2026-02-04 21:05:35 +00:00
40 changed files with 4744 additions and 125 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,6 @@
 target/
 Dockerfile
+.git
+data
+target
+demos
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,5 @@ Cargo.lock

 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
+
+.harmony_generated
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -243,7 +243,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
 dependencies = [
 "cfg-if",
 "const-random",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 "once_cell",
 "version_check",
 "zerocopy",
@@ -450,6 +450,43 @@ dependencies = [
 "pin-project-lite",
 ]

+[[package]]
+name = "async-nats"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-util",
+ "memchr",
+ "nkeys",
+ "nuid",
+ "once_cell",
+ "pin-project",
+ "portable-atomic",
+ "rand 0.8.5",
+ "regex",
+ "ring",
+ "rustls-native-certs 0.7.3",
+ "rustls-pemfile 2.2.0",
+ "rustls-webpki 0.102.8",
+ "serde",
+ "serde_json",
+ "serde_nanos",
+ "serde_repr",
+ "thiserror 1.0.69",
+ "time",
+ "tokio",
+ "tokio-rustls 0.26.2",
+ "tokio-stream",
+ "tokio-util",
+ "tokio-websockets",
+ "tracing",
+ "tryhard",
+ "url",
+]
+
 [[package]]
 name = "async-stream"
 version = "0.3.6"
@@ -775,6 +812,9 @@ name = "bytes"
 version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+dependencies = [
+ "serde",
+]

 [[package]]
 name = "bytestring"
@@ -1583,6 +1623,7 @@ dependencies = [
 "rand_core 0.6.4",
 "serde",
 "sha2",
+ "signature",
 "subtle",
 "zeroize",
 ]
@@ -2456,21 +2497,21 @@ dependencies = [
 "cfg-if",
 "js-sys",
 "libc",
- "wasi 0.11.1+wasi-snapshot-preview1",
+ "wasi",
 "wasm-bindgen",
 ]

 [[package]]
 name = "getrandom"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
 "cfg-if",
 "js-sys",
 "libc",
 "r-efi",
- "wasi 0.14.3+wasi-0.2.4",
+ "wasip2",
 "wasm-bindgen",
 ]

@@ -2572,6 +2613,7 @@ dependencies = [
 "env_logger",
 "fqdn",
 "futures-util",
+ "harmony_execution",
 "harmony_inventory_agent",
 "harmony_macros",
 "harmony_secret",
@@ -2619,6 +2661,43 @@ dependencies = [
 "walkdir",
 ]

+[[package]]
+name = "harmony_agent"
+version = "0.1.0"
+dependencies = [
+ "async-nats",
+ "async-trait",
+ "cidr",
+ "env_logger",
+ "getrandom 0.3.4",
+ "harmony",
+ "harmony_macros",
+ "harmony_types",
+ "log",
+ "pretty_assertions",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.16",
+ "tokio",
+]
+
+[[package]]
+name = "harmony_agent_deploy"
+version = "0.1.0"
+dependencies = [
+ "cidr",
+ "env_logger",
+ "harmony",
+ "harmony_cli",
+ "harmony_macros",
+ "harmony_types",
+ "log",
+ "serde",
+ "serde_json",
+ "tokio",
+ "url",
+]
+
 [[package]]
 name = "harmony_cli"
 version = "0.1.0"
@@ -2659,6 +2738,16 @@ dependencies = [
 "tokio",
 ]

+[[package]]
+name = "harmony_execution"
+version = "0.1.0"
+dependencies = [
+ "directories",
+ "lazy_static",
+ "log",
+ "thiserror 2.0.16",
+]
+
 [[package]]
 name = "harmony_inventory_agent"
 version = "0.1.0"
@@ -3523,7 +3612,7 @@ version = "0.1.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 "libc",
 ]

@@ -3963,7 +4052,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
 "libc",
 "log",
- "wasi 0.11.1+wasi-snapshot-preview1",
+ "wasi",
 "windows-sys 0.48.0",
 ]

@@ -3975,7 +4064,7 @@ checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c"
 dependencies = [
 "libc",
 "log",
- "wasi 0.11.1+wasi-snapshot-preview1",
+ "wasi",
 "windows-sys 0.59.0",
 ]

@@ -4022,6 +4111,21 @@ dependencies = [
 "unicode-segmentation",
 ]

+[[package]]
+name = "nkeys"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf"
+dependencies = [
+ "data-encoding",
+ "ed25519",
+ "ed25519-dalek",
+ "getrandom 0.2.16",
+ "log",
+ "rand 0.8.5",
+ "signatory",
+]
+
 [[package]]
 name = "non-blank-string-rs"
 version = "1.0.4"
@@ -4040,6 +4144,15 @@ dependencies = [
 "winapi 0.3.9",
 ]

+[[package]]
+name = "nuid"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83"
+dependencies = [
+ "rand 0.8.5",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.6"
@@ -4660,7 +4773,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
 dependencies = [
 "bytes",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 "lru-slab",
 "rand 0.9.2",
 "ring",
@@ -4765,7 +4878,7 @@ version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 ]

 [[package]]
@@ -5301,6 +5414,16 @@ dependencies = [
 "untrusted",
 ]

+[[package]]
+name = "rustls-webpki"
+version = "0.102.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
+dependencies = [
+ "rustls-pki-types",
+ "untrusted",
+]
+
 [[package]]
 name = "rustls-webpki"
 version = "0.103.4"
@@ -5564,6 +5687,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "serde_nanos"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "serde_path_to_error"
 version = "0.1.17"
@@ -5731,6 +5863,18 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "signatory"
+version = "0.27.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31"
+dependencies = [
+ "pkcs8",
+ "rand_core 0.6.4",
+ "signature",
+ "zeroize",
+]
+
 [[package]]
 name = "signature"
 version = "2.2.0"
@@ -6314,7 +6458,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
 dependencies = [
 "fastrand",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 "once_cell",
 "rustix 1.0.8",
 "windows-sys 0.60.2",
@@ -6538,6 +6682,27 @@ dependencies = [
 "tokio",
 ]

+[[package]]
+name = "tokio-websockets"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "http 1.3.1",
+ "httparse",
+ "rand 0.8.5",
+ "ring",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls 0.26.2",
+ "tokio-util",
+ "webpki-roots 0.26.11",
+]
+
 [[package]]
 name = "toml"
 version = "0.8.23"
@@ -6689,6 +6854,16 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"

+[[package]]
+name = "tryhard"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5"
+dependencies = [
+ "pin-project-lite",
+ "tokio",
+]
+
 [[package]]
 name = "tui-logger"
 version = "0.14.5"
@@ -6865,7 +7040,7 @@ version = "1.18.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 "js-sys",
 "rand 0.9.2",
 "uuid-macro-internal",
@@ -6936,10 +7111,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"

 [[package]]
-name = "wasi"
-version = "0.14.3+wasi-0.2.4"
+name = "wasip2"
+version = "1.0.2+wasi-0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95"
+checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
 dependencies = [
 "wit-bindgen",
 ]
@@ -7061,6 +7236,15 @@ version = "0.25.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1"

+[[package]]
+name = "webpki-roots"
+version = "0.26.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
+dependencies = [
+ "webpki-roots 1.0.2",
+]
+
 [[package]]
 name = "webpki-roots"
 version = "1.0.2"
@@ -7438,9 +7622,9 @@ dependencies = [

 [[package]]
 name = "wit-bindgen"
-version = "0.45.0"
+version = "0.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"

 [[package]]
 name = "writeable"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,7 @@ members = [
  "harmony_types",
  "harmony_macros",
  "harmony_tui",
+  "harmony_execution",
  "opnsense-config",
  "opnsense-config-xml",
  "harmony_cli",
@@ -17,6 +18,8 @@ members = [
  "harmony_secret",
  "adr/agent_discovery/mdns",
  "brocade",
+  "harmony_agent",
+  "harmony_agent/deploy",
 ]

 [workspace.package]
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # Harmony : Open-source infrastructure orchestration that treats your platform like first-class code

+In other words, Harmony is a **next-generation platform engineering framework**.
+
 _By [NationTech](https://nationtech.io)_

 [![Build](https://git.nationtech.io/NationTech/harmony/actions/workflows/check.yml/badge.svg)](https://git.nationtech.io/nationtech/harmony)
--- a/adr/018-Template-Hydration-For-Workload-Deployment.md
+++ b/adr/018-Template-Hydration-For-Workload-Deployment.md
@@ -0,0 +1,141 @@
+# Architecture Decision Record: Template Hydration for Kubernetes Manifest Generation
+
+Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay
+
+Initial Date: 2025-01-23
+
+Last Updated Date: 2025-01-23
+
+## Status
+
+Implemented
+
+## Context
+
+Harmony's philosophy is built on three guiding principles: Infrastructure as Resilient Code, Prove It Works — Before You Deploy, and One Unified Model. Our goal is to shift validation and verification as left as possible—ideally to compile time—rather than discovering errors at deploy time.
+
+After investigating a few approaches such as compile-checked Askama templates to generate Kubernetes manifests for Helm charts, we found again that this approach suffered from several fundamental limitations:
+
+*   **Late Validation:** Typos in template syntax or field names are only discovered at deployment time, not during compilation. A mistyped `metadata.name` won't surface until Helm attempts to render the template.
+*   **Brittle Maintenance:** Templates are string-based with limited IDE support. Refactoring requires grep-and-replace across YAML-like template files, risking subtle breakage.
+*   **Hard-to-Test Logic:** Testing template output requires mocking the template engine and comparing serialized strings rather than asserting against typed data structures.
+*   **No Type Safety:** There is no guarantee that the generated YAML will be valid Kubernetes resources without runtime validation.
+
+We also faced a strategic choice around Helm: use it as both *templating engine* and *packaging mechanism*, or decouple these concerns. While Helm's ecosystem integration (Harbor, ArgoCD, OCI registry support) is valuable, the Jinja-like templating is at odds with Harmony's "code-first" ethos.
+
+## Decision
+
+We will adopt the **Template Hydration Pattern**—constructing Kubernetes manifests programmatically using strongly-typed `kube-rs` objects, then serializing them to YAML files for packaging into Helm charts.
+
+Specifically:
+
+*   **Write strongly typed `k8s_openapi` Structs:** All Kubernetes resources (Deployment, Service, ConfigMap, etc.) will be constructed using the typed structs generated by `k8s_openapi`.
+*   **Direct Serialization to YAML:** Rather than rendering templates, we use `serde_yaml::to_string()` to serialize typed objects directly into YAML manifests. This way, YAML is only used as a data-transfer format and not a templating/programming language - which it is not.
+*   **Helm as Packaging-Only:** Helm's role is reduced to packaging pre-rendered templates into a tarball and pushing to OCI registries. No template rendering logic resides within Helm.
+*   **Ecosystem Preservation:** The generated Helm charts remain fully compatible with Harbor, ArgoCD, and any Helm-compatible tool—the only difference is that the `templates/` directory contains static YAML files.
+
+The implementation in `backend_app.rs` demonstrates this pattern:
+
+```rust
+let deployment = Deployment {
+    metadata: ObjectMeta {
+        name: Some(self.name.clone()),
+        labels: Some([("app.kubernetes.io/name".to_string(), self.name.clone())].into()),
+        ..Default::default()
+    },
+    spec: Some(DeploymentSpec { /* ... */ }),
+    ..Default::default()
+};
+
+let deployment_yaml = serde_yaml::to_string(&deployment)?;
+fs::write(templates_dir.join("deployment.yaml"), deployment_yaml)?;
+```
+
+## Rationale
+
+**Aligns with "Infrastructure as Resilient Code"**
+
+Harmony's first principle states that infrastructure should be treated like application code. By expressing Kubernetes manifests as Rust structs, we gain:
+
+*   **Refactorability:** Rename a label and the compiler catches all usages.
+*   **IDE Support:** Autocomplete for all Kubernetes API fields; documentation inline.
+*   **Code Navigation:** Jump to definition shows exactly where a value comes from.
+
+**Achieves "Prove It Works — Before You Deploy"**
+
+The compiler now validates that:
+
+*   All required fields are populated (Rust's `Option` type prevents missing fields).
+*   Field types match expectations (ports are integers, not strings).
+*   Enums contain valid values (e.g., `ServiceType::ClusterIP`).
+
+This moves what was runtime validation into compile-time checks, fulfilling the "shift left" promise.
+
+**Enables True Unit Testing**
+
+Developers can now write unit tests that assert directly against typed objects:
+
+```rust
+let deployment = create_deployment(&app);
+assert_eq!(deployment.spec.unwrap().replicas.unwrap(), 3);
+assert_eq!(deployment.metadata.name.unwrap(), "my-app");
+```
+
+No string parsing, no YAML serialization, no fragile assertions against rendered output.
+
+**Preserves Ecosystem Benefits**
+
+By generating standard Helm chart structures, Harmony retains compatibility with:
+
+*   **OCI Registries (Harbor, GHCR):** `helm push` works exactly as before.
+*   **ArgoCD:** Syncs and manages releases using the generated charts.
+*   **Existing Workflows:** Teams already consuming Helm charts see no change.
+
+The Helm tarball becomes a "dumb pipe" for transport, which is arguably its ideal role.
+
+## Consequences
+
+### Positive
+
+*   **Compile-Time Safety:** A broad class of errors (typos, missing fields, type mismatches) is now caught at build time.
+*   **Better Developer Experience:** IDE autocomplete, inline documentation, and refactor support significantly reduce the learning curve for Kubernetes manifests.
+*   **Testability:** Unit tests can validate manifest structure without integration or runtime checks.
+*   **Auditability:** The source-of-truth for manifests is now pure Rust—easier to review in pull requests than template logic scattered across files.
+*   **Future-Extensibility:** CustomResources (CRDs) can be supported via `kopium`-generated Rust types, maintaining the same strong typing.
+
+### Negative
+
+*   **API Schema Drift:** Kubernetes API changes require regenerating `k8s_openapi` types and updating code. A change in a struct field will cause the build to fail—intentionally, but still requiring the pipeline to be updated.
+*   **Verbosity:** Typed construction is more verbose than the equivalent template. Builder patterns or helper functions will be needed to keep code readable.
+*   **Learning Curve:** Contributors must understand both the Kubernetes resource spec *and* the Rust type system, rather than just YAML.
+*   **Debugging Shift:** When debugging generated YAML, you now trace through Rust code rather than template files—more precise but different mental model.
+
+## Alternatives Considered
+
+### 1. Enhance Askama with Compile-Time Validation
+*Pros:* Stay within familiar templating paradigm; minimal code changes.
+*Cons:* Rust's type system cannot fully express Kubernetes schema validation without significant macro boilerplate. Errors would still surface at template evaluation time, not compilation.
+
+### 2. Use Helm SDK Programmatically (Go)
+*Pros:* Direct access to Helm's template engine; no YAML serialization step.
+*Cons:* Would introduce a second language (Go) into a Rust codebase, increasing cognitive load and compilation complexity. No improvement in compile-time safety.
+
+### 3. Raw YAML String Templating (Manual)
+*Pros:* Maximum control; no external dependencies.
+*Cons:* Even more error-prone than Askama; no structure validation; string concatenation errors abound.
+
+### 4. Use Kustomize for All Manifests
+*Pros:* Declarative overlays; standard tool.
+*Cons:* Kustomize is itself a layer over YAML templates with its own DSL. It does not provide compile-time type safety and would require externalizing manifest management outside Harmony's codebase.
+
+__Note that this template hydration architecture still allows to override templates with tools like kustomize when required__
+
+## Additional Notes
+
+**Scalability to Future Topologies**
+
+The Template Hydration pattern enables future Harmony architectures to generate manifests dynamically based on topology context. For example, a `CostTopology` might adjust resource requests based on cluster pricing, manipulating the typed `Deployment::spec` directly before serialization.
+
+**Implementation Status**
+
+As of this writing, the pattern is implemented for `BackendApp` deployments (`backend_app.rs`). The next phase is to extend this pattern across all application modules (`webapp.rs`, etc.) and to standardize on this approach for any new implementations.
--- a/brocade/examples/main.rs
+++ b/brocade/examples/main.rs
@@ -1,7 +1,7 @@
 use std::net::{IpAddr, Ipv4Addr};

 use brocade::{BrocadeOptions, ssh};
-use harmony_secret::{Secret, SecretManager};
+use harmony_secret::Secret;
 use harmony_types::switch::PortLocation;
 use schemars::JsonSchema;
 use serde::{Deserialize, Serialize};
--- a/examples/openbao/src/main.rs
+++ b/examples/openbao/src/main.rs
@@ -56,6 +56,8 @@ async fn main() {
        )),
    };

+    // TODO exec pod commands to initialize secret store if not already done
+
    harmony_cli::run(
        Inventory::autoload(),
        K8sAnywhereTopology::from_env(),
--- a/harmony/Cargo.toml
+++ b/harmony/Cargo.toml
@@ -30,6 +30,7 @@ opnsense-config = { path = "../opnsense-config" }
 opnsense-config-xml = { path = "../opnsense-config-xml" }
 harmony_macros = { path = "../harmony_macros" }
 harmony_types = { path = "../harmony_types" }
+harmony_execution = { path = "../harmony_execution" }
 uuid.workspace = true
 url.workspace = true
 kube = { workspace = true, features = ["derive"] }
--- a/harmony/src/modules/application/backend_app.rs
+++ b/harmony/src/modules/application/backend_app.rs
@@ -0,0 +1,801 @@
+use async_trait::async_trait;
+use log::{debug, info, trace};
+use serde::Serialize;
+use std::path::PathBuf;
+
+use crate::{
+    config::{REGISTRY_PROJECT, REGISTRY_URL},
+    modules::application::{
+        Application, HelmPackage, OCICompliant,
+        config::ApplicationNetworkPort,
+        helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind},
+    },
+};
+use harmony_execution::{RunnerOptions, run_command};
+
+#[derive(Debug, Clone, Serialize)]
+pub struct BuildCommand {
+    pub program: String,
+    pub args: Vec<String>,
+}
+
+impl BuildCommand {
+    pub fn new(program: impl Into<String>, args: Vec<impl Into<String>>) -> Self {
+        Self {
+            program: program.into(),
+            args: args.into_iter().map(|s| s.into()).collect(),
+        }
+    }
+
+    pub fn to_std_command(&self) -> std::process::Command {
+        let mut cmd = std::process::Command::new(&self.program);
+        cmd.args(&self.args);
+        cmd
+    }
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct BackendApp {
+    pub name: String,
+    pub project_root: std::path::PathBuf,
+    pub network_ports: Vec<ApplicationNetworkPort>,
+    pub env_vars: Vec<(String, String)>,
+    pub build_cmd: BuildCommand,
+    pub dockerfile: Option<PathBuf>,
+}
+
+impl BackendApp {
+    fn get_dockerfile(&self) -> Result<PathBuf, String> {
+        debug!(
+            "Looking for dockerfile, currently set to {:?}",
+            self.dockerfile
+        );
+        if let Some(dockerfile) = &self.dockerfile {
+            return match dockerfile.exists() {
+                true => {
+                    info!(
+                        "Found dockerfile as intended at {}",
+                        dockerfile.to_string_lossy()
+                    );
+                    Ok(dockerfile.clone())
+                }
+                false => Err(format!(
+                    "Dockerfile explicitely set to {dockerfile} does not exist",
+                    dockerfile = dockerfile.to_string_lossy()
+                )),
+            };
+        }
+
+        let existing_dockerfile = self.project_root.join("Dockerfile");
+
+        debug!("project_root = {:?}", self.project_root);
+
+        debug!("checking = {:?}", existing_dockerfile);
+        if existing_dockerfile.exists() {
+            debug!(
+                "Checking path {:#?} for existing Dockerfile",
+                self.project_root.clone()
+            );
+            return Ok(existing_dockerfile);
+        }
+        Err(format!(
+            "Could not find a dockerfile in {project_root} folder. Tried {existing_dockerfile}",
+            project_root = self.project_root.to_string_lossy(),
+            existing_dockerfile = existing_dockerfile.to_string_lossy(),
+        ))
+    }
+}
+
+impl Application for BackendApp {
+    fn name(&self) -> String {
+        self.name.clone()
+    }
+}
+
+#[async_trait]
+impl OCICompliant for BackendApp {
+    async fn build_push_oci_image(&self) -> Result<String, String> {
+        let dockerfile = self.get_dockerfile()?;
+        let image_tag = self.image_name();
+
+        // Run docker build command, streaming output to console and capturing it
+        let output = run_command(
+            std::process::Command::new("docker").args([
+                "build",
+                "-t",
+                &image_tag,
+                "-f",
+                &dockerfile.to_string_lossy(),
+                &self.project_root.to_string_lossy(),
+            ]),
+            RunnerOptions::print_to_console(),
+        )
+        .map_err(|e| format!("Failed to spawn docker build process: {}", e))?;
+
+        if output.is_success() {
+            info!("Docker image build succeeded");
+            Ok(image_tag)
+        } else {
+            Err(format!(
+                "Docker image build FAILED:\n{}",
+                output.format_output()
+            ))
+        }
+    }
+
+    fn local_image_name(&self) -> String {
+        self.name.clone()
+    }
+
+    fn image_name(&self) -> String {
+        format!(
+            "{}/{}/{}",
+            *REGISTRY_URL,
+            *REGISTRY_PROJECT,
+            &self.local_image_name()
+        )
+    }
+}
+
+#[async_trait]
+impl HelmPackage for BackendApp {
+    fn project_root(&self) -> PathBuf {
+        self.project_root.clone()
+    }
+
+    fn chart_name(&self) -> String {
+        self.name.clone()
+    }
+
+    async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
+        let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string());
+
+        // Build the typed Deployment object using the builder with initial options
+        helm_chart.add_resource(HelmResourceKind::Deployment(
+            DeploymentBuilder::with_options(
+                &self.name,
+                image_url,
+                Some(self.network_ports.clone()),
+                Some(self.env_vars.clone()),
+                None,
+            )
+            .build(),
+        ));
+
+        // Build the typed Service object using the helper function
+        if let Some(service) =
+            helm::create_service_from_ports(self.name.clone(), &self.network_ports)
+        {
+            helm_chart.add_resource(HelmResourceKind::Service(service));
+        }
+
+        // Write the Helm chart metadata to the project root
+        let chart_dir = helm_chart
+            .write_to(&self.project_root.join(".harmony_generated/helm/"))
+            .map_err(|e| format!("Failed to write Helm chart: {}", e))?;
+
+        info!("Helm chart for '{}' written to: {:?}", self.name, chart_dir);
+
+        Ok(chart_dir.to_string_lossy().to_string())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::modules::application::config::ApplicationNetworkPort;
+    use crate::modules::application::config::NetworkProtocol;
+    use k8s_openapi::api::apps::v1::Deployment;
+    use k8s_openapi::api::core::v1::{Container, EnvVar, Service as K8sService, ServicePort};
+    use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
+    use serde_yaml::from_str;
+    use std::fs;
+    use std::path::Path;
+    use tempfile::tempdir;
+
+    // Test Helpers
+    fn read_service_yaml(project_root: &Path, chart_name: &str) -> K8sService {
+        let path = project_root.join(format!(
+            ".harmony_generated/helm/{chart_name}/templates/service.yaml"
+        ));
+        let content = fs::read_to_string(&path)
+            .unwrap_or_else(|e| panic!("Failed to read service.yaml at {:?}: {}", path, e));
+        from_str(&content)
+            .unwrap_or_else(|e| panic!("Failed to parse service.yaml as K8s Service: {}", e))
+    }
+
+    fn read_deployment_yaml(project_root: &Path, chart_name: &str) -> Deployment {
+        let path = project_root.join(format!(
+            ".harmony_generated/helm/{chart_name}/templates/deployment.yaml"
+        ));
+        let content = fs::read_to_string(&path)
+            .unwrap_or_else(|e| panic!("Failed to read deployment.yaml at {:?}: {}", path, e));
+        from_str(&content)
+            .unwrap_or_else(|e| panic!("Failed to parse deployment.yaml as K8s Deployment: {}", e))
+    }
+
+    fn service_yaml_exists(project_root: &Path, chart_name: &str) -> bool {
+        let path = project_root.join(format!(
+            ".harmony_generated/helm/{chart_name}/templates/service.yaml"
+        ));
+        path.exists()
+    }
+
+    // Service Assertions
+    fn assert_service_metadata(service: &K8sService, expected_name: &str) {
+        assert_eq!(
+            service.metadata.name.as_deref(),
+            Some(expected_name),
+            "Service name should be '{expected_name}'"
+        );
+    }
+
+    fn assert_service_type(service: &K8sService, expected_type: &str) {
+        assert_eq!(
+            service.spec.as_ref().and_then(|s| s.type_.as_deref()),
+            Some(expected_type),
+            "Service type should be '{expected_type}'"
+        );
+    }
+
+    fn assert_service_port_count(service: &K8sService, expected_count: usize) {
+        let ports = service
+            .spec
+            .as_ref()
+            .and_then(|s| s.ports.as_ref())
+            .unwrap_or_else(|| panic!("Service should have ports"));
+        assert_eq!(
+            ports.len(),
+            expected_count,
+            "Service should have {expected_count} ports"
+        );
+    }
+
+    fn assert_service_port(
+        port: &ServicePort,
+        expected_name: &str,
+        expected_protocol: &str,
+        expected_number: i32,
+    ) {
+        assert_eq!(
+            port.name.as_deref(),
+            Some(expected_name),
+            "Port name should be '{expected_name}'"
+        );
+        assert_eq!(
+            port.protocol.as_deref(),
+            Some(expected_protocol),
+            "Port '{expected_name}' protocol should be '{expected_protocol}'"
+        );
+        assert_eq!(
+            port.port, expected_number,
+            "Port '{expected_name}' number should be {expected_number}"
+        );
+    }
+
+    fn assert_target_port_matches_service_port(port: &ServicePort) {
+        match &port.target_port {
+            Some(IntOrString::Int(target)) => {
+                assert_eq!(
+                    *target,
+                    port.port,
+                    "Target port should match service port for '{}'",
+                    port.name.as_deref().unwrap_or("unknown")
+                );
+            }
+            _ => panic!(
+                "Target port should be Int for '{}'",
+                port.name.as_deref().unwrap_or("unknown")
+            ),
+        }
+    }
+
+    // Deployment Assertions
+    fn assert_deployment_metadata(deployment: &Deployment, expected_name: &str) {
+        assert_eq!(
+            deployment.metadata.name.as_deref(),
+            Some(expected_name),
+            "Deployment name should be '{expected_name}'"
+        );
+    }
+
+    fn assert_deployment_replicas(deployment: &Deployment, expected_replicas: i32) {
+        let spec = deployment
+            .spec
+            .as_ref()
+            .unwrap_or_else(|| panic!("Deployment should have spec"));
+        assert_eq!(
+            spec.replicas,
+            Some(expected_replicas),
+            "Deployment should have {expected_replicas} replicas"
+        );
+    }
+
+    fn assert_selector_match_label(deployment: &Deployment, expected_label_value: &str) {
+        let spec = deployment
+            .spec
+            .as_ref()
+            .unwrap_or_else(|| panic!("Deployment should have spec"));
+        assert_eq!(
+            spec.selector
+                .match_labels
+                .as_ref()
+                .and_then(|m| m.get("app.kubernetes.io/name")),
+            Some(&expected_label_value.to_string()),
+            "Selector should match app name '{expected_label_value}'"
+        );
+    }
+
+    fn assert_pod_labels(deployment: &Deployment, expected_name: &str) {
+        let spec = deployment
+            .spec
+            .as_ref()
+            .unwrap_or_else(|| panic!("Deployment should have spec"));
+        let metadata = spec
+            .template
+            .metadata
+            .as_ref()
+            .unwrap_or_else(|| panic!("Pod template should have metadata"));
+        let labels = metadata
+            .labels
+            .as_ref()
+            .unwrap_or_else(|| panic!("Pod should have labels"));
+
+        assert_eq!(
+            labels.get("app.kubernetes.io/name"),
+            Some(&expected_name.to_string()),
+            "Pod label app.kubernetes.io/name should be '{expected_name}'"
+        );
+        assert_eq!(
+            labels.get("app.kubernetes.io/instance"),
+            Some(&expected_name.to_string()),
+            "Pod label app.kubernetes.io/instance should be '{expected_name}'"
+        );
+    }
+
+    // Container Assertions
+    fn assert_container_metadata(
+        container: &Container,
+        expected_name: &str,
+        expected_image: &str,
+        expected_pull_policy: &str,
+    ) {
+        assert_eq!(
+            container.name, expected_name,
+            "Container name should be '{expected_name}'"
+        );
+        assert_eq!(
+            container.image.as_deref(),
+            Some(expected_image),
+            "Container image should be '{expected_image}'"
+        );
+        assert_eq!(
+            container.image_pull_policy.as_deref(),
+            Some(expected_pull_policy),
+            "Image pull policy should be '{expected_pull_policy}'"
+        );
+    }
+
+    fn assert_container_ports_count(container: &Container, expected_count: usize) {
+        let ports = container
+            .ports
+            .as_ref()
+            .unwrap_or_else(|| panic!("Container should have ports"));
+        assert_eq!(
+            ports.len(),
+            expected_count,
+            "Container should have {expected_count} ports"
+        );
+    }
+
+    fn assert_container_port(
+        port: &k8s_openapi::api::core::v1::ContainerPort,
+        expected_name: &str,
+        expected_protocol: &str,
+        expected_number: i32,
+    ) {
+        assert_eq!(
+            port.name.as_deref(),
+            Some(expected_name),
+            "Container port name should be '{expected_name}'"
+        );
+        assert_eq!(
+            port.protocol.as_deref(),
+            Some(expected_protocol),
+            "Container port '{expected_name}' protocol should be '{expected_protocol}'"
+        );
+        assert_eq!(
+            port.container_port, expected_number,
+            "Container port '{expected_name}' number should be {expected_number}"
+        );
+    }
+
+    fn assert_container_env_vars_count(container: &Container, expected_count: usize) {
+        let env_vars = container
+            .env
+            .as_ref()
+            .unwrap_or_else(|| panic!("Container should have env vars"));
+        assert_eq!(
+            env_vars.len(),
+            expected_count,
+            "Container should have {expected_count} env vars"
+        );
+    }
+
+    fn assert_container_env_var(env_var: &EnvVar, expected_name: &str, expected_value: &str) {
+        assert_eq!(
+            env_var.name, expected_name,
+            "Env var name should be '{expected_name}'"
+        );
+        assert_eq!(
+            env_var.value.as_deref(),
+            Some(expected_value),
+            "Env var '{expected_name}' value should be '{expected_value}'"
+        );
+    }
+
+    fn get_container(deployment: &Deployment) -> Container {
+        let spec = deployment
+            .spec
+            .as_ref()
+            .unwrap_or_else(|| panic!("Deployment should have spec"));
+        let pod_spec = spec
+            .template
+            .spec
+            .as_ref()
+            .unwrap_or_else(|| panic!("Pod template should have spec"));
+        pod_spec
+            .containers
+            .first()
+            .unwrap_or_else(|| panic!("Should have exactly one container"))
+            .clone()
+    }
+
+    // Test Fixtures
+    fn standard_test_ports() -> Vec<ApplicationNetworkPort> {
+        vec![
+            ApplicationNetworkPort {
+                number: 8080,
+                protocol: NetworkProtocol::TCP,
+                name: "http".to_string(),
+            },
+            ApplicationNetworkPort {
+                number: 9000,
+                protocol: NetworkProtocol::TCP,
+                name: "metrics".to_string(),
+            },
+            ApplicationNetworkPort {
+                number: 50051,
+                protocol: NetworkProtocol::TCP,
+                name: "grpc".to_string(),
+            },
+        ]
+    }
+
+    fn standard_test_env_vars() -> Vec<(String, String)> {
+        vec![
+            ("ENV_VAR_1".to_string(), "value1".to_string()),
+            ("ENV_VAR_2".to_string(), "value2".to_string()),
+        ]
+    }
+
+    fn udp_test_ports() -> Vec<ApplicationNetworkPort> {
+        vec![
+            ApplicationNetworkPort {
+                number: 53,
+                protocol: NetworkProtocol::UDP,
+                name: "dns".to_string(),
+            },
+            ApplicationNetworkPort {
+                number: 8080,
+                protocol: NetworkProtocol::TCP,
+                name: "http".to_string(),
+            },
+        ]
+    }
+
+    // Test Builder
+    struct BackendAppTestBuilder {
+        name: Option<String>,
+        network_ports: Option<Vec<ApplicationNetworkPort>>,
+        env_vars: Option<Vec<(String, String)>>,
+    }
+
+    impl BackendAppTestBuilder {
+        fn new() -> Self {
+            Self {
+                name: None,
+                network_ports: None,
+                env_vars: None,
+            }
+        }
+
+        fn with_name(mut self, name: impl Into<String>) -> Self {
+            self.name = Some(name.into());
+            self
+        }
+
+        fn with_standard_ports(mut self) -> Self {
+            self.network_ports = Some(standard_test_ports());
+            self
+        }
+
+        fn with_udp_ports(mut self) -> Self {
+            self.network_ports = Some(udp_test_ports());
+            self
+        }
+
+        fn with_standard_env_vars(mut self) -> Self {
+            self.env_vars = Some(standard_test_env_vars());
+            self
+        }
+
+        fn with_no_ports(mut self) -> Self {
+            self.network_ports = Some(vec![]);
+            self
+        }
+
+        fn build(self, project_root: PathBuf) -> BackendApp {
+            BackendApp {
+                name: self.name.unwrap_or_else(|| "test-app".to_string()),
+                project_root,
+                network_ports: self.network_ports.unwrap_or_default(),
+                env_vars: self.env_vars.unwrap_or_default(),
+                build_cmd: BuildCommand::new("cargo", vec!["build"]),
+                dockerfile: None,
+            }
+        }
+    }
+
+    impl Default for BackendAppTestBuilder {
+        fn default() -> Self {
+            Self::new()
+        }
+    }
+
+    // Helper function for test setup
+    async fn build_helm_chart_for_test(app: &BackendApp, image_url: &str) {
+        let result = app.build_push_helm_package(image_url).await;
+        assert!(
+            result.is_ok(),
+            "build_push_helm_package should succeed: {:?}",
+            result
+        );
+    }
+
+    // ===== SERVICE TESTS =====
+
+    #[tokio::test]
+    async fn service_is_created_with_application_name() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let service = read_service_yaml(&app.project_root, "test-app");
+        assert_service_metadata(&service, "test-app");
+    }
+
+    #[tokio::test]
+    async fn service_has_default_clusterip_type() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let service = read_service_yaml(&app.project_root, "test-app");
+        assert_service_type(&service, "ClusterIP");
+    }
+
+    #[tokio::test]
+    async fn service_exposes_all_network_ports() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let service = read_service_yaml(&app.project_root, "test-app");
+        assert_service_port_count(&service, 3);
+
+        let ports = service.spec.unwrap().ports.unwrap();
+        assert_service_port(&ports[0], "http", "TCP", 8080);
+        assert_service_port(&ports[1], "metrics", "TCP", 9000);
+        assert_service_port(&ports[2], "grpc", "TCP", 50051);
+    }
+
+    #[tokio::test]
+    async fn service_target_ports_match_service_ports() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let service = read_service_yaml(&app.project_root, "test-app");
+        let ports = service.spec.unwrap().ports.unwrap();
+
+        for port in &ports {
+            assert_target_port_matches_service_port(port);
+        }
+    }
+
+    #[tokio::test]
+    async fn service_not_created_when_application_has_no_ports() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app-no-ports")
+            .with_no_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app-no-ports:1.0.0").await;
+
+        assert!(
+            !service_yaml_exists(&app.project_root, "test-app-no-ports"),
+            "service.yaml should not exist when there are no network ports"
+        );
+    }
+
+    #[tokio::test]
+    async fn service_respects_port_protocol_type() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("udp-app")
+            .with_udp_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/udp-app:1.0.0").await;
+
+        let service = read_service_yaml(&app.project_root, "udp-app");
+        let ports = service.spec.unwrap().ports.unwrap();
+
+        assert_service_port(&ports[0], "dns", "UDP", 53);
+        assert_service_port(&ports[1], "http", "TCP", 8080);
+    }
+
+    // ===== DEPLOYMENT METADATA TESTS =====
+
+    #[tokio::test]
+    async fn deployment_has_application_name() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        assert_deployment_metadata(&deployment, "test-app");
+    }
+
+    #[tokio::test]
+    async fn deployment_has_single_replica_by_default() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        assert_deployment_replicas(&deployment, 1);
+    }
+
+    #[tokio::test]
+    async fn deployment_selector_matches_application_name() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        assert_selector_match_label(&deployment, "test-app");
+    }
+
+    #[tokio::test]
+    async fn pod_has_standard_kubernetes_labels() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        assert_pod_labels(&deployment, "test-app");
+    }
+
+    // ===== CONTAINER CONFIGURATION TESTS =====
+
+    #[tokio::test]
+    async fn container_has_correct_name_and_image() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        let image_url = "registry.example.com/test/test-app:1.0.0";
+        build_helm_chart_for_test(&app, image_url).await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        let container = get_container(&deployment);
+
+        assert_container_metadata(&container, "test-app", image_url, "IfNotPresent");
+    }
+
+    #[tokio::test]
+    async fn container_exposes_all_application_ports() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        let container = get_container(&deployment);
+
+        assert_container_ports_count(&container, 3);
+
+        let ports = container.ports.unwrap();
+        assert_container_port(&ports[0], "http", "TCP", 8080);
+        assert_container_port(&ports[1], "metrics", "TCP", 9000);
+        assert_container_port(&ports[2], "grpc", "TCP", 50051);
+    }
+
+    #[tokio::test]
+    async fn container_has_all_environment_variables() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .with_standard_env_vars()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        let container = get_container(&deployment);
+
+        assert_container_env_vars_count(&container, 2);
+
+        let env_vars = container.env.unwrap();
+        assert_container_env_var(&env_vars[0], "ENV_VAR_1", "value1");
+        assert_container_env_var(&env_vars[1], "ENV_VAR_2", "value2");
+    }
+
+    // ===== BUILD COMMAND UNIT TESTS =====
+
+    #[test]
+    fn build_command_creation_sets_program_and_args() {
+        let cmd = BuildCommand::new("docker", vec!["build", "-t", "myimage"]);
+        assert_eq!(cmd.program, "docker");
+        assert_eq!(cmd.args, vec!["build", "-t", "myimage"]);
+    }
+
+    #[test]
+    fn build_command_clone_copies_all_fields() {
+        let cmd1 = BuildCommand::new("cargo", vec!["build", "--release"]);
+        let cmd2 = cmd1.clone();
+        assert_eq!(cmd1.program, cmd2.program);
+        assert_eq!(cmd1.args, cmd2.args);
+    }
+}
--- a/harmony/src/modules/application/config.rs
+++ b/harmony/src/modules/application/config.rs
@@ -0,0 +1,29 @@
+use serde::Serialize;
+
+#[derive(Debug, Clone, Serialize)]
+pub enum NetworkProtocol {
+    TCP,
+    UDP,
+}
+
+impl NetworkProtocol {
+    pub fn as_str(&self) -> &str {
+        match self {
+            NetworkProtocol::TCP => "TCP",
+            NetworkProtocol::UDP => "UDP",
+        }
+    }
+}
+
+impl std::fmt::Display for NetworkProtocol {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct ApplicationNetworkPort {
+    pub number: u16,
+    pub protocol: NetworkProtocol,
+    pub name: String,
+}
--- a/harmony/src/modules/application/features/packaging_deployment.rs
+++ b/harmony/src/modules/application/features/packaging_deployment.rs
@@ -48,11 +48,11 @@ use crate::{
 /// - ArgoCD to install/upgrade/rollback/inspect k8s resources
 /// - Kubernetes for runtime orchestration
 #[derive(Debug, Default, Clone)]
-pub struct PackagingDeployment<A: OCICompliant + HelmPackage + Webapp> {
+pub struct PackagingDeployment<A: OCICompliant + HelmPackage> {
    pub application: Arc<A>,
 }

-impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {
+impl<A: OCICompliant + HelmPackage> PackagingDeployment<A> {
    async fn deploy_to_local_k3d(
        &self,
        app_name: String,
@@ -138,7 +138,7 @@ impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {

 #[async_trait]
 impl<
-    A: OCICompliant + HelmPackage + Webapp + Clone + 'static,
+    A: OCICompliant + HelmPackage + Clone + 'static,
    T: Topology + HelmCommand + MultiTargetTopology + K8sclient + Ingress + 'static,
 > ApplicationFeature<T> for PackagingDeployment<A>
 {
@@ -148,24 +148,12 @@ impl<
    ) -> Result<InstallationOutcome, InstallationError> {
        let image = self.application.image_name();

-        let domain = if topology.current_target() == DeploymentTarget::Production {
-            self.application.dns()
-        } else {
-            topology
-                .get_domain(&self.application.name())
-                .await
-                .map_err(|e| e.to_string())?
-        };
-
        // TODO Write CI/CD workflow files
        // we can autotedect the CI type using the remote url (default to github action for github
        // url, etc..)
        // Or ask for it when unknown

-        let helm_chart = self
-            .application
-            .build_push_helm_package(&image, &domain)
-            .await?;
+        let helm_chart = self.application.build_push_helm_package(&image).await?;

        // TODO: Make building image configurable/skippable if image already exists (prompt)")
        // https://git.nationtech.io/NationTech/harmony/issues/104
@@ -215,12 +203,12 @@ impl<
        };

        Ok(InstallationOutcome::success_with_details(vec![format!(
-            "{}: http://{domain}",
+            "{}",
            self.application.name()
        )]))
    }
    fn name(&self) -> String {
-        "ContinuousDelivery".to_string()
+        "PackagingDeployment".to_string()
    }
 }

--- a/harmony/src/modules/application/helm/mod.rs
+++ b/harmony/src/modules/application/helm/mod.rs
@@ -0,0 +1,446 @@
+// Re-export common Kubernetes types for convenience
+pub use k8s_openapi::api::{
+    apps::v1::{Deployment, DeploymentSpec},
+    core::v1::{
+        Container, ContainerPort, EnvVar, PodSpec, PodTemplateSpec, Service as K8sService,
+        ServicePort, ServiceSpec,
+    },
+};
+use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
+use kube::core::ObjectMeta;
+
+// Import domain types for the deployment builder
+use crate::modules::application::config::{ApplicationNetworkPort, NetworkProtocol};
+use std::fs;
+use std::path::{Path, PathBuf};
+
+/// Enum representing all supported Kubernetes resource types for Helm charts.
+/// Supports built-in typed resources and custom CRDs via YAML strings.
+pub enum HelmResourceKind {
+    /// Built-in typed Service resource
+    Service(K8sService),
+    /// Built-in typed Deployment resource
+    Deployment(Deployment),
+    /// Custom resource as pre-serialized YAML (e.g., CRDs, custom types)
+    CustomYaml { filename: String, content: String },
+    // Can add more typed variants as needed: ConfigMap, Secret, Ingress, etc.
+}
+
+impl HelmResourceKind {
+    pub fn filename(&self) -> String {
+        match self {
+            HelmResourceKind::Service(_) => "service.yaml".to_string(),
+            HelmResourceKind::Deployment(_) => "deployment.yaml".to_string(),
+            HelmResourceKind::CustomYaml { filename, .. } => filename.clone(),
+        }
+    }
+
+    pub fn serialize_to_yaml(&self) -> Result<String, serde_yaml::Error> {
+        match self {
+            HelmResourceKind::Service(s) => serde_yaml::to_string(s),
+            HelmResourceKind::Deployment(d) => serde_yaml::to_string(d),
+            HelmResourceKind::CustomYaml { content, .. } => Ok(content.clone()),
+        }
+    }
+
+    pub fn as_service(&self) -> Option<&K8sService> {
+        match self {
+            HelmResourceKind::Service(s) => Some(s),
+            _ => None,
+        }
+    }
+
+    pub fn as_deployment(&self) -> Option<&Deployment> {
+        match self {
+            HelmResourceKind::Deployment(d) => Some(d),
+            _ => None,
+        }
+    }
+
+    /// Add a custom resource from any serializable type (e.g., CRDs, custom types)
+    pub fn from_yaml(filename: impl Into<String>, content: impl Into<String>) -> Self {
+        HelmResourceKind::CustomYaml {
+            filename: filename.into(),
+            content: content.into(),
+        }
+    }
+
+    /// Add a custom resource from any type that implements Serialize
+    pub fn from_serializable<T: serde::Serialize>(
+        filename: impl Into<String>,
+        resource: &T,
+    ) -> Result<Self, serde_yaml::Error> {
+        Ok(HelmResourceKind::CustomYaml {
+            filename: filename.into(),
+            content: serde_yaml::to_string(resource)?,
+        })
+    }
+}
+
+/// The main orchestrator for building a Helm chart.
+pub struct HelmChart {
+    pub name: String,
+    pub version: String,
+    pub app_version: String,
+    pub description: String,
+    pub resources: Vec<HelmResourceKind>,
+    pub values: Vec<String>,
+}
+
+impl HelmChart {
+    pub fn new(name: String, app_version: String) -> Self {
+        Self {
+            name: name.clone(),
+            version: "0.1.0".to_string(),
+            app_version,
+            description: format!("A Helm chart for {}", name),
+            resources: Vec::new(),
+            values: Vec::new(),
+        }
+    }
+
+    pub fn add_resource(&mut self, resource: HelmResourceKind) {
+        self.resources.push(resource);
+    }
+
+    pub fn add_value(&mut self, key: &str, value: &str) {
+        self.values.push(format!("{}: {}", key, value));
+    }
+
+    pub fn write_to(&self, base_path: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
+        let chart_dir = base_path.join(&self.name);
+        let templates_dir = chart_dir.join("templates");
+        fs::create_dir_all(&templates_dir)?;
+
+        // 1. Render and write Chart.yaml
+        let chart_yaml = ChartYaml {
+            name: &self.name,
+            description: &self.description,
+            version: &self.version,
+            app_version: &self.app_version,
+        };
+        fs::write(chart_dir.join("Chart.yaml"), chart_yaml.render()?)?;
+
+        // 2. Write values.yaml (Constructed dynamically)
+        let values_content = self.values.join("\n");
+        fs::write(chart_dir.join("values.yaml"), values_content)?;
+
+        // 3. Serialize and write all added resources (Deployment, Service, etc.)
+        for resource in &self.resources {
+            let filename = resource.filename();
+            let content = resource
+                .serialize_to_yaml()
+                .map_err(|e| format!("Failed to serialize resource {}: {}", filename, e))?;
+            fs::write(templates_dir.join(filename), content)?;
+        }
+
+        Ok(chart_dir)
+    }
+}
+
+use askama::Template;
+
+#[derive(Template)]
+#[template(path = "helm/Chart.yaml.j2")]
+struct ChartYaml<'a> {
+    name: &'a str,
+    description: &'a str,
+    version: &'a str,
+    app_version: &'a str,
+}
+
+/// Builder for creating a Kubernetes Service with proper labels and selectors.
+pub struct ServiceBuilder {
+    name: String,
+    service_type: String,
+    ports: Vec<ServicePort>,
+    selector_label: String,
+}
+
+impl ServiceBuilder {
+    pub fn new(name: impl Into<String>) -> Self {
+        Self {
+            name: name.into(),
+            service_type: "ClusterIP".to_string(),
+            ports: Vec::new(),
+            selector_label: String::new(),
+        }
+    }
+
+    pub fn service_type(mut self, service_type: impl Into<String>) -> Self {
+        self.service_type = service_type.into();
+        self
+    }
+
+    pub fn with_port(
+        mut self,
+        name: impl Into<String>,
+        port: i32,
+        protocol: impl Into<String>,
+    ) -> Self {
+        use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
+        self.ports.push(ServicePort {
+            name: Some(name.into()),
+            protocol: Some(protocol.into()),
+            port,
+            target_port: Some(IntOrString::Int(port)),
+            ..Default::default()
+        });
+        self
+    }
+
+    pub fn selector_label(mut self, label: impl Into<String>) -> Self {
+        self.selector_label = label.into();
+        self
+    }
+
+    pub fn build(self) -> K8sService {
+        K8sService {
+            metadata: ObjectMeta {
+                name: Some(self.name.clone()),
+                labels: Some(
+                    [
+                        ("app.kubernetes.io/name".to_string(), self.name.clone()),
+                        (
+                            "app.kubernetes.io/component".to_string(),
+                            "service".to_string(),
+                        ),
+                        (
+                            "app.kubernetes.io/managed-by".to_string(),
+                            "harmony".to_string(),
+                        ),
+                    ]
+                    .into(),
+                ),
+                ..Default::default()
+            },
+            spec: Some(ServiceSpec {
+                type_: Some(self.service_type),
+                selector: Some(
+                    [("app.kubernetes.io/name".to_string(), self.selector_label)].into(),
+                ),
+                ports: if self.ports.is_empty() {
+                    None
+                } else {
+                    Some(self.ports)
+                },
+                ..Default::default()
+            }),
+            ..Default::default()
+        }
+    }
+}
+
+/// Builder for creating a Kubernetes Deployment with pod template and container spec.
+pub struct DeploymentBuilder {
+    name: String,
+    image: String,
+    replicas: i32,
+    container_ports: Vec<ContainerPort>,
+    env_vars: Vec<EnvVar>,
+    image_pull_policy: Option<String>,
+}
+
+impl DeploymentBuilder {
+    /// Create a new DeploymentBuilder with minimal required fields.
+    pub fn new(name: impl Into<String>, image: impl Into<String>) -> Self {
+        Self::with_options(name, image, None, None, None)
+    }
+
+    /// Create a new DeploymentBuilder with optional initial configuration.
+    ///
+    /// Arguments:
+    /// - `name`: The deployment name
+    /// - `image`: The container image to use
+    /// - `ports`: Optional vector of initial application network ports
+    /// - `env_vars`: Optional vector of initial environment variable key-value pairs
+    /// - `replicas`: Optional number of replicas (defaults to 1)
+    pub fn with_options(
+        name: impl Into<String>,
+        image: impl Into<String>,
+        ports: Option<Vec<ApplicationNetworkPort>>,
+        env_vars: Option<Vec<(String, String)>>,
+        replicas: Option<i32>,
+    ) -> Self {
+        let container_ports: Vec<ContainerPort> = ports
+            .unwrap_or_default()
+            .into_iter()
+            .map(|port| ContainerPort {
+                container_port: port.number as i32,
+                name: Some(port.name),
+                protocol: Some(port.protocol.to_string()),
+                ..Default::default()
+            })
+            .collect();
+
+        let k8s_env_vars: Vec<EnvVar> = env_vars
+            .unwrap_or_default()
+            .into_iter()
+            .map(|(key, value)| EnvVar {
+                name: key,
+                value: Some(value),
+                ..Default::default()
+            })
+            .collect();
+
+        Self {
+            name: name.into(),
+            image: image.into(),
+            replicas: replicas.unwrap_or(1),
+            container_ports,
+            env_vars: k8s_env_vars,
+            image_pull_policy: Some("IfNotPresent".to_string()),
+        }
+    }
+
+    pub fn replicas(mut self, replicas: i32) -> Self {
+        self.replicas = replicas;
+        self
+    }
+
+    pub fn with_container_port(
+        mut self,
+        number: i32,
+        name: impl Into<String>,
+        protocol: impl Into<String>,
+    ) -> Self {
+        self.container_ports.push(ContainerPort {
+            container_port: number,
+            name: Some(name.into()),
+            protocol: Some(protocol.into()),
+            ..Default::default()
+        });
+        self
+    }
+
+    pub fn with_env_var(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
+        self.env_vars.push(EnvVar {
+            name: name.into(),
+            value: Some(value.into()),
+            ..Default::default()
+        });
+        self
+    }
+
+    pub fn image_pull_policy(mut self, policy: impl Into<String>) -> Self {
+        self.image_pull_policy = Some(policy.into());
+        self
+    }
+
+    pub fn build(self) -> Deployment {
+        let name = self.name.clone();
+        Deployment {
+            metadata: ObjectMeta {
+                name: Some(name.clone()),
+                labels: Some(
+                    [
+                        ("app.kubernetes.io/name".to_string(), name.clone()),
+                        (
+                            "app.kubernetes.io/component".to_string(),
+                            "deployment".to_string(),
+                        ),
+                        (
+                            "app.kubernetes.io/managed-by".to_string(),
+                            "harmony".to_string(),
+                        ),
+                        ("app.kubernetes.io/version".to_string(), "1.0.0".to_string()),
+                    ]
+                    .into(),
+                ),
+                ..Default::default()
+            },
+            spec: Some(DeploymentSpec {
+                replicas: Some(self.replicas),
+                selector: k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector {
+                    match_labels: Some(
+                        [("app.kubernetes.io/name".to_string(), name.clone())].into(),
+                    ),
+                    ..Default::default()
+                },
+                template: PodTemplateSpec {
+                    metadata: Some(ObjectMeta {
+                        labels: Some(
+                            [
+                                ("app.kubernetes.io/name".to_string(), name.clone()),
+                                ("app.kubernetes.io/instance".to_string(), name.clone()),
+                            ]
+                            .into(),
+                        ),
+                        ..Default::default()
+                    }),
+                    spec: Some(PodSpec {
+                        containers: vec![Container {
+                            name: name.clone(),
+                            image: Some(self.image),
+                            image_pull_policy: self.image_pull_policy,
+                            ports: if self.container_ports.is_empty() {
+                                None
+                            } else {
+                                Some(self.container_ports)
+                            },
+                            env: if self.env_vars.is_empty() {
+                                None
+                            } else {
+                                Some(self.env_vars)
+                            },
+                            ..Default::default()
+                        }],
+                        ..Default::default()
+                    }),
+                },
+                ..Default::default()
+            }),
+            ..Default::default()
+        }
+    }
+}
+
+/// Helper function to create a Service from network port configuration.
+/// Returns `None` if no ports are provided.
+pub fn create_service_from_ports(
+    name: String,
+    network_ports: &[ApplicationNetworkPort],
+) -> Option<K8sService> {
+    if network_ports.is_empty() {
+        return None;
+    }
+
+    let ports: Vec<ServicePort> = network_ports
+        .into_iter()
+        .map(|port| ServicePort {
+            name: Some(port.name.clone()),
+            protocol: Some(port.protocol.to_string()),
+            port: port.number as i32,
+            target_port: Some(IntOrString::Int(port.number as i32)),
+            ..Default::default()
+        })
+        .collect();
+
+    Some(K8sService {
+        metadata: ObjectMeta {
+            name: Some(name.clone()),
+            labels: Some(
+                [
+                    ("app.kubernetes.io/name".to_string(), name.clone()),
+                    (
+                        "app.kubernetes.io/component".to_string(),
+                        "service".to_string(),
+                    ),
+                    (
+                        "app.kubernetes.io/managed-by".to_string(),
+                        "harmony".to_string(),
+                    ),
+                ]
+                .into(),
+            ),
+            ..Default::default()
+        },
+        spec: Some(ServiceSpec {
+            type_: Some("ClusterIP".to_string()),
+            selector: Some([("app.kubernetes.io/name".to_string(), name)].into()),
+            ports: Some(ports),
+            ..Default::default()
+        }),
+        ..Default::default()
+    })
+}
--- a/harmony/src/modules/application/mod.rs
+++ b/harmony/src/modules/application/mod.rs
@@ -1,5 +1,8 @@
+pub mod backend_app;
+pub mod config;
 mod feature;
 pub mod features;
+pub mod helm;
 pub mod oci;
 mod rust;
 mod webapp;
@@ -124,3 +127,15 @@ impl Serialize for dyn Application {
        todo!()
    }
 }
+
+/// Checks the output of a process command for success.
+fn check_output(
+    output: &std::process::Output,
+    msg: &str,
+) -> Result<(), Box<dyn std::error::Error>> {
+    if !output.status.success() {
+        let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
+        return Err(error_message.into());
+    }
+    Ok(())
+}
--- a/harmony/src/modules/application/oci.rs
+++ b/harmony/src/modules/application/oci.rs
@@ -1,5 +1,13 @@
+use std::path::{Path, PathBuf};
+
+use crate::{
+    config::{REGISTRY_PROJECT, REGISTRY_URL},
+    modules::application::check_output,
+};
+
 use super::Application;
 use async_trait::async_trait;
+use log::debug;

 #[async_trait]
 pub trait OCICompliant: Application {
@@ -17,9 +25,74 @@ pub trait HelmPackage: Application {
    /// # Arguments
    /// * `image_url` - The full URL of the OCI container image to be used in the Deployment.
    /// * `domain` - The domain where the application is hosted.
-    async fn build_push_helm_package(
-        &self,
-        image_url: &str,
-        domain: &str,
-    ) -> Result<String, String>;
+    async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String>;
+
+    fn project_root(&self) -> PathBuf;
+
+    fn chart_name(&self) -> String;
+
+    /// Packages a Helm chart directory into a .tgz file.
+    fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
+        let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
+        debug!(
+            "Launching `helm package {}` cli with CWD {}",
+            chart_dirname.to_string_lossy(),
+            &self
+                .project_root()
+                .join(".harmony_generated")
+                .join("helm")
+                .to_string_lossy()
+        );
+        let output = std::process::Command::new("helm")
+            .args(["package", chart_dirname.to_str().unwrap()])
+            .current_dir(self.project_root().join(".harmony_generated").join("helm")) // Run package from the parent dir
+            .output()?;
+
+        check_output(&output, "Failed to package Helm chart")?;
+
+        // Helm prints the path of the created chart to stdout.
+        let tgz_name = String::from_utf8(output.stdout)?
+            .split_whitespace()
+            .last()
+            .unwrap_or_default()
+            .to_string();
+        if tgz_name.is_empty() {
+            return Err("Could not determine packaged chart filename.".into());
+        }
+
+        // The output from helm is relative, so we join it with the execution directory.
+        Ok(self
+            .project_root()
+            .join(".harmony_generated")
+            .join("helm")
+            .join(tgz_name))
+    }
+
+    /// Pushes a packaged Helm chart to an OCI registry.
+    fn push_helm_chart(
+        &self,
+        packaged_chart_path: &Path,
+    ) -> Result<String, Box<dyn std::error::Error>> {
+        // The chart name is the file stem of the .tgz file
+        let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
+        let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
+        let oci_pull_url = format!("{oci_push_url}/{}-chart", self.chart_name());
+        debug!(
+            "Pushing Helm chart {} to {}",
+            packaged_chart_path.to_string_lossy(),
+            oci_push_url
+        );
+
+        let output = std::process::Command::new("helm")
+            .args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
+            .output()?;
+
+        check_output(&output, "Pushing Helm chart failed")?;
+
+        // The final URL includes the version tag, which is part of the file name
+        let version = chart_file_name.rsplit_once('-').unwrap().1;
+        debug!("pull url {oci_pull_url}");
+        debug!("push url {oci_push_url}");
+        Ok(format!("{}:{}", oci_pull_url, version))
+    }
 }
--- a/harmony/src/modules/application/rust.rs
+++ b/harmony/src/modules/application/rust.rs
@@ -81,16 +81,21 @@ impl Webapp for RustWebapp {

 #[async_trait]
 impl HelmPackage for RustWebapp {
-    async fn build_push_helm_package(
-        &self,
-        image_url: &str,
-        domain: &str,
-    ) -> Result<String, String> {
+    fn project_root(&self) -> PathBuf {
+        self.project_root.clone()
+    }
+
+    fn chart_name(&self) -> String {
+        self.name.clone()
+    }
+
+    async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
+        let domain = self.dns();
        info!("Starting Helm chart build and push for '{}'", self.name);

        // 1. Create the Helm chart files on disk.
        let chart_dir = self
-            .create_helm_chart_files(image_url, domain)
+            .create_helm_chart_files(image_url, &domain)
            .await
            .map_err(|e| format!("Failed to create Helm chart files: {}", e))?;
        info!("Successfully created Helm chart files in {:?}", chart_dir);
@@ -327,19 +332,6 @@ impl RustWebapp {
        Ok(image_tag.to_string())
    }

-    /// Checks the output of a process command for success.
-    fn check_output(
-        &self,
-        output: &process::Output,
-        msg: &str,
-    ) -> Result<(), Box<dyn std::error::Error>> {
-        if !output.status.success() {
-            let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
-            return Err(error_message.into());
-        }
-        Ok(())
-    }
-
    fn build_builder_image(&self, dockerfile: &mut Dockerfile) {
        match self.framework {
            Some(RustWebFramework::Leptos) => {
@@ -640,71 +632,6 @@ spec:
        Ok(chart_dir)
    }

-    /// Packages a Helm chart directory into a .tgz file.
-    fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
-        let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
-        debug!(
-            "Launching `helm package {}` cli with CWD {}",
-            chart_dirname.to_string_lossy(),
-            &self
-                .project_root
-                .join(".harmony_generated")
-                .join("helm")
-                .to_string_lossy()
-        );
-        let output = process::Command::new("helm")
-            .args(["package", chart_dirname.to_str().unwrap()])
-            .current_dir(self.project_root.join(".harmony_generated").join("helm")) // Run package from the parent dir
-            .output()?;
-
-        self.check_output(&output, "Failed to package Helm chart")?;
-
-        // Helm prints the path of the created chart to stdout.
-        let tgz_name = String::from_utf8(output.stdout)?
-            .split_whitespace()
-            .last()
-            .unwrap_or_default()
-            .to_string();
-        if tgz_name.is_empty() {
-            return Err("Could not determine packaged chart filename.".into());
-        }
-
-        // The output from helm is relative, so we join it with the execution directory.
-        Ok(self
-            .project_root
-            .join(".harmony_generated")
-            .join("helm")
-            .join(tgz_name))
-    }
-
-    /// Pushes a packaged Helm chart to an OCI registry.
-    fn push_helm_chart(
-        &self,
-        packaged_chart_path: &Path,
-    ) -> Result<String, Box<dyn std::error::Error>> {
-        // The chart name is the file stem of the .tgz file
-        let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
-        let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
-        let oci_pull_url = format!("{oci_push_url}/{}-chart", self.name);
-        debug!(
-            "Pushing Helm chart {} to {}",
-            packaged_chart_path.to_string_lossy(),
-            oci_push_url
-        );
-
-        let output = process::Command::new("helm")
-            .args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
-            .output()?;
-
-        self.check_output(&output, "Pushing Helm chart failed")?;
-
-        // The final URL includes the version tag, which is part of the file name
-        let version = chart_file_name.rsplit_once('-').unwrap().1;
-        debug!("pull url {oci_pull_url}");
-        debug!("push url {oci_push_url}");
-        Ok(format!("{}:{}", oci_pull_url, version))
-    }
-
    fn get_or_build_dockerfile(&self) -> Result<PathBuf, Box<dyn std::error::Error>> {
        let existing_dockerfile = self.project_root.join("Dockerfile");

--- a/harmony/templates/helm/Chart.yaml.j2
+++ b/harmony/templates/helm/Chart.yaml.j2
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: {{ name }}
+description: {{ description }}
+type: application
+version: {{ version }}
+appVersion: "{{ app_version }}"
--- a/harmony_agent/.dockerignore
+++ b/harmony_agent/.dockerignore
@@ -0,0 +1,4 @@
+.git
+data
+target
+demos
--- a/harmony_agent/Cargo.toml
+++ b/harmony_agent/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "harmony_agent"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+harmony = { path = "../harmony" }
+# harmony_cli = { path = "../harmony_cli" }
+harmony_types = { path = "../harmony_types" }
+harmony_macros = { path = "../harmony_macros" }
+cidr = { workspace = true }
+tokio = { workspace = true }
+log = { workspace = true }
+env_logger = { workspace = true }
+async-nats = "0.45.0"
+async-trait = "0.1"
+# url = { workspace = true }
+
+serde.workspace = true
+serde_json.workspace = true
+getrandom = "0.3.4"
+
+thiserror.workspace = true
+pretty_assertions.workspace = true
--- a/harmony_agent/Dockerfile
+++ b/harmony_agent/Dockerfile
@@ -0,0 +1,44 @@
+# Build stage
+FROM rust:slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y pkg-config && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy all required packages
+COPY . .
+
+RUN ls -la1
+
+# Build the application in release mode
+RUN cargo build --release -p harmony_agent
+
+# Runtime stage
+FROM debian:bookworm-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy the binary from the builder stage
+COPY --from=builder /app/target/release/harmony_agent ./harmony_agent
+
+# Declare environment variables used by the Harmony Agent
+# These will be set from build-time environment variables if present
+# NATS_URL: URL of the NATS server (default: nats://localhost:4222)
+ARG NATS_URL=nats://localhost:4222
+ENV NATS_URL=${NATS_URL}
+# NATS_CREDS_PATH: Optional path to NATS credentials file
+ARG NATS_CREDS_PATH
+ENV NATS_CREDS_PATH=${NATS_CREDS_PATH}
+# MY_CLUSTER_ID: This cluster's unique identifier (required)
+ARG MY_CLUSTER_ID
+ENV MY_CLUSTER_ID=${MY_CLUSTER_ID}
+# DESIRED_PRIMARY: The ID of the desired primary cluster (required)
+ARG DESIRED_PRIMARY
+ENV DESIRED_PRIMARY=${DESIRED_PRIMARY}
+
+# Run the application
+ENTRYPOINT ["./harmony_agent"]
--- a/harmony_agent/README.md
+++ b/harmony_agent/README.md
@@ -0,0 +1,248 @@
+TODO
+
+DONE:
+1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type
+2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision)
+3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded
+4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent
+5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback
+6. ✅ failover_timeout added to AgentConfig
+7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error
+8. ✅ startup reconciliation implemented via on_startup() method
+
+REMAINING:
+- review all code and list implementation issues
+- review both workflow for each state transition
+- Complete replica workflow staleness detection (needs implementation in Watching state)
+- Implement state recovery from Failed state for both workflows
+- Implement subscribe in NATS store with watch() API
+- Implement config validation for failover_timeout constraints
+
+TODO
+
+1. store trait subscribe definition missing callback
+2. BUG, data integrity issue : nats store not actually using jetstream metadata
+3. review all code and list implementation issues
+4. review both workflow for each state transition
+5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded
+6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself
+   should hold it probably, every agent should be subscribed to the cluster_state object and
+   keep it in memory to allow workflows to process against it efficiently)
+
+## CRITICAL - Data Integrity Issues
+
+1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`)
+   - Currently uses `put()` which overwrites unconditionally
+   - Must use `update()` with revision parameter for proper compare-and-set
+   - Without this, concurrent promotion attempts can cause split brain
+
+2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`)
+   - Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3
+   - NATS Entry has `.revision` and `.created` fields that must be used
+   - This defeats the entire purpose of store-provided timestamps
+
+3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`)
+   - Line ~156: TODO comment confirms missing metadata passing
+   - Replica cannot calculate staleness without metadata.timestamp
+   - Failover logic is broken
+
+4. **No actual cluster state watching exists**
+   - Replica workflow declares `ClusterState` but never updates it
+   - No subscription to primary heartbeat or cluster_state key
+   - Replica cannot detect primary liveness
+
+## HIGH - Missing Core Functionality
+
+5. **Replica Workflow incomplete** - All key logic is TODO:
+   - Watching primary staleness (line 114)
+   - Promotion attempt (line 118)
+   - Original primary recovery detection (line 127)
+   - Demotion/handshake (line 131)
+
+6. **Missing replica "Failed" state**
+   - `ReplicaState` enum has no `Failed` variant
+   - User's TODO #5 correctly identifies this gap
+   - What happens if replica's own heartbeats fail repeatedly?
+
+7. **Primary Workflow incomplete** - Key logic missing:
+   - No NATS check before recovering from `Fenced` state (line 95)
+   - No NATS check in `Yielding` state for demotion handshake (line 101)
+   - No actual fencing failure handling
+
+8. **Store `subscribe` not implemented** (`store/mod.rs`)
+   - Returns `todo!()` in NATS implementation
+   - No callback mechanism defined in trait
+   - Without this, agents cannot react to state changes
+
+9. **Cluster state not tracked centrally**
+   - User's TODO #6 correctly identifies this
+   - Each agent should maintain a local copy of cluster_state
+   - No subscription mechanism to update this local copy
+
+10. **No validation of configuration constraints**
+    - Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin`
+    - Invalid config could cause split brain
+
+## MEDIUM - Incorrect State Transitions
+
+11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`)
+    - Two state transitions happen in one heartbeat cycle
+    - Should stay in `Failed` until fencing actually completes
+    - What if fencing fails? State machine won't reflect it
+
+12. **No fencing failure handling**
+    - If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes
+    - ADR mentions escalating to radical measures, but no callback for failure
+
+13. **Replica `Watching` state does nothing**
+    - Line 115: Just logs, checks nothing
+    - Should be checking staleness of primary heartbeat
+
+14. **Demotion handshake not implemented**
+    - ADR section 4 details this but code doesn't implement it
+    - How does original primary know it should yield?
+
+## LOW - Observability & Reliability
+
+15. **No graceful shutdown mechanism**
+    - `run_heartbeat_loop` runs forever
+    - No signal handling (SIGTERM, SIGINT)
+
+16. **Async task errors silently ignored**
+    - `tokio::spawn` at lines 74, 83, 123
+    - No `JoinHandle` retention or error handling
+
+17. **No metrics/observability**
+    - Only log output
+    - No Prometheus metrics for state transitions, failure counts, etc.
+
+18. **Hardcoded main() function** (`agent_loop.rs::main`)
+    - Not production-ready entry point
+    - Should load config from environment or file
+
+19. **Store factory pattern missing**
+    - TODO comment at line 54 confirms this
+    - Can't switch between stores via config
+
+20. **No backoff/retry logic for NATS operations**
+    - Transient failures could trigger unnecessary fencing
+
+21. **`AgentInfo` status is hardcoded to "HEALTHY"**
+    - Line 137 in `store_heartbeat`
+    - Should反映 actual workflow state
+
+22. **Unused fields in structs**
+    - `HeartbeatState.last_seq` set but never read
+    - `ClusterState.current_primary` set but never read
+
+## ADR-017-3 Compliance Issues
+
+23. **ADR violation: Clock skew not avoided**
+    - While ADR says use store metadata, code uses local time
+
+24. **Failover timeout not configurable**
+    - Defined in ADR but not in `AgentConfig`
+    - Needed for replica staleness calculation
+
+25. **Safety margin concept exists in ADR but not in code**
+    - Configuration should include this margin
+
+26. **No handling of Case 3 (Replica Network Lag)**
+    - ADR describes NATS rejection prevention
+    - But `set_strict` implementation accepts any write
+
+## Code Quality Issues
+
+27. **Inconsistent error handling**
+    - Some paths return `Err`, others `todo!()`, others ignore
+
+28. **Unnecessary `Clone` bounds**
+    - `DeploymentConfig.clone()` used frequently
+    - Could be optimized with `Arc`
+
+29. **Missing lifetime annotations**
+    - `KvStore::get` returns `String` key in error - inefficient
+
+30. **No integration points mentioned**
+    - PostgreSQL lifecycle control implementation missing
+    - Fencing via CNPG not connected
+
+## Production Readiness Checklist Summary
+
+For battle testing preparation, you need:
+
+**Immediate ( blockers):**
+- Fix NATS store metadata usage (issues #1, #2)
+- Implement strict set_strict with actual CAS (#1)
+- Implement replica primary watching (#4, #5)
+- Add failover_timeout config + staleness logic (#3, #24)
+- Implement subscribe mechanism with callbacks (#8)
+
+**High priority:**
+- Complete all workflow transitions (#5, #7, #11-14)
+- Add cluster state tracking (#6, #9)
+- Add configuration validation (#10)
+- Add Replica Failed state (#6)
+
+**Before deployment:**
+- Implement graceful shutdown (#15)
+- Add error handling for spawned tasks (#16)
+- Remove hardcoded main function (#18)
+- Implement store factory (#19)
+- Add Prometheus metrics (#17)
+
+**Documentation:**
+- Document all configuration parameters and their trade-offs
+- Add runbooks for each failure mode
+- Document battle test scenarios to cover
+
+### Addendum: Missing Critical Issues
+
+#### 1. CRITICAL: Heartbeat "Lying" (Data Integrity)
+*   **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`.
+*   **The Bug:** `status: "HEALTHY".to_string()` is hardcoded.
+*   **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY".
+*   **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely.
+
+#### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption)
+*   **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`).
+*   **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks.
+*   **Scenario:**
+    1.  Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s).
+    2.  Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s).
+    3.  `on_active` finishes *before* `on_failover`.
+    4.  `on_failover` finishes last, killing the DB *after* the agent decided it was healthy.
+*   **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one.
+
+#### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk)
+*   **Location:** `agent_loop.rs` loop logic.
+*   **The Bug:** There is no "Stop the World" gate.
+*   **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*.
+*   **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again.
+*   **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter.
+
+#### 4. HIGH: NATS Bucket Name Collision
+*   **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`.
+*   **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`.
+*   **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state.
+*   **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`.
+
+#### 5. HIGH: Startup State Reconciliation
+*   **Location:** `HarmonyAgent::new`.
+*   **The Bug:** Agents always start in `Initializing`.
+*   **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader.
+*   **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime.
+*   **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check).
+
+### Summary of Tasks to Add
+
+Please add these to your master list before starting implementation:
+
+28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY".
+29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping.
+30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection).
+31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`.
+32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid.
+
+*   **Think about vacuum / stop-the-world operations**
+
--- a/harmony_agent/deploy/Cargo.toml
+++ b/harmony_agent/deploy/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "harmony_agent_deploy"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+harmony = { path = "../../harmony" }
+harmony_cli = { path = "../../harmony_cli" }
+harmony_types = { path = "../../harmony_types" }
+harmony_macros = { path = "../../harmony_macros" }
+cidr = { workspace = true }
+tokio = { workspace = true }
+log = { workspace = true }
+env_logger = { workspace = true }
+url = { workspace = true }
+
+serde.workspace = true
+serde_json.workspace = true
--- a/harmony_agent/deploy/src/main.rs
+++ b/harmony_agent/deploy/src/main.rs
@@ -0,0 +1,63 @@
+use harmony::{
+    inventory::Inventory,
+    modules::{
+        application::{
+            ApplicationScore,
+            backend_app::{BackendApp, BuildCommand},
+            features::{Monitoring, PackagingDeployment},
+        },
+        monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
+    },
+    topology::K8sAnywhereTopology,
+};
+use harmony_macros::hurl;
+use harmony_types::k8s_name::K8sName;
+use std::{path::PathBuf, sync::Arc};
+
+#[tokio::main]
+async fn main() {
+    let application = Arc::new(BackendApp {
+        name: "harmony-agent".to_string(),
+        // Since harmony_agent is part of the harmony workspace, the actual "project root"
+        // is not harmony_agent folder but the workspace root.
+        //
+        // So using ../ here means we MUST run this deployment script from the harmony_agent
+        // folder
+        project_root: PathBuf::from("../"),
+        network_ports: vec![],
+        env_vars: vec![
+            ("NATS_URL".to_string(), "nats://nats".to_string()),
+            ("DESIRED_PRIMARY".to_string(), "site-1".to_string()),
+            ("MY_CLUSTER_ID".to_string(), "site-1".to_string()),
+            ("NATS_CREDS_PATH".to_string(), "".to_string()),
+        ],
+        build_cmd: BuildCommand::new("cargo", vec!["build", "--release", "-p", "harmony_agent"]),
+        dockerfile: Some(PathBuf::from("Dockerfile")),
+    });
+
+    let app = ApplicationScore {
+        features: vec![
+            Box::new(PackagingDeployment {
+                application: application.clone(),
+            }),
+            Box::new(Monitoring {
+                application: application.clone(),
+                alert_receiver: vec![Box::new(DiscordWebhook {
+                    name: K8sName("test-discord".to_string()),
+                    url: hurl!("https://discord.doesnt.exist.com"),
+                    selectors: vec![],
+                })],
+            }),
+        ],
+        application,
+    };
+
+    harmony_cli::run(
+        Inventory::autoload(),
+        K8sAnywhereTopology::from_env(), // <== Deploy to local automatically provisioned k3d by default or connect to any kubernetes cluster
+        vec![Box::new(app)],
+        None,
+    )
+    .await
+    .unwrap();
+}
--- a/harmony_agent/src/agent/config.rs
+++ b/harmony_agent/src/agent/config.rs
@@ -0,0 +1,79 @@
+use std::time::Duration;
+
+use harmony_types::id::Id;
+use log::info;
+
+use super::heartbeat::HeartbeatFailure;
+use super::role::AgentRole;
+
+#[derive(Debug, Clone)]
+pub struct AgentConfig {
+    /// Number of consecutive successful heartbeats required before the service transitions from
+    /// failed to healthy.
+    pub success_threshold: usize,
+    /// Number of consecutive failed heartbeats required before the service transitions from
+    /// healthy to failed.
+    pub failure_threshold: usize,
+    /// Time between each heartbeat. If a heartbeat takes longer than this, it will be
+    /// considered failed.
+    pub heartbeat_interval: Duration,
+    /// Time since last observed primary heartbeat before replica considers primary stale.
+    /// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin
+    /// to avoid split brain during network partitions.
+    pub failover_timeout: Duration,
+    /// **UNSTABLE FIELD**
+    ///
+    /// For now, an agent instance only serves one deployment. This is probably fine as an agent's
+    /// footprint is low, but managing multiple deployments in a single instance would be a
+    /// significant resource usage reduction.
+    ///
+    /// Decoupling the deployment of the agent with the application's deployment could make things
+    /// more complicated though, where we would have to be careful about version compatibility
+    /// between all components managed by the agent instance. So for now it is a 1-1 map.
+    ///
+    /// But I have a feeling this could change so I am marking this field unstable to warn you, the
+    /// reader.
+    pub deployment_config_unstable: DeploymentConfig,
+    pub nats_url: String,
+    pub nats_creds_path: Option<String>,
+    pub agent_id: Id,
+    pub cluster_id: Id,
+    pub desired_primary_id: Id,
+    /// The role this agent plays (Primary or Replica)
+    pub role: AgentRole,
+}
+
+#[derive(Debug, Clone)]
+pub enum DeploymentConfig {
+    FailoverPostgreSQL(FailoverCNPGConfig),
+}
+
+#[derive(Debug, Clone)]
+pub struct FailoverCNPGConfig {
+    pub cnpg_cluster_name: String,
+}
+
+impl DeploymentConfig {
+    /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres)
+    pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> {
+        match self {
+            DeploymentConfig::FailoverPostgreSQL(cfg) => {
+                info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name);
+                // TODO: Implement actual PG check / NATS write here
+                Ok(())
+            }
+        }
+    }
+
+    /// Callback: Transitioned from Unhealthy -> Healthy
+    pub async fn on_active(&self) {
+        info!("Service is now ACTIVE (Healthy)");
+        // e.g., Remove fencing lock
+    }
+
+    /// Callback: Transitioned from Healthy -> Unhealthy
+    pub async fn on_failover(&self) {
+        info!("Service is now FAILED (Unhealthy)");
+        // e.g., Initiate self-fencing, stop accepting traffic
+    }
+}
--- a/harmony_agent/src/agent/heartbeat.rs
+++ b/harmony_agent/src/agent/heartbeat.rs
@@ -0,0 +1,35 @@
+use harmony_types::id::Id;
+use serde::{Deserialize, Serialize};
+
+use crate::store::KvMetadata;
+
+/// Agent-provided heartbeat information (no timestamps - those come from the store)
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct AgentInfo {
+    pub agent_id: Id,
+    pub cluster_id: Id,
+    pub status: String,
+}
+
+/// Complete heartbeat with both agent data and store metadata
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct AgentHeartbeat {
+    pub agent_info: AgentInfo,
+    pub metadata: Option<KvMetadata>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
+pub struct ClusterStateData {
+    pub cluster_info: ClusterState,
+    pub metadata: Option<KvMetadata>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
+pub struct ClusterState {
+    pub cluster_id: Id,
+    pub current_primary: Option<Id>,
+    pub desired_primary: Id,
+}
+
+#[derive(Debug)]
+pub struct HeartbeatFailure {}
--- a/harmony_agent/src/agent/mod.rs
+++ b/harmony_agent/src/agent/mod.rs
@@ -0,0 +1,507 @@
+use std::time::{SystemTime, UNIX_EPOCH};
+use std::{str::FromStr, sync::Arc, time::Duration};
+
+use harmony_types::id::Id;
+use log::{debug, error, info, trace, warn};
+use tokio::sync::RwLock;
+use tokio::time::{Instant, sleep};
+
+use crate::agent::heartbeat::ClusterState;
+use crate::store::{KvMetadata, KvStore, KvStoreError};
+use crate::workflow::HeartbeatWorkflow;
+use crate::workflow::primary::PrimaryWorkflow;
+use crate::workflow::replica::ReplicaWorkflow;
+
+// Submodules
+mod config;
+pub mod heartbeat;
+mod role;
+
+// Re-exports for backwards compatibility
+pub use config::{AgentConfig, DeploymentConfig, FailoverCNPGConfig};
+pub use heartbeat::{AgentHeartbeat, AgentInfo, ClusterStateData, HeartbeatFailure};
+pub use role::AgentRole;
+
+pub async fn launch_agent<S>(
+    role: AgentRole,
+    health_kv: Arc<S>,
+    cluster_kv: Arc<S>,
+    heartbeat_interval: Duration,
+    failover_timeout: Duration,
+) -> Result<(), Box<dyn std::error::Error>>
+where
+    S: KvStore + Send + Sync + 'static,
+{
+    // Cheap ass fix when we boot two agents at the same time and the store does not exist, delay
+    // one so they don't crash because of the race
+    match role {
+        AgentRole::Primary => {}
+        AgentRole::Replica => {
+            sleep(Duration::from_millis(100)).await;
+        }
+    }
+
+    let my_agent_name = format!("agent-{}", role);
+    let my_agent_id = Id::from_str(&my_agent_name).unwrap();
+
+    let config = AgentConfig {
+        role,
+        success_threshold: 2,
+        failure_threshold: 2,
+        heartbeat_interval,
+        failover_timeout,
+        deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
+            cnpg_cluster_name: String::from("cnpg_cluster_name"),
+        }),
+        nats_url: String::new(),
+        nats_creds_path: None,
+        agent_id: my_agent_id,
+        cluster_id: "cluster_test_id".into(),
+        desired_primary_id: "primary_id".into(),
+    };
+
+    log::info!("Harmony Agent Initialized");
+    log::info!("Initializing Harmony Agent Id : {}", config.agent_id);
+    log::info!("Full config : {:?}", config);
+
+    // TODO load store based on config, default to nats
+    // probably a good use case for a factory pattern
+
+    let mut agent = HarmonyAgent::new(config, health_kv, cluster_kv);
+
+    agent.reconcile_startup().await?;
+
+    // Run the heartbeat loop
+    agent.run_heartbeat_loop().await;
+
+    Ok(())
+}
+
+pub struct HarmonyAgent<S: KvStore> {
+    pub config: AgentConfig,
+    workflow: Box<dyn HeartbeatWorkflow>,
+    health_kv: Arc<S>,
+    cluster_kv: Arc<S>,
+    /// Last successful heartbeat, used to track sequence number for next write
+    /// This avoids doing a GET before every SET, reducing network round-trips
+    last_heartbeat: Arc<RwLock<Option<AgentHeartbeat>>>,
+    /// Local copy of cluster state, updated via subscription
+    /// This allows workflows to make decisions without querying NATS each time
+    cluster_state: Arc<RwLock<Option<ClusterStateData>>>,
+}
+
+impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
+    pub fn new(config: AgentConfig, health_kv: Arc<S>, cluster_kv: Arc<S>) -> Self {
+        let workflow: Box<dyn HeartbeatWorkflow> = match config.role {
+            AgentRole::Primary => {
+                info!("Initializing agent as PRIMARY");
+                Box::new(PrimaryWorkflow::new(
+                    config.success_threshold,
+                    config.failure_threshold,
+                    config.deployment_config_unstable.clone(),
+                ))
+            }
+            AgentRole::Replica => {
+                info!("Initializing agent as REPLICA");
+                Box::new(ReplicaWorkflow::new(
+                    config.success_threshold,
+                    config.failure_threshold,
+                    config.cluster_id.clone(),
+                    config.desired_primary_id.clone(),
+                    config.agent_id.clone(),
+                    config.failover_timeout,
+                ))
+            }
+        };
+
+        Self {
+            config,
+            workflow,
+            health_kv,
+            cluster_kv,
+            last_heartbeat: Arc::new(RwLock::new(None)),
+            cluster_state: Arc::new(RwLock::new(None)),
+        }
+    }
+
+    /// Generic helper to fetch and deserialize data from KV store
+    /// Returns Ok(Some(data)) if key exists and deserializes successfully
+    /// Returns Ok(None) if key doesn't exist
+    /// Returns Err if deserialization fails or other errors occur
+    async fn fetch_from_store<D>(
+        &self,
+        store: &Arc<S>,
+        key: &str,
+    ) -> Result<Option<(D, KvMetadata)>, KvStoreError>
+    where
+        D: serde::de::DeserializeOwned,
+    {
+        debug!("Fetching data from key: {}", key);
+
+        let result = store.get(key).await;
+        debug!("Got result from store: {:#?}", result);
+
+        match result {
+            Ok(kv_result) => {
+                if let Some(value) = kv_result.value {
+                    match serde_json::from_value::<D>(value.clone()) {
+                        Ok(data) => Ok(Some((data, kv_result.metadata))),
+                        Err(e) => {
+                            log::warn!("Failed to deserialize data from key {}: {}", key, e);
+                            Err(KvStoreError::DeserializationFailed {
+                                deserialization_error: format!(
+                                    "Key exists but deserialization failed for {key}: {e}"
+                                ),
+                                value: value.to_string(),
+                            })
+                        }
+                    }
+                } else {
+                    Err(KvStoreError::Unknown(format!(
+                        "Key exists but value is empty for {key}, this should not happen"
+                    )))
+                }
+            }
+            Err(KvStoreError::KeyNotAvailable(_)) => {
+                debug!("Key {} not found in store", key);
+                Ok(None)
+            }
+            Err(e) => {
+                log::warn!("Failed to fetch data from key {}: {}", key, e);
+                Err(e)
+            }
+        }
+    }
+
+    /// Reconcile startup state by fetching cluster state and heartbeat from the store
+    /// This allows the workflow to determine if it should resume as Primary/Replica
+    /// based on the persisted cluster state
+    pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> {
+        let cluster_key = format!("cluster.{}", self.config.cluster_id);
+
+        debug!(
+            "Fetching cluster state for startup reconciliation from key: {}",
+            cluster_key
+        );
+
+        let cluster_state_option = match self
+            .fetch_from_store::<ClusterState>(&self.cluster_kv, &cluster_key)
+            .await?
+        {
+            Some((data, metadata)) => Some(ClusterStateData {
+                cluster_info: data,
+                metadata: Some(metadata),
+            }),
+            None => {
+                debug!(
+                    "Cluster state key not found, this is a fresh cluster, initializing cluster state"
+                );
+                Some(self.store_cluster_state(None).await?)
+            }
+        };
+
+        debug!("Found cluster state {cluster_state_option:#?}");
+        self.workflow
+            .on_startup(cluster_state_option.as_ref(), &self.config)
+            .await;
+
+        // Cache the cluster state locally
+        *self.cluster_state.write().await = cluster_state_option;
+        // Fetch last heartbeat if it exists to avoid sequence conflicts
+        let heartbeat_key = format!("heartbeat.{}", self.config.agent_id);
+        debug!("Fetching last heartbeat from key: {}", heartbeat_key);
+
+        let last_heartbeat_option = self.health_kv.get(&heartbeat_key).await;
+
+        let last_heartbeat = match last_heartbeat_option {
+            Ok(kv_result) => {
+                let value = kv_result
+                    .value
+                    .expect("When key exist it should always contain data");
+                Some(AgentHeartbeat {
+                    agent_info: serde_json::from_value::<AgentInfo>(value.clone()).map_err(
+                        |e| KvStoreError::DeserializationFailed {
+                            deserialization_error: e.to_string(),
+                            value: value.to_string(),
+                        },
+                    )?,
+                    metadata: Some(kv_result.metadata),
+                })
+            }
+            Err(e) => match e {
+                KvStoreError::KeyNotAvailable(_) => None,
+                _ => return Err(e),
+            },
+        };
+        if let Some(heartbeat) = &last_heartbeat {
+            debug!(
+                "Found existing heartbeat with sequence: {}",
+                heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
+            );
+        } else {
+            debug!("No existing heartbeat found, starting fresh");
+        }
+
+        // Cache the last heartbeat for sequence tracking
+        *self.last_heartbeat.write().await = last_heartbeat;
+
+        Ok(())
+    }
+
+    async fn store_cluster_state(
+        &self,
+        cluster_data: Option<ClusterStateData>,
+    ) -> Result<ClusterStateData, KvStoreError> {
+        let key = format!("cluster.{}", self.config.cluster_id);
+        match cluster_data {
+            Some(cluster_data) => {
+                debug!("found some cluster state {:#?}", cluster_data);
+
+                let value = serde_json::to_value(&cluster_data.cluster_info).map_err(|e| {
+                    KvStoreError::DeserializationFailed {
+                        deserialization_error: e.to_string(),
+                        value: format!("{:?}", cluster_data),
+                    }
+                })?;
+
+                let expected_sequence = {
+                    let last = self.cluster_state.read().await;
+                    last.as_ref()
+                        .and_then(|hb| hb.metadata.as_ref())
+                        .map(|m| m.sequence)
+                        .unwrap_or(0)
+                };
+
+                debug!("expected sequence {:#?}", expected_sequence);
+                let new_seq = self
+                    .cluster_kv
+                    .set_strict(&key, value, expected_sequence)
+                    .await?;
+
+                let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
+                debug!("cluster kv {:#?}", cluster_kv_result);
+
+                let cluster_data_new = ClusterStateData {
+                    cluster_info: cluster_data.cluster_info.clone(),
+                    metadata: Some(cluster_kv_result.metadata),
+                };
+
+                *self.cluster_state.write().await = Some(cluster_data_new.clone());
+                Ok(cluster_data)
+            }
+            None => {
+                let cluster_info = ClusterState {
+                    cluster_id: self.config.cluster_id.clone(),
+                    current_primary: None,
+                    desired_primary: self.config.desired_primary_id.clone(),
+                };
+
+                let value = serde_json::to_value(&cluster_info).map_err(|e| {
+                    KvStoreError::DeserializationFailed {
+                        deserialization_error: e.to_string(),
+                        value: format!("{:?}", cluster_info),
+                    }
+                })?;
+
+                let cluster_data = ClusterStateData {
+                    cluster_info,
+                    metadata: None,
+                };
+
+                let new_seq = self.cluster_kv.set_strict(&key, value, 0).await?;
+
+                let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
+                debug!("cluster kv {:#?}", cluster_kv_result);
+
+                let cluster_data_new = ClusterStateData {
+                    cluster_info: cluster_data.cluster_info.clone(),
+                    metadata: Some(cluster_kv_result.metadata),
+                };
+
+                *self.cluster_state.write().await = Some(cluster_data_new.clone());
+                Ok(cluster_data_new)
+            }
+        }
+    }
+
+    /// Sends agent heartbeat to the KV store
+    ///
+    /// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence)
+    /// to avoid clock skew issues. This follows the ADR-017-3 principle that all timestamp
+    /// comparisons use the store's clock, not agent clocks.
+    ///
+    /// This method uses the last successful heartbeat's sequence number to avoid an extra
+    /// GET call before each SET, reducing network round-trips and latency exposure.
+    async fn store_heartbeat(&self) -> Result<AgentHeartbeat, KvStoreError> {
+        let key = format!("heartbeat.{}", self.config.agent_id);
+
+        // Create agent info WITHOUT timestamp - the store will add metadata
+        // Use workflow state to report actual status (e.g. Primary:Fenced, Replica:Watching)
+        let agent_info = AgentInfo {
+            agent_id: self.config.agent_id.clone(),
+            cluster_id: self.config.cluster_id.clone(),
+            status: self.workflow.state_name().to_string(),
+        };
+
+        debug!("Storing heartbeat for agent: {}", self.config.agent_id);
+        let value =
+            serde_json::to_value(&agent_info).map_err(|e| KvStoreError::DeserializationFailed {
+                deserialization_error: e.to_string(),
+                value: format!("{:?}", agent_info),
+            })?;
+
+        let expected_sequence = {
+            let last = self.last_heartbeat.read().await;
+            last.as_ref()
+                .and_then(|hb| hb.metadata.as_ref())
+                .map(|m| m.sequence)
+                .unwrap_or(0)
+        };
+
+        trace!("Writing new heartbeat  {key} (#{expected_sequence}), value: {value:?}");
+        let new_seq = self
+            .health_kv
+            .set_strict(&key, value, expected_sequence)
+            .await?;
+        trace!("Got new sequence {new_seq}");
+        let kv_result = self.health_kv.get_revision(&key, new_seq).await?;
+
+        debug!("Heartbeat stored succsssfully with sequence: {}", new_seq);
+
+        // Construct complete heartbeat with metadata from store
+        let heartbeat = AgentHeartbeat {
+            agent_info,
+            metadata: Some(kv_result.metadata),
+        };
+
+        // Cache this successful heartbeat for next iteration
+        *self.last_heartbeat.write().await = Some(heartbeat.clone());
+
+        Ok(heartbeat)
+    }
+
+    pub async fn run_heartbeat_loop(&mut self) {
+        let mut next_heartbeat_start;
+        loop {
+            let this_heartbeat_start = Instant::now();
+            next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval;
+
+            // Perform the check via the config/strategy with a timeout
+            //
+            // FIXME There is too much stuff happening inside the timeout. There are some things like a
+            // promotion, that we don't want to cancel within a single heartbeat interval timeout
+            // I think that the timeout should only apply to the store_heartbeat().await call.
+            // Logic happening after should not be affected in the exact same manner. There can be
+            // other timeouts or other stuff to consider here.
+            // However, the system does rely on heartbeats happening regularly, so we do not want
+            // to delay the next heartbeat either. This is tricky.
+            // An idea right now is to keep the heartbeat running but, when a processing event
+            // occurs, set a flag on the local agent that there is a process running (promotion,
+            // demotion, etc) and take no other decision until this process is not done. There is
+            // one exception we can think of right now :
+            // - a healthy primary starts running a process such as "calling mom"
+            // - the primary keeps sending its heartbeat to prove to the rest of the cluster that
+            // it is still healthy
+            // - then the primary heartbeat fails up to failure_threshold
+            // - at this moment the "calling mom" process must not prevent the primary from fencing itself. Otherwise the replica that promotes itself when it realises that the primary is dead will cause a split brain.
+            //  - Another solution would be register the processing: "calling mom" in the primary
+            //  heartbeat store, and prevent the replica from promoting when there is a running
+            //  task on the primary.
+            let result = tokio::time::timeout(self.config.heartbeat_interval, async {
+                // Store heartbeat and perform deployment-specific health check
+                match &self.store_heartbeat().await {
+                    Ok(heartbeat) => {
+                        // Heartbeat stored successfully, already cached by store_heartbeat
+                        debug!(
+                            "Heartbeat stored: seq={}",
+                            heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
+                        );
+                    }
+                    Err(KvStoreError::WrongLastRevision) => {
+                        todo!("fetch and update correct last sequence number")
+                        // CAS failure could indicate:
+                        // 1. Network latency: our previous timeout heartbeat actually succeeded
+                        // 2. Agent ID conflict: another agent with same ID exists
+                        // 3. Clock/bucket corruption (unlikely)
+
+                        // log::warn!(
+                        //     "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}",
+                        //     self.config.agent_id, expected, current, current
+                        // );
+                        // // Update cached heartbeat sequence to prevent repeated failures
+                        // if let Some(hb) = self.last_heartbeat.write().await.as_mut() {
+                        //     if let Some(metadata) = hb.metadata.as_mut() {
+                        //         metadata.sequence = *current;
+                        //     }
+                        // }
+                    }
+                    Err(e) => {
+                        // Actual storage failure - treat as heartbeat failure
+                        log::error!("Heartbeat storage error: {}", e);
+                        return Err(HeartbeatFailure {});
+                    }
+                }
+                self.config
+                    .deployment_config_unstable
+                    .perform_heartbeat()
+                    .await?;
+
+                // TODO: Pass the heartbeat with metadata to the workflow for staleness checks
+                // The workflow needs access to metadata.timestamp for failover timeout calculations
+                Ok::<(), HeartbeatFailure>(())
+            })
+            .await;
+
+            // Update Counters & Handle State Transitions
+            // Timeout is also treated as a failure
+            let heartbeat_result = match result {
+                Ok(inner_result) => inner_result,
+                Err(_) => Err(HeartbeatFailure {}),
+            };
+
+            trace!("Got heartbeat_result : {heartbeat_result:?}");
+            match heartbeat_result {
+                Ok(_) => {
+                    let new_state = self
+                        .workflow
+                        .handle_heartbeat_success(
+                            self.cluster_state.read().await.as_ref(),
+                            &self.config,
+                        )
+                        .await;
+                    if let Some(new_state) = new_state {
+                        warn!("Got new cluster state : {new_state:#?}");
+                        self.store_cluster_state(Some(new_state))
+                            .await
+                            .expect(&format!("cluster state not able to be stored"));
+                    }
+                }
+                Err(_) => {
+                    self.workflow
+                        .handle_heartbeat_failure(self.cluster_state.read().await.as_ref())
+                        .await;
+                }
+            }
+
+            info!(
+                "Heartbeat : success={heartbeat_emoji} state={state}, successes={consecutive_successes}/{success_threshold}, fails={consecutive_failures}/{failure_threshold} took={heartbeat_duration}ms",
+                success_threshold = self.config.success_threshold,
+                failure_threshold = self.config.failure_threshold,
+                state = self.workflow.state_name(),
+                consecutive_successes = self.workflow.consecutive_successes(),
+                consecutive_failures = self.workflow.consecutive_failures(),
+                heartbeat_emoji = if heartbeat_result.is_ok() {
+                    "✅"
+                } else {
+                    "❌"
+                },
+                heartbeat_duration = (Instant::now() - this_heartbeat_start).as_millis(),
+            );
+            debug!(
+                "Sleeping for {} ms before next heartbeat",
+                (next_heartbeat_start - Instant::now()).as_millis()
+            );
+            tokio::time::sleep_until(next_heartbeat_start).await;
+        }
+    }
+}
--- a/harmony_agent/src/agent/role.rs
+++ b/harmony_agent/src/agent/role.rs
@@ -0,0 +1,17 @@
+use std::fmt;
+
+/// The role of this agent instance
+#[derive(Debug, Clone, PartialEq)]
+pub enum AgentRole {
+    Primary,
+    Replica,
+}
+
+impl fmt::Display for AgentRole {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            AgentRole::Primary => write!(f, "primary"),
+            AgentRole::Replica => write!(f, "replica"),
+        }
+    }
+}
--- a/harmony_agent/src/config.rs
+++ b/harmony_agent/src/config.rs
@@ -0,0 +1,90 @@
+use harmony_types::id::Id;
+use log::debug;
+use std::env;
+use std::path::Path;
+use std::time::Duration;
+
+/// Configuration for the Harmony Agent
+#[derive(Debug, Clone)]
+pub struct AgentConfig {
+    pub nats_url: String,
+    pub nats_creds_path: Option<String>,
+    pub my_cluster_id: Id,
+    pub desired_primary: Id,
+    pub heartbeat_interval: Duration,
+}
+
+pub const NATS_URL: &str = "NATS_URL";
+pub const DESIRED_PRIMARY: &str = "DESIRED_PRIMARY";
+pub const MY_CLUSTER_ID: &str = "MY_CLUSTER_ID";
+pub const NATS_CREDS_PATH: &str = "NATS_CREDS_PATH";
+
+impl AgentConfig {
+    pub fn load_from_env() -> Result<Self, String> {
+        let nats_url = env::var(NATS_URL).unwrap_or_else(|_| "nats://localhost:4222".to_string());
+
+        // Validate NATS URL is not empty
+        if nats_url.is_empty() {
+            return Err(format!("{NATS_URL} cannot be empty"));
+        }
+
+        // Validate NATS URL format
+        if !nats_url.starts_with("nats://") && !nats_url.starts_with("tls://") {
+            return Err(format!(
+                "Invalid NATS URL format: {}. Must start with 'nats://' or 'tls://'",
+                nats_url
+            ));
+        }
+
+        let nats_creds_path = env::var(NATS_CREDS_PATH)
+            .ok()
+            .filter(|creds_path| !creds_path.is_empty());
+
+        // Validate NATS creds path if provided
+        if let Some(creds_path) = &nats_creds_path {
+            debug!("Validating nats creds path from env var {NATS_CREDS_PATH} : {nats_creds_path:?}");
+            let path = Path::new(creds_path);
+            if !path.exists() {
+                return Err(format!(
+                    "NATS credentials file does not exist: {}",
+                    creds_path
+                ));
+            }
+            if !path.is_file() {
+                return Err(format!(
+                    "NATS credentials path is not a file: {}",
+                    creds_path
+                ));
+            }
+            // Check if file is readable by attempting to read metadata
+            if std::fs::metadata(path).is_err() {
+                return Err(format!(
+                    "NATS credentials file is not readable: {}",
+                    creds_path
+                ));
+            }
+        }
+
+        let my_cluster_id_str = env::var(MY_CLUSTER_ID)
+            .map_err(|_| "Environment variable {MY_CLUSTER_ID} is required".to_string())?;
+
+        if my_cluster_id_str.is_empty() {
+            return Err(format!("{MY_CLUSTER_ID} cannot be empty"));
+        }
+
+        let desired_primary_str = env::var(DESIRED_PRIMARY)
+            .map_err(|_| "Environment variable {DESIRED_PRIMARY} is required".to_string())?;
+
+        if desired_primary_str.is_empty() {
+            return Err(format!("{DESIRED_PRIMARY} cannot be empty"));
+        }
+
+        Ok(Self {
+            nats_url,
+            nats_creds_path,
+            my_cluster_id: my_cluster_id_str.into(),
+            desired_primary: desired_primary_str.into(),
+            heartbeat_interval: Duration::from_millis(1000),
+        })
+    }
+}
--- a/harmony_agent/src/main.rs
+++ b/harmony_agent/src/main.rs
@@ -0,0 +1,82 @@
+use std::{sync::Arc, time::Duration};
+
+use crate::{
+    agent::AgentRole,
+    store::{ChaosKvStore, InMemoryKvStore, NatsKvStore},
+};
+
+// mod agent_loop;
+mod agent;
+pub mod store;
+mod workflow;
+
+#[tokio::main]
+async fn main() {
+    env_logger::init();
+
+    let heartbeat_interval = Duration::from_millis(2000);
+    let failover_timeout = Duration::from_secs(10);
+
+    // let (health_kv, cluster_kv) = get_chaos_store(&heartbeat_interval, &failover_timeout);
+
+    let nats_store = get_local_nats_store().await;
+    let health_kv = nats_store.clone();
+    let cluster_kv = nats_store.clone();
+
+    let _ = tokio::join!(
+        agent::launch_agent(
+            AgentRole::Primary,
+            health_kv.clone(),
+            cluster_kv.clone(),
+            heartbeat_interval,
+            failover_timeout
+        ),
+        agent::launch_agent(
+            AgentRole::Replica,
+            health_kv,
+            cluster_kv,
+            heartbeat_interval,
+            failover_timeout
+        ),
+    );
+}
+
+fn get_chaos_store(
+    heartbeat_interval: &Duration,
+    failover_timeout: &Duration,
+) -> (
+    Arc<ChaosKvStore<InMemoryKvStore>>,
+    Arc<ChaosKvStore<InMemoryKvStore>>,
+) {
+    let health_kv = Arc::new(ChaosKvStore::new(
+        InMemoryKvStore::new(),
+        10,
+        10,
+        heartbeat_interval.as_millis().try_into().unwrap(),
+    ));
+    let cluster_kv = Arc::new(ChaosKvStore::new(
+        InMemoryKvStore::new(),
+        5,
+        5,
+        failover_timeout.as_millis().try_into().unwrap(),
+    ));
+
+    (health_kv, cluster_kv)
+}
+
+async fn get_local_nats_store() -> Arc<NatsKvStore> {
+    let client = async_nats::connect("localhost").await.unwrap();
+    let jetstream = async_nats::jetstream::new(client);
+    let kv = jetstream
+        .create_key_value(async_nats::jetstream::kv::Config {
+            bucket: "kv".to_string(),
+            history: 10,
+            ..Default::default()
+        })
+        .await
+        .unwrap();
+    let status = kv.status().await.unwrap();
+    println!("status: {:?}", status);
+
+    Arc::new(NatsKvStore::new(kv))
+}
--- a/harmony_agent/src/store/chaos.rs
+++ b/harmony_agent/src/store/chaos.rs
@@ -0,0 +1,142 @@
+use async_trait::async_trait;
+use log::{debug, trace, warn};
+use serde_json::Value;
+use std::sync::Arc;
+use tokio::time::Duration;
+
+use crate::store::SubscriptionCallback;
+
+use super::{KvStore, KvStoreError};
+
+/// A chaos testing KV store that randomly times out or fails
+/// Wraps another KvStore implementation and adds random failures
+#[derive(Clone)]
+pub struct ChaosKvStore<T: KvStore> {
+    inner: Arc<T>,
+    timeout_probability_percent: u32,
+    failure_probability_percent: u32,
+    max_delay_ms: u64,
+}
+
+impl<T: KvStore> ChaosKvStore<T> {
+    pub fn new(
+        inner: T,
+        timeout_probability_percent: u32,
+        failure_probability_percent: u32,
+        max_delay_ms: u64,
+    ) -> Self {
+        Self {
+            inner: Arc::new(inner),
+            timeout_probability_percent,
+            failure_probability_percent,
+            max_delay_ms,
+        }
+    }
+
+    async fn maybe_chaos(&self) -> Result<(), KvStoreError> {
+        trace!("Calculating chaos");
+        // Random delay
+        let delay = getrandom::u64().unwrap() % self.max_delay_ms;
+        let delay = Duration::from_millis(delay);
+        trace!("Sleeping until chaos maybe happens {delay:?}");
+        tokio::time::sleep(delay).await;
+
+        // Random failure
+        let failure_random = getrandom::u32().unwrap() % 100;
+        if failure_random < self.failure_probability_percent {
+            warn!(
+                "Chaos causes an error : {failure_random} < {}",
+                self.failure_probability_percent
+            );
+            return Err(KvStoreError::Unknown(format!(
+                "Randomly failed thanks to chaos store with {}% chances, got {}",
+                self.failure_probability_percent, failure_random
+            )));
+        }
+
+        // Random timeout (simulated as a very long delay)
+        let failure_random = getrandom::u32().unwrap() % 100;
+        if failure_random < self.timeout_probability_percent {
+            warn!(
+                "Chaos caused a timeout : {failure_random} < {}",
+                self.failure_probability_percent
+            );
+            tokio::time::sleep(Duration::from_secs(189754678456784560)).await;
+        }
+
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl<T: KvStore + Send + Sync> KvStore for ChaosKvStore<T> {
+    async fn get(&self, key: &str) -> Result<super::KvResult, KvStoreError> {
+        self.maybe_chaos().await?;
+        self.inner.get(key).await
+    }
+
+    async fn get_revision(
+        &self,
+        key: &str,
+        expected_seq: u64,
+    ) -> Result<super::KvResult, KvStoreError> {
+        self.maybe_chaos().await?;
+        self.inner.get_revision(key, expected_seq).await
+    }
+
+    async fn set_strict(
+        &self,
+        key: &str,
+        value: Value,
+        expected_sequence: u64,
+    ) -> Result<u64, KvStoreError> {
+        self.maybe_chaos().await?;
+        self.inner.set_strict(key, value, expected_sequence).await
+    }
+
+    async fn subscribe(
+        &self,
+        key: &str,
+        callback: SubscriptionCallback,
+    ) -> Result<(), KvStoreError> {
+        self.maybe_chaos().await?;
+        self.inner.subscribe(key, callback).await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::store::InMemoryKvStore;
+    use serde_json::json;
+
+    #[tokio::test]
+    async fn test_chaos_store_with_no_chaos() {
+        let inner = InMemoryKvStore::new();
+        let chaos = ChaosKvStore::new(inner, 0, 0, 1);
+
+        let value = json!({"test": "value"});
+        let result = chaos.set_strict("key", value.clone(), 0).await.unwrap();
+        assert_eq!(result, 1);
+
+        let retrieved = chaos.get("key").await.unwrap();
+        assert_eq!(retrieved.value, Some(value));
+    }
+
+    #[tokio::test]
+    async fn test_chaos_store_with_delay() {
+        let inner = InMemoryKvStore::new();
+        let chaos = ChaosKvStore::new(inner, 0, 0, 100);
+
+        let start = tokio::time::Instant::now();
+        let value = json!({"test": "value"});
+        chaos.set_strict("key", value, 0).await.unwrap();
+        let elapsed = start.elapsed();
+
+        // Should have some delay
+        assert!(
+            elapsed.as_millis() < 150,
+            "Should complete within reasonable time"
+        );
+    }
+}
--- a/harmony_agent/src/store/memory.rs
+++ b/harmony_agent/src/store/memory.rs
@@ -0,0 +1,196 @@
+use async_trait::async_trait;
+use log::{debug, trace};
+use serde_json::Value;
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::{SystemTime, UNIX_EPOCH};
+use tokio::sync::RwLock;
+
+use crate::store::SubscriptionCallback;
+
+use super::{KvMetadata, KvResult, KvStore, KvStoreError};
+
+/// An in-memory KV store that guarantees ordering like NATS JetStream
+/// Each key maintains a full history of all writes, where the sequence number
+/// is the length of the history (1-indexed)
+#[derive(Clone)]
+pub struct InMemoryKvStore {
+    data: Arc<RwLock<HashMap<String, Vec<(Value, u64)>>>>,
+}
+
+impl InMemoryKvStore {
+    pub fn new() -> Self {
+        Self {
+            data: Arc::new(RwLock::new(HashMap::new())),
+        }
+    }
+
+    /// Get the latest sequence number for a key (length of history)
+    pub async fn get_seq(&self, key: &str) -> Option<u64> {
+        self.data.read().await.get(key).map(|vec| vec.len() as u64)
+    }
+
+    /// Get the value at a specific revision for a key
+    pub async fn get_revision(&self, key: &str, seq: u64) -> Result<KvResult, KvStoreError> {
+        let data = self.data.read().await;
+        let entries = data
+            .get(key)
+            .ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
+
+        // Sequence numbers are 1-indexed, so seq must be >= 1 and <= len()
+        if seq == 0 || seq > entries.len() as u64 {
+            return Err(KvStoreError::KeyNotAvailable(key.to_string()));
+        }
+
+        let (value, timestamp) = entries[seq as usize - 1].clone();
+
+        Ok(KvResult {
+            value: Some(value.clone()),
+            metadata: KvMetadata {
+                timestamp,
+                sequence: seq,
+            },
+        })
+    }
+}
+
+impl Default for InMemoryKvStore {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl KvStore for InMemoryKvStore {
+    async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
+        self.get_revision(key, expected_seq).await
+    }
+
+    async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
+        let data = self.data.read().await;
+        let entries = data
+            .get(key)
+            .ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
+
+        let (value, timestamp) = entries.last().unwrap();
+
+        Ok(KvResult {
+            value: Some(value.clone()),
+            metadata: KvMetadata {
+                timestamp: *timestamp,
+                sequence: entries.len() as u64,
+            },
+        })
+    }
+
+    async fn set_strict(
+        &self,
+        key: &str,
+        value: Value,
+        expected_sequence: u64,
+    ) -> Result<u64, KvStoreError> {
+        // Check current sequence (length of history for this key)
+        let data = self.data.read().await;
+        // This implemenetation does not seem to match the NATS sequence. In nats the
+        // sequence updates one counter per bucket. This impl creates a counter per key
+        let current_sequence = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
+        drop(data);
+
+        // Verify expected sequence matches
+        if current_sequence != expected_sequence {
+            trace!("{current_sequence} != {expected_sequence}");
+            return Err(KvStoreError::WrongLastRevision);
+        }
+
+        // Get current timestamp
+        let timestamp = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .expect("Time went backwards")
+            .as_millis() as u64;
+
+        // Append to the history
+        let mut data = self.data.write().await;
+        data.entry(key.to_string())
+            .or_insert_with(Vec::new)
+            .push((value.clone(), timestamp));
+
+        let new_seq = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
+
+        debug!(
+            "Successfully inserted {key}(rev#{new_seq}) : {value}",
+            value = value.to_string()
+        );
+
+        Ok(new_seq)
+    }
+
+    async fn subscribe(
+        &self,
+        key: &str,
+        callback: SubscriptionCallback,
+    ) -> Result<(), KvStoreError> {
+        // For now, subscribe just returns the current value
+        // In a real implementation, this would return a stream of updates
+        self.get(key).await;
+        todo!() // register callback and call it when key is set ?
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[tokio::test]
+    async fn test_memory_store_basic() {
+        let store = InMemoryKvStore::new();
+
+        // Set a value
+        let value = json!({"status": "healthy"});
+        let result = store
+            .set_strict("test_key", value.clone(), 0)
+            .await
+            .unwrap();
+        assert_eq!(result, 1);
+
+        // Get the value
+        let retrieved = store.get("test_key").await.unwrap();
+        assert_eq!(retrieved.value, Some(value));
+        assert_eq!(retrieved.metadata.sequence, 1);
+    }
+
+    #[tokio::test]
+    async fn test_memory_store_sequence_numbers() {
+        let store = InMemoryKvStore::new();
+
+        let seq1 = store.set_strict("key1", json!("value1"), 0).await.unwrap();
+
+        let seq2 = store.set_strict("key1", json!("value2"), 1).await.unwrap();
+
+        assert!(seq2 > seq1, "Sequence numbers should increment");
+    }
+
+    #[tokio::test]
+    async fn test_memory_store_key_not_found() {
+        let store = InMemoryKvStore::new();
+        let result = store.get("nonexistent").await;
+        assert!(matches!(result, Err(KvStoreError::KeyNotAvailable(_))));
+    }
+
+    #[tokio::test]
+    async fn test_memory_store_strict_ordering() {
+        let store = InMemoryKvStore::new();
+
+        // First write with sequence 0
+        let result1 = store.set_strict("key", json!("value1"), 0).await.unwrap();
+        assert_eq!(result1, 1);
+
+        // Second write with correct sequence
+        let result2 = store.set_strict("key", json!("value2"), 1).await.unwrap();
+        assert_eq!(result2, 2);
+
+        // Third write with wrong sequence should fail
+        let result3 = store.set_strict("key", json!("value3"), 1).await;
+        assert!(matches!(result3, Err(KvStoreError::WrongLastRevision)));
+    }
+}
--- a/harmony_agent/src/store/mod.rs
+++ b/harmony_agent/src/store/mod.rs
@@ -0,0 +1,120 @@
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use thiserror::Error;
+
+/// Handle for managing active subscriptions
+#[derive(Debug, Clone)]
+pub struct SubscriptionHandle {
+    id: usize,
+    _phantom: std::marker::PhantomData<()>,
+}
+
+/// Metadata returned by the KV store for all operations
+/// Contains timing and ordering information set by the store
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
+pub struct KvMetadata {
+    /// Timestamp set by the store (milliseconds since UNIX epoch)
+    pub timestamp: u64,
+    /// Sequence number for strict ordering guarantees
+    pub sequence: u64,
+}
+
+/// Result returned by KV store operations
+/// Contains both the value (if any) and store metadata
+#[derive(Debug, Clone)]
+pub struct KvResult {
+    /// The value from the store (None if key doesn't exist)
+    pub value: Option<Value>,
+    /// Store-provided metadata (timestamp, sequence)
+    pub metadata: KvMetadata,
+}
+
+/// Callback type for subscription updates
+/// Callback receives: key, new value (None if deleted), and metadata
+pub type SubscriptionCallback = Box<dyn Fn(String, Option<Value>, KvMetadata) + Send + Sync>;
+
+#[derive(Error, Debug)]
+pub enum KvStoreError {
+    #[error("data store disconnected")]
+    Disconnect(#[from] std::io::Error),
+    #[error("invalid key")]
+    InvalidKey,
+    #[error("operation timed out")]
+    Timeout,
+    #[error("the data for key `{0}` is not available")]
+    KeyNotAvailable(String),
+    #[error("Failed to deserialize value to json. Error {0} , value: {1}", .deserialization_error, .value)]
+    DeserializationFailed {
+        deserialization_error: String,
+        value: String,
+    },
+    #[error("Strict ordering violation, wrong last sequence number")]
+    WrongLastRevision,
+    #[error("unknown data store error {0}")]
+    Unknown(String),
+}
+
+#[async_trait]
+pub trait KvStore {
+    /// Get a value from the store
+    ///
+    /// # Returns
+    /// - `Ok(KvResult)`: Contains the value and metadata (timestamp, sequence)
+    /// - `Err(KeyNotAvailable)`: If the key doesn't exist
+    async fn get(&self, key: &str) -> Result<KvResult, KvStoreError>;
+
+    async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError>;
+
+    /// Strict set operation with compare-and-set semantics
+    ///
+    /// Sets the value only if the current sequence number matches `expected_sequence`.
+    /// This provides strict ordering guarantees needed for the failover algorithm.
+    ///
+    /// # Parameters
+    /// - `key`: The key to set
+    /// - `value`: The value to store
+    /// - `expected_sequence`: The sequence number we expect the key to currently have.
+    ///   Use 0 for the first write to a new key.
+    ///
+    /// # Returns
+    /// - `Ok(u64)`: Returns the new sequence number
+    /// - `Err(KvStoreError)`: If another write happened (current != expected)
+    ///
+    /// # Example Use Case
+    /// For NATS JetStream, this maps to the conditional update operation that ensures
+    /// only one agent can successfully promote to primary.
+    async fn set_strict(
+        &self,
+        key: &str,
+        value: Value,
+        expected_sequence: u64,
+    ) -> Result<u64, KvStoreError>;
+
+    /// Subscribe to updates for a key
+    ///
+    /// # Parameters
+    /// - `key`: The key to subscribe to
+    /// - `callback`: Function to call on each update with key, value, and metadata
+    ///
+    /// # Returns
+    /// - `Ok(())`: Subscription established successfully
+    /// - `Err(KvStoreError)`: Subscription failed
+    ///
+    /// Note: For JetStream, this should use watch() API. Updates will invoke the callback
+    /// asynchronously in the background.
+    async fn subscribe(
+        &self,
+        key: &str,
+        callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
+                                        // callback
+    ) -> Result<(), KvStoreError>;
+}
+
+mod chaos;
+mod memory;
+mod nats;
+
+pub use chaos::ChaosKvStore;
+pub use memory::InMemoryKvStore;
+pub use nats::NatsKvStore;
--- a/harmony_agent/src/store/nats.rs
+++ b/harmony_agent/src/store/nats.rs
@@ -0,0 +1,179 @@
+use async_nats::jetstream::kv::{Store, UpdateError};
+use async_trait::async_trait;
+use log::{debug, error, trace};
+use serde_json::Value;
+
+use crate::store::SubscriptionCallback;
+
+use super::{KvMetadata, KvResult, KvStore, KvStoreError};
+
+/// NATS JetStream-backed KV store
+pub struct NatsKvStore {
+    store: Store,
+}
+
+impl NatsKvStore {
+    pub fn new(store: Store) -> Self {
+        Self { store }
+    }
+
+    pub async fn create(
+        client: async_nats::Client,
+        bucket_name: &str,
+        history_size: i64,
+    ) -> Result<Self, Box<dyn std::error::Error>> {
+        let jetstream = async_nats::jetstream::new(client);
+
+        debug!("Creating NATS KV bucket: {}", bucket_name);
+        let store = jetstream
+            .create_key_value(async_nats::jetstream::kv::Config {
+                bucket: bucket_name.to_string(),
+                history: history_size,
+                ..Default::default()
+            })
+            .await
+            .map_err(|e| {
+                error!(
+                    "Failed to initialize NATS KV bucket '{}': {}",
+                    bucket_name, e
+                );
+                e
+            })?;
+
+        Ok(Self::new(store))
+    }
+}
+
+#[async_trait]
+impl KvStore for NatsKvStore {
+    async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
+        let entry = self
+            .store
+            .entry_for_revision(key, expected_seq)
+            .await
+            .map_err(|e| {
+                error!("NATS get failed for key '{}': {}", key, e);
+                KvStoreError::Disconnect(std::io::Error::new(
+                    std::io::ErrorKind::Other,
+                    e.to_string(),
+                ))
+            })?;
+
+        if entry.is_none() {
+            return Err(KvStoreError::KeyNotAvailable(key.to_string()));
+        }
+
+        let entry = entry.unwrap();
+        let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
+            KvStoreError::DeserializationFailed {
+                deserialization_error: e.to_string(),
+                value: String::from_utf8_lossy(&entry.value).to_string(),
+            }
+        })?;
+
+        // Extract metadata from NATS entry
+        // Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
+        let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
+
+        let metadata = KvMetadata {
+            timestamp,
+            sequence: entry.revision,
+        };
+
+        Ok(KvResult {
+            value: Some(value),
+            metadata,
+        })
+    }
+
+    async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
+        let entry = self.store.entry(key).await.map_err(|e| {
+            error!("NATS get failed for key '{}': {}", key, e);
+            KvStoreError::Disconnect(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                e.to_string(),
+            ))
+        })?;
+
+        if entry.is_none() {
+            return Err(KvStoreError::KeyNotAvailable(key.to_string()));
+        }
+
+        let entry = entry.unwrap();
+        let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
+            KvStoreError::DeserializationFailed {
+                deserialization_error: e.to_string(),
+                value: String::from_utf8_lossy(&entry.value).to_string(),
+            }
+        })?;
+
+        // Extract metadata from NATS entry
+        // Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
+        let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
+
+        let metadata = KvMetadata {
+            timestamp,
+            sequence: entry.revision,
+        };
+
+        Ok(KvResult {
+            value: Some(value),
+            metadata,
+        })
+    }
+
+    async fn set_strict(
+        &self,
+        key: &str,
+        value: Value,
+        expected_sequence: u64,
+    ) -> Result<u64, KvStoreError> {
+        trace!(
+            "Nats set strict {key} (#{expected_sequence}) : {}",
+            value.to_string()
+        );
+        let bytes =
+            serde_json::to_vec(&value).map_err(|e| KvStoreError::DeserializationFailed {
+                deserialization_error: e.to_string(),
+                value: value.to_string(),
+            })?;
+
+        // Use update() for CAS semantics (Compare-And-Set)
+        // This ensures we only write if the revision matches expected_sequence
+        let revision = self
+            .store
+            .update(&key, bytes.into(), expected_sequence)
+            .await
+            .map_err(|e| {
+                // FIXME this is ugly, we should have a clean KvStoreError containing
+                // proper information from nats instead
+                error!("NATS update failed for key '{}': {}", key, e);
+                e
+            })?;
+
+        Ok(revision)
+    }
+
+    async fn subscribe(
+        &self,
+        key: &str,
+        callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
+    ) -> Result<(), KvStoreError> {
+        todo!()
+    }
+}
+
+impl From<UpdateError> for KvStoreError {
+    fn from(value: UpdateError) -> Self {
+        match value.kind() {
+            async_nats::jetstream::kv::UpdateErrorKind::InvalidKey => KvStoreError::InvalidKey,
+            async_nats::jetstream::kv::UpdateErrorKind::TimedOut => KvStoreError::Timeout,
+            async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision => {
+                KvStoreError::WrongLastRevision
+            }
+            async_nats::jetstream::kv::UpdateErrorKind::Other => KvStoreError::Disconnect(
+                std::io::Error::new(std::io::ErrorKind::Other, "NATS update error"),
+            ),
+        }
+    }
+}
--- a/harmony_agent/src/workflow/mod.rs
+++ b/harmony_agent/src/workflow/mod.rs
@@ -0,0 +1,39 @@
+use std::sync::Arc;
+
+use crate::agent::AgentConfig;
+use async_trait::async_trait;
+
+pub mod primary;
+pub mod replica;
+
+/// Trait that defines how a workflow (Primary or Replica) handles heartbeat events
+#[async_trait]
+pub trait HeartbeatWorkflow: Send + Sync {
+    /// Handle a successful heartbeat
+    async fn handle_heartbeat_success(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+        agent_config: &AgentConfig,
+    ) -> Option<crate::agent::ClusterStateData>;
+
+    /// Handle a failed heartbeat
+    async fn handle_heartbeat_failure(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+    );
+
+    async fn on_startup(
+        &self,
+        cluster_state: Option<&crate::agent::heartbeat::ClusterStateData>,
+        agent_config: &AgentConfig,
+    );
+
+    /// Get the current state name for logging (also used for heartbeat status)
+    fn state_name(&self) -> &'static str;
+
+    /// Get current consecutive successes
+    fn consecutive_successes(&self) -> usize;
+
+    /// Get current consecutive failures
+    fn consecutive_failures(&self) -> usize;
+}
--- a/harmony_agent/src/workflow/primary.rs
+++ b/harmony_agent/src/workflow/primary.rs
@@ -0,0 +1,330 @@
+use async_trait::async_trait;
+use log::{debug, info, trace, warn};
+
+use crate::{
+    agent::{AgentConfig, DeploymentConfig},
+    workflow::HeartbeatWorkflow,
+};
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum PrimaryState {
+    Initializing,
+    Healthy,
+    Failed,
+    Fenced,
+    Yielding,
+}
+
+impl PrimaryState {
+    pub fn name(&self) -> &'static str {
+        match self {
+            PrimaryState::Initializing => "Primary:Initializing",
+            PrimaryState::Healthy => "Primary:Healthy",
+            PrimaryState::Failed => "Primary:Failed",
+            PrimaryState::Fenced => "Primary:Fenced",
+            PrimaryState::Yielding => "Primary:Yielding",
+        }
+    }
+}
+
+pub struct PrimaryWorkflow {
+    state: PrimaryState,
+    consecutive_successes: usize,
+    consecutive_failures: usize,
+
+    // TODO these thresholds should not be copied into the workflow struct. They are configuration
+    // level and should always be read from the context passed to the workflow functions
+    success_threshold: usize,
+    failure_threshold: usize,
+
+    // TODO not sure if this should be known by the workflow or passed in the context to function
+    // calls or just completely handled by the agent ?
+    deployment_config: DeploymentConfig,
+}
+
+impl PrimaryWorkflow {
+    pub fn new(
+        success_threshold: usize,
+        failure_threshold: usize,
+        deployment_config: DeploymentConfig,
+    ) -> Self {
+        Self {
+            state: PrimaryState::Initializing,
+            consecutive_successes: 0,
+            consecutive_failures: 0,
+            success_threshold,
+            failure_threshold,
+            deployment_config,
+        }
+    }
+
+    fn transition_to(&mut self, new_state: PrimaryState) {
+        if self.state != new_state {
+            info!(
+                "State transition: {} -> {}",
+                self.state.name(),
+                new_state.name()
+            );
+            self.state = new_state;
+        }
+    }
+}
+
+#[async_trait]
+impl HeartbeatWorkflow for PrimaryWorkflow {
+    async fn on_startup(
+        &self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+        agent_config: &AgentConfig,
+    ) {
+        if let Some(state) = cluster_state {
+            info!(
+                "Startup reconciliation: current primary is {:?}, desired primary is {:?}",
+                state.cluster_info.current_primary, state.cluster_info.desired_primary
+            );
+
+            // No automatic fast-tracking - agent must earn healthy status
+            // through successful heartbeats. This prevents duplicate agents
+            // or crashloop agents from incorrectly claiming primary.
+        } else {
+            debug!("No cluster state on startup, starting from Initializing");
+        }
+    }
+    async fn handle_heartbeat_success(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+        agent_config: &AgentConfig,
+    ) -> Option<crate::agent::ClusterStateData> {
+        trace!(
+            "Handling heartbeat success, current counters success {} failures {}",
+            self.consecutive_successes, self.consecutive_failures
+        );
+        self.consecutive_successes += 1;
+        self.consecutive_failures = 0;
+
+        match self.state {
+            PrimaryState::Initializing => {
+                if self.consecutive_successes >= self.success_threshold {
+                    self.transition_to(PrimaryState::Healthy);
+                    // Trigger on_active callback
+                    let config = self.deployment_config.clone();
+                    tokio::spawn(async move {
+                        config.on_active().await;
+                    });
+                    if let Some(state) = cluster_state
+                        && state.cluster_info.desired_primary == agent_config.desired_primary_id
+                    {
+                        debug!("state {:#?}", state);
+                        let mut new_state = state.clone();
+                        new_state.cluster_info.current_primary =
+                            Some(agent_config.agent_id.clone());
+                        return Some(new_state);
+                    } else {
+                        todo!(
+                            "I cluster_state should not be an option, and we should throw an error when we are running a primary workflow but we are not the desired primary in the cluster state data"
+                        );
+                    }
+                }
+                None
+            }
+            PrimaryState::Failed => {
+                if self.consecutive_successes >= self.success_threshold {
+                    self.transition_to(PrimaryState::Healthy);
+                    let config = self.deployment_config.clone();
+                    tokio::spawn(async move {
+                        config.on_active().await;
+                    });
+                }
+                todo!()
+            }
+            PrimaryState::Healthy => {
+                // Stay healthy
+                debug!("Primary staying healthy");
+                None
+            }
+            PrimaryState::Fenced => {
+                // Recovery from fenced state
+                if self.consecutive_successes >= self.success_threshold {
+                    // TODO: Check NATS for current_primary status before recovering
+                    info!("Recovered from fenced state, transitioning to yielding");
+                    self.transition_to(PrimaryState::Yielding);
+                }
+                todo!()
+            }
+            PrimaryState::Yielding => {
+                // TODO: Check NATS to see if we can resume as primary
+                trace!("Yielding, waiting for demotion handshake");
+                todo!()
+            }
+        }
+    }
+
+    async fn handle_heartbeat_failure(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+    ) {
+        self.consecutive_failures += 1;
+        self.consecutive_successes = 0;
+
+        match self.state {
+            PrimaryState::Healthy => {
+                if self.consecutive_failures >= self.failure_threshold {
+                    warn!(
+                        "Failure threshold reached ({}/{}), transitioning to Failed",
+                        self.consecutive_failures, self.failure_threshold
+                    );
+                    self.transition_to(PrimaryState::Failed);
+
+                    // Immediately fence
+                    self.transition_to(PrimaryState::Fenced);
+                    let config = self.deployment_config.clone();
+                    tokio::spawn(async move {
+                        config.on_failover().await;
+                    });
+                }
+            }
+            PrimaryState::Initializing => {
+                // Stay in initializing, just accumulate failures
+                trace!("Heartbeat failed during initialization");
+            }
+            PrimaryState::Failed | PrimaryState::Fenced | PrimaryState::Yielding => {
+                // Already in a degraded state
+                trace!("Heartbeat failed in degraded state: {}", self.state.name());
+            }
+        }
+    }
+
+    fn state_name(&self) -> &'static str {
+        self.state.name()
+    }
+
+    fn consecutive_successes(&self) -> usize {
+        self.consecutive_successes
+    }
+
+    fn consecutive_failures(&self) -> usize {
+        self.consecutive_failures
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use harmony_types::id::Id;
+    use std::time::Duration;
+
+    use crate::agent::{AgentRole, FailoverCNPGConfig};
+
+    use pretty_assertions::assert_eq;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn primary_does_nothing_when_on_heartbeat_success_below_threshold() {
+        let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
+
+        assert!(
+            primary
+                .handle_heartbeat_success(Some(&cluster_state), &agent_config)
+                .await
+                .is_none()
+        );
+    }
+
+    #[tokio::test]
+    async fn primary_transitions_cluster_state_when_consecutive_success_threshold_reached() {
+        let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
+
+        let mut expected_state = cluster_state.clone();
+        expected_state.cluster_info.current_primary = Some(Id::empty());
+
+        assert_eq!(
+            primary
+                .handle_heartbeat_success(Some(&cluster_state), &agent_config)
+                .await,
+            None
+        );
+        assert_eq!(
+            primary
+                .handle_heartbeat_success(Some(&cluster_state), &agent_config)
+                .await,
+            Some(expected_state)
+        );
+    }
+
+    #[tokio::test]
+    async fn primary_stays_healthy_below_failure_threshold() {
+        let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
+
+        // Reach healthy
+        let _ = primary
+            .handle_heartbeat_success(Some(&cluster_state), &agent_config)
+            .await;
+        assert_eq!(primary.state, PrimaryState::Healthy);
+
+        // One failure below threshold
+        primary.handle_heartbeat_failure(Some(&cluster_state)).await;
+        assert_eq!(primary.state, PrimaryState::Healthy);
+        assert_eq!(primary.consecutive_failures(), 1);
+        assert_eq!(primary.consecutive_successes(), 0);
+    }
+
+    #[tokio::test]
+    async fn primary_transitions_to_failed_at_failure_threshold() {
+        let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
+
+        // Reach healthy
+        let _ = primary
+            .handle_heartbeat_success(Some(&cluster_state), &agent_config)
+            .await;
+        assert_eq!(primary.state, PrimaryState::Healthy);
+
+        // First failure, still healthy
+        primary.handle_heartbeat_failure(Some(&cluster_state)).await;
+        assert_eq!(primary.state, PrimaryState::Healthy);
+        assert_eq!(primary.consecutive_failures(), 1);
+
+        // Second failure reaches threshold, transitions to Failed
+        primary.handle_heartbeat_failure(Some(&cluster_state)).await;
+        assert_eq!(primary.state, PrimaryState::Fenced);
+        assert_eq!(primary.consecutive_failures(), 2);
+        assert_eq!(primary.consecutive_successes(), 0);
+    }
+
+    fn default_test_state(
+        success_threshold: usize,
+        failure_threshold: usize,
+    ) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) {
+        let cluster_state = crate::agent::ClusterStateData {
+            cluster_info: crate::agent::heartbeat::ClusterState {
+                cluster_id: Id::empty(),
+                current_primary: None,
+                desired_primary: Id::empty(),
+            },
+            metadata: None,
+        };
+
+        let agent_config = AgentConfig {
+            success_threshold,
+            failure_threshold,
+            heartbeat_interval: Duration::from_nanos(0),
+            failover_timeout: Duration::from_nanos(0),
+            deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
+                cnpg_cluster_name: "test".to_string(),
+            }),
+            nats_url: String::new(),
+            nats_creds_path: None,
+            agent_id: Id::empty(),
+            cluster_id: Id::empty(),
+            desired_primary_id: Id::empty(),
+            role: AgentRole::Primary,
+        };
+
+        let primary = PrimaryWorkflow::new(
+            agent_config.success_threshold,
+            agent_config.failure_threshold,
+            agent_config.deployment_config_unstable.clone(),
+        );
+
+        (primary, cluster_state, agent_config)
+    }
+}
--- a/harmony_agent/src/workflow/replica.rs
+++ b/harmony_agent/src/workflow/replica.rs
@@ -0,0 +1,279 @@
+use async_trait::async_trait;
+use harmony_types::id::Id;
+use log::{debug, error, info, trace, warn};
+use std::time::Duration;
+use tokio::sync::RwLock;
+
+use crate::agent::{AgentConfig, AgentHeartbeat};
+use crate::workflow::HeartbeatWorkflow;
+
+#[derive(Debug, Clone)]
+pub struct HeartbeatState {
+    pub agent_id: Id,
+    pub last_seq: Option<u64>,
+}
+
+impl HeartbeatState {
+    pub fn watch(agent_id: Id) -> Self {
+        Self {
+            agent_id,
+            last_seq: None,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct ClusterState {
+    pub cluster_id: Id,
+    pub current_primary: Option<Id>,
+}
+
+impl ClusterState {
+    pub fn watch(cluster_id: Id) -> Self {
+        Self {
+            cluster_id,
+            current_primary: None,
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum ReplicaState {
+    Initializing,
+    Watching,
+    Promoting,
+    PromotionFailed,
+    Leader,
+    Demoting,
+    Failed,
+}
+
+impl ReplicaState {
+    pub fn name(&self) -> &'static str {
+        match self {
+            ReplicaState::Initializing => "Replica:Initializing",
+            ReplicaState::Watching => "Replica:Watching",
+            ReplicaState::Promoting => "Replica:Promoting",
+            ReplicaState::PromotionFailed => "Replica:PromotionFailed",
+            ReplicaState::Leader => "Replica:Leader",
+            ReplicaState::Demoting => "Replica:Demoting",
+            ReplicaState::Failed => "Replica:Failed",
+        }
+    }
+}
+
+pub struct ReplicaWorkflow {
+    state: ReplicaState,
+    heartbeat_state: HeartbeatState,
+    primary_state: HeartbeatState,
+    cluster_state: ClusterState,
+    consecutive_successes: usize,
+    consecutive_failures: usize,
+    success_threshold: usize,
+    failure_threshold: usize,
+    failover_timeout: Duration,
+    /// Our own last heartbeat (for timestamp comparison against primary)
+    last_my_heartbeat: Option<AgentHeartbeat>,
+    /// Last observed primary heartbeat (metadata only, for staleness detection)
+    last_primary_heartbeat: Option<RwLock<AgentHeartbeat>>,
+}
+
+impl ReplicaWorkflow {
+    pub fn new(
+        success_threshold: usize,
+        failure_threshold: usize,
+        cluster_id: Id,
+        primary_id: Id,
+        my_id: Id,
+        failover_timeout: Duration,
+    ) -> Self {
+        Self {
+            state: ReplicaState::Initializing,
+            consecutive_successes: 0,
+            consecutive_failures: 0,
+            success_threshold,
+            failure_threshold,
+            failover_timeout,
+            cluster_state: ClusterState::watch(cluster_id),
+            primary_state: HeartbeatState::watch(primary_id),
+            heartbeat_state: HeartbeatState::watch(my_id),
+            last_my_heartbeat: None,
+            last_primary_heartbeat: None,
+        }
+    }
+
+    fn transition_to(&mut self, new_state: ReplicaState) {
+        if self.state != new_state {
+            info!(
+                "State transition: {} -> {}",
+                self.state.name(),
+                new_state.name()
+            );
+            self.state = new_state;
+        }
+    }
+
+    /// Check if the primary heartbeat is stale compared to our own
+    /// Per ADR-017-3: primary is stale if (replica_timestamp - primary_timestamp) > failover_timeout
+    async fn is_primary_stale(&mut self) -> bool {
+        if let Some(my_hb) = &self.last_my_heartbeat {
+            if let Some(my_metadata) = &my_hb.metadata {
+                if let Some(primary_hb_ref) = self.last_primary_heartbeat.as_ref() {
+                    let primary_hb = primary_hb_ref.read().await;
+                    if let Some(primary_metadata) = &primary_hb.metadata {
+                        // Calculate time difference: replica_timestamp - primary_timestamp
+                        let time_diff_ms = my_metadata
+                            .timestamp
+                            .saturating_sub(primary_metadata.timestamp);
+                        let failover_timeout_ms = self.failover_timeout.as_millis() as u64;
+
+                        trace!(
+                            "Staleness check: my_ts={}, primary_ts={}, diff={}ms, timeout={}ms",
+                            my_metadata.timestamp,
+                            primary_metadata.timestamp,
+                            time_diff_ms,
+                            failover_timeout_ms
+                        );
+
+                        if time_diff_ms > failover_timeout_ms {
+                            info!(
+                                "Primary heartbeat stale ({}ms > {}ms), attempting promotion",
+                                time_diff_ms, failover_timeout_ms
+                            );
+
+                            return true;
+                        }
+                    }
+                }
+            }
+        }
+        false
+    }
+}
+
+#[async_trait]
+impl HeartbeatWorkflow for ReplicaWorkflow {
+    async fn on_startup(
+        &self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+        agent_config: &AgentConfig,
+    ) {
+        // todo!("not sure if the replica should do anything on startup")
+    }
+
+    async fn handle_heartbeat_success(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+        agent_config: &AgentConfig,
+    ) -> Option<crate::agent::ClusterStateData> {
+        trace!(
+            "Handling heartbeat success, current counters success {} failures {}",
+            self.consecutive_successes, self.consecutive_failures
+        );
+        self.consecutive_successes += 1;
+        self.consecutive_failures = 0;
+
+        match self.state {
+            ReplicaState::Initializing => {
+                if self.consecutive_successes >= self.success_threshold {
+                    self.transition_to(ReplicaState::Watching);
+                }
+                None
+            }
+            ReplicaState::Watching => {
+                // TODO: Check primary staleness from NATS
+                trace!("Replica watching primary");
+                if self.is_primary_stale().await {
+                    panic!("Found stale primary, launching promotion");
+                }
+                debug!("perform the replica watch actions :
+                - if a primary exists in the cluster (cluster_state.current_primary == expected_primary)
+                    - check the last primary heartbeat kv timestamp
+                    - compare it with our latest kv heartbeat
+                    - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself)
+                    - launching promotion will change the status of the replica
+                    ");
+
+                None
+            }
+            ReplicaState::Promoting => {
+                // TODO: Complete promotion attempt
+                trace!("Replica promotion in progress");
+                todo!(
+                    "When promoting, a heartbeat failure does not affect promotion unless failure_threshold is reached, a heartbeat success does nothing either"
+                );
+            }
+            ReplicaState::PromotionFailed => {
+                if self.consecutive_successes >= self.success_threshold {
+                    self.transition_to(ReplicaState::Watching);
+                }
+                todo!()
+            }
+            ReplicaState::Leader => {
+                // TODO: Check for original primary recovery
+                trace!("Replica acting as leader");
+                todo!()
+            }
+            ReplicaState::Failed => {
+                if self.consecutive_successes >= self.success_threshold {
+                    info!("Replica recovered from Failed state, transitioning to Watching");
+                    self.transition_to(ReplicaState::Watching);
+                }
+                todo!()
+            }
+            ReplicaState::Demoting => {
+                // TODO: Complete demotion back to watching
+                trace!("Replica demotion in progress");
+                todo!()
+            }
+        }
+    }
+
+    async fn handle_heartbeat_failure(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+    ) {
+        self.consecutive_failures += 1;
+        self.consecutive_successes = 0;
+
+        // TODO revisit this. I think we should handle the agent healthiness (checking
+        // consecutive_failures against failure_threshold) separately from handling the cluster
+        // state.
+        //
+        // That said, there might be funny stuff we have to do when the agent reaches the failure
+        // threshold, especially in promoting and demoting statuses.
+
+        match self.state {
+            ReplicaState::Watching | ReplicaState::Initializing => {
+                if self.consecutive_failures >= self.failure_threshold {
+                    info!(
+                        "Replica exceeded failure threshold ({}/{}), transitioning to Failed",
+                        self.consecutive_failures, self.failure_threshold
+                    );
+                    self.transition_to(ReplicaState::Failed);
+                } else {
+                    trace!("Replica heartbeat failed, but below threshold");
+                }
+            }
+            ReplicaState::Promoting
+            | ReplicaState::PromotionFailed
+            | ReplicaState::Leader
+            | ReplicaState::Demoting
+            | ReplicaState::Failed => {
+                trace!("Replica heartbeat failed in state: {}", self.state.name());
+            }
+        }
+    }
+
+    fn state_name(&self) -> &'static str {
+        self.state.name()
+    }
+
+    fn consecutive_successes(&self) -> usize {
+        self.consecutive_successes
+    }
+
+    fn consecutive_failures(&self) -> usize {
+        self.consecutive_failures
+    }
+}
--- a/harmony_execution/Cargo.toml
+++ b/harmony_execution/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "harmony_execution"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+thiserror.workspace = true
+lazy_static.workspace = true
+directories.workspace = true
+log.workspace = true
--- a/harmony_execution/src/command.rs
+++ b/harmony_execution/src/command.rs
@@ -0,0 +1,470 @@
+use std::io::{BufRead, BufReader};
+use std::process::{Child, Command, Stdio};
+use std::sync::Arc;
+use std::thread;
+
+/// Captured output from a command execution
+#[derive(Debug, Clone)]
+pub struct CommandOutput {
+    /// Captured stdout content
+    pub stdout: String,
+    /// Captured stderr content
+    pub stderr: String,
+    /// Exit status of the command
+    pub status: CommandStatus,
+}
+
+impl CommandOutput {
+    /// Returns true if the command succeeded
+    pub fn is_success(&self) -> bool {
+        self.status.is_success()
+    }
+
+    /// Formats the complete output for display
+    pub fn format_output(&self) -> String {
+        format!(
+            "Stdout:\n{}\n\nStderr:\n{}",
+            if self.stdout.is_empty() {
+                "<empty>"
+            } else {
+                &self.stdout
+            },
+            if self.stderr.is_empty() {
+                "<empty>"
+            } else {
+                &self.stderr
+            }
+        )
+    }
+}
+
+/// Result status of a command execution
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum CommandStatus {
+    /// Command executed successfully (exit code 0)
+    Success,
+    /// Command failed with an exit code
+    Failed(i32),
+    /// Command was terminated by a signal
+    Terminated(i32),
+    /// Command execution could not be started
+    Error(String),
+}
+
+impl CommandStatus {
+    pub fn is_success(&self) -> bool {
+        matches!(self, CommandStatus::Success)
+    }
+}
+
+impl From<std::process::ExitStatus> for CommandStatus {
+    fn from(status: std::process::ExitStatus) -> Self {
+        if status.success() {
+            CommandStatus::Success
+        } else if let Some(code) = status.code() {
+            CommandStatus::Failed(code)
+        } else {
+            CommandStatus::Terminated(0) // Signal codes are platform-specific
+        }
+    }
+}
+
+type Callback = Arc<dyn Fn(&str) + Send + Sync>;
+
+/// Options for configuring command execution
+#[derive(Clone)]
+pub struct RunnerOptions {
+    /// Whether to print stdout to console in real-time
+    pub print_stdout: bool,
+    /// Whether to print stderr to console in real-time
+    pub print_stderr: bool,
+    /// Optional callback for each stdout line
+    pub stdout_callback: Callback,
+    /// Optional callback for each stderr line
+    pub stderr_callback: Callback,
+}
+
+impl RunnerOptions {
+    fn empty_callback() -> Callback {
+        Arc::new(|_| {})
+    }
+    /// Create default options with real-time printing enabled
+    pub fn print_to_console() -> Self {
+        Self {
+            print_stdout: true,
+            print_stderr: true,
+            ..Default::default()
+        }
+    }
+
+    /// Create options that capture output silently
+    pub fn silent() -> Self {
+        Self {
+            print_stdout: false,
+            print_stderr: false,
+            ..Default::default()
+        }
+    }
+
+    /// Set custom callbacks for stdout and stderr lines
+    pub fn with_callbacks<F1, F2>(mut self, stdout_callback: F1, stderr_callback: F2) -> Self
+    where
+        F1: Fn(&str) + Send + Sync + 'static,
+        F2: Fn(&str) + Send + Sync + 'static,
+    {
+        self.stdout_callback = Arc::new(stdout_callback);
+        self.stderr_callback = Arc::new(stderr_callback);
+        self
+    }
+}
+
+impl Default for RunnerOptions {
+    fn default() -> Self {
+        Self {
+            print_stdout: true,
+            print_stderr: true,
+            stdout_callback: Self::empty_callback(),
+            stderr_callback: Self::empty_callback(),
+        }
+    }
+}
+
+/// Error type for command execution failures
+#[derive(Debug)]
+pub struct CommandError {
+    /// Human-readable error description
+    pub message: String,
+    /// Captured output if execution started
+    pub output: Option<CommandOutput>,
+}
+
+impl std::fmt::Display for CommandError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.message)?;
+        if let Some(output) = &self.output {
+            write!(f, "\n{}", output.format_output())?;
+        }
+        Ok(())
+    }
+}
+
+impl std::error::Error for CommandError {}
+
+/// Runs a command and captures its output while streaming to console
+///
+/// # Example
+///
+/// ```
+/// use harmony_execution::command::{run_command, RunnerOptions};
+/// use std::process::Command;
+///
+/// let output = run_command(
+///     Command::new("echo").arg("hello"),
+///     RunnerOptions::print_to_console()
+/// ).unwrap();
+/// assert!(output.is_success());
+/// assert_eq!(output.stdout, "hello\n");
+/// ```
+pub fn run_command(
+    command: &mut Command,
+    options: RunnerOptions,
+) -> Result<CommandOutput, CommandError> {
+    let mut child = command
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()
+        .map_err(|e| CommandError {
+            message: format!("Failed to spawn command: {}", e),
+            output: None,
+        })?;
+
+    let stdout = child.stdout.take().ok_or_else(|| CommandError {
+        message: "Failed to capture stdout".to_string(),
+        output: None,
+    })?;
+
+    let stderr = child.stderr.take().ok_or_else(|| CommandError {
+        message: "Failed to capture stderr".to_string(),
+        output: None,
+    })?;
+
+    let stdout_reader = BufReader::new(stdout);
+    let stderr_reader = BufReader::new(stderr);
+
+    let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel();
+    let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel();
+
+    // Spawn thread to handle stdout
+    let stdout_handle = thread::spawn(move || {
+        let mut output = String::new();
+        for line in stdout_reader.lines() {
+            match line {
+                Ok(line_content) => {
+                    if options.print_stdout {
+                        println!("{}", line_content);
+                    }
+                    (options.stdout_callback)(&line_content);
+                    output.push_str(&line_content);
+                    output.push('\n');
+                }
+                Err(e) => {
+                    // Silently handle read errors - corrupted data at end is common
+                    log::trace!("Error reading stdout line: {}", e);
+                }
+            }
+        }
+        let _ = stdout_sender.send(output);
+    });
+
+    // Spawn thread to handle stderr
+    let stderr_handle = thread::spawn(move || {
+        let mut output = String::new();
+        for line in stderr_reader.lines() {
+            match line {
+                Ok(line_content) => {
+                    if options.print_stderr {
+                        eprintln!("{}", line_content);
+                    }
+                    (options.stderr_callback)(&line_content);
+                    output.push_str(&line_content);
+                    output.push('\n');
+                }
+                Err(e) => {
+                    log::trace!("Error reading stderr line: {}", e);
+                }
+            }
+        }
+        let _ = stderr_sender.send(output);
+    });
+
+    let status = child.wait().map_err(|e| CommandError {
+        message: format!("Failed to wait for command process: {}", e),
+        output: None,
+    })?;
+
+    let stdout_lines = stdout_handle
+        .join()
+        .map_err(|e| CommandError {
+            message: format!("Stdout thread panicked: {:?}", e),
+            output: None,
+        })
+        .and_then(|_| {
+            stdout_receiver.recv().map_err(|e| CommandError {
+                message: format!("Failed to receive stdout: {}", e),
+                output: None,
+            })
+        })?;
+
+    let stderr_lines = stderr_handle
+        .join()
+        .map_err(|e| CommandError {
+            message: format!("Stderr thread panicked: {:?}", e),
+            output: None,
+        })
+        .and_then(|_| {
+            stderr_receiver.recv().map_err(|e| CommandError {
+                message: format!("Failed to receive stderr: {}", e),
+                output: None,
+            })
+        })?;
+
+    Ok(CommandOutput {
+        stdout: stdout_lines,
+        stderr: stderr_lines,
+        status: status.into(),
+    })
+}
+
+/// Convenience function to run a command with default options (print to console)
+pub fn run(command: &mut Command) -> Result<CommandOutput, CommandError> {
+    run_command(command, RunnerOptions::print_to_console())
+}
+
+/// Convenience function to run a command silently (capture output only)
+pub fn run_silent(command: &mut Command) -> Result<CommandOutput, CommandError> {
+    run_command(command, RunnerOptions::silent())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::process::Command;
+
+    #[test]
+    fn test_simple_echo_command() {
+        let output = run_silent(Command::new("echo").arg("hello world")).unwrap();
+        assert!(output.is_success());
+        assert_eq!(output.stdout.trim(), "hello world");
+        assert!(output.stderr.is_empty());
+    }
+
+    #[test]
+    fn test_command_failure() {
+        let output = run_silent(Command::new("sh").args(["-c", "exit 42"])).unwrap();
+        assert!(!output.is_success());
+        assert_eq!(output.status, CommandStatus::Failed(42));
+    }
+
+    #[test]
+    fn test_command_output_format() {
+        let output = run_silent(Command::new("echo").arg("test")).unwrap();
+        let formatted = output.format_output();
+        assert!(formatted.contains("Stdout:"));
+        assert!(formatted.contains("test"));
+    }
+
+    #[test]
+    fn test_runner_options() {
+        let opts = RunnerOptions::print_to_console();
+        assert!(opts.print_stdout);
+        assert!(opts.print_stderr);
+
+        let opts = RunnerOptions::silent();
+        assert!(!opts.print_stdout);
+        assert!(!opts.print_stderr);
+    }
+
+    #[test]
+    fn test_command_status_from_exit_status() {
+        let output = run_silent(&mut Command::new("true")).unwrap();
+        assert_eq!(output.status, CommandStatus::Success);
+
+        let output = run_silent(&mut Command::new("false")).unwrap();
+        assert_eq!(output.status, CommandStatus::Failed(1));
+    }
+
+    #[test]
+    fn test_stdout_callback_receives_lines() {
+        use std::sync::{Arc, Mutex};
+
+        let captured = Arc::new(Mutex::new(Vec::new()));
+        let captured_clone = Arc::clone(&captured);
+
+        let opts = RunnerOptions::silent().with_callbacks(
+            move |line| captured_clone.lock().unwrap().push(line.to_string()),
+            |_| {},
+        );
+
+        run_command(Command::new("echo").arg("hello world"), opts).unwrap();
+
+        let lines = captured.lock().unwrap();
+        assert_eq!(lines.len(), 1);
+        assert_eq!(lines[0], "hello world");
+    }
+
+    #[test]
+    fn test_stderr_callback_receives_lines() {
+        use std::sync::{Arc, Mutex};
+
+        let captured = Arc::new(Mutex::new(Vec::new()));
+        let captured_clone = Arc::clone(&captured);
+
+        let opts = RunnerOptions::silent().with_callbacks(
+            |_| {},
+            move |line| captured_clone.lock().unwrap().push(line.to_string()),
+        );
+
+        run_command(Command::new("sh").args(["-c", "echo error >&2"]), opts).unwrap();
+
+        let lines = captured.lock().unwrap();
+        assert_eq!(lines.len(), 1);
+        assert_eq!(lines[0], "error");
+    }
+
+    #[test]
+    fn test_callback_and_capture_both_work() {
+        use std::sync::{Arc, Mutex};
+
+        let callback_lines = Arc::new(Mutex::new(Vec::new()));
+        let callback_clone = Arc::clone(&callback_lines);
+
+        let opts = RunnerOptions::silent().with_callbacks(
+            move |line| callback_clone.lock().unwrap().push(line.to_string()),
+            |_| {},
+        );
+
+        let output =
+            run_command(Command::new("printf").args(["line1\nline2\nline3\n"]), opts).unwrap();
+
+        // Verify captured output
+        assert_eq!(output.stdout, "line1\nline2\nline3\n");
+
+        // Verify callback received all lines
+        let lines = callback_lines.lock().unwrap();
+        assert_eq!(lines.len(), 3);
+        assert_eq!(lines[0], "line1");
+        assert_eq!(lines[1], "line2");
+        assert_eq!(lines[2], "line3");
+    }
+
+    #[test]
+    fn test_multiline_output_capture() {
+        let output = run_silent(Command::new("printf").args(["line1\nline2\nline3\n"])).unwrap();
+
+        assert_eq!(output.stdout, "line1\nline2\nline3\n");
+        assert!(output.stderr.trim().is_empty());
+    }
+
+    #[test]
+    fn test_mixed_stdout_stderr_capture() {
+        let output = run_silent(Command::new("sh").args([
+            "-c",
+            "echo stdout1 && echo stderr1 >&2 && echo stdout2 && echo stderr2 >&2",
+        ]))
+        .unwrap();
+
+        assert!(output.stdout.contains("stdout1"));
+        assert!(output.stdout.contains("stdout2"));
+        assert!(output.stderr.contains("stderr1"));
+        assert!(output.stderr.contains("stderr2"));
+    }
+
+    #[test]
+    fn test_empty_output_command() {
+        let output = run_silent(&mut Command::new("true")).unwrap();
+
+        assert!(output.stdout.is_empty());
+        assert!(output.stderr.is_empty());
+        assert!(output.is_success());
+    }
+
+    #[test]
+    fn test_command_output_format_with_empty_streams() {
+        let output = run_silent(&mut Command::new("true")).unwrap();
+        let formatted = output.format_output();
+
+        assert!(formatted.contains("Stdout:"));
+        assert!(formatted.contains("<empty>"));
+        assert!(formatted.contains("Stderr:"));
+    }
+
+    #[test]
+    fn test_error_contains_message_and_output() {
+        let error = CommandError {
+            message: "Test error".to_string(),
+            output: Some(CommandOutput {
+                stdout: "captured stdout".to_string(),
+                stderr: "captured stderr".to_string(),
+                status: CommandStatus::Success,
+            }),
+        };
+
+        let display = format!("{}", error);
+        assert!(display.contains("Test error"));
+        assert!(display.contains("captured stdout"));
+        assert!(display.contains("captured stderr"));
+    }
+
+    #[test]
+    fn test_error_without_output() {
+        let error = CommandError {
+            message: "Spawn failed".to_string(),
+            output: None,
+        };
+
+        let display = format!("{}", error);
+        assert!(display.contains("Spawn failed"));
+        assert!(!display.contains("Stdout:"));
+        assert!(!display.contains("Stderr:"));
+    }
+}
--- a/harmony_execution/src/lib.rs
+++ b/harmony_execution/src/lib.rs
@@ -0,0 +1,5 @@
+pub mod command;
+
+pub use command::{
+    CommandError, CommandOutput, CommandStatus, RunnerOptions, run, run_command, run_silent,
+};
--- a/harmony_types/src/id.rs
+++ b/harmony_types/src/id.rs
@@ -32,6 +32,14 @@ impl Id {
    }
 }

+impl Into<Id> for &str {
+    fn into(self) -> Id {
+        Id {
+            value: self.to_string(),
+        }
+    }
+}
+
 impl FromStr for Id {
    type Err = ();