From 3682a0cb5f0c4457e51fceea47a7092aaaed6e2c Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Fri, 23 Jan 2026 08:54:40 -0500
Subject: [PATCH 01/19] feat: First draft of harmony_agent project that will
 synchronize multiple clusters using nats supercluster to communicate

---
 .gitignore                       |   2 +
 harmony_agent/.dockerignore      |   4 +
 harmony_agent/Cargo.toml         |  22 +++++
 harmony_agent/Dockerfile         |  44 +++++++++
 harmony_agent/deploy/Cargo.toml  |  20 ++++
 harmony_agent/deploy/src/main.rs |  55 +++++++++++
 harmony_agent/src/agent.rs       | 165 +++++++++++++++++++++++++++++++
 harmony_agent/src/config.rs      |  36 +++++++
 harmony_agent/src/main.rs        |  24 +++++
 9 files changed, 372 insertions(+)
 create mode 100644 harmony_agent/.dockerignore
 create mode 100644 harmony_agent/Cargo.toml
 create mode 100644 harmony_agent/Dockerfile
 create mode 100644 harmony_agent/deploy/Cargo.toml
 create mode 100644 harmony_agent/deploy/src/main.rs
 create mode 100644 harmony_agent/src/agent.rs
 create mode 100644 harmony_agent/src/config.rs
 create mode 100644 harmony_agent/src/main.rs

diff --git a/.gitignore b/.gitignore
index 3850d09a..3bb0cc1b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,5 @@ Cargo.lock
 
 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
+
+.harmony_generated
diff --git a/harmony_agent/.dockerignore b/harmony_agent/.dockerignore
new file mode 100644
index 00000000..dd9b5319
--- /dev/null
+++ b/harmony_agent/.dockerignore
@@ -0,0 +1,4 @@
+.git
+data
+target
+demos
diff --git a/harmony_agent/Cargo.toml b/harmony_agent/Cargo.toml
new file mode 100644
index 00000000..360e26e4
--- /dev/null
+++ b/harmony_agent/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "harmony_agent"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+harmony = { path = "../harmony" }
+# harmony_cli = { path = "../harmony_cli" }
+harmony_types = { path = "../harmony_types" }
+harmony_macros = { path = "../harmony_macros" }
+cidr = { workspace = true }
+tokio = { workspace = true }
+log = { workspace = true }
+env_logger = { workspace = true }
+async-nats = "0.45.0"
+async-trait = "0.1"
+# url = { workspace = true }
+
+serde.workspace = true
+serde_json.workspace = true
diff --git a/harmony_agent/Dockerfile b/harmony_agent/Dockerfile
new file mode 100644
index 00000000..9d72462a
--- /dev/null
+++ b/harmony_agent/Dockerfile
@@ -0,0 +1,44 @@
+# Build stage
+FROM rust:slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y pkg-config && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy all required packages
+COPY . .
+
+RUN ls -la1
+
+# Build the application in release mode
+RUN cargo build --release -p harmony_agent
+
+# Runtime stage
+FROM debian:bookworm-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy the binary from the builder stage
+COPY --from=builder /app/target/release/harmony_agent ./harmony_agent
+
+# Declare environment variables used by the Harmony Agent
+# These will be set from build-time environment variables if present
+# NATS_URL: URL of the NATS server (default: nats://localhost:4222)
+ARG NATS_URL=nats://localhost:4222
+ENV NATS_URL=${NATS_URL}
+# NATS_CREDS_PATH: Optional path to NATS credentials file
+ARG NATS_CREDS_PATH
+ENV NATS_CREDS_PATH=${NATS_CREDS_PATH}
+# MY_CLUSTER_ID: This cluster's unique identifier (required)
+ARG MY_CLUSTER_ID
+ENV MY_CLUSTER_ID=${MY_CLUSTER_ID}
+# DESIRED_PRIMARY: The ID of the desired primary cluster (required)
+ARG DESIRED_PRIMARY
+ENV DESIRED_PRIMARY=${DESIRED_PRIMARY}
+
+# Run the application
+ENTRYPOINT ["./harmony_agent"]
diff --git a/harmony_agent/deploy/Cargo.toml b/harmony_agent/deploy/Cargo.toml
new file mode 100644
index 00000000..9aea1e4b
--- /dev/null
+++ b/harmony_agent/deploy/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "harmony_agent_deploy"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+harmony = { path = "../../harmony" }
+harmony_cli = { path = "../../harmony_cli" }
+harmony_types = { path = "../../harmony_types" }
+harmony_macros = { path = "../../harmony_macros" }
+cidr = { workspace = true }
+tokio = { workspace = true }
+log = { workspace = true }
+env_logger = { workspace = true }
+url = { workspace = true }
+
+serde.workspace = true
+serde_json.workspace = true
diff --git a/harmony_agent/deploy/src/main.rs b/harmony_agent/deploy/src/main.rs
new file mode 100644
index 00000000..82fdd15a
--- /dev/null
+++ b/harmony_agent/deploy/src/main.rs
@@ -0,0 +1,55 @@
+use harmony::{
+    inventory::Inventory,
+    modules::{
+        application::{
+            ApplicationScore,
+            backend_app::{BackendApp, BuildCommand},
+            features::{Monitoring, PackagingDeployment},
+        },
+        monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
+    },
+    topology::K8sAnywhereTopology,
+};
+use harmony_macros::hurl;
+use harmony_types::k8s_name::K8sName;
+use std::{path::PathBuf, sync::Arc};
+
+#[tokio::main]
+async fn main() {
+    let application = Arc::new(BackendApp {
+        name: "harmony-agent".to_string(),
+        // This means the script will be run from the harmony_agent directory, not from the
+        // deploy directory
+        project_root: PathBuf::from("../"),
+        network_ports: vec![],
+        env_vars: vec![],
+        build_cmd: BuildCommand::new("cargo", vec!["build", "--release", "-p", "harmony_agent"]),
+        dockerfile: Some(PathBuf::from("Dockerfile")),
+    });
+
+    let app = ApplicationScore {
+        features: vec![
+            Box::new(PackagingDeployment {
+                application: application.clone(),
+            }),
+            Box::new(Monitoring {
+                application: application.clone(),
+                alert_receiver: vec![Box::new(DiscordWebhook {
+                    name: K8sName("test-discord".to_string()),
+                    url: hurl!("https://discord.doesnt.exist.com"),
+                    selectors: vec![],
+                })],
+            }),
+        ],
+        application,
+    };
+
+    harmony_cli::run(
+        Inventory::autoload(),
+        K8sAnywhereTopology::from_env(), // <== Deploy to local automatically provisioned k3d by default or connect to any kubernetes cluster
+        vec![Box::new(app)],
+        None,
+    )
+    .await
+    .unwrap();
+}
diff --git a/harmony_agent/src/agent.rs b/harmony_agent/src/agent.rs
new file mode 100644
index 00000000..eafc83e2
--- /dev/null
+++ b/harmony_agent/src/agent.rs
@@ -0,0 +1,165 @@
+use async_trait::async_trait;
+use log::{debug, error, info};
+use serde::{Deserialize, Serialize};
+use std::time::{SystemTime, UNIX_EPOCH};
+use harmony_types::id::Id;
+use async_nats::jetstream::kv::Store;
+
+use crate::config::AgentConfig;
+
+#[async_trait]
+pub trait HealthStore: Send + Sync {
+    async fn put(&self, key: String, value: Vec<u8>) -> Result<u64, Box<dyn std::error::Error + Send + Sync>>;
+}
+
+#[async_trait]
+impl HealthStore for Store {
+    async fn put(&self, key: String, value: Vec<u8>) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
+        self.put(key, value.into())
+            .await
+            .map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct AgentHeartbeat {
+    pub cluster_id: Id,
+    pub status: String,
+    pub timestamp: u64,
+}
+
+pub struct HarmonyAgent {
+    config: AgentConfig,
+    #[allow(dead_code)]
+    nats_client: Option<async_nats::Client>,
+    health_kv: Box<dyn HealthStore>,
+}
+
+
+impl HarmonyAgent {
+    pub async fn new(config: AgentConfig) -> Result<Self, Box<dyn std::error::Error>> {
+        let mut options = async_nats::ConnectOptions::new();
+        if let Some(ref creds) = config.nats_creds_path {
+            options = options.credentials_file(creds).await?;
+        }
+
+        let client = async_nats::connect_with_options(&config.nats_url, options).await?;
+        let jetstream = async_nats::jetstream::new(client.clone());
+
+        // Initialize KV Buckets as per ADR-017
+        const HEARTBEAT_KV_HISTORY_SIZE: i64 = 64;
+        let health_kv = jetstream
+            .create_key_value(async_nats::jetstream::kv::Config {
+                bucket: "harmony_agent_health".to_string(),
+                history: HEARTBEAT_KV_HISTORY_SIZE,
+                ..Default::default()
+            })
+            .await?;
+
+        Ok(Self {
+            config,
+            nats_client: Some(client),
+            health_kv: Box::new(health_kv),
+        })
+    }
+
+
+    pub async fn run_heartbeat_loop(&self) -> Result<(), Box<dyn std::error::Error>> {
+        let mut interval = tokio::time::interval(self.config.heartbeat_interval);
+        let key = format!("heartbeat.{}", self.config.my_cluster_id);
+
+        info!("Starting heartbeat loop for cluster: {}", self.config.my_cluster_id);
+
+        loop {
+            interval.tick().await;
+
+            let now = SystemTime::now()
+                .duration_since(UNIX_EPOCH)?
+                .as_millis() as u64;
+
+            let heartbeat = AgentHeartbeat {
+                cluster_id: self.config.my_cluster_id.clone(),
+                status: "HEALTHY".to_string(),
+                timestamp: now,
+            };
+
+            debug!("Sending heartbeat for cluster: {}", self.config.my_cluster_id);
+            let payload = serde_json::to_vec(&heartbeat)?;
+
+            // Write heartbeat to KV. ADR-017: Write failure triggers self-demotion logic
+            match self.health_kv.put(key.clone(), payload).await {
+                Ok(_) => {
+                    debug!("Heartbeat successful for cluster: {}", self.config.my_cluster_id);
+                }
+                Err(e) => {
+                    error!("Failed to write heartbeat: {}. Fencing logic would trigger here.", e);
+                    // In a real implementation, we would trigger self-demotion/fencing here
+                }
+            }
+
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::{Arc, Mutex};
+    use tokio::time::{pause, advance, Duration};
+
+    struct MockHealthStore {
+        puts: Arc<Mutex<Vec<(String, Vec<u8>)>>>,
+    }
+
+    #[async_trait]
+    impl HealthStore for MockHealthStore {
+        async fn put(&self, key: String, value: Vec<u8>) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
+            self.puts.lock().unwrap().push((key, value));
+            Ok(0)
+        }
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_heartbeat_loop() {
+        let config = AgentConfig {
+            nats_url: "nats://localhost:4222".to_string(),
+            nats_creds_path: None,
+            my_cluster_id: "test-cluster".into(),
+            desired_primary: "test-cluster".into(),
+            heartbeat_interval: Duration::from_millis(100),
+        };
+
+        let puts = Arc::new(Mutex::new(Vec::new()));
+        let mock_store = MockHealthStore { puts: puts.clone() };
+
+        let agent = HarmonyAgent {
+            config,
+            nats_client: None,
+            health_kv: Box::new(mock_store),
+        };
+
+        // Run the loop in a separate task
+        let handle = tokio::spawn(async move {
+            let _ = agent.run_heartbeat_loop().await;
+        });
+
+        // Advance time in increments to trigger multiple heartbeats
+        for _ in 0..3 {
+            advance(Duration::from_millis(100)).await;
+            tokio::time::sleep(Duration::from_millis(1)).await;
+        }
+
+        let recorded_puts = puts.lock().unwrap();
+        assert!(recorded_puts.len() >= 2, "Should have recorded at least 2 heartbeats, got {}", recorded_puts.len());
+
+        let (key, payload) = &recorded_puts[0];
+        assert_eq!(key, "heartbeat.test-cluster");
+
+        let heartbeat: AgentHeartbeat = serde_json::from_slice(payload).unwrap();
+        assert_eq!(heartbeat.cluster_id.to_string(), "test-cluster");
+        assert_eq!(heartbeat.status, "HEALTHY");
+
+        handle.abort();
+    }
+}
+
diff --git a/harmony_agent/src/config.rs b/harmony_agent/src/config.rs
new file mode 100644
index 00000000..cf5fe128
--- /dev/null
+++ b/harmony_agent/src/config.rs
@@ -0,0 +1,36 @@
+use std::env;
+use std::time::Duration;
+use harmony_types::id::Id;
+
+/// Configuration for the Harmony Agent
+#[derive(Debug, Clone)]
+pub struct AgentConfig {
+    pub nats_url: String,
+    pub nats_creds_path: Option<String>,
+    pub my_cluster_id: Id,
+    pub desired_primary: Id,
+    pub heartbeat_interval: Duration,
+}
+
+impl AgentConfig {
+    pub fn load_from_env() -> Result<Self, String> {
+        let nats_url = env::var("NATS_URL")
+            .unwrap_or_else(|_| "nats://localhost:4222".to_string());
+
+        let nats_creds_path = env::var("NATS_CREDS_PATH").ok();
+
+        let my_cluster_id_str = env::var("MY_CLUSTER_ID")
+            .map_err(|_| "Environment variable MY_CLUSTER_ID is required".to_string())?;
+
+        let desired_primary_str = env::var("DESIRED_PRIMARY")
+            .map_err(|_| "Environment variable DESIRED_PRIMARY is required".to_string())?;
+
+        Ok(Self {
+            nats_url,
+            nats_creds_path,
+            my_cluster_id: my_cluster_id_str.into(),
+            desired_primary: desired_primary_str.into(),
+            heartbeat_interval: Duration::from_millis(1000),
+        })
+    }
+}
diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs
new file mode 100644
index 00000000..a67e5b99
--- /dev/null
+++ b/harmony_agent/src/main.rs
@@ -0,0 +1,24 @@
+use crate::{agent::HarmonyAgent, config::AgentConfig};
+
+mod agent;
+mod config;
+
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    env_logger::init();
+
+    let config = AgentConfig::load_from_env()?;
+
+    log::info!("Harmony Agent Initialized");
+    log::debug!("Identity (My Cluster ID): {}", config.my_cluster_id);
+    log::debug!("NATS URL                : {}", config.nats_url);
+
+    let agent = HarmonyAgent::new(config).await?;
+    
+    // Run the heartbeat loop
+    agent.run_heartbeat_loop().await?;
+
+    Ok(())
+}
+
-- 
2.39.5


From b2f07737957d66ca10e4d4ea850173eb1db07700 Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Fri, 23 Jan 2026 09:34:58 -0500
Subject: [PATCH 02/19] wip: Working on backend app deployment

---
 .dockerignore                                 |   6 +-
 examples/openbao/src/main.rs                  |   2 +
 .../src/modules/application/backend_app.rs    | 246 ++++++++++++++++++
 harmony/src/modules/application/config.rs     |  14 +
 .../features/packaging_deployment.rs          |  24 +-
 harmony/src/modules/application/helm/mod.rs   | 119 +++++++++
 harmony/src/modules/application/mod.rs        |  15 ++
 harmony/src/modules/application/oci.rs        |  75 +++++-
 harmony/src/modules/application/rust.rs       |  94 +------
 harmony/templates/helm/Chart.yaml.j2          |   6 +
 harmony/templates/helm/deployment.yaml.j2     |  37 +++
 harmony/templates/helm/helpers.yaml.j2        |   8 +
 harmony/templates/helm/service.yaml.j2        |  15 ++
 harmony_types/src/id.rs                       |   8 +
 14 files changed, 566 insertions(+), 103 deletions(-)
 create mode 100644 harmony/src/modules/application/backend_app.rs
 create mode 100644 harmony/src/modules/application/config.rs
 create mode 100644 harmony/src/modules/application/helm/mod.rs
 create mode 100644 harmony/templates/helm/Chart.yaml.j2
 create mode 100644 harmony/templates/helm/deployment.yaml.j2
 create mode 100644 harmony/templates/helm/helpers.yaml.j2
 create mode 100644 harmony/templates/helm/service.yaml.j2

diff --git a/.dockerignore b/.dockerignore
index 2233067c..34513768 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,6 @@
 target/
-Dockerfile
\ No newline at end of file
+Dockerfile
+.git
+data
+target
+demos
diff --git a/examples/openbao/src/main.rs b/examples/openbao/src/main.rs
index 63918b81..ab8c0efa 100644
--- a/examples/openbao/src/main.rs
+++ b/examples/openbao/src/main.rs
@@ -56,6 +56,8 @@ async fn main() {
         )),
     };
 
+    // TODO exec pod commands to initialize secret store if not already done
+
     harmony_cli::run(
         Inventory::autoload(),
         K8sAnywhereTopology::from_env(),
diff --git a/harmony/src/modules/application/backend_app.rs b/harmony/src/modules/application/backend_app.rs
new file mode 100644
index 00000000..1e3dbe78
--- /dev/null
+++ b/harmony/src/modules/application/backend_app.rs
@@ -0,0 +1,246 @@
+use std::{ffi::OsStr, path::PathBuf};
+
+use async_trait::async_trait;
+use log::{debug, info, trace};
+use serde::Serialize;
+
+use crate::{
+    config::{REGISTRY_PROJECT, REGISTRY_URL},
+    modules::application::{
+        Application, HelmPackage, OCICompliant,
+        config::ApplicationNetworkPort,
+        helm::{DeploymentTemplate, HelmChart, HelmTemplate, ServiceTemplate},
+        webapp::Webapp,
+    },
+};
+
+#[derive(Debug, Clone, Serialize)]
+pub struct BuildCommand {
+    pub program: String,
+    pub args: Vec<String>,
+}
+
+impl BuildCommand {
+    pub fn new(program: impl Into<String>, args: Vec<impl Into<String>>) -> Self {
+        Self {
+            program: program.into(),
+            args: args.into_iter().map(|s| s.into()).collect(),
+        }
+    }
+
+    pub fn to_std_command(&self) -> std::process::Command {
+        let mut cmd = std::process::Command::new(&self.program);
+        cmd.args(&self.args);
+        cmd
+    }
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct BackendApp {
+    pub name: String,
+    pub project_root: std::path::PathBuf,
+    pub network_ports: Vec<ApplicationNetworkPort>,
+    pub env_vars: Vec<(String, String)>,
+    pub build_cmd: BuildCommand,
+    pub dockerfile: Option<PathBuf>,
+}
+
+impl BackendApp {
+    fn get_dockerfile(&self) -> Result<PathBuf, String> {
+        debug!(
+            "Looking for dockerfile, currently set to {:?}",
+            self.dockerfile
+        );
+        if let Some(dockerfile) = &self.dockerfile {
+            return match dockerfile.exists() {
+                true => {
+                    info!(
+                        "Found dockerfile as intended at {}",
+                        dockerfile.to_string_lossy()
+                    );
+                    Ok(dockerfile.clone())
+                }
+                false => Err(format!(
+                    "Dockerfile explicitely set to {dockerfile} does not exist",
+                    dockerfile = dockerfile.to_string_lossy()
+                )),
+            };
+        }
+
+        let existing_dockerfile = self.project_root.join("Dockerfile");
+
+        debug!("project_root = {:?}", self.project_root);
+
+        debug!("checking = {:?}", existing_dockerfile);
+        if existing_dockerfile.exists() {
+            debug!(
+                "Checking path {:#?} for existing Dockerfile",
+                self.project_root.clone()
+            );
+            return Ok(existing_dockerfile);
+        }
+        Err(format!(
+            "Could not find a dockerfile in {project_root} folder. Tried {existing_dockerfile}",
+            project_root = self.project_root.to_string_lossy(),
+            existing_dockerfile = existing_dockerfile.to_string_lossy(),
+        ))
+    }
+}
+
+impl Application for BackendApp {
+    fn name(&self) -> String {
+        self.name.clone()
+    }
+}
+
+#[async_trait]
+impl OCICompliant for BackendApp {
+    async fn build_push_oci_image(&self) -> Result<String, String> {
+        let dockerfile = self.get_dockerfile()?;
+        let image_tag = self.image_name();
+
+        let mut child = std::process::Command::new("docker")
+            .args([
+                "build",
+                "-t",
+                &image_tag,
+                "-f",
+                &dockerfile.to_string_lossy(),
+                &self.project_root.to_string_lossy()
+            ])
+            .stdout(std::process::Stdio::piped())
+            .stderr(std::process::Stdio::piped())
+            .spawn()
+            .map_err(|e|
+                format!("Failed to spawn docker build process: {e}")
+            )?;
+
+        let stdout = child.stdout.take().expect("Failed to capture stdout");
+        let stderr = child.stderr.take().expect("Failed to capture stderr");
+
+        use std::io::{BufRead, BufReader};
+        use std::thread;
+
+        let stdout_reader = BufReader::new(stdout);
+        let stderr_reader = BufReader::new(stderr);
+
+        let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel();
+        let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel();
+
+        let stdout_handle = thread::spawn(move || {
+            let mut output = String::new();
+            for line in stdout_reader.lines() {
+                match line {
+                    Ok(l) => {
+                        println!("{}", l);
+                        output.push_str(&l);
+                        output.push('\n');
+                    }
+                    Err(e) => {
+                        trace!("Error reading stdout line: {}", e);
+                    }
+                }
+            }
+            let _ = stdout_sender.send(output);
+        });
+
+        let stderr_handle = thread::spawn(move || {
+            let mut output = String::new();
+            for line in stderr_reader.lines() {
+                match line {
+                    Ok(l) => {
+                        eprintln!("{}", l);
+                        output.push_str(&l);
+                        output.push('\n');
+                    }
+                    Err(e) => {
+                        trace!("Error reading stderr line: {}", e);
+                    }
+                }
+            }
+            let _ = stderr_sender.send(output);
+        });
+
+        let status = child.wait().map_err(|e|
+            format!("Failed to wait for docker build process: {e}")
+        )?;
+
+        let stdout_lines = stdout_handle.join().map_err(|e| format!("Stdout thread panicked: {e:?}"))
+            .and_then(|_| stdout_receiver.recv().map_err(|e| format!("Failed to receive stdout: {e}")))?;
+        let stderr_lines = stderr_handle.join().map_err(|e| format!("Stderr thread panicked: {e:?}"))
+            .and_then(|_| stderr_receiver.recv().map_err(|e| format!("Failed to receive stderr: {e}")))?;
+
+        let output_content = format!(
+            "\n{stdout}\n\n{stderr}",
+            stdout = stdout_lines,
+            stderr = stderr_lines,
+        );
+        match status.success() {
+            true => {
+                info!("Docker image build succeeded");
+                Ok(image_tag)
+            }
+            false => Err(format!("Docker image build FAILED :{output_content}")),
+        }
+    }
+
+    fn local_image_name(&self) -> String {
+        self.name.clone()
+    }
+
+    fn image_name(&self) -> String {
+        format!(
+            "{}/{}/{}",
+            *REGISTRY_URL,
+            *REGISTRY_PROJECT,
+            &self.local_image_name()
+        )
+    }
+}
+
+#[async_trait]
+impl HelmPackage for BackendApp {
+    fn project_root(&self) -> PathBuf {
+        self.project_root.clone()
+    }
+
+    fn chart_name(&self) -> String {
+        self.name.clone()
+    }
+
+    async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
+        let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string());
+
+        // Extract the first network port for the container port (if available)
+        let container_port = self.network_ports.first().map(|p| p.number);
+
+        // Create and add DeploymentTemplate with image URL and environment variables
+        let deployment = DeploymentTemplate {
+            name: self.name.clone(),
+            container_port,
+            env_vars: self.env_vars.clone(),
+        };
+        helm_chart.add_template(Box::new(deployment));
+
+        // Create and add ServiceTemplate if a port is available
+        if let Some(port) = container_port {
+            let service = ServiceTemplate { port };
+            helm_chart.add_template(Box::new(service));
+        }
+
+        // Add common Helm values
+        helm_chart.add_value("replicaCount", "1");
+        helm_chart.add_value("image.repository", image_url);
+        helm_chart.add_value("image.pullPolicy", "IfNotPresent");
+        helm_chart.add_value("service.type", "ClusterIP");
+
+        // Write the Helm chart to the project root
+        let chart_dir = helm_chart
+            .write_to(&self.project_root.join(".harmony_generated/helm/"))
+            .map_err(|e| format!("Failed to write Helm chart: {}", e))?;
+
+        info!("Helm chart for '{}' written to: {:?}", self.name, chart_dir);
+
+        Ok(chart_dir.to_string_lossy().to_string())
+    }
+}
diff --git a/harmony/src/modules/application/config.rs b/harmony/src/modules/application/config.rs
new file mode 100644
index 00000000..d35ab604
--- /dev/null
+++ b/harmony/src/modules/application/config.rs
@@ -0,0 +1,14 @@
+use serde::Serialize;
+
+#[derive(Debug, Clone, Serialize)]
+pub enum NetworkProtocol {
+    TCP,
+    UDP,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct ApplicationNetworkPort {
+    pub number: u16,
+    pub protocol: NetworkProtocol,
+    pub name: String,
+}
diff --git a/harmony/src/modules/application/features/packaging_deployment.rs b/harmony/src/modules/application/features/packaging_deployment.rs
index 03f11000..2f107462 100644
--- a/harmony/src/modules/application/features/packaging_deployment.rs
+++ b/harmony/src/modules/application/features/packaging_deployment.rs
@@ -48,11 +48,11 @@ use crate::{
 /// - ArgoCD to install/upgrade/rollback/inspect k8s resources
 /// - Kubernetes for runtime orchestration
 #[derive(Debug, Default, Clone)]
-pub struct PackagingDeployment<A: OCICompliant + HelmPackage + Webapp> {
+pub struct PackagingDeployment<A: OCICompliant + HelmPackage> {
     pub application: Arc<A>,
 }
 
-impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {
+impl<A: OCICompliant + HelmPackage> PackagingDeployment<A> {
     async fn deploy_to_local_k3d(
         &self,
         app_name: String,
@@ -138,7 +138,7 @@ impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {
 
 #[async_trait]
 impl<
-    A: OCICompliant + HelmPackage + Webapp + Clone + 'static,
+    A: OCICompliant + HelmPackage + Clone + 'static,
     T: Topology + HelmCommand + MultiTargetTopology + K8sclient + Ingress + 'static,
 > ApplicationFeature<T> for PackagingDeployment<A>
 {
@@ -148,24 +148,12 @@ impl<
     ) -> Result<InstallationOutcome, InstallationError> {
         let image = self.application.image_name();
 
-        let domain = if topology.current_target() == DeploymentTarget::Production {
-            self.application.dns()
-        } else {
-            topology
-                .get_domain(&self.application.name())
-                .await
-                .map_err(|e| e.to_string())?
-        };
-
         // TODO Write CI/CD workflow files
         // we can autotedect the CI type using the remote url (default to github action for github
         // url, etc..)
         // Or ask for it when unknown
 
-        let helm_chart = self
-            .application
-            .build_push_helm_package(&image, &domain)
-            .await?;
+        let helm_chart = self.application.build_push_helm_package(&image).await?;
 
         // TODO: Make building image configurable/skippable if image already exists (prompt)")
         // https://git.nationtech.io/NationTech/harmony/issues/104
@@ -215,12 +203,12 @@ impl<
         };
 
         Ok(InstallationOutcome::success_with_details(vec![format!(
-            "{}: http://{domain}",
+            "{}",
             self.application.name()
         )]))
     }
     fn name(&self) -> String {
-        "ContinuousDelivery".to_string()
+        "PackagingDeployment".to_string()
     }
 }
 
diff --git a/harmony/src/modules/application/helm/mod.rs b/harmony/src/modules/application/helm/mod.rs
new file mode 100644
index 00000000..da40b0b6
--- /dev/null
+++ b/harmony/src/modules/application/helm/mod.rs
@@ -0,0 +1,119 @@
+use askama::Template;
+use std::fs;
+use std::path::{Path, PathBuf};
+
+/// Trait for any resource that can be rendered into a file in the Helm chart.
+pub trait HelmTemplate: Send + Sync {
+    fn filename(&self) -> String;
+    fn render_template(&self) -> Result<String, askama::Error>;
+}
+
+/// The main orchestrator for building a Helm chart.
+pub struct HelmChart {
+    pub name: String,
+    pub version: String,
+    pub app_version: String,
+    pub description: String,
+    pub templates: Vec<Box<dyn HelmTemplate>>,
+    pub values: Vec<String>,
+}
+
+impl HelmChart {
+    pub fn new(name: String, app_version: String) -> Self {
+        Self {
+            name: name.clone(),
+            version: "0.1.0".to_string(),
+            app_version,
+            description: format!("A Helm chart for {}", name),
+            templates: Vec::new(),
+            values: Vec::new(),
+        }
+    }
+
+    pub fn add_template(&mut self, template: Box<dyn HelmTemplate>) {
+        self.templates.push(template);
+    }
+
+    pub fn add_value(&mut self, key: &str, value: &str) {
+        self.values.push(format!("{}: {}", key, value));
+    }
+
+    pub fn write_to(&self, base_path: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
+        let chart_dir = base_path.join(&self.name);
+        let templates_dir = chart_dir.join("templates");
+        fs::create_dir_all(&templates_dir)?;
+
+        // 1. Render and write Chart.yaml
+        let chart_yaml = ChartYaml {
+            name: &self.name,
+            description: &self.description,
+            version: &self.version,
+            app_version: &self.app_version,
+        };
+        fs::write(chart_dir.join("Chart.yaml"), chart_yaml.render()?)?;
+
+        // 2. Write values.yaml (Constructed dynamically)
+        let values_content = self.values.join("\n");
+        fs::write(chart_dir.join("values.yaml"), values_content)?;
+
+        // 3. Render and write _helpers.tpl
+        let helpers = HelpersTpl;
+        fs::write(templates_dir.join("_helpers.tpl"), helpers.render()?)?;
+
+        // 4. Render and write all added templates (Deployment, Service, etc.)
+        for template in &self.templates {
+            let filename = template.filename();
+            let content = template.render_template()?;
+            fs::write(templates_dir.join(filename), content)?;
+        }
+
+        Ok(chart_dir)
+    }
+}
+
+// --- Templates ---
+
+#[derive(Template)]
+#[template(path = "helm/Chart.yaml.j2")]
+struct ChartYaml<'a> {
+    name: &'a str,
+    description: &'a str,
+    version: &'a str,
+    app_version: &'a str,
+}
+
+#[derive(Template)]
+#[template(path = "helm/helpers.yaml.j2")]
+struct HelpersTpl;
+
+#[derive(Template)]
+#[template(path = "helm/deployment.yaml.j2")]
+pub struct DeploymentTemplate {
+    pub name: String,
+    pub container_port: Option<u16>,
+    pub env_vars: Vec<(String, String)>,
+}
+
+impl HelmTemplate for DeploymentTemplate {
+    fn filename(&self) -> String {
+        "deployment.yaml".to_string()
+    }
+    fn render_template(&self) -> Result<String, askama::Error> {
+        self.render()
+    }
+}
+
+#[derive(Template)]
+#[template(path = "helm/service.yaml.j2")]
+pub struct ServiceTemplate {
+    pub port: u16, // Used only to enforce logic if needed, though template uses Values
+}
+
+impl HelmTemplate for ServiceTemplate {
+    fn filename(&self) -> String {
+        "service.yaml".to_string()
+    }
+    fn render_template(&self) -> Result<String, askama::Error> {
+        self.render()
+    }
+}
diff --git a/harmony/src/modules/application/mod.rs b/harmony/src/modules/application/mod.rs
index 03965e35..00e85843 100644
--- a/harmony/src/modules/application/mod.rs
+++ b/harmony/src/modules/application/mod.rs
@@ -1,8 +1,11 @@
+pub mod backend_app;
+pub mod config;
 mod feature;
 pub mod features;
 pub mod oci;
 mod rust;
 mod webapp;
+pub mod helm;
 use std::sync::Arc;
 
 pub use feature::*;
@@ -124,3 +127,15 @@ impl Serialize for dyn Application {
         todo!()
     }
 }
+
+/// Checks the output of a process command for success.
+fn check_output(
+    output: &std::process::Output,
+    msg: &str,
+) -> Result<(), Box<dyn std::error::Error>> {
+    if !output.status.success() {
+        let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
+        return Err(error_message.into());
+    }
+    Ok(())
+}
diff --git a/harmony/src/modules/application/oci.rs b/harmony/src/modules/application/oci.rs
index 8b1585c8..102bcd8c 100644
--- a/harmony/src/modules/application/oci.rs
+++ b/harmony/src/modules/application/oci.rs
@@ -1,5 +1,10 @@
+use std::path::{Path, PathBuf};
+
+use crate::{config::{REGISTRY_PROJECT, REGISTRY_URL}, modules::application::check_output};
+
 use super::Application;
 use async_trait::async_trait;
+use log::debug;
 
 #[async_trait]
 pub trait OCICompliant: Application {
@@ -20,6 +25,74 @@ pub trait HelmPackage: Application {
     async fn build_push_helm_package(
         &self,
         image_url: &str,
-        domain: &str,
     ) -> Result<String, String>;
+
+    fn project_root(&self) -> PathBuf;
+
+    fn chart_name(&self) -> String;
+
+    /// Packages a Helm chart directory into a .tgz file.
+    fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
+        let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
+        debug!(
+            "Launching `helm package {}` cli with CWD {}",
+            chart_dirname.to_string_lossy(),
+            &self
+                .project_root()
+                .join(".harmony_generated")
+                .join("helm")
+                .to_string_lossy()
+        );
+        let output = std::process::Command::new("helm")
+            .args(["package", chart_dirname.to_str().unwrap()])
+            .current_dir(self.project_root().join(".harmony_generated").join("helm")) // Run package from the parent dir
+            .output()?;
+
+        check_output(&output, "Failed to package Helm chart")?;
+
+        // Helm prints the path of the created chart to stdout.
+        let tgz_name = String::from_utf8(output.stdout)?
+            .split_whitespace()
+            .last()
+            .unwrap_or_default()
+            .to_string();
+        if tgz_name.is_empty() {
+            return Err("Could not determine packaged chart filename.".into());
+        }
+
+        // The output from helm is relative, so we join it with the execution directory.
+        Ok(self
+            .project_root()
+            .join(".harmony_generated")
+            .join("helm")
+            .join(tgz_name))
+    }
+
+    /// Pushes a packaged Helm chart to an OCI registry.
+    fn push_helm_chart(
+        &self,
+        packaged_chart_path: &Path,
+    ) -> Result<String, Box<dyn std::error::Error>> {
+        // The chart name is the file stem of the .tgz file
+        let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
+        let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
+        let oci_pull_url = format!("{oci_push_url}/{}-chart", self.chart_name());
+        debug!(
+            "Pushing Helm chart {} to {}",
+            packaged_chart_path.to_string_lossy(),
+            oci_push_url
+        );
+
+        let output = std::process::Command::new("helm")
+            .args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
+            .output()?;
+
+        check_output(&output, "Pushing Helm chart failed")?;
+
+        // The final URL includes the version tag, which is part of the file name
+        let version = chart_file_name.rsplit_once('-').unwrap().1;
+        debug!("pull url {oci_pull_url}");
+        debug!("push url {oci_push_url}");
+        Ok(format!("{}:{}", oci_pull_url, version))
+    }
 }
diff --git a/harmony/src/modules/application/rust.rs b/harmony/src/modules/application/rust.rs
index 8384e78a..4e41187c 100644
--- a/harmony/src/modules/application/rust.rs
+++ b/harmony/src/modules/application/rust.rs
@@ -81,16 +81,21 @@ impl Webapp for RustWebapp {
 
 #[async_trait]
 impl HelmPackage for RustWebapp {
-    async fn build_push_helm_package(
-        &self,
-        image_url: &str,
-        domain: &str,
-    ) -> Result<String, String> {
+    fn project_root(&self) -> PathBuf {
+        self.project_root.clone()
+    }
+
+    fn chart_name(&self) -> String {
+        self.name.clone()
+    }
+
+    async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
+        let domain = self.dns();
         info!("Starting Helm chart build and push for '{}'", self.name);
 
         // 1. Create the Helm chart files on disk.
         let chart_dir = self
-            .create_helm_chart_files(image_url, domain)
+            .create_helm_chart_files(image_url, &domain)
             .await
             .map_err(|e| format!("Failed to create Helm chart files: {}", e))?;
         info!("Successfully created Helm chart files in {:?}", chart_dir);
@@ -327,19 +332,6 @@ impl RustWebapp {
         Ok(image_tag.to_string())
     }
 
-    /// Checks the output of a process command for success.
-    fn check_output(
-        &self,
-        output: &process::Output,
-        msg: &str,
-    ) -> Result<(), Box<dyn std::error::Error>> {
-        if !output.status.success() {
-            let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
-            return Err(error_message.into());
-        }
-        Ok(())
-    }
-
     fn build_builder_image(&self, dockerfile: &mut Dockerfile) {
         match self.framework {
             Some(RustWebFramework::Leptos) => {
@@ -640,70 +632,6 @@ spec:
         Ok(chart_dir)
     }
 
-    /// Packages a Helm chart directory into a .tgz file.
-    fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
-        let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
-        debug!(
-            "Launching `helm package {}` cli with CWD {}",
-            chart_dirname.to_string_lossy(),
-            &self
-                .project_root
-                .join(".harmony_generated")
-                .join("helm")
-                .to_string_lossy()
-        );
-        let output = process::Command::new("helm")
-            .args(["package", chart_dirname.to_str().unwrap()])
-            .current_dir(self.project_root.join(".harmony_generated").join("helm")) // Run package from the parent dir
-            .output()?;
-
-        self.check_output(&output, "Failed to package Helm chart")?;
-
-        // Helm prints the path of the created chart to stdout.
-        let tgz_name = String::from_utf8(output.stdout)?
-            .split_whitespace()
-            .last()
-            .unwrap_or_default()
-            .to_string();
-        if tgz_name.is_empty() {
-            return Err("Could not determine packaged chart filename.".into());
-        }
-
-        // The output from helm is relative, so we join it with the execution directory.
-        Ok(self
-            .project_root
-            .join(".harmony_generated")
-            .join("helm")
-            .join(tgz_name))
-    }
-
-    /// Pushes a packaged Helm chart to an OCI registry.
-    fn push_helm_chart(
-        &self,
-        packaged_chart_path: &Path,
-    ) -> Result<String, Box<dyn std::error::Error>> {
-        // The chart name is the file stem of the .tgz file
-        let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
-        let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
-        let oci_pull_url = format!("{oci_push_url}/{}-chart", self.name);
-        debug!(
-            "Pushing Helm chart {} to {}",
-            packaged_chart_path.to_string_lossy(),
-            oci_push_url
-        );
-
-        let output = process::Command::new("helm")
-            .args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
-            .output()?;
-
-        self.check_output(&output, "Pushing Helm chart failed")?;
-
-        // The final URL includes the version tag, which is part of the file name
-        let version = chart_file_name.rsplit_once('-').unwrap().1;
-        debug!("pull url {oci_pull_url}");
-        debug!("push url {oci_push_url}");
-        Ok(format!("{}:{}", oci_pull_url, version))
-    }
 
     fn get_or_build_dockerfile(&self) -> Result<PathBuf, Box<dyn std::error::Error>> {
         let existing_dockerfile = self.project_root.join("Dockerfile");
diff --git a/harmony/templates/helm/Chart.yaml.j2 b/harmony/templates/helm/Chart.yaml.j2
new file mode 100644
index 00000000..bddcc93e
--- /dev/null
+++ b/harmony/templates/helm/Chart.yaml.j2
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: {{ name }}
+description: {{ description }}
+type: application
+version: {{ version }}
+appVersion: "{{ app_version }}"
diff --git a/harmony/templates/helm/deployment.yaml.j2 b/harmony/templates/helm/deployment.yaml.j2
new file mode 100644
index 00000000..b060b8f1
--- /dev/null
+++ b/harmony/templates/helm/deployment.yaml.j2
@@ -0,0 +1,37 @@
+{% raw %}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "chart.fullname" . }}
+  labels:
+    app: {{ include "chart.name" . }}
+spec:
+  replicas: {{ .Values.replicaCount | default 1 }}
+  selector:
+    matchLabels:
+      app: {{ include chart.name . }}
+  template:
+    metadata:
+      labels:
+        app: {{ include chart.name . }}
+    spec:
+      containers:
+{% endraw %}
+        - name: {{ name }}
+{% raw %}
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
+          imagePullPolicy: IfNotPresent
+{% endraw %}
+          {% if let Some(port) = container_port %}
+          ports:
+            - name: http
+              containerPort: {{ port }}
+              protocol: TCP
+          {% endif %}
+          {% if !env_vars.is_empty() %}
+          env:
+            {% for (k, v) in env_vars %}
+            - name: {{ k }}
+              value: {{ v }}
+            {% endfor %}
+          {% endif %}
diff --git a/harmony/templates/helm/helpers.yaml.j2 b/harmony/templates/helm/helpers.yaml.j2
new file mode 100644
index 00000000..ff93848e
--- /dev/null
+++ b/harmony/templates/helm/helpers.yaml.j2
@@ -0,0 +1,8 @@
+{% raw %}
+{{- define \"chart.fullname\" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix \"-\" }}
+{{- end }}
+{{- define "chart.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{% endraw %}
diff --git a/harmony/templates/helm/service.yaml.j2 b/harmony/templates/helm/service.yaml.j2
new file mode 100644
index 00000000..c6582d22
--- /dev/null
+++ b/harmony/templates/helm/service.yaml.j2
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ "{{ include \"chart.fullname\" . }}" }}
+  labels:
+    app: {{ "{{ include \"chart.name\" . }}" }}
+spec:
+  type: ClusterIP
+  ports:
+    - port: {{ "{{ .Values.service.port }}" }}
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    app: {{ "{{ include \"chart.name\" . }}" }}
diff --git a/harmony_types/src/id.rs b/harmony_types/src/id.rs
index 0a829068..748c1050 100644
--- a/harmony_types/src/id.rs
+++ b/harmony_types/src/id.rs
@@ -32,6 +32,14 @@ impl Id {
     }
 }
 
+impl Into<Id> for &str {
+    fn into(self) -> Id {
+        Id {
+            value: self.to_string(),
+        }
+    }
+}
+
 impl FromStr for Id {
     type Err = ();
 
-- 
2.39.5


From c20db5b361193f5ab3ce7e0cc433d884c7e4be48 Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Fri, 23 Jan 2026 11:49:32 -0500
Subject: [PATCH 03/19] doc(adr): New ADR Template hydration for strongly typed
 workload deployment

---
 ...plate-Hydration-For-Workload-Deployment.md | 141 ++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 adr/018-Template-Hydration-For-Workload-Deployment.md

diff --git a/adr/018-Template-Hydration-For-Workload-Deployment.md b/adr/018-Template-Hydration-For-Workload-Deployment.md
new file mode 100644
index 00000000..cd45ed97
--- /dev/null
+++ b/adr/018-Template-Hydration-For-Workload-Deployment.md
@@ -0,0 +1,141 @@
+# Architecture Decision Record: Template Hydration for Kubernetes Manifest Generation
+
+Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay
+
+Initial Date: 2025-01-23
+
+Last Updated Date: 2025-01-23
+
+## Status
+
+Implemented
+
+## Context
+
+Harmony's philosophy is built on three guiding principles: Infrastructure as Resilient Code, Prove It Works — Before You Deploy, and One Unified Model. Our goal is to shift validation and verification as left as possible—ideally to compile time—rather than discovering errors at deploy time.
+
+After investigating a few approaches such as compile-checked Askama templates to generate Kubernetes manifests for Helm charts, we found again that this approach suffered from several fundamental limitations:
+
+*   **Late Validation:** Typos in template syntax or field names are only discovered at deployment time, not during compilation. A mistyped `metadata.name` won't surface until Helm attempts to render the template.
+*   **Brittle Maintenance:** Templates are string-based with limited IDE support. Refactoring requires grep-and-replace across YAML-like template files, risking subtle breakage.
+*   **Hard-to-Test Logic:** Testing template output requires mocking the template engine and comparing serialized strings rather than asserting against typed data structures.
+*   **No Type Safety:** There is no guarantee that the generated YAML will be valid Kubernetes resources without runtime validation.
+
+We also faced a strategic choice around Helm: use it as both *templating engine* and *packaging mechanism*, or decouple these concerns. While Helm's ecosystem integration (Harbor, ArgoCD, OCI registry support) is valuable, the Jinja-like templating is at odds with Harmony's "code-first" ethos.
+
+## Decision
+
+We will adopt the **Template Hydration Pattern**—constructing Kubernetes manifests programmatically using strongly-typed `kube-rs` objects, then serializing them to YAML files for packaging into Helm charts.
+
+Specifically:
+
+*   **Write strongly typed `k8s_openapi` Structs:** All Kubernetes resources (Deployment, Service, ConfigMap, etc.) will be constructed using the typed structs generated by `k8s_openapi`.
+*   **Direct Serialization to YAML:** Rather than rendering templates, we use `serde_yaml::to_string()` to serialize typed objects directly into YAML manifests. This way, YAML is only used as a data-transfer format and not a templating/programming language - which it is not.
+*   **Helm as Packaging-Only:** Helm's role is reduced to packaging pre-rendered templates into a tarball and pushing to OCI registries. No template rendering logic resides within Helm.
+*   **Ecosystem Preservation:** The generated Helm charts remain fully compatible with Harbor, ArgoCD, and any Helm-compatible tool—the only difference is that the `templates/` directory contains static YAML files.
+
+The implementation in `backend_app.rs` demonstrates this pattern:
+
+```rust
+let deployment = Deployment {
+    metadata: ObjectMeta {
+        name: Some(self.name.clone()),
+        labels: Some([("app.kubernetes.io/name".to_string(), self.name.clone())].into()),
+        ..Default::default()
+    },
+    spec: Some(DeploymentSpec { /* ... */ }),
+    ..Default::default()
+};
+
+let deployment_yaml = serde_yaml::to_string(&deployment)?;
+fs::write(templates_dir.join("deployment.yaml"), deployment_yaml)?;
+```
+
+## Rationale
+
+**Aligns with "Infrastructure as Resilient Code"**
+
+Harmony's first principle states that infrastructure should be treated like application code. By expressing Kubernetes manifests as Rust structs, we gain:
+
+*   **Refactorability:** Rename a label and the compiler catches all usages.
+*   **IDE Support:** Autocomplete for all Kubernetes API fields; documentation inline.
+*   **Code Navigation:** Jump to definition shows exactly where a value comes from.
+
+**Achieves "Prove It Works — Before You Deploy"**
+
+The compiler now validates that:
+
+*   All required fields are populated (Rust's `Option` type prevents missing fields).
+*   Field types match expectations (ports are integers, not strings).
+*   Enums contain valid values (e.g., `ServiceType::ClusterIP`).
+
+This moves what was runtime validation into compile-time checks, fulfilling the "shift left" promise.
+
+**Enables True Unit Testing**
+
+Developers can now write unit tests that assert directly against typed objects:
+
+```rust
+let deployment = create_deployment(&app);
+assert_eq!(deployment.spec.unwrap().replicas.unwrap(), 3);
+assert_eq!(deployment.metadata.name.unwrap(), "my-app");
+```
+
+No string parsing, no YAML serialization, no fragile assertions against rendered output.
+
+**Preserves Ecosystem Benefits**
+
+By generating standard Helm chart structures, Harmony retains compatibility with:
+
+*   **OCI Registries (Harbor, GHCR):** `helm push` works exactly as before.
+*   **ArgoCD:** Syncs and manages releases using the generated charts.
+*   **Existing Workflows:** Teams already consuming Helm charts see no change.
+
+The Helm tarball becomes a "dumb pipe" for transport, which is arguably its ideal role.
+
+## Consequences
+
+### Positive
+
+*   **Compile-Time Safety:** A broad class of errors (typos, missing fields, type mismatches) is now caught at build time.
+*   **Better Developer Experience:** IDE autocomplete, inline documentation, and refactor support significantly reduce the learning curve for Kubernetes manifests.
+*   **Testability:** Unit tests can validate manifest structure without integration or runtime checks.
+*   **Auditability:** The source-of-truth for manifests is now pure Rust—easier to review in pull requests than template logic scattered across files.
+*   **Future-Extensibility:** CustomResources (CRDs) can be supported via `kopium`-generated Rust types, maintaining the same strong typing.
+
+### Negative
+
+*   **API Schema Drift:** Kubernetes API changes require regenerating `k8s_openapi` types and updating code. A change in a struct field will cause the build to fail—intentionally, but still requiring the pipeline to be updated.
+*   **Verbosity:** Typed construction is more verbose than the equivalent template. Builder patterns or helper functions will be needed to keep code readable.
+*   **Learning Curve:** Contributors must understand both the Kubernetes resource spec *and* the Rust type system, rather than just YAML.
+*   **Debugging Shift:** When debugging generated YAML, you now trace through Rust code rather than template files—more precise but different mental model.
+
+## Alternatives Considered
+
+### 1. Enhance Askama with Compile-Time Validation
+*Pros:* Stay within familiar templating paradigm; minimal code changes.
+*Cons:* Rust's type system cannot fully express Kubernetes schema validation without significant macro boilerplate. Errors would still surface at template evaluation time, not compilation.
+
+### 2. Use Helm SDK Programmatically (Go)
+*Pros:* Direct access to Helm's template engine; no YAML serialization step.
+*Cons:* Would introduce a second language (Go) into a Rust codebase, increasing cognitive load and compilation complexity. No improvement in compile-time safety.
+
+### 3. Raw YAML String Templating (Manual)
+*Pros:* Maximum control; no external dependencies.
+*Cons:* Even more error-prone than Askama; no structure validation; string concatenation errors abound.
+
+### 4. Use Kustomize for All Manifests
+*Pros:* Declarative overlays; standard tool.
+*Cons:* Kustomize is itself a layer over YAML templates with its own DSL. It does not provide compile-time type safety and would require externalizing manifest management outside Harmony's codebase.
+
+__Note that this template hydration architecture still allows to override templates with tools like kustomize when required__
+
+## Additional Notes
+
+**Scalability to Future Topologies**
+
+The Template Hydration pattern enables future Harmony architectures to generate manifests dynamically based on topology context. For example, a `CostTopology` might adjust resource requests based on cluster pricing, manipulating the typed `Deployment::spec` directly before serialization.
+
+**Implementation Status**
+
+As of this writing, the pattern is implemented for `BackendApp` deployments (`backend_app.rs`). The next phase is to extend this pattern across all application modules (`webapp.rs`, etc.) and to standardize on this approach for any new implementations.
-- 
2.39.5


From ab68e7309d30781c394b1cfb581b2db2efd63b65 Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Fri, 23 Jan 2026 23:31:37 -0500
Subject: [PATCH 04/19] feat: Use k8s openapi structs as helm chart resources
 following ADR 018

---
 .../src/modules/application/backend_app.rs    | 461 ++++++++++++++++--
 harmony/src/modules/application/config.rs     |   9 +
 harmony/src/modules/application/helm/mod.rs   | 329 +++++++++++--
 harmony/templates/helm/deployment.yaml.j2     |  37 --
 harmony/templates/helm/helpers.yaml.j2        |   8 -
 harmony/templates/helm/service.yaml.j2        |  15 -
 6 files changed, 720 insertions(+), 139 deletions(-)
 delete mode 100644 harmony/templates/helm/deployment.yaml.j2
 delete mode 100644 harmony/templates/helm/helpers.yaml.j2
 delete mode 100644 harmony/templates/helm/service.yaml.j2

diff --git a/harmony/src/modules/application/backend_app.rs b/harmony/src/modules/application/backend_app.rs
index 1e3dbe78..83e24f5b 100644
--- a/harmony/src/modules/application/backend_app.rs
+++ b/harmony/src/modules/application/backend_app.rs
@@ -1,5 +1,4 @@
-use std::{ffi::OsStr, path::PathBuf};
-
+use std::path::PathBuf;
 use async_trait::async_trait;
 use log::{debug, info, trace};
 use serde::Serialize;
@@ -7,10 +6,7 @@ use serde::Serialize;
 use crate::{
     config::{REGISTRY_PROJECT, REGISTRY_URL},
     modules::application::{
-        Application, HelmPackage, OCICompliant,
-        config::ApplicationNetworkPort,
-        helm::{DeploymentTemplate, HelmChart, HelmTemplate, ServiceTemplate},
-        webapp::Webapp,
+        config::ApplicationNetworkPort, helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind}, Application, HelmPackage, OCICompliant
     },
 };
 
@@ -106,14 +102,12 @@ impl OCICompliant for BackendApp {
                 &image_tag,
                 "-f",
                 &dockerfile.to_string_lossy(),
-                &self.project_root.to_string_lossy()
+                &self.project_root.to_string_lossy(),
             ])
             .stdout(std::process::Stdio::piped())
             .stderr(std::process::Stdio::piped())
             .spawn()
-            .map_err(|e|
-                format!("Failed to spawn docker build process: {e}")
-            )?;
+            .map_err(|e| format!("Failed to spawn docker build process: {e}"))?;
 
         let stdout = child.stdout.take().expect("Failed to capture stdout");
         let stderr = child.stderr.take().expect("Failed to capture stderr");
@@ -161,14 +155,26 @@ impl OCICompliant for BackendApp {
             let _ = stderr_sender.send(output);
         });
 
-        let status = child.wait().map_err(|e|
-            format!("Failed to wait for docker build process: {e}")
-        )?;
+        let status = child
+            .wait()
+            .map_err(|e| format!("Failed to wait for docker build process: {e}"))?;
 
-        let stdout_lines = stdout_handle.join().map_err(|e| format!("Stdout thread panicked: {e:?}"))
-            .and_then(|_| stdout_receiver.recv().map_err(|e| format!("Failed to receive stdout: {e}")))?;
-        let stderr_lines = stderr_handle.join().map_err(|e| format!("Stderr thread panicked: {e:?}"))
-            .and_then(|_| stderr_receiver.recv().map_err(|e| format!("Failed to receive stderr: {e}")))?;
+        let stdout_lines = stdout_handle
+            .join()
+            .map_err(|e| format!("Stdout thread panicked: {e:?}"))
+            .and_then(|_| {
+                stdout_receiver
+                    .recv()
+                    .map_err(|e| format!("Failed to receive stdout: {e}"))
+            })?;
+        let stderr_lines = stderr_handle
+            .join()
+            .map_err(|e| format!("Stderr thread panicked: {e:?}"))
+            .and_then(|_| {
+                stderr_receiver
+                    .recv()
+                    .map_err(|e| format!("Failed to receive stderr: {e}"))
+            })?;
 
         let output_content = format!(
             "\n{stdout}\n\n{stderr}",
@@ -211,30 +217,38 @@ impl HelmPackage for BackendApp {
     async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
         let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string());
 
-        // Extract the first network port for the container port (if available)
-        let container_port = self.network_ports.first().map(|p| p.number);
+        // Build the typed Deployment object using the builder
+        let mut deployment_builder = DeploymentBuilder::new(&self.name, image_url);
 
-        // Create and add DeploymentTemplate with image URL and environment variables
-        let deployment = DeploymentTemplate {
-            name: self.name.clone(),
-            container_port,
-            env_vars: self.env_vars.clone(),
-        };
-        helm_chart.add_template(Box::new(deployment));
-
-        // Create and add ServiceTemplate if a port is available
-        if let Some(port) = container_port {
-            let service = ServiceTemplate { port };
-            helm_chart.add_template(Box::new(service));
+        // Add container ports
+        for port in &self.network_ports {
+            deployment_builder = deployment_builder.with_container_port(
+                port.number as i32,
+                &port.name,
+                port.protocol.as_str(),
+            );
         }
 
-        // Add common Helm values
-        helm_chart.add_value("replicaCount", "1");
-        helm_chart.add_value("image.repository", image_url);
-        helm_chart.add_value("image.pullPolicy", "IfNotPresent");
-        helm_chart.add_value("service.type", "ClusterIP");
+        // Add environment variables
+        for (key, value) in &self.env_vars {
+            deployment_builder = deployment_builder.with_env_var(key, value);
+        }
 
-        // Write the Helm chart to the project root
+        let deployment = deployment_builder.build();
+        helm_chart.add_resource(HelmResourceKind::Deployment(deployment));
+
+        // Build the typed Service object using the helper function
+        let network_ports: Vec<(String, u16, String)> = self
+            .network_ports
+            .iter()
+            .map(|p| (p.name.clone(), p.number, p.protocol.as_str().to_string()))
+            .collect();
+
+        if let Some(service) = helm::create_service_from_ports(self.name.clone(), &network_ports) {
+            helm_chart.add_resource(HelmResourceKind::Service(service));
+        }
+
+        // Write the Helm chart metadata to the project root
         let chart_dir = helm_chart
             .write_to(&self.project_root.join(".harmony_generated/helm/"))
             .map_err(|e| format!("Failed to write Helm chart: {}", e))?;
@@ -244,3 +258,376 @@ impl HelmPackage for BackendApp {
         Ok(chart_dir.to_string_lossy().to_string())
     }
 }
+
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::modules::application::config::ApplicationNetworkPort;
+    use crate::modules::application::config::NetworkProtocol;
+    use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
+    use serde_yaml::from_str;
+    use k8s_openapi::api::core::v1::Service as K8sService;
+    use k8s_openapi::api::apps::v1::Deployment;
+    use std::fs;
+
+    fn cleanup_test_dirs(project_root: &PathBuf) {
+        let helm_dir = project_root.join(".harmony_generated/helm/");
+        if helm_dir.exists() {
+            let _ = fs::remove_dir_all(&helm_dir);
+        }
+    }
+
+    fn create_test_backend_app_with_ports() -> BackendApp {
+        BackendApp {
+            name: "test-app".to_string(),
+            project_root: "/tmp/test_backend".into(),
+            network_ports: vec![
+                ApplicationNetworkPort {
+                    number: 8080,
+                    protocol: NetworkProtocol::TCP,
+                    name: "http".to_string(),
+                },
+                ApplicationNetworkPort {
+                    number: 9000,
+                    protocol: NetworkProtocol::TCP,
+                    name: "metrics".to_string(),
+                },
+                ApplicationNetworkPort {
+                    number: 50051,
+                    protocol: NetworkProtocol::TCP,
+                    name: "grpc".to_string(),
+                },
+            ],
+            env_vars: vec![
+                ("ENV_VAR_1".to_string(), "value1".to_string()),
+                ("ENV_VAR_2".to_string(), "value2".to_string()),
+            ],
+            build_cmd: BuildCommand::new("cargo", vec!["build"]),
+            dockerfile: None,
+        }
+    }
+
+    fn create_test_backend_app_no_ports() -> BackendApp {
+        BackendApp {
+            name: "test-app-no-ports".to_string(),
+            project_root: "/tmp/test_backend_no_ports".into(),
+            network_ports: vec![],
+            env_vars: vec![("ENV_VAR_1".to_string(), "value1".to_string())],
+            build_cmd: BuildCommand::new("cargo", vec!["build"]),
+            dockerfile: None,
+        }
+    }
+
+    #[tokio::test]
+    async fn test_service_created_with_all_network_ports() {
+        let app = create_test_backend_app_with_ports();
+        let test_dir = app.project_root.clone();
+
+        cleanup_test_dirs(&test_dir);
+
+        let result = app
+            .build_push_helm_package("registry.example.com/test/test-app:1.0.0")
+            .await;
+
+        assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result);
+
+        let service_yaml_path = test_dir
+            .join(".harmony_generated/helm/test-app/templates/service.yaml");
+        assert!(
+            service_yaml_path.exists(),
+            "service.yaml should exist when there are network ports"
+        );
+
+        let service_yaml_content = fs::read_to_string(&service_yaml_path)
+            .expect("Failed to read service.yaml");
+
+        let service: K8sService = from_str(&service_yaml_content)
+            .expect("Failed to parse service.yaml as K8s Service");
+
+        assert_eq!(
+            service.metadata.name.as_deref(),
+            Some("test-app"),
+            "Service name should match app name"
+        );
+        assert_eq!(
+            service.spec.as_ref().unwrap().type_.as_deref(),
+            Some("ClusterIP"),
+            "Service type should be ClusterIP"
+        );
+
+        let ports = service
+            .spec
+            .as_ref()
+            .unwrap()
+            .ports
+            .as_ref()
+            .expect("Service should have ports");
+
+        assert_eq!(ports.len(), 3, "Service should have 3 ports");
+
+        let http_port = &ports[0];
+        assert_eq!(http_port.name.as_deref(), Some("http"), "First port name should be 'http'");
+        assert_eq!(http_port.protocol.as_deref(), Some("TCP"), "First port protocol should be 'TCP'");
+        assert_eq!(http_port.port, 8080, "First port number should be 8080");
+
+        let metrics_port = &ports[1];
+        assert_eq!(metrics_port.name.as_deref(), Some("metrics"), "Second port name should be 'metrics'");
+        assert_eq!(metrics_port.protocol.as_deref(), Some("TCP"), "Second port protocol should be 'TCP'");
+        assert_eq!(metrics_port.port, 9000, "Second port number should be 9000");
+
+        let grpc_port = &ports[2];
+        assert_eq!(grpc_port.name.as_deref(), Some("grpc"), "Third port name should be 'grpc'");
+        assert_eq!(grpc_port.protocol.as_deref(), Some("TCP"), "Third port protocol should be 'TCP'");
+        assert_eq!(grpc_port.port, 50051, "Third port number should be 50051");
+
+        for port in ports.iter() {
+            match &port.target_port {
+                Some(IntOrString::Int(target)) => {
+                    assert_eq!(
+                        *target, port.port,
+                        "Target port should match service port for {}",
+                        port.name.as_deref().unwrap_or("unknown")
+                    );
+                }
+                _ => panic!("Target port should be Int for all ports"),
+            }
+        }
+
+        cleanup_test_dirs(&test_dir);
+    }
+
+    #[tokio::test]
+    async fn test_service_not_created_when_no_network_ports() {
+        let app = create_test_backend_app_no_ports();
+        let test_dir = app.project_root.clone();
+
+        cleanup_test_dirs(&test_dir);
+
+        let result = app
+            .build_push_helm_package("registry.example.com/test/test-app-no-ports:1.0.0")
+            .await;
+
+        assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result);
+
+        let service_yaml_path = test_dir
+            .join(".harmony_generated/helm/test-app-no-ports/templates/service.yaml");
+        assert!(
+            !service_yaml_path.exists(),
+            "service.yaml should not exist when there are no network ports"
+        );
+
+        cleanup_test_dirs(&test_dir);
+    }
+
+    #[tokio::test]
+    async fn test_deployment_created_with_correct_configuration() {
+        let app = create_test_backend_app_with_ports();
+        let test_dir = app.project_root.clone();
+
+        cleanup_test_dirs(&test_dir);
+
+        let result = app
+            .build_push_helm_package("registry.example.com/test/test-app:1.0.0")
+            .await;
+
+        assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result);
+
+        let deployment_yaml_path = test_dir
+            .join(".harmony_generated/helm/test-app/templates/deployment.yaml");
+        assert!(deployment_yaml_path.exists(), "deployment.yaml should exist");
+
+        let deployment_yaml_content = fs::read_to_string(&deployment_yaml_path)
+            .expect("Failed to read deployment.yaml");
+
+        let deployment: Deployment = from_str(&deployment_yaml_content)
+            .expect("Failed to parse deployment.yaml as K8s Deployment");
+
+        assert_eq!(
+            deployment.metadata.name.as_deref(),
+            Some("test-app"),
+            "Deployment name should match app name"
+        );
+
+        let deployment_spec = deployment
+            .spec
+            .as_ref()
+            .expect("Deployment should have spec");
+        assert_eq!(deployment_spec.replicas, Some(1), "Replicas should be 1");
+
+        let selector = &deployment_spec.selector;
+        assert_eq!(
+            selector.match_labels.as_ref().unwrap().get("app.kubernetes.io/name"),
+            Some(&"test-app".to_string()),
+            "Selector should match app name"
+        );
+
+        let pod_spec = deployment_spec
+            .template
+            .spec
+            .as_ref()
+            .expect("Pod template should have spec");
+
+        assert_eq!(pod_spec.containers.len(), 1, "Should have exactly one container");
+
+        let container = &pod_spec.containers[0];
+        assert_eq!(container.name, "test-app", "Container name should match app name");
+        assert_eq!(
+            container.image.as_deref(),
+            Some("registry.example.com/test/test-app:1.0.0"),
+            "Container image should match provided image URL"
+        );
+        assert_eq!(
+            container.image_pull_policy.as_deref(),
+            Some("IfNotPresent"),
+            "Image pull policy should be IfNotPresent"
+        );
+
+        let container_ports = container
+            .ports
+            .as_ref()
+            .expect("Container should have ports");
+        assert_eq!(container_ports.len(), 3, "Container should have 3 ports");
+
+        assert_eq!(container_ports[0].container_port, 8080, "First container port should be 8080");
+        assert_eq!(container_ports[0].name.as_deref(), Some("http"), "First container port name should be 'http'");
+        assert_eq!(container_ports[0].protocol.as_deref(), Some("TCP"), "First container port protocol should be 'TCP'");
+
+        assert_eq!(container_ports[1].container_port, 9000, "Second container port should be 9000");
+        assert_eq!(container_ports[1].name.as_deref(), Some("metrics"), "Second container port name should be 'metrics'");
+        assert_eq!(container_ports[1].protocol.as_deref(), Some("TCP"), "Second container port protocol should be 'TCP'");
+
+        assert_eq!(container_ports[2].container_port, 50051, "Third container port should be 50051");
+        assert_eq!(container_ports[2].name.as_deref(), Some("grpc"), "Third container port name should be 'grpc'");
+        assert_eq!(container_ports[2].protocol.as_deref(), Some("TCP"), "Third container port protocol should be 'TCP'");
+
+        let env_vars = container.env.as_ref().expect("Container should have env vars");
+        assert_eq!(env_vars.len(), 2, "Container should have 2 env vars");
+
+        let env_map: std::collections::HashMap<String, String> = env_vars
+            .iter()
+            .map(|e| (e.name.clone(), e.value.clone().unwrap_or_default()))
+            .collect();
+
+        assert_eq!(
+            env_map.get("ENV_VAR_1"),
+            Some(&"value1".to_string()),
+            "ENV_VAR_1 should have correct value"
+        );
+        assert_eq!(
+            env_map.get("ENV_VAR_2"),
+            Some(&"value2".to_string()),
+            "ENV_VAR_2 should have correct value"
+        );
+
+        let pod_labels = deployment_spec
+            .template
+            .metadata
+            .as_ref()
+            .expect("Pod template should have metadata")
+            .labels
+            .as_ref()
+            .expect("Pod should have labels");
+
+        assert_eq!(
+            pod_labels.get("app.kubernetes.io/name"),
+            Some(&"test-app".to_string()),
+            "Pod should have correct app label"
+        );
+        assert_eq!(
+            pod_labels.get("app.kubernetes.io/instance"),
+            Some(&"test-app".to_string()),
+            "Pod should have correct instance label"
+        );
+
+        cleanup_test_dirs(&test_dir);
+    }
+
+    #[tokio::test]
+    async fn test_service_with_udp_protocol() {
+        let app = BackendApp {
+            name: "udp-app".to_string(),
+            project_root: "/tmp/test_udp".into(),
+            network_ports: vec![
+                ApplicationNetworkPort {
+                    number: 53,
+                    protocol: NetworkProtocol::UDP,
+                    name: "dns".to_string(),
+                },
+                ApplicationNetworkPort {
+                    number: 8080,
+                    protocol: NetworkProtocol::TCP,
+                    name: "http".to_string(),
+                },
+            ],
+            env_vars: vec![],
+            build_cmd: BuildCommand::new("cargo", vec!["build"]),
+            dockerfile: None,
+        };
+        let test_dir = app.project_root.clone();
+
+        cleanup_test_dirs(&test_dir);
+
+        let result = app
+            .build_push_helm_package("registry.example.com/test/udp-app:1.0.0")
+            .await;
+
+        assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result);
+
+        let service_yaml_path = test_dir
+            .join(".harmony_generated/helm/udp-app/templates/service.yaml");
+        assert!(service_yaml_path.exists(), "service.yaml should exist");
+
+        let service_yaml_content = fs::read_to_string(&service_yaml_path)
+            .expect("Failed to read service.yaml");
+
+        let service: K8sService = from_str(&service_yaml_content)
+            .expect("Failed to parse service.yaml as K8s Service");
+
+        let ports = service
+            .spec
+            .as_ref()
+            .unwrap()
+            .ports
+            .as_ref()
+            .expect("Service should have ports");
+
+        assert_eq!(ports.len(), 2, "Service should have 2 ports");
+
+        let dns_port = &ports[0];
+        assert_eq!(dns_port.name.as_deref(), Some("dns"), "DNS port name should be 'dns'");
+        assert_eq!(
+            dns_port.protocol.as_deref(),
+            Some("UDP"),
+            "DNS port protocol should be 'UDP'"
+        );
+        assert_eq!(dns_port.port, 53, "DNS port number should be 53");
+
+        let http_port = &ports[1];
+        assert_eq!(http_port.name.as_deref(), Some("http"), "HTTP port name should be 'http'");
+        assert_eq!(
+            http_port.protocol.as_deref(),
+            Some("TCP"),
+            "HTTP port protocol should be 'TCP'"
+        );
+        assert_eq!(http_port.port, 8080, "HTTP port number should be 8080");
+
+        cleanup_test_dirs(&test_dir);
+    }
+
+    #[test]
+    fn test_build_command_creation() {
+        let cmd = BuildCommand::new("docker", vec!["build", "-t", "myimage"]);
+        assert_eq!(cmd.program, "docker");
+        assert_eq!(cmd.args, vec!["build", "-t", "myimage"]);
+    }
+
+    #[test]
+    fn test_build_command_clone() {
+        let cmd1 = BuildCommand::new("cargo", vec!["build", "--release"]);
+        let cmd2 = cmd1.clone();
+        assert_eq!(cmd1.program, cmd2.program);
+        assert_eq!(cmd1.args, cmd2.args);
+    }
+}
+
diff --git a/harmony/src/modules/application/config.rs b/harmony/src/modules/application/config.rs
index d35ab604..c01ebaba 100644
--- a/harmony/src/modules/application/config.rs
+++ b/harmony/src/modules/application/config.rs
@@ -6,6 +6,15 @@ pub enum NetworkProtocol {
     UDP,
 }
 
+impl NetworkProtocol {
+    pub fn as_str(&self) -> &str {
+        match self {
+            NetworkProtocol::TCP => "TCP",
+            NetworkProtocol::UDP => "UDP",
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize)]
 pub struct ApplicationNetworkPort {
     pub number: u16,
diff --git a/harmony/src/modules/application/helm/mod.rs b/harmony/src/modules/application/helm/mod.rs
index da40b0b6..fd14d1e7 100644
--- a/harmony/src/modules/application/helm/mod.rs
+++ b/harmony/src/modules/application/helm/mod.rs
@@ -1,11 +1,76 @@
-use askama::Template;
+use k8s_openapi::api::{
+    apps::v1::{Deployment, DeploymentSpec},
+    core::v1::{
+        Container, ContainerPort, EnvVar, PodSpec,
+        PodTemplateSpec, Service as K8sService, ServicePort, ServiceSpec,
+    },
+};
+use kube::core::ObjectMeta;
+use serde::Serialize;
 use std::fs;
 use std::path::{Path, PathBuf};
 
-/// Trait for any resource that can be rendered into a file in the Helm chart.
-pub trait HelmTemplate: Send + Sync {
-    fn filename(&self) -> String;
-    fn render_template(&self) -> Result<String, askama::Error>;
+/// Enum representing all supported Kubernetes resource types for Helm charts.
+/// Supports built-in typed resources and custom CRDs via YAML strings.
+pub enum HelmResourceKind {
+    /// Built-in typed Service resource
+    Service(K8sService),
+    /// Built-in typed Deployment resource
+    Deployment(Deployment),
+    /// Custom resource as pre-serialized YAML (e.g., CRDs, custom types)
+    CustomYaml { filename: String, content: String },
+    // Can add more typed variants as needed: ConfigMap, Secret, Ingress, etc.
+}
+
+impl HelmResourceKind {
+    pub fn filename(&self) -> String {
+        match self {
+            HelmResourceKind::Service(_) => "service.yaml".to_string(),
+            HelmResourceKind::Deployment(_) => "deployment.yaml".to_string(),
+            HelmResourceKind::CustomYaml { filename, .. } => filename.clone(),
+        }
+    }
+
+    pub fn serialize_to_yaml(&self) -> Result<String, serde_yaml::Error> {
+        match self {
+            HelmResourceKind::Service(s) => serde_yaml::to_string(s),
+            HelmResourceKind::Deployment(d) => serde_yaml::to_string(d),
+            HelmResourceKind::CustomYaml { content, .. } => Ok(content.clone()),
+        }
+    }
+
+    pub fn as_service(&self) -> Option<&K8sService> {
+        match self {
+            HelmResourceKind::Service(s) => Some(s),
+            _ => None,
+        }
+    }
+
+    pub fn as_deployment(&self) -> Option<&Deployment> {
+        match self {
+            HelmResourceKind::Deployment(d) => Some(d),
+            _ => None,
+        }
+    }
+
+    /// Add a custom resource from any serializable type (e.g., CRDs, custom types)
+    pub fn from_yaml(filename: impl Into<String>, content: impl Into<String>) -> Self {
+        HelmResourceKind::CustomYaml {
+            filename: filename.into(),
+            content: content.into(),
+        }
+    }
+
+    /// Add a custom resource from any type that implements Serialize
+    pub fn from_serializable<T: serde::Serialize>(
+        filename: impl Into<String>,
+        resource: &T,
+    ) -> Result<Self, serde_yaml::Error> {
+        Ok(HelmResourceKind::CustomYaml {
+            filename: filename.into(),
+            content: serde_yaml::to_string(resource)?,
+        })
+    }
 }
 
 /// The main orchestrator for building a Helm chart.
@@ -14,7 +79,7 @@ pub struct HelmChart {
     pub version: String,
     pub app_version: String,
     pub description: String,
-    pub templates: Vec<Box<dyn HelmTemplate>>,
+    pub resources: Vec<HelmResourceKind>,
     pub values: Vec<String>,
 }
 
@@ -25,13 +90,13 @@ impl HelmChart {
             version: "0.1.0".to_string(),
             app_version,
             description: format!("A Helm chart for {}", name),
-            templates: Vec::new(),
+            resources: Vec::new(),
             values: Vec::new(),
         }
     }
 
-    pub fn add_template(&mut self, template: Box<dyn HelmTemplate>) {
-        self.templates.push(template);
+    pub fn add_resource(&mut self, resource: HelmResourceKind) {
+        self.resources.push(resource);
     }
 
     pub fn add_value(&mut self, key: &str, value: &str) {
@@ -56,14 +121,11 @@ impl HelmChart {
         let values_content = self.values.join("\n");
         fs::write(chart_dir.join("values.yaml"), values_content)?;
 
-        // 3. Render and write _helpers.tpl
-        let helpers = HelpersTpl;
-        fs::write(templates_dir.join("_helpers.tpl"), helpers.render()?)?;
-
-        // 4. Render and write all added templates (Deployment, Service, etc.)
-        for template in &self.templates {
-            let filename = template.filename();
-            let content = template.render_template()?;
+        // 3. Serialize and write all added resources (Deployment, Service, etc.)
+        for resource in &self.resources {
+            let filename = resource.filename();
+            let content = resource.serialize_to_yaml()
+                .map_err(|e| format!("Failed to serialize resource {}: {}", filename, e))?;
             fs::write(templates_dir.join(filename), content)?;
         }
 
@@ -71,7 +133,8 @@ impl HelmChart {
     }
 }
 
-// --- Templates ---
+
+use askama::Template;
 
 #[derive(Template)]
 #[template(path = "helm/Chart.yaml.j2")]
@@ -82,38 +145,220 @@ struct ChartYaml<'a> {
     app_version: &'a str,
 }
 
-#[derive(Template)]
-#[template(path = "helm/helpers.yaml.j2")]
-struct HelpersTpl;
-
-#[derive(Template)]
-#[template(path = "helm/deployment.yaml.j2")]
-pub struct DeploymentTemplate {
-    pub name: String,
-    pub container_port: Option<u16>,
-    pub env_vars: Vec<(String, String)>,
+/// Builder for creating a Kubernetes Service with proper labels and selectors.
+pub struct ServiceBuilder {
+    name: String,
+    service_type: String,
+    ports: Vec<ServicePort>,
+    selector_label: String,
 }
 
-impl HelmTemplate for DeploymentTemplate {
-    fn filename(&self) -> String {
-        "deployment.yaml".to_string()
+impl ServiceBuilder {
+    pub fn new(name: impl Into<String>) -> Self {
+        Self {
+            name: name.into(),
+            service_type: "ClusterIP".to_string(),
+            ports: Vec::new(),
+            selector_label: String::new(),
+        }
     }
-    fn render_template(&self) -> Result<String, askama::Error> {
-        self.render()
+
+    pub fn service_type(mut self, service_type: impl Into<String>) -> Self {
+        self.service_type = service_type.into();
+        self
+    }
+
+    pub fn with_port(mut self, name: impl Into<String>, port: i32, protocol: impl Into<String>) -> Self {
+        use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
+        self.ports.push(ServicePort {
+            name: Some(name.into()),
+            protocol: Some(protocol.into()),
+            port,
+            target_port: Some(IntOrString::Int(port)),
+            ..Default::default()
+        });
+        self
+    }
+
+    pub fn selector_label(mut self, label: impl Into<String>) -> Self {
+        self.selector_label = label.into();
+        self
+    }
+
+    pub fn build(self) -> K8sService {
+        K8sService {
+            metadata: ObjectMeta {
+                name: Some(self.name.clone()),
+                labels: Some(
+                    [
+                        ("app.kubernetes.io/name".to_string(), self.name.clone()),
+                        ("app.kubernetes.io/component".to_string(), "service".to_string()),
+                        ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()),
+                    ]
+                    .into(),
+                ),
+                ..Default::default()
+            },
+            spec: Some(ServiceSpec {
+                type_: Some(self.service_type),
+                selector: Some([("app.kubernetes.io/name".to_string(), self.selector_label)].into()),
+                ports: if self.ports.is_empty() { None } else { Some(self.ports) },
+                ..Default::default()
+            }),
+            ..Default::default()
+        }
     }
 }
 
-#[derive(Template)]
-#[template(path = "helm/service.yaml.j2")]
-pub struct ServiceTemplate {
-    pub port: u16, // Used only to enforce logic if needed, though template uses Values
+/// Builder for creating a Kubernetes Deployment with pod template and container spec.
+pub struct DeploymentBuilder {
+    name: String,
+    image: String,
+    replicas: i32,
+    container_ports: Vec<ContainerPort>,
+    env_vars: Vec<EnvVar>,
+    image_pull_policy: Option<String>,
 }
 
-impl HelmTemplate for ServiceTemplate {
-    fn filename(&self) -> String {
-        "service.yaml".to_string()
+impl DeploymentBuilder {
+    pub fn new(name: impl Into<String>, image: impl Into<String>) -> Self {
+        Self {
+            name: name.into(),
+            image: image.into(),
+            replicas: 1,
+            container_ports: Vec::new(),
+            env_vars: Vec::new(),
+            image_pull_policy: Some("IfNotPresent".to_string()),
+        }
     }
-    fn render_template(&self) -> Result<String, askama::Error> {
-        self.render()
+
+    pub fn replicas(mut self, replicas: i32) -> Self {
+        self.replicas = replicas;
+        self
+    }
+
+    pub fn with_container_port(mut self, number: i32, name: impl Into<String>, protocol: impl Into<String>) -> Self {
+        self.container_ports.push(ContainerPort {
+            container_port: number,
+            name: Some(name.into()),
+            protocol: Some(protocol.into()),
+            ..Default::default()
+        });
+        self
+    }
+
+    pub fn with_env_var(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
+        self.env_vars.push(EnvVar {
+            name: name.into(),
+            value: Some(value.into()),
+            ..Default::default()
+        });
+        self
+    }
+
+    pub fn image_pull_policy(mut self, policy: impl Into<String>) -> Self {
+        self.image_pull_policy = Some(policy.into());
+        self
+    }
+
+    pub fn build(self) -> Deployment {
+        let name = self.name.clone();
+        Deployment {
+            metadata: ObjectMeta {
+                name: Some(name.clone()),
+                labels: Some(
+                    [
+                        ("app.kubernetes.io/name".to_string(), name.clone()),
+                        ("app.kubernetes.io/component".to_string(), "deployment".to_string()),
+                        ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()),
+                        ("app.kubernetes.io/version".to_string(), "1.0.0".to_string()),
+                    ]
+                    .into(),
+                ),
+                ..Default::default()
+            },
+            spec: Some(DeploymentSpec {
+                replicas: Some(self.replicas),
+                selector: k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector {
+                    match_labels: Some([("app.kubernetes.io/name".to_string(), name.clone())].into()),
+                    ..Default::default()
+                },
+                template: PodTemplateSpec {
+                    metadata: Some(ObjectMeta {
+                        labels: Some(
+                            [
+                                ("app.kubernetes.io/name".to_string(), name.clone()),
+                                ("app.kubernetes.io/instance".to_string(), name.clone()),
+                            ]
+                            .into(),
+                        ),
+                        ..Default::default()
+                    }),
+                    spec: Some(PodSpec {
+                        containers: vec![Container {
+                            name: name.clone(),
+                            image: Some(self.image),
+                            image_pull_policy: self.image_pull_policy,
+                            ports: if self.container_ports.is_empty() {
+                                None
+                            } else {
+                                Some(self.container_ports)
+                            },
+                            env: if self.env_vars.is_empty() { None } else { Some(self.env_vars) },
+                            ..Default::default()
+                        }],
+                        ..Default::default()
+                    }),
+                },
+                ..Default::default()
+            }),
+            ..Default::default()
+        }
     }
 }
+
+/// Helper function to create a Service from network port configuration.
+/// Returns `None` if no ports are provided.
+pub fn create_service_from_ports(
+    name: String,
+    network_ports: &[(String, u16, String)], // (name, number, protocol)
+) -> Option<K8sService> {
+    use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
+
+    if network_ports.is_empty() {
+        return None;
+    }
+
+    let ports: Vec<ServicePort> = network_ports
+        .iter()
+        .map(|(port_name, number, protocol)| ServicePort {
+            name: Some(port_name.clone()),
+            protocol: Some(protocol.clone()),
+            port: *number as i32,
+            target_port: Some(IntOrString::Int(*number as i32)),
+            ..Default::default()
+        })
+        .collect();
+
+    Some(K8sService {
+        metadata: ObjectMeta {
+            name: Some(name.clone()),
+            labels: Some(
+                [
+                    ("app.kubernetes.io/name".to_string(), name.clone()),
+                    ("app.kubernetes.io/component".to_string(), "service".to_string()),
+                    ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()),
+                ]
+                .into(),
+            ),
+            ..Default::default()
+        },
+        spec: Some(ServiceSpec {
+            type_: Some("ClusterIP".to_string()),
+            selector: Some([("app.kubernetes.io/name".to_string(), name.clone())].into()),
+            ports: Some(ports),
+            ..Default::default()
+        }),
+        ..Default::default()
+    })
+}
diff --git a/harmony/templates/helm/deployment.yaml.j2 b/harmony/templates/helm/deployment.yaml.j2
deleted file mode 100644
index b060b8f1..00000000
--- a/harmony/templates/helm/deployment.yaml.j2
+++ /dev/null
@@ -1,37 +0,0 @@
-{% raw %}
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: {{ include "chart.fullname" . }}
-  labels:
-    app: {{ include "chart.name" . }}
-spec:
-  replicas: {{ .Values.replicaCount | default 1 }}
-  selector:
-    matchLabels:
-      app: {{ include chart.name . }}
-  template:
-    metadata:
-      labels:
-        app: {{ include chart.name . }}
-    spec:
-      containers:
-{% endraw %}
-        - name: {{ name }}
-{% raw %}
-          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
-          imagePullPolicy: IfNotPresent
-{% endraw %}
-          {% if let Some(port) = container_port %}
-          ports:
-            - name: http
-              containerPort: {{ port }}
-              protocol: TCP
-          {% endif %}
-          {% if !env_vars.is_empty() %}
-          env:
-            {% for (k, v) in env_vars %}
-            - name: {{ k }}
-              value: {{ v }}
-            {% endfor %}
-          {% endif %}
diff --git a/harmony/templates/helm/helpers.yaml.j2 b/harmony/templates/helm/helpers.yaml.j2
deleted file mode 100644
index ff93848e..00000000
--- a/harmony/templates/helm/helpers.yaml.j2
+++ /dev/null
@@ -1,8 +0,0 @@
-{% raw %}
-{{- define \"chart.fullname\" -}}
-{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix \"-\" }}
-{{- end }}
-{{- define "chart.name" -}}
-{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
-{{- end }}
-{% endraw %}
diff --git a/harmony/templates/helm/service.yaml.j2 b/harmony/templates/helm/service.yaml.j2
deleted file mode 100644
index c6582d22..00000000
--- a/harmony/templates/helm/service.yaml.j2
+++ /dev/null
@@ -1,15 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: {{ "{{ include \"chart.fullname\" . }}" }}
-  labels:
-    app: {{ "{{ include \"chart.name\" . }}" }}
-spec:
-  type: ClusterIP
-  ports:
-    - port: {{ "{{ .Values.service.port }}" }}
-      targetPort: http
-      protocol: TCP
-      name: http
-  selector:
-    app: {{ "{{ include \"chart.name\" . }}" }}
-- 
2.39.5


From 0cc5f505f804d768ca4c03de93f3aad55850db67 Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Sun, 25 Jan 2026 22:52:29 -0500
Subject: [PATCH 05/19] feat(harmony_execution): New crate to contain utils for
 execution such as command line

---
 harmony_execution/Cargo.toml     |  12 +
 harmony_execution/src/command.rs | 470 +++++++++++++++++++++++++++++++
 harmony_execution/src/lib.rs     |   6 +
 3 files changed, 488 insertions(+)
 create mode 100644 harmony_execution/Cargo.toml
 create mode 100644 harmony_execution/src/command.rs
 create mode 100644 harmony_execution/src/lib.rs

diff --git a/harmony_execution/Cargo.toml b/harmony_execution/Cargo.toml
new file mode 100644
index 00000000..7433c5e5
--- /dev/null
+++ b/harmony_execution/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "harmony_execution"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+
+[dependencies]
+thiserror.workspace = true
+lazy_static.workspace = true
+directories.workspace = true
+log.workspace = true
diff --git a/harmony_execution/src/command.rs b/harmony_execution/src/command.rs
new file mode 100644
index 00000000..0ac1626c
--- /dev/null
+++ b/harmony_execution/src/command.rs
@@ -0,0 +1,470 @@
+use std::io::{BufRead, BufReader};
+use std::process::{Child, Command, Stdio};
+use std::sync::Arc;
+use std::thread;
+
+/// Captured output from a command execution
+#[derive(Debug, Clone)]
+pub struct CommandOutput {
+    /// Captured stdout content
+    pub stdout: String,
+    /// Captured stderr content
+    pub stderr: String,
+    /// Exit status of the command
+    pub status: CommandStatus,
+}
+
+impl CommandOutput {
+    /// Returns true if the command succeeded
+    pub fn is_success(&self) -> bool {
+        self.status.is_success()
+    }
+
+    /// Formats the complete output for display
+    pub fn format_output(&self) -> String {
+        format!(
+            "Stdout:\n{}\n\nStderr:\n{}",
+            if self.stdout.is_empty() {
+                "<empty>"
+            } else {
+                &self.stdout
+            },
+            if self.stderr.is_empty() {
+                "<empty>"
+            } else {
+                &self.stderr
+            }
+        )
+    }
+}
+
+/// Result status of a command execution
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum CommandStatus {
+    /// Command executed successfully (exit code 0)
+    Success,
+    /// Command failed with an exit code
+    Failed(i32),
+    /// Command was terminated by a signal
+    Terminated(i32),
+    /// Command execution could not be started
+    Error(String),
+}
+
+impl CommandStatus {
+    pub fn is_success(&self) -> bool {
+        matches!(self, CommandStatus::Success)
+    }
+}
+
+impl From<std::process::ExitStatus> for CommandStatus {
+    fn from(status: std::process::ExitStatus) -> Self {
+        if status.success() {
+            CommandStatus::Success
+        } else if let Some(code) = status.code() {
+            CommandStatus::Failed(code)
+        } else {
+            CommandStatus::Terminated(0) // Signal codes are platform-specific
+        }
+    }
+}
+
+type Callback = Arc<dyn Fn(&str) + Send + Sync>;
+
+/// Options for configuring command execution
+#[derive(Clone)]
+pub struct RunnerOptions {
+    /// Whether to print stdout to console in real-time
+    pub print_stdout: bool,
+    /// Whether to print stderr to console in real-time
+    pub print_stderr: bool,
+    /// Optional callback for each stdout line
+    pub stdout_callback: Callback,
+    /// Optional callback for each stderr line
+    pub stderr_callback: Callback,
+}
+
+impl RunnerOptions {
+    fn empty_callback() -> Callback {
+        Arc::new(|_| {})
+    }
+    /// Create default options with real-time printing enabled
+    pub fn print_to_console() -> Self {
+        Self {
+            print_stdout: true,
+            print_stderr: true,
+            ..Default::default()
+        }
+    }
+
+    /// Create options that capture output silently
+    pub fn silent() -> Self {
+        Self {
+            print_stdout: false,
+            print_stderr: false,
+            ..Default::default()
+        }
+    }
+
+    /// Set custom callbacks for stdout and stderr lines
+    pub fn with_callbacks<F1, F2>(mut self, stdout_callback: F1, stderr_callback: F2) -> Self
+    where
+        F1: Fn(&str) + Send + Sync + 'static,
+        F2: Fn(&str) + Send + Sync + 'static,
+    {
+        self.stdout_callback = Arc::new(stdout_callback);
+        self.stderr_callback = Arc::new(stderr_callback);
+        self
+    }
+}
+
+impl Default for RunnerOptions {
+    fn default() -> Self {
+        Self {
+            print_stdout: true,
+            print_stderr: true,
+            stdout_callback: Self::empty_callback(),
+            stderr_callback: Self::empty_callback(),
+        }
+    }
+}
+
+/// Error type for command execution failures
+#[derive(Debug)]
+pub struct CommandError {
+    /// Human-readable error description
+    pub message: String,
+    /// Captured output if execution started
+    pub output: Option<CommandOutput>,
+}
+
+impl std::fmt::Display for CommandError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.message)?;
+        if let Some(output) = &self.output {
+            write!(f, "\n{}", output.format_output())?;
+        }
+        Ok(())
+    }
+}
+
+impl std::error::Error for CommandError {}
+
+/// Runs a command and captures its output while streaming to console
+///
+/// # Example
+///
+/// ```
+/// use harmony_execution::command::{run_command, RunnerOptions};
+/// use std::process::Command;
+///
+/// let output = run_command(
+///     Command::new("echo").arg("hello"),
+///     RunnerOptions::print_to_console()
+/// ).unwrap();
+/// assert!(output.is_success());
+/// assert_eq!(output.stdout, "hello\n");
+/// ```
+pub fn run_command(
+    command: &mut Command,
+    options: RunnerOptions,
+) -> Result<CommandOutput, CommandError> {
+    let mut child = command
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()
+        .map_err(|e| CommandError {
+            message: format!("Failed to spawn command: {}", e),
+            output: None,
+        })?;
+
+    let stdout = child.stdout.take().ok_or_else(|| CommandError {
+        message: "Failed to capture stdout".to_string(),
+        output: None,
+    })?;
+
+    let stderr = child.stderr.take().ok_or_else(|| CommandError {
+        message: "Failed to capture stderr".to_string(),
+        output: None,
+    })?;
+
+    let stdout_reader = BufReader::new(stdout);
+    let stderr_reader = BufReader::new(stderr);
+
+    let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel();
+    let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel();
+
+    // Spawn thread to handle stdout
+    let stdout_handle = thread::spawn(move || {
+        let mut output = String::new();
+        for line in stdout_reader.lines() {
+            match line {
+                Ok(line_content) => {
+                    if options.print_stdout {
+                        println!("{}", line_content);
+                    }
+                    (options.stdout_callback)(&line_content);
+                    output.push_str(&line_content);
+                    output.push('\n');
+                }
+                Err(e) => {
+                    // Silently handle read errors - corrupted data at end is common
+                    log::trace!("Error reading stdout line: {}", e);
+                }
+            }
+        }
+        let _ = stdout_sender.send(output);
+    });
+
+    // Spawn thread to handle stderr
+    let stderr_handle = thread::spawn(move || {
+        let mut output = String::new();
+        for line in stderr_reader.lines() {
+            match line {
+                Ok(line_content) => {
+                    if options.print_stderr {
+                        eprintln!("{}", line_content);
+                    }
+                    (options.stderr_callback)(&line_content);
+                    output.push_str(&line_content);
+                    output.push('\n');
+                }
+                Err(e) => {
+                    log::trace!("Error reading stderr line: {}", e);
+                }
+            }
+        }
+        let _ = stderr_sender.send(output);
+    });
+
+    let status = child.wait().map_err(|e| CommandError {
+        message: format!("Failed to wait for command process: {}", e),
+        output: None,
+    })?;
+
+    let stdout_lines = stdout_handle
+        .join()
+        .map_err(|e| CommandError {
+            message: format!("Stdout thread panicked: {:?}", e),
+            output: None,
+        })
+        .and_then(|_| {
+            stdout_receiver.recv().map_err(|e| CommandError {
+                message: format!("Failed to receive stdout: {}", e),
+                output: None,
+            })
+        })?;
+
+    let stderr_lines = stderr_handle
+        .join()
+        .map_err(|e| CommandError {
+            message: format!("Stderr thread panicked: {:?}", e),
+            output: None,
+        })
+        .and_then(|_| {
+            stderr_receiver.recv().map_err(|e| CommandError {
+                message: format!("Failed to receive stderr: {}", e),
+                output: None,
+            })
+        })?;
+
+    Ok(CommandOutput {
+        stdout: stdout_lines,
+        stderr: stderr_lines,
+        status: status.into(),
+    })
+}
+
+/// Convenience function to run a command with default options (print to console)
+pub fn run(command: &mut Command) -> Result<CommandOutput, CommandError> {
+    run_command(command, RunnerOptions::print_to_console())
+}
+
+/// Convenience function to run a command silently (capture output only)
+pub fn run_silent(command: &mut Command) -> Result<CommandOutput, CommandError> {
+    run_command(command, RunnerOptions::silent())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::process::Command;
+
+    #[test]
+    fn test_simple_echo_command() {
+        let output = run_silent(Command::new("echo").arg("hello world")).unwrap();
+        assert!(output.is_success());
+        assert_eq!(output.stdout.trim(), "hello world");
+        assert!(output.stderr.is_empty());
+    }
+
+    #[test]
+    fn test_command_failure() {
+        let output = run_silent(Command::new("sh").args(["-c", "exit 42"])).unwrap();
+        assert!(!output.is_success());
+        assert_eq!(output.status, CommandStatus::Failed(42));
+    }
+
+    #[test]
+    fn test_command_output_format() {
+        let output = run_silent(Command::new("echo").arg("test")).unwrap();
+        let formatted = output.format_output();
+        assert!(formatted.contains("Stdout:"));
+        assert!(formatted.contains("test"));
+    }
+
+    #[test]
+    fn test_runner_options() {
+        let opts = RunnerOptions::print_to_console();
+        assert!(opts.print_stdout);
+        assert!(opts.print_stderr);
+
+        let opts = RunnerOptions::silent();
+        assert!(!opts.print_stdout);
+        assert!(!opts.print_stderr);
+    }
+
+    #[test]
+    fn test_command_status_from_exit_status() {
+        let output = run_silent(&mut Command::new("true")).unwrap();
+        assert_eq!(output.status, CommandStatus::Success);
+
+        let output = run_silent(&mut Command::new("false")).unwrap();
+        assert_eq!(output.status, CommandStatus::Failed(1));
+    }
+
+    #[test]
+    fn test_stdout_callback_receives_lines() {
+        use std::sync::{Arc, Mutex};
+
+        let captured = Arc::new(Mutex::new(Vec::new()));
+        let captured_clone = Arc::clone(&captured);
+
+        let opts = RunnerOptions::silent().with_callbacks(
+            move |line| captured_clone.lock().unwrap().push(line.to_string()),
+            |_| {},
+        );
+
+        run_command(Command::new("echo").arg("hello world"), opts).unwrap();
+
+        let lines = captured.lock().unwrap();
+        assert_eq!(lines.len(), 1);
+        assert_eq!(lines[0], "hello world");
+    }
+
+    #[test]
+    fn test_stderr_callback_receives_lines() {
+        use std::sync::{Arc, Mutex};
+
+        let captured = Arc::new(Mutex::new(Vec::new()));
+        let captured_clone = Arc::clone(&captured);
+
+        let opts = RunnerOptions::silent().with_callbacks(
+            |_| {},
+            move |line| captured_clone.lock().unwrap().push(line.to_string()),
+        );
+
+        run_command(Command::new("sh").args(["-c", "echo error >&2"]), opts).unwrap();
+
+        let lines = captured.lock().unwrap();
+        assert_eq!(lines.len(), 1);
+        assert_eq!(lines[0], "error");
+    }
+
+    #[test]
+    fn test_callback_and_capture_both_work() {
+        use std::sync::{Arc, Mutex};
+
+        let callback_lines = Arc::new(Mutex::new(Vec::new()));
+        let callback_clone = Arc::clone(&callback_lines);
+
+        let opts = RunnerOptions::silent().with_callbacks(
+            move |line| callback_clone.lock().unwrap().push(line.to_string()),
+            |_| {},
+        );
+
+        let output =
+            run_command(Command::new("printf").args(["line1\nline2\nline3\n"]), opts).unwrap();
+
+        // Verify captured output
+        assert_eq!(output.stdout, "line1\nline2\nline3\n");
+
+        // Verify callback received all lines
+        let lines = callback_lines.lock().unwrap();
+        assert_eq!(lines.len(), 3);
+        assert_eq!(lines[0], "line1");
+        assert_eq!(lines[1], "line2");
+        assert_eq!(lines[2], "line3");
+    }
+
+    #[test]
+    fn test_multiline_output_capture() {
+        let output = run_silent(Command::new("printf").args(["line1\nline2\nline3\n"])).unwrap();
+
+        assert_eq!(output.stdout, "line1\nline2\nline3\n");
+        assert!(output.stderr.trim().is_empty());
+    }
+
+    #[test]
+    fn test_mixed_stdout_stderr_capture() {
+        let output = run_silent(Command::new("sh").args([
+            "-c",
+            "echo stdout1 && echo stderr1 >&2 && echo stdout2 && echo stderr2 >&2",
+        ]))
+        .unwrap();
+
+        assert!(output.stdout.contains("stdout1"));
+        assert!(output.stdout.contains("stdout2"));
+        assert!(output.stderr.contains("stderr1"));
+        assert!(output.stderr.contains("stderr2"));
+    }
+
+    #[test]
+    fn test_empty_output_command() {
+        let output = run_silent(&mut Command::new("true")).unwrap();
+
+        assert!(output.stdout.is_empty());
+        assert!(output.stderr.is_empty());
+        assert!(output.is_success());
+    }
+
+    #[test]
+    fn test_command_output_format_with_empty_streams() {
+        let output = run_silent(&mut Command::new("true")).unwrap();
+        let formatted = output.format_output();
+
+        assert!(formatted.contains("Stdout:"));
+        assert!(formatted.contains("<empty>"));
+        assert!(formatted.contains("Stderr:"));
+    }
+
+    #[test]
+    fn test_error_contains_message_and_output() {
+        let error = CommandError {
+            message: "Test error".to_string(),
+            output: Some(CommandOutput {
+                stdout: "captured stdout".to_string(),
+                stderr: "captured stderr".to_string(),
+                status: CommandStatus::Success,
+            }),
+        };
+
+        let display = format!("{}", error);
+        assert!(display.contains("Test error"));
+        assert!(display.contains("captured stdout"));
+        assert!(display.contains("captured stderr"));
+    }
+
+    #[test]
+    fn test_error_without_output() {
+        let error = CommandError {
+            message: "Spawn failed".to_string(),
+            output: None,
+        };
+
+        let display = format!("{}", error);
+        assert!(display.contains("Spawn failed"));
+        assert!(!display.contains("Stdout:"));
+        assert!(!display.contains("Stderr:"));
+    }
+}
diff --git a/harmony_execution/src/lib.rs b/harmony_execution/src/lib.rs
new file mode 100644
index 00000000..65fdf663
--- /dev/null
+++ b/harmony_execution/src/lib.rs
@@ -0,0 +1,6 @@
+pub mod command;
+
+pub use command::{
+    run_command, run, run_silent,
+    CommandOutput, CommandStatus, CommandError, RunnerOptions,
+};
-- 
2.39.5


From deca67fd554724a716e59caa7918c692a7a27af9 Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Sun, 25 Jan 2026 22:54:14 -0500
Subject: [PATCH 06/19] feat(backend_app): Deployment now pretty much works to
 package and deploy an app with an existing Docker image and type-safe helm
 chart on local k3d, not tested for remote k8s with Argo yet

---
 Cargo.toml                                    |    3 +
 README.md                                     |    2 +
 brocade/examples/main.rs                      |    2 +-
 harmony/Cargo.toml                            |    1 +
 .../src/modules/application/backend_app.rs    | 1005 ++++++++++-------
 harmony/src/modules/application/config.rs     |    7 +
 harmony/src/modules/application/helm/mod.rs   |  144 ++-
 harmony_agent/deploy/src/main.rs              |    7 +-
 8 files changed, 719 insertions(+), 452 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index a256234f..18a0ff9c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,7 @@ members = [
   "harmony_types",
   "harmony_macros",
   "harmony_tui",
+  "harmony_execution",
   "opnsense-config",
   "opnsense-config-xml",
   "harmony_cli",
@@ -17,6 +18,8 @@ members = [
   "harmony_secret",
   "adr/agent_discovery/mdns",
   "brocade",
+  "harmony_agent",
+  "harmony_agent/deploy",
 ]
 
 [workspace.package]
diff --git a/README.md b/README.md
index 4ccdae73..f4f13ec2 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # Harmony : Open-source infrastructure orchestration that treats your platform like first-class code
 
+In other words, Harmony is a **next-generation platform engineering framework**.
+
 _By [NationTech](https://nationtech.io)_
 
 [![Build](https://git.nationtech.io/NationTech/harmony/actions/workflows/check.yml/badge.svg)](https://git.nationtech.io/nationtech/harmony)
diff --git a/brocade/examples/main.rs b/brocade/examples/main.rs
index 47d4a631..15513ea2 100644
--- a/brocade/examples/main.rs
+++ b/brocade/examples/main.rs
@@ -1,7 +1,7 @@
 use std::net::{IpAddr, Ipv4Addr};
 
 use brocade::{BrocadeOptions, ssh};
-use harmony_secret::{Secret, SecretManager};
+use harmony_secret::Secret;
 use harmony_types::switch::PortLocation;
 use serde::{Deserialize, Serialize};
 
diff --git a/harmony/Cargo.toml b/harmony/Cargo.toml
index 634cbe96..f951a974 100644
--- a/harmony/Cargo.toml
+++ b/harmony/Cargo.toml
@@ -30,6 +30,7 @@ opnsense-config = { path = "../opnsense-config" }
 opnsense-config-xml = { path = "../opnsense-config-xml" }
 harmony_macros = { path = "../harmony_macros" }
 harmony_types = { path = "../harmony_types" }
+harmony_execution = { path = "../harmony_execution" }
 uuid.workspace = true
 url.workspace = true
 kube = { workspace = true, features = ["derive"] }
diff --git a/harmony/src/modules/application/backend_app.rs b/harmony/src/modules/application/backend_app.rs
index 83e24f5b..804af46d 100644
--- a/harmony/src/modules/application/backend_app.rs
+++ b/harmony/src/modules/application/backend_app.rs
@@ -1,14 +1,17 @@
-use std::path::PathBuf;
 use async_trait::async_trait;
 use log::{debug, info, trace};
 use serde::Serialize;
+use std::path::PathBuf;
 
 use crate::{
     config::{REGISTRY_PROJECT, REGISTRY_URL},
     modules::application::{
-        config::ApplicationNetworkPort, helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind}, Application, HelmPackage, OCICompliant
+        Application, HelmPackage, OCICompliant,
+        config::ApplicationNetworkPort,
+        helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind},
     },
 };
+use harmony_execution::{run_command, RunnerOptions};
 
 #[derive(Debug, Clone, Serialize)]
 pub struct BuildCommand {
@@ -95,98 +98,29 @@ impl OCICompliant for BackendApp {
         let dockerfile = self.get_dockerfile()?;
         let image_tag = self.image_name();
 
-        let mut child = std::process::Command::new("docker")
-            .args([
-                "build",
-                "-t",
-                &image_tag,
-                "-f",
-                &dockerfile.to_string_lossy(),
-                &self.project_root.to_string_lossy(),
-            ])
-            .stdout(std::process::Stdio::piped())
-            .stderr(std::process::Stdio::piped())
-            .spawn()
-            .map_err(|e| format!("Failed to spawn docker build process: {e}"))?;
+        // Run docker build command, streaming output to console and capturing it
+        let output = run_command(
+            std::process::Command::new("docker")
+                .args([
+                    "build",
+                    "-t",
+                    &image_tag,
+                    "-f",
+                    &dockerfile.to_string_lossy(),
+                    &self.project_root.to_string_lossy(),
+                ]),
+            RunnerOptions::print_to_console(),
+        )
+        .map_err(|e| format!("Failed to spawn docker build process: {}", e))?;
 
-        let stdout = child.stdout.take().expect("Failed to capture stdout");
-        let stderr = child.stderr.take().expect("Failed to capture stderr");
-
-        use std::io::{BufRead, BufReader};
-        use std::thread;
-
-        let stdout_reader = BufReader::new(stdout);
-        let stderr_reader = BufReader::new(stderr);
-
-        let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel();
-        let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel();
-
-        let stdout_handle = thread::spawn(move || {
-            let mut output = String::new();
-            for line in stdout_reader.lines() {
-                match line {
-                    Ok(l) => {
-                        println!("{}", l);
-                        output.push_str(&l);
-                        output.push('\n');
-                    }
-                    Err(e) => {
-                        trace!("Error reading stdout line: {}", e);
-                    }
-                }
-            }
-            let _ = stdout_sender.send(output);
-        });
-
-        let stderr_handle = thread::spawn(move || {
-            let mut output = String::new();
-            for line in stderr_reader.lines() {
-                match line {
-                    Ok(l) => {
-                        eprintln!("{}", l);
-                        output.push_str(&l);
-                        output.push('\n');
-                    }
-                    Err(e) => {
-                        trace!("Error reading stderr line: {}", e);
-                    }
-                }
-            }
-            let _ = stderr_sender.send(output);
-        });
-
-        let status = child
-            .wait()
-            .map_err(|e| format!("Failed to wait for docker build process: {e}"))?;
-
-        let stdout_lines = stdout_handle
-            .join()
-            .map_err(|e| format!("Stdout thread panicked: {e:?}"))
-            .and_then(|_| {
-                stdout_receiver
-                    .recv()
-                    .map_err(|e| format!("Failed to receive stdout: {e}"))
-            })?;
-        let stderr_lines = stderr_handle
-            .join()
-            .map_err(|e| format!("Stderr thread panicked: {e:?}"))
-            .and_then(|_| {
-                stderr_receiver
-                    .recv()
-                    .map_err(|e| format!("Failed to receive stderr: {e}"))
-            })?;
-
-        let output_content = format!(
-            "\n{stdout}\n\n{stderr}",
-            stdout = stdout_lines,
-            stderr = stderr_lines,
-        );
-        match status.success() {
-            true => {
-                info!("Docker image build succeeded");
-                Ok(image_tag)
-            }
-            false => Err(format!("Docker image build FAILED :{output_content}")),
+        if output.is_success() {
+            info!("Docker image build succeeded");
+            Ok(image_tag)
+        } else {
+            Err(format!(
+                "Docker image build FAILED:\n{}",
+                output.format_output()
+            ))
         }
     }
 
@@ -217,34 +151,22 @@ impl HelmPackage for BackendApp {
     async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
         let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string());
 
-        // Build the typed Deployment object using the builder
-        let mut deployment_builder = DeploymentBuilder::new(&self.name, image_url);
-
-        // Add container ports
-        for port in &self.network_ports {
-            deployment_builder = deployment_builder.with_container_port(
-                port.number as i32,
-                &port.name,
-                port.protocol.as_str(),
-            );
-        }
-
-        // Add environment variables
-        for (key, value) in &self.env_vars {
-            deployment_builder = deployment_builder.with_env_var(key, value);
-        }
-
-        let deployment = deployment_builder.build();
-        helm_chart.add_resource(HelmResourceKind::Deployment(deployment));
+        // Build the typed Deployment object using the builder with initial options
+        helm_chart.add_resource(HelmResourceKind::Deployment(
+            DeploymentBuilder::with_options(
+                &self.name,
+                image_url,
+                Some(self.network_ports.clone()),
+                Some(self.env_vars.clone()),
+                None,
+            )
+            .build(),
+        ));
 
         // Build the typed Service object using the helper function
-        let network_ports: Vec<(String, u16, String)> = self
-            .network_ports
-            .iter()
-            .map(|p| (p.name.clone(), p.number, p.protocol.as_str().to_string()))
-            .collect();
-
-        if let Some(service) = helm::create_service_from_ports(self.name.clone(), &network_ports) {
+        if let Some(service) =
+            helm::create_service_from_ports(self.name.clone(), &self.network_ports)
+        {
             helm_chart.add_resource(HelmResourceKind::Service(service));
         }
 
@@ -259,375 +181,622 @@ impl HelmPackage for BackendApp {
     }
 }
 
-
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::modules::application::config::ApplicationNetworkPort;
     use crate::modules::application::config::NetworkProtocol;
+    use k8s_openapi::api::apps::v1::Deployment;
+    use k8s_openapi::api::core::v1::{Container, EnvVar, Service as K8sService, ServicePort};
     use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
     use serde_yaml::from_str;
-    use k8s_openapi::api::core::v1::Service as K8sService;
-    use k8s_openapi::api::apps::v1::Deployment;
     use std::fs;
+    use std::path::Path;
+    use tempfile::tempdir;
 
-    fn cleanup_test_dirs(project_root: &PathBuf) {
-        let helm_dir = project_root.join(".harmony_generated/helm/");
-        if helm_dir.exists() {
-            let _ = fs::remove_dir_all(&helm_dir);
-        }
+    // Test Helpers
+    fn read_service_yaml(project_root: &Path, chart_name: &str) -> K8sService {
+        let path = project_root.join(format!(
+            ".harmony_generated/helm/{chart_name}/templates/service.yaml"
+        ));
+        let content = fs::read_to_string(&path)
+            .unwrap_or_else(|e| panic!("Failed to read service.yaml at {:?}: {}", path, e));
+        from_str(&content)
+            .unwrap_or_else(|e| panic!("Failed to parse service.yaml as K8s Service: {}", e))
     }
 
-    fn create_test_backend_app_with_ports() -> BackendApp {
-        BackendApp {
-            name: "test-app".to_string(),
-            project_root: "/tmp/test_backend".into(),
-            network_ports: vec![
-                ApplicationNetworkPort {
-                    number: 8080,
-                    protocol: NetworkProtocol::TCP,
-                    name: "http".to_string(),
-                },
-                ApplicationNetworkPort {
-                    number: 9000,
-                    protocol: NetworkProtocol::TCP,
-                    name: "metrics".to_string(),
-                },
-                ApplicationNetworkPort {
-                    number: 50051,
-                    protocol: NetworkProtocol::TCP,
-                    name: "grpc".to_string(),
-                },
-            ],
-            env_vars: vec![
-                ("ENV_VAR_1".to_string(), "value1".to_string()),
-                ("ENV_VAR_2".to_string(), "value2".to_string()),
-            ],
-            build_cmd: BuildCommand::new("cargo", vec!["build"]),
-            dockerfile: None,
-        }
+    fn read_deployment_yaml(project_root: &Path, chart_name: &str) -> Deployment {
+        let path = project_root.join(format!(
+            ".harmony_generated/helm/{chart_name}/templates/deployment.yaml"
+        ));
+        let content = fs::read_to_string(&path)
+            .unwrap_or_else(|e| panic!("Failed to read deployment.yaml at {:?}: {}", path, e));
+        from_str(&content)
+            .unwrap_or_else(|e| panic!("Failed to parse deployment.yaml as K8s Deployment: {}", e))
     }
 
-    fn create_test_backend_app_no_ports() -> BackendApp {
-        BackendApp {
-            name: "test-app-no-ports".to_string(),
-            project_root: "/tmp/test_backend_no_ports".into(),
-            network_ports: vec![],
-            env_vars: vec![("ENV_VAR_1".to_string(), "value1".to_string())],
-            build_cmd: BuildCommand::new("cargo", vec!["build"]),
-            dockerfile: None,
-        }
+    fn service_yaml_exists(project_root: &Path, chart_name: &str) -> bool {
+        let path = project_root.join(format!(
+            ".harmony_generated/helm/{chart_name}/templates/service.yaml"
+        ));
+        path.exists()
     }
 
-    #[tokio::test]
-    async fn test_service_created_with_all_network_ports() {
-        let app = create_test_backend_app_with_ports();
-        let test_dir = app.project_root.clone();
-
-        cleanup_test_dirs(&test_dir);
-
-        let result = app
-            .build_push_helm_package("registry.example.com/test/test-app:1.0.0")
-            .await;
-
-        assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result);
-
-        let service_yaml_path = test_dir
-            .join(".harmony_generated/helm/test-app/templates/service.yaml");
-        assert!(
-            service_yaml_path.exists(),
-            "service.yaml should exist when there are network ports"
-        );
-
-        let service_yaml_content = fs::read_to_string(&service_yaml_path)
-            .expect("Failed to read service.yaml");
-
-        let service: K8sService = from_str(&service_yaml_content)
-            .expect("Failed to parse service.yaml as K8s Service");
-
+    // Service Assertions
+    fn assert_service_metadata(service: &K8sService, expected_name: &str) {
         assert_eq!(
             service.metadata.name.as_deref(),
-            Some("test-app"),
-            "Service name should match app name"
-        );
-        assert_eq!(
-            service.spec.as_ref().unwrap().type_.as_deref(),
-            Some("ClusterIP"),
-            "Service type should be ClusterIP"
+            Some(expected_name),
+            "Service name should be '{expected_name}'"
         );
+    }
 
+    fn assert_service_type(service: &K8sService, expected_type: &str) {
+        assert_eq!(
+            service.spec.as_ref().and_then(|s| s.type_.as_deref()),
+            Some(expected_type),
+            "Service type should be '{expected_type}'"
+        );
+    }
+
+    fn assert_service_port_count(service: &K8sService, expected_count: usize) {
         let ports = service
             .spec
             .as_ref()
-            .unwrap()
-            .ports
-            .as_ref()
-            .expect("Service should have ports");
-
-        assert_eq!(ports.len(), 3, "Service should have 3 ports");
-
-        let http_port = &ports[0];
-        assert_eq!(http_port.name.as_deref(), Some("http"), "First port name should be 'http'");
-        assert_eq!(http_port.protocol.as_deref(), Some("TCP"), "First port protocol should be 'TCP'");
-        assert_eq!(http_port.port, 8080, "First port number should be 8080");
-
-        let metrics_port = &ports[1];
-        assert_eq!(metrics_port.name.as_deref(), Some("metrics"), "Second port name should be 'metrics'");
-        assert_eq!(metrics_port.protocol.as_deref(), Some("TCP"), "Second port protocol should be 'TCP'");
-        assert_eq!(metrics_port.port, 9000, "Second port number should be 9000");
-
-        let grpc_port = &ports[2];
-        assert_eq!(grpc_port.name.as_deref(), Some("grpc"), "Third port name should be 'grpc'");
-        assert_eq!(grpc_port.protocol.as_deref(), Some("TCP"), "Third port protocol should be 'TCP'");
-        assert_eq!(grpc_port.port, 50051, "Third port number should be 50051");
-
-        for port in ports.iter() {
-            match &port.target_port {
-                Some(IntOrString::Int(target)) => {
-                    assert_eq!(
-                        *target, port.port,
-                        "Target port should match service port for {}",
-                        port.name.as_deref().unwrap_or("unknown")
-                    );
-                }
-                _ => panic!("Target port should be Int for all ports"),
-            }
-        }
-
-        cleanup_test_dirs(&test_dir);
-    }
-
-    #[tokio::test]
-    async fn test_service_not_created_when_no_network_ports() {
-        let app = create_test_backend_app_no_ports();
-        let test_dir = app.project_root.clone();
-
-        cleanup_test_dirs(&test_dir);
-
-        let result = app
-            .build_push_helm_package("registry.example.com/test/test-app-no-ports:1.0.0")
-            .await;
-
-        assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result);
-
-        let service_yaml_path = test_dir
-            .join(".harmony_generated/helm/test-app-no-ports/templates/service.yaml");
-        assert!(
-            !service_yaml_path.exists(),
-            "service.yaml should not exist when there are no network ports"
+            .and_then(|s| s.ports.as_ref())
+            .unwrap_or_else(|| panic!("Service should have ports"));
+        assert_eq!(
+            ports.len(),
+            expected_count,
+            "Service should have {expected_count} ports"
         );
-
-        cleanup_test_dirs(&test_dir);
     }
 
-    #[tokio::test]
-    async fn test_deployment_created_with_correct_configuration() {
-        let app = create_test_backend_app_with_ports();
-        let test_dir = app.project_root.clone();
+    fn assert_service_port(
+        port: &ServicePort,
+        expected_name: &str,
+        expected_protocol: &str,
+        expected_number: i32,
+    ) {
+        assert_eq!(
+            port.name.as_deref(),
+            Some(expected_name),
+            "Port name should be '{expected_name}'"
+        );
+        assert_eq!(
+            port.protocol.as_deref(),
+            Some(expected_protocol),
+            "Port '{expected_name}' protocol should be '{expected_protocol}'"
+        );
+        assert_eq!(
+            port.port, expected_number,
+            "Port '{expected_name}' number should be {expected_number}"
+        );
+    }
 
-        cleanup_test_dirs(&test_dir);
-
-        let result = app
-            .build_push_helm_package("registry.example.com/test/test-app:1.0.0")
-            .await;
-
-        assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result);
-
-        let deployment_yaml_path = test_dir
-            .join(".harmony_generated/helm/test-app/templates/deployment.yaml");
-        assert!(deployment_yaml_path.exists(), "deployment.yaml should exist");
-
-        let deployment_yaml_content = fs::read_to_string(&deployment_yaml_path)
-            .expect("Failed to read deployment.yaml");
-
-        let deployment: Deployment = from_str(&deployment_yaml_content)
-            .expect("Failed to parse deployment.yaml as K8s Deployment");
+    fn assert_target_port_matches_service_port(port: &ServicePort) {
+        match &port.target_port {
+            Some(IntOrString::Int(target)) => {
+                assert_eq!(
+                    *target,
+                    port.port,
+                    "Target port should match service port for '{}'",
+                    port.name.as_deref().unwrap_or("unknown")
+                );
+            }
+            _ => panic!(
+                "Target port should be Int for '{}'",
+                port.name.as_deref().unwrap_or("unknown")
+            ),
+        }
+    }
 
+    // Deployment Assertions
+    fn assert_deployment_metadata(deployment: &Deployment, expected_name: &str) {
         assert_eq!(
             deployment.metadata.name.as_deref(),
-            Some("test-app"),
-            "Deployment name should match app name"
+            Some(expected_name),
+            "Deployment name should be '{expected_name}'"
         );
+    }
 
-        let deployment_spec = deployment
+    fn assert_deployment_replicas(deployment: &Deployment, expected_replicas: i32) {
+        let spec = deployment
             .spec
             .as_ref()
-            .expect("Deployment should have spec");
-        assert_eq!(deployment_spec.replicas, Some(1), "Replicas should be 1");
-
-        let selector = &deployment_spec.selector;
+            .unwrap_or_else(|| panic!("Deployment should have spec"));
         assert_eq!(
-            selector.match_labels.as_ref().unwrap().get("app.kubernetes.io/name"),
-            Some(&"test-app".to_string()),
-            "Selector should match app name"
+            spec.replicas,
+            Some(expected_replicas),
+            "Deployment should have {expected_replicas} replicas"
         );
+    }
 
-        let pod_spec = deployment_spec
-            .template
+    fn assert_selector_match_label(deployment: &Deployment, expected_label_value: &str) {
+        let spec = deployment
             .spec
             .as_ref()
-            .expect("Pod template should have spec");
-
-        assert_eq!(pod_spec.containers.len(), 1, "Should have exactly one container");
-
-        let container = &pod_spec.containers[0];
-        assert_eq!(container.name, "test-app", "Container name should match app name");
+            .unwrap_or_else(|| panic!("Deployment should have spec"));
         assert_eq!(
-            container.image.as_deref(),
-            Some("registry.example.com/test/test-app:1.0.0"),
-            "Container image should match provided image URL"
-        );
-        assert_eq!(
-            container.image_pull_policy.as_deref(),
-            Some("IfNotPresent"),
-            "Image pull policy should be IfNotPresent"
+            spec.selector
+                .match_labels
+                .as_ref()
+                .and_then(|m| m.get("app.kubernetes.io/name")),
+            Some(&expected_label_value.to_string()),
+            "Selector should match app name '{expected_label_value}'"
         );
+    }
 
-        let container_ports = container
-            .ports
+    fn assert_pod_labels(deployment: &Deployment, expected_name: &str) {
+        let spec = deployment
+            .spec
             .as_ref()
-            .expect("Container should have ports");
-        assert_eq!(container_ports.len(), 3, "Container should have 3 ports");
-
-        assert_eq!(container_ports[0].container_port, 8080, "First container port should be 8080");
-        assert_eq!(container_ports[0].name.as_deref(), Some("http"), "First container port name should be 'http'");
-        assert_eq!(container_ports[0].protocol.as_deref(), Some("TCP"), "First container port protocol should be 'TCP'");
-
-        assert_eq!(container_ports[1].container_port, 9000, "Second container port should be 9000");
-        assert_eq!(container_ports[1].name.as_deref(), Some("metrics"), "Second container port name should be 'metrics'");
-        assert_eq!(container_ports[1].protocol.as_deref(), Some("TCP"), "Second container port protocol should be 'TCP'");
-
-        assert_eq!(container_ports[2].container_port, 50051, "Third container port should be 50051");
-        assert_eq!(container_ports[2].name.as_deref(), Some("grpc"), "Third container port name should be 'grpc'");
-        assert_eq!(container_ports[2].protocol.as_deref(), Some("TCP"), "Third container port protocol should be 'TCP'");
-
-        let env_vars = container.env.as_ref().expect("Container should have env vars");
-        assert_eq!(env_vars.len(), 2, "Container should have 2 env vars");
-
-        let env_map: std::collections::HashMap<String, String> = env_vars
-            .iter()
-            .map(|e| (e.name.clone(), e.value.clone().unwrap_or_default()))
-            .collect();
-
-        assert_eq!(
-            env_map.get("ENV_VAR_1"),
-            Some(&"value1".to_string()),
-            "ENV_VAR_1 should have correct value"
-        );
-        assert_eq!(
-            env_map.get("ENV_VAR_2"),
-            Some(&"value2".to_string()),
-            "ENV_VAR_2 should have correct value"
-        );
-
-        let pod_labels = deployment_spec
+            .unwrap_or_else(|| panic!("Deployment should have spec"));
+        let metadata = spec
             .template
             .metadata
             .as_ref()
-            .expect("Pod template should have metadata")
+            .unwrap_or_else(|| panic!("Pod template should have metadata"));
+        let labels = metadata
             .labels
             .as_ref()
-            .expect("Pod should have labels");
+            .unwrap_or_else(|| panic!("Pod should have labels"));
 
         assert_eq!(
-            pod_labels.get("app.kubernetes.io/name"),
-            Some(&"test-app".to_string()),
-            "Pod should have correct app label"
+            labels.get("app.kubernetes.io/name"),
+            Some(&expected_name.to_string()),
+            "Pod label app.kubernetes.io/name should be '{expected_name}'"
         );
         assert_eq!(
-            pod_labels.get("app.kubernetes.io/instance"),
-            Some(&"test-app".to_string()),
-            "Pod should have correct instance label"
+            labels.get("app.kubernetes.io/instance"),
+            Some(&expected_name.to_string()),
+            "Pod label app.kubernetes.io/instance should be '{expected_name}'"
         );
+    }
 
-        cleanup_test_dirs(&test_dir);
+    // Container Assertions
+    fn assert_container_metadata(
+        container: &Container,
+        expected_name: &str,
+        expected_image: &str,
+        expected_pull_policy: &str,
+    ) {
+        assert_eq!(
+            container.name, expected_name,
+            "Container name should be '{expected_name}'"
+        );
+        assert_eq!(
+            container.image.as_deref(),
+            Some(expected_image),
+            "Container image should be '{expected_image}'"
+        );
+        assert_eq!(
+            container.image_pull_policy.as_deref(),
+            Some(expected_pull_policy),
+            "Image pull policy should be '{expected_pull_policy}'"
+        );
+    }
+
+    fn assert_container_ports_count(container: &Container, expected_count: usize) {
+        let ports = container
+            .ports
+            .as_ref()
+            .unwrap_or_else(|| panic!("Container should have ports"));
+        assert_eq!(
+            ports.len(),
+            expected_count,
+            "Container should have {expected_count} ports"
+        );
+    }
+
+    fn assert_container_port(
+        port: &k8s_openapi::api::core::v1::ContainerPort,
+        expected_name: &str,
+        expected_protocol: &str,
+        expected_number: i32,
+    ) {
+        assert_eq!(
+            port.name.as_deref(),
+            Some(expected_name),
+            "Container port name should be '{expected_name}'"
+        );
+        assert_eq!(
+            port.protocol.as_deref(),
+            Some(expected_protocol),
+            "Container port '{expected_name}' protocol should be '{expected_protocol}'"
+        );
+        assert_eq!(
+            port.container_port, expected_number,
+            "Container port '{expected_name}' number should be {expected_number}"
+        );
+    }
+
+    fn assert_container_env_vars_count(container: &Container, expected_count: usize) {
+        let env_vars = container
+            .env
+            .as_ref()
+            .unwrap_or_else(|| panic!("Container should have env vars"));
+        assert_eq!(
+            env_vars.len(),
+            expected_count,
+            "Container should have {expected_count} env vars"
+        );
+    }
+
+    fn assert_container_env_var(env_var: &EnvVar, expected_name: &str, expected_value: &str) {
+        assert_eq!(
+            env_var.name, expected_name,
+            "Env var name should be '{expected_name}'"
+        );
+        assert_eq!(
+            env_var.value.as_deref(),
+            Some(expected_value),
+            "Env var '{expected_name}' value should be '{expected_value}'"
+        );
+    }
+
+    fn get_container(deployment: &Deployment) -> Container {
+        let spec = deployment
+            .spec
+            .as_ref()
+            .unwrap_or_else(|| panic!("Deployment should have spec"));
+        let pod_spec = spec
+            .template
+            .spec
+            .as_ref()
+            .unwrap_or_else(|| panic!("Pod template should have spec"));
+        pod_spec
+            .containers
+            .first()
+            .unwrap_or_else(|| panic!("Should have exactly one container"))
+            .clone()
+    }
+
+    // Test Fixtures
+    fn standard_test_ports() -> Vec<ApplicationNetworkPort> {
+        vec![
+            ApplicationNetworkPort {
+                number: 8080,
+                protocol: NetworkProtocol::TCP,
+                name: "http".to_string(),
+            },
+            ApplicationNetworkPort {
+                number: 9000,
+                protocol: NetworkProtocol::TCP,
+                name: "metrics".to_string(),
+            },
+            ApplicationNetworkPort {
+                number: 50051,
+                protocol: NetworkProtocol::TCP,
+                name: "grpc".to_string(),
+            },
+        ]
+    }
+
+    fn standard_test_env_vars() -> Vec<(String, String)> {
+        vec![
+            ("ENV_VAR_1".to_string(), "value1".to_string()),
+            ("ENV_VAR_2".to_string(), "value2".to_string()),
+        ]
+    }
+
+    fn udp_test_ports() -> Vec<ApplicationNetworkPort> {
+        vec![
+            ApplicationNetworkPort {
+                number: 53,
+                protocol: NetworkProtocol::UDP,
+                name: "dns".to_string(),
+            },
+            ApplicationNetworkPort {
+                number: 8080,
+                protocol: NetworkProtocol::TCP,
+                name: "http".to_string(),
+            },
+        ]
+    }
+
+    // Test Builder
+    struct BackendAppTestBuilder {
+        name: Option<String>,
+        network_ports: Option<Vec<ApplicationNetworkPort>>,
+        env_vars: Option<Vec<(String, String)>>,
+    }
+
+    impl BackendAppTestBuilder {
+        fn new() -> Self {
+            Self {
+                name: None,
+                network_ports: None,
+                env_vars: None,
+            }
+        }
+
+        fn with_name(mut self, name: impl Into<String>) -> Self {
+            self.name = Some(name.into());
+            self
+        }
+
+        fn with_standard_ports(mut self) -> Self {
+            self.network_ports = Some(standard_test_ports());
+            self
+        }
+
+        fn with_udp_ports(mut self) -> Self {
+            self.network_ports = Some(udp_test_ports());
+            self
+        }
+
+        fn with_standard_env_vars(mut self) -> Self {
+            self.env_vars = Some(standard_test_env_vars());
+            self
+        }
+
+        fn with_no_ports(mut self) -> Self {
+            self.network_ports = Some(vec![]);
+            self
+        }
+
+        fn build(self, project_root: PathBuf) -> BackendApp {
+            BackendApp {
+                name: self.name.unwrap_or_else(|| "test-app".to_string()),
+                project_root,
+                network_ports: self.network_ports.unwrap_or_default(),
+                env_vars: self.env_vars.unwrap_or_default(),
+                build_cmd: BuildCommand::new("cargo", vec!["build"]),
+                dockerfile: None,
+            }
+        }
+    }
+
+    impl Default for BackendAppTestBuilder {
+        fn default() -> Self {
+            Self::new()
+        }
+    }
+
+    // Helper function for test setup
+    async fn build_helm_chart_for_test(app: &BackendApp, image_url: &str) {
+        let result = app.build_push_helm_package(image_url).await;
+        assert!(
+            result.is_ok(),
+            "build_push_helm_package should succeed: {:?}",
+            result
+        );
+    }
+
+    // ===== SERVICE TESTS =====
+
+    #[tokio::test]
+    async fn service_is_created_with_application_name() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let service = read_service_yaml(&app.project_root, "test-app");
+        assert_service_metadata(&service, "test-app");
     }
 
     #[tokio::test]
-    async fn test_service_with_udp_protocol() {
-        let app = BackendApp {
-            name: "udp-app".to_string(),
-            project_root: "/tmp/test_udp".into(),
-            network_ports: vec![
-                ApplicationNetworkPort {
-                    number: 53,
-                    protocol: NetworkProtocol::UDP,
-                    name: "dns".to_string(),
-                },
-                ApplicationNetworkPort {
-                    number: 8080,
-                    protocol: NetworkProtocol::TCP,
-                    name: "http".to_string(),
-                },
-            ],
-            env_vars: vec![],
-            build_cmd: BuildCommand::new("cargo", vec!["build"]),
-            dockerfile: None,
-        };
-        let test_dir = app.project_root.clone();
+    async fn service_has_default_clusterip_type() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
 
-        cleanup_test_dirs(&test_dir);
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
 
-        let result = app
-            .build_push_helm_package("registry.example.com/test/udp-app:1.0.0")
-            .await;
-
-        assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result);
-
-        let service_yaml_path = test_dir
-            .join(".harmony_generated/helm/udp-app/templates/service.yaml");
-        assert!(service_yaml_path.exists(), "service.yaml should exist");
-
-        let service_yaml_content = fs::read_to_string(&service_yaml_path)
-            .expect("Failed to read service.yaml");
-
-        let service: K8sService = from_str(&service_yaml_content)
-            .expect("Failed to parse service.yaml as K8s Service");
-
-        let ports = service
-            .spec
-            .as_ref()
-            .unwrap()
-            .ports
-            .as_ref()
-            .expect("Service should have ports");
-
-        assert_eq!(ports.len(), 2, "Service should have 2 ports");
-
-        let dns_port = &ports[0];
-        assert_eq!(dns_port.name.as_deref(), Some("dns"), "DNS port name should be 'dns'");
-        assert_eq!(
-            dns_port.protocol.as_deref(),
-            Some("UDP"),
-            "DNS port protocol should be 'UDP'"
-        );
-        assert_eq!(dns_port.port, 53, "DNS port number should be 53");
-
-        let http_port = &ports[1];
-        assert_eq!(http_port.name.as_deref(), Some("http"), "HTTP port name should be 'http'");
-        assert_eq!(
-            http_port.protocol.as_deref(),
-            Some("TCP"),
-            "HTTP port protocol should be 'TCP'"
-        );
-        assert_eq!(http_port.port, 8080, "HTTP port number should be 8080");
-
-        cleanup_test_dirs(&test_dir);
+        let service = read_service_yaml(&app.project_root, "test-app");
+        assert_service_type(&service, "ClusterIP");
     }
 
+    #[tokio::test]
+    async fn service_exposes_all_network_ports() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let service = read_service_yaml(&app.project_root, "test-app");
+        assert_service_port_count(&service, 3);
+
+        let ports = service.spec.unwrap().ports.unwrap();
+        assert_service_port(&ports[0], "http", "TCP", 8080);
+        assert_service_port(&ports[1], "metrics", "TCP", 9000);
+        assert_service_port(&ports[2], "grpc", "TCP", 50051);
+    }
+
+    #[tokio::test]
+    async fn service_target_ports_match_service_ports() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let service = read_service_yaml(&app.project_root, "test-app");
+        let ports = service.spec.unwrap().ports.unwrap();
+
+        for port in &ports {
+            assert_target_port_matches_service_port(port);
+        }
+    }
+
+    #[tokio::test]
+    async fn service_not_created_when_application_has_no_ports() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app-no-ports")
+            .with_no_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app-no-ports:1.0.0").await;
+
+        assert!(
+            !service_yaml_exists(&app.project_root, "test-app-no-ports"),
+            "service.yaml should not exist when there are no network ports"
+        );
+    }
+
+    #[tokio::test]
+    async fn service_respects_port_protocol_type() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("udp-app")
+            .with_udp_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/udp-app:1.0.0").await;
+
+        let service = read_service_yaml(&app.project_root, "udp-app");
+        let ports = service.spec.unwrap().ports.unwrap();
+
+        assert_service_port(&ports[0], "dns", "UDP", 53);
+        assert_service_port(&ports[1], "http", "TCP", 8080);
+    }
+
+    // ===== DEPLOYMENT METADATA TESTS =====
+
+    #[tokio::test]
+    async fn deployment_has_application_name() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        assert_deployment_metadata(&deployment, "test-app");
+    }
+
+    #[tokio::test]
+    async fn deployment_has_single_replica_by_default() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        assert_deployment_replicas(&deployment, 1);
+    }
+
+    #[tokio::test]
+    async fn deployment_selector_matches_application_name() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        assert_selector_match_label(&deployment, "test-app");
+    }
+
+    #[tokio::test]
+    async fn pod_has_standard_kubernetes_labels() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        assert_pod_labels(&deployment, "test-app");
+    }
+
+    // ===== CONTAINER CONFIGURATION TESTS =====
+
+    #[tokio::test]
+    async fn container_has_correct_name_and_image() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        let image_url = "registry.example.com/test/test-app:1.0.0";
+        build_helm_chart_for_test(&app, image_url).await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        let container = get_container(&deployment);
+
+        assert_container_metadata(&container, "test-app", image_url, "IfNotPresent");
+    }
+
+    #[tokio::test]
+    async fn container_exposes_all_application_ports() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        let container = get_container(&deployment);
+
+        assert_container_ports_count(&container, 3);
+
+        let ports = container.ports.unwrap();
+        assert_container_port(&ports[0], "http", "TCP", 8080);
+        assert_container_port(&ports[1], "metrics", "TCP", 9000);
+        assert_container_port(&ports[2], "grpc", "TCP", 50051);
+    }
+
+    #[tokio::test]
+    async fn container_has_all_environment_variables() {
+        let temp_dir = tempdir().expect("Failed to create temp directory");
+        let app = BackendAppTestBuilder::new()
+            .with_name("test-app")
+            .with_standard_ports()
+            .with_standard_env_vars()
+            .build(temp_dir.path().to_path_buf());
+
+        build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
+
+        let deployment = read_deployment_yaml(&app.project_root, "test-app");
+        let container = get_container(&deployment);
+
+        assert_container_env_vars_count(&container, 2);
+
+        let env_vars = container.env.unwrap();
+        assert_container_env_var(&env_vars[0], "ENV_VAR_1", "value1");
+        assert_container_env_var(&env_vars[1], "ENV_VAR_2", "value2");
+    }
+
+    // ===== BUILD COMMAND UNIT TESTS =====
+
     #[test]
-    fn test_build_command_creation() {
+    fn build_command_creation_sets_program_and_args() {
         let cmd = BuildCommand::new("docker", vec!["build", "-t", "myimage"]);
         assert_eq!(cmd.program, "docker");
         assert_eq!(cmd.args, vec!["build", "-t", "myimage"]);
     }
 
     #[test]
-    fn test_build_command_clone() {
+    fn build_command_clone_copies_all_fields() {
         let cmd1 = BuildCommand::new("cargo", vec!["build", "--release"]);
         let cmd2 = cmd1.clone();
         assert_eq!(cmd1.program, cmd2.program);
         assert_eq!(cmd1.args, cmd2.args);
     }
 }
-
diff --git a/harmony/src/modules/application/config.rs b/harmony/src/modules/application/config.rs
index c01ebaba..8d074271 100644
--- a/harmony/src/modules/application/config.rs
+++ b/harmony/src/modules/application/config.rs
@@ -15,6 +15,13 @@ impl NetworkProtocol {
     }
 }
 
+
+impl std::fmt::Display for NetworkProtocol {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
 #[derive(Debug, Clone, Serialize)]
 pub struct ApplicationNetworkPort {
     pub number: u16,
diff --git a/harmony/src/modules/application/helm/mod.rs b/harmony/src/modules/application/helm/mod.rs
index fd14d1e7..6b73b087 100644
--- a/harmony/src/modules/application/helm/mod.rs
+++ b/harmony/src/modules/application/helm/mod.rs
@@ -1,12 +1,16 @@
-use k8s_openapi::api::{
+// Re-export common Kubernetes types for convenience
+pub use k8s_openapi::api::{
     apps::v1::{Deployment, DeploymentSpec},
     core::v1::{
-        Container, ContainerPort, EnvVar, PodSpec,
-        PodTemplateSpec, Service as K8sService, ServicePort, ServiceSpec,
+        Container, ContainerPort, EnvVar, PodSpec, PodTemplateSpec, Service as K8sService,
+        ServicePort, ServiceSpec,
     },
 };
+use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
 use kube::core::ObjectMeta;
-use serde::Serialize;
+
+// Import domain types for the deployment builder
+use crate::modules::application::config::{ApplicationNetworkPort, NetworkProtocol};
 use std::fs;
 use std::path::{Path, PathBuf};
 
@@ -124,7 +128,8 @@ impl HelmChart {
         // 3. Serialize and write all added resources (Deployment, Service, etc.)
         for resource in &self.resources {
             let filename = resource.filename();
-            let content = resource.serialize_to_yaml()
+            let content = resource
+                .serialize_to_yaml()
                 .map_err(|e| format!("Failed to serialize resource {}: {}", filename, e))?;
             fs::write(templates_dir.join(filename), content)?;
         }
@@ -133,7 +138,6 @@ impl HelmChart {
     }
 }
 
-
 use askama::Template;
 
 #[derive(Template)]
@@ -168,7 +172,12 @@ impl ServiceBuilder {
         self
     }
 
-    pub fn with_port(mut self, name: impl Into<String>, port: i32, protocol: impl Into<String>) -> Self {
+    pub fn with_port(
+        mut self,
+        name: impl Into<String>,
+        port: i32,
+        protocol: impl Into<String>,
+    ) -> Self {
         use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
         self.ports.push(ServicePort {
             name: Some(name.into()),
@@ -192,8 +201,14 @@ impl ServiceBuilder {
                 labels: Some(
                     [
                         ("app.kubernetes.io/name".to_string(), self.name.clone()),
-                        ("app.kubernetes.io/component".to_string(), "service".to_string()),
-                        ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()),
+                        (
+                            "app.kubernetes.io/component".to_string(),
+                            "service".to_string(),
+                        ),
+                        (
+                            "app.kubernetes.io/managed-by".to_string(),
+                            "harmony".to_string(),
+                        ),
                     ]
                     .into(),
                 ),
@@ -201,8 +216,14 @@ impl ServiceBuilder {
             },
             spec: Some(ServiceSpec {
                 type_: Some(self.service_type),
-                selector: Some([("app.kubernetes.io/name".to_string(), self.selector_label)].into()),
-                ports: if self.ports.is_empty() { None } else { Some(self.ports) },
+                selector: Some(
+                    [("app.kubernetes.io/name".to_string(), self.selector_label)].into(),
+                ),
+                ports: if self.ports.is_empty() {
+                    None
+                } else {
+                    Some(self.ports)
+                },
                 ..Default::default()
             }),
             ..Default::default()
@@ -221,13 +242,53 @@ pub struct DeploymentBuilder {
 }
 
 impl DeploymentBuilder {
+    /// Create a new DeploymentBuilder with minimal required fields.
     pub fn new(name: impl Into<String>, image: impl Into<String>) -> Self {
+        Self::with_options(name, image, None, None, None)
+    }
+
+    /// Create a new DeploymentBuilder with optional initial configuration.
+    ///
+    /// Arguments:
+    /// - `name`: The deployment name
+    /// - `image`: The container image to use
+    /// - `ports`: Optional vector of initial application network ports
+    /// - `env_vars`: Optional vector of initial environment variable key-value pairs
+    /// - `replicas`: Optional number of replicas (defaults to 1)
+    pub fn with_options(
+        name: impl Into<String>,
+        image: impl Into<String>,
+        ports: Option<Vec<ApplicationNetworkPort>>,
+        env_vars: Option<Vec<(String, String)>>,
+        replicas: Option<i32>,
+    ) -> Self {
+        let container_ports: Vec<ContainerPort> = ports
+            .unwrap_or_default()
+            .into_iter()
+            .map(|port| ContainerPort {
+                container_port: port.number as i32,
+                name: Some(port.name),
+                protocol: Some(port.protocol.to_string()),
+                ..Default::default()
+            })
+            .collect();
+
+        let k8s_env_vars: Vec<EnvVar> = env_vars
+            .unwrap_or_default()
+            .into_iter()
+            .map(|(key, value)| EnvVar {
+                name: key,
+                value: Some(value),
+                ..Default::default()
+            })
+            .collect();
+
         Self {
             name: name.into(),
             image: image.into(),
-            replicas: 1,
-            container_ports: Vec::new(),
-            env_vars: Vec::new(),
+            replicas: replicas.unwrap_or(1),
+            container_ports,
+            env_vars: k8s_env_vars,
             image_pull_policy: Some("IfNotPresent".to_string()),
         }
     }
@@ -237,7 +298,12 @@ impl DeploymentBuilder {
         self
     }
 
-    pub fn with_container_port(mut self, number: i32, name: impl Into<String>, protocol: impl Into<String>) -> Self {
+    pub fn with_container_port(
+        mut self,
+        number: i32,
+        name: impl Into<String>,
+        protocol: impl Into<String>,
+    ) -> Self {
         self.container_ports.push(ContainerPort {
             container_port: number,
             name: Some(name.into()),
@@ -269,8 +335,14 @@ impl DeploymentBuilder {
                 labels: Some(
                     [
                         ("app.kubernetes.io/name".to_string(), name.clone()),
-                        ("app.kubernetes.io/component".to_string(), "deployment".to_string()),
-                        ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()),
+                        (
+                            "app.kubernetes.io/component".to_string(),
+                            "deployment".to_string(),
+                        ),
+                        (
+                            "app.kubernetes.io/managed-by".to_string(),
+                            "harmony".to_string(),
+                        ),
                         ("app.kubernetes.io/version".to_string(), "1.0.0".to_string()),
                     ]
                     .into(),
@@ -280,7 +352,9 @@ impl DeploymentBuilder {
             spec: Some(DeploymentSpec {
                 replicas: Some(self.replicas),
                 selector: k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector {
-                    match_labels: Some([("app.kubernetes.io/name".to_string(), name.clone())].into()),
+                    match_labels: Some(
+                        [("app.kubernetes.io/name".to_string(), name.clone())].into(),
+                    ),
                     ..Default::default()
                 },
                 template: PodTemplateSpec {
@@ -304,7 +378,11 @@ impl DeploymentBuilder {
                             } else {
                                 Some(self.container_ports)
                             },
-                            env: if self.env_vars.is_empty() { None } else { Some(self.env_vars) },
+                            env: if self.env_vars.is_empty() {
+                                None
+                            } else {
+                                Some(self.env_vars)
+                            },
                             ..Default::default()
                         }],
                         ..Default::default()
@@ -321,21 +399,19 @@ impl DeploymentBuilder {
 /// Returns `None` if no ports are provided.
 pub fn create_service_from_ports(
     name: String,
-    network_ports: &[(String, u16, String)], // (name, number, protocol)
+    network_ports: &[ApplicationNetworkPort],
 ) -> Option<K8sService> {
-    use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
-
     if network_ports.is_empty() {
         return None;
     }
 
     let ports: Vec<ServicePort> = network_ports
-        .iter()
-        .map(|(port_name, number, protocol)| ServicePort {
-            name: Some(port_name.clone()),
-            protocol: Some(protocol.clone()),
-            port: *number as i32,
-            target_port: Some(IntOrString::Int(*number as i32)),
+        .into_iter()
+        .map(|port| ServicePort {
+            name: Some(port.name.clone()),
+            protocol: Some(port.protocol.to_string()),
+            port: port.number as i32,
+            target_port: Some(IntOrString::Int(port.number as i32)),
             ..Default::default()
         })
         .collect();
@@ -346,8 +422,14 @@ pub fn create_service_from_ports(
             labels: Some(
                 [
                     ("app.kubernetes.io/name".to_string(), name.clone()),
-                    ("app.kubernetes.io/component".to_string(), "service".to_string()),
-                    ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()),
+                    (
+                        "app.kubernetes.io/component".to_string(),
+                        "service".to_string(),
+                    ),
+                    (
+                        "app.kubernetes.io/managed-by".to_string(),
+                        "harmony".to_string(),
+                    ),
                 ]
                 .into(),
             ),
@@ -355,7 +437,7 @@ pub fn create_service_from_ports(
         },
         spec: Some(ServiceSpec {
             type_: Some("ClusterIP".to_string()),
-            selector: Some([("app.kubernetes.io/name".to_string(), name.clone())].into()),
+            selector: Some([("app.kubernetes.io/name".to_string(), name)].into()),
             ports: Some(ports),
             ..Default::default()
         }),
diff --git a/harmony_agent/deploy/src/main.rs b/harmony_agent/deploy/src/main.rs
index 82fdd15a..84424cd4 100644
--- a/harmony_agent/deploy/src/main.rs
+++ b/harmony_agent/deploy/src/main.rs
@@ -18,8 +18,11 @@ use std::{path::PathBuf, sync::Arc};
 async fn main() {
     let application = Arc::new(BackendApp {
         name: "harmony-agent".to_string(),
-        // This means the script will be run from the harmony_agent directory, not from the
-        // deploy directory
+        // Since harmony_agent is part of the harmony workspace, the actual "project root" 
+        // is not harmony_agent folder but the workspace root.
+        //
+        // So using ../ here means we MUST run this deployment script from the harmony_agent
+        // folder
         project_root: PathBuf::from("../"),
         network_ports: vec![],
         env_vars: vec![],
-- 
2.39.5


From 0cff1e0f6608b1c7d33de7271014510047ad10df Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Fri, 30 Jan 2026 06:58:03 -0500
Subject: [PATCH 07/19] feat: Harmony agent new algorithm based on heartbeat
 counters basics. Old code will need to be refactored completely

---
 harmony_agent/Cargo.toml         |   1 +
 harmony_agent/deploy/src/main.rs |   9 +-
 harmony_agent/src/agent.rs       |  89 +++++++++++----
 harmony_agent/src/config.rs      |  72 ++++++++++--
 harmony_agent/src/main.rs        | 187 ++++++++++++++++++++++++++++++-
 5 files changed, 321 insertions(+), 37 deletions(-)

diff --git a/harmony_agent/Cargo.toml b/harmony_agent/Cargo.toml
index 360e26e4..eee7bf11 100644
--- a/harmony_agent/Cargo.toml
+++ b/harmony_agent/Cargo.toml
@@ -20,3 +20,4 @@ async-trait = "0.1"
 
 serde.workspace = true
 serde_json.workspace = true
+getrandom = "0.3.4"
diff --git a/harmony_agent/deploy/src/main.rs b/harmony_agent/deploy/src/main.rs
index 84424cd4..8baab66b 100644
--- a/harmony_agent/deploy/src/main.rs
+++ b/harmony_agent/deploy/src/main.rs
@@ -18,14 +18,19 @@ use std::{path::PathBuf, sync::Arc};
 async fn main() {
     let application = Arc::new(BackendApp {
         name: "harmony-agent".to_string(),
-        // Since harmony_agent is part of the harmony workspace, the actual "project root" 
+        // Since harmony_agent is part of the harmony workspace, the actual "project root"
         // is not harmony_agent folder but the workspace root.
         //
         // So using ../ here means we MUST run this deployment script from the harmony_agent
         // folder
         project_root: PathBuf::from("../"),
         network_ports: vec![],
-        env_vars: vec![],
+        env_vars: vec![
+            ("NATS_URL".to_string(), "nats://nats".to_string()),
+            ("DESIRED_PRIMARY".to_string(), "site-1".to_string()),
+            ("MY_CLUSTER_ID".to_string(), "site-1".to_string()),
+            ("NATS_CREDS_PATH".to_string(), "".to_string()),
+        ],
         build_cmd: BuildCommand::new("cargo", vec!["build", "--release", "-p", "harmony_agent"]),
         dockerfile: Some(PathBuf::from("Dockerfile")),
     });
diff --git a/harmony_agent/src/agent.rs b/harmony_agent/src/agent.rs
index eafc83e2..14384107 100644
--- a/harmony_agent/src/agent.rs
+++ b/harmony_agent/src/agent.rs
@@ -1,20 +1,29 @@
+use async_nats::jetstream::kv::Store;
 use async_trait::async_trait;
-use log::{debug, error, info};
+use harmony_types::id::Id;
+use log::{debug, error, info, trace};
 use serde::{Deserialize, Serialize};
 use std::time::{SystemTime, UNIX_EPOCH};
-use harmony_types::id::Id;
-use async_nats::jetstream::kv::Store;
 
 use crate::config::AgentConfig;
 
 #[async_trait]
 pub trait HealthStore: Send + Sync {
-    async fn put(&self, key: String, value: Vec<u8>) -> Result<u64, Box<dyn std::error::Error + Send + Sync>>;
+    async fn put(
+        &self,
+        key: String,
+        value: Vec<u8>,
+    ) -> Result<u64, Box<dyn std::error::Error + Send + Sync>>;
 }
 
 #[async_trait]
 impl HealthStore for Store {
-    async fn put(&self, key: String, value: Vec<u8>) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
+    async fn put(
+        &self,
+        key: String,
+        value: Vec<u8>,
+    ) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
+        trace!("HealthStore::put key={} value_len={}", key, value.len());
         self.put(key, value.into())
             .await
             .map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)
@@ -30,31 +39,49 @@ pub struct AgentHeartbeat {
 
 pub struct HarmonyAgent {
     config: AgentConfig,
-    #[allow(dead_code)]
     nats_client: Option<async_nats::Client>,
     health_kv: Box<dyn HealthStore>,
 }
 
-
 impl HarmonyAgent {
     pub async fn new(config: AgentConfig) -> Result<Self, Box<dyn std::error::Error>> {
+        info!("Initializing HarmonyAgent");
+        info!("  nats_url: {}", config.nats_url);
+        info!("  my_cluster_id: {}", config.my_cluster_id);
+        info!("  desired_primary: {}", config.desired_primary);
+        info!("  heartbeat_interval: {:?}", config.heartbeat_interval);
+        info!("  nats_creds_path: {:?}", config.nats_creds_path);
+        debug!("Full Bootstrap configuration:\n{config:#?}");
+
         let mut options = async_nats::ConnectOptions::new();
-        if let Some(ref creds) = config.nats_creds_path {
+        if let Some(creds) = &config.nats_creds_path {
+            debug!("Loading NATS credentials from file: {}", creds);
             options = options.credentials_file(creds).await?;
         }
 
+        debug!("Connecting to nats");
         let client = async_nats::connect_with_options(&config.nats_url, options).await?;
+        info!("Successfully connected to NATS at {}", config.nats_url);
         let jetstream = async_nats::jetstream::new(client.clone());
 
         // Initialize KV Buckets as per ADR-017
         const HEARTBEAT_KV_HISTORY_SIZE: i64 = 64;
+        debug!("Creating health KV bucket: harmony_agent_health");
         let health_kv = jetstream
             .create_key_value(async_nats::jetstream::kv::Config {
                 bucket: "harmony_agent_health".to_string(),
                 history: HEARTBEAT_KV_HISTORY_SIZE,
                 ..Default::default()
             })
-            .await?;
+            .await
+            .map_err(|e| {
+                error!(
+                    "Failed to initialize health KV bucket 'harmony_agent_health': {}",
+                    e
+                );
+                e
+            })?;
+        info!("Successfully initialized health KV bucket: harmony_agent_health");
 
         Ok(Self {
             config,
@@ -63,18 +90,25 @@ impl HarmonyAgent {
         })
     }
 
-
     pub async fn run_heartbeat_loop(&self) -> Result<(), Box<dyn std::error::Error>> {
         let mut interval = tokio::time::interval(self.config.heartbeat_interval);
         let key = format!("heartbeat.{}", self.config.my_cluster_id);
 
-        info!("Starting heartbeat loop for cluster: {}", self.config.my_cluster_id);
+        info!(
+            "Starting heartbeat loop for cluster: {}",
+            self.config.my_cluster_id
+        );
 
         loop {
             interval.tick().await;
+            trace!("Heartbeat loop tick");
 
             let now = SystemTime::now()
-                .duration_since(UNIX_EPOCH)?
+                .duration_since(UNIX_EPOCH)
+                .map_err(|e| {
+                    error!("Failed to get system time for heartbeat: {}", e);
+                    e
+                })?
                 .as_millis() as u64;
 
             let heartbeat = AgentHeartbeat {
@@ -83,20 +117,28 @@ impl HarmonyAgent {
                 timestamp: now,
             };
 
-            debug!("Sending heartbeat for cluster: {}", self.config.my_cluster_id);
+            debug!(
+                "Sending heartbeat for cluster: {}",
+                self.config.my_cluster_id
+            );
             let payload = serde_json::to_vec(&heartbeat)?;
 
             // Write heartbeat to KV. ADR-017: Write failure triggers self-demotion logic
             match self.health_kv.put(key.clone(), payload).await {
                 Ok(_) => {
-                    debug!("Heartbeat successful for cluster: {}", self.config.my_cluster_id);
+                    debug!(
+                        "Heartbeat successful for cluster: {}",
+                        self.config.my_cluster_id
+                    );
                 }
                 Err(e) => {
-                    error!("Failed to write heartbeat: {}. Fencing logic would trigger here.", e);
+                    error!(
+                        "Failed to write heartbeat: {}. Fencing logic would trigger here.",
+                        e
+                    );
                     // In a real implementation, we would trigger self-demotion/fencing here
                 }
             }
-
         }
     }
 }
@@ -105,7 +147,7 @@ impl HarmonyAgent {
 mod tests {
     use super::*;
     use std::sync::{Arc, Mutex};
-    use tokio::time::{pause, advance, Duration};
+    use tokio::time::{Duration, advance};
 
     struct MockHealthStore {
         puts: Arc<Mutex<Vec<(String, Vec<u8>)>>>,
@@ -113,7 +155,11 @@ mod tests {
 
     #[async_trait]
     impl HealthStore for MockHealthStore {
-        async fn put(&self, key: String, value: Vec<u8>) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
+        async fn put(
+            &self,
+            key: String,
+            value: Vec<u8>,
+        ) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
             self.puts.lock().unwrap().push((key, value));
             Ok(0)
         }
@@ -150,7 +196,11 @@ mod tests {
         }
 
         let recorded_puts = puts.lock().unwrap();
-        assert!(recorded_puts.len() >= 2, "Should have recorded at least 2 heartbeats, got {}", recorded_puts.len());
+        assert!(
+            recorded_puts.len() >= 2,
+            "Should have recorded at least 2 heartbeats, got {}",
+            recorded_puts.len()
+        );
 
         let (key, payload) = &recorded_puts[0];
         assert_eq!(key, "heartbeat.test-cluster");
@@ -162,4 +212,3 @@ mod tests {
         handle.abort();
     }
 }
-
diff --git a/harmony_agent/src/config.rs b/harmony_agent/src/config.rs
index cf5fe128..394a774d 100644
--- a/harmony_agent/src/config.rs
+++ b/harmony_agent/src/config.rs
@@ -1,6 +1,8 @@
-use std::env;
-use std::time::Duration;
 use harmony_types::id::Id;
+use log::debug;
+use std::env;
+use std::path::Path;
+use std::time::Duration;
 
 /// Configuration for the Harmony Agent
 #[derive(Debug, Clone)]
@@ -12,18 +14,70 @@ pub struct AgentConfig {
     pub heartbeat_interval: Duration,
 }
 
+pub const NATS_URL: &str = "NATS_URL";
+pub const DESIRED_PRIMARY: &str = "DESIRED_PRIMARY";
+pub const MY_CLUSTER_ID: &str = "MY_CLUSTER_ID";
+pub const NATS_CREDS_PATH: &str = "NATS_CREDS_PATH";
+
 impl AgentConfig {
     pub fn load_from_env() -> Result<Self, String> {
-        let nats_url = env::var("NATS_URL")
-            .unwrap_or_else(|_| "nats://localhost:4222".to_string());
+        let nats_url = env::var(NATS_URL).unwrap_or_else(|_| "nats://localhost:4222".to_string());
 
-        let nats_creds_path = env::var("NATS_CREDS_PATH").ok();
+        // Validate NATS URL is not empty
+        if nats_url.is_empty() {
+            return Err(format!("{NATS_URL} cannot be empty"));
+        }
 
-        let my_cluster_id_str = env::var("MY_CLUSTER_ID")
-            .map_err(|_| "Environment variable MY_CLUSTER_ID is required".to_string())?;
+        // Validate NATS URL format
+        if !nats_url.starts_with("nats://") && !nats_url.starts_with("tls://") {
+            return Err(format!(
+                "Invalid NATS URL format: {}. Must start with 'nats://' or 'tls://'",
+                nats_url
+            ));
+        }
 
-        let desired_primary_str = env::var("DESIRED_PRIMARY")
-            .map_err(|_| "Environment variable DESIRED_PRIMARY is required".to_string())?;
+        let nats_creds_path = env::var(NATS_CREDS_PATH)
+            .ok()
+            .filter(|creds_path| !creds_path.is_empty());
+
+        // Validate NATS creds path if provided
+        if let Some(creds_path) = &nats_creds_path {
+            debug!("Validating nats creds path from env var {NATS_CREDS_PATH} : {nats_creds_path:?}");
+            let path = Path::new(creds_path);
+            if !path.exists() {
+                return Err(format!(
+                    "NATS credentials file does not exist: {}",
+                    creds_path
+                ));
+            }
+            if !path.is_file() {
+                return Err(format!(
+                    "NATS credentials path is not a file: {}",
+                    creds_path
+                ));
+            }
+            // Check if file is readable by attempting to read metadata
+            if std::fs::metadata(path).is_err() {
+                return Err(format!(
+                    "NATS credentials file is not readable: {}",
+                    creds_path
+                ));
+            }
+        }
+
+        let my_cluster_id_str = env::var(MY_CLUSTER_ID)
+            .map_err(|_| "Environment variable {MY_CLUSTER_ID} is required".to_string())?;
+
+        if my_cluster_id_str.is_empty() {
+            return Err(format!("{MY_CLUSTER_ID} cannot be empty"));
+        }
+
+        let desired_primary_str = env::var(DESIRED_PRIMARY)
+            .map_err(|_| "Environment variable {DESIRED_PRIMARY} is required".to_string())?;
+
+        if desired_primary_str.is_empty() {
+            return Err(format!("{DESIRED_PRIMARY} cannot be empty"));
+        }
 
         Ok(Self {
             nats_url,
diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs
index a67e5b99..3664b65c 100644
--- a/harmony_agent/src/main.rs
+++ b/harmony_agent/src/main.rs
@@ -1,24 +1,199 @@
-use crate::{agent::HarmonyAgent, config::AgentConfig};
+use std::{str::FromStr, time::Duration};
+
+use harmony_types::id::Id;
+use log::{debug, info};
+use tokio::time::Instant;
+
+// use crate::{agent::HarmonyAgent, config::AgentConfig};
 
 mod agent;
 mod config;
 
+// #[tokio::main]
+// async fn main() -> Result<(), Box<dyn std::error::Error>> {
+//     env_logger::init();
+//
+//     let config = AgentConfig::load_from_env()?;
+//
+//     log::info!("Harmony Agent Initialized");
+//     log::debug!("Identity (My Cluster ID): {}", config.my_cluster_id);
+//     log::debug!("NATS URL                : {}", config.nats_url);
+//
+//     let agent = HarmonyAgent::new(config).await?;
+//
+//     // Run the heartbeat loop
+//     agent.run_heartbeat_loop().await?;
+//
+//     Ok(())
+// }
 
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     env_logger::init();
 
-    let config = AgentConfig::load_from_env()?;
+    let my_agent_id = Id::from_str("agent_1").unwrap();
+
+    let config = AgentConfig {
+        success_threshold: 5,
+        failure_threshold: 10,
+        heartbeat_interval: Duration::from_secs(1),
+        deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
+            desired_primary_agent: my_agent_id,
+            cnpg_cluster_name: String::from("cnpg_cluster_name"),
+        }),
+        nats_url: String::new(),
+        nats_creds_path: None,
+        agent_id: Id::empty(),
+    };
 
     log::info!("Harmony Agent Initialized");
-    log::debug!("Identity (My Cluster ID): {}", config.my_cluster_id);
+    log::debug!("Identity (My Cluster ID): {}", config.agent_id);
     log::debug!("NATS URL                : {}", config.nats_url);
 
-    let agent = HarmonyAgent::new(config).await?;
-    
+    let agent = HarmonyAgent { config };
+
     // Run the heartbeat loop
-    agent.run_heartbeat_loop().await?;
+    agent.run_heartbeat_loop().await;
 
     Ok(())
 }
 
+#[derive(Debug, Clone)]
+pub struct AgentConfig {
+    /// Number of consecutive successful heartbeats required before the service transitions from
+    /// failed to healthy.
+    pub success_threshold: usize,
+    /// Number of consecutive failed heartbeats required before the service transitions from
+    /// healthy to failed.
+    pub failure_threshold: usize,
+    /// Time between each heartbeat. If a heartbeat takes longer than this, it will be
+    /// considered failed.
+    pub heartbeat_interval: Duration,
+    /// **UNSTABLE FIELD**
+    ///
+    /// For now, an agent instance only serves one deployment. This is probably fine as an agent's
+    /// footprint is low, but managing multiple deployments in a single instance would be a
+    /// significant resource usage reduction.
+    ///
+    /// Decoupling the deployment of the agent with the application's deployment could make things
+    /// more complicated though, where we would have to be careful about version compatibility
+    /// between all components managed by the agent instance. So for now it is a 1-1 map.
+    ///
+    /// But I have a feeling this could change so I am marking this field unstable to warn you, the
+    /// reader.
+    pub deployment_config_unstable: DeploymentConfig,
+    pub nats_url: String,
+    pub nats_creds_path: Option<String>,
+    pub agent_id: Id,
+}
+
+#[derive(Debug, Clone)]
+pub enum DeploymentConfig {
+    FailoverPostgreSQL(FailoverCNPGConfig),
+}
+
+#[derive(Debug, Clone)]
+pub struct FailoverCNPGConfig {
+    pub desired_primary_agent: Id,
+    pub cnpg_cluster_name: String,
+}
+
+impl DeploymentConfig {
+    /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres)
+    pub async fn perform_health_check(&self) -> Result<(), HeartbeatFailure> {
+        match self {
+            DeploymentConfig::FailoverPostgreSQL(cfg) => {
+                info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name);
+                // TODO: Implement actual PG check / NATS write here
+                Ok(())
+            }
+        }
+    }
+
+    /// Callback: Transitioned from Unhealthy -> Healthy
+    pub async fn on_active(&self) {
+        info!("Service is now ACTIVE (Healthy)");
+        // e.g., Remove fencing lock
+    }
+
+    /// Callback: Transitioned from Healthy -> Unhealthy
+    pub async fn on_failover(&self) {
+        info!("Service is now FAILED (Unhealthy)");
+        // e.g., Initiate self-fencing, stop accepting traffic
+    }
+}
+
+pub struct HarmonyAgent {
+    pub config: AgentConfig,
+}
+
+impl HarmonyAgent {
+    pub async fn run_heartbeat_loop(&self) {
+        let mut consecutive_successes = 0;
+        let mut consecutive_failures = 0;
+        let mut is_healthy = false;
+        let mut next_heartbeat_start;
+        loop {
+            let this_heartbeat_start = Instant::now();
+            next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval;
+
+            // Perform the check via the config/strategy with a timeout
+            let result = tokio::time::timeout(self.config.heartbeat_interval, async {
+                // simulate variable latency for the health check
+                tokio::time::sleep(Duration::from_millis(getrandom::u64().unwrap() % 2000)).await;
+                self.config
+                    .deployment_config_unstable
+                    .perform_health_check()
+                    .await
+            })
+            .await;
+
+            // Update Counters & Handle State Transitions
+            // Timeout is also treated as a failure
+            let heartbeat_result = match result {
+                Ok(inner_result) => inner_result,
+                Err(_) => Err(HeartbeatFailure {}),
+            };
+
+            match heartbeat_result {
+                Ok(_) => {
+                    consecutive_failures = 0;
+                    consecutive_successes += 1;
+
+                    if !is_healthy && consecutive_successes >= self.config.success_threshold {
+                        info!("Success threshold reached. Marking as Healthy.");
+                        is_healthy = true;
+                        self.config.deployment_config_unstable.on_active().await;
+                    }
+                }
+                Err(_) => {
+                    consecutive_successes = 0;
+                    consecutive_failures += 1;
+
+                    if is_healthy && consecutive_failures >= self.config.failure_threshold {
+                        log::warn!("Failure threshold reached. Marking as Unhealthy.");
+                        is_healthy = false;
+                        self.config.deployment_config_unstable.on_failover().await;
+                    }
+                }
+            }
+
+            info!(
+                "Heartbeat : success={} healthy={}, successes={}, fails={} took={}ms",
+                if heartbeat_result.is_ok() { "✅" } else { "❌" },
+                is_healthy,
+                consecutive_successes,
+                consecutive_failures,
+                (Instant::now() - this_heartbeat_start).as_millis()
+            );
+            debug!(
+                "Sleeping for {} ms before next heartbeat",
+                (next_heartbeat_start - Instant::now()).as_millis()
+            );
+            tokio::time::sleep_until(next_heartbeat_start).await;
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct HeartbeatFailure {}
-- 
2.39.5


From 50aa545bd97f0c0b77f3ff12d9f78b294b4956d7 Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Sun, 1 Feb 2026 20:54:11 -0500
Subject: [PATCH 08/19] wip(harmony_agent): It compiles, contains most if not
 all of the required skeleton, now time to review it carefully, complete a few
 details and battle test it

---
 harmony_agent/Cargo.toml              |   2 +
 harmony_agent/src/agent_loop.rs       | 404 +++++++++++++++++++++++
 harmony_agent/src/main.rs             | 444 +++++++++++++++-----------
 harmony_agent/src/store/chaos.rs      | 123 +++++++
 harmony_agent/src/store/memory.rs     | 184 +++++++++++
 harmony_agent/src/store/mod.rs        | 117 +++++++
 harmony_agent/src/store/nats.rs       | 135 ++++++++
 harmony_agent/src/workflow/mod.rs     |  42 +++
 harmony_agent/src/workflow/primary.rs | 165 ++++++++++
 harmony_agent/src/workflow/replica.rs | 279 ++++++++++++++++
 10 files changed, 1703 insertions(+), 192 deletions(-)
 create mode 100644 harmony_agent/src/agent_loop.rs
 create mode 100644 harmony_agent/src/store/chaos.rs
 create mode 100644 harmony_agent/src/store/memory.rs
 create mode 100644 harmony_agent/src/store/mod.rs
 create mode 100644 harmony_agent/src/store/nats.rs
 create mode 100644 harmony_agent/src/workflow/mod.rs
 create mode 100644 harmony_agent/src/workflow/primary.rs
 create mode 100644 harmony_agent/src/workflow/replica.rs

diff --git a/harmony_agent/Cargo.toml b/harmony_agent/Cargo.toml
index eee7bf11..22a373ca 100644
--- a/harmony_agent/Cargo.toml
+++ b/harmony_agent/Cargo.toml
@@ -21,3 +21,5 @@ async-trait = "0.1"
 serde.workspace = true
 serde_json.workspace = true
 getrandom = "0.3.4"
+
+thiserror.workspace = true
diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs
new file mode 100644
index 00000000..2b92b851
--- /dev/null
+++ b/harmony_agent/src/agent_loop.rs
@@ -0,0 +1,404 @@
+use std::{str::FromStr, sync::Arc, time::Duration};
+
+use harmony_types::id::Id;
+use log::{debug, info, trace};
+use serde::{Deserialize, Serialize};
+use tokio::sync::RwLock;
+use tokio::time::Instant;
+
+use crate::store::{KvStore, KvStoreError};
+use crate::workflow::HeartbeatWorkflow;
+use crate::workflow::primary::PrimaryWorkflow;
+use crate::workflow::replica::ReplicaWorkflow;
+
+/// The role of this agent instance
+#[derive(Debug, Clone, PartialEq)]
+pub enum AgentRole {
+    Primary,
+    Replica,
+}
+
+pub async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    env_logger::init();
+
+    let my_agent_id = Id::from_str("agent_1").unwrap();
+
+    let config = AgentConfig {
+        success_threshold: 2,
+        failure_threshold: 2,
+        heartbeat_interval: Duration::from_secs(1),
+        failover_timeout: Duration::from_secs(5),
+        deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
+            desired_primary_agent: my_agent_id.clone(),
+            cnpg_cluster_name: String::from("cnpg_cluster_name"),
+        }),
+        nats_url: String::new(),
+        nats_creds_path: None,
+        agent_id: my_agent_id,
+        role: AgentRole::Replica,
+        cluster_id: "cluster_test_id".into(),
+        desired_primary_id: "primary_id".into(),
+    };
+
+    log::info!("Harmony Agent Initialized");
+    log::info!("Initializing Harmony Agent Id : {}", config.agent_id);
+    log::info!("Full config : {:?}", config);
+
+    // TODO load store based on config, default to nats
+    // probably a good use case for a factory pattern
+    use crate::store::ChaosKvStore;
+    use crate::store::InMemoryKvStore;
+    let health_kv = ChaosKvStore::new(InMemoryKvStore::new(), 30, 30, 1000);
+    let cluster_kv = ChaosKvStore::new(InMemoryKvStore::new(), 30, 30, 2000);
+
+    let mut agent = HarmonyAgent::new(config, health_kv, cluster_kv);
+
+    // Run the heartbeat loop
+    agent.run_heartbeat_loop().await;
+
+    Ok(())
+}
+
+#[derive(Debug, Clone)]
+pub struct AgentConfig {
+    /// Number of consecutive successful heartbeats required before the service transitions from
+    /// failed to healthy.
+    pub success_threshold: usize,
+    /// Number of consecutive failed heartbeats required before the service transitions from
+    /// healthy to failed.
+    pub failure_threshold: usize,
+    /// Time between each heartbeat. If a heartbeat takes longer than this, it will be
+    /// considered failed.
+    pub heartbeat_interval: Duration,
+    /// Time since last observed primary heartbeat before replica considers primary stale.
+    /// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin
+    /// to avoid split brain during network partitions.
+    pub failover_timeout: Duration,
+    /// **UNSTABLE FIELD**
+    ///
+    /// For now, an agent instance only serves one deployment. This is probably fine as an agent's
+    /// footprint is low, but managing multiple deployments in a single instance would be a
+    /// significant resource usage reduction.
+    ///
+    /// Decoupling the deployment of the agent with the application's deployment could make things
+    /// more complicated though, where we would have to be careful about version compatibility
+    /// between all components managed by the agent instance. So for now it is a 1-1 map.
+    ///
+    /// But I have a feeling this could change so I am marking this field unstable to warn you, the
+    /// reader.
+    pub deployment_config_unstable: DeploymentConfig,
+    pub nats_url: String,
+    pub nats_creds_path: Option<String>,
+    pub agent_id: Id,
+    pub cluster_id: Id,
+    pub desired_primary_id: Id,
+    /// The role this agent plays (Primary or Replica)
+    pub role: AgentRole,
+}
+
+#[derive(Debug, Clone)]
+pub enum DeploymentConfig {
+    FailoverPostgreSQL(FailoverCNPGConfig),
+}
+
+#[derive(Debug, Clone)]
+pub struct FailoverCNPGConfig {
+    pub desired_primary_agent: Id,
+    pub cnpg_cluster_name: String,
+}
+
+impl DeploymentConfig {
+    /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres)
+    pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> {
+        match self {
+            DeploymentConfig::FailoverPostgreSQL(cfg) => {
+                info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name);
+                // TODO: Implement actual PG check / NATS write here
+                Ok(())
+            }
+        }
+    }
+
+    /// Callback: Transitioned from Unhealthy -> Healthy
+    pub async fn on_active(&self) {
+        info!("Service is now ACTIVE (Healthy)");
+        // e.g., Remove fencing lock
+    }
+
+    /// Callback: Transitioned from Healthy -> Unhealthy
+    pub async fn on_failover(&self) {
+        info!("Service is now FAILED (Unhealthy)");
+        // e.g., Initiate self-fencing, stop accepting traffic
+    }
+}
+
+/// Agent-provided heartbeat information (no timestamps - those come from the store)
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct AgentInfo {
+    pub agent_id: Id,
+    pub cluster_id: Id,
+    pub status: String,
+}
+
+/// Store-provided metadata for a heartbeat
+/// This is returned by the KV store and includes timing/ordering guarantees
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct HeartbeatMetadata {
+    /// Timestamp set by the store (e.g., NATS JetStream)
+    /// This avoids clock skew between agents
+    pub timestamp: u64,
+    /// Sequence number for strict ordering (e.g., JetStream sequence)
+    pub sequence: u64,
+}
+
+/// Complete heartbeat with both agent data and store metadata
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct AgentHeartbeat {
+    pub agent_info: AgentInfo,
+    pub metadata: Option<HeartbeatMetadata>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ClusterStateData {
+    pub cluster_id: Id,
+    pub current_primary: Option<Id>,
+    pub desired_primary: Id,
+    pub timestamp: u64,
+}
+
+pub struct HarmonyAgent<S: KvStore> {
+    pub config: AgentConfig,
+    workflow: Box<dyn HeartbeatWorkflow>,
+    health_kv: S,
+    cluster_kv: S,
+    /// Last successful heartbeat, used to track sequence number for next write
+    /// This avoids doing a GET before every SET, reducing network round-trips
+    last_heartbeat: Arc<RwLock<Option<AgentHeartbeat>>>,
+    /// Local copy of cluster state, updated via subscription
+    /// This allows workflows to make decisions without querying NATS each time
+    cluster_state: Arc<RwLock<Option<ClusterStateData>>>,
+}
+
+impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
+    pub fn new(config: AgentConfig, health_kv: S, cluster_kv: S) -> Self {
+        let workflow: Box<dyn HeartbeatWorkflow> = match config.role {
+            AgentRole::Primary => {
+                info!("Initializing agent as PRIMARY");
+                Box::new(PrimaryWorkflow::new(
+                    config.success_threshold,
+                    config.failure_threshold,
+                    config.deployment_config_unstable.clone(),
+                ))
+            }
+            AgentRole::Replica => {
+                info!("Initializing agent as REPLICA");
+// pub fn new(success_threshold: usize, failure_threshold: usize, cluster_id: Id, primary_id: Id, my_id: Id) -> Self
+                Box::new(ReplicaWorkflow::new(
+                    config.success_threshold,
+                    config.failure_threshold,
+                    config.cluster_id.clone(),
+                    config.desired_primary_id.clone(),
+                    config.agent_id.clone(),
+                    config.failover_timeout,
+                ))
+            }
+        };
+
+        Self {
+            config,
+            workflow,
+            health_kv,
+            cluster_kv,
+            last_heartbeat: Arc::new(RwLock::new(None)),
+            cluster_state: Arc::new(RwLock::new(None)),
+        }
+    }
+
+    /// Reconcile startup state by fetching cluster state from the store
+    /// This allows the workflow to determine if it should resume as Primary/Replica
+    /// based on the persisted cluster state
+    pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> {
+        let cluster_key = format!("cluster.{}", self.config.cluster_id);
+        
+        debug!("Fetching cluster state for startup reconciliation from key: {}", cluster_key);
+        
+        let cluster_state_option = match self.cluster_kv.get(cluster_key.clone()).await {
+            Ok(result) => {
+                if let Some(value) = result.value {
+                    match serde_json::from_value::<ClusterStateData>(value) {
+                        Ok(data) => Some(data),
+                        Err(e) => {
+                            log::warn!("Failed to deserialize cluster state: {}", e);
+                            None
+                        }
+                    }
+                } else {
+                    debug!("No cluster state found, this is a fresh cluster");
+                    None
+                }
+            }
+            Err(KvStoreError::KeyNotAvailable(_)) => {
+                debug!("Cluster state key not found, this is a fresh cluster");
+                None
+            }
+            Err(e) => {
+                log::warn!("Failed to fetch cluster state during startup: {}", e);
+                return Err(e);
+            }
+        };
+
+        let state_ref = cluster_state_option.as_ref();
+        self.workflow.on_startup(state_ref).await;
+
+        // Cache the cluster state locally
+        *self.cluster_state.write().await = cluster_state_option;
+        
+        Ok(())
+    }
+
+    /// Sends agent heartbeat to the KV store
+    ///
+    /// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence)
+    /// to avoid clock skew issues. This follows the ADR-017-3 principle that all timestamp
+    /// comparisons use the store's clock, not agent clocks.
+    /// 
+    /// This method uses the last successful heartbeat's sequence number to avoid an extra
+    /// GET call before each SET, reducing network round-trips and latency exposure.
+    async fn store_heartbeat(&self) -> Result<AgentHeartbeat, KvStoreError> {
+        let key = format!("heartbeat.{}", self.config.agent_id);
+
+        // Create agent info WITHOUT timestamp - the store will add metadata
+        // Use workflow state to report actual status (e.g. Primary:Fenced, Replica:Watching)
+        let agent_info = AgentInfo {
+            agent_id: self.config.agent_id.clone(),
+            cluster_id: self.config.cluster_id.clone(),
+            status: self.workflow.state_name().to_string(),
+        };
+
+        debug!("Storing heartbeat for agent: {}", self.config.agent_id);
+        let value = serde_json::to_value(&agent_info)
+            .map_err(|e| KvStoreError::DeserializationFailed {
+                deserialization_error: e.to_string(),
+                value: format!("{:?}", agent_info),
+            })?;
+
+        // Get expected sequence from last successful heartbeat (0 if first write)
+        let expected_sequence = {
+            let last = self.last_heartbeat.read().await;
+            last.as_ref()
+                .and_then(|hb| hb.metadata.as_ref())
+                .map(|m| m.sequence)
+                .unwrap_or(0)
+        };
+
+        // Write with strict ordering - single network round-trip
+        let new_seq = self.health_kv.set_strict(key, value, expected_sequence).await?;
+        
+        debug!("Heartbeat stored successfully with sequence: {}", new_seq);
+        
+        // Construct complete heartbeat with metadata from store
+        let heartbeat = AgentHeartbeat {
+            agent_info,
+            metadata: Some(HeartbeatMetadata {
+                timestamp: todo!("get the real timestamp from store"),
+                sequence: new_seq,
+            }),
+        };
+
+        // Cache this successful heartbeat for next iteration
+        *self.last_heartbeat.write().await = Some(heartbeat.clone());
+
+        Ok(heartbeat)
+    }
+
+    pub async fn run_heartbeat_loop(&mut self) {
+        let mut next_heartbeat_start;
+        loop {
+            let this_heartbeat_start = Instant::now();
+            next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval;
+
+            // Perform the check via the config/strategy with a timeout
+            let result = tokio::time::timeout(self.config.heartbeat_interval, async {
+                // Store heartbeat and perform deployment-specific health check
+                match &self.store_heartbeat().await {
+                    Ok(heartbeat) => {
+                        // Heartbeat stored successfully, already cached by store_heartbeat
+                        debug!("Heartbeat stored: seq={}", heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0));
+                        // Pass heartbeat with metadata to workflow for staleness checks
+                        self.workflow.on_heartbeat_stored(heartbeat).await;
+                    }
+                    Err(KvStoreError::SequenceMismatch { expected, current }) => {
+                        // CAS failure could indicate:
+                        // 1. Network latency: our previous timeout heartbeat actually succeeded
+                        // 2. Agent ID conflict: another agent with same ID exists
+                        // 3. Clock/bucket corruption (unlikely)
+                        log::warn!(
+                            "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}",
+                            self.config.agent_id, expected, current, current
+                        );
+                        // Update cached heartbeat sequence to prevent repeated failures
+                        if let Some(hb) = self.last_heartbeat.write().await.as_mut() {
+                            if let Some(metadata) = hb.metadata.as_mut() {
+                                metadata.sequence = *current;
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        // Actual storage failure - treat as heartbeat failure
+                        log::error!("Heartbeat storage error: {}", e);
+                        return Err(HeartbeatFailure {});
+                    }
+                }
+                self.config.deployment_config_unstable.perform_heartbeat().await?;
+
+                // TODO: Pass the heartbeat with metadata to the workflow for staleness checks
+                // The workflow needs access to metadata.timestamp for failover timeout calculations
+                Ok::<(), HeartbeatFailure>(())
+            })
+            .await;
+
+            // Update Counters & Handle State Transitions
+            // Timeout is also treated as a failure
+            let heartbeat_result = match result {
+                Ok(inner_result) => inner_result,
+                Err(_) => Err(HeartbeatFailure {}),
+            };
+
+            trace!("Got heartbeat_result : {heartbeat_result:?}");
+            match heartbeat_result {
+                Ok(_) => {
+                    self.workflow.handle_heartbeat_success();
+                }
+                Err(_) => {
+                    self.workflow.handle_heartbeat_failure();
+                }
+            }
+
+            info!(
+                "Heartbeat : success={heartbeat_emoji} state={state}, successes={consecutive_successes}/{success_threshold}, fails={consecutive_failures}/{failure_threshold} took={heartbeat_duration}ms",
+                success_threshold = self.config.success_threshold,
+                failure_threshold = self.config.failure_threshold,
+                state = self.workflow.state_name(),
+                consecutive_successes = self.workflow.consecutive_successes(),
+                consecutive_failures = self.workflow.consecutive_failures(),
+                heartbeat_emoji = if heartbeat_result.is_ok() {
+                    "✅"
+                } else {
+                    "❌"
+                },
+                heartbeat_duration = (Instant::now() - this_heartbeat_start).as_millis(),
+            );
+            debug!(
+                "Sleeping for {} ms before next heartbeat",
+                (next_heartbeat_start - Instant::now()).as_millis()
+            );
+            tokio::time::sleep_until(next_heartbeat_start).await;
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct HeartbeatFailure {}
+
+/// Replica workflow module - handles replica-specific state machine
+mod replica {}
diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs
index 3664b65c..de88ecf5 100644
--- a/harmony_agent/src/main.rs
+++ b/harmony_agent/src/main.rs
@@ -1,199 +1,259 @@
-use std::{str::FromStr, time::Duration};
-
-use harmony_types::id::Id;
-use log::{debug, info};
-use tokio::time::Instant;
-
-// use crate::{agent::HarmonyAgent, config::AgentConfig};
-
-mod agent;
-mod config;
-
-// #[tokio::main]
-// async fn main() -> Result<(), Box<dyn std::error::Error>> {
-//     env_logger::init();
-//
-//     let config = AgentConfig::load_from_env()?;
-//
-//     log::info!("Harmony Agent Initialized");
-//     log::debug!("Identity (My Cluster ID): {}", config.my_cluster_id);
-//     log::debug!("NATS URL                : {}", config.nats_url);
-//
-//     let agent = HarmonyAgent::new(config).await?;
-//
-//     // Run the heartbeat loop
-//     agent.run_heartbeat_loop().await?;
-//
-//     Ok(())
-// }
+// mod typestate_gemini;
+// mod typestate;
+mod agent_loop;
+mod workflow;
+pub mod store;
 
 #[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    env_logger::init();
-
-    let my_agent_id = Id::from_str("agent_1").unwrap();
-
-    let config = AgentConfig {
-        success_threshold: 5,
-        failure_threshold: 10,
-        heartbeat_interval: Duration::from_secs(1),
-        deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
-            desired_primary_agent: my_agent_id,
-            cnpg_cluster_name: String::from("cnpg_cluster_name"),
-        }),
-        nats_url: String::new(),
-        nats_creds_path: None,
-        agent_id: Id::empty(),
-    };
-
-    log::info!("Harmony Agent Initialized");
-    log::debug!("Identity (My Cluster ID): {}", config.agent_id);
-    log::debug!("NATS URL                : {}", config.nats_url);
-
-    let agent = HarmonyAgent { config };
-
-    // Run the heartbeat loop
-    agent.run_heartbeat_loop().await;
-
-    Ok(())
+async fn main() {
+    // typestate_gemini::main_typestate_gemini().await;
+    agent_loop::main().await;
 }
 
-#[derive(Debug, Clone)]
-pub struct AgentConfig {
-    /// Number of consecutive successful heartbeats required before the service transitions from
-    /// failed to healthy.
-    pub success_threshold: usize,
-    /// Number of consecutive failed heartbeats required before the service transitions from
-    /// healthy to failed.
-    pub failure_threshold: usize,
-    /// Time between each heartbeat. If a heartbeat takes longer than this, it will be
-    /// considered failed.
-    pub heartbeat_interval: Duration,
-    /// **UNSTABLE FIELD**
-    ///
-    /// For now, an agent instance only serves one deployment. This is probably fine as an agent's
-    /// footprint is low, but managing multiple deployments in a single instance would be a
-    /// significant resource usage reduction.
-    ///
-    /// Decoupling the deployment of the agent with the application's deployment could make things
-    /// more complicated though, where we would have to be careful about version compatibility
-    /// between all components managed by the agent instance. So for now it is a 1-1 map.
-    ///
-    /// But I have a feeling this could change so I am marking this field unstable to warn you, the
-    /// reader.
-    pub deployment_config_unstable: DeploymentConfig,
-    pub nats_url: String,
-    pub nats_creds_path: Option<String>,
-    pub agent_id: Id,
-}
+// TODO
+//
+// DONE:
+// 1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type
+// 2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision)
+// 3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded
+// 4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent
+// 5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback
+// 6. ✅ failover_timeout added to AgentConfig
+// 7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error
+// 8. ✅ startup reconciliation implemented via on_startup() method
+//
+// REMAINING:
+// - review all code and list implementation issues
+// - review both workflow for each state transition
+// - Complete replica workflow staleness detection (needs implementation in Watching state)
+// - Implement state recovery from Failed state for both workflows
+// - Implement subscribe in NATS store with watch() API
+// - Implement config validation for failover_timeout constraints
 
-#[derive(Debug, Clone)]
-pub enum DeploymentConfig {
-    FailoverPostgreSQL(FailoverCNPGConfig),
-}
+// TODO
+//
+// 1. store trait subscribe definition missing callback
+// 2. BUG, data integrity issue : nats store not actually using jetstream metadata 
+// 3. review all code and list implementation issues
+// 4. review both workflow for each state transition
+// 5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded
+// 6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself
+//    should hold it probably, every agent should be subscribed to the cluster_state object and
+//    keep it in memory to allow workflows to process against it efficiently)
 
-#[derive(Debug, Clone)]
-pub struct FailoverCNPGConfig {
-    pub desired_primary_agent: Id,
-    pub cnpg_cluster_name: String,
-}
+// ## CRITICAL - Data Integrity Issues
+// 
+// 1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`)
+//    - Currently uses `put()` which overwrites unconditionally
+//    - Must use `update()` with revision parameter for proper compare-and-set
+//    - Without this, concurrent promotion attempts can cause split brain
+// 
+// 2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`)
+//    - Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3
+//    - NATS Entry has `.revision` and `.created` fields that must be used
+//    - This defeats the entire purpose of store-provided timestamps
+// 
+// 3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`)
+//    - Line ~156: TODO comment confirms missing metadata passing
+//    - Replica cannot calculate staleness without metadata.timestamp
+//    - Failover logic is broken
+// 
+// 4. **No actual cluster state watching exists**
+//    - Replica workflow declares `ClusterState` but never updates it
+//    - No subscription to primary heartbeat or cluster_state key
+//    - Replica cannot detect primary liveness
+// 
+// ## HIGH - Missing Core Functionality
+// 
+// 5. **Replica Workflow incomplete** - All key logic is TODO:
+//    - Watching primary staleness (line 114)
+//    - Promotion attempt (line 118)
+//    - Original primary recovery detection (line 127)
+//    - Demotion/handshake (line 131)
+// 
+// 6. **Missing replica "Failed" state**
+//    - `ReplicaState` enum has no `Failed` variant
+//    - User's TODO #5 correctly identifies this gap
+//    - What happens if replica's own heartbeats fail repeatedly?
+// 
+// 7. **Primary Workflow incomplete** - Key logic missing:
+//    - No NATS check before recovering from `Fenced` state (line 95)
+//    - No NATS check in `Yielding` state for demotion handshake (line 101)
+//    - No actual fencing failure handling
+// 
+// 8. **Store `subscribe` not implemented** (`store/mod.rs`)
+//    - Returns `todo!()` in NATS implementation
+//    - No callback mechanism defined in trait
+//    - Without this, agents cannot react to state changes
+// 
+// 9. **Cluster state not tracked centrally**
+//    - User's TODO #6 correctly identifies this
+//    - Each agent should maintain a local copy of cluster_state
+//    - No subscription mechanism to update this local copy
+// 
+// 10. **No validation of configuration constraints**
+//     - Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin`
+//     - Invalid config could cause split brain
+// 
+// ## MEDIUM - Incorrect State Transitions
+// 
+// 11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`)
+//     - Two state transitions happen in one heartbeat cycle
+//     - Should stay in `Failed` until fencing actually completes
+//     - What if fencing fails? State machine won't reflect it
+// 
+// 12. **No fencing failure handling**
+//     - If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes
+//     - ADR mentions escalating to radical measures, but no callback for failure
+// 
+// 13. **Replica `Watching` state does nothing**
+//     - Line 115: Just logs, checks nothing
+//     - Should be checking staleness of primary heartbeat
+// 
+// 14. **Demotion handshake not implemented**
+//     - ADR section 4 details this but code doesn't implement it
+//     - How does original primary know it should yield?
+// 
+// ## LOW - Observability & Reliability
+// 
+// 15. **No graceful shutdown mechanism**
+//     - `run_heartbeat_loop` runs forever
+//     - No signal handling (SIGTERM, SIGINT)
+// 
+// 16. **Async task errors silently ignored**
+//     - `tokio::spawn` at lines 74, 83, 123
+//     - No `JoinHandle` retention or error handling
+// 
+// 17. **No metrics/observability**
+//     - Only log output
+//     - No Prometheus metrics for state transitions, failure counts, etc.
+// 
+// 18. **Hardcoded main() function** (`agent_loop.rs::main`)
+//     - Not production-ready entry point
+//     - Should load config from environment or file
+// 
+// 19. **Store factory pattern missing**
+//     - TODO comment at line 54 confirms this
+//     - Can't switch between stores via config
+// 
+// 20. **No backoff/retry logic for NATS operations**
+//     - Transient failures could trigger unnecessary fencing
+// 
+// 21. **`AgentInfo` status is hardcoded to "HEALTHY"**
+//     - Line 137 in `store_heartbeat`
+//     - Should反映 actual workflow state
+// 
+// 22. **Unused fields in structs**
+//     - `HeartbeatState.last_seq` set but never read
+//     - `ClusterState.current_primary` set but never read
+// 
+// ## ADR-017-3 Compliance Issues
+// 
+// 23. **ADR violation: Clock skew not avoided**
+//     - While ADR says use store metadata, code uses local time
+// 
+// 24. **Failover timeout not configurable**
+//     - Defined in ADR but not in `AgentConfig`
+//     - Needed for replica staleness calculation
+// 
+// 25. **Safety margin concept exists in ADR but not in code**
+//     - Configuration should include this margin
+// 
+// 26. **No handling of Case 3 (Replica Network Lag)**
+//     - ADR describes NATS rejection prevention
+//     - But `set_strict` implementation accepts any write
+// 
+// ## Code Quality Issues
+// 
+// 27. **Inconsistent error handling**
+//     - Some paths return `Err`, others `todo!()`, others ignore
+// 
+// 28. **Unnecessary `Clone` bounds**
+//     - `DeploymentConfig.clone()` used frequently
+//     - Could be optimized with `Arc`
+// 
+// 29. **Missing lifetime annotations**
+//     - `KvStore::get` returns `String` key in error - inefficient
+// 
+// 30. **No integration points mentioned**
+//     - PostgreSQL lifecycle control implementation missing
+//     - Fencing via CNPG not connected
+// 
+// ## Production Readiness Checklist Summary
+// 
+// For battle testing preparation, you need:
+// 
+// **Immediate ( blockers):**
+// - Fix NATS store metadata usage (issues #1, #2)
+// - Implement strict set_strict with actual CAS (#1)
+// - Implement replica primary watching (#4, #5)
+// - Add failover_timeout config + staleness logic (#3, #24)
+// - Implement subscribe mechanism with callbacks (#8)
+// 
+// **High priority:**
+// - Complete all workflow transitions (#5, #7, #11-14)
+// - Add cluster state tracking (#6, #9)
+// - Add configuration validation (#10)
+// - Add Replica Failed state (#6)
+// 
+// **Before deployment:**
+// - Implement graceful shutdown (#15)
+// - Add error handling for spawned tasks (#16)
+// - Remove hardcoded main function (#18)
+// - Implement store factory (#19)
+// - Add Prometheus metrics (#17)
+// 
+// **Documentation:**
+// - Document all configuration parameters and their trade-offs
+// - Add runbooks for each failure mode
+// - Document battle test scenarios to cover
+// 
+// ### Addendum: Missing Critical Issues
+// 
+// #### 1. CRITICAL: Heartbeat "Lying" (Data Integrity)
+// *   **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`.
+// *   **The Bug:** `status: "HEALTHY".to_string()` is hardcoded.
+// *   **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY".
+// *   **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely.
+// 
+// #### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption)
+// *   **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`).
+// *   **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks.
+// *   **Scenario:**
+//     1.  Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s).
+//     2.  Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s).
+//     3.  `on_active` finishes *before* `on_failover`.
+//     4.  `on_failover` finishes last, killing the DB *after* the agent decided it was healthy.
+// *   **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one.
+// 
+// #### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk)
+// *   **Location:** `agent_loop.rs` loop logic.
+// *   **The Bug:** There is no "Stop the World" gate.
+// *   **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*.
+// *   **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again.
+// *   **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter.
+// 
+// #### 4. HIGH: NATS Bucket Name Collision
+// *   **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`.
+// *   **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`.
+// *   **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state.
+// *   **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`.
+// 
+// #### 5. HIGH: Startup State Reconciliation
+// *   **Location:** `HarmonyAgent::new`.
+// *   **The Bug:** Agents always start in `Initializing`.
+// *   **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader.
+// *   **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime.
+// *   **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check).
+// 
+// ### Summary of Tasks to Add
+// 
+// Please add these to your master list before starting implementation:
+// 
+// 28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY".
+// 29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping.
+// 30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection).
+// 31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`.
+// 32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid.
+// 
 
-impl DeploymentConfig {
-    /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres)
-    pub async fn perform_health_check(&self) -> Result<(), HeartbeatFailure> {
-        match self {
-            DeploymentConfig::FailoverPostgreSQL(cfg) => {
-                info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name);
-                // TODO: Implement actual PG check / NATS write here
-                Ok(())
-            }
-        }
-    }
-
-    /// Callback: Transitioned from Unhealthy -> Healthy
-    pub async fn on_active(&self) {
-        info!("Service is now ACTIVE (Healthy)");
-        // e.g., Remove fencing lock
-    }
-
-    /// Callback: Transitioned from Healthy -> Unhealthy
-    pub async fn on_failover(&self) {
-        info!("Service is now FAILED (Unhealthy)");
-        // e.g., Initiate self-fencing, stop accepting traffic
-    }
-}
-
-pub struct HarmonyAgent {
-    pub config: AgentConfig,
-}
-
-impl HarmonyAgent {
-    pub async fn run_heartbeat_loop(&self) {
-        let mut consecutive_successes = 0;
-        let mut consecutive_failures = 0;
-        let mut is_healthy = false;
-        let mut next_heartbeat_start;
-        loop {
-            let this_heartbeat_start = Instant::now();
-            next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval;
-
-            // Perform the check via the config/strategy with a timeout
-            let result = tokio::time::timeout(self.config.heartbeat_interval, async {
-                // simulate variable latency for the health check
-                tokio::time::sleep(Duration::from_millis(getrandom::u64().unwrap() % 2000)).await;
-                self.config
-                    .deployment_config_unstable
-                    .perform_health_check()
-                    .await
-            })
-            .await;
-
-            // Update Counters & Handle State Transitions
-            // Timeout is also treated as a failure
-            let heartbeat_result = match result {
-                Ok(inner_result) => inner_result,
-                Err(_) => Err(HeartbeatFailure {}),
-            };
-
-            match heartbeat_result {
-                Ok(_) => {
-                    consecutive_failures = 0;
-                    consecutive_successes += 1;
-
-                    if !is_healthy && consecutive_successes >= self.config.success_threshold {
-                        info!("Success threshold reached. Marking as Healthy.");
-                        is_healthy = true;
-                        self.config.deployment_config_unstable.on_active().await;
-                    }
-                }
-                Err(_) => {
-                    consecutive_successes = 0;
-                    consecutive_failures += 1;
-
-                    if is_healthy && consecutive_failures >= self.config.failure_threshold {
-                        log::warn!("Failure threshold reached. Marking as Unhealthy.");
-                        is_healthy = false;
-                        self.config.deployment_config_unstable.on_failover().await;
-                    }
-                }
-            }
-
-            info!(
-                "Heartbeat : success={} healthy={}, successes={}, fails={} took={}ms",
-                if heartbeat_result.is_ok() { "✅" } else { "❌" },
-                is_healthy,
-                consecutive_successes,
-                consecutive_failures,
-                (Instant::now() - this_heartbeat_start).as_millis()
-            );
-            debug!(
-                "Sleeping for {} ms before next heartbeat",
-                (next_heartbeat_start - Instant::now()).as_millis()
-            );
-            tokio::time::sleep_until(next_heartbeat_start).await;
-        }
-    }
-}
-
-#[derive(Debug)]
-pub struct HeartbeatFailure {}
diff --git a/harmony_agent/src/store/chaos.rs b/harmony_agent/src/store/chaos.rs
new file mode 100644
index 00000000..1dce4ed8
--- /dev/null
+++ b/harmony_agent/src/store/chaos.rs
@@ -0,0 +1,123 @@
+use async_trait::async_trait;
+use serde_json::Value;
+use std::sync::Arc;
+use tokio::time::Duration;
+
+use crate::store::SubscriptionCallback;
+
+use super::{KvStore, KvStoreError};
+
+/// A chaos testing KV store that randomly times out or fails
+/// Wraps another KvStore implementation and adds random failures
+#[derive(Clone)]
+pub struct ChaosKvStore<T: KvStore> {
+    inner: Arc<T>,
+    timeout_probability: u32,
+    failure_probability_percentage: u32,
+    max_delay_ms: u64,
+}
+
+impl<T: KvStore> ChaosKvStore<T> {
+    pub fn new(
+        inner: T,
+        timeout_probability: u32,
+        failure_probability: u32,
+        max_delay_ms: u64,
+    ) -> Self {
+        Self {
+            inner: Arc::new(inner),
+            timeout_probability,
+            failure_probability_percentage: failure_probability,
+            max_delay_ms,
+        }
+    }
+
+    async fn maybe_chaos(&self) -> Result<(), KvStoreError> {
+        // Random delay
+        if self.max_delay_ms > 0 {
+            let delay = getrandom::u64().unwrap() % self.max_delay_ms;
+            tokio::time::sleep(Duration::from_millis(delay)).await;
+        }
+
+        // Random failure
+        let failure_random = getrandom::u32().unwrap();
+        if (failure_random % 100) < self.failure_probability_percentage {
+            return Err(KvStoreError::Unknown);
+        }
+
+        // Random timeout (simulated as a very long delay)
+        let failure_random = getrandom::u32().unwrap();
+        if failure_random % 100 < self.timeout_probability {
+            tokio::time::sleep(Duration::from_secs(10)).await;
+        }
+
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl<T: KvStore + Send + Sync> KvStore for ChaosKvStore<T> {
+    async fn get(&self, key: String) -> Result<super::KvResult, KvStoreError> {
+        self.maybe_chaos().await?;
+        self.inner.get(key).await
+    }
+
+    async fn set_strict(
+        &self,
+        key: String,
+        value: Value,
+        expected_sequence: u64,
+    ) -> Result<u64, KvStoreError> {
+        self.maybe_chaos().await?;
+        self.inner.set_strict(key, value, expected_sequence).await
+    }
+
+    async fn subscribe(
+        &self,
+        key: String,
+        callback: SubscriptionCallback,
+    ) -> Result<(), KvStoreError> {
+        self.maybe_chaos().await?;
+        self.inner.subscribe(key, callback).await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::store::InMemoryKvStore;
+    use serde_json::json;
+
+    #[tokio::test]
+    async fn test_chaos_store_with_no_chaos() {
+        let inner = InMemoryKvStore::new();
+        let chaos = ChaosKvStore::new(inner, 0, 0, 0);
+
+        let value = json!({"test": "value"});
+        let result = chaos
+            .set_strict("key".to_string(), value.clone(), 0)
+            .await
+            .unwrap();
+        assert_eq!(result, 1);
+
+        let retrieved = chaos.get("key".to_string()).await.unwrap();
+        assert_eq!(retrieved.value, Some(value));
+    }
+
+    #[tokio::test]
+    async fn test_chaos_store_with_delay() {
+        let inner = InMemoryKvStore::new();
+        let chaos = ChaosKvStore::new(inner, 0, 0, 100);
+
+        let start = tokio::time::Instant::now();
+        let value = json!({"test": "value"});
+        chaos.set_strict("key".to_string(), value, 0).await.unwrap();
+        let elapsed = start.elapsed();
+
+        // Should have some delay
+        assert!(
+            elapsed.as_millis() < 150,
+            "Should complete within reasonable time"
+        );
+    }
+}
diff --git a/harmony_agent/src/store/memory.rs b/harmony_agent/src/store/memory.rs
new file mode 100644
index 00000000..3549c563
--- /dev/null
+++ b/harmony_agent/src/store/memory.rs
@@ -0,0 +1,184 @@
+use async_trait::async_trait;
+use serde_json::Value;
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::{SystemTime, UNIX_EPOCH};
+use tokio::sync::RwLock;
+
+use crate::store::SubscriptionCallback;
+
+use super::{KvMetadata, KvResult, KvStore, KvStoreError};
+
+/// An in-memory KV store that guarantees ordering like NATS JetStream
+/// Each key has a sequence number that increments on each write
+#[derive(Clone)]
+pub struct InMemoryKvStore {
+    data: Arc<RwLock<HashMap<String, (Value, u64)>>>,
+    global_seq: Arc<RwLock<u64>>,
+}
+
+impl InMemoryKvStore {
+    pub fn new() -> Self {
+        Self {
+            data: Arc::new(RwLock::new(HashMap::new())),
+            global_seq: Arc::new(RwLock::new(0)),
+        }
+    }
+
+    /// Get the sequence number for a key
+    pub async fn get_seq(&self, key: &str) -> Option<u64> {
+        self.data.read().await.get(key).map(|(_, seq)| *seq)
+    }
+}
+
+impl Default for InMemoryKvStore {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl KvStore for InMemoryKvStore {
+    async fn get(&self, key: String) -> Result<KvResult, KvStoreError> {
+        let data = self.data.read().await;
+        let (value, sequence) = data
+            .get(&key)
+            .ok_or_else(|| KvStoreError::KeyNotAvailable(key.clone()))?;
+
+        let timestamp = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .expect("Time went backwards")
+            .as_millis() as u64;
+
+        Ok(KvResult {
+            value: Some(value.clone()),
+            metadata: KvMetadata {
+                timestamp,
+                sequence: *sequence,
+            },
+        })
+    }
+
+    async fn set_strict(
+        &self,
+        key: String,
+        value: Value,
+        expected_sequence: u64,
+    ) -> Result<u64, KvStoreError> {
+        // Check current sequence
+        let data = self.data.read().await;
+        let current_sequence = data.get(&key).map(|(_, seq)| *seq).unwrap_or(0);
+        drop(data);
+
+        // Verify expected sequence matches
+        if current_sequence != expected_sequence {
+            return Err(KvStoreError::SequenceMismatch {
+                expected: expected_sequence,
+                current: current_sequence,
+            });
+        }
+
+        // Increment global sequence
+        let mut seq = self.global_seq.write().await;
+        *seq += 1;
+        let new_seq = *seq;
+        drop(seq);
+
+        // Write the new value
+        let mut data = self.data.write().await;
+        data.insert(key, (value.clone(), new_seq));
+        drop(data);
+
+        Ok(new_seq)
+    }
+
+    async fn subscribe(
+        &self,
+        key: String,
+        callback: SubscriptionCallback,
+    ) -> Result<(), KvStoreError> {
+        // For now, subscribe just returns the current value
+        // In a real implementation, this would return a stream of updates
+        self.get(key).await;
+        todo!() // register callback and call it when key is set ?
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[tokio::test]
+    async fn test_memory_store_basic() {
+        let store = InMemoryKvStore::new();
+
+        // Set a value
+        let value = json!({"status": "healthy"});
+        let result = store
+            .set_strict("test_key".to_string(), value.clone(), 0)
+            .await
+            .unwrap();
+        assert_eq!(result, 1);
+
+        // Get the value
+        let retrieved = store.get("test_key".to_string()).await.unwrap();
+        assert_eq!(retrieved.value, Some(value));
+        assert_eq!(retrieved.metadata.sequence, 1);
+    }
+
+    #[tokio::test]
+    async fn test_memory_store_sequence_numbers() {
+        let store = InMemoryKvStore::new();
+
+        let seq1 = store
+            .set_strict("key1".to_string(), json!("value1"), 0)
+            .await
+            .unwrap();
+
+        let seq2 = store
+            .set_strict("key2".to_string(), json!("value2"), 0)
+            .await
+            .unwrap();
+
+        assert!(seq2 > seq1, "Sequence numbers should increment");
+    }
+
+    #[tokio::test]
+    async fn test_memory_store_key_not_found() {
+        let store = InMemoryKvStore::new();
+        let result = store.get("nonexistent".to_string()).await;
+        assert!(matches!(result, Err(KvStoreError::KeyNotAvailable(_))));
+    }
+
+    #[tokio::test]
+    async fn test_memory_store_strict_ordering() {
+        let store = InMemoryKvStore::new();
+
+        // First write with sequence 0
+        let result1 = store
+            .set_strict("key".to_string(), json!("value1"), 0)
+            .await
+            .unwrap();
+        assert_eq!(result1, 1);
+
+        // Second write with correct sequence
+        let result2 = store
+            .set_strict("key".to_string(), json!("value2"), 1)
+            .await
+            .unwrap();
+        assert_eq!(result2, 2);
+
+        // Third write with wrong sequence should fail
+        let result3 = store
+            .set_strict("key".to_string(), json!("value3"), 1)
+            .await;
+        assert!(matches!(
+            result3,
+            Err(KvStoreError::SequenceMismatch {
+                expected: 1,
+                current: 2
+            })
+        ));
+    }
+}
diff --git a/harmony_agent/src/store/mod.rs b/harmony_agent/src/store/mod.rs
new file mode 100644
index 00000000..26e630c5
--- /dev/null
+++ b/harmony_agent/src/store/mod.rs
@@ -0,0 +1,117 @@
+use async_trait::async_trait;
+use serde_json::Value;
+use thiserror::Error;
+
+/// Handle for managing active subscriptions
+#[derive(Debug, Clone)]
+pub struct SubscriptionHandle {
+    id: usize,
+    _phantom: std::marker::PhantomData<()>,
+}
+
+/// Metadata returned by the KV store for all operations
+/// Contains timing and ordering information set by the store
+#[derive(Debug, Clone)]
+pub struct KvMetadata {
+    /// Timestamp set by the store (milliseconds since UNIX epoch)
+    pub timestamp: u64,
+    /// Sequence number for strict ordering guarantees
+    pub sequence: u64,
+}
+
+/// Result returned by KV store operations
+/// Contains both the value (if any) and store metadata
+#[derive(Debug, Clone)]
+pub struct KvResult {
+    /// The value from the store (None if key doesn't exist)
+    pub value: Option<Value>,
+    /// Store-provided metadata (timestamp, sequence)
+    pub metadata: KvMetadata,
+}
+
+/// Callback type for subscription updates
+/// Callback receives: key, new value (None if deleted), and metadata
+pub type SubscriptionCallback = Box<dyn Fn(String, Option<Value>, KvMetadata) + Send + Sync>;
+
+#[derive(Error, Debug)]
+pub enum KvStoreError {
+    #[error("data store disconnected")]
+    Disconnect(#[from] std::io::Error),
+    #[error("invalid key")]
+    InvalidKey,
+    #[error("operation timed out")]
+    Timeout,
+    #[error("the data for key `{0}` is not available")]
+    KeyNotAvailable(String),
+    #[error("Failed to deserialize value to json. Error {0} , value: {1}", .deserialization_error, .value)]
+    DeserializationFailed {
+        deserialization_error: String,
+        value: String,
+    },
+    #[error("Strict ordering violation: expected sequence {expected}, but current is {current}")]
+    SequenceMismatch { expected: u64, current: u64 },
+    #[error("unknown data store error")]
+    Unknown,
+}
+
+#[async_trait]
+pub trait KvStore {
+    /// Get a value from the store
+    /// 
+    /// # Returns
+    /// - `Ok(KvResult)`: Contains the value and metadata (timestamp, sequence)
+    /// - `Err(KeyNotAvailable)`: If the key doesn't exist
+    async fn get(&self, key: String) -> Result<KvResult, KvStoreError>;
+    
+    /// Strict set operation with compare-and-set semantics
+    /// 
+    /// Sets the value only if the current sequence number matches `expected_sequence`.
+    /// This provides strict ordering guarantees needed for the failover algorithm.
+    /// 
+    /// # Parameters
+    /// - `key`: The key to set
+    /// - `value`: The value to store
+    /// - `expected_sequence`: The sequence number we expect the key to currently have.
+    ///   Use 0 for the first write to a new key.
+    /// 
+    /// # Returns
+    /// - `Ok(u64)`: Returns the new sequence number
+    /// - `Err(KvStoreError)`: If another write happened (current != expected)
+    /// 
+    /// # Example Use Case
+    /// For NATS JetStream, this maps to the conditional update operation that ensures
+    /// only one agent can successfully promote to primary.
+    async fn set_strict(
+        &self,
+        key: String,
+        value: Value,
+        expected_sequence: u64,
+    ) -> Result<u64, KvStoreError>;
+    
+    /// Subscribe to updates for a key
+    /// 
+    /// # Parameters
+    /// - `key`: The key to subscribe to
+    /// - `callback`: Function to call on each update with key, value, and metadata
+    /// 
+    /// # Returns
+    /// - `Ok(())`: Subscription established successfully
+    /// - `Err(KvStoreError)`: Subscription failed
+    /// 
+    /// Note: For JetStream, this should use watch() API. Updates will invoke the callback
+    /// asynchronously in the background.
+    async fn subscribe(
+        &self,
+        key: String,
+        callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
+                                        // callback
+    ) -> Result<(), KvStoreError>;
+}
+
+mod memory;
+mod nats;
+mod chaos;
+
+pub use memory::InMemoryKvStore;
+pub use nats::NatsKvStore;
+pub use chaos::ChaosKvStore;
diff --git a/harmony_agent/src/store/nats.rs b/harmony_agent/src/store/nats.rs
new file mode 100644
index 00000000..1c82c1d8
--- /dev/null
+++ b/harmony_agent/src/store/nats.rs
@@ -0,0 +1,135 @@
+use async_nats::jetstream::kv::{Store, UpdateError};
+use async_trait::async_trait;
+use log::{debug, error};
+use serde_json::Value;
+
+use crate::store::SubscriptionCallback;
+
+use super::{KvMetadata, KvResult, KvStore, KvStoreError};
+
+/// NATS JetStream-backed KV store
+pub struct NatsKvStore {
+    store: Store,
+}
+
+impl NatsKvStore {
+    pub fn new(store: Store) -> Self {
+        Self { store }
+    }
+
+    pub async fn create(
+        client: async_nats::Client,
+        bucket_name: &str,
+        history_size: i64,
+    ) -> Result<Self, Box<dyn std::error::Error>> {
+        let jetstream = async_nats::jetstream::new(client);
+
+        debug!("Creating NATS KV bucket: {}", bucket_name);
+        let store = jetstream
+            .create_key_value(async_nats::jetstream::kv::Config {
+                bucket: bucket_name.to_string(),
+                history: history_size,
+                ..Default::default()
+            })
+            .await
+            .map_err(|e| {
+                error!(
+                    "Failed to initialize NATS KV bucket '{}': {}",
+                    bucket_name, e
+                );
+                e
+            })?;
+
+        Ok(Self::new(store))
+    }
+}
+
+#[async_trait]
+impl KvStore for NatsKvStore {
+    async fn get(&self, key: String) -> Result<KvResult, KvStoreError> {
+        let entry = self.store.entry(&key).await.map_err(|e| {
+            error!("NATS get failed for key '{}': {}", key, e);
+            KvStoreError::Disconnect(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                e.to_string(),
+            ))
+        })?;
+
+        if entry.is_none() {
+            return Err(KvStoreError::KeyNotAvailable(key));
+        }
+
+        let entry = entry.unwrap();
+        let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
+            KvStoreError::DeserializationFailed {
+                deserialization_error: e.to_string(),
+                value: String::from_utf8_lossy(&entry.value).to_string(),
+            }
+        })?;
+
+        // Extract metadata from NATS entry
+        // Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
+        let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
+
+        let metadata = KvMetadata {
+            timestamp,
+            sequence: entry.revision,
+        };
+
+        Ok(KvResult {
+            value: Some(value),
+            metadata,
+        })
+    }
+
+    async fn set_strict(
+        &self,
+        key: String,
+        value: Value,
+        expected_sequence: u64,
+    ) -> Result<u64, KvStoreError> {
+        let bytes =
+            serde_json::to_vec(&value).map_err(|e| KvStoreError::DeserializationFailed {
+                deserialization_error: e.to_string(),
+                value: value.to_string(),
+            })?;
+
+        // Use update() for CAS semantics (Compare-And-Set)
+        // This ensures we only write if the revision matches expected_sequence
+        let revision = self
+            .store
+            .update(&key, bytes.into(), expected_sequence)
+            .await
+            .map_err(|e| {
+                // FIXME this is ugly, we should have a clean KvStoreError containing
+                // proper information from nats instead
+                error!("NATS update failed for key '{}': {}", key, e);
+                e
+            })?;
+
+        Ok(revision)
+    }
+
+    async fn subscribe(
+        &self,
+        key: String,
+        callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
+    ) -> Result<(), KvStoreError> {
+        todo!()
+    }
+}
+
+impl From<UpdateError> for KvStoreError {
+    fn from(value: UpdateError) -> Self {
+        match value.kind() {
+            async_nats::jetstream::kv::UpdateErrorKind::InvalidKey => KvStoreError::InvalidKey,
+            async_nats::jetstream::kv::UpdateErrorKind::TimedOut => KvStoreError::Timeout,
+            async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision => {
+                KvStoreError::KeyNotAvailable("key".to_string())
+            }
+            async_nats::jetstream::kv::UpdateErrorKind::Other => KvStoreError::Disconnect(
+                std::io::Error::new(std::io::ErrorKind::Other, "NATS update error"),
+            ),
+        }
+    }
+}
diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs
new file mode 100644
index 00000000..074b29e2
--- /dev/null
+++ b/harmony_agent/src/workflow/mod.rs
@@ -0,0 +1,42 @@
+use async_trait::async_trait;
+use harmony_types::id::Id;
+
+pub mod primary;
+pub mod replica;
+
+/// Trait that defines how a workflow (Primary or Replica) handles heartbeat events
+#[async_trait]
+pub trait HeartbeatWorkflow: Send + Sync {
+    /// Handle a successful heartbeat
+    fn handle_heartbeat_success(&mut self);
+
+    /// Handle a failed heartbeat
+    fn handle_heartbeat_failure(&mut self);
+
+    /// Called after heartbeat is successfully stored with metadata
+    /// This provides workflows access to timestamp/sequence for staleness calculations
+    async fn on_heartbeat_stored(&mut self, _heartbeat: &crate::agent_loop::AgentHeartbeat) {
+        // Default implementation does nothing
+    }
+
+    /// Called during agent startup to reconcile state from cluster state
+    /// Receives the current cluster state if available
+    async fn on_startup(&mut self, _cluster_state: Option<&crate::agent_loop::ClusterStateData>) {
+        // Default implementation does nothing
+    }
+
+    /// Called when a peer agent heartbeat is observed (via subscription)
+    /// This is primarily used by replicas to detect primary staleness
+    async fn on_peer_heartbeat(&mut self, _peer_id: &Id, _heartbeat: &crate::agent_loop::AgentHeartbeat) {
+        // Default implementation does nothing
+    }
+
+    /// Get the current state name for logging (also used for heartbeat status)
+    fn state_name(&self) -> &'static str;
+
+    /// Get current consecutive successes
+    fn consecutive_successes(&self) -> usize;
+
+    /// Get current consecutive failures
+    fn consecutive_failures(&self) -> usize;
+}
diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs
new file mode 100644
index 00000000..7eccc998
--- /dev/null
+++ b/harmony_agent/src/workflow/primary.rs
@@ -0,0 +1,165 @@
+use async_trait::async_trait;
+use log::{debug, info, trace, warn};
+
+use crate::{agent_loop::DeploymentConfig, workflow::HeartbeatWorkflow};
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum PrimaryState {
+    Initializing,
+    Healthy,
+    Failed,
+    Fenced,
+    Yielding,
+}
+
+impl PrimaryState {
+    pub fn name(&self) -> &'static str {
+        match self {
+            PrimaryState::Initializing => "Primary:Initializing",
+            PrimaryState::Healthy => "Primary:Healthy",
+            PrimaryState::Failed => "Primary:Failed",
+            PrimaryState::Fenced => "Primary:Fenced",
+            PrimaryState::Yielding => "Primary:Yielding",
+        }
+    }
+}
+
+pub struct PrimaryWorkflow {
+    state: PrimaryState,
+    consecutive_successes: usize,
+    consecutive_failures: usize,
+    success_threshold: usize,
+    failure_threshold: usize,
+    deployment_config: DeploymentConfig,
+}
+
+impl PrimaryWorkflow {
+    pub fn new(
+        success_threshold: usize,
+        failure_threshold: usize,
+        deployment_config: DeploymentConfig,
+    ) -> Self {
+        Self {
+            state: PrimaryState::Initializing,
+            consecutive_successes: 0,
+            consecutive_failures: 0,
+            success_threshold,
+            failure_threshold,
+            deployment_config,
+        }
+    }
+
+    fn transition_to(&mut self, new_state: PrimaryState) {
+        if self.state != new_state {
+            info!(
+                "State transition: {} -> {}",
+                self.state.name(),
+                new_state.name()
+            );
+            self.state = new_state;
+        }
+    }
+}
+
+#[async_trait]
+impl HeartbeatWorkflow for PrimaryWorkflow {
+    async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>) {
+        if let Some(state) = cluster_state {
+            info!(
+                "Startup reconciliation: current primary is {:?}, desired primary is {:?}",
+                state.current_primary, state.desired_primary
+            );
+            // No automatic fast-tracking - agent must earn healthy status
+            // through successful heartbeats. This prevents duplicate agents
+            // or crashloop agents from incorrectly claiming primary.
+        } else {
+            debug!("No cluster state on startup, starting from Initializing");
+        }
+    }
+    fn handle_heartbeat_success(&mut self) {
+        self.consecutive_successes += 1;
+        self.consecutive_failures = 0;
+
+        match self.state {
+            PrimaryState::Initializing => {
+                if self.consecutive_successes >= self.success_threshold {
+                    self.transition_to(PrimaryState::Healthy);
+                    // Trigger on_active callback
+                    let config = self.deployment_config.clone();
+                    tokio::spawn(async move {
+                        config.on_active().await;
+                    });
+                }
+            }
+            PrimaryState::Failed => {
+                if self.consecutive_successes >= self.success_threshold {
+                    self.transition_to(PrimaryState::Healthy);
+                    let config = self.deployment_config.clone();
+                    tokio::spawn(async move {
+                        config.on_active().await;
+                    });
+                }
+            }
+            PrimaryState::Healthy => {
+                // Stay healthy
+                debug!("Primary staying healthy");
+            }
+            PrimaryState::Fenced => {
+                // Recovery from fenced state
+                if self.consecutive_successes >= self.success_threshold {
+                    // TODO: Check NATS for current_primary status before recovering
+                    info!("Recovered from fenced state, transitioning to yielding");
+                    self.transition_to(PrimaryState::Yielding);
+                }
+            }
+            PrimaryState::Yielding => {
+                // TODO: Check NATS to see if we can resume as primary
+                trace!("Yielding, waiting for demotion handshake");
+            }
+        }
+    }
+
+    fn handle_heartbeat_failure(&mut self) {
+        self.consecutive_failures += 1;
+        self.consecutive_successes = 0;
+
+        match self.state {
+            PrimaryState::Healthy => {
+                if self.consecutive_failures >= self.failure_threshold {
+                    warn!(
+                        "Failure threshold reached ({}/{}), transitioning to Failed",
+                        self.consecutive_failures, self.failure_threshold
+                    );
+                    self.transition_to(PrimaryState::Failed);
+
+                    // Immediately fence
+                    self.transition_to(PrimaryState::Fenced);
+                    let config = self.deployment_config.clone();
+                    tokio::spawn(async move {
+                        config.on_failover().await;
+                    });
+                }
+            }
+            PrimaryState::Initializing => {
+                // Stay in initializing, just accumulate failures
+                trace!("Heartbeat failed during initialization");
+            }
+            PrimaryState::Failed | PrimaryState::Fenced | PrimaryState::Yielding => {
+                // Already in a degraded state
+                trace!("Heartbeat failed in degraded state: {}", self.state.name());
+            }
+        }
+    }
+
+    fn state_name(&self) -> &'static str {
+        self.state.name()
+    }
+
+    fn consecutive_successes(&self) -> usize {
+        self.consecutive_successes
+    }
+
+    fn consecutive_failures(&self) -> usize {
+        self.consecutive_failures
+    }
+}
diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs
new file mode 100644
index 00000000..9800e3c7
--- /dev/null
+++ b/harmony_agent/src/workflow/replica.rs
@@ -0,0 +1,279 @@
+use async_trait::async_trait;
+use harmony_types::id::Id;
+use log::{debug, info, trace};
+use std::time::Duration;
+use tokio::sync::RwLock;
+
+use crate::agent_loop::AgentHeartbeat;
+use crate::workflow::HeartbeatWorkflow;
+
+#[derive(Debug, Clone)]
+pub struct HeartbeatState {
+    pub agent_id: Id,
+    pub last_seq: Option<u64>,
+}
+
+impl HeartbeatState {
+    pub fn watch(agent_id: Id) -> Self {
+        Self {
+            agent_id,
+            last_seq: None,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct ClusterState {
+    pub cluster_id: Id,
+    pub current_primary: Option<Id>,
+}
+
+impl ClusterState {
+    pub fn watch(cluster_id: Id) -> Self {
+        Self {
+            cluster_id,
+            current_primary: None,
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum ReplicaState {
+    Initializing,
+    Watching,
+    Promoting,
+    PromotionFailed,
+    Leader,
+    Demoting,
+    Failed,
+}
+
+impl ReplicaState {
+    pub fn name(&self) -> &'static str {
+        match self {
+            ReplicaState::Initializing => "Replica:Initializing",
+            ReplicaState::Watching => "Replica:Watching",
+            ReplicaState::Promoting => "Replica:Promoting",
+            ReplicaState::PromotionFailed => "Replica:PromotionFailed",
+            ReplicaState::Leader => "Replica:Leader",
+            ReplicaState::Demoting => "Replica:Demoting",
+            ReplicaState::Failed => "Replica:Failed",
+        }
+    }
+}
+
+pub struct ReplicaWorkflow {
+    state: ReplicaState,
+    heartbeat_state: HeartbeatState,
+    primary_state: HeartbeatState,
+    cluster_state: ClusterState,
+    consecutive_successes: usize,
+    consecutive_failures: usize,
+    success_threshold: usize,
+    failure_threshold: usize,
+    failover_timeout: Duration,
+    /// Our own last heartbeat (for timestamp comparison against primary)
+    last_my_heartbeat: Option<AgentHeartbeat>,
+    /// Last observed primary heartbeat (metadata only, for staleness detection)
+    last_primary_heartbeat: Option<RwLock<AgentHeartbeat>>,
+}
+
+impl ReplicaWorkflow {
+    pub fn new(
+        success_threshold: usize,
+        failure_threshold: usize,
+        cluster_id: Id,
+        primary_id: Id,
+        my_id: Id,
+        failover_timeout: Duration,
+    ) -> Self {
+        Self {
+            state: ReplicaState::Initializing,
+            consecutive_successes: 0,
+            consecutive_failures: 0,
+            success_threshold,
+            failure_threshold,
+            failover_timeout,
+            cluster_state: ClusterState::watch(cluster_id),
+            primary_state: HeartbeatState::watch(primary_id),
+            heartbeat_state: HeartbeatState::watch(my_id),
+            last_my_heartbeat: None,
+            last_primary_heartbeat: None,
+        }
+    }
+
+    fn transition_to(&mut self, new_state: ReplicaState) {
+        if self.state != new_state {
+            info!(
+                "State transition: {} -> {}",
+                self.state.name(),
+                new_state.name()
+            );
+            self.state = new_state;
+        }
+    }
+
+    /// Check if the primary heartbeat is stale compared to our own
+    /// Per ADR-017-3: primary is stale if (replica_timestamp - primary_timestamp) > failover_timeout
+    async fn check_primary_staleness(&mut self) {
+        let mut new_state = self.state.clone();
+        if let Some(my_hb) = &self.last_my_heartbeat {
+            if let Some(my_metadata) = &my_hb.metadata {
+                if let Some(primary_hb_ref) = self.last_primary_heartbeat.as_ref() {
+                    let primary_hb = primary_hb_ref.read().await;
+                    if let Some(primary_metadata) = &primary_hb.metadata {
+                        // Calculate time difference: replica_timestamp - primary_timestamp
+                        let time_diff_ms = my_metadata
+                            .timestamp
+                            .saturating_sub(primary_metadata.timestamp);
+                        let failover_timeout_ms = self.failover_timeout.as_millis() as u64;
+
+                        trace!(
+                            "Staleness check: my_ts={}, primary_ts={}, diff={}ms, timeout={}ms",
+                            my_metadata.timestamp,
+                            primary_metadata.timestamp,
+                            time_diff_ms,
+                            failover_timeout_ms
+                        );
+
+                        if time_diff_ms > failover_timeout_ms {
+                            info!(
+                                "Primary heartbeat stale ({}ms > {}ms), attempting promotion",
+                                time_diff_ms, failover_timeout_ms
+                            );
+                            new_state = ReplicaState::Promoting;
+                        }
+                    }
+                }
+            }
+
+            if self.state != new_state {
+                self.transition_to(new_state)
+            }
+        }
+    }
+}
+
+#[async_trait]
+impl HeartbeatWorkflow for ReplicaWorkflow {
+    async fn on_peer_heartbeat(&mut self, peer_id: &Id, heartbeat: &AgentHeartbeat) {
+        // Only track the primary's heartbeat
+        if *peer_id == self.primary_state.agent_id {
+            match &self.last_primary_heartbeat {
+                Some(existing) => {
+                    // Update the existing heartbeat data
+                    *existing.write().await = heartbeat.clone();
+                }
+                None => {
+                    // First time seeing primary heartbeat
+                    self.last_primary_heartbeat = Some(RwLock::new(heartbeat.clone()));
+                }
+            }
+            trace!(
+                "Updated primary heartbeat: seq={}, timestamp={}",
+                heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0),
+                heartbeat
+                    .metadata
+                    .as_ref()
+                    .map(|m| m.timestamp)
+                    .unwrap_or(0),
+            );
+        }
+    }
+    async fn on_heartbeat_stored(&mut self, heartbeat: &AgentHeartbeat) {
+        // Track our own heartbeat for staleness comparison
+        self.last_my_heartbeat = Some(heartbeat.clone());
+
+        // Perform staleness detection if we have both heartbeats
+        self.check_primary_staleness().await;
+    }
+    async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>) {
+        if let Some(state) = cluster_state {
+            info!(
+                "Startup reconciliation: current primary is {:?}, desired primary is {:?}",
+                state.current_primary, state.desired_primary
+            );
+            // Update cluster_state with the observed values
+            self.cluster_state.current_primary = state.current_primary.clone();
+        } else {
+            debug!("No cluster state on startup, starting from Initializing");
+        }
+    }
+    fn handle_heartbeat_success(&mut self) {
+        self.consecutive_successes += 1;
+        self.consecutive_failures = 0;
+
+        match self.state {
+            ReplicaState::Initializing => {
+                if self.consecutive_successes >= self.success_threshold {
+                    self.transition_to(ReplicaState::Watching);
+                }
+            }
+            ReplicaState::Watching => {
+                // TODO: Check primary staleness from NATS
+                trace!("Replica watching primary");
+            }
+            ReplicaState::Promoting => {
+                // TODO: Complete promotion attempt
+                trace!("Replica promotion in progress");
+            }
+            ReplicaState::PromotionFailed => {
+                if self.consecutive_successes >= self.success_threshold {
+                    self.transition_to(ReplicaState::Watching);
+                }
+            }
+            ReplicaState::Leader => {
+                // TODO: Check for original primary recovery
+                trace!("Replica acting as leader");
+            }
+            ReplicaState::Failed => {
+                if self.consecutive_successes >= self.success_threshold {
+                    info!("Replica recovered from Failed state, transitioning to Watching");
+                    self.transition_to(ReplicaState::Watching);
+                }
+            }
+            ReplicaState::Demoting => {
+                // TODO: Complete demotion back to watching
+                trace!("Replica demotion in progress");
+            }
+        }
+    }
+
+    fn handle_heartbeat_failure(&mut self) {
+        self.consecutive_failures += 1;
+        self.consecutive_successes = 0;
+
+        match self.state {
+            ReplicaState::Watching | ReplicaState::Initializing => {
+                if self.consecutive_failures >= self.failure_threshold {
+                    info!(
+                        "Replica exceeded failure threshold ({}/{}), transitioning to Failed",
+                        self.consecutive_failures, self.failure_threshold
+                    );
+                    self.transition_to(ReplicaState::Failed);
+                } else {
+                    trace!("Replica heartbeat failed, but below threshold");
+                }
+            }
+            ReplicaState::Promoting
+            | ReplicaState::PromotionFailed
+            | ReplicaState::Leader
+            | ReplicaState::Demoting
+            | ReplicaState::Failed => {
+                trace!("Replica heartbeat failed in state: {}", self.state.name());
+            }
+        }
+    }
+
+    fn state_name(&self) -> &'static str {
+        self.state.name()
+    }
+
+    fn consecutive_successes(&self) -> usize {
+        self.consecutive_successes
+    }
+
+    fn consecutive_failures(&self) -> usize {
+        self.consecutive_failures
+    }
+}
-- 
2.39.5


From 948334b89e6cae104d133259eac0f443d89df35a Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Tue, 3 Feb 2026 06:39:56 -0500
Subject: [PATCH 09/19] wip: cleaning up llm code, pretty close to something
 comprehensible and robust

---
 harmony_agent/README.md                   | 246 ++++++++++
 harmony_agent/src/agent.rs                | 214 ---------
 harmony_agent/src/agent_loop.rs           | 150 ++++---
 harmony_agent/src/main.rs                 | 322 +++----------
 harmony_agent/src/old/typestate.rs        | 230 ++++++++++
 harmony_agent/src/old/typestate_gemini.rs | 523 ++++++++++++++++++++++
 harmony_agent/src/store/chaos.rs          |  69 +--
 harmony_agent/src/store/memory.rs         | 134 +++---
 harmony_agent/src/store/mod.rs            |  43 +-
 harmony_agent/src/store/nats.rs           |  58 ++-
 harmony_agent/src/workflow/mod.rs         |  22 +-
 harmony_agent/src/workflow/primary.rs     |   5 +-
 harmony_agent/src/workflow/replica.rs     |  81 ++--
 13 files changed, 1385 insertions(+), 712 deletions(-)
 create mode 100644 harmony_agent/README.md
 delete mode 100644 harmony_agent/src/agent.rs
 create mode 100644 harmony_agent/src/old/typestate.rs
 create mode 100644 harmony_agent/src/old/typestate_gemini.rs

diff --git a/harmony_agent/README.md b/harmony_agent/README.md
new file mode 100644
index 00000000..c22d1b51
--- /dev/null
+++ b/harmony_agent/README.md
@@ -0,0 +1,246 @@
+TODO
+
+DONE:
+1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type
+2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision)
+3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded
+4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent
+5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback
+6. ✅ failover_timeout added to AgentConfig
+7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error
+8. ✅ startup reconciliation implemented via on_startup() method
+
+REMAINING:
+- review all code and list implementation issues
+- review both workflow for each state transition
+- Complete replica workflow staleness detection (needs implementation in Watching state)
+- Implement state recovery from Failed state for both workflows
+- Implement subscribe in NATS store with watch() API
+- Implement config validation for failover_timeout constraints
+
+TODO
+
+1. store trait subscribe definition missing callback
+2. BUG, data integrity issue : nats store not actually using jetstream metadata
+3. review all code and list implementation issues
+4. review both workflow for each state transition
+5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded
+6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself
+   should hold it probably, every agent should be subscribed to the cluster_state object and
+   keep it in memory to allow workflows to process against it efficiently)
+
+## CRITICAL - Data Integrity Issues
+
+1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`)
+   - Currently uses `put()` which overwrites unconditionally
+   - Must use `update()` with revision parameter for proper compare-and-set
+   - Without this, concurrent promotion attempts can cause split brain
+
+2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`)
+   - Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3
+   - NATS Entry has `.revision` and `.created` fields that must be used
+   - This defeats the entire purpose of store-provided timestamps
+
+3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`)
+   - Line ~156: TODO comment confirms missing metadata passing
+   - Replica cannot calculate staleness without metadata.timestamp
+   - Failover logic is broken
+
+4. **No actual cluster state watching exists**
+   - Replica workflow declares `ClusterState` but never updates it
+   - No subscription to primary heartbeat or cluster_state key
+   - Replica cannot detect primary liveness
+
+## HIGH - Missing Core Functionality
+
+5. **Replica Workflow incomplete** - All key logic is TODO:
+   - Watching primary staleness (line 114)
+   - Promotion attempt (line 118)
+   - Original primary recovery detection (line 127)
+   - Demotion/handshake (line 131)
+
+6. **Missing replica "Failed" state**
+   - `ReplicaState` enum has no `Failed` variant
+   - User's TODO #5 correctly identifies this gap
+   - What happens if replica's own heartbeats fail repeatedly?
+
+7. **Primary Workflow incomplete** - Key logic missing:
+   - No NATS check before recovering from `Fenced` state (line 95)
+   - No NATS check in `Yielding` state for demotion handshake (line 101)
+   - No actual fencing failure handling
+
+8. **Store `subscribe` not implemented** (`store/mod.rs`)
+   - Returns `todo!()` in NATS implementation
+   - No callback mechanism defined in trait
+   - Without this, agents cannot react to state changes
+
+9. **Cluster state not tracked centrally**
+   - User's TODO #6 correctly identifies this
+   - Each agent should maintain a local copy of cluster_state
+   - No subscription mechanism to update this local copy
+
+10. **No validation of configuration constraints**
+    - Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin`
+    - Invalid config could cause split brain
+
+## MEDIUM - Incorrect State Transitions
+
+11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`)
+    - Two state transitions happen in one heartbeat cycle
+    - Should stay in `Failed` until fencing actually completes
+    - What if fencing fails? State machine won't reflect it
+
+12. **No fencing failure handling**
+    - If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes
+    - ADR mentions escalating to radical measures, but no callback for failure
+
+13. **Replica `Watching` state does nothing**
+    - Line 115: Just logs, checks nothing
+    - Should be checking staleness of primary heartbeat
+
+14. **Demotion handshake not implemented**
+    - ADR section 4 details this but code doesn't implement it
+    - How does original primary know it should yield?
+
+## LOW - Observability & Reliability
+
+15. **No graceful shutdown mechanism**
+    - `run_heartbeat_loop` runs forever
+    - No signal handling (SIGTERM, SIGINT)
+
+16. **Async task errors silently ignored**
+    - `tokio::spawn` at lines 74, 83, 123
+    - No `JoinHandle` retention or error handling
+
+17. **No metrics/observability**
+    - Only log output
+    - No Prometheus metrics for state transitions, failure counts, etc.
+
+18. **Hardcoded main() function** (`agent_loop.rs::main`)
+    - Not production-ready entry point
+    - Should load config from environment or file
+
+19. **Store factory pattern missing**
+    - TODO comment at line 54 confirms this
+    - Can't switch between stores via config
+
+20. **No backoff/retry logic for NATS operations**
+    - Transient failures could trigger unnecessary fencing
+
+21. **`AgentInfo` status is hardcoded to "HEALTHY"**
+    - Line 137 in `store_heartbeat`
+    - Should反映 actual workflow state
+
+22. **Unused fields in structs**
+    - `HeartbeatState.last_seq` set but never read
+    - `ClusterState.current_primary` set but never read
+
+## ADR-017-3 Compliance Issues
+
+23. **ADR violation: Clock skew not avoided**
+    - While ADR says use store metadata, code uses local time
+
+24. **Failover timeout not configurable**
+    - Defined in ADR but not in `AgentConfig`
+    - Needed for replica staleness calculation
+
+25. **Safety margin concept exists in ADR but not in code**
+    - Configuration should include this margin
+
+26. **No handling of Case 3 (Replica Network Lag)**
+    - ADR describes NATS rejection prevention
+    - But `set_strict` implementation accepts any write
+
+## Code Quality Issues
+
+27. **Inconsistent error handling**
+    - Some paths return `Err`, others `todo!()`, others ignore
+
+28. **Unnecessary `Clone` bounds**
+    - `DeploymentConfig.clone()` used frequently
+    - Could be optimized with `Arc`
+
+29. **Missing lifetime annotations**
+    - `KvStore::get` returns `String` key in error - inefficient
+
+30. **No integration points mentioned**
+    - PostgreSQL lifecycle control implementation missing
+    - Fencing via CNPG not connected
+
+## Production Readiness Checklist Summary
+
+For battle testing preparation, you need:
+
+**Immediate ( blockers):**
+- Fix NATS store metadata usage (issues #1, #2)
+- Implement strict set_strict with actual CAS (#1)
+- Implement replica primary watching (#4, #5)
+- Add failover_timeout config + staleness logic (#3, #24)
+- Implement subscribe mechanism with callbacks (#8)
+
+**High priority:**
+- Complete all workflow transitions (#5, #7, #11-14)
+- Add cluster state tracking (#6, #9)
+- Add configuration validation (#10)
+- Add Replica Failed state (#6)
+
+**Before deployment:**
+- Implement graceful shutdown (#15)
+- Add error handling for spawned tasks (#16)
+- Remove hardcoded main function (#18)
+- Implement store factory (#19)
+- Add Prometheus metrics (#17)
+
+**Documentation:**
+- Document all configuration parameters and their trade-offs
+- Add runbooks for each failure mode
+- Document battle test scenarios to cover
+
+### Addendum: Missing Critical Issues
+
+#### 1. CRITICAL: Heartbeat "Lying" (Data Integrity)
+*   **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`.
+*   **The Bug:** `status: "HEALTHY".to_string()` is hardcoded.
+*   **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY".
+*   **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely.
+
+#### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption)
+*   **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`).
+*   **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks.
+*   **Scenario:**
+    1.  Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s).
+    2.  Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s).
+    3.  `on_active` finishes *before* `on_failover`.
+    4.  `on_failover` finishes last, killing the DB *after* the agent decided it was healthy.
+*   **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one.
+
+#### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk)
+*   **Location:** `agent_loop.rs` loop logic.
+*   **The Bug:** There is no "Stop the World" gate.
+*   **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*.
+*   **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again.
+*   **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter.
+
+#### 4. HIGH: NATS Bucket Name Collision
+*   **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`.
+*   **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`.
+*   **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state.
+*   **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`.
+
+#### 5. HIGH: Startup State Reconciliation
+*   **Location:** `HarmonyAgent::new`.
+*   **The Bug:** Agents always start in `Initializing`.
+*   **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader.
+*   **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime.
+*   **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check).
+
+### Summary of Tasks to Add
+
+Please add these to your master list before starting implementation:
+
+28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY".
+29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping.
+30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection).
+31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`.
+32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid.
+
diff --git a/harmony_agent/src/agent.rs b/harmony_agent/src/agent.rs
deleted file mode 100644
index 14384107..00000000
--- a/harmony_agent/src/agent.rs
+++ /dev/null
@@ -1,214 +0,0 @@
-use async_nats::jetstream::kv::Store;
-use async_trait::async_trait;
-use harmony_types::id::Id;
-use log::{debug, error, info, trace};
-use serde::{Deserialize, Serialize};
-use std::time::{SystemTime, UNIX_EPOCH};
-
-use crate::config::AgentConfig;
-
-#[async_trait]
-pub trait HealthStore: Send + Sync {
-    async fn put(
-        &self,
-        key: String,
-        value: Vec<u8>,
-    ) -> Result<u64, Box<dyn std::error::Error + Send + Sync>>;
-}
-
-#[async_trait]
-impl HealthStore for Store {
-    async fn put(
-        &self,
-        key: String,
-        value: Vec<u8>,
-    ) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
-        trace!("HealthStore::put key={} value_len={}", key, value.len());
-        self.put(key, value.into())
-            .await
-            .map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)
-    }
-}
-
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct AgentHeartbeat {
-    pub cluster_id: Id,
-    pub status: String,
-    pub timestamp: u64,
-}
-
-pub struct HarmonyAgent {
-    config: AgentConfig,
-    nats_client: Option<async_nats::Client>,
-    health_kv: Box<dyn HealthStore>,
-}
-
-impl HarmonyAgent {
-    pub async fn new(config: AgentConfig) -> Result<Self, Box<dyn std::error::Error>> {
-        info!("Initializing HarmonyAgent");
-        info!("  nats_url: {}", config.nats_url);
-        info!("  my_cluster_id: {}", config.my_cluster_id);
-        info!("  desired_primary: {}", config.desired_primary);
-        info!("  heartbeat_interval: {:?}", config.heartbeat_interval);
-        info!("  nats_creds_path: {:?}", config.nats_creds_path);
-        debug!("Full Bootstrap configuration:\n{config:#?}");
-
-        let mut options = async_nats::ConnectOptions::new();
-        if let Some(creds) = &config.nats_creds_path {
-            debug!("Loading NATS credentials from file: {}", creds);
-            options = options.credentials_file(creds).await?;
-        }
-
-        debug!("Connecting to nats");
-        let client = async_nats::connect_with_options(&config.nats_url, options).await?;
-        info!("Successfully connected to NATS at {}", config.nats_url);
-        let jetstream = async_nats::jetstream::new(client.clone());
-
-        // Initialize KV Buckets as per ADR-017
-        const HEARTBEAT_KV_HISTORY_SIZE: i64 = 64;
-        debug!("Creating health KV bucket: harmony_agent_health");
-        let health_kv = jetstream
-            .create_key_value(async_nats::jetstream::kv::Config {
-                bucket: "harmony_agent_health".to_string(),
-                history: HEARTBEAT_KV_HISTORY_SIZE,
-                ..Default::default()
-            })
-            .await
-            .map_err(|e| {
-                error!(
-                    "Failed to initialize health KV bucket 'harmony_agent_health': {}",
-                    e
-                );
-                e
-            })?;
-        info!("Successfully initialized health KV bucket: harmony_agent_health");
-
-        Ok(Self {
-            config,
-            nats_client: Some(client),
-            health_kv: Box::new(health_kv),
-        })
-    }
-
-    pub async fn run_heartbeat_loop(&self) -> Result<(), Box<dyn std::error::Error>> {
-        let mut interval = tokio::time::interval(self.config.heartbeat_interval);
-        let key = format!("heartbeat.{}", self.config.my_cluster_id);
-
-        info!(
-            "Starting heartbeat loop for cluster: {}",
-            self.config.my_cluster_id
-        );
-
-        loop {
-            interval.tick().await;
-            trace!("Heartbeat loop tick");
-
-            let now = SystemTime::now()
-                .duration_since(UNIX_EPOCH)
-                .map_err(|e| {
-                    error!("Failed to get system time for heartbeat: {}", e);
-                    e
-                })?
-                .as_millis() as u64;
-
-            let heartbeat = AgentHeartbeat {
-                cluster_id: self.config.my_cluster_id.clone(),
-                status: "HEALTHY".to_string(),
-                timestamp: now,
-            };
-
-            debug!(
-                "Sending heartbeat for cluster: {}",
-                self.config.my_cluster_id
-            );
-            let payload = serde_json::to_vec(&heartbeat)?;
-
-            // Write heartbeat to KV. ADR-017: Write failure triggers self-demotion logic
-            match self.health_kv.put(key.clone(), payload).await {
-                Ok(_) => {
-                    debug!(
-                        "Heartbeat successful for cluster: {}",
-                        self.config.my_cluster_id
-                    );
-                }
-                Err(e) => {
-                    error!(
-                        "Failed to write heartbeat: {}. Fencing logic would trigger here.",
-                        e
-                    );
-                    // In a real implementation, we would trigger self-demotion/fencing here
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::sync::{Arc, Mutex};
-    use tokio::time::{Duration, advance};
-
-    struct MockHealthStore {
-        puts: Arc<Mutex<Vec<(String, Vec<u8>)>>>,
-    }
-
-    #[async_trait]
-    impl HealthStore for MockHealthStore {
-        async fn put(
-            &self,
-            key: String,
-            value: Vec<u8>,
-        ) -> Result<u64, Box<dyn std::error::Error + Send + Sync>> {
-            self.puts.lock().unwrap().push((key, value));
-            Ok(0)
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_heartbeat_loop() {
-        let config = AgentConfig {
-            nats_url: "nats://localhost:4222".to_string(),
-            nats_creds_path: None,
-            my_cluster_id: "test-cluster".into(),
-            desired_primary: "test-cluster".into(),
-            heartbeat_interval: Duration::from_millis(100),
-        };
-
-        let puts = Arc::new(Mutex::new(Vec::new()));
-        let mock_store = MockHealthStore { puts: puts.clone() };
-
-        let agent = HarmonyAgent {
-            config,
-            nats_client: None,
-            health_kv: Box::new(mock_store),
-        };
-
-        // Run the loop in a separate task
-        let handle = tokio::spawn(async move {
-            let _ = agent.run_heartbeat_loop().await;
-        });
-
-        // Advance time in increments to trigger multiple heartbeats
-        for _ in 0..3 {
-            advance(Duration::from_millis(100)).await;
-            tokio::time::sleep(Duration::from_millis(1)).await;
-        }
-
-        let recorded_puts = puts.lock().unwrap();
-        assert!(
-            recorded_puts.len() >= 2,
-            "Should have recorded at least 2 heartbeats, got {}",
-            recorded_puts.len()
-        );
-
-        let (key, payload) = &recorded_puts[0];
-        assert_eq!(key, "heartbeat.test-cluster");
-
-        let heartbeat: AgentHeartbeat = serde_json::from_slice(payload).unwrap();
-        assert_eq!(heartbeat.cluster_id.to_string(), "test-cluster");
-        assert_eq!(heartbeat.status, "HEALTHY");
-
-        handle.abort();
-    }
-}
diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs
index 2b92b851..089b013d 100644
--- a/harmony_agent/src/agent_loop.rs
+++ b/harmony_agent/src/agent_loop.rs
@@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize};
 use tokio::sync::RwLock;
 use tokio::time::Instant;
 
-use crate::store::{KvStore, KvStoreError};
+use crate::store::{KvMetadata, KvStore, KvStoreError};
 use crate::workflow::HeartbeatWorkflow;
 use crate::workflow::primary::PrimaryWorkflow;
 use crate::workflow::replica::ReplicaWorkflow;
@@ -18,16 +18,24 @@ pub enum AgentRole {
     Replica,
 }
 
-pub async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    env_logger::init();
-
+pub async fn launch_agent<S>(
+    role: AgentRole,
+    health_kv: Arc<S>,
+    cluster_kv: Arc<S>,
+    heartbeat_interval: Duration,
+    failover_timeout: Duration,
+) -> Result<(), Box<dyn std::error::Error>>
+where
+    S: KvStore + Send + Sync + 'static,
+{
     let my_agent_id = Id::from_str("agent_1").unwrap();
 
     let config = AgentConfig {
+        role,
         success_threshold: 2,
         failure_threshold: 2,
-        heartbeat_interval: Duration::from_secs(1),
-        failover_timeout: Duration::from_secs(5),
+        heartbeat_interval,
+        failover_timeout,
         deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
             desired_primary_agent: my_agent_id.clone(),
             cnpg_cluster_name: String::from("cnpg_cluster_name"),
@@ -35,7 +43,6 @@ pub async fn main() -> Result<(), Box<dyn std::error::Error>> {
         nats_url: String::new(),
         nats_creds_path: None,
         agent_id: my_agent_id,
-        role: AgentRole::Replica,
         cluster_id: "cluster_test_id".into(),
         desired_primary_id: "primary_id".into(),
     };
@@ -46,13 +53,11 @@ pub async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     // TODO load store based on config, default to nats
     // probably a good use case for a factory pattern
-    use crate::store::ChaosKvStore;
-    use crate::store::InMemoryKvStore;
-    let health_kv = ChaosKvStore::new(InMemoryKvStore::new(), 30, 30, 1000);
-    let cluster_kv = ChaosKvStore::new(InMemoryKvStore::new(), 30, 30, 2000);
 
     let mut agent = HarmonyAgent::new(config, health_kv, cluster_kv);
 
+    agent.reconcile_startup().await?;
+
     // Run the heartbeat loop
     agent.run_heartbeat_loop().await;
 
@@ -140,22 +145,11 @@ pub struct AgentInfo {
     pub status: String,
 }
 
-/// Store-provided metadata for a heartbeat
-/// This is returned by the KV store and includes timing/ordering guarantees
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct HeartbeatMetadata {
-    /// Timestamp set by the store (e.g., NATS JetStream)
-    /// This avoids clock skew between agents
-    pub timestamp: u64,
-    /// Sequence number for strict ordering (e.g., JetStream sequence)
-    pub sequence: u64,
-}
-
 /// Complete heartbeat with both agent data and store metadata
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct AgentHeartbeat {
     pub agent_info: AgentInfo,
-    pub metadata: Option<HeartbeatMetadata>,
+    pub metadata: Option<KvMetadata>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -169,8 +163,8 @@ pub struct ClusterStateData {
 pub struct HarmonyAgent<S: KvStore> {
     pub config: AgentConfig,
     workflow: Box<dyn HeartbeatWorkflow>,
-    health_kv: S,
-    cluster_kv: S,
+    health_kv: Arc<S>,
+    cluster_kv: Arc<S>,
     /// Last successful heartbeat, used to track sequence number for next write
     /// This avoids doing a GET before every SET, reducing network round-trips
     last_heartbeat: Arc<RwLock<Option<AgentHeartbeat>>>,
@@ -179,8 +173,8 @@ pub struct HarmonyAgent<S: KvStore> {
     cluster_state: Arc<RwLock<Option<ClusterStateData>>>,
 }
 
-impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
-    pub fn new(config: AgentConfig, health_kv: S, cluster_kv: S) -> Self {
+impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
+    pub fn new(config: AgentConfig, health_kv: Arc<S>, cluster_kv: Arc<S>) -> Self {
         let workflow: Box<dyn HeartbeatWorkflow> = match config.role {
             AgentRole::Primary => {
                 info!("Initializing agent as PRIMARY");
@@ -192,7 +186,7 @@ impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
             }
             AgentRole::Replica => {
                 info!("Initializing agent as REPLICA");
-// pub fn new(success_threshold: usize, failure_threshold: usize, cluster_id: Id, primary_id: Id, my_id: Id) -> Self
+                // pub fn new(success_threshold: usize, failure_threshold: usize, cluster_id: Id, primary_id: Id, my_id: Id) -> Self
                 Box::new(ReplicaWorkflow::new(
                     config.success_threshold,
                     config.failure_threshold,
@@ -219,10 +213,13 @@ impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
     /// based on the persisted cluster state
     pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> {
         let cluster_key = format!("cluster.{}", self.config.cluster_id);
-        
-        debug!("Fetching cluster state for startup reconciliation from key: {}", cluster_key);
-        
-        let cluster_state_option = match self.cluster_kv.get(cluster_key.clone()).await {
+
+        debug!(
+            "Fetching cluster state for startup reconciliation from key: {}",
+            cluster_key
+        );
+
+        let cluster_state_option = match self.cluster_kv.get(&cluster_key).await {
             Ok(result) => {
                 if let Some(value) = result.value {
                     match serde_json::from_value::<ClusterStateData>(value) {
@@ -252,7 +249,7 @@ impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
 
         // Cache the cluster state locally
         *self.cluster_state.write().await = cluster_state_option;
-        
+
         Ok(())
     }
 
@@ -261,7 +258,7 @@ impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
     /// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence)
     /// to avoid clock skew issues. This follows the ADR-017-3 principle that all timestamp
     /// comparisons use the store's clock, not agent clocks.
-    /// 
+    ///
     /// This method uses the last successful heartbeat's sequence number to avoid an extra
     /// GET call before each SET, reducing network round-trips and latency exposure.
     async fn store_heartbeat(&self) -> Result<AgentHeartbeat, KvStoreError> {
@@ -276,13 +273,12 @@ impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
         };
 
         debug!("Storing heartbeat for agent: {}", self.config.agent_id);
-        let value = serde_json::to_value(&agent_info)
-            .map_err(|e| KvStoreError::DeserializationFailed {
+        let value =
+            serde_json::to_value(&agent_info).map_err(|e| KvStoreError::DeserializationFailed {
                 deserialization_error: e.to_string(),
                 value: format!("{:?}", agent_info),
             })?;
 
-        // Get expected sequence from last successful heartbeat (0 if first write)
         let expected_sequence = {
             let last = self.last_heartbeat.read().await;
             last.as_ref()
@@ -291,18 +287,20 @@ impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
                 .unwrap_or(0)
         };
 
-        // Write with strict ordering - single network round-trip
-        let new_seq = self.health_kv.set_strict(key, value, expected_sequence).await?;
-        
-        debug!("Heartbeat stored successfully with sequence: {}", new_seq);
-        
+        trace!("Writing new heartbeat  {key} (#{expected_sequence}), value: {value:?}");
+        let new_seq = self
+            .health_kv
+            .set_strict(&key, value, expected_sequence)
+            .await?;
+        trace!("Got new sequence {new_seq}");
+        let kv_result = self.health_kv.get_revision(&key, new_seq).await?;
+
+        debug!("Heartbeat stored succsssfully with sequence: {}", new_seq);
+
         // Construct complete heartbeat with metadata from store
         let heartbeat = AgentHeartbeat {
             agent_info,
-            metadata: Some(HeartbeatMetadata {
-                timestamp: todo!("get the real timestamp from store"),
-                sequence: new_seq,
-            }),
+            metadata: Some(kv_result.metadata),
         };
 
         // Cache this successful heartbeat for next iteration
@@ -318,30 +316,53 @@ impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
             next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval;
 
             // Perform the check via the config/strategy with a timeout
+            //
+            // FIXME There is too much stuff happening inside the timeout. There are some things like a
+            // promotion, that we don't want to cancel within a single heartbeat interval timeout
+            // I think that the timeout should only apply to the store_heartbeat().await call.
+            // Logic happening after should not be affected in the exact same manner. There can be
+            // other timeouts or other stuff to consider here.
+            // However, the system does rely on heartbeats happening regularly, so we do not want
+            // to delay the next heartbeat either. This is tricky.
+            // An idea right now is to keep the heartbeat running but, when a processing event
+            // occurs, set a flag on the local agent that there is a process running (promotion,
+            // demotion, etc) and take no other decision until this process is not done. There is
+            // one exception we can think of right now :
+            // - a healthy primary starts running a process such as "calling mom"
+            // - the primary keeps sending its heartbeat to prove to the rest of the cluster that
+            // it is still healthy
+            // - then the primary heartbeat fails up to failure_threshold
+            // - at this moment the "calling mom" process must not prevent the primary from fencing itself. Otherwise the replica that promotes itself when it realises that the primary is dead will cause a split brain.
+            //  - Another solution would be register the processing: "calling mom" in the primary
+            //  heartbeat store, and prevent the replica from promoting when there is a running
+            //  task on the primary.
             let result = tokio::time::timeout(self.config.heartbeat_interval, async {
                 // Store heartbeat and perform deployment-specific health check
                 match &self.store_heartbeat().await {
                     Ok(heartbeat) => {
                         // Heartbeat stored successfully, already cached by store_heartbeat
-                        debug!("Heartbeat stored: seq={}", heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0));
-                        // Pass heartbeat with metadata to workflow for staleness checks
-                        self.workflow.on_heartbeat_stored(heartbeat).await;
+                        debug!(
+                            "Heartbeat stored: seq={}",
+                            heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
+                        );
                     }
-                    Err(KvStoreError::SequenceMismatch { expected, current }) => {
+                    Err(KvStoreError::WrongLastRevision) => {
+                        todo!("fetch and update correct last sequence number")
                         // CAS failure could indicate:
                         // 1. Network latency: our previous timeout heartbeat actually succeeded
                         // 2. Agent ID conflict: another agent with same ID exists
                         // 3. Clock/bucket corruption (unlikely)
-                        log::warn!(
-                            "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}",
-                            self.config.agent_id, expected, current, current
-                        );
-                        // Update cached heartbeat sequence to prevent repeated failures
-                        if let Some(hb) = self.last_heartbeat.write().await.as_mut() {
-                            if let Some(metadata) = hb.metadata.as_mut() {
-                                metadata.sequence = *current;
-                            }
-                        }
+
+                        // log::warn!(
+                        //     "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}",
+                        //     self.config.agent_id, expected, current, current
+                        // );
+                        // // Update cached heartbeat sequence to prevent repeated failures
+                        // if let Some(hb) = self.last_heartbeat.write().await.as_mut() {
+                        //     if let Some(metadata) = hb.metadata.as_mut() {
+                        //         metadata.sequence = *current;
+                        //     }
+                        // }
                     }
                     Err(e) => {
                         // Actual storage failure - treat as heartbeat failure
@@ -349,7 +370,10 @@ impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
                         return Err(HeartbeatFailure {});
                     }
                 }
-                self.config.deployment_config_unstable.perform_heartbeat().await?;
+                self.config
+                    .deployment_config_unstable
+                    .perform_heartbeat()
+                    .await?;
 
                 // TODO: Pass the heartbeat with metadata to the workflow for staleness checks
                 // The workflow needs access to metadata.timestamp for failover timeout calculations
@@ -367,10 +391,10 @@ impl<S: KvStore + Clone + Send + Sync + 'static> HarmonyAgent<S> {
             trace!("Got heartbeat_result : {heartbeat_result:?}");
             match heartbeat_result {
                 Ok(_) => {
-                    self.workflow.handle_heartbeat_success();
+                    self.workflow.handle_heartbeat_success().await;
                 }
                 Err(_) => {
-                    self.workflow.handle_heartbeat_failure();
+                    self.workflow.handle_heartbeat_failure().await;
                 }
             }
 
diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs
index de88ecf5..92a0fa09 100644
--- a/harmony_agent/src/main.rs
+++ b/harmony_agent/src/main.rs
@@ -1,259 +1,83 @@
-// mod typestate_gemini;
-// mod typestate;
+use std::{sync::Arc, time::Duration};
+
+use async_nats::jetstream::kv::Store;
+
+use crate::{
+    agent_loop::AgentRole,
+    store::{ChaosKvStore, InMemoryKvStore, NatsKvStore},
+};
+
 mod agent_loop;
-mod workflow;
 pub mod store;
+mod workflow;
 
 #[tokio::main]
 async fn main() {
-    // typestate_gemini::main_typestate_gemini().await;
-    agent_loop::main().await;
+    env_logger::init();
+
+    let heartbeat_interval = Duration::from_millis(2000);
+    let failover_timeout = Duration::from_secs(10);
+
+    // let (health_kv, cluster_kv) = get_chaos_store(&heartbeat_interval, &failover_timeout);
+
+    let nats_store = get_local_nats_store().await;
+    let health_kv = nats_store.clone();
+    let cluster_kv = nats_store.clone();
+
+    let _ = tokio::join!(
+        agent_loop::launch_agent(
+            AgentRole::Primary,
+            health_kv.clone(),
+            cluster_kv.clone(),
+            heartbeat_interval,
+            failover_timeout
+        ),
+        agent_loop::launch_agent(
+            AgentRole::Replica,
+            health_kv,
+            cluster_kv,
+            heartbeat_interval,
+            failover_timeout
+        ),
+    );
 }
 
-// TODO
-//
-// DONE:
-// 1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type
-// 2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision)
-// 3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded
-// 4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent
-// 5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback
-// 6. ✅ failover_timeout added to AgentConfig
-// 7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error
-// 8. ✅ startup reconciliation implemented via on_startup() method
-//
-// REMAINING:
-// - review all code and list implementation issues
-// - review both workflow for each state transition
-// - Complete replica workflow staleness detection (needs implementation in Watching state)
-// - Implement state recovery from Failed state for both workflows
-// - Implement subscribe in NATS store with watch() API
-// - Implement config validation for failover_timeout constraints
+fn get_chaos_store(
+    heartbeat_interval: &Duration,
+    failover_timeout: &Duration,
+) -> (
+    Arc<ChaosKvStore<InMemoryKvStore>>,
+    Arc<ChaosKvStore<InMemoryKvStore>>,
+) {
+    let health_kv = Arc::new(ChaosKvStore::new(
+        InMemoryKvStore::new(),
+        10,
+        10,
+        heartbeat_interval.as_millis().try_into().unwrap(),
+    ));
+    let cluster_kv = Arc::new(ChaosKvStore::new(
+        InMemoryKvStore::new(),
+        5,
+        5,
+        failover_timeout.as_millis().try_into().unwrap(),
+    ));
 
-// TODO
-//
-// 1. store trait subscribe definition missing callback
-// 2. BUG, data integrity issue : nats store not actually using jetstream metadata 
-// 3. review all code and list implementation issues
-// 4. review both workflow for each state transition
-// 5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded
-// 6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself
-//    should hold it probably, every agent should be subscribed to the cluster_state object and
-//    keep it in memory to allow workflows to process against it efficiently)
+    (health_kv, cluster_kv)
+}
 
-// ## CRITICAL - Data Integrity Issues
-// 
-// 1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`)
-//    - Currently uses `put()` which overwrites unconditionally
-//    - Must use `update()` with revision parameter for proper compare-and-set
-//    - Without this, concurrent promotion attempts can cause split brain
-// 
-// 2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`)
-//    - Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3
-//    - NATS Entry has `.revision` and `.created` fields that must be used
-//    - This defeats the entire purpose of store-provided timestamps
-// 
-// 3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`)
-//    - Line ~156: TODO comment confirms missing metadata passing
-//    - Replica cannot calculate staleness without metadata.timestamp
-//    - Failover logic is broken
-// 
-// 4. **No actual cluster state watching exists**
-//    - Replica workflow declares `ClusterState` but never updates it
-//    - No subscription to primary heartbeat or cluster_state key
-//    - Replica cannot detect primary liveness
-// 
-// ## HIGH - Missing Core Functionality
-// 
-// 5. **Replica Workflow incomplete** - All key logic is TODO:
-//    - Watching primary staleness (line 114)
-//    - Promotion attempt (line 118)
-//    - Original primary recovery detection (line 127)
-//    - Demotion/handshake (line 131)
-// 
-// 6. **Missing replica "Failed" state**
-//    - `ReplicaState` enum has no `Failed` variant
-//    - User's TODO #5 correctly identifies this gap
-//    - What happens if replica's own heartbeats fail repeatedly?
-// 
-// 7. **Primary Workflow incomplete** - Key logic missing:
-//    - No NATS check before recovering from `Fenced` state (line 95)
-//    - No NATS check in `Yielding` state for demotion handshake (line 101)
-//    - No actual fencing failure handling
-// 
-// 8. **Store `subscribe` not implemented** (`store/mod.rs`)
-//    - Returns `todo!()` in NATS implementation
-//    - No callback mechanism defined in trait
-//    - Without this, agents cannot react to state changes
-// 
-// 9. **Cluster state not tracked centrally**
-//    - User's TODO #6 correctly identifies this
-//    - Each agent should maintain a local copy of cluster_state
-//    - No subscription mechanism to update this local copy
-// 
-// 10. **No validation of configuration constraints**
-//     - Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin`
-//     - Invalid config could cause split brain
-// 
-// ## MEDIUM - Incorrect State Transitions
-// 
-// 11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`)
-//     - Two state transitions happen in one heartbeat cycle
-//     - Should stay in `Failed` until fencing actually completes
-//     - What if fencing fails? State machine won't reflect it
-// 
-// 12. **No fencing failure handling**
-//     - If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes
-//     - ADR mentions escalating to radical measures, but no callback for failure
-// 
-// 13. **Replica `Watching` state does nothing**
-//     - Line 115: Just logs, checks nothing
-//     - Should be checking staleness of primary heartbeat
-// 
-// 14. **Demotion handshake not implemented**
-//     - ADR section 4 details this but code doesn't implement it
-//     - How does original primary know it should yield?
-// 
-// ## LOW - Observability & Reliability
-// 
-// 15. **No graceful shutdown mechanism**
-//     - `run_heartbeat_loop` runs forever
-//     - No signal handling (SIGTERM, SIGINT)
-// 
-// 16. **Async task errors silently ignored**
-//     - `tokio::spawn` at lines 74, 83, 123
-//     - No `JoinHandle` retention or error handling
-// 
-// 17. **No metrics/observability**
-//     - Only log output
-//     - No Prometheus metrics for state transitions, failure counts, etc.
-// 
-// 18. **Hardcoded main() function** (`agent_loop.rs::main`)
-//     - Not production-ready entry point
-//     - Should load config from environment or file
-// 
-// 19. **Store factory pattern missing**
-//     - TODO comment at line 54 confirms this
-//     - Can't switch between stores via config
-// 
-// 20. **No backoff/retry logic for NATS operations**
-//     - Transient failures could trigger unnecessary fencing
-// 
-// 21. **`AgentInfo` status is hardcoded to "HEALTHY"**
-//     - Line 137 in `store_heartbeat`
-//     - Should反映 actual workflow state
-// 
-// 22. **Unused fields in structs**
-//     - `HeartbeatState.last_seq` set but never read
-//     - `ClusterState.current_primary` set but never read
-// 
-// ## ADR-017-3 Compliance Issues
-// 
-// 23. **ADR violation: Clock skew not avoided**
-//     - While ADR says use store metadata, code uses local time
-// 
-// 24. **Failover timeout not configurable**
-//     - Defined in ADR but not in `AgentConfig`
-//     - Needed for replica staleness calculation
-// 
-// 25. **Safety margin concept exists in ADR but not in code**
-//     - Configuration should include this margin
-// 
-// 26. **No handling of Case 3 (Replica Network Lag)**
-//     - ADR describes NATS rejection prevention
-//     - But `set_strict` implementation accepts any write
-// 
-// ## Code Quality Issues
-// 
-// 27. **Inconsistent error handling**
-//     - Some paths return `Err`, others `todo!()`, others ignore
-// 
-// 28. **Unnecessary `Clone` bounds**
-//     - `DeploymentConfig.clone()` used frequently
-//     - Could be optimized with `Arc`
-// 
-// 29. **Missing lifetime annotations**
-//     - `KvStore::get` returns `String` key in error - inefficient
-// 
-// 30. **No integration points mentioned**
-//     - PostgreSQL lifecycle control implementation missing
-//     - Fencing via CNPG not connected
-// 
-// ## Production Readiness Checklist Summary
-// 
-// For battle testing preparation, you need:
-// 
-// **Immediate ( blockers):**
-// - Fix NATS store metadata usage (issues #1, #2)
-// - Implement strict set_strict with actual CAS (#1)
-// - Implement replica primary watching (#4, #5)
-// - Add failover_timeout config + staleness logic (#3, #24)
-// - Implement subscribe mechanism with callbacks (#8)
-// 
-// **High priority:**
-// - Complete all workflow transitions (#5, #7, #11-14)
-// - Add cluster state tracking (#6, #9)
-// - Add configuration validation (#10)
-// - Add Replica Failed state (#6)
-// 
-// **Before deployment:**
-// - Implement graceful shutdown (#15)
-// - Add error handling for spawned tasks (#16)
-// - Remove hardcoded main function (#18)
-// - Implement store factory (#19)
-// - Add Prometheus metrics (#17)
-// 
-// **Documentation:**
-// - Document all configuration parameters and their trade-offs
-// - Add runbooks for each failure mode
-// - Document battle test scenarios to cover
-// 
-// ### Addendum: Missing Critical Issues
-// 
-// #### 1. CRITICAL: Heartbeat "Lying" (Data Integrity)
-// *   **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`.
-// *   **The Bug:** `status: "HEALTHY".to_string()` is hardcoded.
-// *   **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY".
-// *   **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely.
-// 
-// #### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption)
-// *   **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`).
-// *   **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks.
-// *   **Scenario:**
-//     1.  Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s).
-//     2.  Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s).
-//     3.  `on_active` finishes *before* `on_failover`.
-//     4.  `on_failover` finishes last, killing the DB *after* the agent decided it was healthy.
-// *   **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one.
-// 
-// #### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk)
-// *   **Location:** `agent_loop.rs` loop logic.
-// *   **The Bug:** There is no "Stop the World" gate.
-// *   **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*.
-// *   **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again.
-// *   **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter.
-// 
-// #### 4. HIGH: NATS Bucket Name Collision
-// *   **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`.
-// *   **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`.
-// *   **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state.
-// *   **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`.
-// 
-// #### 5. HIGH: Startup State Reconciliation
-// *   **Location:** `HarmonyAgent::new`.
-// *   **The Bug:** Agents always start in `Initializing`.
-// *   **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader.
-// *   **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime.
-// *   **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check).
-// 
-// ### Summary of Tasks to Add
-// 
-// Please add these to your master list before starting implementation:
-// 
-// 28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY".
-// 29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping.
-// 30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection).
-// 31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`.
-// 32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid.
-// 
+async fn get_local_nats_store() -> Arc<NatsKvStore> {
+    let client = async_nats::connect("localhost").await.unwrap();
+    let jetstream = async_nats::jetstream::new(client);
+    let kv = jetstream
+        .create_key_value(async_nats::jetstream::kv::Config {
+            bucket: "kv".to_string(),
+            history: 10,
+            ..Default::default()
+        })
+        .await
+        .unwrap();
+    let status = kv.status().await.unwrap();
+    println!("status: {:?}", status);
 
+    Arc::new(NatsKvStore::new(kv))
+}
diff --git a/harmony_agent/src/old/typestate.rs b/harmony_agent/src/old/typestate.rs
new file mode 100644
index 00000000..78445d55
--- /dev/null
+++ b/harmony_agent/src/old/typestate.rs
@@ -0,0 +1,230 @@
+use std::{marker::PhantomData, time::Duration};
+
+/// Typestate pattern implementation for Primary and Replica state machines
+/// Based on Will Crichton's typestate pattern
+
+pub mod primary {
+  use super::Agent;
+
+  /// Primary state: Agent is initializing
+  pub struct Initializing {}
+
+  /// Primary state: Heartbeat failures exceeded threshold
+  pub struct Failed {}
+
+  /// Primary state: Database fenced/stopped
+  pub struct Fenced {}
+
+  /// Primary state: Heartbeat succeeding
+  pub struct Healthy {}
+
+  /// Primary state: Recovered from fence, waiting for demotion handshake
+  pub struct Yielding {}
+
+  impl Agent<Initializing> {
+    /// Transition from initializing to healthy
+    pub fn healthy(self) -> Agent<Healthy> {
+        self
+    }
+  }
+
+  impl Agent<Failed> {
+    /// Transition from failed to fenced
+    pub fn fence(self) -> Agent<Fenced> {
+      Agent {
+        consecutive_success: self.consecutive_success,
+        consecutive_failure: self.consecutive_failure,
+        failure_threshold: self.failure_threshold,
+        success_threshold: self.success_threshold,
+        heartbeat_timeout: self.heartbeat_timeout,
+        _state: PhantomData
+      }
+    }
+
+    /// Transition from failed to healthy (recovery)
+    pub fn recover(self) -> Agent<Healthy> {
+        self
+    }
+  }
+
+  impl Agent<Fenced> {
+    /// Transition from fenced to yielding (waiting for demotion)
+    pub fn await_demotion(self) -> Agent<Yielding> {
+        self
+    }
+
+    /// Transition from fenced to healthy (recovery after demotion completes)
+    pub fn recover(self) -> Agent<Healthy> {
+        self
+    }
+  }
+
+  impl Agent<Healthy> {
+    /// Transition from healthy to yielding (when original primary recovers)
+    pub fn yield_leadership(self) -> Agent<Yielding> {
+        self
+    }
+
+    /// Transition from healthy to failed (heartbeat failure)
+    pub fn fail(self) -> Agent<Failed> {
+        self
+    }
+  }
+
+  impl Agent<Yielding> {
+    /// Transition from yielding back to healthy (after demotion completes)
+    pub fn recover(self) -> Agent<Healthy> {
+        self
+    }
+
+    /// Transition from yielding back to healthy (if demotion cancelled)
+    pub fn recover_and_promote(self) -> Agent<Healthy> {
+        self
+    }
+  }
+}
+
+pub mod replica {
+  use super::Agent;
+
+  /// Replica state: Agent is initializing
+  pub struct Initializing {}
+
+  /// Replica state: Watching primary heartbeats
+  pub struct Watching {}
+
+  /// Replica state: Failover timeout exceeded, attempting promotion
+  pub struct Promoting {}
+
+  /// Replica state: Promotion attempt rejected by NATS
+  pub struct PromotionFailed {}
+
+  /// Replica state: Successfully promoted to leader
+  pub struct Leader {}
+
+  /// Replica state: Original primary recovered, yielding leadership
+  pub struct Demoting {}
+
+  impl Agent<Initializing> {
+    /// Transition from initializing to watching
+    pub fn start_watching(self) -> Agent<Watching> {
+        self
+    }
+  }
+
+  impl Agent<Watching> {
+    /// Transition from watching to promoting (failover timeout reached)
+    pub fn promote(self) -> Agent<Promoting> {
+        self
+    }
+
+    /// Transition from watching back to promoting (if demotion cancelled)
+    pub fn promote_again(self) -> Agent<Promoting> {
+        self
+    }
+  }
+
+  impl Agent<Promoting> {
+    /// Transition from promoting to leader (promotion successful)
+    pub fn become_leader(self) -> Agent<Leader> {
+        self
+    }
+
+    /// Transition from promoting to promotion_failed (NATS rejected)
+    pub fn promotion_rejected(self) -> Agent<PromotionFailed> {
+        self
+    }
+
+    /// Transition from promoting back to watching (reverted)
+    pub fn revert_to_watching(self) -> Agent<Watching> {
+        self
+    }
+  }
+
+  impl Agent<PromotionFailed> {
+    /// Transition from promotion_failed back to watching
+    pub fn continue_watching(self) -> Agent<Watching> {
+        self
+    }
+  }
+
+  impl Agent<Leader> {
+    /// Transition from leader to demoting (original primary recovered)
+    pub fn yield_leadership(self) -> Agent<Demoting> {
+        self
+    }
+
+    /// Transition from leader to watching (if demotion cancelled)
+    pub fn revert_to_watching(self) -> Agent<Watching> {
+        self
+    }
+  }
+
+  impl Agent<Demoting> {
+    /// Transition from demoting back to watching (if demotion cancelled)
+    pub fn revert_to_watching(self) -> Agent<Watching> {
+        self
+    }
+
+    /// Transition from demoting back to leader (if demotion cancelled)
+    pub fn promote_again(self) -> Agent<Leader> {
+        self
+    }
+  }
+}
+
+/// Main Agent struct using typestate pattern
+/// State is tracked through the generic type parameter
+pub struct Agent<State> {
+  pub consecutive_success: usize,
+  pub consecutive_failure: usize,
+  pub failure_threshold: usize,
+  pub success_threshold: usize,
+  pub heartbeat_timeout: Duration,
+  _state: PhantomData<State>
+}
+
+impl<State> Agent<State> {
+  /// Create a new agent in the given state with default thresholds
+  pub fn new(state: State) -> Self {
+    Agent {
+      consecutive_success: 0,
+      consecutive_failure: 0,
+      failure_threshold: 2,
+      success_threshold: 3,
+      heartbeat_timeout: Duration::from_secs(1),
+      _state: PhantomData
+    }
+  }
+
+  /// Create a new agent with custom thresholds
+  pub fn with_thresholds(state: State, success_threshold: usize, failure_threshold: usize, heartbeat_timeout: Duration) -> Self {
+    Agent {
+      consecutive_success: 0,
+      consecutive_failure: 0,
+      failure_threshold,
+      success_threshold,
+      heartbeat_timeout,
+      _state: PhantomData
+    }
+  }
+}
+
+impl<State> Clone for Agent<State> {
+  fn clone(&self) -> Self {
+    Agent {
+      consecutive_success: self.consecutive_success,
+      consecutive_failure: self.consecutive_failure,
+      failure_threshold: self.failure_threshold,
+      success_threshold: self.success_threshold,
+      heartbeat_timeout: self.heartbeat_timeout,
+      _state: PhantomData
+    }
+  }
+}
+
+impl<State> Default for Agent<State> {
+  fn default() -> Self {
+    Self::new(Initializing {})
+  }
+}
diff --git a/harmony_agent/src/old/typestate_gemini.rs b/harmony_agent/src/old/typestate_gemini.rs
new file mode 100644
index 00000000..e4285bdd
--- /dev/null
+++ b/harmony_agent/src/old/typestate_gemini.rs
@@ -0,0 +1,523 @@
+use std::marker::PhantomData;
+use std::time::Duration;
+use tokio::sync::mpsc;
+use tokio::time::Instant;
+
+// =============================================================================
+// FSM Library (Type State Pattern)
+// =============================================================================
+
+pub mod fsm {
+    use super::*;
+
+    /// Generic FSM container
+    pub struct FSM<S, E, U> {
+        pub user_data: Option<U>,
+        pub state: PhantomData<S>,
+        pub _phantom_event: PhantomData<E>,
+    }
+
+    impl<S, E, U> FSM<S, E, U> {
+        pub fn new(user_data: Option<U>) -> Self {
+            Self {
+                user_data,
+                state: PhantomData,
+                _phantom_event: PhantomData,
+            }
+        }
+    }
+
+    /// Trait to represent FSM behavior via dynamic dispatch
+    pub trait HandleEvent<E, U> {
+        fn handle_event(self: Box<Self>, event: E) -> Box<dyn ErasedState<E, U>>;
+    }
+
+    /// Implemented per-state by the macro to route event logic
+    pub trait ErasedState<E, U>: Send {
+        fn handle_event(self: Box<Self>, event: E) -> Box<dyn ErasedState<E, U>>;
+    }
+
+    impl<S, E, U> ErasedState<E, U> for FSM<S, E, U>
+    where
+        FSM<S, E, U>: HandleEvent<E, U> + Send + 'static,
+    {
+        fn handle_event(self: Box<Self>, event: E) -> Box<dyn ErasedState<E, U>> {
+            HandleEvent::handle_event(self, event)
+        }
+    }
+
+    /// Allows FSM to move from state `S` to `T`, retaining user data
+    pub trait StateMachine<S, E, U>: Send + 'static {
+        fn into_boxed<T>(self) -> Box<FSM<T, E, U>>;
+    }
+
+    impl<S, E, U> StateMachine<S, E, U> for FSM<S, E, U>
+    where
+        S: Send + 'static,
+        E: Send + 'static,
+        U: Send + 'static,
+    {
+        fn into_boxed<T>(self) -> Box<FSM<T, E, U>> {
+            Box::new(FSM {
+                user_data: self.user_data,
+                state: PhantomData,
+                _phantom_event: PhantomData,
+            })
+        }
+    }
+
+    /// Runs the FSM in an asynchronous loop
+    pub async fn run_machine<E, U>(
+        mut state: Box<dyn ErasedState<E, U>>,
+        mut rx: tokio::sync::mpsc::Receiver<E>,
+    ) where
+        E: Send + 'static,
+        U: Send + 'static,
+    {
+        while let Some(event) = rx.recv().await {
+            state = ErasedState::handle_event(state, event);
+        }
+    }
+}
+
+/// Macro for Declaring Transitions
+#[macro_export]
+macro_rules! define_fsm {
+    (
+        $struct:ident<$event:ident, $user:ident>, {
+            $(
+                $state:ty => {
+                    $(
+                        $pattern:pat => $next:ty => $action:expr
+                    ),* $(,)?
+                }
+            ),* $(,)?
+        }
+    ) => {
+        $(
+            impl $crate::fsm::HandleEvent<$event, $user> for $struct<$state, $event, $user> {
+                fn handle_event(mut self: Box<Self>, event: $event) -> Box<dyn $crate::fsm::ErasedState<$event, $user>> {
+                    match event {
+                        $(
+                            $pattern => {
+                                // log::debug!("FSM Transition: {:?} --[{:?}]--> {:?}", stringify!($state), e, stringify!($next));
+                                log::debug!("FSM Transition: {:?} --[:?]--> {:?}", stringify!($state), stringify!($next));
+                                $action(&mut self);
+                                self.into_boxed::<$next>()
+                            }
+                        )*
+                        // Default handler for unmapped events in this state: stay in current state
+                        _ => {
+                            // log::trace!("FSM Ignore: {:?} --[{:?}]--> (no transition)", stringify!($state), event);
+                            self
+                        }
+                    }
+                }
+            }
+        )*
+    };
+}
+
+// =============================================================================
+// Harmony Agent Domain Logic
+// =============================================================================
+
+use fsm::{ErasedState, StateMachine, FSM};
+
+// --- States ---
+#[derive(Debug)]
+struct RolePrimary; // Active Leader
+#[derive(Debug)]
+struct RoleReplica; // Passive Watchdog
+#[derive(Debug)]
+struct RoleFencing; // Transition: Shutting down
+#[derive(Debug)]
+struct RolePromoting; // Transition: Taking over
+#[derive(Debug)]
+struct RoleDemoting; // Transition: Yielding
+
+// --- Events ---
+#[derive(Debug, Clone)]
+enum AgentEvent {
+    /// Periodic timer tick (drives checks)
+    Tick,
+    /// Result of a local health check (Primary only)
+    HealthCheckResult { success: bool },
+    /// Update from NATS about the cluster state
+    ClusterStateUpdate { primary_id: String, timestamp: Instant },
+    /// Command to force a state change (e.g. admin intervention)
+    ForceDemote,
+}
+
+// --- Side Effect Commands (Outbound) ---
+#[derive(Debug)]
+enum WorkerCommand {
+    PerformHealthCheck,
+    PerformFencing,
+    PerformPromotion,
+    PerformDemotion,
+}
+
+// --- Context ---
+struct AgentContext {
+    // Config
+    agent_id: String,
+    success_threshold: usize,
+    failure_threshold: usize,
+    heartbeat_interval: Duration,
+    failover_timeout: Duration,
+
+    // Runtime State
+    consecutive_failures: usize,
+    last_primary_heartbeat: Option<Instant>,
+    
+    // Communication
+    worker_tx: mpsc::Sender<WorkerCommand>,
+}
+
+impl AgentContext {
+    fn send_command(&self, cmd: WorkerCommand) {
+        let tx = self.worker_tx.clone();
+        tokio::spawn(async move {
+            if let Err(e) = tx.send(cmd).await {
+                log::error!("Failed to send worker command: {}", e);
+            }
+        });
+    }
+}
+
+// --- FSM Definition ---
+
+define_fsm!(FSM<AgentEvent, AgentContext>, {
+    // -------------------------------------------------------------------------
+    // PRIMARY STATE (Self-Preservation)
+    // -------------------------------------------------------------------------
+    RolePrimary => {
+        // 1. On Tick: Trigger a health check (Async Side Effect)
+        AgentEvent::Tick => RolePrimary => |s: &mut FSM<RolePrimary, AgentEvent, AgentContext>| {
+            if let Some(ctx) = &mut s.user_data {
+                ctx.send_command(WorkerCommand::PerformHealthCheck);
+            }
+        },
+
+        // 2. Health Check Success: Reset counters
+        AgentEvent::HealthCheckResult { success: true } => RolePrimary => |s: &mut FSM<RolePrimary, AgentEvent, AgentContext>| {
+            if let Some(ctx) = &mut s.user_data {
+                ctx.consecutive_failures = 0;
+                log::info!("✅ Heartbeat Success (Primary)");
+            }
+        },
+
+        // 3. Health Check Failure: Increment counters & Check Threshold
+        AgentEvent::HealthCheckResult { success: false } => RolePrimary => |s: &mut FSM<RolePrimary, AgentEvent, AgentContext>| {
+            // NOTE: We determine next state dynamically by checking threshold.
+            // Since the macro requires a static next type, we handle the "Stay" case here.
+            // If we need to transition, we assume the event loop sends a specific event,
+            // OR we use a separate state for "Checking".
+            // However, to keep it simple within this pattern, we will check threshold here.
+            // If threshold reached, we ideally want to return RoleFencing.
+            // But the macro forces `=> RolePrimary`.
+            //
+            // WORKAROUND: We use a specific event flow.
+            // Ideally, the `HealthCheckResult` logic would be:
+            // if fail >= threshold { transition Fencing } else { stay }
+            //
+            // To strictly follow the macro structure where destination is fixed per pattern:
+            // We can't branch to different types in one pattern.
+            // So we will stay in RolePrimary here, but if threshold is hit, we trigger Fencing immediately
+            // by sending a command, and we rely on the Worker to complete fencing and maybe restart us?
+            //
+            // BETTER APPROACH for this specific FSM pattern:
+            // We need an intermediate event or state if the destination depends on runtime data.
+            // But let's assume for this implementation that we handle the "Stay" case here,
+            // and if we fail, we transition to Fencing on the NEXT tick or via a self-generated event?
+            //
+            // Let's modify the logic: The Worker sends `HealthCheckResult { success: false }`.
+            // If we are still below threshold, we log.
+            // If we are at threshold, we treat this event as a trigger for Fencing?
+            // No, the pattern matches `Event => Type`.
+            //
+            // Revised: We need two patterns. But we can't match on values inside the struct in the macro easily
+            // unless we define specific events like `HealthCheckFailedFatal`.
+            //
+            // Let's use `consecutive_failures` check inside the action.
+            // If fatal, we return a new Box<RoleFencing>.
+            // Wait, the macro generates `self.into_boxed::<$next>()`. It hardcodes the return type.
+            //
+            // This is a limitation of the macro provided in the blog post.
+            // To solve this strictly following the provided code, we must ensure the event *itself* dictates the transition.
+            //
+            // So the Worker must know the threshold? No, that leaks logic.
+            //
+            // Solution: The FSM Action can mutate `ctx`.
+            // We will have `AgentEvent::HealthCheckFailed`.
+            // We stay in `RolePrimary`.
+            // Inside the action, if `ctx.failures >= threshold`, we `ctx.send_command(PerformFencing)`.
+            // And we transition to `RoleFencing`? We can't conditionally transition in the macro.
+            //
+            // OK, I will split the event.
+            // The Worker returns `HealthCheckResult`.
+            // The FSM handles it.
+            // If the FSM sees failure, it stays in Primary.
+            // But if it needs to fence, it needs to transition.
+            //
+            // I will add `AgentEvent::FencingTriggered` which is sent by the FSM to itself?
+            // Or simpler: The Worker sends `HealthCheckFailed`.
+            // If we want to fence, we need to move to `RoleFencing`.
+            //
+            // Let's adjust the macro usage slightly. The user said "follow exactly the FSM pattern".
+            // The pattern implies strict state transitions.
+            //
+            // I will implement `RolePrimary` -> `RoleFencing` on `ForceDemote` or similar.
+            // And I will assume the Worker sends `ForceDemote` if it detects critical failure?
+            // No, the logic belongs in the FSM.
+            //
+            // Let's use the `Tick` to check the counter.
+            // 1. Tick -> Check.
+            // 2. Result -> Update Counter.
+            // 3. Tick -> If counter > thresh -> Transition Fencing.
+            //
+            // Let's try that.
+            if let Some(ctx) = &mut s.user_data {
+                ctx.consecutive_failures += 1;
+                log::warn!("⚠️ Heartbeat Failed (Count: {}/{})", ctx.consecutive_failures, ctx.failure_threshold);
+            }
+        },
+
+        // 4. The actual Fencing Transition
+        // We use a specific pattern guard if possible, or just a separate event.
+        // Since we can't guard in the macro, we'll use a trick:
+        // If failures are high, the NEXT Tick will trigger transition?
+        // No, we want immediate.
+        //
+        // Let's add `AgentEvent::CriticalFailure` event.
+        // The `HealthCheckResult` handler (above) will check the threshold.
+        // If threshold reached, it cannot transition itself (locked to RolePrimary).
+        // BUT, it can emit a `CriticalFailure` event to the channel.
+        // Then the FSM loop picks it up and transitions.
+        AgentEvent::ForceDemote => RoleFencing => |s: &mut FSM<RolePrimary, AgentEvent, AgentContext>| {
+             if let Some(ctx) = &mut s.user_data {
+                log::error!("🚨 Failure Threshold Reached. Initiating Fencing.");
+                ctx.send_command(WorkerCommand::PerformFencing);
+            }
+        },
+
+        // 5. Split Brain Prevention
+        AgentEvent::ClusterStateUpdate { primary_id, .. } => RoleDemoting => |s: &mut FSM<RolePrimary, AgentEvent, AgentContext>| {
+             if let Some(ctx) = &mut s.user_data {
+                if primary_id != ctx.agent_id && !primary_id.is_empty() {
+                    log::warn!("Split Brain Detected! Another primary is active: {}. Demoting.", primary_id);
+                    ctx.send_command(WorkerCommand::PerformDemotion);
+                }
+            }
+        }
+    },
+
+    // -------------------------------------------------------------------------
+    // REPLICA STATE (Watchdog)
+    // -------------------------------------------------------------------------
+    RoleReplica => {
+        // 1. Receive Heartbeats from Primary
+        AgentEvent::ClusterStateUpdate { primary_id, timestamp } => RoleReplica => |s: &mut FSM<RoleReplica, AgentEvent, AgentContext>| {
+            if let Some(ctx) = &mut s.user_data {
+                if !primary_id.is_empty() {
+                    ctx.last_primary_heartbeat = Some(timestamp);
+                    // log::trace!("Replica: Saw primary {} at {:?}", primary_id, timestamp);
+                }
+            }
+        },
+
+        // 2. Tick: Check for Staleness
+        AgentEvent::Tick => RoleReplica => |s: &mut FSM<RoleReplica, AgentEvent, AgentContext>| {
+             // We can't transition conditionally here either.
+             // Same pattern: Check logic, if stale, send `ForcePromote` event to self.
+             if let Some(ctx) = &mut s.user_data {
+                 if let Some(last) = ctx.last_primary_heartbeat {
+                     let elapsed = Instant::now().duration_since(last);
+                     if elapsed > ctx.failover_timeout {
+                         log::warn!("⚡ Primary Stale ({}ms > {}ms). Triggering Promotion.", elapsed.as_millis(), ctx.failover_timeout.as_millis());
+                         // We need to trigger the transition.
+                         // We can't do it directly in this closure because the return type is fixed to RoleReplica.
+                         // So we assume the "Driver" or a self-send handles the trigger.
+                         // For this implementation, we'll assume we have a handle to the main loop channel in ctx?
+                         // No, ctx has `worker_tx`.
+                         //
+                         // We will send a command to worker to "ConfirmPromotionEligibility", which sends back `ForcePromote`.
+                         ctx.send_command(WorkerCommand::PerformPromotion); // This checks eligibility then triggers event
+                     }
+                 }
+             }
+        },
+
+        // 3. Promotion Triggered
+        AgentEvent::ForceDemote => RolePromoting => |s: &mut FSM<RoleReplica, AgentEvent, AgentContext>| {
+             log::info!("Promoting to Primary...");
+        }
+    },
+
+    // -------------------------------------------------------------------------
+    // FENCING STATE (Transient)
+    // -------------------------------------------------------------------------
+    RoleFencing => {
+        // Once fencing is done (simulated by Tick or specific event), we become a Replica (Clean Demotion)
+        AgentEvent::Tick => RoleReplica => |s: &mut FSM<RoleFencing, AgentEvent, AgentContext>| {
+            log::info!("Fencing/Demotion complete. Switching to Replica (Watchdog) mode.");
+            if let Some(ctx) = &mut s.user_data {
+                ctx.consecutive_failures = 0;
+            }
+        }
+    },
+
+    // -------------------------------------------------------------------------
+    // PROMOTING STATE (Transient)
+    // -------------------------------------------------------------------------
+    RolePromoting => {
+        // Promotion logic usually involves ensuring WAL catchup etc.
+        // We simulate success on next Tick.
+        AgentEvent::Tick => RolePrimary => |s: &mut FSM<RolePromoting, AgentEvent, AgentContext>| {
+            log::info!("Promotion Complete. I am now the PRIMARY.");
+            if let Some(ctx) = &mut s.user_data {
+                ctx.consecutive_failures = 0;
+                // Reset heartbeat timestamp so we don't fence immediately
+                ctx.last_primary_heartbeat = Some(Instant::now());
+            }
+        }
+    },
+
+    // -------------------------------------------------------------------------
+    // DEMOTING STATE (Transient)
+    // -------------------------------------------------------------------------
+    RoleDemoting => {
+         AgentEvent::Tick => RoleReplica => |s: &mut FSM<RoleDemoting, AgentEvent, AgentContext>| {
+            log::info!("Demotion Complete. Switching to Replica.");
+        }
+    }
+});
+
+// =============================================================================
+// Main & Runtime
+// =============================================================================
+
+pub async fn main_typestate_gemini() -> Result<(), Box<dyn std::error::Error>> {
+    env_logger::init();
+    log::info!("Harmony Agent FSM Starting...");
+
+    // 1. Setup Channels
+    let (event_tx, event_rx) = mpsc::channel::<AgentEvent>(100);
+    let (worker_tx, mut worker_rx) = mpsc::channel::<WorkerCommand>(100);
+
+    // 2. Configuration
+    let my_agent_id = "agent_1".to_string();
+    let desired_primary = "agent_1".to_string(); // Change to "agent_2" to test Replica start
+    let is_primary = my_agent_id == desired_primary;
+
+    let context = AgentContext {
+        agent_id: my_agent_id.clone(),
+        success_threshold: 2,
+        failure_threshold: 2,
+        heartbeat_interval: Duration::from_secs(1),
+        failover_timeout: Duration::from_secs(3), // 3s > 1s interval
+        consecutive_failures: 0,
+        last_primary_heartbeat: Some(Instant::now()),
+        worker_tx: worker_tx.clone(),
+    };
+
+    // 3. Spawn Worker (Simulates IO and Logic Glue)
+    let event_tx_worker = event_tx.clone();
+    tokio::spawn(async move {
+        while let Some(cmd) = worker_rx.recv().await {
+            match cmd {
+                WorkerCommand::PerformHealthCheck => {
+                    // Simulate IO latency
+                    tokio::time::sleep(Duration::from_millis(100)).await;
+                    
+                    // Simulate random failure (10% chance)
+                    let success = getrandom::u64().unwrap() % 100 > 10;
+                    
+                    // Send result back
+                    let _ = event_tx_worker.send(AgentEvent::HealthCheckResult { success }).await;
+                    
+                    // CRITICAL: Logic Glue for the FSM limitation
+                    // If we failed, we don't know the counter here easily without shared state.
+                    // But for the purpose of this demo, let's assume the FSM handles the counter.
+                    // If the FSM decides to fence, it sends PerformFencing.
+                    //
+                    // However, we need to trigger the transition event if threshold is hit.
+                    // Since FSM action is sync and can't send async events easily back to itself *during* the transition,
+                    // we rely on the FSM action checking the counter and sending a command to US (Worker),
+                    // and WE send the transition event back.
+                }
+                WorkerCommand::PerformFencing => {
+                    log::warn!("[Worker] Executing Fencing Procedure (Stop DB)...");
+                    tokio::time::sleep(Duration::from_millis(500)).await;
+                    // Trigger the state transition in FSM
+                    let _ = event_tx_worker.send(AgentEvent::ForceDemote).await;
+                }
+                WorkerCommand::PerformPromotion => {
+                    log::info!("[Worker] Checking Promotion Eligibility...");
+                    // Simulate check
+                    tokio::time::sleep(Duration::from_millis(200)).await;
+                    // Trigger transition
+                    let _ = event_tx_worker.send(AgentEvent::ForceDemote).await; // Reusing ForceDemote as "Trigger Transition" for Replica->Promote based on graph? 
+                    // Wait, Replica->Promote uses ForceDemote in the macro above? 
+                    // Yes: AgentEvent::ForceDemote => RolePromoting
+                }
+                WorkerCommand::PerformDemotion => {
+                    log::warn!("[Worker] Yielding Leadership...");
+                    tokio::time::sleep(Duration::from_millis(200)).await;
+                    // Trigger transition
+                    // We need an event that goes Primary -> Demoting.
+                    // In the macro: AgentEvent::ClusterStateUpdate handles the detection.
+                    // But we need to transition. 
+                    // Actually, the macro for ClusterStateUpdate transitions DIRECTLY to RoleDemoting.
+                    // So this command might just be for side-effects (stopping DB).
+                }
+            }
+        }
+    });
+
+    // 4. Spawn Timer (Heartbeat Tick)
+    let event_tx_timer = event_tx.clone();
+    tokio::spawn(async move {
+        let mut interval = tokio::time::interval(Duration::from_secs(1));
+        loop {
+            interval.tick().await;
+            let _ = event_tx_timer.send(AgentEvent::Tick).await;
+        }
+    });
+
+    // 5. Spawn NATS Watcher (Simulated)
+    let event_tx_nats = event_tx.clone();
+    tokio::spawn(async move {
+        // Simulate receiving heartbeats from "agent_1"
+        loop {
+            tokio::time::sleep(Duration::from_millis(500)).await;
+            // If we are agent_1, we are the primary, so we don't see external heartbeats usually,
+            // but for simulation, let's say we see ourselves or nothing.
+            // If we are agent_2 (Replica), we see agent_1.
+            
+            // Uncomment to simulate primary death for Replica:
+            // continue; 
+            
+            let _ = event_tx_nats.send(AgentEvent::ClusterStateUpdate {
+                primary_id: "agent_1".to_string(),
+                timestamp: Instant::now(),
+            }).await;
+        }
+    });
+
+    // 6. Initialize FSM
+    let initial_state: Box<dyn ErasedState<AgentEvent, AgentContext>> = if is_primary {
+        log::info!("Starting as PRIMARY");
+        Box::new(FSM::<RolePrimary, AgentEvent, AgentContext>::new(Some(context)))
+    } else {
+        log::info!("Starting as REPLICA");
+        Box::new(FSM::<RoleReplica, AgentEvent, AgentContext>::new(Some(context)))
+    };
+
+    // 7. Run
+    fsm::run_machine(initial_state, event_rx).await;
+
+    Ok(())
+}
+
diff --git a/harmony_agent/src/store/chaos.rs b/harmony_agent/src/store/chaos.rs
index 1dce4ed8..9fa6fc83 100644
--- a/harmony_agent/src/store/chaos.rs
+++ b/harmony_agent/src/store/chaos.rs
@@ -1,4 +1,5 @@
 use async_trait::async_trait;
+use log::{debug, trace, warn};
 use serde_json::Value;
 use std::sync::Arc;
 use tokio::time::Duration;
@@ -12,43 +13,55 @@ use super::{KvStore, KvStoreError};
 #[derive(Clone)]
 pub struct ChaosKvStore<T: KvStore> {
     inner: Arc<T>,
-    timeout_probability: u32,
-    failure_probability_percentage: u32,
+    timeout_probability_percent: u32,
+    failure_probability_percent: u32,
     max_delay_ms: u64,
 }
 
 impl<T: KvStore> ChaosKvStore<T> {
     pub fn new(
         inner: T,
-        timeout_probability: u32,
-        failure_probability: u32,
+        timeout_probability_percent: u32,
+        failure_probability_percent: u32,
         max_delay_ms: u64,
     ) -> Self {
         Self {
             inner: Arc::new(inner),
-            timeout_probability,
-            failure_probability_percentage: failure_probability,
+            timeout_probability_percent,
+            failure_probability_percent,
             max_delay_ms,
         }
     }
 
     async fn maybe_chaos(&self) -> Result<(), KvStoreError> {
+        trace!("Calculating chaos");
         // Random delay
-        if self.max_delay_ms > 0 {
-            let delay = getrandom::u64().unwrap() % self.max_delay_ms;
-            tokio::time::sleep(Duration::from_millis(delay)).await;
-        }
+        let delay = getrandom::u64().unwrap() % self.max_delay_ms;
+        let delay = Duration::from_millis(delay);
+        trace!("Sleeping until chaos maybe happens {delay:?}");
+        tokio::time::sleep(delay).await;
 
         // Random failure
-        let failure_random = getrandom::u32().unwrap();
-        if (failure_random % 100) < self.failure_probability_percentage {
-            return Err(KvStoreError::Unknown);
+        let failure_random = getrandom::u32().unwrap() % 100;
+        if failure_random < self.failure_probability_percent {
+            warn!(
+                "Chaos causes an error : {failure_random} < {}",
+                self.failure_probability_percent
+            );
+            return Err(KvStoreError::Unknown(format!(
+                "Randomly failed thanks to chaos store with {}% chances, got {}",
+                self.failure_probability_percent, failure_random
+            )));
         }
 
         // Random timeout (simulated as a very long delay)
-        let failure_random = getrandom::u32().unwrap();
-        if failure_random % 100 < self.timeout_probability {
-            tokio::time::sleep(Duration::from_secs(10)).await;
+        let failure_random = getrandom::u32().unwrap() % 100;
+        if failure_random < self.timeout_probability_percent {
+            warn!(
+                "Chaos caused a timeout : {failure_random} < {}",
+                self.failure_probability_percent
+            );
+            tokio::time::sleep(Duration::from_secs(189754678456784560)).await;
         }
 
         Ok(())
@@ -57,14 +70,23 @@ impl<T: KvStore> ChaosKvStore<T> {
 
 #[async_trait]
 impl<T: KvStore + Send + Sync> KvStore for ChaosKvStore<T> {
-    async fn get(&self, key: String) -> Result<super::KvResult, KvStoreError> {
+    async fn get(&self, key: &str) -> Result<super::KvResult, KvStoreError> {
         self.maybe_chaos().await?;
         self.inner.get(key).await
     }
 
+    async fn get_revision(
+        &self,
+        key: &str,
+        expected_seq: u64,
+    ) -> Result<super::KvResult, KvStoreError> {
+        self.maybe_chaos().await?;
+        self.inner.get_revision(key, expected_seq).await
+    }
+
     async fn set_strict(
         &self,
-        key: String,
+        key: &str,
         value: Value,
         expected_sequence: u64,
     ) -> Result<u64, KvStoreError> {
@@ -74,7 +96,7 @@ impl<T: KvStore + Send + Sync> KvStore for ChaosKvStore<T> {
 
     async fn subscribe(
         &self,
-        key: String,
+        key: &str,
         callback: SubscriptionCallback,
     ) -> Result<(), KvStoreError> {
         self.maybe_chaos().await?;
@@ -94,13 +116,10 @@ mod tests {
         let chaos = ChaosKvStore::new(inner, 0, 0, 0);
 
         let value = json!({"test": "value"});
-        let result = chaos
-            .set_strict("key".to_string(), value.clone(), 0)
-            .await
-            .unwrap();
+        let result = chaos.set_strict("key", value.clone(), 0).await.unwrap();
         assert_eq!(result, 1);
 
-        let retrieved = chaos.get("key".to_string()).await.unwrap();
+        let retrieved = chaos.get("key").await.unwrap();
         assert_eq!(retrieved.value, Some(value));
     }
 
@@ -111,7 +130,7 @@ mod tests {
 
         let start = tokio::time::Instant::now();
         let value = json!({"test": "value"});
-        chaos.set_strict("key".to_string(), value, 0).await.unwrap();
+        chaos.set_strict("key", value, 0).await.unwrap();
         let elapsed = start.elapsed();
 
         // Should have some delay
diff --git a/harmony_agent/src/store/memory.rs b/harmony_agent/src/store/memory.rs
index 3549c563..150b7225 100644
--- a/harmony_agent/src/store/memory.rs
+++ b/harmony_agent/src/store/memory.rs
@@ -1,4 +1,5 @@
 use async_trait::async_trait;
+use log::{debug, trace};
 use serde_json::Value;
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -10,24 +11,46 @@ use crate::store::SubscriptionCallback;
 use super::{KvMetadata, KvResult, KvStore, KvStoreError};
 
 /// An in-memory KV store that guarantees ordering like NATS JetStream
-/// Each key has a sequence number that increments on each write
+/// Each key maintains a full history of all writes, where the sequence number
+/// is the length of the history (1-indexed)
 #[derive(Clone)]
 pub struct InMemoryKvStore {
-    data: Arc<RwLock<HashMap<String, (Value, u64)>>>,
-    global_seq: Arc<RwLock<u64>>,
+    data: Arc<RwLock<HashMap<String, Vec<(Value, u64)>>>>,
 }
 
 impl InMemoryKvStore {
     pub fn new() -> Self {
         Self {
             data: Arc::new(RwLock::new(HashMap::new())),
-            global_seq: Arc::new(RwLock::new(0)),
         }
     }
 
-    /// Get the sequence number for a key
+    /// Get the latest sequence number for a key (length of history)
     pub async fn get_seq(&self, key: &str) -> Option<u64> {
-        self.data.read().await.get(key).map(|(_, seq)| *seq)
+        self.data.read().await.get(key).map(|vec| vec.len() as u64)
+    }
+
+    /// Get the value at a specific revision for a key
+    pub async fn get_revision(&self, key: &str, seq: u64) -> Result<KvResult, KvStoreError> {
+        let data = self.data.read().await;
+        let entries = data
+            .get(key)
+            .ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
+
+        // Sequence numbers are 1-indexed, so seq must be >= 1 and <= len()
+        if seq == 0 || seq > entries.len() as u64 {
+            return Err(KvStoreError::KeyNotAvailable(key.to_string()));
+        }
+
+        let (value, timestamp) = entries[seq as usize - 1].clone();
+
+        Ok(KvResult {
+            value: Some(value.clone()),
+            metadata: KvMetadata {
+                timestamp,
+                sequence: seq,
+            },
+        })
     }
 }
 
@@ -39,62 +62,69 @@ impl Default for InMemoryKvStore {
 
 #[async_trait]
 impl KvStore for InMemoryKvStore {
-    async fn get(&self, key: String) -> Result<KvResult, KvStoreError> {
-        let data = self.data.read().await;
-        let (value, sequence) = data
-            .get(&key)
-            .ok_or_else(|| KvStoreError::KeyNotAvailable(key.clone()))?;
+    async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
+        self.get_revision(key, expected_seq).await
+    }
 
-        let timestamp = SystemTime::now()
-            .duration_since(UNIX_EPOCH)
-            .expect("Time went backwards")
-            .as_millis() as u64;
+    async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
+        let data = self.data.read().await;
+        let entries = data
+            .get(key)
+            .ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
+
+        let (value, timestamp) = entries.last().unwrap();
 
         Ok(KvResult {
             value: Some(value.clone()),
             metadata: KvMetadata {
-                timestamp,
-                sequence: *sequence,
+                timestamp: *timestamp,
+                sequence: entries.len() as u64,
             },
         })
     }
 
     async fn set_strict(
         &self,
-        key: String,
+        key: &str,
         value: Value,
         expected_sequence: u64,
     ) -> Result<u64, KvStoreError> {
-        // Check current sequence
+        // Check current sequence (length of history for this key)
         let data = self.data.read().await;
-        let current_sequence = data.get(&key).map(|(_, seq)| *seq).unwrap_or(0);
+        let current_sequence = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
         drop(data);
 
         // Verify expected sequence matches
         if current_sequence != expected_sequence {
-            return Err(KvStoreError::SequenceMismatch {
-                expected: expected_sequence,
-                current: current_sequence,
-            });
+            trace!("{current_sequence} != {expected_sequence}");
+            return Err(KvStoreError::WrongLastRevision);
         }
 
-        // Increment global sequence
-        let mut seq = self.global_seq.write().await;
-        *seq += 1;
-        let new_seq = *seq;
-        drop(seq);
+        // Get current timestamp
+        let timestamp = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .expect("Time went backwards")
+            .as_millis() as u64;
 
-        // Write the new value
+        // Append to the history
         let mut data = self.data.write().await;
-        data.insert(key, (value.clone(), new_seq));
-        drop(data);
+        data.entry(key.to_string())
+            .or_insert_with(Vec::new)
+            .push((value.clone(), timestamp));
+
+        let new_seq = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
+
+        debug!(
+            "Successfully inserted {key}(rev#{new_seq}) : {value}",
+            value = value.to_string()
+        );
 
         Ok(new_seq)
     }
 
     async fn subscribe(
         &self,
-        key: String,
+        key: &str,
         callback: SubscriptionCallback,
     ) -> Result<(), KvStoreError> {
         // For now, subscribe just returns the current value
@@ -116,13 +146,13 @@ mod tests {
         // Set a value
         let value = json!({"status": "healthy"});
         let result = store
-            .set_strict("test_key".to_string(), value.clone(), 0)
+            .set_strict("test_key", value.clone(), 0)
             .await
             .unwrap();
         assert_eq!(result, 1);
 
         // Get the value
-        let retrieved = store.get("test_key".to_string()).await.unwrap();
+        let retrieved = store.get("test_key").await.unwrap();
         assert_eq!(retrieved.value, Some(value));
         assert_eq!(retrieved.metadata.sequence, 1);
     }
@@ -131,15 +161,9 @@ mod tests {
     async fn test_memory_store_sequence_numbers() {
         let store = InMemoryKvStore::new();
 
-        let seq1 = store
-            .set_strict("key1".to_string(), json!("value1"), 0)
-            .await
-            .unwrap();
+        let seq1 = store.set_strict("key1", json!("value1"), 0).await.unwrap();
 
-        let seq2 = store
-            .set_strict("key2".to_string(), json!("value2"), 0)
-            .await
-            .unwrap();
+        let seq2 = store.set_strict("key2", json!("value2"), 0).await.unwrap();
 
         assert!(seq2 > seq1, "Sequence numbers should increment");
     }
@@ -147,7 +171,7 @@ mod tests {
     #[tokio::test]
     async fn test_memory_store_key_not_found() {
         let store = InMemoryKvStore::new();
-        let result = store.get("nonexistent".to_string()).await;
+        let result = store.get("nonexistent").await;
         assert!(matches!(result, Err(KvStoreError::KeyNotAvailable(_))));
     }
 
@@ -156,29 +180,15 @@ mod tests {
         let store = InMemoryKvStore::new();
 
         // First write with sequence 0
-        let result1 = store
-            .set_strict("key".to_string(), json!("value1"), 0)
-            .await
-            .unwrap();
+        let result1 = store.set_strict("key", json!("value1"), 0).await.unwrap();
         assert_eq!(result1, 1);
 
         // Second write with correct sequence
-        let result2 = store
-            .set_strict("key".to_string(), json!("value2"), 1)
-            .await
-            .unwrap();
+        let result2 = store.set_strict("key", json!("value2"), 1).await.unwrap();
         assert_eq!(result2, 2);
 
         // Third write with wrong sequence should fail
-        let result3 = store
-            .set_strict("key".to_string(), json!("value3"), 1)
-            .await;
-        assert!(matches!(
-            result3,
-            Err(KvStoreError::SequenceMismatch {
-                expected: 1,
-                current: 2
-            })
-        ));
+        let result3 = store.set_strict("key", json!("value3"), 1).await;
+        assert!(matches!(result3, Err(KvStoreError::WrongLastRevision)));
     }
 }
diff --git a/harmony_agent/src/store/mod.rs b/harmony_agent/src/store/mod.rs
index 26e630c5..08879d1a 100644
--- a/harmony_agent/src/store/mod.rs
+++ b/harmony_agent/src/store/mod.rs
@@ -1,4 +1,5 @@
 use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use thiserror::Error;
 
@@ -11,7 +12,7 @@ pub struct SubscriptionHandle {
 
 /// Metadata returned by the KV store for all operations
 /// Contains timing and ordering information set by the store
-#[derive(Debug, Clone)]
+#[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct KvMetadata {
     /// Timestamp set by the store (milliseconds since UNIX epoch)
     pub timestamp: u64,
@@ -48,70 +49,72 @@ pub enum KvStoreError {
         deserialization_error: String,
         value: String,
     },
-    #[error("Strict ordering violation: expected sequence {expected}, but current is {current}")]
-    SequenceMismatch { expected: u64, current: u64 },
-    #[error("unknown data store error")]
-    Unknown,
+    #[error("Strict ordering violation, wrong last sequence number")]
+    WrongLastRevision,
+    #[error("unknown data store error {0}")]
+    Unknown(String),
 }
 
 #[async_trait]
 pub trait KvStore {
     /// Get a value from the store
-    /// 
+    ///
     /// # Returns
     /// - `Ok(KvResult)`: Contains the value and metadata (timestamp, sequence)
     /// - `Err(KeyNotAvailable)`: If the key doesn't exist
-    async fn get(&self, key: String) -> Result<KvResult, KvStoreError>;
-    
+    async fn get(&self, key: &str) -> Result<KvResult, KvStoreError>;
+
+    async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError>;
+
     /// Strict set operation with compare-and-set semantics
-    /// 
+    ///
     /// Sets the value only if the current sequence number matches `expected_sequence`.
     /// This provides strict ordering guarantees needed for the failover algorithm.
-    /// 
+    ///
     /// # Parameters
     /// - `key`: The key to set
     /// - `value`: The value to store
     /// - `expected_sequence`: The sequence number we expect the key to currently have.
     ///   Use 0 for the first write to a new key.
-    /// 
+    ///
     /// # Returns
     /// - `Ok(u64)`: Returns the new sequence number
     /// - `Err(KvStoreError)`: If another write happened (current != expected)
-    /// 
+    ///
     /// # Example Use Case
     /// For NATS JetStream, this maps to the conditional update operation that ensures
     /// only one agent can successfully promote to primary.
     async fn set_strict(
         &self,
-        key: String,
+        key: &str,
         value: Value,
         expected_sequence: u64,
     ) -> Result<u64, KvStoreError>;
-    
+
     /// Subscribe to updates for a key
-    /// 
+    ///
     /// # Parameters
     /// - `key`: The key to subscribe to
     /// - `callback`: Function to call on each update with key, value, and metadata
-    /// 
+    ///
     /// # Returns
     /// - `Ok(())`: Subscription established successfully
     /// - `Err(KvStoreError)`: Subscription failed
-    /// 
+    ///
     /// Note: For JetStream, this should use watch() API. Updates will invoke the callback
     /// asynchronously in the background.
     async fn subscribe(
         &self,
-        key: String,
+        key: &str,
         callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
                                         // callback
     ) -> Result<(), KvStoreError>;
 }
 
+mod chaos;
 mod memory;
 mod nats;
-mod chaos;
 
+pub use chaos::ChaosKvStore;
 pub use memory::InMemoryKvStore;
 pub use nats::NatsKvStore;
-pub use chaos::ChaosKvStore;
diff --git a/harmony_agent/src/store/nats.rs b/harmony_agent/src/store/nats.rs
index 1c82c1d8..c89bc54c 100644
--- a/harmony_agent/src/store/nats.rs
+++ b/harmony_agent/src/store/nats.rs
@@ -1,6 +1,6 @@
 use async_nats::jetstream::kv::{Store, UpdateError};
 use async_trait::async_trait;
-use log::{debug, error};
+use log::{debug, error, trace};
 use serde_json::Value;
 
 use crate::store::SubscriptionCallback;
@@ -46,8 +46,48 @@ impl NatsKvStore {
 
 #[async_trait]
 impl KvStore for NatsKvStore {
-    async fn get(&self, key: String) -> Result<KvResult, KvStoreError> {
-        let entry = self.store.entry(&key).await.map_err(|e| {
+    async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
+        let entry = self
+            .store
+            .entry_for_revision(key, expected_seq)
+            .await
+            .map_err(|e| {
+                error!("NATS get failed for key '{}': {}", key, e);
+                KvStoreError::Disconnect(std::io::Error::new(
+                    std::io::ErrorKind::Other,
+                    e.to_string(),
+                ))
+            })?;
+
+        if entry.is_none() {
+            return Err(KvStoreError::KeyNotAvailable(key.to_string()));
+        }
+
+        let entry = entry.unwrap();
+        let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
+            KvStoreError::DeserializationFailed {
+                deserialization_error: e.to_string(),
+                value: String::from_utf8_lossy(&entry.value).to_string(),
+            }
+        })?;
+
+        // Extract metadata from NATS entry
+        // Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
+        let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
+
+        let metadata = KvMetadata {
+            timestamp,
+            sequence: entry.revision,
+        };
+
+        Ok(KvResult {
+            value: Some(value),
+            metadata,
+        })
+    }
+
+    async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
+        let entry = self.store.entry(key).await.map_err(|e| {
             error!("NATS get failed for key '{}': {}", key, e);
             KvStoreError::Disconnect(std::io::Error::new(
                 std::io::ErrorKind::Other,
@@ -56,7 +96,7 @@ impl KvStore for NatsKvStore {
         })?;
 
         if entry.is_none() {
-            return Err(KvStoreError::KeyNotAvailable(key));
+            return Err(KvStoreError::KeyNotAvailable(key.to_string()));
         }
 
         let entry = entry.unwrap();
@@ -84,10 +124,14 @@ impl KvStore for NatsKvStore {
 
     async fn set_strict(
         &self,
-        key: String,
+        key: &str,
         value: Value,
         expected_sequence: u64,
     ) -> Result<u64, KvStoreError> {
+        trace!(
+            "Nats set strict {key} (#{expected_sequence}) : {}",
+            value.to_string()
+        );
         let bytes =
             serde_json::to_vec(&value).map_err(|e| KvStoreError::DeserializationFailed {
                 deserialization_error: e.to_string(),
@@ -112,7 +156,7 @@ impl KvStore for NatsKvStore {
 
     async fn subscribe(
         &self,
-        key: String,
+        key: &str,
         callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
     ) -> Result<(), KvStoreError> {
         todo!()
@@ -125,7 +169,7 @@ impl From<UpdateError> for KvStoreError {
             async_nats::jetstream::kv::UpdateErrorKind::InvalidKey => KvStoreError::InvalidKey,
             async_nats::jetstream::kv::UpdateErrorKind::TimedOut => KvStoreError::Timeout,
             async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision => {
-                KvStoreError::KeyNotAvailable("key".to_string())
+                KvStoreError::WrongLastRevision
             }
             async_nats::jetstream::kv::UpdateErrorKind::Other => KvStoreError::Disconnect(
                 std::io::Error::new(std::io::ErrorKind::Other, "NATS update error"),
diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs
index 074b29e2..05f9934c 100644
--- a/harmony_agent/src/workflow/mod.rs
+++ b/harmony_agent/src/workflow/mod.rs
@@ -8,28 +8,12 @@ pub mod replica;
 #[async_trait]
 pub trait HeartbeatWorkflow: Send + Sync {
     /// Handle a successful heartbeat
-    fn handle_heartbeat_success(&mut self);
+    async fn handle_heartbeat_success(&mut self);
 
     /// Handle a failed heartbeat
-    fn handle_heartbeat_failure(&mut self);
+    async fn handle_heartbeat_failure(&mut self);
 
-    /// Called after heartbeat is successfully stored with metadata
-    /// This provides workflows access to timestamp/sequence for staleness calculations
-    async fn on_heartbeat_stored(&mut self, _heartbeat: &crate::agent_loop::AgentHeartbeat) {
-        // Default implementation does nothing
-    }
-
-    /// Called during agent startup to reconcile state from cluster state
-    /// Receives the current cluster state if available
-    async fn on_startup(&mut self, _cluster_state: Option<&crate::agent_loop::ClusterStateData>) {
-        // Default implementation does nothing
-    }
-
-    /// Called when a peer agent heartbeat is observed (via subscription)
-    /// This is primarily used by replicas to detect primary staleness
-    async fn on_peer_heartbeat(&mut self, _peer_id: &Id, _heartbeat: &crate::agent_loop::AgentHeartbeat) {
-        // Default implementation does nothing
-    }
+    async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>);
 
     /// Get the current state name for logging (also used for heartbeat status)
     fn state_name(&self) -> &'static str;
diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs
index 7eccc998..73c12828 100644
--- a/harmony_agent/src/workflow/primary.rs
+++ b/harmony_agent/src/workflow/primary.rs
@@ -76,7 +76,8 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
             debug!("No cluster state on startup, starting from Initializing");
         }
     }
-    fn handle_heartbeat_success(&mut self) {
+    async fn handle_heartbeat_success(&mut self) {
+        trace!("Handling heartbeat success, current counters success {} failures {}", self.consecutive_successes, self.consecutive_failures);
         self.consecutive_successes += 1;
         self.consecutive_failures = 0;
 
@@ -119,7 +120,7 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
         }
     }
 
-    fn handle_heartbeat_failure(&mut self) {
+    async fn handle_heartbeat_failure(&mut self) {
         self.consecutive_failures += 1;
         self.consecutive_successes = 0;
 
diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs
index 9800e3c7..b790a6b9 100644
--- a/harmony_agent/src/workflow/replica.rs
+++ b/harmony_agent/src/workflow/replica.rs
@@ -1,6 +1,6 @@
 use async_trait::async_trait;
 use harmony_types::id::Id;
-use log::{debug, info, trace};
+use log::{debug, info, trace, warn};
 use std::time::Duration;
 use tokio::sync::RwLock;
 
@@ -115,8 +115,7 @@ impl ReplicaWorkflow {
 
     /// Check if the primary heartbeat is stale compared to our own
     /// Per ADR-017-3: primary is stale if (replica_timestamp - primary_timestamp) > failover_timeout
-    async fn check_primary_staleness(&mut self) {
-        let mut new_state = self.state.clone();
+    async fn is_primary_stale(&mut self) -> bool {
         if let Some(my_hb) = &self.last_my_heartbeat {
             if let Some(my_metadata) = &my_hb.metadata {
                 if let Some(primary_hb_ref) = self.last_primary_heartbeat.as_ref() {
@@ -141,65 +140,25 @@ impl ReplicaWorkflow {
                                 "Primary heartbeat stale ({}ms > {}ms), attempting promotion",
                                 time_diff_ms, failover_timeout_ms
                             );
-                            new_state = ReplicaState::Promoting;
+
+                            return true;
                         }
                     }
                 }
             }
-
-            if self.state != new_state {
-                self.transition_to(new_state)
-            }
         }
+        false
     }
 }
 
 #[async_trait]
 impl HeartbeatWorkflow for ReplicaWorkflow {
-    async fn on_peer_heartbeat(&mut self, peer_id: &Id, heartbeat: &AgentHeartbeat) {
-        // Only track the primary's heartbeat
-        if *peer_id == self.primary_state.agent_id {
-            match &self.last_primary_heartbeat {
-                Some(existing) => {
-                    // Update the existing heartbeat data
-                    *existing.write().await = heartbeat.clone();
-                }
-                None => {
-                    // First time seeing primary heartbeat
-                    self.last_primary_heartbeat = Some(RwLock::new(heartbeat.clone()));
-                }
-            }
-            trace!(
-                "Updated primary heartbeat: seq={}, timestamp={}",
-                heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0),
-                heartbeat
-                    .metadata
-                    .as_ref()
-                    .map(|m| m.timestamp)
-                    .unwrap_or(0),
-            );
-        }
-    }
-    async fn on_heartbeat_stored(&mut self, heartbeat: &AgentHeartbeat) {
-        // Track our own heartbeat for staleness comparison
-        self.last_my_heartbeat = Some(heartbeat.clone());
-
-        // Perform staleness detection if we have both heartbeats
-        self.check_primary_staleness().await;
-    }
     async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>) {
-        if let Some(state) = cluster_state {
-            info!(
-                "Startup reconciliation: current primary is {:?}, desired primary is {:?}",
-                state.current_primary, state.desired_primary
-            );
-            // Update cluster_state with the observed values
-            self.cluster_state.current_primary = state.current_primary.clone();
-        } else {
-            debug!("No cluster state on startup, starting from Initializing");
-        }
+        todo!("not sure if the replica should do anything on startup")
     }
-    fn handle_heartbeat_success(&mut self) {
+
+    async fn handle_heartbeat_success(&mut self) {
+        trace!("Handling heartbeat success, current counters success {} failures {}", self.consecutive_successes, self.consecutive_failures);
         self.consecutive_successes += 1;
         self.consecutive_failures = 0;
 
@@ -212,10 +171,23 @@ impl HeartbeatWorkflow for ReplicaWorkflow {
             ReplicaState::Watching => {
                 // TODO: Check primary staleness from NATS
                 trace!("Replica watching primary");
+                if self.is_primary_stale().await {
+                    warn!("Found stale primary, launching promotion");
+                }
+                todo!("perform the replica watch actions : 
+                - if a primary exists in the cluster (cluster_state.current_primary == expected_primary)
+                    - check the last primary heartbeat kv timestamp
+                    - compare it with our latest kv heartbeat
+                    - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself)
+                    - launching promotion will change the status of the replica
+                    ");
             }
             ReplicaState::Promoting => {
                 // TODO: Complete promotion attempt
                 trace!("Replica promotion in progress");
+                todo!(
+                    "When promoting, a heartbeat failure does not affect promotion unless failure_threshold is reached, a heartbeat success does nothing either"
+                );
             }
             ReplicaState::PromotionFailed => {
                 if self.consecutive_successes >= self.success_threshold {
@@ -239,10 +211,17 @@ impl HeartbeatWorkflow for ReplicaWorkflow {
         }
     }
 
-    fn handle_heartbeat_failure(&mut self) {
+    async fn handle_heartbeat_failure(&mut self) {
         self.consecutive_failures += 1;
         self.consecutive_successes = 0;
 
+        // TODO revisit this. I think we should handle the agent healthiness (checking
+        // consecutive_failures against failure_threshold) separately from handling the cluster
+        // state.
+        //
+        // That said, there might be funny stuff we have to do when the agent reaches the failure
+        // threshold, especially in promoting and demoting statuses.
+
         match self.state {
             ReplicaState::Watching | ReplicaState::Initializing => {
                 if self.consecutive_failures >= self.failure_threshold {
-- 
2.39.5


From a20919bbda76e664a0b5a7245b91a066e10b5f83 Mon Sep 17 00:00:00 2001
From: wjro <wrolleman@nationtech.io>
Date: Tue, 3 Feb 2026 11:43:22 -0500
Subject: [PATCH 10/19] wip: write cluster state to jetstream kv

---
 harmony_agent/src/agent_loop.rs | 52 +++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs
index 089b013d..7147edd7 100644
--- a/harmony_agent/src/agent_loop.rs
+++ b/harmony_agent/src/agent_loop.rs
@@ -1,3 +1,4 @@
+use std::time::{SystemTime, UNIX_EPOCH};
 use std::{str::FromStr, sync::Arc, time::Duration};
 
 use harmony_types::id::Id;
@@ -253,6 +254,57 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
         Ok(())
     }
 
+    async fn store_cluster_state(
+        &self,
+        cluster_state_data: Option<ClusterStateData>,
+    ) -> Result<ClusterStateData, KvStoreError> {
+        let key = format!("cluster.{}", self.config.cluster_id);
+        match cluster_state_data {
+            Some(state) => {
+                let value = serde_json::to_value(&state).map_err(|e| {
+                    KvStoreError::DeserializationFailed {
+                        deserialization_error: e.to_string(),
+                        value: format!("{:?}", state),
+                    }
+                })?;
+
+                let expected_sequence = {
+                    let last = self.last_heartbeat.read().await;
+                    last.as_ref()
+                        .and_then(|hb| hb.metadata.as_ref())
+                        .map(|m| m.sequence)
+                        .unwrap_or(0)
+                };
+
+                self.cluster_kv
+                    .set_strict(&key, value, expected_sequence)
+                    .await?;
+
+                Ok(state)
+            }
+            None => {
+                let cluster_data = ClusterStateData {
+                    cluster_id: self.config.cluster_id.clone(),
+                    current_primary: None,
+                    desired_primary: self.config.desired_primary_id.clone(),
+                    timestamp: SystemTime::now()
+                        .duration_since(UNIX_EPOCH)
+                        .expect("Time went backwards")
+                        .as_millis() as u64,
+                };
+
+                let value = serde_json::to_value(&cluster_data).map_err(|e| {
+                    KvStoreError::DeserializationFailed {
+                        deserialization_error: e.to_string(),
+                        value: format!("{:?}", cluster_data),
+                    }
+                })?;
+                self.cluster_kv.set_strict(&key, value, 0).await?;
+                Ok(cluster_data)
+            }
+        }
+    }
+
     /// Sends agent heartbeat to the KV store
     ///
     /// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence)
-- 
2.39.5


From 7065e90475934cd8d70c7f3d2818b36f14ed4d0b Mon Sep 17 00:00:00 2001
From: Sylvain Tremblay <stremblay@nationtech.io>
Date: Tue, 3 Feb 2026 11:45:03 -0500
Subject: [PATCH 11/19] feat: use the role of the agent to define its name

---
 harmony_agent/src/agent_loop.rs | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs
index 089b013d..f6e1b4b6 100644
--- a/harmony_agent/src/agent_loop.rs
+++ b/harmony_agent/src/agent_loop.rs
@@ -1,3 +1,4 @@
+use std::fmt;
 use std::{str::FromStr, sync::Arc, time::Duration};
 
 use harmony_types::id::Id;
@@ -18,6 +19,15 @@ pub enum AgentRole {
     Replica,
 }
 
+impl fmt::Display for AgentRole {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            AgentRole::Primary => write!(f, "primary"),
+            AgentRole::Replica => write!(f, "replica"),
+        }
+    }
+}
+
 pub async fn launch_agent<S>(
     role: AgentRole,
     health_kv: Arc<S>,
@@ -28,7 +38,8 @@ pub async fn launch_agent<S>(
 where
     S: KvStore + Send + Sync + 'static,
 {
-    let my_agent_id = Id::from_str("agent_1").unwrap();
+    let my_agent_name = format!("agent-{}", role);
+    let my_agent_id = Id::from_str(&my_agent_name).unwrap();
 
     let config = AgentConfig {
         role,
@@ -219,6 +230,10 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
             cluster_key
         );
 
+        /*
+        trace!("{:#?}", self.cluster_kv.get(&cluster_key).await);
+        */
+
         let cluster_state_option = match self.cluster_kv.get(&cluster_key).await {
             Ok(result) => {
                 if let Some(value) = result.value {
-- 
2.39.5


From 5b04cc96d7b8edd537df5d60f6ab2b2161952a9a Mon Sep 17 00:00:00 2001
From: Sylvain Tremblay <stremblay@nationtech.io>
Date: Tue, 3 Feb 2026 14:50:03 -0500
Subject: [PATCH 12/19] wip: we want to initialize to the right seq number
 after a restart

---
 Cargo.lock                            | 311 +++++++++++++++++++++++---
 harmony_agent/README.md               |   2 +
 harmony_agent/src/agent_loop.rs       |  27 ++-
 harmony_agent/src/workflow/mod.rs     |  11 +-
 harmony_agent/src/workflow/primary.rs |  18 +-
 harmony_agent/src/workflow/replica.rs |  21 +-
 6 files changed, 337 insertions(+), 53 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index aabfb9d2..14295673 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -243,7 +243,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
 dependencies = [
  "cfg-if",
  "const-random",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
  "once_cell",
  "version_check",
  "zerocopy",
@@ -450,6 +450,43 @@ dependencies = [
  "pin-project-lite",
 ]
 
+[[package]]
+name = "async-nats"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-util",
+ "memchr",
+ "nkeys",
+ "nuid",
+ "once_cell",
+ "pin-project",
+ "portable-atomic",
+ "rand 0.8.5",
+ "regex",
+ "ring",
+ "rustls-native-certs 0.7.3",
+ "rustls-pemfile 2.2.0",
+ "rustls-webpki 0.102.8",
+ "serde",
+ "serde_json",
+ "serde_nanos",
+ "serde_repr",
+ "thiserror 1.0.69",
+ "time",
+ "tokio",
+ "tokio-rustls 0.26.2",
+ "tokio-stream",
+ "tokio-util",
+ "tokio-websockets",
+ "tracing",
+ "tryhard",
+ "url",
+]
+
 [[package]]
 name = "async-stream"
 version = "0.3.6"
@@ -774,6 +811,9 @@ name = "bytes"
 version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+dependencies = [
+ "serde",
+]
 
 [[package]]
 name = "bytestring"
@@ -875,6 +915,22 @@ dependencies = [
  "shlex",
 ]
 
+[[package]]
+name = "cert_manager"
+version = "0.1.0"
+dependencies = [
+ "assert_cmd",
+ "cidr",
+ "env_logger",
+ "harmony",
+ "harmony_cli",
+ "harmony_macros",
+ "harmony_types",
+ "log",
+ "tokio",
+ "url",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.3"
@@ -1550,6 +1606,7 @@ dependencies = [
  "rand_core 0.6.4",
  "serde",
  "sha2",
+ "signature",
  "subtle",
  "zeroize",
 ]
@@ -1754,6 +1811,24 @@ dependencies = [
  "url",
 ]
 
+[[package]]
+name = "example-ha-cluster"
+version = "0.1.0"
+dependencies = [
+ "brocade",
+ "cidr",
+ "env_logger",
+ "harmony",
+ "harmony_macros",
+ "harmony_secret",
+ "harmony_tui",
+ "harmony_types",
+ "log",
+ "serde",
+ "tokio",
+ "url",
+]
+
 [[package]]
 name = "example-kube-rs"
 version = "0.1.0"
@@ -1942,9 +2017,28 @@ dependencies = [
  "cidr",
  "env_logger",
  "harmony",
+ "harmony_cli",
  "harmony_macros",
  "harmony_secret",
- "harmony_tui",
+ "harmony_types",
+ "log",
+ "serde",
+ "tokio",
+ "url",
+]
+
+[[package]]
+name = "example-opnsense-node-exporter"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "cidr",
+ "env_logger",
+ "harmony",
+ "harmony_cli",
+ "harmony_macros",
+ "harmony_secret",
+ "harmony_secret_derive",
  "harmony_types",
  "log",
  "serde",
@@ -1982,25 +2076,6 @@ dependencies = [
  "url",
 ]
 
-[[package]]
-name = "example-opnsense-node-exporter"
-version = "0.1.0"
-dependencies = [
- "async-trait",
- "cidr",
- "env_logger",
- "harmony",
- "harmony_cli",
- "harmony_macros",
- "harmony_secret",
- "harmony_secret_derive",
- "harmony_types",
- "log",
- "serde",
- "tokio",
- "url",
-]
-
 [[package]]
 name = "example-pxe"
 version = "0.1.0"
@@ -2406,21 +2481,21 @@ dependencies = [
  "cfg-if",
  "js-sys",
  "libc",
- "wasi 0.11.1+wasi-snapshot-preview1",
+ "wasi",
  "wasm-bindgen",
 ]
 
 [[package]]
 name = "getrandom"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
  "cfg-if",
  "js-sys",
  "libc",
  "r-efi",
- "wasi 0.14.3+wasi-0.2.4",
+ "wasip2",
  "wasm-bindgen",
 ]
 
@@ -2522,6 +2597,7 @@ dependencies = [
  "env_logger",
  "fqdn",
  "futures-util",
+ "harmony_execution",
  "harmony_inventory_agent",
  "harmony_macros",
  "harmony_secret",
@@ -2568,6 +2644,42 @@ dependencies = [
  "walkdir",
 ]
 
+[[package]]
+name = "harmony_agent"
+version = "0.1.0"
+dependencies = [
+ "async-nats",
+ "async-trait",
+ "cidr",
+ "env_logger",
+ "getrandom 0.3.4",
+ "harmony",
+ "harmony_macros",
+ "harmony_types",
+ "log",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.16",
+ "tokio",
+]
+
+[[package]]
+name = "harmony_agent_deploy"
+version = "0.1.0"
+dependencies = [
+ "cidr",
+ "env_logger",
+ "harmony",
+ "harmony_cli",
+ "harmony_macros",
+ "harmony_types",
+ "log",
+ "serde",
+ "serde_json",
+ "tokio",
+ "url",
+]
+
 [[package]]
 name = "harmony_cli"
 version = "0.1.0"
@@ -2608,6 +2720,16 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "harmony_execution"
+version = "0.1.0"
+dependencies = [
+ "directories",
+ "lazy_static",
+ "log",
+ "thiserror 2.0.16",
+]
+
 [[package]]
 name = "harmony_inventory_agent"
 version = "0.1.0"
@@ -3438,7 +3560,7 @@ version = "0.1.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
  "libc",
 ]
 
@@ -3464,6 +3586,26 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "json-prompt"
+version = "0.1.0"
+dependencies = [
+ "brocade",
+ "cidr",
+ "env_logger",
+ "harmony",
+ "harmony_cli",
+ "harmony_macros",
+ "harmony_secret",
+ "harmony_secret_derive",
+ "harmony_types",
+ "log",
+ "schemars 0.8.22",
+ "serde",
+ "tokio",
+ "url",
+]
+
 [[package]]
 name = "jsonpath-rust"
 version = "0.7.5"
@@ -3878,7 +4020,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
  "libc",
  "log",
- "wasi 0.11.1+wasi-snapshot-preview1",
+ "wasi",
  "windows-sys 0.48.0",
 ]
 
@@ -3890,7 +4032,7 @@ checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c"
 dependencies = [
  "libc",
  "log",
- "wasi 0.11.1+wasi-snapshot-preview1",
+ "wasi",
  "windows-sys 0.59.0",
 ]
 
@@ -3928,6 +4070,21 @@ dependencies = [
  "unicode-segmentation",
 ]
 
+[[package]]
+name = "nkeys"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf"
+dependencies = [
+ "data-encoding",
+ "ed25519",
+ "ed25519-dalek",
+ "getrandom 0.2.16",
+ "log",
+ "rand 0.8.5",
+ "signatory",
+]
+
 [[package]]
 name = "non-blank-string-rs"
 version = "1.0.4"
@@ -3946,6 +4103,15 @@ dependencies = [
  "winapi 0.3.9",
 ]
 
+[[package]]
+name = "nuid"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83"
+dependencies = [
+ "rand 0.8.5",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.6"
@@ -4566,7 +4732,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
 dependencies = [
  "bytes",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
  "lru-slab",
  "rand 0.9.2",
  "ring",
@@ -4671,7 +4837,7 @@ version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
 ]
 
 [[package]]
@@ -5207,6 +5373,16 @@ dependencies = [
  "untrusted",
 ]
 
+[[package]]
+name = "rustls-webpki"
+version = "0.102.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
+dependencies = [
+ "rustls-pki-types",
+ "untrusted",
+]
+
 [[package]]
 name = "rustls-webpki"
 version = "0.103.4"
@@ -5470,6 +5646,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "serde_nanos"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "serde_path_to_error"
 version = "0.1.17"
@@ -5637,6 +5822,18 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "signatory"
+version = "0.27.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31"
+dependencies = [
+ "pkcs8",
+ "rand_core 0.6.4",
+ "signature",
+ "zeroize",
+]
+
 [[package]]
 name = "signature"
 version = "2.2.0"
@@ -6200,7 +6397,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
 dependencies = [
  "fastrand",
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
  "once_cell",
  "rustix 1.0.8",
  "windows-sys 0.60.2",
@@ -6413,6 +6610,27 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "tokio-websockets"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "http 1.3.1",
+ "httparse",
+ "rand 0.8.5",
+ "ring",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls 0.26.2",
+ "tokio-util",
+ "webpki-roots 0.26.11",
+]
+
 [[package]]
 name = "toml"
 version = "0.8.23"
@@ -6564,6 +6782,16 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
+[[package]]
+name = "tryhard"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5"
+dependencies = [
+ "pin-project-lite",
+ "tokio",
+]
+
 [[package]]
 name = "tui-logger"
 version = "0.14.5"
@@ -6740,7 +6968,7 @@ version = "1.18.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
 dependencies = [
- "getrandom 0.3.3",
+ "getrandom 0.3.4",
  "js-sys",
  "rand 0.9.2",
  "uuid-macro-internal",
@@ -6811,10 +7039,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
 
 [[package]]
-name = "wasi"
-version = "0.14.3+wasi-0.2.4"
+name = "wasip2"
+version = "1.0.2+wasi-0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95"
+checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
 dependencies = [
  "wit-bindgen",
 ]
@@ -6936,6 +7164,15 @@ version = "0.25.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1"
 
+[[package]]
+name = "webpki-roots"
+version = "0.26.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
+dependencies = [
+ "webpki-roots 1.0.2",
+]
+
 [[package]]
 name = "webpki-roots"
 version = "1.0.2"
@@ -7313,9 +7550,9 @@ dependencies = [
 
 [[package]]
 name = "wit-bindgen"
-version = "0.45.0"
+version = "0.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
 
 [[package]]
 name = "writeable"
diff --git a/harmony_agent/README.md b/harmony_agent/README.md
index c22d1b51..189e8145 100644
--- a/harmony_agent/README.md
+++ b/harmony_agent/README.md
@@ -244,3 +244,5 @@ Please add these to your master list before starting implementation:
 31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`.
 32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid.
 
+*   **Think about vacuum / stop-the-world operations**
+
diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs
index 68cb5844..f63f35a0 100644
--- a/harmony_agent/src/agent_loop.rs
+++ b/harmony_agent/src/agent_loop.rs
@@ -6,7 +6,7 @@ use harmony_types::id::Id;
 use log::{debug, info, trace};
 use serde::{Deserialize, Serialize};
 use tokio::sync::RwLock;
-use tokio::time::Instant;
+use tokio::time::{Instant, sleep};
 
 use crate::store::{KvMetadata, KvStore, KvStoreError};
 use crate::workflow::HeartbeatWorkflow;
@@ -39,6 +39,12 @@ pub async fn launch_agent<S>(
 where
     S: KvStore + Send + Sync + 'static,
 {
+    match role {
+        AgentRole::Primary => {}
+        AgentRole::Replica => {
+            sleep(Duration::from_millis(100)).await;
+        }
+    }
     let my_agent_name = format!("agent-{}", role);
     let my_agent_id = Id::from_str(&my_agent_name).unwrap();
 
@@ -238,21 +244,27 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
         let cluster_state_option = match self.cluster_kv.get(&cluster_key).await {
             Ok(result) => {
                 if let Some(value) = result.value {
-                    match serde_json::from_value::<ClusterStateData>(value) {
+                    match serde_json::from_value::<ClusterStateData>(value.clone()) {
                         Ok(data) => Some(data),
                         Err(e) => {
                             log::warn!("Failed to deserialize cluster state: {}", e);
-                            None
+                            return Err(KvStoreError::DeserializationFailed {
+                                deserialization_error: format!(
+                                    "Cluster key exist but is empty {cluster_key} : {e}"
+                                ),
+                                value: value.to_string(),
+                            });
                         }
                     }
                 } else {
-                    debug!("No cluster state found, this is a fresh cluster");
-                    None
+                    return Err(KvStoreError::Unknown(format!(
+                        "Cluster key exist but is empty {cluster_key}"
+                    )));
                 }
             }
             Err(KvStoreError::KeyNotAvailable(_)) => {
                 debug!("Cluster state key not found, this is a fresh cluster");
-                None
+                Some(self.store_cluster_state(None).await?)
             }
             Err(e) => {
                 log::warn!("Failed to fetch cluster state during startup: {}", e);
@@ -261,7 +273,8 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
         };
 
         let state_ref = cluster_state_option.as_ref();
-        self.workflow.on_startup(state_ref).await;
+
+        self.workflow.on_startup(state_ref, self.health_kv.as_ref(), &self.config).await;
 
         // Cache the cluster state locally
         *self.cluster_state.write().await = cluster_state_option;
diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs
index 05f9934c..561ce434 100644
--- a/harmony_agent/src/workflow/mod.rs
+++ b/harmony_agent/src/workflow/mod.rs
@@ -1,6 +1,10 @@
+use std::sync::Arc;
+
 use async_trait::async_trait;
 use harmony_types::id::Id;
 
+use crate::{agent_loop::AgentConfig, store::KvStore};
+
 pub mod primary;
 pub mod replica;
 
@@ -13,7 +17,12 @@ pub trait HeartbeatWorkflow: Send + Sync {
     /// Handle a failed heartbeat
     async fn handle_heartbeat_failure(&mut self);
 
-    async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>);
+    async fn on_startup(
+        &mut self,
+        cluster_state: Option<&crate::agent_loop::ClusterStateData>,
+        health_kv: &dyn KvStore,
+        agent_config: &AgentConfig,
+    );
 
     /// Get the current state name for logging (also used for heartbeat status)
     fn state_name(&self) -> &'static str;
diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs
index 73c12828..ffe3ffae 100644
--- a/harmony_agent/src/workflow/primary.rs
+++ b/harmony_agent/src/workflow/primary.rs
@@ -1,7 +1,7 @@
 use async_trait::async_trait;
 use log::{debug, info, trace, warn};
 
-use crate::{agent_loop::DeploymentConfig, workflow::HeartbeatWorkflow};
+use crate::{agent_loop::{AgentConfig, DeploymentConfig}, store::KvStore, workflow::HeartbeatWorkflow};
 
 #[derive(Debug, Clone, PartialEq)]
 pub enum PrimaryState {
@@ -63,12 +63,21 @@ impl PrimaryWorkflow {
 
 #[async_trait]
 impl HeartbeatWorkflow for PrimaryWorkflow {
-    async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>) {
+    async fn on_startup(
+        &mut self,
+        cluster_state: Option<&crate::agent_loop::ClusterStateData>,
+        health_kv: &dyn KvStore,
+        agent_config: &AgentConfig,
+    ) {
         if let Some(state) = cluster_state {
             info!(
                 "Startup reconciliation: current primary is {:?}, desired primary is {:?}",
                 state.current_primary, state.desired_primary
             );
+
+            let key = format!("heartbeat.{}", agent_config.agent_id.clone());
+            // let hb = health_kv.get(&key);
+
             // No automatic fast-tracking - agent must earn healthy status
             // through successful heartbeats. This prevents duplicate agents
             // or crashloop agents from incorrectly claiming primary.
@@ -77,7 +86,10 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
         }
     }
     async fn handle_heartbeat_success(&mut self) {
-        trace!("Handling heartbeat success, current counters success {} failures {}", self.consecutive_successes, self.consecutive_failures);
+        trace!(
+            "Handling heartbeat success, current counters success {} failures {}",
+            self.consecutive_successes, self.consecutive_failures
+        );
         self.consecutive_successes += 1;
         self.consecutive_failures = 0;
 
diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs
index b790a6b9..eda4bbdc 100644
--- a/harmony_agent/src/workflow/replica.rs
+++ b/harmony_agent/src/workflow/replica.rs
@@ -4,7 +4,8 @@ use log::{debug, info, trace, warn};
 use std::time::Duration;
 use tokio::sync::RwLock;
 
-use crate::agent_loop::AgentHeartbeat;
+use crate::agent_loop::{AgentConfig, AgentHeartbeat};
+use crate::store::KvStore;
 use crate::workflow::HeartbeatWorkflow;
 
 #[derive(Debug, Clone)]
@@ -153,12 +154,20 @@ impl ReplicaWorkflow {
 
 #[async_trait]
 impl HeartbeatWorkflow for ReplicaWorkflow {
-    async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>) {
-        todo!("not sure if the replica should do anything on startup")
+    async fn on_startup(
+        &mut self,
+        cluster_state: Option<&crate::agent_loop::ClusterStateData>,
+        health_kv: &dyn KvStore,
+        agent_config: &AgentConfig,
+    ) {
+        // todo!("not sure if the replica should do anything on startup")
     }
 
     async fn handle_heartbeat_success(&mut self) {
-        trace!("Handling heartbeat success, current counters success {} failures {}", self.consecutive_successes, self.consecutive_failures);
+        trace!(
+            "Handling heartbeat success, current counters success {} failures {}",
+            self.consecutive_successes, self.consecutive_failures
+        );
         self.consecutive_successes += 1;
         self.consecutive_failures = 0;
 
@@ -174,13 +183,15 @@ impl HeartbeatWorkflow for ReplicaWorkflow {
                 if self.is_primary_stale().await {
                     warn!("Found stale primary, launching promotion");
                 }
-                todo!("perform the replica watch actions : 
+                /*
+                todo!("perform the replica watch actions :
                 - if a primary exists in the cluster (cluster_state.current_primary == expected_primary)
                     - check the last primary heartbeat kv timestamp
                     - compare it with our latest kv heartbeat
                     - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself)
                     - launching promotion will change the status of the replica
                     ");
+                */
             }
             ReplicaState::Promoting => {
                 // TODO: Complete promotion attempt
-- 
2.39.5


From a88d67627aa9e001a5b0348339bf77b0a91218ed Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Tue, 3 Feb 2026 20:46:18 -0500
Subject: [PATCH 13/19] chore: Add a note and delete old code

---
 harmony_agent/src/agent_loop.rs           |   3 +
 harmony_agent/src/old/typestate.rs        | 230 ----------
 harmony_agent/src/old/typestate_gemini.rs | 523 ----------------------
 3 files changed, 3 insertions(+), 753 deletions(-)
 delete mode 100644 harmony_agent/src/old/typestate.rs
 delete mode 100644 harmony_agent/src/old/typestate_gemini.rs

diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs
index f63f35a0..7111db92 100644
--- a/harmony_agent/src/agent_loop.rs
+++ b/harmony_agent/src/agent_loop.rs
@@ -39,12 +39,15 @@ pub async fn launch_agent<S>(
 where
     S: KvStore + Send + Sync + 'static,
 {
+    // Cheap ass fix when we boot two agents at the same time and the store does not exist, delay
+    // one so they don't crash because of the race
     match role {
         AgentRole::Primary => {}
         AgentRole::Replica => {
             sleep(Duration::from_millis(100)).await;
         }
     }
+
     let my_agent_name = format!("agent-{}", role);
     let my_agent_id = Id::from_str(&my_agent_name).unwrap();
 
diff --git a/harmony_agent/src/old/typestate.rs b/harmony_agent/src/old/typestate.rs
deleted file mode 100644
index 78445d55..00000000
--- a/harmony_agent/src/old/typestate.rs
+++ /dev/null
@@ -1,230 +0,0 @@
-use std::{marker::PhantomData, time::Duration};
-
-/// Typestate pattern implementation for Primary and Replica state machines
-/// Based on Will Crichton's typestate pattern
-
-pub mod primary {
-  use super::Agent;
-
-  /// Primary state: Agent is initializing
-  pub struct Initializing {}
-
-  /// Primary state: Heartbeat failures exceeded threshold
-  pub struct Failed {}
-
-  /// Primary state: Database fenced/stopped
-  pub struct Fenced {}
-
-  /// Primary state: Heartbeat succeeding
-  pub struct Healthy {}
-
-  /// Primary state: Recovered from fence, waiting for demotion handshake
-  pub struct Yielding {}
-
-  impl Agent<Initializing> {
-    /// Transition from initializing to healthy
-    pub fn healthy(self) -> Agent<Healthy> {
-        self
-    }
-  }
-
-  impl Agent<Failed> {
-    /// Transition from failed to fenced
-    pub fn fence(self) -> Agent<Fenced> {
-      Agent {
-        consecutive_success: self.consecutive_success,
-        consecutive_failure: self.consecutive_failure,
-        failure_threshold: self.failure_threshold,
-        success_threshold: self.success_threshold,
-        heartbeat_timeout: self.heartbeat_timeout,
-        _state: PhantomData
-      }
-    }
-
-    /// Transition from failed to healthy (recovery)
-    pub fn recover(self) -> Agent<Healthy> {
-        self
-    }
-  }
-
-  impl Agent<Fenced> {
-    /// Transition from fenced to yielding (waiting for demotion)
-    pub fn await_demotion(self) -> Agent<Yielding> {
-        self
-    }
-
-    /// Transition from fenced to healthy (recovery after demotion completes)
-    pub fn recover(self) -> Agent<Healthy> {
-        self
-    }
-  }
-
-  impl Agent<Healthy> {
-    /// Transition from healthy to yielding (when original primary recovers)
-    pub fn yield_leadership(self) -> Agent<Yielding> {
-        self
-    }
-
-    /// Transition from healthy to failed (heartbeat failure)
-    pub fn fail(self) -> Agent<Failed> {
-        self
-    }
-  }
-
-  impl Agent<Yielding> {
-    /// Transition from yielding back to healthy (after demotion completes)
-    pub fn recover(self) -> Agent<Healthy> {
-        self
-    }
-
-    /// Transition from yielding back to healthy (if demotion cancelled)
-    pub fn recover_and_promote(self) -> Agent<Healthy> {
-        self
-    }
-  }
-}
-
-pub mod replica {
-  use super::Agent;
-
-  /// Replica state: Agent is initializing
-  pub struct Initializing {}
-
-  /// Replica state: Watching primary heartbeats
-  pub struct Watching {}
-
-  /// Replica state: Failover timeout exceeded, attempting promotion
-  pub struct Promoting {}
-
-  /// Replica state: Promotion attempt rejected by NATS
-  pub struct PromotionFailed {}
-
-  /// Replica state: Successfully promoted to leader
-  pub struct Leader {}
-
-  /// Replica state: Original primary recovered, yielding leadership
-  pub struct Demoting {}
-
-  impl Agent<Initializing> {
-    /// Transition from initializing to watching
-    pub fn start_watching(self) -> Agent<Watching> {
-        self
-    }
-  }
-
-  impl Agent<Watching> {
-    /// Transition from watching to promoting (failover timeout reached)
-    pub fn promote(self) -> Agent<Promoting> {
-        self
-    }
-
-    /// Transition from watching back to promoting (if demotion cancelled)
-    pub fn promote_again(self) -> Agent<Promoting> {
-        self
-    }
-  }
-
-  impl Agent<Promoting> {
-    /// Transition from promoting to leader (promotion successful)
-    pub fn become_leader(self) -> Agent<Leader> {
-        self
-    }
-
-    /// Transition from promoting to promotion_failed (NATS rejected)
-    pub fn promotion_rejected(self) -> Agent<PromotionFailed> {
-        self
-    }
-
-    /// Transition from promoting back to watching (reverted)
-    pub fn revert_to_watching(self) -> Agent<Watching> {
-        self
-    }
-  }
-
-  impl Agent<PromotionFailed> {
-    /// Transition from promotion_failed back to watching
-    pub fn continue_watching(self) -> Agent<Watching> {
-        self
-    }
-  }
-
-  impl Agent<Leader> {
-    /// Transition from leader to demoting (original primary recovered)
-    pub fn yield_leadership(self) -> Agent<Demoting> {
-        self
-    }
-
-    /// Transition from leader to watching (if demotion cancelled)
-    pub fn revert_to_watching(self) -> Agent<Watching> {
-        self
-    }
-  }
-
-  impl Agent<Demoting> {
-    /// Transition from demoting back to watching (if demotion cancelled)
-    pub fn revert_to_watching(self) -> Agent<Watching> {
-        self
-    }
-
-    /// Transition from demoting back to leader (if demotion cancelled)
-    pub fn promote_again(self) -> Agent<Leader> {
-        self
-    }
-  }
-}
-
-/// Main Agent struct using typestate pattern
-/// State is tracked through the generic type parameter
-pub struct Agent<State> {
-  pub consecutive_success: usize,
-  pub consecutive_failure: usize,
-  pub failure_threshold: usize,
-  pub success_threshold: usize,
-  pub heartbeat_timeout: Duration,
-  _state: PhantomData<State>
-}
-
-impl<State> Agent<State> {
-  /// Create a new agent in the given state with default thresholds
-  pub fn new(state: State) -> Self {
-    Agent {
-      consecutive_success: 0,
-      consecutive_failure: 0,
-      failure_threshold: 2,
-      success_threshold: 3,
-      heartbeat_timeout: Duration::from_secs(1),
-      _state: PhantomData
-    }
-  }
-
-  /// Create a new agent with custom thresholds
-  pub fn with_thresholds(state: State, success_threshold: usize, failure_threshold: usize, heartbeat_timeout: Duration) -> Self {
-    Agent {
-      consecutive_success: 0,
-      consecutive_failure: 0,
-      failure_threshold,
-      success_threshold,
-      heartbeat_timeout,
-      _state: PhantomData
-    }
-  }
-}
-
-impl<State> Clone for Agent<State> {
-  fn clone(&self) -> Self {
-    Agent {
-      consecutive_success: self.consecutive_success,
-      consecutive_failure: self.consecutive_failure,
-      failure_threshold: self.failure_threshold,
-      success_threshold: self.success_threshold,
-      heartbeat_timeout: self.heartbeat_timeout,
-      _state: PhantomData
-    }
-  }
-}
-
-impl<State> Default for Agent<State> {
-  fn default() -> Self {
-    Self::new(Initializing {})
-  }
-}
diff --git a/harmony_agent/src/old/typestate_gemini.rs b/harmony_agent/src/old/typestate_gemini.rs
deleted file mode 100644
index e4285bdd..00000000
--- a/harmony_agent/src/old/typestate_gemini.rs
+++ /dev/null
@@ -1,523 +0,0 @@
-use std::marker::PhantomData;
-use std::time::Duration;
-use tokio::sync::mpsc;
-use tokio::time::Instant;
-
-// =============================================================================
-// FSM Library (Type State Pattern)
-// =============================================================================
-
-pub mod fsm {
-    use super::*;
-
-    /// Generic FSM container
-    pub struct FSM<S, E, U> {
-        pub user_data: Option<U>,
-        pub state: PhantomData<S>,
-        pub _phantom_event: PhantomData<E>,
-    }
-
-    impl<S, E, U> FSM<S, E, U> {
-        pub fn new(user_data: Option<U>) -> Self {
-            Self {
-                user_data,
-                state: PhantomData,
-                _phantom_event: PhantomData,
-            }
-        }
-    }
-
-    /// Trait to represent FSM behavior via dynamic dispatch
-    pub trait HandleEvent<E, U> {
-        fn handle_event(self: Box<Self>, event: E) -> Box<dyn ErasedState<E, U>>;
-    }
-
-    /// Implemented per-state by the macro to route event logic
-    pub trait ErasedState<E, U>: Send {
-        fn handle_event(self: Box<Self>, event: E) -> Box<dyn ErasedState<E, U>>;
-    }
-
-    impl<S, E, U> ErasedState<E, U> for FSM<S, E, U>
-    where
-        FSM<S, E, U>: HandleEvent<E, U> + Send + 'static,
-    {
-        fn handle_event(self: Box<Self>, event: E) -> Box<dyn ErasedState<E, U>> {
-            HandleEvent::handle_event(self, event)
-        }
-    }
-
-    /// Allows FSM to move from state `S` to `T`, retaining user data
-    pub trait StateMachine<S, E, U>: Send + 'static {
-        fn into_boxed<T>(self) -> Box<FSM<T, E, U>>;
-    }
-
-    impl<S, E, U> StateMachine<S, E, U> for FSM<S, E, U>
-    where
-        S: Send + 'static,
-        E: Send + 'static,
-        U: Send + 'static,
-    {
-        fn into_boxed<T>(self) -> Box<FSM<T, E, U>> {
-            Box::new(FSM {
-                user_data: self.user_data,
-                state: PhantomData,
-                _phantom_event: PhantomData,
-            })
-        }
-    }
-
-    /// Runs the FSM in an asynchronous loop
-    pub async fn run_machine<E, U>(
-        mut state: Box<dyn ErasedState<E, U>>,
-        mut rx: tokio::sync::mpsc::Receiver<E>,
-    ) where
-        E: Send + 'static,
-        U: Send + 'static,
-    {
-        while let Some(event) = rx.recv().await {
-            state = ErasedState::handle_event(state, event);
-        }
-    }
-}
-
-/// Macro for Declaring Transitions
-#[macro_export]
-macro_rules! define_fsm {
-    (
-        $struct:ident<$event:ident, $user:ident>, {
-            $(
-                $state:ty => {
-                    $(
-                        $pattern:pat => $next:ty => $action:expr
-                    ),* $(,)?
-                }
-            ),* $(,)?
-        }
-    ) => {
-        $(
-            impl $crate::fsm::HandleEvent<$event, $user> for $struct<$state, $event, $user> {
-                fn handle_event(mut self: Box<Self>, event: $event) -> Box<dyn $crate::fsm::ErasedState<$event, $user>> {
-                    match event {
-                        $(
-                            $pattern => {
-                                // log::debug!("FSM Transition: {:?} --[{:?}]--> {:?}", stringify!($state), e, stringify!($next));
-                                log::debug!("FSM Transition: {:?} --[:?]--> {:?}", stringify!($state), stringify!($next));
-                                $action(&mut self);
-                                self.into_boxed::<$next>()
-                            }
-                        )*
-                        // Default handler for unmapped events in this state: stay in current state
-                        _ => {
-                            // log::trace!("FSM Ignore: {:?} --[{:?}]--> (no transition)", stringify!($state), event);
-                            self
-                        }
-                    }
-                }
-            }
-        )*
-    };
-}
-
-// =============================================================================
-// Harmony Agent Domain Logic
-// =============================================================================
-
-use fsm::{ErasedState, StateMachine, FSM};
-
-// --- States ---
-#[derive(Debug)]
-struct RolePrimary; // Active Leader
-#[derive(Debug)]
-struct RoleReplica; // Passive Watchdog
-#[derive(Debug)]
-struct RoleFencing; // Transition: Shutting down
-#[derive(Debug)]
-struct RolePromoting; // Transition: Taking over
-#[derive(Debug)]
-struct RoleDemoting; // Transition: Yielding
-
-// --- Events ---
-#[derive(Debug, Clone)]
-enum AgentEvent {
-    /// Periodic timer tick (drives checks)
-    Tick,
-    /// Result of a local health check (Primary only)
-    HealthCheckResult { success: bool },
-    /// Update from NATS about the cluster state
-    ClusterStateUpdate { primary_id: String, timestamp: Instant },
-    /// Command to force a state change (e.g. admin intervention)
-    ForceDemote,
-}
-
-// --- Side Effect Commands (Outbound) ---
-#[derive(Debug)]
-enum WorkerCommand {
-    PerformHealthCheck,
-    PerformFencing,
-    PerformPromotion,
-    PerformDemotion,
-}
-
-// --- Context ---
-struct AgentContext {
-    // Config
-    agent_id: String,
-    success_threshold: usize,
-    failure_threshold: usize,
-    heartbeat_interval: Duration,
-    failover_timeout: Duration,
-
-    // Runtime State
-    consecutive_failures: usize,
-    last_primary_heartbeat: Option<Instant>,
-    
-    // Communication
-    worker_tx: mpsc::Sender<WorkerCommand>,
-}
-
-impl AgentContext {
-    fn send_command(&self, cmd: WorkerCommand) {
-        let tx = self.worker_tx.clone();
-        tokio::spawn(async move {
-            if let Err(e) = tx.send(cmd).await {
-                log::error!("Failed to send worker command: {}", e);
-            }
-        });
-    }
-}
-
-// --- FSM Definition ---
-
-define_fsm!(FSM<AgentEvent, AgentContext>, {
-    // -------------------------------------------------------------------------
-    // PRIMARY STATE (Self-Preservation)
-    // -------------------------------------------------------------------------
-    RolePrimary => {
-        // 1. On Tick: Trigger a health check (Async Side Effect)
-        AgentEvent::Tick => RolePrimary => |s: &mut FSM<RolePrimary, AgentEvent, AgentContext>| {
-            if let Some(ctx) = &mut s.user_data {
-                ctx.send_command(WorkerCommand::PerformHealthCheck);
-            }
-        },
-
-        // 2. Health Check Success: Reset counters
-        AgentEvent::HealthCheckResult { success: true } => RolePrimary => |s: &mut FSM<RolePrimary, AgentEvent, AgentContext>| {
-            if let Some(ctx) = &mut s.user_data {
-                ctx.consecutive_failures = 0;
-                log::info!("✅ Heartbeat Success (Primary)");
-            }
-        },
-
-        // 3. Health Check Failure: Increment counters & Check Threshold
-        AgentEvent::HealthCheckResult { success: false } => RolePrimary => |s: &mut FSM<RolePrimary, AgentEvent, AgentContext>| {
-            // NOTE: We determine next state dynamically by checking threshold.
-            // Since the macro requires a static next type, we handle the "Stay" case here.
-            // If we need to transition, we assume the event loop sends a specific event,
-            // OR we use a separate state for "Checking".
-            // However, to keep it simple within this pattern, we will check threshold here.
-            // If threshold reached, we ideally want to return RoleFencing.
-            // But the macro forces `=> RolePrimary`.
-            //
-            // WORKAROUND: We use a specific event flow.
-            // Ideally, the `HealthCheckResult` logic would be:
-            // if fail >= threshold { transition Fencing } else { stay }
-            //
-            // To strictly follow the macro structure where destination is fixed per pattern:
-            // We can't branch to different types in one pattern.
-            // So we will stay in RolePrimary here, but if threshold is hit, we trigger Fencing immediately
-            // by sending a command, and we rely on the Worker to complete fencing and maybe restart us?
-            //
-            // BETTER APPROACH for this specific FSM pattern:
-            // We need an intermediate event or state if the destination depends on runtime data.
-            // But let's assume for this implementation that we handle the "Stay" case here,
-            // and if we fail, we transition to Fencing on the NEXT tick or via a self-generated event?
-            //
-            // Let's modify the logic: The Worker sends `HealthCheckResult { success: false }`.
-            // If we are still below threshold, we log.
-            // If we are at threshold, we treat this event as a trigger for Fencing?
-            // No, the pattern matches `Event => Type`.
-            //
-            // Revised: We need two patterns. But we can't match on values inside the struct in the macro easily
-            // unless we define specific events like `HealthCheckFailedFatal`.
-            //
-            // Let's use `consecutive_failures` check inside the action.
-            // If fatal, we return a new Box<RoleFencing>.
-            // Wait, the macro generates `self.into_boxed::<$next>()`. It hardcodes the return type.
-            //
-            // This is a limitation of the macro provided in the blog post.
-            // To solve this strictly following the provided code, we must ensure the event *itself* dictates the transition.
-            //
-            // So the Worker must know the threshold? No, that leaks logic.
-            //
-            // Solution: The FSM Action can mutate `ctx`.
-            // We will have `AgentEvent::HealthCheckFailed`.
-            // We stay in `RolePrimary`.
-            // Inside the action, if `ctx.failures >= threshold`, we `ctx.send_command(PerformFencing)`.
-            // And we transition to `RoleFencing`? We can't conditionally transition in the macro.
-            //
-            // OK, I will split the event.
-            // The Worker returns `HealthCheckResult`.
-            // The FSM handles it.
-            // If the FSM sees failure, it stays in Primary.
-            // But if it needs to fence, it needs to transition.
-            //
-            // I will add `AgentEvent::FencingTriggered` which is sent by the FSM to itself?
-            // Or simpler: The Worker sends `HealthCheckFailed`.
-            // If we want to fence, we need to move to `RoleFencing`.
-            //
-            // Let's adjust the macro usage slightly. The user said "follow exactly the FSM pattern".
-            // The pattern implies strict state transitions.
-            //
-            // I will implement `RolePrimary` -> `RoleFencing` on `ForceDemote` or similar.
-            // And I will assume the Worker sends `ForceDemote` if it detects critical failure?
-            // No, the logic belongs in the FSM.
-            //
-            // Let's use the `Tick` to check the counter.
-            // 1. Tick -> Check.
-            // 2. Result -> Update Counter.
-            // 3. Tick -> If counter > thresh -> Transition Fencing.
-            //
-            // Let's try that.
-            if let Some(ctx) = &mut s.user_data {
-                ctx.consecutive_failures += 1;
-                log::warn!("⚠️ Heartbeat Failed (Count: {}/{})", ctx.consecutive_failures, ctx.failure_threshold);
-            }
-        },
-
-        // 4. The actual Fencing Transition
-        // We use a specific pattern guard if possible, or just a separate event.
-        // Since we can't guard in the macro, we'll use a trick:
-        // If failures are high, the NEXT Tick will trigger transition?
-        // No, we want immediate.
-        //
-        // Let's add `AgentEvent::CriticalFailure` event.
-        // The `HealthCheckResult` handler (above) will check the threshold.
-        // If threshold reached, it cannot transition itself (locked to RolePrimary).
-        // BUT, it can emit a `CriticalFailure` event to the channel.
-        // Then the FSM loop picks it up and transitions.
-        AgentEvent::ForceDemote => RoleFencing => |s: &mut FSM<RolePrimary, AgentEvent, AgentContext>| {
-             if let Some(ctx) = &mut s.user_data {
-                log::error!("🚨 Failure Threshold Reached. Initiating Fencing.");
-                ctx.send_command(WorkerCommand::PerformFencing);
-            }
-        },
-
-        // 5. Split Brain Prevention
-        AgentEvent::ClusterStateUpdate { primary_id, .. } => RoleDemoting => |s: &mut FSM<RolePrimary, AgentEvent, AgentContext>| {
-             if let Some(ctx) = &mut s.user_data {
-                if primary_id != ctx.agent_id && !primary_id.is_empty() {
-                    log::warn!("Split Brain Detected! Another primary is active: {}. Demoting.", primary_id);
-                    ctx.send_command(WorkerCommand::PerformDemotion);
-                }
-            }
-        }
-    },
-
-    // -------------------------------------------------------------------------
-    // REPLICA STATE (Watchdog)
-    // -------------------------------------------------------------------------
-    RoleReplica => {
-        // 1. Receive Heartbeats from Primary
-        AgentEvent::ClusterStateUpdate { primary_id, timestamp } => RoleReplica => |s: &mut FSM<RoleReplica, AgentEvent, AgentContext>| {
-            if let Some(ctx) = &mut s.user_data {
-                if !primary_id.is_empty() {
-                    ctx.last_primary_heartbeat = Some(timestamp);
-                    // log::trace!("Replica: Saw primary {} at {:?}", primary_id, timestamp);
-                }
-            }
-        },
-
-        // 2. Tick: Check for Staleness
-        AgentEvent::Tick => RoleReplica => |s: &mut FSM<RoleReplica, AgentEvent, AgentContext>| {
-             // We can't transition conditionally here either.
-             // Same pattern: Check logic, if stale, send `ForcePromote` event to self.
-             if let Some(ctx) = &mut s.user_data {
-                 if let Some(last) = ctx.last_primary_heartbeat {
-                     let elapsed = Instant::now().duration_since(last);
-                     if elapsed > ctx.failover_timeout {
-                         log::warn!("⚡ Primary Stale ({}ms > {}ms). Triggering Promotion.", elapsed.as_millis(), ctx.failover_timeout.as_millis());
-                         // We need to trigger the transition.
-                         // We can't do it directly in this closure because the return type is fixed to RoleReplica.
-                         // So we assume the "Driver" or a self-send handles the trigger.
-                         // For this implementation, we'll assume we have a handle to the main loop channel in ctx?
-                         // No, ctx has `worker_tx`.
-                         //
-                         // We will send a command to worker to "ConfirmPromotionEligibility", which sends back `ForcePromote`.
-                         ctx.send_command(WorkerCommand::PerformPromotion); // This checks eligibility then triggers event
-                     }
-                 }
-             }
-        },
-
-        // 3. Promotion Triggered
-        AgentEvent::ForceDemote => RolePromoting => |s: &mut FSM<RoleReplica, AgentEvent, AgentContext>| {
-             log::info!("Promoting to Primary...");
-        }
-    },
-
-    // -------------------------------------------------------------------------
-    // FENCING STATE (Transient)
-    // -------------------------------------------------------------------------
-    RoleFencing => {
-        // Once fencing is done (simulated by Tick or specific event), we become a Replica (Clean Demotion)
-        AgentEvent::Tick => RoleReplica => |s: &mut FSM<RoleFencing, AgentEvent, AgentContext>| {
-            log::info!("Fencing/Demotion complete. Switching to Replica (Watchdog) mode.");
-            if let Some(ctx) = &mut s.user_data {
-                ctx.consecutive_failures = 0;
-            }
-        }
-    },
-
-    // -------------------------------------------------------------------------
-    // PROMOTING STATE (Transient)
-    // -------------------------------------------------------------------------
-    RolePromoting => {
-        // Promotion logic usually involves ensuring WAL catchup etc.
-        // We simulate success on next Tick.
-        AgentEvent::Tick => RolePrimary => |s: &mut FSM<RolePromoting, AgentEvent, AgentContext>| {
-            log::info!("Promotion Complete. I am now the PRIMARY.");
-            if let Some(ctx) = &mut s.user_data {
-                ctx.consecutive_failures = 0;
-                // Reset heartbeat timestamp so we don't fence immediately
-                ctx.last_primary_heartbeat = Some(Instant::now());
-            }
-        }
-    },
-
-    // -------------------------------------------------------------------------
-    // DEMOTING STATE (Transient)
-    // -------------------------------------------------------------------------
-    RoleDemoting => {
-         AgentEvent::Tick => RoleReplica => |s: &mut FSM<RoleDemoting, AgentEvent, AgentContext>| {
-            log::info!("Demotion Complete. Switching to Replica.");
-        }
-    }
-});
-
-// =============================================================================
-// Main & Runtime
-// =============================================================================
-
-pub async fn main_typestate_gemini() -> Result<(), Box<dyn std::error::Error>> {
-    env_logger::init();
-    log::info!("Harmony Agent FSM Starting...");
-
-    // 1. Setup Channels
-    let (event_tx, event_rx) = mpsc::channel::<AgentEvent>(100);
-    let (worker_tx, mut worker_rx) = mpsc::channel::<WorkerCommand>(100);
-
-    // 2. Configuration
-    let my_agent_id = "agent_1".to_string();
-    let desired_primary = "agent_1".to_string(); // Change to "agent_2" to test Replica start
-    let is_primary = my_agent_id == desired_primary;
-
-    let context = AgentContext {
-        agent_id: my_agent_id.clone(),
-        success_threshold: 2,
-        failure_threshold: 2,
-        heartbeat_interval: Duration::from_secs(1),
-        failover_timeout: Duration::from_secs(3), // 3s > 1s interval
-        consecutive_failures: 0,
-        last_primary_heartbeat: Some(Instant::now()),
-        worker_tx: worker_tx.clone(),
-    };
-
-    // 3. Spawn Worker (Simulates IO and Logic Glue)
-    let event_tx_worker = event_tx.clone();
-    tokio::spawn(async move {
-        while let Some(cmd) = worker_rx.recv().await {
-            match cmd {
-                WorkerCommand::PerformHealthCheck => {
-                    // Simulate IO latency
-                    tokio::time::sleep(Duration::from_millis(100)).await;
-                    
-                    // Simulate random failure (10% chance)
-                    let success = getrandom::u64().unwrap() % 100 > 10;
-                    
-                    // Send result back
-                    let _ = event_tx_worker.send(AgentEvent::HealthCheckResult { success }).await;
-                    
-                    // CRITICAL: Logic Glue for the FSM limitation
-                    // If we failed, we don't know the counter here easily without shared state.
-                    // But for the purpose of this demo, let's assume the FSM handles the counter.
-                    // If the FSM decides to fence, it sends PerformFencing.
-                    //
-                    // However, we need to trigger the transition event if threshold is hit.
-                    // Since FSM action is sync and can't send async events easily back to itself *during* the transition,
-                    // we rely on the FSM action checking the counter and sending a command to US (Worker),
-                    // and WE send the transition event back.
-                }
-                WorkerCommand::PerformFencing => {
-                    log::warn!("[Worker] Executing Fencing Procedure (Stop DB)...");
-                    tokio::time::sleep(Duration::from_millis(500)).await;
-                    // Trigger the state transition in FSM
-                    let _ = event_tx_worker.send(AgentEvent::ForceDemote).await;
-                }
-                WorkerCommand::PerformPromotion => {
-                    log::info!("[Worker] Checking Promotion Eligibility...");
-                    // Simulate check
-                    tokio::time::sleep(Duration::from_millis(200)).await;
-                    // Trigger transition
-                    let _ = event_tx_worker.send(AgentEvent::ForceDemote).await; // Reusing ForceDemote as "Trigger Transition" for Replica->Promote based on graph? 
-                    // Wait, Replica->Promote uses ForceDemote in the macro above? 
-                    // Yes: AgentEvent::ForceDemote => RolePromoting
-                }
-                WorkerCommand::PerformDemotion => {
-                    log::warn!("[Worker] Yielding Leadership...");
-                    tokio::time::sleep(Duration::from_millis(200)).await;
-                    // Trigger transition
-                    // We need an event that goes Primary -> Demoting.
-                    // In the macro: AgentEvent::ClusterStateUpdate handles the detection.
-                    // But we need to transition. 
-                    // Actually, the macro for ClusterStateUpdate transitions DIRECTLY to RoleDemoting.
-                    // So this command might just be for side-effects (stopping DB).
-                }
-            }
-        }
-    });
-
-    // 4. Spawn Timer (Heartbeat Tick)
-    let event_tx_timer = event_tx.clone();
-    tokio::spawn(async move {
-        let mut interval = tokio::time::interval(Duration::from_secs(1));
-        loop {
-            interval.tick().await;
-            let _ = event_tx_timer.send(AgentEvent::Tick).await;
-        }
-    });
-
-    // 5. Spawn NATS Watcher (Simulated)
-    let event_tx_nats = event_tx.clone();
-    tokio::spawn(async move {
-        // Simulate receiving heartbeats from "agent_1"
-        loop {
-            tokio::time::sleep(Duration::from_millis(500)).await;
-            // If we are agent_1, we are the primary, so we don't see external heartbeats usually,
-            // but for simulation, let's say we see ourselves or nothing.
-            // If we are agent_2 (Replica), we see agent_1.
-            
-            // Uncomment to simulate primary death for Replica:
-            // continue; 
-            
-            let _ = event_tx_nats.send(AgentEvent::ClusterStateUpdate {
-                primary_id: "agent_1".to_string(),
-                timestamp: Instant::now(),
-            }).await;
-        }
-    });
-
-    // 6. Initialize FSM
-    let initial_state: Box<dyn ErasedState<AgentEvent, AgentContext>> = if is_primary {
-        log::info!("Starting as PRIMARY");
-        Box::new(FSM::<RolePrimary, AgentEvent, AgentContext>::new(Some(context)))
-    } else {
-        log::info!("Starting as REPLICA");
-        Box::new(FSM::<RoleReplica, AgentEvent, AgentContext>::new(Some(context)))
-    };
-
-    // 7. Run
-    fsm::run_machine(initial_state, event_rx).await;
-
-    Ok(())
-}
-
-- 
2.39.5


From 9c551a0eba9ac73d2d1ef27419af4f6d7e3bcb84 Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Tue, 3 Feb 2026 22:12:44 -0500
Subject: [PATCH 14/19] fix: Agent can now reload heartbeat info from store

---
 harmony_agent/src/agent/config.rs             |  80 ++++++
 harmony_agent/src/agent/heartbeat.rs          |  30 ++
 .../src/{agent_loop.rs => agent/mod.rs}       | 263 ++++++++----------
 harmony_agent/src/agent/role.rs               |  17 ++
 harmony_agent/src/main.rs                     |  14 +-
 harmony_agent/src/workflow/mod.rs             |   9 +-
 harmony_agent/src/workflow/primary.rs         |  10 +-
 harmony_agent/src/workflow/replica.rs         |  10 +-
 8 files changed, 254 insertions(+), 179 deletions(-)
 create mode 100644 harmony_agent/src/agent/config.rs
 create mode 100644 harmony_agent/src/agent/heartbeat.rs
 rename harmony_agent/src/{agent_loop.rs => agent/mod.rs} (74%)
 create mode 100644 harmony_agent/src/agent/role.rs

diff --git a/harmony_agent/src/agent/config.rs b/harmony_agent/src/agent/config.rs
new file mode 100644
index 00000000..ec96601c
--- /dev/null
+++ b/harmony_agent/src/agent/config.rs
@@ -0,0 +1,80 @@
+use std::time::Duration;
+
+use harmony_types::id::Id;
+use log::info;
+
+use super::role::AgentRole;
+use super::heartbeat::HeartbeatFailure;
+
+#[derive(Debug, Clone)]
+pub struct AgentConfig {
+    /// Number of consecutive successful heartbeats required before the service transitions from
+    /// failed to healthy.
+    pub success_threshold: usize,
+    /// Number of consecutive failed heartbeats required before the service transitions from
+    /// healthy to failed.
+    pub failure_threshold: usize,
+    /// Time between each heartbeat. If a heartbeat takes longer than this, it will be
+    /// considered failed.
+    pub heartbeat_interval: Duration,
+    /// Time since last observed primary heartbeat before replica considers primary stale.
+    /// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin
+    /// to avoid split brain during network partitions.
+    pub failover_timeout: Duration,
+    /// **UNSTABLE FIELD**
+    ///
+    /// For now, an agent instance only serves one deployment. This is probably fine as an agent's
+    /// footprint is low, but managing multiple deployments in a single instance would be a
+    /// significant resource usage reduction.
+    ///
+    /// Decoupling the deployment of the agent with the application's deployment could make things
+    /// more complicated though, where we would have to be careful about version compatibility
+    /// between all components managed by the agent instance. So for now it is a 1-1 map.
+    ///
+    /// But I have a feeling this could change so I am marking this field unstable to warn you, the
+    /// reader.
+    pub deployment_config_unstable: DeploymentConfig,
+    pub nats_url: String,
+    pub nats_creds_path: Option<String>,
+    pub agent_id: Id,
+    pub cluster_id: Id,
+    pub desired_primary_id: Id,
+    /// The role this agent plays (Primary or Replica)
+    pub role: AgentRole,
+}
+
+#[derive(Debug, Clone)]
+pub enum DeploymentConfig {
+    FailoverPostgreSQL(FailoverCNPGConfig),
+}
+
+#[derive(Debug, Clone)]
+pub struct FailoverCNPGConfig {
+    pub desired_primary_agent: Id,
+    pub cnpg_cluster_name: String,
+}
+
+impl DeploymentConfig {
+    /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres)
+    pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> {
+        match self {
+            DeploymentConfig::FailoverPostgreSQL(cfg) => {
+                info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name);
+                // TODO: Implement actual PG check / NATS write here
+                Ok(())
+            }
+        }
+    }
+
+    /// Callback: Transitioned from Unhealthy -> Healthy
+    pub async fn on_active(&self) {
+        info!("Service is now ACTIVE (Healthy)");
+        // e.g., Remove fencing lock
+    }
+
+    /// Callback: Transitioned from Healthy -> Unhealthy
+    pub async fn on_failover(&self) {
+        info!("Service is now FAILED (Unhealthy)");
+        // e.g., Initiate self-fencing, stop accepting traffic
+    }
+}
diff --git a/harmony_agent/src/agent/heartbeat.rs b/harmony_agent/src/agent/heartbeat.rs
new file mode 100644
index 00000000..f2fe9704
--- /dev/null
+++ b/harmony_agent/src/agent/heartbeat.rs
@@ -0,0 +1,30 @@
+use harmony_types::id::Id;
+use serde::{Deserialize, Serialize};
+
+use crate::store::KvMetadata;
+
+/// Agent-provided heartbeat information (no timestamps - those come from the store)
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct AgentInfo {
+    pub agent_id: Id,
+    pub cluster_id: Id,
+    pub status: String,
+}
+
+/// Complete heartbeat with both agent data and store metadata
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct AgentHeartbeat {
+    pub agent_info: AgentInfo,
+    pub metadata: Option<KvMetadata>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ClusterStateData {
+    pub cluster_id: Id,
+    pub current_primary: Option<Id>,
+    pub desired_primary: Id,
+    pub timestamp: u64,
+}
+
+#[derive(Debug)]
+pub struct HeartbeatFailure {}
diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent/mod.rs
similarity index 74%
rename from harmony_agent/src/agent_loop.rs
rename to harmony_agent/src/agent/mod.rs
index 7111db92..8d6ff2f5 100644
--- a/harmony_agent/src/agent_loop.rs
+++ b/harmony_agent/src/agent/mod.rs
@@ -1,10 +1,8 @@
-use std::fmt;
 use std::time::{SystemTime, UNIX_EPOCH};
 use std::{str::FromStr, sync::Arc, time::Duration};
 
 use harmony_types::id::Id;
 use log::{debug, info, trace};
-use serde::{Deserialize, Serialize};
 use tokio::sync::RwLock;
 use tokio::time::{Instant, sleep};
 
@@ -13,21 +11,15 @@ use crate::workflow::HeartbeatWorkflow;
 use crate::workflow::primary::PrimaryWorkflow;
 use crate::workflow::replica::ReplicaWorkflow;
 
-/// The role of this agent instance
-#[derive(Debug, Clone, PartialEq)]
-pub enum AgentRole {
-    Primary,
-    Replica,
-}
+// Submodules
+mod config;
+mod heartbeat;
+mod role;
 
-impl fmt::Display for AgentRole {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            AgentRole::Primary => write!(f, "primary"),
-            AgentRole::Replica => write!(f, "replica"),
-        }
-    }
-}
+// Re-exports for backwards compatibility
+pub use config::{AgentConfig, DeploymentConfig, FailoverCNPGConfig};
+pub use heartbeat::{AgentHeartbeat, AgentInfo, ClusterStateData, HeartbeatFailure};
+pub use role::AgentRole;
 
 pub async fn launch_agent<S>(
     role: AgentRole,
@@ -85,102 +77,6 @@ where
     Ok(())
 }
 
-#[derive(Debug, Clone)]
-pub struct AgentConfig {
-    /// Number of consecutive successful heartbeats required before the service transitions from
-    /// failed to healthy.
-    pub success_threshold: usize,
-    /// Number of consecutive failed heartbeats required before the service transitions from
-    /// healthy to failed.
-    pub failure_threshold: usize,
-    /// Time between each heartbeat. If a heartbeat takes longer than this, it will be
-    /// considered failed.
-    pub heartbeat_interval: Duration,
-    /// Time since last observed primary heartbeat before replica considers primary stale.
-    /// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin
-    /// to avoid split brain during network partitions.
-    pub failover_timeout: Duration,
-    /// **UNSTABLE FIELD**
-    ///
-    /// For now, an agent instance only serves one deployment. This is probably fine as an agent's
-    /// footprint is low, but managing multiple deployments in a single instance would be a
-    /// significant resource usage reduction.
-    ///
-    /// Decoupling the deployment of the agent with the application's deployment could make things
-    /// more complicated though, where we would have to be careful about version compatibility
-    /// between all components managed by the agent instance. So for now it is a 1-1 map.
-    ///
-    /// But I have a feeling this could change so I am marking this field unstable to warn you, the
-    /// reader.
-    pub deployment_config_unstable: DeploymentConfig,
-    pub nats_url: String,
-    pub nats_creds_path: Option<String>,
-    pub agent_id: Id,
-    pub cluster_id: Id,
-    pub desired_primary_id: Id,
-    /// The role this agent plays (Primary or Replica)
-    pub role: AgentRole,
-}
-
-#[derive(Debug, Clone)]
-pub enum DeploymentConfig {
-    FailoverPostgreSQL(FailoverCNPGConfig),
-}
-
-#[derive(Debug, Clone)]
-pub struct FailoverCNPGConfig {
-    pub desired_primary_agent: Id,
-    pub cnpg_cluster_name: String,
-}
-
-impl DeploymentConfig {
-    /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres)
-    pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> {
-        match self {
-            DeploymentConfig::FailoverPostgreSQL(cfg) => {
-                info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name);
-                // TODO: Implement actual PG check / NATS write here
-                Ok(())
-            }
-        }
-    }
-
-    /// Callback: Transitioned from Unhealthy -> Healthy
-    pub async fn on_active(&self) {
-        info!("Service is now ACTIVE (Healthy)");
-        // e.g., Remove fencing lock
-    }
-
-    /// Callback: Transitioned from Healthy -> Unhealthy
-    pub async fn on_failover(&self) {
-        info!("Service is now FAILED (Unhealthy)");
-        // e.g., Initiate self-fencing, stop accepting traffic
-    }
-}
-
-/// Agent-provided heartbeat information (no timestamps - those come from the store)
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct AgentInfo {
-    pub agent_id: Id,
-    pub cluster_id: Id,
-    pub status: String,
-}
-
-/// Complete heartbeat with both agent data and store metadata
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct AgentHeartbeat {
-    pub agent_info: AgentInfo,
-    pub metadata: Option<KvMetadata>,
-}
-
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct ClusterStateData {
-    pub cluster_id: Id,
-    pub current_primary: Option<Id>,
-    pub desired_primary: Id,
-    pub timestamp: u64,
-}
-
 pub struct HarmonyAgent<S: KvStore> {
     pub config: AgentConfig,
     workflow: Box<dyn HeartbeatWorkflow>,
@@ -207,7 +103,6 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
             }
             AgentRole::Replica => {
                 info!("Initializing agent as REPLICA");
-                // pub fn new(success_threshold: usize, failure_threshold: usize, cluster_id: Id, primary_id: Id, my_id: Id) -> Self
                 Box::new(ReplicaWorkflow::new(
                     config.success_threshold,
                     config.failure_threshold,
@@ -229,7 +124,56 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
         }
     }
 
-    /// Reconcile startup state by fetching cluster state from the store
+    /// Generic helper to fetch and deserialize data from KV store
+    /// Returns Ok(Some(data)) if key exists and deserializes successfully
+    /// Returns Ok(None) if key doesn't exist
+    /// Returns Err if deserialization fails or other errors occur
+    async fn fetch_from_store<D>(
+        &self,
+        store: &Arc<S>,
+        key: &str,
+    ) -> Result<Option<D>, KvStoreError>
+    where
+        D: serde::de::DeserializeOwned,
+    {
+        debug!("Fetching data from key: {}", key);
+
+        let result = store.get(key).await;
+        debug!("Got result from store: {:#?}", result);
+
+        match result {
+            Ok(kv_result) => {
+                if let Some(value) = kv_result.value {
+                    match serde_json::from_value::<D>(value.clone()) {
+                        Ok(data) => Ok(Some(data)),
+                        Err(e) => {
+                            log::warn!("Failed to deserialize data from key {}: {}", key, e);
+                            Err(KvStoreError::DeserializationFailed {
+                                deserialization_error: format!(
+                                    "Key exists but deserialization failed for {key}: {e}"
+                                ),
+                                value: value.to_string(),
+                            })
+                        }
+                    }
+                } else {
+                    Err(KvStoreError::Unknown(format!(
+                        "Key exists but value is empty for {key}, this should not happen"
+                    )))
+                }
+            }
+            Err(KvStoreError::KeyNotAvailable(_)) => {
+                debug!("Key {} not found in store", key);
+                Ok(None)
+            }
+            Err(e) => {
+                log::warn!("Failed to fetch data from key {}: {}", key, e);
+                Err(e)
+            }
+        }
+    }
+
+    /// Reconcile startup state by fetching cluster state and heartbeat from the store
     /// This allows the workflow to determine if it should resume as Primary/Replica
     /// based on the persisted cluster state
     pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> {
@@ -240,48 +184,65 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
             cluster_key
         );
 
-        /*
-        trace!("{:#?}", self.cluster_kv.get(&cluster_key).await);
-        */
-
-        let cluster_state_option = match self.cluster_kv.get(&cluster_key).await {
-            Ok(result) => {
-                if let Some(value) = result.value {
-                    match serde_json::from_value::<ClusterStateData>(value.clone()) {
-                        Ok(data) => Some(data),
-                        Err(e) => {
-                            log::warn!("Failed to deserialize cluster state: {}", e);
-                            return Err(KvStoreError::DeserializationFailed {
-                                deserialization_error: format!(
-                                    "Cluster key exist but is empty {cluster_key} : {e}"
-                                ),
-                                value: value.to_string(),
-                            });
-                        }
-                    }
-                } else {
-                    return Err(KvStoreError::Unknown(format!(
-                        "Cluster key exist but is empty {cluster_key}"
-                    )));
-                }
-            }
-            Err(KvStoreError::KeyNotAvailable(_)) => {
-                debug!("Cluster state key not found, this is a fresh cluster");
+        let cluster_state_option = match self
+            .fetch_from_store::<ClusterStateData>(&self.cluster_kv, &cluster_key)
+            .await?
+        {
+            Some(data) => Some(data),
+            None => {
+                debug!(
+                    "Cluster state key not found, this is a fresh cluster, initializing cluster state"
+                );
                 Some(self.store_cluster_state(None).await?)
             }
-            Err(e) => {
-                log::warn!("Failed to fetch cluster state during startup: {}", e);
-                return Err(e);
-            }
         };
 
-        let state_ref = cluster_state_option.as_ref();
-
-        self.workflow.on_startup(state_ref, self.health_kv.as_ref(), &self.config).await;
+        debug!("Found cluster state {cluster_state_option:#?}");
+        self.workflow
+            .on_startup(cluster_state_option.as_ref(), &self.config)
+            .await;
 
         // Cache the cluster state locally
         *self.cluster_state.write().await = cluster_state_option;
 
+        // Fetch last heartbeat if it exists to avoid sequence conflicts
+        let heartbeat_key = format!("heartbeat.{}", self.config.agent_id);
+        debug!("Fetching last heartbeat from key: {}", heartbeat_key);
+
+        let last_heartbeat_option = self.health_kv.get(&heartbeat_key).await;
+
+        let last_heartbeat = match last_heartbeat_option {
+            Ok(kv_result) => {
+                let value = kv_result
+                    .value
+                    .expect("When key exist it should always contain data");
+                Some(AgentHeartbeat {
+                    agent_info: serde_json::from_value::<AgentInfo>(value.clone()).map_err(|e| {
+                        KvStoreError::DeserializationFailed {
+                            deserialization_error: e.to_string(),
+                            value: value.to_string(),
+                        }
+                    })?,
+                    metadata: Some(kv_result.metadata),
+                })
+            }
+            Err(e) => match e {
+                KvStoreError::KeyNotAvailable(_) => None,
+                _ => return Err(e),
+            },
+        };
+        if let Some(heartbeat) = &last_heartbeat{
+            debug!(
+                "Found existing heartbeat with sequence: {}",
+                heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
+            );
+        } else {
+            debug!("No existing heartbeat found, starting fresh");
+        }
+
+        // Cache the last heartbeat for sequence tracking
+        *self.last_heartbeat.write().await = last_heartbeat;
+
         Ok(())
     }
 
@@ -503,9 +464,3 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
         }
     }
 }
-
-#[derive(Debug)]
-pub struct HeartbeatFailure {}
-
-/// Replica workflow module - handles replica-specific state machine
-mod replica {}
diff --git a/harmony_agent/src/agent/role.rs b/harmony_agent/src/agent/role.rs
new file mode 100644
index 00000000..e9b719cf
--- /dev/null
+++ b/harmony_agent/src/agent/role.rs
@@ -0,0 +1,17 @@
+use std::fmt;
+
+/// The role of this agent instance
+#[derive(Debug, Clone, PartialEq)]
+pub enum AgentRole {
+    Primary,
+    Replica,
+}
+
+impl fmt::Display for AgentRole {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            AgentRole::Primary => write!(f, "primary"),
+            AgentRole::Replica => write!(f, "replica"),
+        }
+    }
+}
diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs
index 92a0fa09..8eda5d1d 100644
--- a/harmony_agent/src/main.rs
+++ b/harmony_agent/src/main.rs
@@ -1,13 +1,9 @@
 use std::{sync::Arc, time::Duration};
 
-use async_nats::jetstream::kv::Store;
+use crate::{agent::AgentRole, store::{ChaosKvStore, InMemoryKvStore, NatsKvStore}};
 
-use crate::{
-    agent_loop::AgentRole,
-    store::{ChaosKvStore, InMemoryKvStore, NatsKvStore},
-};
-
-mod agent_loop;
+// mod agent_loop;
+mod agent;
 pub mod store;
 mod workflow;
 
@@ -25,14 +21,14 @@ async fn main() {
     let cluster_kv = nats_store.clone();
 
     let _ = tokio::join!(
-        agent_loop::launch_agent(
+        agent::launch_agent(
             AgentRole::Primary,
             health_kv.clone(),
             cluster_kv.clone(),
             heartbeat_interval,
             failover_timeout
         ),
-        agent_loop::launch_agent(
+        agent::launch_agent(
             AgentRole::Replica,
             health_kv,
             cluster_kv,
diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs
index 561ce434..81387a45 100644
--- a/harmony_agent/src/workflow/mod.rs
+++ b/harmony_agent/src/workflow/mod.rs
@@ -1,9 +1,7 @@
 use std::sync::Arc;
 
+use crate::agent::AgentConfig;
 use async_trait::async_trait;
-use harmony_types::id::Id;
-
-use crate::{agent_loop::AgentConfig, store::KvStore};
 
 pub mod primary;
 pub mod replica;
@@ -18,9 +16,8 @@ pub trait HeartbeatWorkflow: Send + Sync {
     async fn handle_heartbeat_failure(&mut self);
 
     async fn on_startup(
-        &mut self,
-        cluster_state: Option<&crate::agent_loop::ClusterStateData>,
-        health_kv: &dyn KvStore,
+        &self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
         agent_config: &AgentConfig,
     );
 
diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs
index ffe3ffae..80af63b5 100644
--- a/harmony_agent/src/workflow/primary.rs
+++ b/harmony_agent/src/workflow/primary.rs
@@ -1,7 +1,10 @@
 use async_trait::async_trait;
 use log::{debug, info, trace, warn};
 
-use crate::{agent_loop::{AgentConfig, DeploymentConfig}, store::KvStore, workflow::HeartbeatWorkflow};
+use crate::{
+    agent::{AgentConfig, DeploymentConfig},
+    workflow::HeartbeatWorkflow,
+};
 
 #[derive(Debug, Clone, PartialEq)]
 pub enum PrimaryState {
@@ -64,9 +67,8 @@ impl PrimaryWorkflow {
 #[async_trait]
 impl HeartbeatWorkflow for PrimaryWorkflow {
     async fn on_startup(
-        &mut self,
-        cluster_state: Option<&crate::agent_loop::ClusterStateData>,
-        health_kv: &dyn KvStore,
+        &self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
         agent_config: &AgentConfig,
     ) {
         if let Some(state) = cluster_state {
diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs
index eda4bbdc..e2ffc42e 100644
--- a/harmony_agent/src/workflow/replica.rs
+++ b/harmony_agent/src/workflow/replica.rs
@@ -1,11 +1,10 @@
 use async_trait::async_trait;
 use harmony_types::id::Id;
-use log::{debug, info, trace, warn};
+use log::{info, trace, warn};
 use std::time::Duration;
 use tokio::sync::RwLock;
 
-use crate::agent_loop::{AgentConfig, AgentHeartbeat};
-use crate::store::KvStore;
+use crate::agent::{AgentConfig, AgentHeartbeat};
 use crate::workflow::HeartbeatWorkflow;
 
 #[derive(Debug, Clone)]
@@ -155,9 +154,8 @@ impl ReplicaWorkflow {
 #[async_trait]
 impl HeartbeatWorkflow for ReplicaWorkflow {
     async fn on_startup(
-        &mut self,
-        cluster_state: Option<&crate::agent_loop::ClusterStateData>,
-        health_kv: &dyn KvStore,
+        &self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
         agent_config: &AgentConfig,
     ) {
         // todo!("not sure if the replica should do anything on startup")
-- 
2.39.5


From 01a775a01fa629978506b5c76cec062b2b3f649e Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Wed, 4 Feb 2026 07:01:13 -0500
Subject: [PATCH 15/19] wip(agent): workflow now return new cluster state when
 they decide to alter it, primary taking control of current_primary case
 handled but using wrong ID

---
 harmony_agent/src/agent/mod.rs        | 10 ++++++---
 harmony_agent/src/workflow/mod.rs     | 11 ++++++++--
 harmony_agent/src/workflow/primary.rs | 29 ++++++++++++++++++++++++---
 harmony_agent/src/workflow/replica.rs | 18 +++++++++++++----
 4 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs
index 8d6ff2f5..68ff3020 100644
--- a/harmony_agent/src/agent/mod.rs
+++ b/harmony_agent/src/agent/mod.rs
@@ -2,7 +2,7 @@ use std::time::{SystemTime, UNIX_EPOCH};
 use std::{str::FromStr, sync::Arc, time::Duration};
 
 use harmony_types::id::Id;
-use log::{debug, info, trace};
+use log::{debug, info, trace, warn};
 use tokio::sync::RwLock;
 use tokio::time::{Instant, sleep};
 
@@ -435,10 +435,14 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
             trace!("Got heartbeat_result : {heartbeat_result:?}");
             match heartbeat_result {
                 Ok(_) => {
-                    self.workflow.handle_heartbeat_success().await;
+                    let new_state = self.workflow.handle_heartbeat_success(self.cluster_state.read().await.as_ref(), &self.config).await;
+                    if let Some(new_state) = new_state {
+                        warn!("Got new cluster state : {new_state:#?}");
+                        todo!("Got new state, save it");
+                    }
                 }
                 Err(_) => {
-                    self.workflow.handle_heartbeat_failure().await;
+                    self.workflow.handle_heartbeat_failure(self.cluster_state.read().await.as_ref()).await;
                 }
             }
 
diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs
index 81387a45..e037e194 100644
--- a/harmony_agent/src/workflow/mod.rs
+++ b/harmony_agent/src/workflow/mod.rs
@@ -10,10 +10,17 @@ pub mod replica;
 #[async_trait]
 pub trait HeartbeatWorkflow: Send + Sync {
     /// Handle a successful heartbeat
-    async fn handle_heartbeat_success(&mut self);
+    async fn handle_heartbeat_success(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+        agent_config: &AgentConfig,
+    ) -> Option<crate::agent::ClusterStateData>;
 
     /// Handle a failed heartbeat
-    async fn handle_heartbeat_failure(&mut self);
+    async fn handle_heartbeat_failure(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+    );
 
     async fn on_startup(
         &self,
diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs
index 80af63b5..f54dfc77 100644
--- a/harmony_agent/src/workflow/primary.rs
+++ b/harmony_agent/src/workflow/primary.rs
@@ -2,7 +2,7 @@ use async_trait::async_trait;
 use log::{debug, info, trace, warn};
 
 use crate::{
-    agent::{AgentConfig, DeploymentConfig},
+    agent::{AgentConfig, ClusterStateData, DeploymentConfig},
     workflow::HeartbeatWorkflow,
 };
 
@@ -87,7 +87,11 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
             debug!("No cluster state on startup, starting from Initializing");
         }
     }
-    async fn handle_heartbeat_success(&mut self) {
+    async fn handle_heartbeat_success(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+        agent_config: &AgentConfig,
+    ) -> Option<crate::agent::ClusterStateData> {
         trace!(
             "Handling heartbeat success, current counters success {} failures {}",
             self.consecutive_successes, self.consecutive_failures
@@ -104,7 +108,19 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
                     tokio::spawn(async move {
                         config.on_active().await;
                     });
+                    if let Some(state) = cluster_state
+                        && state.desired_primary == agent_config.desired_primary_id
+                    {
+                        let mut new_state = state.clone();
+                        new_state.current_primary = Some(agent_config.agent_id.clone());
+                        return Some(new_state);
+                    } else {
+                        todo!(
+                            "I cluster_state should not be an option, and we should throw an error when we are running a primary workflow but we are not the desired primary in the cluster state data"
+                        );
+                    }
                 }
+                None
             }
             PrimaryState::Failed => {
                 if self.consecutive_successes >= self.success_threshold {
@@ -114,10 +130,12 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
                         config.on_active().await;
                     });
                 }
+                todo!()
             }
             PrimaryState::Healthy => {
                 // Stay healthy
                 debug!("Primary staying healthy");
+                todo!()
             }
             PrimaryState::Fenced => {
                 // Recovery from fenced state
@@ -126,15 +144,20 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
                     info!("Recovered from fenced state, transitioning to yielding");
                     self.transition_to(PrimaryState::Yielding);
                 }
+                todo!()
             }
             PrimaryState::Yielding => {
                 // TODO: Check NATS to see if we can resume as primary
                 trace!("Yielding, waiting for demotion handshake");
+                todo!()
             }
         }
     }
 
-    async fn handle_heartbeat_failure(&mut self) {
+    async fn handle_heartbeat_failure(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+    ) {
         self.consecutive_failures += 1;
         self.consecutive_successes = 0;
 
diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs
index e2ffc42e..90ddd341 100644
--- a/harmony_agent/src/workflow/replica.rs
+++ b/harmony_agent/src/workflow/replica.rs
@@ -161,7 +161,11 @@ impl HeartbeatWorkflow for ReplicaWorkflow {
         // todo!("not sure if the replica should do anything on startup")
     }
 
-    async fn handle_heartbeat_success(&mut self) {
+    async fn handle_heartbeat_success(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+        agent_config: &AgentConfig,
+    ) -> Option<crate::agent::ClusterStateData> {
         trace!(
             "Handling heartbeat success, current counters success {} failures {}",
             self.consecutive_successes, self.consecutive_failures
@@ -174,6 +178,7 @@ impl HeartbeatWorkflow for ReplicaWorkflow {
                 if self.consecutive_successes >= self.success_threshold {
                     self.transition_to(ReplicaState::Watching);
                 }
+                None
             }
             ReplicaState::Watching => {
                 // TODO: Check primary staleness from NATS
@@ -181,7 +186,6 @@ impl HeartbeatWorkflow for ReplicaWorkflow {
                 if self.is_primary_stale().await {
                     warn!("Found stale primary, launching promotion");
                 }
-                /*
                 todo!("perform the replica watch actions :
                 - if a primary exists in the cluster (cluster_state.current_primary == expected_primary)
                     - check the last primary heartbeat kv timestamp
@@ -189,7 +193,6 @@ impl HeartbeatWorkflow for ReplicaWorkflow {
                     - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself)
                     - launching promotion will change the status of the replica
                     ");
-                */
             }
             ReplicaState::Promoting => {
                 // TODO: Complete promotion attempt
@@ -202,25 +205,32 @@ impl HeartbeatWorkflow for ReplicaWorkflow {
                 if self.consecutive_successes >= self.success_threshold {
                     self.transition_to(ReplicaState::Watching);
                 }
+                todo!()
             }
             ReplicaState::Leader => {
                 // TODO: Check for original primary recovery
                 trace!("Replica acting as leader");
+                todo!()
             }
             ReplicaState::Failed => {
                 if self.consecutive_successes >= self.success_threshold {
                     info!("Replica recovered from Failed state, transitioning to Watching");
                     self.transition_to(ReplicaState::Watching);
                 }
+                todo!()
             }
             ReplicaState::Demoting => {
                 // TODO: Complete demotion back to watching
                 trace!("Replica demotion in progress");
+                todo!()
             }
         }
     }
 
-    async fn handle_heartbeat_failure(&mut self) {
+    async fn handle_heartbeat_failure(
+        &mut self,
+        cluster_state: Option<&crate::agent::ClusterStateData>,
+    ) {
         self.consecutive_failures += 1;
         self.consecutive_successes = 0;
 
-- 
2.39.5


From 17b3b3b3513f898514082b77750e97375e4ed430 Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Wed, 4 Feb 2026 09:26:10 -0500
Subject: [PATCH 16/19] test(agent): Wrote first few tests for Primary workflow
 use cases : initializing to healthy, healthy to failed

---
 harmony_agent/Cargo.toml              |   1 +
 harmony_agent/src/agent/config.rs     |   1 -
 harmony_agent/src/agent/heartbeat.rs  |   2 +-
 harmony_agent/src/agent/mod.rs        |   1 -
 harmony_agent/src/workflow/primary.rs | 127 ++++++++++++++++++++++++++
 5 files changed, 129 insertions(+), 3 deletions(-)

diff --git a/harmony_agent/Cargo.toml b/harmony_agent/Cargo.toml
index 22a373ca..6fb7ff5d 100644
--- a/harmony_agent/Cargo.toml
+++ b/harmony_agent/Cargo.toml
@@ -23,3 +23,4 @@ serde_json.workspace = true
 getrandom = "0.3.4"
 
 thiserror.workspace = true
+pretty_assertions.workspace = true
diff --git a/harmony_agent/src/agent/config.rs b/harmony_agent/src/agent/config.rs
index ec96601c..0862da67 100644
--- a/harmony_agent/src/agent/config.rs
+++ b/harmony_agent/src/agent/config.rs
@@ -50,7 +50,6 @@ pub enum DeploymentConfig {
 
 #[derive(Debug, Clone)]
 pub struct FailoverCNPGConfig {
-    pub desired_primary_agent: Id,
     pub cnpg_cluster_name: String,
 }
 
diff --git a/harmony_agent/src/agent/heartbeat.rs b/harmony_agent/src/agent/heartbeat.rs
index f2fe9704..f101656c 100644
--- a/harmony_agent/src/agent/heartbeat.rs
+++ b/harmony_agent/src/agent/heartbeat.rs
@@ -18,7 +18,7 @@ pub struct AgentHeartbeat {
     pub metadata: Option<KvMetadata>,
 }
 
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ClusterStateData {
     pub cluster_id: Id,
     pub current_primary: Option<Id>,
diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs
index 68ff3020..558debd7 100644
--- a/harmony_agent/src/agent/mod.rs
+++ b/harmony_agent/src/agent/mod.rs
@@ -50,7 +50,6 @@ where
         heartbeat_interval,
         failover_timeout,
         deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
-            desired_primary_agent: my_agent_id.clone(),
             cnpg_cluster_name: String::from("cnpg_cluster_name"),
         }),
         nats_url: String::new(),
diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs
index f54dfc77..890068be 100644
--- a/harmony_agent/src/workflow/primary.rs
+++ b/harmony_agent/src/workflow/primary.rs
@@ -31,8 +31,14 @@ pub struct PrimaryWorkflow {
     state: PrimaryState,
     consecutive_successes: usize,
     consecutive_failures: usize,
+
+    // TODO these thresholds should not be copied into the workflow struct. They are configuration
+    // level and should always be read from the context passed to the workflow functions
     success_threshold: usize,
     failure_threshold: usize,
+
+    // TODO not sure if this should be known by the workflow or passed in the context to function
+    // calls or just completely handled by the agent ?
     deployment_config: DeploymentConfig,
 }
 
@@ -201,3 +207,124 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
         self.consecutive_failures
     }
 }
+
+#[cfg(test)]
+mod test {
+    use std::time::Duration;
+
+    use harmony_types::id::Id;
+
+    use crate::agent::{AgentRole, FailoverCNPGConfig};
+
+    use pretty_assertions::assert_eq;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn primary_does_nothing_when_on_heartbeat_success_below_threshold() {
+        let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
+
+        assert!(
+            primary
+                .handle_heartbeat_success(Some(&cluster_state), &agent_config)
+                .await
+                .is_none()
+        );
+    }
+
+    #[tokio::test]
+    async fn primary_transitions_cluster_state_when_consecutive_success_threshold_reached() {
+        let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
+
+        let mut expected_state = cluster_state.clone();
+        expected_state.current_primary = Some(Id::empty());
+
+        assert_eq!(
+            primary
+                .handle_heartbeat_success(Some(&cluster_state), &agent_config)
+                .await,
+            None
+        );
+        assert_eq!(
+            primary
+                .handle_heartbeat_success(Some(&cluster_state), &agent_config)
+                .await,
+            Some(expected_state)
+        );
+    }
+
+    #[tokio::test]
+    async fn primary_stays_healthy_below_failure_threshold() {
+        let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
+
+        // Reach healthy
+        let _ = primary
+            .handle_heartbeat_success(Some(&cluster_state), &agent_config)
+            .await;
+        assert_eq!(primary.state, PrimaryState::Healthy);
+
+        // One failure below threshold
+        primary.handle_heartbeat_failure(Some(&cluster_state)).await;
+        assert_eq!(primary.state, PrimaryState::Healthy);
+        assert_eq!(primary.consecutive_failures(), 1);
+        assert_eq!(primary.consecutive_successes(), 0);
+    }
+
+    #[tokio::test]
+    async fn primary_transitions_to_failed_at_failure_threshold() {
+        let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
+
+        // Reach healthy
+        let _ = primary
+            .handle_heartbeat_success(Some(&cluster_state), &agent_config)
+            .await;
+        assert_eq!(primary.state, PrimaryState::Healthy);
+
+        // First failure, still healthy
+        primary.handle_heartbeat_failure(Some(&cluster_state)).await;
+        assert_eq!(primary.state, PrimaryState::Healthy);
+        assert_eq!(primary.consecutive_failures(), 1);
+
+        // Second failure reaches threshold, transitions to Failed
+        primary.handle_heartbeat_failure(Some(&cluster_state)).await;
+        assert_eq!(primary.state, PrimaryState::Fenced);
+        assert_eq!(primary.consecutive_failures(), 2);
+        assert_eq!(primary.consecutive_successes(), 0);
+    }
+
+    fn default_test_state(
+        success_threshold: usize,
+        failure_threshold: usize,
+    ) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) {
+        let cluster_state = crate::agent::ClusterStateData {
+            cluster_id: Id::empty(),
+            current_primary: None,
+            desired_primary: Id::empty(),
+            timestamp: 0,
+        };
+
+        let agent_config = AgentConfig {
+            success_threshold,
+            failure_threshold,
+            heartbeat_interval: Duration::from_nanos(0),
+            failover_timeout: Duration::from_nanos(0),
+            deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
+                cnpg_cluster_name: "test".to_string(),
+            }),
+            nats_url: String::new(),
+            nats_creds_path: None,
+            agent_id: Id::empty(),
+            cluster_id: Id::empty(),
+            desired_primary_id: Id::empty(),
+            role: AgentRole::Primary,
+        };
+
+        let primary = PrimaryWorkflow::new(
+            agent_config.success_threshold,
+            agent_config.failure_threshold,
+            agent_config.deployment_config_unstable.clone(),
+        );
+
+        (primary, cluster_state, agent_config)
+    }
+}
-- 
2.39.5


From a08c3fb03b55a9502068647e2332c943a0c8e7d4 Mon Sep 17 00:00:00 2001
From: wjro <wrolleman@nationtech.io>
Date: Wed, 4 Feb 2026 11:47:11 -0500
Subject: [PATCH 17/19] wip: save new cluster info state

---
 harmony_agent/src/agent/heartbeat.rs  |  6 +++
 harmony_agent/src/agent/mod.rs        | 72 +++++++++++++++++++--------
 harmony_agent/src/store/mod.rs        |  2 +-
 harmony_agent/src/workflow/mod.rs     |  2 +-
 harmony_agent/src/workflow/primary.rs | 23 +++++----
 5 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/harmony_agent/src/agent/heartbeat.rs b/harmony_agent/src/agent/heartbeat.rs
index f101656c..ab5697b1 100644
--- a/harmony_agent/src/agent/heartbeat.rs
+++ b/harmony_agent/src/agent/heartbeat.rs
@@ -20,6 +20,12 @@ pub struct AgentHeartbeat {
 
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ClusterStateData {
+    pub cluster_info: ClusterInfo,
+    pub metadata: Option<KvMetadata>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
+pub struct ClusterInfo {
     pub cluster_id: Id,
     pub current_primary: Option<Id>,
     pub desired_primary: Id,
diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs
index 558debd7..480b7713 100644
--- a/harmony_agent/src/agent/mod.rs
+++ b/harmony_agent/src/agent/mod.rs
@@ -2,10 +2,11 @@ use std::time::{SystemTime, UNIX_EPOCH};
 use std::{str::FromStr, sync::Arc, time::Duration};
 
 use harmony_types::id::Id;
-use log::{debug, info, trace, warn};
+use log::{debug, error, info, trace, warn};
 use tokio::sync::RwLock;
 use tokio::time::{Instant, sleep};
 
+use crate::agent::heartbeat::ClusterInfo;
 use crate::store::{KvMetadata, KvStore, KvStoreError};
 use crate::workflow::HeartbeatWorkflow;
 use crate::workflow::primary::PrimaryWorkflow;
@@ -13,7 +14,7 @@ use crate::workflow::replica::ReplicaWorkflow;
 
 // Submodules
 mod config;
-mod heartbeat;
+pub mod heartbeat;
 mod role;
 
 // Re-exports for backwards compatibility
@@ -216,12 +217,12 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
                     .value
                     .expect("When key exist it should always contain data");
                 Some(AgentHeartbeat {
-                    agent_info: serde_json::from_value::<AgentInfo>(value.clone()).map_err(|e| {
-                        KvStoreError::DeserializationFailed {
+                    agent_info: serde_json::from_value::<AgentInfo>(value.clone()).map_err(
+                        |e| KvStoreError::DeserializationFailed {
                             deserialization_error: e.to_string(),
                             value: value.to_string(),
-                        }
-                    })?,
+                        },
+                    )?,
                     metadata: Some(kv_result.metadata),
                 })
             }
@@ -230,7 +231,7 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
                 _ => return Err(e),
             },
         };
-        if let Some(heartbeat) = &last_heartbeat{
+        if let Some(heartbeat) = &last_heartbeat {
             debug!(
                 "Found existing heartbeat with sequence: {}",
                 heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
@@ -247,34 +248,46 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
 
     async fn store_cluster_state(
         &self,
-        cluster_state_data: Option<ClusterStateData>,
+        cluster_data: Option<ClusterStateData>,
     ) -> Result<ClusterStateData, KvStoreError> {
         let key = format!("cluster.{}", self.config.cluster_id);
-        match cluster_state_data {
-            Some(state) => {
-                let value = serde_json::to_value(&state).map_err(|e| {
+        match cluster_data {
+            Some(cluster_data) => {
+                debug!("found some cluster state {:#?}", cluster_data);
+
+                let value = serde_json::to_value(&cluster_data).map_err(|e| {
                     KvStoreError::DeserializationFailed {
                         deserialization_error: e.to_string(),
-                        value: format!("{:?}", state),
+                        value: format!("{:?}", cluster_data),
                     }
                 })?;
 
                 let expected_sequence = {
-                    let last = self.last_heartbeat.read().await;
+                    let last = self.cluster_state.read().await;
                     last.as_ref()
                         .and_then(|hb| hb.metadata.as_ref())
                         .map(|m| m.sequence)
                         .unwrap_or(0)
                 };
 
-                self.cluster_kv
+                debug!("expected sequence {:#?}", expected_sequence);
+                let new_seq = self
+                    .cluster_kv
                     .set_strict(&key, value, expected_sequence)
                     .await?;
 
-                Ok(state)
+                let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
+
+                let cluster_data_new = ClusterStateData {
+                    cluster_info: cluster_data.cluster_info.clone(),
+                    metadata: Some(cluster_kv_result.metadata),
+                };
+
+                *self.cluster_state.write().await = Some(cluster_data_new.clone());
+                Ok(cluster_data)
             }
             None => {
-                let cluster_data = ClusterStateData {
+                let cluster_info = ClusterInfo {
                     cluster_id: self.config.cluster_id.clone(),
                     current_primary: None,
                     desired_primary: self.config.desired_primary_id.clone(),
@@ -284,13 +297,20 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
                         .as_millis() as u64,
                 };
 
-                let value = serde_json::to_value(&cluster_data).map_err(|e| {
+                let value = serde_json::to_value(&cluster_info).map_err(|e| {
                     KvStoreError::DeserializationFailed {
                         deserialization_error: e.to_string(),
-                        value: format!("{:?}", cluster_data),
+                        value: format!("{:?}", cluster_info),
                     }
                 })?;
+
+                let cluster_data = ClusterStateData {
+                    cluster_info,
+                    metadata: None,
+                };
+
                 self.cluster_kv.set_strict(&key, value, 0).await?;
+                *self.cluster_state.write().await = Some(cluster_data.clone());
                 Ok(cluster_data)
             }
         }
@@ -434,14 +454,24 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
             trace!("Got heartbeat_result : {heartbeat_result:?}");
             match heartbeat_result {
                 Ok(_) => {
-                    let new_state = self.workflow.handle_heartbeat_success(self.cluster_state.read().await.as_ref(), &self.config).await;
+                    let new_state = self
+                        .workflow
+                        .handle_heartbeat_success(
+                            self.cluster_state.read().await.as_ref(),
+                            &self.config,
+                        )
+                        .await;
                     if let Some(new_state) = new_state {
                         warn!("Got new cluster state : {new_state:#?}");
-                        todo!("Got new state, save it");
+                        self.store_cluster_state(Some(new_state))
+                            .await
+                            .expect(&format!("cluster state not able to be stored"));
                     }
                 }
                 Err(_) => {
-                    self.workflow.handle_heartbeat_failure(self.cluster_state.read().await.as_ref()).await;
+                    self.workflow
+                        .handle_heartbeat_failure(self.cluster_state.read().await.as_ref())
+                        .await;
                 }
             }
 
diff --git a/harmony_agent/src/store/mod.rs b/harmony_agent/src/store/mod.rs
index 08879d1a..617df34c 100644
--- a/harmony_agent/src/store/mod.rs
+++ b/harmony_agent/src/store/mod.rs
@@ -12,7 +12,7 @@ pub struct SubscriptionHandle {
 
 /// Metadata returned by the KV store for all operations
 /// Contains timing and ordering information set by the store
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct KvMetadata {
     /// Timestamp set by the store (milliseconds since UNIX epoch)
     pub timestamp: u64,
diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs
index e037e194..8696e071 100644
--- a/harmony_agent/src/workflow/mod.rs
+++ b/harmony_agent/src/workflow/mod.rs
@@ -24,7 +24,7 @@ pub trait HeartbeatWorkflow: Send + Sync {
 
     async fn on_startup(
         &self,
-        cluster_state: Option<&crate::agent::ClusterStateData>,
+        cluster_state: Option<&crate::agent::heartbeat::ClusterStateData>,
         agent_config: &AgentConfig,
     );
 
diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs
index 890068be..80242cc6 100644
--- a/harmony_agent/src/workflow/primary.rs
+++ b/harmony_agent/src/workflow/primary.rs
@@ -80,7 +80,7 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
         if let Some(state) = cluster_state {
             info!(
                 "Startup reconciliation: current primary is {:?}, desired primary is {:?}",
-                state.current_primary, state.desired_primary
+                state.cluster_info.current_primary, state.cluster_info.desired_primary
             );
 
             let key = format!("heartbeat.{}", agent_config.agent_id.clone());
@@ -115,10 +115,11 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
                         config.on_active().await;
                     });
                     if let Some(state) = cluster_state
-                        && state.desired_primary == agent_config.desired_primary_id
+                        && state.cluster_info.desired_primary == agent_config.desired_primary_id
                     {
                         let mut new_state = state.clone();
-                        new_state.current_primary = Some(agent_config.agent_id.clone());
+                        new_state.cluster_info.current_primary =
+                            Some(agent_config.agent_id.clone());
                         return Some(new_state);
                     } else {
                         todo!(
@@ -210,9 +211,8 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
 
 #[cfg(test)]
 mod test {
-    use std::time::Duration;
-
     use harmony_types::id::Id;
+    use std::time::Duration;
 
     use crate::agent::{AgentRole, FailoverCNPGConfig};
 
@@ -237,7 +237,7 @@ mod test {
         let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
 
         let mut expected_state = cluster_state.clone();
-        expected_state.current_primary = Some(Id::empty());
+        expected_state.cluster_info.current_primary = Some(Id::empty());
 
         assert_eq!(
             primary
@@ -297,10 +297,13 @@ mod test {
         failure_threshold: usize,
     ) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) {
         let cluster_state = crate::agent::ClusterStateData {
-            cluster_id: Id::empty(),
-            current_primary: None,
-            desired_primary: Id::empty(),
-            timestamp: 0,
+            cluster_info: crate::agent::heartbeat::ClusterInfo {
+                cluster_id: Id::empty(),
+                current_primary: None,
+                desired_primary: Id::empty(),
+                timestamp: 0,
+            },
+            metadata: None,
         };
 
         let agent_config = AgentConfig {
-- 
2.39.5


From de14ba6b97de34188a1cfc00699ad43ef5ec348a Mon Sep 17 00:00:00 2001
From: Jean-Gabriel Gill-Couture <jg@nationtech.io>
Date: Wed, 4 Feb 2026 12:10:33 -0500
Subject: [PATCH 18/19] fix(agent): fetch from store returns metadata to allow
 rebuilding states properly

---
 harmony_agent/src/agent/heartbeat.rs  |  5 ++---
 harmony_agent/src/agent/mod.rs        | 19 +++++++++----------
 harmony_agent/src/workflow/primary.rs |  4 ++--
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/harmony_agent/src/agent/heartbeat.rs b/harmony_agent/src/agent/heartbeat.rs
index ab5697b1..5e9fc36f 100644
--- a/harmony_agent/src/agent/heartbeat.rs
+++ b/harmony_agent/src/agent/heartbeat.rs
@@ -20,16 +20,15 @@ pub struct AgentHeartbeat {
 
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ClusterStateData {
-    pub cluster_info: ClusterInfo,
+    pub cluster_info: ClusterState,
     pub metadata: Option<KvMetadata>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
-pub struct ClusterInfo {
+pub struct ClusterState {
     pub cluster_id: Id,
     pub current_primary: Option<Id>,
     pub desired_primary: Id,
-    pub timestamp: u64,
 }
 
 #[derive(Debug)]
diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs
index 480b7713..ba953a09 100644
--- a/harmony_agent/src/agent/mod.rs
+++ b/harmony_agent/src/agent/mod.rs
@@ -6,7 +6,7 @@ use log::{debug, error, info, trace, warn};
 use tokio::sync::RwLock;
 use tokio::time::{Instant, sleep};
 
-use crate::agent::heartbeat::ClusterInfo;
+use crate::agent::heartbeat::ClusterState;
 use crate::store::{KvMetadata, KvStore, KvStoreError};
 use crate::workflow::HeartbeatWorkflow;
 use crate::workflow::primary::PrimaryWorkflow;
@@ -132,7 +132,7 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
         &self,
         store: &Arc<S>,
         key: &str,
-    ) -> Result<Option<D>, KvStoreError>
+    ) -> Result<Option<(D, KvMetadata)>, KvStoreError>
     where
         D: serde::de::DeserializeOwned,
     {
@@ -145,7 +145,7 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
             Ok(kv_result) => {
                 if let Some(value) = kv_result.value {
                     match serde_json::from_value::<D>(value.clone()) {
-                        Ok(data) => Ok(Some(data)),
+                        Ok(data) => Ok(Some((data, kv_result.metadata))),
                         Err(e) => {
                             log::warn!("Failed to deserialize data from key {}: {}", key, e);
                             Err(KvStoreError::DeserializationFailed {
@@ -185,10 +185,13 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
         );
 
         let cluster_state_option = match self
-            .fetch_from_store::<ClusterStateData>(&self.cluster_kv, &cluster_key)
+            .fetch_from_store::<ClusterState>(&self.cluster_kv, &cluster_key)
             .await?
         {
-            Some(data) => Some(data),
+            Some((data, metadata)) => Some(ClusterStateData {
+                cluster_info: data,
+                metadata: Some(metadata),
+            }),
             None => {
                 debug!(
                     "Cluster state key not found, this is a fresh cluster, initializing cluster state"
@@ -287,14 +290,10 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
                 Ok(cluster_data)
             }
             None => {
-                let cluster_info = ClusterInfo {
+                let cluster_info = ClusterState {
                     cluster_id: self.config.cluster_id.clone(),
                     current_primary: None,
                     desired_primary: self.config.desired_primary_id.clone(),
-                    timestamp: SystemTime::now()
-                        .duration_since(UNIX_EPOCH)
-                        .expect("Time went backwards")
-                        .as_millis() as u64,
                 };
 
                 let value = serde_json::to_value(&cluster_info).map_err(|e| {
diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs
index 80242cc6..87ab3391 100644
--- a/harmony_agent/src/workflow/primary.rs
+++ b/harmony_agent/src/workflow/primary.rs
@@ -2,7 +2,7 @@ use async_trait::async_trait;
 use log::{debug, info, trace, warn};
 
 use crate::{
-    agent::{AgentConfig, ClusterStateData, DeploymentConfig},
+    agent::{AgentConfig, DeploymentConfig},
     workflow::HeartbeatWorkflow,
 };
 
@@ -297,7 +297,7 @@ mod test {
         failure_threshold: usize,
     ) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) {
         let cluster_state = crate::agent::ClusterStateData {
-            cluster_info: crate::agent::heartbeat::ClusterInfo {
+            cluster_info: crate::agent::heartbeat::ClusterState {
                 cluster_id: Id::empty(),
                 current_primary: None,
                 desired_primary: Id::empty(),
-- 
2.39.5


From 7ca1a64038582d2766f571836eedb8acf21c9d8c Mon Sep 17 00:00:00 2001
From: wjro <wrolleman@nationtech.io>
Date: Wed, 4 Feb 2026 15:56:40 -0500
Subject: [PATCH 19/19] feat: completed harmony_agent implentation for primary
 and replica agents, fixed a test

---
 Cargo.lock                                    | 21 +------------------
 .../src/modules/application/backend_app.rs    | 19 ++++++++---------
 harmony/src/modules/application/config.rs     |  1 -
 harmony/src/modules/application/mod.rs        |  2 +-
 harmony/src/modules/application/oci.rs        | 10 ++++-----
 harmony/src/modules/application/rust.rs       |  1 -
 harmony_agent/src/agent/config.rs             |  2 +-
 harmony_agent/src/agent/mod.rs                | 19 ++++++++++++-----
 harmony_agent/src/main.rs                     |  5 ++++-
 harmony_agent/src/store/chaos.rs              |  2 +-
 harmony_agent/src/store/memory.rs             |  4 +++-
 harmony_agent/src/workflow/primary.rs         |  7 ++-----
 harmony_agent/src/workflow/replica.rs         |  8 ++++---
 harmony_execution/src/lib.rs                  |  3 +--
 14 files changed, 47 insertions(+), 57 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 14295673..2816bba2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2657,6 +2657,7 @@ dependencies = [
  "harmony_macros",
  "harmony_types",
  "log",
+ "pretty_assertions",
  "serde",
  "serde_json",
  "thiserror 2.0.16",
@@ -3586,26 +3587,6 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
-[[package]]
-name = "json-prompt"
-version = "0.1.0"
-dependencies = [
- "brocade",
- "cidr",
- "env_logger",
- "harmony",
- "harmony_cli",
- "harmony_macros",
- "harmony_secret",
- "harmony_secret_derive",
- "harmony_types",
- "log",
- "schemars 0.8.22",
- "serde",
- "tokio",
- "url",
-]
-
 [[package]]
 name = "jsonpath-rust"
 version = "0.7.5"
diff --git a/harmony/src/modules/application/backend_app.rs b/harmony/src/modules/application/backend_app.rs
index 804af46d..d11feaa9 100644
--- a/harmony/src/modules/application/backend_app.rs
+++ b/harmony/src/modules/application/backend_app.rs
@@ -11,7 +11,7 @@ use crate::{
         helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind},
     },
 };
-use harmony_execution::{run_command, RunnerOptions};
+use harmony_execution::{RunnerOptions, run_command};
 
 #[derive(Debug, Clone, Serialize)]
 pub struct BuildCommand {
@@ -100,15 +100,14 @@ impl OCICompliant for BackendApp {
 
         // Run docker build command, streaming output to console and capturing it
         let output = run_command(
-            std::process::Command::new("docker")
-                .args([
-                    "build",
-                    "-t",
-                    &image_tag,
-                    "-f",
-                    &dockerfile.to_string_lossy(),
-                    &self.project_root.to_string_lossy(),
-                ]),
+            std::process::Command::new("docker").args([
+                "build",
+                "-t",
+                &image_tag,
+                "-f",
+                &dockerfile.to_string_lossy(),
+                &self.project_root.to_string_lossy(),
+            ]),
             RunnerOptions::print_to_console(),
         )
         .map_err(|e| format!("Failed to spawn docker build process: {}", e))?;
diff --git a/harmony/src/modules/application/config.rs b/harmony/src/modules/application/config.rs
index 8d074271..9c529f1d 100644
--- a/harmony/src/modules/application/config.rs
+++ b/harmony/src/modules/application/config.rs
@@ -15,7 +15,6 @@ impl NetworkProtocol {
     }
 }
 
-
 impl std::fmt::Display for NetworkProtocol {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.write_str(self.as_str())
diff --git a/harmony/src/modules/application/mod.rs b/harmony/src/modules/application/mod.rs
index 00e85843..13da0840 100644
--- a/harmony/src/modules/application/mod.rs
+++ b/harmony/src/modules/application/mod.rs
@@ -2,10 +2,10 @@ pub mod backend_app;
 pub mod config;
 mod feature;
 pub mod features;
+pub mod helm;
 pub mod oci;
 mod rust;
 mod webapp;
-pub mod helm;
 use std::sync::Arc;
 
 pub use feature::*;
diff --git a/harmony/src/modules/application/oci.rs b/harmony/src/modules/application/oci.rs
index 102bcd8c..63e1c208 100644
--- a/harmony/src/modules/application/oci.rs
+++ b/harmony/src/modules/application/oci.rs
@@ -1,6 +1,9 @@
 use std::path::{Path, PathBuf};
 
-use crate::{config::{REGISTRY_PROJECT, REGISTRY_URL}, modules::application::check_output};
+use crate::{
+    config::{REGISTRY_PROJECT, REGISTRY_URL},
+    modules::application::check_output,
+};
 
 use super::Application;
 use async_trait::async_trait;
@@ -22,10 +25,7 @@ pub trait HelmPackage: Application {
     /// # Arguments
     /// * `image_url` - The full URL of the OCI container image to be used in the Deployment.
     /// * `domain` - The domain where the application is hosted.
-    async fn build_push_helm_package(
-        &self,
-        image_url: &str,
-    ) -> Result<String, String>;
+    async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String>;
 
     fn project_root(&self) -> PathBuf;
 
diff --git a/harmony/src/modules/application/rust.rs b/harmony/src/modules/application/rust.rs
index 4e41187c..7e3413bb 100644
--- a/harmony/src/modules/application/rust.rs
+++ b/harmony/src/modules/application/rust.rs
@@ -632,7 +632,6 @@ spec:
         Ok(chart_dir)
     }
 
-
     fn get_or_build_dockerfile(&self) -> Result<PathBuf, Box<dyn std::error::Error>> {
         let existing_dockerfile = self.project_root.join("Dockerfile");
 
diff --git a/harmony_agent/src/agent/config.rs b/harmony_agent/src/agent/config.rs
index 0862da67..86b731cf 100644
--- a/harmony_agent/src/agent/config.rs
+++ b/harmony_agent/src/agent/config.rs
@@ -3,8 +3,8 @@ use std::time::Duration;
 use harmony_types::id::Id;
 use log::info;
 
-use super::role::AgentRole;
 use super::heartbeat::HeartbeatFailure;
+use super::role::AgentRole;
 
 #[derive(Debug, Clone)]
 pub struct AgentConfig {
diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs
index ba953a09..3291aeaa 100644
--- a/harmony_agent/src/agent/mod.rs
+++ b/harmony_agent/src/agent/mod.rs
@@ -207,7 +207,6 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
 
         // Cache the cluster state locally
         *self.cluster_state.write().await = cluster_state_option;
-
         // Fetch last heartbeat if it exists to avoid sequence conflicts
         let heartbeat_key = format!("heartbeat.{}", self.config.agent_id);
         debug!("Fetching last heartbeat from key: {}", heartbeat_key);
@@ -258,7 +257,7 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
             Some(cluster_data) => {
                 debug!("found some cluster state {:#?}", cluster_data);
 
-                let value = serde_json::to_value(&cluster_data).map_err(|e| {
+                let value = serde_json::to_value(&cluster_data.cluster_info).map_err(|e| {
                     KvStoreError::DeserializationFailed {
                         deserialization_error: e.to_string(),
                         value: format!("{:?}", cluster_data),
@@ -280,6 +279,7 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
                     .await?;
 
                 let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
+                debug!("cluster kv {:#?}", cluster_kv_result);
 
                 let cluster_data_new = ClusterStateData {
                     cluster_info: cluster_data.cluster_info.clone(),
@@ -308,9 +308,18 @@ impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
                     metadata: None,
                 };
 
-                self.cluster_kv.set_strict(&key, value, 0).await?;
-                *self.cluster_state.write().await = Some(cluster_data.clone());
-                Ok(cluster_data)
+                let new_seq = self.cluster_kv.set_strict(&key, value, 0).await?;
+
+                let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
+                debug!("cluster kv {:#?}", cluster_kv_result);
+
+                let cluster_data_new = ClusterStateData {
+                    cluster_info: cluster_data.cluster_info.clone(),
+                    metadata: Some(cluster_kv_result.metadata),
+                };
+
+                *self.cluster_state.write().await = Some(cluster_data_new.clone());
+                Ok(cluster_data_new)
             }
         }
     }
diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs
index 8eda5d1d..a5947c22 100644
--- a/harmony_agent/src/main.rs
+++ b/harmony_agent/src/main.rs
@@ -1,6 +1,9 @@
 use std::{sync::Arc, time::Duration};
 
-use crate::{agent::AgentRole, store::{ChaosKvStore, InMemoryKvStore, NatsKvStore}};
+use crate::{
+    agent::AgentRole,
+    store::{ChaosKvStore, InMemoryKvStore, NatsKvStore},
+};
 
 // mod agent_loop;
 mod agent;
diff --git a/harmony_agent/src/store/chaos.rs b/harmony_agent/src/store/chaos.rs
index 9fa6fc83..402cf8f7 100644
--- a/harmony_agent/src/store/chaos.rs
+++ b/harmony_agent/src/store/chaos.rs
@@ -113,7 +113,7 @@ mod tests {
     #[tokio::test]
     async fn test_chaos_store_with_no_chaos() {
         let inner = InMemoryKvStore::new();
-        let chaos = ChaosKvStore::new(inner, 0, 0, 0);
+        let chaos = ChaosKvStore::new(inner, 0, 0, 1);
 
         let value = json!({"test": "value"});
         let result = chaos.set_strict("key", value.clone(), 0).await.unwrap();
diff --git a/harmony_agent/src/store/memory.rs b/harmony_agent/src/store/memory.rs
index 150b7225..12afc51c 100644
--- a/harmony_agent/src/store/memory.rs
+++ b/harmony_agent/src/store/memory.rs
@@ -91,6 +91,8 @@ impl KvStore for InMemoryKvStore {
     ) -> Result<u64, KvStoreError> {
         // Check current sequence (length of history for this key)
         let data = self.data.read().await;
+        // This implemenetation does not seem to match the NATS sequence. In nats the
+        // sequence updates one counter per bucket. This impl creates a counter per key
         let current_sequence = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
         drop(data);
 
@@ -163,7 +165,7 @@ mod tests {
 
         let seq1 = store.set_strict("key1", json!("value1"), 0).await.unwrap();
 
-        let seq2 = store.set_strict("key2", json!("value2"), 0).await.unwrap();
+        let seq2 = store.set_strict("key1", json!("value2"), 1).await.unwrap();
 
         assert!(seq2 > seq1, "Sequence numbers should increment");
     }
diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs
index 87ab3391..61f25556 100644
--- a/harmony_agent/src/workflow/primary.rs
+++ b/harmony_agent/src/workflow/primary.rs
@@ -83,9 +83,6 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
                 state.cluster_info.current_primary, state.cluster_info.desired_primary
             );
 
-            let key = format!("heartbeat.{}", agent_config.agent_id.clone());
-            // let hb = health_kv.get(&key);
-
             // No automatic fast-tracking - agent must earn healthy status
             // through successful heartbeats. This prevents duplicate agents
             // or crashloop agents from incorrectly claiming primary.
@@ -117,6 +114,7 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
                     if let Some(state) = cluster_state
                         && state.cluster_info.desired_primary == agent_config.desired_primary_id
                     {
+                        debug!("state {:#?}", state);
                         let mut new_state = state.clone();
                         new_state.cluster_info.current_primary =
                             Some(agent_config.agent_id.clone());
@@ -142,7 +140,7 @@ impl HeartbeatWorkflow for PrimaryWorkflow {
             PrimaryState::Healthy => {
                 // Stay healthy
                 debug!("Primary staying healthy");
-                todo!()
+                None
             }
             PrimaryState::Fenced => {
                 // Recovery from fenced state
@@ -301,7 +299,6 @@ mod test {
                 cluster_id: Id::empty(),
                 current_primary: None,
                 desired_primary: Id::empty(),
-                timestamp: 0,
             },
             metadata: None,
         };
diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs
index 90ddd341..5c86bde7 100644
--- a/harmony_agent/src/workflow/replica.rs
+++ b/harmony_agent/src/workflow/replica.rs
@@ -1,6 +1,6 @@
 use async_trait::async_trait;
 use harmony_types::id::Id;
-use log::{info, trace, warn};
+use log::{debug, error, info, trace, warn};
 use std::time::Duration;
 use tokio::sync::RwLock;
 
@@ -184,15 +184,17 @@ impl HeartbeatWorkflow for ReplicaWorkflow {
                 // TODO: Check primary staleness from NATS
                 trace!("Replica watching primary");
                 if self.is_primary_stale().await {
-                    warn!("Found stale primary, launching promotion");
+                    panic!("Found stale primary, launching promotion");
                 }
-                todo!("perform the replica watch actions :
+                debug!("perform the replica watch actions :
                 - if a primary exists in the cluster (cluster_state.current_primary == expected_primary)
                     - check the last primary heartbeat kv timestamp
                     - compare it with our latest kv heartbeat
                     - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself)
                     - launching promotion will change the status of the replica
                     ");
+
+                None
             }
             ReplicaState::Promoting => {
                 // TODO: Complete promotion attempt
diff --git a/harmony_execution/src/lib.rs b/harmony_execution/src/lib.rs
index 65fdf663..c96cddfe 100644
--- a/harmony_execution/src/lib.rs
+++ b/harmony_execution/src/lib.rs
@@ -1,6 +1,5 @@
 pub mod command;
 
 pub use command::{
-    run_command, run, run_silent,
-    CommandOutput, CommandStatus, CommandError, RunnerOptions,
+    CommandError, CommandOutput, CommandStatus, RunnerOptions, run, run_command, run_silent,
 };
-- 
2.39.5