From 3682a0cb5f0c4457e51fceea47a7092aaaed6e2c Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Fri, 23 Jan 2026 08:54:40 -0500 Subject: [PATCH 01/19] feat: First draft of harmony_agent project that will synchronize multiple clusters using nats supercluster to communicate --- .gitignore | 2 + harmony_agent/.dockerignore | 4 + harmony_agent/Cargo.toml | 22 +++++ harmony_agent/Dockerfile | 44 +++++++++ harmony_agent/deploy/Cargo.toml | 20 ++++ harmony_agent/deploy/src/main.rs | 55 +++++++++++ harmony_agent/src/agent.rs | 165 +++++++++++++++++++++++++++++++ harmony_agent/src/config.rs | 36 +++++++ harmony_agent/src/main.rs | 24 +++++ 9 files changed, 372 insertions(+) create mode 100644 harmony_agent/.dockerignore create mode 100644 harmony_agent/Cargo.toml create mode 100644 harmony_agent/Dockerfile create mode 100644 harmony_agent/deploy/Cargo.toml create mode 100644 harmony_agent/deploy/src/main.rs create mode 100644 harmony_agent/src/agent.rs create mode 100644 harmony_agent/src/config.rs create mode 100644 harmony_agent/src/main.rs diff --git a/.gitignore b/.gitignore index 3850d09a..3bb0cc1b 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ Cargo.lock # MSVC Windows builds of rustc generate these, which store debugging information *.pdb + +.harmony_generated diff --git a/harmony_agent/.dockerignore b/harmony_agent/.dockerignore new file mode 100644 index 00000000..dd9b5319 --- /dev/null +++ b/harmony_agent/.dockerignore @@ -0,0 +1,4 @@ +.git +data +target +demos diff --git a/harmony_agent/Cargo.toml b/harmony_agent/Cargo.toml new file mode 100644 index 00000000..360e26e4 --- /dev/null +++ b/harmony_agent/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "harmony_agent" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +harmony = { path = "../harmony" } +# harmony_cli = { path = "../harmony_cli" } +harmony_types = { path = "../harmony_types" } +harmony_macros = { path = "../harmony_macros" } +cidr = { workspace = true } +tokio = { workspace = true } +log = { workspace = true } +env_logger = { workspace = true } +async-nats = "0.45.0" +async-trait = "0.1" +# url = { workspace = true } + +serde.workspace = true +serde_json.workspace = true diff --git a/harmony_agent/Dockerfile b/harmony_agent/Dockerfile new file mode 100644 index 00000000..9d72462a --- /dev/null +++ b/harmony_agent/Dockerfile @@ -0,0 +1,44 @@ +# Build stage +FROM rust:slim AS builder + +# Install build dependencies +RUN apt-get update && apt-get install -y pkg-config && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy all required packages +COPY . . + +RUN ls -la1 + +# Build the application in release mode +RUN cargo build --release -p harmony_agent + +# Runtime stage +FROM debian:bookworm-slim + +# Install runtime dependencies +RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy the binary from the builder stage +COPY --from=builder /app/target/release/harmony_agent ./harmony_agent + +# Declare environment variables used by the Harmony Agent +# These will be set from build-time environment variables if present +# NATS_URL: URL of the NATS server (default: nats://localhost:4222) +ARG NATS_URL=nats://localhost:4222 +ENV NATS_URL=${NATS_URL} +# NATS_CREDS_PATH: Optional path to NATS credentials file +ARG NATS_CREDS_PATH +ENV NATS_CREDS_PATH=${NATS_CREDS_PATH} +# MY_CLUSTER_ID: This cluster's unique identifier (required) +ARG MY_CLUSTER_ID +ENV MY_CLUSTER_ID=${MY_CLUSTER_ID} +# DESIRED_PRIMARY: The ID of the desired primary cluster (required) +ARG DESIRED_PRIMARY +ENV DESIRED_PRIMARY=${DESIRED_PRIMARY} + +# Run the application +ENTRYPOINT ["./harmony_agent"] diff --git a/harmony_agent/deploy/Cargo.toml b/harmony_agent/deploy/Cargo.toml new file mode 100644 index 00000000..9aea1e4b --- /dev/null +++ b/harmony_agent/deploy/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "harmony_agent_deploy" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_types = { path = "../../harmony_types" } +harmony_macros = { path = "../../harmony_macros" } +cidr = { workspace = true } +tokio = { workspace = true } +log = { workspace = true } +env_logger = { workspace = true } +url = { workspace = true } + +serde.workspace = true +serde_json.workspace = true diff --git a/harmony_agent/deploy/src/main.rs b/harmony_agent/deploy/src/main.rs new file mode 100644 index 00000000..82fdd15a --- /dev/null +++ b/harmony_agent/deploy/src/main.rs @@ -0,0 +1,55 @@ +use harmony::{ + inventory::Inventory, + modules::{ + application::{ + ApplicationScore, + backend_app::{BackendApp, BuildCommand}, + features::{Monitoring, PackagingDeployment}, + }, + monitoring::alert_channel::discord_alert_channel::DiscordWebhook, + }, + topology::K8sAnywhereTopology, +}; +use harmony_macros::hurl; +use harmony_types::k8s_name::K8sName; +use std::{path::PathBuf, sync::Arc}; + +#[tokio::main] +async fn main() { + let application = Arc::new(BackendApp { + name: "harmony-agent".to_string(), + // This means the script will be run from the harmony_agent directory, not from the + // deploy directory + project_root: PathBuf::from("../"), + network_ports: vec![], + env_vars: vec![], + build_cmd: BuildCommand::new("cargo", vec!["build", "--release", "-p", "harmony_agent"]), + dockerfile: Some(PathBuf::from("Dockerfile")), + }); + + let app = ApplicationScore { + features: vec![ + Box::new(PackagingDeployment { + application: application.clone(), + }), + Box::new(Monitoring { + application: application.clone(), + alert_receiver: vec![Box::new(DiscordWebhook { + name: K8sName("test-discord".to_string()), + url: hurl!("https://discord.doesnt.exist.com"), + selectors: vec![], + })], + }), + ], + application, + }; + + harmony_cli::run( + Inventory::autoload(), + K8sAnywhereTopology::from_env(), // <== Deploy to local automatically provisioned k3d by default or connect to any kubernetes cluster + vec![Box::new(app)], + None, + ) + .await + .unwrap(); +} diff --git a/harmony_agent/src/agent.rs b/harmony_agent/src/agent.rs new file mode 100644 index 00000000..eafc83e2 --- /dev/null +++ b/harmony_agent/src/agent.rs @@ -0,0 +1,165 @@ +use async_trait::async_trait; +use log::{debug, error, info}; +use serde::{Deserialize, Serialize}; +use std::time::{SystemTime, UNIX_EPOCH}; +use harmony_types::id::Id; +use async_nats::jetstream::kv::Store; + +use crate::config::AgentConfig; + +#[async_trait] +pub trait HealthStore: Send + Sync { + async fn put(&self, key: String, value: Vec) -> Result>; +} + +#[async_trait] +impl HealthStore for Store { + async fn put(&self, key: String, value: Vec) -> Result> { + self.put(key, value.into()) + .await + .map_err(|e| Box::new(e) as Box) + } +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AgentHeartbeat { + pub cluster_id: Id, + pub status: String, + pub timestamp: u64, +} + +pub struct HarmonyAgent { + config: AgentConfig, + #[allow(dead_code)] + nats_client: Option, + health_kv: Box, +} + + +impl HarmonyAgent { + pub async fn new(config: AgentConfig) -> Result> { + let mut options = async_nats::ConnectOptions::new(); + if let Some(ref creds) = config.nats_creds_path { + options = options.credentials_file(creds).await?; + } + + let client = async_nats::connect_with_options(&config.nats_url, options).await?; + let jetstream = async_nats::jetstream::new(client.clone()); + + // Initialize KV Buckets as per ADR-017 + const HEARTBEAT_KV_HISTORY_SIZE: i64 = 64; + let health_kv = jetstream + .create_key_value(async_nats::jetstream::kv::Config { + bucket: "harmony_agent_health".to_string(), + history: HEARTBEAT_KV_HISTORY_SIZE, + ..Default::default() + }) + .await?; + + Ok(Self { + config, + nats_client: Some(client), + health_kv: Box::new(health_kv), + }) + } + + + pub async fn run_heartbeat_loop(&self) -> Result<(), Box> { + let mut interval = tokio::time::interval(self.config.heartbeat_interval); + let key = format!("heartbeat.{}", self.config.my_cluster_id); + + info!("Starting heartbeat loop for cluster: {}", self.config.my_cluster_id); + + loop { + interval.tick().await; + + let now = SystemTime::now() + .duration_since(UNIX_EPOCH)? + .as_millis() as u64; + + let heartbeat = AgentHeartbeat { + cluster_id: self.config.my_cluster_id.clone(), + status: "HEALTHY".to_string(), + timestamp: now, + }; + + debug!("Sending heartbeat for cluster: {}", self.config.my_cluster_id); + let payload = serde_json::to_vec(&heartbeat)?; + + // Write heartbeat to KV. ADR-017: Write failure triggers self-demotion logic + match self.health_kv.put(key.clone(), payload).await { + Ok(_) => { + debug!("Heartbeat successful for cluster: {}", self.config.my_cluster_id); + } + Err(e) => { + error!("Failed to write heartbeat: {}. Fencing logic would trigger here.", e); + // In a real implementation, we would trigger self-demotion/fencing here + } + } + + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::{Arc, Mutex}; + use tokio::time::{pause, advance, Duration}; + + struct MockHealthStore { + puts: Arc)>>>, + } + + #[async_trait] + impl HealthStore for MockHealthStore { + async fn put(&self, key: String, value: Vec) -> Result> { + self.puts.lock().unwrap().push((key, value)); + Ok(0) + } + } + + #[tokio::test(start_paused = true)] + async fn test_heartbeat_loop() { + let config = AgentConfig { + nats_url: "nats://localhost:4222".to_string(), + nats_creds_path: None, + my_cluster_id: "test-cluster".into(), + desired_primary: "test-cluster".into(), + heartbeat_interval: Duration::from_millis(100), + }; + + let puts = Arc::new(Mutex::new(Vec::new())); + let mock_store = MockHealthStore { puts: puts.clone() }; + + let agent = HarmonyAgent { + config, + nats_client: None, + health_kv: Box::new(mock_store), + }; + + // Run the loop in a separate task + let handle = tokio::spawn(async move { + let _ = agent.run_heartbeat_loop().await; + }); + + // Advance time in increments to trigger multiple heartbeats + for _ in 0..3 { + advance(Duration::from_millis(100)).await; + tokio::time::sleep(Duration::from_millis(1)).await; + } + + let recorded_puts = puts.lock().unwrap(); + assert!(recorded_puts.len() >= 2, "Should have recorded at least 2 heartbeats, got {}", recorded_puts.len()); + + let (key, payload) = &recorded_puts[0]; + assert_eq!(key, "heartbeat.test-cluster"); + + let heartbeat: AgentHeartbeat = serde_json::from_slice(payload).unwrap(); + assert_eq!(heartbeat.cluster_id.to_string(), "test-cluster"); + assert_eq!(heartbeat.status, "HEALTHY"); + + handle.abort(); + } +} + diff --git a/harmony_agent/src/config.rs b/harmony_agent/src/config.rs new file mode 100644 index 00000000..cf5fe128 --- /dev/null +++ b/harmony_agent/src/config.rs @@ -0,0 +1,36 @@ +use std::env; +use std::time::Duration; +use harmony_types::id::Id; + +/// Configuration for the Harmony Agent +#[derive(Debug, Clone)] +pub struct AgentConfig { + pub nats_url: String, + pub nats_creds_path: Option, + pub my_cluster_id: Id, + pub desired_primary: Id, + pub heartbeat_interval: Duration, +} + +impl AgentConfig { + pub fn load_from_env() -> Result { + let nats_url = env::var("NATS_URL") + .unwrap_or_else(|_| "nats://localhost:4222".to_string()); + + let nats_creds_path = env::var("NATS_CREDS_PATH").ok(); + + let my_cluster_id_str = env::var("MY_CLUSTER_ID") + .map_err(|_| "Environment variable MY_CLUSTER_ID is required".to_string())?; + + let desired_primary_str = env::var("DESIRED_PRIMARY") + .map_err(|_| "Environment variable DESIRED_PRIMARY is required".to_string())?; + + Ok(Self { + nats_url, + nats_creds_path, + my_cluster_id: my_cluster_id_str.into(), + desired_primary: desired_primary_str.into(), + heartbeat_interval: Duration::from_millis(1000), + }) + } +} diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs new file mode 100644 index 00000000..a67e5b99 --- /dev/null +++ b/harmony_agent/src/main.rs @@ -0,0 +1,24 @@ +use crate::{agent::HarmonyAgent, config::AgentConfig}; + +mod agent; +mod config; + + +#[tokio::main] +async fn main() -> Result<(), Box> { + env_logger::init(); + + let config = AgentConfig::load_from_env()?; + + log::info!("Harmony Agent Initialized"); + log::debug!("Identity (My Cluster ID): {}", config.my_cluster_id); + log::debug!("NATS URL : {}", config.nats_url); + + let agent = HarmonyAgent::new(config).await?; + + // Run the heartbeat loop + agent.run_heartbeat_loop().await?; + + Ok(()) +} + -- 2.39.5 From b2f07737957d66ca10e4d4ea850173eb1db07700 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Fri, 23 Jan 2026 09:34:58 -0500 Subject: [PATCH 02/19] wip: Working on backend app deployment --- .dockerignore | 6 +- examples/openbao/src/main.rs | 2 + .../src/modules/application/backend_app.rs | 246 ++++++++++++++++++ harmony/src/modules/application/config.rs | 14 + .../features/packaging_deployment.rs | 24 +- harmony/src/modules/application/helm/mod.rs | 119 +++++++++ harmony/src/modules/application/mod.rs | 15 ++ harmony/src/modules/application/oci.rs | 75 +++++- harmony/src/modules/application/rust.rs | 94 +------ harmony/templates/helm/Chart.yaml.j2 | 6 + harmony/templates/helm/deployment.yaml.j2 | 37 +++ harmony/templates/helm/helpers.yaml.j2 | 8 + harmony/templates/helm/service.yaml.j2 | 15 ++ harmony_types/src/id.rs | 8 + 14 files changed, 566 insertions(+), 103 deletions(-) create mode 100644 harmony/src/modules/application/backend_app.rs create mode 100644 harmony/src/modules/application/config.rs create mode 100644 harmony/src/modules/application/helm/mod.rs create mode 100644 harmony/templates/helm/Chart.yaml.j2 create mode 100644 harmony/templates/helm/deployment.yaml.j2 create mode 100644 harmony/templates/helm/helpers.yaml.j2 create mode 100644 harmony/templates/helm/service.yaml.j2 diff --git a/.dockerignore b/.dockerignore index 2233067c..34513768 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,6 @@ target/ -Dockerfile \ No newline at end of file +Dockerfile +.git +data +target +demos diff --git a/examples/openbao/src/main.rs b/examples/openbao/src/main.rs index 63918b81..ab8c0efa 100644 --- a/examples/openbao/src/main.rs +++ b/examples/openbao/src/main.rs @@ -56,6 +56,8 @@ async fn main() { )), }; + // TODO exec pod commands to initialize secret store if not already done + harmony_cli::run( Inventory::autoload(), K8sAnywhereTopology::from_env(), diff --git a/harmony/src/modules/application/backend_app.rs b/harmony/src/modules/application/backend_app.rs new file mode 100644 index 00000000..1e3dbe78 --- /dev/null +++ b/harmony/src/modules/application/backend_app.rs @@ -0,0 +1,246 @@ +use std::{ffi::OsStr, path::PathBuf}; + +use async_trait::async_trait; +use log::{debug, info, trace}; +use serde::Serialize; + +use crate::{ + config::{REGISTRY_PROJECT, REGISTRY_URL}, + modules::application::{ + Application, HelmPackage, OCICompliant, + config::ApplicationNetworkPort, + helm::{DeploymentTemplate, HelmChart, HelmTemplate, ServiceTemplate}, + webapp::Webapp, + }, +}; + +#[derive(Debug, Clone, Serialize)] +pub struct BuildCommand { + pub program: String, + pub args: Vec, +} + +impl BuildCommand { + pub fn new(program: impl Into, args: Vec>) -> Self { + Self { + program: program.into(), + args: args.into_iter().map(|s| s.into()).collect(), + } + } + + pub fn to_std_command(&self) -> std::process::Command { + let mut cmd = std::process::Command::new(&self.program); + cmd.args(&self.args); + cmd + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct BackendApp { + pub name: String, + pub project_root: std::path::PathBuf, + pub network_ports: Vec, + pub env_vars: Vec<(String, String)>, + pub build_cmd: BuildCommand, + pub dockerfile: Option, +} + +impl BackendApp { + fn get_dockerfile(&self) -> Result { + debug!( + "Looking for dockerfile, currently set to {:?}", + self.dockerfile + ); + if let Some(dockerfile) = &self.dockerfile { + return match dockerfile.exists() { + true => { + info!( + "Found dockerfile as intended at {}", + dockerfile.to_string_lossy() + ); + Ok(dockerfile.clone()) + } + false => Err(format!( + "Dockerfile explicitely set to {dockerfile} does not exist", + dockerfile = dockerfile.to_string_lossy() + )), + }; + } + + let existing_dockerfile = self.project_root.join("Dockerfile"); + + debug!("project_root = {:?}", self.project_root); + + debug!("checking = {:?}", existing_dockerfile); + if existing_dockerfile.exists() { + debug!( + "Checking path {:#?} for existing Dockerfile", + self.project_root.clone() + ); + return Ok(existing_dockerfile); + } + Err(format!( + "Could not find a dockerfile in {project_root} folder. Tried {existing_dockerfile}", + project_root = self.project_root.to_string_lossy(), + existing_dockerfile = existing_dockerfile.to_string_lossy(), + )) + } +} + +impl Application for BackendApp { + fn name(&self) -> String { + self.name.clone() + } +} + +#[async_trait] +impl OCICompliant for BackendApp { + async fn build_push_oci_image(&self) -> Result { + let dockerfile = self.get_dockerfile()?; + let image_tag = self.image_name(); + + let mut child = std::process::Command::new("docker") + .args([ + "build", + "-t", + &image_tag, + "-f", + &dockerfile.to_string_lossy(), + &self.project_root.to_string_lossy() + ]) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .map_err(|e| + format!("Failed to spawn docker build process: {e}") + )?; + + let stdout = child.stdout.take().expect("Failed to capture stdout"); + let stderr = child.stderr.take().expect("Failed to capture stderr"); + + use std::io::{BufRead, BufReader}; + use std::thread; + + let stdout_reader = BufReader::new(stdout); + let stderr_reader = BufReader::new(stderr); + + let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel(); + let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel(); + + let stdout_handle = thread::spawn(move || { + let mut output = String::new(); + for line in stdout_reader.lines() { + match line { + Ok(l) => { + println!("{}", l); + output.push_str(&l); + output.push('\n'); + } + Err(e) => { + trace!("Error reading stdout line: {}", e); + } + } + } + let _ = stdout_sender.send(output); + }); + + let stderr_handle = thread::spawn(move || { + let mut output = String::new(); + for line in stderr_reader.lines() { + match line { + Ok(l) => { + eprintln!("{}", l); + output.push_str(&l); + output.push('\n'); + } + Err(e) => { + trace!("Error reading stderr line: {}", e); + } + } + } + let _ = stderr_sender.send(output); + }); + + let status = child.wait().map_err(|e| + format!("Failed to wait for docker build process: {e}") + )?; + + let stdout_lines = stdout_handle.join().map_err(|e| format!("Stdout thread panicked: {e:?}")) + .and_then(|_| stdout_receiver.recv().map_err(|e| format!("Failed to receive stdout: {e}")))?; + let stderr_lines = stderr_handle.join().map_err(|e| format!("Stderr thread panicked: {e:?}")) + .and_then(|_| stderr_receiver.recv().map_err(|e| format!("Failed to receive stderr: {e}")))?; + + let output_content = format!( + "\n{stdout}\n\n{stderr}", + stdout = stdout_lines, + stderr = stderr_lines, + ); + match status.success() { + true => { + info!("Docker image build succeeded"); + Ok(image_tag) + } + false => Err(format!("Docker image build FAILED :{output_content}")), + } + } + + fn local_image_name(&self) -> String { + self.name.clone() + } + + fn image_name(&self) -> String { + format!( + "{}/{}/{}", + *REGISTRY_URL, + *REGISTRY_PROJECT, + &self.local_image_name() + ) + } +} + +#[async_trait] +impl HelmPackage for BackendApp { + fn project_root(&self) -> PathBuf { + self.project_root.clone() + } + + fn chart_name(&self) -> String { + self.name.clone() + } + + async fn build_push_helm_package(&self, image_url: &str) -> Result { + let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string()); + + // Extract the first network port for the container port (if available) + let container_port = self.network_ports.first().map(|p| p.number); + + // Create and add DeploymentTemplate with image URL and environment variables + let deployment = DeploymentTemplate { + name: self.name.clone(), + container_port, + env_vars: self.env_vars.clone(), + }; + helm_chart.add_template(Box::new(deployment)); + + // Create and add ServiceTemplate if a port is available + if let Some(port) = container_port { + let service = ServiceTemplate { port }; + helm_chart.add_template(Box::new(service)); + } + + // Add common Helm values + helm_chart.add_value("replicaCount", "1"); + helm_chart.add_value("image.repository", image_url); + helm_chart.add_value("image.pullPolicy", "IfNotPresent"); + helm_chart.add_value("service.type", "ClusterIP"); + + // Write the Helm chart to the project root + let chart_dir = helm_chart + .write_to(&self.project_root.join(".harmony_generated/helm/")) + .map_err(|e| format!("Failed to write Helm chart: {}", e))?; + + info!("Helm chart for '{}' written to: {:?}", self.name, chart_dir); + + Ok(chart_dir.to_string_lossy().to_string()) + } +} diff --git a/harmony/src/modules/application/config.rs b/harmony/src/modules/application/config.rs new file mode 100644 index 00000000..d35ab604 --- /dev/null +++ b/harmony/src/modules/application/config.rs @@ -0,0 +1,14 @@ +use serde::Serialize; + +#[derive(Debug, Clone, Serialize)] +pub enum NetworkProtocol { + TCP, + UDP, +} + +#[derive(Debug, Clone, Serialize)] +pub struct ApplicationNetworkPort { + pub number: u16, + pub protocol: NetworkProtocol, + pub name: String, +} diff --git a/harmony/src/modules/application/features/packaging_deployment.rs b/harmony/src/modules/application/features/packaging_deployment.rs index 03f11000..2f107462 100644 --- a/harmony/src/modules/application/features/packaging_deployment.rs +++ b/harmony/src/modules/application/features/packaging_deployment.rs @@ -48,11 +48,11 @@ use crate::{ /// - ArgoCD to install/upgrade/rollback/inspect k8s resources /// - Kubernetes for runtime orchestration #[derive(Debug, Default, Clone)] -pub struct PackagingDeployment { +pub struct PackagingDeployment { pub application: Arc, } -impl PackagingDeployment { +impl PackagingDeployment { async fn deploy_to_local_k3d( &self, app_name: String, @@ -138,7 +138,7 @@ impl PackagingDeployment { #[async_trait] impl< - A: OCICompliant + HelmPackage + Webapp + Clone + 'static, + A: OCICompliant + HelmPackage + Clone + 'static, T: Topology + HelmCommand + MultiTargetTopology + K8sclient + Ingress + 'static, > ApplicationFeature for PackagingDeployment { @@ -148,24 +148,12 @@ impl< ) -> Result { let image = self.application.image_name(); - let domain = if topology.current_target() == DeploymentTarget::Production { - self.application.dns() - } else { - topology - .get_domain(&self.application.name()) - .await - .map_err(|e| e.to_string())? - }; - // TODO Write CI/CD workflow files // we can autotedect the CI type using the remote url (default to github action for github // url, etc..) // Or ask for it when unknown - let helm_chart = self - .application - .build_push_helm_package(&image, &domain) - .await?; + let helm_chart = self.application.build_push_helm_package(&image).await?; // TODO: Make building image configurable/skippable if image already exists (prompt)") // https://git.nationtech.io/NationTech/harmony/issues/104 @@ -215,12 +203,12 @@ impl< }; Ok(InstallationOutcome::success_with_details(vec![format!( - "{}: http://{domain}", + "{}", self.application.name() )])) } fn name(&self) -> String { - "ContinuousDelivery".to_string() + "PackagingDeployment".to_string() } } diff --git a/harmony/src/modules/application/helm/mod.rs b/harmony/src/modules/application/helm/mod.rs new file mode 100644 index 00000000..da40b0b6 --- /dev/null +++ b/harmony/src/modules/application/helm/mod.rs @@ -0,0 +1,119 @@ +use askama::Template; +use std::fs; +use std::path::{Path, PathBuf}; + +/// Trait for any resource that can be rendered into a file in the Helm chart. +pub trait HelmTemplate: Send + Sync { + fn filename(&self) -> String; + fn render_template(&self) -> Result; +} + +/// The main orchestrator for building a Helm chart. +pub struct HelmChart { + pub name: String, + pub version: String, + pub app_version: String, + pub description: String, + pub templates: Vec>, + pub values: Vec, +} + +impl HelmChart { + pub fn new(name: String, app_version: String) -> Self { + Self { + name: name.clone(), + version: "0.1.0".to_string(), + app_version, + description: format!("A Helm chart for {}", name), + templates: Vec::new(), + values: Vec::new(), + } + } + + pub fn add_template(&mut self, template: Box) { + self.templates.push(template); + } + + pub fn add_value(&mut self, key: &str, value: &str) { + self.values.push(format!("{}: {}", key, value)); + } + + pub fn write_to(&self, base_path: &Path) -> Result> { + let chart_dir = base_path.join(&self.name); + let templates_dir = chart_dir.join("templates"); + fs::create_dir_all(&templates_dir)?; + + // 1. Render and write Chart.yaml + let chart_yaml = ChartYaml { + name: &self.name, + description: &self.description, + version: &self.version, + app_version: &self.app_version, + }; + fs::write(chart_dir.join("Chart.yaml"), chart_yaml.render()?)?; + + // 2. Write values.yaml (Constructed dynamically) + let values_content = self.values.join("\n"); + fs::write(chart_dir.join("values.yaml"), values_content)?; + + // 3. Render and write _helpers.tpl + let helpers = HelpersTpl; + fs::write(templates_dir.join("_helpers.tpl"), helpers.render()?)?; + + // 4. Render and write all added templates (Deployment, Service, etc.) + for template in &self.templates { + let filename = template.filename(); + let content = template.render_template()?; + fs::write(templates_dir.join(filename), content)?; + } + + Ok(chart_dir) + } +} + +// --- Templates --- + +#[derive(Template)] +#[template(path = "helm/Chart.yaml.j2")] +struct ChartYaml<'a> { + name: &'a str, + description: &'a str, + version: &'a str, + app_version: &'a str, +} + +#[derive(Template)] +#[template(path = "helm/helpers.yaml.j2")] +struct HelpersTpl; + +#[derive(Template)] +#[template(path = "helm/deployment.yaml.j2")] +pub struct DeploymentTemplate { + pub name: String, + pub container_port: Option, + pub env_vars: Vec<(String, String)>, +} + +impl HelmTemplate for DeploymentTemplate { + fn filename(&self) -> String { + "deployment.yaml".to_string() + } + fn render_template(&self) -> Result { + self.render() + } +} + +#[derive(Template)] +#[template(path = "helm/service.yaml.j2")] +pub struct ServiceTemplate { + pub port: u16, // Used only to enforce logic if needed, though template uses Values +} + +impl HelmTemplate for ServiceTemplate { + fn filename(&self) -> String { + "service.yaml".to_string() + } + fn render_template(&self) -> Result { + self.render() + } +} diff --git a/harmony/src/modules/application/mod.rs b/harmony/src/modules/application/mod.rs index 03965e35..00e85843 100644 --- a/harmony/src/modules/application/mod.rs +++ b/harmony/src/modules/application/mod.rs @@ -1,8 +1,11 @@ +pub mod backend_app; +pub mod config; mod feature; pub mod features; pub mod oci; mod rust; mod webapp; +pub mod helm; use std::sync::Arc; pub use feature::*; @@ -124,3 +127,15 @@ impl Serialize for dyn Application { todo!() } } + +/// Checks the output of a process command for success. +fn check_output( + output: &std::process::Output, + msg: &str, +) -> Result<(), Box> { + if !output.status.success() { + let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr)); + return Err(error_message.into()); + } + Ok(()) +} diff --git a/harmony/src/modules/application/oci.rs b/harmony/src/modules/application/oci.rs index 8b1585c8..102bcd8c 100644 --- a/harmony/src/modules/application/oci.rs +++ b/harmony/src/modules/application/oci.rs @@ -1,5 +1,10 @@ +use std::path::{Path, PathBuf}; + +use crate::{config::{REGISTRY_PROJECT, REGISTRY_URL}, modules::application::check_output}; + use super::Application; use async_trait::async_trait; +use log::debug; #[async_trait] pub trait OCICompliant: Application { @@ -20,6 +25,74 @@ pub trait HelmPackage: Application { async fn build_push_helm_package( &self, image_url: &str, - domain: &str, ) -> Result; + + fn project_root(&self) -> PathBuf; + + fn chart_name(&self) -> String; + + /// Packages a Helm chart directory into a .tgz file. + fn package_helm_chart(&self, chart_dir: &Path) -> Result> { + let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname"); + debug!( + "Launching `helm package {}` cli with CWD {}", + chart_dirname.to_string_lossy(), + &self + .project_root() + .join(".harmony_generated") + .join("helm") + .to_string_lossy() + ); + let output = std::process::Command::new("helm") + .args(["package", chart_dirname.to_str().unwrap()]) + .current_dir(self.project_root().join(".harmony_generated").join("helm")) // Run package from the parent dir + .output()?; + + check_output(&output, "Failed to package Helm chart")?; + + // Helm prints the path of the created chart to stdout. + let tgz_name = String::from_utf8(output.stdout)? + .split_whitespace() + .last() + .unwrap_or_default() + .to_string(); + if tgz_name.is_empty() { + return Err("Could not determine packaged chart filename.".into()); + } + + // The output from helm is relative, so we join it with the execution directory. + Ok(self + .project_root() + .join(".harmony_generated") + .join("helm") + .join(tgz_name)) + } + + /// Pushes a packaged Helm chart to an OCI registry. + fn push_helm_chart( + &self, + packaged_chart_path: &Path, + ) -> Result> { + // The chart name is the file stem of the .tgz file + let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap(); + let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT); + let oci_pull_url = format!("{oci_push_url}/{}-chart", self.chart_name()); + debug!( + "Pushing Helm chart {} to {}", + packaged_chart_path.to_string_lossy(), + oci_push_url + ); + + let output = std::process::Command::new("helm") + .args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url]) + .output()?; + + check_output(&output, "Pushing Helm chart failed")?; + + // The final URL includes the version tag, which is part of the file name + let version = chart_file_name.rsplit_once('-').unwrap().1; + debug!("pull url {oci_pull_url}"); + debug!("push url {oci_push_url}"); + Ok(format!("{}:{}", oci_pull_url, version)) + } } diff --git a/harmony/src/modules/application/rust.rs b/harmony/src/modules/application/rust.rs index 8384e78a..4e41187c 100644 --- a/harmony/src/modules/application/rust.rs +++ b/harmony/src/modules/application/rust.rs @@ -81,16 +81,21 @@ impl Webapp for RustWebapp { #[async_trait] impl HelmPackage for RustWebapp { - async fn build_push_helm_package( - &self, - image_url: &str, - domain: &str, - ) -> Result { + fn project_root(&self) -> PathBuf { + self.project_root.clone() + } + + fn chart_name(&self) -> String { + self.name.clone() + } + + async fn build_push_helm_package(&self, image_url: &str) -> Result { + let domain = self.dns(); info!("Starting Helm chart build and push for '{}'", self.name); // 1. Create the Helm chart files on disk. let chart_dir = self - .create_helm_chart_files(image_url, domain) + .create_helm_chart_files(image_url, &domain) .await .map_err(|e| format!("Failed to create Helm chart files: {}", e))?; info!("Successfully created Helm chart files in {:?}", chart_dir); @@ -327,19 +332,6 @@ impl RustWebapp { Ok(image_tag.to_string()) } - /// Checks the output of a process command for success. - fn check_output( - &self, - output: &process::Output, - msg: &str, - ) -> Result<(), Box> { - if !output.status.success() { - let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr)); - return Err(error_message.into()); - } - Ok(()) - } - fn build_builder_image(&self, dockerfile: &mut Dockerfile) { match self.framework { Some(RustWebFramework::Leptos) => { @@ -640,70 +632,6 @@ spec: Ok(chart_dir) } - /// Packages a Helm chart directory into a .tgz file. - fn package_helm_chart(&self, chart_dir: &Path) -> Result> { - let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname"); - debug!( - "Launching `helm package {}` cli with CWD {}", - chart_dirname.to_string_lossy(), - &self - .project_root - .join(".harmony_generated") - .join("helm") - .to_string_lossy() - ); - let output = process::Command::new("helm") - .args(["package", chart_dirname.to_str().unwrap()]) - .current_dir(self.project_root.join(".harmony_generated").join("helm")) // Run package from the parent dir - .output()?; - - self.check_output(&output, "Failed to package Helm chart")?; - - // Helm prints the path of the created chart to stdout. - let tgz_name = String::from_utf8(output.stdout)? - .split_whitespace() - .last() - .unwrap_or_default() - .to_string(); - if tgz_name.is_empty() { - return Err("Could not determine packaged chart filename.".into()); - } - - // The output from helm is relative, so we join it with the execution directory. - Ok(self - .project_root - .join(".harmony_generated") - .join("helm") - .join(tgz_name)) - } - - /// Pushes a packaged Helm chart to an OCI registry. - fn push_helm_chart( - &self, - packaged_chart_path: &Path, - ) -> Result> { - // The chart name is the file stem of the .tgz file - let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap(); - let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT); - let oci_pull_url = format!("{oci_push_url}/{}-chart", self.name); - debug!( - "Pushing Helm chart {} to {}", - packaged_chart_path.to_string_lossy(), - oci_push_url - ); - - let output = process::Command::new("helm") - .args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url]) - .output()?; - - self.check_output(&output, "Pushing Helm chart failed")?; - - // The final URL includes the version tag, which is part of the file name - let version = chart_file_name.rsplit_once('-').unwrap().1; - debug!("pull url {oci_pull_url}"); - debug!("push url {oci_push_url}"); - Ok(format!("{}:{}", oci_pull_url, version)) - } fn get_or_build_dockerfile(&self) -> Result> { let existing_dockerfile = self.project_root.join("Dockerfile"); diff --git a/harmony/templates/helm/Chart.yaml.j2 b/harmony/templates/helm/Chart.yaml.j2 new file mode 100644 index 00000000..bddcc93e --- /dev/null +++ b/harmony/templates/helm/Chart.yaml.j2 @@ -0,0 +1,6 @@ +apiVersion: v2 +name: {{ name }} +description: {{ description }} +type: application +version: {{ version }} +appVersion: "{{ app_version }}" diff --git a/harmony/templates/helm/deployment.yaml.j2 b/harmony/templates/helm/deployment.yaml.j2 new file mode 100644 index 00000000..b060b8f1 --- /dev/null +++ b/harmony/templates/helm/deployment.yaml.j2 @@ -0,0 +1,37 @@ +{% raw %} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "chart.fullname" . }} + labels: + app: {{ include "chart.name" . }} +spec: + replicas: {{ .Values.replicaCount | default 1 }} + selector: + matchLabels: + app: {{ include chart.name . }} + template: + metadata: + labels: + app: {{ include chart.name . }} + spec: + containers: +{% endraw %} + - name: {{ name }} +{% raw %} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: IfNotPresent +{% endraw %} + {% if let Some(port) = container_port %} + ports: + - name: http + containerPort: {{ port }} + protocol: TCP + {% endif %} + {% if !env_vars.is_empty() %} + env: + {% for (k, v) in env_vars %} + - name: {{ k }} + value: {{ v }} + {% endfor %} + {% endif %} diff --git a/harmony/templates/helm/helpers.yaml.j2 b/harmony/templates/helm/helpers.yaml.j2 new file mode 100644 index 00000000..ff93848e --- /dev/null +++ b/harmony/templates/helm/helpers.yaml.j2 @@ -0,0 +1,8 @@ +{% raw %} +{{- define \"chart.fullname\" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix \"-\" }} +{{- end }} +{{- define "chart.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} +{% endraw %} diff --git a/harmony/templates/helm/service.yaml.j2 b/harmony/templates/helm/service.yaml.j2 new file mode 100644 index 00000000..c6582d22 --- /dev/null +++ b/harmony/templates/helm/service.yaml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ "{{ include \"chart.fullname\" . }}" }} + labels: + app: {{ "{{ include \"chart.name\" . }}" }} +spec: + type: ClusterIP + ports: + - port: {{ "{{ .Values.service.port }}" }} + targetPort: http + protocol: TCP + name: http + selector: + app: {{ "{{ include \"chart.name\" . }}" }} diff --git a/harmony_types/src/id.rs b/harmony_types/src/id.rs index 0a829068..748c1050 100644 --- a/harmony_types/src/id.rs +++ b/harmony_types/src/id.rs @@ -32,6 +32,14 @@ impl Id { } } +impl Into for &str { + fn into(self) -> Id { + Id { + value: self.to_string(), + } + } +} + impl FromStr for Id { type Err = (); -- 2.39.5 From c20db5b361193f5ab3ce7e0cc433d884c7e4be48 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Fri, 23 Jan 2026 11:49:32 -0500 Subject: [PATCH 03/19] doc(adr): New ADR Template hydration for strongly typed workload deployment --- ...plate-Hydration-For-Workload-Deployment.md | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 adr/018-Template-Hydration-For-Workload-Deployment.md diff --git a/adr/018-Template-Hydration-For-Workload-Deployment.md b/adr/018-Template-Hydration-For-Workload-Deployment.md new file mode 100644 index 00000000..cd45ed97 --- /dev/null +++ b/adr/018-Template-Hydration-For-Workload-Deployment.md @@ -0,0 +1,141 @@ +# Architecture Decision Record: Template Hydration for Kubernetes Manifest Generation + +Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay + +Initial Date: 2025-01-23 + +Last Updated Date: 2025-01-23 + +## Status + +Implemented + +## Context + +Harmony's philosophy is built on three guiding principles: Infrastructure as Resilient Code, Prove It Works — Before You Deploy, and One Unified Model. Our goal is to shift validation and verification as left as possible—ideally to compile time—rather than discovering errors at deploy time. + +After investigating a few approaches such as compile-checked Askama templates to generate Kubernetes manifests for Helm charts, we found again that this approach suffered from several fundamental limitations: + +* **Late Validation:** Typos in template syntax or field names are only discovered at deployment time, not during compilation. A mistyped `metadata.name` won't surface until Helm attempts to render the template. +* **Brittle Maintenance:** Templates are string-based with limited IDE support. Refactoring requires grep-and-replace across YAML-like template files, risking subtle breakage. +* **Hard-to-Test Logic:** Testing template output requires mocking the template engine and comparing serialized strings rather than asserting against typed data structures. +* **No Type Safety:** There is no guarantee that the generated YAML will be valid Kubernetes resources without runtime validation. + +We also faced a strategic choice around Helm: use it as both *templating engine* and *packaging mechanism*, or decouple these concerns. While Helm's ecosystem integration (Harbor, ArgoCD, OCI registry support) is valuable, the Jinja-like templating is at odds with Harmony's "code-first" ethos. + +## Decision + +We will adopt the **Template Hydration Pattern**—constructing Kubernetes manifests programmatically using strongly-typed `kube-rs` objects, then serializing them to YAML files for packaging into Helm charts. + +Specifically: + +* **Write strongly typed `k8s_openapi` Structs:** All Kubernetes resources (Deployment, Service, ConfigMap, etc.) will be constructed using the typed structs generated by `k8s_openapi`. +* **Direct Serialization to YAML:** Rather than rendering templates, we use `serde_yaml::to_string()` to serialize typed objects directly into YAML manifests. This way, YAML is only used as a data-transfer format and not a templating/programming language - which it is not. +* **Helm as Packaging-Only:** Helm's role is reduced to packaging pre-rendered templates into a tarball and pushing to OCI registries. No template rendering logic resides within Helm. +* **Ecosystem Preservation:** The generated Helm charts remain fully compatible with Harbor, ArgoCD, and any Helm-compatible tool—the only difference is that the `templates/` directory contains static YAML files. + +The implementation in `backend_app.rs` demonstrates this pattern: + +```rust +let deployment = Deployment { + metadata: ObjectMeta { + name: Some(self.name.clone()), + labels: Some([("app.kubernetes.io/name".to_string(), self.name.clone())].into()), + ..Default::default() + }, + spec: Some(DeploymentSpec { /* ... */ }), + ..Default::default() +}; + +let deployment_yaml = serde_yaml::to_string(&deployment)?; +fs::write(templates_dir.join("deployment.yaml"), deployment_yaml)?; +``` + +## Rationale + +**Aligns with "Infrastructure as Resilient Code"** + +Harmony's first principle states that infrastructure should be treated like application code. By expressing Kubernetes manifests as Rust structs, we gain: + +* **Refactorability:** Rename a label and the compiler catches all usages. +* **IDE Support:** Autocomplete for all Kubernetes API fields; documentation inline. +* **Code Navigation:** Jump to definition shows exactly where a value comes from. + +**Achieves "Prove It Works — Before You Deploy"** + +The compiler now validates that: + +* All required fields are populated (Rust's `Option` type prevents missing fields). +* Field types match expectations (ports are integers, not strings). +* Enums contain valid values (e.g., `ServiceType::ClusterIP`). + +This moves what was runtime validation into compile-time checks, fulfilling the "shift left" promise. + +**Enables True Unit Testing** + +Developers can now write unit tests that assert directly against typed objects: + +```rust +let deployment = create_deployment(&app); +assert_eq!(deployment.spec.unwrap().replicas.unwrap(), 3); +assert_eq!(deployment.metadata.name.unwrap(), "my-app"); +``` + +No string parsing, no YAML serialization, no fragile assertions against rendered output. + +**Preserves Ecosystem Benefits** + +By generating standard Helm chart structures, Harmony retains compatibility with: + +* **OCI Registries (Harbor, GHCR):** `helm push` works exactly as before. +* **ArgoCD:** Syncs and manages releases using the generated charts. +* **Existing Workflows:** Teams already consuming Helm charts see no change. + +The Helm tarball becomes a "dumb pipe" for transport, which is arguably its ideal role. + +## Consequences + +### Positive + +* **Compile-Time Safety:** A broad class of errors (typos, missing fields, type mismatches) is now caught at build time. +* **Better Developer Experience:** IDE autocomplete, inline documentation, and refactor support significantly reduce the learning curve for Kubernetes manifests. +* **Testability:** Unit tests can validate manifest structure without integration or runtime checks. +* **Auditability:** The source-of-truth for manifests is now pure Rust—easier to review in pull requests than template logic scattered across files. +* **Future-Extensibility:** CustomResources (CRDs) can be supported via `kopium`-generated Rust types, maintaining the same strong typing. + +### Negative + +* **API Schema Drift:** Kubernetes API changes require regenerating `k8s_openapi` types and updating code. A change in a struct field will cause the build to fail—intentionally, but still requiring the pipeline to be updated. +* **Verbosity:** Typed construction is more verbose than the equivalent template. Builder patterns or helper functions will be needed to keep code readable. +* **Learning Curve:** Contributors must understand both the Kubernetes resource spec *and* the Rust type system, rather than just YAML. +* **Debugging Shift:** When debugging generated YAML, you now trace through Rust code rather than template files—more precise but different mental model. + +## Alternatives Considered + +### 1. Enhance Askama with Compile-Time Validation +*Pros:* Stay within familiar templating paradigm; minimal code changes. +*Cons:* Rust's type system cannot fully express Kubernetes schema validation without significant macro boilerplate. Errors would still surface at template evaluation time, not compilation. + +### 2. Use Helm SDK Programmatically (Go) +*Pros:* Direct access to Helm's template engine; no YAML serialization step. +*Cons:* Would introduce a second language (Go) into a Rust codebase, increasing cognitive load and compilation complexity. No improvement in compile-time safety. + +### 3. Raw YAML String Templating (Manual) +*Pros:* Maximum control; no external dependencies. +*Cons:* Even more error-prone than Askama; no structure validation; string concatenation errors abound. + +### 4. Use Kustomize for All Manifests +*Pros:* Declarative overlays; standard tool. +*Cons:* Kustomize is itself a layer over YAML templates with its own DSL. It does not provide compile-time type safety and would require externalizing manifest management outside Harmony's codebase. + +__Note that this template hydration architecture still allows to override templates with tools like kustomize when required__ + +## Additional Notes + +**Scalability to Future Topologies** + +The Template Hydration pattern enables future Harmony architectures to generate manifests dynamically based on topology context. For example, a `CostTopology` might adjust resource requests based on cluster pricing, manipulating the typed `Deployment::spec` directly before serialization. + +**Implementation Status** + +As of this writing, the pattern is implemented for `BackendApp` deployments (`backend_app.rs`). The next phase is to extend this pattern across all application modules (`webapp.rs`, etc.) and to standardize on this approach for any new implementations. -- 2.39.5 From ab68e7309d30781c394b1cfb581b2db2efd63b65 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Fri, 23 Jan 2026 23:31:37 -0500 Subject: [PATCH 04/19] feat: Use k8s openapi structs as helm chart resources following ADR 018 --- .../src/modules/application/backend_app.rs | 461 ++++++++++++++++-- harmony/src/modules/application/config.rs | 9 + harmony/src/modules/application/helm/mod.rs | 329 +++++++++++-- harmony/templates/helm/deployment.yaml.j2 | 37 -- harmony/templates/helm/helpers.yaml.j2 | 8 - harmony/templates/helm/service.yaml.j2 | 15 - 6 files changed, 720 insertions(+), 139 deletions(-) delete mode 100644 harmony/templates/helm/deployment.yaml.j2 delete mode 100644 harmony/templates/helm/helpers.yaml.j2 delete mode 100644 harmony/templates/helm/service.yaml.j2 diff --git a/harmony/src/modules/application/backend_app.rs b/harmony/src/modules/application/backend_app.rs index 1e3dbe78..83e24f5b 100644 --- a/harmony/src/modules/application/backend_app.rs +++ b/harmony/src/modules/application/backend_app.rs @@ -1,5 +1,4 @@ -use std::{ffi::OsStr, path::PathBuf}; - +use std::path::PathBuf; use async_trait::async_trait; use log::{debug, info, trace}; use serde::Serialize; @@ -7,10 +6,7 @@ use serde::Serialize; use crate::{ config::{REGISTRY_PROJECT, REGISTRY_URL}, modules::application::{ - Application, HelmPackage, OCICompliant, - config::ApplicationNetworkPort, - helm::{DeploymentTemplate, HelmChart, HelmTemplate, ServiceTemplate}, - webapp::Webapp, + config::ApplicationNetworkPort, helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind}, Application, HelmPackage, OCICompliant }, }; @@ -106,14 +102,12 @@ impl OCICompliant for BackendApp { &image_tag, "-f", &dockerfile.to_string_lossy(), - &self.project_root.to_string_lossy() + &self.project_root.to_string_lossy(), ]) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) .spawn() - .map_err(|e| - format!("Failed to spawn docker build process: {e}") - )?; + .map_err(|e| format!("Failed to spawn docker build process: {e}"))?; let stdout = child.stdout.take().expect("Failed to capture stdout"); let stderr = child.stderr.take().expect("Failed to capture stderr"); @@ -161,14 +155,26 @@ impl OCICompliant for BackendApp { let _ = stderr_sender.send(output); }); - let status = child.wait().map_err(|e| - format!("Failed to wait for docker build process: {e}") - )?; + let status = child + .wait() + .map_err(|e| format!("Failed to wait for docker build process: {e}"))?; - let stdout_lines = stdout_handle.join().map_err(|e| format!("Stdout thread panicked: {e:?}")) - .and_then(|_| stdout_receiver.recv().map_err(|e| format!("Failed to receive stdout: {e}")))?; - let stderr_lines = stderr_handle.join().map_err(|e| format!("Stderr thread panicked: {e:?}")) - .and_then(|_| stderr_receiver.recv().map_err(|e| format!("Failed to receive stderr: {e}")))?; + let stdout_lines = stdout_handle + .join() + .map_err(|e| format!("Stdout thread panicked: {e:?}")) + .and_then(|_| { + stdout_receiver + .recv() + .map_err(|e| format!("Failed to receive stdout: {e}")) + })?; + let stderr_lines = stderr_handle + .join() + .map_err(|e| format!("Stderr thread panicked: {e:?}")) + .and_then(|_| { + stderr_receiver + .recv() + .map_err(|e| format!("Failed to receive stderr: {e}")) + })?; let output_content = format!( "\n{stdout}\n\n{stderr}", @@ -211,30 +217,38 @@ impl HelmPackage for BackendApp { async fn build_push_helm_package(&self, image_url: &str) -> Result { let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string()); - // Extract the first network port for the container port (if available) - let container_port = self.network_ports.first().map(|p| p.number); + // Build the typed Deployment object using the builder + let mut deployment_builder = DeploymentBuilder::new(&self.name, image_url); - // Create and add DeploymentTemplate with image URL and environment variables - let deployment = DeploymentTemplate { - name: self.name.clone(), - container_port, - env_vars: self.env_vars.clone(), - }; - helm_chart.add_template(Box::new(deployment)); - - // Create and add ServiceTemplate if a port is available - if let Some(port) = container_port { - let service = ServiceTemplate { port }; - helm_chart.add_template(Box::new(service)); + // Add container ports + for port in &self.network_ports { + deployment_builder = deployment_builder.with_container_port( + port.number as i32, + &port.name, + port.protocol.as_str(), + ); } - // Add common Helm values - helm_chart.add_value("replicaCount", "1"); - helm_chart.add_value("image.repository", image_url); - helm_chart.add_value("image.pullPolicy", "IfNotPresent"); - helm_chart.add_value("service.type", "ClusterIP"); + // Add environment variables + for (key, value) in &self.env_vars { + deployment_builder = deployment_builder.with_env_var(key, value); + } - // Write the Helm chart to the project root + let deployment = deployment_builder.build(); + helm_chart.add_resource(HelmResourceKind::Deployment(deployment)); + + // Build the typed Service object using the helper function + let network_ports: Vec<(String, u16, String)> = self + .network_ports + .iter() + .map(|p| (p.name.clone(), p.number, p.protocol.as_str().to_string())) + .collect(); + + if let Some(service) = helm::create_service_from_ports(self.name.clone(), &network_ports) { + helm_chart.add_resource(HelmResourceKind::Service(service)); + } + + // Write the Helm chart metadata to the project root let chart_dir = helm_chart .write_to(&self.project_root.join(".harmony_generated/helm/")) .map_err(|e| format!("Failed to write Helm chart: {}", e))?; @@ -244,3 +258,376 @@ impl HelmPackage for BackendApp { Ok(chart_dir.to_string_lossy().to_string()) } } + + +#[cfg(test)] +mod tests { + use super::*; + use crate::modules::application::config::ApplicationNetworkPort; + use crate::modules::application::config::NetworkProtocol; + use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString; + use serde_yaml::from_str; + use k8s_openapi::api::core::v1::Service as K8sService; + use k8s_openapi::api::apps::v1::Deployment; + use std::fs; + + fn cleanup_test_dirs(project_root: &PathBuf) { + let helm_dir = project_root.join(".harmony_generated/helm/"); + if helm_dir.exists() { + let _ = fs::remove_dir_all(&helm_dir); + } + } + + fn create_test_backend_app_with_ports() -> BackendApp { + BackendApp { + name: "test-app".to_string(), + project_root: "/tmp/test_backend".into(), + network_ports: vec![ + ApplicationNetworkPort { + number: 8080, + protocol: NetworkProtocol::TCP, + name: "http".to_string(), + }, + ApplicationNetworkPort { + number: 9000, + protocol: NetworkProtocol::TCP, + name: "metrics".to_string(), + }, + ApplicationNetworkPort { + number: 50051, + protocol: NetworkProtocol::TCP, + name: "grpc".to_string(), + }, + ], + env_vars: vec![ + ("ENV_VAR_1".to_string(), "value1".to_string()), + ("ENV_VAR_2".to_string(), "value2".to_string()), + ], + build_cmd: BuildCommand::new("cargo", vec!["build"]), + dockerfile: None, + } + } + + fn create_test_backend_app_no_ports() -> BackendApp { + BackendApp { + name: "test-app-no-ports".to_string(), + project_root: "/tmp/test_backend_no_ports".into(), + network_ports: vec![], + env_vars: vec![("ENV_VAR_1".to_string(), "value1".to_string())], + build_cmd: BuildCommand::new("cargo", vec!["build"]), + dockerfile: None, + } + } + + #[tokio::test] + async fn test_service_created_with_all_network_ports() { + let app = create_test_backend_app_with_ports(); + let test_dir = app.project_root.clone(); + + cleanup_test_dirs(&test_dir); + + let result = app + .build_push_helm_package("registry.example.com/test/test-app:1.0.0") + .await; + + assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result); + + let service_yaml_path = test_dir + .join(".harmony_generated/helm/test-app/templates/service.yaml"); + assert!( + service_yaml_path.exists(), + "service.yaml should exist when there are network ports" + ); + + let service_yaml_content = fs::read_to_string(&service_yaml_path) + .expect("Failed to read service.yaml"); + + let service: K8sService = from_str(&service_yaml_content) + .expect("Failed to parse service.yaml as K8s Service"); + + assert_eq!( + service.metadata.name.as_deref(), + Some("test-app"), + "Service name should match app name" + ); + assert_eq!( + service.spec.as_ref().unwrap().type_.as_deref(), + Some("ClusterIP"), + "Service type should be ClusterIP" + ); + + let ports = service + .spec + .as_ref() + .unwrap() + .ports + .as_ref() + .expect("Service should have ports"); + + assert_eq!(ports.len(), 3, "Service should have 3 ports"); + + let http_port = &ports[0]; + assert_eq!(http_port.name.as_deref(), Some("http"), "First port name should be 'http'"); + assert_eq!(http_port.protocol.as_deref(), Some("TCP"), "First port protocol should be 'TCP'"); + assert_eq!(http_port.port, 8080, "First port number should be 8080"); + + let metrics_port = &ports[1]; + assert_eq!(metrics_port.name.as_deref(), Some("metrics"), "Second port name should be 'metrics'"); + assert_eq!(metrics_port.protocol.as_deref(), Some("TCP"), "Second port protocol should be 'TCP'"); + assert_eq!(metrics_port.port, 9000, "Second port number should be 9000"); + + let grpc_port = &ports[2]; + assert_eq!(grpc_port.name.as_deref(), Some("grpc"), "Third port name should be 'grpc'"); + assert_eq!(grpc_port.protocol.as_deref(), Some("TCP"), "Third port protocol should be 'TCP'"); + assert_eq!(grpc_port.port, 50051, "Third port number should be 50051"); + + for port in ports.iter() { + match &port.target_port { + Some(IntOrString::Int(target)) => { + assert_eq!( + *target, port.port, + "Target port should match service port for {}", + port.name.as_deref().unwrap_or("unknown") + ); + } + _ => panic!("Target port should be Int for all ports"), + } + } + + cleanup_test_dirs(&test_dir); + } + + #[tokio::test] + async fn test_service_not_created_when_no_network_ports() { + let app = create_test_backend_app_no_ports(); + let test_dir = app.project_root.clone(); + + cleanup_test_dirs(&test_dir); + + let result = app + .build_push_helm_package("registry.example.com/test/test-app-no-ports:1.0.0") + .await; + + assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result); + + let service_yaml_path = test_dir + .join(".harmony_generated/helm/test-app-no-ports/templates/service.yaml"); + assert!( + !service_yaml_path.exists(), + "service.yaml should not exist when there are no network ports" + ); + + cleanup_test_dirs(&test_dir); + } + + #[tokio::test] + async fn test_deployment_created_with_correct_configuration() { + let app = create_test_backend_app_with_ports(); + let test_dir = app.project_root.clone(); + + cleanup_test_dirs(&test_dir); + + let result = app + .build_push_helm_package("registry.example.com/test/test-app:1.0.0") + .await; + + assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result); + + let deployment_yaml_path = test_dir + .join(".harmony_generated/helm/test-app/templates/deployment.yaml"); + assert!(deployment_yaml_path.exists(), "deployment.yaml should exist"); + + let deployment_yaml_content = fs::read_to_string(&deployment_yaml_path) + .expect("Failed to read deployment.yaml"); + + let deployment: Deployment = from_str(&deployment_yaml_content) + .expect("Failed to parse deployment.yaml as K8s Deployment"); + + assert_eq!( + deployment.metadata.name.as_deref(), + Some("test-app"), + "Deployment name should match app name" + ); + + let deployment_spec = deployment + .spec + .as_ref() + .expect("Deployment should have spec"); + assert_eq!(deployment_spec.replicas, Some(1), "Replicas should be 1"); + + let selector = &deployment_spec.selector; + assert_eq!( + selector.match_labels.as_ref().unwrap().get("app.kubernetes.io/name"), + Some(&"test-app".to_string()), + "Selector should match app name" + ); + + let pod_spec = deployment_spec + .template + .spec + .as_ref() + .expect("Pod template should have spec"); + + assert_eq!(pod_spec.containers.len(), 1, "Should have exactly one container"); + + let container = &pod_spec.containers[0]; + assert_eq!(container.name, "test-app", "Container name should match app name"); + assert_eq!( + container.image.as_deref(), + Some("registry.example.com/test/test-app:1.0.0"), + "Container image should match provided image URL" + ); + assert_eq!( + container.image_pull_policy.as_deref(), + Some("IfNotPresent"), + "Image pull policy should be IfNotPresent" + ); + + let container_ports = container + .ports + .as_ref() + .expect("Container should have ports"); + assert_eq!(container_ports.len(), 3, "Container should have 3 ports"); + + assert_eq!(container_ports[0].container_port, 8080, "First container port should be 8080"); + assert_eq!(container_ports[0].name.as_deref(), Some("http"), "First container port name should be 'http'"); + assert_eq!(container_ports[0].protocol.as_deref(), Some("TCP"), "First container port protocol should be 'TCP'"); + + assert_eq!(container_ports[1].container_port, 9000, "Second container port should be 9000"); + assert_eq!(container_ports[1].name.as_deref(), Some("metrics"), "Second container port name should be 'metrics'"); + assert_eq!(container_ports[1].protocol.as_deref(), Some("TCP"), "Second container port protocol should be 'TCP'"); + + assert_eq!(container_ports[2].container_port, 50051, "Third container port should be 50051"); + assert_eq!(container_ports[2].name.as_deref(), Some("grpc"), "Third container port name should be 'grpc'"); + assert_eq!(container_ports[2].protocol.as_deref(), Some("TCP"), "Third container port protocol should be 'TCP'"); + + let env_vars = container.env.as_ref().expect("Container should have env vars"); + assert_eq!(env_vars.len(), 2, "Container should have 2 env vars"); + + let env_map: std::collections::HashMap = env_vars + .iter() + .map(|e| (e.name.clone(), e.value.clone().unwrap_or_default())) + .collect(); + + assert_eq!( + env_map.get("ENV_VAR_1"), + Some(&"value1".to_string()), + "ENV_VAR_1 should have correct value" + ); + assert_eq!( + env_map.get("ENV_VAR_2"), + Some(&"value2".to_string()), + "ENV_VAR_2 should have correct value" + ); + + let pod_labels = deployment_spec + .template + .metadata + .as_ref() + .expect("Pod template should have metadata") + .labels + .as_ref() + .expect("Pod should have labels"); + + assert_eq!( + pod_labels.get("app.kubernetes.io/name"), + Some(&"test-app".to_string()), + "Pod should have correct app label" + ); + assert_eq!( + pod_labels.get("app.kubernetes.io/instance"), + Some(&"test-app".to_string()), + "Pod should have correct instance label" + ); + + cleanup_test_dirs(&test_dir); + } + + #[tokio::test] + async fn test_service_with_udp_protocol() { + let app = BackendApp { + name: "udp-app".to_string(), + project_root: "/tmp/test_udp".into(), + network_ports: vec![ + ApplicationNetworkPort { + number: 53, + protocol: NetworkProtocol::UDP, + name: "dns".to_string(), + }, + ApplicationNetworkPort { + number: 8080, + protocol: NetworkProtocol::TCP, + name: "http".to_string(), + }, + ], + env_vars: vec![], + build_cmd: BuildCommand::new("cargo", vec!["build"]), + dockerfile: None, + }; + let test_dir = app.project_root.clone(); + + cleanup_test_dirs(&test_dir); + + let result = app + .build_push_helm_package("registry.example.com/test/udp-app:1.0.0") + .await; + + assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result); + + let service_yaml_path = test_dir + .join(".harmony_generated/helm/udp-app/templates/service.yaml"); + assert!(service_yaml_path.exists(), "service.yaml should exist"); + + let service_yaml_content = fs::read_to_string(&service_yaml_path) + .expect("Failed to read service.yaml"); + + let service: K8sService = from_str(&service_yaml_content) + .expect("Failed to parse service.yaml as K8s Service"); + + let ports = service + .spec + .as_ref() + .unwrap() + .ports + .as_ref() + .expect("Service should have ports"); + + assert_eq!(ports.len(), 2, "Service should have 2 ports"); + + let dns_port = &ports[0]; + assert_eq!(dns_port.name.as_deref(), Some("dns"), "DNS port name should be 'dns'"); + assert_eq!( + dns_port.protocol.as_deref(), + Some("UDP"), + "DNS port protocol should be 'UDP'" + ); + assert_eq!(dns_port.port, 53, "DNS port number should be 53"); + + let http_port = &ports[1]; + assert_eq!(http_port.name.as_deref(), Some("http"), "HTTP port name should be 'http'"); + assert_eq!( + http_port.protocol.as_deref(), + Some("TCP"), + "HTTP port protocol should be 'TCP'" + ); + assert_eq!(http_port.port, 8080, "HTTP port number should be 8080"); + + cleanup_test_dirs(&test_dir); + } + + #[test] + fn test_build_command_creation() { + let cmd = BuildCommand::new("docker", vec!["build", "-t", "myimage"]); + assert_eq!(cmd.program, "docker"); + assert_eq!(cmd.args, vec!["build", "-t", "myimage"]); + } + + #[test] + fn test_build_command_clone() { + let cmd1 = BuildCommand::new("cargo", vec!["build", "--release"]); + let cmd2 = cmd1.clone(); + assert_eq!(cmd1.program, cmd2.program); + assert_eq!(cmd1.args, cmd2.args); + } +} + diff --git a/harmony/src/modules/application/config.rs b/harmony/src/modules/application/config.rs index d35ab604..c01ebaba 100644 --- a/harmony/src/modules/application/config.rs +++ b/harmony/src/modules/application/config.rs @@ -6,6 +6,15 @@ pub enum NetworkProtocol { UDP, } +impl NetworkProtocol { + pub fn as_str(&self) -> &str { + match self { + NetworkProtocol::TCP => "TCP", + NetworkProtocol::UDP => "UDP", + } + } +} + #[derive(Debug, Clone, Serialize)] pub struct ApplicationNetworkPort { pub number: u16, diff --git a/harmony/src/modules/application/helm/mod.rs b/harmony/src/modules/application/helm/mod.rs index da40b0b6..fd14d1e7 100644 --- a/harmony/src/modules/application/helm/mod.rs +++ b/harmony/src/modules/application/helm/mod.rs @@ -1,11 +1,76 @@ -use askama::Template; +use k8s_openapi::api::{ + apps::v1::{Deployment, DeploymentSpec}, + core::v1::{ + Container, ContainerPort, EnvVar, PodSpec, + PodTemplateSpec, Service as K8sService, ServicePort, ServiceSpec, + }, +}; +use kube::core::ObjectMeta; +use serde::Serialize; use std::fs; use std::path::{Path, PathBuf}; -/// Trait for any resource that can be rendered into a file in the Helm chart. -pub trait HelmTemplate: Send + Sync { - fn filename(&self) -> String; - fn render_template(&self) -> Result; +/// Enum representing all supported Kubernetes resource types for Helm charts. +/// Supports built-in typed resources and custom CRDs via YAML strings. +pub enum HelmResourceKind { + /// Built-in typed Service resource + Service(K8sService), + /// Built-in typed Deployment resource + Deployment(Deployment), + /// Custom resource as pre-serialized YAML (e.g., CRDs, custom types) + CustomYaml { filename: String, content: String }, + // Can add more typed variants as needed: ConfigMap, Secret, Ingress, etc. +} + +impl HelmResourceKind { + pub fn filename(&self) -> String { + match self { + HelmResourceKind::Service(_) => "service.yaml".to_string(), + HelmResourceKind::Deployment(_) => "deployment.yaml".to_string(), + HelmResourceKind::CustomYaml { filename, .. } => filename.clone(), + } + } + + pub fn serialize_to_yaml(&self) -> Result { + match self { + HelmResourceKind::Service(s) => serde_yaml::to_string(s), + HelmResourceKind::Deployment(d) => serde_yaml::to_string(d), + HelmResourceKind::CustomYaml { content, .. } => Ok(content.clone()), + } + } + + pub fn as_service(&self) -> Option<&K8sService> { + match self { + HelmResourceKind::Service(s) => Some(s), + _ => None, + } + } + + pub fn as_deployment(&self) -> Option<&Deployment> { + match self { + HelmResourceKind::Deployment(d) => Some(d), + _ => None, + } + } + + /// Add a custom resource from any serializable type (e.g., CRDs, custom types) + pub fn from_yaml(filename: impl Into, content: impl Into) -> Self { + HelmResourceKind::CustomYaml { + filename: filename.into(), + content: content.into(), + } + } + + /// Add a custom resource from any type that implements Serialize + pub fn from_serializable( + filename: impl Into, + resource: &T, + ) -> Result { + Ok(HelmResourceKind::CustomYaml { + filename: filename.into(), + content: serde_yaml::to_string(resource)?, + }) + } } /// The main orchestrator for building a Helm chart. @@ -14,7 +79,7 @@ pub struct HelmChart { pub version: String, pub app_version: String, pub description: String, - pub templates: Vec>, + pub resources: Vec, pub values: Vec, } @@ -25,13 +90,13 @@ impl HelmChart { version: "0.1.0".to_string(), app_version, description: format!("A Helm chart for {}", name), - templates: Vec::new(), + resources: Vec::new(), values: Vec::new(), } } - pub fn add_template(&mut self, template: Box) { - self.templates.push(template); + pub fn add_resource(&mut self, resource: HelmResourceKind) { + self.resources.push(resource); } pub fn add_value(&mut self, key: &str, value: &str) { @@ -56,14 +121,11 @@ impl HelmChart { let values_content = self.values.join("\n"); fs::write(chart_dir.join("values.yaml"), values_content)?; - // 3. Render and write _helpers.tpl - let helpers = HelpersTpl; - fs::write(templates_dir.join("_helpers.tpl"), helpers.render()?)?; - - // 4. Render and write all added templates (Deployment, Service, etc.) - for template in &self.templates { - let filename = template.filename(); - let content = template.render_template()?; + // 3. Serialize and write all added resources (Deployment, Service, etc.) + for resource in &self.resources { + let filename = resource.filename(); + let content = resource.serialize_to_yaml() + .map_err(|e| format!("Failed to serialize resource {}: {}", filename, e))?; fs::write(templates_dir.join(filename), content)?; } @@ -71,7 +133,8 @@ impl HelmChart { } } -// --- Templates --- + +use askama::Template; #[derive(Template)] #[template(path = "helm/Chart.yaml.j2")] @@ -82,38 +145,220 @@ struct ChartYaml<'a> { app_version: &'a str, } -#[derive(Template)] -#[template(path = "helm/helpers.yaml.j2")] -struct HelpersTpl; - -#[derive(Template)] -#[template(path = "helm/deployment.yaml.j2")] -pub struct DeploymentTemplate { - pub name: String, - pub container_port: Option, - pub env_vars: Vec<(String, String)>, +/// Builder for creating a Kubernetes Service with proper labels and selectors. +pub struct ServiceBuilder { + name: String, + service_type: String, + ports: Vec, + selector_label: String, } -impl HelmTemplate for DeploymentTemplate { - fn filename(&self) -> String { - "deployment.yaml".to_string() +impl ServiceBuilder { + pub fn new(name: impl Into) -> Self { + Self { + name: name.into(), + service_type: "ClusterIP".to_string(), + ports: Vec::new(), + selector_label: String::new(), + } } - fn render_template(&self) -> Result { - self.render() + + pub fn service_type(mut self, service_type: impl Into) -> Self { + self.service_type = service_type.into(); + self + } + + pub fn with_port(mut self, name: impl Into, port: i32, protocol: impl Into) -> Self { + use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString; + self.ports.push(ServicePort { + name: Some(name.into()), + protocol: Some(protocol.into()), + port, + target_port: Some(IntOrString::Int(port)), + ..Default::default() + }); + self + } + + pub fn selector_label(mut self, label: impl Into) -> Self { + self.selector_label = label.into(); + self + } + + pub fn build(self) -> K8sService { + K8sService { + metadata: ObjectMeta { + name: Some(self.name.clone()), + labels: Some( + [ + ("app.kubernetes.io/name".to_string(), self.name.clone()), + ("app.kubernetes.io/component".to_string(), "service".to_string()), + ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()), + ] + .into(), + ), + ..Default::default() + }, + spec: Some(ServiceSpec { + type_: Some(self.service_type), + selector: Some([("app.kubernetes.io/name".to_string(), self.selector_label)].into()), + ports: if self.ports.is_empty() { None } else { Some(self.ports) }, + ..Default::default() + }), + ..Default::default() + } } } -#[derive(Template)] -#[template(path = "helm/service.yaml.j2")] -pub struct ServiceTemplate { - pub port: u16, // Used only to enforce logic if needed, though template uses Values +/// Builder for creating a Kubernetes Deployment with pod template and container spec. +pub struct DeploymentBuilder { + name: String, + image: String, + replicas: i32, + container_ports: Vec, + env_vars: Vec, + image_pull_policy: Option, } -impl HelmTemplate for ServiceTemplate { - fn filename(&self) -> String { - "service.yaml".to_string() +impl DeploymentBuilder { + pub fn new(name: impl Into, image: impl Into) -> Self { + Self { + name: name.into(), + image: image.into(), + replicas: 1, + container_ports: Vec::new(), + env_vars: Vec::new(), + image_pull_policy: Some("IfNotPresent".to_string()), + } } - fn render_template(&self) -> Result { - self.render() + + pub fn replicas(mut self, replicas: i32) -> Self { + self.replicas = replicas; + self + } + + pub fn with_container_port(mut self, number: i32, name: impl Into, protocol: impl Into) -> Self { + self.container_ports.push(ContainerPort { + container_port: number, + name: Some(name.into()), + protocol: Some(protocol.into()), + ..Default::default() + }); + self + } + + pub fn with_env_var(mut self, name: impl Into, value: impl Into) -> Self { + self.env_vars.push(EnvVar { + name: name.into(), + value: Some(value.into()), + ..Default::default() + }); + self + } + + pub fn image_pull_policy(mut self, policy: impl Into) -> Self { + self.image_pull_policy = Some(policy.into()); + self + } + + pub fn build(self) -> Deployment { + let name = self.name.clone(); + Deployment { + metadata: ObjectMeta { + name: Some(name.clone()), + labels: Some( + [ + ("app.kubernetes.io/name".to_string(), name.clone()), + ("app.kubernetes.io/component".to_string(), "deployment".to_string()), + ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()), + ("app.kubernetes.io/version".to_string(), "1.0.0".to_string()), + ] + .into(), + ), + ..Default::default() + }, + spec: Some(DeploymentSpec { + replicas: Some(self.replicas), + selector: k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector { + match_labels: Some([("app.kubernetes.io/name".to_string(), name.clone())].into()), + ..Default::default() + }, + template: PodTemplateSpec { + metadata: Some(ObjectMeta { + labels: Some( + [ + ("app.kubernetes.io/name".to_string(), name.clone()), + ("app.kubernetes.io/instance".to_string(), name.clone()), + ] + .into(), + ), + ..Default::default() + }), + spec: Some(PodSpec { + containers: vec![Container { + name: name.clone(), + image: Some(self.image), + image_pull_policy: self.image_pull_policy, + ports: if self.container_ports.is_empty() { + None + } else { + Some(self.container_ports) + }, + env: if self.env_vars.is_empty() { None } else { Some(self.env_vars) }, + ..Default::default() + }], + ..Default::default() + }), + }, + ..Default::default() + }), + ..Default::default() + } } } + +/// Helper function to create a Service from network port configuration. +/// Returns `None` if no ports are provided. +pub fn create_service_from_ports( + name: String, + network_ports: &[(String, u16, String)], // (name, number, protocol) +) -> Option { + use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString; + + if network_ports.is_empty() { + return None; + } + + let ports: Vec = network_ports + .iter() + .map(|(port_name, number, protocol)| ServicePort { + name: Some(port_name.clone()), + protocol: Some(protocol.clone()), + port: *number as i32, + target_port: Some(IntOrString::Int(*number as i32)), + ..Default::default() + }) + .collect(); + + Some(K8sService { + metadata: ObjectMeta { + name: Some(name.clone()), + labels: Some( + [ + ("app.kubernetes.io/name".to_string(), name.clone()), + ("app.kubernetes.io/component".to_string(), "service".to_string()), + ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()), + ] + .into(), + ), + ..Default::default() + }, + spec: Some(ServiceSpec { + type_: Some("ClusterIP".to_string()), + selector: Some([("app.kubernetes.io/name".to_string(), name.clone())].into()), + ports: Some(ports), + ..Default::default() + }), + ..Default::default() + }) +} diff --git a/harmony/templates/helm/deployment.yaml.j2 b/harmony/templates/helm/deployment.yaml.j2 deleted file mode 100644 index b060b8f1..00000000 --- a/harmony/templates/helm/deployment.yaml.j2 +++ /dev/null @@ -1,37 +0,0 @@ -{% raw %} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "chart.fullname" . }} - labels: - app: {{ include "chart.name" . }} -spec: - replicas: {{ .Values.replicaCount | default 1 }} - selector: - matchLabels: - app: {{ include chart.name . }} - template: - metadata: - labels: - app: {{ include chart.name . }} - spec: - containers: -{% endraw %} - - name: {{ name }} -{% raw %} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: IfNotPresent -{% endraw %} - {% if let Some(port) = container_port %} - ports: - - name: http - containerPort: {{ port }} - protocol: TCP - {% endif %} - {% if !env_vars.is_empty() %} - env: - {% for (k, v) in env_vars %} - - name: {{ k }} - value: {{ v }} - {% endfor %} - {% endif %} diff --git a/harmony/templates/helm/helpers.yaml.j2 b/harmony/templates/helm/helpers.yaml.j2 deleted file mode 100644 index ff93848e..00000000 --- a/harmony/templates/helm/helpers.yaml.j2 +++ /dev/null @@ -1,8 +0,0 @@ -{% raw %} -{{- define \"chart.fullname\" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix \"-\" }} -{{- end }} -{{- define "chart.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} -{% endraw %} diff --git a/harmony/templates/helm/service.yaml.j2 b/harmony/templates/helm/service.yaml.j2 deleted file mode 100644 index c6582d22..00000000 --- a/harmony/templates/helm/service.yaml.j2 +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ "{{ include \"chart.fullname\" . }}" }} - labels: - app: {{ "{{ include \"chart.name\" . }}" }} -spec: - type: ClusterIP - ports: - - port: {{ "{{ .Values.service.port }}" }} - targetPort: http - protocol: TCP - name: http - selector: - app: {{ "{{ include \"chart.name\" . }}" }} -- 2.39.5 From 0cc5f505f804d768ca4c03de93f3aad55850db67 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Sun, 25 Jan 2026 22:52:29 -0500 Subject: [PATCH 05/19] feat(harmony_execution): New crate to contain utils for execution such as command line --- harmony_execution/Cargo.toml | 12 + harmony_execution/src/command.rs | 470 +++++++++++++++++++++++++++++++ harmony_execution/src/lib.rs | 6 + 3 files changed, 488 insertions(+) create mode 100644 harmony_execution/Cargo.toml create mode 100644 harmony_execution/src/command.rs create mode 100644 harmony_execution/src/lib.rs diff --git a/harmony_execution/Cargo.toml b/harmony_execution/Cargo.toml new file mode 100644 index 00000000..7433c5e5 --- /dev/null +++ b/harmony_execution/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "harmony_execution" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +thiserror.workspace = true +lazy_static.workspace = true +directories.workspace = true +log.workspace = true diff --git a/harmony_execution/src/command.rs b/harmony_execution/src/command.rs new file mode 100644 index 00000000..0ac1626c --- /dev/null +++ b/harmony_execution/src/command.rs @@ -0,0 +1,470 @@ +use std::io::{BufRead, BufReader}; +use std::process::{Child, Command, Stdio}; +use std::sync::Arc; +use std::thread; + +/// Captured output from a command execution +#[derive(Debug, Clone)] +pub struct CommandOutput { + /// Captured stdout content + pub stdout: String, + /// Captured stderr content + pub stderr: String, + /// Exit status of the command + pub status: CommandStatus, +} + +impl CommandOutput { + /// Returns true if the command succeeded + pub fn is_success(&self) -> bool { + self.status.is_success() + } + + /// Formats the complete output for display + pub fn format_output(&self) -> String { + format!( + "Stdout:\n{}\n\nStderr:\n{}", + if self.stdout.is_empty() { + "" + } else { + &self.stdout + }, + if self.stderr.is_empty() { + "" + } else { + &self.stderr + } + ) + } +} + +/// Result status of a command execution +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CommandStatus { + /// Command executed successfully (exit code 0) + Success, + /// Command failed with an exit code + Failed(i32), + /// Command was terminated by a signal + Terminated(i32), + /// Command execution could not be started + Error(String), +} + +impl CommandStatus { + pub fn is_success(&self) -> bool { + matches!(self, CommandStatus::Success) + } +} + +impl From for CommandStatus { + fn from(status: std::process::ExitStatus) -> Self { + if status.success() { + CommandStatus::Success + } else if let Some(code) = status.code() { + CommandStatus::Failed(code) + } else { + CommandStatus::Terminated(0) // Signal codes are platform-specific + } + } +} + +type Callback = Arc; + +/// Options for configuring command execution +#[derive(Clone)] +pub struct RunnerOptions { + /// Whether to print stdout to console in real-time + pub print_stdout: bool, + /// Whether to print stderr to console in real-time + pub print_stderr: bool, + /// Optional callback for each stdout line + pub stdout_callback: Callback, + /// Optional callback for each stderr line + pub stderr_callback: Callback, +} + +impl RunnerOptions { + fn empty_callback() -> Callback { + Arc::new(|_| {}) + } + /// Create default options with real-time printing enabled + pub fn print_to_console() -> Self { + Self { + print_stdout: true, + print_stderr: true, + ..Default::default() + } + } + + /// Create options that capture output silently + pub fn silent() -> Self { + Self { + print_stdout: false, + print_stderr: false, + ..Default::default() + } + } + + /// Set custom callbacks for stdout and stderr lines + pub fn with_callbacks(mut self, stdout_callback: F1, stderr_callback: F2) -> Self + where + F1: Fn(&str) + Send + Sync + 'static, + F2: Fn(&str) + Send + Sync + 'static, + { + self.stdout_callback = Arc::new(stdout_callback); + self.stderr_callback = Arc::new(stderr_callback); + self + } +} + +impl Default for RunnerOptions { + fn default() -> Self { + Self { + print_stdout: true, + print_stderr: true, + stdout_callback: Self::empty_callback(), + stderr_callback: Self::empty_callback(), + } + } +} + +/// Error type for command execution failures +#[derive(Debug)] +pub struct CommandError { + /// Human-readable error description + pub message: String, + /// Captured output if execution started + pub output: Option, +} + +impl std::fmt::Display for CommandError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message)?; + if let Some(output) = &self.output { + write!(f, "\n{}", output.format_output())?; + } + Ok(()) + } +} + +impl std::error::Error for CommandError {} + +/// Runs a command and captures its output while streaming to console +/// +/// # Example +/// +/// ``` +/// use harmony_execution::command::{run_command, RunnerOptions}; +/// use std::process::Command; +/// +/// let output = run_command( +/// Command::new("echo").arg("hello"), +/// RunnerOptions::print_to_console() +/// ).unwrap(); +/// assert!(output.is_success()); +/// assert_eq!(output.stdout, "hello\n"); +/// ``` +pub fn run_command( + command: &mut Command, + options: RunnerOptions, +) -> Result { + let mut child = command + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| CommandError { + message: format!("Failed to spawn command: {}", e), + output: None, + })?; + + let stdout = child.stdout.take().ok_or_else(|| CommandError { + message: "Failed to capture stdout".to_string(), + output: None, + })?; + + let stderr = child.stderr.take().ok_or_else(|| CommandError { + message: "Failed to capture stderr".to_string(), + output: None, + })?; + + let stdout_reader = BufReader::new(stdout); + let stderr_reader = BufReader::new(stderr); + + let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel(); + let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel(); + + // Spawn thread to handle stdout + let stdout_handle = thread::spawn(move || { + let mut output = String::new(); + for line in stdout_reader.lines() { + match line { + Ok(line_content) => { + if options.print_stdout { + println!("{}", line_content); + } + (options.stdout_callback)(&line_content); + output.push_str(&line_content); + output.push('\n'); + } + Err(e) => { + // Silently handle read errors - corrupted data at end is common + log::trace!("Error reading stdout line: {}", e); + } + } + } + let _ = stdout_sender.send(output); + }); + + // Spawn thread to handle stderr + let stderr_handle = thread::spawn(move || { + let mut output = String::new(); + for line in stderr_reader.lines() { + match line { + Ok(line_content) => { + if options.print_stderr { + eprintln!("{}", line_content); + } + (options.stderr_callback)(&line_content); + output.push_str(&line_content); + output.push('\n'); + } + Err(e) => { + log::trace!("Error reading stderr line: {}", e); + } + } + } + let _ = stderr_sender.send(output); + }); + + let status = child.wait().map_err(|e| CommandError { + message: format!("Failed to wait for command process: {}", e), + output: None, + })?; + + let stdout_lines = stdout_handle + .join() + .map_err(|e| CommandError { + message: format!("Stdout thread panicked: {:?}", e), + output: None, + }) + .and_then(|_| { + stdout_receiver.recv().map_err(|e| CommandError { + message: format!("Failed to receive stdout: {}", e), + output: None, + }) + })?; + + let stderr_lines = stderr_handle + .join() + .map_err(|e| CommandError { + message: format!("Stderr thread panicked: {:?}", e), + output: None, + }) + .and_then(|_| { + stderr_receiver.recv().map_err(|e| CommandError { + message: format!("Failed to receive stderr: {}", e), + output: None, + }) + })?; + + Ok(CommandOutput { + stdout: stdout_lines, + stderr: stderr_lines, + status: status.into(), + }) +} + +/// Convenience function to run a command with default options (print to console) +pub fn run(command: &mut Command) -> Result { + run_command(command, RunnerOptions::print_to_console()) +} + +/// Convenience function to run a command silently (capture output only) +pub fn run_silent(command: &mut Command) -> Result { + run_command(command, RunnerOptions::silent()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + + #[test] + fn test_simple_echo_command() { + let output = run_silent(Command::new("echo").arg("hello world")).unwrap(); + assert!(output.is_success()); + assert_eq!(output.stdout.trim(), "hello world"); + assert!(output.stderr.is_empty()); + } + + #[test] + fn test_command_failure() { + let output = run_silent(Command::new("sh").args(["-c", "exit 42"])).unwrap(); + assert!(!output.is_success()); + assert_eq!(output.status, CommandStatus::Failed(42)); + } + + #[test] + fn test_command_output_format() { + let output = run_silent(Command::new("echo").arg("test")).unwrap(); + let formatted = output.format_output(); + assert!(formatted.contains("Stdout:")); + assert!(formatted.contains("test")); + } + + #[test] + fn test_runner_options() { + let opts = RunnerOptions::print_to_console(); + assert!(opts.print_stdout); + assert!(opts.print_stderr); + + let opts = RunnerOptions::silent(); + assert!(!opts.print_stdout); + assert!(!opts.print_stderr); + } + + #[test] + fn test_command_status_from_exit_status() { + let output = run_silent(&mut Command::new("true")).unwrap(); + assert_eq!(output.status, CommandStatus::Success); + + let output = run_silent(&mut Command::new("false")).unwrap(); + assert_eq!(output.status, CommandStatus::Failed(1)); + } + + #[test] + fn test_stdout_callback_receives_lines() { + use std::sync::{Arc, Mutex}; + + let captured = Arc::new(Mutex::new(Vec::new())); + let captured_clone = Arc::clone(&captured); + + let opts = RunnerOptions::silent().with_callbacks( + move |line| captured_clone.lock().unwrap().push(line.to_string()), + |_| {}, + ); + + run_command(Command::new("echo").arg("hello world"), opts).unwrap(); + + let lines = captured.lock().unwrap(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0], "hello world"); + } + + #[test] + fn test_stderr_callback_receives_lines() { + use std::sync::{Arc, Mutex}; + + let captured = Arc::new(Mutex::new(Vec::new())); + let captured_clone = Arc::clone(&captured); + + let opts = RunnerOptions::silent().with_callbacks( + |_| {}, + move |line| captured_clone.lock().unwrap().push(line.to_string()), + ); + + run_command(Command::new("sh").args(["-c", "echo error >&2"]), opts).unwrap(); + + let lines = captured.lock().unwrap(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0], "error"); + } + + #[test] + fn test_callback_and_capture_both_work() { + use std::sync::{Arc, Mutex}; + + let callback_lines = Arc::new(Mutex::new(Vec::new())); + let callback_clone = Arc::clone(&callback_lines); + + let opts = RunnerOptions::silent().with_callbacks( + move |line| callback_clone.lock().unwrap().push(line.to_string()), + |_| {}, + ); + + let output = + run_command(Command::new("printf").args(["line1\nline2\nline3\n"]), opts).unwrap(); + + // Verify captured output + assert_eq!(output.stdout, "line1\nline2\nline3\n"); + + // Verify callback received all lines + let lines = callback_lines.lock().unwrap(); + assert_eq!(lines.len(), 3); + assert_eq!(lines[0], "line1"); + assert_eq!(lines[1], "line2"); + assert_eq!(lines[2], "line3"); + } + + #[test] + fn test_multiline_output_capture() { + let output = run_silent(Command::new("printf").args(["line1\nline2\nline3\n"])).unwrap(); + + assert_eq!(output.stdout, "line1\nline2\nline3\n"); + assert!(output.stderr.trim().is_empty()); + } + + #[test] + fn test_mixed_stdout_stderr_capture() { + let output = run_silent(Command::new("sh").args([ + "-c", + "echo stdout1 && echo stderr1 >&2 && echo stdout2 && echo stderr2 >&2", + ])) + .unwrap(); + + assert!(output.stdout.contains("stdout1")); + assert!(output.stdout.contains("stdout2")); + assert!(output.stderr.contains("stderr1")); + assert!(output.stderr.contains("stderr2")); + } + + #[test] + fn test_empty_output_command() { + let output = run_silent(&mut Command::new("true")).unwrap(); + + assert!(output.stdout.is_empty()); + assert!(output.stderr.is_empty()); + assert!(output.is_success()); + } + + #[test] + fn test_command_output_format_with_empty_streams() { + let output = run_silent(&mut Command::new("true")).unwrap(); + let formatted = output.format_output(); + + assert!(formatted.contains("Stdout:")); + assert!(formatted.contains("")); + assert!(formatted.contains("Stderr:")); + } + + #[test] + fn test_error_contains_message_and_output() { + let error = CommandError { + message: "Test error".to_string(), + output: Some(CommandOutput { + stdout: "captured stdout".to_string(), + stderr: "captured stderr".to_string(), + status: CommandStatus::Success, + }), + }; + + let display = format!("{}", error); + assert!(display.contains("Test error")); + assert!(display.contains("captured stdout")); + assert!(display.contains("captured stderr")); + } + + #[test] + fn test_error_without_output() { + let error = CommandError { + message: "Spawn failed".to_string(), + output: None, + }; + + let display = format!("{}", error); + assert!(display.contains("Spawn failed")); + assert!(!display.contains("Stdout:")); + assert!(!display.contains("Stderr:")); + } +} diff --git a/harmony_execution/src/lib.rs b/harmony_execution/src/lib.rs new file mode 100644 index 00000000..65fdf663 --- /dev/null +++ b/harmony_execution/src/lib.rs @@ -0,0 +1,6 @@ +pub mod command; + +pub use command::{ + run_command, run, run_silent, + CommandOutput, CommandStatus, CommandError, RunnerOptions, +}; -- 2.39.5 From deca67fd554724a716e59caa7918c692a7a27af9 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Sun, 25 Jan 2026 22:54:14 -0500 Subject: [PATCH 06/19] feat(backend_app): Deployment now pretty much works to package and deploy an app with an existing Docker image and type-safe helm chart on local k3d, not tested for remote k8s with Argo yet --- Cargo.toml | 3 + README.md | 2 + brocade/examples/main.rs | 2 +- harmony/Cargo.toml | 1 + .../src/modules/application/backend_app.rs | 1005 ++++++++++------- harmony/src/modules/application/config.rs | 7 + harmony/src/modules/application/helm/mod.rs | 144 ++- harmony_agent/deploy/src/main.rs | 7 +- 8 files changed, 719 insertions(+), 452 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a256234f..18a0ff9c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "harmony_types", "harmony_macros", "harmony_tui", + "harmony_execution", "opnsense-config", "opnsense-config-xml", "harmony_cli", @@ -17,6 +18,8 @@ members = [ "harmony_secret", "adr/agent_discovery/mdns", "brocade", + "harmony_agent", + "harmony_agent/deploy", ] [workspace.package] diff --git a/README.md b/README.md index 4ccdae73..f4f13ec2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Harmony : Open-source infrastructure orchestration that treats your platform like first-class code +In other words, Harmony is a **next-generation platform engineering framework**. + _By [NationTech](https://nationtech.io)_ [![Build](https://git.nationtech.io/NationTech/harmony/actions/workflows/check.yml/badge.svg)](https://git.nationtech.io/nationtech/harmony) diff --git a/brocade/examples/main.rs b/brocade/examples/main.rs index 47d4a631..15513ea2 100644 --- a/brocade/examples/main.rs +++ b/brocade/examples/main.rs @@ -1,7 +1,7 @@ use std::net::{IpAddr, Ipv4Addr}; use brocade::{BrocadeOptions, ssh}; -use harmony_secret::{Secret, SecretManager}; +use harmony_secret::Secret; use harmony_types::switch::PortLocation; use serde::{Deserialize, Serialize}; diff --git a/harmony/Cargo.toml b/harmony/Cargo.toml index 634cbe96..f951a974 100644 --- a/harmony/Cargo.toml +++ b/harmony/Cargo.toml @@ -30,6 +30,7 @@ opnsense-config = { path = "../opnsense-config" } opnsense-config-xml = { path = "../opnsense-config-xml" } harmony_macros = { path = "../harmony_macros" } harmony_types = { path = "../harmony_types" } +harmony_execution = { path = "../harmony_execution" } uuid.workspace = true url.workspace = true kube = { workspace = true, features = ["derive"] } diff --git a/harmony/src/modules/application/backend_app.rs b/harmony/src/modules/application/backend_app.rs index 83e24f5b..804af46d 100644 --- a/harmony/src/modules/application/backend_app.rs +++ b/harmony/src/modules/application/backend_app.rs @@ -1,14 +1,17 @@ -use std::path::PathBuf; use async_trait::async_trait; use log::{debug, info, trace}; use serde::Serialize; +use std::path::PathBuf; use crate::{ config::{REGISTRY_PROJECT, REGISTRY_URL}, modules::application::{ - config::ApplicationNetworkPort, helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind}, Application, HelmPackage, OCICompliant + Application, HelmPackage, OCICompliant, + config::ApplicationNetworkPort, + helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind}, }, }; +use harmony_execution::{run_command, RunnerOptions}; #[derive(Debug, Clone, Serialize)] pub struct BuildCommand { @@ -95,98 +98,29 @@ impl OCICompliant for BackendApp { let dockerfile = self.get_dockerfile()?; let image_tag = self.image_name(); - let mut child = std::process::Command::new("docker") - .args([ - "build", - "-t", - &image_tag, - "-f", - &dockerfile.to_string_lossy(), - &self.project_root.to_string_lossy(), - ]) - .stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()) - .spawn() - .map_err(|e| format!("Failed to spawn docker build process: {e}"))?; + // Run docker build command, streaming output to console and capturing it + let output = run_command( + std::process::Command::new("docker") + .args([ + "build", + "-t", + &image_tag, + "-f", + &dockerfile.to_string_lossy(), + &self.project_root.to_string_lossy(), + ]), + RunnerOptions::print_to_console(), + ) + .map_err(|e| format!("Failed to spawn docker build process: {}", e))?; - let stdout = child.stdout.take().expect("Failed to capture stdout"); - let stderr = child.stderr.take().expect("Failed to capture stderr"); - - use std::io::{BufRead, BufReader}; - use std::thread; - - let stdout_reader = BufReader::new(stdout); - let stderr_reader = BufReader::new(stderr); - - let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel(); - let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel(); - - let stdout_handle = thread::spawn(move || { - let mut output = String::new(); - for line in stdout_reader.lines() { - match line { - Ok(l) => { - println!("{}", l); - output.push_str(&l); - output.push('\n'); - } - Err(e) => { - trace!("Error reading stdout line: {}", e); - } - } - } - let _ = stdout_sender.send(output); - }); - - let stderr_handle = thread::spawn(move || { - let mut output = String::new(); - for line in stderr_reader.lines() { - match line { - Ok(l) => { - eprintln!("{}", l); - output.push_str(&l); - output.push('\n'); - } - Err(e) => { - trace!("Error reading stderr line: {}", e); - } - } - } - let _ = stderr_sender.send(output); - }); - - let status = child - .wait() - .map_err(|e| format!("Failed to wait for docker build process: {e}"))?; - - let stdout_lines = stdout_handle - .join() - .map_err(|e| format!("Stdout thread panicked: {e:?}")) - .and_then(|_| { - stdout_receiver - .recv() - .map_err(|e| format!("Failed to receive stdout: {e}")) - })?; - let stderr_lines = stderr_handle - .join() - .map_err(|e| format!("Stderr thread panicked: {e:?}")) - .and_then(|_| { - stderr_receiver - .recv() - .map_err(|e| format!("Failed to receive stderr: {e}")) - })?; - - let output_content = format!( - "\n{stdout}\n\n{stderr}", - stdout = stdout_lines, - stderr = stderr_lines, - ); - match status.success() { - true => { - info!("Docker image build succeeded"); - Ok(image_tag) - } - false => Err(format!("Docker image build FAILED :{output_content}")), + if output.is_success() { + info!("Docker image build succeeded"); + Ok(image_tag) + } else { + Err(format!( + "Docker image build FAILED:\n{}", + output.format_output() + )) } } @@ -217,34 +151,22 @@ impl HelmPackage for BackendApp { async fn build_push_helm_package(&self, image_url: &str) -> Result { let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string()); - // Build the typed Deployment object using the builder - let mut deployment_builder = DeploymentBuilder::new(&self.name, image_url); - - // Add container ports - for port in &self.network_ports { - deployment_builder = deployment_builder.with_container_port( - port.number as i32, - &port.name, - port.protocol.as_str(), - ); - } - - // Add environment variables - for (key, value) in &self.env_vars { - deployment_builder = deployment_builder.with_env_var(key, value); - } - - let deployment = deployment_builder.build(); - helm_chart.add_resource(HelmResourceKind::Deployment(deployment)); + // Build the typed Deployment object using the builder with initial options + helm_chart.add_resource(HelmResourceKind::Deployment( + DeploymentBuilder::with_options( + &self.name, + image_url, + Some(self.network_ports.clone()), + Some(self.env_vars.clone()), + None, + ) + .build(), + )); // Build the typed Service object using the helper function - let network_ports: Vec<(String, u16, String)> = self - .network_ports - .iter() - .map(|p| (p.name.clone(), p.number, p.protocol.as_str().to_string())) - .collect(); - - if let Some(service) = helm::create_service_from_ports(self.name.clone(), &network_ports) { + if let Some(service) = + helm::create_service_from_ports(self.name.clone(), &self.network_ports) + { helm_chart.add_resource(HelmResourceKind::Service(service)); } @@ -259,375 +181,622 @@ impl HelmPackage for BackendApp { } } - #[cfg(test)] mod tests { use super::*; use crate::modules::application::config::ApplicationNetworkPort; use crate::modules::application::config::NetworkProtocol; + use k8s_openapi::api::apps::v1::Deployment; + use k8s_openapi::api::core::v1::{Container, EnvVar, Service as K8sService, ServicePort}; use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString; use serde_yaml::from_str; - use k8s_openapi::api::core::v1::Service as K8sService; - use k8s_openapi::api::apps::v1::Deployment; use std::fs; + use std::path::Path; + use tempfile::tempdir; - fn cleanup_test_dirs(project_root: &PathBuf) { - let helm_dir = project_root.join(".harmony_generated/helm/"); - if helm_dir.exists() { - let _ = fs::remove_dir_all(&helm_dir); - } + // Test Helpers + fn read_service_yaml(project_root: &Path, chart_name: &str) -> K8sService { + let path = project_root.join(format!( + ".harmony_generated/helm/{chart_name}/templates/service.yaml" + )); + let content = fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("Failed to read service.yaml at {:?}: {}", path, e)); + from_str(&content) + .unwrap_or_else(|e| panic!("Failed to parse service.yaml as K8s Service: {}", e)) } - fn create_test_backend_app_with_ports() -> BackendApp { - BackendApp { - name: "test-app".to_string(), - project_root: "/tmp/test_backend".into(), - network_ports: vec![ - ApplicationNetworkPort { - number: 8080, - protocol: NetworkProtocol::TCP, - name: "http".to_string(), - }, - ApplicationNetworkPort { - number: 9000, - protocol: NetworkProtocol::TCP, - name: "metrics".to_string(), - }, - ApplicationNetworkPort { - number: 50051, - protocol: NetworkProtocol::TCP, - name: "grpc".to_string(), - }, - ], - env_vars: vec![ - ("ENV_VAR_1".to_string(), "value1".to_string()), - ("ENV_VAR_2".to_string(), "value2".to_string()), - ], - build_cmd: BuildCommand::new("cargo", vec!["build"]), - dockerfile: None, - } + fn read_deployment_yaml(project_root: &Path, chart_name: &str) -> Deployment { + let path = project_root.join(format!( + ".harmony_generated/helm/{chart_name}/templates/deployment.yaml" + )); + let content = fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("Failed to read deployment.yaml at {:?}: {}", path, e)); + from_str(&content) + .unwrap_or_else(|e| panic!("Failed to parse deployment.yaml as K8s Deployment: {}", e)) } - fn create_test_backend_app_no_ports() -> BackendApp { - BackendApp { - name: "test-app-no-ports".to_string(), - project_root: "/tmp/test_backend_no_ports".into(), - network_ports: vec![], - env_vars: vec![("ENV_VAR_1".to_string(), "value1".to_string())], - build_cmd: BuildCommand::new("cargo", vec!["build"]), - dockerfile: None, - } + fn service_yaml_exists(project_root: &Path, chart_name: &str) -> bool { + let path = project_root.join(format!( + ".harmony_generated/helm/{chart_name}/templates/service.yaml" + )); + path.exists() } - #[tokio::test] - async fn test_service_created_with_all_network_ports() { - let app = create_test_backend_app_with_ports(); - let test_dir = app.project_root.clone(); - - cleanup_test_dirs(&test_dir); - - let result = app - .build_push_helm_package("registry.example.com/test/test-app:1.0.0") - .await; - - assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result); - - let service_yaml_path = test_dir - .join(".harmony_generated/helm/test-app/templates/service.yaml"); - assert!( - service_yaml_path.exists(), - "service.yaml should exist when there are network ports" - ); - - let service_yaml_content = fs::read_to_string(&service_yaml_path) - .expect("Failed to read service.yaml"); - - let service: K8sService = from_str(&service_yaml_content) - .expect("Failed to parse service.yaml as K8s Service"); - + // Service Assertions + fn assert_service_metadata(service: &K8sService, expected_name: &str) { assert_eq!( service.metadata.name.as_deref(), - Some("test-app"), - "Service name should match app name" - ); - assert_eq!( - service.spec.as_ref().unwrap().type_.as_deref(), - Some("ClusterIP"), - "Service type should be ClusterIP" + Some(expected_name), + "Service name should be '{expected_name}'" ); + } + fn assert_service_type(service: &K8sService, expected_type: &str) { + assert_eq!( + service.spec.as_ref().and_then(|s| s.type_.as_deref()), + Some(expected_type), + "Service type should be '{expected_type}'" + ); + } + + fn assert_service_port_count(service: &K8sService, expected_count: usize) { let ports = service .spec .as_ref() - .unwrap() - .ports - .as_ref() - .expect("Service should have ports"); - - assert_eq!(ports.len(), 3, "Service should have 3 ports"); - - let http_port = &ports[0]; - assert_eq!(http_port.name.as_deref(), Some("http"), "First port name should be 'http'"); - assert_eq!(http_port.protocol.as_deref(), Some("TCP"), "First port protocol should be 'TCP'"); - assert_eq!(http_port.port, 8080, "First port number should be 8080"); - - let metrics_port = &ports[1]; - assert_eq!(metrics_port.name.as_deref(), Some("metrics"), "Second port name should be 'metrics'"); - assert_eq!(metrics_port.protocol.as_deref(), Some("TCP"), "Second port protocol should be 'TCP'"); - assert_eq!(metrics_port.port, 9000, "Second port number should be 9000"); - - let grpc_port = &ports[2]; - assert_eq!(grpc_port.name.as_deref(), Some("grpc"), "Third port name should be 'grpc'"); - assert_eq!(grpc_port.protocol.as_deref(), Some("TCP"), "Third port protocol should be 'TCP'"); - assert_eq!(grpc_port.port, 50051, "Third port number should be 50051"); - - for port in ports.iter() { - match &port.target_port { - Some(IntOrString::Int(target)) => { - assert_eq!( - *target, port.port, - "Target port should match service port for {}", - port.name.as_deref().unwrap_or("unknown") - ); - } - _ => panic!("Target port should be Int for all ports"), - } - } - - cleanup_test_dirs(&test_dir); - } - - #[tokio::test] - async fn test_service_not_created_when_no_network_ports() { - let app = create_test_backend_app_no_ports(); - let test_dir = app.project_root.clone(); - - cleanup_test_dirs(&test_dir); - - let result = app - .build_push_helm_package("registry.example.com/test/test-app-no-ports:1.0.0") - .await; - - assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result); - - let service_yaml_path = test_dir - .join(".harmony_generated/helm/test-app-no-ports/templates/service.yaml"); - assert!( - !service_yaml_path.exists(), - "service.yaml should not exist when there are no network ports" + .and_then(|s| s.ports.as_ref()) + .unwrap_or_else(|| panic!("Service should have ports")); + assert_eq!( + ports.len(), + expected_count, + "Service should have {expected_count} ports" ); - - cleanup_test_dirs(&test_dir); } - #[tokio::test] - async fn test_deployment_created_with_correct_configuration() { - let app = create_test_backend_app_with_ports(); - let test_dir = app.project_root.clone(); + fn assert_service_port( + port: &ServicePort, + expected_name: &str, + expected_protocol: &str, + expected_number: i32, + ) { + assert_eq!( + port.name.as_deref(), + Some(expected_name), + "Port name should be '{expected_name}'" + ); + assert_eq!( + port.protocol.as_deref(), + Some(expected_protocol), + "Port '{expected_name}' protocol should be '{expected_protocol}'" + ); + assert_eq!( + port.port, expected_number, + "Port '{expected_name}' number should be {expected_number}" + ); + } - cleanup_test_dirs(&test_dir); - - let result = app - .build_push_helm_package("registry.example.com/test/test-app:1.0.0") - .await; - - assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result); - - let deployment_yaml_path = test_dir - .join(".harmony_generated/helm/test-app/templates/deployment.yaml"); - assert!(deployment_yaml_path.exists(), "deployment.yaml should exist"); - - let deployment_yaml_content = fs::read_to_string(&deployment_yaml_path) - .expect("Failed to read deployment.yaml"); - - let deployment: Deployment = from_str(&deployment_yaml_content) - .expect("Failed to parse deployment.yaml as K8s Deployment"); + fn assert_target_port_matches_service_port(port: &ServicePort) { + match &port.target_port { + Some(IntOrString::Int(target)) => { + assert_eq!( + *target, + port.port, + "Target port should match service port for '{}'", + port.name.as_deref().unwrap_or("unknown") + ); + } + _ => panic!( + "Target port should be Int for '{}'", + port.name.as_deref().unwrap_or("unknown") + ), + } + } + // Deployment Assertions + fn assert_deployment_metadata(deployment: &Deployment, expected_name: &str) { assert_eq!( deployment.metadata.name.as_deref(), - Some("test-app"), - "Deployment name should match app name" + Some(expected_name), + "Deployment name should be '{expected_name}'" ); + } - let deployment_spec = deployment + fn assert_deployment_replicas(deployment: &Deployment, expected_replicas: i32) { + let spec = deployment .spec .as_ref() - .expect("Deployment should have spec"); - assert_eq!(deployment_spec.replicas, Some(1), "Replicas should be 1"); - - let selector = &deployment_spec.selector; + .unwrap_or_else(|| panic!("Deployment should have spec")); assert_eq!( - selector.match_labels.as_ref().unwrap().get("app.kubernetes.io/name"), - Some(&"test-app".to_string()), - "Selector should match app name" + spec.replicas, + Some(expected_replicas), + "Deployment should have {expected_replicas} replicas" ); + } - let pod_spec = deployment_spec - .template + fn assert_selector_match_label(deployment: &Deployment, expected_label_value: &str) { + let spec = deployment .spec .as_ref() - .expect("Pod template should have spec"); - - assert_eq!(pod_spec.containers.len(), 1, "Should have exactly one container"); - - let container = &pod_spec.containers[0]; - assert_eq!(container.name, "test-app", "Container name should match app name"); + .unwrap_or_else(|| panic!("Deployment should have spec")); assert_eq!( - container.image.as_deref(), - Some("registry.example.com/test/test-app:1.0.0"), - "Container image should match provided image URL" - ); - assert_eq!( - container.image_pull_policy.as_deref(), - Some("IfNotPresent"), - "Image pull policy should be IfNotPresent" + spec.selector + .match_labels + .as_ref() + .and_then(|m| m.get("app.kubernetes.io/name")), + Some(&expected_label_value.to_string()), + "Selector should match app name '{expected_label_value}'" ); + } - let container_ports = container - .ports + fn assert_pod_labels(deployment: &Deployment, expected_name: &str) { + let spec = deployment + .spec .as_ref() - .expect("Container should have ports"); - assert_eq!(container_ports.len(), 3, "Container should have 3 ports"); - - assert_eq!(container_ports[0].container_port, 8080, "First container port should be 8080"); - assert_eq!(container_ports[0].name.as_deref(), Some("http"), "First container port name should be 'http'"); - assert_eq!(container_ports[0].protocol.as_deref(), Some("TCP"), "First container port protocol should be 'TCP'"); - - assert_eq!(container_ports[1].container_port, 9000, "Second container port should be 9000"); - assert_eq!(container_ports[1].name.as_deref(), Some("metrics"), "Second container port name should be 'metrics'"); - assert_eq!(container_ports[1].protocol.as_deref(), Some("TCP"), "Second container port protocol should be 'TCP'"); - - assert_eq!(container_ports[2].container_port, 50051, "Third container port should be 50051"); - assert_eq!(container_ports[2].name.as_deref(), Some("grpc"), "Third container port name should be 'grpc'"); - assert_eq!(container_ports[2].protocol.as_deref(), Some("TCP"), "Third container port protocol should be 'TCP'"); - - let env_vars = container.env.as_ref().expect("Container should have env vars"); - assert_eq!(env_vars.len(), 2, "Container should have 2 env vars"); - - let env_map: std::collections::HashMap = env_vars - .iter() - .map(|e| (e.name.clone(), e.value.clone().unwrap_or_default())) - .collect(); - - assert_eq!( - env_map.get("ENV_VAR_1"), - Some(&"value1".to_string()), - "ENV_VAR_1 should have correct value" - ); - assert_eq!( - env_map.get("ENV_VAR_2"), - Some(&"value2".to_string()), - "ENV_VAR_2 should have correct value" - ); - - let pod_labels = deployment_spec + .unwrap_or_else(|| panic!("Deployment should have spec")); + let metadata = spec .template .metadata .as_ref() - .expect("Pod template should have metadata") + .unwrap_or_else(|| panic!("Pod template should have metadata")); + let labels = metadata .labels .as_ref() - .expect("Pod should have labels"); + .unwrap_or_else(|| panic!("Pod should have labels")); assert_eq!( - pod_labels.get("app.kubernetes.io/name"), - Some(&"test-app".to_string()), - "Pod should have correct app label" + labels.get("app.kubernetes.io/name"), + Some(&expected_name.to_string()), + "Pod label app.kubernetes.io/name should be '{expected_name}'" ); assert_eq!( - pod_labels.get("app.kubernetes.io/instance"), - Some(&"test-app".to_string()), - "Pod should have correct instance label" + labels.get("app.kubernetes.io/instance"), + Some(&expected_name.to_string()), + "Pod label app.kubernetes.io/instance should be '{expected_name}'" ); + } - cleanup_test_dirs(&test_dir); + // Container Assertions + fn assert_container_metadata( + container: &Container, + expected_name: &str, + expected_image: &str, + expected_pull_policy: &str, + ) { + assert_eq!( + container.name, expected_name, + "Container name should be '{expected_name}'" + ); + assert_eq!( + container.image.as_deref(), + Some(expected_image), + "Container image should be '{expected_image}'" + ); + assert_eq!( + container.image_pull_policy.as_deref(), + Some(expected_pull_policy), + "Image pull policy should be '{expected_pull_policy}'" + ); + } + + fn assert_container_ports_count(container: &Container, expected_count: usize) { + let ports = container + .ports + .as_ref() + .unwrap_or_else(|| panic!("Container should have ports")); + assert_eq!( + ports.len(), + expected_count, + "Container should have {expected_count} ports" + ); + } + + fn assert_container_port( + port: &k8s_openapi::api::core::v1::ContainerPort, + expected_name: &str, + expected_protocol: &str, + expected_number: i32, + ) { + assert_eq!( + port.name.as_deref(), + Some(expected_name), + "Container port name should be '{expected_name}'" + ); + assert_eq!( + port.protocol.as_deref(), + Some(expected_protocol), + "Container port '{expected_name}' protocol should be '{expected_protocol}'" + ); + assert_eq!( + port.container_port, expected_number, + "Container port '{expected_name}' number should be {expected_number}" + ); + } + + fn assert_container_env_vars_count(container: &Container, expected_count: usize) { + let env_vars = container + .env + .as_ref() + .unwrap_or_else(|| panic!("Container should have env vars")); + assert_eq!( + env_vars.len(), + expected_count, + "Container should have {expected_count} env vars" + ); + } + + fn assert_container_env_var(env_var: &EnvVar, expected_name: &str, expected_value: &str) { + assert_eq!( + env_var.name, expected_name, + "Env var name should be '{expected_name}'" + ); + assert_eq!( + env_var.value.as_deref(), + Some(expected_value), + "Env var '{expected_name}' value should be '{expected_value}'" + ); + } + + fn get_container(deployment: &Deployment) -> Container { + let spec = deployment + .spec + .as_ref() + .unwrap_or_else(|| panic!("Deployment should have spec")); + let pod_spec = spec + .template + .spec + .as_ref() + .unwrap_or_else(|| panic!("Pod template should have spec")); + pod_spec + .containers + .first() + .unwrap_or_else(|| panic!("Should have exactly one container")) + .clone() + } + + // Test Fixtures + fn standard_test_ports() -> Vec { + vec![ + ApplicationNetworkPort { + number: 8080, + protocol: NetworkProtocol::TCP, + name: "http".to_string(), + }, + ApplicationNetworkPort { + number: 9000, + protocol: NetworkProtocol::TCP, + name: "metrics".to_string(), + }, + ApplicationNetworkPort { + number: 50051, + protocol: NetworkProtocol::TCP, + name: "grpc".to_string(), + }, + ] + } + + fn standard_test_env_vars() -> Vec<(String, String)> { + vec![ + ("ENV_VAR_1".to_string(), "value1".to_string()), + ("ENV_VAR_2".to_string(), "value2".to_string()), + ] + } + + fn udp_test_ports() -> Vec { + vec![ + ApplicationNetworkPort { + number: 53, + protocol: NetworkProtocol::UDP, + name: "dns".to_string(), + }, + ApplicationNetworkPort { + number: 8080, + protocol: NetworkProtocol::TCP, + name: "http".to_string(), + }, + ] + } + + // Test Builder + struct BackendAppTestBuilder { + name: Option, + network_ports: Option>, + env_vars: Option>, + } + + impl BackendAppTestBuilder { + fn new() -> Self { + Self { + name: None, + network_ports: None, + env_vars: None, + } + } + + fn with_name(mut self, name: impl Into) -> Self { + self.name = Some(name.into()); + self + } + + fn with_standard_ports(mut self) -> Self { + self.network_ports = Some(standard_test_ports()); + self + } + + fn with_udp_ports(mut self) -> Self { + self.network_ports = Some(udp_test_ports()); + self + } + + fn with_standard_env_vars(mut self) -> Self { + self.env_vars = Some(standard_test_env_vars()); + self + } + + fn with_no_ports(mut self) -> Self { + self.network_ports = Some(vec![]); + self + } + + fn build(self, project_root: PathBuf) -> BackendApp { + BackendApp { + name: self.name.unwrap_or_else(|| "test-app".to_string()), + project_root, + network_ports: self.network_ports.unwrap_or_default(), + env_vars: self.env_vars.unwrap_or_default(), + build_cmd: BuildCommand::new("cargo", vec!["build"]), + dockerfile: None, + } + } + } + + impl Default for BackendAppTestBuilder { + fn default() -> Self { + Self::new() + } + } + + // Helper function for test setup + async fn build_helm_chart_for_test(app: &BackendApp, image_url: &str) { + let result = app.build_push_helm_package(image_url).await; + assert!( + result.is_ok(), + "build_push_helm_package should succeed: {:?}", + result + ); + } + + // ===== SERVICE TESTS ===== + + #[tokio::test] + async fn service_is_created_with_application_name() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let service = read_service_yaml(&app.project_root, "test-app"); + assert_service_metadata(&service, "test-app"); } #[tokio::test] - async fn test_service_with_udp_protocol() { - let app = BackendApp { - name: "udp-app".to_string(), - project_root: "/tmp/test_udp".into(), - network_ports: vec![ - ApplicationNetworkPort { - number: 53, - protocol: NetworkProtocol::UDP, - name: "dns".to_string(), - }, - ApplicationNetworkPort { - number: 8080, - protocol: NetworkProtocol::TCP, - name: "http".to_string(), - }, - ], - env_vars: vec![], - build_cmd: BuildCommand::new("cargo", vec!["build"]), - dockerfile: None, - }; - let test_dir = app.project_root.clone(); + async fn service_has_default_clusterip_type() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); - cleanup_test_dirs(&test_dir); + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; - let result = app - .build_push_helm_package("registry.example.com/test/udp-app:1.0.0") - .await; - - assert!(result.is_ok(), "build_push_helm_package should succeed: {:?}", result); - - let service_yaml_path = test_dir - .join(".harmony_generated/helm/udp-app/templates/service.yaml"); - assert!(service_yaml_path.exists(), "service.yaml should exist"); - - let service_yaml_content = fs::read_to_string(&service_yaml_path) - .expect("Failed to read service.yaml"); - - let service: K8sService = from_str(&service_yaml_content) - .expect("Failed to parse service.yaml as K8s Service"); - - let ports = service - .spec - .as_ref() - .unwrap() - .ports - .as_ref() - .expect("Service should have ports"); - - assert_eq!(ports.len(), 2, "Service should have 2 ports"); - - let dns_port = &ports[0]; - assert_eq!(dns_port.name.as_deref(), Some("dns"), "DNS port name should be 'dns'"); - assert_eq!( - dns_port.protocol.as_deref(), - Some("UDP"), - "DNS port protocol should be 'UDP'" - ); - assert_eq!(dns_port.port, 53, "DNS port number should be 53"); - - let http_port = &ports[1]; - assert_eq!(http_port.name.as_deref(), Some("http"), "HTTP port name should be 'http'"); - assert_eq!( - http_port.protocol.as_deref(), - Some("TCP"), - "HTTP port protocol should be 'TCP'" - ); - assert_eq!(http_port.port, 8080, "HTTP port number should be 8080"); - - cleanup_test_dirs(&test_dir); + let service = read_service_yaml(&app.project_root, "test-app"); + assert_service_type(&service, "ClusterIP"); } + #[tokio::test] + async fn service_exposes_all_network_ports() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let service = read_service_yaml(&app.project_root, "test-app"); + assert_service_port_count(&service, 3); + + let ports = service.spec.unwrap().ports.unwrap(); + assert_service_port(&ports[0], "http", "TCP", 8080); + assert_service_port(&ports[1], "metrics", "TCP", 9000); + assert_service_port(&ports[2], "grpc", "TCP", 50051); + } + + #[tokio::test] + async fn service_target_ports_match_service_ports() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let service = read_service_yaml(&app.project_root, "test-app"); + let ports = service.spec.unwrap().ports.unwrap(); + + for port in &ports { + assert_target_port_matches_service_port(port); + } + } + + #[tokio::test] + async fn service_not_created_when_application_has_no_ports() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app-no-ports") + .with_no_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app-no-ports:1.0.0").await; + + assert!( + !service_yaml_exists(&app.project_root, "test-app-no-ports"), + "service.yaml should not exist when there are no network ports" + ); + } + + #[tokio::test] + async fn service_respects_port_protocol_type() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("udp-app") + .with_udp_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/udp-app:1.0.0").await; + + let service = read_service_yaml(&app.project_root, "udp-app"); + let ports = service.spec.unwrap().ports.unwrap(); + + assert_service_port(&ports[0], "dns", "UDP", 53); + assert_service_port(&ports[1], "http", "TCP", 8080); + } + + // ===== DEPLOYMENT METADATA TESTS ===== + + #[tokio::test] + async fn deployment_has_application_name() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + assert_deployment_metadata(&deployment, "test-app"); + } + + #[tokio::test] + async fn deployment_has_single_replica_by_default() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + assert_deployment_replicas(&deployment, 1); + } + + #[tokio::test] + async fn deployment_selector_matches_application_name() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + assert_selector_match_label(&deployment, "test-app"); + } + + #[tokio::test] + async fn pod_has_standard_kubernetes_labels() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + assert_pod_labels(&deployment, "test-app"); + } + + // ===== CONTAINER CONFIGURATION TESTS ===== + + #[tokio::test] + async fn container_has_correct_name_and_image() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + let image_url = "registry.example.com/test/test-app:1.0.0"; + build_helm_chart_for_test(&app, image_url).await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + let container = get_container(&deployment); + + assert_container_metadata(&container, "test-app", image_url, "IfNotPresent"); + } + + #[tokio::test] + async fn container_exposes_all_application_ports() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + let container = get_container(&deployment); + + assert_container_ports_count(&container, 3); + + let ports = container.ports.unwrap(); + assert_container_port(&ports[0], "http", "TCP", 8080); + assert_container_port(&ports[1], "metrics", "TCP", 9000); + assert_container_port(&ports[2], "grpc", "TCP", 50051); + } + + #[tokio::test] + async fn container_has_all_environment_variables() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .with_standard_env_vars() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + let container = get_container(&deployment); + + assert_container_env_vars_count(&container, 2); + + let env_vars = container.env.unwrap(); + assert_container_env_var(&env_vars[0], "ENV_VAR_1", "value1"); + assert_container_env_var(&env_vars[1], "ENV_VAR_2", "value2"); + } + + // ===== BUILD COMMAND UNIT TESTS ===== + #[test] - fn test_build_command_creation() { + fn build_command_creation_sets_program_and_args() { let cmd = BuildCommand::new("docker", vec!["build", "-t", "myimage"]); assert_eq!(cmd.program, "docker"); assert_eq!(cmd.args, vec!["build", "-t", "myimage"]); } #[test] - fn test_build_command_clone() { + fn build_command_clone_copies_all_fields() { let cmd1 = BuildCommand::new("cargo", vec!["build", "--release"]); let cmd2 = cmd1.clone(); assert_eq!(cmd1.program, cmd2.program); assert_eq!(cmd1.args, cmd2.args); } } - diff --git a/harmony/src/modules/application/config.rs b/harmony/src/modules/application/config.rs index c01ebaba..8d074271 100644 --- a/harmony/src/modules/application/config.rs +++ b/harmony/src/modules/application/config.rs @@ -15,6 +15,13 @@ impl NetworkProtocol { } } + +impl std::fmt::Display for NetworkProtocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + #[derive(Debug, Clone, Serialize)] pub struct ApplicationNetworkPort { pub number: u16, diff --git a/harmony/src/modules/application/helm/mod.rs b/harmony/src/modules/application/helm/mod.rs index fd14d1e7..6b73b087 100644 --- a/harmony/src/modules/application/helm/mod.rs +++ b/harmony/src/modules/application/helm/mod.rs @@ -1,12 +1,16 @@ -use k8s_openapi::api::{ +// Re-export common Kubernetes types for convenience +pub use k8s_openapi::api::{ apps::v1::{Deployment, DeploymentSpec}, core::v1::{ - Container, ContainerPort, EnvVar, PodSpec, - PodTemplateSpec, Service as K8sService, ServicePort, ServiceSpec, + Container, ContainerPort, EnvVar, PodSpec, PodTemplateSpec, Service as K8sService, + ServicePort, ServiceSpec, }, }; +use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString; use kube::core::ObjectMeta; -use serde::Serialize; + +// Import domain types for the deployment builder +use crate::modules::application::config::{ApplicationNetworkPort, NetworkProtocol}; use std::fs; use std::path::{Path, PathBuf}; @@ -124,7 +128,8 @@ impl HelmChart { // 3. Serialize and write all added resources (Deployment, Service, etc.) for resource in &self.resources { let filename = resource.filename(); - let content = resource.serialize_to_yaml() + let content = resource + .serialize_to_yaml() .map_err(|e| format!("Failed to serialize resource {}: {}", filename, e))?; fs::write(templates_dir.join(filename), content)?; } @@ -133,7 +138,6 @@ impl HelmChart { } } - use askama::Template; #[derive(Template)] @@ -168,7 +172,12 @@ impl ServiceBuilder { self } - pub fn with_port(mut self, name: impl Into, port: i32, protocol: impl Into) -> Self { + pub fn with_port( + mut self, + name: impl Into, + port: i32, + protocol: impl Into, + ) -> Self { use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString; self.ports.push(ServicePort { name: Some(name.into()), @@ -192,8 +201,14 @@ impl ServiceBuilder { labels: Some( [ ("app.kubernetes.io/name".to_string(), self.name.clone()), - ("app.kubernetes.io/component".to_string(), "service".to_string()), - ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()), + ( + "app.kubernetes.io/component".to_string(), + "service".to_string(), + ), + ( + "app.kubernetes.io/managed-by".to_string(), + "harmony".to_string(), + ), ] .into(), ), @@ -201,8 +216,14 @@ impl ServiceBuilder { }, spec: Some(ServiceSpec { type_: Some(self.service_type), - selector: Some([("app.kubernetes.io/name".to_string(), self.selector_label)].into()), - ports: if self.ports.is_empty() { None } else { Some(self.ports) }, + selector: Some( + [("app.kubernetes.io/name".to_string(), self.selector_label)].into(), + ), + ports: if self.ports.is_empty() { + None + } else { + Some(self.ports) + }, ..Default::default() }), ..Default::default() @@ -221,13 +242,53 @@ pub struct DeploymentBuilder { } impl DeploymentBuilder { + /// Create a new DeploymentBuilder with minimal required fields. pub fn new(name: impl Into, image: impl Into) -> Self { + Self::with_options(name, image, None, None, None) + } + + /// Create a new DeploymentBuilder with optional initial configuration. + /// + /// Arguments: + /// - `name`: The deployment name + /// - `image`: The container image to use + /// - `ports`: Optional vector of initial application network ports + /// - `env_vars`: Optional vector of initial environment variable key-value pairs + /// - `replicas`: Optional number of replicas (defaults to 1) + pub fn with_options( + name: impl Into, + image: impl Into, + ports: Option>, + env_vars: Option>, + replicas: Option, + ) -> Self { + let container_ports: Vec = ports + .unwrap_or_default() + .into_iter() + .map(|port| ContainerPort { + container_port: port.number as i32, + name: Some(port.name), + protocol: Some(port.protocol.to_string()), + ..Default::default() + }) + .collect(); + + let k8s_env_vars: Vec = env_vars + .unwrap_or_default() + .into_iter() + .map(|(key, value)| EnvVar { + name: key, + value: Some(value), + ..Default::default() + }) + .collect(); + Self { name: name.into(), image: image.into(), - replicas: 1, - container_ports: Vec::new(), - env_vars: Vec::new(), + replicas: replicas.unwrap_or(1), + container_ports, + env_vars: k8s_env_vars, image_pull_policy: Some("IfNotPresent".to_string()), } } @@ -237,7 +298,12 @@ impl DeploymentBuilder { self } - pub fn with_container_port(mut self, number: i32, name: impl Into, protocol: impl Into) -> Self { + pub fn with_container_port( + mut self, + number: i32, + name: impl Into, + protocol: impl Into, + ) -> Self { self.container_ports.push(ContainerPort { container_port: number, name: Some(name.into()), @@ -269,8 +335,14 @@ impl DeploymentBuilder { labels: Some( [ ("app.kubernetes.io/name".to_string(), name.clone()), - ("app.kubernetes.io/component".to_string(), "deployment".to_string()), - ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()), + ( + "app.kubernetes.io/component".to_string(), + "deployment".to_string(), + ), + ( + "app.kubernetes.io/managed-by".to_string(), + "harmony".to_string(), + ), ("app.kubernetes.io/version".to_string(), "1.0.0".to_string()), ] .into(), @@ -280,7 +352,9 @@ impl DeploymentBuilder { spec: Some(DeploymentSpec { replicas: Some(self.replicas), selector: k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector { - match_labels: Some([("app.kubernetes.io/name".to_string(), name.clone())].into()), + match_labels: Some( + [("app.kubernetes.io/name".to_string(), name.clone())].into(), + ), ..Default::default() }, template: PodTemplateSpec { @@ -304,7 +378,11 @@ impl DeploymentBuilder { } else { Some(self.container_ports) }, - env: if self.env_vars.is_empty() { None } else { Some(self.env_vars) }, + env: if self.env_vars.is_empty() { + None + } else { + Some(self.env_vars) + }, ..Default::default() }], ..Default::default() @@ -321,21 +399,19 @@ impl DeploymentBuilder { /// Returns `None` if no ports are provided. pub fn create_service_from_ports( name: String, - network_ports: &[(String, u16, String)], // (name, number, protocol) + network_ports: &[ApplicationNetworkPort], ) -> Option { - use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString; - if network_ports.is_empty() { return None; } let ports: Vec = network_ports - .iter() - .map(|(port_name, number, protocol)| ServicePort { - name: Some(port_name.clone()), - protocol: Some(protocol.clone()), - port: *number as i32, - target_port: Some(IntOrString::Int(*number as i32)), + .into_iter() + .map(|port| ServicePort { + name: Some(port.name.clone()), + protocol: Some(port.protocol.to_string()), + port: port.number as i32, + target_port: Some(IntOrString::Int(port.number as i32)), ..Default::default() }) .collect(); @@ -346,8 +422,14 @@ pub fn create_service_from_ports( labels: Some( [ ("app.kubernetes.io/name".to_string(), name.clone()), - ("app.kubernetes.io/component".to_string(), "service".to_string()), - ("app.kubernetes.io/managed-by".to_string(), "harmony".to_string()), + ( + "app.kubernetes.io/component".to_string(), + "service".to_string(), + ), + ( + "app.kubernetes.io/managed-by".to_string(), + "harmony".to_string(), + ), ] .into(), ), @@ -355,7 +437,7 @@ pub fn create_service_from_ports( }, spec: Some(ServiceSpec { type_: Some("ClusterIP".to_string()), - selector: Some([("app.kubernetes.io/name".to_string(), name.clone())].into()), + selector: Some([("app.kubernetes.io/name".to_string(), name)].into()), ports: Some(ports), ..Default::default() }), diff --git a/harmony_agent/deploy/src/main.rs b/harmony_agent/deploy/src/main.rs index 82fdd15a..84424cd4 100644 --- a/harmony_agent/deploy/src/main.rs +++ b/harmony_agent/deploy/src/main.rs @@ -18,8 +18,11 @@ use std::{path::PathBuf, sync::Arc}; async fn main() { let application = Arc::new(BackendApp { name: "harmony-agent".to_string(), - // This means the script will be run from the harmony_agent directory, not from the - // deploy directory + // Since harmony_agent is part of the harmony workspace, the actual "project root" + // is not harmony_agent folder but the workspace root. + // + // So using ../ here means we MUST run this deployment script from the harmony_agent + // folder project_root: PathBuf::from("../"), network_ports: vec![], env_vars: vec![], -- 2.39.5 From 0cff1e0f6608b1c7d33de7271014510047ad10df Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Fri, 30 Jan 2026 06:58:03 -0500 Subject: [PATCH 07/19] feat: Harmony agent new algorithm based on heartbeat counters basics. Old code will need to be refactored completely --- harmony_agent/Cargo.toml | 1 + harmony_agent/deploy/src/main.rs | 9 +- harmony_agent/src/agent.rs | 89 +++++++++++---- harmony_agent/src/config.rs | 72 ++++++++++-- harmony_agent/src/main.rs | 187 ++++++++++++++++++++++++++++++- 5 files changed, 321 insertions(+), 37 deletions(-) diff --git a/harmony_agent/Cargo.toml b/harmony_agent/Cargo.toml index 360e26e4..eee7bf11 100644 --- a/harmony_agent/Cargo.toml +++ b/harmony_agent/Cargo.toml @@ -20,3 +20,4 @@ async-trait = "0.1" serde.workspace = true serde_json.workspace = true +getrandom = "0.3.4" diff --git a/harmony_agent/deploy/src/main.rs b/harmony_agent/deploy/src/main.rs index 84424cd4..8baab66b 100644 --- a/harmony_agent/deploy/src/main.rs +++ b/harmony_agent/deploy/src/main.rs @@ -18,14 +18,19 @@ use std::{path::PathBuf, sync::Arc}; async fn main() { let application = Arc::new(BackendApp { name: "harmony-agent".to_string(), - // Since harmony_agent is part of the harmony workspace, the actual "project root" + // Since harmony_agent is part of the harmony workspace, the actual "project root" // is not harmony_agent folder but the workspace root. // // So using ../ here means we MUST run this deployment script from the harmony_agent // folder project_root: PathBuf::from("../"), network_ports: vec![], - env_vars: vec![], + env_vars: vec![ + ("NATS_URL".to_string(), "nats://nats".to_string()), + ("DESIRED_PRIMARY".to_string(), "site-1".to_string()), + ("MY_CLUSTER_ID".to_string(), "site-1".to_string()), + ("NATS_CREDS_PATH".to_string(), "".to_string()), + ], build_cmd: BuildCommand::new("cargo", vec!["build", "--release", "-p", "harmony_agent"]), dockerfile: Some(PathBuf::from("Dockerfile")), }); diff --git a/harmony_agent/src/agent.rs b/harmony_agent/src/agent.rs index eafc83e2..14384107 100644 --- a/harmony_agent/src/agent.rs +++ b/harmony_agent/src/agent.rs @@ -1,20 +1,29 @@ +use async_nats::jetstream::kv::Store; use async_trait::async_trait; -use log::{debug, error, info}; +use harmony_types::id::Id; +use log::{debug, error, info, trace}; use serde::{Deserialize, Serialize}; use std::time::{SystemTime, UNIX_EPOCH}; -use harmony_types::id::Id; -use async_nats::jetstream::kv::Store; use crate::config::AgentConfig; #[async_trait] pub trait HealthStore: Send + Sync { - async fn put(&self, key: String, value: Vec) -> Result>; + async fn put( + &self, + key: String, + value: Vec, + ) -> Result>; } #[async_trait] impl HealthStore for Store { - async fn put(&self, key: String, value: Vec) -> Result> { + async fn put( + &self, + key: String, + value: Vec, + ) -> Result> { + trace!("HealthStore::put key={} value_len={}", key, value.len()); self.put(key, value.into()) .await .map_err(|e| Box::new(e) as Box) @@ -30,31 +39,49 @@ pub struct AgentHeartbeat { pub struct HarmonyAgent { config: AgentConfig, - #[allow(dead_code)] nats_client: Option, health_kv: Box, } - impl HarmonyAgent { pub async fn new(config: AgentConfig) -> Result> { + info!("Initializing HarmonyAgent"); + info!(" nats_url: {}", config.nats_url); + info!(" my_cluster_id: {}", config.my_cluster_id); + info!(" desired_primary: {}", config.desired_primary); + info!(" heartbeat_interval: {:?}", config.heartbeat_interval); + info!(" nats_creds_path: {:?}", config.nats_creds_path); + debug!("Full Bootstrap configuration:\n{config:#?}"); + let mut options = async_nats::ConnectOptions::new(); - if let Some(ref creds) = config.nats_creds_path { + if let Some(creds) = &config.nats_creds_path { + debug!("Loading NATS credentials from file: {}", creds); options = options.credentials_file(creds).await?; } + debug!("Connecting to nats"); let client = async_nats::connect_with_options(&config.nats_url, options).await?; + info!("Successfully connected to NATS at {}", config.nats_url); let jetstream = async_nats::jetstream::new(client.clone()); // Initialize KV Buckets as per ADR-017 const HEARTBEAT_KV_HISTORY_SIZE: i64 = 64; + debug!("Creating health KV bucket: harmony_agent_health"); let health_kv = jetstream .create_key_value(async_nats::jetstream::kv::Config { bucket: "harmony_agent_health".to_string(), history: HEARTBEAT_KV_HISTORY_SIZE, ..Default::default() }) - .await?; + .await + .map_err(|e| { + error!( + "Failed to initialize health KV bucket 'harmony_agent_health': {}", + e + ); + e + })?; + info!("Successfully initialized health KV bucket: harmony_agent_health"); Ok(Self { config, @@ -63,18 +90,25 @@ impl HarmonyAgent { }) } - pub async fn run_heartbeat_loop(&self) -> Result<(), Box> { let mut interval = tokio::time::interval(self.config.heartbeat_interval); let key = format!("heartbeat.{}", self.config.my_cluster_id); - info!("Starting heartbeat loop for cluster: {}", self.config.my_cluster_id); + info!( + "Starting heartbeat loop for cluster: {}", + self.config.my_cluster_id + ); loop { interval.tick().await; + trace!("Heartbeat loop tick"); let now = SystemTime::now() - .duration_since(UNIX_EPOCH)? + .duration_since(UNIX_EPOCH) + .map_err(|e| { + error!("Failed to get system time for heartbeat: {}", e); + e + })? .as_millis() as u64; let heartbeat = AgentHeartbeat { @@ -83,20 +117,28 @@ impl HarmonyAgent { timestamp: now, }; - debug!("Sending heartbeat for cluster: {}", self.config.my_cluster_id); + debug!( + "Sending heartbeat for cluster: {}", + self.config.my_cluster_id + ); let payload = serde_json::to_vec(&heartbeat)?; // Write heartbeat to KV. ADR-017: Write failure triggers self-demotion logic match self.health_kv.put(key.clone(), payload).await { Ok(_) => { - debug!("Heartbeat successful for cluster: {}", self.config.my_cluster_id); + debug!( + "Heartbeat successful for cluster: {}", + self.config.my_cluster_id + ); } Err(e) => { - error!("Failed to write heartbeat: {}. Fencing logic would trigger here.", e); + error!( + "Failed to write heartbeat: {}. Fencing logic would trigger here.", + e + ); // In a real implementation, we would trigger self-demotion/fencing here } } - } } } @@ -105,7 +147,7 @@ impl HarmonyAgent { mod tests { use super::*; use std::sync::{Arc, Mutex}; - use tokio::time::{pause, advance, Duration}; + use tokio::time::{Duration, advance}; struct MockHealthStore { puts: Arc)>>>, @@ -113,7 +155,11 @@ mod tests { #[async_trait] impl HealthStore for MockHealthStore { - async fn put(&self, key: String, value: Vec) -> Result> { + async fn put( + &self, + key: String, + value: Vec, + ) -> Result> { self.puts.lock().unwrap().push((key, value)); Ok(0) } @@ -150,7 +196,11 @@ mod tests { } let recorded_puts = puts.lock().unwrap(); - assert!(recorded_puts.len() >= 2, "Should have recorded at least 2 heartbeats, got {}", recorded_puts.len()); + assert!( + recorded_puts.len() >= 2, + "Should have recorded at least 2 heartbeats, got {}", + recorded_puts.len() + ); let (key, payload) = &recorded_puts[0]; assert_eq!(key, "heartbeat.test-cluster"); @@ -162,4 +212,3 @@ mod tests { handle.abort(); } } - diff --git a/harmony_agent/src/config.rs b/harmony_agent/src/config.rs index cf5fe128..394a774d 100644 --- a/harmony_agent/src/config.rs +++ b/harmony_agent/src/config.rs @@ -1,6 +1,8 @@ -use std::env; -use std::time::Duration; use harmony_types::id::Id; +use log::debug; +use std::env; +use std::path::Path; +use std::time::Duration; /// Configuration for the Harmony Agent #[derive(Debug, Clone)] @@ -12,18 +14,70 @@ pub struct AgentConfig { pub heartbeat_interval: Duration, } +pub const NATS_URL: &str = "NATS_URL"; +pub const DESIRED_PRIMARY: &str = "DESIRED_PRIMARY"; +pub const MY_CLUSTER_ID: &str = "MY_CLUSTER_ID"; +pub const NATS_CREDS_PATH: &str = "NATS_CREDS_PATH"; + impl AgentConfig { pub fn load_from_env() -> Result { - let nats_url = env::var("NATS_URL") - .unwrap_or_else(|_| "nats://localhost:4222".to_string()); + let nats_url = env::var(NATS_URL).unwrap_or_else(|_| "nats://localhost:4222".to_string()); - let nats_creds_path = env::var("NATS_CREDS_PATH").ok(); + // Validate NATS URL is not empty + if nats_url.is_empty() { + return Err(format!("{NATS_URL} cannot be empty")); + } - let my_cluster_id_str = env::var("MY_CLUSTER_ID") - .map_err(|_| "Environment variable MY_CLUSTER_ID is required".to_string())?; + // Validate NATS URL format + if !nats_url.starts_with("nats://") && !nats_url.starts_with("tls://") { + return Err(format!( + "Invalid NATS URL format: {}. Must start with 'nats://' or 'tls://'", + nats_url + )); + } - let desired_primary_str = env::var("DESIRED_PRIMARY") - .map_err(|_| "Environment variable DESIRED_PRIMARY is required".to_string())?; + let nats_creds_path = env::var(NATS_CREDS_PATH) + .ok() + .filter(|creds_path| !creds_path.is_empty()); + + // Validate NATS creds path if provided + if let Some(creds_path) = &nats_creds_path { + debug!("Validating nats creds path from env var {NATS_CREDS_PATH} : {nats_creds_path:?}"); + let path = Path::new(creds_path); + if !path.exists() { + return Err(format!( + "NATS credentials file does not exist: {}", + creds_path + )); + } + if !path.is_file() { + return Err(format!( + "NATS credentials path is not a file: {}", + creds_path + )); + } + // Check if file is readable by attempting to read metadata + if std::fs::metadata(path).is_err() { + return Err(format!( + "NATS credentials file is not readable: {}", + creds_path + )); + } + } + + let my_cluster_id_str = env::var(MY_CLUSTER_ID) + .map_err(|_| "Environment variable {MY_CLUSTER_ID} is required".to_string())?; + + if my_cluster_id_str.is_empty() { + return Err(format!("{MY_CLUSTER_ID} cannot be empty")); + } + + let desired_primary_str = env::var(DESIRED_PRIMARY) + .map_err(|_| "Environment variable {DESIRED_PRIMARY} is required".to_string())?; + + if desired_primary_str.is_empty() { + return Err(format!("{DESIRED_PRIMARY} cannot be empty")); + } Ok(Self { nats_url, diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs index a67e5b99..3664b65c 100644 --- a/harmony_agent/src/main.rs +++ b/harmony_agent/src/main.rs @@ -1,24 +1,199 @@ -use crate::{agent::HarmonyAgent, config::AgentConfig}; +use std::{str::FromStr, time::Duration}; + +use harmony_types::id::Id; +use log::{debug, info}; +use tokio::time::Instant; + +// use crate::{agent::HarmonyAgent, config::AgentConfig}; mod agent; mod config; +// #[tokio::main] +// async fn main() -> Result<(), Box> { +// env_logger::init(); +// +// let config = AgentConfig::load_from_env()?; +// +// log::info!("Harmony Agent Initialized"); +// log::debug!("Identity (My Cluster ID): {}", config.my_cluster_id); +// log::debug!("NATS URL : {}", config.nats_url); +// +// let agent = HarmonyAgent::new(config).await?; +// +// // Run the heartbeat loop +// agent.run_heartbeat_loop().await?; +// +// Ok(()) +// } #[tokio::main] async fn main() -> Result<(), Box> { env_logger::init(); - let config = AgentConfig::load_from_env()?; + let my_agent_id = Id::from_str("agent_1").unwrap(); + + let config = AgentConfig { + success_threshold: 5, + failure_threshold: 10, + heartbeat_interval: Duration::from_secs(1), + deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig { + desired_primary_agent: my_agent_id, + cnpg_cluster_name: String::from("cnpg_cluster_name"), + }), + nats_url: String::new(), + nats_creds_path: None, + agent_id: Id::empty(), + }; log::info!("Harmony Agent Initialized"); - log::debug!("Identity (My Cluster ID): {}", config.my_cluster_id); + log::debug!("Identity (My Cluster ID): {}", config.agent_id); log::debug!("NATS URL : {}", config.nats_url); - let agent = HarmonyAgent::new(config).await?; - + let agent = HarmonyAgent { config }; + // Run the heartbeat loop - agent.run_heartbeat_loop().await?; + agent.run_heartbeat_loop().await; Ok(()) } +#[derive(Debug, Clone)] +pub struct AgentConfig { + /// Number of consecutive successful heartbeats required before the service transitions from + /// failed to healthy. + pub success_threshold: usize, + /// Number of consecutive failed heartbeats required before the service transitions from + /// healthy to failed. + pub failure_threshold: usize, + /// Time between each heartbeat. If a heartbeat takes longer than this, it will be + /// considered failed. + pub heartbeat_interval: Duration, + /// **UNSTABLE FIELD** + /// + /// For now, an agent instance only serves one deployment. This is probably fine as an agent's + /// footprint is low, but managing multiple deployments in a single instance would be a + /// significant resource usage reduction. + /// + /// Decoupling the deployment of the agent with the application's deployment could make things + /// more complicated though, where we would have to be careful about version compatibility + /// between all components managed by the agent instance. So for now it is a 1-1 map. + /// + /// But I have a feeling this could change so I am marking this field unstable to warn you, the + /// reader. + pub deployment_config_unstable: DeploymentConfig, + pub nats_url: String, + pub nats_creds_path: Option, + pub agent_id: Id, +} + +#[derive(Debug, Clone)] +pub enum DeploymentConfig { + FailoverPostgreSQL(FailoverCNPGConfig), +} + +#[derive(Debug, Clone)] +pub struct FailoverCNPGConfig { + pub desired_primary_agent: Id, + pub cnpg_cluster_name: String, +} + +impl DeploymentConfig { + /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres) + pub async fn perform_health_check(&self) -> Result<(), HeartbeatFailure> { + match self { + DeploymentConfig::FailoverPostgreSQL(cfg) => { + info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name); + // TODO: Implement actual PG check / NATS write here + Ok(()) + } + } + } + + /// Callback: Transitioned from Unhealthy -> Healthy + pub async fn on_active(&self) { + info!("Service is now ACTIVE (Healthy)"); + // e.g., Remove fencing lock + } + + /// Callback: Transitioned from Healthy -> Unhealthy + pub async fn on_failover(&self) { + info!("Service is now FAILED (Unhealthy)"); + // e.g., Initiate self-fencing, stop accepting traffic + } +} + +pub struct HarmonyAgent { + pub config: AgentConfig, +} + +impl HarmonyAgent { + pub async fn run_heartbeat_loop(&self) { + let mut consecutive_successes = 0; + let mut consecutive_failures = 0; + let mut is_healthy = false; + let mut next_heartbeat_start; + loop { + let this_heartbeat_start = Instant::now(); + next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval; + + // Perform the check via the config/strategy with a timeout + let result = tokio::time::timeout(self.config.heartbeat_interval, async { + // simulate variable latency for the health check + tokio::time::sleep(Duration::from_millis(getrandom::u64().unwrap() % 2000)).await; + self.config + .deployment_config_unstable + .perform_health_check() + .await + }) + .await; + + // Update Counters & Handle State Transitions + // Timeout is also treated as a failure + let heartbeat_result = match result { + Ok(inner_result) => inner_result, + Err(_) => Err(HeartbeatFailure {}), + }; + + match heartbeat_result { + Ok(_) => { + consecutive_failures = 0; + consecutive_successes += 1; + + if !is_healthy && consecutive_successes >= self.config.success_threshold { + info!("Success threshold reached. Marking as Healthy."); + is_healthy = true; + self.config.deployment_config_unstable.on_active().await; + } + } + Err(_) => { + consecutive_successes = 0; + consecutive_failures += 1; + + if is_healthy && consecutive_failures >= self.config.failure_threshold { + log::warn!("Failure threshold reached. Marking as Unhealthy."); + is_healthy = false; + self.config.deployment_config_unstable.on_failover().await; + } + } + } + + info!( + "Heartbeat : success={} healthy={}, successes={}, fails={} took={}ms", + if heartbeat_result.is_ok() { "✅" } else { "❌" }, + is_healthy, + consecutive_successes, + consecutive_failures, + (Instant::now() - this_heartbeat_start).as_millis() + ); + debug!( + "Sleeping for {} ms before next heartbeat", + (next_heartbeat_start - Instant::now()).as_millis() + ); + tokio::time::sleep_until(next_heartbeat_start).await; + } + } +} + +#[derive(Debug)] +pub struct HeartbeatFailure {} -- 2.39.5 From 50aa545bd97f0c0b77f3ff12d9f78b294b4956d7 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Sun, 1 Feb 2026 20:54:11 -0500 Subject: [PATCH 08/19] wip(harmony_agent): It compiles, contains most if not all of the required skeleton, now time to review it carefully, complete a few details and battle test it --- harmony_agent/Cargo.toml | 2 + harmony_agent/src/agent_loop.rs | 404 +++++++++++++++++++++++ harmony_agent/src/main.rs | 444 +++++++++++++++----------- harmony_agent/src/store/chaos.rs | 123 +++++++ harmony_agent/src/store/memory.rs | 184 +++++++++++ harmony_agent/src/store/mod.rs | 117 +++++++ harmony_agent/src/store/nats.rs | 135 ++++++++ harmony_agent/src/workflow/mod.rs | 42 +++ harmony_agent/src/workflow/primary.rs | 165 ++++++++++ harmony_agent/src/workflow/replica.rs | 279 ++++++++++++++++ 10 files changed, 1703 insertions(+), 192 deletions(-) create mode 100644 harmony_agent/src/agent_loop.rs create mode 100644 harmony_agent/src/store/chaos.rs create mode 100644 harmony_agent/src/store/memory.rs create mode 100644 harmony_agent/src/store/mod.rs create mode 100644 harmony_agent/src/store/nats.rs create mode 100644 harmony_agent/src/workflow/mod.rs create mode 100644 harmony_agent/src/workflow/primary.rs create mode 100644 harmony_agent/src/workflow/replica.rs diff --git a/harmony_agent/Cargo.toml b/harmony_agent/Cargo.toml index eee7bf11..22a373ca 100644 --- a/harmony_agent/Cargo.toml +++ b/harmony_agent/Cargo.toml @@ -21,3 +21,5 @@ async-trait = "0.1" serde.workspace = true serde_json.workspace = true getrandom = "0.3.4" + +thiserror.workspace = true diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs new file mode 100644 index 00000000..2b92b851 --- /dev/null +++ b/harmony_agent/src/agent_loop.rs @@ -0,0 +1,404 @@ +use std::{str::FromStr, sync::Arc, time::Duration}; + +use harmony_types::id::Id; +use log::{debug, info, trace}; +use serde::{Deserialize, Serialize}; +use tokio::sync::RwLock; +use tokio::time::Instant; + +use crate::store::{KvStore, KvStoreError}; +use crate::workflow::HeartbeatWorkflow; +use crate::workflow::primary::PrimaryWorkflow; +use crate::workflow::replica::ReplicaWorkflow; + +/// The role of this agent instance +#[derive(Debug, Clone, PartialEq)] +pub enum AgentRole { + Primary, + Replica, +} + +pub async fn main() -> Result<(), Box> { + env_logger::init(); + + let my_agent_id = Id::from_str("agent_1").unwrap(); + + let config = AgentConfig { + success_threshold: 2, + failure_threshold: 2, + heartbeat_interval: Duration::from_secs(1), + failover_timeout: Duration::from_secs(5), + deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig { + desired_primary_agent: my_agent_id.clone(), + cnpg_cluster_name: String::from("cnpg_cluster_name"), + }), + nats_url: String::new(), + nats_creds_path: None, + agent_id: my_agent_id, + role: AgentRole::Replica, + cluster_id: "cluster_test_id".into(), + desired_primary_id: "primary_id".into(), + }; + + log::info!("Harmony Agent Initialized"); + log::info!("Initializing Harmony Agent Id : {}", config.agent_id); + log::info!("Full config : {:?}", config); + + // TODO load store based on config, default to nats + // probably a good use case for a factory pattern + use crate::store::ChaosKvStore; + use crate::store::InMemoryKvStore; + let health_kv = ChaosKvStore::new(InMemoryKvStore::new(), 30, 30, 1000); + let cluster_kv = ChaosKvStore::new(InMemoryKvStore::new(), 30, 30, 2000); + + let mut agent = HarmonyAgent::new(config, health_kv, cluster_kv); + + // Run the heartbeat loop + agent.run_heartbeat_loop().await; + + Ok(()) +} + +#[derive(Debug, Clone)] +pub struct AgentConfig { + /// Number of consecutive successful heartbeats required before the service transitions from + /// failed to healthy. + pub success_threshold: usize, + /// Number of consecutive failed heartbeats required before the service transitions from + /// healthy to failed. + pub failure_threshold: usize, + /// Time between each heartbeat. If a heartbeat takes longer than this, it will be + /// considered failed. + pub heartbeat_interval: Duration, + /// Time since last observed primary heartbeat before replica considers primary stale. + /// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin + /// to avoid split brain during network partitions. + pub failover_timeout: Duration, + /// **UNSTABLE FIELD** + /// + /// For now, an agent instance only serves one deployment. This is probably fine as an agent's + /// footprint is low, but managing multiple deployments in a single instance would be a + /// significant resource usage reduction. + /// + /// Decoupling the deployment of the agent with the application's deployment could make things + /// more complicated though, where we would have to be careful about version compatibility + /// between all components managed by the agent instance. So for now it is a 1-1 map. + /// + /// But I have a feeling this could change so I am marking this field unstable to warn you, the + /// reader. + pub deployment_config_unstable: DeploymentConfig, + pub nats_url: String, + pub nats_creds_path: Option, + pub agent_id: Id, + pub cluster_id: Id, + pub desired_primary_id: Id, + /// The role this agent plays (Primary or Replica) + pub role: AgentRole, +} + +#[derive(Debug, Clone)] +pub enum DeploymentConfig { + FailoverPostgreSQL(FailoverCNPGConfig), +} + +#[derive(Debug, Clone)] +pub struct FailoverCNPGConfig { + pub desired_primary_agent: Id, + pub cnpg_cluster_name: String, +} + +impl DeploymentConfig { + /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres) + pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> { + match self { + DeploymentConfig::FailoverPostgreSQL(cfg) => { + info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name); + // TODO: Implement actual PG check / NATS write here + Ok(()) + } + } + } + + /// Callback: Transitioned from Unhealthy -> Healthy + pub async fn on_active(&self) { + info!("Service is now ACTIVE (Healthy)"); + // e.g., Remove fencing lock + } + + /// Callback: Transitioned from Healthy -> Unhealthy + pub async fn on_failover(&self) { + info!("Service is now FAILED (Unhealthy)"); + // e.g., Initiate self-fencing, stop accepting traffic + } +} + +/// Agent-provided heartbeat information (no timestamps - those come from the store) +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AgentInfo { + pub agent_id: Id, + pub cluster_id: Id, + pub status: String, +} + +/// Store-provided metadata for a heartbeat +/// This is returned by the KV store and includes timing/ordering guarantees +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct HeartbeatMetadata { + /// Timestamp set by the store (e.g., NATS JetStream) + /// This avoids clock skew between agents + pub timestamp: u64, + /// Sequence number for strict ordering (e.g., JetStream sequence) + pub sequence: u64, +} + +/// Complete heartbeat with both agent data and store metadata +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AgentHeartbeat { + pub agent_info: AgentInfo, + pub metadata: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ClusterStateData { + pub cluster_id: Id, + pub current_primary: Option, + pub desired_primary: Id, + pub timestamp: u64, +} + +pub struct HarmonyAgent { + pub config: AgentConfig, + workflow: Box, + health_kv: S, + cluster_kv: S, + /// Last successful heartbeat, used to track sequence number for next write + /// This avoids doing a GET before every SET, reducing network round-trips + last_heartbeat: Arc>>, + /// Local copy of cluster state, updated via subscription + /// This allows workflows to make decisions without querying NATS each time + cluster_state: Arc>>, +} + +impl HarmonyAgent { + pub fn new(config: AgentConfig, health_kv: S, cluster_kv: S) -> Self { + let workflow: Box = match config.role { + AgentRole::Primary => { + info!("Initializing agent as PRIMARY"); + Box::new(PrimaryWorkflow::new( + config.success_threshold, + config.failure_threshold, + config.deployment_config_unstable.clone(), + )) + } + AgentRole::Replica => { + info!("Initializing agent as REPLICA"); +// pub fn new(success_threshold: usize, failure_threshold: usize, cluster_id: Id, primary_id: Id, my_id: Id) -> Self + Box::new(ReplicaWorkflow::new( + config.success_threshold, + config.failure_threshold, + config.cluster_id.clone(), + config.desired_primary_id.clone(), + config.agent_id.clone(), + config.failover_timeout, + )) + } + }; + + Self { + config, + workflow, + health_kv, + cluster_kv, + last_heartbeat: Arc::new(RwLock::new(None)), + cluster_state: Arc::new(RwLock::new(None)), + } + } + + /// Reconcile startup state by fetching cluster state from the store + /// This allows the workflow to determine if it should resume as Primary/Replica + /// based on the persisted cluster state + pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> { + let cluster_key = format!("cluster.{}", self.config.cluster_id); + + debug!("Fetching cluster state for startup reconciliation from key: {}", cluster_key); + + let cluster_state_option = match self.cluster_kv.get(cluster_key.clone()).await { + Ok(result) => { + if let Some(value) = result.value { + match serde_json::from_value::(value) { + Ok(data) => Some(data), + Err(e) => { + log::warn!("Failed to deserialize cluster state: {}", e); + None + } + } + } else { + debug!("No cluster state found, this is a fresh cluster"); + None + } + } + Err(KvStoreError::KeyNotAvailable(_)) => { + debug!("Cluster state key not found, this is a fresh cluster"); + None + } + Err(e) => { + log::warn!("Failed to fetch cluster state during startup: {}", e); + return Err(e); + } + }; + + let state_ref = cluster_state_option.as_ref(); + self.workflow.on_startup(state_ref).await; + + // Cache the cluster state locally + *self.cluster_state.write().await = cluster_state_option; + + Ok(()) + } + + /// Sends agent heartbeat to the KV store + /// + /// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence) + /// to avoid clock skew issues. This follows the ADR-017-3 principle that all timestamp + /// comparisons use the store's clock, not agent clocks. + /// + /// This method uses the last successful heartbeat's sequence number to avoid an extra + /// GET call before each SET, reducing network round-trips and latency exposure. + async fn store_heartbeat(&self) -> Result { + let key = format!("heartbeat.{}", self.config.agent_id); + + // Create agent info WITHOUT timestamp - the store will add metadata + // Use workflow state to report actual status (e.g. Primary:Fenced, Replica:Watching) + let agent_info = AgentInfo { + agent_id: self.config.agent_id.clone(), + cluster_id: self.config.cluster_id.clone(), + status: self.workflow.state_name().to_string(), + }; + + debug!("Storing heartbeat for agent: {}", self.config.agent_id); + let value = serde_json::to_value(&agent_info) + .map_err(|e| KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: format!("{:?}", agent_info), + })?; + + // Get expected sequence from last successful heartbeat (0 if first write) + let expected_sequence = { + let last = self.last_heartbeat.read().await; + last.as_ref() + .and_then(|hb| hb.metadata.as_ref()) + .map(|m| m.sequence) + .unwrap_or(0) + }; + + // Write with strict ordering - single network round-trip + let new_seq = self.health_kv.set_strict(key, value, expected_sequence).await?; + + debug!("Heartbeat stored successfully with sequence: {}", new_seq); + + // Construct complete heartbeat with metadata from store + let heartbeat = AgentHeartbeat { + agent_info, + metadata: Some(HeartbeatMetadata { + timestamp: todo!("get the real timestamp from store"), + sequence: new_seq, + }), + }; + + // Cache this successful heartbeat for next iteration + *self.last_heartbeat.write().await = Some(heartbeat.clone()); + + Ok(heartbeat) + } + + pub async fn run_heartbeat_loop(&mut self) { + let mut next_heartbeat_start; + loop { + let this_heartbeat_start = Instant::now(); + next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval; + + // Perform the check via the config/strategy with a timeout + let result = tokio::time::timeout(self.config.heartbeat_interval, async { + // Store heartbeat and perform deployment-specific health check + match &self.store_heartbeat().await { + Ok(heartbeat) => { + // Heartbeat stored successfully, already cached by store_heartbeat + debug!("Heartbeat stored: seq={}", heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)); + // Pass heartbeat with metadata to workflow for staleness checks + self.workflow.on_heartbeat_stored(heartbeat).await; + } + Err(KvStoreError::SequenceMismatch { expected, current }) => { + // CAS failure could indicate: + // 1. Network latency: our previous timeout heartbeat actually succeeded + // 2. Agent ID conflict: another agent with same ID exists + // 3. Clock/bucket corruption (unlikely) + log::warn!( + "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}", + self.config.agent_id, expected, current, current + ); + // Update cached heartbeat sequence to prevent repeated failures + if let Some(hb) = self.last_heartbeat.write().await.as_mut() { + if let Some(metadata) = hb.metadata.as_mut() { + metadata.sequence = *current; + } + } + } + Err(e) => { + // Actual storage failure - treat as heartbeat failure + log::error!("Heartbeat storage error: {}", e); + return Err(HeartbeatFailure {}); + } + } + self.config.deployment_config_unstable.perform_heartbeat().await?; + + // TODO: Pass the heartbeat with metadata to the workflow for staleness checks + // The workflow needs access to metadata.timestamp for failover timeout calculations + Ok::<(), HeartbeatFailure>(()) + }) + .await; + + // Update Counters & Handle State Transitions + // Timeout is also treated as a failure + let heartbeat_result = match result { + Ok(inner_result) => inner_result, + Err(_) => Err(HeartbeatFailure {}), + }; + + trace!("Got heartbeat_result : {heartbeat_result:?}"); + match heartbeat_result { + Ok(_) => { + self.workflow.handle_heartbeat_success(); + } + Err(_) => { + self.workflow.handle_heartbeat_failure(); + } + } + + info!( + "Heartbeat : success={heartbeat_emoji} state={state}, successes={consecutive_successes}/{success_threshold}, fails={consecutive_failures}/{failure_threshold} took={heartbeat_duration}ms", + success_threshold = self.config.success_threshold, + failure_threshold = self.config.failure_threshold, + state = self.workflow.state_name(), + consecutive_successes = self.workflow.consecutive_successes(), + consecutive_failures = self.workflow.consecutive_failures(), + heartbeat_emoji = if heartbeat_result.is_ok() { + "✅" + } else { + "❌" + }, + heartbeat_duration = (Instant::now() - this_heartbeat_start).as_millis(), + ); + debug!( + "Sleeping for {} ms before next heartbeat", + (next_heartbeat_start - Instant::now()).as_millis() + ); + tokio::time::sleep_until(next_heartbeat_start).await; + } + } +} + +#[derive(Debug)] +pub struct HeartbeatFailure {} + +/// Replica workflow module - handles replica-specific state machine +mod replica {} diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs index 3664b65c..de88ecf5 100644 --- a/harmony_agent/src/main.rs +++ b/harmony_agent/src/main.rs @@ -1,199 +1,259 @@ -use std::{str::FromStr, time::Duration}; - -use harmony_types::id::Id; -use log::{debug, info}; -use tokio::time::Instant; - -// use crate::{agent::HarmonyAgent, config::AgentConfig}; - -mod agent; -mod config; - -// #[tokio::main] -// async fn main() -> Result<(), Box> { -// env_logger::init(); -// -// let config = AgentConfig::load_from_env()?; -// -// log::info!("Harmony Agent Initialized"); -// log::debug!("Identity (My Cluster ID): {}", config.my_cluster_id); -// log::debug!("NATS URL : {}", config.nats_url); -// -// let agent = HarmonyAgent::new(config).await?; -// -// // Run the heartbeat loop -// agent.run_heartbeat_loop().await?; -// -// Ok(()) -// } +// mod typestate_gemini; +// mod typestate; +mod agent_loop; +mod workflow; +pub mod store; #[tokio::main] -async fn main() -> Result<(), Box> { - env_logger::init(); - - let my_agent_id = Id::from_str("agent_1").unwrap(); - - let config = AgentConfig { - success_threshold: 5, - failure_threshold: 10, - heartbeat_interval: Duration::from_secs(1), - deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig { - desired_primary_agent: my_agent_id, - cnpg_cluster_name: String::from("cnpg_cluster_name"), - }), - nats_url: String::new(), - nats_creds_path: None, - agent_id: Id::empty(), - }; - - log::info!("Harmony Agent Initialized"); - log::debug!("Identity (My Cluster ID): {}", config.agent_id); - log::debug!("NATS URL : {}", config.nats_url); - - let agent = HarmonyAgent { config }; - - // Run the heartbeat loop - agent.run_heartbeat_loop().await; - - Ok(()) +async fn main() { + // typestate_gemini::main_typestate_gemini().await; + agent_loop::main().await; } -#[derive(Debug, Clone)] -pub struct AgentConfig { - /// Number of consecutive successful heartbeats required before the service transitions from - /// failed to healthy. - pub success_threshold: usize, - /// Number of consecutive failed heartbeats required before the service transitions from - /// healthy to failed. - pub failure_threshold: usize, - /// Time between each heartbeat. If a heartbeat takes longer than this, it will be - /// considered failed. - pub heartbeat_interval: Duration, - /// **UNSTABLE FIELD** - /// - /// For now, an agent instance only serves one deployment. This is probably fine as an agent's - /// footprint is low, but managing multiple deployments in a single instance would be a - /// significant resource usage reduction. - /// - /// Decoupling the deployment of the agent with the application's deployment could make things - /// more complicated though, where we would have to be careful about version compatibility - /// between all components managed by the agent instance. So for now it is a 1-1 map. - /// - /// But I have a feeling this could change so I am marking this field unstable to warn you, the - /// reader. - pub deployment_config_unstable: DeploymentConfig, - pub nats_url: String, - pub nats_creds_path: Option, - pub agent_id: Id, -} +// TODO +// +// DONE: +// 1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type +// 2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision) +// 3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded +// 4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent +// 5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback +// 6. ✅ failover_timeout added to AgentConfig +// 7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error +// 8. ✅ startup reconciliation implemented via on_startup() method +// +// REMAINING: +// - review all code and list implementation issues +// - review both workflow for each state transition +// - Complete replica workflow staleness detection (needs implementation in Watching state) +// - Implement state recovery from Failed state for both workflows +// - Implement subscribe in NATS store with watch() API +// - Implement config validation for failover_timeout constraints -#[derive(Debug, Clone)] -pub enum DeploymentConfig { - FailoverPostgreSQL(FailoverCNPGConfig), -} +// TODO +// +// 1. store trait subscribe definition missing callback +// 2. BUG, data integrity issue : nats store not actually using jetstream metadata +// 3. review all code and list implementation issues +// 4. review both workflow for each state transition +// 5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded +// 6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself +// should hold it probably, every agent should be subscribed to the cluster_state object and +// keep it in memory to allow workflows to process against it efficiently) -#[derive(Debug, Clone)] -pub struct FailoverCNPGConfig { - pub desired_primary_agent: Id, - pub cnpg_cluster_name: String, -} +// ## CRITICAL - Data Integrity Issues +// +// 1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`) +// - Currently uses `put()` which overwrites unconditionally +// - Must use `update()` with revision parameter for proper compare-and-set +// - Without this, concurrent promotion attempts can cause split brain +// +// 2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`) +// - Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3 +// - NATS Entry has `.revision` and `.created` fields that must be used +// - This defeats the entire purpose of store-provided timestamps +// +// 3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`) +// - Line ~156: TODO comment confirms missing metadata passing +// - Replica cannot calculate staleness without metadata.timestamp +// - Failover logic is broken +// +// 4. **No actual cluster state watching exists** +// - Replica workflow declares `ClusterState` but never updates it +// - No subscription to primary heartbeat or cluster_state key +// - Replica cannot detect primary liveness +// +// ## HIGH - Missing Core Functionality +// +// 5. **Replica Workflow incomplete** - All key logic is TODO: +// - Watching primary staleness (line 114) +// - Promotion attempt (line 118) +// - Original primary recovery detection (line 127) +// - Demotion/handshake (line 131) +// +// 6. **Missing replica "Failed" state** +// - `ReplicaState` enum has no `Failed` variant +// - User's TODO #5 correctly identifies this gap +// - What happens if replica's own heartbeats fail repeatedly? +// +// 7. **Primary Workflow incomplete** - Key logic missing: +// - No NATS check before recovering from `Fenced` state (line 95) +// - No NATS check in `Yielding` state for demotion handshake (line 101) +// - No actual fencing failure handling +// +// 8. **Store `subscribe` not implemented** (`store/mod.rs`) +// - Returns `todo!()` in NATS implementation +// - No callback mechanism defined in trait +// - Without this, agents cannot react to state changes +// +// 9. **Cluster state not tracked centrally** +// - User's TODO #6 correctly identifies this +// - Each agent should maintain a local copy of cluster_state +// - No subscription mechanism to update this local copy +// +// 10. **No validation of configuration constraints** +// - Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin` +// - Invalid config could cause split brain +// +// ## MEDIUM - Incorrect State Transitions +// +// 11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`) +// - Two state transitions happen in one heartbeat cycle +// - Should stay in `Failed` until fencing actually completes +// - What if fencing fails? State machine won't reflect it +// +// 12. **No fencing failure handling** +// - If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes +// - ADR mentions escalating to radical measures, but no callback for failure +// +// 13. **Replica `Watching` state does nothing** +// - Line 115: Just logs, checks nothing +// - Should be checking staleness of primary heartbeat +// +// 14. **Demotion handshake not implemented** +// - ADR section 4 details this but code doesn't implement it +// - How does original primary know it should yield? +// +// ## LOW - Observability & Reliability +// +// 15. **No graceful shutdown mechanism** +// - `run_heartbeat_loop` runs forever +// - No signal handling (SIGTERM, SIGINT) +// +// 16. **Async task errors silently ignored** +// - `tokio::spawn` at lines 74, 83, 123 +// - No `JoinHandle` retention or error handling +// +// 17. **No metrics/observability** +// - Only log output +// - No Prometheus metrics for state transitions, failure counts, etc. +// +// 18. **Hardcoded main() function** (`agent_loop.rs::main`) +// - Not production-ready entry point +// - Should load config from environment or file +// +// 19. **Store factory pattern missing** +// - TODO comment at line 54 confirms this +// - Can't switch between stores via config +// +// 20. **No backoff/retry logic for NATS operations** +// - Transient failures could trigger unnecessary fencing +// +// 21. **`AgentInfo` status is hardcoded to "HEALTHY"** +// - Line 137 in `store_heartbeat` +// - Should反映 actual workflow state +// +// 22. **Unused fields in structs** +// - `HeartbeatState.last_seq` set but never read +// - `ClusterState.current_primary` set but never read +// +// ## ADR-017-3 Compliance Issues +// +// 23. **ADR violation: Clock skew not avoided** +// - While ADR says use store metadata, code uses local time +// +// 24. **Failover timeout not configurable** +// - Defined in ADR but not in `AgentConfig` +// - Needed for replica staleness calculation +// +// 25. **Safety margin concept exists in ADR but not in code** +// - Configuration should include this margin +// +// 26. **No handling of Case 3 (Replica Network Lag)** +// - ADR describes NATS rejection prevention +// - But `set_strict` implementation accepts any write +// +// ## Code Quality Issues +// +// 27. **Inconsistent error handling** +// - Some paths return `Err`, others `todo!()`, others ignore +// +// 28. **Unnecessary `Clone` bounds** +// - `DeploymentConfig.clone()` used frequently +// - Could be optimized with `Arc` +// +// 29. **Missing lifetime annotations** +// - `KvStore::get` returns `String` key in error - inefficient +// +// 30. **No integration points mentioned** +// - PostgreSQL lifecycle control implementation missing +// - Fencing via CNPG not connected +// +// ## Production Readiness Checklist Summary +// +// For battle testing preparation, you need: +// +// **Immediate ( blockers):** +// - Fix NATS store metadata usage (issues #1, #2) +// - Implement strict set_strict with actual CAS (#1) +// - Implement replica primary watching (#4, #5) +// - Add failover_timeout config + staleness logic (#3, #24) +// - Implement subscribe mechanism with callbacks (#8) +// +// **High priority:** +// - Complete all workflow transitions (#5, #7, #11-14) +// - Add cluster state tracking (#6, #9) +// - Add configuration validation (#10) +// - Add Replica Failed state (#6) +// +// **Before deployment:** +// - Implement graceful shutdown (#15) +// - Add error handling for spawned tasks (#16) +// - Remove hardcoded main function (#18) +// - Implement store factory (#19) +// - Add Prometheus metrics (#17) +// +// **Documentation:** +// - Document all configuration parameters and their trade-offs +// - Add runbooks for each failure mode +// - Document battle test scenarios to cover +// +// ### Addendum: Missing Critical Issues +// +// #### 1. CRITICAL: Heartbeat "Lying" (Data Integrity) +// * **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`. +// * **The Bug:** `status: "HEALTHY".to_string()` is hardcoded. +// * **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY". +// * **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely. +// +// #### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption) +// * **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`). +// * **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks. +// * **Scenario:** +// 1. Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s). +// 2. Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s). +// 3. `on_active` finishes *before* `on_failover`. +// 4. `on_failover` finishes last, killing the DB *after* the agent decided it was healthy. +// * **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one. +// +// #### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk) +// * **Location:** `agent_loop.rs` loop logic. +// * **The Bug:** There is no "Stop the World" gate. +// * **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*. +// * **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again. +// * **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter. +// +// #### 4. HIGH: NATS Bucket Name Collision +// * **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`. +// * **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`. +// * **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state. +// * **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`. +// +// #### 5. HIGH: Startup State Reconciliation +// * **Location:** `HarmonyAgent::new`. +// * **The Bug:** Agents always start in `Initializing`. +// * **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader. +// * **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime. +// * **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check). +// +// ### Summary of Tasks to Add +// +// Please add these to your master list before starting implementation: +// +// 28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY". +// 29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping. +// 30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection). +// 31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`. +// 32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid. +// -impl DeploymentConfig { - /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres) - pub async fn perform_health_check(&self) -> Result<(), HeartbeatFailure> { - match self { - DeploymentConfig::FailoverPostgreSQL(cfg) => { - info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name); - // TODO: Implement actual PG check / NATS write here - Ok(()) - } - } - } - - /// Callback: Transitioned from Unhealthy -> Healthy - pub async fn on_active(&self) { - info!("Service is now ACTIVE (Healthy)"); - // e.g., Remove fencing lock - } - - /// Callback: Transitioned from Healthy -> Unhealthy - pub async fn on_failover(&self) { - info!("Service is now FAILED (Unhealthy)"); - // e.g., Initiate self-fencing, stop accepting traffic - } -} - -pub struct HarmonyAgent { - pub config: AgentConfig, -} - -impl HarmonyAgent { - pub async fn run_heartbeat_loop(&self) { - let mut consecutive_successes = 0; - let mut consecutive_failures = 0; - let mut is_healthy = false; - let mut next_heartbeat_start; - loop { - let this_heartbeat_start = Instant::now(); - next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval; - - // Perform the check via the config/strategy with a timeout - let result = tokio::time::timeout(self.config.heartbeat_interval, async { - // simulate variable latency for the health check - tokio::time::sleep(Duration::from_millis(getrandom::u64().unwrap() % 2000)).await; - self.config - .deployment_config_unstable - .perform_health_check() - .await - }) - .await; - - // Update Counters & Handle State Transitions - // Timeout is also treated as a failure - let heartbeat_result = match result { - Ok(inner_result) => inner_result, - Err(_) => Err(HeartbeatFailure {}), - }; - - match heartbeat_result { - Ok(_) => { - consecutive_failures = 0; - consecutive_successes += 1; - - if !is_healthy && consecutive_successes >= self.config.success_threshold { - info!("Success threshold reached. Marking as Healthy."); - is_healthy = true; - self.config.deployment_config_unstable.on_active().await; - } - } - Err(_) => { - consecutive_successes = 0; - consecutive_failures += 1; - - if is_healthy && consecutive_failures >= self.config.failure_threshold { - log::warn!("Failure threshold reached. Marking as Unhealthy."); - is_healthy = false; - self.config.deployment_config_unstable.on_failover().await; - } - } - } - - info!( - "Heartbeat : success={} healthy={}, successes={}, fails={} took={}ms", - if heartbeat_result.is_ok() { "✅" } else { "❌" }, - is_healthy, - consecutive_successes, - consecutive_failures, - (Instant::now() - this_heartbeat_start).as_millis() - ); - debug!( - "Sleeping for {} ms before next heartbeat", - (next_heartbeat_start - Instant::now()).as_millis() - ); - tokio::time::sleep_until(next_heartbeat_start).await; - } - } -} - -#[derive(Debug)] -pub struct HeartbeatFailure {} diff --git a/harmony_agent/src/store/chaos.rs b/harmony_agent/src/store/chaos.rs new file mode 100644 index 00000000..1dce4ed8 --- /dev/null +++ b/harmony_agent/src/store/chaos.rs @@ -0,0 +1,123 @@ +use async_trait::async_trait; +use serde_json::Value; +use std::sync::Arc; +use tokio::time::Duration; + +use crate::store::SubscriptionCallback; + +use super::{KvStore, KvStoreError}; + +/// A chaos testing KV store that randomly times out or fails +/// Wraps another KvStore implementation and adds random failures +#[derive(Clone)] +pub struct ChaosKvStore { + inner: Arc, + timeout_probability: u32, + failure_probability_percentage: u32, + max_delay_ms: u64, +} + +impl ChaosKvStore { + pub fn new( + inner: T, + timeout_probability: u32, + failure_probability: u32, + max_delay_ms: u64, + ) -> Self { + Self { + inner: Arc::new(inner), + timeout_probability, + failure_probability_percentage: failure_probability, + max_delay_ms, + } + } + + async fn maybe_chaos(&self) -> Result<(), KvStoreError> { + // Random delay + if self.max_delay_ms > 0 { + let delay = getrandom::u64().unwrap() % self.max_delay_ms; + tokio::time::sleep(Duration::from_millis(delay)).await; + } + + // Random failure + let failure_random = getrandom::u32().unwrap(); + if (failure_random % 100) < self.failure_probability_percentage { + return Err(KvStoreError::Unknown); + } + + // Random timeout (simulated as a very long delay) + let failure_random = getrandom::u32().unwrap(); + if failure_random % 100 < self.timeout_probability { + tokio::time::sleep(Duration::from_secs(10)).await; + } + + Ok(()) + } +} + +#[async_trait] +impl KvStore for ChaosKvStore { + async fn get(&self, key: String) -> Result { + self.maybe_chaos().await?; + self.inner.get(key).await + } + + async fn set_strict( + &self, + key: String, + value: Value, + expected_sequence: u64, + ) -> Result { + self.maybe_chaos().await?; + self.inner.set_strict(key, value, expected_sequence).await + } + + async fn subscribe( + &self, + key: String, + callback: SubscriptionCallback, + ) -> Result<(), KvStoreError> { + self.maybe_chaos().await?; + self.inner.subscribe(key, callback).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::store::InMemoryKvStore; + use serde_json::json; + + #[tokio::test] + async fn test_chaos_store_with_no_chaos() { + let inner = InMemoryKvStore::new(); + let chaos = ChaosKvStore::new(inner, 0, 0, 0); + + let value = json!({"test": "value"}); + let result = chaos + .set_strict("key".to_string(), value.clone(), 0) + .await + .unwrap(); + assert_eq!(result, 1); + + let retrieved = chaos.get("key".to_string()).await.unwrap(); + assert_eq!(retrieved.value, Some(value)); + } + + #[tokio::test] + async fn test_chaos_store_with_delay() { + let inner = InMemoryKvStore::new(); + let chaos = ChaosKvStore::new(inner, 0, 0, 100); + + let start = tokio::time::Instant::now(); + let value = json!({"test": "value"}); + chaos.set_strict("key".to_string(), value, 0).await.unwrap(); + let elapsed = start.elapsed(); + + // Should have some delay + assert!( + elapsed.as_millis() < 150, + "Should complete within reasonable time" + ); + } +} diff --git a/harmony_agent/src/store/memory.rs b/harmony_agent/src/store/memory.rs new file mode 100644 index 00000000..3549c563 --- /dev/null +++ b/harmony_agent/src/store/memory.rs @@ -0,0 +1,184 @@ +use async_trait::async_trait; +use serde_json::Value; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; +use tokio::sync::RwLock; + +use crate::store::SubscriptionCallback; + +use super::{KvMetadata, KvResult, KvStore, KvStoreError}; + +/// An in-memory KV store that guarantees ordering like NATS JetStream +/// Each key has a sequence number that increments on each write +#[derive(Clone)] +pub struct InMemoryKvStore { + data: Arc>>, + global_seq: Arc>, +} + +impl InMemoryKvStore { + pub fn new() -> Self { + Self { + data: Arc::new(RwLock::new(HashMap::new())), + global_seq: Arc::new(RwLock::new(0)), + } + } + + /// Get the sequence number for a key + pub async fn get_seq(&self, key: &str) -> Option { + self.data.read().await.get(key).map(|(_, seq)| *seq) + } +} + +impl Default for InMemoryKvStore { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl KvStore for InMemoryKvStore { + async fn get(&self, key: String) -> Result { + let data = self.data.read().await; + let (value, sequence) = data + .get(&key) + .ok_or_else(|| KvStoreError::KeyNotAvailable(key.clone()))?; + + let timestamp = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_millis() as u64; + + Ok(KvResult { + value: Some(value.clone()), + metadata: KvMetadata { + timestamp, + sequence: *sequence, + }, + }) + } + + async fn set_strict( + &self, + key: String, + value: Value, + expected_sequence: u64, + ) -> Result { + // Check current sequence + let data = self.data.read().await; + let current_sequence = data.get(&key).map(|(_, seq)| *seq).unwrap_or(0); + drop(data); + + // Verify expected sequence matches + if current_sequence != expected_sequence { + return Err(KvStoreError::SequenceMismatch { + expected: expected_sequence, + current: current_sequence, + }); + } + + // Increment global sequence + let mut seq = self.global_seq.write().await; + *seq += 1; + let new_seq = *seq; + drop(seq); + + // Write the new value + let mut data = self.data.write().await; + data.insert(key, (value.clone(), new_seq)); + drop(data); + + Ok(new_seq) + } + + async fn subscribe( + &self, + key: String, + callback: SubscriptionCallback, + ) -> Result<(), KvStoreError> { + // For now, subscribe just returns the current value + // In a real implementation, this would return a stream of updates + self.get(key).await; + todo!() // register callback and call it when key is set ? + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[tokio::test] + async fn test_memory_store_basic() { + let store = InMemoryKvStore::new(); + + // Set a value + let value = json!({"status": "healthy"}); + let result = store + .set_strict("test_key".to_string(), value.clone(), 0) + .await + .unwrap(); + assert_eq!(result, 1); + + // Get the value + let retrieved = store.get("test_key".to_string()).await.unwrap(); + assert_eq!(retrieved.value, Some(value)); + assert_eq!(retrieved.metadata.sequence, 1); + } + + #[tokio::test] + async fn test_memory_store_sequence_numbers() { + let store = InMemoryKvStore::new(); + + let seq1 = store + .set_strict("key1".to_string(), json!("value1"), 0) + .await + .unwrap(); + + let seq2 = store + .set_strict("key2".to_string(), json!("value2"), 0) + .await + .unwrap(); + + assert!(seq2 > seq1, "Sequence numbers should increment"); + } + + #[tokio::test] + async fn test_memory_store_key_not_found() { + let store = InMemoryKvStore::new(); + let result = store.get("nonexistent".to_string()).await; + assert!(matches!(result, Err(KvStoreError::KeyNotAvailable(_)))); + } + + #[tokio::test] + async fn test_memory_store_strict_ordering() { + let store = InMemoryKvStore::new(); + + // First write with sequence 0 + let result1 = store + .set_strict("key".to_string(), json!("value1"), 0) + .await + .unwrap(); + assert_eq!(result1, 1); + + // Second write with correct sequence + let result2 = store + .set_strict("key".to_string(), json!("value2"), 1) + .await + .unwrap(); + assert_eq!(result2, 2); + + // Third write with wrong sequence should fail + let result3 = store + .set_strict("key".to_string(), json!("value3"), 1) + .await; + assert!(matches!( + result3, + Err(KvStoreError::SequenceMismatch { + expected: 1, + current: 2 + }) + )); + } +} diff --git a/harmony_agent/src/store/mod.rs b/harmony_agent/src/store/mod.rs new file mode 100644 index 00000000..26e630c5 --- /dev/null +++ b/harmony_agent/src/store/mod.rs @@ -0,0 +1,117 @@ +use async_trait::async_trait; +use serde_json::Value; +use thiserror::Error; + +/// Handle for managing active subscriptions +#[derive(Debug, Clone)] +pub struct SubscriptionHandle { + id: usize, + _phantom: std::marker::PhantomData<()>, +} + +/// Metadata returned by the KV store for all operations +/// Contains timing and ordering information set by the store +#[derive(Debug, Clone)] +pub struct KvMetadata { + /// Timestamp set by the store (milliseconds since UNIX epoch) + pub timestamp: u64, + /// Sequence number for strict ordering guarantees + pub sequence: u64, +} + +/// Result returned by KV store operations +/// Contains both the value (if any) and store metadata +#[derive(Debug, Clone)] +pub struct KvResult { + /// The value from the store (None if key doesn't exist) + pub value: Option, + /// Store-provided metadata (timestamp, sequence) + pub metadata: KvMetadata, +} + +/// Callback type for subscription updates +/// Callback receives: key, new value (None if deleted), and metadata +pub type SubscriptionCallback = Box, KvMetadata) + Send + Sync>; + +#[derive(Error, Debug)] +pub enum KvStoreError { + #[error("data store disconnected")] + Disconnect(#[from] std::io::Error), + #[error("invalid key")] + InvalidKey, + #[error("operation timed out")] + Timeout, + #[error("the data for key `{0}` is not available")] + KeyNotAvailable(String), + #[error("Failed to deserialize value to json. Error {0} , value: {1}", .deserialization_error, .value)] + DeserializationFailed { + deserialization_error: String, + value: String, + }, + #[error("Strict ordering violation: expected sequence {expected}, but current is {current}")] + SequenceMismatch { expected: u64, current: u64 }, + #[error("unknown data store error")] + Unknown, +} + +#[async_trait] +pub trait KvStore { + /// Get a value from the store + /// + /// # Returns + /// - `Ok(KvResult)`: Contains the value and metadata (timestamp, sequence) + /// - `Err(KeyNotAvailable)`: If the key doesn't exist + async fn get(&self, key: String) -> Result; + + /// Strict set operation with compare-and-set semantics + /// + /// Sets the value only if the current sequence number matches `expected_sequence`. + /// This provides strict ordering guarantees needed for the failover algorithm. + /// + /// # Parameters + /// - `key`: The key to set + /// - `value`: The value to store + /// - `expected_sequence`: The sequence number we expect the key to currently have. + /// Use 0 for the first write to a new key. + /// + /// # Returns + /// - `Ok(u64)`: Returns the new sequence number + /// - `Err(KvStoreError)`: If another write happened (current != expected) + /// + /// # Example Use Case + /// For NATS JetStream, this maps to the conditional update operation that ensures + /// only one agent can successfully promote to primary. + async fn set_strict( + &self, + key: String, + value: Value, + expected_sequence: u64, + ) -> Result; + + /// Subscribe to updates for a key + /// + /// # Parameters + /// - `key`: The key to subscribe to + /// - `callback`: Function to call on each update with key, value, and metadata + /// + /// # Returns + /// - `Ok(())`: Subscription established successfully + /// - `Err(KvStoreError)`: Subscription failed + /// + /// Note: For JetStream, this should use watch() API. Updates will invoke the callback + /// asynchronously in the background. + async fn subscribe( + &self, + key: String, + callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a + // callback + ) -> Result<(), KvStoreError>; +} + +mod memory; +mod nats; +mod chaos; + +pub use memory::InMemoryKvStore; +pub use nats::NatsKvStore; +pub use chaos::ChaosKvStore; diff --git a/harmony_agent/src/store/nats.rs b/harmony_agent/src/store/nats.rs new file mode 100644 index 00000000..1c82c1d8 --- /dev/null +++ b/harmony_agent/src/store/nats.rs @@ -0,0 +1,135 @@ +use async_nats::jetstream::kv::{Store, UpdateError}; +use async_trait::async_trait; +use log::{debug, error}; +use serde_json::Value; + +use crate::store::SubscriptionCallback; + +use super::{KvMetadata, KvResult, KvStore, KvStoreError}; + +/// NATS JetStream-backed KV store +pub struct NatsKvStore { + store: Store, +} + +impl NatsKvStore { + pub fn new(store: Store) -> Self { + Self { store } + } + + pub async fn create( + client: async_nats::Client, + bucket_name: &str, + history_size: i64, + ) -> Result> { + let jetstream = async_nats::jetstream::new(client); + + debug!("Creating NATS KV bucket: {}", bucket_name); + let store = jetstream + .create_key_value(async_nats::jetstream::kv::Config { + bucket: bucket_name.to_string(), + history: history_size, + ..Default::default() + }) + .await + .map_err(|e| { + error!( + "Failed to initialize NATS KV bucket '{}': {}", + bucket_name, e + ); + e + })?; + + Ok(Self::new(store)) + } +} + +#[async_trait] +impl KvStore for NatsKvStore { + async fn get(&self, key: String) -> Result { + let entry = self.store.entry(&key).await.map_err(|e| { + error!("NATS get failed for key '{}': {}", key, e); + KvStoreError::Disconnect(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + )) + })?; + + if entry.is_none() { + return Err(KvStoreError::KeyNotAvailable(key)); + } + + let entry = entry.unwrap(); + let value: Value = serde_json::from_slice(&entry.value).map_err(|e| { + KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: String::from_utf8_lossy(&entry.value).to_string(), + } + })?; + + // Extract metadata from NATS entry + // Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime + let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64; + + let metadata = KvMetadata { + timestamp, + sequence: entry.revision, + }; + + Ok(KvResult { + value: Some(value), + metadata, + }) + } + + async fn set_strict( + &self, + key: String, + value: Value, + expected_sequence: u64, + ) -> Result { + let bytes = + serde_json::to_vec(&value).map_err(|e| KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: value.to_string(), + })?; + + // Use update() for CAS semantics (Compare-And-Set) + // This ensures we only write if the revision matches expected_sequence + let revision = self + .store + .update(&key, bytes.into(), expected_sequence) + .await + .map_err(|e| { + // FIXME this is ugly, we should have a clean KvStoreError containing + // proper information from nats instead + error!("NATS update failed for key '{}': {}", key, e); + e + })?; + + Ok(revision) + } + + async fn subscribe( + &self, + key: String, + callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a + ) -> Result<(), KvStoreError> { + todo!() + } +} + +impl From for KvStoreError { + fn from(value: UpdateError) -> Self { + match value.kind() { + async_nats::jetstream::kv::UpdateErrorKind::InvalidKey => KvStoreError::InvalidKey, + async_nats::jetstream::kv::UpdateErrorKind::TimedOut => KvStoreError::Timeout, + async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision => { + KvStoreError::KeyNotAvailable("key".to_string()) + } + async_nats::jetstream::kv::UpdateErrorKind::Other => KvStoreError::Disconnect( + std::io::Error::new(std::io::ErrorKind::Other, "NATS update error"), + ), + } + } +} diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs new file mode 100644 index 00000000..074b29e2 --- /dev/null +++ b/harmony_agent/src/workflow/mod.rs @@ -0,0 +1,42 @@ +use async_trait::async_trait; +use harmony_types::id::Id; + +pub mod primary; +pub mod replica; + +/// Trait that defines how a workflow (Primary or Replica) handles heartbeat events +#[async_trait] +pub trait HeartbeatWorkflow: Send + Sync { + /// Handle a successful heartbeat + fn handle_heartbeat_success(&mut self); + + /// Handle a failed heartbeat + fn handle_heartbeat_failure(&mut self); + + /// Called after heartbeat is successfully stored with metadata + /// This provides workflows access to timestamp/sequence for staleness calculations + async fn on_heartbeat_stored(&mut self, _heartbeat: &crate::agent_loop::AgentHeartbeat) { + // Default implementation does nothing + } + + /// Called during agent startup to reconcile state from cluster state + /// Receives the current cluster state if available + async fn on_startup(&mut self, _cluster_state: Option<&crate::agent_loop::ClusterStateData>) { + // Default implementation does nothing + } + + /// Called when a peer agent heartbeat is observed (via subscription) + /// This is primarily used by replicas to detect primary staleness + async fn on_peer_heartbeat(&mut self, _peer_id: &Id, _heartbeat: &crate::agent_loop::AgentHeartbeat) { + // Default implementation does nothing + } + + /// Get the current state name for logging (also used for heartbeat status) + fn state_name(&self) -> &'static str; + + /// Get current consecutive successes + fn consecutive_successes(&self) -> usize; + + /// Get current consecutive failures + fn consecutive_failures(&self) -> usize; +} diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs new file mode 100644 index 00000000..7eccc998 --- /dev/null +++ b/harmony_agent/src/workflow/primary.rs @@ -0,0 +1,165 @@ +use async_trait::async_trait; +use log::{debug, info, trace, warn}; + +use crate::{agent_loop::DeploymentConfig, workflow::HeartbeatWorkflow}; + +#[derive(Debug, Clone, PartialEq)] +pub enum PrimaryState { + Initializing, + Healthy, + Failed, + Fenced, + Yielding, +} + +impl PrimaryState { + pub fn name(&self) -> &'static str { + match self { + PrimaryState::Initializing => "Primary:Initializing", + PrimaryState::Healthy => "Primary:Healthy", + PrimaryState::Failed => "Primary:Failed", + PrimaryState::Fenced => "Primary:Fenced", + PrimaryState::Yielding => "Primary:Yielding", + } + } +} + +pub struct PrimaryWorkflow { + state: PrimaryState, + consecutive_successes: usize, + consecutive_failures: usize, + success_threshold: usize, + failure_threshold: usize, + deployment_config: DeploymentConfig, +} + +impl PrimaryWorkflow { + pub fn new( + success_threshold: usize, + failure_threshold: usize, + deployment_config: DeploymentConfig, + ) -> Self { + Self { + state: PrimaryState::Initializing, + consecutive_successes: 0, + consecutive_failures: 0, + success_threshold, + failure_threshold, + deployment_config, + } + } + + fn transition_to(&mut self, new_state: PrimaryState) { + if self.state != new_state { + info!( + "State transition: {} -> {}", + self.state.name(), + new_state.name() + ); + self.state = new_state; + } + } +} + +#[async_trait] +impl HeartbeatWorkflow for PrimaryWorkflow { + async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>) { + if let Some(state) = cluster_state { + info!( + "Startup reconciliation: current primary is {:?}, desired primary is {:?}", + state.current_primary, state.desired_primary + ); + // No automatic fast-tracking - agent must earn healthy status + // through successful heartbeats. This prevents duplicate agents + // or crashloop agents from incorrectly claiming primary. + } else { + debug!("No cluster state on startup, starting from Initializing"); + } + } + fn handle_heartbeat_success(&mut self) { + self.consecutive_successes += 1; + self.consecutive_failures = 0; + + match self.state { + PrimaryState::Initializing => { + if self.consecutive_successes >= self.success_threshold { + self.transition_to(PrimaryState::Healthy); + // Trigger on_active callback + let config = self.deployment_config.clone(); + tokio::spawn(async move { + config.on_active().await; + }); + } + } + PrimaryState::Failed => { + if self.consecutive_successes >= self.success_threshold { + self.transition_to(PrimaryState::Healthy); + let config = self.deployment_config.clone(); + tokio::spawn(async move { + config.on_active().await; + }); + } + } + PrimaryState::Healthy => { + // Stay healthy + debug!("Primary staying healthy"); + } + PrimaryState::Fenced => { + // Recovery from fenced state + if self.consecutive_successes >= self.success_threshold { + // TODO: Check NATS for current_primary status before recovering + info!("Recovered from fenced state, transitioning to yielding"); + self.transition_to(PrimaryState::Yielding); + } + } + PrimaryState::Yielding => { + // TODO: Check NATS to see if we can resume as primary + trace!("Yielding, waiting for demotion handshake"); + } + } + } + + fn handle_heartbeat_failure(&mut self) { + self.consecutive_failures += 1; + self.consecutive_successes = 0; + + match self.state { + PrimaryState::Healthy => { + if self.consecutive_failures >= self.failure_threshold { + warn!( + "Failure threshold reached ({}/{}), transitioning to Failed", + self.consecutive_failures, self.failure_threshold + ); + self.transition_to(PrimaryState::Failed); + + // Immediately fence + self.transition_to(PrimaryState::Fenced); + let config = self.deployment_config.clone(); + tokio::spawn(async move { + config.on_failover().await; + }); + } + } + PrimaryState::Initializing => { + // Stay in initializing, just accumulate failures + trace!("Heartbeat failed during initialization"); + } + PrimaryState::Failed | PrimaryState::Fenced | PrimaryState::Yielding => { + // Already in a degraded state + trace!("Heartbeat failed in degraded state: {}", self.state.name()); + } + } + } + + fn state_name(&self) -> &'static str { + self.state.name() + } + + fn consecutive_successes(&self) -> usize { + self.consecutive_successes + } + + fn consecutive_failures(&self) -> usize { + self.consecutive_failures + } +} diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs new file mode 100644 index 00000000..9800e3c7 --- /dev/null +++ b/harmony_agent/src/workflow/replica.rs @@ -0,0 +1,279 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use log::{debug, info, trace}; +use std::time::Duration; +use tokio::sync::RwLock; + +use crate::agent_loop::AgentHeartbeat; +use crate::workflow::HeartbeatWorkflow; + +#[derive(Debug, Clone)] +pub struct HeartbeatState { + pub agent_id: Id, + pub last_seq: Option, +} + +impl HeartbeatState { + pub fn watch(agent_id: Id) -> Self { + Self { + agent_id, + last_seq: None, + } + } +} + +#[derive(Debug, Clone)] +pub struct ClusterState { + pub cluster_id: Id, + pub current_primary: Option, +} + +impl ClusterState { + pub fn watch(cluster_id: Id) -> Self { + Self { + cluster_id, + current_primary: None, + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum ReplicaState { + Initializing, + Watching, + Promoting, + PromotionFailed, + Leader, + Demoting, + Failed, +} + +impl ReplicaState { + pub fn name(&self) -> &'static str { + match self { + ReplicaState::Initializing => "Replica:Initializing", + ReplicaState::Watching => "Replica:Watching", + ReplicaState::Promoting => "Replica:Promoting", + ReplicaState::PromotionFailed => "Replica:PromotionFailed", + ReplicaState::Leader => "Replica:Leader", + ReplicaState::Demoting => "Replica:Demoting", + ReplicaState::Failed => "Replica:Failed", + } + } +} + +pub struct ReplicaWorkflow { + state: ReplicaState, + heartbeat_state: HeartbeatState, + primary_state: HeartbeatState, + cluster_state: ClusterState, + consecutive_successes: usize, + consecutive_failures: usize, + success_threshold: usize, + failure_threshold: usize, + failover_timeout: Duration, + /// Our own last heartbeat (for timestamp comparison against primary) + last_my_heartbeat: Option, + /// Last observed primary heartbeat (metadata only, for staleness detection) + last_primary_heartbeat: Option>, +} + +impl ReplicaWorkflow { + pub fn new( + success_threshold: usize, + failure_threshold: usize, + cluster_id: Id, + primary_id: Id, + my_id: Id, + failover_timeout: Duration, + ) -> Self { + Self { + state: ReplicaState::Initializing, + consecutive_successes: 0, + consecutive_failures: 0, + success_threshold, + failure_threshold, + failover_timeout, + cluster_state: ClusterState::watch(cluster_id), + primary_state: HeartbeatState::watch(primary_id), + heartbeat_state: HeartbeatState::watch(my_id), + last_my_heartbeat: None, + last_primary_heartbeat: None, + } + } + + fn transition_to(&mut self, new_state: ReplicaState) { + if self.state != new_state { + info!( + "State transition: {} -> {}", + self.state.name(), + new_state.name() + ); + self.state = new_state; + } + } + + /// Check if the primary heartbeat is stale compared to our own + /// Per ADR-017-3: primary is stale if (replica_timestamp - primary_timestamp) > failover_timeout + async fn check_primary_staleness(&mut self) { + let mut new_state = self.state.clone(); + if let Some(my_hb) = &self.last_my_heartbeat { + if let Some(my_metadata) = &my_hb.metadata { + if let Some(primary_hb_ref) = self.last_primary_heartbeat.as_ref() { + let primary_hb = primary_hb_ref.read().await; + if let Some(primary_metadata) = &primary_hb.metadata { + // Calculate time difference: replica_timestamp - primary_timestamp + let time_diff_ms = my_metadata + .timestamp + .saturating_sub(primary_metadata.timestamp); + let failover_timeout_ms = self.failover_timeout.as_millis() as u64; + + trace!( + "Staleness check: my_ts={}, primary_ts={}, diff={}ms, timeout={}ms", + my_metadata.timestamp, + primary_metadata.timestamp, + time_diff_ms, + failover_timeout_ms + ); + + if time_diff_ms > failover_timeout_ms { + info!( + "Primary heartbeat stale ({}ms > {}ms), attempting promotion", + time_diff_ms, failover_timeout_ms + ); + new_state = ReplicaState::Promoting; + } + } + } + } + + if self.state != new_state { + self.transition_to(new_state) + } + } + } +} + +#[async_trait] +impl HeartbeatWorkflow for ReplicaWorkflow { + async fn on_peer_heartbeat(&mut self, peer_id: &Id, heartbeat: &AgentHeartbeat) { + // Only track the primary's heartbeat + if *peer_id == self.primary_state.agent_id { + match &self.last_primary_heartbeat { + Some(existing) => { + // Update the existing heartbeat data + *existing.write().await = heartbeat.clone(); + } + None => { + // First time seeing primary heartbeat + self.last_primary_heartbeat = Some(RwLock::new(heartbeat.clone())); + } + } + trace!( + "Updated primary heartbeat: seq={}, timestamp={}", + heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0), + heartbeat + .metadata + .as_ref() + .map(|m| m.timestamp) + .unwrap_or(0), + ); + } + } + async fn on_heartbeat_stored(&mut self, heartbeat: &AgentHeartbeat) { + // Track our own heartbeat for staleness comparison + self.last_my_heartbeat = Some(heartbeat.clone()); + + // Perform staleness detection if we have both heartbeats + self.check_primary_staleness().await; + } + async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>) { + if let Some(state) = cluster_state { + info!( + "Startup reconciliation: current primary is {:?}, desired primary is {:?}", + state.current_primary, state.desired_primary + ); + // Update cluster_state with the observed values + self.cluster_state.current_primary = state.current_primary.clone(); + } else { + debug!("No cluster state on startup, starting from Initializing"); + } + } + fn handle_heartbeat_success(&mut self) { + self.consecutive_successes += 1; + self.consecutive_failures = 0; + + match self.state { + ReplicaState::Initializing => { + if self.consecutive_successes >= self.success_threshold { + self.transition_to(ReplicaState::Watching); + } + } + ReplicaState::Watching => { + // TODO: Check primary staleness from NATS + trace!("Replica watching primary"); + } + ReplicaState::Promoting => { + // TODO: Complete promotion attempt + trace!("Replica promotion in progress"); + } + ReplicaState::PromotionFailed => { + if self.consecutive_successes >= self.success_threshold { + self.transition_to(ReplicaState::Watching); + } + } + ReplicaState::Leader => { + // TODO: Check for original primary recovery + trace!("Replica acting as leader"); + } + ReplicaState::Failed => { + if self.consecutive_successes >= self.success_threshold { + info!("Replica recovered from Failed state, transitioning to Watching"); + self.transition_to(ReplicaState::Watching); + } + } + ReplicaState::Demoting => { + // TODO: Complete demotion back to watching + trace!("Replica demotion in progress"); + } + } + } + + fn handle_heartbeat_failure(&mut self) { + self.consecutive_failures += 1; + self.consecutive_successes = 0; + + match self.state { + ReplicaState::Watching | ReplicaState::Initializing => { + if self.consecutive_failures >= self.failure_threshold { + info!( + "Replica exceeded failure threshold ({}/{}), transitioning to Failed", + self.consecutive_failures, self.failure_threshold + ); + self.transition_to(ReplicaState::Failed); + } else { + trace!("Replica heartbeat failed, but below threshold"); + } + } + ReplicaState::Promoting + | ReplicaState::PromotionFailed + | ReplicaState::Leader + | ReplicaState::Demoting + | ReplicaState::Failed => { + trace!("Replica heartbeat failed in state: {}", self.state.name()); + } + } + } + + fn state_name(&self) -> &'static str { + self.state.name() + } + + fn consecutive_successes(&self) -> usize { + self.consecutive_successes + } + + fn consecutive_failures(&self) -> usize { + self.consecutive_failures + } +} -- 2.39.5 From 948334b89e6cae104d133259eac0f443d89df35a Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Tue, 3 Feb 2026 06:39:56 -0500 Subject: [PATCH 09/19] wip: cleaning up llm code, pretty close to something comprehensible and robust --- harmony_agent/README.md | 246 ++++++++++ harmony_agent/src/agent.rs | 214 --------- harmony_agent/src/agent_loop.rs | 150 ++++--- harmony_agent/src/main.rs | 322 +++---------- harmony_agent/src/old/typestate.rs | 230 ++++++++++ harmony_agent/src/old/typestate_gemini.rs | 523 ++++++++++++++++++++++ harmony_agent/src/store/chaos.rs | 69 +-- harmony_agent/src/store/memory.rs | 134 +++--- harmony_agent/src/store/mod.rs | 43 +- harmony_agent/src/store/nats.rs | 58 ++- harmony_agent/src/workflow/mod.rs | 22 +- harmony_agent/src/workflow/primary.rs | 5 +- harmony_agent/src/workflow/replica.rs | 81 ++-- 13 files changed, 1385 insertions(+), 712 deletions(-) create mode 100644 harmony_agent/README.md delete mode 100644 harmony_agent/src/agent.rs create mode 100644 harmony_agent/src/old/typestate.rs create mode 100644 harmony_agent/src/old/typestate_gemini.rs diff --git a/harmony_agent/README.md b/harmony_agent/README.md new file mode 100644 index 00000000..c22d1b51 --- /dev/null +++ b/harmony_agent/README.md @@ -0,0 +1,246 @@ +TODO + +DONE: +1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type +2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision) +3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded +4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent +5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback +6. ✅ failover_timeout added to AgentConfig +7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error +8. ✅ startup reconciliation implemented via on_startup() method + +REMAINING: +- review all code and list implementation issues +- review both workflow for each state transition +- Complete replica workflow staleness detection (needs implementation in Watching state) +- Implement state recovery from Failed state for both workflows +- Implement subscribe in NATS store with watch() API +- Implement config validation for failover_timeout constraints + +TODO + +1. store trait subscribe definition missing callback +2. BUG, data integrity issue : nats store not actually using jetstream metadata +3. review all code and list implementation issues +4. review both workflow for each state transition +5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded +6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself + should hold it probably, every agent should be subscribed to the cluster_state object and + keep it in memory to allow workflows to process against it efficiently) + +## CRITICAL - Data Integrity Issues + +1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`) + - Currently uses `put()` which overwrites unconditionally + - Must use `update()` with revision parameter for proper compare-and-set + - Without this, concurrent promotion attempts can cause split brain + +2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`) + - Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3 + - NATS Entry has `.revision` and `.created` fields that must be used + - This defeats the entire purpose of store-provided timestamps + +3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`) + - Line ~156: TODO comment confirms missing metadata passing + - Replica cannot calculate staleness without metadata.timestamp + - Failover logic is broken + +4. **No actual cluster state watching exists** + - Replica workflow declares `ClusterState` but never updates it + - No subscription to primary heartbeat or cluster_state key + - Replica cannot detect primary liveness + +## HIGH - Missing Core Functionality + +5. **Replica Workflow incomplete** - All key logic is TODO: + - Watching primary staleness (line 114) + - Promotion attempt (line 118) + - Original primary recovery detection (line 127) + - Demotion/handshake (line 131) + +6. **Missing replica "Failed" state** + - `ReplicaState` enum has no `Failed` variant + - User's TODO #5 correctly identifies this gap + - What happens if replica's own heartbeats fail repeatedly? + +7. **Primary Workflow incomplete** - Key logic missing: + - No NATS check before recovering from `Fenced` state (line 95) + - No NATS check in `Yielding` state for demotion handshake (line 101) + - No actual fencing failure handling + +8. **Store `subscribe` not implemented** (`store/mod.rs`) + - Returns `todo!()` in NATS implementation + - No callback mechanism defined in trait + - Without this, agents cannot react to state changes + +9. **Cluster state not tracked centrally** + - User's TODO #6 correctly identifies this + - Each agent should maintain a local copy of cluster_state + - No subscription mechanism to update this local copy + +10. **No validation of configuration constraints** + - Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin` + - Invalid config could cause split brain + +## MEDIUM - Incorrect State Transitions + +11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`) + - Two state transitions happen in one heartbeat cycle + - Should stay in `Failed` until fencing actually completes + - What if fencing fails? State machine won't reflect it + +12. **No fencing failure handling** + - If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes + - ADR mentions escalating to radical measures, but no callback for failure + +13. **Replica `Watching` state does nothing** + - Line 115: Just logs, checks nothing + - Should be checking staleness of primary heartbeat + +14. **Demotion handshake not implemented** + - ADR section 4 details this but code doesn't implement it + - How does original primary know it should yield? + +## LOW - Observability & Reliability + +15. **No graceful shutdown mechanism** + - `run_heartbeat_loop` runs forever + - No signal handling (SIGTERM, SIGINT) + +16. **Async task errors silently ignored** + - `tokio::spawn` at lines 74, 83, 123 + - No `JoinHandle` retention or error handling + +17. **No metrics/observability** + - Only log output + - No Prometheus metrics for state transitions, failure counts, etc. + +18. **Hardcoded main() function** (`agent_loop.rs::main`) + - Not production-ready entry point + - Should load config from environment or file + +19. **Store factory pattern missing** + - TODO comment at line 54 confirms this + - Can't switch between stores via config + +20. **No backoff/retry logic for NATS operations** + - Transient failures could trigger unnecessary fencing + +21. **`AgentInfo` status is hardcoded to "HEALTHY"** + - Line 137 in `store_heartbeat` + - Should反映 actual workflow state + +22. **Unused fields in structs** + - `HeartbeatState.last_seq` set but never read + - `ClusterState.current_primary` set but never read + +## ADR-017-3 Compliance Issues + +23. **ADR violation: Clock skew not avoided** + - While ADR says use store metadata, code uses local time + +24. **Failover timeout not configurable** + - Defined in ADR but not in `AgentConfig` + - Needed for replica staleness calculation + +25. **Safety margin concept exists in ADR but not in code** + - Configuration should include this margin + +26. **No handling of Case 3 (Replica Network Lag)** + - ADR describes NATS rejection prevention + - But `set_strict` implementation accepts any write + +## Code Quality Issues + +27. **Inconsistent error handling** + - Some paths return `Err`, others `todo!()`, others ignore + +28. **Unnecessary `Clone` bounds** + - `DeploymentConfig.clone()` used frequently + - Could be optimized with `Arc` + +29. **Missing lifetime annotations** + - `KvStore::get` returns `String` key in error - inefficient + +30. **No integration points mentioned** + - PostgreSQL lifecycle control implementation missing + - Fencing via CNPG not connected + +## Production Readiness Checklist Summary + +For battle testing preparation, you need: + +**Immediate ( blockers):** +- Fix NATS store metadata usage (issues #1, #2) +- Implement strict set_strict with actual CAS (#1) +- Implement replica primary watching (#4, #5) +- Add failover_timeout config + staleness logic (#3, #24) +- Implement subscribe mechanism with callbacks (#8) + +**High priority:** +- Complete all workflow transitions (#5, #7, #11-14) +- Add cluster state tracking (#6, #9) +- Add configuration validation (#10) +- Add Replica Failed state (#6) + +**Before deployment:** +- Implement graceful shutdown (#15) +- Add error handling for spawned tasks (#16) +- Remove hardcoded main function (#18) +- Implement store factory (#19) +- Add Prometheus metrics (#17) + +**Documentation:** +- Document all configuration parameters and their trade-offs +- Add runbooks for each failure mode +- Document battle test scenarios to cover + +### Addendum: Missing Critical Issues + +#### 1. CRITICAL: Heartbeat "Lying" (Data Integrity) +* **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`. +* **The Bug:** `status: "HEALTHY".to_string()` is hardcoded. +* **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY". +* **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely. + +#### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption) +* **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`). +* **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks. +* **Scenario:** + 1. Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s). + 2. Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s). + 3. `on_active` finishes *before* `on_failover`. + 4. `on_failover` finishes last, killing the DB *after* the agent decided it was healthy. +* **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one. + +#### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk) +* **Location:** `agent_loop.rs` loop logic. +* **The Bug:** There is no "Stop the World" gate. +* **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*. +* **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again. +* **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter. + +#### 4. HIGH: NATS Bucket Name Collision +* **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`. +* **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`. +* **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state. +* **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`. + +#### 5. HIGH: Startup State Reconciliation +* **Location:** `HarmonyAgent::new`. +* **The Bug:** Agents always start in `Initializing`. +* **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader. +* **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime. +* **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check). + +### Summary of Tasks to Add + +Please add these to your master list before starting implementation: + +28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY". +29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping. +30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection). +31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`. +32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid. + diff --git a/harmony_agent/src/agent.rs b/harmony_agent/src/agent.rs deleted file mode 100644 index 14384107..00000000 --- a/harmony_agent/src/agent.rs +++ /dev/null @@ -1,214 +0,0 @@ -use async_nats::jetstream::kv::Store; -use async_trait::async_trait; -use harmony_types::id::Id; -use log::{debug, error, info, trace}; -use serde::{Deserialize, Serialize}; -use std::time::{SystemTime, UNIX_EPOCH}; - -use crate::config::AgentConfig; - -#[async_trait] -pub trait HealthStore: Send + Sync { - async fn put( - &self, - key: String, - value: Vec, - ) -> Result>; -} - -#[async_trait] -impl HealthStore for Store { - async fn put( - &self, - key: String, - value: Vec, - ) -> Result> { - trace!("HealthStore::put key={} value_len={}", key, value.len()); - self.put(key, value.into()) - .await - .map_err(|e| Box::new(e) as Box) - } -} - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct AgentHeartbeat { - pub cluster_id: Id, - pub status: String, - pub timestamp: u64, -} - -pub struct HarmonyAgent { - config: AgentConfig, - nats_client: Option, - health_kv: Box, -} - -impl HarmonyAgent { - pub async fn new(config: AgentConfig) -> Result> { - info!("Initializing HarmonyAgent"); - info!(" nats_url: {}", config.nats_url); - info!(" my_cluster_id: {}", config.my_cluster_id); - info!(" desired_primary: {}", config.desired_primary); - info!(" heartbeat_interval: {:?}", config.heartbeat_interval); - info!(" nats_creds_path: {:?}", config.nats_creds_path); - debug!("Full Bootstrap configuration:\n{config:#?}"); - - let mut options = async_nats::ConnectOptions::new(); - if let Some(creds) = &config.nats_creds_path { - debug!("Loading NATS credentials from file: {}", creds); - options = options.credentials_file(creds).await?; - } - - debug!("Connecting to nats"); - let client = async_nats::connect_with_options(&config.nats_url, options).await?; - info!("Successfully connected to NATS at {}", config.nats_url); - let jetstream = async_nats::jetstream::new(client.clone()); - - // Initialize KV Buckets as per ADR-017 - const HEARTBEAT_KV_HISTORY_SIZE: i64 = 64; - debug!("Creating health KV bucket: harmony_agent_health"); - let health_kv = jetstream - .create_key_value(async_nats::jetstream::kv::Config { - bucket: "harmony_agent_health".to_string(), - history: HEARTBEAT_KV_HISTORY_SIZE, - ..Default::default() - }) - .await - .map_err(|e| { - error!( - "Failed to initialize health KV bucket 'harmony_agent_health': {}", - e - ); - e - })?; - info!("Successfully initialized health KV bucket: harmony_agent_health"); - - Ok(Self { - config, - nats_client: Some(client), - health_kv: Box::new(health_kv), - }) - } - - pub async fn run_heartbeat_loop(&self) -> Result<(), Box> { - let mut interval = tokio::time::interval(self.config.heartbeat_interval); - let key = format!("heartbeat.{}", self.config.my_cluster_id); - - info!( - "Starting heartbeat loop for cluster: {}", - self.config.my_cluster_id - ); - - loop { - interval.tick().await; - trace!("Heartbeat loop tick"); - - let now = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_err(|e| { - error!("Failed to get system time for heartbeat: {}", e); - e - })? - .as_millis() as u64; - - let heartbeat = AgentHeartbeat { - cluster_id: self.config.my_cluster_id.clone(), - status: "HEALTHY".to_string(), - timestamp: now, - }; - - debug!( - "Sending heartbeat for cluster: {}", - self.config.my_cluster_id - ); - let payload = serde_json::to_vec(&heartbeat)?; - - // Write heartbeat to KV. ADR-017: Write failure triggers self-demotion logic - match self.health_kv.put(key.clone(), payload).await { - Ok(_) => { - debug!( - "Heartbeat successful for cluster: {}", - self.config.my_cluster_id - ); - } - Err(e) => { - error!( - "Failed to write heartbeat: {}. Fencing logic would trigger here.", - e - ); - // In a real implementation, we would trigger self-demotion/fencing here - } - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::{Arc, Mutex}; - use tokio::time::{Duration, advance}; - - struct MockHealthStore { - puts: Arc)>>>, - } - - #[async_trait] - impl HealthStore for MockHealthStore { - async fn put( - &self, - key: String, - value: Vec, - ) -> Result> { - self.puts.lock().unwrap().push((key, value)); - Ok(0) - } - } - - #[tokio::test(start_paused = true)] - async fn test_heartbeat_loop() { - let config = AgentConfig { - nats_url: "nats://localhost:4222".to_string(), - nats_creds_path: None, - my_cluster_id: "test-cluster".into(), - desired_primary: "test-cluster".into(), - heartbeat_interval: Duration::from_millis(100), - }; - - let puts = Arc::new(Mutex::new(Vec::new())); - let mock_store = MockHealthStore { puts: puts.clone() }; - - let agent = HarmonyAgent { - config, - nats_client: None, - health_kv: Box::new(mock_store), - }; - - // Run the loop in a separate task - let handle = tokio::spawn(async move { - let _ = agent.run_heartbeat_loop().await; - }); - - // Advance time in increments to trigger multiple heartbeats - for _ in 0..3 { - advance(Duration::from_millis(100)).await; - tokio::time::sleep(Duration::from_millis(1)).await; - } - - let recorded_puts = puts.lock().unwrap(); - assert!( - recorded_puts.len() >= 2, - "Should have recorded at least 2 heartbeats, got {}", - recorded_puts.len() - ); - - let (key, payload) = &recorded_puts[0]; - assert_eq!(key, "heartbeat.test-cluster"); - - let heartbeat: AgentHeartbeat = serde_json::from_slice(payload).unwrap(); - assert_eq!(heartbeat.cluster_id.to_string(), "test-cluster"); - assert_eq!(heartbeat.status, "HEALTHY"); - - handle.abort(); - } -} diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs index 2b92b851..089b013d 100644 --- a/harmony_agent/src/agent_loop.rs +++ b/harmony_agent/src/agent_loop.rs @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; use tokio::sync::RwLock; use tokio::time::Instant; -use crate::store::{KvStore, KvStoreError}; +use crate::store::{KvMetadata, KvStore, KvStoreError}; use crate::workflow::HeartbeatWorkflow; use crate::workflow::primary::PrimaryWorkflow; use crate::workflow::replica::ReplicaWorkflow; @@ -18,16 +18,24 @@ pub enum AgentRole { Replica, } -pub async fn main() -> Result<(), Box> { - env_logger::init(); - +pub async fn launch_agent( + role: AgentRole, + health_kv: Arc, + cluster_kv: Arc, + heartbeat_interval: Duration, + failover_timeout: Duration, +) -> Result<(), Box> +where + S: KvStore + Send + Sync + 'static, +{ let my_agent_id = Id::from_str("agent_1").unwrap(); let config = AgentConfig { + role, success_threshold: 2, failure_threshold: 2, - heartbeat_interval: Duration::from_secs(1), - failover_timeout: Duration::from_secs(5), + heartbeat_interval, + failover_timeout, deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig { desired_primary_agent: my_agent_id.clone(), cnpg_cluster_name: String::from("cnpg_cluster_name"), @@ -35,7 +43,6 @@ pub async fn main() -> Result<(), Box> { nats_url: String::new(), nats_creds_path: None, agent_id: my_agent_id, - role: AgentRole::Replica, cluster_id: "cluster_test_id".into(), desired_primary_id: "primary_id".into(), }; @@ -46,13 +53,11 @@ pub async fn main() -> Result<(), Box> { // TODO load store based on config, default to nats // probably a good use case for a factory pattern - use crate::store::ChaosKvStore; - use crate::store::InMemoryKvStore; - let health_kv = ChaosKvStore::new(InMemoryKvStore::new(), 30, 30, 1000); - let cluster_kv = ChaosKvStore::new(InMemoryKvStore::new(), 30, 30, 2000); let mut agent = HarmonyAgent::new(config, health_kv, cluster_kv); + agent.reconcile_startup().await?; + // Run the heartbeat loop agent.run_heartbeat_loop().await; @@ -140,22 +145,11 @@ pub struct AgentInfo { pub status: String, } -/// Store-provided metadata for a heartbeat -/// This is returned by the KV store and includes timing/ordering guarantees -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct HeartbeatMetadata { - /// Timestamp set by the store (e.g., NATS JetStream) - /// This avoids clock skew between agents - pub timestamp: u64, - /// Sequence number for strict ordering (e.g., JetStream sequence) - pub sequence: u64, -} - /// Complete heartbeat with both agent data and store metadata #[derive(Debug, Serialize, Deserialize, Clone)] pub struct AgentHeartbeat { pub agent_info: AgentInfo, - pub metadata: Option, + pub metadata: Option, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -169,8 +163,8 @@ pub struct ClusterStateData { pub struct HarmonyAgent { pub config: AgentConfig, workflow: Box, - health_kv: S, - cluster_kv: S, + health_kv: Arc, + cluster_kv: Arc, /// Last successful heartbeat, used to track sequence number for next write /// This avoids doing a GET before every SET, reducing network round-trips last_heartbeat: Arc>>, @@ -179,8 +173,8 @@ pub struct HarmonyAgent { cluster_state: Arc>>, } -impl HarmonyAgent { - pub fn new(config: AgentConfig, health_kv: S, cluster_kv: S) -> Self { +impl HarmonyAgent { + pub fn new(config: AgentConfig, health_kv: Arc, cluster_kv: Arc) -> Self { let workflow: Box = match config.role { AgentRole::Primary => { info!("Initializing agent as PRIMARY"); @@ -192,7 +186,7 @@ impl HarmonyAgent { } AgentRole::Replica => { info!("Initializing agent as REPLICA"); -// pub fn new(success_threshold: usize, failure_threshold: usize, cluster_id: Id, primary_id: Id, my_id: Id) -> Self + // pub fn new(success_threshold: usize, failure_threshold: usize, cluster_id: Id, primary_id: Id, my_id: Id) -> Self Box::new(ReplicaWorkflow::new( config.success_threshold, config.failure_threshold, @@ -219,10 +213,13 @@ impl HarmonyAgent { /// based on the persisted cluster state pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> { let cluster_key = format!("cluster.{}", self.config.cluster_id); - - debug!("Fetching cluster state for startup reconciliation from key: {}", cluster_key); - - let cluster_state_option = match self.cluster_kv.get(cluster_key.clone()).await { + + debug!( + "Fetching cluster state for startup reconciliation from key: {}", + cluster_key + ); + + let cluster_state_option = match self.cluster_kv.get(&cluster_key).await { Ok(result) => { if let Some(value) = result.value { match serde_json::from_value::(value) { @@ -252,7 +249,7 @@ impl HarmonyAgent { // Cache the cluster state locally *self.cluster_state.write().await = cluster_state_option; - + Ok(()) } @@ -261,7 +258,7 @@ impl HarmonyAgent { /// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence) /// to avoid clock skew issues. This follows the ADR-017-3 principle that all timestamp /// comparisons use the store's clock, not agent clocks. - /// + /// /// This method uses the last successful heartbeat's sequence number to avoid an extra /// GET call before each SET, reducing network round-trips and latency exposure. async fn store_heartbeat(&self) -> Result { @@ -276,13 +273,12 @@ impl HarmonyAgent { }; debug!("Storing heartbeat for agent: {}", self.config.agent_id); - let value = serde_json::to_value(&agent_info) - .map_err(|e| KvStoreError::DeserializationFailed { + let value = + serde_json::to_value(&agent_info).map_err(|e| KvStoreError::DeserializationFailed { deserialization_error: e.to_string(), value: format!("{:?}", agent_info), })?; - // Get expected sequence from last successful heartbeat (0 if first write) let expected_sequence = { let last = self.last_heartbeat.read().await; last.as_ref() @@ -291,18 +287,20 @@ impl HarmonyAgent { .unwrap_or(0) }; - // Write with strict ordering - single network round-trip - let new_seq = self.health_kv.set_strict(key, value, expected_sequence).await?; - - debug!("Heartbeat stored successfully with sequence: {}", new_seq); - + trace!("Writing new heartbeat {key} (#{expected_sequence}), value: {value:?}"); + let new_seq = self + .health_kv + .set_strict(&key, value, expected_sequence) + .await?; + trace!("Got new sequence {new_seq}"); + let kv_result = self.health_kv.get_revision(&key, new_seq).await?; + + debug!("Heartbeat stored succsssfully with sequence: {}", new_seq); + // Construct complete heartbeat with metadata from store let heartbeat = AgentHeartbeat { agent_info, - metadata: Some(HeartbeatMetadata { - timestamp: todo!("get the real timestamp from store"), - sequence: new_seq, - }), + metadata: Some(kv_result.metadata), }; // Cache this successful heartbeat for next iteration @@ -318,30 +316,53 @@ impl HarmonyAgent { next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval; // Perform the check via the config/strategy with a timeout + // + // FIXME There is too much stuff happening inside the timeout. There are some things like a + // promotion, that we don't want to cancel within a single heartbeat interval timeout + // I think that the timeout should only apply to the store_heartbeat().await call. + // Logic happening after should not be affected in the exact same manner. There can be + // other timeouts or other stuff to consider here. + // However, the system does rely on heartbeats happening regularly, so we do not want + // to delay the next heartbeat either. This is tricky. + // An idea right now is to keep the heartbeat running but, when a processing event + // occurs, set a flag on the local agent that there is a process running (promotion, + // demotion, etc) and take no other decision until this process is not done. There is + // one exception we can think of right now : + // - a healthy primary starts running a process such as "calling mom" + // - the primary keeps sending its heartbeat to prove to the rest of the cluster that + // it is still healthy + // - then the primary heartbeat fails up to failure_threshold + // - at this moment the "calling mom" process must not prevent the primary from fencing itself. Otherwise the replica that promotes itself when it realises that the primary is dead will cause a split brain. + // - Another solution would be register the processing: "calling mom" in the primary + // heartbeat store, and prevent the replica from promoting when there is a running + // task on the primary. let result = tokio::time::timeout(self.config.heartbeat_interval, async { // Store heartbeat and perform deployment-specific health check match &self.store_heartbeat().await { Ok(heartbeat) => { // Heartbeat stored successfully, already cached by store_heartbeat - debug!("Heartbeat stored: seq={}", heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)); - // Pass heartbeat with metadata to workflow for staleness checks - self.workflow.on_heartbeat_stored(heartbeat).await; + debug!( + "Heartbeat stored: seq={}", + heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0) + ); } - Err(KvStoreError::SequenceMismatch { expected, current }) => { + Err(KvStoreError::WrongLastRevision) => { + todo!("fetch and update correct last sequence number") // CAS failure could indicate: // 1. Network latency: our previous timeout heartbeat actually succeeded // 2. Agent ID conflict: another agent with same ID exists // 3. Clock/bucket corruption (unlikely) - log::warn!( - "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}", - self.config.agent_id, expected, current, current - ); - // Update cached heartbeat sequence to prevent repeated failures - if let Some(hb) = self.last_heartbeat.write().await.as_mut() { - if let Some(metadata) = hb.metadata.as_mut() { - metadata.sequence = *current; - } - } + + // log::warn!( + // "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}", + // self.config.agent_id, expected, current, current + // ); + // // Update cached heartbeat sequence to prevent repeated failures + // if let Some(hb) = self.last_heartbeat.write().await.as_mut() { + // if let Some(metadata) = hb.metadata.as_mut() { + // metadata.sequence = *current; + // } + // } } Err(e) => { // Actual storage failure - treat as heartbeat failure @@ -349,7 +370,10 @@ impl HarmonyAgent { return Err(HeartbeatFailure {}); } } - self.config.deployment_config_unstable.perform_heartbeat().await?; + self.config + .deployment_config_unstable + .perform_heartbeat() + .await?; // TODO: Pass the heartbeat with metadata to the workflow for staleness checks // The workflow needs access to metadata.timestamp for failover timeout calculations @@ -367,10 +391,10 @@ impl HarmonyAgent { trace!("Got heartbeat_result : {heartbeat_result:?}"); match heartbeat_result { Ok(_) => { - self.workflow.handle_heartbeat_success(); + self.workflow.handle_heartbeat_success().await; } Err(_) => { - self.workflow.handle_heartbeat_failure(); + self.workflow.handle_heartbeat_failure().await; } } diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs index de88ecf5..92a0fa09 100644 --- a/harmony_agent/src/main.rs +++ b/harmony_agent/src/main.rs @@ -1,259 +1,83 @@ -// mod typestate_gemini; -// mod typestate; +use std::{sync::Arc, time::Duration}; + +use async_nats::jetstream::kv::Store; + +use crate::{ + agent_loop::AgentRole, + store::{ChaosKvStore, InMemoryKvStore, NatsKvStore}, +}; + mod agent_loop; -mod workflow; pub mod store; +mod workflow; #[tokio::main] async fn main() { - // typestate_gemini::main_typestate_gemini().await; - agent_loop::main().await; + env_logger::init(); + + let heartbeat_interval = Duration::from_millis(2000); + let failover_timeout = Duration::from_secs(10); + + // let (health_kv, cluster_kv) = get_chaos_store(&heartbeat_interval, &failover_timeout); + + let nats_store = get_local_nats_store().await; + let health_kv = nats_store.clone(); + let cluster_kv = nats_store.clone(); + + let _ = tokio::join!( + agent_loop::launch_agent( + AgentRole::Primary, + health_kv.clone(), + cluster_kv.clone(), + heartbeat_interval, + failover_timeout + ), + agent_loop::launch_agent( + AgentRole::Replica, + health_kv, + cluster_kv, + heartbeat_interval, + failover_timeout + ), + ); } -// TODO -// -// DONE: -// 1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type -// 2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision) -// 3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded -// 4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent -// 5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback -// 6. ✅ failover_timeout added to AgentConfig -// 7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error -// 8. ✅ startup reconciliation implemented via on_startup() method -// -// REMAINING: -// - review all code and list implementation issues -// - review both workflow for each state transition -// - Complete replica workflow staleness detection (needs implementation in Watching state) -// - Implement state recovery from Failed state for both workflows -// - Implement subscribe in NATS store with watch() API -// - Implement config validation for failover_timeout constraints +fn get_chaos_store( + heartbeat_interval: &Duration, + failover_timeout: &Duration, +) -> ( + Arc>, + Arc>, +) { + let health_kv = Arc::new(ChaosKvStore::new( + InMemoryKvStore::new(), + 10, + 10, + heartbeat_interval.as_millis().try_into().unwrap(), + )); + let cluster_kv = Arc::new(ChaosKvStore::new( + InMemoryKvStore::new(), + 5, + 5, + failover_timeout.as_millis().try_into().unwrap(), + )); -// TODO -// -// 1. store trait subscribe definition missing callback -// 2. BUG, data integrity issue : nats store not actually using jetstream metadata -// 3. review all code and list implementation issues -// 4. review both workflow for each state transition -// 5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded -// 6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself -// should hold it probably, every agent should be subscribed to the cluster_state object and -// keep it in memory to allow workflows to process against it efficiently) + (health_kv, cluster_kv) +} -// ## CRITICAL - Data Integrity Issues -// -// 1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`) -// - Currently uses `put()` which overwrites unconditionally -// - Must use `update()` with revision parameter for proper compare-and-set -// - Without this, concurrent promotion attempts can cause split brain -// -// 2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`) -// - Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3 -// - NATS Entry has `.revision` and `.created` fields that must be used -// - This defeats the entire purpose of store-provided timestamps -// -// 3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`) -// - Line ~156: TODO comment confirms missing metadata passing -// - Replica cannot calculate staleness without metadata.timestamp -// - Failover logic is broken -// -// 4. **No actual cluster state watching exists** -// - Replica workflow declares `ClusterState` but never updates it -// - No subscription to primary heartbeat or cluster_state key -// - Replica cannot detect primary liveness -// -// ## HIGH - Missing Core Functionality -// -// 5. **Replica Workflow incomplete** - All key logic is TODO: -// - Watching primary staleness (line 114) -// - Promotion attempt (line 118) -// - Original primary recovery detection (line 127) -// - Demotion/handshake (line 131) -// -// 6. **Missing replica "Failed" state** -// - `ReplicaState` enum has no `Failed` variant -// - User's TODO #5 correctly identifies this gap -// - What happens if replica's own heartbeats fail repeatedly? -// -// 7. **Primary Workflow incomplete** - Key logic missing: -// - No NATS check before recovering from `Fenced` state (line 95) -// - No NATS check in `Yielding` state for demotion handshake (line 101) -// - No actual fencing failure handling -// -// 8. **Store `subscribe` not implemented** (`store/mod.rs`) -// - Returns `todo!()` in NATS implementation -// - No callback mechanism defined in trait -// - Without this, agents cannot react to state changes -// -// 9. **Cluster state not tracked centrally** -// - User's TODO #6 correctly identifies this -// - Each agent should maintain a local copy of cluster_state -// - No subscription mechanism to update this local copy -// -// 10. **No validation of configuration constraints** -// - Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin` -// - Invalid config could cause split brain -// -// ## MEDIUM - Incorrect State Transitions -// -// 11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`) -// - Two state transitions happen in one heartbeat cycle -// - Should stay in `Failed` until fencing actually completes -// - What if fencing fails? State machine won't reflect it -// -// 12. **No fencing failure handling** -// - If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes -// - ADR mentions escalating to radical measures, but no callback for failure -// -// 13. **Replica `Watching` state does nothing** -// - Line 115: Just logs, checks nothing -// - Should be checking staleness of primary heartbeat -// -// 14. **Demotion handshake not implemented** -// - ADR section 4 details this but code doesn't implement it -// - How does original primary know it should yield? -// -// ## LOW - Observability & Reliability -// -// 15. **No graceful shutdown mechanism** -// - `run_heartbeat_loop` runs forever -// - No signal handling (SIGTERM, SIGINT) -// -// 16. **Async task errors silently ignored** -// - `tokio::spawn` at lines 74, 83, 123 -// - No `JoinHandle` retention or error handling -// -// 17. **No metrics/observability** -// - Only log output -// - No Prometheus metrics for state transitions, failure counts, etc. -// -// 18. **Hardcoded main() function** (`agent_loop.rs::main`) -// - Not production-ready entry point -// - Should load config from environment or file -// -// 19. **Store factory pattern missing** -// - TODO comment at line 54 confirms this -// - Can't switch between stores via config -// -// 20. **No backoff/retry logic for NATS operations** -// - Transient failures could trigger unnecessary fencing -// -// 21. **`AgentInfo` status is hardcoded to "HEALTHY"** -// - Line 137 in `store_heartbeat` -// - Should反映 actual workflow state -// -// 22. **Unused fields in structs** -// - `HeartbeatState.last_seq` set but never read -// - `ClusterState.current_primary` set but never read -// -// ## ADR-017-3 Compliance Issues -// -// 23. **ADR violation: Clock skew not avoided** -// - While ADR says use store metadata, code uses local time -// -// 24. **Failover timeout not configurable** -// - Defined in ADR but not in `AgentConfig` -// - Needed for replica staleness calculation -// -// 25. **Safety margin concept exists in ADR but not in code** -// - Configuration should include this margin -// -// 26. **No handling of Case 3 (Replica Network Lag)** -// - ADR describes NATS rejection prevention -// - But `set_strict` implementation accepts any write -// -// ## Code Quality Issues -// -// 27. **Inconsistent error handling** -// - Some paths return `Err`, others `todo!()`, others ignore -// -// 28. **Unnecessary `Clone` bounds** -// - `DeploymentConfig.clone()` used frequently -// - Could be optimized with `Arc` -// -// 29. **Missing lifetime annotations** -// - `KvStore::get` returns `String` key in error - inefficient -// -// 30. **No integration points mentioned** -// - PostgreSQL lifecycle control implementation missing -// - Fencing via CNPG not connected -// -// ## Production Readiness Checklist Summary -// -// For battle testing preparation, you need: -// -// **Immediate ( blockers):** -// - Fix NATS store metadata usage (issues #1, #2) -// - Implement strict set_strict with actual CAS (#1) -// - Implement replica primary watching (#4, #5) -// - Add failover_timeout config + staleness logic (#3, #24) -// - Implement subscribe mechanism with callbacks (#8) -// -// **High priority:** -// - Complete all workflow transitions (#5, #7, #11-14) -// - Add cluster state tracking (#6, #9) -// - Add configuration validation (#10) -// - Add Replica Failed state (#6) -// -// **Before deployment:** -// - Implement graceful shutdown (#15) -// - Add error handling for spawned tasks (#16) -// - Remove hardcoded main function (#18) -// - Implement store factory (#19) -// - Add Prometheus metrics (#17) -// -// **Documentation:** -// - Document all configuration parameters and their trade-offs -// - Add runbooks for each failure mode -// - Document battle test scenarios to cover -// -// ### Addendum: Missing Critical Issues -// -// #### 1. CRITICAL: Heartbeat "Lying" (Data Integrity) -// * **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`. -// * **The Bug:** `status: "HEALTHY".to_string()` is hardcoded. -// * **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY". -// * **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely. -// -// #### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption) -// * **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`). -// * **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks. -// * **Scenario:** -// 1. Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s). -// 2. Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s). -// 3. `on_active` finishes *before* `on_failover`. -// 4. `on_failover` finishes last, killing the DB *after* the agent decided it was healthy. -// * **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one. -// -// #### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk) -// * **Location:** `agent_loop.rs` loop logic. -// * **The Bug:** There is no "Stop the World" gate. -// * **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*. -// * **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again. -// * **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter. -// -// #### 4. HIGH: NATS Bucket Name Collision -// * **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`. -// * **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`. -// * **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state. -// * **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`. -// -// #### 5. HIGH: Startup State Reconciliation -// * **Location:** `HarmonyAgent::new`. -// * **The Bug:** Agents always start in `Initializing`. -// * **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader. -// * **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime. -// * **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check). -// -// ### Summary of Tasks to Add -// -// Please add these to your master list before starting implementation: -// -// 28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY". -// 29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping. -// 30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection). -// 31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`. -// 32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid. -// +async fn get_local_nats_store() -> Arc { + let client = async_nats::connect("localhost").await.unwrap(); + let jetstream = async_nats::jetstream::new(client); + let kv = jetstream + .create_key_value(async_nats::jetstream::kv::Config { + bucket: "kv".to_string(), + history: 10, + ..Default::default() + }) + .await + .unwrap(); + let status = kv.status().await.unwrap(); + println!("status: {:?}", status); + Arc::new(NatsKvStore::new(kv)) +} diff --git a/harmony_agent/src/old/typestate.rs b/harmony_agent/src/old/typestate.rs new file mode 100644 index 00000000..78445d55 --- /dev/null +++ b/harmony_agent/src/old/typestate.rs @@ -0,0 +1,230 @@ +use std::{marker::PhantomData, time::Duration}; + +/// Typestate pattern implementation for Primary and Replica state machines +/// Based on Will Crichton's typestate pattern + +pub mod primary { + use super::Agent; + + /// Primary state: Agent is initializing + pub struct Initializing {} + + /// Primary state: Heartbeat failures exceeded threshold + pub struct Failed {} + + /// Primary state: Database fenced/stopped + pub struct Fenced {} + + /// Primary state: Heartbeat succeeding + pub struct Healthy {} + + /// Primary state: Recovered from fence, waiting for demotion handshake + pub struct Yielding {} + + impl Agent { + /// Transition from initializing to healthy + pub fn healthy(self) -> Agent { + self + } + } + + impl Agent { + /// Transition from failed to fenced + pub fn fence(self) -> Agent { + Agent { + consecutive_success: self.consecutive_success, + consecutive_failure: self.consecutive_failure, + failure_threshold: self.failure_threshold, + success_threshold: self.success_threshold, + heartbeat_timeout: self.heartbeat_timeout, + _state: PhantomData + } + } + + /// Transition from failed to healthy (recovery) + pub fn recover(self) -> Agent { + self + } + } + + impl Agent { + /// Transition from fenced to yielding (waiting for demotion) + pub fn await_demotion(self) -> Agent { + self + } + + /// Transition from fenced to healthy (recovery after demotion completes) + pub fn recover(self) -> Agent { + self + } + } + + impl Agent { + /// Transition from healthy to yielding (when original primary recovers) + pub fn yield_leadership(self) -> Agent { + self + } + + /// Transition from healthy to failed (heartbeat failure) + pub fn fail(self) -> Agent { + self + } + } + + impl Agent { + /// Transition from yielding back to healthy (after demotion completes) + pub fn recover(self) -> Agent { + self + } + + /// Transition from yielding back to healthy (if demotion cancelled) + pub fn recover_and_promote(self) -> Agent { + self + } + } +} + +pub mod replica { + use super::Agent; + + /// Replica state: Agent is initializing + pub struct Initializing {} + + /// Replica state: Watching primary heartbeats + pub struct Watching {} + + /// Replica state: Failover timeout exceeded, attempting promotion + pub struct Promoting {} + + /// Replica state: Promotion attempt rejected by NATS + pub struct PromotionFailed {} + + /// Replica state: Successfully promoted to leader + pub struct Leader {} + + /// Replica state: Original primary recovered, yielding leadership + pub struct Demoting {} + + impl Agent { + /// Transition from initializing to watching + pub fn start_watching(self) -> Agent { + self + } + } + + impl Agent { + /// Transition from watching to promoting (failover timeout reached) + pub fn promote(self) -> Agent { + self + } + + /// Transition from watching back to promoting (if demotion cancelled) + pub fn promote_again(self) -> Agent { + self + } + } + + impl Agent { + /// Transition from promoting to leader (promotion successful) + pub fn become_leader(self) -> Agent { + self + } + + /// Transition from promoting to promotion_failed (NATS rejected) + pub fn promotion_rejected(self) -> Agent { + self + } + + /// Transition from promoting back to watching (reverted) + pub fn revert_to_watching(self) -> Agent { + self + } + } + + impl Agent { + /// Transition from promotion_failed back to watching + pub fn continue_watching(self) -> Agent { + self + } + } + + impl Agent { + /// Transition from leader to demoting (original primary recovered) + pub fn yield_leadership(self) -> Agent { + self + } + + /// Transition from leader to watching (if demotion cancelled) + pub fn revert_to_watching(self) -> Agent { + self + } + } + + impl Agent { + /// Transition from demoting back to watching (if demotion cancelled) + pub fn revert_to_watching(self) -> Agent { + self + } + + /// Transition from demoting back to leader (if demotion cancelled) + pub fn promote_again(self) -> Agent { + self + } + } +} + +/// Main Agent struct using typestate pattern +/// State is tracked through the generic type parameter +pub struct Agent { + pub consecutive_success: usize, + pub consecutive_failure: usize, + pub failure_threshold: usize, + pub success_threshold: usize, + pub heartbeat_timeout: Duration, + _state: PhantomData +} + +impl Agent { + /// Create a new agent in the given state with default thresholds + pub fn new(state: State) -> Self { + Agent { + consecutive_success: 0, + consecutive_failure: 0, + failure_threshold: 2, + success_threshold: 3, + heartbeat_timeout: Duration::from_secs(1), + _state: PhantomData + } + } + + /// Create a new agent with custom thresholds + pub fn with_thresholds(state: State, success_threshold: usize, failure_threshold: usize, heartbeat_timeout: Duration) -> Self { + Agent { + consecutive_success: 0, + consecutive_failure: 0, + failure_threshold, + success_threshold, + heartbeat_timeout, + _state: PhantomData + } + } +} + +impl Clone for Agent { + fn clone(&self) -> Self { + Agent { + consecutive_success: self.consecutive_success, + consecutive_failure: self.consecutive_failure, + failure_threshold: self.failure_threshold, + success_threshold: self.success_threshold, + heartbeat_timeout: self.heartbeat_timeout, + _state: PhantomData + } + } +} + +impl Default for Agent { + fn default() -> Self { + Self::new(Initializing {}) + } +} diff --git a/harmony_agent/src/old/typestate_gemini.rs b/harmony_agent/src/old/typestate_gemini.rs new file mode 100644 index 00000000..e4285bdd --- /dev/null +++ b/harmony_agent/src/old/typestate_gemini.rs @@ -0,0 +1,523 @@ +use std::marker::PhantomData; +use std::time::Duration; +use tokio::sync::mpsc; +use tokio::time::Instant; + +// ============================================================================= +// FSM Library (Type State Pattern) +// ============================================================================= + +pub mod fsm { + use super::*; + + /// Generic FSM container + pub struct FSM { + pub user_data: Option, + pub state: PhantomData, + pub _phantom_event: PhantomData, + } + + impl FSM { + pub fn new(user_data: Option) -> Self { + Self { + user_data, + state: PhantomData, + _phantom_event: PhantomData, + } + } + } + + /// Trait to represent FSM behavior via dynamic dispatch + pub trait HandleEvent { + fn handle_event(self: Box, event: E) -> Box>; + } + + /// Implemented per-state by the macro to route event logic + pub trait ErasedState: Send { + fn handle_event(self: Box, event: E) -> Box>; + } + + impl ErasedState for FSM + where + FSM: HandleEvent + Send + 'static, + { + fn handle_event(self: Box, event: E) -> Box> { + HandleEvent::handle_event(self, event) + } + } + + /// Allows FSM to move from state `S` to `T`, retaining user data + pub trait StateMachine: Send + 'static { + fn into_boxed(self) -> Box>; + } + + impl StateMachine for FSM + where + S: Send + 'static, + E: Send + 'static, + U: Send + 'static, + { + fn into_boxed(self) -> Box> { + Box::new(FSM { + user_data: self.user_data, + state: PhantomData, + _phantom_event: PhantomData, + }) + } + } + + /// Runs the FSM in an asynchronous loop + pub async fn run_machine( + mut state: Box>, + mut rx: tokio::sync::mpsc::Receiver, + ) where + E: Send + 'static, + U: Send + 'static, + { + while let Some(event) = rx.recv().await { + state = ErasedState::handle_event(state, event); + } + } +} + +/// Macro for Declaring Transitions +#[macro_export] +macro_rules! define_fsm { + ( + $struct:ident<$event:ident, $user:ident>, { + $( + $state:ty => { + $( + $pattern:pat => $next:ty => $action:expr + ),* $(,)? + } + ),* $(,)? + } + ) => { + $( + impl $crate::fsm::HandleEvent<$event, $user> for $struct<$state, $event, $user> { + fn handle_event(mut self: Box, event: $event) -> Box> { + match event { + $( + $pattern => { + // log::debug!("FSM Transition: {:?} --[{:?}]--> {:?}", stringify!($state), e, stringify!($next)); + log::debug!("FSM Transition: {:?} --[:?]--> {:?}", stringify!($state), stringify!($next)); + $action(&mut self); + self.into_boxed::<$next>() + } + )* + // Default handler for unmapped events in this state: stay in current state + _ => { + // log::trace!("FSM Ignore: {:?} --[{:?}]--> (no transition)", stringify!($state), event); + self + } + } + } + } + )* + }; +} + +// ============================================================================= +// Harmony Agent Domain Logic +// ============================================================================= + +use fsm::{ErasedState, StateMachine, FSM}; + +// --- States --- +#[derive(Debug)] +struct RolePrimary; // Active Leader +#[derive(Debug)] +struct RoleReplica; // Passive Watchdog +#[derive(Debug)] +struct RoleFencing; // Transition: Shutting down +#[derive(Debug)] +struct RolePromoting; // Transition: Taking over +#[derive(Debug)] +struct RoleDemoting; // Transition: Yielding + +// --- Events --- +#[derive(Debug, Clone)] +enum AgentEvent { + /// Periodic timer tick (drives checks) + Tick, + /// Result of a local health check (Primary only) + HealthCheckResult { success: bool }, + /// Update from NATS about the cluster state + ClusterStateUpdate { primary_id: String, timestamp: Instant }, + /// Command to force a state change (e.g. admin intervention) + ForceDemote, +} + +// --- Side Effect Commands (Outbound) --- +#[derive(Debug)] +enum WorkerCommand { + PerformHealthCheck, + PerformFencing, + PerformPromotion, + PerformDemotion, +} + +// --- Context --- +struct AgentContext { + // Config + agent_id: String, + success_threshold: usize, + failure_threshold: usize, + heartbeat_interval: Duration, + failover_timeout: Duration, + + // Runtime State + consecutive_failures: usize, + last_primary_heartbeat: Option, + + // Communication + worker_tx: mpsc::Sender, +} + +impl AgentContext { + fn send_command(&self, cmd: WorkerCommand) { + let tx = self.worker_tx.clone(); + tokio::spawn(async move { + if let Err(e) = tx.send(cmd).await { + log::error!("Failed to send worker command: {}", e); + } + }); + } +} + +// --- FSM Definition --- + +define_fsm!(FSM, { + // ------------------------------------------------------------------------- + // PRIMARY STATE (Self-Preservation) + // ------------------------------------------------------------------------- + RolePrimary => { + // 1. On Tick: Trigger a health check (Async Side Effect) + AgentEvent::Tick => RolePrimary => |s: &mut FSM| { + if let Some(ctx) = &mut s.user_data { + ctx.send_command(WorkerCommand::PerformHealthCheck); + } + }, + + // 2. Health Check Success: Reset counters + AgentEvent::HealthCheckResult { success: true } => RolePrimary => |s: &mut FSM| { + if let Some(ctx) = &mut s.user_data { + ctx.consecutive_failures = 0; + log::info!("✅ Heartbeat Success (Primary)"); + } + }, + + // 3. Health Check Failure: Increment counters & Check Threshold + AgentEvent::HealthCheckResult { success: false } => RolePrimary => |s: &mut FSM| { + // NOTE: We determine next state dynamically by checking threshold. + // Since the macro requires a static next type, we handle the "Stay" case here. + // If we need to transition, we assume the event loop sends a specific event, + // OR we use a separate state for "Checking". + // However, to keep it simple within this pattern, we will check threshold here. + // If threshold reached, we ideally want to return RoleFencing. + // But the macro forces `=> RolePrimary`. + // + // WORKAROUND: We use a specific event flow. + // Ideally, the `HealthCheckResult` logic would be: + // if fail >= threshold { transition Fencing } else { stay } + // + // To strictly follow the macro structure where destination is fixed per pattern: + // We can't branch to different types in one pattern. + // So we will stay in RolePrimary here, but if threshold is hit, we trigger Fencing immediately + // by sending a command, and we rely on the Worker to complete fencing and maybe restart us? + // + // BETTER APPROACH for this specific FSM pattern: + // We need an intermediate event or state if the destination depends on runtime data. + // But let's assume for this implementation that we handle the "Stay" case here, + // and if we fail, we transition to Fencing on the NEXT tick or via a self-generated event? + // + // Let's modify the logic: The Worker sends `HealthCheckResult { success: false }`. + // If we are still below threshold, we log. + // If we are at threshold, we treat this event as a trigger for Fencing? + // No, the pattern matches `Event => Type`. + // + // Revised: We need two patterns. But we can't match on values inside the struct in the macro easily + // unless we define specific events like `HealthCheckFailedFatal`. + // + // Let's use `consecutive_failures` check inside the action. + // If fatal, we return a new Box. + // Wait, the macro generates `self.into_boxed::<$next>()`. It hardcodes the return type. + // + // This is a limitation of the macro provided in the blog post. + // To solve this strictly following the provided code, we must ensure the event *itself* dictates the transition. + // + // So the Worker must know the threshold? No, that leaks logic. + // + // Solution: The FSM Action can mutate `ctx`. + // We will have `AgentEvent::HealthCheckFailed`. + // We stay in `RolePrimary`. + // Inside the action, if `ctx.failures >= threshold`, we `ctx.send_command(PerformFencing)`. + // And we transition to `RoleFencing`? We can't conditionally transition in the macro. + // + // OK, I will split the event. + // The Worker returns `HealthCheckResult`. + // The FSM handles it. + // If the FSM sees failure, it stays in Primary. + // But if it needs to fence, it needs to transition. + // + // I will add `AgentEvent::FencingTriggered` which is sent by the FSM to itself? + // Or simpler: The Worker sends `HealthCheckFailed`. + // If we want to fence, we need to move to `RoleFencing`. + // + // Let's adjust the macro usage slightly. The user said "follow exactly the FSM pattern". + // The pattern implies strict state transitions. + // + // I will implement `RolePrimary` -> `RoleFencing` on `ForceDemote` or similar. + // And I will assume the Worker sends `ForceDemote` if it detects critical failure? + // No, the logic belongs in the FSM. + // + // Let's use the `Tick` to check the counter. + // 1. Tick -> Check. + // 2. Result -> Update Counter. + // 3. Tick -> If counter > thresh -> Transition Fencing. + // + // Let's try that. + if let Some(ctx) = &mut s.user_data { + ctx.consecutive_failures += 1; + log::warn!("⚠️ Heartbeat Failed (Count: {}/{})", ctx.consecutive_failures, ctx.failure_threshold); + } + }, + + // 4. The actual Fencing Transition + // We use a specific pattern guard if possible, or just a separate event. + // Since we can't guard in the macro, we'll use a trick: + // If failures are high, the NEXT Tick will trigger transition? + // No, we want immediate. + // + // Let's add `AgentEvent::CriticalFailure` event. + // The `HealthCheckResult` handler (above) will check the threshold. + // If threshold reached, it cannot transition itself (locked to RolePrimary). + // BUT, it can emit a `CriticalFailure` event to the channel. + // Then the FSM loop picks it up and transitions. + AgentEvent::ForceDemote => RoleFencing => |s: &mut FSM| { + if let Some(ctx) = &mut s.user_data { + log::error!("🚨 Failure Threshold Reached. Initiating Fencing."); + ctx.send_command(WorkerCommand::PerformFencing); + } + }, + + // 5. Split Brain Prevention + AgentEvent::ClusterStateUpdate { primary_id, .. } => RoleDemoting => |s: &mut FSM| { + if let Some(ctx) = &mut s.user_data { + if primary_id != ctx.agent_id && !primary_id.is_empty() { + log::warn!("Split Brain Detected! Another primary is active: {}. Demoting.", primary_id); + ctx.send_command(WorkerCommand::PerformDemotion); + } + } + } + }, + + // ------------------------------------------------------------------------- + // REPLICA STATE (Watchdog) + // ------------------------------------------------------------------------- + RoleReplica => { + // 1. Receive Heartbeats from Primary + AgentEvent::ClusterStateUpdate { primary_id, timestamp } => RoleReplica => |s: &mut FSM| { + if let Some(ctx) = &mut s.user_data { + if !primary_id.is_empty() { + ctx.last_primary_heartbeat = Some(timestamp); + // log::trace!("Replica: Saw primary {} at {:?}", primary_id, timestamp); + } + } + }, + + // 2. Tick: Check for Staleness + AgentEvent::Tick => RoleReplica => |s: &mut FSM| { + // We can't transition conditionally here either. + // Same pattern: Check logic, if stale, send `ForcePromote` event to self. + if let Some(ctx) = &mut s.user_data { + if let Some(last) = ctx.last_primary_heartbeat { + let elapsed = Instant::now().duration_since(last); + if elapsed > ctx.failover_timeout { + log::warn!("⚡ Primary Stale ({}ms > {}ms). Triggering Promotion.", elapsed.as_millis(), ctx.failover_timeout.as_millis()); + // We need to trigger the transition. + // We can't do it directly in this closure because the return type is fixed to RoleReplica. + // So we assume the "Driver" or a self-send handles the trigger. + // For this implementation, we'll assume we have a handle to the main loop channel in ctx? + // No, ctx has `worker_tx`. + // + // We will send a command to worker to "ConfirmPromotionEligibility", which sends back `ForcePromote`. + ctx.send_command(WorkerCommand::PerformPromotion); // This checks eligibility then triggers event + } + } + } + }, + + // 3. Promotion Triggered + AgentEvent::ForceDemote => RolePromoting => |s: &mut FSM| { + log::info!("Promoting to Primary..."); + } + }, + + // ------------------------------------------------------------------------- + // FENCING STATE (Transient) + // ------------------------------------------------------------------------- + RoleFencing => { + // Once fencing is done (simulated by Tick or specific event), we become a Replica (Clean Demotion) + AgentEvent::Tick => RoleReplica => |s: &mut FSM| { + log::info!("Fencing/Demotion complete. Switching to Replica (Watchdog) mode."); + if let Some(ctx) = &mut s.user_data { + ctx.consecutive_failures = 0; + } + } + }, + + // ------------------------------------------------------------------------- + // PROMOTING STATE (Transient) + // ------------------------------------------------------------------------- + RolePromoting => { + // Promotion logic usually involves ensuring WAL catchup etc. + // We simulate success on next Tick. + AgentEvent::Tick => RolePrimary => |s: &mut FSM| { + log::info!("Promotion Complete. I am now the PRIMARY."); + if let Some(ctx) = &mut s.user_data { + ctx.consecutive_failures = 0; + // Reset heartbeat timestamp so we don't fence immediately + ctx.last_primary_heartbeat = Some(Instant::now()); + } + } + }, + + // ------------------------------------------------------------------------- + // DEMOTING STATE (Transient) + // ------------------------------------------------------------------------- + RoleDemoting => { + AgentEvent::Tick => RoleReplica => |s: &mut FSM| { + log::info!("Demotion Complete. Switching to Replica."); + } + } +}); + +// ============================================================================= +// Main & Runtime +// ============================================================================= + +pub async fn main_typestate_gemini() -> Result<(), Box> { + env_logger::init(); + log::info!("Harmony Agent FSM Starting..."); + + // 1. Setup Channels + let (event_tx, event_rx) = mpsc::channel::(100); + let (worker_tx, mut worker_rx) = mpsc::channel::(100); + + // 2. Configuration + let my_agent_id = "agent_1".to_string(); + let desired_primary = "agent_1".to_string(); // Change to "agent_2" to test Replica start + let is_primary = my_agent_id == desired_primary; + + let context = AgentContext { + agent_id: my_agent_id.clone(), + success_threshold: 2, + failure_threshold: 2, + heartbeat_interval: Duration::from_secs(1), + failover_timeout: Duration::from_secs(3), // 3s > 1s interval + consecutive_failures: 0, + last_primary_heartbeat: Some(Instant::now()), + worker_tx: worker_tx.clone(), + }; + + // 3. Spawn Worker (Simulates IO and Logic Glue) + let event_tx_worker = event_tx.clone(); + tokio::spawn(async move { + while let Some(cmd) = worker_rx.recv().await { + match cmd { + WorkerCommand::PerformHealthCheck => { + // Simulate IO latency + tokio::time::sleep(Duration::from_millis(100)).await; + + // Simulate random failure (10% chance) + let success = getrandom::u64().unwrap() % 100 > 10; + + // Send result back + let _ = event_tx_worker.send(AgentEvent::HealthCheckResult { success }).await; + + // CRITICAL: Logic Glue for the FSM limitation + // If we failed, we don't know the counter here easily without shared state. + // But for the purpose of this demo, let's assume the FSM handles the counter. + // If the FSM decides to fence, it sends PerformFencing. + // + // However, we need to trigger the transition event if threshold is hit. + // Since FSM action is sync and can't send async events easily back to itself *during* the transition, + // we rely on the FSM action checking the counter and sending a command to US (Worker), + // and WE send the transition event back. + } + WorkerCommand::PerformFencing => { + log::warn!("[Worker] Executing Fencing Procedure (Stop DB)..."); + tokio::time::sleep(Duration::from_millis(500)).await; + // Trigger the state transition in FSM + let _ = event_tx_worker.send(AgentEvent::ForceDemote).await; + } + WorkerCommand::PerformPromotion => { + log::info!("[Worker] Checking Promotion Eligibility..."); + // Simulate check + tokio::time::sleep(Duration::from_millis(200)).await; + // Trigger transition + let _ = event_tx_worker.send(AgentEvent::ForceDemote).await; // Reusing ForceDemote as "Trigger Transition" for Replica->Promote based on graph? + // Wait, Replica->Promote uses ForceDemote in the macro above? + // Yes: AgentEvent::ForceDemote => RolePromoting + } + WorkerCommand::PerformDemotion => { + log::warn!("[Worker] Yielding Leadership..."); + tokio::time::sleep(Duration::from_millis(200)).await; + // Trigger transition + // We need an event that goes Primary -> Demoting. + // In the macro: AgentEvent::ClusterStateUpdate handles the detection. + // But we need to transition. + // Actually, the macro for ClusterStateUpdate transitions DIRECTLY to RoleDemoting. + // So this command might just be for side-effects (stopping DB). + } + } + } + }); + + // 4. Spawn Timer (Heartbeat Tick) + let event_tx_timer = event_tx.clone(); + tokio::spawn(async move { + let mut interval = tokio::time::interval(Duration::from_secs(1)); + loop { + interval.tick().await; + let _ = event_tx_timer.send(AgentEvent::Tick).await; + } + }); + + // 5. Spawn NATS Watcher (Simulated) + let event_tx_nats = event_tx.clone(); + tokio::spawn(async move { + // Simulate receiving heartbeats from "agent_1" + loop { + tokio::time::sleep(Duration::from_millis(500)).await; + // If we are agent_1, we are the primary, so we don't see external heartbeats usually, + // but for simulation, let's say we see ourselves or nothing. + // If we are agent_2 (Replica), we see agent_1. + + // Uncomment to simulate primary death for Replica: + // continue; + + let _ = event_tx_nats.send(AgentEvent::ClusterStateUpdate { + primary_id: "agent_1".to_string(), + timestamp: Instant::now(), + }).await; + } + }); + + // 6. Initialize FSM + let initial_state: Box> = if is_primary { + log::info!("Starting as PRIMARY"); + Box::new(FSM::::new(Some(context))) + } else { + log::info!("Starting as REPLICA"); + Box::new(FSM::::new(Some(context))) + }; + + // 7. Run + fsm::run_machine(initial_state, event_rx).await; + + Ok(()) +} + diff --git a/harmony_agent/src/store/chaos.rs b/harmony_agent/src/store/chaos.rs index 1dce4ed8..9fa6fc83 100644 --- a/harmony_agent/src/store/chaos.rs +++ b/harmony_agent/src/store/chaos.rs @@ -1,4 +1,5 @@ use async_trait::async_trait; +use log::{debug, trace, warn}; use serde_json::Value; use std::sync::Arc; use tokio::time::Duration; @@ -12,43 +13,55 @@ use super::{KvStore, KvStoreError}; #[derive(Clone)] pub struct ChaosKvStore { inner: Arc, - timeout_probability: u32, - failure_probability_percentage: u32, + timeout_probability_percent: u32, + failure_probability_percent: u32, max_delay_ms: u64, } impl ChaosKvStore { pub fn new( inner: T, - timeout_probability: u32, - failure_probability: u32, + timeout_probability_percent: u32, + failure_probability_percent: u32, max_delay_ms: u64, ) -> Self { Self { inner: Arc::new(inner), - timeout_probability, - failure_probability_percentage: failure_probability, + timeout_probability_percent, + failure_probability_percent, max_delay_ms, } } async fn maybe_chaos(&self) -> Result<(), KvStoreError> { + trace!("Calculating chaos"); // Random delay - if self.max_delay_ms > 0 { - let delay = getrandom::u64().unwrap() % self.max_delay_ms; - tokio::time::sleep(Duration::from_millis(delay)).await; - } + let delay = getrandom::u64().unwrap() % self.max_delay_ms; + let delay = Duration::from_millis(delay); + trace!("Sleeping until chaos maybe happens {delay:?}"); + tokio::time::sleep(delay).await; // Random failure - let failure_random = getrandom::u32().unwrap(); - if (failure_random % 100) < self.failure_probability_percentage { - return Err(KvStoreError::Unknown); + let failure_random = getrandom::u32().unwrap() % 100; + if failure_random < self.failure_probability_percent { + warn!( + "Chaos causes an error : {failure_random} < {}", + self.failure_probability_percent + ); + return Err(KvStoreError::Unknown(format!( + "Randomly failed thanks to chaos store with {}% chances, got {}", + self.failure_probability_percent, failure_random + ))); } // Random timeout (simulated as a very long delay) - let failure_random = getrandom::u32().unwrap(); - if failure_random % 100 < self.timeout_probability { - tokio::time::sleep(Duration::from_secs(10)).await; + let failure_random = getrandom::u32().unwrap() % 100; + if failure_random < self.timeout_probability_percent { + warn!( + "Chaos caused a timeout : {failure_random} < {}", + self.failure_probability_percent + ); + tokio::time::sleep(Duration::from_secs(189754678456784560)).await; } Ok(()) @@ -57,14 +70,23 @@ impl ChaosKvStore { #[async_trait] impl KvStore for ChaosKvStore { - async fn get(&self, key: String) -> Result { + async fn get(&self, key: &str) -> Result { self.maybe_chaos().await?; self.inner.get(key).await } + async fn get_revision( + &self, + key: &str, + expected_seq: u64, + ) -> Result { + self.maybe_chaos().await?; + self.inner.get_revision(key, expected_seq).await + } + async fn set_strict( &self, - key: String, + key: &str, value: Value, expected_sequence: u64, ) -> Result { @@ -74,7 +96,7 @@ impl KvStore for ChaosKvStore { async fn subscribe( &self, - key: String, + key: &str, callback: SubscriptionCallback, ) -> Result<(), KvStoreError> { self.maybe_chaos().await?; @@ -94,13 +116,10 @@ mod tests { let chaos = ChaosKvStore::new(inner, 0, 0, 0); let value = json!({"test": "value"}); - let result = chaos - .set_strict("key".to_string(), value.clone(), 0) - .await - .unwrap(); + let result = chaos.set_strict("key", value.clone(), 0).await.unwrap(); assert_eq!(result, 1); - let retrieved = chaos.get("key".to_string()).await.unwrap(); + let retrieved = chaos.get("key").await.unwrap(); assert_eq!(retrieved.value, Some(value)); } @@ -111,7 +130,7 @@ mod tests { let start = tokio::time::Instant::now(); let value = json!({"test": "value"}); - chaos.set_strict("key".to_string(), value, 0).await.unwrap(); + chaos.set_strict("key", value, 0).await.unwrap(); let elapsed = start.elapsed(); // Should have some delay diff --git a/harmony_agent/src/store/memory.rs b/harmony_agent/src/store/memory.rs index 3549c563..150b7225 100644 --- a/harmony_agent/src/store/memory.rs +++ b/harmony_agent/src/store/memory.rs @@ -1,4 +1,5 @@ use async_trait::async_trait; +use log::{debug, trace}; use serde_json::Value; use std::collections::HashMap; use std::sync::Arc; @@ -10,24 +11,46 @@ use crate::store::SubscriptionCallback; use super::{KvMetadata, KvResult, KvStore, KvStoreError}; /// An in-memory KV store that guarantees ordering like NATS JetStream -/// Each key has a sequence number that increments on each write +/// Each key maintains a full history of all writes, where the sequence number +/// is the length of the history (1-indexed) #[derive(Clone)] pub struct InMemoryKvStore { - data: Arc>>, - global_seq: Arc>, + data: Arc>>>, } impl InMemoryKvStore { pub fn new() -> Self { Self { data: Arc::new(RwLock::new(HashMap::new())), - global_seq: Arc::new(RwLock::new(0)), } } - /// Get the sequence number for a key + /// Get the latest sequence number for a key (length of history) pub async fn get_seq(&self, key: &str) -> Option { - self.data.read().await.get(key).map(|(_, seq)| *seq) + self.data.read().await.get(key).map(|vec| vec.len() as u64) + } + + /// Get the value at a specific revision for a key + pub async fn get_revision(&self, key: &str, seq: u64) -> Result { + let data = self.data.read().await; + let entries = data + .get(key) + .ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?; + + // Sequence numbers are 1-indexed, so seq must be >= 1 and <= len() + if seq == 0 || seq > entries.len() as u64 { + return Err(KvStoreError::KeyNotAvailable(key.to_string())); + } + + let (value, timestamp) = entries[seq as usize - 1].clone(); + + Ok(KvResult { + value: Some(value.clone()), + metadata: KvMetadata { + timestamp, + sequence: seq, + }, + }) } } @@ -39,62 +62,69 @@ impl Default for InMemoryKvStore { #[async_trait] impl KvStore for InMemoryKvStore { - async fn get(&self, key: String) -> Result { - let data = self.data.read().await; - let (value, sequence) = data - .get(&key) - .ok_or_else(|| KvStoreError::KeyNotAvailable(key.clone()))?; + async fn get_revision(&self, key: &str, expected_seq: u64) -> Result { + self.get_revision(key, expected_seq).await + } - let timestamp = SystemTime::now() - .duration_since(UNIX_EPOCH) - .expect("Time went backwards") - .as_millis() as u64; + async fn get(&self, key: &str) -> Result { + let data = self.data.read().await; + let entries = data + .get(key) + .ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?; + + let (value, timestamp) = entries.last().unwrap(); Ok(KvResult { value: Some(value.clone()), metadata: KvMetadata { - timestamp, - sequence: *sequence, + timestamp: *timestamp, + sequence: entries.len() as u64, }, }) } async fn set_strict( &self, - key: String, + key: &str, value: Value, expected_sequence: u64, ) -> Result { - // Check current sequence + // Check current sequence (length of history for this key) let data = self.data.read().await; - let current_sequence = data.get(&key).map(|(_, seq)| *seq).unwrap_or(0); + let current_sequence = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0); drop(data); // Verify expected sequence matches if current_sequence != expected_sequence { - return Err(KvStoreError::SequenceMismatch { - expected: expected_sequence, - current: current_sequence, - }); + trace!("{current_sequence} != {expected_sequence}"); + return Err(KvStoreError::WrongLastRevision); } - // Increment global sequence - let mut seq = self.global_seq.write().await; - *seq += 1; - let new_seq = *seq; - drop(seq); + // Get current timestamp + let timestamp = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_millis() as u64; - // Write the new value + // Append to the history let mut data = self.data.write().await; - data.insert(key, (value.clone(), new_seq)); - drop(data); + data.entry(key.to_string()) + .or_insert_with(Vec::new) + .push((value.clone(), timestamp)); + + let new_seq = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0); + + debug!( + "Successfully inserted {key}(rev#{new_seq}) : {value}", + value = value.to_string() + ); Ok(new_seq) } async fn subscribe( &self, - key: String, + key: &str, callback: SubscriptionCallback, ) -> Result<(), KvStoreError> { // For now, subscribe just returns the current value @@ -116,13 +146,13 @@ mod tests { // Set a value let value = json!({"status": "healthy"}); let result = store - .set_strict("test_key".to_string(), value.clone(), 0) + .set_strict("test_key", value.clone(), 0) .await .unwrap(); assert_eq!(result, 1); // Get the value - let retrieved = store.get("test_key".to_string()).await.unwrap(); + let retrieved = store.get("test_key").await.unwrap(); assert_eq!(retrieved.value, Some(value)); assert_eq!(retrieved.metadata.sequence, 1); } @@ -131,15 +161,9 @@ mod tests { async fn test_memory_store_sequence_numbers() { let store = InMemoryKvStore::new(); - let seq1 = store - .set_strict("key1".to_string(), json!("value1"), 0) - .await - .unwrap(); + let seq1 = store.set_strict("key1", json!("value1"), 0).await.unwrap(); - let seq2 = store - .set_strict("key2".to_string(), json!("value2"), 0) - .await - .unwrap(); + let seq2 = store.set_strict("key2", json!("value2"), 0).await.unwrap(); assert!(seq2 > seq1, "Sequence numbers should increment"); } @@ -147,7 +171,7 @@ mod tests { #[tokio::test] async fn test_memory_store_key_not_found() { let store = InMemoryKvStore::new(); - let result = store.get("nonexistent".to_string()).await; + let result = store.get("nonexistent").await; assert!(matches!(result, Err(KvStoreError::KeyNotAvailable(_)))); } @@ -156,29 +180,15 @@ mod tests { let store = InMemoryKvStore::new(); // First write with sequence 0 - let result1 = store - .set_strict("key".to_string(), json!("value1"), 0) - .await - .unwrap(); + let result1 = store.set_strict("key", json!("value1"), 0).await.unwrap(); assert_eq!(result1, 1); // Second write with correct sequence - let result2 = store - .set_strict("key".to_string(), json!("value2"), 1) - .await - .unwrap(); + let result2 = store.set_strict("key", json!("value2"), 1).await.unwrap(); assert_eq!(result2, 2); // Third write with wrong sequence should fail - let result3 = store - .set_strict("key".to_string(), json!("value3"), 1) - .await; - assert!(matches!( - result3, - Err(KvStoreError::SequenceMismatch { - expected: 1, - current: 2 - }) - )); + let result3 = store.set_strict("key", json!("value3"), 1).await; + assert!(matches!(result3, Err(KvStoreError::WrongLastRevision))); } } diff --git a/harmony_agent/src/store/mod.rs b/harmony_agent/src/store/mod.rs index 26e630c5..08879d1a 100644 --- a/harmony_agent/src/store/mod.rs +++ b/harmony_agent/src/store/mod.rs @@ -1,4 +1,5 @@ use async_trait::async_trait; +use serde::{Deserialize, Serialize}; use serde_json::Value; use thiserror::Error; @@ -11,7 +12,7 @@ pub struct SubscriptionHandle { /// Metadata returned by the KV store for all operations /// Contains timing and ordering information set by the store -#[derive(Debug, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone)] pub struct KvMetadata { /// Timestamp set by the store (milliseconds since UNIX epoch) pub timestamp: u64, @@ -48,70 +49,72 @@ pub enum KvStoreError { deserialization_error: String, value: String, }, - #[error("Strict ordering violation: expected sequence {expected}, but current is {current}")] - SequenceMismatch { expected: u64, current: u64 }, - #[error("unknown data store error")] - Unknown, + #[error("Strict ordering violation, wrong last sequence number")] + WrongLastRevision, + #[error("unknown data store error {0}")] + Unknown(String), } #[async_trait] pub trait KvStore { /// Get a value from the store - /// + /// /// # Returns /// - `Ok(KvResult)`: Contains the value and metadata (timestamp, sequence) /// - `Err(KeyNotAvailable)`: If the key doesn't exist - async fn get(&self, key: String) -> Result; - + async fn get(&self, key: &str) -> Result; + + async fn get_revision(&self, key: &str, expected_seq: u64) -> Result; + /// Strict set operation with compare-and-set semantics - /// + /// /// Sets the value only if the current sequence number matches `expected_sequence`. /// This provides strict ordering guarantees needed for the failover algorithm. - /// + /// /// # Parameters /// - `key`: The key to set /// - `value`: The value to store /// - `expected_sequence`: The sequence number we expect the key to currently have. /// Use 0 for the first write to a new key. - /// + /// /// # Returns /// - `Ok(u64)`: Returns the new sequence number /// - `Err(KvStoreError)`: If another write happened (current != expected) - /// + /// /// # Example Use Case /// For NATS JetStream, this maps to the conditional update operation that ensures /// only one agent can successfully promote to primary. async fn set_strict( &self, - key: String, + key: &str, value: Value, expected_sequence: u64, ) -> Result; - + /// Subscribe to updates for a key - /// + /// /// # Parameters /// - `key`: The key to subscribe to /// - `callback`: Function to call on each update with key, value, and metadata - /// + /// /// # Returns /// - `Ok(())`: Subscription established successfully /// - `Err(KvStoreError)`: Subscription failed - /// + /// /// Note: For JetStream, this should use watch() API. Updates will invoke the callback /// asynchronously in the background. async fn subscribe( &self, - key: String, + key: &str, callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a // callback ) -> Result<(), KvStoreError>; } +mod chaos; mod memory; mod nats; -mod chaos; +pub use chaos::ChaosKvStore; pub use memory::InMemoryKvStore; pub use nats::NatsKvStore; -pub use chaos::ChaosKvStore; diff --git a/harmony_agent/src/store/nats.rs b/harmony_agent/src/store/nats.rs index 1c82c1d8..c89bc54c 100644 --- a/harmony_agent/src/store/nats.rs +++ b/harmony_agent/src/store/nats.rs @@ -1,6 +1,6 @@ use async_nats::jetstream::kv::{Store, UpdateError}; use async_trait::async_trait; -use log::{debug, error}; +use log::{debug, error, trace}; use serde_json::Value; use crate::store::SubscriptionCallback; @@ -46,8 +46,48 @@ impl NatsKvStore { #[async_trait] impl KvStore for NatsKvStore { - async fn get(&self, key: String) -> Result { - let entry = self.store.entry(&key).await.map_err(|e| { + async fn get_revision(&self, key: &str, expected_seq: u64) -> Result { + let entry = self + .store + .entry_for_revision(key, expected_seq) + .await + .map_err(|e| { + error!("NATS get failed for key '{}': {}", key, e); + KvStoreError::Disconnect(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + )) + })?; + + if entry.is_none() { + return Err(KvStoreError::KeyNotAvailable(key.to_string())); + } + + let entry = entry.unwrap(); + let value: Value = serde_json::from_slice(&entry.value).map_err(|e| { + KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: String::from_utf8_lossy(&entry.value).to_string(), + } + })?; + + // Extract metadata from NATS entry + // Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime + let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64; + + let metadata = KvMetadata { + timestamp, + sequence: entry.revision, + }; + + Ok(KvResult { + value: Some(value), + metadata, + }) + } + + async fn get(&self, key: &str) -> Result { + let entry = self.store.entry(key).await.map_err(|e| { error!("NATS get failed for key '{}': {}", key, e); KvStoreError::Disconnect(std::io::Error::new( std::io::ErrorKind::Other, @@ -56,7 +96,7 @@ impl KvStore for NatsKvStore { })?; if entry.is_none() { - return Err(KvStoreError::KeyNotAvailable(key)); + return Err(KvStoreError::KeyNotAvailable(key.to_string())); } let entry = entry.unwrap(); @@ -84,10 +124,14 @@ impl KvStore for NatsKvStore { async fn set_strict( &self, - key: String, + key: &str, value: Value, expected_sequence: u64, ) -> Result { + trace!( + "Nats set strict {key} (#{expected_sequence}) : {}", + value.to_string() + ); let bytes = serde_json::to_vec(&value).map_err(|e| KvStoreError::DeserializationFailed { deserialization_error: e.to_string(), @@ -112,7 +156,7 @@ impl KvStore for NatsKvStore { async fn subscribe( &self, - key: String, + key: &str, callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a ) -> Result<(), KvStoreError> { todo!() @@ -125,7 +169,7 @@ impl From for KvStoreError { async_nats::jetstream::kv::UpdateErrorKind::InvalidKey => KvStoreError::InvalidKey, async_nats::jetstream::kv::UpdateErrorKind::TimedOut => KvStoreError::Timeout, async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision => { - KvStoreError::KeyNotAvailable("key".to_string()) + KvStoreError::WrongLastRevision } async_nats::jetstream::kv::UpdateErrorKind::Other => KvStoreError::Disconnect( std::io::Error::new(std::io::ErrorKind::Other, "NATS update error"), diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs index 074b29e2..05f9934c 100644 --- a/harmony_agent/src/workflow/mod.rs +++ b/harmony_agent/src/workflow/mod.rs @@ -8,28 +8,12 @@ pub mod replica; #[async_trait] pub trait HeartbeatWorkflow: Send + Sync { /// Handle a successful heartbeat - fn handle_heartbeat_success(&mut self); + async fn handle_heartbeat_success(&mut self); /// Handle a failed heartbeat - fn handle_heartbeat_failure(&mut self); + async fn handle_heartbeat_failure(&mut self); - /// Called after heartbeat is successfully stored with metadata - /// This provides workflows access to timestamp/sequence for staleness calculations - async fn on_heartbeat_stored(&mut self, _heartbeat: &crate::agent_loop::AgentHeartbeat) { - // Default implementation does nothing - } - - /// Called during agent startup to reconcile state from cluster state - /// Receives the current cluster state if available - async fn on_startup(&mut self, _cluster_state: Option<&crate::agent_loop::ClusterStateData>) { - // Default implementation does nothing - } - - /// Called when a peer agent heartbeat is observed (via subscription) - /// This is primarily used by replicas to detect primary staleness - async fn on_peer_heartbeat(&mut self, _peer_id: &Id, _heartbeat: &crate::agent_loop::AgentHeartbeat) { - // Default implementation does nothing - } + async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>); /// Get the current state name for logging (also used for heartbeat status) fn state_name(&self) -> &'static str; diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs index 7eccc998..73c12828 100644 --- a/harmony_agent/src/workflow/primary.rs +++ b/harmony_agent/src/workflow/primary.rs @@ -76,7 +76,8 @@ impl HeartbeatWorkflow for PrimaryWorkflow { debug!("No cluster state on startup, starting from Initializing"); } } - fn handle_heartbeat_success(&mut self) { + async fn handle_heartbeat_success(&mut self) { + trace!("Handling heartbeat success, current counters success {} failures {}", self.consecutive_successes, self.consecutive_failures); self.consecutive_successes += 1; self.consecutive_failures = 0; @@ -119,7 +120,7 @@ impl HeartbeatWorkflow for PrimaryWorkflow { } } - fn handle_heartbeat_failure(&mut self) { + async fn handle_heartbeat_failure(&mut self) { self.consecutive_failures += 1; self.consecutive_successes = 0; diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs index 9800e3c7..b790a6b9 100644 --- a/harmony_agent/src/workflow/replica.rs +++ b/harmony_agent/src/workflow/replica.rs @@ -1,6 +1,6 @@ use async_trait::async_trait; use harmony_types::id::Id; -use log::{debug, info, trace}; +use log::{debug, info, trace, warn}; use std::time::Duration; use tokio::sync::RwLock; @@ -115,8 +115,7 @@ impl ReplicaWorkflow { /// Check if the primary heartbeat is stale compared to our own /// Per ADR-017-3: primary is stale if (replica_timestamp - primary_timestamp) > failover_timeout - async fn check_primary_staleness(&mut self) { - let mut new_state = self.state.clone(); + async fn is_primary_stale(&mut self) -> bool { if let Some(my_hb) = &self.last_my_heartbeat { if let Some(my_metadata) = &my_hb.metadata { if let Some(primary_hb_ref) = self.last_primary_heartbeat.as_ref() { @@ -141,65 +140,25 @@ impl ReplicaWorkflow { "Primary heartbeat stale ({}ms > {}ms), attempting promotion", time_diff_ms, failover_timeout_ms ); - new_state = ReplicaState::Promoting; + + return true; } } } } - - if self.state != new_state { - self.transition_to(new_state) - } } + false } } #[async_trait] impl HeartbeatWorkflow for ReplicaWorkflow { - async fn on_peer_heartbeat(&mut self, peer_id: &Id, heartbeat: &AgentHeartbeat) { - // Only track the primary's heartbeat - if *peer_id == self.primary_state.agent_id { - match &self.last_primary_heartbeat { - Some(existing) => { - // Update the existing heartbeat data - *existing.write().await = heartbeat.clone(); - } - None => { - // First time seeing primary heartbeat - self.last_primary_heartbeat = Some(RwLock::new(heartbeat.clone())); - } - } - trace!( - "Updated primary heartbeat: seq={}, timestamp={}", - heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0), - heartbeat - .metadata - .as_ref() - .map(|m| m.timestamp) - .unwrap_or(0), - ); - } - } - async fn on_heartbeat_stored(&mut self, heartbeat: &AgentHeartbeat) { - // Track our own heartbeat for staleness comparison - self.last_my_heartbeat = Some(heartbeat.clone()); - - // Perform staleness detection if we have both heartbeats - self.check_primary_staleness().await; - } async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>) { - if let Some(state) = cluster_state { - info!( - "Startup reconciliation: current primary is {:?}, desired primary is {:?}", - state.current_primary, state.desired_primary - ); - // Update cluster_state with the observed values - self.cluster_state.current_primary = state.current_primary.clone(); - } else { - debug!("No cluster state on startup, starting from Initializing"); - } + todo!("not sure if the replica should do anything on startup") } - fn handle_heartbeat_success(&mut self) { + + async fn handle_heartbeat_success(&mut self) { + trace!("Handling heartbeat success, current counters success {} failures {}", self.consecutive_successes, self.consecutive_failures); self.consecutive_successes += 1; self.consecutive_failures = 0; @@ -212,10 +171,23 @@ impl HeartbeatWorkflow for ReplicaWorkflow { ReplicaState::Watching => { // TODO: Check primary staleness from NATS trace!("Replica watching primary"); + if self.is_primary_stale().await { + warn!("Found stale primary, launching promotion"); + } + todo!("perform the replica watch actions : + - if a primary exists in the cluster (cluster_state.current_primary == expected_primary) + - check the last primary heartbeat kv timestamp + - compare it with our latest kv heartbeat + - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself) + - launching promotion will change the status of the replica + "); } ReplicaState::Promoting => { // TODO: Complete promotion attempt trace!("Replica promotion in progress"); + todo!( + "When promoting, a heartbeat failure does not affect promotion unless failure_threshold is reached, a heartbeat success does nothing either" + ); } ReplicaState::PromotionFailed => { if self.consecutive_successes >= self.success_threshold { @@ -239,10 +211,17 @@ impl HeartbeatWorkflow for ReplicaWorkflow { } } - fn handle_heartbeat_failure(&mut self) { + async fn handle_heartbeat_failure(&mut self) { self.consecutive_failures += 1; self.consecutive_successes = 0; + // TODO revisit this. I think we should handle the agent healthiness (checking + // consecutive_failures against failure_threshold) separately from handling the cluster + // state. + // + // That said, there might be funny stuff we have to do when the agent reaches the failure + // threshold, especially in promoting and demoting statuses. + match self.state { ReplicaState::Watching | ReplicaState::Initializing => { if self.consecutive_failures >= self.failure_threshold { -- 2.39.5 From a20919bbda76e664a0b5a7245b91a066e10b5f83 Mon Sep 17 00:00:00 2001 From: wjro Date: Tue, 3 Feb 2026 11:43:22 -0500 Subject: [PATCH 10/19] wip: write cluster state to jetstream kv --- harmony_agent/src/agent_loop.rs | 52 +++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs index 089b013d..7147edd7 100644 --- a/harmony_agent/src/agent_loop.rs +++ b/harmony_agent/src/agent_loop.rs @@ -1,3 +1,4 @@ +use std::time::{SystemTime, UNIX_EPOCH}; use std::{str::FromStr, sync::Arc, time::Duration}; use harmony_types::id::Id; @@ -253,6 +254,57 @@ impl HarmonyAgent { Ok(()) } + async fn store_cluster_state( + &self, + cluster_state_data: Option, + ) -> Result { + let key = format!("cluster.{}", self.config.cluster_id); + match cluster_state_data { + Some(state) => { + let value = serde_json::to_value(&state).map_err(|e| { + KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: format!("{:?}", state), + } + })?; + + let expected_sequence = { + let last = self.last_heartbeat.read().await; + last.as_ref() + .and_then(|hb| hb.metadata.as_ref()) + .map(|m| m.sequence) + .unwrap_or(0) + }; + + self.cluster_kv + .set_strict(&key, value, expected_sequence) + .await?; + + Ok(state) + } + None => { + let cluster_data = ClusterStateData { + cluster_id: self.config.cluster_id.clone(), + current_primary: None, + desired_primary: self.config.desired_primary_id.clone(), + timestamp: SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_millis() as u64, + }; + + let value = serde_json::to_value(&cluster_data).map_err(|e| { + KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: format!("{:?}", cluster_data), + } + })?; + self.cluster_kv.set_strict(&key, value, 0).await?; + Ok(cluster_data) + } + } + } + /// Sends agent heartbeat to the KV store /// /// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence) -- 2.39.5 From 7065e90475934cd8d70c7f3d2818b36f14ed4d0b Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 3 Feb 2026 11:45:03 -0500 Subject: [PATCH 11/19] feat: use the role of the agent to define its name --- harmony_agent/src/agent_loop.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs index 089b013d..f6e1b4b6 100644 --- a/harmony_agent/src/agent_loop.rs +++ b/harmony_agent/src/agent_loop.rs @@ -1,3 +1,4 @@ +use std::fmt; use std::{str::FromStr, sync::Arc, time::Duration}; use harmony_types::id::Id; @@ -18,6 +19,15 @@ pub enum AgentRole { Replica, } +impl fmt::Display for AgentRole { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + AgentRole::Primary => write!(f, "primary"), + AgentRole::Replica => write!(f, "replica"), + } + } +} + pub async fn launch_agent( role: AgentRole, health_kv: Arc, @@ -28,7 +38,8 @@ pub async fn launch_agent( where S: KvStore + Send + Sync + 'static, { - let my_agent_id = Id::from_str("agent_1").unwrap(); + let my_agent_name = format!("agent-{}", role); + let my_agent_id = Id::from_str(&my_agent_name).unwrap(); let config = AgentConfig { role, @@ -219,6 +230,10 @@ impl HarmonyAgent { cluster_key ); + /* + trace!("{:#?}", self.cluster_kv.get(&cluster_key).await); + */ + let cluster_state_option = match self.cluster_kv.get(&cluster_key).await { Ok(result) => { if let Some(value) = result.value { -- 2.39.5 From 5b04cc96d7b8edd537df5d60f6ab2b2161952a9a Mon Sep 17 00:00:00 2001 From: Sylvain Tremblay Date: Tue, 3 Feb 2026 14:50:03 -0500 Subject: [PATCH 12/19] wip: we want to initialize to the right seq number after a restart --- Cargo.lock | 311 +++++++++++++++++++++++--- harmony_agent/README.md | 2 + harmony_agent/src/agent_loop.rs | 27 ++- harmony_agent/src/workflow/mod.rs | 11 +- harmony_agent/src/workflow/primary.rs | 18 +- harmony_agent/src/workflow/replica.rs | 21 +- 6 files changed, 337 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aabfb9d2..14295673 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -243,7 +243,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", "const-random", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "version_check", "zerocopy", @@ -450,6 +450,43 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "async-nats" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-util", + "memchr", + "nkeys", + "nuid", + "once_cell", + "pin-project", + "portable-atomic", + "rand 0.8.5", + "regex", + "ring", + "rustls-native-certs 0.7.3", + "rustls-pemfile 2.2.0", + "rustls-webpki 0.102.8", + "serde", + "serde_json", + "serde_nanos", + "serde_repr", + "thiserror 1.0.69", + "time", + "tokio", + "tokio-rustls 0.26.2", + "tokio-stream", + "tokio-util", + "tokio-websockets", + "tracing", + "tryhard", + "url", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -774,6 +811,9 @@ name = "bytes" version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +dependencies = [ + "serde", +] [[package]] name = "bytestring" @@ -875,6 +915,22 @@ dependencies = [ "shlex", ] +[[package]] +name = "cert_manager" +version = "0.1.0" +dependencies = [ + "assert_cmd", + "cidr", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_types", + "log", + "tokio", + "url", +] + [[package]] name = "cfg-if" version = "1.0.3" @@ -1550,6 +1606,7 @@ dependencies = [ "rand_core 0.6.4", "serde", "sha2", + "signature", "subtle", "zeroize", ] @@ -1754,6 +1811,24 @@ dependencies = [ "url", ] +[[package]] +name = "example-ha-cluster" +version = "0.1.0" +dependencies = [ + "brocade", + "cidr", + "env_logger", + "harmony", + "harmony_macros", + "harmony_secret", + "harmony_tui", + "harmony_types", + "log", + "serde", + "tokio", + "url", +] + [[package]] name = "example-kube-rs" version = "0.1.0" @@ -1942,9 +2017,28 @@ dependencies = [ "cidr", "env_logger", "harmony", + "harmony_cli", "harmony_macros", "harmony_secret", - "harmony_tui", + "harmony_types", + "log", + "serde", + "tokio", + "url", +] + +[[package]] +name = "example-opnsense-node-exporter" +version = "0.1.0" +dependencies = [ + "async-trait", + "cidr", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_secret", + "harmony_secret_derive", "harmony_types", "log", "serde", @@ -1982,25 +2076,6 @@ dependencies = [ "url", ] -[[package]] -name = "example-opnsense-node-exporter" -version = "0.1.0" -dependencies = [ - "async-trait", - "cidr", - "env_logger", - "harmony", - "harmony_cli", - "harmony_macros", - "harmony_secret", - "harmony_secret_derive", - "harmony_types", - "log", - "serde", - "tokio", - "url", -] - [[package]] name = "example-pxe" version = "0.1.0" @@ -2406,21 +2481,21 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] [[package]] name = "getrandom" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "js-sys", "libc", "r-efi", - "wasi 0.14.3+wasi-0.2.4", + "wasip2", "wasm-bindgen", ] @@ -2522,6 +2597,7 @@ dependencies = [ "env_logger", "fqdn", "futures-util", + "harmony_execution", "harmony_inventory_agent", "harmony_macros", "harmony_secret", @@ -2568,6 +2644,42 @@ dependencies = [ "walkdir", ] +[[package]] +name = "harmony_agent" +version = "0.1.0" +dependencies = [ + "async-nats", + "async-trait", + "cidr", + "env_logger", + "getrandom 0.3.4", + "harmony", + "harmony_macros", + "harmony_types", + "log", + "serde", + "serde_json", + "thiserror 2.0.16", + "tokio", +] + +[[package]] +name = "harmony_agent_deploy" +version = "0.1.0" +dependencies = [ + "cidr", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_types", + "log", + "serde", + "serde_json", + "tokio", + "url", +] + [[package]] name = "harmony_cli" version = "0.1.0" @@ -2608,6 +2720,16 @@ dependencies = [ "tokio", ] +[[package]] +name = "harmony_execution" +version = "0.1.0" +dependencies = [ + "directories", + "lazy_static", + "log", + "thiserror 2.0.16", +] + [[package]] name = "harmony_inventory_agent" version = "0.1.0" @@ -3438,7 +3560,7 @@ version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "libc", ] @@ -3464,6 +3586,26 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "json-prompt" +version = "0.1.0" +dependencies = [ + "brocade", + "cidr", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_secret", + "harmony_secret_derive", + "harmony_types", + "log", + "schemars 0.8.22", + "serde", + "tokio", + "url", +] + [[package]] name = "jsonpath-rust" version = "0.7.5" @@ -3878,7 +4020,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", "windows-sys 0.48.0", ] @@ -3890,7 +4032,7 @@ checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" dependencies = [ "libc", "log", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", "windows-sys 0.59.0", ] @@ -3928,6 +4070,21 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "nkeys" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf" +dependencies = [ + "data-encoding", + "ed25519", + "ed25519-dalek", + "getrandom 0.2.16", + "log", + "rand 0.8.5", + "signatory", +] + [[package]] name = "non-blank-string-rs" version = "1.0.4" @@ -3946,6 +4103,15 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "nuid" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83" +dependencies = [ + "rand 0.8.5", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -4566,7 +4732,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ "bytes", - "getrandom 0.3.3", + "getrandom 0.3.4", "lru-slab", "rand 0.9.2", "ring", @@ -4671,7 +4837,7 @@ version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", ] [[package]] @@ -5207,6 +5373,16 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustls-webpki" +version = "0.102.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +dependencies = [ + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustls-webpki" version = "0.103.4" @@ -5470,6 +5646,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_nanos" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985" +dependencies = [ + "serde", +] + [[package]] name = "serde_path_to_error" version = "0.1.17" @@ -5637,6 +5822,18 @@ dependencies = [ "libc", ] +[[package]] +name = "signatory" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31" +dependencies = [ + "pkcs8", + "rand_core 0.6.4", + "signature", + "zeroize", +] + [[package]] name = "signature" version = "2.2.0" @@ -6200,7 +6397,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e" dependencies = [ "fastrand", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "rustix 1.0.8", "windows-sys 0.60.2", @@ -6413,6 +6610,27 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-websockets" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-core", + "futures-sink", + "http 1.3.1", + "httparse", + "rand 0.8.5", + "ring", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.26.2", + "tokio-util", + "webpki-roots 0.26.11", +] + [[package]] name = "toml" version = "0.8.23" @@ -6564,6 +6782,16 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "tryhard" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5" +dependencies = [ + "pin-project-lite", + "tokio", +] + [[package]] name = "tui-logger" version = "0.14.5" @@ -6740,7 +6968,7 @@ version = "1.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "js-sys", "rand 0.9.2", "uuid-macro-internal", @@ -6811,10 +7039,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] -name = "wasi" -version = "0.14.3+wasi-0.2.4" +name = "wasip2" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" dependencies = [ "wit-bindgen", ] @@ -6936,6 +7164,15 @@ version = "0.25.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" +[[package]] +name = "webpki-roots" +version = "0.26.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" +dependencies = [ + "webpki-roots 1.0.2", +] + [[package]] name = "webpki-roots" version = "1.0.2" @@ -7313,9 +7550,9 @@ dependencies = [ [[package]] name = "wit-bindgen" -version = "0.45.0" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" [[package]] name = "writeable" diff --git a/harmony_agent/README.md b/harmony_agent/README.md index c22d1b51..189e8145 100644 --- a/harmony_agent/README.md +++ b/harmony_agent/README.md @@ -244,3 +244,5 @@ Please add these to your master list before starting implementation: 31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`. 32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid. +* **Think about vacuum / stop-the-world operations** + diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs index 68cb5844..f63f35a0 100644 --- a/harmony_agent/src/agent_loop.rs +++ b/harmony_agent/src/agent_loop.rs @@ -6,7 +6,7 @@ use harmony_types::id::Id; use log::{debug, info, trace}; use serde::{Deserialize, Serialize}; use tokio::sync::RwLock; -use tokio::time::Instant; +use tokio::time::{Instant, sleep}; use crate::store::{KvMetadata, KvStore, KvStoreError}; use crate::workflow::HeartbeatWorkflow; @@ -39,6 +39,12 @@ pub async fn launch_agent( where S: KvStore + Send + Sync + 'static, { + match role { + AgentRole::Primary => {} + AgentRole::Replica => { + sleep(Duration::from_millis(100)).await; + } + } let my_agent_name = format!("agent-{}", role); let my_agent_id = Id::from_str(&my_agent_name).unwrap(); @@ -238,21 +244,27 @@ impl HarmonyAgent { let cluster_state_option = match self.cluster_kv.get(&cluster_key).await { Ok(result) => { if let Some(value) = result.value { - match serde_json::from_value::(value) { + match serde_json::from_value::(value.clone()) { Ok(data) => Some(data), Err(e) => { log::warn!("Failed to deserialize cluster state: {}", e); - None + return Err(KvStoreError::DeserializationFailed { + deserialization_error: format!( + "Cluster key exist but is empty {cluster_key} : {e}" + ), + value: value.to_string(), + }); } } } else { - debug!("No cluster state found, this is a fresh cluster"); - None + return Err(KvStoreError::Unknown(format!( + "Cluster key exist but is empty {cluster_key}" + ))); } } Err(KvStoreError::KeyNotAvailable(_)) => { debug!("Cluster state key not found, this is a fresh cluster"); - None + Some(self.store_cluster_state(None).await?) } Err(e) => { log::warn!("Failed to fetch cluster state during startup: {}", e); @@ -261,7 +273,8 @@ impl HarmonyAgent { }; let state_ref = cluster_state_option.as_ref(); - self.workflow.on_startup(state_ref).await; + + self.workflow.on_startup(state_ref, self.health_kv.as_ref(), &self.config).await; // Cache the cluster state locally *self.cluster_state.write().await = cluster_state_option; diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs index 05f9934c..561ce434 100644 --- a/harmony_agent/src/workflow/mod.rs +++ b/harmony_agent/src/workflow/mod.rs @@ -1,6 +1,10 @@ +use std::sync::Arc; + use async_trait::async_trait; use harmony_types::id::Id; +use crate::{agent_loop::AgentConfig, store::KvStore}; + pub mod primary; pub mod replica; @@ -13,7 +17,12 @@ pub trait HeartbeatWorkflow: Send + Sync { /// Handle a failed heartbeat async fn handle_heartbeat_failure(&mut self); - async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>); + async fn on_startup( + &mut self, + cluster_state: Option<&crate::agent_loop::ClusterStateData>, + health_kv: &dyn KvStore, + agent_config: &AgentConfig, + ); /// Get the current state name for logging (also used for heartbeat status) fn state_name(&self) -> &'static str; diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs index 73c12828..ffe3ffae 100644 --- a/harmony_agent/src/workflow/primary.rs +++ b/harmony_agent/src/workflow/primary.rs @@ -1,7 +1,7 @@ use async_trait::async_trait; use log::{debug, info, trace, warn}; -use crate::{agent_loop::DeploymentConfig, workflow::HeartbeatWorkflow}; +use crate::{agent_loop::{AgentConfig, DeploymentConfig}, store::KvStore, workflow::HeartbeatWorkflow}; #[derive(Debug, Clone, PartialEq)] pub enum PrimaryState { @@ -63,12 +63,21 @@ impl PrimaryWorkflow { #[async_trait] impl HeartbeatWorkflow for PrimaryWorkflow { - async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>) { + async fn on_startup( + &mut self, + cluster_state: Option<&crate::agent_loop::ClusterStateData>, + health_kv: &dyn KvStore, + agent_config: &AgentConfig, + ) { if let Some(state) = cluster_state { info!( "Startup reconciliation: current primary is {:?}, desired primary is {:?}", state.current_primary, state.desired_primary ); + + let key = format!("heartbeat.{}", agent_config.agent_id.clone()); + // let hb = health_kv.get(&key); + // No automatic fast-tracking - agent must earn healthy status // through successful heartbeats. This prevents duplicate agents // or crashloop agents from incorrectly claiming primary. @@ -77,7 +86,10 @@ impl HeartbeatWorkflow for PrimaryWorkflow { } } async fn handle_heartbeat_success(&mut self) { - trace!("Handling heartbeat success, current counters success {} failures {}", self.consecutive_successes, self.consecutive_failures); + trace!( + "Handling heartbeat success, current counters success {} failures {}", + self.consecutive_successes, self.consecutive_failures + ); self.consecutive_successes += 1; self.consecutive_failures = 0; diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs index b790a6b9..eda4bbdc 100644 --- a/harmony_agent/src/workflow/replica.rs +++ b/harmony_agent/src/workflow/replica.rs @@ -4,7 +4,8 @@ use log::{debug, info, trace, warn}; use std::time::Duration; use tokio::sync::RwLock; -use crate::agent_loop::AgentHeartbeat; +use crate::agent_loop::{AgentConfig, AgentHeartbeat}; +use crate::store::KvStore; use crate::workflow::HeartbeatWorkflow; #[derive(Debug, Clone)] @@ -153,12 +154,20 @@ impl ReplicaWorkflow { #[async_trait] impl HeartbeatWorkflow for ReplicaWorkflow { - async fn on_startup(&mut self, cluster_state: Option<&crate::agent_loop::ClusterStateData>) { - todo!("not sure if the replica should do anything on startup") + async fn on_startup( + &mut self, + cluster_state: Option<&crate::agent_loop::ClusterStateData>, + health_kv: &dyn KvStore, + agent_config: &AgentConfig, + ) { + // todo!("not sure if the replica should do anything on startup") } async fn handle_heartbeat_success(&mut self) { - trace!("Handling heartbeat success, current counters success {} failures {}", self.consecutive_successes, self.consecutive_failures); + trace!( + "Handling heartbeat success, current counters success {} failures {}", + self.consecutive_successes, self.consecutive_failures + ); self.consecutive_successes += 1; self.consecutive_failures = 0; @@ -174,13 +183,15 @@ impl HeartbeatWorkflow for ReplicaWorkflow { if self.is_primary_stale().await { warn!("Found stale primary, launching promotion"); } - todo!("perform the replica watch actions : + /* + todo!("perform the replica watch actions : - if a primary exists in the cluster (cluster_state.current_primary == expected_primary) - check the last primary heartbeat kv timestamp - compare it with our latest kv heartbeat - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself) - launching promotion will change the status of the replica "); + */ } ReplicaState::Promoting => { // TODO: Complete promotion attempt -- 2.39.5 From a88d67627aa9e001a5b0348339bf77b0a91218ed Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Tue, 3 Feb 2026 20:46:18 -0500 Subject: [PATCH 13/19] chore: Add a note and delete old code --- harmony_agent/src/agent_loop.rs | 3 + harmony_agent/src/old/typestate.rs | 230 ---------- harmony_agent/src/old/typestate_gemini.rs | 523 ---------------------- 3 files changed, 3 insertions(+), 753 deletions(-) delete mode 100644 harmony_agent/src/old/typestate.rs delete mode 100644 harmony_agent/src/old/typestate_gemini.rs diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent_loop.rs index f63f35a0..7111db92 100644 --- a/harmony_agent/src/agent_loop.rs +++ b/harmony_agent/src/agent_loop.rs @@ -39,12 +39,15 @@ pub async fn launch_agent( where S: KvStore + Send + Sync + 'static, { + // Cheap ass fix when we boot two agents at the same time and the store does not exist, delay + // one so they don't crash because of the race match role { AgentRole::Primary => {} AgentRole::Replica => { sleep(Duration::from_millis(100)).await; } } + let my_agent_name = format!("agent-{}", role); let my_agent_id = Id::from_str(&my_agent_name).unwrap(); diff --git a/harmony_agent/src/old/typestate.rs b/harmony_agent/src/old/typestate.rs deleted file mode 100644 index 78445d55..00000000 --- a/harmony_agent/src/old/typestate.rs +++ /dev/null @@ -1,230 +0,0 @@ -use std::{marker::PhantomData, time::Duration}; - -/// Typestate pattern implementation for Primary and Replica state machines -/// Based on Will Crichton's typestate pattern - -pub mod primary { - use super::Agent; - - /// Primary state: Agent is initializing - pub struct Initializing {} - - /// Primary state: Heartbeat failures exceeded threshold - pub struct Failed {} - - /// Primary state: Database fenced/stopped - pub struct Fenced {} - - /// Primary state: Heartbeat succeeding - pub struct Healthy {} - - /// Primary state: Recovered from fence, waiting for demotion handshake - pub struct Yielding {} - - impl Agent { - /// Transition from initializing to healthy - pub fn healthy(self) -> Agent { - self - } - } - - impl Agent { - /// Transition from failed to fenced - pub fn fence(self) -> Agent { - Agent { - consecutive_success: self.consecutive_success, - consecutive_failure: self.consecutive_failure, - failure_threshold: self.failure_threshold, - success_threshold: self.success_threshold, - heartbeat_timeout: self.heartbeat_timeout, - _state: PhantomData - } - } - - /// Transition from failed to healthy (recovery) - pub fn recover(self) -> Agent { - self - } - } - - impl Agent { - /// Transition from fenced to yielding (waiting for demotion) - pub fn await_demotion(self) -> Agent { - self - } - - /// Transition from fenced to healthy (recovery after demotion completes) - pub fn recover(self) -> Agent { - self - } - } - - impl Agent { - /// Transition from healthy to yielding (when original primary recovers) - pub fn yield_leadership(self) -> Agent { - self - } - - /// Transition from healthy to failed (heartbeat failure) - pub fn fail(self) -> Agent { - self - } - } - - impl Agent { - /// Transition from yielding back to healthy (after demotion completes) - pub fn recover(self) -> Agent { - self - } - - /// Transition from yielding back to healthy (if demotion cancelled) - pub fn recover_and_promote(self) -> Agent { - self - } - } -} - -pub mod replica { - use super::Agent; - - /// Replica state: Agent is initializing - pub struct Initializing {} - - /// Replica state: Watching primary heartbeats - pub struct Watching {} - - /// Replica state: Failover timeout exceeded, attempting promotion - pub struct Promoting {} - - /// Replica state: Promotion attempt rejected by NATS - pub struct PromotionFailed {} - - /// Replica state: Successfully promoted to leader - pub struct Leader {} - - /// Replica state: Original primary recovered, yielding leadership - pub struct Demoting {} - - impl Agent { - /// Transition from initializing to watching - pub fn start_watching(self) -> Agent { - self - } - } - - impl Agent { - /// Transition from watching to promoting (failover timeout reached) - pub fn promote(self) -> Agent { - self - } - - /// Transition from watching back to promoting (if demotion cancelled) - pub fn promote_again(self) -> Agent { - self - } - } - - impl Agent { - /// Transition from promoting to leader (promotion successful) - pub fn become_leader(self) -> Agent { - self - } - - /// Transition from promoting to promotion_failed (NATS rejected) - pub fn promotion_rejected(self) -> Agent { - self - } - - /// Transition from promoting back to watching (reverted) - pub fn revert_to_watching(self) -> Agent { - self - } - } - - impl Agent { - /// Transition from promotion_failed back to watching - pub fn continue_watching(self) -> Agent { - self - } - } - - impl Agent { - /// Transition from leader to demoting (original primary recovered) - pub fn yield_leadership(self) -> Agent { - self - } - - /// Transition from leader to watching (if demotion cancelled) - pub fn revert_to_watching(self) -> Agent { - self - } - } - - impl Agent { - /// Transition from demoting back to watching (if demotion cancelled) - pub fn revert_to_watching(self) -> Agent { - self - } - - /// Transition from demoting back to leader (if demotion cancelled) - pub fn promote_again(self) -> Agent { - self - } - } -} - -/// Main Agent struct using typestate pattern -/// State is tracked through the generic type parameter -pub struct Agent { - pub consecutive_success: usize, - pub consecutive_failure: usize, - pub failure_threshold: usize, - pub success_threshold: usize, - pub heartbeat_timeout: Duration, - _state: PhantomData -} - -impl Agent { - /// Create a new agent in the given state with default thresholds - pub fn new(state: State) -> Self { - Agent { - consecutive_success: 0, - consecutive_failure: 0, - failure_threshold: 2, - success_threshold: 3, - heartbeat_timeout: Duration::from_secs(1), - _state: PhantomData - } - } - - /// Create a new agent with custom thresholds - pub fn with_thresholds(state: State, success_threshold: usize, failure_threshold: usize, heartbeat_timeout: Duration) -> Self { - Agent { - consecutive_success: 0, - consecutive_failure: 0, - failure_threshold, - success_threshold, - heartbeat_timeout, - _state: PhantomData - } - } -} - -impl Clone for Agent { - fn clone(&self) -> Self { - Agent { - consecutive_success: self.consecutive_success, - consecutive_failure: self.consecutive_failure, - failure_threshold: self.failure_threshold, - success_threshold: self.success_threshold, - heartbeat_timeout: self.heartbeat_timeout, - _state: PhantomData - } - } -} - -impl Default for Agent { - fn default() -> Self { - Self::new(Initializing {}) - } -} diff --git a/harmony_agent/src/old/typestate_gemini.rs b/harmony_agent/src/old/typestate_gemini.rs deleted file mode 100644 index e4285bdd..00000000 --- a/harmony_agent/src/old/typestate_gemini.rs +++ /dev/null @@ -1,523 +0,0 @@ -use std::marker::PhantomData; -use std::time::Duration; -use tokio::sync::mpsc; -use tokio::time::Instant; - -// ============================================================================= -// FSM Library (Type State Pattern) -// ============================================================================= - -pub mod fsm { - use super::*; - - /// Generic FSM container - pub struct FSM { - pub user_data: Option, - pub state: PhantomData, - pub _phantom_event: PhantomData, - } - - impl FSM { - pub fn new(user_data: Option) -> Self { - Self { - user_data, - state: PhantomData, - _phantom_event: PhantomData, - } - } - } - - /// Trait to represent FSM behavior via dynamic dispatch - pub trait HandleEvent { - fn handle_event(self: Box, event: E) -> Box>; - } - - /// Implemented per-state by the macro to route event logic - pub trait ErasedState: Send { - fn handle_event(self: Box, event: E) -> Box>; - } - - impl ErasedState for FSM - where - FSM: HandleEvent + Send + 'static, - { - fn handle_event(self: Box, event: E) -> Box> { - HandleEvent::handle_event(self, event) - } - } - - /// Allows FSM to move from state `S` to `T`, retaining user data - pub trait StateMachine: Send + 'static { - fn into_boxed(self) -> Box>; - } - - impl StateMachine for FSM - where - S: Send + 'static, - E: Send + 'static, - U: Send + 'static, - { - fn into_boxed(self) -> Box> { - Box::new(FSM { - user_data: self.user_data, - state: PhantomData, - _phantom_event: PhantomData, - }) - } - } - - /// Runs the FSM in an asynchronous loop - pub async fn run_machine( - mut state: Box>, - mut rx: tokio::sync::mpsc::Receiver, - ) where - E: Send + 'static, - U: Send + 'static, - { - while let Some(event) = rx.recv().await { - state = ErasedState::handle_event(state, event); - } - } -} - -/// Macro for Declaring Transitions -#[macro_export] -macro_rules! define_fsm { - ( - $struct:ident<$event:ident, $user:ident>, { - $( - $state:ty => { - $( - $pattern:pat => $next:ty => $action:expr - ),* $(,)? - } - ),* $(,)? - } - ) => { - $( - impl $crate::fsm::HandleEvent<$event, $user> for $struct<$state, $event, $user> { - fn handle_event(mut self: Box, event: $event) -> Box> { - match event { - $( - $pattern => { - // log::debug!("FSM Transition: {:?} --[{:?}]--> {:?}", stringify!($state), e, stringify!($next)); - log::debug!("FSM Transition: {:?} --[:?]--> {:?}", stringify!($state), stringify!($next)); - $action(&mut self); - self.into_boxed::<$next>() - } - )* - // Default handler for unmapped events in this state: stay in current state - _ => { - // log::trace!("FSM Ignore: {:?} --[{:?}]--> (no transition)", stringify!($state), event); - self - } - } - } - } - )* - }; -} - -// ============================================================================= -// Harmony Agent Domain Logic -// ============================================================================= - -use fsm::{ErasedState, StateMachine, FSM}; - -// --- States --- -#[derive(Debug)] -struct RolePrimary; // Active Leader -#[derive(Debug)] -struct RoleReplica; // Passive Watchdog -#[derive(Debug)] -struct RoleFencing; // Transition: Shutting down -#[derive(Debug)] -struct RolePromoting; // Transition: Taking over -#[derive(Debug)] -struct RoleDemoting; // Transition: Yielding - -// --- Events --- -#[derive(Debug, Clone)] -enum AgentEvent { - /// Periodic timer tick (drives checks) - Tick, - /// Result of a local health check (Primary only) - HealthCheckResult { success: bool }, - /// Update from NATS about the cluster state - ClusterStateUpdate { primary_id: String, timestamp: Instant }, - /// Command to force a state change (e.g. admin intervention) - ForceDemote, -} - -// --- Side Effect Commands (Outbound) --- -#[derive(Debug)] -enum WorkerCommand { - PerformHealthCheck, - PerformFencing, - PerformPromotion, - PerformDemotion, -} - -// --- Context --- -struct AgentContext { - // Config - agent_id: String, - success_threshold: usize, - failure_threshold: usize, - heartbeat_interval: Duration, - failover_timeout: Duration, - - // Runtime State - consecutive_failures: usize, - last_primary_heartbeat: Option, - - // Communication - worker_tx: mpsc::Sender, -} - -impl AgentContext { - fn send_command(&self, cmd: WorkerCommand) { - let tx = self.worker_tx.clone(); - tokio::spawn(async move { - if let Err(e) = tx.send(cmd).await { - log::error!("Failed to send worker command: {}", e); - } - }); - } -} - -// --- FSM Definition --- - -define_fsm!(FSM, { - // ------------------------------------------------------------------------- - // PRIMARY STATE (Self-Preservation) - // ------------------------------------------------------------------------- - RolePrimary => { - // 1. On Tick: Trigger a health check (Async Side Effect) - AgentEvent::Tick => RolePrimary => |s: &mut FSM| { - if let Some(ctx) = &mut s.user_data { - ctx.send_command(WorkerCommand::PerformHealthCheck); - } - }, - - // 2. Health Check Success: Reset counters - AgentEvent::HealthCheckResult { success: true } => RolePrimary => |s: &mut FSM| { - if let Some(ctx) = &mut s.user_data { - ctx.consecutive_failures = 0; - log::info!("✅ Heartbeat Success (Primary)"); - } - }, - - // 3. Health Check Failure: Increment counters & Check Threshold - AgentEvent::HealthCheckResult { success: false } => RolePrimary => |s: &mut FSM| { - // NOTE: We determine next state dynamically by checking threshold. - // Since the macro requires a static next type, we handle the "Stay" case here. - // If we need to transition, we assume the event loop sends a specific event, - // OR we use a separate state for "Checking". - // However, to keep it simple within this pattern, we will check threshold here. - // If threshold reached, we ideally want to return RoleFencing. - // But the macro forces `=> RolePrimary`. - // - // WORKAROUND: We use a specific event flow. - // Ideally, the `HealthCheckResult` logic would be: - // if fail >= threshold { transition Fencing } else { stay } - // - // To strictly follow the macro structure where destination is fixed per pattern: - // We can't branch to different types in one pattern. - // So we will stay in RolePrimary here, but if threshold is hit, we trigger Fencing immediately - // by sending a command, and we rely on the Worker to complete fencing and maybe restart us? - // - // BETTER APPROACH for this specific FSM pattern: - // We need an intermediate event or state if the destination depends on runtime data. - // But let's assume for this implementation that we handle the "Stay" case here, - // and if we fail, we transition to Fencing on the NEXT tick or via a self-generated event? - // - // Let's modify the logic: The Worker sends `HealthCheckResult { success: false }`. - // If we are still below threshold, we log. - // If we are at threshold, we treat this event as a trigger for Fencing? - // No, the pattern matches `Event => Type`. - // - // Revised: We need two patterns. But we can't match on values inside the struct in the macro easily - // unless we define specific events like `HealthCheckFailedFatal`. - // - // Let's use `consecutive_failures` check inside the action. - // If fatal, we return a new Box. - // Wait, the macro generates `self.into_boxed::<$next>()`. It hardcodes the return type. - // - // This is a limitation of the macro provided in the blog post. - // To solve this strictly following the provided code, we must ensure the event *itself* dictates the transition. - // - // So the Worker must know the threshold? No, that leaks logic. - // - // Solution: The FSM Action can mutate `ctx`. - // We will have `AgentEvent::HealthCheckFailed`. - // We stay in `RolePrimary`. - // Inside the action, if `ctx.failures >= threshold`, we `ctx.send_command(PerformFencing)`. - // And we transition to `RoleFencing`? We can't conditionally transition in the macro. - // - // OK, I will split the event. - // The Worker returns `HealthCheckResult`. - // The FSM handles it. - // If the FSM sees failure, it stays in Primary. - // But if it needs to fence, it needs to transition. - // - // I will add `AgentEvent::FencingTriggered` which is sent by the FSM to itself? - // Or simpler: The Worker sends `HealthCheckFailed`. - // If we want to fence, we need to move to `RoleFencing`. - // - // Let's adjust the macro usage slightly. The user said "follow exactly the FSM pattern". - // The pattern implies strict state transitions. - // - // I will implement `RolePrimary` -> `RoleFencing` on `ForceDemote` or similar. - // And I will assume the Worker sends `ForceDemote` if it detects critical failure? - // No, the logic belongs in the FSM. - // - // Let's use the `Tick` to check the counter. - // 1. Tick -> Check. - // 2. Result -> Update Counter. - // 3. Tick -> If counter > thresh -> Transition Fencing. - // - // Let's try that. - if let Some(ctx) = &mut s.user_data { - ctx.consecutive_failures += 1; - log::warn!("⚠️ Heartbeat Failed (Count: {}/{})", ctx.consecutive_failures, ctx.failure_threshold); - } - }, - - // 4. The actual Fencing Transition - // We use a specific pattern guard if possible, or just a separate event. - // Since we can't guard in the macro, we'll use a trick: - // If failures are high, the NEXT Tick will trigger transition? - // No, we want immediate. - // - // Let's add `AgentEvent::CriticalFailure` event. - // The `HealthCheckResult` handler (above) will check the threshold. - // If threshold reached, it cannot transition itself (locked to RolePrimary). - // BUT, it can emit a `CriticalFailure` event to the channel. - // Then the FSM loop picks it up and transitions. - AgentEvent::ForceDemote => RoleFencing => |s: &mut FSM| { - if let Some(ctx) = &mut s.user_data { - log::error!("🚨 Failure Threshold Reached. Initiating Fencing."); - ctx.send_command(WorkerCommand::PerformFencing); - } - }, - - // 5. Split Brain Prevention - AgentEvent::ClusterStateUpdate { primary_id, .. } => RoleDemoting => |s: &mut FSM| { - if let Some(ctx) = &mut s.user_data { - if primary_id != ctx.agent_id && !primary_id.is_empty() { - log::warn!("Split Brain Detected! Another primary is active: {}. Demoting.", primary_id); - ctx.send_command(WorkerCommand::PerformDemotion); - } - } - } - }, - - // ------------------------------------------------------------------------- - // REPLICA STATE (Watchdog) - // ------------------------------------------------------------------------- - RoleReplica => { - // 1. Receive Heartbeats from Primary - AgentEvent::ClusterStateUpdate { primary_id, timestamp } => RoleReplica => |s: &mut FSM| { - if let Some(ctx) = &mut s.user_data { - if !primary_id.is_empty() { - ctx.last_primary_heartbeat = Some(timestamp); - // log::trace!("Replica: Saw primary {} at {:?}", primary_id, timestamp); - } - } - }, - - // 2. Tick: Check for Staleness - AgentEvent::Tick => RoleReplica => |s: &mut FSM| { - // We can't transition conditionally here either. - // Same pattern: Check logic, if stale, send `ForcePromote` event to self. - if let Some(ctx) = &mut s.user_data { - if let Some(last) = ctx.last_primary_heartbeat { - let elapsed = Instant::now().duration_since(last); - if elapsed > ctx.failover_timeout { - log::warn!("⚡ Primary Stale ({}ms > {}ms). Triggering Promotion.", elapsed.as_millis(), ctx.failover_timeout.as_millis()); - // We need to trigger the transition. - // We can't do it directly in this closure because the return type is fixed to RoleReplica. - // So we assume the "Driver" or a self-send handles the trigger. - // For this implementation, we'll assume we have a handle to the main loop channel in ctx? - // No, ctx has `worker_tx`. - // - // We will send a command to worker to "ConfirmPromotionEligibility", which sends back `ForcePromote`. - ctx.send_command(WorkerCommand::PerformPromotion); // This checks eligibility then triggers event - } - } - } - }, - - // 3. Promotion Triggered - AgentEvent::ForceDemote => RolePromoting => |s: &mut FSM| { - log::info!("Promoting to Primary..."); - } - }, - - // ------------------------------------------------------------------------- - // FENCING STATE (Transient) - // ------------------------------------------------------------------------- - RoleFencing => { - // Once fencing is done (simulated by Tick or specific event), we become a Replica (Clean Demotion) - AgentEvent::Tick => RoleReplica => |s: &mut FSM| { - log::info!("Fencing/Demotion complete. Switching to Replica (Watchdog) mode."); - if let Some(ctx) = &mut s.user_data { - ctx.consecutive_failures = 0; - } - } - }, - - // ------------------------------------------------------------------------- - // PROMOTING STATE (Transient) - // ------------------------------------------------------------------------- - RolePromoting => { - // Promotion logic usually involves ensuring WAL catchup etc. - // We simulate success on next Tick. - AgentEvent::Tick => RolePrimary => |s: &mut FSM| { - log::info!("Promotion Complete. I am now the PRIMARY."); - if let Some(ctx) = &mut s.user_data { - ctx.consecutive_failures = 0; - // Reset heartbeat timestamp so we don't fence immediately - ctx.last_primary_heartbeat = Some(Instant::now()); - } - } - }, - - // ------------------------------------------------------------------------- - // DEMOTING STATE (Transient) - // ------------------------------------------------------------------------- - RoleDemoting => { - AgentEvent::Tick => RoleReplica => |s: &mut FSM| { - log::info!("Demotion Complete. Switching to Replica."); - } - } -}); - -// ============================================================================= -// Main & Runtime -// ============================================================================= - -pub async fn main_typestate_gemini() -> Result<(), Box> { - env_logger::init(); - log::info!("Harmony Agent FSM Starting..."); - - // 1. Setup Channels - let (event_tx, event_rx) = mpsc::channel::(100); - let (worker_tx, mut worker_rx) = mpsc::channel::(100); - - // 2. Configuration - let my_agent_id = "agent_1".to_string(); - let desired_primary = "agent_1".to_string(); // Change to "agent_2" to test Replica start - let is_primary = my_agent_id == desired_primary; - - let context = AgentContext { - agent_id: my_agent_id.clone(), - success_threshold: 2, - failure_threshold: 2, - heartbeat_interval: Duration::from_secs(1), - failover_timeout: Duration::from_secs(3), // 3s > 1s interval - consecutive_failures: 0, - last_primary_heartbeat: Some(Instant::now()), - worker_tx: worker_tx.clone(), - }; - - // 3. Spawn Worker (Simulates IO and Logic Glue) - let event_tx_worker = event_tx.clone(); - tokio::spawn(async move { - while let Some(cmd) = worker_rx.recv().await { - match cmd { - WorkerCommand::PerformHealthCheck => { - // Simulate IO latency - tokio::time::sleep(Duration::from_millis(100)).await; - - // Simulate random failure (10% chance) - let success = getrandom::u64().unwrap() % 100 > 10; - - // Send result back - let _ = event_tx_worker.send(AgentEvent::HealthCheckResult { success }).await; - - // CRITICAL: Logic Glue for the FSM limitation - // If we failed, we don't know the counter here easily without shared state. - // But for the purpose of this demo, let's assume the FSM handles the counter. - // If the FSM decides to fence, it sends PerformFencing. - // - // However, we need to trigger the transition event if threshold is hit. - // Since FSM action is sync and can't send async events easily back to itself *during* the transition, - // we rely on the FSM action checking the counter and sending a command to US (Worker), - // and WE send the transition event back. - } - WorkerCommand::PerformFencing => { - log::warn!("[Worker] Executing Fencing Procedure (Stop DB)..."); - tokio::time::sleep(Duration::from_millis(500)).await; - // Trigger the state transition in FSM - let _ = event_tx_worker.send(AgentEvent::ForceDemote).await; - } - WorkerCommand::PerformPromotion => { - log::info!("[Worker] Checking Promotion Eligibility..."); - // Simulate check - tokio::time::sleep(Duration::from_millis(200)).await; - // Trigger transition - let _ = event_tx_worker.send(AgentEvent::ForceDemote).await; // Reusing ForceDemote as "Trigger Transition" for Replica->Promote based on graph? - // Wait, Replica->Promote uses ForceDemote in the macro above? - // Yes: AgentEvent::ForceDemote => RolePromoting - } - WorkerCommand::PerformDemotion => { - log::warn!("[Worker] Yielding Leadership..."); - tokio::time::sleep(Duration::from_millis(200)).await; - // Trigger transition - // We need an event that goes Primary -> Demoting. - // In the macro: AgentEvent::ClusterStateUpdate handles the detection. - // But we need to transition. - // Actually, the macro for ClusterStateUpdate transitions DIRECTLY to RoleDemoting. - // So this command might just be for side-effects (stopping DB). - } - } - } - }); - - // 4. Spawn Timer (Heartbeat Tick) - let event_tx_timer = event_tx.clone(); - tokio::spawn(async move { - let mut interval = tokio::time::interval(Duration::from_secs(1)); - loop { - interval.tick().await; - let _ = event_tx_timer.send(AgentEvent::Tick).await; - } - }); - - // 5. Spawn NATS Watcher (Simulated) - let event_tx_nats = event_tx.clone(); - tokio::spawn(async move { - // Simulate receiving heartbeats from "agent_1" - loop { - tokio::time::sleep(Duration::from_millis(500)).await; - // If we are agent_1, we are the primary, so we don't see external heartbeats usually, - // but for simulation, let's say we see ourselves or nothing. - // If we are agent_2 (Replica), we see agent_1. - - // Uncomment to simulate primary death for Replica: - // continue; - - let _ = event_tx_nats.send(AgentEvent::ClusterStateUpdate { - primary_id: "agent_1".to_string(), - timestamp: Instant::now(), - }).await; - } - }); - - // 6. Initialize FSM - let initial_state: Box> = if is_primary { - log::info!("Starting as PRIMARY"); - Box::new(FSM::::new(Some(context))) - } else { - log::info!("Starting as REPLICA"); - Box::new(FSM::::new(Some(context))) - }; - - // 7. Run - fsm::run_machine(initial_state, event_rx).await; - - Ok(()) -} - -- 2.39.5 From 9c551a0eba9ac73d2d1ef27419af4f6d7e3bcb84 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Tue, 3 Feb 2026 22:12:44 -0500 Subject: [PATCH 14/19] fix: Agent can now reload heartbeat info from store --- harmony_agent/src/agent/config.rs | 80 ++++++ harmony_agent/src/agent/heartbeat.rs | 30 ++ .../src/{agent_loop.rs => agent/mod.rs} | 263 ++++++++---------- harmony_agent/src/agent/role.rs | 17 ++ harmony_agent/src/main.rs | 14 +- harmony_agent/src/workflow/mod.rs | 9 +- harmony_agent/src/workflow/primary.rs | 10 +- harmony_agent/src/workflow/replica.rs | 10 +- 8 files changed, 254 insertions(+), 179 deletions(-) create mode 100644 harmony_agent/src/agent/config.rs create mode 100644 harmony_agent/src/agent/heartbeat.rs rename harmony_agent/src/{agent_loop.rs => agent/mod.rs} (74%) create mode 100644 harmony_agent/src/agent/role.rs diff --git a/harmony_agent/src/agent/config.rs b/harmony_agent/src/agent/config.rs new file mode 100644 index 00000000..ec96601c --- /dev/null +++ b/harmony_agent/src/agent/config.rs @@ -0,0 +1,80 @@ +use std::time::Duration; + +use harmony_types::id::Id; +use log::info; + +use super::role::AgentRole; +use super::heartbeat::HeartbeatFailure; + +#[derive(Debug, Clone)] +pub struct AgentConfig { + /// Number of consecutive successful heartbeats required before the service transitions from + /// failed to healthy. + pub success_threshold: usize, + /// Number of consecutive failed heartbeats required before the service transitions from + /// healthy to failed. + pub failure_threshold: usize, + /// Time between each heartbeat. If a heartbeat takes longer than this, it will be + /// considered failed. + pub heartbeat_interval: Duration, + /// Time since last observed primary heartbeat before replica considers primary stale. + /// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin + /// to avoid split brain during network partitions. + pub failover_timeout: Duration, + /// **UNSTABLE FIELD** + /// + /// For now, an agent instance only serves one deployment. This is probably fine as an agent's + /// footprint is low, but managing multiple deployments in a single instance would be a + /// significant resource usage reduction. + /// + /// Decoupling the deployment of the agent with the application's deployment could make things + /// more complicated though, where we would have to be careful about version compatibility + /// between all components managed by the agent instance. So for now it is a 1-1 map. + /// + /// But I have a feeling this could change so I am marking this field unstable to warn you, the + /// reader. + pub deployment_config_unstable: DeploymentConfig, + pub nats_url: String, + pub nats_creds_path: Option, + pub agent_id: Id, + pub cluster_id: Id, + pub desired_primary_id: Id, + /// The role this agent plays (Primary or Replica) + pub role: AgentRole, +} + +#[derive(Debug, Clone)] +pub enum DeploymentConfig { + FailoverPostgreSQL(FailoverCNPGConfig), +} + +#[derive(Debug, Clone)] +pub struct FailoverCNPGConfig { + pub desired_primary_agent: Id, + pub cnpg_cluster_name: String, +} + +impl DeploymentConfig { + /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres) + pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> { + match self { + DeploymentConfig::FailoverPostgreSQL(cfg) => { + info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name); + // TODO: Implement actual PG check / NATS write here + Ok(()) + } + } + } + + /// Callback: Transitioned from Unhealthy -> Healthy + pub async fn on_active(&self) { + info!("Service is now ACTIVE (Healthy)"); + // e.g., Remove fencing lock + } + + /// Callback: Transitioned from Healthy -> Unhealthy + pub async fn on_failover(&self) { + info!("Service is now FAILED (Unhealthy)"); + // e.g., Initiate self-fencing, stop accepting traffic + } +} diff --git a/harmony_agent/src/agent/heartbeat.rs b/harmony_agent/src/agent/heartbeat.rs new file mode 100644 index 00000000..f2fe9704 --- /dev/null +++ b/harmony_agent/src/agent/heartbeat.rs @@ -0,0 +1,30 @@ +use harmony_types::id::Id; +use serde::{Deserialize, Serialize}; + +use crate::store::KvMetadata; + +/// Agent-provided heartbeat information (no timestamps - those come from the store) +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AgentInfo { + pub agent_id: Id, + pub cluster_id: Id, + pub status: String, +} + +/// Complete heartbeat with both agent data and store metadata +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AgentHeartbeat { + pub agent_info: AgentInfo, + pub metadata: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ClusterStateData { + pub cluster_id: Id, + pub current_primary: Option, + pub desired_primary: Id, + pub timestamp: u64, +} + +#[derive(Debug)] +pub struct HeartbeatFailure {} diff --git a/harmony_agent/src/agent_loop.rs b/harmony_agent/src/agent/mod.rs similarity index 74% rename from harmony_agent/src/agent_loop.rs rename to harmony_agent/src/agent/mod.rs index 7111db92..8d6ff2f5 100644 --- a/harmony_agent/src/agent_loop.rs +++ b/harmony_agent/src/agent/mod.rs @@ -1,10 +1,8 @@ -use std::fmt; use std::time::{SystemTime, UNIX_EPOCH}; use std::{str::FromStr, sync::Arc, time::Duration}; use harmony_types::id::Id; use log::{debug, info, trace}; -use serde::{Deserialize, Serialize}; use tokio::sync::RwLock; use tokio::time::{Instant, sleep}; @@ -13,21 +11,15 @@ use crate::workflow::HeartbeatWorkflow; use crate::workflow::primary::PrimaryWorkflow; use crate::workflow::replica::ReplicaWorkflow; -/// The role of this agent instance -#[derive(Debug, Clone, PartialEq)] -pub enum AgentRole { - Primary, - Replica, -} +// Submodules +mod config; +mod heartbeat; +mod role; -impl fmt::Display for AgentRole { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - AgentRole::Primary => write!(f, "primary"), - AgentRole::Replica => write!(f, "replica"), - } - } -} +// Re-exports for backwards compatibility +pub use config::{AgentConfig, DeploymentConfig, FailoverCNPGConfig}; +pub use heartbeat::{AgentHeartbeat, AgentInfo, ClusterStateData, HeartbeatFailure}; +pub use role::AgentRole; pub async fn launch_agent( role: AgentRole, @@ -85,102 +77,6 @@ where Ok(()) } -#[derive(Debug, Clone)] -pub struct AgentConfig { - /// Number of consecutive successful heartbeats required before the service transitions from - /// failed to healthy. - pub success_threshold: usize, - /// Number of consecutive failed heartbeats required before the service transitions from - /// healthy to failed. - pub failure_threshold: usize, - /// Time between each heartbeat. If a heartbeat takes longer than this, it will be - /// considered failed. - pub heartbeat_interval: Duration, - /// Time since last observed primary heartbeat before replica considers primary stale. - /// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin - /// to avoid split brain during network partitions. - pub failover_timeout: Duration, - /// **UNSTABLE FIELD** - /// - /// For now, an agent instance only serves one deployment. This is probably fine as an agent's - /// footprint is low, but managing multiple deployments in a single instance would be a - /// significant resource usage reduction. - /// - /// Decoupling the deployment of the agent with the application's deployment could make things - /// more complicated though, where we would have to be careful about version compatibility - /// between all components managed by the agent instance. So for now it is a 1-1 map. - /// - /// But I have a feeling this could change so I am marking this field unstable to warn you, the - /// reader. - pub deployment_config_unstable: DeploymentConfig, - pub nats_url: String, - pub nats_creds_path: Option, - pub agent_id: Id, - pub cluster_id: Id, - pub desired_primary_id: Id, - /// The role this agent plays (Primary or Replica) - pub role: AgentRole, -} - -#[derive(Debug, Clone)] -pub enum DeploymentConfig { - FailoverPostgreSQL(FailoverCNPGConfig), -} - -#[derive(Debug, Clone)] -pub struct FailoverCNPGConfig { - pub desired_primary_agent: Id, - pub cnpg_cluster_name: String, -} - -impl DeploymentConfig { - /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres) - pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> { - match self { - DeploymentConfig::FailoverPostgreSQL(cfg) => { - info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name); - // TODO: Implement actual PG check / NATS write here - Ok(()) - } - } - } - - /// Callback: Transitioned from Unhealthy -> Healthy - pub async fn on_active(&self) { - info!("Service is now ACTIVE (Healthy)"); - // e.g., Remove fencing lock - } - - /// Callback: Transitioned from Healthy -> Unhealthy - pub async fn on_failover(&self) { - info!("Service is now FAILED (Unhealthy)"); - // e.g., Initiate self-fencing, stop accepting traffic - } -} - -/// Agent-provided heartbeat information (no timestamps - those come from the store) -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct AgentInfo { - pub agent_id: Id, - pub cluster_id: Id, - pub status: String, -} - -/// Complete heartbeat with both agent data and store metadata -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct AgentHeartbeat { - pub agent_info: AgentInfo, - pub metadata: Option, -} - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ClusterStateData { - pub cluster_id: Id, - pub current_primary: Option, - pub desired_primary: Id, - pub timestamp: u64, -} - pub struct HarmonyAgent { pub config: AgentConfig, workflow: Box, @@ -207,7 +103,6 @@ impl HarmonyAgent { } AgentRole::Replica => { info!("Initializing agent as REPLICA"); - // pub fn new(success_threshold: usize, failure_threshold: usize, cluster_id: Id, primary_id: Id, my_id: Id) -> Self Box::new(ReplicaWorkflow::new( config.success_threshold, config.failure_threshold, @@ -229,7 +124,56 @@ impl HarmonyAgent { } } - /// Reconcile startup state by fetching cluster state from the store + /// Generic helper to fetch and deserialize data from KV store + /// Returns Ok(Some(data)) if key exists and deserializes successfully + /// Returns Ok(None) if key doesn't exist + /// Returns Err if deserialization fails or other errors occur + async fn fetch_from_store( + &self, + store: &Arc, + key: &str, + ) -> Result, KvStoreError> + where + D: serde::de::DeserializeOwned, + { + debug!("Fetching data from key: {}", key); + + let result = store.get(key).await; + debug!("Got result from store: {:#?}", result); + + match result { + Ok(kv_result) => { + if let Some(value) = kv_result.value { + match serde_json::from_value::(value.clone()) { + Ok(data) => Ok(Some(data)), + Err(e) => { + log::warn!("Failed to deserialize data from key {}: {}", key, e); + Err(KvStoreError::DeserializationFailed { + deserialization_error: format!( + "Key exists but deserialization failed for {key}: {e}" + ), + value: value.to_string(), + }) + } + } + } else { + Err(KvStoreError::Unknown(format!( + "Key exists but value is empty for {key}, this should not happen" + ))) + } + } + Err(KvStoreError::KeyNotAvailable(_)) => { + debug!("Key {} not found in store", key); + Ok(None) + } + Err(e) => { + log::warn!("Failed to fetch data from key {}: {}", key, e); + Err(e) + } + } + } + + /// Reconcile startup state by fetching cluster state and heartbeat from the store /// This allows the workflow to determine if it should resume as Primary/Replica /// based on the persisted cluster state pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> { @@ -240,48 +184,65 @@ impl HarmonyAgent { cluster_key ); - /* - trace!("{:#?}", self.cluster_kv.get(&cluster_key).await); - */ - - let cluster_state_option = match self.cluster_kv.get(&cluster_key).await { - Ok(result) => { - if let Some(value) = result.value { - match serde_json::from_value::(value.clone()) { - Ok(data) => Some(data), - Err(e) => { - log::warn!("Failed to deserialize cluster state: {}", e); - return Err(KvStoreError::DeserializationFailed { - deserialization_error: format!( - "Cluster key exist but is empty {cluster_key} : {e}" - ), - value: value.to_string(), - }); - } - } - } else { - return Err(KvStoreError::Unknown(format!( - "Cluster key exist but is empty {cluster_key}" - ))); - } - } - Err(KvStoreError::KeyNotAvailable(_)) => { - debug!("Cluster state key not found, this is a fresh cluster"); + let cluster_state_option = match self + .fetch_from_store::(&self.cluster_kv, &cluster_key) + .await? + { + Some(data) => Some(data), + None => { + debug!( + "Cluster state key not found, this is a fresh cluster, initializing cluster state" + ); Some(self.store_cluster_state(None).await?) } - Err(e) => { - log::warn!("Failed to fetch cluster state during startup: {}", e); - return Err(e); - } }; - let state_ref = cluster_state_option.as_ref(); - - self.workflow.on_startup(state_ref, self.health_kv.as_ref(), &self.config).await; + debug!("Found cluster state {cluster_state_option:#?}"); + self.workflow + .on_startup(cluster_state_option.as_ref(), &self.config) + .await; // Cache the cluster state locally *self.cluster_state.write().await = cluster_state_option; + // Fetch last heartbeat if it exists to avoid sequence conflicts + let heartbeat_key = format!("heartbeat.{}", self.config.agent_id); + debug!("Fetching last heartbeat from key: {}", heartbeat_key); + + let last_heartbeat_option = self.health_kv.get(&heartbeat_key).await; + + let last_heartbeat = match last_heartbeat_option { + Ok(kv_result) => { + let value = kv_result + .value + .expect("When key exist it should always contain data"); + Some(AgentHeartbeat { + agent_info: serde_json::from_value::(value.clone()).map_err(|e| { + KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: value.to_string(), + } + })?, + metadata: Some(kv_result.metadata), + }) + } + Err(e) => match e { + KvStoreError::KeyNotAvailable(_) => None, + _ => return Err(e), + }, + }; + if let Some(heartbeat) = &last_heartbeat{ + debug!( + "Found existing heartbeat with sequence: {}", + heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0) + ); + } else { + debug!("No existing heartbeat found, starting fresh"); + } + + // Cache the last heartbeat for sequence tracking + *self.last_heartbeat.write().await = last_heartbeat; + Ok(()) } @@ -503,9 +464,3 @@ impl HarmonyAgent { } } } - -#[derive(Debug)] -pub struct HeartbeatFailure {} - -/// Replica workflow module - handles replica-specific state machine -mod replica {} diff --git a/harmony_agent/src/agent/role.rs b/harmony_agent/src/agent/role.rs new file mode 100644 index 00000000..e9b719cf --- /dev/null +++ b/harmony_agent/src/agent/role.rs @@ -0,0 +1,17 @@ +use std::fmt; + +/// The role of this agent instance +#[derive(Debug, Clone, PartialEq)] +pub enum AgentRole { + Primary, + Replica, +} + +impl fmt::Display for AgentRole { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + AgentRole::Primary => write!(f, "primary"), + AgentRole::Replica => write!(f, "replica"), + } + } +} diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs index 92a0fa09..8eda5d1d 100644 --- a/harmony_agent/src/main.rs +++ b/harmony_agent/src/main.rs @@ -1,13 +1,9 @@ use std::{sync::Arc, time::Duration}; -use async_nats::jetstream::kv::Store; +use crate::{agent::AgentRole, store::{ChaosKvStore, InMemoryKvStore, NatsKvStore}}; -use crate::{ - agent_loop::AgentRole, - store::{ChaosKvStore, InMemoryKvStore, NatsKvStore}, -}; - -mod agent_loop; +// mod agent_loop; +mod agent; pub mod store; mod workflow; @@ -25,14 +21,14 @@ async fn main() { let cluster_kv = nats_store.clone(); let _ = tokio::join!( - agent_loop::launch_agent( + agent::launch_agent( AgentRole::Primary, health_kv.clone(), cluster_kv.clone(), heartbeat_interval, failover_timeout ), - agent_loop::launch_agent( + agent::launch_agent( AgentRole::Replica, health_kv, cluster_kv, diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs index 561ce434..81387a45 100644 --- a/harmony_agent/src/workflow/mod.rs +++ b/harmony_agent/src/workflow/mod.rs @@ -1,9 +1,7 @@ use std::sync::Arc; +use crate::agent::AgentConfig; use async_trait::async_trait; -use harmony_types::id::Id; - -use crate::{agent_loop::AgentConfig, store::KvStore}; pub mod primary; pub mod replica; @@ -18,9 +16,8 @@ pub trait HeartbeatWorkflow: Send + Sync { async fn handle_heartbeat_failure(&mut self); async fn on_startup( - &mut self, - cluster_state: Option<&crate::agent_loop::ClusterStateData>, - health_kv: &dyn KvStore, + &self, + cluster_state: Option<&crate::agent::ClusterStateData>, agent_config: &AgentConfig, ); diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs index ffe3ffae..80af63b5 100644 --- a/harmony_agent/src/workflow/primary.rs +++ b/harmony_agent/src/workflow/primary.rs @@ -1,7 +1,10 @@ use async_trait::async_trait; use log::{debug, info, trace, warn}; -use crate::{agent_loop::{AgentConfig, DeploymentConfig}, store::KvStore, workflow::HeartbeatWorkflow}; +use crate::{ + agent::{AgentConfig, DeploymentConfig}, + workflow::HeartbeatWorkflow, +}; #[derive(Debug, Clone, PartialEq)] pub enum PrimaryState { @@ -64,9 +67,8 @@ impl PrimaryWorkflow { #[async_trait] impl HeartbeatWorkflow for PrimaryWorkflow { async fn on_startup( - &mut self, - cluster_state: Option<&crate::agent_loop::ClusterStateData>, - health_kv: &dyn KvStore, + &self, + cluster_state: Option<&crate::agent::ClusterStateData>, agent_config: &AgentConfig, ) { if let Some(state) = cluster_state { diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs index eda4bbdc..e2ffc42e 100644 --- a/harmony_agent/src/workflow/replica.rs +++ b/harmony_agent/src/workflow/replica.rs @@ -1,11 +1,10 @@ use async_trait::async_trait; use harmony_types::id::Id; -use log::{debug, info, trace, warn}; +use log::{info, trace, warn}; use std::time::Duration; use tokio::sync::RwLock; -use crate::agent_loop::{AgentConfig, AgentHeartbeat}; -use crate::store::KvStore; +use crate::agent::{AgentConfig, AgentHeartbeat}; use crate::workflow::HeartbeatWorkflow; #[derive(Debug, Clone)] @@ -155,9 +154,8 @@ impl ReplicaWorkflow { #[async_trait] impl HeartbeatWorkflow for ReplicaWorkflow { async fn on_startup( - &mut self, - cluster_state: Option<&crate::agent_loop::ClusterStateData>, - health_kv: &dyn KvStore, + &self, + cluster_state: Option<&crate::agent::ClusterStateData>, agent_config: &AgentConfig, ) { // todo!("not sure if the replica should do anything on startup") -- 2.39.5 From 01a775a01fa629978506b5c76cec062b2b3f649e Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Wed, 4 Feb 2026 07:01:13 -0500 Subject: [PATCH 15/19] wip(agent): workflow now return new cluster state when they decide to alter it, primary taking control of current_primary case handled but using wrong ID --- harmony_agent/src/agent/mod.rs | 10 ++++++--- harmony_agent/src/workflow/mod.rs | 11 ++++++++-- harmony_agent/src/workflow/primary.rs | 29 ++++++++++++++++++++++++--- harmony_agent/src/workflow/replica.rs | 18 +++++++++++++---- 4 files changed, 56 insertions(+), 12 deletions(-) diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs index 8d6ff2f5..68ff3020 100644 --- a/harmony_agent/src/agent/mod.rs +++ b/harmony_agent/src/agent/mod.rs @@ -2,7 +2,7 @@ use std::time::{SystemTime, UNIX_EPOCH}; use std::{str::FromStr, sync::Arc, time::Duration}; use harmony_types::id::Id; -use log::{debug, info, trace}; +use log::{debug, info, trace, warn}; use tokio::sync::RwLock; use tokio::time::{Instant, sleep}; @@ -435,10 +435,14 @@ impl HarmonyAgent { trace!("Got heartbeat_result : {heartbeat_result:?}"); match heartbeat_result { Ok(_) => { - self.workflow.handle_heartbeat_success().await; + let new_state = self.workflow.handle_heartbeat_success(self.cluster_state.read().await.as_ref(), &self.config).await; + if let Some(new_state) = new_state { + warn!("Got new cluster state : {new_state:#?}"); + todo!("Got new state, save it"); + } } Err(_) => { - self.workflow.handle_heartbeat_failure().await; + self.workflow.handle_heartbeat_failure(self.cluster_state.read().await.as_ref()).await; } } diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs index 81387a45..e037e194 100644 --- a/harmony_agent/src/workflow/mod.rs +++ b/harmony_agent/src/workflow/mod.rs @@ -10,10 +10,17 @@ pub mod replica; #[async_trait] pub trait HeartbeatWorkflow: Send + Sync { /// Handle a successful heartbeat - async fn handle_heartbeat_success(&mut self); + async fn handle_heartbeat_success( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + agent_config: &AgentConfig, + ) -> Option; /// Handle a failed heartbeat - async fn handle_heartbeat_failure(&mut self); + async fn handle_heartbeat_failure( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + ); async fn on_startup( &self, diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs index 80af63b5..f54dfc77 100644 --- a/harmony_agent/src/workflow/primary.rs +++ b/harmony_agent/src/workflow/primary.rs @@ -2,7 +2,7 @@ use async_trait::async_trait; use log::{debug, info, trace, warn}; use crate::{ - agent::{AgentConfig, DeploymentConfig}, + agent::{AgentConfig, ClusterStateData, DeploymentConfig}, workflow::HeartbeatWorkflow, }; @@ -87,7 +87,11 @@ impl HeartbeatWorkflow for PrimaryWorkflow { debug!("No cluster state on startup, starting from Initializing"); } } - async fn handle_heartbeat_success(&mut self) { + async fn handle_heartbeat_success( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + agent_config: &AgentConfig, + ) -> Option { trace!( "Handling heartbeat success, current counters success {} failures {}", self.consecutive_successes, self.consecutive_failures @@ -104,7 +108,19 @@ impl HeartbeatWorkflow for PrimaryWorkflow { tokio::spawn(async move { config.on_active().await; }); + if let Some(state) = cluster_state + && state.desired_primary == agent_config.desired_primary_id + { + let mut new_state = state.clone(); + new_state.current_primary = Some(agent_config.agent_id.clone()); + return Some(new_state); + } else { + todo!( + "I cluster_state should not be an option, and we should throw an error when we are running a primary workflow but we are not the desired primary in the cluster state data" + ); + } } + None } PrimaryState::Failed => { if self.consecutive_successes >= self.success_threshold { @@ -114,10 +130,12 @@ impl HeartbeatWorkflow for PrimaryWorkflow { config.on_active().await; }); } + todo!() } PrimaryState::Healthy => { // Stay healthy debug!("Primary staying healthy"); + todo!() } PrimaryState::Fenced => { // Recovery from fenced state @@ -126,15 +144,20 @@ impl HeartbeatWorkflow for PrimaryWorkflow { info!("Recovered from fenced state, transitioning to yielding"); self.transition_to(PrimaryState::Yielding); } + todo!() } PrimaryState::Yielding => { // TODO: Check NATS to see if we can resume as primary trace!("Yielding, waiting for demotion handshake"); + todo!() } } } - async fn handle_heartbeat_failure(&mut self) { + async fn handle_heartbeat_failure( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + ) { self.consecutive_failures += 1; self.consecutive_successes = 0; diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs index e2ffc42e..90ddd341 100644 --- a/harmony_agent/src/workflow/replica.rs +++ b/harmony_agent/src/workflow/replica.rs @@ -161,7 +161,11 @@ impl HeartbeatWorkflow for ReplicaWorkflow { // todo!("not sure if the replica should do anything on startup") } - async fn handle_heartbeat_success(&mut self) { + async fn handle_heartbeat_success( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + agent_config: &AgentConfig, + ) -> Option { trace!( "Handling heartbeat success, current counters success {} failures {}", self.consecutive_successes, self.consecutive_failures @@ -174,6 +178,7 @@ impl HeartbeatWorkflow for ReplicaWorkflow { if self.consecutive_successes >= self.success_threshold { self.transition_to(ReplicaState::Watching); } + None } ReplicaState::Watching => { // TODO: Check primary staleness from NATS @@ -181,7 +186,6 @@ impl HeartbeatWorkflow for ReplicaWorkflow { if self.is_primary_stale().await { warn!("Found stale primary, launching promotion"); } - /* todo!("perform the replica watch actions : - if a primary exists in the cluster (cluster_state.current_primary == expected_primary) - check the last primary heartbeat kv timestamp @@ -189,7 +193,6 @@ impl HeartbeatWorkflow for ReplicaWorkflow { - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself) - launching promotion will change the status of the replica "); - */ } ReplicaState::Promoting => { // TODO: Complete promotion attempt @@ -202,25 +205,32 @@ impl HeartbeatWorkflow for ReplicaWorkflow { if self.consecutive_successes >= self.success_threshold { self.transition_to(ReplicaState::Watching); } + todo!() } ReplicaState::Leader => { // TODO: Check for original primary recovery trace!("Replica acting as leader"); + todo!() } ReplicaState::Failed => { if self.consecutive_successes >= self.success_threshold { info!("Replica recovered from Failed state, transitioning to Watching"); self.transition_to(ReplicaState::Watching); } + todo!() } ReplicaState::Demoting => { // TODO: Complete demotion back to watching trace!("Replica demotion in progress"); + todo!() } } } - async fn handle_heartbeat_failure(&mut self) { + async fn handle_heartbeat_failure( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + ) { self.consecutive_failures += 1; self.consecutive_successes = 0; -- 2.39.5 From 17b3b3b3513f898514082b77750e97375e4ed430 Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Wed, 4 Feb 2026 09:26:10 -0500 Subject: [PATCH 16/19] test(agent): Wrote first few tests for Primary workflow use cases : initializing to healthy, healthy to failed --- harmony_agent/Cargo.toml | 1 + harmony_agent/src/agent/config.rs | 1 - harmony_agent/src/agent/heartbeat.rs | 2 +- harmony_agent/src/agent/mod.rs | 1 - harmony_agent/src/workflow/primary.rs | 127 ++++++++++++++++++++++++++ 5 files changed, 129 insertions(+), 3 deletions(-) diff --git a/harmony_agent/Cargo.toml b/harmony_agent/Cargo.toml index 22a373ca..6fb7ff5d 100644 --- a/harmony_agent/Cargo.toml +++ b/harmony_agent/Cargo.toml @@ -23,3 +23,4 @@ serde_json.workspace = true getrandom = "0.3.4" thiserror.workspace = true +pretty_assertions.workspace = true diff --git a/harmony_agent/src/agent/config.rs b/harmony_agent/src/agent/config.rs index ec96601c..0862da67 100644 --- a/harmony_agent/src/agent/config.rs +++ b/harmony_agent/src/agent/config.rs @@ -50,7 +50,6 @@ pub enum DeploymentConfig { #[derive(Debug, Clone)] pub struct FailoverCNPGConfig { - pub desired_primary_agent: Id, pub cnpg_cluster_name: String, } diff --git a/harmony_agent/src/agent/heartbeat.rs b/harmony_agent/src/agent/heartbeat.rs index f2fe9704..f101656c 100644 --- a/harmony_agent/src/agent/heartbeat.rs +++ b/harmony_agent/src/agent/heartbeat.rs @@ -18,7 +18,7 @@ pub struct AgentHeartbeat { pub metadata: Option, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct ClusterStateData { pub cluster_id: Id, pub current_primary: Option, diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs index 68ff3020..558debd7 100644 --- a/harmony_agent/src/agent/mod.rs +++ b/harmony_agent/src/agent/mod.rs @@ -50,7 +50,6 @@ where heartbeat_interval, failover_timeout, deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig { - desired_primary_agent: my_agent_id.clone(), cnpg_cluster_name: String::from("cnpg_cluster_name"), }), nats_url: String::new(), diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs index f54dfc77..890068be 100644 --- a/harmony_agent/src/workflow/primary.rs +++ b/harmony_agent/src/workflow/primary.rs @@ -31,8 +31,14 @@ pub struct PrimaryWorkflow { state: PrimaryState, consecutive_successes: usize, consecutive_failures: usize, + + // TODO these thresholds should not be copied into the workflow struct. They are configuration + // level and should always be read from the context passed to the workflow functions success_threshold: usize, failure_threshold: usize, + + // TODO not sure if this should be known by the workflow or passed in the context to function + // calls or just completely handled by the agent ? deployment_config: DeploymentConfig, } @@ -201,3 +207,124 @@ impl HeartbeatWorkflow for PrimaryWorkflow { self.consecutive_failures } } + +#[cfg(test)] +mod test { + use std::time::Duration; + + use harmony_types::id::Id; + + use crate::agent::{AgentRole, FailoverCNPGConfig}; + + use pretty_assertions::assert_eq; + + use super::*; + + #[tokio::test] + async fn primary_does_nothing_when_on_heartbeat_success_below_threshold() { + let (mut primary, cluster_state, agent_config) = default_test_state(2, 2); + + assert!( + primary + .handle_heartbeat_success(Some(&cluster_state), &agent_config) + .await + .is_none() + ); + } + + #[tokio::test] + async fn primary_transitions_cluster_state_when_consecutive_success_threshold_reached() { + let (mut primary, cluster_state, agent_config) = default_test_state(2, 2); + + let mut expected_state = cluster_state.clone(); + expected_state.current_primary = Some(Id::empty()); + + assert_eq!( + primary + .handle_heartbeat_success(Some(&cluster_state), &agent_config) + .await, + None + ); + assert_eq!( + primary + .handle_heartbeat_success(Some(&cluster_state), &agent_config) + .await, + Some(expected_state) + ); + } + + #[tokio::test] + async fn primary_stays_healthy_below_failure_threshold() { + let (mut primary, cluster_state, agent_config) = default_test_state(1, 2); + + // Reach healthy + let _ = primary + .handle_heartbeat_success(Some(&cluster_state), &agent_config) + .await; + assert_eq!(primary.state, PrimaryState::Healthy); + + // One failure below threshold + primary.handle_heartbeat_failure(Some(&cluster_state)).await; + assert_eq!(primary.state, PrimaryState::Healthy); + assert_eq!(primary.consecutive_failures(), 1); + assert_eq!(primary.consecutive_successes(), 0); + } + + #[tokio::test] + async fn primary_transitions_to_failed_at_failure_threshold() { + let (mut primary, cluster_state, agent_config) = default_test_state(1, 2); + + // Reach healthy + let _ = primary + .handle_heartbeat_success(Some(&cluster_state), &agent_config) + .await; + assert_eq!(primary.state, PrimaryState::Healthy); + + // First failure, still healthy + primary.handle_heartbeat_failure(Some(&cluster_state)).await; + assert_eq!(primary.state, PrimaryState::Healthy); + assert_eq!(primary.consecutive_failures(), 1); + + // Second failure reaches threshold, transitions to Failed + primary.handle_heartbeat_failure(Some(&cluster_state)).await; + assert_eq!(primary.state, PrimaryState::Fenced); + assert_eq!(primary.consecutive_failures(), 2); + assert_eq!(primary.consecutive_successes(), 0); + } + + fn default_test_state( + success_threshold: usize, + failure_threshold: usize, + ) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) { + let cluster_state = crate::agent::ClusterStateData { + cluster_id: Id::empty(), + current_primary: None, + desired_primary: Id::empty(), + timestamp: 0, + }; + + let agent_config = AgentConfig { + success_threshold, + failure_threshold, + heartbeat_interval: Duration::from_nanos(0), + failover_timeout: Duration::from_nanos(0), + deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig { + cnpg_cluster_name: "test".to_string(), + }), + nats_url: String::new(), + nats_creds_path: None, + agent_id: Id::empty(), + cluster_id: Id::empty(), + desired_primary_id: Id::empty(), + role: AgentRole::Primary, + }; + + let primary = PrimaryWorkflow::new( + agent_config.success_threshold, + agent_config.failure_threshold, + agent_config.deployment_config_unstable.clone(), + ); + + (primary, cluster_state, agent_config) + } +} -- 2.39.5 From a08c3fb03b55a9502068647e2332c943a0c8e7d4 Mon Sep 17 00:00:00 2001 From: wjro Date: Wed, 4 Feb 2026 11:47:11 -0500 Subject: [PATCH 17/19] wip: save new cluster info state --- harmony_agent/src/agent/heartbeat.rs | 6 +++ harmony_agent/src/agent/mod.rs | 72 +++++++++++++++++++-------- harmony_agent/src/store/mod.rs | 2 +- harmony_agent/src/workflow/mod.rs | 2 +- harmony_agent/src/workflow/primary.rs | 23 +++++---- 5 files changed, 72 insertions(+), 33 deletions(-) diff --git a/harmony_agent/src/agent/heartbeat.rs b/harmony_agent/src/agent/heartbeat.rs index f101656c..ab5697b1 100644 --- a/harmony_agent/src/agent/heartbeat.rs +++ b/harmony_agent/src/agent/heartbeat.rs @@ -20,6 +20,12 @@ pub struct AgentHeartbeat { #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct ClusterStateData { + pub cluster_info: ClusterInfo, + pub metadata: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub struct ClusterInfo { pub cluster_id: Id, pub current_primary: Option, pub desired_primary: Id, diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs index 558debd7..480b7713 100644 --- a/harmony_agent/src/agent/mod.rs +++ b/harmony_agent/src/agent/mod.rs @@ -2,10 +2,11 @@ use std::time::{SystemTime, UNIX_EPOCH}; use std::{str::FromStr, sync::Arc, time::Duration}; use harmony_types::id::Id; -use log::{debug, info, trace, warn}; +use log::{debug, error, info, trace, warn}; use tokio::sync::RwLock; use tokio::time::{Instant, sleep}; +use crate::agent::heartbeat::ClusterInfo; use crate::store::{KvMetadata, KvStore, KvStoreError}; use crate::workflow::HeartbeatWorkflow; use crate::workflow::primary::PrimaryWorkflow; @@ -13,7 +14,7 @@ use crate::workflow::replica::ReplicaWorkflow; // Submodules mod config; -mod heartbeat; +pub mod heartbeat; mod role; // Re-exports for backwards compatibility @@ -216,12 +217,12 @@ impl HarmonyAgent { .value .expect("When key exist it should always contain data"); Some(AgentHeartbeat { - agent_info: serde_json::from_value::(value.clone()).map_err(|e| { - KvStoreError::DeserializationFailed { + agent_info: serde_json::from_value::(value.clone()).map_err( + |e| KvStoreError::DeserializationFailed { deserialization_error: e.to_string(), value: value.to_string(), - } - })?, + }, + )?, metadata: Some(kv_result.metadata), }) } @@ -230,7 +231,7 @@ impl HarmonyAgent { _ => return Err(e), }, }; - if let Some(heartbeat) = &last_heartbeat{ + if let Some(heartbeat) = &last_heartbeat { debug!( "Found existing heartbeat with sequence: {}", heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0) @@ -247,34 +248,46 @@ impl HarmonyAgent { async fn store_cluster_state( &self, - cluster_state_data: Option, + cluster_data: Option, ) -> Result { let key = format!("cluster.{}", self.config.cluster_id); - match cluster_state_data { - Some(state) => { - let value = serde_json::to_value(&state).map_err(|e| { + match cluster_data { + Some(cluster_data) => { + debug!("found some cluster state {:#?}", cluster_data); + + let value = serde_json::to_value(&cluster_data).map_err(|e| { KvStoreError::DeserializationFailed { deserialization_error: e.to_string(), - value: format!("{:?}", state), + value: format!("{:?}", cluster_data), } })?; let expected_sequence = { - let last = self.last_heartbeat.read().await; + let last = self.cluster_state.read().await; last.as_ref() .and_then(|hb| hb.metadata.as_ref()) .map(|m| m.sequence) .unwrap_or(0) }; - self.cluster_kv + debug!("expected sequence {:#?}", expected_sequence); + let new_seq = self + .cluster_kv .set_strict(&key, value, expected_sequence) .await?; - Ok(state) + let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?; + + let cluster_data_new = ClusterStateData { + cluster_info: cluster_data.cluster_info.clone(), + metadata: Some(cluster_kv_result.metadata), + }; + + *self.cluster_state.write().await = Some(cluster_data_new.clone()); + Ok(cluster_data) } None => { - let cluster_data = ClusterStateData { + let cluster_info = ClusterInfo { cluster_id: self.config.cluster_id.clone(), current_primary: None, desired_primary: self.config.desired_primary_id.clone(), @@ -284,13 +297,20 @@ impl HarmonyAgent { .as_millis() as u64, }; - let value = serde_json::to_value(&cluster_data).map_err(|e| { + let value = serde_json::to_value(&cluster_info).map_err(|e| { KvStoreError::DeserializationFailed { deserialization_error: e.to_string(), - value: format!("{:?}", cluster_data), + value: format!("{:?}", cluster_info), } })?; + + let cluster_data = ClusterStateData { + cluster_info, + metadata: None, + }; + self.cluster_kv.set_strict(&key, value, 0).await?; + *self.cluster_state.write().await = Some(cluster_data.clone()); Ok(cluster_data) } } @@ -434,14 +454,24 @@ impl HarmonyAgent { trace!("Got heartbeat_result : {heartbeat_result:?}"); match heartbeat_result { Ok(_) => { - let new_state = self.workflow.handle_heartbeat_success(self.cluster_state.read().await.as_ref(), &self.config).await; + let new_state = self + .workflow + .handle_heartbeat_success( + self.cluster_state.read().await.as_ref(), + &self.config, + ) + .await; if let Some(new_state) = new_state { warn!("Got new cluster state : {new_state:#?}"); - todo!("Got new state, save it"); + self.store_cluster_state(Some(new_state)) + .await + .expect(&format!("cluster state not able to be stored")); } } Err(_) => { - self.workflow.handle_heartbeat_failure(self.cluster_state.read().await.as_ref()).await; + self.workflow + .handle_heartbeat_failure(self.cluster_state.read().await.as_ref()) + .await; } } diff --git a/harmony_agent/src/store/mod.rs b/harmony_agent/src/store/mod.rs index 08879d1a..617df34c 100644 --- a/harmony_agent/src/store/mod.rs +++ b/harmony_agent/src/store/mod.rs @@ -12,7 +12,7 @@ pub struct SubscriptionHandle { /// Metadata returned by the KV store for all operations /// Contains timing and ordering information set by the store -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct KvMetadata { /// Timestamp set by the store (milliseconds since UNIX epoch) pub timestamp: u64, diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs index e037e194..8696e071 100644 --- a/harmony_agent/src/workflow/mod.rs +++ b/harmony_agent/src/workflow/mod.rs @@ -24,7 +24,7 @@ pub trait HeartbeatWorkflow: Send + Sync { async fn on_startup( &self, - cluster_state: Option<&crate::agent::ClusterStateData>, + cluster_state: Option<&crate::agent::heartbeat::ClusterStateData>, agent_config: &AgentConfig, ); diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs index 890068be..80242cc6 100644 --- a/harmony_agent/src/workflow/primary.rs +++ b/harmony_agent/src/workflow/primary.rs @@ -80,7 +80,7 @@ impl HeartbeatWorkflow for PrimaryWorkflow { if let Some(state) = cluster_state { info!( "Startup reconciliation: current primary is {:?}, desired primary is {:?}", - state.current_primary, state.desired_primary + state.cluster_info.current_primary, state.cluster_info.desired_primary ); let key = format!("heartbeat.{}", agent_config.agent_id.clone()); @@ -115,10 +115,11 @@ impl HeartbeatWorkflow for PrimaryWorkflow { config.on_active().await; }); if let Some(state) = cluster_state - && state.desired_primary == agent_config.desired_primary_id + && state.cluster_info.desired_primary == agent_config.desired_primary_id { let mut new_state = state.clone(); - new_state.current_primary = Some(agent_config.agent_id.clone()); + new_state.cluster_info.current_primary = + Some(agent_config.agent_id.clone()); return Some(new_state); } else { todo!( @@ -210,9 +211,8 @@ impl HeartbeatWorkflow for PrimaryWorkflow { #[cfg(test)] mod test { - use std::time::Duration; - use harmony_types::id::Id; + use std::time::Duration; use crate::agent::{AgentRole, FailoverCNPGConfig}; @@ -237,7 +237,7 @@ mod test { let (mut primary, cluster_state, agent_config) = default_test_state(2, 2); let mut expected_state = cluster_state.clone(); - expected_state.current_primary = Some(Id::empty()); + expected_state.cluster_info.current_primary = Some(Id::empty()); assert_eq!( primary @@ -297,10 +297,13 @@ mod test { failure_threshold: usize, ) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) { let cluster_state = crate::agent::ClusterStateData { - cluster_id: Id::empty(), - current_primary: None, - desired_primary: Id::empty(), - timestamp: 0, + cluster_info: crate::agent::heartbeat::ClusterInfo { + cluster_id: Id::empty(), + current_primary: None, + desired_primary: Id::empty(), + timestamp: 0, + }, + metadata: None, }; let agent_config = AgentConfig { -- 2.39.5 From de14ba6b97de34188a1cfc00699ad43ef5ec348a Mon Sep 17 00:00:00 2001 From: Jean-Gabriel Gill-Couture Date: Wed, 4 Feb 2026 12:10:33 -0500 Subject: [PATCH 18/19] fix(agent): fetch from store returns metadata to allow rebuilding states properly --- harmony_agent/src/agent/heartbeat.rs | 5 ++--- harmony_agent/src/agent/mod.rs | 19 +++++++++---------- harmony_agent/src/workflow/primary.rs | 4 ++-- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/harmony_agent/src/agent/heartbeat.rs b/harmony_agent/src/agent/heartbeat.rs index ab5697b1..5e9fc36f 100644 --- a/harmony_agent/src/agent/heartbeat.rs +++ b/harmony_agent/src/agent/heartbeat.rs @@ -20,16 +20,15 @@ pub struct AgentHeartbeat { #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct ClusterStateData { - pub cluster_info: ClusterInfo, + pub cluster_info: ClusterState, pub metadata: Option, } #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] -pub struct ClusterInfo { +pub struct ClusterState { pub cluster_id: Id, pub current_primary: Option, pub desired_primary: Id, - pub timestamp: u64, } #[derive(Debug)] diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs index 480b7713..ba953a09 100644 --- a/harmony_agent/src/agent/mod.rs +++ b/harmony_agent/src/agent/mod.rs @@ -6,7 +6,7 @@ use log::{debug, error, info, trace, warn}; use tokio::sync::RwLock; use tokio::time::{Instant, sleep}; -use crate::agent::heartbeat::ClusterInfo; +use crate::agent::heartbeat::ClusterState; use crate::store::{KvMetadata, KvStore, KvStoreError}; use crate::workflow::HeartbeatWorkflow; use crate::workflow::primary::PrimaryWorkflow; @@ -132,7 +132,7 @@ impl HarmonyAgent { &self, store: &Arc, key: &str, - ) -> Result, KvStoreError> + ) -> Result, KvStoreError> where D: serde::de::DeserializeOwned, { @@ -145,7 +145,7 @@ impl HarmonyAgent { Ok(kv_result) => { if let Some(value) = kv_result.value { match serde_json::from_value::(value.clone()) { - Ok(data) => Ok(Some(data)), + Ok(data) => Ok(Some((data, kv_result.metadata))), Err(e) => { log::warn!("Failed to deserialize data from key {}: {}", key, e); Err(KvStoreError::DeserializationFailed { @@ -185,10 +185,13 @@ impl HarmonyAgent { ); let cluster_state_option = match self - .fetch_from_store::(&self.cluster_kv, &cluster_key) + .fetch_from_store::(&self.cluster_kv, &cluster_key) .await? { - Some(data) => Some(data), + Some((data, metadata)) => Some(ClusterStateData { + cluster_info: data, + metadata: Some(metadata), + }), None => { debug!( "Cluster state key not found, this is a fresh cluster, initializing cluster state" @@ -287,14 +290,10 @@ impl HarmonyAgent { Ok(cluster_data) } None => { - let cluster_info = ClusterInfo { + let cluster_info = ClusterState { cluster_id: self.config.cluster_id.clone(), current_primary: None, desired_primary: self.config.desired_primary_id.clone(), - timestamp: SystemTime::now() - .duration_since(UNIX_EPOCH) - .expect("Time went backwards") - .as_millis() as u64, }; let value = serde_json::to_value(&cluster_info).map_err(|e| { diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs index 80242cc6..87ab3391 100644 --- a/harmony_agent/src/workflow/primary.rs +++ b/harmony_agent/src/workflow/primary.rs @@ -2,7 +2,7 @@ use async_trait::async_trait; use log::{debug, info, trace, warn}; use crate::{ - agent::{AgentConfig, ClusterStateData, DeploymentConfig}, + agent::{AgentConfig, DeploymentConfig}, workflow::HeartbeatWorkflow, }; @@ -297,7 +297,7 @@ mod test { failure_threshold: usize, ) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) { let cluster_state = crate::agent::ClusterStateData { - cluster_info: crate::agent::heartbeat::ClusterInfo { + cluster_info: crate::agent::heartbeat::ClusterState { cluster_id: Id::empty(), current_primary: None, desired_primary: Id::empty(), -- 2.39.5 From 7ca1a64038582d2766f571836eedb8acf21c9d8c Mon Sep 17 00:00:00 2001 From: wjro Date: Wed, 4 Feb 2026 15:56:40 -0500 Subject: [PATCH 19/19] feat: completed harmony_agent implentation for primary and replica agents, fixed a test --- Cargo.lock | 21 +------------------ .../src/modules/application/backend_app.rs | 19 ++++++++--------- harmony/src/modules/application/config.rs | 1 - harmony/src/modules/application/mod.rs | 2 +- harmony/src/modules/application/oci.rs | 10 ++++----- harmony/src/modules/application/rust.rs | 1 - harmony_agent/src/agent/config.rs | 2 +- harmony_agent/src/agent/mod.rs | 19 ++++++++++++----- harmony_agent/src/main.rs | 5 ++++- harmony_agent/src/store/chaos.rs | 2 +- harmony_agent/src/store/memory.rs | 4 +++- harmony_agent/src/workflow/primary.rs | 7 ++----- harmony_agent/src/workflow/replica.rs | 8 ++++--- harmony_execution/src/lib.rs | 3 +-- 14 files changed, 47 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 14295673..2816bba2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2657,6 +2657,7 @@ dependencies = [ "harmony_macros", "harmony_types", "log", + "pretty_assertions", "serde", "serde_json", "thiserror 2.0.16", @@ -3586,26 +3587,6 @@ dependencies = [ "thiserror 1.0.69", ] -[[package]] -name = "json-prompt" -version = "0.1.0" -dependencies = [ - "brocade", - "cidr", - "env_logger", - "harmony", - "harmony_cli", - "harmony_macros", - "harmony_secret", - "harmony_secret_derive", - "harmony_types", - "log", - "schemars 0.8.22", - "serde", - "tokio", - "url", -] - [[package]] name = "jsonpath-rust" version = "0.7.5" diff --git a/harmony/src/modules/application/backend_app.rs b/harmony/src/modules/application/backend_app.rs index 804af46d..d11feaa9 100644 --- a/harmony/src/modules/application/backend_app.rs +++ b/harmony/src/modules/application/backend_app.rs @@ -11,7 +11,7 @@ use crate::{ helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind}, }, }; -use harmony_execution::{run_command, RunnerOptions}; +use harmony_execution::{RunnerOptions, run_command}; #[derive(Debug, Clone, Serialize)] pub struct BuildCommand { @@ -100,15 +100,14 @@ impl OCICompliant for BackendApp { // Run docker build command, streaming output to console and capturing it let output = run_command( - std::process::Command::new("docker") - .args([ - "build", - "-t", - &image_tag, - "-f", - &dockerfile.to_string_lossy(), - &self.project_root.to_string_lossy(), - ]), + std::process::Command::new("docker").args([ + "build", + "-t", + &image_tag, + "-f", + &dockerfile.to_string_lossy(), + &self.project_root.to_string_lossy(), + ]), RunnerOptions::print_to_console(), ) .map_err(|e| format!("Failed to spawn docker build process: {}", e))?; diff --git a/harmony/src/modules/application/config.rs b/harmony/src/modules/application/config.rs index 8d074271..9c529f1d 100644 --- a/harmony/src/modules/application/config.rs +++ b/harmony/src/modules/application/config.rs @@ -15,7 +15,6 @@ impl NetworkProtocol { } } - impl std::fmt::Display for NetworkProtocol { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(self.as_str()) diff --git a/harmony/src/modules/application/mod.rs b/harmony/src/modules/application/mod.rs index 00e85843..13da0840 100644 --- a/harmony/src/modules/application/mod.rs +++ b/harmony/src/modules/application/mod.rs @@ -2,10 +2,10 @@ pub mod backend_app; pub mod config; mod feature; pub mod features; +pub mod helm; pub mod oci; mod rust; mod webapp; -pub mod helm; use std::sync::Arc; pub use feature::*; diff --git a/harmony/src/modules/application/oci.rs b/harmony/src/modules/application/oci.rs index 102bcd8c..63e1c208 100644 --- a/harmony/src/modules/application/oci.rs +++ b/harmony/src/modules/application/oci.rs @@ -1,6 +1,9 @@ use std::path::{Path, PathBuf}; -use crate::{config::{REGISTRY_PROJECT, REGISTRY_URL}, modules::application::check_output}; +use crate::{ + config::{REGISTRY_PROJECT, REGISTRY_URL}, + modules::application::check_output, +}; use super::Application; use async_trait::async_trait; @@ -22,10 +25,7 @@ pub trait HelmPackage: Application { /// # Arguments /// * `image_url` - The full URL of the OCI container image to be used in the Deployment. /// * `domain` - The domain where the application is hosted. - async fn build_push_helm_package( - &self, - image_url: &str, - ) -> Result; + async fn build_push_helm_package(&self, image_url: &str) -> Result; fn project_root(&self) -> PathBuf; diff --git a/harmony/src/modules/application/rust.rs b/harmony/src/modules/application/rust.rs index 4e41187c..7e3413bb 100644 --- a/harmony/src/modules/application/rust.rs +++ b/harmony/src/modules/application/rust.rs @@ -632,7 +632,6 @@ spec: Ok(chart_dir) } - fn get_or_build_dockerfile(&self) -> Result> { let existing_dockerfile = self.project_root.join("Dockerfile"); diff --git a/harmony_agent/src/agent/config.rs b/harmony_agent/src/agent/config.rs index 0862da67..86b731cf 100644 --- a/harmony_agent/src/agent/config.rs +++ b/harmony_agent/src/agent/config.rs @@ -3,8 +3,8 @@ use std::time::Duration; use harmony_types::id::Id; use log::info; -use super::role::AgentRole; use super::heartbeat::HeartbeatFailure; +use super::role::AgentRole; #[derive(Debug, Clone)] pub struct AgentConfig { diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs index ba953a09..3291aeaa 100644 --- a/harmony_agent/src/agent/mod.rs +++ b/harmony_agent/src/agent/mod.rs @@ -207,7 +207,6 @@ impl HarmonyAgent { // Cache the cluster state locally *self.cluster_state.write().await = cluster_state_option; - // Fetch last heartbeat if it exists to avoid sequence conflicts let heartbeat_key = format!("heartbeat.{}", self.config.agent_id); debug!("Fetching last heartbeat from key: {}", heartbeat_key); @@ -258,7 +257,7 @@ impl HarmonyAgent { Some(cluster_data) => { debug!("found some cluster state {:#?}", cluster_data); - let value = serde_json::to_value(&cluster_data).map_err(|e| { + let value = serde_json::to_value(&cluster_data.cluster_info).map_err(|e| { KvStoreError::DeserializationFailed { deserialization_error: e.to_string(), value: format!("{:?}", cluster_data), @@ -280,6 +279,7 @@ impl HarmonyAgent { .await?; let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?; + debug!("cluster kv {:#?}", cluster_kv_result); let cluster_data_new = ClusterStateData { cluster_info: cluster_data.cluster_info.clone(), @@ -308,9 +308,18 @@ impl HarmonyAgent { metadata: None, }; - self.cluster_kv.set_strict(&key, value, 0).await?; - *self.cluster_state.write().await = Some(cluster_data.clone()); - Ok(cluster_data) + let new_seq = self.cluster_kv.set_strict(&key, value, 0).await?; + + let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?; + debug!("cluster kv {:#?}", cluster_kv_result); + + let cluster_data_new = ClusterStateData { + cluster_info: cluster_data.cluster_info.clone(), + metadata: Some(cluster_kv_result.metadata), + }; + + *self.cluster_state.write().await = Some(cluster_data_new.clone()); + Ok(cluster_data_new) } } } diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs index 8eda5d1d..a5947c22 100644 --- a/harmony_agent/src/main.rs +++ b/harmony_agent/src/main.rs @@ -1,6 +1,9 @@ use std::{sync::Arc, time::Duration}; -use crate::{agent::AgentRole, store::{ChaosKvStore, InMemoryKvStore, NatsKvStore}}; +use crate::{ + agent::AgentRole, + store::{ChaosKvStore, InMemoryKvStore, NatsKvStore}, +}; // mod agent_loop; mod agent; diff --git a/harmony_agent/src/store/chaos.rs b/harmony_agent/src/store/chaos.rs index 9fa6fc83..402cf8f7 100644 --- a/harmony_agent/src/store/chaos.rs +++ b/harmony_agent/src/store/chaos.rs @@ -113,7 +113,7 @@ mod tests { #[tokio::test] async fn test_chaos_store_with_no_chaos() { let inner = InMemoryKvStore::new(); - let chaos = ChaosKvStore::new(inner, 0, 0, 0); + let chaos = ChaosKvStore::new(inner, 0, 0, 1); let value = json!({"test": "value"}); let result = chaos.set_strict("key", value.clone(), 0).await.unwrap(); diff --git a/harmony_agent/src/store/memory.rs b/harmony_agent/src/store/memory.rs index 150b7225..12afc51c 100644 --- a/harmony_agent/src/store/memory.rs +++ b/harmony_agent/src/store/memory.rs @@ -91,6 +91,8 @@ impl KvStore for InMemoryKvStore { ) -> Result { // Check current sequence (length of history for this key) let data = self.data.read().await; + // This implemenetation does not seem to match the NATS sequence. In nats the + // sequence updates one counter per bucket. This impl creates a counter per key let current_sequence = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0); drop(data); @@ -163,7 +165,7 @@ mod tests { let seq1 = store.set_strict("key1", json!("value1"), 0).await.unwrap(); - let seq2 = store.set_strict("key2", json!("value2"), 0).await.unwrap(); + let seq2 = store.set_strict("key1", json!("value2"), 1).await.unwrap(); assert!(seq2 > seq1, "Sequence numbers should increment"); } diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs index 87ab3391..61f25556 100644 --- a/harmony_agent/src/workflow/primary.rs +++ b/harmony_agent/src/workflow/primary.rs @@ -83,9 +83,6 @@ impl HeartbeatWorkflow for PrimaryWorkflow { state.cluster_info.current_primary, state.cluster_info.desired_primary ); - let key = format!("heartbeat.{}", agent_config.agent_id.clone()); - // let hb = health_kv.get(&key); - // No automatic fast-tracking - agent must earn healthy status // through successful heartbeats. This prevents duplicate agents // or crashloop agents from incorrectly claiming primary. @@ -117,6 +114,7 @@ impl HeartbeatWorkflow for PrimaryWorkflow { if let Some(state) = cluster_state && state.cluster_info.desired_primary == agent_config.desired_primary_id { + debug!("state {:#?}", state); let mut new_state = state.clone(); new_state.cluster_info.current_primary = Some(agent_config.agent_id.clone()); @@ -142,7 +140,7 @@ impl HeartbeatWorkflow for PrimaryWorkflow { PrimaryState::Healthy => { // Stay healthy debug!("Primary staying healthy"); - todo!() + None } PrimaryState::Fenced => { // Recovery from fenced state @@ -301,7 +299,6 @@ mod test { cluster_id: Id::empty(), current_primary: None, desired_primary: Id::empty(), - timestamp: 0, }, metadata: None, }; diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs index 90ddd341..5c86bde7 100644 --- a/harmony_agent/src/workflow/replica.rs +++ b/harmony_agent/src/workflow/replica.rs @@ -1,6 +1,6 @@ use async_trait::async_trait; use harmony_types::id::Id; -use log::{info, trace, warn}; +use log::{debug, error, info, trace, warn}; use std::time::Duration; use tokio::sync::RwLock; @@ -184,15 +184,17 @@ impl HeartbeatWorkflow for ReplicaWorkflow { // TODO: Check primary staleness from NATS trace!("Replica watching primary"); if self.is_primary_stale().await { - warn!("Found stale primary, launching promotion"); + panic!("Found stale primary, launching promotion"); } - todo!("perform the replica watch actions : + debug!("perform the replica watch actions : - if a primary exists in the cluster (cluster_state.current_primary == expected_primary) - check the last primary heartbeat kv timestamp - compare it with our latest kv heartbeat - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself) - launching promotion will change the status of the replica "); + + None } ReplicaState::Promoting => { // TODO: Complete promotion attempt diff --git a/harmony_execution/src/lib.rs b/harmony_execution/src/lib.rs index 65fdf663..c96cddfe 100644 --- a/harmony_execution/src/lib.rs +++ b/harmony_execution/src/lib.rs @@ -1,6 +1,5 @@ pub mod command; pub use command::{ - run_command, run, run_silent, - CommandOutput, CommandStatus, CommandError, RunnerOptions, + CommandError, CommandOutput, CommandStatus, RunnerOptions, run, run_command, run_silent, }; -- 2.39.5