diff --git a/.dockerignore b/.dockerignore index 2233067c..34513768 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,6 @@ target/ -Dockerfile \ No newline at end of file +Dockerfile +.git +data +target +demos diff --git a/.gitignore b/.gitignore index 3850d09a..3bb0cc1b 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ Cargo.lock # MSVC Windows builds of rustc generate these, which store debugging information *.pdb + +.harmony_generated diff --git a/Cargo.lock b/Cargo.lock index c53f46c8..ed76c0ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -243,7 +243,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", "const-random", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "version_check", "zerocopy", @@ -450,6 +450,43 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "async-nats" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-util", + "memchr", + "nkeys", + "nuid", + "once_cell", + "pin-project", + "portable-atomic", + "rand 0.8.5", + "regex", + "ring", + "rustls-native-certs 0.7.3", + "rustls-pemfile 2.2.0", + "rustls-webpki 0.102.8", + "serde", + "serde_json", + "serde_nanos", + "serde_repr", + "thiserror 1.0.69", + "time", + "tokio", + "tokio-rustls 0.26.2", + "tokio-stream", + "tokio-util", + "tokio-websockets", + "tracing", + "tryhard", + "url", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -775,6 +812,9 @@ name = "bytes" version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +dependencies = [ + "serde", +] [[package]] name = "bytestring" @@ -1583,6 +1623,7 @@ dependencies = [ "rand_core 0.6.4", "serde", "sha2", + "signature", "subtle", "zeroize", ] @@ -2456,21 +2497,21 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] [[package]] name = "getrandom" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "js-sys", "libc", "r-efi", - "wasi 0.14.3+wasi-0.2.4", + "wasip2", "wasm-bindgen", ] @@ -2572,6 +2613,7 @@ dependencies = [ "env_logger", "fqdn", "futures-util", + "harmony_execution", "harmony_inventory_agent", "harmony_macros", "harmony_secret", @@ -2619,6 +2661,43 @@ dependencies = [ "walkdir", ] +[[package]] +name = "harmony_agent" +version = "0.1.0" +dependencies = [ + "async-nats", + "async-trait", + "cidr", + "env_logger", + "getrandom 0.3.4", + "harmony", + "harmony_macros", + "harmony_types", + "log", + "pretty_assertions", + "serde", + "serde_json", + "thiserror 2.0.16", + "tokio", +] + +[[package]] +name = "harmony_agent_deploy" +version = "0.1.0" +dependencies = [ + "cidr", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_types", + "log", + "serde", + "serde_json", + "tokio", + "url", +] + [[package]] name = "harmony_cli" version = "0.1.0" @@ -2659,6 +2738,16 @@ dependencies = [ "tokio", ] +[[package]] +name = "harmony_execution" +version = "0.1.0" +dependencies = [ + "directories", + "lazy_static", + "log", + "thiserror 2.0.16", +] + [[package]] name = "harmony_inventory_agent" version = "0.1.0" @@ -3523,7 +3612,7 @@ version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "libc", ] @@ -3963,7 +4052,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", "windows-sys 0.48.0", ] @@ -3975,7 +4064,7 @@ checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" dependencies = [ "libc", "log", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", "windows-sys 0.59.0", ] @@ -4022,6 +4111,21 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "nkeys" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf" +dependencies = [ + "data-encoding", + "ed25519", + "ed25519-dalek", + "getrandom 0.2.16", + "log", + "rand 0.8.5", + "signatory", +] + [[package]] name = "non-blank-string-rs" version = "1.0.4" @@ -4040,6 +4144,15 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "nuid" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83" +dependencies = [ + "rand 0.8.5", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -4660,7 +4773,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ "bytes", - "getrandom 0.3.3", + "getrandom 0.3.4", "lru-slab", "rand 0.9.2", "ring", @@ -4765,7 +4878,7 @@ version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", ] [[package]] @@ -5301,6 +5414,16 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustls-webpki" +version = "0.102.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +dependencies = [ + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustls-webpki" version = "0.103.4" @@ -5564,6 +5687,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_nanos" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985" +dependencies = [ + "serde", +] + [[package]] name = "serde_path_to_error" version = "0.1.17" @@ -5731,6 +5863,18 @@ dependencies = [ "libc", ] +[[package]] +name = "signatory" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31" +dependencies = [ + "pkcs8", + "rand_core 0.6.4", + "signature", + "zeroize", +] + [[package]] name = "signature" version = "2.2.0" @@ -6314,7 +6458,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e" dependencies = [ "fastrand", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "rustix 1.0.8", "windows-sys 0.60.2", @@ -6538,6 +6682,27 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-websockets" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-core", + "futures-sink", + "http 1.3.1", + "httparse", + "rand 0.8.5", + "ring", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.26.2", + "tokio-util", + "webpki-roots 0.26.11", +] + [[package]] name = "toml" version = "0.8.23" @@ -6689,6 +6854,16 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "tryhard" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5" +dependencies = [ + "pin-project-lite", + "tokio", +] + [[package]] name = "tui-logger" version = "0.14.5" @@ -6865,7 +7040,7 @@ version = "1.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "js-sys", "rand 0.9.2", "uuid-macro-internal", @@ -6936,10 +7111,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] -name = "wasi" -version = "0.14.3+wasi-0.2.4" +name = "wasip2" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" dependencies = [ "wit-bindgen", ] @@ -7061,6 +7236,15 @@ version = "0.25.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" +[[package]] +name = "webpki-roots" +version = "0.26.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" +dependencies = [ + "webpki-roots 1.0.2", +] + [[package]] name = "webpki-roots" version = "1.0.2" @@ -7438,9 +7622,9 @@ dependencies = [ [[package]] name = "wit-bindgen" -version = "0.45.0" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" [[package]] name = "writeable" diff --git a/Cargo.toml b/Cargo.toml index a256234f..18a0ff9c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "harmony_types", "harmony_macros", "harmony_tui", + "harmony_execution", "opnsense-config", "opnsense-config-xml", "harmony_cli", @@ -17,6 +18,8 @@ members = [ "harmony_secret", "adr/agent_discovery/mdns", "brocade", + "harmony_agent", + "harmony_agent/deploy", ] [workspace.package] diff --git a/README.md b/README.md index 4ccdae73..f4f13ec2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Harmony : Open-source infrastructure orchestration that treats your platform like first-class code +In other words, Harmony is a **next-generation platform engineering framework**. + _By [NationTech](https://nationtech.io)_ [![Build](https://git.nationtech.io/NationTech/harmony/actions/workflows/check.yml/badge.svg)](https://git.nationtech.io/nationtech/harmony) diff --git a/adr/018-Template-Hydration-For-Workload-Deployment.md b/adr/018-Template-Hydration-For-Workload-Deployment.md new file mode 100644 index 00000000..cd45ed97 --- /dev/null +++ b/adr/018-Template-Hydration-For-Workload-Deployment.md @@ -0,0 +1,141 @@ +# Architecture Decision Record: Template Hydration for Kubernetes Manifest Generation + +Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay + +Initial Date: 2025-01-23 + +Last Updated Date: 2025-01-23 + +## Status + +Implemented + +## Context + +Harmony's philosophy is built on three guiding principles: Infrastructure as Resilient Code, Prove It Works — Before You Deploy, and One Unified Model. Our goal is to shift validation and verification as left as possible—ideally to compile time—rather than discovering errors at deploy time. + +After investigating a few approaches such as compile-checked Askama templates to generate Kubernetes manifests for Helm charts, we found again that this approach suffered from several fundamental limitations: + +* **Late Validation:** Typos in template syntax or field names are only discovered at deployment time, not during compilation. A mistyped `metadata.name` won't surface until Helm attempts to render the template. +* **Brittle Maintenance:** Templates are string-based with limited IDE support. Refactoring requires grep-and-replace across YAML-like template files, risking subtle breakage. +* **Hard-to-Test Logic:** Testing template output requires mocking the template engine and comparing serialized strings rather than asserting against typed data structures. +* **No Type Safety:** There is no guarantee that the generated YAML will be valid Kubernetes resources without runtime validation. + +We also faced a strategic choice around Helm: use it as both *templating engine* and *packaging mechanism*, or decouple these concerns. While Helm's ecosystem integration (Harbor, ArgoCD, OCI registry support) is valuable, the Jinja-like templating is at odds with Harmony's "code-first" ethos. + +## Decision + +We will adopt the **Template Hydration Pattern**—constructing Kubernetes manifests programmatically using strongly-typed `kube-rs` objects, then serializing them to YAML files for packaging into Helm charts. + +Specifically: + +* **Write strongly typed `k8s_openapi` Structs:** All Kubernetes resources (Deployment, Service, ConfigMap, etc.) will be constructed using the typed structs generated by `k8s_openapi`. +* **Direct Serialization to YAML:** Rather than rendering templates, we use `serde_yaml::to_string()` to serialize typed objects directly into YAML manifests. This way, YAML is only used as a data-transfer format and not a templating/programming language - which it is not. +* **Helm as Packaging-Only:** Helm's role is reduced to packaging pre-rendered templates into a tarball and pushing to OCI registries. No template rendering logic resides within Helm. +* **Ecosystem Preservation:** The generated Helm charts remain fully compatible with Harbor, ArgoCD, and any Helm-compatible tool—the only difference is that the `templates/` directory contains static YAML files. + +The implementation in `backend_app.rs` demonstrates this pattern: + +```rust +let deployment = Deployment { + metadata: ObjectMeta { + name: Some(self.name.clone()), + labels: Some([("app.kubernetes.io/name".to_string(), self.name.clone())].into()), + ..Default::default() + }, + spec: Some(DeploymentSpec { /* ... */ }), + ..Default::default() +}; + +let deployment_yaml = serde_yaml::to_string(&deployment)?; +fs::write(templates_dir.join("deployment.yaml"), deployment_yaml)?; +``` + +## Rationale + +**Aligns with "Infrastructure as Resilient Code"** + +Harmony's first principle states that infrastructure should be treated like application code. By expressing Kubernetes manifests as Rust structs, we gain: + +* **Refactorability:** Rename a label and the compiler catches all usages. +* **IDE Support:** Autocomplete for all Kubernetes API fields; documentation inline. +* **Code Navigation:** Jump to definition shows exactly where a value comes from. + +**Achieves "Prove It Works — Before You Deploy"** + +The compiler now validates that: + +* All required fields are populated (Rust's `Option` type prevents missing fields). +* Field types match expectations (ports are integers, not strings). +* Enums contain valid values (e.g., `ServiceType::ClusterIP`). + +This moves what was runtime validation into compile-time checks, fulfilling the "shift left" promise. + +**Enables True Unit Testing** + +Developers can now write unit tests that assert directly against typed objects: + +```rust +let deployment = create_deployment(&app); +assert_eq!(deployment.spec.unwrap().replicas.unwrap(), 3); +assert_eq!(deployment.metadata.name.unwrap(), "my-app"); +``` + +No string parsing, no YAML serialization, no fragile assertions against rendered output. + +**Preserves Ecosystem Benefits** + +By generating standard Helm chart structures, Harmony retains compatibility with: + +* **OCI Registries (Harbor, GHCR):** `helm push` works exactly as before. +* **ArgoCD:** Syncs and manages releases using the generated charts. +* **Existing Workflows:** Teams already consuming Helm charts see no change. + +The Helm tarball becomes a "dumb pipe" for transport, which is arguably its ideal role. + +## Consequences + +### Positive + +* **Compile-Time Safety:** A broad class of errors (typos, missing fields, type mismatches) is now caught at build time. +* **Better Developer Experience:** IDE autocomplete, inline documentation, and refactor support significantly reduce the learning curve for Kubernetes manifests. +* **Testability:** Unit tests can validate manifest structure without integration or runtime checks. +* **Auditability:** The source-of-truth for manifests is now pure Rust—easier to review in pull requests than template logic scattered across files. +* **Future-Extensibility:** CustomResources (CRDs) can be supported via `kopium`-generated Rust types, maintaining the same strong typing. + +### Negative + +* **API Schema Drift:** Kubernetes API changes require regenerating `k8s_openapi` types and updating code. A change in a struct field will cause the build to fail—intentionally, but still requiring the pipeline to be updated. +* **Verbosity:** Typed construction is more verbose than the equivalent template. Builder patterns or helper functions will be needed to keep code readable. +* **Learning Curve:** Contributors must understand both the Kubernetes resource spec *and* the Rust type system, rather than just YAML. +* **Debugging Shift:** When debugging generated YAML, you now trace through Rust code rather than template files—more precise but different mental model. + +## Alternatives Considered + +### 1. Enhance Askama with Compile-Time Validation +*Pros:* Stay within familiar templating paradigm; minimal code changes. +*Cons:* Rust's type system cannot fully express Kubernetes schema validation without significant macro boilerplate. Errors would still surface at template evaluation time, not compilation. + +### 2. Use Helm SDK Programmatically (Go) +*Pros:* Direct access to Helm's template engine; no YAML serialization step. +*Cons:* Would introduce a second language (Go) into a Rust codebase, increasing cognitive load and compilation complexity. No improvement in compile-time safety. + +### 3. Raw YAML String Templating (Manual) +*Pros:* Maximum control; no external dependencies. +*Cons:* Even more error-prone than Askama; no structure validation; string concatenation errors abound. + +### 4. Use Kustomize for All Manifests +*Pros:* Declarative overlays; standard tool. +*Cons:* Kustomize is itself a layer over YAML templates with its own DSL. It does not provide compile-time type safety and would require externalizing manifest management outside Harmony's codebase. + +__Note that this template hydration architecture still allows to override templates with tools like kustomize when required__ + +## Additional Notes + +**Scalability to Future Topologies** + +The Template Hydration pattern enables future Harmony architectures to generate manifests dynamically based on topology context. For example, a `CostTopology` might adjust resource requests based on cluster pricing, manipulating the typed `Deployment::spec` directly before serialization. + +**Implementation Status** + +As of this writing, the pattern is implemented for `BackendApp` deployments (`backend_app.rs`). The next phase is to extend this pattern across all application modules (`webapp.rs`, etc.) and to standardize on this approach for any new implementations. diff --git a/brocade/examples/main.rs b/brocade/examples/main.rs index 05d74b6a..ae47de5e 100644 --- a/brocade/examples/main.rs +++ b/brocade/examples/main.rs @@ -1,7 +1,7 @@ use std::net::{IpAddr, Ipv4Addr}; use brocade::{BrocadeOptions, ssh}; -use harmony_secret::{Secret, SecretManager}; +use harmony_secret::Secret; use harmony_types::switch::PortLocation; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; diff --git a/examples/openbao/src/main.rs b/examples/openbao/src/main.rs index 63918b81..ab8c0efa 100644 --- a/examples/openbao/src/main.rs +++ b/examples/openbao/src/main.rs @@ -56,6 +56,8 @@ async fn main() { )), }; + // TODO exec pod commands to initialize secret store if not already done + harmony_cli::run( Inventory::autoload(), K8sAnywhereTopology::from_env(), diff --git a/harmony/Cargo.toml b/harmony/Cargo.toml index b4a0bed6..d1542778 100644 --- a/harmony/Cargo.toml +++ b/harmony/Cargo.toml @@ -30,6 +30,7 @@ opnsense-config = { path = "../opnsense-config" } opnsense-config-xml = { path = "../opnsense-config-xml" } harmony_macros = { path = "../harmony_macros" } harmony_types = { path = "../harmony_types" } +harmony_execution = { path = "../harmony_execution" } uuid.workspace = true url.workspace = true kube = { workspace = true, features = ["derive"] } diff --git a/harmony/src/modules/application/backend_app.rs b/harmony/src/modules/application/backend_app.rs new file mode 100644 index 00000000..d11feaa9 --- /dev/null +++ b/harmony/src/modules/application/backend_app.rs @@ -0,0 +1,801 @@ +use async_trait::async_trait; +use log::{debug, info, trace}; +use serde::Serialize; +use std::path::PathBuf; + +use crate::{ + config::{REGISTRY_PROJECT, REGISTRY_URL}, + modules::application::{ + Application, HelmPackage, OCICompliant, + config::ApplicationNetworkPort, + helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind}, + }, +}; +use harmony_execution::{RunnerOptions, run_command}; + +#[derive(Debug, Clone, Serialize)] +pub struct BuildCommand { + pub program: String, + pub args: Vec, +} + +impl BuildCommand { + pub fn new(program: impl Into, args: Vec>) -> Self { + Self { + program: program.into(), + args: args.into_iter().map(|s| s.into()).collect(), + } + } + + pub fn to_std_command(&self) -> std::process::Command { + let mut cmd = std::process::Command::new(&self.program); + cmd.args(&self.args); + cmd + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct BackendApp { + pub name: String, + pub project_root: std::path::PathBuf, + pub network_ports: Vec, + pub env_vars: Vec<(String, String)>, + pub build_cmd: BuildCommand, + pub dockerfile: Option, +} + +impl BackendApp { + fn get_dockerfile(&self) -> Result { + debug!( + "Looking for dockerfile, currently set to {:?}", + self.dockerfile + ); + if let Some(dockerfile) = &self.dockerfile { + return match dockerfile.exists() { + true => { + info!( + "Found dockerfile as intended at {}", + dockerfile.to_string_lossy() + ); + Ok(dockerfile.clone()) + } + false => Err(format!( + "Dockerfile explicitely set to {dockerfile} does not exist", + dockerfile = dockerfile.to_string_lossy() + )), + }; + } + + let existing_dockerfile = self.project_root.join("Dockerfile"); + + debug!("project_root = {:?}", self.project_root); + + debug!("checking = {:?}", existing_dockerfile); + if existing_dockerfile.exists() { + debug!( + "Checking path {:#?} for existing Dockerfile", + self.project_root.clone() + ); + return Ok(existing_dockerfile); + } + Err(format!( + "Could not find a dockerfile in {project_root} folder. Tried {existing_dockerfile}", + project_root = self.project_root.to_string_lossy(), + existing_dockerfile = existing_dockerfile.to_string_lossy(), + )) + } +} + +impl Application for BackendApp { + fn name(&self) -> String { + self.name.clone() + } +} + +#[async_trait] +impl OCICompliant for BackendApp { + async fn build_push_oci_image(&self) -> Result { + let dockerfile = self.get_dockerfile()?; + let image_tag = self.image_name(); + + // Run docker build command, streaming output to console and capturing it + let output = run_command( + std::process::Command::new("docker").args([ + "build", + "-t", + &image_tag, + "-f", + &dockerfile.to_string_lossy(), + &self.project_root.to_string_lossy(), + ]), + RunnerOptions::print_to_console(), + ) + .map_err(|e| format!("Failed to spawn docker build process: {}", e))?; + + if output.is_success() { + info!("Docker image build succeeded"); + Ok(image_tag) + } else { + Err(format!( + "Docker image build FAILED:\n{}", + output.format_output() + )) + } + } + + fn local_image_name(&self) -> String { + self.name.clone() + } + + fn image_name(&self) -> String { + format!( + "{}/{}/{}", + *REGISTRY_URL, + *REGISTRY_PROJECT, + &self.local_image_name() + ) + } +} + +#[async_trait] +impl HelmPackage for BackendApp { + fn project_root(&self) -> PathBuf { + self.project_root.clone() + } + + fn chart_name(&self) -> String { + self.name.clone() + } + + async fn build_push_helm_package(&self, image_url: &str) -> Result { + let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string()); + + // Build the typed Deployment object using the builder with initial options + helm_chart.add_resource(HelmResourceKind::Deployment( + DeploymentBuilder::with_options( + &self.name, + image_url, + Some(self.network_ports.clone()), + Some(self.env_vars.clone()), + None, + ) + .build(), + )); + + // Build the typed Service object using the helper function + if let Some(service) = + helm::create_service_from_ports(self.name.clone(), &self.network_ports) + { + helm_chart.add_resource(HelmResourceKind::Service(service)); + } + + // Write the Helm chart metadata to the project root + let chart_dir = helm_chart + .write_to(&self.project_root.join(".harmony_generated/helm/")) + .map_err(|e| format!("Failed to write Helm chart: {}", e))?; + + info!("Helm chart for '{}' written to: {:?}", self.name, chart_dir); + + Ok(chart_dir.to_string_lossy().to_string()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::modules::application::config::ApplicationNetworkPort; + use crate::modules::application::config::NetworkProtocol; + use k8s_openapi::api::apps::v1::Deployment; + use k8s_openapi::api::core::v1::{Container, EnvVar, Service as K8sService, ServicePort}; + use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString; + use serde_yaml::from_str; + use std::fs; + use std::path::Path; + use tempfile::tempdir; + + // Test Helpers + fn read_service_yaml(project_root: &Path, chart_name: &str) -> K8sService { + let path = project_root.join(format!( + ".harmony_generated/helm/{chart_name}/templates/service.yaml" + )); + let content = fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("Failed to read service.yaml at {:?}: {}", path, e)); + from_str(&content) + .unwrap_or_else(|e| panic!("Failed to parse service.yaml as K8s Service: {}", e)) + } + + fn read_deployment_yaml(project_root: &Path, chart_name: &str) -> Deployment { + let path = project_root.join(format!( + ".harmony_generated/helm/{chart_name}/templates/deployment.yaml" + )); + let content = fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("Failed to read deployment.yaml at {:?}: {}", path, e)); + from_str(&content) + .unwrap_or_else(|e| panic!("Failed to parse deployment.yaml as K8s Deployment: {}", e)) + } + + fn service_yaml_exists(project_root: &Path, chart_name: &str) -> bool { + let path = project_root.join(format!( + ".harmony_generated/helm/{chart_name}/templates/service.yaml" + )); + path.exists() + } + + // Service Assertions + fn assert_service_metadata(service: &K8sService, expected_name: &str) { + assert_eq!( + service.metadata.name.as_deref(), + Some(expected_name), + "Service name should be '{expected_name}'" + ); + } + + fn assert_service_type(service: &K8sService, expected_type: &str) { + assert_eq!( + service.spec.as_ref().and_then(|s| s.type_.as_deref()), + Some(expected_type), + "Service type should be '{expected_type}'" + ); + } + + fn assert_service_port_count(service: &K8sService, expected_count: usize) { + let ports = service + .spec + .as_ref() + .and_then(|s| s.ports.as_ref()) + .unwrap_or_else(|| panic!("Service should have ports")); + assert_eq!( + ports.len(), + expected_count, + "Service should have {expected_count} ports" + ); + } + + fn assert_service_port( + port: &ServicePort, + expected_name: &str, + expected_protocol: &str, + expected_number: i32, + ) { + assert_eq!( + port.name.as_deref(), + Some(expected_name), + "Port name should be '{expected_name}'" + ); + assert_eq!( + port.protocol.as_deref(), + Some(expected_protocol), + "Port '{expected_name}' protocol should be '{expected_protocol}'" + ); + assert_eq!( + port.port, expected_number, + "Port '{expected_name}' number should be {expected_number}" + ); + } + + fn assert_target_port_matches_service_port(port: &ServicePort) { + match &port.target_port { + Some(IntOrString::Int(target)) => { + assert_eq!( + *target, + port.port, + "Target port should match service port for '{}'", + port.name.as_deref().unwrap_or("unknown") + ); + } + _ => panic!( + "Target port should be Int for '{}'", + port.name.as_deref().unwrap_or("unknown") + ), + } + } + + // Deployment Assertions + fn assert_deployment_metadata(deployment: &Deployment, expected_name: &str) { + assert_eq!( + deployment.metadata.name.as_deref(), + Some(expected_name), + "Deployment name should be '{expected_name}'" + ); + } + + fn assert_deployment_replicas(deployment: &Deployment, expected_replicas: i32) { + let spec = deployment + .spec + .as_ref() + .unwrap_or_else(|| panic!("Deployment should have spec")); + assert_eq!( + spec.replicas, + Some(expected_replicas), + "Deployment should have {expected_replicas} replicas" + ); + } + + fn assert_selector_match_label(deployment: &Deployment, expected_label_value: &str) { + let spec = deployment + .spec + .as_ref() + .unwrap_or_else(|| panic!("Deployment should have spec")); + assert_eq!( + spec.selector + .match_labels + .as_ref() + .and_then(|m| m.get("app.kubernetes.io/name")), + Some(&expected_label_value.to_string()), + "Selector should match app name '{expected_label_value}'" + ); + } + + fn assert_pod_labels(deployment: &Deployment, expected_name: &str) { + let spec = deployment + .spec + .as_ref() + .unwrap_or_else(|| panic!("Deployment should have spec")); + let metadata = spec + .template + .metadata + .as_ref() + .unwrap_or_else(|| panic!("Pod template should have metadata")); + let labels = metadata + .labels + .as_ref() + .unwrap_or_else(|| panic!("Pod should have labels")); + + assert_eq!( + labels.get("app.kubernetes.io/name"), + Some(&expected_name.to_string()), + "Pod label app.kubernetes.io/name should be '{expected_name}'" + ); + assert_eq!( + labels.get("app.kubernetes.io/instance"), + Some(&expected_name.to_string()), + "Pod label app.kubernetes.io/instance should be '{expected_name}'" + ); + } + + // Container Assertions + fn assert_container_metadata( + container: &Container, + expected_name: &str, + expected_image: &str, + expected_pull_policy: &str, + ) { + assert_eq!( + container.name, expected_name, + "Container name should be '{expected_name}'" + ); + assert_eq!( + container.image.as_deref(), + Some(expected_image), + "Container image should be '{expected_image}'" + ); + assert_eq!( + container.image_pull_policy.as_deref(), + Some(expected_pull_policy), + "Image pull policy should be '{expected_pull_policy}'" + ); + } + + fn assert_container_ports_count(container: &Container, expected_count: usize) { + let ports = container + .ports + .as_ref() + .unwrap_or_else(|| panic!("Container should have ports")); + assert_eq!( + ports.len(), + expected_count, + "Container should have {expected_count} ports" + ); + } + + fn assert_container_port( + port: &k8s_openapi::api::core::v1::ContainerPort, + expected_name: &str, + expected_protocol: &str, + expected_number: i32, + ) { + assert_eq!( + port.name.as_deref(), + Some(expected_name), + "Container port name should be '{expected_name}'" + ); + assert_eq!( + port.protocol.as_deref(), + Some(expected_protocol), + "Container port '{expected_name}' protocol should be '{expected_protocol}'" + ); + assert_eq!( + port.container_port, expected_number, + "Container port '{expected_name}' number should be {expected_number}" + ); + } + + fn assert_container_env_vars_count(container: &Container, expected_count: usize) { + let env_vars = container + .env + .as_ref() + .unwrap_or_else(|| panic!("Container should have env vars")); + assert_eq!( + env_vars.len(), + expected_count, + "Container should have {expected_count} env vars" + ); + } + + fn assert_container_env_var(env_var: &EnvVar, expected_name: &str, expected_value: &str) { + assert_eq!( + env_var.name, expected_name, + "Env var name should be '{expected_name}'" + ); + assert_eq!( + env_var.value.as_deref(), + Some(expected_value), + "Env var '{expected_name}' value should be '{expected_value}'" + ); + } + + fn get_container(deployment: &Deployment) -> Container { + let spec = deployment + .spec + .as_ref() + .unwrap_or_else(|| panic!("Deployment should have spec")); + let pod_spec = spec + .template + .spec + .as_ref() + .unwrap_or_else(|| panic!("Pod template should have spec")); + pod_spec + .containers + .first() + .unwrap_or_else(|| panic!("Should have exactly one container")) + .clone() + } + + // Test Fixtures + fn standard_test_ports() -> Vec { + vec![ + ApplicationNetworkPort { + number: 8080, + protocol: NetworkProtocol::TCP, + name: "http".to_string(), + }, + ApplicationNetworkPort { + number: 9000, + protocol: NetworkProtocol::TCP, + name: "metrics".to_string(), + }, + ApplicationNetworkPort { + number: 50051, + protocol: NetworkProtocol::TCP, + name: "grpc".to_string(), + }, + ] + } + + fn standard_test_env_vars() -> Vec<(String, String)> { + vec![ + ("ENV_VAR_1".to_string(), "value1".to_string()), + ("ENV_VAR_2".to_string(), "value2".to_string()), + ] + } + + fn udp_test_ports() -> Vec { + vec![ + ApplicationNetworkPort { + number: 53, + protocol: NetworkProtocol::UDP, + name: "dns".to_string(), + }, + ApplicationNetworkPort { + number: 8080, + protocol: NetworkProtocol::TCP, + name: "http".to_string(), + }, + ] + } + + // Test Builder + struct BackendAppTestBuilder { + name: Option, + network_ports: Option>, + env_vars: Option>, + } + + impl BackendAppTestBuilder { + fn new() -> Self { + Self { + name: None, + network_ports: None, + env_vars: None, + } + } + + fn with_name(mut self, name: impl Into) -> Self { + self.name = Some(name.into()); + self + } + + fn with_standard_ports(mut self) -> Self { + self.network_ports = Some(standard_test_ports()); + self + } + + fn with_udp_ports(mut self) -> Self { + self.network_ports = Some(udp_test_ports()); + self + } + + fn with_standard_env_vars(mut self) -> Self { + self.env_vars = Some(standard_test_env_vars()); + self + } + + fn with_no_ports(mut self) -> Self { + self.network_ports = Some(vec![]); + self + } + + fn build(self, project_root: PathBuf) -> BackendApp { + BackendApp { + name: self.name.unwrap_or_else(|| "test-app".to_string()), + project_root, + network_ports: self.network_ports.unwrap_or_default(), + env_vars: self.env_vars.unwrap_or_default(), + build_cmd: BuildCommand::new("cargo", vec!["build"]), + dockerfile: None, + } + } + } + + impl Default for BackendAppTestBuilder { + fn default() -> Self { + Self::new() + } + } + + // Helper function for test setup + async fn build_helm_chart_for_test(app: &BackendApp, image_url: &str) { + let result = app.build_push_helm_package(image_url).await; + assert!( + result.is_ok(), + "build_push_helm_package should succeed: {:?}", + result + ); + } + + // ===== SERVICE TESTS ===== + + #[tokio::test] + async fn service_is_created_with_application_name() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let service = read_service_yaml(&app.project_root, "test-app"); + assert_service_metadata(&service, "test-app"); + } + + #[tokio::test] + async fn service_has_default_clusterip_type() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let service = read_service_yaml(&app.project_root, "test-app"); + assert_service_type(&service, "ClusterIP"); + } + + #[tokio::test] + async fn service_exposes_all_network_ports() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let service = read_service_yaml(&app.project_root, "test-app"); + assert_service_port_count(&service, 3); + + let ports = service.spec.unwrap().ports.unwrap(); + assert_service_port(&ports[0], "http", "TCP", 8080); + assert_service_port(&ports[1], "metrics", "TCP", 9000); + assert_service_port(&ports[2], "grpc", "TCP", 50051); + } + + #[tokio::test] + async fn service_target_ports_match_service_ports() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let service = read_service_yaml(&app.project_root, "test-app"); + let ports = service.spec.unwrap().ports.unwrap(); + + for port in &ports { + assert_target_port_matches_service_port(port); + } + } + + #[tokio::test] + async fn service_not_created_when_application_has_no_ports() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app-no-ports") + .with_no_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app-no-ports:1.0.0").await; + + assert!( + !service_yaml_exists(&app.project_root, "test-app-no-ports"), + "service.yaml should not exist when there are no network ports" + ); + } + + #[tokio::test] + async fn service_respects_port_protocol_type() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("udp-app") + .with_udp_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/udp-app:1.0.0").await; + + let service = read_service_yaml(&app.project_root, "udp-app"); + let ports = service.spec.unwrap().ports.unwrap(); + + assert_service_port(&ports[0], "dns", "UDP", 53); + assert_service_port(&ports[1], "http", "TCP", 8080); + } + + // ===== DEPLOYMENT METADATA TESTS ===== + + #[tokio::test] + async fn deployment_has_application_name() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + assert_deployment_metadata(&deployment, "test-app"); + } + + #[tokio::test] + async fn deployment_has_single_replica_by_default() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + assert_deployment_replicas(&deployment, 1); + } + + #[tokio::test] + async fn deployment_selector_matches_application_name() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + assert_selector_match_label(&deployment, "test-app"); + } + + #[tokio::test] + async fn pod_has_standard_kubernetes_labels() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + assert_pod_labels(&deployment, "test-app"); + } + + // ===== CONTAINER CONFIGURATION TESTS ===== + + #[tokio::test] + async fn container_has_correct_name_and_image() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + let image_url = "registry.example.com/test/test-app:1.0.0"; + build_helm_chart_for_test(&app, image_url).await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + let container = get_container(&deployment); + + assert_container_metadata(&container, "test-app", image_url, "IfNotPresent"); + } + + #[tokio::test] + async fn container_exposes_all_application_ports() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + let container = get_container(&deployment); + + assert_container_ports_count(&container, 3); + + let ports = container.ports.unwrap(); + assert_container_port(&ports[0], "http", "TCP", 8080); + assert_container_port(&ports[1], "metrics", "TCP", 9000); + assert_container_port(&ports[2], "grpc", "TCP", 50051); + } + + #[tokio::test] + async fn container_has_all_environment_variables() { + let temp_dir = tempdir().expect("Failed to create temp directory"); + let app = BackendAppTestBuilder::new() + .with_name("test-app") + .with_standard_ports() + .with_standard_env_vars() + .build(temp_dir.path().to_path_buf()); + + build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await; + + let deployment = read_deployment_yaml(&app.project_root, "test-app"); + let container = get_container(&deployment); + + assert_container_env_vars_count(&container, 2); + + let env_vars = container.env.unwrap(); + assert_container_env_var(&env_vars[0], "ENV_VAR_1", "value1"); + assert_container_env_var(&env_vars[1], "ENV_VAR_2", "value2"); + } + + // ===== BUILD COMMAND UNIT TESTS ===== + + #[test] + fn build_command_creation_sets_program_and_args() { + let cmd = BuildCommand::new("docker", vec!["build", "-t", "myimage"]); + assert_eq!(cmd.program, "docker"); + assert_eq!(cmd.args, vec!["build", "-t", "myimage"]); + } + + #[test] + fn build_command_clone_copies_all_fields() { + let cmd1 = BuildCommand::new("cargo", vec!["build", "--release"]); + let cmd2 = cmd1.clone(); + assert_eq!(cmd1.program, cmd2.program); + assert_eq!(cmd1.args, cmd2.args); + } +} diff --git a/harmony/src/modules/application/config.rs b/harmony/src/modules/application/config.rs new file mode 100644 index 00000000..9c529f1d --- /dev/null +++ b/harmony/src/modules/application/config.rs @@ -0,0 +1,29 @@ +use serde::Serialize; + +#[derive(Debug, Clone, Serialize)] +pub enum NetworkProtocol { + TCP, + UDP, +} + +impl NetworkProtocol { + pub fn as_str(&self) -> &str { + match self { + NetworkProtocol::TCP => "TCP", + NetworkProtocol::UDP => "UDP", + } + } +} + +impl std::fmt::Display for NetworkProtocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct ApplicationNetworkPort { + pub number: u16, + pub protocol: NetworkProtocol, + pub name: String, +} diff --git a/harmony/src/modules/application/features/packaging_deployment.rs b/harmony/src/modules/application/features/packaging_deployment.rs index 03f11000..2f107462 100644 --- a/harmony/src/modules/application/features/packaging_deployment.rs +++ b/harmony/src/modules/application/features/packaging_deployment.rs @@ -48,11 +48,11 @@ use crate::{ /// - ArgoCD to install/upgrade/rollback/inspect k8s resources /// - Kubernetes for runtime orchestration #[derive(Debug, Default, Clone)] -pub struct PackagingDeployment { +pub struct PackagingDeployment { pub application: Arc, } -impl PackagingDeployment { +impl PackagingDeployment { async fn deploy_to_local_k3d( &self, app_name: String, @@ -138,7 +138,7 @@ impl PackagingDeployment { #[async_trait] impl< - A: OCICompliant + HelmPackage + Webapp + Clone + 'static, + A: OCICompliant + HelmPackage + Clone + 'static, T: Topology + HelmCommand + MultiTargetTopology + K8sclient + Ingress + 'static, > ApplicationFeature for PackagingDeployment { @@ -148,24 +148,12 @@ impl< ) -> Result { let image = self.application.image_name(); - let domain = if topology.current_target() == DeploymentTarget::Production { - self.application.dns() - } else { - topology - .get_domain(&self.application.name()) - .await - .map_err(|e| e.to_string())? - }; - // TODO Write CI/CD workflow files // we can autotedect the CI type using the remote url (default to github action for github // url, etc..) // Or ask for it when unknown - let helm_chart = self - .application - .build_push_helm_package(&image, &domain) - .await?; + let helm_chart = self.application.build_push_helm_package(&image).await?; // TODO: Make building image configurable/skippable if image already exists (prompt)") // https://git.nationtech.io/NationTech/harmony/issues/104 @@ -215,12 +203,12 @@ impl< }; Ok(InstallationOutcome::success_with_details(vec![format!( - "{}: http://{domain}", + "{}", self.application.name() )])) } fn name(&self) -> String { - "ContinuousDelivery".to_string() + "PackagingDeployment".to_string() } } diff --git a/harmony/src/modules/application/helm/mod.rs b/harmony/src/modules/application/helm/mod.rs new file mode 100644 index 00000000..6b73b087 --- /dev/null +++ b/harmony/src/modules/application/helm/mod.rs @@ -0,0 +1,446 @@ +// Re-export common Kubernetes types for convenience +pub use k8s_openapi::api::{ + apps::v1::{Deployment, DeploymentSpec}, + core::v1::{ + Container, ContainerPort, EnvVar, PodSpec, PodTemplateSpec, Service as K8sService, + ServicePort, ServiceSpec, + }, +}; +use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString; +use kube::core::ObjectMeta; + +// Import domain types for the deployment builder +use crate::modules::application::config::{ApplicationNetworkPort, NetworkProtocol}; +use std::fs; +use std::path::{Path, PathBuf}; + +/// Enum representing all supported Kubernetes resource types for Helm charts. +/// Supports built-in typed resources and custom CRDs via YAML strings. +pub enum HelmResourceKind { + /// Built-in typed Service resource + Service(K8sService), + /// Built-in typed Deployment resource + Deployment(Deployment), + /// Custom resource as pre-serialized YAML (e.g., CRDs, custom types) + CustomYaml { filename: String, content: String }, + // Can add more typed variants as needed: ConfigMap, Secret, Ingress, etc. +} + +impl HelmResourceKind { + pub fn filename(&self) -> String { + match self { + HelmResourceKind::Service(_) => "service.yaml".to_string(), + HelmResourceKind::Deployment(_) => "deployment.yaml".to_string(), + HelmResourceKind::CustomYaml { filename, .. } => filename.clone(), + } + } + + pub fn serialize_to_yaml(&self) -> Result { + match self { + HelmResourceKind::Service(s) => serde_yaml::to_string(s), + HelmResourceKind::Deployment(d) => serde_yaml::to_string(d), + HelmResourceKind::CustomYaml { content, .. } => Ok(content.clone()), + } + } + + pub fn as_service(&self) -> Option<&K8sService> { + match self { + HelmResourceKind::Service(s) => Some(s), + _ => None, + } + } + + pub fn as_deployment(&self) -> Option<&Deployment> { + match self { + HelmResourceKind::Deployment(d) => Some(d), + _ => None, + } + } + + /// Add a custom resource from any serializable type (e.g., CRDs, custom types) + pub fn from_yaml(filename: impl Into, content: impl Into) -> Self { + HelmResourceKind::CustomYaml { + filename: filename.into(), + content: content.into(), + } + } + + /// Add a custom resource from any type that implements Serialize + pub fn from_serializable( + filename: impl Into, + resource: &T, + ) -> Result { + Ok(HelmResourceKind::CustomYaml { + filename: filename.into(), + content: serde_yaml::to_string(resource)?, + }) + } +} + +/// The main orchestrator for building a Helm chart. +pub struct HelmChart { + pub name: String, + pub version: String, + pub app_version: String, + pub description: String, + pub resources: Vec, + pub values: Vec, +} + +impl HelmChart { + pub fn new(name: String, app_version: String) -> Self { + Self { + name: name.clone(), + version: "0.1.0".to_string(), + app_version, + description: format!("A Helm chart for {}", name), + resources: Vec::new(), + values: Vec::new(), + } + } + + pub fn add_resource(&mut self, resource: HelmResourceKind) { + self.resources.push(resource); + } + + pub fn add_value(&mut self, key: &str, value: &str) { + self.values.push(format!("{}: {}", key, value)); + } + + pub fn write_to(&self, base_path: &Path) -> Result> { + let chart_dir = base_path.join(&self.name); + let templates_dir = chart_dir.join("templates"); + fs::create_dir_all(&templates_dir)?; + + // 1. Render and write Chart.yaml + let chart_yaml = ChartYaml { + name: &self.name, + description: &self.description, + version: &self.version, + app_version: &self.app_version, + }; + fs::write(chart_dir.join("Chart.yaml"), chart_yaml.render()?)?; + + // 2. Write values.yaml (Constructed dynamically) + let values_content = self.values.join("\n"); + fs::write(chart_dir.join("values.yaml"), values_content)?; + + // 3. Serialize and write all added resources (Deployment, Service, etc.) + for resource in &self.resources { + let filename = resource.filename(); + let content = resource + .serialize_to_yaml() + .map_err(|e| format!("Failed to serialize resource {}: {}", filename, e))?; + fs::write(templates_dir.join(filename), content)?; + } + + Ok(chart_dir) + } +} + +use askama::Template; + +#[derive(Template)] +#[template(path = "helm/Chart.yaml.j2")] +struct ChartYaml<'a> { + name: &'a str, + description: &'a str, + version: &'a str, + app_version: &'a str, +} + +/// Builder for creating a Kubernetes Service with proper labels and selectors. +pub struct ServiceBuilder { + name: String, + service_type: String, + ports: Vec, + selector_label: String, +} + +impl ServiceBuilder { + pub fn new(name: impl Into) -> Self { + Self { + name: name.into(), + service_type: "ClusterIP".to_string(), + ports: Vec::new(), + selector_label: String::new(), + } + } + + pub fn service_type(mut self, service_type: impl Into) -> Self { + self.service_type = service_type.into(); + self + } + + pub fn with_port( + mut self, + name: impl Into, + port: i32, + protocol: impl Into, + ) -> Self { + use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString; + self.ports.push(ServicePort { + name: Some(name.into()), + protocol: Some(protocol.into()), + port, + target_port: Some(IntOrString::Int(port)), + ..Default::default() + }); + self + } + + pub fn selector_label(mut self, label: impl Into) -> Self { + self.selector_label = label.into(); + self + } + + pub fn build(self) -> K8sService { + K8sService { + metadata: ObjectMeta { + name: Some(self.name.clone()), + labels: Some( + [ + ("app.kubernetes.io/name".to_string(), self.name.clone()), + ( + "app.kubernetes.io/component".to_string(), + "service".to_string(), + ), + ( + "app.kubernetes.io/managed-by".to_string(), + "harmony".to_string(), + ), + ] + .into(), + ), + ..Default::default() + }, + spec: Some(ServiceSpec { + type_: Some(self.service_type), + selector: Some( + [("app.kubernetes.io/name".to_string(), self.selector_label)].into(), + ), + ports: if self.ports.is_empty() { + None + } else { + Some(self.ports) + }, + ..Default::default() + }), + ..Default::default() + } + } +} + +/// Builder for creating a Kubernetes Deployment with pod template and container spec. +pub struct DeploymentBuilder { + name: String, + image: String, + replicas: i32, + container_ports: Vec, + env_vars: Vec, + image_pull_policy: Option, +} + +impl DeploymentBuilder { + /// Create a new DeploymentBuilder with minimal required fields. + pub fn new(name: impl Into, image: impl Into) -> Self { + Self::with_options(name, image, None, None, None) + } + + /// Create a new DeploymentBuilder with optional initial configuration. + /// + /// Arguments: + /// - `name`: The deployment name + /// - `image`: The container image to use + /// - `ports`: Optional vector of initial application network ports + /// - `env_vars`: Optional vector of initial environment variable key-value pairs + /// - `replicas`: Optional number of replicas (defaults to 1) + pub fn with_options( + name: impl Into, + image: impl Into, + ports: Option>, + env_vars: Option>, + replicas: Option, + ) -> Self { + let container_ports: Vec = ports + .unwrap_or_default() + .into_iter() + .map(|port| ContainerPort { + container_port: port.number as i32, + name: Some(port.name), + protocol: Some(port.protocol.to_string()), + ..Default::default() + }) + .collect(); + + let k8s_env_vars: Vec = env_vars + .unwrap_or_default() + .into_iter() + .map(|(key, value)| EnvVar { + name: key, + value: Some(value), + ..Default::default() + }) + .collect(); + + Self { + name: name.into(), + image: image.into(), + replicas: replicas.unwrap_or(1), + container_ports, + env_vars: k8s_env_vars, + image_pull_policy: Some("IfNotPresent".to_string()), + } + } + + pub fn replicas(mut self, replicas: i32) -> Self { + self.replicas = replicas; + self + } + + pub fn with_container_port( + mut self, + number: i32, + name: impl Into, + protocol: impl Into, + ) -> Self { + self.container_ports.push(ContainerPort { + container_port: number, + name: Some(name.into()), + protocol: Some(protocol.into()), + ..Default::default() + }); + self + } + + pub fn with_env_var(mut self, name: impl Into, value: impl Into) -> Self { + self.env_vars.push(EnvVar { + name: name.into(), + value: Some(value.into()), + ..Default::default() + }); + self + } + + pub fn image_pull_policy(mut self, policy: impl Into) -> Self { + self.image_pull_policy = Some(policy.into()); + self + } + + pub fn build(self) -> Deployment { + let name = self.name.clone(); + Deployment { + metadata: ObjectMeta { + name: Some(name.clone()), + labels: Some( + [ + ("app.kubernetes.io/name".to_string(), name.clone()), + ( + "app.kubernetes.io/component".to_string(), + "deployment".to_string(), + ), + ( + "app.kubernetes.io/managed-by".to_string(), + "harmony".to_string(), + ), + ("app.kubernetes.io/version".to_string(), "1.0.0".to_string()), + ] + .into(), + ), + ..Default::default() + }, + spec: Some(DeploymentSpec { + replicas: Some(self.replicas), + selector: k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector { + match_labels: Some( + [("app.kubernetes.io/name".to_string(), name.clone())].into(), + ), + ..Default::default() + }, + template: PodTemplateSpec { + metadata: Some(ObjectMeta { + labels: Some( + [ + ("app.kubernetes.io/name".to_string(), name.clone()), + ("app.kubernetes.io/instance".to_string(), name.clone()), + ] + .into(), + ), + ..Default::default() + }), + spec: Some(PodSpec { + containers: vec![Container { + name: name.clone(), + image: Some(self.image), + image_pull_policy: self.image_pull_policy, + ports: if self.container_ports.is_empty() { + None + } else { + Some(self.container_ports) + }, + env: if self.env_vars.is_empty() { + None + } else { + Some(self.env_vars) + }, + ..Default::default() + }], + ..Default::default() + }), + }, + ..Default::default() + }), + ..Default::default() + } + } +} + +/// Helper function to create a Service from network port configuration. +/// Returns `None` if no ports are provided. +pub fn create_service_from_ports( + name: String, + network_ports: &[ApplicationNetworkPort], +) -> Option { + if network_ports.is_empty() { + return None; + } + + let ports: Vec = network_ports + .into_iter() + .map(|port| ServicePort { + name: Some(port.name.clone()), + protocol: Some(port.protocol.to_string()), + port: port.number as i32, + target_port: Some(IntOrString::Int(port.number as i32)), + ..Default::default() + }) + .collect(); + + Some(K8sService { + metadata: ObjectMeta { + name: Some(name.clone()), + labels: Some( + [ + ("app.kubernetes.io/name".to_string(), name.clone()), + ( + "app.kubernetes.io/component".to_string(), + "service".to_string(), + ), + ( + "app.kubernetes.io/managed-by".to_string(), + "harmony".to_string(), + ), + ] + .into(), + ), + ..Default::default() + }, + spec: Some(ServiceSpec { + type_: Some("ClusterIP".to_string()), + selector: Some([("app.kubernetes.io/name".to_string(), name)].into()), + ports: Some(ports), + ..Default::default() + }), + ..Default::default() + }) +} diff --git a/harmony/src/modules/application/mod.rs b/harmony/src/modules/application/mod.rs index 03965e35..13da0840 100644 --- a/harmony/src/modules/application/mod.rs +++ b/harmony/src/modules/application/mod.rs @@ -1,5 +1,8 @@ +pub mod backend_app; +pub mod config; mod feature; pub mod features; +pub mod helm; pub mod oci; mod rust; mod webapp; @@ -124,3 +127,15 @@ impl Serialize for dyn Application { todo!() } } + +/// Checks the output of a process command for success. +fn check_output( + output: &std::process::Output, + msg: &str, +) -> Result<(), Box> { + if !output.status.success() { + let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr)); + return Err(error_message.into()); + } + Ok(()) +} diff --git a/harmony/src/modules/application/oci.rs b/harmony/src/modules/application/oci.rs index 8b1585c8..63e1c208 100644 --- a/harmony/src/modules/application/oci.rs +++ b/harmony/src/modules/application/oci.rs @@ -1,5 +1,13 @@ +use std::path::{Path, PathBuf}; + +use crate::{ + config::{REGISTRY_PROJECT, REGISTRY_URL}, + modules::application::check_output, +}; + use super::Application; use async_trait::async_trait; +use log::debug; #[async_trait] pub trait OCICompliant: Application { @@ -17,9 +25,74 @@ pub trait HelmPackage: Application { /// # Arguments /// * `image_url` - The full URL of the OCI container image to be used in the Deployment. /// * `domain` - The domain where the application is hosted. - async fn build_push_helm_package( + async fn build_push_helm_package(&self, image_url: &str) -> Result; + + fn project_root(&self) -> PathBuf; + + fn chart_name(&self) -> String; + + /// Packages a Helm chart directory into a .tgz file. + fn package_helm_chart(&self, chart_dir: &Path) -> Result> { + let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname"); + debug!( + "Launching `helm package {}` cli with CWD {}", + chart_dirname.to_string_lossy(), + &self + .project_root() + .join(".harmony_generated") + .join("helm") + .to_string_lossy() + ); + let output = std::process::Command::new("helm") + .args(["package", chart_dirname.to_str().unwrap()]) + .current_dir(self.project_root().join(".harmony_generated").join("helm")) // Run package from the parent dir + .output()?; + + check_output(&output, "Failed to package Helm chart")?; + + // Helm prints the path of the created chart to stdout. + let tgz_name = String::from_utf8(output.stdout)? + .split_whitespace() + .last() + .unwrap_or_default() + .to_string(); + if tgz_name.is_empty() { + return Err("Could not determine packaged chart filename.".into()); + } + + // The output from helm is relative, so we join it with the execution directory. + Ok(self + .project_root() + .join(".harmony_generated") + .join("helm") + .join(tgz_name)) + } + + /// Pushes a packaged Helm chart to an OCI registry. + fn push_helm_chart( &self, - image_url: &str, - domain: &str, - ) -> Result; + packaged_chart_path: &Path, + ) -> Result> { + // The chart name is the file stem of the .tgz file + let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap(); + let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT); + let oci_pull_url = format!("{oci_push_url}/{}-chart", self.chart_name()); + debug!( + "Pushing Helm chart {} to {}", + packaged_chart_path.to_string_lossy(), + oci_push_url + ); + + let output = std::process::Command::new("helm") + .args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url]) + .output()?; + + check_output(&output, "Pushing Helm chart failed")?; + + // The final URL includes the version tag, which is part of the file name + let version = chart_file_name.rsplit_once('-').unwrap().1; + debug!("pull url {oci_pull_url}"); + debug!("push url {oci_push_url}"); + Ok(format!("{}:{}", oci_pull_url, version)) + } } diff --git a/harmony/src/modules/application/rust.rs b/harmony/src/modules/application/rust.rs index 8384e78a..7e3413bb 100644 --- a/harmony/src/modules/application/rust.rs +++ b/harmony/src/modules/application/rust.rs @@ -81,16 +81,21 @@ impl Webapp for RustWebapp { #[async_trait] impl HelmPackage for RustWebapp { - async fn build_push_helm_package( - &self, - image_url: &str, - domain: &str, - ) -> Result { + fn project_root(&self) -> PathBuf { + self.project_root.clone() + } + + fn chart_name(&self) -> String { + self.name.clone() + } + + async fn build_push_helm_package(&self, image_url: &str) -> Result { + let domain = self.dns(); info!("Starting Helm chart build and push for '{}'", self.name); // 1. Create the Helm chart files on disk. let chart_dir = self - .create_helm_chart_files(image_url, domain) + .create_helm_chart_files(image_url, &domain) .await .map_err(|e| format!("Failed to create Helm chart files: {}", e))?; info!("Successfully created Helm chart files in {:?}", chart_dir); @@ -327,19 +332,6 @@ impl RustWebapp { Ok(image_tag.to_string()) } - /// Checks the output of a process command for success. - fn check_output( - &self, - output: &process::Output, - msg: &str, - ) -> Result<(), Box> { - if !output.status.success() { - let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr)); - return Err(error_message.into()); - } - Ok(()) - } - fn build_builder_image(&self, dockerfile: &mut Dockerfile) { match self.framework { Some(RustWebFramework::Leptos) => { @@ -640,71 +632,6 @@ spec: Ok(chart_dir) } - /// Packages a Helm chart directory into a .tgz file. - fn package_helm_chart(&self, chart_dir: &Path) -> Result> { - let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname"); - debug!( - "Launching `helm package {}` cli with CWD {}", - chart_dirname.to_string_lossy(), - &self - .project_root - .join(".harmony_generated") - .join("helm") - .to_string_lossy() - ); - let output = process::Command::new("helm") - .args(["package", chart_dirname.to_str().unwrap()]) - .current_dir(self.project_root.join(".harmony_generated").join("helm")) // Run package from the parent dir - .output()?; - - self.check_output(&output, "Failed to package Helm chart")?; - - // Helm prints the path of the created chart to stdout. - let tgz_name = String::from_utf8(output.stdout)? - .split_whitespace() - .last() - .unwrap_or_default() - .to_string(); - if tgz_name.is_empty() { - return Err("Could not determine packaged chart filename.".into()); - } - - // The output from helm is relative, so we join it with the execution directory. - Ok(self - .project_root - .join(".harmony_generated") - .join("helm") - .join(tgz_name)) - } - - /// Pushes a packaged Helm chart to an OCI registry. - fn push_helm_chart( - &self, - packaged_chart_path: &Path, - ) -> Result> { - // The chart name is the file stem of the .tgz file - let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap(); - let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT); - let oci_pull_url = format!("{oci_push_url}/{}-chart", self.name); - debug!( - "Pushing Helm chart {} to {}", - packaged_chart_path.to_string_lossy(), - oci_push_url - ); - - let output = process::Command::new("helm") - .args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url]) - .output()?; - - self.check_output(&output, "Pushing Helm chart failed")?; - - // The final URL includes the version tag, which is part of the file name - let version = chart_file_name.rsplit_once('-').unwrap().1; - debug!("pull url {oci_pull_url}"); - debug!("push url {oci_push_url}"); - Ok(format!("{}:{}", oci_pull_url, version)) - } - fn get_or_build_dockerfile(&self) -> Result> { let existing_dockerfile = self.project_root.join("Dockerfile"); diff --git a/harmony/templates/helm/Chart.yaml.j2 b/harmony/templates/helm/Chart.yaml.j2 new file mode 100644 index 00000000..bddcc93e --- /dev/null +++ b/harmony/templates/helm/Chart.yaml.j2 @@ -0,0 +1,6 @@ +apiVersion: v2 +name: {{ name }} +description: {{ description }} +type: application +version: {{ version }} +appVersion: "{{ app_version }}" diff --git a/harmony_agent/.dockerignore b/harmony_agent/.dockerignore new file mode 100644 index 00000000..dd9b5319 --- /dev/null +++ b/harmony_agent/.dockerignore @@ -0,0 +1,4 @@ +.git +data +target +demos diff --git a/harmony_agent/Cargo.toml b/harmony_agent/Cargo.toml new file mode 100644 index 00000000..6fb7ff5d --- /dev/null +++ b/harmony_agent/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "harmony_agent" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +harmony = { path = "../harmony" } +# harmony_cli = { path = "../harmony_cli" } +harmony_types = { path = "../harmony_types" } +harmony_macros = { path = "../harmony_macros" } +cidr = { workspace = true } +tokio = { workspace = true } +log = { workspace = true } +env_logger = { workspace = true } +async-nats = "0.45.0" +async-trait = "0.1" +# url = { workspace = true } + +serde.workspace = true +serde_json.workspace = true +getrandom = "0.3.4" + +thiserror.workspace = true +pretty_assertions.workspace = true diff --git a/harmony_agent/Dockerfile b/harmony_agent/Dockerfile new file mode 100644 index 00000000..9d72462a --- /dev/null +++ b/harmony_agent/Dockerfile @@ -0,0 +1,44 @@ +# Build stage +FROM rust:slim AS builder + +# Install build dependencies +RUN apt-get update && apt-get install -y pkg-config && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy all required packages +COPY . . + +RUN ls -la1 + +# Build the application in release mode +RUN cargo build --release -p harmony_agent + +# Runtime stage +FROM debian:bookworm-slim + +# Install runtime dependencies +RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy the binary from the builder stage +COPY --from=builder /app/target/release/harmony_agent ./harmony_agent + +# Declare environment variables used by the Harmony Agent +# These will be set from build-time environment variables if present +# NATS_URL: URL of the NATS server (default: nats://localhost:4222) +ARG NATS_URL=nats://localhost:4222 +ENV NATS_URL=${NATS_URL} +# NATS_CREDS_PATH: Optional path to NATS credentials file +ARG NATS_CREDS_PATH +ENV NATS_CREDS_PATH=${NATS_CREDS_PATH} +# MY_CLUSTER_ID: This cluster's unique identifier (required) +ARG MY_CLUSTER_ID +ENV MY_CLUSTER_ID=${MY_CLUSTER_ID} +# DESIRED_PRIMARY: The ID of the desired primary cluster (required) +ARG DESIRED_PRIMARY +ENV DESIRED_PRIMARY=${DESIRED_PRIMARY} + +# Run the application +ENTRYPOINT ["./harmony_agent"] diff --git a/harmony_agent/README.md b/harmony_agent/README.md new file mode 100644 index 00000000..189e8145 --- /dev/null +++ b/harmony_agent/README.md @@ -0,0 +1,248 @@ +TODO + +DONE: +1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type +2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision) +3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded +4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent +5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback +6. ✅ failover_timeout added to AgentConfig +7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error +8. ✅ startup reconciliation implemented via on_startup() method + +REMAINING: +- review all code and list implementation issues +- review both workflow for each state transition +- Complete replica workflow staleness detection (needs implementation in Watching state) +- Implement state recovery from Failed state for both workflows +- Implement subscribe in NATS store with watch() API +- Implement config validation for failover_timeout constraints + +TODO + +1. store trait subscribe definition missing callback +2. BUG, data integrity issue : nats store not actually using jetstream metadata +3. review all code and list implementation issues +4. review both workflow for each state transition +5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded +6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself + should hold it probably, every agent should be subscribed to the cluster_state object and + keep it in memory to allow workflows to process against it efficiently) + +## CRITICAL - Data Integrity Issues + +1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`) + - Currently uses `put()` which overwrites unconditionally + - Must use `update()` with revision parameter for proper compare-and-set + - Without this, concurrent promotion attempts can cause split brain + +2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`) + - Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3 + - NATS Entry has `.revision` and `.created` fields that must be used + - This defeats the entire purpose of store-provided timestamps + +3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`) + - Line ~156: TODO comment confirms missing metadata passing + - Replica cannot calculate staleness without metadata.timestamp + - Failover logic is broken + +4. **No actual cluster state watching exists** + - Replica workflow declares `ClusterState` but never updates it + - No subscription to primary heartbeat or cluster_state key + - Replica cannot detect primary liveness + +## HIGH - Missing Core Functionality + +5. **Replica Workflow incomplete** - All key logic is TODO: + - Watching primary staleness (line 114) + - Promotion attempt (line 118) + - Original primary recovery detection (line 127) + - Demotion/handshake (line 131) + +6. **Missing replica "Failed" state** + - `ReplicaState` enum has no `Failed` variant + - User's TODO #5 correctly identifies this gap + - What happens if replica's own heartbeats fail repeatedly? + +7. **Primary Workflow incomplete** - Key logic missing: + - No NATS check before recovering from `Fenced` state (line 95) + - No NATS check in `Yielding` state for demotion handshake (line 101) + - No actual fencing failure handling + +8. **Store `subscribe` not implemented** (`store/mod.rs`) + - Returns `todo!()` in NATS implementation + - No callback mechanism defined in trait + - Without this, agents cannot react to state changes + +9. **Cluster state not tracked centrally** + - User's TODO #6 correctly identifies this + - Each agent should maintain a local copy of cluster_state + - No subscription mechanism to update this local copy + +10. **No validation of configuration constraints** + - Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin` + - Invalid config could cause split brain + +## MEDIUM - Incorrect State Transitions + +11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`) + - Two state transitions happen in one heartbeat cycle + - Should stay in `Failed` until fencing actually completes + - What if fencing fails? State machine won't reflect it + +12. **No fencing failure handling** + - If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes + - ADR mentions escalating to radical measures, but no callback for failure + +13. **Replica `Watching` state does nothing** + - Line 115: Just logs, checks nothing + - Should be checking staleness of primary heartbeat + +14. **Demotion handshake not implemented** + - ADR section 4 details this but code doesn't implement it + - How does original primary know it should yield? + +## LOW - Observability & Reliability + +15. **No graceful shutdown mechanism** + - `run_heartbeat_loop` runs forever + - No signal handling (SIGTERM, SIGINT) + +16. **Async task errors silently ignored** + - `tokio::spawn` at lines 74, 83, 123 + - No `JoinHandle` retention or error handling + +17. **No metrics/observability** + - Only log output + - No Prometheus metrics for state transitions, failure counts, etc. + +18. **Hardcoded main() function** (`agent_loop.rs::main`) + - Not production-ready entry point + - Should load config from environment or file + +19. **Store factory pattern missing** + - TODO comment at line 54 confirms this + - Can't switch between stores via config + +20. **No backoff/retry logic for NATS operations** + - Transient failures could trigger unnecessary fencing + +21. **`AgentInfo` status is hardcoded to "HEALTHY"** + - Line 137 in `store_heartbeat` + - Should反映 actual workflow state + +22. **Unused fields in structs** + - `HeartbeatState.last_seq` set but never read + - `ClusterState.current_primary` set but never read + +## ADR-017-3 Compliance Issues + +23. **ADR violation: Clock skew not avoided** + - While ADR says use store metadata, code uses local time + +24. **Failover timeout not configurable** + - Defined in ADR but not in `AgentConfig` + - Needed for replica staleness calculation + +25. **Safety margin concept exists in ADR but not in code** + - Configuration should include this margin + +26. **No handling of Case 3 (Replica Network Lag)** + - ADR describes NATS rejection prevention + - But `set_strict` implementation accepts any write + +## Code Quality Issues + +27. **Inconsistent error handling** + - Some paths return `Err`, others `todo!()`, others ignore + +28. **Unnecessary `Clone` bounds** + - `DeploymentConfig.clone()` used frequently + - Could be optimized with `Arc` + +29. **Missing lifetime annotations** + - `KvStore::get` returns `String` key in error - inefficient + +30. **No integration points mentioned** + - PostgreSQL lifecycle control implementation missing + - Fencing via CNPG not connected + +## Production Readiness Checklist Summary + +For battle testing preparation, you need: + +**Immediate ( blockers):** +- Fix NATS store metadata usage (issues #1, #2) +- Implement strict set_strict with actual CAS (#1) +- Implement replica primary watching (#4, #5) +- Add failover_timeout config + staleness logic (#3, #24) +- Implement subscribe mechanism with callbacks (#8) + +**High priority:** +- Complete all workflow transitions (#5, #7, #11-14) +- Add cluster state tracking (#6, #9) +- Add configuration validation (#10) +- Add Replica Failed state (#6) + +**Before deployment:** +- Implement graceful shutdown (#15) +- Add error handling for spawned tasks (#16) +- Remove hardcoded main function (#18) +- Implement store factory (#19) +- Add Prometheus metrics (#17) + +**Documentation:** +- Document all configuration parameters and their trade-offs +- Add runbooks for each failure mode +- Document battle test scenarios to cover + +### Addendum: Missing Critical Issues + +#### 1. CRITICAL: Heartbeat "Lying" (Data Integrity) +* **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`. +* **The Bug:** `status: "HEALTHY".to_string()` is hardcoded. +* **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY". +* **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely. + +#### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption) +* **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`). +* **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks. +* **Scenario:** + 1. Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s). + 2. Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s). + 3. `on_active` finishes *before* `on_failover`. + 4. `on_failover` finishes last, killing the DB *after* the agent decided it was healthy. +* **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one. + +#### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk) +* **Location:** `agent_loop.rs` loop logic. +* **The Bug:** There is no "Stop the World" gate. +* **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*. +* **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again. +* **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter. + +#### 4. HIGH: NATS Bucket Name Collision +* **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`. +* **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`. +* **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state. +* **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`. + +#### 5. HIGH: Startup State Reconciliation +* **Location:** `HarmonyAgent::new`. +* **The Bug:** Agents always start in `Initializing`. +* **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader. +* **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime. +* **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check). + +### Summary of Tasks to Add + +Please add these to your master list before starting implementation: + +28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY". +29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping. +30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection). +31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`. +32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid. + +* **Think about vacuum / stop-the-world operations** + diff --git a/harmony_agent/deploy/Cargo.toml b/harmony_agent/deploy/Cargo.toml new file mode 100644 index 00000000..9aea1e4b --- /dev/null +++ b/harmony_agent/deploy/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "harmony_agent_deploy" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +harmony_types = { path = "../../harmony_types" } +harmony_macros = { path = "../../harmony_macros" } +cidr = { workspace = true } +tokio = { workspace = true } +log = { workspace = true } +env_logger = { workspace = true } +url = { workspace = true } + +serde.workspace = true +serde_json.workspace = true diff --git a/harmony_agent/deploy/src/main.rs b/harmony_agent/deploy/src/main.rs new file mode 100644 index 00000000..8baab66b --- /dev/null +++ b/harmony_agent/deploy/src/main.rs @@ -0,0 +1,63 @@ +use harmony::{ + inventory::Inventory, + modules::{ + application::{ + ApplicationScore, + backend_app::{BackendApp, BuildCommand}, + features::{Monitoring, PackagingDeployment}, + }, + monitoring::alert_channel::discord_alert_channel::DiscordWebhook, + }, + topology::K8sAnywhereTopology, +}; +use harmony_macros::hurl; +use harmony_types::k8s_name::K8sName; +use std::{path::PathBuf, sync::Arc}; + +#[tokio::main] +async fn main() { + let application = Arc::new(BackendApp { + name: "harmony-agent".to_string(), + // Since harmony_agent is part of the harmony workspace, the actual "project root" + // is not harmony_agent folder but the workspace root. + // + // So using ../ here means we MUST run this deployment script from the harmony_agent + // folder + project_root: PathBuf::from("../"), + network_ports: vec![], + env_vars: vec![ + ("NATS_URL".to_string(), "nats://nats".to_string()), + ("DESIRED_PRIMARY".to_string(), "site-1".to_string()), + ("MY_CLUSTER_ID".to_string(), "site-1".to_string()), + ("NATS_CREDS_PATH".to_string(), "".to_string()), + ], + build_cmd: BuildCommand::new("cargo", vec!["build", "--release", "-p", "harmony_agent"]), + dockerfile: Some(PathBuf::from("Dockerfile")), + }); + + let app = ApplicationScore { + features: vec![ + Box::new(PackagingDeployment { + application: application.clone(), + }), + Box::new(Monitoring { + application: application.clone(), + alert_receiver: vec![Box::new(DiscordWebhook { + name: K8sName("test-discord".to_string()), + url: hurl!("https://discord.doesnt.exist.com"), + selectors: vec![], + })], + }), + ], + application, + }; + + harmony_cli::run( + Inventory::autoload(), + K8sAnywhereTopology::from_env(), // <== Deploy to local automatically provisioned k3d by default or connect to any kubernetes cluster + vec![Box::new(app)], + None, + ) + .await + .unwrap(); +} diff --git a/harmony_agent/src/agent/config.rs b/harmony_agent/src/agent/config.rs new file mode 100644 index 00000000..86b731cf --- /dev/null +++ b/harmony_agent/src/agent/config.rs @@ -0,0 +1,79 @@ +use std::time::Duration; + +use harmony_types::id::Id; +use log::info; + +use super::heartbeat::HeartbeatFailure; +use super::role::AgentRole; + +#[derive(Debug, Clone)] +pub struct AgentConfig { + /// Number of consecutive successful heartbeats required before the service transitions from + /// failed to healthy. + pub success_threshold: usize, + /// Number of consecutive failed heartbeats required before the service transitions from + /// healthy to failed. + pub failure_threshold: usize, + /// Time between each heartbeat. If a heartbeat takes longer than this, it will be + /// considered failed. + pub heartbeat_interval: Duration, + /// Time since last observed primary heartbeat before replica considers primary stale. + /// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin + /// to avoid split brain during network partitions. + pub failover_timeout: Duration, + /// **UNSTABLE FIELD** + /// + /// For now, an agent instance only serves one deployment. This is probably fine as an agent's + /// footprint is low, but managing multiple deployments in a single instance would be a + /// significant resource usage reduction. + /// + /// Decoupling the deployment of the agent with the application's deployment could make things + /// more complicated though, where we would have to be careful about version compatibility + /// between all components managed by the agent instance. So for now it is a 1-1 map. + /// + /// But I have a feeling this could change so I am marking this field unstable to warn you, the + /// reader. + pub deployment_config_unstable: DeploymentConfig, + pub nats_url: String, + pub nats_creds_path: Option, + pub agent_id: Id, + pub cluster_id: Id, + pub desired_primary_id: Id, + /// The role this agent plays (Primary or Replica) + pub role: AgentRole, +} + +#[derive(Debug, Clone)] +pub enum DeploymentConfig { + FailoverPostgreSQL(FailoverCNPGConfig), +} + +#[derive(Debug, Clone)] +pub struct FailoverCNPGConfig { + pub cnpg_cluster_name: String, +} + +impl DeploymentConfig { + /// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres) + pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> { + match self { + DeploymentConfig::FailoverPostgreSQL(cfg) => { + info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name); + // TODO: Implement actual PG check / NATS write here + Ok(()) + } + } + } + + /// Callback: Transitioned from Unhealthy -> Healthy + pub async fn on_active(&self) { + info!("Service is now ACTIVE (Healthy)"); + // e.g., Remove fencing lock + } + + /// Callback: Transitioned from Healthy -> Unhealthy + pub async fn on_failover(&self) { + info!("Service is now FAILED (Unhealthy)"); + // e.g., Initiate self-fencing, stop accepting traffic + } +} diff --git a/harmony_agent/src/agent/heartbeat.rs b/harmony_agent/src/agent/heartbeat.rs new file mode 100644 index 00000000..5e9fc36f --- /dev/null +++ b/harmony_agent/src/agent/heartbeat.rs @@ -0,0 +1,35 @@ +use harmony_types::id::Id; +use serde::{Deserialize, Serialize}; + +use crate::store::KvMetadata; + +/// Agent-provided heartbeat information (no timestamps - those come from the store) +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AgentInfo { + pub agent_id: Id, + pub cluster_id: Id, + pub status: String, +} + +/// Complete heartbeat with both agent data and store metadata +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AgentHeartbeat { + pub agent_info: AgentInfo, + pub metadata: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub struct ClusterStateData { + pub cluster_info: ClusterState, + pub metadata: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub struct ClusterState { + pub cluster_id: Id, + pub current_primary: Option, + pub desired_primary: Id, +} + +#[derive(Debug)] +pub struct HeartbeatFailure {} diff --git a/harmony_agent/src/agent/mod.rs b/harmony_agent/src/agent/mod.rs new file mode 100644 index 00000000..3291aeaa --- /dev/null +++ b/harmony_agent/src/agent/mod.rs @@ -0,0 +1,507 @@ +use std::time::{SystemTime, UNIX_EPOCH}; +use std::{str::FromStr, sync::Arc, time::Duration}; + +use harmony_types::id::Id; +use log::{debug, error, info, trace, warn}; +use tokio::sync::RwLock; +use tokio::time::{Instant, sleep}; + +use crate::agent::heartbeat::ClusterState; +use crate::store::{KvMetadata, KvStore, KvStoreError}; +use crate::workflow::HeartbeatWorkflow; +use crate::workflow::primary::PrimaryWorkflow; +use crate::workflow::replica::ReplicaWorkflow; + +// Submodules +mod config; +pub mod heartbeat; +mod role; + +// Re-exports for backwards compatibility +pub use config::{AgentConfig, DeploymentConfig, FailoverCNPGConfig}; +pub use heartbeat::{AgentHeartbeat, AgentInfo, ClusterStateData, HeartbeatFailure}; +pub use role::AgentRole; + +pub async fn launch_agent( + role: AgentRole, + health_kv: Arc, + cluster_kv: Arc, + heartbeat_interval: Duration, + failover_timeout: Duration, +) -> Result<(), Box> +where + S: KvStore + Send + Sync + 'static, +{ + // Cheap ass fix when we boot two agents at the same time and the store does not exist, delay + // one so they don't crash because of the race + match role { + AgentRole::Primary => {} + AgentRole::Replica => { + sleep(Duration::from_millis(100)).await; + } + } + + let my_agent_name = format!("agent-{}", role); + let my_agent_id = Id::from_str(&my_agent_name).unwrap(); + + let config = AgentConfig { + role, + success_threshold: 2, + failure_threshold: 2, + heartbeat_interval, + failover_timeout, + deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig { + cnpg_cluster_name: String::from("cnpg_cluster_name"), + }), + nats_url: String::new(), + nats_creds_path: None, + agent_id: my_agent_id, + cluster_id: "cluster_test_id".into(), + desired_primary_id: "primary_id".into(), + }; + + log::info!("Harmony Agent Initialized"); + log::info!("Initializing Harmony Agent Id : {}", config.agent_id); + log::info!("Full config : {:?}", config); + + // TODO load store based on config, default to nats + // probably a good use case for a factory pattern + + let mut agent = HarmonyAgent::new(config, health_kv, cluster_kv); + + agent.reconcile_startup().await?; + + // Run the heartbeat loop + agent.run_heartbeat_loop().await; + + Ok(()) +} + +pub struct HarmonyAgent { + pub config: AgentConfig, + workflow: Box, + health_kv: Arc, + cluster_kv: Arc, + /// Last successful heartbeat, used to track sequence number for next write + /// This avoids doing a GET before every SET, reducing network round-trips + last_heartbeat: Arc>>, + /// Local copy of cluster state, updated via subscription + /// This allows workflows to make decisions without querying NATS each time + cluster_state: Arc>>, +} + +impl HarmonyAgent { + pub fn new(config: AgentConfig, health_kv: Arc, cluster_kv: Arc) -> Self { + let workflow: Box = match config.role { + AgentRole::Primary => { + info!("Initializing agent as PRIMARY"); + Box::new(PrimaryWorkflow::new( + config.success_threshold, + config.failure_threshold, + config.deployment_config_unstable.clone(), + )) + } + AgentRole::Replica => { + info!("Initializing agent as REPLICA"); + Box::new(ReplicaWorkflow::new( + config.success_threshold, + config.failure_threshold, + config.cluster_id.clone(), + config.desired_primary_id.clone(), + config.agent_id.clone(), + config.failover_timeout, + )) + } + }; + + Self { + config, + workflow, + health_kv, + cluster_kv, + last_heartbeat: Arc::new(RwLock::new(None)), + cluster_state: Arc::new(RwLock::new(None)), + } + } + + /// Generic helper to fetch and deserialize data from KV store + /// Returns Ok(Some(data)) if key exists and deserializes successfully + /// Returns Ok(None) if key doesn't exist + /// Returns Err if deserialization fails or other errors occur + async fn fetch_from_store( + &self, + store: &Arc, + key: &str, + ) -> Result, KvStoreError> + where + D: serde::de::DeserializeOwned, + { + debug!("Fetching data from key: {}", key); + + let result = store.get(key).await; + debug!("Got result from store: {:#?}", result); + + match result { + Ok(kv_result) => { + if let Some(value) = kv_result.value { + match serde_json::from_value::(value.clone()) { + Ok(data) => Ok(Some((data, kv_result.metadata))), + Err(e) => { + log::warn!("Failed to deserialize data from key {}: {}", key, e); + Err(KvStoreError::DeserializationFailed { + deserialization_error: format!( + "Key exists but deserialization failed for {key}: {e}" + ), + value: value.to_string(), + }) + } + } + } else { + Err(KvStoreError::Unknown(format!( + "Key exists but value is empty for {key}, this should not happen" + ))) + } + } + Err(KvStoreError::KeyNotAvailable(_)) => { + debug!("Key {} not found in store", key); + Ok(None) + } + Err(e) => { + log::warn!("Failed to fetch data from key {}: {}", key, e); + Err(e) + } + } + } + + /// Reconcile startup state by fetching cluster state and heartbeat from the store + /// This allows the workflow to determine if it should resume as Primary/Replica + /// based on the persisted cluster state + pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> { + let cluster_key = format!("cluster.{}", self.config.cluster_id); + + debug!( + "Fetching cluster state for startup reconciliation from key: {}", + cluster_key + ); + + let cluster_state_option = match self + .fetch_from_store::(&self.cluster_kv, &cluster_key) + .await? + { + Some((data, metadata)) => Some(ClusterStateData { + cluster_info: data, + metadata: Some(metadata), + }), + None => { + debug!( + "Cluster state key not found, this is a fresh cluster, initializing cluster state" + ); + Some(self.store_cluster_state(None).await?) + } + }; + + debug!("Found cluster state {cluster_state_option:#?}"); + self.workflow + .on_startup(cluster_state_option.as_ref(), &self.config) + .await; + + // Cache the cluster state locally + *self.cluster_state.write().await = cluster_state_option; + // Fetch last heartbeat if it exists to avoid sequence conflicts + let heartbeat_key = format!("heartbeat.{}", self.config.agent_id); + debug!("Fetching last heartbeat from key: {}", heartbeat_key); + + let last_heartbeat_option = self.health_kv.get(&heartbeat_key).await; + + let last_heartbeat = match last_heartbeat_option { + Ok(kv_result) => { + let value = kv_result + .value + .expect("When key exist it should always contain data"); + Some(AgentHeartbeat { + agent_info: serde_json::from_value::(value.clone()).map_err( + |e| KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: value.to_string(), + }, + )?, + metadata: Some(kv_result.metadata), + }) + } + Err(e) => match e { + KvStoreError::KeyNotAvailable(_) => None, + _ => return Err(e), + }, + }; + if let Some(heartbeat) = &last_heartbeat { + debug!( + "Found existing heartbeat with sequence: {}", + heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0) + ); + } else { + debug!("No existing heartbeat found, starting fresh"); + } + + // Cache the last heartbeat for sequence tracking + *self.last_heartbeat.write().await = last_heartbeat; + + Ok(()) + } + + async fn store_cluster_state( + &self, + cluster_data: Option, + ) -> Result { + let key = format!("cluster.{}", self.config.cluster_id); + match cluster_data { + Some(cluster_data) => { + debug!("found some cluster state {:#?}", cluster_data); + + let value = serde_json::to_value(&cluster_data.cluster_info).map_err(|e| { + KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: format!("{:?}", cluster_data), + } + })?; + + let expected_sequence = { + let last = self.cluster_state.read().await; + last.as_ref() + .and_then(|hb| hb.metadata.as_ref()) + .map(|m| m.sequence) + .unwrap_or(0) + }; + + debug!("expected sequence {:#?}", expected_sequence); + let new_seq = self + .cluster_kv + .set_strict(&key, value, expected_sequence) + .await?; + + let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?; + debug!("cluster kv {:#?}", cluster_kv_result); + + let cluster_data_new = ClusterStateData { + cluster_info: cluster_data.cluster_info.clone(), + metadata: Some(cluster_kv_result.metadata), + }; + + *self.cluster_state.write().await = Some(cluster_data_new.clone()); + Ok(cluster_data) + } + None => { + let cluster_info = ClusterState { + cluster_id: self.config.cluster_id.clone(), + current_primary: None, + desired_primary: self.config.desired_primary_id.clone(), + }; + + let value = serde_json::to_value(&cluster_info).map_err(|e| { + KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: format!("{:?}", cluster_info), + } + })?; + + let cluster_data = ClusterStateData { + cluster_info, + metadata: None, + }; + + let new_seq = self.cluster_kv.set_strict(&key, value, 0).await?; + + let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?; + debug!("cluster kv {:#?}", cluster_kv_result); + + let cluster_data_new = ClusterStateData { + cluster_info: cluster_data.cluster_info.clone(), + metadata: Some(cluster_kv_result.metadata), + }; + + *self.cluster_state.write().await = Some(cluster_data_new.clone()); + Ok(cluster_data_new) + } + } + } + + /// Sends agent heartbeat to the KV store + /// + /// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence) + /// to avoid clock skew issues. This follows the ADR-017-3 principle that all timestamp + /// comparisons use the store's clock, not agent clocks. + /// + /// This method uses the last successful heartbeat's sequence number to avoid an extra + /// GET call before each SET, reducing network round-trips and latency exposure. + async fn store_heartbeat(&self) -> Result { + let key = format!("heartbeat.{}", self.config.agent_id); + + // Create agent info WITHOUT timestamp - the store will add metadata + // Use workflow state to report actual status (e.g. Primary:Fenced, Replica:Watching) + let agent_info = AgentInfo { + agent_id: self.config.agent_id.clone(), + cluster_id: self.config.cluster_id.clone(), + status: self.workflow.state_name().to_string(), + }; + + debug!("Storing heartbeat for agent: {}", self.config.agent_id); + let value = + serde_json::to_value(&agent_info).map_err(|e| KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: format!("{:?}", agent_info), + })?; + + let expected_sequence = { + let last = self.last_heartbeat.read().await; + last.as_ref() + .and_then(|hb| hb.metadata.as_ref()) + .map(|m| m.sequence) + .unwrap_or(0) + }; + + trace!("Writing new heartbeat {key} (#{expected_sequence}), value: {value:?}"); + let new_seq = self + .health_kv + .set_strict(&key, value, expected_sequence) + .await?; + trace!("Got new sequence {new_seq}"); + let kv_result = self.health_kv.get_revision(&key, new_seq).await?; + + debug!("Heartbeat stored succsssfully with sequence: {}", new_seq); + + // Construct complete heartbeat with metadata from store + let heartbeat = AgentHeartbeat { + agent_info, + metadata: Some(kv_result.metadata), + }; + + // Cache this successful heartbeat for next iteration + *self.last_heartbeat.write().await = Some(heartbeat.clone()); + + Ok(heartbeat) + } + + pub async fn run_heartbeat_loop(&mut self) { + let mut next_heartbeat_start; + loop { + let this_heartbeat_start = Instant::now(); + next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval; + + // Perform the check via the config/strategy with a timeout + // + // FIXME There is too much stuff happening inside the timeout. There are some things like a + // promotion, that we don't want to cancel within a single heartbeat interval timeout + // I think that the timeout should only apply to the store_heartbeat().await call. + // Logic happening after should not be affected in the exact same manner. There can be + // other timeouts or other stuff to consider here. + // However, the system does rely on heartbeats happening regularly, so we do not want + // to delay the next heartbeat either. This is tricky. + // An idea right now is to keep the heartbeat running but, when a processing event + // occurs, set a flag on the local agent that there is a process running (promotion, + // demotion, etc) and take no other decision until this process is not done. There is + // one exception we can think of right now : + // - a healthy primary starts running a process such as "calling mom" + // - the primary keeps sending its heartbeat to prove to the rest of the cluster that + // it is still healthy + // - then the primary heartbeat fails up to failure_threshold + // - at this moment the "calling mom" process must not prevent the primary from fencing itself. Otherwise the replica that promotes itself when it realises that the primary is dead will cause a split brain. + // - Another solution would be register the processing: "calling mom" in the primary + // heartbeat store, and prevent the replica from promoting when there is a running + // task on the primary. + let result = tokio::time::timeout(self.config.heartbeat_interval, async { + // Store heartbeat and perform deployment-specific health check + match &self.store_heartbeat().await { + Ok(heartbeat) => { + // Heartbeat stored successfully, already cached by store_heartbeat + debug!( + "Heartbeat stored: seq={}", + heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0) + ); + } + Err(KvStoreError::WrongLastRevision) => { + todo!("fetch and update correct last sequence number") + // CAS failure could indicate: + // 1. Network latency: our previous timeout heartbeat actually succeeded + // 2. Agent ID conflict: another agent with same ID exists + // 3. Clock/bucket corruption (unlikely) + + // log::warn!( + // "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}", + // self.config.agent_id, expected, current, current + // ); + // // Update cached heartbeat sequence to prevent repeated failures + // if let Some(hb) = self.last_heartbeat.write().await.as_mut() { + // if let Some(metadata) = hb.metadata.as_mut() { + // metadata.sequence = *current; + // } + // } + } + Err(e) => { + // Actual storage failure - treat as heartbeat failure + log::error!("Heartbeat storage error: {}", e); + return Err(HeartbeatFailure {}); + } + } + self.config + .deployment_config_unstable + .perform_heartbeat() + .await?; + + // TODO: Pass the heartbeat with metadata to the workflow for staleness checks + // The workflow needs access to metadata.timestamp for failover timeout calculations + Ok::<(), HeartbeatFailure>(()) + }) + .await; + + // Update Counters & Handle State Transitions + // Timeout is also treated as a failure + let heartbeat_result = match result { + Ok(inner_result) => inner_result, + Err(_) => Err(HeartbeatFailure {}), + }; + + trace!("Got heartbeat_result : {heartbeat_result:?}"); + match heartbeat_result { + Ok(_) => { + let new_state = self + .workflow + .handle_heartbeat_success( + self.cluster_state.read().await.as_ref(), + &self.config, + ) + .await; + if let Some(new_state) = new_state { + warn!("Got new cluster state : {new_state:#?}"); + self.store_cluster_state(Some(new_state)) + .await + .expect(&format!("cluster state not able to be stored")); + } + } + Err(_) => { + self.workflow + .handle_heartbeat_failure(self.cluster_state.read().await.as_ref()) + .await; + } + } + + info!( + "Heartbeat : success={heartbeat_emoji} state={state}, successes={consecutive_successes}/{success_threshold}, fails={consecutive_failures}/{failure_threshold} took={heartbeat_duration}ms", + success_threshold = self.config.success_threshold, + failure_threshold = self.config.failure_threshold, + state = self.workflow.state_name(), + consecutive_successes = self.workflow.consecutive_successes(), + consecutive_failures = self.workflow.consecutive_failures(), + heartbeat_emoji = if heartbeat_result.is_ok() { + "✅" + } else { + "❌" + }, + heartbeat_duration = (Instant::now() - this_heartbeat_start).as_millis(), + ); + debug!( + "Sleeping for {} ms before next heartbeat", + (next_heartbeat_start - Instant::now()).as_millis() + ); + tokio::time::sleep_until(next_heartbeat_start).await; + } + } +} diff --git a/harmony_agent/src/agent/role.rs b/harmony_agent/src/agent/role.rs new file mode 100644 index 00000000..e9b719cf --- /dev/null +++ b/harmony_agent/src/agent/role.rs @@ -0,0 +1,17 @@ +use std::fmt; + +/// The role of this agent instance +#[derive(Debug, Clone, PartialEq)] +pub enum AgentRole { + Primary, + Replica, +} + +impl fmt::Display for AgentRole { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + AgentRole::Primary => write!(f, "primary"), + AgentRole::Replica => write!(f, "replica"), + } + } +} diff --git a/harmony_agent/src/config.rs b/harmony_agent/src/config.rs new file mode 100644 index 00000000..394a774d --- /dev/null +++ b/harmony_agent/src/config.rs @@ -0,0 +1,90 @@ +use harmony_types::id::Id; +use log::debug; +use std::env; +use std::path::Path; +use std::time::Duration; + +/// Configuration for the Harmony Agent +#[derive(Debug, Clone)] +pub struct AgentConfig { + pub nats_url: String, + pub nats_creds_path: Option, + pub my_cluster_id: Id, + pub desired_primary: Id, + pub heartbeat_interval: Duration, +} + +pub const NATS_URL: &str = "NATS_URL"; +pub const DESIRED_PRIMARY: &str = "DESIRED_PRIMARY"; +pub const MY_CLUSTER_ID: &str = "MY_CLUSTER_ID"; +pub const NATS_CREDS_PATH: &str = "NATS_CREDS_PATH"; + +impl AgentConfig { + pub fn load_from_env() -> Result { + let nats_url = env::var(NATS_URL).unwrap_or_else(|_| "nats://localhost:4222".to_string()); + + // Validate NATS URL is not empty + if nats_url.is_empty() { + return Err(format!("{NATS_URL} cannot be empty")); + } + + // Validate NATS URL format + if !nats_url.starts_with("nats://") && !nats_url.starts_with("tls://") { + return Err(format!( + "Invalid NATS URL format: {}. Must start with 'nats://' or 'tls://'", + nats_url + )); + } + + let nats_creds_path = env::var(NATS_CREDS_PATH) + .ok() + .filter(|creds_path| !creds_path.is_empty()); + + // Validate NATS creds path if provided + if let Some(creds_path) = &nats_creds_path { + debug!("Validating nats creds path from env var {NATS_CREDS_PATH} : {nats_creds_path:?}"); + let path = Path::new(creds_path); + if !path.exists() { + return Err(format!( + "NATS credentials file does not exist: {}", + creds_path + )); + } + if !path.is_file() { + return Err(format!( + "NATS credentials path is not a file: {}", + creds_path + )); + } + // Check if file is readable by attempting to read metadata + if std::fs::metadata(path).is_err() { + return Err(format!( + "NATS credentials file is not readable: {}", + creds_path + )); + } + } + + let my_cluster_id_str = env::var(MY_CLUSTER_ID) + .map_err(|_| "Environment variable {MY_CLUSTER_ID} is required".to_string())?; + + if my_cluster_id_str.is_empty() { + return Err(format!("{MY_CLUSTER_ID} cannot be empty")); + } + + let desired_primary_str = env::var(DESIRED_PRIMARY) + .map_err(|_| "Environment variable {DESIRED_PRIMARY} is required".to_string())?; + + if desired_primary_str.is_empty() { + return Err(format!("{DESIRED_PRIMARY} cannot be empty")); + } + + Ok(Self { + nats_url, + nats_creds_path, + my_cluster_id: my_cluster_id_str.into(), + desired_primary: desired_primary_str.into(), + heartbeat_interval: Duration::from_millis(1000), + }) + } +} diff --git a/harmony_agent/src/main.rs b/harmony_agent/src/main.rs new file mode 100644 index 00000000..a5947c22 --- /dev/null +++ b/harmony_agent/src/main.rs @@ -0,0 +1,82 @@ +use std::{sync::Arc, time::Duration}; + +use crate::{ + agent::AgentRole, + store::{ChaosKvStore, InMemoryKvStore, NatsKvStore}, +}; + +// mod agent_loop; +mod agent; +pub mod store; +mod workflow; + +#[tokio::main] +async fn main() { + env_logger::init(); + + let heartbeat_interval = Duration::from_millis(2000); + let failover_timeout = Duration::from_secs(10); + + // let (health_kv, cluster_kv) = get_chaos_store(&heartbeat_interval, &failover_timeout); + + let nats_store = get_local_nats_store().await; + let health_kv = nats_store.clone(); + let cluster_kv = nats_store.clone(); + + let _ = tokio::join!( + agent::launch_agent( + AgentRole::Primary, + health_kv.clone(), + cluster_kv.clone(), + heartbeat_interval, + failover_timeout + ), + agent::launch_agent( + AgentRole::Replica, + health_kv, + cluster_kv, + heartbeat_interval, + failover_timeout + ), + ); +} + +fn get_chaos_store( + heartbeat_interval: &Duration, + failover_timeout: &Duration, +) -> ( + Arc>, + Arc>, +) { + let health_kv = Arc::new(ChaosKvStore::new( + InMemoryKvStore::new(), + 10, + 10, + heartbeat_interval.as_millis().try_into().unwrap(), + )); + let cluster_kv = Arc::new(ChaosKvStore::new( + InMemoryKvStore::new(), + 5, + 5, + failover_timeout.as_millis().try_into().unwrap(), + )); + + (health_kv, cluster_kv) +} + +async fn get_local_nats_store() -> Arc { + let client = async_nats::connect("localhost").await.unwrap(); + let jetstream = async_nats::jetstream::new(client); + let kv = jetstream + .create_key_value(async_nats::jetstream::kv::Config { + bucket: "kv".to_string(), + history: 10, + ..Default::default() + }) + .await + .unwrap(); + let status = kv.status().await.unwrap(); + println!("status: {:?}", status); + + Arc::new(NatsKvStore::new(kv)) +} diff --git a/harmony_agent/src/store/chaos.rs b/harmony_agent/src/store/chaos.rs new file mode 100644 index 00000000..402cf8f7 --- /dev/null +++ b/harmony_agent/src/store/chaos.rs @@ -0,0 +1,142 @@ +use async_trait::async_trait; +use log::{debug, trace, warn}; +use serde_json::Value; +use std::sync::Arc; +use tokio::time::Duration; + +use crate::store::SubscriptionCallback; + +use super::{KvStore, KvStoreError}; + +/// A chaos testing KV store that randomly times out or fails +/// Wraps another KvStore implementation and adds random failures +#[derive(Clone)] +pub struct ChaosKvStore { + inner: Arc, + timeout_probability_percent: u32, + failure_probability_percent: u32, + max_delay_ms: u64, +} + +impl ChaosKvStore { + pub fn new( + inner: T, + timeout_probability_percent: u32, + failure_probability_percent: u32, + max_delay_ms: u64, + ) -> Self { + Self { + inner: Arc::new(inner), + timeout_probability_percent, + failure_probability_percent, + max_delay_ms, + } + } + + async fn maybe_chaos(&self) -> Result<(), KvStoreError> { + trace!("Calculating chaos"); + // Random delay + let delay = getrandom::u64().unwrap() % self.max_delay_ms; + let delay = Duration::from_millis(delay); + trace!("Sleeping until chaos maybe happens {delay:?}"); + tokio::time::sleep(delay).await; + + // Random failure + let failure_random = getrandom::u32().unwrap() % 100; + if failure_random < self.failure_probability_percent { + warn!( + "Chaos causes an error : {failure_random} < {}", + self.failure_probability_percent + ); + return Err(KvStoreError::Unknown(format!( + "Randomly failed thanks to chaos store with {}% chances, got {}", + self.failure_probability_percent, failure_random + ))); + } + + // Random timeout (simulated as a very long delay) + let failure_random = getrandom::u32().unwrap() % 100; + if failure_random < self.timeout_probability_percent { + warn!( + "Chaos caused a timeout : {failure_random} < {}", + self.failure_probability_percent + ); + tokio::time::sleep(Duration::from_secs(189754678456784560)).await; + } + + Ok(()) + } +} + +#[async_trait] +impl KvStore for ChaosKvStore { + async fn get(&self, key: &str) -> Result { + self.maybe_chaos().await?; + self.inner.get(key).await + } + + async fn get_revision( + &self, + key: &str, + expected_seq: u64, + ) -> Result { + self.maybe_chaos().await?; + self.inner.get_revision(key, expected_seq).await + } + + async fn set_strict( + &self, + key: &str, + value: Value, + expected_sequence: u64, + ) -> Result { + self.maybe_chaos().await?; + self.inner.set_strict(key, value, expected_sequence).await + } + + async fn subscribe( + &self, + key: &str, + callback: SubscriptionCallback, + ) -> Result<(), KvStoreError> { + self.maybe_chaos().await?; + self.inner.subscribe(key, callback).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::store::InMemoryKvStore; + use serde_json::json; + + #[tokio::test] + async fn test_chaos_store_with_no_chaos() { + let inner = InMemoryKvStore::new(); + let chaos = ChaosKvStore::new(inner, 0, 0, 1); + + let value = json!({"test": "value"}); + let result = chaos.set_strict("key", value.clone(), 0).await.unwrap(); + assert_eq!(result, 1); + + let retrieved = chaos.get("key").await.unwrap(); + assert_eq!(retrieved.value, Some(value)); + } + + #[tokio::test] + async fn test_chaos_store_with_delay() { + let inner = InMemoryKvStore::new(); + let chaos = ChaosKvStore::new(inner, 0, 0, 100); + + let start = tokio::time::Instant::now(); + let value = json!({"test": "value"}); + chaos.set_strict("key", value, 0).await.unwrap(); + let elapsed = start.elapsed(); + + // Should have some delay + assert!( + elapsed.as_millis() < 150, + "Should complete within reasonable time" + ); + } +} diff --git a/harmony_agent/src/store/memory.rs b/harmony_agent/src/store/memory.rs new file mode 100644 index 00000000..12afc51c --- /dev/null +++ b/harmony_agent/src/store/memory.rs @@ -0,0 +1,196 @@ +use async_trait::async_trait; +use log::{debug, trace}; +use serde_json::Value; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; +use tokio::sync::RwLock; + +use crate::store::SubscriptionCallback; + +use super::{KvMetadata, KvResult, KvStore, KvStoreError}; + +/// An in-memory KV store that guarantees ordering like NATS JetStream +/// Each key maintains a full history of all writes, where the sequence number +/// is the length of the history (1-indexed) +#[derive(Clone)] +pub struct InMemoryKvStore { + data: Arc>>>, +} + +impl InMemoryKvStore { + pub fn new() -> Self { + Self { + data: Arc::new(RwLock::new(HashMap::new())), + } + } + + /// Get the latest sequence number for a key (length of history) + pub async fn get_seq(&self, key: &str) -> Option { + self.data.read().await.get(key).map(|vec| vec.len() as u64) + } + + /// Get the value at a specific revision for a key + pub async fn get_revision(&self, key: &str, seq: u64) -> Result { + let data = self.data.read().await; + let entries = data + .get(key) + .ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?; + + // Sequence numbers are 1-indexed, so seq must be >= 1 and <= len() + if seq == 0 || seq > entries.len() as u64 { + return Err(KvStoreError::KeyNotAvailable(key.to_string())); + } + + let (value, timestamp) = entries[seq as usize - 1].clone(); + + Ok(KvResult { + value: Some(value.clone()), + metadata: KvMetadata { + timestamp, + sequence: seq, + }, + }) + } +} + +impl Default for InMemoryKvStore { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl KvStore for InMemoryKvStore { + async fn get_revision(&self, key: &str, expected_seq: u64) -> Result { + self.get_revision(key, expected_seq).await + } + + async fn get(&self, key: &str) -> Result { + let data = self.data.read().await; + let entries = data + .get(key) + .ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?; + + let (value, timestamp) = entries.last().unwrap(); + + Ok(KvResult { + value: Some(value.clone()), + metadata: KvMetadata { + timestamp: *timestamp, + sequence: entries.len() as u64, + }, + }) + } + + async fn set_strict( + &self, + key: &str, + value: Value, + expected_sequence: u64, + ) -> Result { + // Check current sequence (length of history for this key) + let data = self.data.read().await; + // This implemenetation does not seem to match the NATS sequence. In nats the + // sequence updates one counter per bucket. This impl creates a counter per key + let current_sequence = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0); + drop(data); + + // Verify expected sequence matches + if current_sequence != expected_sequence { + trace!("{current_sequence} != {expected_sequence}"); + return Err(KvStoreError::WrongLastRevision); + } + + // Get current timestamp + let timestamp = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_millis() as u64; + + // Append to the history + let mut data = self.data.write().await; + data.entry(key.to_string()) + .or_insert_with(Vec::new) + .push((value.clone(), timestamp)); + + let new_seq = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0); + + debug!( + "Successfully inserted {key}(rev#{new_seq}) : {value}", + value = value.to_string() + ); + + Ok(new_seq) + } + + async fn subscribe( + &self, + key: &str, + callback: SubscriptionCallback, + ) -> Result<(), KvStoreError> { + // For now, subscribe just returns the current value + // In a real implementation, this would return a stream of updates + self.get(key).await; + todo!() // register callback and call it when key is set ? + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[tokio::test] + async fn test_memory_store_basic() { + let store = InMemoryKvStore::new(); + + // Set a value + let value = json!({"status": "healthy"}); + let result = store + .set_strict("test_key", value.clone(), 0) + .await + .unwrap(); + assert_eq!(result, 1); + + // Get the value + let retrieved = store.get("test_key").await.unwrap(); + assert_eq!(retrieved.value, Some(value)); + assert_eq!(retrieved.metadata.sequence, 1); + } + + #[tokio::test] + async fn test_memory_store_sequence_numbers() { + let store = InMemoryKvStore::new(); + + let seq1 = store.set_strict("key1", json!("value1"), 0).await.unwrap(); + + let seq2 = store.set_strict("key1", json!("value2"), 1).await.unwrap(); + + assert!(seq2 > seq1, "Sequence numbers should increment"); + } + + #[tokio::test] + async fn test_memory_store_key_not_found() { + let store = InMemoryKvStore::new(); + let result = store.get("nonexistent").await; + assert!(matches!(result, Err(KvStoreError::KeyNotAvailable(_)))); + } + + #[tokio::test] + async fn test_memory_store_strict_ordering() { + let store = InMemoryKvStore::new(); + + // First write with sequence 0 + let result1 = store.set_strict("key", json!("value1"), 0).await.unwrap(); + assert_eq!(result1, 1); + + // Second write with correct sequence + let result2 = store.set_strict("key", json!("value2"), 1).await.unwrap(); + assert_eq!(result2, 2); + + // Third write with wrong sequence should fail + let result3 = store.set_strict("key", json!("value3"), 1).await; + assert!(matches!(result3, Err(KvStoreError::WrongLastRevision))); + } +} diff --git a/harmony_agent/src/store/mod.rs b/harmony_agent/src/store/mod.rs new file mode 100644 index 00000000..617df34c --- /dev/null +++ b/harmony_agent/src/store/mod.rs @@ -0,0 +1,120 @@ +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use thiserror::Error; + +/// Handle for managing active subscriptions +#[derive(Debug, Clone)] +pub struct SubscriptionHandle { + id: usize, + _phantom: std::marker::PhantomData<()>, +} + +/// Metadata returned by the KV store for all operations +/// Contains timing and ordering information set by the store +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub struct KvMetadata { + /// Timestamp set by the store (milliseconds since UNIX epoch) + pub timestamp: u64, + /// Sequence number for strict ordering guarantees + pub sequence: u64, +} + +/// Result returned by KV store operations +/// Contains both the value (if any) and store metadata +#[derive(Debug, Clone)] +pub struct KvResult { + /// The value from the store (None if key doesn't exist) + pub value: Option, + /// Store-provided metadata (timestamp, sequence) + pub metadata: KvMetadata, +} + +/// Callback type for subscription updates +/// Callback receives: key, new value (None if deleted), and metadata +pub type SubscriptionCallback = Box, KvMetadata) + Send + Sync>; + +#[derive(Error, Debug)] +pub enum KvStoreError { + #[error("data store disconnected")] + Disconnect(#[from] std::io::Error), + #[error("invalid key")] + InvalidKey, + #[error("operation timed out")] + Timeout, + #[error("the data for key `{0}` is not available")] + KeyNotAvailable(String), + #[error("Failed to deserialize value to json. Error {0} , value: {1}", .deserialization_error, .value)] + DeserializationFailed { + deserialization_error: String, + value: String, + }, + #[error("Strict ordering violation, wrong last sequence number")] + WrongLastRevision, + #[error("unknown data store error {0}")] + Unknown(String), +} + +#[async_trait] +pub trait KvStore { + /// Get a value from the store + /// + /// # Returns + /// - `Ok(KvResult)`: Contains the value and metadata (timestamp, sequence) + /// - `Err(KeyNotAvailable)`: If the key doesn't exist + async fn get(&self, key: &str) -> Result; + + async fn get_revision(&self, key: &str, expected_seq: u64) -> Result; + + /// Strict set operation with compare-and-set semantics + /// + /// Sets the value only if the current sequence number matches `expected_sequence`. + /// This provides strict ordering guarantees needed for the failover algorithm. + /// + /// # Parameters + /// - `key`: The key to set + /// - `value`: The value to store + /// - `expected_sequence`: The sequence number we expect the key to currently have. + /// Use 0 for the first write to a new key. + /// + /// # Returns + /// - `Ok(u64)`: Returns the new sequence number + /// - `Err(KvStoreError)`: If another write happened (current != expected) + /// + /// # Example Use Case + /// For NATS JetStream, this maps to the conditional update operation that ensures + /// only one agent can successfully promote to primary. + async fn set_strict( + &self, + key: &str, + value: Value, + expected_sequence: u64, + ) -> Result; + + /// Subscribe to updates for a key + /// + /// # Parameters + /// - `key`: The key to subscribe to + /// - `callback`: Function to call on each update with key, value, and metadata + /// + /// # Returns + /// - `Ok(())`: Subscription established successfully + /// - `Err(KvStoreError)`: Subscription failed + /// + /// Note: For JetStream, this should use watch() API. Updates will invoke the callback + /// asynchronously in the background. + async fn subscribe( + &self, + key: &str, + callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a + // callback + ) -> Result<(), KvStoreError>; +} + +mod chaos; +mod memory; +mod nats; + +pub use chaos::ChaosKvStore; +pub use memory::InMemoryKvStore; +pub use nats::NatsKvStore; diff --git a/harmony_agent/src/store/nats.rs b/harmony_agent/src/store/nats.rs new file mode 100644 index 00000000..c89bc54c --- /dev/null +++ b/harmony_agent/src/store/nats.rs @@ -0,0 +1,179 @@ +use async_nats::jetstream::kv::{Store, UpdateError}; +use async_trait::async_trait; +use log::{debug, error, trace}; +use serde_json::Value; + +use crate::store::SubscriptionCallback; + +use super::{KvMetadata, KvResult, KvStore, KvStoreError}; + +/// NATS JetStream-backed KV store +pub struct NatsKvStore { + store: Store, +} + +impl NatsKvStore { + pub fn new(store: Store) -> Self { + Self { store } + } + + pub async fn create( + client: async_nats::Client, + bucket_name: &str, + history_size: i64, + ) -> Result> { + let jetstream = async_nats::jetstream::new(client); + + debug!("Creating NATS KV bucket: {}", bucket_name); + let store = jetstream + .create_key_value(async_nats::jetstream::kv::Config { + bucket: bucket_name.to_string(), + history: history_size, + ..Default::default() + }) + .await + .map_err(|e| { + error!( + "Failed to initialize NATS KV bucket '{}': {}", + bucket_name, e + ); + e + })?; + + Ok(Self::new(store)) + } +} + +#[async_trait] +impl KvStore for NatsKvStore { + async fn get_revision(&self, key: &str, expected_seq: u64) -> Result { + let entry = self + .store + .entry_for_revision(key, expected_seq) + .await + .map_err(|e| { + error!("NATS get failed for key '{}': {}", key, e); + KvStoreError::Disconnect(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + )) + })?; + + if entry.is_none() { + return Err(KvStoreError::KeyNotAvailable(key.to_string())); + } + + let entry = entry.unwrap(); + let value: Value = serde_json::from_slice(&entry.value).map_err(|e| { + KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: String::from_utf8_lossy(&entry.value).to_string(), + } + })?; + + // Extract metadata from NATS entry + // Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime + let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64; + + let metadata = KvMetadata { + timestamp, + sequence: entry.revision, + }; + + Ok(KvResult { + value: Some(value), + metadata, + }) + } + + async fn get(&self, key: &str) -> Result { + let entry = self.store.entry(key).await.map_err(|e| { + error!("NATS get failed for key '{}': {}", key, e); + KvStoreError::Disconnect(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + )) + })?; + + if entry.is_none() { + return Err(KvStoreError::KeyNotAvailable(key.to_string())); + } + + let entry = entry.unwrap(); + let value: Value = serde_json::from_slice(&entry.value).map_err(|e| { + KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: String::from_utf8_lossy(&entry.value).to_string(), + } + })?; + + // Extract metadata from NATS entry + // Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime + let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64; + + let metadata = KvMetadata { + timestamp, + sequence: entry.revision, + }; + + Ok(KvResult { + value: Some(value), + metadata, + }) + } + + async fn set_strict( + &self, + key: &str, + value: Value, + expected_sequence: u64, + ) -> Result { + trace!( + "Nats set strict {key} (#{expected_sequence}) : {}", + value.to_string() + ); + let bytes = + serde_json::to_vec(&value).map_err(|e| KvStoreError::DeserializationFailed { + deserialization_error: e.to_string(), + value: value.to_string(), + })?; + + // Use update() for CAS semantics (Compare-And-Set) + // This ensures we only write if the revision matches expected_sequence + let revision = self + .store + .update(&key, bytes.into(), expected_sequence) + .await + .map_err(|e| { + // FIXME this is ugly, we should have a clean KvStoreError containing + // proper information from nats instead + error!("NATS update failed for key '{}': {}", key, e); + e + })?; + + Ok(revision) + } + + async fn subscribe( + &self, + key: &str, + callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a + ) -> Result<(), KvStoreError> { + todo!() + } +} + +impl From for KvStoreError { + fn from(value: UpdateError) -> Self { + match value.kind() { + async_nats::jetstream::kv::UpdateErrorKind::InvalidKey => KvStoreError::InvalidKey, + async_nats::jetstream::kv::UpdateErrorKind::TimedOut => KvStoreError::Timeout, + async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision => { + KvStoreError::WrongLastRevision + } + async_nats::jetstream::kv::UpdateErrorKind::Other => KvStoreError::Disconnect( + std::io::Error::new(std::io::ErrorKind::Other, "NATS update error"), + ), + } + } +} diff --git a/harmony_agent/src/workflow/mod.rs b/harmony_agent/src/workflow/mod.rs new file mode 100644 index 00000000..8696e071 --- /dev/null +++ b/harmony_agent/src/workflow/mod.rs @@ -0,0 +1,39 @@ +use std::sync::Arc; + +use crate::agent::AgentConfig; +use async_trait::async_trait; + +pub mod primary; +pub mod replica; + +/// Trait that defines how a workflow (Primary or Replica) handles heartbeat events +#[async_trait] +pub trait HeartbeatWorkflow: Send + Sync { + /// Handle a successful heartbeat + async fn handle_heartbeat_success( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + agent_config: &AgentConfig, + ) -> Option; + + /// Handle a failed heartbeat + async fn handle_heartbeat_failure( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + ); + + async fn on_startup( + &self, + cluster_state: Option<&crate::agent::heartbeat::ClusterStateData>, + agent_config: &AgentConfig, + ); + + /// Get the current state name for logging (also used for heartbeat status) + fn state_name(&self) -> &'static str; + + /// Get current consecutive successes + fn consecutive_successes(&self) -> usize; + + /// Get current consecutive failures + fn consecutive_failures(&self) -> usize; +} diff --git a/harmony_agent/src/workflow/primary.rs b/harmony_agent/src/workflow/primary.rs new file mode 100644 index 00000000..61f25556 --- /dev/null +++ b/harmony_agent/src/workflow/primary.rs @@ -0,0 +1,330 @@ +use async_trait::async_trait; +use log::{debug, info, trace, warn}; + +use crate::{ + agent::{AgentConfig, DeploymentConfig}, + workflow::HeartbeatWorkflow, +}; + +#[derive(Debug, Clone, PartialEq)] +pub enum PrimaryState { + Initializing, + Healthy, + Failed, + Fenced, + Yielding, +} + +impl PrimaryState { + pub fn name(&self) -> &'static str { + match self { + PrimaryState::Initializing => "Primary:Initializing", + PrimaryState::Healthy => "Primary:Healthy", + PrimaryState::Failed => "Primary:Failed", + PrimaryState::Fenced => "Primary:Fenced", + PrimaryState::Yielding => "Primary:Yielding", + } + } +} + +pub struct PrimaryWorkflow { + state: PrimaryState, + consecutive_successes: usize, + consecutive_failures: usize, + + // TODO these thresholds should not be copied into the workflow struct. They are configuration + // level and should always be read from the context passed to the workflow functions + success_threshold: usize, + failure_threshold: usize, + + // TODO not sure if this should be known by the workflow or passed in the context to function + // calls or just completely handled by the agent ? + deployment_config: DeploymentConfig, +} + +impl PrimaryWorkflow { + pub fn new( + success_threshold: usize, + failure_threshold: usize, + deployment_config: DeploymentConfig, + ) -> Self { + Self { + state: PrimaryState::Initializing, + consecutive_successes: 0, + consecutive_failures: 0, + success_threshold, + failure_threshold, + deployment_config, + } + } + + fn transition_to(&mut self, new_state: PrimaryState) { + if self.state != new_state { + info!( + "State transition: {} -> {}", + self.state.name(), + new_state.name() + ); + self.state = new_state; + } + } +} + +#[async_trait] +impl HeartbeatWorkflow for PrimaryWorkflow { + async fn on_startup( + &self, + cluster_state: Option<&crate::agent::ClusterStateData>, + agent_config: &AgentConfig, + ) { + if let Some(state) = cluster_state { + info!( + "Startup reconciliation: current primary is {:?}, desired primary is {:?}", + state.cluster_info.current_primary, state.cluster_info.desired_primary + ); + + // No automatic fast-tracking - agent must earn healthy status + // through successful heartbeats. This prevents duplicate agents + // or crashloop agents from incorrectly claiming primary. + } else { + debug!("No cluster state on startup, starting from Initializing"); + } + } + async fn handle_heartbeat_success( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + agent_config: &AgentConfig, + ) -> Option { + trace!( + "Handling heartbeat success, current counters success {} failures {}", + self.consecutive_successes, self.consecutive_failures + ); + self.consecutive_successes += 1; + self.consecutive_failures = 0; + + match self.state { + PrimaryState::Initializing => { + if self.consecutive_successes >= self.success_threshold { + self.transition_to(PrimaryState::Healthy); + // Trigger on_active callback + let config = self.deployment_config.clone(); + tokio::spawn(async move { + config.on_active().await; + }); + if let Some(state) = cluster_state + && state.cluster_info.desired_primary == agent_config.desired_primary_id + { + debug!("state {:#?}", state); + let mut new_state = state.clone(); + new_state.cluster_info.current_primary = + Some(agent_config.agent_id.clone()); + return Some(new_state); + } else { + todo!( + "I cluster_state should not be an option, and we should throw an error when we are running a primary workflow but we are not the desired primary in the cluster state data" + ); + } + } + None + } + PrimaryState::Failed => { + if self.consecutive_successes >= self.success_threshold { + self.transition_to(PrimaryState::Healthy); + let config = self.deployment_config.clone(); + tokio::spawn(async move { + config.on_active().await; + }); + } + todo!() + } + PrimaryState::Healthy => { + // Stay healthy + debug!("Primary staying healthy"); + None + } + PrimaryState::Fenced => { + // Recovery from fenced state + if self.consecutive_successes >= self.success_threshold { + // TODO: Check NATS for current_primary status before recovering + info!("Recovered from fenced state, transitioning to yielding"); + self.transition_to(PrimaryState::Yielding); + } + todo!() + } + PrimaryState::Yielding => { + // TODO: Check NATS to see if we can resume as primary + trace!("Yielding, waiting for demotion handshake"); + todo!() + } + } + } + + async fn handle_heartbeat_failure( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + ) { + self.consecutive_failures += 1; + self.consecutive_successes = 0; + + match self.state { + PrimaryState::Healthy => { + if self.consecutive_failures >= self.failure_threshold { + warn!( + "Failure threshold reached ({}/{}), transitioning to Failed", + self.consecutive_failures, self.failure_threshold + ); + self.transition_to(PrimaryState::Failed); + + // Immediately fence + self.transition_to(PrimaryState::Fenced); + let config = self.deployment_config.clone(); + tokio::spawn(async move { + config.on_failover().await; + }); + } + } + PrimaryState::Initializing => { + // Stay in initializing, just accumulate failures + trace!("Heartbeat failed during initialization"); + } + PrimaryState::Failed | PrimaryState::Fenced | PrimaryState::Yielding => { + // Already in a degraded state + trace!("Heartbeat failed in degraded state: {}", self.state.name()); + } + } + } + + fn state_name(&self) -> &'static str { + self.state.name() + } + + fn consecutive_successes(&self) -> usize { + self.consecutive_successes + } + + fn consecutive_failures(&self) -> usize { + self.consecutive_failures + } +} + +#[cfg(test)] +mod test { + use harmony_types::id::Id; + use std::time::Duration; + + use crate::agent::{AgentRole, FailoverCNPGConfig}; + + use pretty_assertions::assert_eq; + + use super::*; + + #[tokio::test] + async fn primary_does_nothing_when_on_heartbeat_success_below_threshold() { + let (mut primary, cluster_state, agent_config) = default_test_state(2, 2); + + assert!( + primary + .handle_heartbeat_success(Some(&cluster_state), &agent_config) + .await + .is_none() + ); + } + + #[tokio::test] + async fn primary_transitions_cluster_state_when_consecutive_success_threshold_reached() { + let (mut primary, cluster_state, agent_config) = default_test_state(2, 2); + + let mut expected_state = cluster_state.clone(); + expected_state.cluster_info.current_primary = Some(Id::empty()); + + assert_eq!( + primary + .handle_heartbeat_success(Some(&cluster_state), &agent_config) + .await, + None + ); + assert_eq!( + primary + .handle_heartbeat_success(Some(&cluster_state), &agent_config) + .await, + Some(expected_state) + ); + } + + #[tokio::test] + async fn primary_stays_healthy_below_failure_threshold() { + let (mut primary, cluster_state, agent_config) = default_test_state(1, 2); + + // Reach healthy + let _ = primary + .handle_heartbeat_success(Some(&cluster_state), &agent_config) + .await; + assert_eq!(primary.state, PrimaryState::Healthy); + + // One failure below threshold + primary.handle_heartbeat_failure(Some(&cluster_state)).await; + assert_eq!(primary.state, PrimaryState::Healthy); + assert_eq!(primary.consecutive_failures(), 1); + assert_eq!(primary.consecutive_successes(), 0); + } + + #[tokio::test] + async fn primary_transitions_to_failed_at_failure_threshold() { + let (mut primary, cluster_state, agent_config) = default_test_state(1, 2); + + // Reach healthy + let _ = primary + .handle_heartbeat_success(Some(&cluster_state), &agent_config) + .await; + assert_eq!(primary.state, PrimaryState::Healthy); + + // First failure, still healthy + primary.handle_heartbeat_failure(Some(&cluster_state)).await; + assert_eq!(primary.state, PrimaryState::Healthy); + assert_eq!(primary.consecutive_failures(), 1); + + // Second failure reaches threshold, transitions to Failed + primary.handle_heartbeat_failure(Some(&cluster_state)).await; + assert_eq!(primary.state, PrimaryState::Fenced); + assert_eq!(primary.consecutive_failures(), 2); + assert_eq!(primary.consecutive_successes(), 0); + } + + fn default_test_state( + success_threshold: usize, + failure_threshold: usize, + ) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) { + let cluster_state = crate::agent::ClusterStateData { + cluster_info: crate::agent::heartbeat::ClusterState { + cluster_id: Id::empty(), + current_primary: None, + desired_primary: Id::empty(), + }, + metadata: None, + }; + + let agent_config = AgentConfig { + success_threshold, + failure_threshold, + heartbeat_interval: Duration::from_nanos(0), + failover_timeout: Duration::from_nanos(0), + deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig { + cnpg_cluster_name: "test".to_string(), + }), + nats_url: String::new(), + nats_creds_path: None, + agent_id: Id::empty(), + cluster_id: Id::empty(), + desired_primary_id: Id::empty(), + role: AgentRole::Primary, + }; + + let primary = PrimaryWorkflow::new( + agent_config.success_threshold, + agent_config.failure_threshold, + agent_config.deployment_config_unstable.clone(), + ); + + (primary, cluster_state, agent_config) + } +} diff --git a/harmony_agent/src/workflow/replica.rs b/harmony_agent/src/workflow/replica.rs new file mode 100644 index 00000000..5c86bde7 --- /dev/null +++ b/harmony_agent/src/workflow/replica.rs @@ -0,0 +1,279 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use log::{debug, error, info, trace, warn}; +use std::time::Duration; +use tokio::sync::RwLock; + +use crate::agent::{AgentConfig, AgentHeartbeat}; +use crate::workflow::HeartbeatWorkflow; + +#[derive(Debug, Clone)] +pub struct HeartbeatState { + pub agent_id: Id, + pub last_seq: Option, +} + +impl HeartbeatState { + pub fn watch(agent_id: Id) -> Self { + Self { + agent_id, + last_seq: None, + } + } +} + +#[derive(Debug, Clone)] +pub struct ClusterState { + pub cluster_id: Id, + pub current_primary: Option, +} + +impl ClusterState { + pub fn watch(cluster_id: Id) -> Self { + Self { + cluster_id, + current_primary: None, + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum ReplicaState { + Initializing, + Watching, + Promoting, + PromotionFailed, + Leader, + Demoting, + Failed, +} + +impl ReplicaState { + pub fn name(&self) -> &'static str { + match self { + ReplicaState::Initializing => "Replica:Initializing", + ReplicaState::Watching => "Replica:Watching", + ReplicaState::Promoting => "Replica:Promoting", + ReplicaState::PromotionFailed => "Replica:PromotionFailed", + ReplicaState::Leader => "Replica:Leader", + ReplicaState::Demoting => "Replica:Demoting", + ReplicaState::Failed => "Replica:Failed", + } + } +} + +pub struct ReplicaWorkflow { + state: ReplicaState, + heartbeat_state: HeartbeatState, + primary_state: HeartbeatState, + cluster_state: ClusterState, + consecutive_successes: usize, + consecutive_failures: usize, + success_threshold: usize, + failure_threshold: usize, + failover_timeout: Duration, + /// Our own last heartbeat (for timestamp comparison against primary) + last_my_heartbeat: Option, + /// Last observed primary heartbeat (metadata only, for staleness detection) + last_primary_heartbeat: Option>, +} + +impl ReplicaWorkflow { + pub fn new( + success_threshold: usize, + failure_threshold: usize, + cluster_id: Id, + primary_id: Id, + my_id: Id, + failover_timeout: Duration, + ) -> Self { + Self { + state: ReplicaState::Initializing, + consecutive_successes: 0, + consecutive_failures: 0, + success_threshold, + failure_threshold, + failover_timeout, + cluster_state: ClusterState::watch(cluster_id), + primary_state: HeartbeatState::watch(primary_id), + heartbeat_state: HeartbeatState::watch(my_id), + last_my_heartbeat: None, + last_primary_heartbeat: None, + } + } + + fn transition_to(&mut self, new_state: ReplicaState) { + if self.state != new_state { + info!( + "State transition: {} -> {}", + self.state.name(), + new_state.name() + ); + self.state = new_state; + } + } + + /// Check if the primary heartbeat is stale compared to our own + /// Per ADR-017-3: primary is stale if (replica_timestamp - primary_timestamp) > failover_timeout + async fn is_primary_stale(&mut self) -> bool { + if let Some(my_hb) = &self.last_my_heartbeat { + if let Some(my_metadata) = &my_hb.metadata { + if let Some(primary_hb_ref) = self.last_primary_heartbeat.as_ref() { + let primary_hb = primary_hb_ref.read().await; + if let Some(primary_metadata) = &primary_hb.metadata { + // Calculate time difference: replica_timestamp - primary_timestamp + let time_diff_ms = my_metadata + .timestamp + .saturating_sub(primary_metadata.timestamp); + let failover_timeout_ms = self.failover_timeout.as_millis() as u64; + + trace!( + "Staleness check: my_ts={}, primary_ts={}, diff={}ms, timeout={}ms", + my_metadata.timestamp, + primary_metadata.timestamp, + time_diff_ms, + failover_timeout_ms + ); + + if time_diff_ms > failover_timeout_ms { + info!( + "Primary heartbeat stale ({}ms > {}ms), attempting promotion", + time_diff_ms, failover_timeout_ms + ); + + return true; + } + } + } + } + } + false + } +} + +#[async_trait] +impl HeartbeatWorkflow for ReplicaWorkflow { + async fn on_startup( + &self, + cluster_state: Option<&crate::agent::ClusterStateData>, + agent_config: &AgentConfig, + ) { + // todo!("not sure if the replica should do anything on startup") + } + + async fn handle_heartbeat_success( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + agent_config: &AgentConfig, + ) -> Option { + trace!( + "Handling heartbeat success, current counters success {} failures {}", + self.consecutive_successes, self.consecutive_failures + ); + self.consecutive_successes += 1; + self.consecutive_failures = 0; + + match self.state { + ReplicaState::Initializing => { + if self.consecutive_successes >= self.success_threshold { + self.transition_to(ReplicaState::Watching); + } + None + } + ReplicaState::Watching => { + // TODO: Check primary staleness from NATS + trace!("Replica watching primary"); + if self.is_primary_stale().await { + panic!("Found stale primary, launching promotion"); + } + debug!("perform the replica watch actions : + - if a primary exists in the cluster (cluster_state.current_primary == expected_primary) + - check the last primary heartbeat kv timestamp + - compare it with our latest kv heartbeat + - if longer than failover timeout, launch promotion (we assume that primary has already fenced itself) + - launching promotion will change the status of the replica + "); + + None + } + ReplicaState::Promoting => { + // TODO: Complete promotion attempt + trace!("Replica promotion in progress"); + todo!( + "When promoting, a heartbeat failure does not affect promotion unless failure_threshold is reached, a heartbeat success does nothing either" + ); + } + ReplicaState::PromotionFailed => { + if self.consecutive_successes >= self.success_threshold { + self.transition_to(ReplicaState::Watching); + } + todo!() + } + ReplicaState::Leader => { + // TODO: Check for original primary recovery + trace!("Replica acting as leader"); + todo!() + } + ReplicaState::Failed => { + if self.consecutive_successes >= self.success_threshold { + info!("Replica recovered from Failed state, transitioning to Watching"); + self.transition_to(ReplicaState::Watching); + } + todo!() + } + ReplicaState::Demoting => { + // TODO: Complete demotion back to watching + trace!("Replica demotion in progress"); + todo!() + } + } + } + + async fn handle_heartbeat_failure( + &mut self, + cluster_state: Option<&crate::agent::ClusterStateData>, + ) { + self.consecutive_failures += 1; + self.consecutive_successes = 0; + + // TODO revisit this. I think we should handle the agent healthiness (checking + // consecutive_failures against failure_threshold) separately from handling the cluster + // state. + // + // That said, there might be funny stuff we have to do when the agent reaches the failure + // threshold, especially in promoting and demoting statuses. + + match self.state { + ReplicaState::Watching | ReplicaState::Initializing => { + if self.consecutive_failures >= self.failure_threshold { + info!( + "Replica exceeded failure threshold ({}/{}), transitioning to Failed", + self.consecutive_failures, self.failure_threshold + ); + self.transition_to(ReplicaState::Failed); + } else { + trace!("Replica heartbeat failed, but below threshold"); + } + } + ReplicaState::Promoting + | ReplicaState::PromotionFailed + | ReplicaState::Leader + | ReplicaState::Demoting + | ReplicaState::Failed => { + trace!("Replica heartbeat failed in state: {}", self.state.name()); + } + } + } + + fn state_name(&self) -> &'static str { + self.state.name() + } + + fn consecutive_successes(&self) -> usize { + self.consecutive_successes + } + + fn consecutive_failures(&self) -> usize { + self.consecutive_failures + } +} diff --git a/harmony_execution/Cargo.toml b/harmony_execution/Cargo.toml new file mode 100644 index 00000000..7433c5e5 --- /dev/null +++ b/harmony_execution/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "harmony_execution" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true + +[dependencies] +thiserror.workspace = true +lazy_static.workspace = true +directories.workspace = true +log.workspace = true diff --git a/harmony_execution/src/command.rs b/harmony_execution/src/command.rs new file mode 100644 index 00000000..0ac1626c --- /dev/null +++ b/harmony_execution/src/command.rs @@ -0,0 +1,470 @@ +use std::io::{BufRead, BufReader}; +use std::process::{Child, Command, Stdio}; +use std::sync::Arc; +use std::thread; + +/// Captured output from a command execution +#[derive(Debug, Clone)] +pub struct CommandOutput { + /// Captured stdout content + pub stdout: String, + /// Captured stderr content + pub stderr: String, + /// Exit status of the command + pub status: CommandStatus, +} + +impl CommandOutput { + /// Returns true if the command succeeded + pub fn is_success(&self) -> bool { + self.status.is_success() + } + + /// Formats the complete output for display + pub fn format_output(&self) -> String { + format!( + "Stdout:\n{}\n\nStderr:\n{}", + if self.stdout.is_empty() { + "" + } else { + &self.stdout + }, + if self.stderr.is_empty() { + "" + } else { + &self.stderr + } + ) + } +} + +/// Result status of a command execution +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CommandStatus { + /// Command executed successfully (exit code 0) + Success, + /// Command failed with an exit code + Failed(i32), + /// Command was terminated by a signal + Terminated(i32), + /// Command execution could not be started + Error(String), +} + +impl CommandStatus { + pub fn is_success(&self) -> bool { + matches!(self, CommandStatus::Success) + } +} + +impl From for CommandStatus { + fn from(status: std::process::ExitStatus) -> Self { + if status.success() { + CommandStatus::Success + } else if let Some(code) = status.code() { + CommandStatus::Failed(code) + } else { + CommandStatus::Terminated(0) // Signal codes are platform-specific + } + } +} + +type Callback = Arc; + +/// Options for configuring command execution +#[derive(Clone)] +pub struct RunnerOptions { + /// Whether to print stdout to console in real-time + pub print_stdout: bool, + /// Whether to print stderr to console in real-time + pub print_stderr: bool, + /// Optional callback for each stdout line + pub stdout_callback: Callback, + /// Optional callback for each stderr line + pub stderr_callback: Callback, +} + +impl RunnerOptions { + fn empty_callback() -> Callback { + Arc::new(|_| {}) + } + /// Create default options with real-time printing enabled + pub fn print_to_console() -> Self { + Self { + print_stdout: true, + print_stderr: true, + ..Default::default() + } + } + + /// Create options that capture output silently + pub fn silent() -> Self { + Self { + print_stdout: false, + print_stderr: false, + ..Default::default() + } + } + + /// Set custom callbacks for stdout and stderr lines + pub fn with_callbacks(mut self, stdout_callback: F1, stderr_callback: F2) -> Self + where + F1: Fn(&str) + Send + Sync + 'static, + F2: Fn(&str) + Send + Sync + 'static, + { + self.stdout_callback = Arc::new(stdout_callback); + self.stderr_callback = Arc::new(stderr_callback); + self + } +} + +impl Default for RunnerOptions { + fn default() -> Self { + Self { + print_stdout: true, + print_stderr: true, + stdout_callback: Self::empty_callback(), + stderr_callback: Self::empty_callback(), + } + } +} + +/// Error type for command execution failures +#[derive(Debug)] +pub struct CommandError { + /// Human-readable error description + pub message: String, + /// Captured output if execution started + pub output: Option, +} + +impl std::fmt::Display for CommandError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message)?; + if let Some(output) = &self.output { + write!(f, "\n{}", output.format_output())?; + } + Ok(()) + } +} + +impl std::error::Error for CommandError {} + +/// Runs a command and captures its output while streaming to console +/// +/// # Example +/// +/// ``` +/// use harmony_execution::command::{run_command, RunnerOptions}; +/// use std::process::Command; +/// +/// let output = run_command( +/// Command::new("echo").arg("hello"), +/// RunnerOptions::print_to_console() +/// ).unwrap(); +/// assert!(output.is_success()); +/// assert_eq!(output.stdout, "hello\n"); +/// ``` +pub fn run_command( + command: &mut Command, + options: RunnerOptions, +) -> Result { + let mut child = command + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| CommandError { + message: format!("Failed to spawn command: {}", e), + output: None, + })?; + + let stdout = child.stdout.take().ok_or_else(|| CommandError { + message: "Failed to capture stdout".to_string(), + output: None, + })?; + + let stderr = child.stderr.take().ok_or_else(|| CommandError { + message: "Failed to capture stderr".to_string(), + output: None, + })?; + + let stdout_reader = BufReader::new(stdout); + let stderr_reader = BufReader::new(stderr); + + let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel(); + let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel(); + + // Spawn thread to handle stdout + let stdout_handle = thread::spawn(move || { + let mut output = String::new(); + for line in stdout_reader.lines() { + match line { + Ok(line_content) => { + if options.print_stdout { + println!("{}", line_content); + } + (options.stdout_callback)(&line_content); + output.push_str(&line_content); + output.push('\n'); + } + Err(e) => { + // Silently handle read errors - corrupted data at end is common + log::trace!("Error reading stdout line: {}", e); + } + } + } + let _ = stdout_sender.send(output); + }); + + // Spawn thread to handle stderr + let stderr_handle = thread::spawn(move || { + let mut output = String::new(); + for line in stderr_reader.lines() { + match line { + Ok(line_content) => { + if options.print_stderr { + eprintln!("{}", line_content); + } + (options.stderr_callback)(&line_content); + output.push_str(&line_content); + output.push('\n'); + } + Err(e) => { + log::trace!("Error reading stderr line: {}", e); + } + } + } + let _ = stderr_sender.send(output); + }); + + let status = child.wait().map_err(|e| CommandError { + message: format!("Failed to wait for command process: {}", e), + output: None, + })?; + + let stdout_lines = stdout_handle + .join() + .map_err(|e| CommandError { + message: format!("Stdout thread panicked: {:?}", e), + output: None, + }) + .and_then(|_| { + stdout_receiver.recv().map_err(|e| CommandError { + message: format!("Failed to receive stdout: {}", e), + output: None, + }) + })?; + + let stderr_lines = stderr_handle + .join() + .map_err(|e| CommandError { + message: format!("Stderr thread panicked: {:?}", e), + output: None, + }) + .and_then(|_| { + stderr_receiver.recv().map_err(|e| CommandError { + message: format!("Failed to receive stderr: {}", e), + output: None, + }) + })?; + + Ok(CommandOutput { + stdout: stdout_lines, + stderr: stderr_lines, + status: status.into(), + }) +} + +/// Convenience function to run a command with default options (print to console) +pub fn run(command: &mut Command) -> Result { + run_command(command, RunnerOptions::print_to_console()) +} + +/// Convenience function to run a command silently (capture output only) +pub fn run_silent(command: &mut Command) -> Result { + run_command(command, RunnerOptions::silent()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + + #[test] + fn test_simple_echo_command() { + let output = run_silent(Command::new("echo").arg("hello world")).unwrap(); + assert!(output.is_success()); + assert_eq!(output.stdout.trim(), "hello world"); + assert!(output.stderr.is_empty()); + } + + #[test] + fn test_command_failure() { + let output = run_silent(Command::new("sh").args(["-c", "exit 42"])).unwrap(); + assert!(!output.is_success()); + assert_eq!(output.status, CommandStatus::Failed(42)); + } + + #[test] + fn test_command_output_format() { + let output = run_silent(Command::new("echo").arg("test")).unwrap(); + let formatted = output.format_output(); + assert!(formatted.contains("Stdout:")); + assert!(formatted.contains("test")); + } + + #[test] + fn test_runner_options() { + let opts = RunnerOptions::print_to_console(); + assert!(opts.print_stdout); + assert!(opts.print_stderr); + + let opts = RunnerOptions::silent(); + assert!(!opts.print_stdout); + assert!(!opts.print_stderr); + } + + #[test] + fn test_command_status_from_exit_status() { + let output = run_silent(&mut Command::new("true")).unwrap(); + assert_eq!(output.status, CommandStatus::Success); + + let output = run_silent(&mut Command::new("false")).unwrap(); + assert_eq!(output.status, CommandStatus::Failed(1)); + } + + #[test] + fn test_stdout_callback_receives_lines() { + use std::sync::{Arc, Mutex}; + + let captured = Arc::new(Mutex::new(Vec::new())); + let captured_clone = Arc::clone(&captured); + + let opts = RunnerOptions::silent().with_callbacks( + move |line| captured_clone.lock().unwrap().push(line.to_string()), + |_| {}, + ); + + run_command(Command::new("echo").arg("hello world"), opts).unwrap(); + + let lines = captured.lock().unwrap(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0], "hello world"); + } + + #[test] + fn test_stderr_callback_receives_lines() { + use std::sync::{Arc, Mutex}; + + let captured = Arc::new(Mutex::new(Vec::new())); + let captured_clone = Arc::clone(&captured); + + let opts = RunnerOptions::silent().with_callbacks( + |_| {}, + move |line| captured_clone.lock().unwrap().push(line.to_string()), + ); + + run_command(Command::new("sh").args(["-c", "echo error >&2"]), opts).unwrap(); + + let lines = captured.lock().unwrap(); + assert_eq!(lines.len(), 1); + assert_eq!(lines[0], "error"); + } + + #[test] + fn test_callback_and_capture_both_work() { + use std::sync::{Arc, Mutex}; + + let callback_lines = Arc::new(Mutex::new(Vec::new())); + let callback_clone = Arc::clone(&callback_lines); + + let opts = RunnerOptions::silent().with_callbacks( + move |line| callback_clone.lock().unwrap().push(line.to_string()), + |_| {}, + ); + + let output = + run_command(Command::new("printf").args(["line1\nline2\nline3\n"]), opts).unwrap(); + + // Verify captured output + assert_eq!(output.stdout, "line1\nline2\nline3\n"); + + // Verify callback received all lines + let lines = callback_lines.lock().unwrap(); + assert_eq!(lines.len(), 3); + assert_eq!(lines[0], "line1"); + assert_eq!(lines[1], "line2"); + assert_eq!(lines[2], "line3"); + } + + #[test] + fn test_multiline_output_capture() { + let output = run_silent(Command::new("printf").args(["line1\nline2\nline3\n"])).unwrap(); + + assert_eq!(output.stdout, "line1\nline2\nline3\n"); + assert!(output.stderr.trim().is_empty()); + } + + #[test] + fn test_mixed_stdout_stderr_capture() { + let output = run_silent(Command::new("sh").args([ + "-c", + "echo stdout1 && echo stderr1 >&2 && echo stdout2 && echo stderr2 >&2", + ])) + .unwrap(); + + assert!(output.stdout.contains("stdout1")); + assert!(output.stdout.contains("stdout2")); + assert!(output.stderr.contains("stderr1")); + assert!(output.stderr.contains("stderr2")); + } + + #[test] + fn test_empty_output_command() { + let output = run_silent(&mut Command::new("true")).unwrap(); + + assert!(output.stdout.is_empty()); + assert!(output.stderr.is_empty()); + assert!(output.is_success()); + } + + #[test] + fn test_command_output_format_with_empty_streams() { + let output = run_silent(&mut Command::new("true")).unwrap(); + let formatted = output.format_output(); + + assert!(formatted.contains("Stdout:")); + assert!(formatted.contains("")); + assert!(formatted.contains("Stderr:")); + } + + #[test] + fn test_error_contains_message_and_output() { + let error = CommandError { + message: "Test error".to_string(), + output: Some(CommandOutput { + stdout: "captured stdout".to_string(), + stderr: "captured stderr".to_string(), + status: CommandStatus::Success, + }), + }; + + let display = format!("{}", error); + assert!(display.contains("Test error")); + assert!(display.contains("captured stdout")); + assert!(display.contains("captured stderr")); + } + + #[test] + fn test_error_without_output() { + let error = CommandError { + message: "Spawn failed".to_string(), + output: None, + }; + + let display = format!("{}", error); + assert!(display.contains("Spawn failed")); + assert!(!display.contains("Stdout:")); + assert!(!display.contains("Stderr:")); + } +} diff --git a/harmony_execution/src/lib.rs b/harmony_execution/src/lib.rs new file mode 100644 index 00000000..c96cddfe --- /dev/null +++ b/harmony_execution/src/lib.rs @@ -0,0 +1,5 @@ +pub mod command; + +pub use command::{ + CommandError, CommandOutput, CommandStatus, RunnerOptions, run, run_command, run_silent, +}; diff --git a/harmony_types/src/id.rs b/harmony_types/src/id.rs index 0a829068..748c1050 100644 --- a/harmony_types/src/id.rs +++ b/harmony_types/src/id.rs @@ -32,6 +32,14 @@ impl Id { } } +impl Into for &str { + fn into(self) -> Id { + Id { + value: self.to_string(), + } + } +} + impl FromStr for Id { type Err = ();