feat/harmony_agent #220
@@ -1,2 +1,6 @@
|
|||||||
target/
|
target/
|
||||||
Dockerfile
|
Dockerfile
|
||||||
|
.git
|
||||||
|
data
|
||||||
|
target
|
||||||
|
demos
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -24,3 +24,5 @@ Cargo.lock
|
|||||||
|
|
||||||
# MSVC Windows builds of rustc generate these, which store debugging information
|
# MSVC Windows builds of rustc generate these, which store debugging information
|
||||||
*.pdb
|
*.pdb
|
||||||
|
|
||||||
|
.harmony_generated
|
||||||
|
|||||||
218
Cargo.lock
generated
218
Cargo.lock
generated
@@ -243,7 +243,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"const-random",
|
"const-random",
|
||||||
"getrandom 0.3.3",
|
"getrandom 0.3.4",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"version_check",
|
"version_check",
|
||||||
"zerocopy",
|
"zerocopy",
|
||||||
@@ -450,6 +450,43 @@ dependencies = [
|
|||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "async-nats"
|
||||||
|
version = "0.45.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe"
|
||||||
|
dependencies = [
|
||||||
|
"base64 0.22.1",
|
||||||
|
"bytes",
|
||||||
|
"futures-util",
|
||||||
|
"memchr",
|
||||||
|
"nkeys",
|
||||||
|
"nuid",
|
||||||
|
"once_cell",
|
||||||
|
"pin-project",
|
||||||
|
"portable-atomic",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"regex",
|
||||||
|
"ring",
|
||||||
|
"rustls-native-certs 0.7.3",
|
||||||
|
"rustls-pemfile 2.2.0",
|
||||||
|
"rustls-webpki 0.102.8",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"serde_nanos",
|
||||||
|
"serde_repr",
|
||||||
|
"thiserror 1.0.69",
|
||||||
|
"time",
|
||||||
|
"tokio",
|
||||||
|
"tokio-rustls 0.26.2",
|
||||||
|
"tokio-stream",
|
||||||
|
"tokio-util",
|
||||||
|
"tokio-websockets",
|
||||||
|
"tracing",
|
||||||
|
"tryhard",
|
||||||
|
"url",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-stream"
|
name = "async-stream"
|
||||||
version = "0.3.6"
|
version = "0.3.6"
|
||||||
@@ -775,6 +812,9 @@ name = "bytes"
|
|||||||
version = "1.10.1"
|
version = "1.10.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
|
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bytestring"
|
name = "bytestring"
|
||||||
@@ -1583,6 +1623,7 @@ dependencies = [
|
|||||||
"rand_core 0.6.4",
|
"rand_core 0.6.4",
|
||||||
"serde",
|
"serde",
|
||||||
"sha2",
|
"sha2",
|
||||||
|
"signature",
|
||||||
"subtle",
|
"subtle",
|
||||||
"zeroize",
|
"zeroize",
|
||||||
]
|
]
|
||||||
@@ -2456,21 +2497,21 @@ dependencies = [
|
|||||||
"cfg-if",
|
"cfg-if",
|
||||||
"js-sys",
|
"js-sys",
|
||||||
"libc",
|
"libc",
|
||||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
"wasi",
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "getrandom"
|
name = "getrandom"
|
||||||
version = "0.3.3"
|
version = "0.3.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
|
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if",
|
"cfg-if",
|
||||||
"js-sys",
|
"js-sys",
|
||||||
"libc",
|
"libc",
|
||||||
"r-efi",
|
"r-efi",
|
||||||
"wasi 0.14.3+wasi-0.2.4",
|
"wasip2",
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -2572,6 +2613,7 @@ dependencies = [
|
|||||||
"env_logger",
|
"env_logger",
|
||||||
"fqdn",
|
"fqdn",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
|
"harmony_execution",
|
||||||
"harmony_inventory_agent",
|
"harmony_inventory_agent",
|
||||||
"harmony_macros",
|
"harmony_macros",
|
||||||
"harmony_secret",
|
"harmony_secret",
|
||||||
@@ -2619,6 +2661,43 @@ dependencies = [
|
|||||||
"walkdir",
|
"walkdir",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "harmony_agent"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"async-nats",
|
||||||
|
"async-trait",
|
||||||
|
"cidr",
|
||||||
|
"env_logger",
|
||||||
|
"getrandom 0.3.4",
|
||||||
|
"harmony",
|
||||||
|
"harmony_macros",
|
||||||
|
"harmony_types",
|
||||||
|
"log",
|
||||||
|
"pretty_assertions",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"thiserror 2.0.16",
|
||||||
|
"tokio",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "harmony_agent_deploy"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"cidr",
|
||||||
|
"env_logger",
|
||||||
|
"harmony",
|
||||||
|
"harmony_cli",
|
||||||
|
"harmony_macros",
|
||||||
|
"harmony_types",
|
||||||
|
"log",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"tokio",
|
||||||
|
"url",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "harmony_cli"
|
name = "harmony_cli"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
@@ -2659,6 +2738,16 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "harmony_execution"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"directories",
|
||||||
|
"lazy_static",
|
||||||
|
"log",
|
||||||
|
"thiserror 2.0.16",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "harmony_inventory_agent"
|
name = "harmony_inventory_agent"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
@@ -3523,7 +3612,7 @@ version = "0.1.34"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
|
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"getrandom 0.3.3",
|
"getrandom 0.3.4",
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -3963,7 +4052,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"log",
|
"log",
|
||||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
"wasi",
|
||||||
"windows-sys 0.48.0",
|
"windows-sys 0.48.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -3975,7 +4064,7 @@ checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
"log",
|
"log",
|
||||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
"wasi",
|
||||||
"windows-sys 0.59.0",
|
"windows-sys 0.59.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -4022,6 +4111,21 @@ dependencies = [
|
|||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "nkeys"
|
||||||
|
version = "0.4.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "879011babc47a1c7fdf5a935ae3cfe94f34645ca0cac1c7f6424b36fc743d1bf"
|
||||||
|
dependencies = [
|
||||||
|
"data-encoding",
|
||||||
|
"ed25519",
|
||||||
|
"ed25519-dalek",
|
||||||
|
"getrandom 0.2.16",
|
||||||
|
"log",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"signatory",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "non-blank-string-rs"
|
name = "non-blank-string-rs"
|
||||||
version = "1.0.4"
|
version = "1.0.4"
|
||||||
@@ -4040,6 +4144,15 @@ dependencies = [
|
|||||||
"winapi 0.3.9",
|
"winapi 0.3.9",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "nuid"
|
||||||
|
version = "0.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fc895af95856f929163a0aa20c26a78d26bfdc839f51b9d5aa7a5b79e52b7e83"
|
||||||
|
dependencies = [
|
||||||
|
"rand 0.8.5",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-bigint"
|
name = "num-bigint"
|
||||||
version = "0.4.6"
|
version = "0.4.6"
|
||||||
@@ -4660,7 +4773,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
|
checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytes",
|
"bytes",
|
||||||
"getrandom 0.3.3",
|
"getrandom 0.3.4",
|
||||||
"lru-slab",
|
"lru-slab",
|
||||||
"rand 0.9.2",
|
"rand 0.9.2",
|
||||||
"ring",
|
"ring",
|
||||||
@@ -4765,7 +4878,7 @@ version = "0.9.3"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
|
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"getrandom 0.3.3",
|
"getrandom 0.3.4",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -5301,6 +5414,16 @@ dependencies = [
|
|||||||
"untrusted",
|
"untrusted",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustls-webpki"
|
||||||
|
version = "0.102.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
|
||||||
|
dependencies = [
|
||||||
|
"rustls-pki-types",
|
||||||
|
"untrusted",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustls-webpki"
|
name = "rustls-webpki"
|
||||||
version = "0.103.4"
|
version = "0.103.4"
|
||||||
@@ -5564,6 +5687,15 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_nanos"
|
||||||
|
version = "0.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a93142f0367a4cc53ae0fead1bcda39e85beccfad3dcd717656cacab94b12985"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_path_to_error"
|
name = "serde_path_to_error"
|
||||||
version = "0.1.17"
|
version = "0.1.17"
|
||||||
@@ -5731,6 +5863,18 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "signatory"
|
||||||
|
version = "0.27.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c1e303f8205714074f6068773f0e29527e0453937fe837c9717d066635b65f31"
|
||||||
|
dependencies = [
|
||||||
|
"pkcs8",
|
||||||
|
"rand_core 0.6.4",
|
||||||
|
"signature",
|
||||||
|
"zeroize",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "signature"
|
name = "signature"
|
||||||
version = "2.2.0"
|
version = "2.2.0"
|
||||||
@@ -6314,7 +6458,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
|
checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fastrand",
|
"fastrand",
|
||||||
"getrandom 0.3.3",
|
"getrandom 0.3.4",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"rustix 1.0.8",
|
"rustix 1.0.8",
|
||||||
"windows-sys 0.60.2",
|
"windows-sys 0.60.2",
|
||||||
@@ -6538,6 +6682,27 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tokio-websockets"
|
||||||
|
version = "0.10.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d"
|
||||||
|
dependencies = [
|
||||||
|
"base64 0.22.1",
|
||||||
|
"bytes",
|
||||||
|
"futures-core",
|
||||||
|
"futures-sink",
|
||||||
|
"http 1.3.1",
|
||||||
|
"httparse",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"ring",
|
||||||
|
"rustls-pki-types",
|
||||||
|
"tokio",
|
||||||
|
"tokio-rustls 0.26.2",
|
||||||
|
"tokio-util",
|
||||||
|
"webpki-roots 0.26.11",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "toml"
|
name = "toml"
|
||||||
version = "0.8.23"
|
version = "0.8.23"
|
||||||
@@ -6689,6 +6854,16 @@ version = "0.2.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tryhard"
|
||||||
|
version = "0.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9fe58ebd5edd976e0fe0f8a14d2a04b7c81ef153ea9a54eebc42e67c2c23b4e5"
|
||||||
|
dependencies = [
|
||||||
|
"pin-project-lite",
|
||||||
|
"tokio",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tui-logger"
|
name = "tui-logger"
|
||||||
version = "0.14.5"
|
version = "0.14.5"
|
||||||
@@ -6865,7 +7040,7 @@ version = "1.18.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
|
checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"getrandom 0.3.3",
|
"getrandom 0.3.4",
|
||||||
"js-sys",
|
"js-sys",
|
||||||
"rand 0.9.2",
|
"rand 0.9.2",
|
||||||
"uuid-macro-internal",
|
"uuid-macro-internal",
|
||||||
@@ -6936,10 +7111,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wasi"
|
name = "wasip2"
|
||||||
version = "0.14.3+wasi-0.2.4"
|
version = "1.0.2+wasi-0.2.9"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95"
|
checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"wit-bindgen",
|
"wit-bindgen",
|
||||||
]
|
]
|
||||||
@@ -7061,6 +7236,15 @@ version = "0.25.4"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1"
|
checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "webpki-roots"
|
||||||
|
version = "0.26.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
|
||||||
|
dependencies = [
|
||||||
|
"webpki-roots 1.0.2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webpki-roots"
|
name = "webpki-roots"
|
||||||
version = "1.0.2"
|
version = "1.0.2"
|
||||||
@@ -7438,9 +7622,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wit-bindgen"
|
name = "wit-bindgen"
|
||||||
version = "0.45.0"
|
version = "0.51.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814"
|
checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "writeable"
|
name = "writeable"
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ members = [
|
|||||||
"harmony_types",
|
"harmony_types",
|
||||||
"harmony_macros",
|
"harmony_macros",
|
||||||
"harmony_tui",
|
"harmony_tui",
|
||||||
|
"harmony_execution",
|
||||||
"opnsense-config",
|
"opnsense-config",
|
||||||
"opnsense-config-xml",
|
"opnsense-config-xml",
|
||||||
"harmony_cli",
|
"harmony_cli",
|
||||||
@@ -17,6 +18,8 @@ members = [
|
|||||||
"harmony_secret",
|
"harmony_secret",
|
||||||
"adr/agent_discovery/mdns",
|
"adr/agent_discovery/mdns",
|
||||||
"brocade",
|
"brocade",
|
||||||
|
"harmony_agent",
|
||||||
|
"harmony_agent/deploy",
|
||||||
]
|
]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
# Harmony : Open-source infrastructure orchestration that treats your platform like first-class code
|
# Harmony : Open-source infrastructure orchestration that treats your platform like first-class code
|
||||||
|
|
||||||
|
In other words, Harmony is a **next-generation platform engineering framework**.
|
||||||
|
|
||||||
_By [NationTech](https://nationtech.io)_
|
_By [NationTech](https://nationtech.io)_
|
||||||
|
|
||||||
[](https://git.nationtech.io/nationtech/harmony)
|
[](https://git.nationtech.io/nationtech/harmony)
|
||||||
|
|||||||
141
adr/018-Template-Hydration-For-Workload-Deployment.md
Normal file
141
adr/018-Template-Hydration-For-Workload-Deployment.md
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
# Architecture Decision Record: Template Hydration for Kubernetes Manifest Generation
|
||||||
|
|
||||||
|
Initial Author: Jean-Gabriel Gill-Couture & Sylvain Tremblay
|
||||||
|
|
||||||
|
Initial Date: 2025-01-23
|
||||||
|
|
||||||
|
Last Updated Date: 2025-01-23
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Implemented
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Harmony's philosophy is built on three guiding principles: Infrastructure as Resilient Code, Prove It Works — Before You Deploy, and One Unified Model. Our goal is to shift validation and verification as left as possible—ideally to compile time—rather than discovering errors at deploy time.
|
||||||
|
|
||||||
|
After investigating a few approaches such as compile-checked Askama templates to generate Kubernetes manifests for Helm charts, we found again that this approach suffered from several fundamental limitations:
|
||||||
|
|
||||||
|
* **Late Validation:** Typos in template syntax or field names are only discovered at deployment time, not during compilation. A mistyped `metadata.name` won't surface until Helm attempts to render the template.
|
||||||
|
* **Brittle Maintenance:** Templates are string-based with limited IDE support. Refactoring requires grep-and-replace across YAML-like template files, risking subtle breakage.
|
||||||
|
* **Hard-to-Test Logic:** Testing template output requires mocking the template engine and comparing serialized strings rather than asserting against typed data structures.
|
||||||
|
* **No Type Safety:** There is no guarantee that the generated YAML will be valid Kubernetes resources without runtime validation.
|
||||||
|
|
||||||
|
We also faced a strategic choice around Helm: use it as both *templating engine* and *packaging mechanism*, or decouple these concerns. While Helm's ecosystem integration (Harbor, ArgoCD, OCI registry support) is valuable, the Jinja-like templating is at odds with Harmony's "code-first" ethos.
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
We will adopt the **Template Hydration Pattern**—constructing Kubernetes manifests programmatically using strongly-typed `kube-rs` objects, then serializing them to YAML files for packaging into Helm charts.
|
||||||
|
|
||||||
|
Specifically:
|
||||||
|
|
||||||
|
* **Write strongly typed `k8s_openapi` Structs:** All Kubernetes resources (Deployment, Service, ConfigMap, etc.) will be constructed using the typed structs generated by `k8s_openapi`.
|
||||||
|
* **Direct Serialization to YAML:** Rather than rendering templates, we use `serde_yaml::to_string()` to serialize typed objects directly into YAML manifests. This way, YAML is only used as a data-transfer format and not a templating/programming language - which it is not.
|
||||||
|
* **Helm as Packaging-Only:** Helm's role is reduced to packaging pre-rendered templates into a tarball and pushing to OCI registries. No template rendering logic resides within Helm.
|
||||||
|
* **Ecosystem Preservation:** The generated Helm charts remain fully compatible with Harbor, ArgoCD, and any Helm-compatible tool—the only difference is that the `templates/` directory contains static YAML files.
|
||||||
|
|
||||||
|
The implementation in `backend_app.rs` demonstrates this pattern:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let deployment = Deployment {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(self.name.clone()),
|
||||||
|
labels: Some([("app.kubernetes.io/name".to_string(), self.name.clone())].into()),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
spec: Some(DeploymentSpec { /* ... */ }),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let deployment_yaml = serde_yaml::to_string(&deployment)?;
|
||||||
|
fs::write(templates_dir.join("deployment.yaml"), deployment_yaml)?;
|
||||||
|
```
|
||||||
|
|
||||||
|
## Rationale
|
||||||
|
|
||||||
|
**Aligns with "Infrastructure as Resilient Code"**
|
||||||
|
|
||||||
|
Harmony's first principle states that infrastructure should be treated like application code. By expressing Kubernetes manifests as Rust structs, we gain:
|
||||||
|
|
||||||
|
* **Refactorability:** Rename a label and the compiler catches all usages.
|
||||||
|
* **IDE Support:** Autocomplete for all Kubernetes API fields; documentation inline.
|
||||||
|
* **Code Navigation:** Jump to definition shows exactly where a value comes from.
|
||||||
|
|
||||||
|
**Achieves "Prove It Works — Before You Deploy"**
|
||||||
|
|
||||||
|
The compiler now validates that:
|
||||||
|
|
||||||
|
* All required fields are populated (Rust's `Option` type prevents missing fields).
|
||||||
|
* Field types match expectations (ports are integers, not strings).
|
||||||
|
* Enums contain valid values (e.g., `ServiceType::ClusterIP`).
|
||||||
|
|
||||||
|
This moves what was runtime validation into compile-time checks, fulfilling the "shift left" promise.
|
||||||
|
|
||||||
|
**Enables True Unit Testing**
|
||||||
|
|
||||||
|
Developers can now write unit tests that assert directly against typed objects:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let deployment = create_deployment(&app);
|
||||||
|
assert_eq!(deployment.spec.unwrap().replicas.unwrap(), 3);
|
||||||
|
assert_eq!(deployment.metadata.name.unwrap(), "my-app");
|
||||||
|
```
|
||||||
|
|
||||||
|
No string parsing, no YAML serialization, no fragile assertions against rendered output.
|
||||||
|
|
||||||
|
**Preserves Ecosystem Benefits**
|
||||||
|
|
||||||
|
By generating standard Helm chart structures, Harmony retains compatibility with:
|
||||||
|
|
||||||
|
* **OCI Registries (Harbor, GHCR):** `helm push` works exactly as before.
|
||||||
|
* **ArgoCD:** Syncs and manages releases using the generated charts.
|
||||||
|
* **Existing Workflows:** Teams already consuming Helm charts see no change.
|
||||||
|
|
||||||
|
The Helm tarball becomes a "dumb pipe" for transport, which is arguably its ideal role.
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
* **Compile-Time Safety:** A broad class of errors (typos, missing fields, type mismatches) is now caught at build time.
|
||||||
|
* **Better Developer Experience:** IDE autocomplete, inline documentation, and refactor support significantly reduce the learning curve for Kubernetes manifests.
|
||||||
|
* **Testability:** Unit tests can validate manifest structure without integration or runtime checks.
|
||||||
|
* **Auditability:** The source-of-truth for manifests is now pure Rust—easier to review in pull requests than template logic scattered across files.
|
||||||
|
* **Future-Extensibility:** CustomResources (CRDs) can be supported via `kopium`-generated Rust types, maintaining the same strong typing.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
* **API Schema Drift:** Kubernetes API changes require regenerating `k8s_openapi` types and updating code. A change in a struct field will cause the build to fail—intentionally, but still requiring the pipeline to be updated.
|
||||||
|
* **Verbosity:** Typed construction is more verbose than the equivalent template. Builder patterns or helper functions will be needed to keep code readable.
|
||||||
|
* **Learning Curve:** Contributors must understand both the Kubernetes resource spec *and* the Rust type system, rather than just YAML.
|
||||||
|
* **Debugging Shift:** When debugging generated YAML, you now trace through Rust code rather than template files—more precise but different mental model.
|
||||||
|
|
||||||
|
## Alternatives Considered
|
||||||
|
|
||||||
|
### 1. Enhance Askama with Compile-Time Validation
|
||||||
|
*Pros:* Stay within familiar templating paradigm; minimal code changes.
|
||||||
|
*Cons:* Rust's type system cannot fully express Kubernetes schema validation without significant macro boilerplate. Errors would still surface at template evaluation time, not compilation.
|
||||||
|
|
||||||
|
### 2. Use Helm SDK Programmatically (Go)
|
||||||
|
*Pros:* Direct access to Helm's template engine; no YAML serialization step.
|
||||||
|
*Cons:* Would introduce a second language (Go) into a Rust codebase, increasing cognitive load and compilation complexity. No improvement in compile-time safety.
|
||||||
|
|
||||||
|
### 3. Raw YAML String Templating (Manual)
|
||||||
|
*Pros:* Maximum control; no external dependencies.
|
||||||
|
*Cons:* Even more error-prone than Askama; no structure validation; string concatenation errors abound.
|
||||||
|
|
||||||
|
### 4. Use Kustomize for All Manifests
|
||||||
|
*Pros:* Declarative overlays; standard tool.
|
||||||
|
*Cons:* Kustomize is itself a layer over YAML templates with its own DSL. It does not provide compile-time type safety and would require externalizing manifest management outside Harmony's codebase.
|
||||||
|
|
||||||
|
__Note that this template hydration architecture still allows to override templates with tools like kustomize when required__
|
||||||
|
|
||||||
|
## Additional Notes
|
||||||
|
|
||||||
|
**Scalability to Future Topologies**
|
||||||
|
|
||||||
|
The Template Hydration pattern enables future Harmony architectures to generate manifests dynamically based on topology context. For example, a `CostTopology` might adjust resource requests based on cluster pricing, manipulating the typed `Deployment::spec` directly before serialization.
|
||||||
|
|
||||||
|
**Implementation Status**
|
||||||
|
|
||||||
|
As of this writing, the pattern is implemented for `BackendApp` deployments (`backend_app.rs`). The next phase is to extend this pattern across all application modules (`webapp.rs`, etc.) and to standardize on this approach for any new implementations.
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
use std::net::{IpAddr, Ipv4Addr};
|
use std::net::{IpAddr, Ipv4Addr};
|
||||||
|
|
||||||
use brocade::{BrocadeOptions, ssh};
|
use brocade::{BrocadeOptions, ssh};
|
||||||
use harmony_secret::{Secret, SecretManager};
|
use harmony_secret::Secret;
|
||||||
use harmony_types::switch::PortLocation;
|
use harmony_types::switch::PortLocation;
|
||||||
use schemars::JsonSchema;
|
use schemars::JsonSchema;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|||||||
@@ -56,6 +56,8 @@ async fn main() {
|
|||||||
)),
|
)),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// TODO exec pod commands to initialize secret store if not already done
|
||||||
|
|
||||||
harmony_cli::run(
|
harmony_cli::run(
|
||||||
Inventory::autoload(),
|
Inventory::autoload(),
|
||||||
K8sAnywhereTopology::from_env(),
|
K8sAnywhereTopology::from_env(),
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ opnsense-config = { path = "../opnsense-config" }
|
|||||||
opnsense-config-xml = { path = "../opnsense-config-xml" }
|
opnsense-config-xml = { path = "../opnsense-config-xml" }
|
||||||
harmony_macros = { path = "../harmony_macros" }
|
harmony_macros = { path = "../harmony_macros" }
|
||||||
harmony_types = { path = "../harmony_types" }
|
harmony_types = { path = "../harmony_types" }
|
||||||
|
harmony_execution = { path = "../harmony_execution" }
|
||||||
uuid.workspace = true
|
uuid.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
kube = { workspace = true, features = ["derive"] }
|
kube = { workspace = true, features = ["derive"] }
|
||||||
|
|||||||
801
harmony/src/modules/application/backend_app.rs
Normal file
801
harmony/src/modules/application/backend_app.rs
Normal file
@@ -0,0 +1,801 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use log::{debug, info, trace};
|
||||||
|
use serde::Serialize;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
config::{REGISTRY_PROJECT, REGISTRY_URL},
|
||||||
|
modules::application::{
|
||||||
|
Application, HelmPackage, OCICompliant,
|
||||||
|
config::ApplicationNetworkPort,
|
||||||
|
helm::{self, DeploymentBuilder, HelmChart, HelmResourceKind},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
use harmony_execution::{RunnerOptions, run_command};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub struct BuildCommand {
|
||||||
|
pub program: String,
|
||||||
|
pub args: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BuildCommand {
|
||||||
|
pub fn new(program: impl Into<String>, args: Vec<impl Into<String>>) -> Self {
|
||||||
|
Self {
|
||||||
|
program: program.into(),
|
||||||
|
args: args.into_iter().map(|s| s.into()).collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn to_std_command(&self) -> std::process::Command {
|
||||||
|
let mut cmd = std::process::Command::new(&self.program);
|
||||||
|
cmd.args(&self.args);
|
||||||
|
cmd
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub struct BackendApp {
|
||||||
|
pub name: String,
|
||||||
|
pub project_root: std::path::PathBuf,
|
||||||
|
pub network_ports: Vec<ApplicationNetworkPort>,
|
||||||
|
pub env_vars: Vec<(String, String)>,
|
||||||
|
pub build_cmd: BuildCommand,
|
||||||
|
pub dockerfile: Option<PathBuf>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BackendApp {
|
||||||
|
fn get_dockerfile(&self) -> Result<PathBuf, String> {
|
||||||
|
debug!(
|
||||||
|
"Looking for dockerfile, currently set to {:?}",
|
||||||
|
self.dockerfile
|
||||||
|
);
|
||||||
|
if let Some(dockerfile) = &self.dockerfile {
|
||||||
|
return match dockerfile.exists() {
|
||||||
|
true => {
|
||||||
|
info!(
|
||||||
|
"Found dockerfile as intended at {}",
|
||||||
|
dockerfile.to_string_lossy()
|
||||||
|
);
|
||||||
|
Ok(dockerfile.clone())
|
||||||
|
}
|
||||||
|
false => Err(format!(
|
||||||
|
"Dockerfile explicitely set to {dockerfile} does not exist",
|
||||||
|
dockerfile = dockerfile.to_string_lossy()
|
||||||
|
)),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let existing_dockerfile = self.project_root.join("Dockerfile");
|
||||||
|
|
||||||
|
debug!("project_root = {:?}", self.project_root);
|
||||||
|
|
||||||
|
debug!("checking = {:?}", existing_dockerfile);
|
||||||
|
if existing_dockerfile.exists() {
|
||||||
|
debug!(
|
||||||
|
"Checking path {:#?} for existing Dockerfile",
|
||||||
|
self.project_root.clone()
|
||||||
|
);
|
||||||
|
return Ok(existing_dockerfile);
|
||||||
|
}
|
||||||
|
Err(format!(
|
||||||
|
"Could not find a dockerfile in {project_root} folder. Tried {existing_dockerfile}",
|
||||||
|
project_root = self.project_root.to_string_lossy(),
|
||||||
|
existing_dockerfile = existing_dockerfile.to_string_lossy(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Application for BackendApp {
|
||||||
|
fn name(&self) -> String {
|
||||||
|
self.name.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl OCICompliant for BackendApp {
|
||||||
|
async fn build_push_oci_image(&self) -> Result<String, String> {
|
||||||
|
let dockerfile = self.get_dockerfile()?;
|
||||||
|
let image_tag = self.image_name();
|
||||||
|
|
||||||
|
// Run docker build command, streaming output to console and capturing it
|
||||||
|
let output = run_command(
|
||||||
|
std::process::Command::new("docker").args([
|
||||||
|
"build",
|
||||||
|
"-t",
|
||||||
|
&image_tag,
|
||||||
|
"-f",
|
||||||
|
&dockerfile.to_string_lossy(),
|
||||||
|
&self.project_root.to_string_lossy(),
|
||||||
|
]),
|
||||||
|
RunnerOptions::print_to_console(),
|
||||||
|
)
|
||||||
|
.map_err(|e| format!("Failed to spawn docker build process: {}", e))?;
|
||||||
|
|
||||||
|
if output.is_success() {
|
||||||
|
info!("Docker image build succeeded");
|
||||||
|
Ok(image_tag)
|
||||||
|
} else {
|
||||||
|
Err(format!(
|
||||||
|
"Docker image build FAILED:\n{}",
|
||||||
|
output.format_output()
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn local_image_name(&self) -> String {
|
||||||
|
self.name.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn image_name(&self) -> String {
|
||||||
|
format!(
|
||||||
|
"{}/{}/{}",
|
||||||
|
*REGISTRY_URL,
|
||||||
|
*REGISTRY_PROJECT,
|
||||||
|
&self.local_image_name()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl HelmPackage for BackendApp {
|
||||||
|
fn project_root(&self) -> PathBuf {
|
||||||
|
self.project_root.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn chart_name(&self) -> String {
|
||||||
|
self.name.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
|
||||||
|
let mut helm_chart = HelmChart::new(self.name.clone(), "1.0.0".to_string());
|
||||||
|
|
||||||
|
// Build the typed Deployment object using the builder with initial options
|
||||||
|
helm_chart.add_resource(HelmResourceKind::Deployment(
|
||||||
|
DeploymentBuilder::with_options(
|
||||||
|
&self.name,
|
||||||
|
image_url,
|
||||||
|
Some(self.network_ports.clone()),
|
||||||
|
Some(self.env_vars.clone()),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.build(),
|
||||||
|
));
|
||||||
|
|
||||||
|
// Build the typed Service object using the helper function
|
||||||
|
if let Some(service) =
|
||||||
|
helm::create_service_from_ports(self.name.clone(), &self.network_ports)
|
||||||
|
{
|
||||||
|
helm_chart.add_resource(HelmResourceKind::Service(service));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the Helm chart metadata to the project root
|
||||||
|
let chart_dir = helm_chart
|
||||||
|
.write_to(&self.project_root.join(".harmony_generated/helm/"))
|
||||||
|
.map_err(|e| format!("Failed to write Helm chart: {}", e))?;
|
||||||
|
|
||||||
|
info!("Helm chart for '{}' written to: {:?}", self.name, chart_dir);
|
||||||
|
|
||||||
|
Ok(chart_dir.to_string_lossy().to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::modules::application::config::ApplicationNetworkPort;
|
||||||
|
use crate::modules::application::config::NetworkProtocol;
|
||||||
|
use k8s_openapi::api::apps::v1::Deployment;
|
||||||
|
use k8s_openapi::api::core::v1::{Container, EnvVar, Service as K8sService, ServicePort};
|
||||||
|
use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
|
||||||
|
use serde_yaml::from_str;
|
||||||
|
use std::fs;
|
||||||
|
use std::path::Path;
|
||||||
|
use tempfile::tempdir;
|
||||||
|
|
||||||
|
// Test Helpers
|
||||||
|
fn read_service_yaml(project_root: &Path, chart_name: &str) -> K8sService {
|
||||||
|
let path = project_root.join(format!(
|
||||||
|
".harmony_generated/helm/{chart_name}/templates/service.yaml"
|
||||||
|
));
|
||||||
|
let content = fs::read_to_string(&path)
|
||||||
|
.unwrap_or_else(|e| panic!("Failed to read service.yaml at {:?}: {}", path, e));
|
||||||
|
from_str(&content)
|
||||||
|
.unwrap_or_else(|e| panic!("Failed to parse service.yaml as K8s Service: {}", e))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_deployment_yaml(project_root: &Path, chart_name: &str) -> Deployment {
|
||||||
|
let path = project_root.join(format!(
|
||||||
|
".harmony_generated/helm/{chart_name}/templates/deployment.yaml"
|
||||||
|
));
|
||||||
|
let content = fs::read_to_string(&path)
|
||||||
|
.unwrap_or_else(|e| panic!("Failed to read deployment.yaml at {:?}: {}", path, e));
|
||||||
|
from_str(&content)
|
||||||
|
.unwrap_or_else(|e| panic!("Failed to parse deployment.yaml as K8s Deployment: {}", e))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn service_yaml_exists(project_root: &Path, chart_name: &str) -> bool {
|
||||||
|
let path = project_root.join(format!(
|
||||||
|
".harmony_generated/helm/{chart_name}/templates/service.yaml"
|
||||||
|
));
|
||||||
|
path.exists()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Service Assertions
|
||||||
|
fn assert_service_metadata(service: &K8sService, expected_name: &str) {
|
||||||
|
assert_eq!(
|
||||||
|
service.metadata.name.as_deref(),
|
||||||
|
Some(expected_name),
|
||||||
|
"Service name should be '{expected_name}'"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_service_type(service: &K8sService, expected_type: &str) {
|
||||||
|
assert_eq!(
|
||||||
|
service.spec.as_ref().and_then(|s| s.type_.as_deref()),
|
||||||
|
Some(expected_type),
|
||||||
|
"Service type should be '{expected_type}'"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_service_port_count(service: &K8sService, expected_count: usize) {
|
||||||
|
let ports = service
|
||||||
|
.spec
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|s| s.ports.as_ref())
|
||||||
|
.unwrap_or_else(|| panic!("Service should have ports"));
|
||||||
|
assert_eq!(
|
||||||
|
ports.len(),
|
||||||
|
expected_count,
|
||||||
|
"Service should have {expected_count} ports"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_service_port(
|
||||||
|
port: &ServicePort,
|
||||||
|
expected_name: &str,
|
||||||
|
expected_protocol: &str,
|
||||||
|
expected_number: i32,
|
||||||
|
) {
|
||||||
|
assert_eq!(
|
||||||
|
port.name.as_deref(),
|
||||||
|
Some(expected_name),
|
||||||
|
"Port name should be '{expected_name}'"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
port.protocol.as_deref(),
|
||||||
|
Some(expected_protocol),
|
||||||
|
"Port '{expected_name}' protocol should be '{expected_protocol}'"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
port.port, expected_number,
|
||||||
|
"Port '{expected_name}' number should be {expected_number}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_target_port_matches_service_port(port: &ServicePort) {
|
||||||
|
match &port.target_port {
|
||||||
|
Some(IntOrString::Int(target)) => {
|
||||||
|
assert_eq!(
|
||||||
|
*target,
|
||||||
|
port.port,
|
||||||
|
"Target port should match service port for '{}'",
|
||||||
|
port.name.as_deref().unwrap_or("unknown")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
_ => panic!(
|
||||||
|
"Target port should be Int for '{}'",
|
||||||
|
port.name.as_deref().unwrap_or("unknown")
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deployment Assertions
|
||||||
|
fn assert_deployment_metadata(deployment: &Deployment, expected_name: &str) {
|
||||||
|
assert_eq!(
|
||||||
|
deployment.metadata.name.as_deref(),
|
||||||
|
Some(expected_name),
|
||||||
|
"Deployment name should be '{expected_name}'"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_deployment_replicas(deployment: &Deployment, expected_replicas: i32) {
|
||||||
|
let spec = deployment
|
||||||
|
.spec
|
||||||
|
.as_ref()
|
||||||
|
.unwrap_or_else(|| panic!("Deployment should have spec"));
|
||||||
|
assert_eq!(
|
||||||
|
spec.replicas,
|
||||||
|
Some(expected_replicas),
|
||||||
|
"Deployment should have {expected_replicas} replicas"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_selector_match_label(deployment: &Deployment, expected_label_value: &str) {
|
||||||
|
let spec = deployment
|
||||||
|
.spec
|
||||||
|
.as_ref()
|
||||||
|
.unwrap_or_else(|| panic!("Deployment should have spec"));
|
||||||
|
assert_eq!(
|
||||||
|
spec.selector
|
||||||
|
.match_labels
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|m| m.get("app.kubernetes.io/name")),
|
||||||
|
Some(&expected_label_value.to_string()),
|
||||||
|
"Selector should match app name '{expected_label_value}'"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_pod_labels(deployment: &Deployment, expected_name: &str) {
|
||||||
|
let spec = deployment
|
||||||
|
.spec
|
||||||
|
.as_ref()
|
||||||
|
.unwrap_or_else(|| panic!("Deployment should have spec"));
|
||||||
|
let metadata = spec
|
||||||
|
.template
|
||||||
|
.metadata
|
||||||
|
.as_ref()
|
||||||
|
.unwrap_or_else(|| panic!("Pod template should have metadata"));
|
||||||
|
let labels = metadata
|
||||||
|
.labels
|
||||||
|
.as_ref()
|
||||||
|
.unwrap_or_else(|| panic!("Pod should have labels"));
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
labels.get("app.kubernetes.io/name"),
|
||||||
|
Some(&expected_name.to_string()),
|
||||||
|
"Pod label app.kubernetes.io/name should be '{expected_name}'"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
labels.get("app.kubernetes.io/instance"),
|
||||||
|
Some(&expected_name.to_string()),
|
||||||
|
"Pod label app.kubernetes.io/instance should be '{expected_name}'"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Container Assertions
|
||||||
|
fn assert_container_metadata(
|
||||||
|
container: &Container,
|
||||||
|
expected_name: &str,
|
||||||
|
expected_image: &str,
|
||||||
|
expected_pull_policy: &str,
|
||||||
|
) {
|
||||||
|
assert_eq!(
|
||||||
|
container.name, expected_name,
|
||||||
|
"Container name should be '{expected_name}'"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
container.image.as_deref(),
|
||||||
|
Some(expected_image),
|
||||||
|
"Container image should be '{expected_image}'"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
container.image_pull_policy.as_deref(),
|
||||||
|
Some(expected_pull_policy),
|
||||||
|
"Image pull policy should be '{expected_pull_policy}'"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_container_ports_count(container: &Container, expected_count: usize) {
|
||||||
|
let ports = container
|
||||||
|
.ports
|
||||||
|
.as_ref()
|
||||||
|
.unwrap_or_else(|| panic!("Container should have ports"));
|
||||||
|
assert_eq!(
|
||||||
|
ports.len(),
|
||||||
|
expected_count,
|
||||||
|
"Container should have {expected_count} ports"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_container_port(
|
||||||
|
port: &k8s_openapi::api::core::v1::ContainerPort,
|
||||||
|
expected_name: &str,
|
||||||
|
expected_protocol: &str,
|
||||||
|
expected_number: i32,
|
||||||
|
) {
|
||||||
|
assert_eq!(
|
||||||
|
port.name.as_deref(),
|
||||||
|
Some(expected_name),
|
||||||
|
"Container port name should be '{expected_name}'"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
port.protocol.as_deref(),
|
||||||
|
Some(expected_protocol),
|
||||||
|
"Container port '{expected_name}' protocol should be '{expected_protocol}'"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
port.container_port, expected_number,
|
||||||
|
"Container port '{expected_name}' number should be {expected_number}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_container_env_vars_count(container: &Container, expected_count: usize) {
|
||||||
|
let env_vars = container
|
||||||
|
.env
|
||||||
|
.as_ref()
|
||||||
|
.unwrap_or_else(|| panic!("Container should have env vars"));
|
||||||
|
assert_eq!(
|
||||||
|
env_vars.len(),
|
||||||
|
expected_count,
|
||||||
|
"Container should have {expected_count} env vars"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn assert_container_env_var(env_var: &EnvVar, expected_name: &str, expected_value: &str) {
|
||||||
|
assert_eq!(
|
||||||
|
env_var.name, expected_name,
|
||||||
|
"Env var name should be '{expected_name}'"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
env_var.value.as_deref(),
|
||||||
|
Some(expected_value),
|
||||||
|
"Env var '{expected_name}' value should be '{expected_value}'"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_container(deployment: &Deployment) -> Container {
|
||||||
|
let spec = deployment
|
||||||
|
.spec
|
||||||
|
.as_ref()
|
||||||
|
.unwrap_or_else(|| panic!("Deployment should have spec"));
|
||||||
|
let pod_spec = spec
|
||||||
|
.template
|
||||||
|
.spec
|
||||||
|
.as_ref()
|
||||||
|
.unwrap_or_else(|| panic!("Pod template should have spec"));
|
||||||
|
pod_spec
|
||||||
|
.containers
|
||||||
|
.first()
|
||||||
|
.unwrap_or_else(|| panic!("Should have exactly one container"))
|
||||||
|
.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test Fixtures
|
||||||
|
fn standard_test_ports() -> Vec<ApplicationNetworkPort> {
|
||||||
|
vec![
|
||||||
|
ApplicationNetworkPort {
|
||||||
|
number: 8080,
|
||||||
|
protocol: NetworkProtocol::TCP,
|
||||||
|
name: "http".to_string(),
|
||||||
|
},
|
||||||
|
ApplicationNetworkPort {
|
||||||
|
number: 9000,
|
||||||
|
protocol: NetworkProtocol::TCP,
|
||||||
|
name: "metrics".to_string(),
|
||||||
|
},
|
||||||
|
ApplicationNetworkPort {
|
||||||
|
number: 50051,
|
||||||
|
protocol: NetworkProtocol::TCP,
|
||||||
|
name: "grpc".to_string(),
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn standard_test_env_vars() -> Vec<(String, String)> {
|
||||||
|
vec![
|
||||||
|
("ENV_VAR_1".to_string(), "value1".to_string()),
|
||||||
|
("ENV_VAR_2".to_string(), "value2".to_string()),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn udp_test_ports() -> Vec<ApplicationNetworkPort> {
|
||||||
|
vec![
|
||||||
|
ApplicationNetworkPort {
|
||||||
|
number: 53,
|
||||||
|
protocol: NetworkProtocol::UDP,
|
||||||
|
name: "dns".to_string(),
|
||||||
|
},
|
||||||
|
ApplicationNetworkPort {
|
||||||
|
number: 8080,
|
||||||
|
protocol: NetworkProtocol::TCP,
|
||||||
|
name: "http".to_string(),
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test Builder
|
||||||
|
struct BackendAppTestBuilder {
|
||||||
|
name: Option<String>,
|
||||||
|
network_ports: Option<Vec<ApplicationNetworkPort>>,
|
||||||
|
env_vars: Option<Vec<(String, String)>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BackendAppTestBuilder {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
name: None,
|
||||||
|
network_ports: None,
|
||||||
|
env_vars: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn with_name(mut self, name: impl Into<String>) -> Self {
|
||||||
|
self.name = Some(name.into());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
fn with_standard_ports(mut self) -> Self {
|
||||||
|
self.network_ports = Some(standard_test_ports());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
fn with_udp_ports(mut self) -> Self {
|
||||||
|
self.network_ports = Some(udp_test_ports());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
fn with_standard_env_vars(mut self) -> Self {
|
||||||
|
self.env_vars = Some(standard_test_env_vars());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
fn with_no_ports(mut self) -> Self {
|
||||||
|
self.network_ports = Some(vec![]);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build(self, project_root: PathBuf) -> BackendApp {
|
||||||
|
BackendApp {
|
||||||
|
name: self.name.unwrap_or_else(|| "test-app".to_string()),
|
||||||
|
project_root,
|
||||||
|
network_ports: self.network_ports.unwrap_or_default(),
|
||||||
|
env_vars: self.env_vars.unwrap_or_default(),
|
||||||
|
build_cmd: BuildCommand::new("cargo", vec!["build"]),
|
||||||
|
dockerfile: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for BackendAppTestBuilder {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function for test setup
|
||||||
|
async fn build_helm_chart_for_test(app: &BackendApp, image_url: &str) {
|
||||||
|
let result = app.build_push_helm_package(image_url).await;
|
||||||
|
assert!(
|
||||||
|
result.is_ok(),
|
||||||
|
"build_push_helm_package should succeed: {:?}",
|
||||||
|
result
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===== SERVICE TESTS =====
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn service_is_created_with_application_name() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||||
|
|
||||||
|
let service = read_service_yaml(&app.project_root, "test-app");
|
||||||
|
assert_service_metadata(&service, "test-app");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn service_has_default_clusterip_type() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||||
|
|
||||||
|
let service = read_service_yaml(&app.project_root, "test-app");
|
||||||
|
assert_service_type(&service, "ClusterIP");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn service_exposes_all_network_ports() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||||
|
|
||||||
|
let service = read_service_yaml(&app.project_root, "test-app");
|
||||||
|
assert_service_port_count(&service, 3);
|
||||||
|
|
||||||
|
let ports = service.spec.unwrap().ports.unwrap();
|
||||||
|
assert_service_port(&ports[0], "http", "TCP", 8080);
|
||||||
|
assert_service_port(&ports[1], "metrics", "TCP", 9000);
|
||||||
|
assert_service_port(&ports[2], "grpc", "TCP", 50051);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn service_target_ports_match_service_ports() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||||
|
|
||||||
|
let service = read_service_yaml(&app.project_root, "test-app");
|
||||||
|
let ports = service.spec.unwrap().ports.unwrap();
|
||||||
|
|
||||||
|
for port in &ports {
|
||||||
|
assert_target_port_matches_service_port(port);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn service_not_created_when_application_has_no_ports() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app-no-ports")
|
||||||
|
.with_no_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app-no-ports:1.0.0").await;
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
!service_yaml_exists(&app.project_root, "test-app-no-ports"),
|
||||||
|
"service.yaml should not exist when there are no network ports"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn service_respects_port_protocol_type() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("udp-app")
|
||||||
|
.with_udp_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/udp-app:1.0.0").await;
|
||||||
|
|
||||||
|
let service = read_service_yaml(&app.project_root, "udp-app");
|
||||||
|
let ports = service.spec.unwrap().ports.unwrap();
|
||||||
|
|
||||||
|
assert_service_port(&ports[0], "dns", "UDP", 53);
|
||||||
|
assert_service_port(&ports[1], "http", "TCP", 8080);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===== DEPLOYMENT METADATA TESTS =====
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn deployment_has_application_name() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||||
|
|
||||||
|
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||||
|
assert_deployment_metadata(&deployment, "test-app");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn deployment_has_single_replica_by_default() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||||
|
|
||||||
|
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||||
|
assert_deployment_replicas(&deployment, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn deployment_selector_matches_application_name() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||||
|
|
||||||
|
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||||
|
assert_selector_match_label(&deployment, "test-app");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn pod_has_standard_kubernetes_labels() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||||
|
|
||||||
|
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||||
|
assert_pod_labels(&deployment, "test-app");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===== CONTAINER CONFIGURATION TESTS =====
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn container_has_correct_name_and_image() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
let image_url = "registry.example.com/test/test-app:1.0.0";
|
||||||
|
build_helm_chart_for_test(&app, image_url).await;
|
||||||
|
|
||||||
|
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||||
|
let container = get_container(&deployment);
|
||||||
|
|
||||||
|
assert_container_metadata(&container, "test-app", image_url, "IfNotPresent");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn container_exposes_all_application_ports() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||||
|
|
||||||
|
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||||
|
let container = get_container(&deployment);
|
||||||
|
|
||||||
|
assert_container_ports_count(&container, 3);
|
||||||
|
|
||||||
|
let ports = container.ports.unwrap();
|
||||||
|
assert_container_port(&ports[0], "http", "TCP", 8080);
|
||||||
|
assert_container_port(&ports[1], "metrics", "TCP", 9000);
|
||||||
|
assert_container_port(&ports[2], "grpc", "TCP", 50051);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn container_has_all_environment_variables() {
|
||||||
|
let temp_dir = tempdir().expect("Failed to create temp directory");
|
||||||
|
let app = BackendAppTestBuilder::new()
|
||||||
|
.with_name("test-app")
|
||||||
|
.with_standard_ports()
|
||||||
|
.with_standard_env_vars()
|
||||||
|
.build(temp_dir.path().to_path_buf());
|
||||||
|
|
||||||
|
build_helm_chart_for_test(&app, "registry.example.com/test/test-app:1.0.0").await;
|
||||||
|
|
||||||
|
let deployment = read_deployment_yaml(&app.project_root, "test-app");
|
||||||
|
let container = get_container(&deployment);
|
||||||
|
|
||||||
|
assert_container_env_vars_count(&container, 2);
|
||||||
|
|
||||||
|
let env_vars = container.env.unwrap();
|
||||||
|
assert_container_env_var(&env_vars[0], "ENV_VAR_1", "value1");
|
||||||
|
assert_container_env_var(&env_vars[1], "ENV_VAR_2", "value2");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ===== BUILD COMMAND UNIT TESTS =====
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn build_command_creation_sets_program_and_args() {
|
||||||
|
let cmd = BuildCommand::new("docker", vec!["build", "-t", "myimage"]);
|
||||||
|
assert_eq!(cmd.program, "docker");
|
||||||
|
assert_eq!(cmd.args, vec!["build", "-t", "myimage"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn build_command_clone_copies_all_fields() {
|
||||||
|
let cmd1 = BuildCommand::new("cargo", vec!["build", "--release"]);
|
||||||
|
let cmd2 = cmd1.clone();
|
||||||
|
assert_eq!(cmd1.program, cmd2.program);
|
||||||
|
assert_eq!(cmd1.args, cmd2.args);
|
||||||
|
}
|
||||||
|
}
|
||||||
29
harmony/src/modules/application/config.rs
Normal file
29
harmony/src/modules/application/config.rs
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub enum NetworkProtocol {
|
||||||
|
TCP,
|
||||||
|
UDP,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NetworkProtocol {
|
||||||
|
pub fn as_str(&self) -> &str {
|
||||||
|
match self {
|
||||||
|
NetworkProtocol::TCP => "TCP",
|
||||||
|
NetworkProtocol::UDP => "UDP",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for NetworkProtocol {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
f.write_str(self.as_str())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
pub struct ApplicationNetworkPort {
|
||||||
|
pub number: u16,
|
||||||
|
pub protocol: NetworkProtocol,
|
||||||
|
pub name: String,
|
||||||
|
}
|
||||||
@@ -48,11 +48,11 @@ use crate::{
|
|||||||
/// - ArgoCD to install/upgrade/rollback/inspect k8s resources
|
/// - ArgoCD to install/upgrade/rollback/inspect k8s resources
|
||||||
/// - Kubernetes for runtime orchestration
|
/// - Kubernetes for runtime orchestration
|
||||||
#[derive(Debug, Default, Clone)]
|
#[derive(Debug, Default, Clone)]
|
||||||
pub struct PackagingDeployment<A: OCICompliant + HelmPackage + Webapp> {
|
pub struct PackagingDeployment<A: OCICompliant + HelmPackage> {
|
||||||
pub application: Arc<A>,
|
pub application: Arc<A>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {
|
impl<A: OCICompliant + HelmPackage> PackagingDeployment<A> {
|
||||||
async fn deploy_to_local_k3d(
|
async fn deploy_to_local_k3d(
|
||||||
&self,
|
&self,
|
||||||
app_name: String,
|
app_name: String,
|
||||||
@@ -138,7 +138,7 @@ impl<A: OCICompliant + HelmPackage + Webapp> PackagingDeployment<A> {
|
|||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl<
|
impl<
|
||||||
A: OCICompliant + HelmPackage + Webapp + Clone + 'static,
|
A: OCICompliant + HelmPackage + Clone + 'static,
|
||||||
T: Topology + HelmCommand + MultiTargetTopology + K8sclient + Ingress + 'static,
|
T: Topology + HelmCommand + MultiTargetTopology + K8sclient + Ingress + 'static,
|
||||||
> ApplicationFeature<T> for PackagingDeployment<A>
|
> ApplicationFeature<T> for PackagingDeployment<A>
|
||||||
{
|
{
|
||||||
@@ -148,24 +148,12 @@ impl<
|
|||||||
) -> Result<InstallationOutcome, InstallationError> {
|
) -> Result<InstallationOutcome, InstallationError> {
|
||||||
let image = self.application.image_name();
|
let image = self.application.image_name();
|
||||||
|
|
||||||
let domain = if topology.current_target() == DeploymentTarget::Production {
|
|
||||||
self.application.dns()
|
|
||||||
} else {
|
|
||||||
topology
|
|
||||||
.get_domain(&self.application.name())
|
|
||||||
.await
|
|
||||||
.map_err(|e| e.to_string())?
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO Write CI/CD workflow files
|
// TODO Write CI/CD workflow files
|
||||||
// we can autotedect the CI type using the remote url (default to github action for github
|
// we can autotedect the CI type using the remote url (default to github action for github
|
||||||
// url, etc..)
|
// url, etc..)
|
||||||
// Or ask for it when unknown
|
// Or ask for it when unknown
|
||||||
|
|
||||||
let helm_chart = self
|
let helm_chart = self.application.build_push_helm_package(&image).await?;
|
||||||
.application
|
|
||||||
.build_push_helm_package(&image, &domain)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
// TODO: Make building image configurable/skippable if image already exists (prompt)")
|
// TODO: Make building image configurable/skippable if image already exists (prompt)")
|
||||||
// https://git.nationtech.io/NationTech/harmony/issues/104
|
// https://git.nationtech.io/NationTech/harmony/issues/104
|
||||||
@@ -215,12 +203,12 @@ impl<
|
|||||||
};
|
};
|
||||||
|
|
||||||
Ok(InstallationOutcome::success_with_details(vec![format!(
|
Ok(InstallationOutcome::success_with_details(vec![format!(
|
||||||
"{}: http://{domain}",
|
"{}",
|
||||||
self.application.name()
|
self.application.name()
|
||||||
)]))
|
)]))
|
||||||
}
|
}
|
||||||
fn name(&self) -> String {
|
fn name(&self) -> String {
|
||||||
"ContinuousDelivery".to_string()
|
"PackagingDeployment".to_string()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
446
harmony/src/modules/application/helm/mod.rs
Normal file
446
harmony/src/modules/application/helm/mod.rs
Normal file
@@ -0,0 +1,446 @@
|
|||||||
|
// Re-export common Kubernetes types for convenience
|
||||||
|
pub use k8s_openapi::api::{
|
||||||
|
apps::v1::{Deployment, DeploymentSpec},
|
||||||
|
core::v1::{
|
||||||
|
Container, ContainerPort, EnvVar, PodSpec, PodTemplateSpec, Service as K8sService,
|
||||||
|
ServicePort, ServiceSpec,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
|
||||||
|
use kube::core::ObjectMeta;
|
||||||
|
|
||||||
|
// Import domain types for the deployment builder
|
||||||
|
use crate::modules::application::config::{ApplicationNetworkPort, NetworkProtocol};
|
||||||
|
use std::fs;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
/// Enum representing all supported Kubernetes resource types for Helm charts.
|
||||||
|
/// Supports built-in typed resources and custom CRDs via YAML strings.
|
||||||
|
pub enum HelmResourceKind {
|
||||||
|
/// Built-in typed Service resource
|
||||||
|
Service(K8sService),
|
||||||
|
/// Built-in typed Deployment resource
|
||||||
|
Deployment(Deployment),
|
||||||
|
/// Custom resource as pre-serialized YAML (e.g., CRDs, custom types)
|
||||||
|
CustomYaml { filename: String, content: String },
|
||||||
|
// Can add more typed variants as needed: ConfigMap, Secret, Ingress, etc.
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HelmResourceKind {
|
||||||
|
pub fn filename(&self) -> String {
|
||||||
|
match self {
|
||||||
|
HelmResourceKind::Service(_) => "service.yaml".to_string(),
|
||||||
|
HelmResourceKind::Deployment(_) => "deployment.yaml".to_string(),
|
||||||
|
HelmResourceKind::CustomYaml { filename, .. } => filename.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn serialize_to_yaml(&self) -> Result<String, serde_yaml::Error> {
|
||||||
|
match self {
|
||||||
|
HelmResourceKind::Service(s) => serde_yaml::to_string(s),
|
||||||
|
HelmResourceKind::Deployment(d) => serde_yaml::to_string(d),
|
||||||
|
HelmResourceKind::CustomYaml { content, .. } => Ok(content.clone()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_service(&self) -> Option<&K8sService> {
|
||||||
|
match self {
|
||||||
|
HelmResourceKind::Service(s) => Some(s),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_deployment(&self) -> Option<&Deployment> {
|
||||||
|
match self {
|
||||||
|
HelmResourceKind::Deployment(d) => Some(d),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a custom resource from any serializable type (e.g., CRDs, custom types)
|
||||||
|
pub fn from_yaml(filename: impl Into<String>, content: impl Into<String>) -> Self {
|
||||||
|
HelmResourceKind::CustomYaml {
|
||||||
|
filename: filename.into(),
|
||||||
|
content: content.into(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a custom resource from any type that implements Serialize
|
||||||
|
pub fn from_serializable<T: serde::Serialize>(
|
||||||
|
filename: impl Into<String>,
|
||||||
|
resource: &T,
|
||||||
|
) -> Result<Self, serde_yaml::Error> {
|
||||||
|
Ok(HelmResourceKind::CustomYaml {
|
||||||
|
filename: filename.into(),
|
||||||
|
content: serde_yaml::to_string(resource)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The main orchestrator for building a Helm chart.
|
||||||
|
pub struct HelmChart {
|
||||||
|
pub name: String,
|
||||||
|
pub version: String,
|
||||||
|
pub app_version: String,
|
||||||
|
pub description: String,
|
||||||
|
pub resources: Vec<HelmResourceKind>,
|
||||||
|
pub values: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HelmChart {
|
||||||
|
pub fn new(name: String, app_version: String) -> Self {
|
||||||
|
Self {
|
||||||
|
name: name.clone(),
|
||||||
|
version: "0.1.0".to_string(),
|
||||||
|
app_version,
|
||||||
|
description: format!("A Helm chart for {}", name),
|
||||||
|
resources: Vec::new(),
|
||||||
|
values: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_resource(&mut self, resource: HelmResourceKind) {
|
||||||
|
self.resources.push(resource);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_value(&mut self, key: &str, value: &str) {
|
||||||
|
self.values.push(format!("{}: {}", key, value));
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write_to(&self, base_path: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
|
||||||
|
let chart_dir = base_path.join(&self.name);
|
||||||
|
let templates_dir = chart_dir.join("templates");
|
||||||
|
fs::create_dir_all(&templates_dir)?;
|
||||||
|
|
||||||
|
// 1. Render and write Chart.yaml
|
||||||
|
let chart_yaml = ChartYaml {
|
||||||
|
name: &self.name,
|
||||||
|
description: &self.description,
|
||||||
|
version: &self.version,
|
||||||
|
app_version: &self.app_version,
|
||||||
|
};
|
||||||
|
fs::write(chart_dir.join("Chart.yaml"), chart_yaml.render()?)?;
|
||||||
|
|
||||||
|
// 2. Write values.yaml (Constructed dynamically)
|
||||||
|
let values_content = self.values.join("\n");
|
||||||
|
fs::write(chart_dir.join("values.yaml"), values_content)?;
|
||||||
|
|
||||||
|
// 3. Serialize and write all added resources (Deployment, Service, etc.)
|
||||||
|
for resource in &self.resources {
|
||||||
|
let filename = resource.filename();
|
||||||
|
let content = resource
|
||||||
|
.serialize_to_yaml()
|
||||||
|
.map_err(|e| format!("Failed to serialize resource {}: {}", filename, e))?;
|
||||||
|
fs::write(templates_dir.join(filename), content)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(chart_dir)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
use askama::Template;
|
||||||
|
|
||||||
|
#[derive(Template)]
|
||||||
|
#[template(path = "helm/Chart.yaml.j2")]
|
||||||
|
struct ChartYaml<'a> {
|
||||||
|
name: &'a str,
|
||||||
|
description: &'a str,
|
||||||
|
version: &'a str,
|
||||||
|
app_version: &'a str,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Builder for creating a Kubernetes Service with proper labels and selectors.
|
||||||
|
pub struct ServiceBuilder {
|
||||||
|
name: String,
|
||||||
|
service_type: String,
|
||||||
|
ports: Vec<ServicePort>,
|
||||||
|
selector_label: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ServiceBuilder {
|
||||||
|
pub fn new(name: impl Into<String>) -> Self {
|
||||||
|
Self {
|
||||||
|
name: name.into(),
|
||||||
|
service_type: "ClusterIP".to_string(),
|
||||||
|
ports: Vec::new(),
|
||||||
|
selector_label: String::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn service_type(mut self, service_type: impl Into<String>) -> Self {
|
||||||
|
self.service_type = service_type.into();
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_port(
|
||||||
|
mut self,
|
||||||
|
name: impl Into<String>,
|
||||||
|
port: i32,
|
||||||
|
protocol: impl Into<String>,
|
||||||
|
) -> Self {
|
||||||
|
use k8s_openapi::apimachinery::pkg::util::intstr::IntOrString;
|
||||||
|
self.ports.push(ServicePort {
|
||||||
|
name: Some(name.into()),
|
||||||
|
protocol: Some(protocol.into()),
|
||||||
|
port,
|
||||||
|
target_port: Some(IntOrString::Int(port)),
|
||||||
|
..Default::default()
|
||||||
|
});
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn selector_label(mut self, label: impl Into<String>) -> Self {
|
||||||
|
self.selector_label = label.into();
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(self) -> K8sService {
|
||||||
|
K8sService {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(self.name.clone()),
|
||||||
|
labels: Some(
|
||||||
|
[
|
||||||
|
("app.kubernetes.io/name".to_string(), self.name.clone()),
|
||||||
|
(
|
||||||
|
"app.kubernetes.io/component".to_string(),
|
||||||
|
"service".to_string(),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"app.kubernetes.io/managed-by".to_string(),
|
||||||
|
"harmony".to_string(),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
.into(),
|
||||||
|
),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
spec: Some(ServiceSpec {
|
||||||
|
type_: Some(self.service_type),
|
||||||
|
selector: Some(
|
||||||
|
[("app.kubernetes.io/name".to_string(), self.selector_label)].into(),
|
||||||
|
),
|
||||||
|
ports: if self.ports.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(self.ports)
|
||||||
|
},
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Builder for creating a Kubernetes Deployment with pod template and container spec.
|
||||||
|
pub struct DeploymentBuilder {
|
||||||
|
name: String,
|
||||||
|
image: String,
|
||||||
|
replicas: i32,
|
||||||
|
container_ports: Vec<ContainerPort>,
|
||||||
|
env_vars: Vec<EnvVar>,
|
||||||
|
image_pull_policy: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DeploymentBuilder {
|
||||||
|
/// Create a new DeploymentBuilder with minimal required fields.
|
||||||
|
pub fn new(name: impl Into<String>, image: impl Into<String>) -> Self {
|
||||||
|
Self::with_options(name, image, None, None, None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new DeploymentBuilder with optional initial configuration.
|
||||||
|
///
|
||||||
|
/// Arguments:
|
||||||
|
/// - `name`: The deployment name
|
||||||
|
/// - `image`: The container image to use
|
||||||
|
/// - `ports`: Optional vector of initial application network ports
|
||||||
|
/// - `env_vars`: Optional vector of initial environment variable key-value pairs
|
||||||
|
/// - `replicas`: Optional number of replicas (defaults to 1)
|
||||||
|
pub fn with_options(
|
||||||
|
name: impl Into<String>,
|
||||||
|
image: impl Into<String>,
|
||||||
|
ports: Option<Vec<ApplicationNetworkPort>>,
|
||||||
|
env_vars: Option<Vec<(String, String)>>,
|
||||||
|
replicas: Option<i32>,
|
||||||
|
) -> Self {
|
||||||
|
let container_ports: Vec<ContainerPort> = ports
|
||||||
|
.unwrap_or_default()
|
||||||
|
.into_iter()
|
||||||
|
.map(|port| ContainerPort {
|
||||||
|
container_port: port.number as i32,
|
||||||
|
name: Some(port.name),
|
||||||
|
protocol: Some(port.protocol.to_string()),
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let k8s_env_vars: Vec<EnvVar> = env_vars
|
||||||
|
.unwrap_or_default()
|
||||||
|
.into_iter()
|
||||||
|
.map(|(key, value)| EnvVar {
|
||||||
|
name: key,
|
||||||
|
value: Some(value),
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Self {
|
||||||
|
name: name.into(),
|
||||||
|
image: image.into(),
|
||||||
|
replicas: replicas.unwrap_or(1),
|
||||||
|
container_ports,
|
||||||
|
env_vars: k8s_env_vars,
|
||||||
|
image_pull_policy: Some("IfNotPresent".to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn replicas(mut self, replicas: i32) -> Self {
|
||||||
|
self.replicas = replicas;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_container_port(
|
||||||
|
mut self,
|
||||||
|
number: i32,
|
||||||
|
name: impl Into<String>,
|
||||||
|
protocol: impl Into<String>,
|
||||||
|
) -> Self {
|
||||||
|
self.container_ports.push(ContainerPort {
|
||||||
|
container_port: number,
|
||||||
|
name: Some(name.into()),
|
||||||
|
protocol: Some(protocol.into()),
|
||||||
|
..Default::default()
|
||||||
|
});
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_env_var(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
|
||||||
|
self.env_vars.push(EnvVar {
|
||||||
|
name: name.into(),
|
||||||
|
value: Some(value.into()),
|
||||||
|
..Default::default()
|
||||||
|
});
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn image_pull_policy(mut self, policy: impl Into<String>) -> Self {
|
||||||
|
self.image_pull_policy = Some(policy.into());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(self) -> Deployment {
|
||||||
|
let name = self.name.clone();
|
||||||
|
Deployment {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(name.clone()),
|
||||||
|
labels: Some(
|
||||||
|
[
|
||||||
|
("app.kubernetes.io/name".to_string(), name.clone()),
|
||||||
|
(
|
||||||
|
"app.kubernetes.io/component".to_string(),
|
||||||
|
"deployment".to_string(),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"app.kubernetes.io/managed-by".to_string(),
|
||||||
|
"harmony".to_string(),
|
||||||
|
),
|
||||||
|
("app.kubernetes.io/version".to_string(), "1.0.0".to_string()),
|
||||||
|
]
|
||||||
|
.into(),
|
||||||
|
),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
spec: Some(DeploymentSpec {
|
||||||
|
replicas: Some(self.replicas),
|
||||||
|
selector: k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector {
|
||||||
|
match_labels: Some(
|
||||||
|
[("app.kubernetes.io/name".to_string(), name.clone())].into(),
|
||||||
|
),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
template: PodTemplateSpec {
|
||||||
|
metadata: Some(ObjectMeta {
|
||||||
|
labels: Some(
|
||||||
|
[
|
||||||
|
("app.kubernetes.io/name".to_string(), name.clone()),
|
||||||
|
("app.kubernetes.io/instance".to_string(), name.clone()),
|
||||||
|
]
|
||||||
|
.into(),
|
||||||
|
),
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
|
spec: Some(PodSpec {
|
||||||
|
containers: vec![Container {
|
||||||
|
name: name.clone(),
|
||||||
|
image: Some(self.image),
|
||||||
|
image_pull_policy: self.image_pull_policy,
|
||||||
|
ports: if self.container_ports.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(self.container_ports)
|
||||||
|
},
|
||||||
|
env: if self.env_vars.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(self.env_vars)
|
||||||
|
},
|
||||||
|
..Default::default()
|
||||||
|
}],
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to create a Service from network port configuration.
|
||||||
|
/// Returns `None` if no ports are provided.
|
||||||
|
pub fn create_service_from_ports(
|
||||||
|
name: String,
|
||||||
|
network_ports: &[ApplicationNetworkPort],
|
||||||
|
) -> Option<K8sService> {
|
||||||
|
if network_ports.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let ports: Vec<ServicePort> = network_ports
|
||||||
|
.into_iter()
|
||||||
|
.map(|port| ServicePort {
|
||||||
|
name: Some(port.name.clone()),
|
||||||
|
protocol: Some(port.protocol.to_string()),
|
||||||
|
port: port.number as i32,
|
||||||
|
target_port: Some(IntOrString::Int(port.number as i32)),
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Some(K8sService {
|
||||||
|
metadata: ObjectMeta {
|
||||||
|
name: Some(name.clone()),
|
||||||
|
labels: Some(
|
||||||
|
[
|
||||||
|
("app.kubernetes.io/name".to_string(), name.clone()),
|
||||||
|
(
|
||||||
|
"app.kubernetes.io/component".to_string(),
|
||||||
|
"service".to_string(),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"app.kubernetes.io/managed-by".to_string(),
|
||||||
|
"harmony".to_string(),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
.into(),
|
||||||
|
),
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
spec: Some(ServiceSpec {
|
||||||
|
type_: Some("ClusterIP".to_string()),
|
||||||
|
selector: Some([("app.kubernetes.io/name".to_string(), name)].into()),
|
||||||
|
ports: Some(ports),
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -1,5 +1,8 @@
|
|||||||
|
pub mod backend_app;
|
||||||
|
pub mod config;
|
||||||
mod feature;
|
mod feature;
|
||||||
pub mod features;
|
pub mod features;
|
||||||
|
pub mod helm;
|
||||||
pub mod oci;
|
pub mod oci;
|
||||||
mod rust;
|
mod rust;
|
||||||
mod webapp;
|
mod webapp;
|
||||||
@@ -124,3 +127,15 @@ impl Serialize for dyn Application {
|
|||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Checks the output of a process command for success.
|
||||||
|
fn check_output(
|
||||||
|
output: &std::process::Output,
|
||||||
|
msg: &str,
|
||||||
|
) -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
if !output.status.success() {
|
||||||
|
let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
|
||||||
|
return Err(error_message.into());
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,5 +1,13 @@
|
|||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
config::{REGISTRY_PROJECT, REGISTRY_URL},
|
||||||
|
modules::application::check_output,
|
||||||
|
};
|
||||||
|
|
||||||
use super::Application;
|
use super::Application;
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use log::debug;
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub trait OCICompliant: Application {
|
pub trait OCICompliant: Application {
|
||||||
@@ -17,9 +25,74 @@ pub trait HelmPackage: Application {
|
|||||||
/// # Arguments
|
/// # Arguments
|
||||||
/// * `image_url` - The full URL of the OCI container image to be used in the Deployment.
|
/// * `image_url` - The full URL of the OCI container image to be used in the Deployment.
|
||||||
/// * `domain` - The domain where the application is hosted.
|
/// * `domain` - The domain where the application is hosted.
|
||||||
async fn build_push_helm_package(
|
async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String>;
|
||||||
&self,
|
|
||||||
image_url: &str,
|
fn project_root(&self) -> PathBuf;
|
||||||
domain: &str,
|
|
||||||
) -> Result<String, String>;
|
fn chart_name(&self) -> String;
|
||||||
|
|
||||||
|
/// Packages a Helm chart directory into a .tgz file.
|
||||||
|
fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
|
||||||
|
let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
|
||||||
|
debug!(
|
||||||
|
"Launching `helm package {}` cli with CWD {}",
|
||||||
|
chart_dirname.to_string_lossy(),
|
||||||
|
&self
|
||||||
|
.project_root()
|
||||||
|
.join(".harmony_generated")
|
||||||
|
.join("helm")
|
||||||
|
.to_string_lossy()
|
||||||
|
);
|
||||||
|
let output = std::process::Command::new("helm")
|
||||||
|
.args(["package", chart_dirname.to_str().unwrap()])
|
||||||
|
.current_dir(self.project_root().join(".harmony_generated").join("helm")) // Run package from the parent dir
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
check_output(&output, "Failed to package Helm chart")?;
|
||||||
|
|
||||||
|
// Helm prints the path of the created chart to stdout.
|
||||||
|
let tgz_name = String::from_utf8(output.stdout)?
|
||||||
|
.split_whitespace()
|
||||||
|
.last()
|
||||||
|
.unwrap_or_default()
|
||||||
|
.to_string();
|
||||||
|
if tgz_name.is_empty() {
|
||||||
|
return Err("Could not determine packaged chart filename.".into());
|
||||||
|
}
|
||||||
|
|
||||||
|
// The output from helm is relative, so we join it with the execution directory.
|
||||||
|
Ok(self
|
||||||
|
.project_root()
|
||||||
|
.join(".harmony_generated")
|
||||||
|
.join("helm")
|
||||||
|
.join(tgz_name))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pushes a packaged Helm chart to an OCI registry.
|
||||||
|
fn push_helm_chart(
|
||||||
|
&self,
|
||||||
|
packaged_chart_path: &Path,
|
||||||
|
) -> Result<String, Box<dyn std::error::Error>> {
|
||||||
|
// The chart name is the file stem of the .tgz file
|
||||||
|
let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
|
||||||
|
let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
|
||||||
|
let oci_pull_url = format!("{oci_push_url}/{}-chart", self.chart_name());
|
||||||
|
debug!(
|
||||||
|
"Pushing Helm chart {} to {}",
|
||||||
|
packaged_chart_path.to_string_lossy(),
|
||||||
|
oci_push_url
|
||||||
|
);
|
||||||
|
|
||||||
|
let output = std::process::Command::new("helm")
|
||||||
|
.args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
|
||||||
|
.output()?;
|
||||||
|
|
||||||
|
check_output(&output, "Pushing Helm chart failed")?;
|
||||||
|
|
||||||
|
// The final URL includes the version tag, which is part of the file name
|
||||||
|
let version = chart_file_name.rsplit_once('-').unwrap().1;
|
||||||
|
debug!("pull url {oci_pull_url}");
|
||||||
|
debug!("push url {oci_push_url}");
|
||||||
|
Ok(format!("{}:{}", oci_pull_url, version))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -81,16 +81,21 @@ impl Webapp for RustWebapp {
|
|||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl HelmPackage for RustWebapp {
|
impl HelmPackage for RustWebapp {
|
||||||
async fn build_push_helm_package(
|
fn project_root(&self) -> PathBuf {
|
||||||
&self,
|
self.project_root.clone()
|
||||||
image_url: &str,
|
}
|
||||||
domain: &str,
|
|
||||||
) -> Result<String, String> {
|
fn chart_name(&self) -> String {
|
||||||
|
self.name.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn build_push_helm_package(&self, image_url: &str) -> Result<String, String> {
|
||||||
|
let domain = self.dns();
|
||||||
info!("Starting Helm chart build and push for '{}'", self.name);
|
info!("Starting Helm chart build and push for '{}'", self.name);
|
||||||
|
|
||||||
// 1. Create the Helm chart files on disk.
|
// 1. Create the Helm chart files on disk.
|
||||||
let chart_dir = self
|
let chart_dir = self
|
||||||
.create_helm_chart_files(image_url, domain)
|
.create_helm_chart_files(image_url, &domain)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| format!("Failed to create Helm chart files: {}", e))?;
|
.map_err(|e| format!("Failed to create Helm chart files: {}", e))?;
|
||||||
info!("Successfully created Helm chart files in {:?}", chart_dir);
|
info!("Successfully created Helm chart files in {:?}", chart_dir);
|
||||||
@@ -327,19 +332,6 @@ impl RustWebapp {
|
|||||||
Ok(image_tag.to_string())
|
Ok(image_tag.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Checks the output of a process command for success.
|
|
||||||
fn check_output(
|
|
||||||
&self,
|
|
||||||
output: &process::Output,
|
|
||||||
msg: &str,
|
|
||||||
) -> Result<(), Box<dyn std::error::Error>> {
|
|
||||||
if !output.status.success() {
|
|
||||||
let error_message = format!("{}: {}", msg, String::from_utf8_lossy(&output.stderr));
|
|
||||||
return Err(error_message.into());
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_builder_image(&self, dockerfile: &mut Dockerfile) {
|
fn build_builder_image(&self, dockerfile: &mut Dockerfile) {
|
||||||
match self.framework {
|
match self.framework {
|
||||||
Some(RustWebFramework::Leptos) => {
|
Some(RustWebFramework::Leptos) => {
|
||||||
@@ -640,71 +632,6 @@ spec:
|
|||||||
Ok(chart_dir)
|
Ok(chart_dir)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Packages a Helm chart directory into a .tgz file.
|
|
||||||
fn package_helm_chart(&self, chart_dir: &Path) -> Result<PathBuf, Box<dyn std::error::Error>> {
|
|
||||||
let chart_dirname = chart_dir.file_name().expect("Should find a chart dirname");
|
|
||||||
debug!(
|
|
||||||
"Launching `helm package {}` cli with CWD {}",
|
|
||||||
chart_dirname.to_string_lossy(),
|
|
||||||
&self
|
|
||||||
.project_root
|
|
||||||
.join(".harmony_generated")
|
|
||||||
.join("helm")
|
|
||||||
.to_string_lossy()
|
|
||||||
);
|
|
||||||
let output = process::Command::new("helm")
|
|
||||||
.args(["package", chart_dirname.to_str().unwrap()])
|
|
||||||
.current_dir(self.project_root.join(".harmony_generated").join("helm")) // Run package from the parent dir
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
self.check_output(&output, "Failed to package Helm chart")?;
|
|
||||||
|
|
||||||
// Helm prints the path of the created chart to stdout.
|
|
||||||
let tgz_name = String::from_utf8(output.stdout)?
|
|
||||||
.split_whitespace()
|
|
||||||
.last()
|
|
||||||
.unwrap_or_default()
|
|
||||||
.to_string();
|
|
||||||
if tgz_name.is_empty() {
|
|
||||||
return Err("Could not determine packaged chart filename.".into());
|
|
||||||
}
|
|
||||||
|
|
||||||
// The output from helm is relative, so we join it with the execution directory.
|
|
||||||
Ok(self
|
|
||||||
.project_root
|
|
||||||
.join(".harmony_generated")
|
|
||||||
.join("helm")
|
|
||||||
.join(tgz_name))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Pushes a packaged Helm chart to an OCI registry.
|
|
||||||
fn push_helm_chart(
|
|
||||||
&self,
|
|
||||||
packaged_chart_path: &Path,
|
|
||||||
) -> Result<String, Box<dyn std::error::Error>> {
|
|
||||||
// The chart name is the file stem of the .tgz file
|
|
||||||
let chart_file_name = packaged_chart_path.file_stem().unwrap().to_str().unwrap();
|
|
||||||
let oci_push_url = format!("oci://{}/{}", *REGISTRY_URL, *REGISTRY_PROJECT);
|
|
||||||
let oci_pull_url = format!("{oci_push_url}/{}-chart", self.name);
|
|
||||||
debug!(
|
|
||||||
"Pushing Helm chart {} to {}",
|
|
||||||
packaged_chart_path.to_string_lossy(),
|
|
||||||
oci_push_url
|
|
||||||
);
|
|
||||||
|
|
||||||
let output = process::Command::new("helm")
|
|
||||||
.args(["push", packaged_chart_path.to_str().unwrap(), &oci_push_url])
|
|
||||||
.output()?;
|
|
||||||
|
|
||||||
self.check_output(&output, "Pushing Helm chart failed")?;
|
|
||||||
|
|
||||||
// The final URL includes the version tag, which is part of the file name
|
|
||||||
let version = chart_file_name.rsplit_once('-').unwrap().1;
|
|
||||||
debug!("pull url {oci_pull_url}");
|
|
||||||
debug!("push url {oci_push_url}");
|
|
||||||
Ok(format!("{}:{}", oci_pull_url, version))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_or_build_dockerfile(&self) -> Result<PathBuf, Box<dyn std::error::Error>> {
|
fn get_or_build_dockerfile(&self) -> Result<PathBuf, Box<dyn std::error::Error>> {
|
||||||
let existing_dockerfile = self.project_root.join("Dockerfile");
|
let existing_dockerfile = self.project_root.join("Dockerfile");
|
||||||
|
|
||||||
|
|||||||
6
harmony/templates/helm/Chart.yaml.j2
Normal file
6
harmony/templates/helm/Chart.yaml.j2
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
apiVersion: v2
|
||||||
|
name: {{ name }}
|
||||||
|
description: {{ description }}
|
||||||
|
type: application
|
||||||
|
version: {{ version }}
|
||||||
|
appVersion: "{{ app_version }}"
|
||||||
4
harmony_agent/.dockerignore
Normal file
4
harmony_agent/.dockerignore
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
.git
|
||||||
|
data
|
||||||
|
target
|
||||||
|
demos
|
||||||
26
harmony_agent/Cargo.toml
Normal file
26
harmony_agent/Cargo.toml
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
[package]
|
||||||
|
name = "harmony_agent"
|
||||||
|
edition = "2024"
|
||||||
|
version.workspace = true
|
||||||
|
readme.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
harmony = { path = "../harmony" }
|
||||||
|
# harmony_cli = { path = "../harmony_cli" }
|
||||||
|
harmony_types = { path = "../harmony_types" }
|
||||||
|
harmony_macros = { path = "../harmony_macros" }
|
||||||
|
cidr = { workspace = true }
|
||||||
|
tokio = { workspace = true }
|
||||||
|
log = { workspace = true }
|
||||||
|
env_logger = { workspace = true }
|
||||||
|
async-nats = "0.45.0"
|
||||||
|
async-trait = "0.1"
|
||||||
|
# url = { workspace = true }
|
||||||
|
|
||||||
|
serde.workspace = true
|
||||||
|
serde_json.workspace = true
|
||||||
|
getrandom = "0.3.4"
|
||||||
|
|
||||||
|
thiserror.workspace = true
|
||||||
|
pretty_assertions.workspace = true
|
||||||
44
harmony_agent/Dockerfile
Normal file
44
harmony_agent/Dockerfile
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
# Build stage
|
||||||
|
FROM rust:slim AS builder
|
||||||
|
|
||||||
|
# Install build dependencies
|
||||||
|
RUN apt-get update && apt-get install -y pkg-config && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy all required packages
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN ls -la1
|
||||||
|
|
||||||
|
# Build the application in release mode
|
||||||
|
RUN cargo build --release -p harmony_agent
|
||||||
|
|
||||||
|
# Runtime stage
|
||||||
|
FROM debian:bookworm-slim
|
||||||
|
|
||||||
|
# Install runtime dependencies
|
||||||
|
RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy the binary from the builder stage
|
||||||
|
COPY --from=builder /app/target/release/harmony_agent ./harmony_agent
|
||||||
|
|
||||||
|
# Declare environment variables used by the Harmony Agent
|
||||||
|
# These will be set from build-time environment variables if present
|
||||||
|
# NATS_URL: URL of the NATS server (default: nats://localhost:4222)
|
||||||
|
ARG NATS_URL=nats://localhost:4222
|
||||||
|
ENV NATS_URL=${NATS_URL}
|
||||||
|
# NATS_CREDS_PATH: Optional path to NATS credentials file
|
||||||
|
ARG NATS_CREDS_PATH
|
||||||
|
ENV NATS_CREDS_PATH=${NATS_CREDS_PATH}
|
||||||
|
# MY_CLUSTER_ID: This cluster's unique identifier (required)
|
||||||
|
ARG MY_CLUSTER_ID
|
||||||
|
ENV MY_CLUSTER_ID=${MY_CLUSTER_ID}
|
||||||
|
# DESIRED_PRIMARY: The ID of the desired primary cluster (required)
|
||||||
|
ARG DESIRED_PRIMARY
|
||||||
|
ENV DESIRED_PRIMARY=${DESIRED_PRIMARY}
|
||||||
|
|
||||||
|
# Run the application
|
||||||
|
ENTRYPOINT ["./harmony_agent"]
|
||||||
248
harmony_agent/README.md
Normal file
248
harmony_agent/README.md
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
TODO
|
||||||
|
|
||||||
|
DONE:
|
||||||
|
1. ✅ store trait subscribe definition missing callback - Fixed with SubscriptionCallback type
|
||||||
|
2. ✅ BUG: data integrity issue: nats store now using jetstream metadata (entry.created, entry.revision)
|
||||||
|
3. ✅ fix replica workflow not transitioning to "failed" when failure_threshold is exceeded
|
||||||
|
4. ✅ fix replica workflow to hold copy of cluster state - cluster_state field added to HarmonyAgent
|
||||||
|
5. ✅ heartbeat metadata now passed to workflow via on_heartbeat_stored() callback
|
||||||
|
6. ✅ failover_timeout added to AgentConfig
|
||||||
|
7. ✅ NATS store properly detects SequenceMismatch and returns SequenceMismatch error
|
||||||
|
8. ✅ startup reconciliation implemented via on_startup() method
|
||||||
|
|
||||||
|
REMAINING:
|
||||||
|
- review all code and list implementation issues
|
||||||
|
- review both workflow for each state transition
|
||||||
|
- Complete replica workflow staleness detection (needs implementation in Watching state)
|
||||||
|
- Implement state recovery from Failed state for both workflows
|
||||||
|
- Implement subscribe in NATS store with watch() API
|
||||||
|
- Implement config validation for failover_timeout constraints
|
||||||
|
|
||||||
|
TODO
|
||||||
|
|
||||||
|
1. store trait subscribe definition missing callback
|
||||||
|
2. BUG, data integrity issue : nats store not actually using jetstream metadata
|
||||||
|
3. review all code and list implementation issues
|
||||||
|
4. review both workflow for each state transition
|
||||||
|
5. fix replica workflow not transitionning to "failed" when failure_threshold is exceeded
|
||||||
|
6. fix replica workflow to hold also a copy of the cluster state (actually the agent itself
|
||||||
|
should hold it probably, every agent should be subscribed to the cluster_state object and
|
||||||
|
keep it in memory to allow workflows to process against it efficiently)
|
||||||
|
|
||||||
|
## CRITICAL - Data Integrity Issues
|
||||||
|
|
||||||
|
1. **NATS Store `set_strict` doesn't enforce CAS** (`store/nats.rs`)
|
||||||
|
- Currently uses `put()` which overwrites unconditionally
|
||||||
|
- Must use `update()` with revision parameter for proper compare-and-set
|
||||||
|
- Without this, concurrent promotion attempts can cause split brain
|
||||||
|
|
||||||
|
2. **NATS Store uses local clock instead of JetStream metadata** (`store/nats.rs`)
|
||||||
|
- Lines 55, 68: Using `SystemTime::now()` violates ADR-017-3
|
||||||
|
- NATS Entry has `.revision` and `.created` fields that must be used
|
||||||
|
- This defeats the entire purpose of store-provided timestamps
|
||||||
|
|
||||||
|
3. **Heartbeat metadata not passed to ReplicaWorkflow** (`agent_loop.rs::run_heartbeat_loop`)
|
||||||
|
- Line ~156: TODO comment confirms missing metadata passing
|
||||||
|
- Replica cannot calculate staleness without metadata.timestamp
|
||||||
|
- Failover logic is broken
|
||||||
|
|
||||||
|
4. **No actual cluster state watching exists**
|
||||||
|
- Replica workflow declares `ClusterState` but never updates it
|
||||||
|
- No subscription to primary heartbeat or cluster_state key
|
||||||
|
- Replica cannot detect primary liveness
|
||||||
|
|
||||||
|
## HIGH - Missing Core Functionality
|
||||||
|
|
||||||
|
5. **Replica Workflow incomplete** - All key logic is TODO:
|
||||||
|
- Watching primary staleness (line 114)
|
||||||
|
- Promotion attempt (line 118)
|
||||||
|
- Original primary recovery detection (line 127)
|
||||||
|
- Demotion/handshake (line 131)
|
||||||
|
|
||||||
|
6. **Missing replica "Failed" state**
|
||||||
|
- `ReplicaState` enum has no `Failed` variant
|
||||||
|
- User's TODO #5 correctly identifies this gap
|
||||||
|
- What happens if replica's own heartbeats fail repeatedly?
|
||||||
|
|
||||||
|
7. **Primary Workflow incomplete** - Key logic missing:
|
||||||
|
- No NATS check before recovering from `Fenced` state (line 95)
|
||||||
|
- No NATS check in `Yielding` state for demotion handshake (line 101)
|
||||||
|
- No actual fencing failure handling
|
||||||
|
|
||||||
|
8. **Store `subscribe` not implemented** (`store/mod.rs`)
|
||||||
|
- Returns `todo!()` in NATS implementation
|
||||||
|
- No callback mechanism defined in trait
|
||||||
|
- Without this, agents cannot react to state changes
|
||||||
|
|
||||||
|
9. **Cluster state not tracked centrally**
|
||||||
|
- User's TODO #6 correctly identifies this
|
||||||
|
- Each agent should maintain a local copy of cluster_state
|
||||||
|
- No subscription mechanism to update this local copy
|
||||||
|
|
||||||
|
10. **No validation of configuration constraints**
|
||||||
|
- Should validate: `failover_timeout > heartbeat_timeout * failure_threshold + safety_margin`
|
||||||
|
- Invalid config could cause split brain
|
||||||
|
|
||||||
|
## MEDIUM - Incorrect State Transitions
|
||||||
|
|
||||||
|
11. **Primary immediately transitions `Failed -> Fenced`** (`workflow/primary.rs:120-121`)
|
||||||
|
- Two state transitions happen in one heartbeat cycle
|
||||||
|
- Should stay in `Failed` until fencing actually completes
|
||||||
|
- What if fencing fails? State machine won't reflect it
|
||||||
|
|
||||||
|
12. **No fencing failure handling**
|
||||||
|
- If `on_failover()` fails, node thinks it's fenced but DB is still accepting writes
|
||||||
|
- ADR mentions escalating to radical measures, but no callback for failure
|
||||||
|
|
||||||
|
13. **Replica `Watching` state does nothing**
|
||||||
|
- Line 115: Just logs, checks nothing
|
||||||
|
- Should be checking staleness of primary heartbeat
|
||||||
|
|
||||||
|
14. **Demotion handshake not implemented**
|
||||||
|
- ADR section 4 details this but code doesn't implement it
|
||||||
|
- How does original primary know it should yield?
|
||||||
|
|
||||||
|
## LOW - Observability & Reliability
|
||||||
|
|
||||||
|
15. **No graceful shutdown mechanism**
|
||||||
|
- `run_heartbeat_loop` runs forever
|
||||||
|
- No signal handling (SIGTERM, SIGINT)
|
||||||
|
|
||||||
|
16. **Async task errors silently ignored**
|
||||||
|
- `tokio::spawn` at lines 74, 83, 123
|
||||||
|
- No `JoinHandle` retention or error handling
|
||||||
|
|
||||||
|
17. **No metrics/observability**
|
||||||
|
- Only log output
|
||||||
|
- No Prometheus metrics for state transitions, failure counts, etc.
|
||||||
|
|
||||||
|
18. **Hardcoded main() function** (`agent_loop.rs::main`)
|
||||||
|
- Not production-ready entry point
|
||||||
|
- Should load config from environment or file
|
||||||
|
|
||||||
|
19. **Store factory pattern missing**
|
||||||
|
- TODO comment at line 54 confirms this
|
||||||
|
- Can't switch between stores via config
|
||||||
|
|
||||||
|
20. **No backoff/retry logic for NATS operations**
|
||||||
|
- Transient failures could trigger unnecessary fencing
|
||||||
|
|
||||||
|
21. **`AgentInfo` status is hardcoded to "HEALTHY"**
|
||||||
|
- Line 137 in `store_heartbeat`
|
||||||
|
- Should反映 actual workflow state
|
||||||
|
|
||||||
|
22. **Unused fields in structs**
|
||||||
|
- `HeartbeatState.last_seq` set but never read
|
||||||
|
- `ClusterState.current_primary` set but never read
|
||||||
|
|
||||||
|
## ADR-017-3 Compliance Issues
|
||||||
|
|
||||||
|
23. **ADR violation: Clock skew not avoided**
|
||||||
|
- While ADR says use store metadata, code uses local time
|
||||||
|
|
||||||
|
24. **Failover timeout not configurable**
|
||||||
|
- Defined in ADR but not in `AgentConfig`
|
||||||
|
- Needed for replica staleness calculation
|
||||||
|
|
||||||
|
25. **Safety margin concept exists in ADR but not in code**
|
||||||
|
- Configuration should include this margin
|
||||||
|
|
||||||
|
26. **No handling of Case 3 (Replica Network Lag)**
|
||||||
|
- ADR describes NATS rejection prevention
|
||||||
|
- But `set_strict` implementation accepts any write
|
||||||
|
|
||||||
|
## Code Quality Issues
|
||||||
|
|
||||||
|
27. **Inconsistent error handling**
|
||||||
|
- Some paths return `Err`, others `todo!()`, others ignore
|
||||||
|
|
||||||
|
28. **Unnecessary `Clone` bounds**
|
||||||
|
- `DeploymentConfig.clone()` used frequently
|
||||||
|
- Could be optimized with `Arc`
|
||||||
|
|
||||||
|
29. **Missing lifetime annotations**
|
||||||
|
- `KvStore::get` returns `String` key in error - inefficient
|
||||||
|
|
||||||
|
30. **No integration points mentioned**
|
||||||
|
- PostgreSQL lifecycle control implementation missing
|
||||||
|
- Fencing via CNPG not connected
|
||||||
|
|
||||||
|
## Production Readiness Checklist Summary
|
||||||
|
|
||||||
|
For battle testing preparation, you need:
|
||||||
|
|
||||||
|
**Immediate ( blockers):**
|
||||||
|
- Fix NATS store metadata usage (issues #1, #2)
|
||||||
|
- Implement strict set_strict with actual CAS (#1)
|
||||||
|
- Implement replica primary watching (#4, #5)
|
||||||
|
- Add failover_timeout config + staleness logic (#3, #24)
|
||||||
|
- Implement subscribe mechanism with callbacks (#8)
|
||||||
|
|
||||||
|
**High priority:**
|
||||||
|
- Complete all workflow transitions (#5, #7, #11-14)
|
||||||
|
- Add cluster state tracking (#6, #9)
|
||||||
|
- Add configuration validation (#10)
|
||||||
|
- Add Replica Failed state (#6)
|
||||||
|
|
||||||
|
**Before deployment:**
|
||||||
|
- Implement graceful shutdown (#15)
|
||||||
|
- Add error handling for spawned tasks (#16)
|
||||||
|
- Remove hardcoded main function (#18)
|
||||||
|
- Implement store factory (#19)
|
||||||
|
- Add Prometheus metrics (#17)
|
||||||
|
|
||||||
|
**Documentation:**
|
||||||
|
- Document all configuration parameters and their trade-offs
|
||||||
|
- Add runbooks for each failure mode
|
||||||
|
- Document battle test scenarios to cover
|
||||||
|
|
||||||
|
### Addendum: Missing Critical Issues
|
||||||
|
|
||||||
|
#### 1. CRITICAL: Heartbeat "Lying" (Data Integrity)
|
||||||
|
* **Location:** `agent_loop.rs` line 137 inside `store_heartbeat`.
|
||||||
|
* **The Bug:** `status: "HEALTHY".to_string()` is hardcoded.
|
||||||
|
* **The Impact:** The agent loop runs regardless of the workflow state. If the Primary transitions to `Fenced` or `Failed`, it continues to write a heartbeat saying "I am HEALTHY".
|
||||||
|
* **The Fix:** The `store_heartbeat` function must accept the current status from the `workflow` (e.g., `self.workflow.status()`) to serialize into the JSON. A fenced agent must broadcast "FENCED" or stop writing entirely.
|
||||||
|
|
||||||
|
#### 2. CRITICAL: Async Task Race Conditions (State Machine Corruption)
|
||||||
|
* **Location:** `workflow/primary.rs` lines 74, 83, 123 (`tokio::spawn`).
|
||||||
|
* **The Bug:** The callbacks (`on_active`, `on_failover`) are spawned as fire-and-forget background tasks.
|
||||||
|
* **Scenario:**
|
||||||
|
1. Primary fails -> transitions to `Fenced` -> spawns `on_failover` (takes 5s).
|
||||||
|
2. Network recovers immediately -> transitions to `Healthy` -> spawns `on_active` (takes 1s).
|
||||||
|
3. `on_active` finishes *before* `on_failover`.
|
||||||
|
4. `on_failover` finishes last, killing the DB *after* the agent decided it was healthy.
|
||||||
|
* **The Fix:** You need a `JoinHandle` or a cancellation token. When transitioning states, any pending conflicting background tasks must be aborted before starting the new one.
|
||||||
|
|
||||||
|
#### 3. CRITICAL: Zombie Leader Prevention (Split Brain Risk)
|
||||||
|
* **Location:** `agent_loop.rs` loop logic.
|
||||||
|
* **The Bug:** There is no "Stop the World" gate.
|
||||||
|
* **Scenario:** If `store_heartbeat` fails (NATS unreachable), the code returns `Err`, triggers `handle_heartbeat_failure`, and the loop *continues*.
|
||||||
|
* **The Risk:** If the NATS write fails because of a CAS error (meaning a Replica has already promoted), this Primary is now a Zombie. It *must* immediately cease all operations. The current loop just sleeps and tries again.
|
||||||
|
* **The Fix:** If `store_heartbeat` returns a `SequenceMismatch` error, the agent must treat this as a fatal demotion event, immediately fencing itself, rather than just incrementing a failure counter.
|
||||||
|
|
||||||
|
#### 4. HIGH: NATS Bucket Name Collision
|
||||||
|
* **Location:** `agent_loop.rs` (Config) vs `store/nats.rs`.
|
||||||
|
* **The Bug:** `FailoverCNPGConfig` has `cnpg_cluster_name`, and `AgentConfig` has `cluster_id`.
|
||||||
|
* **The Impact:** If you run two different Harmony clusters on the same NATS server, and they use the same bucket name logic (or hardcoded names), they will overwrite each other's state.
|
||||||
|
* **The Fix:** The NATS KV bucket name must be namespaced dynamically, e.g., `format!("harmony_{}", config.cluster_id)`.
|
||||||
|
|
||||||
|
#### 5. HIGH: Startup State Reconciliation
|
||||||
|
* **Location:** `HarmonyAgent::new`.
|
||||||
|
* **The Bug:** Agents always start in `Initializing`.
|
||||||
|
* **Scenario:** The process crashes while it is the `Leader`. It restarts. It enters `Initializing`. It doesn't know it *should* be the leader.
|
||||||
|
* **The Impact:** The cluster might be leaderless until the `failover_timeout` expires, causing unnecessary downtime.
|
||||||
|
* **The Fix:** On startup, the agent must fetch the `ClusterState` from NATS. If `current_primary == my_id`, it should jump directly to `Healthy`/`Leader` state (possibly after a sanity check).
|
||||||
|
|
||||||
|
### Summary of Tasks to Add
|
||||||
|
|
||||||
|
Please add these to your master list before starting implementation:
|
||||||
|
|
||||||
|
28. **Dynamic Heartbeat Status:** Pass workflow state to `store_heartbeat` to prevent Fenced nodes from reporting "HEALTHY".
|
||||||
|
29. **Async Task Cancellation:** Implement `AbortHandle` for `on_active`/`on_failover` tasks to prevent race conditions during rapid state flapping.
|
||||||
|
30. **Fatal CAS Handling:** Treat `SequenceMismatch` in `store_heartbeat` as an immediate "I have been replaced" signal (Zombie detection).
|
||||||
|
31. **NATS Namespace Isolation:** Ensure KV bucket names include `cluster_id`.
|
||||||
|
32. **Startup Reconciliation:** Check NATS on boot to restore previous state if valid.
|
||||||
|
|
||||||
|
* **Think about vacuum / stop-the-world operations**
|
||||||
|
|
||||||
20
harmony_agent/deploy/Cargo.toml
Normal file
20
harmony_agent/deploy/Cargo.toml
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
[package]
|
||||||
|
name = "harmony_agent_deploy"
|
||||||
|
edition = "2024"
|
||||||
|
version.workspace = true
|
||||||
|
readme.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
harmony = { path = "../../harmony" }
|
||||||
|
harmony_cli = { path = "../../harmony_cli" }
|
||||||
|
harmony_types = { path = "../../harmony_types" }
|
||||||
|
harmony_macros = { path = "../../harmony_macros" }
|
||||||
|
cidr = { workspace = true }
|
||||||
|
tokio = { workspace = true }
|
||||||
|
log = { workspace = true }
|
||||||
|
env_logger = { workspace = true }
|
||||||
|
url = { workspace = true }
|
||||||
|
|
||||||
|
serde.workspace = true
|
||||||
|
serde_json.workspace = true
|
||||||
63
harmony_agent/deploy/src/main.rs
Normal file
63
harmony_agent/deploy/src/main.rs
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
use harmony::{
|
||||||
|
inventory::Inventory,
|
||||||
|
modules::{
|
||||||
|
application::{
|
||||||
|
ApplicationScore,
|
||||||
|
backend_app::{BackendApp, BuildCommand},
|
||||||
|
features::{Monitoring, PackagingDeployment},
|
||||||
|
},
|
||||||
|
monitoring::alert_channel::discord_alert_channel::DiscordWebhook,
|
||||||
|
},
|
||||||
|
topology::K8sAnywhereTopology,
|
||||||
|
};
|
||||||
|
use harmony_macros::hurl;
|
||||||
|
use harmony_types::k8s_name::K8sName;
|
||||||
|
use std::{path::PathBuf, sync::Arc};
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let application = Arc::new(BackendApp {
|
||||||
|
name: "harmony-agent".to_string(),
|
||||||
|
// Since harmony_agent is part of the harmony workspace, the actual "project root"
|
||||||
|
// is not harmony_agent folder but the workspace root.
|
||||||
|
//
|
||||||
|
// So using ../ here means we MUST run this deployment script from the harmony_agent
|
||||||
|
// folder
|
||||||
|
project_root: PathBuf::from("../"),
|
||||||
|
network_ports: vec![],
|
||||||
|
env_vars: vec![
|
||||||
|
("NATS_URL".to_string(), "nats://nats".to_string()),
|
||||||
|
("DESIRED_PRIMARY".to_string(), "site-1".to_string()),
|
||||||
|
("MY_CLUSTER_ID".to_string(), "site-1".to_string()),
|
||||||
|
("NATS_CREDS_PATH".to_string(), "".to_string()),
|
||||||
|
],
|
||||||
|
build_cmd: BuildCommand::new("cargo", vec!["build", "--release", "-p", "harmony_agent"]),
|
||||||
|
dockerfile: Some(PathBuf::from("Dockerfile")),
|
||||||
|
});
|
||||||
|
|
||||||
|
let app = ApplicationScore {
|
||||||
|
features: vec![
|
||||||
|
Box::new(PackagingDeployment {
|
||||||
|
application: application.clone(),
|
||||||
|
}),
|
||||||
|
Box::new(Monitoring {
|
||||||
|
application: application.clone(),
|
||||||
|
alert_receiver: vec![Box::new(DiscordWebhook {
|
||||||
|
name: K8sName("test-discord".to_string()),
|
||||||
|
url: hurl!("https://discord.doesnt.exist.com"),
|
||||||
|
selectors: vec![],
|
||||||
|
})],
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
application,
|
||||||
|
};
|
||||||
|
|
||||||
|
harmony_cli::run(
|
||||||
|
Inventory::autoload(),
|
||||||
|
K8sAnywhereTopology::from_env(), // <== Deploy to local automatically provisioned k3d by default or connect to any kubernetes cluster
|
||||||
|
vec![Box::new(app)],
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
79
harmony_agent/src/agent/config.rs
Normal file
79
harmony_agent/src/agent/config.rs
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use harmony_types::id::Id;
|
||||||
|
use log::info;
|
||||||
|
|
||||||
|
use super::heartbeat::HeartbeatFailure;
|
||||||
|
use super::role::AgentRole;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct AgentConfig {
|
||||||
|
/// Number of consecutive successful heartbeats required before the service transitions from
|
||||||
|
/// failed to healthy.
|
||||||
|
pub success_threshold: usize,
|
||||||
|
/// Number of consecutive failed heartbeats required before the service transitions from
|
||||||
|
/// healthy to failed.
|
||||||
|
pub failure_threshold: usize,
|
||||||
|
/// Time between each heartbeat. If a heartbeat takes longer than this, it will be
|
||||||
|
/// considered failed.
|
||||||
|
pub heartbeat_interval: Duration,
|
||||||
|
/// Time since last observed primary heartbeat before replica considers primary stale.
|
||||||
|
/// This must be configured such that failover_timeout > heartbeat_interval * failure_threshold + safety_margin
|
||||||
|
/// to avoid split brain during network partitions.
|
||||||
|
pub failover_timeout: Duration,
|
||||||
|
/// **UNSTABLE FIELD**
|
||||||
|
///
|
||||||
|
/// For now, an agent instance only serves one deployment. This is probably fine as an agent's
|
||||||
|
/// footprint is low, but managing multiple deployments in a single instance would be a
|
||||||
|
/// significant resource usage reduction.
|
||||||
|
///
|
||||||
|
/// Decoupling the deployment of the agent with the application's deployment could make things
|
||||||
|
/// more complicated though, where we would have to be careful about version compatibility
|
||||||
|
/// between all components managed by the agent instance. So for now it is a 1-1 map.
|
||||||
|
///
|
||||||
|
/// But I have a feeling this could change so I am marking this field unstable to warn you, the
|
||||||
|
/// reader.
|
||||||
|
pub deployment_config_unstable: DeploymentConfig,
|
||||||
|
pub nats_url: String,
|
||||||
|
pub nats_creds_path: Option<String>,
|
||||||
|
pub agent_id: Id,
|
||||||
|
pub cluster_id: Id,
|
||||||
|
pub desired_primary_id: Id,
|
||||||
|
/// The role this agent plays (Primary or Replica)
|
||||||
|
pub role: AgentRole,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum DeploymentConfig {
|
||||||
|
FailoverPostgreSQL(FailoverCNPGConfig),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct FailoverCNPGConfig {
|
||||||
|
pub cnpg_cluster_name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DeploymentConfig {
|
||||||
|
/// The actual "work" of the heartbeat (e.g., write to NATS, check Postgres)
|
||||||
|
pub async fn perform_heartbeat(&self) -> Result<(), HeartbeatFailure> {
|
||||||
|
match self {
|
||||||
|
DeploymentConfig::FailoverPostgreSQL(cfg) => {
|
||||||
|
info!("Checking PG Cluster: {}", cfg.cnpg_cluster_name);
|
||||||
|
// TODO: Implement actual PG check / NATS write here
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Callback: Transitioned from Unhealthy -> Healthy
|
||||||
|
pub async fn on_active(&self) {
|
||||||
|
info!("Service is now ACTIVE (Healthy)");
|
||||||
|
// e.g., Remove fencing lock
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Callback: Transitioned from Healthy -> Unhealthy
|
||||||
|
pub async fn on_failover(&self) {
|
||||||
|
info!("Service is now FAILED (Unhealthy)");
|
||||||
|
// e.g., Initiate self-fencing, stop accepting traffic
|
||||||
|
}
|
||||||
|
}
|
||||||
35
harmony_agent/src/agent/heartbeat.rs
Normal file
35
harmony_agent/src/agent/heartbeat.rs
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
use harmony_types::id::Id;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::store::KvMetadata;
|
||||||
|
|
||||||
|
/// Agent-provided heartbeat information (no timestamps - those come from the store)
|
||||||
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
|
pub struct AgentInfo {
|
||||||
|
pub agent_id: Id,
|
||||||
|
pub cluster_id: Id,
|
||||||
|
pub status: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Complete heartbeat with both agent data and store metadata
|
||||||
|
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||||
|
pub struct AgentHeartbeat {
|
||||||
|
pub agent_info: AgentInfo,
|
||||||
|
pub metadata: Option<KvMetadata>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||||
|
pub struct ClusterStateData {
|
||||||
|
pub cluster_info: ClusterState,
|
||||||
|
pub metadata: Option<KvMetadata>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||||
|
pub struct ClusterState {
|
||||||
|
pub cluster_id: Id,
|
||||||
|
pub current_primary: Option<Id>,
|
||||||
|
pub desired_primary: Id,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct HeartbeatFailure {}
|
||||||
507
harmony_agent/src/agent/mod.rs
Normal file
507
harmony_agent/src/agent/mod.rs
Normal file
@@ -0,0 +1,507 @@
|
|||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
use std::{str::FromStr, sync::Arc, time::Duration};
|
||||||
|
|
||||||
|
use harmony_types::id::Id;
|
||||||
|
use log::{debug, error, info, trace, warn};
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
use tokio::time::{Instant, sleep};
|
||||||
|
|
||||||
|
use crate::agent::heartbeat::ClusterState;
|
||||||
|
use crate::store::{KvMetadata, KvStore, KvStoreError};
|
||||||
|
use crate::workflow::HeartbeatWorkflow;
|
||||||
|
use crate::workflow::primary::PrimaryWorkflow;
|
||||||
|
use crate::workflow::replica::ReplicaWorkflow;
|
||||||
|
|
||||||
|
// Submodules
|
||||||
|
mod config;
|
||||||
|
pub mod heartbeat;
|
||||||
|
mod role;
|
||||||
|
|
||||||
|
// Re-exports for backwards compatibility
|
||||||
|
pub use config::{AgentConfig, DeploymentConfig, FailoverCNPGConfig};
|
||||||
|
pub use heartbeat::{AgentHeartbeat, AgentInfo, ClusterStateData, HeartbeatFailure};
|
||||||
|
pub use role::AgentRole;
|
||||||
|
|
||||||
|
pub async fn launch_agent<S>(
|
||||||
|
role: AgentRole,
|
||||||
|
health_kv: Arc<S>,
|
||||||
|
cluster_kv: Arc<S>,
|
||||||
|
heartbeat_interval: Duration,
|
||||||
|
failover_timeout: Duration,
|
||||||
|
) -> Result<(), Box<dyn std::error::Error>>
|
||||||
|
where
|
||||||
|
S: KvStore + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
// Cheap ass fix when we boot two agents at the same time and the store does not exist, delay
|
||||||
|
// one so they don't crash because of the race
|
||||||
|
match role {
|
||||||
|
AgentRole::Primary => {}
|
||||||
|
AgentRole::Replica => {
|
||||||
|
sleep(Duration::from_millis(100)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let my_agent_name = format!("agent-{}", role);
|
||||||
|
let my_agent_id = Id::from_str(&my_agent_name).unwrap();
|
||||||
|
|
||||||
|
let config = AgentConfig {
|
||||||
|
role,
|
||||||
|
success_threshold: 2,
|
||||||
|
failure_threshold: 2,
|
||||||
|
heartbeat_interval,
|
||||||
|
failover_timeout,
|
||||||
|
deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
|
||||||
|
cnpg_cluster_name: String::from("cnpg_cluster_name"),
|
||||||
|
}),
|
||||||
|
nats_url: String::new(),
|
||||||
|
nats_creds_path: None,
|
||||||
|
agent_id: my_agent_id,
|
||||||
|
cluster_id: "cluster_test_id".into(),
|
||||||
|
desired_primary_id: "primary_id".into(),
|
||||||
|
};
|
||||||
|
|
||||||
|
log::info!("Harmony Agent Initialized");
|
||||||
|
log::info!("Initializing Harmony Agent Id : {}", config.agent_id);
|
||||||
|
log::info!("Full config : {:?}", config);
|
||||||
|
|
||||||
|
// TODO load store based on config, default to nats
|
||||||
|
// probably a good use case for a factory pattern
|
||||||
|
|
||||||
|
let mut agent = HarmonyAgent::new(config, health_kv, cluster_kv);
|
||||||
|
|
||||||
|
agent.reconcile_startup().await?;
|
||||||
|
|
||||||
|
// Run the heartbeat loop
|
||||||
|
agent.run_heartbeat_loop().await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct HarmonyAgent<S: KvStore> {
|
||||||
|
pub config: AgentConfig,
|
||||||
|
workflow: Box<dyn HeartbeatWorkflow>,
|
||||||
|
health_kv: Arc<S>,
|
||||||
|
cluster_kv: Arc<S>,
|
||||||
|
/// Last successful heartbeat, used to track sequence number for next write
|
||||||
|
/// This avoids doing a GET before every SET, reducing network round-trips
|
||||||
|
last_heartbeat: Arc<RwLock<Option<AgentHeartbeat>>>,
|
||||||
|
/// Local copy of cluster state, updated via subscription
|
||||||
|
/// This allows workflows to make decisions without querying NATS each time
|
||||||
|
cluster_state: Arc<RwLock<Option<ClusterStateData>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<S: KvStore + Send + Sync + 'static> HarmonyAgent<S> {
|
||||||
|
pub fn new(config: AgentConfig, health_kv: Arc<S>, cluster_kv: Arc<S>) -> Self {
|
||||||
|
let workflow: Box<dyn HeartbeatWorkflow> = match config.role {
|
||||||
|
AgentRole::Primary => {
|
||||||
|
info!("Initializing agent as PRIMARY");
|
||||||
|
Box::new(PrimaryWorkflow::new(
|
||||||
|
config.success_threshold,
|
||||||
|
config.failure_threshold,
|
||||||
|
config.deployment_config_unstable.clone(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
AgentRole::Replica => {
|
||||||
|
info!("Initializing agent as REPLICA");
|
||||||
|
Box::new(ReplicaWorkflow::new(
|
||||||
|
config.success_threshold,
|
||||||
|
config.failure_threshold,
|
||||||
|
config.cluster_id.clone(),
|
||||||
|
config.desired_primary_id.clone(),
|
||||||
|
config.agent_id.clone(),
|
||||||
|
config.failover_timeout,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Self {
|
||||||
|
config,
|
||||||
|
workflow,
|
||||||
|
health_kv,
|
||||||
|
cluster_kv,
|
||||||
|
last_heartbeat: Arc::new(RwLock::new(None)),
|
||||||
|
cluster_state: Arc::new(RwLock::new(None)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generic helper to fetch and deserialize data from KV store
|
||||||
|
/// Returns Ok(Some(data)) if key exists and deserializes successfully
|
||||||
|
/// Returns Ok(None) if key doesn't exist
|
||||||
|
/// Returns Err if deserialization fails or other errors occur
|
||||||
|
async fn fetch_from_store<D>(
|
||||||
|
&self,
|
||||||
|
store: &Arc<S>,
|
||||||
|
key: &str,
|
||||||
|
) -> Result<Option<(D, KvMetadata)>, KvStoreError>
|
||||||
|
where
|
||||||
|
D: serde::de::DeserializeOwned,
|
||||||
|
{
|
||||||
|
debug!("Fetching data from key: {}", key);
|
||||||
|
|
||||||
|
let result = store.get(key).await;
|
||||||
|
debug!("Got result from store: {:#?}", result);
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Ok(kv_result) => {
|
||||||
|
if let Some(value) = kv_result.value {
|
||||||
|
match serde_json::from_value::<D>(value.clone()) {
|
||||||
|
Ok(data) => Ok(Some((data, kv_result.metadata))),
|
||||||
|
Err(e) => {
|
||||||
|
log::warn!("Failed to deserialize data from key {}: {}", key, e);
|
||||||
|
Err(KvStoreError::DeserializationFailed {
|
||||||
|
deserialization_error: format!(
|
||||||
|
"Key exists but deserialization failed for {key}: {e}"
|
||||||
|
),
|
||||||
|
value: value.to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Err(KvStoreError::Unknown(format!(
|
||||||
|
"Key exists but value is empty for {key}, this should not happen"
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(KvStoreError::KeyNotAvailable(_)) => {
|
||||||
|
debug!("Key {} not found in store", key);
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
log::warn!("Failed to fetch data from key {}: {}", key, e);
|
||||||
|
Err(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reconcile startup state by fetching cluster state and heartbeat from the store
|
||||||
|
/// This allows the workflow to determine if it should resume as Primary/Replica
|
||||||
|
/// based on the persisted cluster state
|
||||||
|
pub async fn reconcile_startup(&mut self) -> Result<(), KvStoreError> {
|
||||||
|
let cluster_key = format!("cluster.{}", self.config.cluster_id);
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Fetching cluster state for startup reconciliation from key: {}",
|
||||||
|
cluster_key
|
||||||
|
);
|
||||||
|
|
||||||
|
let cluster_state_option = match self
|
||||||
|
.fetch_from_store::<ClusterState>(&self.cluster_kv, &cluster_key)
|
||||||
|
.await?
|
||||||
|
{
|
||||||
|
Some((data, metadata)) => Some(ClusterStateData {
|
||||||
|
cluster_info: data,
|
||||||
|
metadata: Some(metadata),
|
||||||
|
}),
|
||||||
|
None => {
|
||||||
|
debug!(
|
||||||
|
"Cluster state key not found, this is a fresh cluster, initializing cluster state"
|
||||||
|
);
|
||||||
|
Some(self.store_cluster_state(None).await?)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
debug!("Found cluster state {cluster_state_option:#?}");
|
||||||
|
self.workflow
|
||||||
|
.on_startup(cluster_state_option.as_ref(), &self.config)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Cache the cluster state locally
|
||||||
|
*self.cluster_state.write().await = cluster_state_option;
|
||||||
|
// Fetch last heartbeat if it exists to avoid sequence conflicts
|
||||||
|
let heartbeat_key = format!("heartbeat.{}", self.config.agent_id);
|
||||||
|
debug!("Fetching last heartbeat from key: {}", heartbeat_key);
|
||||||
|
|
||||||
|
let last_heartbeat_option = self.health_kv.get(&heartbeat_key).await;
|
||||||
|
|
||||||
|
let last_heartbeat = match last_heartbeat_option {
|
||||||
|
Ok(kv_result) => {
|
||||||
|
let value = kv_result
|
||||||
|
.value
|
||||||
|
.expect("When key exist it should always contain data");
|
||||||
|
Some(AgentHeartbeat {
|
||||||
|
agent_info: serde_json::from_value::<AgentInfo>(value.clone()).map_err(
|
||||||
|
|e| KvStoreError::DeserializationFailed {
|
||||||
|
deserialization_error: e.to_string(),
|
||||||
|
value: value.to_string(),
|
||||||
|
},
|
||||||
|
)?,
|
||||||
|
metadata: Some(kv_result.metadata),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
Err(e) => match e {
|
||||||
|
KvStoreError::KeyNotAvailable(_) => None,
|
||||||
|
_ => return Err(e),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
if let Some(heartbeat) = &last_heartbeat {
|
||||||
|
debug!(
|
||||||
|
"Found existing heartbeat with sequence: {}",
|
||||||
|
heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
debug!("No existing heartbeat found, starting fresh");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cache the last heartbeat for sequence tracking
|
||||||
|
*self.last_heartbeat.write().await = last_heartbeat;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn store_cluster_state(
|
||||||
|
&self,
|
||||||
|
cluster_data: Option<ClusterStateData>,
|
||||||
|
) -> Result<ClusterStateData, KvStoreError> {
|
||||||
|
let key = format!("cluster.{}", self.config.cluster_id);
|
||||||
|
match cluster_data {
|
||||||
|
Some(cluster_data) => {
|
||||||
|
debug!("found some cluster state {:#?}", cluster_data);
|
||||||
|
|
||||||
|
let value = serde_json::to_value(&cluster_data.cluster_info).map_err(|e| {
|
||||||
|
KvStoreError::DeserializationFailed {
|
||||||
|
deserialization_error: e.to_string(),
|
||||||
|
value: format!("{:?}", cluster_data),
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let expected_sequence = {
|
||||||
|
let last = self.cluster_state.read().await;
|
||||||
|
last.as_ref()
|
||||||
|
.and_then(|hb| hb.metadata.as_ref())
|
||||||
|
.map(|m| m.sequence)
|
||||||
|
.unwrap_or(0)
|
||||||
|
};
|
||||||
|
|
||||||
|
debug!("expected sequence {:#?}", expected_sequence);
|
||||||
|
let new_seq = self
|
||||||
|
.cluster_kv
|
||||||
|
.set_strict(&key, value, expected_sequence)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
|
||||||
|
debug!("cluster kv {:#?}", cluster_kv_result);
|
||||||
|
|
||||||
|
let cluster_data_new = ClusterStateData {
|
||||||
|
cluster_info: cluster_data.cluster_info.clone(),
|
||||||
|
metadata: Some(cluster_kv_result.metadata),
|
||||||
|
};
|
||||||
|
|
||||||
|
*self.cluster_state.write().await = Some(cluster_data_new.clone());
|
||||||
|
Ok(cluster_data)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let cluster_info = ClusterState {
|
||||||
|
cluster_id: self.config.cluster_id.clone(),
|
||||||
|
current_primary: None,
|
||||||
|
desired_primary: self.config.desired_primary_id.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let value = serde_json::to_value(&cluster_info).map_err(|e| {
|
||||||
|
KvStoreError::DeserializationFailed {
|
||||||
|
deserialization_error: e.to_string(),
|
||||||
|
value: format!("{:?}", cluster_info),
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let cluster_data = ClusterStateData {
|
||||||
|
cluster_info,
|
||||||
|
metadata: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let new_seq = self.cluster_kv.set_strict(&key, value, 0).await?;
|
||||||
|
|
||||||
|
let cluster_kv_result = self.cluster_kv.get_revision(&key, new_seq).await?;
|
||||||
|
debug!("cluster kv {:#?}", cluster_kv_result);
|
||||||
|
|
||||||
|
let cluster_data_new = ClusterStateData {
|
||||||
|
cluster_info: cluster_data.cluster_info.clone(),
|
||||||
|
metadata: Some(cluster_kv_result.metadata),
|
||||||
|
};
|
||||||
|
|
||||||
|
*self.cluster_state.write().await = Some(cluster_data_new.clone());
|
||||||
|
Ok(cluster_data_new)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sends agent heartbeat to the KV store
|
||||||
|
///
|
||||||
|
/// Note: We only send AgentInfo. The store will add HeartbeatMetadata (timestamp, sequence)
|
||||||
|
/// to avoid clock skew issues. This follows the ADR-017-3 principle that all timestamp
|
||||||
|
/// comparisons use the store's clock, not agent clocks.
|
||||||
|
///
|
||||||
|
/// This method uses the last successful heartbeat's sequence number to avoid an extra
|
||||||
|
/// GET call before each SET, reducing network round-trips and latency exposure.
|
||||||
|
async fn store_heartbeat(&self) -> Result<AgentHeartbeat, KvStoreError> {
|
||||||
|
let key = format!("heartbeat.{}", self.config.agent_id);
|
||||||
|
|
||||||
|
// Create agent info WITHOUT timestamp - the store will add metadata
|
||||||
|
// Use workflow state to report actual status (e.g. Primary:Fenced, Replica:Watching)
|
||||||
|
let agent_info = AgentInfo {
|
||||||
|
agent_id: self.config.agent_id.clone(),
|
||||||
|
cluster_id: self.config.cluster_id.clone(),
|
||||||
|
status: self.workflow.state_name().to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
debug!("Storing heartbeat for agent: {}", self.config.agent_id);
|
||||||
|
let value =
|
||||||
|
serde_json::to_value(&agent_info).map_err(|e| KvStoreError::DeserializationFailed {
|
||||||
|
deserialization_error: e.to_string(),
|
||||||
|
value: format!("{:?}", agent_info),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let expected_sequence = {
|
||||||
|
let last = self.last_heartbeat.read().await;
|
||||||
|
last.as_ref()
|
||||||
|
.and_then(|hb| hb.metadata.as_ref())
|
||||||
|
.map(|m| m.sequence)
|
||||||
|
.unwrap_or(0)
|
||||||
|
};
|
||||||
|
|
||||||
|
trace!("Writing new heartbeat {key} (#{expected_sequence}), value: {value:?}");
|
||||||
|
let new_seq = self
|
||||||
|
.health_kv
|
||||||
|
.set_strict(&key, value, expected_sequence)
|
||||||
|
.await?;
|
||||||
|
trace!("Got new sequence {new_seq}");
|
||||||
|
let kv_result = self.health_kv.get_revision(&key, new_seq).await?;
|
||||||
|
|
||||||
|
debug!("Heartbeat stored succsssfully with sequence: {}", new_seq);
|
||||||
|
|
||||||
|
// Construct complete heartbeat with metadata from store
|
||||||
|
let heartbeat = AgentHeartbeat {
|
||||||
|
agent_info,
|
||||||
|
metadata: Some(kv_result.metadata),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Cache this successful heartbeat for next iteration
|
||||||
|
*self.last_heartbeat.write().await = Some(heartbeat.clone());
|
||||||
|
|
||||||
|
Ok(heartbeat)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn run_heartbeat_loop(&mut self) {
|
||||||
|
let mut next_heartbeat_start;
|
||||||
|
loop {
|
||||||
|
let this_heartbeat_start = Instant::now();
|
||||||
|
next_heartbeat_start = this_heartbeat_start + self.config.heartbeat_interval;
|
||||||
|
|
||||||
|
// Perform the check via the config/strategy with a timeout
|
||||||
|
//
|
||||||
|
// FIXME There is too much stuff happening inside the timeout. There are some things like a
|
||||||
|
// promotion, that we don't want to cancel within a single heartbeat interval timeout
|
||||||
|
// I think that the timeout should only apply to the store_heartbeat().await call.
|
||||||
|
// Logic happening after should not be affected in the exact same manner. There can be
|
||||||
|
// other timeouts or other stuff to consider here.
|
||||||
|
// However, the system does rely on heartbeats happening regularly, so we do not want
|
||||||
|
// to delay the next heartbeat either. This is tricky.
|
||||||
|
// An idea right now is to keep the heartbeat running but, when a processing event
|
||||||
|
// occurs, set a flag on the local agent that there is a process running (promotion,
|
||||||
|
// demotion, etc) and take no other decision until this process is not done. There is
|
||||||
|
// one exception we can think of right now :
|
||||||
|
// - a healthy primary starts running a process such as "calling mom"
|
||||||
|
// - the primary keeps sending its heartbeat to prove to the rest of the cluster that
|
||||||
|
// it is still healthy
|
||||||
|
// - then the primary heartbeat fails up to failure_threshold
|
||||||
|
// - at this moment the "calling mom" process must not prevent the primary from fencing itself. Otherwise the replica that promotes itself when it realises that the primary is dead will cause a split brain.
|
||||||
|
// - Another solution would be register the processing: "calling mom" in the primary
|
||||||
|
// heartbeat store, and prevent the replica from promoting when there is a running
|
||||||
|
// task on the primary.
|
||||||
|
let result = tokio::time::timeout(self.config.heartbeat_interval, async {
|
||||||
|
// Store heartbeat and perform deployment-specific health check
|
||||||
|
match &self.store_heartbeat().await {
|
||||||
|
Ok(heartbeat) => {
|
||||||
|
// Heartbeat stored successfully, already cached by store_heartbeat
|
||||||
|
debug!(
|
||||||
|
"Heartbeat stored: seq={}",
|
||||||
|
heartbeat.metadata.as_ref().map(|m| m.sequence).unwrap_or(0)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(KvStoreError::WrongLastRevision) => {
|
||||||
|
todo!("fetch and update correct last sequence number")
|
||||||
|
// CAS failure could indicate:
|
||||||
|
// 1. Network latency: our previous timeout heartbeat actually succeeded
|
||||||
|
// 2. Agent ID conflict: another agent with same ID exists
|
||||||
|
// 3. Clock/bucket corruption (unlikely)
|
||||||
|
|
||||||
|
// log::warn!(
|
||||||
|
// "CAS mismatch for agent {}: expected sequence {}, got {}. Possible causes: network latency, agent ID conflict, or clock issue. Updating local sequence to {}",
|
||||||
|
// self.config.agent_id, expected, current, current
|
||||||
|
// );
|
||||||
|
// // Update cached heartbeat sequence to prevent repeated failures
|
||||||
|
// if let Some(hb) = self.last_heartbeat.write().await.as_mut() {
|
||||||
|
// if let Some(metadata) = hb.metadata.as_mut() {
|
||||||
|
// metadata.sequence = *current;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Actual storage failure - treat as heartbeat failure
|
||||||
|
log::error!("Heartbeat storage error: {}", e);
|
||||||
|
return Err(HeartbeatFailure {});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.config
|
||||||
|
.deployment_config_unstable
|
||||||
|
.perform_heartbeat()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// TODO: Pass the heartbeat with metadata to the workflow for staleness checks
|
||||||
|
// The workflow needs access to metadata.timestamp for failover timeout calculations
|
||||||
|
Ok::<(), HeartbeatFailure>(())
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Update Counters & Handle State Transitions
|
||||||
|
// Timeout is also treated as a failure
|
||||||
|
let heartbeat_result = match result {
|
||||||
|
Ok(inner_result) => inner_result,
|
||||||
|
Err(_) => Err(HeartbeatFailure {}),
|
||||||
|
};
|
||||||
|
|
||||||
|
trace!("Got heartbeat_result : {heartbeat_result:?}");
|
||||||
|
match heartbeat_result {
|
||||||
|
Ok(_) => {
|
||||||
|
let new_state = self
|
||||||
|
.workflow
|
||||||
|
.handle_heartbeat_success(
|
||||||
|
self.cluster_state.read().await.as_ref(),
|
||||||
|
&self.config,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
if let Some(new_state) = new_state {
|
||||||
|
warn!("Got new cluster state : {new_state:#?}");
|
||||||
|
self.store_cluster_state(Some(new_state))
|
||||||
|
.await
|
||||||
|
.expect(&format!("cluster state not able to be stored"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
self.workflow
|
||||||
|
.handle_heartbeat_failure(self.cluster_state.read().await.as_ref())
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Heartbeat : success={heartbeat_emoji} state={state}, successes={consecutive_successes}/{success_threshold}, fails={consecutive_failures}/{failure_threshold} took={heartbeat_duration}ms",
|
||||||
|
success_threshold = self.config.success_threshold,
|
||||||
|
failure_threshold = self.config.failure_threshold,
|
||||||
|
state = self.workflow.state_name(),
|
||||||
|
consecutive_successes = self.workflow.consecutive_successes(),
|
||||||
|
consecutive_failures = self.workflow.consecutive_failures(),
|
||||||
|
heartbeat_emoji = if heartbeat_result.is_ok() {
|
||||||
|
"✅"
|
||||||
|
} else {
|
||||||
|
"❌"
|
||||||
|
},
|
||||||
|
heartbeat_duration = (Instant::now() - this_heartbeat_start).as_millis(),
|
||||||
|
);
|
||||||
|
debug!(
|
||||||
|
"Sleeping for {} ms before next heartbeat",
|
||||||
|
(next_heartbeat_start - Instant::now()).as_millis()
|
||||||
|
);
|
||||||
|
tokio::time::sleep_until(next_heartbeat_start).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
17
harmony_agent/src/agent/role.rs
Normal file
17
harmony_agent/src/agent/role.rs
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
use std::fmt;
|
||||||
|
|
||||||
|
/// The role of this agent instance
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum AgentRole {
|
||||||
|
Primary,
|
||||||
|
Replica,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for AgentRole {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
AgentRole::Primary => write!(f, "primary"),
|
||||||
|
AgentRole::Replica => write!(f, "replica"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
90
harmony_agent/src/config.rs
Normal file
90
harmony_agent/src/config.rs
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
use harmony_types::id::Id;
|
||||||
|
use log::debug;
|
||||||
|
use std::env;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
/// Configuration for the Harmony Agent
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct AgentConfig {
|
||||||
|
pub nats_url: String,
|
||||||
|
pub nats_creds_path: Option<String>,
|
||||||
|
pub my_cluster_id: Id,
|
||||||
|
pub desired_primary: Id,
|
||||||
|
pub heartbeat_interval: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const NATS_URL: &str = "NATS_URL";
|
||||||
|
pub const DESIRED_PRIMARY: &str = "DESIRED_PRIMARY";
|
||||||
|
pub const MY_CLUSTER_ID: &str = "MY_CLUSTER_ID";
|
||||||
|
pub const NATS_CREDS_PATH: &str = "NATS_CREDS_PATH";
|
||||||
|
|
||||||
|
impl AgentConfig {
|
||||||
|
pub fn load_from_env() -> Result<Self, String> {
|
||||||
|
let nats_url = env::var(NATS_URL).unwrap_or_else(|_| "nats://localhost:4222".to_string());
|
||||||
|
|
||||||
|
// Validate NATS URL is not empty
|
||||||
|
if nats_url.is_empty() {
|
||||||
|
return Err(format!("{NATS_URL} cannot be empty"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate NATS URL format
|
||||||
|
if !nats_url.starts_with("nats://") && !nats_url.starts_with("tls://") {
|
||||||
|
return Err(format!(
|
||||||
|
"Invalid NATS URL format: {}. Must start with 'nats://' or 'tls://'",
|
||||||
|
nats_url
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let nats_creds_path = env::var(NATS_CREDS_PATH)
|
||||||
|
.ok()
|
||||||
|
.filter(|creds_path| !creds_path.is_empty());
|
||||||
|
|
||||||
|
// Validate NATS creds path if provided
|
||||||
|
if let Some(creds_path) = &nats_creds_path {
|
||||||
|
debug!("Validating nats creds path from env var {NATS_CREDS_PATH} : {nats_creds_path:?}");
|
||||||
|
let path = Path::new(creds_path);
|
||||||
|
if !path.exists() {
|
||||||
|
return Err(format!(
|
||||||
|
"NATS credentials file does not exist: {}",
|
||||||
|
creds_path
|
||||||
|
));
|
||||||
|
}
|
||||||
|
if !path.is_file() {
|
||||||
|
return Err(format!(
|
||||||
|
"NATS credentials path is not a file: {}",
|
||||||
|
creds_path
|
||||||
|
));
|
||||||
|
}
|
||||||
|
// Check if file is readable by attempting to read metadata
|
||||||
|
if std::fs::metadata(path).is_err() {
|
||||||
|
return Err(format!(
|
||||||
|
"NATS credentials file is not readable: {}",
|
||||||
|
creds_path
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let my_cluster_id_str = env::var(MY_CLUSTER_ID)
|
||||||
|
.map_err(|_| "Environment variable {MY_CLUSTER_ID} is required".to_string())?;
|
||||||
|
|
||||||
|
if my_cluster_id_str.is_empty() {
|
||||||
|
return Err(format!("{MY_CLUSTER_ID} cannot be empty"));
|
||||||
|
}
|
||||||
|
|
||||||
|
let desired_primary_str = env::var(DESIRED_PRIMARY)
|
||||||
|
.map_err(|_| "Environment variable {DESIRED_PRIMARY} is required".to_string())?;
|
||||||
|
|
||||||
|
if desired_primary_str.is_empty() {
|
||||||
|
return Err(format!("{DESIRED_PRIMARY} cannot be empty"));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
nats_url,
|
||||||
|
nats_creds_path,
|
||||||
|
my_cluster_id: my_cluster_id_str.into(),
|
||||||
|
desired_primary: desired_primary_str.into(),
|
||||||
|
heartbeat_interval: Duration::from_millis(1000),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
82
harmony_agent/src/main.rs
Normal file
82
harmony_agent/src/main.rs
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
use std::{sync::Arc, time::Duration};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
agent::AgentRole,
|
||||||
|
store::{ChaosKvStore, InMemoryKvStore, NatsKvStore},
|
||||||
|
};
|
||||||
|
|
||||||
|
// mod agent_loop;
|
||||||
|
mod agent;
|
||||||
|
pub mod store;
|
||||||
|
mod workflow;
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
env_logger::init();
|
||||||
|
|
||||||
|
let heartbeat_interval = Duration::from_millis(2000);
|
||||||
|
let failover_timeout = Duration::from_secs(10);
|
||||||
|
|
||||||
|
// let (health_kv, cluster_kv) = get_chaos_store(&heartbeat_interval, &failover_timeout);
|
||||||
|
|
||||||
|
let nats_store = get_local_nats_store().await;
|
||||||
|
let health_kv = nats_store.clone();
|
||||||
|
let cluster_kv = nats_store.clone();
|
||||||
|
|
||||||
|
let _ = tokio::join!(
|
||||||
|
agent::launch_agent(
|
||||||
|
AgentRole::Primary,
|
||||||
|
health_kv.clone(),
|
||||||
|
cluster_kv.clone(),
|
||||||
|
heartbeat_interval,
|
||||||
|
failover_timeout
|
||||||
|
),
|
||||||
|
agent::launch_agent(
|
||||||
|
AgentRole::Replica,
|
||||||
|
health_kv,
|
||||||
|
cluster_kv,
|
||||||
|
heartbeat_interval,
|
||||||
|
failover_timeout
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_chaos_store(
|
||||||
|
heartbeat_interval: &Duration,
|
||||||
|
failover_timeout: &Duration,
|
||||||
|
) -> (
|
||||||
|
Arc<ChaosKvStore<InMemoryKvStore>>,
|
||||||
|
Arc<ChaosKvStore<InMemoryKvStore>>,
|
||||||
|
) {
|
||||||
|
let health_kv = Arc::new(ChaosKvStore::new(
|
||||||
|
InMemoryKvStore::new(),
|
||||||
|
10,
|
||||||
|
10,
|
||||||
|
heartbeat_interval.as_millis().try_into().unwrap(),
|
||||||
|
));
|
||||||
|
let cluster_kv = Arc::new(ChaosKvStore::new(
|
||||||
|
InMemoryKvStore::new(),
|
||||||
|
5,
|
||||||
|
5,
|
||||||
|
failover_timeout.as_millis().try_into().unwrap(),
|
||||||
|
));
|
||||||
|
|
||||||
|
(health_kv, cluster_kv)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_local_nats_store() -> Arc<NatsKvStore> {
|
||||||
|
let client = async_nats::connect("localhost").await.unwrap();
|
||||||
|
let jetstream = async_nats::jetstream::new(client);
|
||||||
|
let kv = jetstream
|
||||||
|
.create_key_value(async_nats::jetstream::kv::Config {
|
||||||
|
bucket: "kv".to_string(),
|
||||||
|
history: 10,
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let status = kv.status().await.unwrap();
|
||||||
|
println!("status: {:?}", status);
|
||||||
|
|
||||||
|
Arc::new(NatsKvStore::new(kv))
|
||||||
|
}
|
||||||
142
harmony_agent/src/store/chaos.rs
Normal file
142
harmony_agent/src/store/chaos.rs
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use log::{debug, trace, warn};
|
||||||
|
use serde_json::Value;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::time::Duration;
|
||||||
|
|
||||||
|
use crate::store::SubscriptionCallback;
|
||||||
|
|
||||||
|
use super::{KvStore, KvStoreError};
|
||||||
|
|
||||||
|
/// A chaos testing KV store that randomly times out or fails
|
||||||
|
/// Wraps another KvStore implementation and adds random failures
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct ChaosKvStore<T: KvStore> {
|
||||||
|
inner: Arc<T>,
|
||||||
|
timeout_probability_percent: u32,
|
||||||
|
failure_probability_percent: u32,
|
||||||
|
max_delay_ms: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: KvStore> ChaosKvStore<T> {
|
||||||
|
pub fn new(
|
||||||
|
inner: T,
|
||||||
|
timeout_probability_percent: u32,
|
||||||
|
failure_probability_percent: u32,
|
||||||
|
max_delay_ms: u64,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
inner: Arc::new(inner),
|
||||||
|
timeout_probability_percent,
|
||||||
|
failure_probability_percent,
|
||||||
|
max_delay_ms,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn maybe_chaos(&self) -> Result<(), KvStoreError> {
|
||||||
|
trace!("Calculating chaos");
|
||||||
|
// Random delay
|
||||||
|
let delay = getrandom::u64().unwrap() % self.max_delay_ms;
|
||||||
|
let delay = Duration::from_millis(delay);
|
||||||
|
trace!("Sleeping until chaos maybe happens {delay:?}");
|
||||||
|
tokio::time::sleep(delay).await;
|
||||||
|
|
||||||
|
// Random failure
|
||||||
|
let failure_random = getrandom::u32().unwrap() % 100;
|
||||||
|
if failure_random < self.failure_probability_percent {
|
||||||
|
warn!(
|
||||||
|
"Chaos causes an error : {failure_random} < {}",
|
||||||
|
self.failure_probability_percent
|
||||||
|
);
|
||||||
|
return Err(KvStoreError::Unknown(format!(
|
||||||
|
"Randomly failed thanks to chaos store with {}% chances, got {}",
|
||||||
|
self.failure_probability_percent, failure_random
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Random timeout (simulated as a very long delay)
|
||||||
|
let failure_random = getrandom::u32().unwrap() % 100;
|
||||||
|
if failure_random < self.timeout_probability_percent {
|
||||||
|
warn!(
|
||||||
|
"Chaos caused a timeout : {failure_random} < {}",
|
||||||
|
self.failure_probability_percent
|
||||||
|
);
|
||||||
|
tokio::time::sleep(Duration::from_secs(189754678456784560)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl<T: KvStore + Send + Sync> KvStore for ChaosKvStore<T> {
|
||||||
|
async fn get(&self, key: &str) -> Result<super::KvResult, KvStoreError> {
|
||||||
|
self.maybe_chaos().await?;
|
||||||
|
self.inner.get(key).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_revision(
|
||||||
|
&self,
|
||||||
|
key: &str,
|
||||||
|
expected_seq: u64,
|
||||||
|
) -> Result<super::KvResult, KvStoreError> {
|
||||||
|
self.maybe_chaos().await?;
|
||||||
|
self.inner.get_revision(key, expected_seq).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn set_strict(
|
||||||
|
&self,
|
||||||
|
key: &str,
|
||||||
|
value: Value,
|
||||||
|
expected_sequence: u64,
|
||||||
|
) -> Result<u64, KvStoreError> {
|
||||||
|
self.maybe_chaos().await?;
|
||||||
|
self.inner.set_strict(key, value, expected_sequence).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn subscribe(
|
||||||
|
&self,
|
||||||
|
key: &str,
|
||||||
|
callback: SubscriptionCallback,
|
||||||
|
) -> Result<(), KvStoreError> {
|
||||||
|
self.maybe_chaos().await?;
|
||||||
|
self.inner.subscribe(key, callback).await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::store::InMemoryKvStore;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_chaos_store_with_no_chaos() {
|
||||||
|
let inner = InMemoryKvStore::new();
|
||||||
|
let chaos = ChaosKvStore::new(inner, 0, 0, 1);
|
||||||
|
|
||||||
|
let value = json!({"test": "value"});
|
||||||
|
let result = chaos.set_strict("key", value.clone(), 0).await.unwrap();
|
||||||
|
assert_eq!(result, 1);
|
||||||
|
|
||||||
|
let retrieved = chaos.get("key").await.unwrap();
|
||||||
|
assert_eq!(retrieved.value, Some(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_chaos_store_with_delay() {
|
||||||
|
let inner = InMemoryKvStore::new();
|
||||||
|
let chaos = ChaosKvStore::new(inner, 0, 0, 100);
|
||||||
|
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
let value = json!({"test": "value"});
|
||||||
|
chaos.set_strict("key", value, 0).await.unwrap();
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
|
||||||
|
// Should have some delay
|
||||||
|
assert!(
|
||||||
|
elapsed.as_millis() < 150,
|
||||||
|
"Should complete within reasonable time"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
196
harmony_agent/src/store/memory.rs
Normal file
196
harmony_agent/src/store/memory.rs
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use log::{debug, trace};
|
||||||
|
use serde_json::Value;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::{SystemTime, UNIX_EPOCH};
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
|
||||||
|
use crate::store::SubscriptionCallback;
|
||||||
|
|
||||||
|
use super::{KvMetadata, KvResult, KvStore, KvStoreError};
|
||||||
|
|
||||||
|
/// An in-memory KV store that guarantees ordering like NATS JetStream
|
||||||
|
/// Each key maintains a full history of all writes, where the sequence number
|
||||||
|
/// is the length of the history (1-indexed)
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct InMemoryKvStore {
|
||||||
|
data: Arc<RwLock<HashMap<String, Vec<(Value, u64)>>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl InMemoryKvStore {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
data: Arc::new(RwLock::new(HashMap::new())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the latest sequence number for a key (length of history)
|
||||||
|
pub async fn get_seq(&self, key: &str) -> Option<u64> {
|
||||||
|
self.data.read().await.get(key).map(|vec| vec.len() as u64)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the value at a specific revision for a key
|
||||||
|
pub async fn get_revision(&self, key: &str, seq: u64) -> Result<KvResult, KvStoreError> {
|
||||||
|
let data = self.data.read().await;
|
||||||
|
let entries = data
|
||||||
|
.get(key)
|
||||||
|
.ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
|
||||||
|
|
||||||
|
// Sequence numbers are 1-indexed, so seq must be >= 1 and <= len()
|
||||||
|
if seq == 0 || seq > entries.len() as u64 {
|
||||||
|
return Err(KvStoreError::KeyNotAvailable(key.to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let (value, timestamp) = entries[seq as usize - 1].clone();
|
||||||
|
|
||||||
|
Ok(KvResult {
|
||||||
|
value: Some(value.clone()),
|
||||||
|
metadata: KvMetadata {
|
||||||
|
timestamp,
|
||||||
|
sequence: seq,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for InMemoryKvStore {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl KvStore for InMemoryKvStore {
|
||||||
|
async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
|
||||||
|
self.get_revision(key, expected_seq).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
|
||||||
|
let data = self.data.read().await;
|
||||||
|
let entries = data
|
||||||
|
.get(key)
|
||||||
|
.ok_or_else(|| KvStoreError::KeyNotAvailable(key.to_string()))?;
|
||||||
|
|
||||||
|
let (value, timestamp) = entries.last().unwrap();
|
||||||
|
|
||||||
|
Ok(KvResult {
|
||||||
|
value: Some(value.clone()),
|
||||||
|
metadata: KvMetadata {
|
||||||
|
timestamp: *timestamp,
|
||||||
|
sequence: entries.len() as u64,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn set_strict(
|
||||||
|
&self,
|
||||||
|
key: &str,
|
||||||
|
value: Value,
|
||||||
|
expected_sequence: u64,
|
||||||
|
) -> Result<u64, KvStoreError> {
|
||||||
|
// Check current sequence (length of history for this key)
|
||||||
|
let data = self.data.read().await;
|
||||||
|
// This implemenetation does not seem to match the NATS sequence. In nats the
|
||||||
|
// sequence updates one counter per bucket. This impl creates a counter per key
|
||||||
|
let current_sequence = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
|
||||||
|
drop(data);
|
||||||
|
|
||||||
|
// Verify expected sequence matches
|
||||||
|
if current_sequence != expected_sequence {
|
||||||
|
trace!("{current_sequence} != {expected_sequence}");
|
||||||
|
return Err(KvStoreError::WrongLastRevision);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get current timestamp
|
||||||
|
let timestamp = SystemTime::now()
|
||||||
|
.duration_since(UNIX_EPOCH)
|
||||||
|
.expect("Time went backwards")
|
||||||
|
.as_millis() as u64;
|
||||||
|
|
||||||
|
// Append to the history
|
||||||
|
let mut data = self.data.write().await;
|
||||||
|
data.entry(key.to_string())
|
||||||
|
.or_insert_with(Vec::new)
|
||||||
|
.push((value.clone(), timestamp));
|
||||||
|
|
||||||
|
let new_seq = data.get(key).map(|vec| vec.len() as u64).unwrap_or(0);
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Successfully inserted {key}(rev#{new_seq}) : {value}",
|
||||||
|
value = value.to_string()
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(new_seq)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn subscribe(
|
||||||
|
&self,
|
||||||
|
key: &str,
|
||||||
|
callback: SubscriptionCallback,
|
||||||
|
) -> Result<(), KvStoreError> {
|
||||||
|
// For now, subscribe just returns the current value
|
||||||
|
// In a real implementation, this would return a stream of updates
|
||||||
|
self.get(key).await;
|
||||||
|
todo!() // register callback and call it when key is set ?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_memory_store_basic() {
|
||||||
|
let store = InMemoryKvStore::new();
|
||||||
|
|
||||||
|
// Set a value
|
||||||
|
let value = json!({"status": "healthy"});
|
||||||
|
let result = store
|
||||||
|
.set_strict("test_key", value.clone(), 0)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(result, 1);
|
||||||
|
|
||||||
|
// Get the value
|
||||||
|
let retrieved = store.get("test_key").await.unwrap();
|
||||||
|
assert_eq!(retrieved.value, Some(value));
|
||||||
|
assert_eq!(retrieved.metadata.sequence, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_memory_store_sequence_numbers() {
|
||||||
|
let store = InMemoryKvStore::new();
|
||||||
|
|
||||||
|
let seq1 = store.set_strict("key1", json!("value1"), 0).await.unwrap();
|
||||||
|
|
||||||
|
let seq2 = store.set_strict("key1", json!("value2"), 1).await.unwrap();
|
||||||
|
|
||||||
|
assert!(seq2 > seq1, "Sequence numbers should increment");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_memory_store_key_not_found() {
|
||||||
|
let store = InMemoryKvStore::new();
|
||||||
|
let result = store.get("nonexistent").await;
|
||||||
|
assert!(matches!(result, Err(KvStoreError::KeyNotAvailable(_))));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_memory_store_strict_ordering() {
|
||||||
|
let store = InMemoryKvStore::new();
|
||||||
|
|
||||||
|
// First write with sequence 0
|
||||||
|
let result1 = store.set_strict("key", json!("value1"), 0).await.unwrap();
|
||||||
|
assert_eq!(result1, 1);
|
||||||
|
|
||||||
|
// Second write with correct sequence
|
||||||
|
let result2 = store.set_strict("key", json!("value2"), 1).await.unwrap();
|
||||||
|
assert_eq!(result2, 2);
|
||||||
|
|
||||||
|
// Third write with wrong sequence should fail
|
||||||
|
let result3 = store.set_strict("key", json!("value3"), 1).await;
|
||||||
|
assert!(matches!(result3, Err(KvStoreError::WrongLastRevision)));
|
||||||
|
}
|
||||||
|
}
|
||||||
120
harmony_agent/src/store/mod.rs
Normal file
120
harmony_agent/src/store/mod.rs
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_json::Value;
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
/// Handle for managing active subscriptions
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct SubscriptionHandle {
|
||||||
|
id: usize,
|
||||||
|
_phantom: std::marker::PhantomData<()>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metadata returned by the KV store for all operations
|
||||||
|
/// Contains timing and ordering information set by the store
|
||||||
|
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||||
|
pub struct KvMetadata {
|
||||||
|
/// Timestamp set by the store (milliseconds since UNIX epoch)
|
||||||
|
pub timestamp: u64,
|
||||||
|
/// Sequence number for strict ordering guarantees
|
||||||
|
pub sequence: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result returned by KV store operations
|
||||||
|
/// Contains both the value (if any) and store metadata
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct KvResult {
|
||||||
|
/// The value from the store (None if key doesn't exist)
|
||||||
|
pub value: Option<Value>,
|
||||||
|
/// Store-provided metadata (timestamp, sequence)
|
||||||
|
pub metadata: KvMetadata,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Callback type for subscription updates
|
||||||
|
/// Callback receives: key, new value (None if deleted), and metadata
|
||||||
|
pub type SubscriptionCallback = Box<dyn Fn(String, Option<Value>, KvMetadata) + Send + Sync>;
|
||||||
|
|
||||||
|
#[derive(Error, Debug)]
|
||||||
|
pub enum KvStoreError {
|
||||||
|
#[error("data store disconnected")]
|
||||||
|
Disconnect(#[from] std::io::Error),
|
||||||
|
#[error("invalid key")]
|
||||||
|
InvalidKey,
|
||||||
|
#[error("operation timed out")]
|
||||||
|
Timeout,
|
||||||
|
#[error("the data for key `{0}` is not available")]
|
||||||
|
KeyNotAvailable(String),
|
||||||
|
#[error("Failed to deserialize value to json. Error {0} , value: {1}", .deserialization_error, .value)]
|
||||||
|
DeserializationFailed {
|
||||||
|
deserialization_error: String,
|
||||||
|
value: String,
|
||||||
|
},
|
||||||
|
#[error("Strict ordering violation, wrong last sequence number")]
|
||||||
|
WrongLastRevision,
|
||||||
|
#[error("unknown data store error {0}")]
|
||||||
|
Unknown(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
pub trait KvStore {
|
||||||
|
/// Get a value from the store
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// - `Ok(KvResult)`: Contains the value and metadata (timestamp, sequence)
|
||||||
|
/// - `Err(KeyNotAvailable)`: If the key doesn't exist
|
||||||
|
async fn get(&self, key: &str) -> Result<KvResult, KvStoreError>;
|
||||||
|
|
||||||
|
async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError>;
|
||||||
|
|
||||||
|
/// Strict set operation with compare-and-set semantics
|
||||||
|
///
|
||||||
|
/// Sets the value only if the current sequence number matches `expected_sequence`.
|
||||||
|
/// This provides strict ordering guarantees needed for the failover algorithm.
|
||||||
|
///
|
||||||
|
/// # Parameters
|
||||||
|
/// - `key`: The key to set
|
||||||
|
/// - `value`: The value to store
|
||||||
|
/// - `expected_sequence`: The sequence number we expect the key to currently have.
|
||||||
|
/// Use 0 for the first write to a new key.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// - `Ok(u64)`: Returns the new sequence number
|
||||||
|
/// - `Err(KvStoreError)`: If another write happened (current != expected)
|
||||||
|
///
|
||||||
|
/// # Example Use Case
|
||||||
|
/// For NATS JetStream, this maps to the conditional update operation that ensures
|
||||||
|
/// only one agent can successfully promote to primary.
|
||||||
|
async fn set_strict(
|
||||||
|
&self,
|
||||||
|
key: &str,
|
||||||
|
value: Value,
|
||||||
|
expected_sequence: u64,
|
||||||
|
) -> Result<u64, KvStoreError>;
|
||||||
|
|
||||||
|
/// Subscribe to updates for a key
|
||||||
|
///
|
||||||
|
/// # Parameters
|
||||||
|
/// - `key`: The key to subscribe to
|
||||||
|
/// - `callback`: Function to call on each update with key, value, and metadata
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// - `Ok(())`: Subscription established successfully
|
||||||
|
/// - `Err(KvStoreError)`: Subscription failed
|
||||||
|
///
|
||||||
|
/// Note: For JetStream, this should use watch() API. Updates will invoke the callback
|
||||||
|
/// asynchronously in the background.
|
||||||
|
async fn subscribe(
|
||||||
|
&self,
|
||||||
|
key: &str,
|
||||||
|
callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
|
||||||
|
// callback
|
||||||
|
) -> Result<(), KvStoreError>;
|
||||||
|
}
|
||||||
|
|
||||||
|
mod chaos;
|
||||||
|
mod memory;
|
||||||
|
mod nats;
|
||||||
|
|
||||||
|
pub use chaos::ChaosKvStore;
|
||||||
|
pub use memory::InMemoryKvStore;
|
||||||
|
pub use nats::NatsKvStore;
|
||||||
179
harmony_agent/src/store/nats.rs
Normal file
179
harmony_agent/src/store/nats.rs
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
use async_nats::jetstream::kv::{Store, UpdateError};
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use log::{debug, error, trace};
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use crate::store::SubscriptionCallback;
|
||||||
|
|
||||||
|
use super::{KvMetadata, KvResult, KvStore, KvStoreError};
|
||||||
|
|
||||||
|
/// NATS JetStream-backed KV store
|
||||||
|
pub struct NatsKvStore {
|
||||||
|
store: Store,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NatsKvStore {
|
||||||
|
pub fn new(store: Store) -> Self {
|
||||||
|
Self { store }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn create(
|
||||||
|
client: async_nats::Client,
|
||||||
|
bucket_name: &str,
|
||||||
|
history_size: i64,
|
||||||
|
) -> Result<Self, Box<dyn std::error::Error>> {
|
||||||
|
let jetstream = async_nats::jetstream::new(client);
|
||||||
|
|
||||||
|
debug!("Creating NATS KV bucket: {}", bucket_name);
|
||||||
|
let store = jetstream
|
||||||
|
.create_key_value(async_nats::jetstream::kv::Config {
|
||||||
|
bucket: bucket_name.to_string(),
|
||||||
|
history: history_size,
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
error!(
|
||||||
|
"Failed to initialize NATS KV bucket '{}': {}",
|
||||||
|
bucket_name, e
|
||||||
|
);
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(Self::new(store))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl KvStore for NatsKvStore {
|
||||||
|
async fn get_revision(&self, key: &str, expected_seq: u64) -> Result<KvResult, KvStoreError> {
|
||||||
|
let entry = self
|
||||||
|
.store
|
||||||
|
.entry_for_revision(key, expected_seq)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
error!("NATS get failed for key '{}': {}", key, e);
|
||||||
|
KvStoreError::Disconnect(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::Other,
|
||||||
|
e.to_string(),
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
if entry.is_none() {
|
||||||
|
return Err(KvStoreError::KeyNotAvailable(key.to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let entry = entry.unwrap();
|
||||||
|
let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
|
||||||
|
KvStoreError::DeserializationFailed {
|
||||||
|
deserialization_error: e.to_string(),
|
||||||
|
value: String::from_utf8_lossy(&entry.value).to_string(),
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Extract metadata from NATS entry
|
||||||
|
// Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
|
||||||
|
let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
|
||||||
|
|
||||||
|
let metadata = KvMetadata {
|
||||||
|
timestamp,
|
||||||
|
sequence: entry.revision,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(KvResult {
|
||||||
|
value: Some(value),
|
||||||
|
metadata,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get(&self, key: &str) -> Result<KvResult, KvStoreError> {
|
||||||
|
let entry = self.store.entry(key).await.map_err(|e| {
|
||||||
|
error!("NATS get failed for key '{}': {}", key, e);
|
||||||
|
KvStoreError::Disconnect(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::Other,
|
||||||
|
e.to_string(),
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
if entry.is_none() {
|
||||||
|
return Err(KvStoreError::KeyNotAvailable(key.to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let entry = entry.unwrap();
|
||||||
|
let value: Value = serde_json::from_slice(&entry.value).map_err(|e| {
|
||||||
|
KvStoreError::DeserializationFailed {
|
||||||
|
deserialization_error: e.to_string(),
|
||||||
|
value: String::from_utf8_lossy(&entry.value).to_string(),
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Extract metadata from NATS entry
|
||||||
|
// Using unix_timestamp_nanos / 1_000_000 to get milliseconds from OffsetDateTime
|
||||||
|
let timestamp = (entry.created.unix_timestamp_nanos() / 1_000_000) as u64;
|
||||||
|
|
||||||
|
let metadata = KvMetadata {
|
||||||
|
timestamp,
|
||||||
|
sequence: entry.revision,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(KvResult {
|
||||||
|
value: Some(value),
|
||||||
|
metadata,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn set_strict(
|
||||||
|
&self,
|
||||||
|
key: &str,
|
||||||
|
value: Value,
|
||||||
|
expected_sequence: u64,
|
||||||
|
) -> Result<u64, KvStoreError> {
|
||||||
|
trace!(
|
||||||
|
"Nats set strict {key} (#{expected_sequence}) : {}",
|
||||||
|
value.to_string()
|
||||||
|
);
|
||||||
|
let bytes =
|
||||||
|
serde_json::to_vec(&value).map_err(|e| KvStoreError::DeserializationFailed {
|
||||||
|
deserialization_error: e.to_string(),
|
||||||
|
value: value.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Use update() for CAS semantics (Compare-And-Set)
|
||||||
|
// This ensures we only write if the revision matches expected_sequence
|
||||||
|
let revision = self
|
||||||
|
.store
|
||||||
|
.update(&key, bytes.into(), expected_sequence)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
// FIXME this is ugly, we should have a clean KvStoreError containing
|
||||||
|
// proper information from nats instead
|
||||||
|
error!("NATS update failed for key '{}': {}", key, e);
|
||||||
|
e
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(revision)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn subscribe(
|
||||||
|
&self,
|
||||||
|
key: &str,
|
||||||
|
callback: SubscriptionCallback, // TODO this should return an iterator instead of taking a
|
||||||
|
) -> Result<(), KvStoreError> {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<UpdateError> for KvStoreError {
|
||||||
|
fn from(value: UpdateError) -> Self {
|
||||||
|
match value.kind() {
|
||||||
|
async_nats::jetstream::kv::UpdateErrorKind::InvalidKey => KvStoreError::InvalidKey,
|
||||||
|
async_nats::jetstream::kv::UpdateErrorKind::TimedOut => KvStoreError::Timeout,
|
||||||
|
async_nats::jetstream::kv::UpdateErrorKind::WrongLastRevision => {
|
||||||
|
KvStoreError::WrongLastRevision
|
||||||
|
}
|
||||||
|
async_nats::jetstream::kv::UpdateErrorKind::Other => KvStoreError::Disconnect(
|
||||||
|
std::io::Error::new(std::io::ErrorKind::Other, "NATS update error"),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
39
harmony_agent/src/workflow/mod.rs
Normal file
39
harmony_agent/src/workflow/mod.rs
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use crate::agent::AgentConfig;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
|
||||||
|
pub mod primary;
|
||||||
|
pub mod replica;
|
||||||
|
|
||||||
|
/// Trait that defines how a workflow (Primary or Replica) handles heartbeat events
|
||||||
|
#[async_trait]
|
||||||
|
pub trait HeartbeatWorkflow: Send + Sync {
|
||||||
|
/// Handle a successful heartbeat
|
||||||
|
async fn handle_heartbeat_success(
|
||||||
|
&mut self,
|
||||||
|
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||||
|
agent_config: &AgentConfig,
|
||||||
|
) -> Option<crate::agent::ClusterStateData>;
|
||||||
|
|
||||||
|
/// Handle a failed heartbeat
|
||||||
|
async fn handle_heartbeat_failure(
|
||||||
|
&mut self,
|
||||||
|
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||||
|
);
|
||||||
|
|
||||||
|
async fn on_startup(
|
||||||
|
&self,
|
||||||
|
cluster_state: Option<&crate::agent::heartbeat::ClusterStateData>,
|
||||||
|
agent_config: &AgentConfig,
|
||||||
|
);
|
||||||
|
|
||||||
|
/// Get the current state name for logging (also used for heartbeat status)
|
||||||
|
fn state_name(&self) -> &'static str;
|
||||||
|
|
||||||
|
/// Get current consecutive successes
|
||||||
|
fn consecutive_successes(&self) -> usize;
|
||||||
|
|
||||||
|
/// Get current consecutive failures
|
||||||
|
fn consecutive_failures(&self) -> usize;
|
||||||
|
}
|
||||||
330
harmony_agent/src/workflow/primary.rs
Normal file
330
harmony_agent/src/workflow/primary.rs
Normal file
@@ -0,0 +1,330 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use log::{debug, info, trace, warn};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
agent::{AgentConfig, DeploymentConfig},
|
||||||
|
workflow::HeartbeatWorkflow,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum PrimaryState {
|
||||||
|
Initializing,
|
||||||
|
Healthy,
|
||||||
|
Failed,
|
||||||
|
Fenced,
|
||||||
|
Yielding,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PrimaryState {
|
||||||
|
pub fn name(&self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
PrimaryState::Initializing => "Primary:Initializing",
|
||||||
|
PrimaryState::Healthy => "Primary:Healthy",
|
||||||
|
PrimaryState::Failed => "Primary:Failed",
|
||||||
|
PrimaryState::Fenced => "Primary:Fenced",
|
||||||
|
PrimaryState::Yielding => "Primary:Yielding",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct PrimaryWorkflow {
|
||||||
|
state: PrimaryState,
|
||||||
|
consecutive_successes: usize,
|
||||||
|
consecutive_failures: usize,
|
||||||
|
|
||||||
|
// TODO these thresholds should not be copied into the workflow struct. They are configuration
|
||||||
|
// level and should always be read from the context passed to the workflow functions
|
||||||
|
success_threshold: usize,
|
||||||
|
failure_threshold: usize,
|
||||||
|
|
||||||
|
// TODO not sure if this should be known by the workflow or passed in the context to function
|
||||||
|
// calls or just completely handled by the agent ?
|
||||||
|
deployment_config: DeploymentConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PrimaryWorkflow {
|
||||||
|
pub fn new(
|
||||||
|
success_threshold: usize,
|
||||||
|
failure_threshold: usize,
|
||||||
|
deployment_config: DeploymentConfig,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
state: PrimaryState::Initializing,
|
||||||
|
consecutive_successes: 0,
|
||||||
|
consecutive_failures: 0,
|
||||||
|
success_threshold,
|
||||||
|
failure_threshold,
|
||||||
|
deployment_config,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn transition_to(&mut self, new_state: PrimaryState) {
|
||||||
|
if self.state != new_state {
|
||||||
|
info!(
|
||||||
|
"State transition: {} -> {}",
|
||||||
|
self.state.name(),
|
||||||
|
new_state.name()
|
||||||
|
);
|
||||||
|
self.state = new_state;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl HeartbeatWorkflow for PrimaryWorkflow {
|
||||||
|
async fn on_startup(
|
||||||
|
&self,
|
||||||
|
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||||
|
agent_config: &AgentConfig,
|
||||||
|
) {
|
||||||
|
if let Some(state) = cluster_state {
|
||||||
|
info!(
|
||||||
|
"Startup reconciliation: current primary is {:?}, desired primary is {:?}",
|
||||||
|
state.cluster_info.current_primary, state.cluster_info.desired_primary
|
||||||
|
);
|
||||||
|
|
||||||
|
// No automatic fast-tracking - agent must earn healthy status
|
||||||
|
// through successful heartbeats. This prevents duplicate agents
|
||||||
|
// or crashloop agents from incorrectly claiming primary.
|
||||||
|
} else {
|
||||||
|
debug!("No cluster state on startup, starting from Initializing");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
async fn handle_heartbeat_success(
|
||||||
|
&mut self,
|
||||||
|
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||||
|
agent_config: &AgentConfig,
|
||||||
|
) -> Option<crate::agent::ClusterStateData> {
|
||||||
|
trace!(
|
||||||
|
"Handling heartbeat success, current counters success {} failures {}",
|
||||||
|
self.consecutive_successes, self.consecutive_failures
|
||||||
|
);
|
||||||
|
self.consecutive_successes += 1;
|
||||||
|
self.consecutive_failures = 0;
|
||||||
|
|
||||||
|
match self.state {
|
||||||
|
PrimaryState::Initializing => {
|
||||||
|
if self.consecutive_successes >= self.success_threshold {
|
||||||
|
self.transition_to(PrimaryState::Healthy);
|
||||||
|
// Trigger on_active callback
|
||||||
|
let config = self.deployment_config.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
config.on_active().await;
|
||||||
|
});
|
||||||
|
if let Some(state) = cluster_state
|
||||||
|
&& state.cluster_info.desired_primary == agent_config.desired_primary_id
|
||||||
|
{
|
||||||
|
debug!("state {:#?}", state);
|
||||||
|
let mut new_state = state.clone();
|
||||||
|
new_state.cluster_info.current_primary =
|
||||||
|
Some(agent_config.agent_id.clone());
|
||||||
|
return Some(new_state);
|
||||||
|
} else {
|
||||||
|
todo!(
|
||||||
|
"I cluster_state should not be an option, and we should throw an error when we are running a primary workflow but we are not the desired primary in the cluster state data"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
PrimaryState::Failed => {
|
||||||
|
if self.consecutive_successes >= self.success_threshold {
|
||||||
|
self.transition_to(PrimaryState::Healthy);
|
||||||
|
let config = self.deployment_config.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
config.on_active().await;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
PrimaryState::Healthy => {
|
||||||
|
// Stay healthy
|
||||||
|
debug!("Primary staying healthy");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
PrimaryState::Fenced => {
|
||||||
|
// Recovery from fenced state
|
||||||
|
if self.consecutive_successes >= self.success_threshold {
|
||||||
|
// TODO: Check NATS for current_primary status before recovering
|
||||||
|
info!("Recovered from fenced state, transitioning to yielding");
|
||||||
|
self.transition_to(PrimaryState::Yielding);
|
||||||
|
}
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
PrimaryState::Yielding => {
|
||||||
|
// TODO: Check NATS to see if we can resume as primary
|
||||||
|
trace!("Yielding, waiting for demotion handshake");
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_heartbeat_failure(
|
||||||
|
&mut self,
|
||||||
|
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||||
|
) {
|
||||||
|
self.consecutive_failures += 1;
|
||||||
|
self.consecutive_successes = 0;
|
||||||
|
|
||||||
|
match self.state {
|
||||||
|
PrimaryState::Healthy => {
|
||||||
|
if self.consecutive_failures >= self.failure_threshold {
|
||||||
|
warn!(
|
||||||
|
"Failure threshold reached ({}/{}), transitioning to Failed",
|
||||||
|
self.consecutive_failures, self.failure_threshold
|
||||||
|
);
|
||||||
|
self.transition_to(PrimaryState::Failed);
|
||||||
|
|
||||||
|
// Immediately fence
|
||||||
|
self.transition_to(PrimaryState::Fenced);
|
||||||
|
let config = self.deployment_config.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
config.on_failover().await;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
PrimaryState::Initializing => {
|
||||||
|
// Stay in initializing, just accumulate failures
|
||||||
|
trace!("Heartbeat failed during initialization");
|
||||||
|
}
|
||||||
|
PrimaryState::Failed | PrimaryState::Fenced | PrimaryState::Yielding => {
|
||||||
|
// Already in a degraded state
|
||||||
|
trace!("Heartbeat failed in degraded state: {}", self.state.name());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn state_name(&self) -> &'static str {
|
||||||
|
self.state.name()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn consecutive_successes(&self) -> usize {
|
||||||
|
self.consecutive_successes
|
||||||
|
}
|
||||||
|
|
||||||
|
fn consecutive_failures(&self) -> usize {
|
||||||
|
self.consecutive_failures
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use harmony_types::id::Id;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use crate::agent::{AgentRole, FailoverCNPGConfig};
|
||||||
|
|
||||||
|
use pretty_assertions::assert_eq;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn primary_does_nothing_when_on_heartbeat_success_below_threshold() {
|
||||||
|
let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
primary
|
||||||
|
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
|
||||||
|
.await
|
||||||
|
.is_none()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn primary_transitions_cluster_state_when_consecutive_success_threshold_reached() {
|
||||||
|
let (mut primary, cluster_state, agent_config) = default_test_state(2, 2);
|
||||||
|
|
||||||
|
let mut expected_state = cluster_state.clone();
|
||||||
|
expected_state.cluster_info.current_primary = Some(Id::empty());
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
primary
|
||||||
|
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
|
||||||
|
.await,
|
||||||
|
None
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
primary
|
||||||
|
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
|
||||||
|
.await,
|
||||||
|
Some(expected_state)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn primary_stays_healthy_below_failure_threshold() {
|
||||||
|
let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
|
||||||
|
|
||||||
|
// Reach healthy
|
||||||
|
let _ = primary
|
||||||
|
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
|
||||||
|
.await;
|
||||||
|
assert_eq!(primary.state, PrimaryState::Healthy);
|
||||||
|
|
||||||
|
// One failure below threshold
|
||||||
|
primary.handle_heartbeat_failure(Some(&cluster_state)).await;
|
||||||
|
assert_eq!(primary.state, PrimaryState::Healthy);
|
||||||
|
assert_eq!(primary.consecutive_failures(), 1);
|
||||||
|
assert_eq!(primary.consecutive_successes(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn primary_transitions_to_failed_at_failure_threshold() {
|
||||||
|
let (mut primary, cluster_state, agent_config) = default_test_state(1, 2);
|
||||||
|
|
||||||
|
// Reach healthy
|
||||||
|
let _ = primary
|
||||||
|
.handle_heartbeat_success(Some(&cluster_state), &agent_config)
|
||||||
|
.await;
|
||||||
|
assert_eq!(primary.state, PrimaryState::Healthy);
|
||||||
|
|
||||||
|
// First failure, still healthy
|
||||||
|
primary.handle_heartbeat_failure(Some(&cluster_state)).await;
|
||||||
|
assert_eq!(primary.state, PrimaryState::Healthy);
|
||||||
|
assert_eq!(primary.consecutive_failures(), 1);
|
||||||
|
|
||||||
|
// Second failure reaches threshold, transitions to Failed
|
||||||
|
primary.handle_heartbeat_failure(Some(&cluster_state)).await;
|
||||||
|
assert_eq!(primary.state, PrimaryState::Fenced);
|
||||||
|
assert_eq!(primary.consecutive_failures(), 2);
|
||||||
|
assert_eq!(primary.consecutive_successes(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_test_state(
|
||||||
|
success_threshold: usize,
|
||||||
|
failure_threshold: usize,
|
||||||
|
) -> (PrimaryWorkflow, crate::agent::ClusterStateData, AgentConfig) {
|
||||||
|
let cluster_state = crate::agent::ClusterStateData {
|
||||||
|
cluster_info: crate::agent::heartbeat::ClusterState {
|
||||||
|
cluster_id: Id::empty(),
|
||||||
|
current_primary: None,
|
||||||
|
desired_primary: Id::empty(),
|
||||||
|
},
|
||||||
|
metadata: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let agent_config = AgentConfig {
|
||||||
|
success_threshold,
|
||||||
|
failure_threshold,
|
||||||
|
heartbeat_interval: Duration::from_nanos(0),
|
||||||
|
failover_timeout: Duration::from_nanos(0),
|
||||||
|
deployment_config_unstable: DeploymentConfig::FailoverPostgreSQL(FailoverCNPGConfig {
|
||||||
|
cnpg_cluster_name: "test".to_string(),
|
||||||
|
}),
|
||||||
|
nats_url: String::new(),
|
||||||
|
nats_creds_path: None,
|
||||||
|
agent_id: Id::empty(),
|
||||||
|
cluster_id: Id::empty(),
|
||||||
|
desired_primary_id: Id::empty(),
|
||||||
|
role: AgentRole::Primary,
|
||||||
|
};
|
||||||
|
|
||||||
|
let primary = PrimaryWorkflow::new(
|
||||||
|
agent_config.success_threshold,
|
||||||
|
agent_config.failure_threshold,
|
||||||
|
agent_config.deployment_config_unstable.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
|
(primary, cluster_state, agent_config)
|
||||||
|
}
|
||||||
|
}
|
||||||
279
harmony_agent/src/workflow/replica.rs
Normal file
279
harmony_agent/src/workflow/replica.rs
Normal file
@@ -0,0 +1,279 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use harmony_types::id::Id;
|
||||||
|
use log::{debug, error, info, trace, warn};
|
||||||
|
use std::time::Duration;
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
|
||||||
|
use crate::agent::{AgentConfig, AgentHeartbeat};
|
||||||
|
use crate::workflow::HeartbeatWorkflow;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct HeartbeatState {
|
||||||
|
pub agent_id: Id,
|
||||||
|
pub last_seq: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HeartbeatState {
|
||||||
|
pub fn watch(agent_id: Id) -> Self {
|
||||||
|
Self {
|
||||||
|
agent_id,
|
||||||
|
last_seq: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct ClusterState {
|
||||||
|
pub cluster_id: Id,
|
||||||
|
pub current_primary: Option<Id>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ClusterState {
|
||||||
|
pub fn watch(cluster_id: Id) -> Self {
|
||||||
|
Self {
|
||||||
|
cluster_id,
|
||||||
|
current_primary: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum ReplicaState {
|
||||||
|
Initializing,
|
||||||
|
Watching,
|
||||||
|
Promoting,
|
||||||
|
PromotionFailed,
|
||||||
|
Leader,
|
||||||
|
Demoting,
|
||||||
|
Failed,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReplicaState {
|
||||||
|
pub fn name(&self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
ReplicaState::Initializing => "Replica:Initializing",
|
||||||
|
ReplicaState::Watching => "Replica:Watching",
|
||||||
|
ReplicaState::Promoting => "Replica:Promoting",
|
||||||
|
ReplicaState::PromotionFailed => "Replica:PromotionFailed",
|
||||||
|
ReplicaState::Leader => "Replica:Leader",
|
||||||
|
ReplicaState::Demoting => "Replica:Demoting",
|
||||||
|
ReplicaState::Failed => "Replica:Failed",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct ReplicaWorkflow {
|
||||||
|
state: ReplicaState,
|
||||||
|
heartbeat_state: HeartbeatState,
|
||||||
|
primary_state: HeartbeatState,
|
||||||
|
cluster_state: ClusterState,
|
||||||
|
consecutive_successes: usize,
|
||||||
|
consecutive_failures: usize,
|
||||||
|
success_threshold: usize,
|
||||||
|
failure_threshold: usize,
|
||||||
|
failover_timeout: Duration,
|
||||||
|
/// Our own last heartbeat (for timestamp comparison against primary)
|
||||||
|
last_my_heartbeat: Option<AgentHeartbeat>,
|
||||||
|
/// Last observed primary heartbeat (metadata only, for staleness detection)
|
||||||
|
last_primary_heartbeat: Option<RwLock<AgentHeartbeat>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReplicaWorkflow {
|
||||||
|
pub fn new(
|
||||||
|
success_threshold: usize,
|
||||||
|
failure_threshold: usize,
|
||||||
|
cluster_id: Id,
|
||||||
|
primary_id: Id,
|
||||||
|
my_id: Id,
|
||||||
|
failover_timeout: Duration,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
state: ReplicaState::Initializing,
|
||||||
|
consecutive_successes: 0,
|
||||||
|
consecutive_failures: 0,
|
||||||
|
success_threshold,
|
||||||
|
failure_threshold,
|
||||||
|
failover_timeout,
|
||||||
|
cluster_state: ClusterState::watch(cluster_id),
|
||||||
|
primary_state: HeartbeatState::watch(primary_id),
|
||||||
|
heartbeat_state: HeartbeatState::watch(my_id),
|
||||||
|
last_my_heartbeat: None,
|
||||||
|
last_primary_heartbeat: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn transition_to(&mut self, new_state: ReplicaState) {
|
||||||
|
if self.state != new_state {
|
||||||
|
info!(
|
||||||
|
"State transition: {} -> {}",
|
||||||
|
self.state.name(),
|
||||||
|
new_state.name()
|
||||||
|
);
|
||||||
|
self.state = new_state;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if the primary heartbeat is stale compared to our own
|
||||||
|
/// Per ADR-017-3: primary is stale if (replica_timestamp - primary_timestamp) > failover_timeout
|
||||||
|
async fn is_primary_stale(&mut self) -> bool {
|
||||||
|
if let Some(my_hb) = &self.last_my_heartbeat {
|
||||||
|
if let Some(my_metadata) = &my_hb.metadata {
|
||||||
|
if let Some(primary_hb_ref) = self.last_primary_heartbeat.as_ref() {
|
||||||
|
let primary_hb = primary_hb_ref.read().await;
|
||||||
|
if let Some(primary_metadata) = &primary_hb.metadata {
|
||||||
|
// Calculate time difference: replica_timestamp - primary_timestamp
|
||||||
|
let time_diff_ms = my_metadata
|
||||||
|
.timestamp
|
||||||
|
.saturating_sub(primary_metadata.timestamp);
|
||||||
|
let failover_timeout_ms = self.failover_timeout.as_millis() as u64;
|
||||||
|
|
||||||
|
trace!(
|
||||||
|
"Staleness check: my_ts={}, primary_ts={}, diff={}ms, timeout={}ms",
|
||||||
|
my_metadata.timestamp,
|
||||||
|
primary_metadata.timestamp,
|
||||||
|
time_diff_ms,
|
||||||
|
failover_timeout_ms
|
||||||
|
);
|
||||||
|
|
||||||
|
if time_diff_ms > failover_timeout_ms {
|
||||||
|
info!(
|
||||||
|
"Primary heartbeat stale ({}ms > {}ms), attempting promotion",
|
||||||
|
time_diff_ms, failover_timeout_ms
|
||||||
|
);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl HeartbeatWorkflow for ReplicaWorkflow {
|
||||||
|
async fn on_startup(
|
||||||
|
&self,
|
||||||
|
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||||
|
agent_config: &AgentConfig,
|
||||||
|
) {
|
||||||
|
// todo!("not sure if the replica should do anything on startup")
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_heartbeat_success(
|
||||||
|
&mut self,
|
||||||
|
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||||
|
agent_config: &AgentConfig,
|
||||||
|
) -> Option<crate::agent::ClusterStateData> {
|
||||||
|
trace!(
|
||||||
|
"Handling heartbeat success, current counters success {} failures {}",
|
||||||
|
self.consecutive_successes, self.consecutive_failures
|
||||||
|
);
|
||||||
|
self.consecutive_successes += 1;
|
||||||
|
self.consecutive_failures = 0;
|
||||||
|
|
||||||
|
match self.state {
|
||||||
|
ReplicaState::Initializing => {
|
||||||
|
if self.consecutive_successes >= self.success_threshold {
|
||||||
|
self.transition_to(ReplicaState::Watching);
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
ReplicaState::Watching => {
|
||||||
|
// TODO: Check primary staleness from NATS
|
||||||
|
trace!("Replica watching primary");
|
||||||
|
if self.is_primary_stale().await {
|
||||||
|
panic!("Found stale primary, launching promotion");
|
||||||
|
}
|
||||||
|
debug!("perform the replica watch actions :
|
||||||
|
- if a primary exists in the cluster (cluster_state.current_primary == expected_primary)
|
||||||
|
- check the last primary heartbeat kv timestamp
|
||||||
|
- compare it with our latest kv heartbeat
|
||||||
|
- if longer than failover timeout, launch promotion (we assume that primary has already fenced itself)
|
||||||
|
- launching promotion will change the status of the replica
|
||||||
|
");
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
ReplicaState::Promoting => {
|
||||||
|
// TODO: Complete promotion attempt
|
||||||
|
trace!("Replica promotion in progress");
|
||||||
|
todo!(
|
||||||
|
"When promoting, a heartbeat failure does not affect promotion unless failure_threshold is reached, a heartbeat success does nothing either"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
ReplicaState::PromotionFailed => {
|
||||||
|
if self.consecutive_successes >= self.success_threshold {
|
||||||
|
self.transition_to(ReplicaState::Watching);
|
||||||
|
}
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
ReplicaState::Leader => {
|
||||||
|
// TODO: Check for original primary recovery
|
||||||
|
trace!("Replica acting as leader");
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
ReplicaState::Failed => {
|
||||||
|
if self.consecutive_successes >= self.success_threshold {
|
||||||
|
info!("Replica recovered from Failed state, transitioning to Watching");
|
||||||
|
self.transition_to(ReplicaState::Watching);
|
||||||
|
}
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
ReplicaState::Demoting => {
|
||||||
|
// TODO: Complete demotion back to watching
|
||||||
|
trace!("Replica demotion in progress");
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_heartbeat_failure(
|
||||||
|
&mut self,
|
||||||
|
cluster_state: Option<&crate::agent::ClusterStateData>,
|
||||||
|
) {
|
||||||
|
self.consecutive_failures += 1;
|
||||||
|
self.consecutive_successes = 0;
|
||||||
|
|
||||||
|
// TODO revisit this. I think we should handle the agent healthiness (checking
|
||||||
|
// consecutive_failures against failure_threshold) separately from handling the cluster
|
||||||
|
// state.
|
||||||
|
//
|
||||||
|
// That said, there might be funny stuff we have to do when the agent reaches the failure
|
||||||
|
// threshold, especially in promoting and demoting statuses.
|
||||||
|
|
||||||
|
match self.state {
|
||||||
|
ReplicaState::Watching | ReplicaState::Initializing => {
|
||||||
|
if self.consecutive_failures >= self.failure_threshold {
|
||||||
|
info!(
|
||||||
|
"Replica exceeded failure threshold ({}/{}), transitioning to Failed",
|
||||||
|
self.consecutive_failures, self.failure_threshold
|
||||||
|
);
|
||||||
|
self.transition_to(ReplicaState::Failed);
|
||||||
|
} else {
|
||||||
|
trace!("Replica heartbeat failed, but below threshold");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ReplicaState::Promoting
|
||||||
|
| ReplicaState::PromotionFailed
|
||||||
|
| ReplicaState::Leader
|
||||||
|
| ReplicaState::Demoting
|
||||||
|
| ReplicaState::Failed => {
|
||||||
|
trace!("Replica heartbeat failed in state: {}", self.state.name());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn state_name(&self) -> &'static str {
|
||||||
|
self.state.name()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn consecutive_successes(&self) -> usize {
|
||||||
|
self.consecutive_successes
|
||||||
|
}
|
||||||
|
|
||||||
|
fn consecutive_failures(&self) -> usize {
|
||||||
|
self.consecutive_failures
|
||||||
|
}
|
||||||
|
}
|
||||||
12
harmony_execution/Cargo.toml
Normal file
12
harmony_execution/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
[package]
|
||||||
|
name = "harmony_execution"
|
||||||
|
edition = "2024"
|
||||||
|
version.workspace = true
|
||||||
|
readme.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
thiserror.workspace = true
|
||||||
|
lazy_static.workspace = true
|
||||||
|
directories.workspace = true
|
||||||
|
log.workspace = true
|
||||||
470
harmony_execution/src/command.rs
Normal file
470
harmony_execution/src/command.rs
Normal file
@@ -0,0 +1,470 @@
|
|||||||
|
use std::io::{BufRead, BufReader};
|
||||||
|
use std::process::{Child, Command, Stdio};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::thread;
|
||||||
|
|
||||||
|
/// Captured output from a command execution
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct CommandOutput {
|
||||||
|
/// Captured stdout content
|
||||||
|
pub stdout: String,
|
||||||
|
/// Captured stderr content
|
||||||
|
pub stderr: String,
|
||||||
|
/// Exit status of the command
|
||||||
|
pub status: CommandStatus,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CommandOutput {
|
||||||
|
/// Returns true if the command succeeded
|
||||||
|
pub fn is_success(&self) -> bool {
|
||||||
|
self.status.is_success()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Formats the complete output for display
|
||||||
|
pub fn format_output(&self) -> String {
|
||||||
|
format!(
|
||||||
|
"Stdout:\n{}\n\nStderr:\n{}",
|
||||||
|
if self.stdout.is_empty() {
|
||||||
|
"<empty>"
|
||||||
|
} else {
|
||||||
|
&self.stdout
|
||||||
|
},
|
||||||
|
if self.stderr.is_empty() {
|
||||||
|
"<empty>"
|
||||||
|
} else {
|
||||||
|
&self.stderr
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result status of a command execution
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub enum CommandStatus {
|
||||||
|
/// Command executed successfully (exit code 0)
|
||||||
|
Success,
|
||||||
|
/// Command failed with an exit code
|
||||||
|
Failed(i32),
|
||||||
|
/// Command was terminated by a signal
|
||||||
|
Terminated(i32),
|
||||||
|
/// Command execution could not be started
|
||||||
|
Error(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CommandStatus {
|
||||||
|
pub fn is_success(&self) -> bool {
|
||||||
|
matches!(self, CommandStatus::Success)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<std::process::ExitStatus> for CommandStatus {
|
||||||
|
fn from(status: std::process::ExitStatus) -> Self {
|
||||||
|
if status.success() {
|
||||||
|
CommandStatus::Success
|
||||||
|
} else if let Some(code) = status.code() {
|
||||||
|
CommandStatus::Failed(code)
|
||||||
|
} else {
|
||||||
|
CommandStatus::Terminated(0) // Signal codes are platform-specific
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type Callback = Arc<dyn Fn(&str) + Send + Sync>;
|
||||||
|
|
||||||
|
/// Options for configuring command execution
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct RunnerOptions {
|
||||||
|
/// Whether to print stdout to console in real-time
|
||||||
|
pub print_stdout: bool,
|
||||||
|
/// Whether to print stderr to console in real-time
|
||||||
|
pub print_stderr: bool,
|
||||||
|
/// Optional callback for each stdout line
|
||||||
|
pub stdout_callback: Callback,
|
||||||
|
/// Optional callback for each stderr line
|
||||||
|
pub stderr_callback: Callback,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RunnerOptions {
|
||||||
|
fn empty_callback() -> Callback {
|
||||||
|
Arc::new(|_| {})
|
||||||
|
}
|
||||||
|
/// Create default options with real-time printing enabled
|
||||||
|
pub fn print_to_console() -> Self {
|
||||||
|
Self {
|
||||||
|
print_stdout: true,
|
||||||
|
print_stderr: true,
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create options that capture output silently
|
||||||
|
pub fn silent() -> Self {
|
||||||
|
Self {
|
||||||
|
print_stdout: false,
|
||||||
|
print_stderr: false,
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set custom callbacks for stdout and stderr lines
|
||||||
|
pub fn with_callbacks<F1, F2>(mut self, stdout_callback: F1, stderr_callback: F2) -> Self
|
||||||
|
where
|
||||||
|
F1: Fn(&str) + Send + Sync + 'static,
|
||||||
|
F2: Fn(&str) + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
self.stdout_callback = Arc::new(stdout_callback);
|
||||||
|
self.stderr_callback = Arc::new(stderr_callback);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for RunnerOptions {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
print_stdout: true,
|
||||||
|
print_stderr: true,
|
||||||
|
stdout_callback: Self::empty_callback(),
|
||||||
|
stderr_callback: Self::empty_callback(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Error type for command execution failures
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct CommandError {
|
||||||
|
/// Human-readable error description
|
||||||
|
pub message: String,
|
||||||
|
/// Captured output if execution started
|
||||||
|
pub output: Option<CommandOutput>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for CommandError {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "{}", self.message)?;
|
||||||
|
if let Some(output) = &self.output {
|
||||||
|
write!(f, "\n{}", output.format_output())?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::error::Error for CommandError {}
|
||||||
|
|
||||||
|
/// Runs a command and captures its output while streaming to console
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use harmony_execution::command::{run_command, RunnerOptions};
|
||||||
|
/// use std::process::Command;
|
||||||
|
///
|
||||||
|
/// let output = run_command(
|
||||||
|
/// Command::new("echo").arg("hello"),
|
||||||
|
/// RunnerOptions::print_to_console()
|
||||||
|
/// ).unwrap();
|
||||||
|
/// assert!(output.is_success());
|
||||||
|
/// assert_eq!(output.stdout, "hello\n");
|
||||||
|
/// ```
|
||||||
|
pub fn run_command(
|
||||||
|
command: &mut Command,
|
||||||
|
options: RunnerOptions,
|
||||||
|
) -> Result<CommandOutput, CommandError> {
|
||||||
|
let mut child = command
|
||||||
|
.stdout(Stdio::piped())
|
||||||
|
.stderr(Stdio::piped())
|
||||||
|
.spawn()
|
||||||
|
.map_err(|e| CommandError {
|
||||||
|
message: format!("Failed to spawn command: {}", e),
|
||||||
|
output: None,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let stdout = child.stdout.take().ok_or_else(|| CommandError {
|
||||||
|
message: "Failed to capture stdout".to_string(),
|
||||||
|
output: None,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let stderr = child.stderr.take().ok_or_else(|| CommandError {
|
||||||
|
message: "Failed to capture stderr".to_string(),
|
||||||
|
output: None,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let stdout_reader = BufReader::new(stdout);
|
||||||
|
let stderr_reader = BufReader::new(stderr);
|
||||||
|
|
||||||
|
let (stdout_sender, stdout_receiver) = std::sync::mpsc::channel();
|
||||||
|
let (stderr_sender, stderr_receiver) = std::sync::mpsc::channel();
|
||||||
|
|
||||||
|
// Spawn thread to handle stdout
|
||||||
|
let stdout_handle = thread::spawn(move || {
|
||||||
|
let mut output = String::new();
|
||||||
|
for line in stdout_reader.lines() {
|
||||||
|
match line {
|
||||||
|
Ok(line_content) => {
|
||||||
|
if options.print_stdout {
|
||||||
|
println!("{}", line_content);
|
||||||
|
}
|
||||||
|
(options.stdout_callback)(&line_content);
|
||||||
|
output.push_str(&line_content);
|
||||||
|
output.push('\n');
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Silently handle read errors - corrupted data at end is common
|
||||||
|
log::trace!("Error reading stdout line: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let _ = stdout_sender.send(output);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Spawn thread to handle stderr
|
||||||
|
let stderr_handle = thread::spawn(move || {
|
||||||
|
let mut output = String::new();
|
||||||
|
for line in stderr_reader.lines() {
|
||||||
|
match line {
|
||||||
|
Ok(line_content) => {
|
||||||
|
if options.print_stderr {
|
||||||
|
eprintln!("{}", line_content);
|
||||||
|
}
|
||||||
|
(options.stderr_callback)(&line_content);
|
||||||
|
output.push_str(&line_content);
|
||||||
|
output.push('\n');
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
log::trace!("Error reading stderr line: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let _ = stderr_sender.send(output);
|
||||||
|
});
|
||||||
|
|
||||||
|
let status = child.wait().map_err(|e| CommandError {
|
||||||
|
message: format!("Failed to wait for command process: {}", e),
|
||||||
|
output: None,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let stdout_lines = stdout_handle
|
||||||
|
.join()
|
||||||
|
.map_err(|e| CommandError {
|
||||||
|
message: format!("Stdout thread panicked: {:?}", e),
|
||||||
|
output: None,
|
||||||
|
})
|
||||||
|
.and_then(|_| {
|
||||||
|
stdout_receiver.recv().map_err(|e| CommandError {
|
||||||
|
message: format!("Failed to receive stdout: {}", e),
|
||||||
|
output: None,
|
||||||
|
})
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let stderr_lines = stderr_handle
|
||||||
|
.join()
|
||||||
|
.map_err(|e| CommandError {
|
||||||
|
message: format!("Stderr thread panicked: {:?}", e),
|
||||||
|
output: None,
|
||||||
|
})
|
||||||
|
.and_then(|_| {
|
||||||
|
stderr_receiver.recv().map_err(|e| CommandError {
|
||||||
|
message: format!("Failed to receive stderr: {}", e),
|
||||||
|
output: None,
|
||||||
|
})
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(CommandOutput {
|
||||||
|
stdout: stdout_lines,
|
||||||
|
stderr: stderr_lines,
|
||||||
|
status: status.into(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience function to run a command with default options (print to console)
|
||||||
|
pub fn run(command: &mut Command) -> Result<CommandOutput, CommandError> {
|
||||||
|
run_command(command, RunnerOptions::print_to_console())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience function to run a command silently (capture output only)
|
||||||
|
pub fn run_silent(command: &mut Command) -> Result<CommandOutput, CommandError> {
|
||||||
|
run_command(command, RunnerOptions::silent())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_simple_echo_command() {
|
||||||
|
let output = run_silent(Command::new("echo").arg("hello world")).unwrap();
|
||||||
|
assert!(output.is_success());
|
||||||
|
assert_eq!(output.stdout.trim(), "hello world");
|
||||||
|
assert!(output.stderr.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_command_failure() {
|
||||||
|
let output = run_silent(Command::new("sh").args(["-c", "exit 42"])).unwrap();
|
||||||
|
assert!(!output.is_success());
|
||||||
|
assert_eq!(output.status, CommandStatus::Failed(42));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_command_output_format() {
|
||||||
|
let output = run_silent(Command::new("echo").arg("test")).unwrap();
|
||||||
|
let formatted = output.format_output();
|
||||||
|
assert!(formatted.contains("Stdout:"));
|
||||||
|
assert!(formatted.contains("test"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_runner_options() {
|
||||||
|
let opts = RunnerOptions::print_to_console();
|
||||||
|
assert!(opts.print_stdout);
|
||||||
|
assert!(opts.print_stderr);
|
||||||
|
|
||||||
|
let opts = RunnerOptions::silent();
|
||||||
|
assert!(!opts.print_stdout);
|
||||||
|
assert!(!opts.print_stderr);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_command_status_from_exit_status() {
|
||||||
|
let output = run_silent(&mut Command::new("true")).unwrap();
|
||||||
|
assert_eq!(output.status, CommandStatus::Success);
|
||||||
|
|
||||||
|
let output = run_silent(&mut Command::new("false")).unwrap();
|
||||||
|
assert_eq!(output.status, CommandStatus::Failed(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stdout_callback_receives_lines() {
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
|
let captured = Arc::new(Mutex::new(Vec::new()));
|
||||||
|
let captured_clone = Arc::clone(&captured);
|
||||||
|
|
||||||
|
let opts = RunnerOptions::silent().with_callbacks(
|
||||||
|
move |line| captured_clone.lock().unwrap().push(line.to_string()),
|
||||||
|
|_| {},
|
||||||
|
);
|
||||||
|
|
||||||
|
run_command(Command::new("echo").arg("hello world"), opts).unwrap();
|
||||||
|
|
||||||
|
let lines = captured.lock().unwrap();
|
||||||
|
assert_eq!(lines.len(), 1);
|
||||||
|
assert_eq!(lines[0], "hello world");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stderr_callback_receives_lines() {
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
|
let captured = Arc::new(Mutex::new(Vec::new()));
|
||||||
|
let captured_clone = Arc::clone(&captured);
|
||||||
|
|
||||||
|
let opts = RunnerOptions::silent().with_callbacks(
|
||||||
|
|_| {},
|
||||||
|
move |line| captured_clone.lock().unwrap().push(line.to_string()),
|
||||||
|
);
|
||||||
|
|
||||||
|
run_command(Command::new("sh").args(["-c", "echo error >&2"]), opts).unwrap();
|
||||||
|
|
||||||
|
let lines = captured.lock().unwrap();
|
||||||
|
assert_eq!(lines.len(), 1);
|
||||||
|
assert_eq!(lines[0], "error");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_callback_and_capture_both_work() {
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
|
let callback_lines = Arc::new(Mutex::new(Vec::new()));
|
||||||
|
let callback_clone = Arc::clone(&callback_lines);
|
||||||
|
|
||||||
|
let opts = RunnerOptions::silent().with_callbacks(
|
||||||
|
move |line| callback_clone.lock().unwrap().push(line.to_string()),
|
||||||
|
|_| {},
|
||||||
|
);
|
||||||
|
|
||||||
|
let output =
|
||||||
|
run_command(Command::new("printf").args(["line1\nline2\nline3\n"]), opts).unwrap();
|
||||||
|
|
||||||
|
// Verify captured output
|
||||||
|
assert_eq!(output.stdout, "line1\nline2\nline3\n");
|
||||||
|
|
||||||
|
// Verify callback received all lines
|
||||||
|
let lines = callback_lines.lock().unwrap();
|
||||||
|
assert_eq!(lines.len(), 3);
|
||||||
|
assert_eq!(lines[0], "line1");
|
||||||
|
assert_eq!(lines[1], "line2");
|
||||||
|
assert_eq!(lines[2], "line3");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multiline_output_capture() {
|
||||||
|
let output = run_silent(Command::new("printf").args(["line1\nline2\nline3\n"])).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(output.stdout, "line1\nline2\nline3\n");
|
||||||
|
assert!(output.stderr.trim().is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_mixed_stdout_stderr_capture() {
|
||||||
|
let output = run_silent(Command::new("sh").args([
|
||||||
|
"-c",
|
||||||
|
"echo stdout1 && echo stderr1 >&2 && echo stdout2 && echo stderr2 >&2",
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert!(output.stdout.contains("stdout1"));
|
||||||
|
assert!(output.stdout.contains("stdout2"));
|
||||||
|
assert!(output.stderr.contains("stderr1"));
|
||||||
|
assert!(output.stderr.contains("stderr2"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_empty_output_command() {
|
||||||
|
let output = run_silent(&mut Command::new("true")).unwrap();
|
||||||
|
|
||||||
|
assert!(output.stdout.is_empty());
|
||||||
|
assert!(output.stderr.is_empty());
|
||||||
|
assert!(output.is_success());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_command_output_format_with_empty_streams() {
|
||||||
|
let output = run_silent(&mut Command::new("true")).unwrap();
|
||||||
|
let formatted = output.format_output();
|
||||||
|
|
||||||
|
assert!(formatted.contains("Stdout:"));
|
||||||
|
assert!(formatted.contains("<empty>"));
|
||||||
|
assert!(formatted.contains("Stderr:"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_error_contains_message_and_output() {
|
||||||
|
let error = CommandError {
|
||||||
|
message: "Test error".to_string(),
|
||||||
|
output: Some(CommandOutput {
|
||||||
|
stdout: "captured stdout".to_string(),
|
||||||
|
stderr: "captured stderr".to_string(),
|
||||||
|
status: CommandStatus::Success,
|
||||||
|
}),
|
||||||
|
};
|
||||||
|
|
||||||
|
let display = format!("{}", error);
|
||||||
|
assert!(display.contains("Test error"));
|
||||||
|
assert!(display.contains("captured stdout"));
|
||||||
|
assert!(display.contains("captured stderr"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_error_without_output() {
|
||||||
|
let error = CommandError {
|
||||||
|
message: "Spawn failed".to_string(),
|
||||||
|
output: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let display = format!("{}", error);
|
||||||
|
assert!(display.contains("Spawn failed"));
|
||||||
|
assert!(!display.contains("Stdout:"));
|
||||||
|
assert!(!display.contains("Stderr:"));
|
||||||
|
}
|
||||||
|
}
|
||||||
5
harmony_execution/src/lib.rs
Normal file
5
harmony_execution/src/lib.rs
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
pub mod command;
|
||||||
|
|
||||||
|
pub use command::{
|
||||||
|
CommandError, CommandOutput, CommandStatus, RunnerOptions, run, run_command, run_silent,
|
||||||
|
};
|
||||||
@@ -32,6 +32,14 @@ impl Id {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Into<Id> for &str {
|
||||||
|
fn into(self) -> Id {
|
||||||
|
Id {
|
||||||
|
value: self.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl FromStr for Id {
|
impl FromStr for Id {
|
||||||
type Err = ();
|
type Err = ();
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user