diff --git a/Cargo.lock b/Cargo.lock index db2929d..9cf1129 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,7 +9,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f7b0a21988c1bf877cf4759ef5ddaac04c1c9fe808c9142ecb78ba97d97a28a" dependencies = [ "bitflags 2.11.0", - "bytes", + "bytes 1.11.1", "futures-core", "futures-sink", "memchr", @@ -32,7 +32,7 @@ dependencies = [ "base64 0.22.1", "bitflags 2.11.0", "brotli", - "bytes", + "bytes 1.11.1", "bytestring", "derive_more", "encoding_rs", @@ -145,7 +145,7 @@ dependencies = [ "actix-service", "actix-utils", "actix-web-codegen", - "bytes", + "bytes 1.11.1", "bytestring", "cfg-if", "cookie 0.16.2", @@ -456,13 +456,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe" dependencies = [ "base64 0.22.1", - "bytes", + "bytes 1.11.1", "futures-util", "memchr", "nkeys", "nuid", "once_cell", - "pin-project", + "pin-project 1.1.11", "portable-atomic", "rand 0.8.5", "regex", @@ -558,7 +558,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", - "bytes", + "bytes 1.11.1", "fastrand", "hex", "http 1.4.0", @@ -619,7 +619,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", - "bytes", + "bytes 1.11.1", "bytes-utils", "fastrand", "http 0.2.12", @@ -651,7 +651,7 @@ dependencies = [ "aws-smithy-types", "aws-smithy-xml", "aws-types", - "bytes", + "bytes 1.11.1", "fastrand", "hex", "hmac", @@ -682,7 +682,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", - "bytes", + "bytes 1.11.1", "fastrand", "http 0.2.12", "http 1.4.0", @@ -706,7 +706,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", - "bytes", + "bytes 1.11.1", "fastrand", "http 0.2.12", "http 1.4.0", @@ -750,7 +750,7 @@ dependencies = [ "aws-smithy-http 0.63.6", "aws-smithy-runtime-api", "aws-smithy-types", - "bytes", + "bytes 1.11.1", "crypto-bigint 0.5.5", "form_urlencoded", "hex", @@ -786,7 +786,7 @@ checksum = "87294a084b43d649d967efe58aa1f9e0adc260e13a6938eb904c0ae9b45824ae" dependencies = [ "aws-smithy-http 0.62.6", "aws-smithy-types", - "bytes", + "bytes 1.11.1", "crc-fast", "hex", "http 0.2.12", @@ -805,7 +805,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf09d74e5e32f76b8762da505a3cd59303e367a664ca67295387baa8c1d7548" dependencies = [ "aws-smithy-types", - "bytes", + "bytes 1.11.1", "crc32fast", ] @@ -818,7 +818,7 @@ dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", "aws-smithy-types", - "bytes", + "bytes 1.11.1", "bytes-utils", "futures-core", "futures-util", @@ -839,7 +839,7 @@ checksum = "ba1ab2dc1c2c3749ead27180d333c42f11be8b0e934058fb4b2258ee8dbe5231" dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", - "bytes", + "bytes 1.11.1", "bytes-utils", "futures-core", "futures-util", @@ -931,7 +931,7 @@ dependencies = [ "aws-smithy-observability", "aws-smithy-runtime-api", "aws-smithy-types", - "bytes", + "bytes 1.11.1", "fastrand", "http 0.2.12", "http 1.4.0", @@ -952,7 +952,7 @@ checksum = "876ab3c9c29791ba4ba02b780a3049e21ec63dabda09268b175272c3733a79e6" dependencies = [ "aws-smithy-async", "aws-smithy-types", - "bytes", + "bytes 1.11.1", "http 0.2.12", "http 1.4.0", "pin-project-lite", @@ -968,7 +968,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d73dbfbaa8e4bc57b9045137680b958d274823509a360abfd8e1d514d40c95c" dependencies = [ "base64-simd", - "bytes", + "bytes 1.11.1", "bytes-utils", "futures-core", "http 0.2.12", @@ -1048,6 +1048,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -1173,7 +1179,7 @@ checksum = "87a52479c9237eb04047ddb94788c41ca0d26eaff8b697ecfbb4c32f7fdc3b1b" dependencies = [ "base64 0.22.1", "bollard-stubs", - "bytes", + "bytes 1.11.1", "futures-core", "futures-util", "hex", @@ -1182,7 +1188,7 @@ dependencies = [ "hyper 1.8.1", "hyper-named-pipe", "hyper-util", - "hyperlocal", + "hyperlocal 0.9.1", "log", "pin-project-lite", "serde", @@ -1306,6 +1312,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "bytes" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38" + [[package]] name = "bytes" version = "1.11.1" @@ -1321,7 +1333,7 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" dependencies = [ - "bytes", + "bytes 1.11.1", "either", ] @@ -1331,7 +1343,7 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "113b4343b5f6617e7ad401ced8de3cc8b012e73a594347c307b90db3e9271289" dependencies = [ - "bytes", + "bytes 1.11.1", ] [[package]] @@ -1652,6 +1664,30 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" +[[package]] +name = "containers-api" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef94b0ff8338282b35bafb408eb0a3e53ba05bdb3b36840589ab9a67a6682593" +dependencies = [ + "chrono", + "flate2", + "futures-util", + "http 0.2.12", + "hyper 0.14.32", + "hyperlocal 0.8.0", + "log", + "mime", + "paste", + "pin-project 1.1.11", + "serde", + "serde_json", + "tar", + "thiserror 1.0.69", + "tokio", + "url", +] + [[package]] name = "convert_case" version = "0.8.0" @@ -3382,6 +3418,18 @@ dependencies = [ "slab", ] +[[package]] +name = "futures_codec" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce54d63f8b0c75023ed920d46fd71d0cbbb830b0ee012726b5b4f506fb6dea5b" +dependencies = [ + "bytes 0.5.6", + "futures", + "memchr", + "pin-project 0.4.30", +] + [[package]] name = "fuzzy-matcher" version = "0.3.7" @@ -3520,7 +3568,7 @@ version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" dependencies = [ - "bytes", + "bytes 1.11.1", "fnv", "futures-core", "futures-sink", @@ -3540,7 +3588,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" dependencies = [ "atomic-waker", - "bytes", + "bytes 1.11.1", "fnv", "futures-core", "futures-sink", @@ -3597,6 +3645,7 @@ dependencies = [ "opnsense-config", "opnsense-config-xml", "option-ext", + "podman-api", "pretty_assertions", "rand 0.9.2", "reqwest 0.11.27", @@ -3963,7 +4012,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3314d5adb5d94bcdf56771f2e50dbbc80bb4bdf88967526706205ac9eff24eb" dependencies = [ "base64 0.22.1", - "bytes", + "bytes 1.11.1", "headers-core", "http 1.4.0", "httpdate", @@ -4061,7 +4110,7 @@ version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" dependencies = [ - "bytes", + "bytes 1.11.1", "fnv", "itoa", ] @@ -4072,7 +4121,7 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" dependencies = [ - "bytes", + "bytes 1.11.1", "itoa", ] @@ -4082,7 +4131,7 @@ version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" dependencies = [ - "bytes", + "bytes 1.11.1", "http 0.2.12", "pin-project-lite", ] @@ -4093,7 +4142,7 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ - "bytes", + "bytes 1.11.1", "http 1.4.0", ] @@ -4103,7 +4152,7 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ - "bytes", + "bytes 1.11.1", "futures-core", "http 1.4.0", "http-body 1.0.1", @@ -4129,7 +4178,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a422b4c865d103368628ae1247be6159ad8041f803eb9e2176cf69ad7d13da40" dependencies = [ "bstr", - "bytes", + "bytes 1.11.1", "crossbeam-channel", "form_urlencoded", "futures", @@ -4152,7 +4201,7 @@ version = "0.14.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" dependencies = [ - "bytes", + "bytes 1.11.1", "futures-channel", "futures-core", "futures-util", @@ -4177,7 +4226,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" dependencies = [ "atomic-waker", - "bytes", + "bytes 1.11.1", "futures-channel", "futures-core", "h2 0.4.13", @@ -4199,7 +4248,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ad4b0a1e37510028bc4ba81d0e38d239c39671b0f0ce9e02dfa93a8133f7c08" dependencies = [ - "bytes", + "bytes 1.11.1", "futures-util", "headers", "http 1.4.0", @@ -4282,7 +4331,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ "base64 0.22.1", - "bytes", + "bytes 1.11.1", "futures-channel", "futures-util", "http 1.4.0", @@ -4298,6 +4347,19 @@ dependencies = [ "tracing", ] +[[package]] +name = "hyperlocal" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fafdf7b2b2de7c9784f76e02c0935e65a8117ec3b768644379983ab333ac98c" +dependencies = [ + "futures-util", + "hex", + "hyper 0.14.32", + "pin-project 1.1.11", + "tokio", +] + [[package]] name = "hyperlocal" version = "0.9.1" @@ -4624,6 +4686,44 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "iot-agent-v0" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-nats", + "chrono", + "clap", + "futures-util", + "harmony", + "serde", + "serde_json", + "tokio", + "toml", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "iot-operator-v0" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-nats", + "clap", + "futures-util", + "k8s-openapi", + "kube", + "schemars 0.8.22", + "serde", + "serde_json", + "serde_yaml", + "thiserror 2.0.18", + "tokio", + "tracing", + "tracing-subscriber", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -4806,7 +4906,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cb276b85b6e94ded00ac8ea2c68fcf4697ea0553cb25fddc35d4a0ab718db8d" dependencies = [ "base64 0.22.1", - "bytes", + "bytes 1.11.1", "chrono", "either", "futures", @@ -4888,7 +4988,7 @@ dependencies = [ "k8s-openapi", "kube-client", "parking_lot", - "pin-project", + "pin-project 1.1.11", "serde", "serde_json", "thiserror 2.0.18", @@ -5058,6 +5158,15 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "md-5" version = "0.10.6" @@ -5237,6 +5346,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "nuid" version = "0.5.0" @@ -5337,7 +5455,7 @@ dependencies = [ "arc-swap", "async-trait", "base64 0.22.1", - "bytes", + "bytes 1.11.1", "cfg-if", "chrono", "either", @@ -5353,7 +5471,7 @@ dependencies = [ "jsonwebtoken", "once_cell", "percent-encoding", - "pin-project", + "pin-project 1.1.11", "secrecy", "serde", "serde_json", @@ -5733,13 +5851,33 @@ dependencies = [ "sha2", ] +[[package]] +name = "pin-project" +version = "0.4.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ef0f924a5ee7ea9cbcea77529dba45f8a9ba9f622419fe3386ca581a3ae9d5a" +dependencies = [ + "pin-project-internal 0.4.30", +] + [[package]] name = "pin-project" version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" dependencies = [ - "pin-project-internal", + "pin-project-internal 1.1.11", +] + +[[package]] +name = "pin-project-internal" +version = "0.4.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "851c8d0ce9bebe43790dedfc86614c23494ac9f423dd618d3a61fc693eafe61e" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", ] [[package]] @@ -5825,6 +5963,42 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" +[[package]] +name = "podman-api" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7697e9e1fdcfd452699eb8c419994a8fd120f0f5ac5a7dd26398a9a983b8dc89" +dependencies = [ + "base64 0.13.1", + "byteorder", + "bytes 1.11.1", + "chrono", + "containers-api", + "flate2", + "futures-util", + "futures_codec", + "log", + "paste", + "podman-api-stubs", + "serde", + "serde_json", + "tar", + "thiserror 1.0.69", + "tokio", + "url", +] + +[[package]] +name = "podman-api-stubs" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d280c623f633a0dded88feab9e387f98451506431d5b7308a858c643305dcee" +dependencies = [ + "chrono", + "serde", + "serde_json", +] + [[package]] name = "poly1305" version = "0.8.0" @@ -6021,7 +6195,7 @@ version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" dependencies = [ - "bytes", + "bytes 1.11.1", "cfg_aliases", "pin-project-lite", "quinn-proto", @@ -6041,7 +6215,7 @@ version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ - "bytes", + "bytes 1.11.1", "getrandom 0.3.4", "lru-slab", "rand 0.9.2", @@ -6305,7 +6479,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62" dependencies = [ "base64 0.21.7", - "bytes", + "bytes 1.11.1", "cookie 0.17.0", "cookie_store", "encoding_rs", @@ -6348,7 +6522,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ "base64 0.22.1", - "bytes", + "bytes 1.11.1", "futures-channel", "futures-core", "futures-util", @@ -6566,7 +6740,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3bb94393cafad0530145b8f626d8687f1ee1dedb93d7ba7740d6ae81868b13b5" dependencies = [ "bitflags 2.11.0", - "bytes", + "bytes 1.11.1", "chrono", "flurry", "log", @@ -6620,7 +6794,7 @@ checksum = "759a090a17ce545d1adcffcc48207d5136c8984d8153bd8247b1ad4a71e49f5f" dependencies = [ "anyhow", "async-trait", - "bytes", + "bytes 1.11.1", "http 1.4.0", "reqwest 0.12.28", "rustify_derive", @@ -7413,7 +7587,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee6798b1838b6a0f69c007c133b8df5866302197e404e8b6ee8ed3e3a5e68dc6" dependencies = [ "base64 0.22.1", - "bytes", + "bytes 1.11.1", "crc", "crossbeam-queue", "either", @@ -7488,7 +7662,7 @@ dependencies = [ "base64 0.22.1", "bitflags 2.11.0", "byteorder", - "bytes", + "bytes 1.11.1", "crc", "digest", "dotenvy", @@ -8000,7 +8174,7 @@ version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" dependencies = [ - "bytes", + "bytes 1.11.1", "libc", "mio 1.1.1", "parking_lot", @@ -8028,7 +8202,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f57eb36ecbe0fc510036adff84824dd3c24bb781e21bfa67b69d556aa85214f" dependencies = [ - "pin-project", + "pin-project 1.1.11", "rand 0.8.5", "tokio", ] @@ -8093,7 +8267,7 @@ version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" dependencies = [ - "bytes", + "bytes 1.11.1", "futures-core", "futures-sink", "pin-project-lite", @@ -8108,7 +8282,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f591660438b3038dd04d16c938271c79e7e06260ad2ea2885a4861bfb238605d" dependencies = [ "base64 0.22.1", - "bytes", + "bytes 1.11.1", "futures-core", "futures-sink", "http 1.4.0", @@ -8218,7 +8392,7 @@ checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ "base64 0.22.1", "bitflags 2.11.0", - "bytes", + "bytes 1.11.1", "futures-util", "http 1.4.0", "http-body 1.0.1", @@ -8286,15 +8460,33 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", "sharded-slab", + "smallvec", "thread_local", + "tracing", "tracing-core", + "tracing-log", ] [[package]] @@ -8334,7 +8526,7 @@ version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4793cb5e56680ecbb1d843515b23b6de9a75eb04b66643e256a396d43be33c13" dependencies = [ - "bytes", + "bytes 1.11.1", "data-encoding", "http 1.4.0", "httparse", @@ -8515,7 +8707,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f81eb4d9221ca29bad43d4b6871b6d2e7656e1af2cfca624a87e5d17880d831d" dependencies = [ "async-trait", - "bytes", + "bytes 1.11.1", "derive_builder 0.12.0", "http 1.4.0", "reqwest 0.12.28", diff --git a/Cargo.toml b/Cargo.toml index ce0fdd1..929354b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,8 @@ members = [ "harmony_node_readiness", "harmony-k8s", "harmony_assets", "opnsense-codegen", "opnsense-api", + "iot/iot-operator-v0", + "iot/iot-agent-v0", ] [workspace.package] @@ -96,4 +98,8 @@ reqwest = { version = "0.12", features = [ assertor = "0.0.4" tokio-test = "0.4" anyhow = "1.0" -clap = { version = "4", features = ["derive"] } +clap = { version = "4", features = ["derive", "env"] } +async-nats = "0.45.0" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +toml = "0.8" diff --git a/ROADMAP/iot_platform/context_conversation.md b/ROADMAP/iot_platform/context_conversation.md new file mode 100644 index 0000000..8c8f588 --- /dev/null +++ b/ROADMAP/iot_platform/context_conversation.md @@ -0,0 +1,245 @@ +# Conversation Summary: IoT Platform Architecture + +**For agents implementing this system:** This document captures the full decision trail that led to the final `iot-platform-v0-walking-skeleton.md` plan. Understanding *why* decisions were made is as important as understanding *what* was decided — especially for judgment calls during implementation where the plan doesn't spell something out explicitly. + +--- + +## The original ask + +Sylvain (CTO of NationTech) wanted to build an IoT platform with these specific requirements: + +- SSO via Zitadel +- Secrets via OpenBao +- Per-device identity, devices belong to groups +- Full CI/CD integration +- A "mini-kubelet" with NATS as the storage backend — each device is a node, reads its own resources, reconciles in a loop, reports status back to NATS KV +- Central operator with CRDs for deployments, device groups, devices — operator writes to NATS on CRD change and reports deployment status back +- CI/CD pipeline publishes hydrated Helm charts to Harbor registry; ArgoCD applies them; operator picks them up and pushes to NATS +- Devices run containers declared via Harmony Scores +- Strong consistency assumed free (NATS provides it) +- Zitadel/OpenBao integration already ~99% done in Harmony + +Original constraints: simplicity key throughout, production-ready, don't go down rabbit holes, deadline and cost discipline. + +--- + +## Phase 1: Initial architecture research and design + +Claude researched NATS, Zitadel, and OpenBao integration patterns in depth using primary sources. Key findings that shaped the design: + +**NATS auth callout with bearer-token JWTs is the right identity primitive.** Devices don't hold NATS signing material. An auth callout service mints a scoped per-connection user JWT with `bearer_token: true` (skips nonce signing, per the `nats-io/jwt` source) after verifying a Zitadel token the device presents at CONNECT time. This is cleaner than distributing long-lived NATS NKeys to devices. ADR-26 is the authoritative spec. + +**Zitadel JWT Profile grant is the device auth path.** Service accounts with public keys registered in Zitadel, devices sign self-JWTs with their private key, exchange for access tokens. Zitadel Discussion #8406 documents exactly this pattern with working Go code for an IoT/TPM case. Key gotcha: external-OpenSSL keys need `ParsePKCS8PrivateKey`, not PKCS1. + +**OpenBao's JWT auth method + templated policies** with `token_policies_template_claims` (PR #618) lets one policy resolve per-device based on JWT claims. One policy for N devices instead of N policies. + +The three systems compose cleanly: +- Zitadel = identity (who or what) +- OpenBao = policy + secrets (what they can access) +- NATS = transport + subject-level authorization (where messages go) + +A ~900-line architecture document captured this with primary-source citations. It remains the reference for implementation detail on auth flows. + +--- + +## Phase 2: Planning iterations and scope calibration + +Claude produced three planning documents in sequence, each refining the approach: + +1. **Issue breakdown (22 issues, 9 tracks)** — human-executable parallel tracks +2. **Autonomous agent harness plan** — contract-first with phase gates, for agent-driven execution +3. **Walking skeleton plan** — thin end-to-end thread shipping Tuesday + +**Sylvain's critical intervention:** when Claude produced the parallel-tracks plan targeting day 14, Sylvain pushed back. The real approach should be **walking skeleton** (Cockburn) / **tracer bullet** — ship a naive end-to-end loop first, let architecture emerge from running code, harden from there. This reduces the risk of reaching day 14 with nothing integrated. + +Claude acknowledged overreach: the three documents shouldn't all exist. Walking skeleton supersedes the other two. The first two became reference material only. + +--- + +## Phase 3: The "start from scratch objectively" challenge + +Sylvain asked Claude to reconsider the architecture from scratch, with access to NationTech's full resource context (k8s clusters, ArgoCD, Harbor, Zitadel, OpenBao, Harmony ownership) but without emotional attachment to the previous design. + +**Claude's initial recommendation:** k3s on each Pi + ArgoCD + external-secrets-operator. Boring, CNCF-standard, maintained by the ecosystem rather than by NationTech. Argued the custom NATS-mini-kubelet approach was "building a platform when you could buy one." + +**Sylvain's decisive pushback** reframed this correctly. Claude had under-weighted several things: + +1. **End-customer engineers are mechanical/electrical/chemical, not Kubernetes-literate.** They debug with `systemctl`, `journalctl`, `ps`. A k3s device forces them to learn kubectl/CRDs/CNI — a real productivity tax on a team that shouldn't have to pay it. A single Rust binary + podman is inspectable with tools they already know. + +2. **The platform bet is strategic, not technical.** NationTech's positioning as "no vendor lock-in, decentralized, open-source enterprise cloud" gains credibility from having a product (Harmony), not from being "extraordinary plumbers for off-the-shelf CNCF." Building a custom platform on this bet is how you become a platform company instead of an integration shop. + +3. **NationTech is its own largest customer.** Multiple OKD clusters already need coordination; manually connecting to each to make deployments is a major operational pain that hinders growth. The same architecture (agent reconciling against NATS KV) eventually manages podman on Pis, `kubectl apply` on OKD clusters, and VM-level operations. One abstraction, three instantiations. + +4. **NATS is architecturally superior for federation.** ArgoCD doesn't naturally federate — it manages clusters *from one place*. A NATS supercluster with strict ordering across regions supports "operator in multiple clusters, ArgoCD instances all over, deployments coming from everywhere." For the long-term decentralized control plane, NATS is the correct substrate. + +5. **Rancher code quality (k3s provenance) is real data, not nostalgia.** Sylvain has direct experience; Claude had over-indexed on CNCF graduation as a quality proxy. + +6. **Harmony daemon-mode `Interpret` is already solved.** Claude had repeatedly flagged "does `Score::interpret()` work in a loop?" as a major unknown. Reality: `s.clone().interpret().await` is exactly the TUI's daemon pattern, and `harmony_agent` runs this in production for distributed CNPG PostgreSQL management. The concern was unfounded. + +**Result:** Claude updated. The custom NATS-based platform is correct for this context. The k3s alternative genuinely doesn't fit. The walking skeleton plan stands. + +Remaining real risks (acknowledged, not architecture-invalidating): +- Platform scope creep → walking skeleton discipline +- Bus factor → normal Harmony collaboration patterns with Jean-Gabriel +- Customers #2-N for the federation story → business question, not technical + +--- + +## Phase 4: Strategic alignment and scope clarifications + +Sylvain provided specific clarifications that shaped the final plan: + +**Balena was considered and rejected.** It's the closest viable alternative, open-source, but requires custom OS (balenaOS — lock-in of a different kind), lacks native SSO + secrets integration, and positions NationTech as a Balena integrator rather than a platform company. AGPL Harmony vs. Balena has similar license profiles; NationTech can deliver honest no-lock-in positioning. + +**Three-way relationship structure:** NationTech → Partner (custom software shop, engineering-quality-focused, does coaching) → End-customer (whose field-deployed Pi 5 devices run the partner's application). Tuesday's demo is for the Partner. Production deployment may involve direct end-customer contact later. + +**Partner relationship is healthy and collaborative.** They want NationTech to succeed. Demo failure modes tolerable. Platform partnership is an active topic between the teams — they explicitly value having a platform partner they trust for landing their own customers. + +**Other potential customers exist but aren't paying.** NationTech is managing their OKD clusters via other means for now. They can wait. NationTech's own OKD coordination pain is the largest driver. + +--- + +## Phase 5: Technical nitty-gritty corrections + +Sylvain corrected several technical details Claude had gotten wrong or overdesigned: + +**No `harmony-podman-score` new crate.** It's a new module in `harmony/src/modules/podman/` following existing Harmony module conventions. Corrected in the plan. + +**Use `podman-api` Rust crate, not shell-out.** Strongly typed API preferred. Requires `systemctl --user enable --now podman.socket` on the device. `podlet` crate worth evaluating later when Quadlet comes back in scope (v0.1+). + +**Graceful shutdown is just `podman stop` with 5-min timeout then SIGKILL.** Not kubelet-style pod termination. Claude had overcomplicated this. + +**Score envelope was overdesigned — drop it.** The "ScoreEnvelope with format/encoding/content_hash/data" pattern reminded Sylvain of SOAP. Use adjacently-tagged serde enum instead: `#[serde(tag = "type", content = "data")]`. Rust type name is the discriminator. Agent deserializes directly into the typed Score. No double-deserialization, no opaque bytes, no format version strings. + +**Change detection via string comparison, not content hash.** Comparing serialized Score strings is cheap enough at this scale (a couple times per minute). Removes hashing-algorithm risk. More deterministic. + +**Agent config is flat TOML for v0.** Long-term target is zero-config — device boots (PXE if budget allows), has a Zitadel URL + initial token, fetches real config from OpenBao, connects to NATS. OpenBao as source of truth for NATS credentials. v0 uses simple shared NATS credentials directly in TOML. + +**OpenBao outage must not break NATS reconnect if token is still valid.** The auth callout in v0.2 should validate Zitadel tokens against JWKS directly; OpenBao lookup for group permissions should be cached in the callout. Availability-favoring design — reboot isn't more of a security event than a passing minute, and NATS rejects on actual token expiry anyway. No degradation of real security posture. + +--- + +## Phase 6: aarch64 discovery + +Late in the conversation, the single most important technical issue surfaced. Claude had been flagging "does Harmony `Interpret` work in daemon mode?" as the biggest Friday risk. Sylvain corrected that this was a non-issue. + +**The real issue:** Harmony doesn't currently compile on aarch64. When `harmony_agent` was cross-compiled for ARM64, an upstream dependency had to be pulled out. Sylvain's 80% confidence: single sub-dependency used by only a few modules, feature-gatable, those modules become unavailable on ARM (acceptable — device doesn't need every Harmony feature). + +**This replaces the Friday-evening "§6 decision" in the plan.** It becomes the first-hour investigation. Fallback paths exist: build agent against minimum aarch64-clean Harmony subset, or (worst case) pure Rust without Harmony Score traits for v0, adopt them in v0.1. + +--- + +## Phase 7: Final plan adjustments + +The walking skeleton plan was updated with all agreed decisions in a single coherent revision. Key decisions baked into the final doc: + +**Section-by-section:** + +- **§1 Strategic framing** now explicitly names NationTech as largest customer and describes the decentralized cloud vision (heating buildings, sovereign, etc.) so collaborators reading the plan understand this is long-term investment, not a side project. + +- **§5.4 agent scope:** kubelet compatibility explicitly NOT a goal, kubelet architecture as north star only, v0 absolutely minimal. One paragraph, no enforced limits — discipline through inherent minimalism. + +- **§5.5 Score message format:** adjacently-tagged serde enum, no envelope, no content hash, string comparison for change detection. + +- **§6.7 agent config:** flat TOML for v0. v0.2 narrows to Zitadel-token-bootstrap model. + +- **§7 aarch64 investigation** is the Friday-evening critical path. Fallbacks documented. + +- **§8 Hour 1-2 field readiness:** heavy power-cycle testing, network-out-during-boot, agent crash loop. SD card wear / thermal / PoE explicitly ruled out per partner conversation. + +- **Agent task cards:** A1 uses new Score format. A2 targets `harmony/src/modules/rpi/`. A3 commits to `podman-api` crate. Graceful shutdown simplified. + +- **§12 v0.2 roadmap** includes availability-favoring auth callout design (cached OpenBao permissions, NATS handles token expiry). + +- **§13 partner conversation:** technical strategy only; "others in your network" question dropped per Sylvain ("overstepping into sales, not your concern"). + +**Explicitly removed:** +- OKD-as-device future spike (kept in strategic framing only, not execution) +- Three-level Jean-Gabriel review process (normal Harmony collaboration applies) +- ScoreEnvelope wrapping +- Content hash in Score messages +- `iot-contracts` crate in v0 (extract v0.1) +- Thesis document Sunday dispatch (moved to v0.1 Week 2) + +--- + +## Key principles for implementing agents + +Drawing these out as they're load-bearing for judgment calls: + +1. **The walking skeleton is the plan. Ship Tuesday with something crude but working.** Not production-ready, not complete. Working end-to-end thread from git push to container running on Pi. + +2. **Inherent discipline over enforced limits.** The plan doesn't have line-count budgets or anti-scope lists because Sylvain argued (correctly) that walking-skeleton discipline makes them redundant. If you find yourself wanting to add PLEG event streams, per-workload worker pools, or housekeeping sweeps to v0 — don't. Periodic relist is enough. + +3. **Architectural boundaries (§6) must survive v0 even under deadline pressure.** Score enum polymorphic from day one. Credentials behind a trait. Topology generalizable. CRD spec forward-compatible. NATS subject grammar matches long-term. These cost little now and save big later. Don't take shortcuts here to save 20 minutes. + +4. **Scope cuts (§4) are real, not aspirational.** Zitadel/OpenBao deferred to v0.2. One device for Tuesday. No groups. No rollout state machine. No API. No TUI. No observability beyond journalctl. Fighting these cuts is the plan's biggest risk. + +5. **Availability favored over strict security posture.** The auth callout caches OpenBao lookups. Token expiry is the authoritative revocation mechanism, not real-time policy lookup. A disconnected OpenBao doesn't brick the fleet. + +6. **The `podman-api` crate is the happy path.** Shell-out to `podman` is fallback-only. Strong typing wins when available. + +7. **Sylvain owns the critical code himself.** Agent A1 (operator), A2 (Pi provisioning), A3 (installer), A4 (demo script) are agent-dispatched. The agent binary itself and the `PodmanV0Score` implementation are Sylvain's work. The auth callout (v0.2) will also be human-written. Don't propose that agents take over these pieces. + +8. **The partner relationship is strategic.** Tuesday demo conversation is half the Tuesday deliverable. Framing the v0.1/v0.2/v0.3 roadmap to them matters as much as the running code. + +9. **End-customer debuggability is a UX constraint.** Mechanical/electrical/chemical engineers will touch these devices. `systemctl status iot-agent` must tell them what's happening. `journalctl -u iot-agent` must be parseable by humans. Error messages must be understandable without Kubernetes knowledge. + +10. **NATS is the long-term architectural commitment.** Everything on NATS — not as a queue, as a coordination fabric. The "decentralized cluster management" future depends on this choice. Implementation decisions that weaken this (e.g., "let's just put a database in the middle") should be pushed back on. + +--- + +## What failed or went wrong in the planning process + +Noted for meta-awareness — avoid repeating: + +- **Claude overproduced.** Three planning documents when two would do. Under deadline pressure, planning documents are distractions from execution. Sylvain eventually said this directly. + +- **Claude under-weighted end-customer UX.** Initial k3s recommendation treated "Kubernetes is easy" as universal when it's only easy for people who already know Kubernetes. + +- **Claude under-weighted strategic positioning.** Platform-building vs. integration-consulting is a business choice; Claude treated it as purely technical. + +- **Claude repeatedly flagged the Harmony daemon-mode concern** despite it being already solved. A better first question would have been "does this work today?" rather than "what if this doesn't work?" + +- **Claude's initial Zitadel/OpenBao integration estimate was too large** because Claude didn't fully internalize "integration is 99% done in Harmony." The remaining work is wiring, not implementing. + +- **Claude started with the ScoreEnvelope pattern** before understanding Harmony's native serde patterns. The "SOAP" reaction was deserved. + +--- + +## What's in the final plan + +The final document `iot-platform-v0-walking-skeleton.md` (~700 lines) contains: + +- Strategic framing (§1) +- Walking skeleton vs. alternatives comparison (§2) +- Tuesday demo definition (§3) +- Scope cuts with milestones (§4) +- End-to-end architecture (§5) +- Architecture boundaries to preserve (§6) +- Friday aarch64 investigation and fallbacks (§7) +- Hour-by-hour Friday-Tuesday plan (§8) +- Four agent task cards (§9) +- Anti-patterns prevented (§10) +- Failure-mode decision tree (§11) +- Post-Tuesday roadmap v0.1→v0.4+ (§12) +- Partner conversation structure for Tuesday (§13) + +Companion documents for deep-dive reference: +- `iot-platform-architecture.md` — full architecture with primary-source citations, useful for v0.2+ when auth is implemented + +--- + +## What agents should do when uncertain + +The plan cannot anticipate everything. When an agent hits an ambiguity, the decision hierarchy is: + +1. **Does this preserve the end-to-end thread for Tuesday?** If yes, proceed. If it breaks the thread, stop and escalate. +2. **Does this preserve architectural boundaries §6?** If unsure, favor the boundary. +3. **Does this add scope beyond §4's in-scope list?** If yes, don't do it, regardless of how easy it seems. +4. **Is this security-critical?** If yes, don't add new code — flag for human review. Especially relevant for v0.2 auth callout work. +5. **Would this be more elegant but take an extra hour?** Don't do it. Ship Tuesday. +6. **Is the end-customer engineer's debuggability harmed by this choice?** If yes, don't do it. +7. **Is this on the path to the OKD-cluster-as-device future?** Don't optimize for this in v0. The abstractions are correct; don't over-invest. + +The walking skeleton's entire value is shipping Tuesday. Every decision that serves that goal is correct. Every decision that defers it (no matter how well-intentioned) is wrong. diff --git a/ROADMAP/iot_platform/v0_walking_skeleton.md b/ROADMAP/iot_platform/v0_walking_skeleton.md new file mode 100644 index 0000000..4990334 --- /dev/null +++ b/ROADMAP/iot_platform/v0_walking_skeleton.md @@ -0,0 +1,707 @@ +# IoT Platform v0 — Walking Skeleton + +**Approach:** Walking skeleton (Cockburn). Thin end-to-end thread through every architectural component. Naive first, architecture emerges from running code, hardening follows real-world feedback. + +## 1. Strategic framing + +**Near-term product:** IoT platform for an internal partner (a custom software shop with strong engineering practices — tests, CI/CD, coaching). They are developing an application for their end-customer whose field devices are Raspberry Pi 5s with 8/16 GB RAM, ARM64. The end-customer's engineers are mechanical/electrical/chemical, not Kubernetes-literate; on-device debuggability using standard Linux tools is a genuine UX concern. + +**Long-term product:** This is the foundation for NationTech's decentralized enterprise cloud orchestration. NationTech itself is effectively our largest customer for this platform — we already run multiple OKD clusters in different locations and need to coordinate deployments, updates, and observability across them without connecting into each one manually. An "agent" reconciling against NATS KV looks the same whether it runs `podman` on a Pi, `kubectl apply` on an OKD cluster, or a VM-level operation. The abstraction has been chosen to support all three eventually; v0 demonstrates it on the simplest target (podman on Pi). + +**Why this matters for collaborators reading this plan:** this is not a side project or a one-off customer integration. NationTech's positioning as a no-vendor-lock-in, decentralized, open-source cloud solution is gaining traction specifically because we have a product (Harmony) and not just bespoke integration work. This IoT platform extends that thesis. Resource investment is long-term. + +**Deadlines:** +- **Tuesday (day 4):** internal partner sees `git push → container running on Pi`. Confidence-building, low-stakes. +- **Day 14 (~2 weeks):** solid product foundation, before other NationTech projects claim attention. +- **2 months (partner's deadline):** hardened production delivery for the partner's end-customer. + +**Hour budget:** +- Friday evening (now): 3-4 hours focused +- Saturday: light supervision of agents, 2-3 hours +- Sunday: light supervision of agents, 2-3 hours +- Monday: 8 focused hours +- Tuesday morning: ship + polish, 4 hours +- Week 2 (Wed-Fri): v0.1 hardening, ~4 hours/day +- Week 3: v0.2 auth layer, ~4 hours/day +- Remaining weeks: partner-driven hardening as their application development reveals needs + +Sustainable hours non-negotiable. + +**Terminology used consistently below:** NationTech = us. Partner = the software shop we're directly working with. End-customer = the partner's customer whose field devices we're managing. + +--- + +## 2. Walking skeleton vs. parallel-tracks: the honest choice + +I considered both. For this context, walking skeleton wins on every axis: + +| Axis | Walking skeleton | Parallel-tracks autonomous | +|------|------------------|----------------------------| +| Partner sees progress | Tuesday (day 4) | Day 11+ | +| Integration risk | Discovered day 3 | Discovered day 11 | +| Weekend pressure | Natural stopping points | Merge-gate pressure | +| Adapts to "OKD cluster as device" future | Trivial — new Score variant later | Expensive mid-architecture pivot | +| Risk of day-14 slip | Low (partner has seen it work) | High (integration bugs in final days) | +| Hours sustainability | Good | Poor | + +--- + +## 3. The demo (= the product for Tuesday) + +``` +Partner edits ArgoCD Operator NATS KV Agent on podman +YAML in git ──push→ syncs ──apply→ writes ──store→ watch ──pull→ Pi reads ──run→ container + to k8s to NATS Score running +``` + +**Success criterion for Tuesday:** +1. `git push` on a workload repo. +2. Within 2 minutes, a container is running on a Raspberry Pi 5 in our lab. +3. Partner can `curl` the container (on the Pi's IP) and get hello-world. +4. Partner can edit the YAML (change image or port), push, watch the container transition within 2 minutes. + +**Invisible to partner but critical:** +- Pi is pre-provisioned with agent installed. +- ArgoCD is pre-configured with the partner's workload repo. +- Agent uses a shared NATS credential from a TOML file. + +**Partner-explicit framing for Tuesday conversation:** +- "This proves the mechanism end-to-end." +- "v0.1 next week: Harmony Score polished, second Pi added, status aggregation." +- "v0.2 week 2: real authentication via Zitadel + OpenBao, no more shared creds." +- "Here's what you can start building against today." + +--- + +## 4. Scope cuts — explicit deferrals + +Each cut has a target milestone. This is the foundation for the "here's what's coming" partner conversation. + +| Deferred | v0 replacement | Milestone | +|----------|----------------|-----------| +| Zitadel device auth | Shared NATS credential in agent TOML | v0.2 | +| OpenBao | Shared credentials in agent TOML | v0.2 | +| Auth callout service | Direct NATS user/pass | v0.2 | +| Scoping tests | None (single-tenant demo) | v0.2 | +| Multiple Pi devices | One Pi for Tuesday; second added v0.1 | v0.1 | +| Quadlet interpretation | `podman-api` crate direct control | v0.1 considers Quadlet | +| Status aggregation in CRD | Agent writes status, operator doesn't aggregate | v0.1 | +| Inventory reporting | Not in v0 | v0.1 | +| Log streaming via NATS | `journalctl` over SSH | v0.1 | +| API service | None | v0.2+ | +| TUI for IoT | `kubectl` + `nats` CLI | v0.2+ | +| Rollout state machine | All-at-once (one Pi for Tuesday, moot) | v0.1+ | +| Failure injection harness | None formal | v0.1 | +| Observability (Prom+Grafana) | `journalctl` + `kubectl logs` | v0.1+ | +| OKD-cluster-as-device | Not in v0; not in v0.x at all | Strategic roadmap, separate | + +**What's kept in v0 despite cost:** +- **Harmony Score on device.** Friday builds a minimal podman Score as a module in `harmony/src/modules/podman/`. Adds 1-2 hours Friday but proves the abstraction works in daemon mode. +- **Real `kube-rs` operator** (not a cron script). The operator's shape matters for long-term stability. +- **NATS KV transport.** Proven now so we don't switch later. +- **CRD-based partner API.** `kubectl apply -f deployment.yaml` is the partner's long-term interface. +- **Pi provisioning via Harmony Score** when achievable in <1hr (§7 Hour 1); manual runbook as fallback. + +--- + +## 5. The thread end-to-end + +### 5.1 Partner's git repo + +``` +iot-workload-hello/ +├── deployment.yaml # Deployment CR +├── README.md # "Edit, git push, done." +``` + +`deployment.yaml`: +```yaml +apiVersion: iot.nationtech.io/v1alpha1 +kind: Deployment +metadata: + name: hello-world + namespace: iot-demo +spec: + targetDevices: + - pi-demo-01 + score: + type: PodmanV0 # Rust enum discriminator (serde adjacently-tagged) + data: + services: + - name: hello + image: docker.io/library/nginx:alpine + ports: ["8080:80"] + rollout: + strategy: Immediate +``` + +### 5.2 Central cluster setup + +Existing k8s cluster. Namespaces: +- `iot-system` — operator, NATS (single-node for v0) +- `iot-demo` — `Deployment` CRs + +ArgoCD application pre-configured to sync `iot-workload-hello` repo into `iot-demo` namespace. + +### 5.3 Raspberry Pi 5 setup + +One Pi 5 in the lab, provisioned via Harmony Pi-provisioning Score (if achievable in <1hr Friday) or manually via SD card flash (fallback). + +Base OS: **Ubuntu Server 24.04 LTS ARM64** (ships Podman 4.9 in repos). Raspberry Pi OS 64-bit bookworm acceptable fallback. + +Installed: +- `podman` (4.4+, ARM64) with `systemctl --user enable --now podman.socket` (required for `podman-api` crate) +- `iot-agent` binary (cross-compiled to aarch64 via existing Harmony aarch64 toolchain) +- `/etc/iot-agent/config.toml` with NATS URL + shared credential +- systemd unit `iot-agent.service` + +### 5.4 What the code does + +**Operator:** +1. Watches `Deployment` CRs cluster-wide. +2. For each, for each `device_id` in `spec.targetDevices`, writes `desired-state..` in `desired-state` JetStream KV bucket with the Score message (see §5.5). +3. Updates `.status.observedScoreString` (the last-written Score as stored string, used for change detection via string comparison). +4. On deletion, removes corresponding KV entries. + +**Agent on Pi:** +1. Connect to NATS (TOML-configured user/pass). +2. Watch `desired-state..>` KV keys. +3. For each entry: deserialize Score message, dispatch to Harmony `Score::interpret(&topology)` via `s.clone().interpret().await` pattern (already the TUI's daemon-mode pattern, battle-tested in `harmony_agent` for CNPG management). +4. For v0, only `PodmanV0` Score variant exists. Interprets against a `PiDeviceTopology` (arch=aarch64, runtime=podman) and uses the `podman-api` crate to manage containers via the Podman REST API (over the user socket activated at §5.3 setup). +5. **Change detection via serialized string comparison** (not content hash). Cheap at this scale (a couple times per minute expected), removes hashing-algorithm risk, deterministic. +6. Status writer: every 30s, write current state to `status.`. + +**Kubelet compatibility is explicitly NOT a goal.** Kubelet architecture serves as a north star for proven reconcile-loop patterns; the v0 implementation stays absolutely minimal. No PLEG event stream in v0, no per-workload worker pool, no housekeeping sweep — just a single reconcile loop with periodic relist. Scope discipline through inherent minimalism, not enforced limits. + +### 5.5 Score message on NATS + +Adjacently tagged serde enum. One Rust type per Score variant, `#[serde(tag = "type", content = "data")]` for clean discriminator/payload separation: + +```rust +#[derive(Serialize, Deserialize, Clone)] +#[serde(tag = "type", content = "data")] +pub enum Score { + PodmanV0(PodmanV0Score), + // Future: OkdApplyV0(OkdApplyScore), KubectlApplyV0(...), etc. +} +``` + +JSON wire format: +```json +{ + "type": "PodmanV0", + "data": { /* PodmanV0Score fields */ } +} +``` + +No envelope. No encoding field. No format version string. The Rust type name is the discriminator; serde handles polymorphism cleanly. Adding a new Score variant (for OKD management later) is `enum Score { ..., OkdApplyV0(OkdApplyScore) }` — additive, not breaking. + +### 5.6 What's deliberately dumb in v0 + +- **Polling instead of event-driven PLEG.** Agent polls podman-api every 30s as ground truth; KV watch events are accelerators. +- **No idempotency beyond string-equality.** Current score matches stored → no-op, mismatch → stop old container, run new. Brief downtime on updates. Fine for v0. +- **Graceful shutdown = `podman stop` with 5min timeout, then SIGKILL.** Sufficient. +- **No auth between operator and NATS.** Same k8s cluster, same namespace. Network trust. +- **No state persistence beyond podman itself.** Agent restart = re-read NATS, re-query podman, reconcile differences. +- **No multi-service coordination.** A Score with three services starts them all immediately, no dependency ordering. + +--- + +## 6. Architecture boundaries we keep even in v0 + +Decisions that cost little now and save real time later. + +### 6.1 Score enum polymorphic from day 1 + +Even with one variant (`PodmanV0`), the enum shape is already polymorphic. Adding `OkdApplyV0` later is trivial. + +### 6.2 `Score` + `Interpret` traits used consistently + +Use Harmony's existing traits. Cost: ~1 hour Friday. Benefit: agent is structurally ready for a second Score type in v0.3+. + +### 6.3 Credentials behind a trait + +```rust +trait CredentialSource: Send + Sync { + async fn nats_connect_options(&self) -> Result; +} +``` + +v0: `TomlFileCredentialSource` reading `/etc/iot-agent/config.toml`. +v0.2: `ZitadelBootstrappedCredentialSource` — same trait, swapped via config. + +30 minutes Friday. Saves 3 hours of refactor in v0.2. + +### 6.4 Device topology generalizes + +`PiDeviceTopology` for v0. Trait interface supports other topologies — OKD cluster as `OkdClusterTopology` later. The v0 Score validates at compile time that its topology requirements match (arch=aarch64, runtime=podman). The OKD Score will validate different requirements (has_kube_api, has_argo). Same pattern. + +### 6.5 CRD spec forward-compatible + +```yaml +spec: + targetDevices: [id1, id2] # v0. v1 adds targetGroups. + score: {type: ..., data: ...} # polymorphic enum + rollout: + strategy: Immediate # v0. v1 adds Progressive. +``` + +### 6.6 NATS subject grammar matches long-term + +Even with one Pi, use `desired-state..` and `status.`. Don't take shortcuts. + +### 6.7 Agent config is TOML, flat for v0 + +```toml +[agent] +device_id = "pi-demo-01" + +[credentials] +type = "toml-shared" +nats_user = "iot-agent" +nats_pass = "dev-shared-password" + +[nats] +urls = ["nats://central:4222"] +``` + +v0.2 adds a `[zitadel]` section enabling the bootstrap-via-token flow (see §11 roadmap). Additive, not breaking. Target long-term state: device boots → PXE or minimal TOML → Zitadel URL + token → fetches real config from OpenBao → connects to NATS. OpenBao outage doesn't break reconnect because the NATS auth callout validates tokens against Zitadel JWKS directly (with cached group permissions); NATS rejects only when the token actually expires. + +--- + +## 7. Friday evening critical path — the aarch64 investigation + +**The previous walking skeleton draft had a "§6 decision point" on whether Harmony's `Interpret` works in daemon mode. That's resolved — the TUI does `s.clone().interpret().await` as a daemon pattern, and `harmony_agent` manages distributed CNPG in production using exactly this. Not a concern.** + +**The real concern that replaces it:** Harmony does not currently compile on aarch64. When `harmony_agent` was cross-compiled for ARM64, an upstream dependency had to be pulled out. This was likely a single sub-dependency used by only a few modules, feature-gatable so those modules become unavailable on ARM (acceptable — the device doesn't need every Harmony feature). Estimated as a quick fix (~80% confidence, per Sylvain's recollection). + +**Friday evening investigation (30-60 min, first):** + +1. `cargo build --target aarch64-unknown-linux-musl -p harmony` on the workspace. Capture the error. +2. Identify the offending crate and the module(s) in Harmony that depend on it. +3. Apply feature-gate: add a `cfg(not(target_arch = "aarch64"))` attribute to the offending module, or introduce a Cargo feature flag (`--features x86-only`) that the ARM build skips. +4. Verify: `cargo build --target aarch64-unknown-linux-musl -p harmony --features ` succeeds. +5. Run the unit tests that exist for the feature-gated modules on x86_64 to confirm we haven't broken anything on the primary platform. + +**Budget:** 2 hours max Friday night. **If not resolved in 2 hours:** +- Fallback A: Build the agent against only the crates that do compile on aarch64 (`harmony_agent`, `harmony_types`, whatever subset). Implement the `PodmanV0` Score directly in the agent crate using its own trait impls for now. Reunify with the main Harmony codebase in v0.1 after the compile fix is properly done. +- Fallback B: (only if Fallback A also blocks) Write the v0 agent as pure Rust without Harmony Score traits. Adopt them in v0.1 after the aarch64 fix lands. This is the walking-skeleton-surfaces-real-issue scenario from §10. + +Document findings in the Friday night log regardless of outcome. v0.1 work includes proper fix if we took a shortcut. + +**This is the single most important investigation of the weekend.** Do it before anything else Friday. Every downstream decision (can the agent use Harmony Score traits? what's agent A3 cross-compiling?) depends on it. + +--- + +## 8. Hour-by-hour plan + +### Friday evening (3-4 hours) + +**Goal by end of Friday night:** aarch64 path clear; operator running in central cluster writes to NATS on CR apply; agent crate compiling on laptop, talking to NATS; Pi provisioning plan chosen. + +**Hour 1 — aarch64 investigation + decisions + dispatches** + +*Your work:* +- **aarch64 investigation per §7** (30-60 min, first thing). +- Write 1-page `v0-demo.md`: demo script, success criteria, fallback plan. +- Decide Pi OS: Ubuntu 24.04 ARM64 (default) vs Raspberry Pi OS 64-bit. Don't agonize beyond 10 min. + +*Dispatch agent A1 (operator):* "Create Rust crate `iot/iot-operator-v0/` using `kube-rs` implementing a Deployment CRD controller that writes to NATS KV. Exact spec in task card §9.A1. Self-verify: `kubectl apply` → `nats kv get` shows entry. Under 300 lines main.rs. No auth." + +*Dispatch agent A2 (Pi provisioning, fallback-aware):* "Attempt Harmony-based Raspberry Pi 5 provisioning Score. Target: fresh Pi flashed via SD card, boots, static IP, Ubuntu 24.04 ARM64 with Podman 4.9, podman user socket enabled, user `iot-agent` with linger enabled, `/etc/iot-agent/` ready. If Harmony doesn't have Pi primitives, document the gap and produce a manual provisioning runbook instead (rpi-imager + cloud-init). Hard time limit: 90 min. Self-verify: `ssh iot-agent@ 'podman --version'` returns 4.4+." + +**Hour 2 — your work: agent crate** + +Start writing the agent yourself. Core customer-experience code; you own its shape. + +Crate in `harmony/src/modules/iot_agent/` or a new binary in the Harmony workspace (follow existing conventions — Harmony modules live in `harmony/src/modules/`): +- Under 500 lines for v0. +- Dependencies: `async-nats`, `serde`, `serde_yaml`, `tokio`, `tracing`, `anyhow`, `podman-api`, plus Harmony workspace deps. +- Main loop per §5.4. +- `CredentialSource` trait (§6.3) with `TomlFileCredentialSource` impl. +- Score enum (§5.5) with `PodmanV0` variant. +- `PodmanV0Score` implements Harmony's `Score` + `Interpret` traits. Score lives in `harmony/src/modules/podman/` (new module) following existing Harmony module conventions. `podman-api` crate for container operations — no shell-out. + +**Hour 3 — local integration** + +- Review agent A1's operator. Deploy to central cluster `iot-system` namespace. +- Deploy NATS to `iot-system` if not already (single-node JetStream). +- Review agent A2's Pi provisioning. If Harmony Score succeeded, note for demo; if manual runbook, accept and move on. +- Agent compiles on laptop. Connects to central NATS. + +**Hour 4 — first partial handshake** + +- `kubectl apply` a `Deployment` CR targeting `pi-demo-01`. +- Verify: `nats kv get desired-state pi-demo-01.test-deploy` shows entry. +- Run agent locally on laptop with `DEVICE_ID=pi-demo-01`, confirm it reads the KV entry and prints what it would do. +- **First success:** local end-to-end without actual podman execution. Good for tonight. + +*Stop by 10 PM.* + +### Saturday (2-3 hours, light supervision) + +**Goal:** local end-to-end working — laptop agent starts a podman container when CR is applied. Pi provisioning rehearsed if Harmony path succeeded. + +**Morning check-in (30 min).** + +**Dispatch agent A3 (installer) Saturday morning.** Task card §9.A3. + +**Your work (2 hours):** +- Finish agent's happy path: start container via podman-api, remove on CR deletion, transition on Score string change. +- End-to-end test on laptop: agent + central NATS + central operator. Expect a design bug. Budget an extra hour. + +**Dispatch agent A4 (demo script) Saturday afternoon.** Task card §9.A4. + +### Sunday (2-3 hours, light supervision) + +**Goal:** Demo script works against the real Pi in the lab. Not polished; works. + +**Your work:** +- Run agent A4's demo against real Pi. Fix what breaks. +- First clean Pi success = shipping-confidence milestone. +- Run 3 more clean-room. Document failure modes. + +**No A5 agent task this time** — thesis doc deferred to Week 2 per §2 decision. + +### Monday (8 focused hours) + +**Goal:** Demo runs reliably. Tuesday ship-ready. + +**Hour 1-2 — field deployment readiness:** + +Named subsection: the most important class of failures for Pi-in-field deployment. The partner's devices will be power-cycled, their networks will flap. These matter more than polish. + +- **Power cycle test:** unplug Pi, wait 30s, plug back in. Target: boot-to-reconciled within 90s. Run this 5+ times. **This is the most important test of the weekend.** +- **Network-out-during-boot:** boot Pi with NATS blocked (iptables or network shutdown). Agent starts, waits, reconciles when NATS is reachable. No crash loop. +- **Agent crash loop:** corrupt the config, let systemd restart loop kick in. Back-off works; device not bricked; `systemctl status` shows the failure clearly. + +*Not tested (ruled out per explicit partner conversation):* SD card wear (partner's app has few IOPS), thermal throttling (environment doesn't cause sustained high temps), PoE-specific failure modes (not relevant here). + +**Hour 3-4 — demo polish:** +- `./demo.sh` is one command, no manual steps. +- Output is clean: clear PASS/FAIL with per-phase timings. +- `kubectl get deployments.iot.nationtech.io` output is readable. + +**Hour 5-6 — partner-facing polish:** +- README in workload repo: 4 lines. "Edit this, git push, done." +- ArgoCD auto-sync enabled on partner's repo. +- CR `.status` updates within ~10s of agent report. + +**Hour 7 — failure demo prep:** + +Partner will ask "what if X fails." Prep answers, demonstrate 1-2 live: +- "Pi loses network" → container keeps running, reconnects when network returns. +- "Central cluster down" → already-running containers keep running. +- "Agent crashes" → systemd restarts, re-reads NATS, no data loss. +- "NATS down" → agent shows clear "not connected" status; recovers on NATS return. + +**Hour 8 — rest case:** +One full clean-room demo run. Time it. Write Tuesday runbook. Stop by 9 PM. + +### Tuesday morning (4 hours) — ship + +**Hour 1:** Final clean-room run on our infra. Passes → proceed. +**Hour 2:** Setup for partner demo (on-site or screenshare). +**Hour 3:** Demo walkthrough. Git push → container. Edit → transition. One failure demo. Q&A. +**Hour 4:** Handoff + v0.1/v0.2 plan conversation. Align on next-week milestone. + +--- + +## 9. Agent task cards + +Each card is self-contained. Hand the entire card to an agent. + +**Mandatory verification for every agent task — must pass before completion:** + +```bash +# Native CI check (check + fmt + clippy + test) +./build/check.sh + +# Cross-compilation — aarch64 builds must succeed for all IoT-critical crates. +# Note: harmony is built with --no-default-features to exclude KVM (libvirt cannot cross-compile to aarch64). +# The 5 KVM examples (kvm_vm_examples, kvm_okd_ha_cluster, opnsense_vm_integration, +# opnsense_pair_integration, example_linux_vm) are x86_64-only by design. +cargo build --target x86_64-unknown-linux-gnu -p harmony -p harmony_agent -p iot-agent-v0 -p iot-operator-v0 +cargo build --target aarch64-unknown-linux-gnu -p harmony --no-default-features -p harmony_agent -p iot-agent-v0 -p iot-operator-v0 +``` + +All three must exit 0. Note: `cargo test --target aarch64-unknown-linux-gnu` cannot run on x86_64 (exec format error) — that's expected. Test execution is only for the host architecture via `./build/check.sh`. If any check fails, fix the issue before marking the task complete. Include the output in the PR description. + +### A1: Operator skeleton (Friday) + +**Goal:** `kube-rs` operator that watches `Deployment` CRs and writes the Score to NATS KV. + +**Deliverable:** Crate `iot/iot-operator-v0/`: +- `Cargo.toml`: `kube`, `k8s-openapi`, `async-nats`, `serde`, `serde_yaml`, `serde_json`, `tokio`, `tracing`, `tracing-subscriber`, `anyhow`. +- `src/main.rs` under 300 lines. +- `deploy/operator.yaml` — Deployment, ServiceAccount, ClusterRole, ClusterRoleBinding. +- `deploy/crd.yaml` — `Deployment` CRD for `iot.nationtech.io/v1alpha1`. + +**Behavior:** +1. Connect to NATS on startup (`NATS_URL` env, no auth). +2. Ensure JetStream KV bucket `desired-state` exists (create if not). +3. Watch `Deployment` CRs cluster-wide. +4. On reconcile: for each `device_id` in `spec.targetDevices`, write key `.` in `desired-state` bucket with the serialized Score (adjacently-tagged JSON per §5.5). +5. On CR delete: remove corresponding KV keys. +6. Update `.status.observedScoreString` with the string that was written (for human inspection and change detection). +7. Log every reconcile. + +**CRD schema:** +```yaml +spec: + targetDevices: [string] # required, at least 1 + score: + type: string # required, e.g. "PodmanV0" + data: object # required, Score-type-specific + rollout: + strategy: string # v0: only "Immediate" +status: + observedScoreString: string + conditions: [stdCondition] +``` + +**Self-verification:** +```bash +cd iot/iot-operator-v0 +cargo build && cargo clippy -- -D warnings + +# Test against k3d: +k3d cluster create iot-test --wait +kubectl apply -f deploy/crd.yaml +docker run -d --rm -p 4222:4222 -p 8222:8222 --name nats nats:latest -js +NATS_URL=nats://localhost:4222 RUST_LOG=info cargo run & +OP_PID=$! + +sleep 3 +kubectl apply -f - < 'podman --version' +# Must be 4.4+ (target 4.9+) +ssh iot-agent@ 'systemctl --user is-active podman.socket' +# Must print "active" +ssh iot-agent@ 'loginctl show-user iot-agent | grep Linger=yes' +ssh iot-agent@ 'uname -m' +# Must print aarch64 +``` + +**Time limit:** 90 min agent time. + +**Forbidden:** Docker. x86_64 base images. Hard-coded credentials. + +### A3: Agent installer (Saturday) + +**Goal:** Deploy agent to Pi via SSH using aarch64-cross-compiled binary. + +**Prerequisites:** Agent binary exists (Sylvain writes Friday). + +**Deliverable:** `iot/iot-agent-v0/scripts/install.sh`: +1. Args: `--host `, `--device-id `, `--nats-url `, `--nats-user `, `--nats-pass

`. +2. Cross-builds for aarch64 using existing Harmony aarch64 toolchain. +3. `scp` binary to Pi, `sudo mv` to `/usr/local/bin/iot-agent`. +4. Templates `/etc/iot-agent/config.toml` from args. +5. Installs `/etc/systemd/system/iot-agent.service`. +6. `systemctl daemon-reload && systemctl enable --now iot-agent`. +7. Waits up to 15s for "connected to NATS" in journal. + +**systemd unit:** +```ini +[Unit] +Description=IoT Agent +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=iot-agent +ExecStart=/usr/local/bin/iot-agent +Restart=on-failure +RestartSec=5 +StandardOutput=journal +StandardError=journal +Environment=RUST_LOG=info + +[Install] +WantedBy=multi-user.target +``` + +**Self-verification:** +```bash +./install.sh --host --device-id pi-demo-01 \ + --nats-url nats://central:4222 \ + --nats-user iot-agent --nats-pass dev-shared-password +ssh iot-agent@ 'sudo systemctl status iot-agent' # active (running) +ssh iot-agent@ 'sudo journalctl -u iot-agent --since "2 minutes ago"' | grep "connected to NATS" +``` + +**Time limit:** 2 hours agent time. + +### A4: End-to-end demo script (Saturday) + +**Goal:** One command runs full demo flow. + +**Deliverable:** `iot/scripts/demo.sh`: +1. Verifies Pi reachable + agent running. +2. Applies `scripts/demo-deployment.yaml`. +3. Waits up to 120s for container on Pi (ssh + `podman ps`). +4. `curl http://:8080` — expects nginx page. +5. Deletes CR, waits up to 60s for removal. +6. Prints PASS or FAIL with per-phase timings. +7. Cleans up on failure. + +**Self-verification:** +```bash +./iot/scripts/demo.sh +# Ends with "PASS", total < 5 min +``` + +**Time limit:** 2 hours agent time. + +--- + +## 10. Anti-patterns the plan prevents + +- **Premature contract extraction.** No `iot-contracts` crate in v0. Inline types. Extract v0.1 when they've proven their shape through use. +- **Quadlet under deadline.** Direct `podman-api` for v0. Quadlet evaluation in v0.1+ (possibly via `podlet` crate for code generation). User systemd quirks are a real cost under deadline pressure. +- **Agent-driven refactors.** If an agent suggests "I could clean this up," say no. v0 ships first. +- **Harmony rewrite.** Use what fits. If something doesn't fit cleanly, document and work around. +- **Second device in v0.** One Pi Tuesday. Second in v0.1. +- **Dashboards/TUI/API for v0.** `kubectl` and `nats` CLI are v0 operator UX. Partner UX is `git push`. +- **OKD-cluster-as-device in v0 or v0.x.** Strategic roadmap, not execution plan. Keep focus. +- **Weekend overwork.** 2-3 hrs/day max Sat/Sun. Monday is where the hours are. + +--- + +## 11. If the demo doesn't work Monday night + +**Flaky polish (reconnect timing, status lag):** Ship Tuesday with minor caveats. Partner tolerates. + +**End-to-end happy path unreliable:** Push ship by half a day or full day. Broken demo hurts partner trust more than 1-day slip. Communicate early Tuesday morning. + +**Genuine architectural flaw (e.g., NATS KV watches lose events under load):** The walking skeleton has done its job — problem discovered cheap, not in week 3. Regroup Tuesday morning, push by 2-3 days, present to partner as "we found a design issue, here's the fix." They respect honesty. + +--- + +## 12. Post-Tuesday milestones + +**v0.1 (Wed-Fri week 2):** Hardening informed by v0 deployment. +- Harmony aarch64 compile properly fixed if we took a Friday shortcut. +- `iot-contracts` crate extracted (consolidate inline types). +- Second Pi added, regression-tested. +- Status aggregation in operator (CRD `.status.aggregate`). +- Inventory reporting from agent. +- Basic journald log streaming prototype. +- Field-readiness test suite running automated against a VM (power cycle, network-out, agent crash loop). +- Thesis document: 2 pages covering 3-year platform vision, written after seeing v0 run. + +**v0.2 (Mon-Fri week 3):** Auth layer. +- Zitadel service accounts per device. +- Device-side JWT Profile client. +- OpenBao JWT auth method configured. +- Auth callout service implementing the bearer-token NATS JWT minting pattern from the architecture doc. +- **Availability-favoring design:** auth callout caches OpenBao policy lookups; on OpenBao failure, cached permissions are used; NATS rejects only on actual token expiry. A reboot doesn't force re-verification more than a passing minute does. +- Scoping test suite. +- Shared credentials removed. +- Bootstrap flow: device has Zitadel URL + initial token on disk → fetches NATS config from OpenBao → connects to NATS. Device TOML narrows to minimal bootstrap-only config. + +**v0.3 (week 4+):** Scale + partner-driven features. +- Multiple workloads per device. +- Progressive rollout. +- Real log streaming. +- API service. +- Observability (Prometheus + Grafana). +- Automated field-readiness tests running on real Pi in CI. + +**v0.4+ (weeks 5-8, partner's 2-month target):** Production hardening. +- TPM-backed device keys. +- Scale testing with partner's real fleet size. +- Runbook maturation. +- First non-demo production deployment for end-customer. + +--- + +## 13. Tuesday partner conversation + +Don't just demo. Frame. Prepared talking points: + +**"Here's what we shipped today."** +- Git push → container on Pi. +- CRD surface they can start building against. +- No auth yet — shared credentials for internal use. + +**"Here's what's coming next week (v0.1)."** +- Harmony Score integration polished. +- Second Pi, multi-device demo. +- Status visibility. + +**"Here's what's coming week 2 (v0.2)."** +- Real authentication: Zitadel + OpenBao. +- Per-device scoped credentials. +- Production-grade security. + +**"Here's how we're doing it."** +- Walking skeleton: ship early, harden based on real use. +- We want them to start building against v0 today. +- Feedback from their real use shapes v0.1/v0.2 priorities. + +**"Here's what we need from them."** +- Early feedback on the CRD surface — does it fit how they want to deploy? +- Access to a test Pi from their fleet (if available) for v0.1/v0.2 testing. +- Rough timeline for their application development so we can sequence hardening with them. + +This conversation is as much of Tuesday's deliverable as the running demo. Don't skip it. diff --git a/examples/example_linux_vm/Cargo.toml b/examples/example_linux_vm/Cargo.toml index f6620c7..09930c0 100644 --- a/examples/example_linux_vm/Cargo.toml +++ b/examples/example_linux_vm/Cargo.toml @@ -9,7 +9,7 @@ name = "example_linux_vm" path = "src/main.rs" [dependencies] -harmony = { path = "../../harmony" } +harmony = { path = "../../harmony", features = ["kvm"] } tokio.workspace = true log.workspace = true env_logger.workspace = true diff --git a/examples/kvm_okd_ha_cluster/Cargo.toml b/examples/kvm_okd_ha_cluster/Cargo.toml index b349f7f..1eb7e69 100644 --- a/examples/kvm_okd_ha_cluster/Cargo.toml +++ b/examples/kvm_okd_ha_cluster/Cargo.toml @@ -9,7 +9,7 @@ name = "kvm_okd_ha_cluster" path = "src/main.rs" [dependencies] -harmony = { path = "../../harmony" } +harmony = { path = "../../harmony", features = ["kvm"] } tokio.workspace = true log.workspace = true env_logger.workspace = true diff --git a/examples/kvm_vm_examples/Cargo.toml b/examples/kvm_vm_examples/Cargo.toml index d9b667b..63e1681 100644 --- a/examples/kvm_vm_examples/Cargo.toml +++ b/examples/kvm_vm_examples/Cargo.toml @@ -9,7 +9,7 @@ name = "kvm-vm-examples" path = "src/main.rs" [dependencies] -harmony = { path = "../../harmony" } +harmony = { path = "../../harmony", features = ["kvm"] } tokio.workspace = true log.workspace = true env_logger.workspace = true diff --git a/examples/opnsense_pair_integration/Cargo.toml b/examples/opnsense_pair_integration/Cargo.toml index 9966388..1fec669 100644 --- a/examples/opnsense_pair_integration/Cargo.toml +++ b/examples/opnsense_pair_integration/Cargo.toml @@ -9,7 +9,7 @@ name = "opnsense-pair-integration" path = "src/main.rs" [dependencies] -harmony = { path = "../../harmony" } +harmony = { path = "../../harmony", features = ["kvm"] } harmony_cli = { path = "../../harmony_cli" } harmony_inventory_agent = { path = "../../harmony_inventory_agent" } harmony_macros = { path = "../../harmony_macros" } diff --git a/examples/opnsense_vm_integration/Cargo.toml b/examples/opnsense_vm_integration/Cargo.toml index ac4ec1a..bf7317e 100644 --- a/examples/opnsense_vm_integration/Cargo.toml +++ b/examples/opnsense_vm_integration/Cargo.toml @@ -9,7 +9,7 @@ name = "opnsense-vm-integration" path = "src/main.rs" [dependencies] -harmony = { path = "../../harmony" } +harmony = { path = "../../harmony", features = ["kvm"] } harmony_cli = { path = "../../harmony_cli" } harmony_inventory_agent = { path = "../../harmony_inventory_agent" } harmony_macros = { path = "../../harmony_macros" } diff --git a/harmony/Cargo.toml b/harmony/Cargo.toml index 7caea1a..38f5497 100644 --- a/harmony/Cargo.toml +++ b/harmony/Cargo.toml @@ -6,6 +6,9 @@ readme.workspace = true license.workspace = true [features] +default = ["kvm"] +kvm = ["dep:virt"] +podman = ["dep:podman-api"] testing = [] [dependencies] @@ -86,7 +89,8 @@ inquire.workspace = true brocade = { path = "../brocade" } option-ext = "0.2.0" rand.workspace = true -virt = "0.4.3" +virt = { version = "0.4.3", optional = true } +podman-api = { version = "0.9", optional = true } [dev-dependencies] pretty_assertions.workspace = true diff --git a/harmony/src/domain/interpret/mod.rs b/harmony/src/domain/interpret/mod.rs index ec3ad1d..47f570e 100644 --- a/harmony/src/domain/interpret/mod.rs +++ b/harmony/src/domain/interpret/mod.rs @@ -37,6 +37,7 @@ pub enum InterpretName { Custom(&'static str), RHOBAlerting, K8sIngress, + PodmanV0, } impl std::fmt::Display for InterpretName { @@ -70,6 +71,7 @@ impl std::fmt::Display for InterpretName { InterpretName::Custom(name) => f.write_str(name), InterpretName::RHOBAlerting => f.write_str("RHOBAlerting"), InterpretName::K8sIngress => f.write_str("K8sIngress"), + InterpretName::PodmanV0 => f.write_str("PodmanV0"), } } } diff --git a/harmony/src/modules/mod.rs b/harmony/src/modules/mod.rs index 1f47e8a..d1eb345 100644 --- a/harmony/src/modules/mod.rs +++ b/harmony/src/modules/mod.rs @@ -10,6 +10,7 @@ pub mod http; pub mod inventory; pub mod k3d; pub mod k8s; +#[cfg(feature = "kvm")] pub mod kvm; pub mod lamp; pub mod load_balancer; @@ -20,6 +21,8 @@ pub mod node_health; pub mod okd; pub mod openbao; pub mod opnsense; +#[cfg(feature = "podman")] +pub mod podman; pub mod postgresql; pub mod prometheus; pub mod storage; diff --git a/harmony/src/modules/podman/interpret.rs b/harmony/src/modules/podman/interpret.rs new file mode 100644 index 0000000..6fda65f --- /dev/null +++ b/harmony/src/modules/podman/interpret.rs @@ -0,0 +1,67 @@ +use async_trait::async_trait; +use harmony_types::id::Id; +use log::info; + +use crate::{ + data::Version, + interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome}, + inventory::Inventory, + topology::Topology, +}; + +use super::score::PodmanV0Score; + +#[derive(Debug)] +pub struct PodmanV0Interpret { + score: PodmanV0Score, + version: Version, + status: InterpretStatus, +} + +impl PodmanV0Interpret { + pub fn new(score: PodmanV0Score) -> Self { + let version = Version::from("0.1.0").expect("Version should be valid"); + Self { + score, + version, + status: InterpretStatus::QUEUED, + } + } +} + +#[async_trait] +impl Interpret for PodmanV0Interpret { + fn get_name(&self) -> InterpretName { + InterpretName::PodmanV0 + } + + fn get_version(&self) -> Version { + self.version.clone() + } + + fn get_status(&self) -> InterpretStatus { + self.status.clone() + } + + fn get_children(&self) -> Vec { + vec![] + } + + async fn execute( + &self, + _inventory: &Inventory, + _topology: &T, + ) -> Result { + for service in &self.score.services { + info!( + "PodmanV0: would create container '{}' from image '{}' with ports {:?}", + service.name, service.image, service.ports + ); + } + + Ok(Outcome::success(format!( + "PodmanV0: {} services would be deployed (stub)", + self.score.services.len() + ))) + } +} diff --git a/harmony/src/modules/podman/mod.rs b/harmony/src/modules/podman/mod.rs new file mode 100644 index 0000000..7ae0e42 --- /dev/null +++ b/harmony/src/modules/podman/mod.rs @@ -0,0 +1,5 @@ +mod interpret; +mod score; + +pub use interpret::PodmanV0Interpret; +pub use score::{IotScore, PodmanService, PodmanV0Score}; diff --git a/harmony/src/modules/podman/score.rs b/harmony/src/modules/podman/score.rs new file mode 100644 index 0000000..a39c010 --- /dev/null +++ b/harmony/src/modules/podman/score.rs @@ -0,0 +1,87 @@ +use serde::{Deserialize, Serialize}; + +use crate::{interpret::Interpret, score::Score, topology::Topology}; + +use super::interpret::PodmanV0Interpret; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct PodmanService { + pub name: String, + pub image: String, + pub ports: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct PodmanV0Score { + pub services: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "type", content = "data")] +pub enum IotScore { + PodmanV0(PodmanV0Score), +} + +impl Score for PodmanV0Score { + fn create_interpret(&self) -> Box> { + Box::new(PodmanV0Interpret::new(self.clone())) + } + + fn name(&self) -> String { + "PodmanV0Score".to_string() + } +} + +impl Score for IotScore { + fn create_interpret(&self) -> Box> { + match self { + IotScore::PodmanV0(score) => score.create_interpret(), + } + } + + fn name(&self) -> String { + match self { + IotScore::PodmanV0(_) => "PodmanV0Score".to_string(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn podman_v0_score_serializes_with_adjacent_tag() { + let score = IotScore::PodmanV0(PodmanV0Score { + services: vec![PodmanService { + name: "web".to_string(), + image: "nginx:latest".to_string(), + ports: vec!["8080:80".to_string()], + }], + }); + let json = serde_json::to_string(&score).unwrap(); + assert!(json.contains("\"type\":\"PodmanV0\"")); + assert!(json.contains("\"data\"")); + } + + #[test] + fn podman_v0_score_roundtrip() { + let score = IotScore::PodmanV0(PodmanV0Score { + services: vec![ + PodmanService { + name: "web".to_string(), + image: "nginx:latest".to_string(), + ports: vec!["8080:80".to_string()], + }, + PodmanService { + name: "api".to_string(), + image: "myapp:1.0".to_string(), + ports: vec!["3000:3000".to_string(), "9090:9090".to_string()], + }, + ], + }); + let serialized = serde_json::to_string(&score).unwrap(); + let deserialized: IotScore = serde_json::from_str(&serialized).unwrap(); + assert_eq!(score, deserialized); + } +} diff --git a/iot/iot-agent-v0/Cargo.toml b/iot/iot-agent-v0/Cargo.toml new file mode 100644 index 0000000..c56d847 --- /dev/null +++ b/iot/iot-agent-v0/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "iot-agent-v0" +version = "0.1.0" +edition = "2024" +rust-version = "1.85" + +[dependencies] +harmony = { path = "../../harmony", default-features = false, features = ["podman"] } +async-nats = { workspace = true } +chrono = { workspace = true } +futures-util = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +tracing-subscriber = { workspace = true } +anyhow = { workspace = true } +clap = { workspace = true } +toml = { workspace = true } \ No newline at end of file diff --git a/iot/iot-agent-v0/src/config.rs b/iot/iot-agent-v0/src/config.rs new file mode 100644 index 0000000..7dd4ae7 --- /dev/null +++ b/iot/iot-agent-v0/src/config.rs @@ -0,0 +1,68 @@ +use serde::Deserialize; +use std::path::Path; + +#[derive(Debug, Clone, Deserialize)] +pub struct AgentConfig { + pub agent: AgentSection, + pub nats: NatsSection, + pub credentials: CredentialsSection, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct AgentSection { + pub device_id: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct NatsSection { + pub urls: Vec, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct CredentialsSection { + #[serde(rename = "type")] + pub source_type: String, + pub nats_user: Option, + pub nats_pass: Option, +} + +pub trait CredentialSource: Send + Sync { + fn nats_credentials(&self) -> anyhow::Result<(String, String)>; +} + +pub struct TomlFileCredentialSource<'a> { + config: &'a AgentConfig, +} + +impl<'a> TomlFileCredentialSource<'a> { + pub fn new(config: &'a AgentConfig) -> Self { + Self { config } + } +} + +impl CredentialSource for TomlFileCredentialSource<'_> { + fn nats_credentials(&self) -> anyhow::Result<(String, String)> { + let creds = &self.config.credentials; + if creds.source_type != "toml-shared" { + anyhow::bail!( + "unsupported credentials.type '{}' (v0 only supports 'toml-shared')", + creds.source_type + ); + } + let user = creds + .nats_user + .as_deref() + .ok_or_else(|| anyhow::anyhow!("missing nats_user in credentials"))?; + let pass = creds + .nats_pass + .as_deref() + .ok_or_else(|| anyhow::anyhow!("missing nats_pass in credentials"))?; + Ok((user.to_string(), pass.to_string())) + } +} + +pub fn load_config(path: &Path) -> anyhow::Result { + let content = std::fs::read_to_string(path)?; + let config: AgentConfig = toml::from_str(&content)?; + Ok(config) +} diff --git a/iot/iot-agent-v0/src/main.rs b/iot/iot-agent-v0/src/main.rs new file mode 100644 index 0000000..23518af --- /dev/null +++ b/iot/iot-agent-v0/src/main.rs @@ -0,0 +1,141 @@ +mod config; + +use std::time::Duration; + +use anyhow::Result; +use clap::Parser; +use config::{AgentConfig, CredentialSource, TomlFileCredentialSource}; +use futures_util::StreamExt; + +use harmony::modules::podman::IotScore; + +#[derive(Parser)] +#[command(name = "iot-agent-v0", about = "IoT agent for Raspberry Pi devices")] +struct Cli { + #[arg( + long, + env = "IOT_AGENT_CONFIG", + default_value = "/etc/iot-agent/config.toml" + )] + config: std::path::PathBuf, +} + +async fn connect_nats(cfg: &AgentConfig) -> Result { + let (user, pass) = TomlFileCredentialSource::new(cfg).nats_credentials()?; + let client = async_nats::ConnectOptions::with_user_and_password(user, pass) + .ping_interval(Duration::from_secs(10)) + .connect(cfg.nats.urls.as_slice()) + .await?; + tracing::info!(urls = ?cfg.nats.urls, "connected to NATS"); + Ok(client) +} + +async fn watch_desired_state(client: async_nats::Client, device_id: String) -> Result<()> { + let jetstream = async_nats::jetstream::new(client); + let bucket = jetstream + .create_key_value(async_nats::jetstream::kv::Config { + bucket: "desired-state".to_string(), + ..Default::default() + }) + .await?; + + let key_filter = format!("{device_id}.>"); + tracing::info!(filter = %key_filter, "watching KV keys"); + + let mut watch = bucket.watch(&key_filter).await?; + while let Some(result) = watch.next().await { + let entry = match result { + Ok(e) => e, + Err(e) => { + tracing::warn!(error = %e, "watch error"); + continue; + } + }; + match entry.operation { + async_nats::jetstream::kv::Operation::Put => { + match serde_json::from_slice::(&entry.value) { + Ok(score) => { + tracing::info!(key = %entry.key, score = ?score, "received desired state"); + } + Err(e) => { + tracing::warn!( + key = %entry.key, + error = %e, + "failed to deserialize score" + ); + } + } + } + async_nats::jetstream::kv::Operation::Delete + | async_nats::jetstream::kv::Operation::Purge => { + tracing::info!(key = %entry.key, "desired state removed"); + } + } + } + Ok(()) +} + +async fn report_status(client: async_nats::Client, device_id: String) -> Result<()> { + let jetstream = async_nats::jetstream::new(client); + let bucket = jetstream + .create_key_value(async_nats::jetstream::kv::Config { + bucket: "agent-status".to_string(), + ..Default::default() + }) + .await?; + + let key = format!("status.{}", device_id); + let mut interval = tokio::time::interval(Duration::from_secs(30)); + + loop { + interval.tick().await; + let status = serde_json::json!({ + "device_id": device_id, + "status": "running", + "timestamp": chrono::Utc::now().to_rfc3339(), + }); + bucket.put(&key, status.to_string().into()).await?; + tracing::debug!(key = %key, "reported status"); + } +} + +#[tokio::main] +async fn main() -> Result<()> { + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .init(); + + let cli = Cli::parse(); + let cfg = config::load_config(&cli.config)?; + tracing::info!(device_id = %cfg.agent.device_id, "iot-agent-v0 starting"); + + let device_id = cfg.agent.device_id.clone(); + let client = connect_nats(&cfg).await?; + + let client_watch = client.clone(); + let device_id_watch = device_id.clone(); + + let ctrlc = async { + tokio::signal::ctrl_c().await.ok(); + tracing::info!("received SIGINT, shutting down"); + }; + let sigterm = async { + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())? + .recv() + .await; + tracing::info!("received SIGTERM, shutting down"); + Ok::<(), anyhow::Error>(()) + }; + + let watch = watch_desired_state(client_watch, device_id_watch); + let status = report_status(client, device_id); + + tokio::select! { + _ = ctrlc => {}, + r = sigterm => { r?; } + r = watch => { r?; } + r = status => { r?; } + } + + Ok(()) +} diff --git a/iot/iot-operator-v0/Cargo.toml b/iot/iot-operator-v0/Cargo.toml new file mode 100644 index 0000000..c96e06e --- /dev/null +++ b/iot/iot-operator-v0/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "iot-operator-v0" +version = "0.1.0" +edition = "2024" +rust-version = "1.85" + +[dependencies] +kube = { workspace = true, features = ["runtime", "derive"] } +k8s-openapi.workspace = true +async-nats = { workspace = true } +serde.workspace = true +serde_json.workspace = true +serde_yaml.workspace = true +schemars = "0.8.22" +tokio.workspace = true +tracing = { workspace = true } +tracing-subscriber = { workspace = true } +anyhow.workspace = true +clap.workspace = true +futures-util = { workspace = true } +thiserror.workspace = true \ No newline at end of file diff --git a/iot/iot-operator-v0/deploy/crd.yaml b/iot/iot-operator-v0/deploy/crd.yaml new file mode 100644 index 0000000..c713cef --- /dev/null +++ b/iot/iot-operator-v0/deploy/crd.yaml @@ -0,0 +1,71 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: deployments.iot.nationtech.io +spec: + group: iot.nationtech.io + names: + categories: [] + kind: Deployment + plural: deployments + shortNames: + - iotdep + singular: deployment + scope: Namespaced + versions: + - additionalPrinterColumns: [] + name: v1alpha1 + schema: + openAPIV3Schema: + description: Auto-generated derived type for DeploymentSpec via `CustomResource` + properties: + spec: + properties: + rollout: + properties: + strategy: + enum: + - Immediate + type: string + required: + - strategy + type: object + score: + properties: + data: + x-kubernetes-preserve-unknown-fields: true + type: + minLength: 1 + type: string + required: + - data + - type + type: object + x-kubernetes-validations: + - message: score.type must be a valid Rust identifier matching the struct name of the score variant (e.g. PodmanV0) + rule: self.type.matches('^[A-Za-z_][A-Za-z0-9_]*$') + targetDevices: + items: + type: string + type: array + required: + - rollout + - score + - targetDevices + type: object + status: + nullable: true + properties: + observedScoreString: + nullable: true + type: string + type: object + required: + - spec + title: Deployment + type: object + served: true + storage: true + subresources: + status: {} + diff --git a/iot/iot-operator-v0/deploy/operator.yaml b/iot/iot-operator-v0/deploy/operator.yaml new file mode 100644 index 0000000..aae6f6d --- /dev/null +++ b/iot/iot-operator-v0/deploy/operator.yaml @@ -0,0 +1,75 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: iot-system +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: iot-operator + namespace: iot-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: iot-operator +rules: + - apiGroups: ["iot.nationtech.io"] + resources: ["deployments"] + verbs: ["get", "list", "watch", "patch", "update"] + - apiGroups: ["iot.nationtech.io"] + resources: ["deployments/status"] + verbs: ["get", "patch", "update"] + - apiGroups: ["iot.nationtech.io"] + resources: ["deployments/finalizers"] + verbs: ["update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: iot-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: iot-operator +subjects: + - kind: ServiceAccount + name: iot-operator + namespace: iot-system +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: iot-operator + namespace: iot-system + labels: + app.kubernetes.io/name: iot-operator +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: iot-operator + template: + metadata: + labels: + app.kubernetes.io/name: iot-operator + spec: + serviceAccountName: iot-operator + containers: + - name: operator + image: ghcr.io/nationtech/iot-operator-v0:latest + imagePullPolicy: IfNotPresent + env: + - name: NATS_URL + value: nats://nats.iot-system.svc.cluster.local:4222 + - name: KV_BUCKET + value: desired-state + - name: RUST_LOG + value: info + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 500m + memory: 256Mi diff --git a/iot/iot-operator-v0/src/controller.rs b/iot/iot-operator-v0/src/controller.rs new file mode 100644 index 0000000..5c0eb09 --- /dev/null +++ b/iot/iot-operator-v0/src/controller.rs @@ -0,0 +1,136 @@ +use std::sync::Arc; +use std::time::Duration; + +use async_nats::jetstream::kv::Store; +use futures_util::StreamExt; +use kube::api::{Patch, PatchParams}; +use kube::runtime::Controller; +use kube::runtime::controller::Action; +use kube::runtime::finalizer::{Event as FinalizerEvent, finalizer}; +use kube::runtime::watcher::Config as WatcherConfig; +use kube::{Api, Client, ResourceExt}; +use serde_json::json; + +use crate::crd::{Deployment, DeploymentStatus, ScorePayload}; + +const FINALIZER: &str = "iot.nationtech.io/finalizer"; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("kube api: {0}")] + Kube(#[from] kube::Error), + #[error("nats kv: {0}")] + Kv(String), + #[error("serde: {0}")] + Serde(#[from] serde_json::Error), + #[error("missing namespace on resource")] + MissingNamespace, + #[error("missing target devices")] + MissingTargets, +} + +pub struct Context { + pub client: Client, + pub kv: Store, +} + +pub async fn run(client: Client, kv: Store) -> anyhow::Result<()> { + let api: Api = Api::all(client.clone()); + let ctx = Arc::new(Context { client, kv }); + + tracing::info!("starting Deployment controller"); + Controller::new(api, WatcherConfig::default()) + .run(reconcile, error_policy, ctx) + .for_each(|res| async move { + match res { + Ok((obj, _)) => tracing::debug!(?obj, "reconciled"), + Err(e) => tracing::warn!(error = %e, "reconcile error"), + } + }) + .await; + Ok(()) +} + +async fn reconcile(obj: Arc, ctx: Arc) -> Result { + let ns = obj.namespace().ok_or(Error::MissingNamespace)?; + let name = obj.name_any(); + tracing::info!(%ns, %name, "reconcile"); + + let api: Api = Api::namespaced(ctx.client.clone(), &ns); + finalizer(&api, FINALIZER, obj, |event| async { + match event { + FinalizerEvent::Apply(d) => apply(d, &api, &ctx.kv).await, + FinalizerEvent::Cleanup(d) => cleanup(d, &ctx.kv).await, + } + }) + .await + .map_err(|e| match e { + kube::runtime::finalizer::Error::ApplyFailed(e) + | kube::runtime::finalizer::Error::CleanupFailed(e) => e, + kube::runtime::finalizer::Error::AddFinalizer(e) + | kube::runtime::finalizer::Error::RemoveFinalizer(e) => Error::Kube(e), + kube::runtime::finalizer::Error::UnnamedObject => Error::Kv("unnamed object".into()), + kube::runtime::finalizer::Error::InvalidFinalizer => Error::Kv("invalid finalizer".into()), + }) +} + +async fn apply(obj: Arc, api: &Api, kv: &Store) -> Result { + let name = obj.name_any(); + if obj.spec.target_devices.is_empty() { + return Err(Error::MissingTargets); + } + let score_json = serialize_score(&obj.spec.score)?; + + let already_observed = obj + .status + .as_ref() + .and_then(|s| s.observed_score_string.as_deref()) + == Some(score_json.as_str()); + if already_observed { + tracing::debug!(%name, "score unchanged; skipping KV write and status patch"); + return Ok(Action::requeue(Duration::from_secs(300))); + } + + for device_id in &obj.spec.target_devices { + let key = kv_key(device_id, &name); + kv.put(key.clone(), score_json.clone().into_bytes().into()) + .await + .map_err(|e| Error::Kv(e.to_string()))?; + tracing::info!(%key, "wrote desired state"); + } + + let status = json!({ + "status": DeploymentStatus { + observed_score_string: Some(score_json), + } + }); + api.patch_status(&name, &PatchParams::default(), &Patch::Merge(&status)) + .await?; + + Ok(Action::requeue(Duration::from_secs(300))) +} + +async fn cleanup(obj: Arc, kv: &Store) -> Result { + let name = obj.name_any(); + for device_id in &obj.spec.target_devices { + let key = kv_key(device_id, &name); + kv.delete(&key) + .await + .map_err(|e| Error::Kv(e.to_string()))?; + tracing::info!(%key, "deleted desired state"); + } + Ok(Action::await_change()) +} + +fn serialize_score(score: &ScorePayload) -> Result { + Ok(serde_json::to_string(score)?) +} + +fn kv_key(device_id: &str, deployment_name: &str) -> String { + format!("{device_id}.{deployment_name}") +} + +fn error_policy(_obj: Arc, err: &Error, _ctx: Arc) -> Action { + tracing::warn!(error = %err, "requeueing after error"); + Action::requeue(Duration::from_secs(30)) +} diff --git a/iot/iot-operator-v0/src/crd.rs b/iot/iot-operator-v0/src/crd.rs new file mode 100644 index 0000000..f815ac7 --- /dev/null +++ b/iot/iot-operator-v0/src/crd.rs @@ -0,0 +1,105 @@ +use kube::CustomResource; +use schemars::JsonSchema; +use schemars::schema::{ + InstanceType, ObjectValidation, Schema, SchemaObject, SingleOrVec, StringValidation, +}; +use serde::{Deserialize, Serialize}; + +#[derive(CustomResource, Serialize, Deserialize, Clone, Debug, JsonSchema)] +#[kube( + group = "iot.nationtech.io", + version = "v1alpha1", + kind = "Deployment", + plural = "deployments", + shortname = "iotdep", + namespaced, + status = "DeploymentStatus" +)] +#[serde(rename_all = "camelCase")] +pub struct DeploymentSpec { + pub target_devices: Vec, + #[schemars(schema_with = "score_payload_schema")] + pub score: ScorePayload, + pub rollout: Rollout, +} + +#[derive(Serialize, Deserialize, Clone, Debug, JsonSchema)] +pub struct ScorePayload { + #[serde(rename = "type")] + pub type_: String, + pub data: serde_json::Value, +} + +/// Hand-rolled schema for `ScorePayload` so we can attach two apiserver +/// concessions that `schemars` can't derive: +/// +/// 1. `x-kubernetes-preserve-unknown-fields: true` on `data` — the payload +/// is routed opaquely; its shape is enforced on-device by the agent's +/// typed `IotScore` deserialization, not by the apiserver. +/// 2. An `x-kubernetes-validations` CEL rule on the enclosing `score` object +/// requiring `type` to be a valid Rust identifier, so typos (`"pdoman"`) +/// are rejected at `kubectl apply` time rather than silently reaching +/// the agent. This validates the *shape* of the discriminator without +/// listing the known variant catalog — the operator stays a generic +/// router (v0.3+ can add `OkdApplyV0` etc. without an operator release). +fn score_payload_schema(_: &mut schemars::r#gen::SchemaGenerator) -> Schema { + let type_schema = Schema::Object(SchemaObject { + instance_type: Some(SingleOrVec::Single(Box::new(InstanceType::String))), + string: Some(Box::new(StringValidation { + min_length: Some(1), + ..Default::default() + })), + ..Default::default() + }); + + let mut data_schema = SchemaObject::default(); + data_schema.extensions.insert( + "x-kubernetes-preserve-unknown-fields".to_string(), + serde_json::Value::Bool(true), + ); + + let object = ObjectValidation { + required: ["type".to_string(), "data".to_string()] + .into_iter() + .collect(), + properties: [ + ("type".to_string(), type_schema), + ("data".to_string(), Schema::Object(data_schema)), + ] + .into_iter() + .collect(), + ..Default::default() + }; + + let mut obj = SchemaObject { + instance_type: Some(SingleOrVec::Single(Box::new(InstanceType::Object))), + object: Some(Box::new(object)), + ..Default::default() + }; + obj.extensions.insert( + "x-kubernetes-validations".to_string(), + serde_json::json!([{ + "rule": "self.type.matches('^[A-Za-z_][A-Za-z0-9_]*$')", + "message": "score.type must be a valid Rust identifier matching the struct name of the score variant (e.g. PodmanV0)" + }]), + ); + + Schema::Object(obj) +} + +#[derive(Serialize, Deserialize, Clone, Debug, JsonSchema)] +pub struct Rollout { + pub strategy: RolloutStrategy, +} + +#[derive(Serialize, Deserialize, Clone, Debug, JsonSchema, PartialEq, Eq)] +pub enum RolloutStrategy { + Immediate, +} + +#[derive(Serialize, Deserialize, Clone, Debug, Default, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct DeploymentStatus { + #[serde(skip_serializing_if = "Option::is_none")] + pub observed_score_string: Option, +} diff --git a/iot/iot-operator-v0/src/main.rs b/iot/iot-operator-v0/src/main.rs new file mode 100644 index 0000000..6ee5a78 --- /dev/null +++ b/iot/iot-operator-v0/src/main.rs @@ -0,0 +1,75 @@ +mod controller; +mod crd; + +use anyhow::Result; +use async_nats::jetstream; +use clap::{Parser, Subcommand}; +use kube::{Client, CustomResourceExt}; + +use crate::crd::Deployment; + +#[derive(Parser)] +#[command( + name = "iot-operator-v0", + about = "IoT operator — Deployment CRD → NATS KV" +)] +struct Cli { + #[command(subcommand)] + command: Option, + + #[arg( + long, + env = "NATS_URL", + default_value = "nats://localhost:4222", + global = true + )] + nats_url: String, + + #[arg( + long, + env = "KV_BUCKET", + default_value = "desired-state", + global = true + )] + kv_bucket: String, +} + +#[derive(Subcommand)] +enum Command { + /// Run the controller (default when no subcommand is given). + Run, + /// Print the Deployment CRD as YAML. + GenCrd, +} + +#[tokio::main] +async fn main() -> Result<()> { + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .init(); + + let cli = Cli::parse(); + match cli.command.unwrap_or(Command::Run) { + Command::GenCrd => { + println!("{}", serde_yaml::to_string(&Deployment::crd())?); + Ok(()) + } + Command::Run => run(&cli.nats_url, &cli.kv_bucket).await, + } +} + +async fn run(nats_url: &str, bucket: &str) -> Result<()> { + let nats = async_nats::connect(nats_url).await?; + tracing::info!(url = %nats_url, "connected to NATS"); + let js = jetstream::new(nats); + let kv = js + .create_key_value(jetstream::kv::Config { + bucket: bucket.to_string(), + ..Default::default() + }) + .await?; + tracing::info!(bucket = %bucket, "KV bucket ready"); + + let client = Client::try_default().await?; + controller::run(client, kv).await +} diff --git a/iot/scripts/smoke-a1.sh b/iot/scripts/smoke-a1.sh new file mode 100755 index 0000000..805ea20 --- /dev/null +++ b/iot/scripts/smoke-a1.sh @@ -0,0 +1,243 @@ +#!/usr/bin/env bash +# A1 smoke test — the end-to-end verification from ROADMAP/iot_platform/v0_walking_skeleton.md §9.A1. +# +# Deployment CR ──apply──▶ operator ──KV put──▶ NATS JetStream +# │ +# nats kv get ◀┘ +# +# Stands up a NATS server container + a k3d cluster, runs the operator against +# them, applies a test CR, asserts the key appears in NATS KV, deletes the CR, +# asserts the key disappears. Everything is torn down in the cleanup trap. +# +# Requirements on the host: +# - podman (rootless OK) +# - cargo (for building/running the operator) +# - kubectl +# - a k3d binary (defaults to Harmony's downloaded copy) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +OPERATOR_DIR="$REPO_ROOT/iot/iot-operator-v0" + +K3D_BIN="${K3D_BIN:-$HOME/.local/share/harmony/k3d/k3d}" +CLUSTER_NAME="${CLUSTER_NAME:-iot-smoke}" +NATS_CONTAINER="${NATS_CONTAINER:-iot-smoke-nats}" +NATS_NET_NAME="${NATS_NET_NAME:-iot-smoke-net}" +NATS_IMAGE="${NATS_IMAGE:-docker.io/library/nats:2.10-alpine}" +NATSBOX_IMAGE="${NATSBOX_IMAGE:-docker.io/natsio/nats-box:latest}" +NATS_PORT="${NATS_PORT:-4222}" +TARGET_DEVICE="${TARGET_DEVICE:-pi-demo-01}" +DEPLOY_NAME="${DEPLOY_NAME:-hello-world}" +DEPLOY_NS="${DEPLOY_NS:-iot-demo}" + +OPERATOR_LOG="$(mktemp -t iot-operator.XXXXXX.log)" +OPERATOR_PID="" +KUBECONFIG_FILE="" + +log() { printf '\033[1;34m[smoke]\033[0m %s\n' "$*"; } +fail() { printf '\033[1;31m[smoke FAIL]\033[0m %s\n' "$*" >&2; exit 1; } + +cleanup() { + local rc=$? + log "cleanup…" + if [[ -n "$OPERATOR_PID" ]] && kill -0 "$OPERATOR_PID" 2>/dev/null; then + kill "$OPERATOR_PID" 2>/dev/null || true + wait "$OPERATOR_PID" 2>/dev/null || true + fi + if [[ "${KEEP:-0}" != "1" ]]; then + "$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true + podman rm -f "$NATS_CONTAINER" >/dev/null 2>&1 || true + podman network rm "$NATS_NET_NAME" >/dev/null 2>&1 || true + [[ -n "$KUBECONFIG_FILE" ]] && rm -f "$KUBECONFIG_FILE" + else + log "KEEP=1 — leaving cluster '$CLUSTER_NAME' and container '$NATS_CONTAINER' running" + log "KUBECONFIG=$KUBECONFIG_FILE" + fi + if [[ $rc -ne 0 ]]; then + log "operator log at $OPERATOR_LOG" + echo "----- operator log tail -----" + tail -n 60 "$OPERATOR_LOG" 2>/dev/null || true + else + rm -f "$OPERATOR_LOG" + fi + exit $rc +} +trap cleanup EXIT INT TERM + +require() { command -v "$1" >/dev/null 2>&1 || fail "missing required tool: $1"; } +require podman +require cargo +require kubectl +[[ -x "$K3D_BIN" ]] || fail "k3d binary not executable at $K3D_BIN (set K3D_BIN=…)" + +natsbox() { + podman run --rm --network "$NATS_NET_NAME" "$NATSBOX_IMAGE" \ + nats --server "nats://$NATS_CONTAINER:$NATS_PORT" "$@" +} + +############################################################################### +# phase 1 — NATS +############################################################################### +log "phase 1: start NATS" +podman network exists "$NATS_NET_NAME" || podman network create "$NATS_NET_NAME" >/dev/null +podman rm -f "$NATS_CONTAINER" >/dev/null 2>&1 || true +podman run -d \ + --name "$NATS_CONTAINER" \ + --network "$NATS_NET_NAME" \ + -p "$NATS_PORT:4222" \ + "$NATS_IMAGE" -js >/dev/null +log "waiting for NATS" +for _ in $(seq 1 30); do + if podman run --rm --network "$NATS_NET_NAME" "$NATSBOX_IMAGE" \ + nats --server "nats://$NATS_CONTAINER:4222" server check connection >/dev/null 2>&1; then + break + fi + sleep 1 +done +natsbox server check connection >/dev/null || fail "NATS never became ready" + +############################################################################### +# phase 2 — k3d cluster + CRD +############################################################################### +log "phase 2: create k3d cluster '$CLUSTER_NAME'" +"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true +"$K3D_BIN" cluster create "$CLUSTER_NAME" --wait --timeout 90s >/dev/null + +KUBECONFIG_FILE="$(mktemp -t iot-smoke-kubeconfig.XXXXXX)" +"$K3D_BIN" kubeconfig get "$CLUSTER_NAME" > "$KUBECONFIG_FILE" +export KUBECONFIG="$KUBECONFIG_FILE" + +log "generate + apply CRD" +( cd "$OPERATOR_DIR" && cargo run -q -- gen-crd ) | kubectl apply -f - >/dev/null +kubectl wait --for=condition=Established "crd/deployments.iot.nationtech.io" --timeout=30s >/dev/null + +kubectl get ns "$DEPLOY_NS" >/dev/null 2>&1 || kubectl create namespace "$DEPLOY_NS" >/dev/null + +############################################################################### +# phase 2b — CEL discriminator guardrail: an invalid score.type must be rejected +# by the apiserver (tests x-kubernetes-validations on spec.score) +############################################################################### +log "phase 2b: apiserver rejects invalid score.type" +BAD_CR=$(cat <&1 || true)" +if echo "$BAD_OUT" | grep -q "must be a valid Rust identifier"; then + log "apiserver rejected invalid discriminator as expected" +else + fail "expected CEL rejection for score.type='has spaces'; got: $BAD_OUT" +fi +# Belt-and-braces: make sure nothing was persisted +if kubectl -n "$DEPLOY_NS" get deployment.iot.nationtech.io bad-discriminator >/dev/null 2>&1; then + kubectl -n "$DEPLOY_NS" delete deployment.iot.nationtech.io bad-discriminator >/dev/null 2>&1 || true + fail "apiserver should have rejected 'bad-discriminator' but it was persisted" +fi + +############################################################################### +# phase 3 — operator +############################################################################### +log "phase 3: start operator" +( + cd "$OPERATOR_DIR" + cargo build -q +) +NATS_URL="nats://127.0.0.1:$NATS_PORT" \ +KV_BUCKET="desired-state" \ +RUST_LOG="info,kube_runtime=warn" \ + "$REPO_ROOT/target/debug/iot-operator-v0" \ + >"$OPERATOR_LOG" 2>&1 & +OPERATOR_PID=$! +log "operator pid=$OPERATOR_PID (log: $OPERATOR_LOG)" + +for _ in $(seq 1 30); do + if grep -q "starting Deployment controller" "$OPERATOR_LOG"; then break; fi + if ! kill -0 "$OPERATOR_PID" 2>/dev/null; then fail "operator exited early"; fi + sleep 0.5 +done +grep -q "starting Deployment controller" "$OPERATOR_LOG" \ + || fail "operator never logged 'starting Deployment controller'" +grep -q "KV bucket ready" "$OPERATOR_LOG" \ + || fail "operator never confirmed KV bucket ready" + +############################################################################### +# phase 4 — apply Deployment CR +############################################################################### +log "phase 4: apply Deployment CR" +cat </dev/null +apiVersion: iot.nationtech.io/v1alpha1 +kind: Deployment +metadata: + name: $DEPLOY_NAME + namespace: $DEPLOY_NS +spec: + targetDevices: [$TARGET_DEVICE] + score: + type: PodmanV0 + data: + services: + - name: hello + image: docker.io/library/nginx:alpine + ports: ["8080:80"] + rollout: + strategy: Immediate +EOF + +log "wait for KV key $TARGET_DEVICE.$DEPLOY_NAME" +KV_VALUE="" +for _ in $(seq 1 30); do + if KV_VALUE="$(natsbox kv get desired-state "$TARGET_DEVICE.$DEPLOY_NAME" --raw 2>/dev/null)"; then + [[ -n "$KV_VALUE" ]] && break + fi + sleep 1 +done +[[ -n "$KV_VALUE" ]] || fail "KV key never appeared" +echo "$KV_VALUE" | grep -q '"type":"PodmanV0"' \ + || fail "KV value missing \"type\":\"PodmanV0\" discriminator — got: $KV_VALUE" +echo "$KV_VALUE" | grep -q '"image":"docker.io/library/nginx:alpine"' \ + || fail "KV value missing nginx image — got: $KV_VALUE" + +log "wait for .status.observedScoreString" +OBSERVED="" +for _ in $(seq 1 30); do + OBSERVED="$(kubectl -n "$DEPLOY_NS" get deployment.iot.nationtech.io "$DEPLOY_NAME" \ + -o jsonpath='{.status.observedScoreString}' 2>/dev/null || true)" + [[ -n "$OBSERVED" ]] && break + sleep 1 +done +[[ -n "$OBSERVED" ]] || fail ".status.observedScoreString never set" +[[ "$OBSERVED" == "$KV_VALUE" ]] \ + || fail "observedScoreString does not match KV value:\n status=$OBSERVED\n kv =$KV_VALUE" + +############################################################################### +# phase 5 — delete CR, expect cleanup via finalizer +############################################################################### +log "phase 5: delete Deployment CR — finalizer should remove KV key" +kubectl -n "$DEPLOY_NS" delete deployment.iot.nationtech.io "$DEPLOY_NAME" --wait=true >/dev/null + +log "wait for KV key removal" +for _ in $(seq 1 30); do + if ! natsbox kv get desired-state "$TARGET_DEVICE.$DEPLOY_NAME" --raw >/dev/null 2>&1; then + log "KV key gone" + break + fi + sleep 1 +done +if natsbox kv get desired-state "$TARGET_DEVICE.$DEPLOY_NAME" --raw >/dev/null 2>&1; then + fail "KV key still present after CR delete" +fi + +log "PASS"