Files
harmony/iot/scripts/smoke-a1.sh
Jean-Gabriel Gill-Couture d21bdef050
Some checks are pending
Run Check Script / check (pull_request) Waiting to run
feat(iot-operator): CEL-validate score.type as a Rust identifier
The CRD previously accepted any string for `score.type`, so typos like
`"pdoman"` or `"PodmnV0"` would be persisted by the apiserver and only
surface on-device as agent-side deserialize warnings. That class of
failure is distasteful and hard to debug.

Replace the auto-derived schema for `ScorePayload` with a hand-rolled
one that keeps the same visible shape but adds two apiserver-level
guardrails:

- `score.type` gets `minLength: 1` and an `x-kubernetes-validations`
  CEL rule requiring it to match `^[A-Za-z_][A-Za-z0-9_]*$` — a valid
  Rust identifier, since score variants *are* Rust struct names in
  `harmony::modules::podman::IotScore`. Message points operators at
  the concrete example `PodmanV0`.
- `score.data` still carries only `x-kubernetes-preserve-unknown-
  fields: true`. The rule validates the discriminator's *shape*, not
  its *value*, so v0.3+ variants (OkdApplyV0, KubectlApplyV0) don't
  require an operator release — preserves ROADMAP §6.1's
  generic-router design.

The `x-kubernetes-preserve-unknown-fields` extension stays scoped to
`score.data` alone; every other field in the CRD has a strict schema,
exactly one preserve-unknown-fields marker and exactly one
validations block in the whole document.

Smoke test extended: phase 2b applies a CR with `score.type: "has
spaces"` and asserts the apiserver rejects it with the CEL message
before the operator ever sees it. Positive phases (kubectl apply ->
NATS KV put -> status observed -> delete -> KV key removed) still
PASS end-to-end.

Matches the `preserve_arbitrary` pattern used by ArgoCD
(`Application.spec.source.helm.valuesObject`) and Flux
(`HelmRelease.spec.values`), both of which similarly use narrow
preserve-unknown-fields on a payload field without coupling the CRD
to their variant catalog.
2026-04-18 10:35:59 -04:00

244 lines
9.0 KiB
Bash
Executable File

#!/usr/bin/env bash
# A1 smoke test — the end-to-end verification from ROADMAP/iot_platform/v0_walking_skeleton.md §9.A1.
#
# Deployment CR ──apply──▶ operator ──KV put──▶ NATS JetStream
# │
# nats kv get ◀┘
#
# Stands up a NATS server container + a k3d cluster, runs the operator against
# them, applies a test CR, asserts the key appears in NATS KV, deletes the CR,
# asserts the key disappears. Everything is torn down in the cleanup trap.
#
# Requirements on the host:
# - podman (rootless OK)
# - cargo (for building/running the operator)
# - kubectl
# - a k3d binary (defaults to Harmony's downloaded copy)
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
OPERATOR_DIR="$REPO_ROOT/iot/iot-operator-v0"
K3D_BIN="${K3D_BIN:-$HOME/.local/share/harmony/k3d/k3d}"
CLUSTER_NAME="${CLUSTER_NAME:-iot-smoke}"
NATS_CONTAINER="${NATS_CONTAINER:-iot-smoke-nats}"
NATS_NET_NAME="${NATS_NET_NAME:-iot-smoke-net}"
NATS_IMAGE="${NATS_IMAGE:-docker.io/library/nats:2.10-alpine}"
NATSBOX_IMAGE="${NATSBOX_IMAGE:-docker.io/natsio/nats-box:latest}"
NATS_PORT="${NATS_PORT:-4222}"
TARGET_DEVICE="${TARGET_DEVICE:-pi-demo-01}"
DEPLOY_NAME="${DEPLOY_NAME:-hello-world}"
DEPLOY_NS="${DEPLOY_NS:-iot-demo}"
OPERATOR_LOG="$(mktemp -t iot-operator.XXXXXX.log)"
OPERATOR_PID=""
KUBECONFIG_FILE=""
log() { printf '\033[1;34m[smoke]\033[0m %s\n' "$*"; }
fail() { printf '\033[1;31m[smoke FAIL]\033[0m %s\n' "$*" >&2; exit 1; }
cleanup() {
local rc=$?
log "cleanup…"
if [[ -n "$OPERATOR_PID" ]] && kill -0 "$OPERATOR_PID" 2>/dev/null; then
kill "$OPERATOR_PID" 2>/dev/null || true
wait "$OPERATOR_PID" 2>/dev/null || true
fi
if [[ "${KEEP:-0}" != "1" ]]; then
"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
podman rm -f "$NATS_CONTAINER" >/dev/null 2>&1 || true
podman network rm "$NATS_NET_NAME" >/dev/null 2>&1 || true
[[ -n "$KUBECONFIG_FILE" ]] && rm -f "$KUBECONFIG_FILE"
else
log "KEEP=1 — leaving cluster '$CLUSTER_NAME' and container '$NATS_CONTAINER' running"
log "KUBECONFIG=$KUBECONFIG_FILE"
fi
if [[ $rc -ne 0 ]]; then
log "operator log at $OPERATOR_LOG"
echo "----- operator log tail -----"
tail -n 60 "$OPERATOR_LOG" 2>/dev/null || true
else
rm -f "$OPERATOR_LOG"
fi
exit $rc
}
trap cleanup EXIT INT TERM
require() { command -v "$1" >/dev/null 2>&1 || fail "missing required tool: $1"; }
require podman
require cargo
require kubectl
[[ -x "$K3D_BIN" ]] || fail "k3d binary not executable at $K3D_BIN (set K3D_BIN=…)"
natsbox() {
podman run --rm --network "$NATS_NET_NAME" "$NATSBOX_IMAGE" \
nats --server "nats://$NATS_CONTAINER:$NATS_PORT" "$@"
}
###############################################################################
# phase 1 — NATS
###############################################################################
log "phase 1: start NATS"
podman network exists "$NATS_NET_NAME" || podman network create "$NATS_NET_NAME" >/dev/null
podman rm -f "$NATS_CONTAINER" >/dev/null 2>&1 || true
podman run -d \
--name "$NATS_CONTAINER" \
--network "$NATS_NET_NAME" \
-p "$NATS_PORT:4222" \
"$NATS_IMAGE" -js >/dev/null
log "waiting for NATS"
for _ in $(seq 1 30); do
if podman run --rm --network "$NATS_NET_NAME" "$NATSBOX_IMAGE" \
nats --server "nats://$NATS_CONTAINER:4222" server check connection >/dev/null 2>&1; then
break
fi
sleep 1
done
natsbox server check connection >/dev/null || fail "NATS never became ready"
###############################################################################
# phase 2 — k3d cluster + CRD
###############################################################################
log "phase 2: create k3d cluster '$CLUSTER_NAME'"
"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
"$K3D_BIN" cluster create "$CLUSTER_NAME" --wait --timeout 90s >/dev/null
KUBECONFIG_FILE="$(mktemp -t iot-smoke-kubeconfig.XXXXXX)"
"$K3D_BIN" kubeconfig get "$CLUSTER_NAME" > "$KUBECONFIG_FILE"
export KUBECONFIG="$KUBECONFIG_FILE"
log "generate + apply CRD"
( cd "$OPERATOR_DIR" && cargo run -q -- gen-crd ) | kubectl apply -f - >/dev/null
kubectl wait --for=condition=Established "crd/deployments.iot.nationtech.io" --timeout=30s >/dev/null
kubectl get ns "$DEPLOY_NS" >/dev/null 2>&1 || kubectl create namespace "$DEPLOY_NS" >/dev/null
###############################################################################
# phase 2b — CEL discriminator guardrail: an invalid score.type must be rejected
# by the apiserver (tests x-kubernetes-validations on spec.score)
###############################################################################
log "phase 2b: apiserver rejects invalid score.type"
BAD_CR=$(cat <<EOF
apiVersion: iot.nationtech.io/v1alpha1
kind: Deployment
metadata:
name: bad-discriminator
namespace: $DEPLOY_NS
spec:
targetDevices: [$TARGET_DEVICE]
score:
type: "has spaces"
data: {}
rollout:
strategy: Immediate
EOF
)
BAD_OUT="$(echo "$BAD_CR" | kubectl apply -f - 2>&1 || true)"
if echo "$BAD_OUT" | grep -q "must be a valid Rust identifier"; then
log "apiserver rejected invalid discriminator as expected"
else
fail "expected CEL rejection for score.type='has spaces'; got: $BAD_OUT"
fi
# Belt-and-braces: make sure nothing was persisted
if kubectl -n "$DEPLOY_NS" get deployment.iot.nationtech.io bad-discriminator >/dev/null 2>&1; then
kubectl -n "$DEPLOY_NS" delete deployment.iot.nationtech.io bad-discriminator >/dev/null 2>&1 || true
fail "apiserver should have rejected 'bad-discriminator' but it was persisted"
fi
###############################################################################
# phase 3 — operator
###############################################################################
log "phase 3: start operator"
(
cd "$OPERATOR_DIR"
cargo build -q
)
NATS_URL="nats://127.0.0.1:$NATS_PORT" \
KV_BUCKET="desired-state" \
RUST_LOG="info,kube_runtime=warn" \
"$REPO_ROOT/target/debug/iot-operator-v0" \
>"$OPERATOR_LOG" 2>&1 &
OPERATOR_PID=$!
log "operator pid=$OPERATOR_PID (log: $OPERATOR_LOG)"
for _ in $(seq 1 30); do
if grep -q "starting Deployment controller" "$OPERATOR_LOG"; then break; fi
if ! kill -0 "$OPERATOR_PID" 2>/dev/null; then fail "operator exited early"; fi
sleep 0.5
done
grep -q "starting Deployment controller" "$OPERATOR_LOG" \
|| fail "operator never logged 'starting Deployment controller'"
grep -q "KV bucket ready" "$OPERATOR_LOG" \
|| fail "operator never confirmed KV bucket ready"
###############################################################################
# phase 4 — apply Deployment CR
###############################################################################
log "phase 4: apply Deployment CR"
cat <<EOF | kubectl apply -f - >/dev/null
apiVersion: iot.nationtech.io/v1alpha1
kind: Deployment
metadata:
name: $DEPLOY_NAME
namespace: $DEPLOY_NS
spec:
targetDevices: [$TARGET_DEVICE]
score:
type: PodmanV0
data:
services:
- name: hello
image: docker.io/library/nginx:alpine
ports: ["8080:80"]
rollout:
strategy: Immediate
EOF
log "wait for KV key $TARGET_DEVICE.$DEPLOY_NAME"
KV_VALUE=""
for _ in $(seq 1 30); do
if KV_VALUE="$(natsbox kv get desired-state "$TARGET_DEVICE.$DEPLOY_NAME" --raw 2>/dev/null)"; then
[[ -n "$KV_VALUE" ]] && break
fi
sleep 1
done
[[ -n "$KV_VALUE" ]] || fail "KV key never appeared"
echo "$KV_VALUE" | grep -q '"type":"PodmanV0"' \
|| fail "KV value missing \"type\":\"PodmanV0\" discriminator — got: $KV_VALUE"
echo "$KV_VALUE" | grep -q '"image":"docker.io/library/nginx:alpine"' \
|| fail "KV value missing nginx image — got: $KV_VALUE"
log "wait for .status.observedScoreString"
OBSERVED=""
for _ in $(seq 1 30); do
OBSERVED="$(kubectl -n "$DEPLOY_NS" get deployment.iot.nationtech.io "$DEPLOY_NAME" \
-o jsonpath='{.status.observedScoreString}' 2>/dev/null || true)"
[[ -n "$OBSERVED" ]] && break
sleep 1
done
[[ -n "$OBSERVED" ]] || fail ".status.observedScoreString never set"
[[ "$OBSERVED" == "$KV_VALUE" ]] \
|| fail "observedScoreString does not match KV value:\n status=$OBSERVED\n kv =$KV_VALUE"
###############################################################################
# phase 5 — delete CR, expect cleanup via finalizer
###############################################################################
log "phase 5: delete Deployment CR — finalizer should remove KV key"
kubectl -n "$DEPLOY_NS" delete deployment.iot.nationtech.io "$DEPLOY_NAME" --wait=true >/dev/null
log "wait for KV key removal"
for _ in $(seq 1 30); do
if ! natsbox kv get desired-state "$TARGET_DEVICE.$DEPLOY_NAME" --raw >/dev/null 2>&1; then
log "KV key gone"
break
fi
sleep 1
done
if natsbox kv get desired-state "$TARGET_DEVICE.$DEPLOY_NAME" --raw >/dev/null 2>&1; then
fail "KV key still present after CR delete"
fi
log "PASS"