- agent-status bucket -> device-heartbeat bucket - status.<device> key -> heartbeat.<device> - drop parity check summary from smoke-a4 (legacy path is gone) - tidy stale AgentStatus comment in agent main
526 lines
21 KiB
Bash
Executable File
526 lines
21 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# End-to-end hands-on demo: operator + in-cluster NATS + ARM VM agent.
|
||
#
|
||
# [k3d cluster]
|
||
# ├── NATS (single-node, NodePort 4222)
|
||
# └── CRD: iot.nationtech.io/v1alpha1/Deployment
|
||
# ▲
|
||
# │ kubectl apply / iot_apply_deployment
|
||
# │
|
||
# [host]
|
||
# ├── operator (cargo run) ──▶ NATS KV desired-state
|
||
# └── libvirt VM
|
||
# └── iot-agent ──▶ NATS KV (watch) ──▶ podman container
|
||
#
|
||
# By default the script brings the whole stack up, applies no
|
||
# Deployment CR, prints a "command menu" of user-runnable one-liners,
|
||
# and blocks on Ctrl-C. With `--auto`, it also drives an apply +
|
||
# upgrade + delete cycle for regression coverage.
|
||
#
|
||
# Prereqs on the runner host (one-time, generic):
|
||
# 1. podman (rootless), cargo, kubectl, virsh, xorriso, python3,
|
||
# libvirt, qemu-system-x86_64/aarch64 + edk2 firmware for the
|
||
# chosen ARCH.
|
||
# 2. Be in the `libvirt` group.
|
||
# 3. `sudo virsh net-start default` (once per boot unless autostart).
|
||
# 4. Rootless podman user socket running:
|
||
# `systemctl --user start podman.socket`.
|
||
# 5. k3d binary at $K3D_BIN (defaults to Harmony's downloaded copy).
|
||
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
OPERATOR_DIR="$REPO_ROOT/iot/iot-operator-v0"
|
||
|
||
# ---- config -----------------------------------------------------------------
|
||
|
||
K3D_BIN="${K3D_BIN:-$HOME/.local/share/harmony/k3d/k3d}"
|
||
CLUSTER_NAME="${CLUSTER_NAME:-iot-demo}"
|
||
|
||
ARCH="${ARCH:-x86-64}"
|
||
VM_NAME="${VM_NAME:-iot-demo-vm}"
|
||
DEVICE_ID="${DEVICE_ID:-$VM_NAME}"
|
||
GROUP="${GROUP:-group-a}"
|
||
LIBVIRT_URI="${LIBVIRT_URI:-qemu:///system}"
|
||
|
||
NATS_NAMESPACE="${NATS_NAMESPACE:-iot-system}"
|
||
NATS_NAME="${NATS_NAME:-iot-nats}"
|
||
NATS_NODE_PORT="${NATS_NODE_PORT:-4222}"
|
||
|
||
DEPLOY_NS="${DEPLOY_NS:-iot-demo}"
|
||
DEPLOY_NAME="${DEPLOY_NAME:-hello-world}"
|
||
DEPLOY_PORT="${DEPLOY_PORT:-8080:80}"
|
||
|
||
# Source image we sideload into the VM's podman. Defaults to the
|
||
# `nginx:alpine` variant (~60 MB) which is almost always cached on
|
||
# dev boxes and keeps TCG-aarch64 boot budgets sane. The tarball
|
||
# transport + podman IfNotPresent semantics mean the agent never
|
||
# hits a public registry for this image.
|
||
SRC_IMAGE="${SRC_IMAGE:-docker.io/library/nginx:alpine}"
|
||
|
||
AUTO=0
|
||
[[ "${1:-}" == "--auto" ]] && AUTO=1
|
||
|
||
OPERATOR_LOG="$(mktemp -t iot-operator.XXXXXX.log)"
|
||
OPERATOR_PID=""
|
||
KUBECONFIG_FILE=""
|
||
|
||
# ---- arch demux -------------------------------------------------------------
|
||
|
||
case "$ARCH" in
|
||
x86-64|x86_64)
|
||
EXAMPLE_ARCH=x86-64
|
||
AGENT_TARGET=
|
||
# Native-KVM x86: podman pull + layer unpack is seconds.
|
||
CONTAINER_WAIT_STEPS=90 # 180 s
|
||
;;
|
||
aarch64|arm64)
|
||
EXAMPLE_ARCH=aarch64
|
||
AGENT_TARGET=aarch64-unknown-linux-gnu
|
||
# TCG aarch64: network stack + userns layer unpack run
|
||
# ~3-5× slower than native. An `nginx:latest` pull (~250 MB)
|
||
# on a cold image takes 4-8 min observed here. Give it 15.
|
||
CONTAINER_WAIT_STEPS=450 # 900 s
|
||
;;
|
||
*) printf '[smoke-a4 FAIL] unsupported ARCH=%s (expected: x86-64 | aarch64)\n' "$ARCH" >&2; exit 1 ;;
|
||
esac
|
||
|
||
log() { printf '\033[1;34m[smoke-a4]\033[0m %s\n' "$*"; }
|
||
fail() { printf '\033[1;31m[smoke-a4 FAIL]\033[0m %s\n' "$*" >&2; exit 1; }
|
||
|
||
cleanup() {
|
||
local rc=$?
|
||
log "cleanup…"
|
||
if [[ -n "$OPERATOR_PID" ]] && kill -0 "$OPERATOR_PID" 2>/dev/null; then
|
||
kill "$OPERATOR_PID" 2>/dev/null || true
|
||
wait "$OPERATOR_PID" 2>/dev/null || true
|
||
fi
|
||
if [[ "${KEEP:-0}" != "1" ]]; then
|
||
virsh --connect "$LIBVIRT_URI" destroy "$VM_NAME" 2>/dev/null || true
|
||
virsh --connect "$LIBVIRT_URI" undefine --nvram \
|
||
--remove-all-storage "$VM_NAME" 2>/dev/null || true
|
||
"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
|
||
[[ -n "$KUBECONFIG_FILE" ]] && rm -f "$KUBECONFIG_FILE"
|
||
else
|
||
log "KEEP=1 — leaving cluster '$CLUSTER_NAME' and VM '$VM_NAME' running"
|
||
[[ -n "$KUBECONFIG_FILE" ]] && log "KUBECONFIG=$KUBECONFIG_FILE"
|
||
fi
|
||
if [[ $rc -ne 0 && -s "$OPERATOR_LOG" ]]; then
|
||
log "operator log at $OPERATOR_LOG"
|
||
echo "----- operator log tail -----"
|
||
tail -n 40 "$OPERATOR_LOG" 2>/dev/null || true
|
||
else
|
||
rm -f "$OPERATOR_LOG"
|
||
fi
|
||
exit $rc
|
||
}
|
||
trap cleanup EXIT INT TERM
|
||
|
||
require() { command -v "$1" >/dev/null 2>&1 || fail "missing required tool: $1"; }
|
||
require cargo
|
||
require kubectl
|
||
require virsh
|
||
require podman
|
||
require docker # cross-runtime image transfer for k3d sideload
|
||
[[ -x "$K3D_BIN" ]] || fail "k3d binary not executable at $K3D_BIN (set K3D_BIN=…)"
|
||
|
||
# ---- phase 1: k3d cluster with NATS port exposed ----------------------------
|
||
|
||
log "phase 1: create k3d cluster '$CLUSTER_NAME' (host port $NATS_NODE_PORT → loadbalancer)"
|
||
"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
|
||
"$K3D_BIN" cluster create "$CLUSTER_NAME" \
|
||
--wait --timeout 90s \
|
||
-p "${NATS_NODE_PORT}:${NATS_NODE_PORT}@loadbalancer" \
|
||
>/dev/null
|
||
KUBECONFIG_FILE="$(mktemp -t iot-demo-kubeconfig.XXXXXX)"
|
||
"$K3D_BIN" kubeconfig get "$CLUSTER_NAME" > "$KUBECONFIG_FILE"
|
||
export KUBECONFIG="$KUBECONFIG_FILE"
|
||
|
||
# ---- phase 2: NATS in-cluster via NatsBasicScore ----------------------------
|
||
|
||
NATS_IMAGE="${NATS_IMAGE:-docker.io/library/nats:2.10-alpine}"
|
||
|
||
# Sideload the NATS image into k3d so the install doesn't race the
|
||
# Docker Hub rate limiter. `docker inspect` + `podman save` + `docker
|
||
# load` is the cross-runtime bridge on hosts that have both (rootful
|
||
# docker for k3d, rootless podman for IoT smokes). Cheap when the
|
||
# image is already in podman's store; a one-time Hub pull when not.
|
||
log "phase 2a: sideload NATS image ($NATS_IMAGE) into k3d cluster"
|
||
if ! docker image inspect "$NATS_IMAGE" >/dev/null 2>&1; then
|
||
if ! podman image inspect "$NATS_IMAGE" >/dev/null 2>&1; then
|
||
log "NATS image not cached locally — pulling from Docker Hub"
|
||
podman pull "$NATS_IMAGE" >/dev/null || fail "podman pull $NATS_IMAGE failed"
|
||
fi
|
||
tmptar="$(mktemp -t nats-image.XXXXXX.tar)"
|
||
podman save "$NATS_IMAGE" -o "$tmptar" >/dev/null
|
||
docker load -i "$tmptar" >/dev/null
|
||
rm -f "$tmptar"
|
||
fi
|
||
"$K3D_BIN" image import "$NATS_IMAGE" -c "$CLUSTER_NAME" >/dev/null
|
||
|
||
log "phase 2b: install NATS in-cluster via NatsBasicScore (namespace=$NATS_NAMESPACE, expose=load-balancer)"
|
||
(
|
||
cd "$REPO_ROOT"
|
||
cargo run -q --release -p example_iot_nats_install -- \
|
||
--namespace "$NATS_NAMESPACE" \
|
||
--name "$NATS_NAME" \
|
||
--expose load-balancer
|
||
)
|
||
log "waiting for NATS Deployment to be Available"
|
||
kubectl -n "$NATS_NAMESPACE" wait --for=condition=Available \
|
||
"deployment/$NATS_NAME" --timeout=120s >/dev/null
|
||
|
||
# kubectl "Available" reports on pod readiness — k3d's klipper-lb
|
||
# takes a further few seconds to wire the host loadbalancer port to
|
||
# the Service endpoints. Probe the actual TCP port from the host
|
||
# before declaring NATS routable, else the operator's connect will
|
||
# race and die with "expected INFO, got nothing."
|
||
log "probing nats://localhost:$NATS_NODE_PORT end-to-end"
|
||
for _ in $(seq 1 60); do
|
||
if (echo >"/dev/tcp/127.0.0.1/$NATS_NODE_PORT") 2>/dev/null; then
|
||
break
|
||
fi
|
||
sleep 1
|
||
done
|
||
(echo >"/dev/tcp/127.0.0.1/$NATS_NODE_PORT") 2>/dev/null \
|
||
|| fail "TCP localhost:$NATS_NODE_PORT never came up after Deployment Available"
|
||
|
||
# ---- phase 3: install Deployment CRD via operator's Score-based install -----
|
||
|
||
log "phase 3: install Deployment CRD via operator \`install\` subcommand"
|
||
(
|
||
cd "$OPERATOR_DIR"
|
||
cargo run -q -- install
|
||
)
|
||
kubectl wait --for=condition=Established \
|
||
"crd/deployments.iot.nationtech.io" --timeout=30s >/dev/null
|
||
|
||
kubectl get ns "$DEPLOY_NS" >/dev/null 2>&1 || \
|
||
kubectl create namespace "$DEPLOY_NS" >/dev/null
|
||
|
||
# ---- phase 4: operator running host-side ------------------------------------
|
||
|
||
log "phase 4: start operator (host-side) connected to nats://localhost:$NATS_NODE_PORT"
|
||
(
|
||
cd "$OPERATOR_DIR"
|
||
cargo build -q --release
|
||
)
|
||
NATS_URL="nats://localhost:$NATS_NODE_PORT" \
|
||
KV_BUCKET="desired-state" \
|
||
RUST_LOG="info,kube_runtime=warn" \
|
||
"$REPO_ROOT/target/release/iot-operator-v0" \
|
||
>"$OPERATOR_LOG" 2>&1 &
|
||
OPERATOR_PID=$!
|
||
log "operator pid=$OPERATOR_PID (log: $OPERATOR_LOG)"
|
||
for _ in $(seq 1 30); do
|
||
if grep -q "starting Deployment controller" "$OPERATOR_LOG"; then break; fi
|
||
if ! kill -0 "$OPERATOR_PID" 2>/dev/null; then fail "operator exited early"; fi
|
||
sleep 0.5
|
||
done
|
||
grep -q "starting Deployment controller" "$OPERATOR_LOG" \
|
||
|| fail "operator never logged 'starting Deployment controller'"
|
||
grep -q "KV bucket ready" "$OPERATOR_LOG" \
|
||
|| fail "operator never confirmed KV bucket ready"
|
||
|
||
# ---- phase 4.5: export the workload image to a tarball ----------------------
|
||
# Instead of running a local OCI registry (which needs `registry:2` from
|
||
# Docker Hub — rate-limited!), sideload the image straight into the VM's
|
||
# podman via `podman save`/`scp`/`podman load`. Paired with harmony's
|
||
# `PodmanTopology::ensure_image_present` (IfNotPresent semantics: present
|
||
# = skip pull), the agent never touches a public registry for known
|
||
# images. This is the same compounding-framework-value move as the k3d
|
||
# NATS sideload in phase 2a.
|
||
|
||
NAT_GW="$(virsh --connect "$LIBVIRT_URI" net-dumpxml default \
|
||
| grep -oP "ip address='\K[^']+" | head -1)"
|
||
[[ -n "$NAT_GW" ]] || fail "couldn't determine libvirt 'default' gateway IP"
|
||
log "libvirt network gateway = $NAT_GW (VM agent will dial nats://$NAT_GW:$NATS_NODE_PORT)"
|
||
|
||
log "phase 4.5: export $SRC_IMAGE to a local tarball for VM sideload"
|
||
# Arch the VM expects.
|
||
case "$ARCH" in
|
||
x86-64|x86_64) EXPECTED_IMAGE_ARCH=amd64 ;;
|
||
aarch64|arm64) EXPECTED_IMAGE_ARCH=arm64 ;;
|
||
esac
|
||
if ! podman image inspect "$SRC_IMAGE" >/dev/null 2>&1; then
|
||
log "source image $SRC_IMAGE not cached — attempting pull (platform=$EXPECTED_IMAGE_ARCH)"
|
||
podman pull --platform="linux/$EXPECTED_IMAGE_ARCH" "$SRC_IMAGE" >/dev/null || \
|
||
fail "podman pull $SRC_IMAGE failed (Docker Hub rate limit?). \
|
||
Pre-pull it when the quota is available (\`podman pull --platform=linux/$EXPECTED_IMAGE_ARCH $SRC_IMAGE\`), then re-run."
|
||
fi
|
||
# Verify arch matches. A podman cache shared across ARCH= runs can
|
||
# end up with a tag pointing at the wrong arch (pulling
|
||
# \`nginx:alpine\` for arm64 overwrites the tag's arm64/amd64
|
||
# binding). Better to fail loudly here than ship the VM an image
|
||
# it can't exec.
|
||
IMAGE_ACTUAL_ARCH="$(podman inspect "$SRC_IMAGE" --format '{{.Architecture}}' 2>/dev/null || true)"
|
||
if [[ "$IMAGE_ACTUAL_ARCH" != "$EXPECTED_IMAGE_ARCH" ]]; then
|
||
fail "$SRC_IMAGE is arch '$IMAGE_ACTUAL_ARCH' but ARCH=$ARCH needs '$EXPECTED_IMAGE_ARCH'. \
|
||
Either pre-pull the right platform (\`podman pull --platform=linux/$EXPECTED_IMAGE_ARCH $SRC_IMAGE\`) \
|
||
or point SRC_IMAGE at a locally-tagged variant."
|
||
fi
|
||
|
||
# The smoke upgrade test asserts container id change on image-tag
|
||
# change, so we'll expose two distinct local tag names pointing at
|
||
# the same bits. Tagging happens on the VM side after `podman load`
|
||
# so we stay compatible with older podman versions that don't grok
|
||
# the multi-image archive format (`podman save -m`).
|
||
V1_IMAGE="localdev/nginx:v1"
|
||
V2_IMAGE="localdev/nginx:v2"
|
||
|
||
IMAGE_TARBALL="$(mktemp -t iot-demo-images.XXXXXX.tar)"
|
||
podman save -o "$IMAGE_TARBALL" "$SRC_IMAGE" >/dev/null \
|
||
|| fail "podman save failed"
|
||
log "exported $SRC_IMAGE → $IMAGE_TARBALL ($(du -h "$IMAGE_TARBALL" | cut -f1))"
|
||
|
||
# ---- phase 5: provision VM + install agent ----------------------------------
|
||
|
||
log "phase 5: build iot-agent-v0 for arch=$ARCH + provision VM"
|
||
(
|
||
cd "$REPO_ROOT"
|
||
if [[ -n "$AGENT_TARGET" ]]; then
|
||
rustup target add "$AGENT_TARGET" >/dev/null
|
||
cargo build -q --release --target "$AGENT_TARGET" -p iot-agent-v0
|
||
else
|
||
cargo build -q --release -p iot-agent-v0
|
||
fi
|
||
)
|
||
if [[ -n "$AGENT_TARGET" ]]; then
|
||
AGENT_BINARY="$REPO_ROOT/target/$AGENT_TARGET/release/iot-agent-v0"
|
||
else
|
||
AGENT_BINARY="$REPO_ROOT/target/release/iot-agent-v0"
|
||
fi
|
||
[[ -f "$AGENT_BINARY" ]] || fail "agent binary missing: $AGENT_BINARY"
|
||
|
||
(
|
||
cd "$REPO_ROOT"
|
||
cargo run -q --release -p example_iot_vm_setup -- \
|
||
--arch "$EXAMPLE_ARCH" \
|
||
--vm-name "$VM_NAME" \
|
||
--device-id "$DEVICE_ID" \
|
||
--group "$GROUP" \
|
||
--agent-binary "$AGENT_BINARY" \
|
||
--nats-url "nats://$NAT_GW:$NATS_NODE_PORT"
|
||
)
|
||
|
||
VM_IP="$(virsh --connect "$LIBVIRT_URI" domifaddr "$VM_NAME" \
|
||
| awk '/ipv4/ { print $4 }' | head -1 | cut -d/ -f1)"
|
||
[[ -n "$VM_IP" ]] || fail "couldn't resolve VM IP"
|
||
|
||
# ---- phase 5c: sideload workload images into iot-agent's podman -------------
|
||
|
||
log "phase 5c: sideload $V1_IMAGE + $V2_IMAGE into iot-agent's podman on VM"
|
||
# scp the tarball (ssh as the admin user, the only one with sshd
|
||
# access), then `podman load` inside an iot-agent user session.
|
||
# Post-load the iot-agent's podman has both tags locally, so
|
||
# `ensure_image_present` in harmony's PodmanTopology takes the
|
||
# "already present, skip pull" branch — no Docker Hub hit.
|
||
scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||
-i "$HOME/.local/share/harmony/iot/ssh/id_ed25519" \
|
||
"$IMAGE_TARBALL" "iot-admin@$VM_IP:/tmp/iot-demo-images.tar" >/dev/null \
|
||
|| fail "scp image tarball to VM failed"
|
||
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||
-i "$HOME/.local/share/harmony/iot/ssh/id_ed25519" \
|
||
"iot-admin@$VM_IP" -- \
|
||
"sudo chown iot-agent:iot-agent /tmp/iot-demo-images.tar && \
|
||
sudo su - iot-agent -c 'XDG_RUNTIME_DIR=/run/user/\$(id -u) podman load -i /tmp/iot-demo-images.tar' && \
|
||
sudo su - iot-agent -c 'XDG_RUNTIME_DIR=/run/user/\$(id -u) podman tag $SRC_IMAGE $V1_IMAGE' && \
|
||
sudo su - iot-agent -c 'XDG_RUNTIME_DIR=/run/user/\$(id -u) podman tag $SRC_IMAGE $V2_IMAGE' && \
|
||
sudo rm -f /tmp/iot-demo-images.tar" >/dev/null \
|
||
|| fail "podman load + tag on VM failed"
|
||
rm -f "$IMAGE_TARBALL"
|
||
log "sideload complete — iot-agent's podman has $V1_IMAGE + $V2_IMAGE"
|
||
|
||
# ---- phase 6: sanity --------------------------------------------------------
|
||
|
||
log "phase 6: sanity — operator + agent + KV"
|
||
for _ in $(seq 1 60); do
|
||
if kubectl -n "$NATS_NAMESPACE" get pod -l app="$NATS_NAME" \
|
||
-o jsonpath='{.items[0].status.phase}' 2>/dev/null \
|
||
| grep -q Running; then
|
||
break
|
||
fi
|
||
sleep 1
|
||
done
|
||
|
||
# NATS box one-liner we'll reuse in the hand-off too. Uses the host
|
||
# loadbalancer port so no pod-network plumbing needed.
|
||
NATSBOX_HOST="podman run --rm docker.io/natsio/nats-box:latest \
|
||
nats --server nats://host.containers.internal:$NATS_NODE_PORT"
|
||
|
||
log "checking agent heartbeat in NATS KV (device-heartbeat bucket)"
|
||
for _ in $(seq 1 30); do
|
||
if $NATSBOX_HOST kv get device-heartbeat "heartbeat.$DEVICE_ID" --raw \
|
||
>/dev/null 2>&1; then
|
||
break
|
||
fi
|
||
sleep 2
|
||
done
|
||
$NATSBOX_HOST kv get device-heartbeat "heartbeat.$DEVICE_ID" --raw >/dev/null \
|
||
|| fail "agent never published heartbeat to NATS"
|
||
log "agent heartbeat present: heartbeat.$DEVICE_ID"
|
||
|
||
# ---- phase 7: either hand off to user, or drive regression ------------------
|
||
|
||
if [[ "$AUTO" == "1" ]]; then
|
||
log "phase 7 (--auto): apply nginx via typed CR, verify, upgrade, delete"
|
||
|
||
log "applying $V1_IMAGE deployment"
|
||
(
|
||
cd "$REPO_ROOT"
|
||
cargo run -q -p example_iot_apply_deployment -- \
|
||
--namespace "$DEPLOY_NS" \
|
||
--name "$DEPLOY_NAME" \
|
||
--target-device "$DEVICE_ID" \
|
||
--image "$V1_IMAGE" \
|
||
--port "$DEPLOY_PORT"
|
||
)
|
||
|
||
log "waiting for container on VM (up to $((CONTAINER_WAIT_STEPS * 2))s)"
|
||
CONTAINER_ID_V1=""
|
||
for _ in $(seq 1 "$CONTAINER_WAIT_STEPS"); do
|
||
id="$(ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||
-i "$HOME/.local/share/harmony/iot/ssh/id_ed25519" \
|
||
"iot-admin@$VM_IP" -- \
|
||
"sudo su - iot-agent -c 'XDG_RUNTIME_DIR=/run/user/\$(id -u) podman ps -q --filter name=$DEPLOY_NAME'" \
|
||
2>/dev/null | head -1)" || true
|
||
if [[ -n "$id" ]]; then CONTAINER_ID_V1="$id"; break; fi
|
||
sleep 2
|
||
done
|
||
[[ -n "$CONTAINER_ID_V1" ]] || fail "nginx container never appeared on VM"
|
||
log "container id (v1): $CONTAINER_ID_V1"
|
||
|
||
log "curl http://$VM_IP:${DEPLOY_PORT%%:*}/"
|
||
for _ in $(seq 1 30); do
|
||
if curl -sf "http://$VM_IP:${DEPLOY_PORT%%:*}/" >/dev/null; then
|
||
log "nginx responded (v1)"; break
|
||
fi
|
||
sleep 2
|
||
done
|
||
|
||
log "waiting for operator to aggregate .status.aggregate.succeeded == 1"
|
||
for _ in $(seq 1 30); do
|
||
got="$(kubectl -n "$DEPLOY_NS" get deployment.iot.nationtech.io "$DEPLOY_NAME" \
|
||
-o jsonpath='{.status.aggregate.succeeded}' 2>/dev/null || true)"
|
||
if [[ "$got" == "1" ]]; then
|
||
log ".status.aggregate.succeeded = 1 — aggregator reflected agent state"
|
||
break
|
||
fi
|
||
sleep 2
|
||
done
|
||
got="$(kubectl -n "$DEPLOY_NS" get deployment.iot.nationtech.io "$DEPLOY_NAME" \
|
||
-o jsonpath='{.status.aggregate.succeeded}' 2>/dev/null || true)"
|
||
[[ "$got" == "1" ]] || fail ".status.aggregate.succeeded never reached 1 (got '$got')"
|
||
|
||
log "upgrading to $V2_IMAGE"
|
||
(
|
||
cd "$REPO_ROOT"
|
||
cargo run -q -p example_iot_apply_deployment -- \
|
||
--namespace "$DEPLOY_NS" \
|
||
--name "$DEPLOY_NAME" \
|
||
--target-device "$DEVICE_ID" \
|
||
--image "$V2_IMAGE" \
|
||
--port "$DEPLOY_PORT"
|
||
)
|
||
log "waiting for container id to change (upgrade, up to $((CONTAINER_WAIT_STEPS * 2))s)"
|
||
CONTAINER_ID_V2=""
|
||
for _ in $(seq 1 "$CONTAINER_WAIT_STEPS"); do
|
||
id="$(ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||
-i "$HOME/.local/share/harmony/iot/ssh/id_ed25519" \
|
||
"iot-admin@$VM_IP" -- \
|
||
"sudo su - iot-agent -c 'XDG_RUNTIME_DIR=/run/user/\$(id -u) podman ps -q --filter name=$DEPLOY_NAME'" \
|
||
2>/dev/null | head -1)" || true
|
||
if [[ -n "$id" && "$id" != "$CONTAINER_ID_V1" ]]; then
|
||
CONTAINER_ID_V2="$id"; break
|
||
fi
|
||
sleep 2
|
||
done
|
||
[[ -n "$CONTAINER_ID_V2" ]] || fail "container id did not change after upgrade"
|
||
log "container id (v2): $CONTAINER_ID_V2 — upgrade confirmed"
|
||
|
||
log "deleting deployment"
|
||
(
|
||
cd "$REPO_ROOT"
|
||
cargo run -q -p example_iot_apply_deployment -- \
|
||
--namespace "$DEPLOY_NS" \
|
||
--name "$DEPLOY_NAME" \
|
||
--target-device "$DEVICE_ID" \
|
||
--delete
|
||
)
|
||
for _ in $(seq 1 60); do
|
||
if ! ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||
-i "$HOME/.local/share/harmony/iot/ssh/id_ed25519" \
|
||
"iot-admin@$VM_IP" -- podman ps -q --filter "name=$DEPLOY_NAME" 2>/dev/null \
|
||
| grep -q .; then
|
||
log "container removed from VM"
|
||
break
|
||
fi
|
||
sleep 2
|
||
done
|
||
|
||
log "PASS (--auto)"
|
||
exit 0
|
||
fi
|
||
|
||
# ---- hand-off mode ----------------------------------------------------------
|
||
|
||
SSH_KEY="$HOME/.local/share/harmony/iot/ssh/id_ed25519"
|
||
|
||
cat <<EOF
|
||
|
||
$(printf '\033[1;32m[smoke-a4]\033[0m full stack is up. Your playground:\n')
|
||
|
||
KUBECONFIG=$KUBECONFIG_FILE
|
||
VM IP: $VM_IP
|
||
device id: $DEVICE_ID
|
||
libvirt NAT gateway (VM's view of the host): $NAT_GW
|
||
NATS URL (from host): nats://localhost:$NATS_NODE_PORT
|
||
NATS URL (from the VM): nats://$NAT_GW:$NATS_NODE_PORT
|
||
|
||
$(printf '\033[1mWatch CRs reconcile:\033[0m\n')
|
||
kubectl get deployments.iot.nationtech.io -A -w
|
||
|
||
$(printf '\033[1mApply an nginx deployment (typed Rust):\033[0m\n')
|
||
cargo run -q -p example_iot_apply_deployment -- \\
|
||
--namespace $DEPLOY_NS \\
|
||
--name $DEPLOY_NAME \\
|
||
--target-device $DEVICE_ID \\
|
||
--image docker.io/library/nginx:latest
|
||
|
||
$(printf '\033[1mUpgrade it:\033[0m\n')
|
||
cargo run -q -p example_iot_apply_deployment -- \\
|
||
--namespace $DEPLOY_NS --name $DEPLOY_NAME --target-device $DEVICE_ID \\
|
||
--image docker.io/library/nginx:1.26
|
||
|
||
$(printf '\033[1mPreview the CR as JSON (and apply via kubectl):\033[0m\n')
|
||
cargo run -q -p example_iot_apply_deployment -- \\
|
||
--name $DEPLOY_NAME --target-device $DEVICE_ID \\
|
||
--image docker.io/library/nginx:latest --print | kubectl apply -f -
|
||
|
||
$(printf '\033[1mConnect to the device:\033[0m\n')
|
||
ssh -i $SSH_KEY iot-admin@$VM_IP
|
||
virsh --connect $LIBVIRT_URI console $VM_NAME --force # alternative
|
||
# list containers (agent runs rootless as iot-agent, not iot-admin):
|
||
ssh -i $SSH_KEY iot-admin@$VM_IP "sudo su - iot-agent -c 'XDG_RUNTIME_DIR=/run/user/\$(id -u) podman ps'"
|
||
|
||
$(printf '\033[1mInspect NATS KV (natsbox):\033[0m\n')
|
||
alias natsbox='podman run --rm docker.io/natsio/nats-box:latest nats --server nats://host.containers.internal:$NATS_NODE_PORT'
|
||
natsbox kv ls desired-state
|
||
natsbox kv get desired-state '$DEVICE_ID.$DEPLOY_NAME' --raw
|
||
natsbox kv ls device-state
|
||
natsbox kv ls device-heartbeat
|
||
natsbox kv get device-heartbeat 'heartbeat.$DEVICE_ID' --raw
|
||
|
||
$(printf '\033[1mHit the deployed nginx:\033[0m\n')
|
||
curl http://$VM_IP:${DEPLOY_PORT%%:*}/
|
||
|
||
$(printf '\033[1mOperator log:\033[0m\n')
|
||
tail -F $OPERATOR_LOG
|
||
|
||
$(printf '\033[1;33mCtrl-C to tear everything down.\033[0m\n')
|
||
EOF
|
||
|
||
# Block until user interrupts; cleanup trap handles teardown.
|
||
while true; do sleep 60; done
|