Files
harmony/iot/scripts/smoke-a3.sh
Jean-Gabriel Gill-Couture 9e42c15901 refactor(iot/smoke): update smoke scripts for new KV wire layout
- agent-status bucket -> device-heartbeat bucket
- status.<device> key -> heartbeat.<device>
- drop parity check summary from smoke-a4 (legacy path is gone)
- tidy stale AgentStatus comment in agent main
2026-04-22 21:10:55 -04:00

202 lines
8.1 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# End-to-end smoke test for the VM-as-device flow.
#
# [libvirt qemu:///system] ──KvmVmScore──▶ VM (Ubuntu 24.04, cloud-init'd)
# │
# ssh+Ansible ◀────┘
# │
# ▼
# IotDeviceSetupScore ──▶ podman + iot-agent on VM
# │
# ▼
# existing operator ──NATS────────┘ (agent joins fleet, reconciles CR)
# │
# ▼ [phase 5]
# virsh reboot → agent reconnects
#
# Prerequisites on the runner host — all one-time, all generic:
# 1. libvirt + qemu + xorriso + python3 + podman + cargo + kubectl
# (Arch: pacman -S libvirt qemu-full libisoburn python podman
# Debian/Ubuntu: apt install libvirt-daemon-system qemu-kvm
# xorriso python3 python3-venv podman)
# 2. Be in the `libvirt` group (`sudo usermod -aG libvirt $USER`)
# 3. `sudo virsh net-start default && sudo virsh net-autostart default`
#
# Harmony handles *everything else*: cloud image download, SSH key
# generation, libvirt pool creation, ansible install, agent build.
# First run costs ~2 min to populate caches; subsequent runs hit the
# cache in <1 s.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
VM_NAME="${VM_NAME:-iot-smoke-vm}"
DEVICE_ID="${DEVICE_ID:-$VM_NAME}"
GROUP="${GROUP:-group-a}"
LIBVIRT_URI="${LIBVIRT_URI:-qemu:///system}"
# Guest architecture. `x86-64` runs native KVM; `aarch64` runs under
# qemu-system-aarch64 TCG on x86 hosts (3-5× slower but exercises the
# real Pi target). Changes: cloud-image URL, qemu binary, agent build
# target, phase 4 timeout.
ARCH="${ARCH:-x86-64}"
NATS_CONTAINER="${NATS_CONTAINER:-iot-smoke-nats-a3}"
NATS_NET_NAME="${NATS_NET_NAME:-iot-smoke-net-a3}"
NATS_IMAGE="${NATS_IMAGE:-docker.io/library/nats:2.10-alpine}"
NATS_PORT="${NATS_PORT:-4222}"
log() { printf '\033[1;34m[smoke-a3]\033[0m %s\n' "$*"; }
fail() { printf '\033[1;31m[smoke-a3 FAIL]\033[0m %s\n' "$*" >&2; exit 1; }
case "$ARCH" in
x86-64|x86_64) EXAMPLE_ARCH=x86-64; AGENT_TARGET= ;;
aarch64|arm64) EXAMPLE_ARCH=aarch64; AGENT_TARGET=aarch64-unknown-linux-gnu ;;
*) fail "unsupported ARCH=$ARCH (expected: x86-64 | aarch64)" ;;
esac
cleanup() {
local rc=$?
log "cleanup…"
if [[ "${KEEP:-0}" != "1" ]]; then
virsh --connect "$LIBVIRT_URI" destroy "$VM_NAME" 2>/dev/null || true
# `--nvram` is required for aarch64 domains (which have a
# per-VM NVRAM file); harmless on x86_64 where no NVRAM is
# registered. Without it, `undefine` refuses and the next
# run sees a stale domain with whatever XML the previous
# run defined — masking XML changes until manually cleaned.
virsh --connect "$LIBVIRT_URI" undefine --nvram \
--remove-all-storage "$VM_NAME" 2>/dev/null || true
podman rm -f "$NATS_CONTAINER" >/dev/null 2>&1 || true
podman network rm "$NATS_NET_NAME" >/dev/null 2>&1 || true
else
log "KEEP=1 — leaving VM '$VM_NAME' and NATS '$NATS_CONTAINER' running"
fi
exit $rc
}
trap cleanup EXIT INT TERM
require() { command -v "$1" >/dev/null 2>&1 || fail "missing required tool: $1"; }
require podman
require cargo
require virsh
# ---------------------------- phase 1: NATS ----------------------------
log "phase 1: start NATS container on host"
podman network exists "$NATS_NET_NAME" || podman network create "$NATS_NET_NAME" >/dev/null
podman rm -f "$NATS_CONTAINER" >/dev/null 2>&1 || true
podman run -d \
--name "$NATS_CONTAINER" \
--network "$NATS_NET_NAME" \
-p "$NATS_PORT:4222" \
"$NATS_IMAGE" -js >/dev/null
NAT_GW="$(virsh --connect "$LIBVIRT_URI" net-dumpxml default \
| grep -oP "ip address='\K[^']+" | head -1)"
[[ -n "$NAT_GW" ]] || fail "couldn't determine libvirt 'default' gateway IP"
log "libvirt network gateway = $NAT_GW (VM will dial NATS at nats://$NAT_GW:$NATS_PORT)"
# ---------------------------- phase 2: build ---------------------------
log "phase 2: build iot-agent-v0 for guest arch=$ARCH (release — debug binary fills cloud rootfs)"
(
cd "$REPO_ROOT"
if [[ -n "$AGENT_TARGET" ]]; then
rustup target add "$AGENT_TARGET" >/dev/null
cargo build -q --release --target "$AGENT_TARGET" -p iot-agent-v0
else
cargo build -q --release -p iot-agent-v0
fi
)
if [[ -n "$AGENT_TARGET" ]]; then
AGENT_BINARY="$REPO_ROOT/target/$AGENT_TARGET/release/iot-agent-v0"
else
AGENT_BINARY="$REPO_ROOT/target/release/iot-agent-v0"
fi
[[ -f "$AGENT_BINARY" ]] || fail "agent binary missing after build: $AGENT_BINARY"
# ---------------------------- phase 3: bootstrap + provision + setup ----------------------------
log "phase 3: bootstrap assets + provision VM + onboard device (arch=$EXAMPLE_ARCH)"
(
cd "$REPO_ROOT"
cargo run -q --release -p example_iot_vm_setup -- \
--arch "$EXAMPLE_ARCH" \
--vm-name "$VM_NAME" \
--device-id "$DEVICE_ID" \
--group "$GROUP" \
--agent-binary "$AGENT_BINARY" \
--nats-url "nats://$NAT_GW:$NATS_PORT"
)
# ---------------------------- phase 4: initial status ----------------------------
# TCG emulation slows agent boot + first NATS publish significantly.
# 60s is fine for native KVM but too tight for aarch64-on-x86.
case "$ARCH" in
aarch64|arm64) STATUS_TIMEOUT=300 ;;
*) STATUS_TIMEOUT=60 ;;
esac
log "phase 4: wait for agent to report heartbeat to NATS (timeout=${STATUS_TIMEOUT}s)"
wait_for_status() {
local timeout=$1
for _ in $(seq 1 "$timeout"); do
if podman run --rm --network "$NATS_NET_NAME" \
docker.io/natsio/nats-box:latest \
nats --server "nats://$NATS_CONTAINER:4222" kv get device-heartbeat \
"heartbeat.$DEVICE_ID" --raw >/dev/null 2>&1; then
return 0
fi
sleep 1
done
return 1
}
wait_for_status "$STATUS_TIMEOUT" || fail "device-heartbeat never appeared for $DEVICE_ID"
log "agent heartbeat present on NATS"
# ---------------------------- phase 5: hard power-cycle, expect recovery ----------------------------
log "phase 5: power-cycle VM (virsh destroy + start) → agent must reconnect to NATS"
nats_status_timestamp() {
# Prints the "at" field of the heartbeat.<device> entry, or "".
# Never errors (for `set -e` safety).
podman run --rm --network "$NATS_NET_NAME" \
docker.io/natsio/nats-box:latest \
nats --server "nats://$NATS_CONTAINER:4222" kv get device-heartbeat \
"heartbeat.$DEVICE_ID" --raw 2>/dev/null \
| grep -oE '"at":"[^"]+"' \
| head -1 | cut -d'"' -f4 || true
}
virsh --connect "$LIBVIRT_URI" destroy "$VM_NAME" >/dev/null
# `virsh destroy` returns before the qemu process is fully torn down;
# wait a couple seconds to be sure the agent is dead and can't flush a
# final status update after our gate.
sleep 3
REBOOT_GATE="$(date -u +%Y-%m-%dT%H:%M:%S+00:00)"
log "reboot gate = $REBOOT_GATE (any agent timestamp > this is post-reboot)"
virsh --connect "$LIBVIRT_URI" start "$VM_NAME" >/dev/null
case "$ARCH" in
aarch64|arm64) REBOOT_STEPS=900 ;; # ~30 min under TCG
*) REBOOT_STEPS=120 ;; # ~4 min on native KVM
esac
log "waiting for agent to re-report status (post-reboot, up to $((REBOOT_STEPS*2))s)…"
TS_AFTER=""
for _ in $(seq 1 "$REBOOT_STEPS"); do
sleep 2
ts="$(nats_status_timestamp)"
# ISO-8601 timestamps compare correctly lexicographically when the
# format is identical. Both the agent and `date -u -Iseconds`
# produce RFC 3339 UTC strings so string > works.
if [[ -n "$ts" && "$ts" > "$REBOOT_GATE" ]]; then
TS_AFTER="$ts"
break
fi
done
if [[ -z "$TS_AFTER" ]]; then
fail "agent did not write a post-reboot status within ~$((REBOOT_STEPS*2))s (gate: $REBOOT_GATE)"
fi
log "post-reboot status seen at $TS_AFTER"
log "PASS — VM $VM_NAME power-cycled and re-onboarded (group=$GROUP)"