- agent-status bucket -> device-heartbeat bucket - status.<device> key -> heartbeat.<device> - drop parity check summary from smoke-a4 (legacy path is gone) - tidy stale AgentStatus comment in agent main
202 lines
8.1 KiB
Bash
Executable File
202 lines
8.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# End-to-end smoke test for the VM-as-device flow.
|
||
#
|
||
# [libvirt qemu:///system] ──KvmVmScore──▶ VM (Ubuntu 24.04, cloud-init'd)
|
||
# │
|
||
# ssh+Ansible ◀────┘
|
||
# │
|
||
# ▼
|
||
# IotDeviceSetupScore ──▶ podman + iot-agent on VM
|
||
# │
|
||
# ▼
|
||
# existing operator ──NATS────────┘ (agent joins fleet, reconciles CR)
|
||
# │
|
||
# ▼ [phase 5]
|
||
# virsh reboot → agent reconnects
|
||
#
|
||
# Prerequisites on the runner host — all one-time, all generic:
|
||
# 1. libvirt + qemu + xorriso + python3 + podman + cargo + kubectl
|
||
# (Arch: pacman -S libvirt qemu-full libisoburn python podman
|
||
# Debian/Ubuntu: apt install libvirt-daemon-system qemu-kvm
|
||
# xorriso python3 python3-venv podman)
|
||
# 2. Be in the `libvirt` group (`sudo usermod -aG libvirt $USER`)
|
||
# 3. `sudo virsh net-start default && sudo virsh net-autostart default`
|
||
#
|
||
# Harmony handles *everything else*: cloud image download, SSH key
|
||
# generation, libvirt pool creation, ansible install, agent build.
|
||
# First run costs ~2 min to populate caches; subsequent runs hit the
|
||
# cache in <1 s.
|
||
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
|
||
VM_NAME="${VM_NAME:-iot-smoke-vm}"
|
||
DEVICE_ID="${DEVICE_ID:-$VM_NAME}"
|
||
GROUP="${GROUP:-group-a}"
|
||
LIBVIRT_URI="${LIBVIRT_URI:-qemu:///system}"
|
||
|
||
# Guest architecture. `x86-64` runs native KVM; `aarch64` runs under
|
||
# qemu-system-aarch64 TCG on x86 hosts (3-5× slower but exercises the
|
||
# real Pi target). Changes: cloud-image URL, qemu binary, agent build
|
||
# target, phase 4 timeout.
|
||
ARCH="${ARCH:-x86-64}"
|
||
|
||
NATS_CONTAINER="${NATS_CONTAINER:-iot-smoke-nats-a3}"
|
||
NATS_NET_NAME="${NATS_NET_NAME:-iot-smoke-net-a3}"
|
||
NATS_IMAGE="${NATS_IMAGE:-docker.io/library/nats:2.10-alpine}"
|
||
NATS_PORT="${NATS_PORT:-4222}"
|
||
|
||
log() { printf '\033[1;34m[smoke-a3]\033[0m %s\n' "$*"; }
|
||
fail() { printf '\033[1;31m[smoke-a3 FAIL]\033[0m %s\n' "$*" >&2; exit 1; }
|
||
|
||
case "$ARCH" in
|
||
x86-64|x86_64) EXAMPLE_ARCH=x86-64; AGENT_TARGET= ;;
|
||
aarch64|arm64) EXAMPLE_ARCH=aarch64; AGENT_TARGET=aarch64-unknown-linux-gnu ;;
|
||
*) fail "unsupported ARCH=$ARCH (expected: x86-64 | aarch64)" ;;
|
||
esac
|
||
|
||
cleanup() {
|
||
local rc=$?
|
||
log "cleanup…"
|
||
if [[ "${KEEP:-0}" != "1" ]]; then
|
||
virsh --connect "$LIBVIRT_URI" destroy "$VM_NAME" 2>/dev/null || true
|
||
# `--nvram` is required for aarch64 domains (which have a
|
||
# per-VM NVRAM file); harmless on x86_64 where no NVRAM is
|
||
# registered. Without it, `undefine` refuses and the next
|
||
# run sees a stale domain with whatever XML the previous
|
||
# run defined — masking XML changes until manually cleaned.
|
||
virsh --connect "$LIBVIRT_URI" undefine --nvram \
|
||
--remove-all-storage "$VM_NAME" 2>/dev/null || true
|
||
podman rm -f "$NATS_CONTAINER" >/dev/null 2>&1 || true
|
||
podman network rm "$NATS_NET_NAME" >/dev/null 2>&1 || true
|
||
else
|
||
log "KEEP=1 — leaving VM '$VM_NAME' and NATS '$NATS_CONTAINER' running"
|
||
fi
|
||
exit $rc
|
||
}
|
||
trap cleanup EXIT INT TERM
|
||
|
||
require() { command -v "$1" >/dev/null 2>&1 || fail "missing required tool: $1"; }
|
||
require podman
|
||
require cargo
|
||
require virsh
|
||
|
||
# ---------------------------- phase 1: NATS ----------------------------
|
||
log "phase 1: start NATS container on host"
|
||
podman network exists "$NATS_NET_NAME" || podman network create "$NATS_NET_NAME" >/dev/null
|
||
podman rm -f "$NATS_CONTAINER" >/dev/null 2>&1 || true
|
||
podman run -d \
|
||
--name "$NATS_CONTAINER" \
|
||
--network "$NATS_NET_NAME" \
|
||
-p "$NATS_PORT:4222" \
|
||
"$NATS_IMAGE" -js >/dev/null
|
||
|
||
NAT_GW="$(virsh --connect "$LIBVIRT_URI" net-dumpxml default \
|
||
| grep -oP "ip address='\K[^']+" | head -1)"
|
||
[[ -n "$NAT_GW" ]] || fail "couldn't determine libvirt 'default' gateway IP"
|
||
log "libvirt network gateway = $NAT_GW (VM will dial NATS at nats://$NAT_GW:$NATS_PORT)"
|
||
|
||
# ---------------------------- phase 2: build ---------------------------
|
||
log "phase 2: build iot-agent-v0 for guest arch=$ARCH (release — debug binary fills cloud rootfs)"
|
||
(
|
||
cd "$REPO_ROOT"
|
||
if [[ -n "$AGENT_TARGET" ]]; then
|
||
rustup target add "$AGENT_TARGET" >/dev/null
|
||
cargo build -q --release --target "$AGENT_TARGET" -p iot-agent-v0
|
||
else
|
||
cargo build -q --release -p iot-agent-v0
|
||
fi
|
||
)
|
||
if [[ -n "$AGENT_TARGET" ]]; then
|
||
AGENT_BINARY="$REPO_ROOT/target/$AGENT_TARGET/release/iot-agent-v0"
|
||
else
|
||
AGENT_BINARY="$REPO_ROOT/target/release/iot-agent-v0"
|
||
fi
|
||
[[ -f "$AGENT_BINARY" ]] || fail "agent binary missing after build: $AGENT_BINARY"
|
||
|
||
# ---------------------------- phase 3: bootstrap + provision + setup ----------------------------
|
||
log "phase 3: bootstrap assets + provision VM + onboard device (arch=$EXAMPLE_ARCH)"
|
||
(
|
||
cd "$REPO_ROOT"
|
||
cargo run -q --release -p example_iot_vm_setup -- \
|
||
--arch "$EXAMPLE_ARCH" \
|
||
--vm-name "$VM_NAME" \
|
||
--device-id "$DEVICE_ID" \
|
||
--group "$GROUP" \
|
||
--agent-binary "$AGENT_BINARY" \
|
||
--nats-url "nats://$NAT_GW:$NATS_PORT"
|
||
)
|
||
|
||
# ---------------------------- phase 4: initial status ----------------------------
|
||
# TCG emulation slows agent boot + first NATS publish significantly.
|
||
# 60s is fine for native KVM but too tight for aarch64-on-x86.
|
||
case "$ARCH" in
|
||
aarch64|arm64) STATUS_TIMEOUT=300 ;;
|
||
*) STATUS_TIMEOUT=60 ;;
|
||
esac
|
||
log "phase 4: wait for agent to report heartbeat to NATS (timeout=${STATUS_TIMEOUT}s)"
|
||
wait_for_status() {
|
||
local timeout=$1
|
||
for _ in $(seq 1 "$timeout"); do
|
||
if podman run --rm --network "$NATS_NET_NAME" \
|
||
docker.io/natsio/nats-box:latest \
|
||
nats --server "nats://$NATS_CONTAINER:4222" kv get device-heartbeat \
|
||
"heartbeat.$DEVICE_ID" --raw >/dev/null 2>&1; then
|
||
return 0
|
||
fi
|
||
sleep 1
|
||
done
|
||
return 1
|
||
}
|
||
wait_for_status "$STATUS_TIMEOUT" || fail "device-heartbeat never appeared for $DEVICE_ID"
|
||
log "agent heartbeat present on NATS"
|
||
|
||
# ---------------------------- phase 5: hard power-cycle, expect recovery ----------------------------
|
||
log "phase 5: power-cycle VM (virsh destroy + start) → agent must reconnect to NATS"
|
||
|
||
nats_status_timestamp() {
|
||
# Prints the "at" field of the heartbeat.<device> entry, or "".
|
||
# Never errors (for `set -e` safety).
|
||
podman run --rm --network "$NATS_NET_NAME" \
|
||
docker.io/natsio/nats-box:latest \
|
||
nats --server "nats://$NATS_CONTAINER:4222" kv get device-heartbeat \
|
||
"heartbeat.$DEVICE_ID" --raw 2>/dev/null \
|
||
| grep -oE '"at":"[^"]+"' \
|
||
| head -1 | cut -d'"' -f4 || true
|
||
}
|
||
|
||
virsh --connect "$LIBVIRT_URI" destroy "$VM_NAME" >/dev/null
|
||
# `virsh destroy` returns before the qemu process is fully torn down;
|
||
# wait a couple seconds to be sure the agent is dead and can't flush a
|
||
# final status update after our gate.
|
||
sleep 3
|
||
REBOOT_GATE="$(date -u +%Y-%m-%dT%H:%M:%S+00:00)"
|
||
log "reboot gate = $REBOOT_GATE (any agent timestamp > this is post-reboot)"
|
||
virsh --connect "$LIBVIRT_URI" start "$VM_NAME" >/dev/null
|
||
|
||
case "$ARCH" in
|
||
aarch64|arm64) REBOOT_STEPS=900 ;; # ~30 min under TCG
|
||
*) REBOOT_STEPS=120 ;; # ~4 min on native KVM
|
||
esac
|
||
log "waiting for agent to re-report status (post-reboot, up to $((REBOOT_STEPS*2))s)…"
|
||
TS_AFTER=""
|
||
for _ in $(seq 1 "$REBOOT_STEPS"); do
|
||
sleep 2
|
||
ts="$(nats_status_timestamp)"
|
||
# ISO-8601 timestamps compare correctly lexicographically when the
|
||
# format is identical. Both the agent and `date -u -Iseconds`
|
||
# produce RFC 3339 UTC strings so string > works.
|
||
if [[ -n "$ts" && "$ts" > "$REBOOT_GATE" ]]; then
|
||
TS_AFTER="$ts"
|
||
break
|
||
fi
|
||
done
|
||
if [[ -z "$TS_AFTER" ]]; then
|
||
fail "agent did not write a post-reboot status within ~$((REBOOT_STEPS*2))s (gate: $REBOOT_GATE)"
|
||
fi
|
||
log "post-reboot status seen at $TS_AFTER"
|
||
|
||
log "PASS — VM $VM_NAME power-cycled and re-onboarded (group=$GROUP)"
|