harmony/iot/scripts/smoke-a3.sh

#!/usr/bin/env bash
# End-to-end smoke test for the VM-as-device flow.
#
#   [libvirt qemu:///system] ──KvmVmScore──▶ VM (Ubuntu 24.04, cloud-init'd)
#                                              │
#                             ssh+Ansible ◀────┘
#                                   │
#                                   ▼
#   IotDeviceSetupScore ──▶ podman + iot-agent on VM
#                                   │
#                                   ▼
#   existing operator ──NATS────────┘   (agent joins fleet, reconciles CR)
#                                   │
#                                   ▼   [phase 5]
#                              virsh reboot → agent reconnects
#
# Prerequisites on the runner host — all one-time, all generic:
#   1. libvirt + qemu + xorriso + python3 + podman + cargo + kubectl
#      (Arch: pacman -S libvirt qemu-full libisoburn python podman
#       Debian/Ubuntu: apt install libvirt-daemon-system qemu-kvm
#                                  xorriso python3 python3-venv podman)
#   2. Be in the `libvirt` group (`sudo usermod -aG libvirt $USER`)
#   3. `sudo virsh net-start default && sudo virsh net-autostart default`
#
# Harmony handles *everything else*: cloud image download, SSH key
# generation, libvirt pool creation, ansible install, agent build.
# First run costs ~2 min to populate caches; subsequent runs hit the
# cache in <1 s.

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"

VM_NAME="${VM_NAME:-iot-smoke-vm}"
DEVICE_ID="${DEVICE_ID:-$VM_NAME}"
GROUP="${GROUP:-group-a}"
LIBVIRT_URI="${LIBVIRT_URI:-qemu:///system}"

# Guest architecture. `x86-64` runs native KVM; `aarch64` runs under
# qemu-system-aarch64 TCG on x86 hosts (3-5× slower but exercises the
# real Pi target). Changes: cloud-image URL, qemu binary, agent build
# target, phase 4 timeout.
ARCH="${ARCH:-x86-64}"

NATS_CONTAINER="${NATS_CONTAINER:-iot-smoke-nats-a3}"
NATS_NET_NAME="${NATS_NET_NAME:-iot-smoke-net-a3}"
NATS_IMAGE="${NATS_IMAGE:-docker.io/library/nats:2.10-alpine}"
NATS_PORT="${NATS_PORT:-4222}"

log() { printf '\033[1;34m[smoke-a3]\033[0m %s\n' "$*"; }
fail() { printf '\033[1;31m[smoke-a3 FAIL]\033[0m %s\n' "$*" >&2; exit 1; }

case "$ARCH" in
    x86-64|x86_64) EXAMPLE_ARCH=x86-64; AGENT_TARGET= ;;
    aarch64|arm64) EXAMPLE_ARCH=aarch64; AGENT_TARGET=aarch64-unknown-linux-gnu ;;
    *) fail "unsupported ARCH=$ARCH (expected: x86-64 | aarch64)" ;;
esac

cleanup() {
    local rc=$?
    log "cleanup…"
    if [[ "${KEEP:-0}" != "1" ]]; then
        virsh --connect "$LIBVIRT_URI" destroy "$VM_NAME" 2>/dev/null || true
        # `--nvram` is required for aarch64 domains (which have a
        # per-VM NVRAM file); harmless on x86_64 where no NVRAM is
        # registered. Without it, `undefine` refuses and the next
        # run sees a stale domain with whatever XML the previous
        # run defined — masking XML changes until manually cleaned.
        virsh --connect "$LIBVIRT_URI" undefine --nvram \
            --remove-all-storage "$VM_NAME" 2>/dev/null || true
        podman rm -f "$NATS_CONTAINER" >/dev/null 2>&1 || true
        podman network rm "$NATS_NET_NAME" >/dev/null 2>&1 || true
    else
        log "KEEP=1 — leaving VM '$VM_NAME' and NATS '$NATS_CONTAINER' running"
    fi
    exit $rc
}
trap cleanup EXIT INT TERM

require() { command -v "$1" >/dev/null 2>&1 || fail "missing required tool: $1"; }
require podman
require cargo
require virsh

# ---------------------------- phase 1: NATS ----------------------------
log "phase 1: start NATS container on host"
podman network exists "$NATS_NET_NAME" || podman network create "$NATS_NET_NAME" >/dev/null
podman rm -f "$NATS_CONTAINER" >/dev/null 2>&1 || true
podman run -d \
    --name "$NATS_CONTAINER" \
    --network "$NATS_NET_NAME" \
    -p "$NATS_PORT:4222" \
    "$NATS_IMAGE" -js >/dev/null

NAT_GW="$(virsh --connect "$LIBVIRT_URI" net-dumpxml default \
    | grep -oP "ip address='\K[^']+" | head -1)"
[[ -n "$NAT_GW" ]] || fail "couldn't determine libvirt 'default' gateway IP"
log "libvirt network gateway = $NAT_GW (VM will dial NATS at nats://$NAT_GW:$NATS_PORT)"

# ---------------------------- phase 2: build ---------------------------
log "phase 2: build iot-agent-v0 for guest arch=$ARCH (release — debug binary fills cloud rootfs)"
(
    cd "$REPO_ROOT"
    if [[ -n "$AGENT_TARGET" ]]; then
        rustup target add "$AGENT_TARGET" >/dev/null
        cargo build -q --release --target "$AGENT_TARGET" -p iot-agent-v0
    else
        cargo build -q --release -p iot-agent-v0
    fi
)
if [[ -n "$AGENT_TARGET" ]]; then
    AGENT_BINARY="$REPO_ROOT/target/$AGENT_TARGET/release/iot-agent-v0"
else
    AGENT_BINARY="$REPO_ROOT/target/release/iot-agent-v0"
fi
[[ -f "$AGENT_BINARY" ]] || fail "agent binary missing after build: $AGENT_BINARY"

# ---------------------------- phase 3: bootstrap + provision + setup ----------------------------
log "phase 3: bootstrap assets + provision VM + onboard device (arch=$EXAMPLE_ARCH)"
(
    cd "$REPO_ROOT"
    cargo run -q --release -p example_iot_vm_setup -- \
        --arch "$EXAMPLE_ARCH" \
        --vm-name "$VM_NAME" \
        --device-id "$DEVICE_ID" \
        --group "$GROUP" \
        --agent-binary "$AGENT_BINARY" \
        --nats-url "nats://$NAT_GW:$NATS_PORT"
)

# ---------------------------- phase 4: initial status ----------------------------
# TCG emulation slows agent boot + first NATS publish significantly.
# 60s is fine for native KVM but too tight for aarch64-on-x86.
case "$ARCH" in
    aarch64|arm64) STATUS_TIMEOUT=300 ;;
    *)             STATUS_TIMEOUT=60  ;;
esac
log "phase 4: wait for agent to report status to NATS (timeout=${STATUS_TIMEOUT}s)"
wait_for_status() {
    local timeout=$1
    for _ in $(seq 1 "$timeout"); do
        if podman run --rm --network "$NATS_NET_NAME" \
                docker.io/natsio/nats-box:latest \
                nats --server "nats://$NATS_CONTAINER:4222" kv get agent-status \
                "status.$DEVICE_ID" --raw >/dev/null 2>&1; then
            return 0
        fi
        sleep 1
    done
    return 1
}
wait_for_status "$STATUS_TIMEOUT" || fail "agent-status never appeared for $DEVICE_ID"
log "agent status present on NATS"

# ---------------------------- phase 5: hard power-cycle, expect recovery ----------------------------
log "phase 5: power-cycle VM (virsh destroy + start) → agent must reconnect to NATS"

nats_status_timestamp() {
    # Prints the "timestamp" field of the status.<device> entry, or "".
    # Never errors (for `set -e` safety).
    podman run --rm --network "$NATS_NET_NAME" \
        docker.io/natsio/nats-box:latest \
        nats --server "nats://$NATS_CONTAINER:4222" kv get agent-status \
        "status.$DEVICE_ID" --raw 2>/dev/null \
        | grep -oE '"timestamp":"[^"]+"' \
        | head -1 | cut -d'"' -f4 || true
}

virsh --connect "$LIBVIRT_URI" destroy "$VM_NAME" >/dev/null
# `virsh destroy` returns before the qemu process is fully torn down;
# wait a couple seconds to be sure the agent is dead and can't flush a
# final status update after our gate.
sleep 3
REBOOT_GATE="$(date -u +%Y-%m-%dT%H:%M:%S+00:00)"
log "reboot gate = $REBOOT_GATE (any agent timestamp > this is post-reboot)"
virsh --connect "$LIBVIRT_URI" start "$VM_NAME" >/dev/null

case "$ARCH" in
    aarch64|arm64) REBOOT_STEPS=900 ;;   # ~30 min under TCG
    *)             REBOOT_STEPS=120 ;;   # ~4 min on native KVM
esac
log "waiting for agent to re-report status (post-reboot, up to $((REBOOT_STEPS*2))s)…"
TS_AFTER=""
for _ in $(seq 1 "$REBOOT_STEPS"); do
    sleep 2
    ts="$(nats_status_timestamp)"
    # ISO-8601 timestamps compare correctly lexicographically when the
    # format is identical. Both the agent and `date -u -Iseconds`
    # produce RFC 3339 UTC strings so string > works.
    if [[ -n "$ts" && "$ts" > "$REBOOT_GATE" ]]; then
        TS_AFTER="$ts"
        break
    fi
done
if [[ -z "$TS_AFTER" ]]; then
    fail "agent did not write a post-reboot status within ~$((REBOOT_STEPS*2))s (gate: $REBOOT_GATE)"
fi
log "post-reboot status seen at $TS_AFTER"

log "PASS — VM $VM_NAME power-cycled and re-onboarded (group=$GROUP)"