Files
harmony/iot/scripts/load-test.sh
Jean-Gabriel Gill-Couture 5e8e72df52
Some checks failed
Run Check Script / check (pull_request) Failing after 52s
feat(iot-load-test): stable paths + HOLD=1 interactive mode
- Stable working dir under /tmp/iot-load-test/ — kubeconfig at
  /tmp/iot-load-test/kubeconfig, operator log at
  /tmp/iot-load-test/operator.log. No more chasing mktemp paths.

- Print an explore banner before the load run so the user can
  `export KUBECONFIG=...` and `kubectl get deployments -w` in
  another terminal while the load actually runs.

- HOLD=1 env var keeps the stack alive after the load completes;
  script blocks on sleep until Ctrl-C. Forwards --keep to the
  binary so CRs + KV entries stay in place for inspection.

- DEBUG=1 bumps operator RUST_LOG to surface every status patch.

- Keep operator.log after successful runs (cheap, often useful).

- Load-test binary: --cleanup bool → --keep flag (clap bool with
  default_value_t = true doesn't accept `--cleanup=false`).
2026-04-22 21:59:26 -04:00

253 lines
9.0 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# Load-test harness for the IoT operator's fleet_aggregator.
#
# Brings up the minimum stack (k3d + in-cluster NATS + CRD + operator)
# with no VM or real agent, then runs the `iot_load_test` binary
# which simulates N devices pushing DeploymentState to NATS.
#
# All stable paths under $WORK_DIR (default /tmp/iot-load-test) so you
# can point kubectl / tail at them while the test is running.
#
# Quick usage:
# iot/scripts/load-test.sh # 100-device default (55 + 9×5)
# HOLD=1 iot/scripts/load-test.sh # leave stack running for exploration
# DEVICES=10000 GROUP_SIZES=5500,500,500,500,500,500,500,500,500,500 \
# DURATION=90 iot/scripts/load-test.sh
#
# While it's running, in another terminal:
# export KUBECONFIG=/tmp/iot-load-test/kubeconfig
# kubectl get deployments.iot.nationtech.io -A -w
# kubectl get deployments.iot.nationtech.io -A \
# -o custom-columns=NAME:.metadata.name,RUN:.status.aggregate.succeeded,FAIL:.status.aggregate.failed,PEND:.status.aggregate.pending
# tail -f /tmp/iot-load-test/operator.log
#
# Set DEBUG=1 to bump RUST_LOG so the operator logs every status patch.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
OPERATOR_DIR="$REPO_ROOT/iot/iot-operator-v0"
# ---- config -----------------------------------------------------------------
K3D_BIN="${K3D_BIN:-$HOME/.local/share/harmony/k3d/k3d}"
CLUSTER_NAME="${CLUSTER_NAME:-iot-load}"
NATS_NAMESPACE="${NATS_NAMESPACE:-iot-system}"
NATS_NAME="${NATS_NAME:-iot-nats}"
NATS_NODE_PORT="${NATS_NODE_PORT:-4222}"
NATS_IMAGE="${NATS_IMAGE:-docker.io/library/nats:2.10-alpine}"
DEVICES="${DEVICES:-100}"
GROUP_SIZES="${GROUP_SIZES:-55,5,5,5,5,5,5,5,5,5}"
TICK_MS="${TICK_MS:-1000}"
DURATION="${DURATION:-60}"
NAMESPACE="${NAMESPACE:-iot-load}"
# Keep the stack alive after the test completes so the user can poke
# at CRs + NATS interactively. Ctrl-C to tear everything down.
HOLD="${HOLD:-0}"
# Stable working dir so kubectl + tail targets are predictable.
WORK_DIR="${WORK_DIR:-/tmp/iot-load-test}"
mkdir -p "$WORK_DIR"
KUBECONFIG_FILE="$WORK_DIR/kubeconfig"
OPERATOR_LOG="$WORK_DIR/operator.log"
OPERATOR_PID=""
log() { printf '\033[1;34m[load-test]\033[0m %s\n' "$*"; }
fail() { printf '\033[1;31m[load-test FAIL]\033[0m %s\n' "$*" >&2; exit 1; }
cleanup() {
local rc=$?
log "cleanup…"
if [[ -n "$OPERATOR_PID" ]] && kill -0 "$OPERATOR_PID" 2>/dev/null; then
kill "$OPERATOR_PID" 2>/dev/null || true
wait "$OPERATOR_PID" 2>/dev/null || true
fi
"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
if [[ $rc -ne 0 && -s "$OPERATOR_LOG" ]]; then
log "operator log at $OPERATOR_LOG (kept for inspection)"
echo "----- operator log tail -----"
tail -n 60 "$OPERATOR_LOG" 2>/dev/null || true
else
# Leave the operator log on success too — cheap, often useful.
log "operator log at $OPERATOR_LOG"
fi
exit $rc
}
trap cleanup EXIT INT TERM
require() { command -v "$1" >/dev/null 2>&1 || fail "missing required tool: $1"; }
require cargo
require kubectl
require podman
require docker
[[ -x "$K3D_BIN" ]] || fail "k3d binary not executable at $K3D_BIN"
# ---- phase 1: k3d cluster ---------------------------------------------------
log "phase 1: create k3d cluster '$CLUSTER_NAME' (host port $NATS_NODE_PORT → loadbalancer)"
"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
"$K3D_BIN" cluster create "$CLUSTER_NAME" \
--wait --timeout 90s \
-p "${NATS_NODE_PORT}:${NATS_NODE_PORT}@loadbalancer" \
>/dev/null
"$K3D_BIN" kubeconfig get "$CLUSTER_NAME" > "$KUBECONFIG_FILE"
export KUBECONFIG="$KUBECONFIG_FILE"
# ---- phase 2: NATS in-cluster ------------------------------------------------
log "phase 2a: sideload NATS image ($NATS_IMAGE)"
if ! docker image inspect "$NATS_IMAGE" >/dev/null 2>&1; then
if ! podman image inspect "$NATS_IMAGE" >/dev/null 2>&1; then
podman pull "$NATS_IMAGE" >/dev/null || fail "podman pull $NATS_IMAGE failed"
fi
tmptar="$(mktemp -t nats-image.XXXXXX.tar)"
podman save "$NATS_IMAGE" -o "$tmptar" >/dev/null
docker load -i "$tmptar" >/dev/null
rm -f "$tmptar"
fi
"$K3D_BIN" image import "$NATS_IMAGE" -c "$CLUSTER_NAME" >/dev/null
log "phase 2b: install NATS via NatsBasicScore"
(
cd "$REPO_ROOT"
cargo run -q --release -p example_iot_nats_install -- \
--namespace "$NATS_NAMESPACE" \
--name "$NATS_NAME" \
--expose load-balancer
)
kubectl -n "$NATS_NAMESPACE" wait --for=condition=Available \
"deployment/$NATS_NAME" --timeout=120s >/dev/null
log "probing nats://localhost:$NATS_NODE_PORT end-to-end"
for _ in $(seq 1 60); do
(echo >"/dev/tcp/127.0.0.1/$NATS_NODE_PORT") 2>/dev/null && break
sleep 1
done
(echo >"/dev/tcp/127.0.0.1/$NATS_NODE_PORT") 2>/dev/null \
|| fail "TCP localhost:$NATS_NODE_PORT never came up"
# ---- phase 3: CRD + operator ------------------------------------------------
log "phase 3: install CRD"
(
cd "$OPERATOR_DIR"
cargo run -q -- install
)
kubectl wait --for=condition=Established \
"crd/deployments.iot.nationtech.io" --timeout=30s >/dev/null
log "phase 4: start operator"
(
cd "$OPERATOR_DIR"
cargo build -q --release
)
# Default log level exposes the CR patch loop + watch attach; DEBUG=1
# bumps it so every status patch + transition is printed.
if [[ "${DEBUG:-0}" == "1" ]]; then
OPERATOR_RUST_LOG="debug,async_nats=warn,hyper=warn,rustls=warn,kube=info"
else
OPERATOR_RUST_LOG="info,kube_runtime=warn"
fi
NATS_URL="nats://localhost:$NATS_NODE_PORT" \
KV_BUCKET="desired-state" \
RUST_LOG="$OPERATOR_RUST_LOG" \
"$REPO_ROOT/target/release/iot-operator-v0" \
>"$OPERATOR_LOG" 2>&1 &
OPERATOR_PID=$!
log "operator pid=$OPERATOR_PID"
for _ in $(seq 1 30); do
if grep -q "starting Deployment controller" "$OPERATOR_LOG"; then break; fi
if ! kill -0 "$OPERATOR_PID" 2>/dev/null; then fail "operator exited early"; fi
sleep 0.5
done
grep -q "starting Deployment controller" "$OPERATOR_LOG" \
|| fail "operator never logged controller startup"
# ---- explore banner (before the load run so the user can start watching) ----
print_banner() {
cat <<EOF
$(printf '\033[1;32m[load-test]\033[0m stack ready. In another terminal:')
$(printf '\033[1mPoint kubectl at the k3d cluster:\033[0m')
export KUBECONFIG=$KUBECONFIG_FILE
$(printf '\033[1mWatch CRs as they update:\033[0m')
kubectl -n $NAMESPACE get deployments.iot.nationtech.io -w
$(printf '\033[1mSnapshot aggregate columns:\033[0m')
kubectl -n $NAMESPACE get deployments.iot.nationtech.io \\
-o custom-columns=NAME:.metadata.name,SUCCEEDED:.status.aggregate.succeeded,FAILED:.status.aggregate.failed,PENDING:.status.aggregate.pending,LAST_ERR:.status.aggregate.lastError.message
$(printf '\033[1mFull CR status JSON for one CR (first group):\033[0m')
kubectl -n $NAMESPACE get deployments.iot.nationtech.io/load-group-00 -o jsonpath='{.status.aggregate}' | jq
$(printf '\033[1mOperator log:\033[0m')
tail -F $OPERATOR_LOG
$(printf '\033[1mPeek at NATS KV directly (natsbox):\033[0m')
alias natsbox='podman run --rm docker.io/natsio/nats-box:latest nats --server nats://host.containers.internal:$NATS_NODE_PORT'
natsbox kv ls device-state
natsbox kv get device-state 'state.load-dev-00001.load-group-00' --raw
natsbox kv ls device-heartbeat
natsbox kv get device-heartbeat 'heartbeat.load-dev-00001' --raw
EOF
}
print_banner
# ---- phase 5: load test ------------------------------------------------------
log "phase 5: run iot_load_test (devices=$DEVICES, tick=${TICK_MS}ms, duration=${DURATION}s)"
(
cd "$REPO_ROOT"
cargo build -q --release -p example_iot_load_test
)
# `--no-cleanup` keeps the CRs + KV entries around after the run so
# you can inspect steady-state aggregate numbers after duration elapses.
LOAD_ARGS=(
--nats-url "nats://localhost:$NATS_NODE_PORT"
--namespace "$NAMESPACE"
--groups "$GROUP_SIZES"
--tick-ms "$TICK_MS"
--duration-s "$DURATION"
)
if [[ "$HOLD" == "1" ]]; then
LOAD_ARGS+=(--keep)
fi
RUST_LOG="info" "$REPO_ROOT/target/release/iot_load_test" "${LOAD_ARGS[@]}"
# ---- phase 6: operator log stats --------------------------------------------
log "phase 6: operator log summary"
patches="$(grep -c "aggregator: status patched" "$OPERATOR_LOG" 2>/dev/null || echo 0)"
warnings="$(grep -c " WARN " "$OPERATOR_LOG" 2>/dev/null || echo 0)"
errors="$(grep -c " ERROR " "$OPERATOR_LOG" 2>/dev/null || echo 0)"
log " CR status patches logged (DEBUG-level; use DEBUG=1 to surface): $patches"
log " operator warnings: $warnings errors: $errors"
if [[ "$errors" -gt 0 ]]; then
echo "----- operator error lines -----"
grep " ERROR " "$OPERATOR_LOG" | tail -20
fi
# ---- hold open (optional) ---------------------------------------------------
if [[ "$HOLD" == "1" ]]; then
print_banner
log "HOLD=1 — stack is still running. Ctrl-C to tear down."
# Block until user interrupts; cleanup trap does the teardown.
while true; do sleep 60; done
fi
log "PASS"