Some checks failed
Run Check Script / check (pull_request) Failing after 52s
- Stable working dir under /tmp/iot-load-test/ — kubeconfig at /tmp/iot-load-test/kubeconfig, operator log at /tmp/iot-load-test/operator.log. No more chasing mktemp paths. - Print an explore banner before the load run so the user can `export KUBECONFIG=...` and `kubectl get deployments -w` in another terminal while the load actually runs. - HOLD=1 env var keeps the stack alive after the load completes; script blocks on sleep until Ctrl-C. Forwards --keep to the binary so CRs + KV entries stay in place for inspection. - DEBUG=1 bumps operator RUST_LOG to surface every status patch. - Keep operator.log after successful runs (cheap, often useful). - Load-test binary: --cleanup bool → --keep flag (clap bool with default_value_t = true doesn't accept `--cleanup=false`).
253 lines
9.0 KiB
Bash
Executable File
253 lines
9.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Load-test harness for the IoT operator's fleet_aggregator.
|
||
#
|
||
# Brings up the minimum stack (k3d + in-cluster NATS + CRD + operator)
|
||
# with no VM or real agent, then runs the `iot_load_test` binary
|
||
# which simulates N devices pushing DeploymentState to NATS.
|
||
#
|
||
# All stable paths under $WORK_DIR (default /tmp/iot-load-test) so you
|
||
# can point kubectl / tail at them while the test is running.
|
||
#
|
||
# Quick usage:
|
||
# iot/scripts/load-test.sh # 100-device default (55 + 9×5)
|
||
# HOLD=1 iot/scripts/load-test.sh # leave stack running for exploration
|
||
# DEVICES=10000 GROUP_SIZES=5500,500,500,500,500,500,500,500,500,500 \
|
||
# DURATION=90 iot/scripts/load-test.sh
|
||
#
|
||
# While it's running, in another terminal:
|
||
# export KUBECONFIG=/tmp/iot-load-test/kubeconfig
|
||
# kubectl get deployments.iot.nationtech.io -A -w
|
||
# kubectl get deployments.iot.nationtech.io -A \
|
||
# -o custom-columns=NAME:.metadata.name,RUN:.status.aggregate.succeeded,FAIL:.status.aggregate.failed,PEND:.status.aggregate.pending
|
||
# tail -f /tmp/iot-load-test/operator.log
|
||
#
|
||
# Set DEBUG=1 to bump RUST_LOG so the operator logs every status patch.
|
||
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
OPERATOR_DIR="$REPO_ROOT/iot/iot-operator-v0"
|
||
|
||
# ---- config -----------------------------------------------------------------
|
||
|
||
K3D_BIN="${K3D_BIN:-$HOME/.local/share/harmony/k3d/k3d}"
|
||
CLUSTER_NAME="${CLUSTER_NAME:-iot-load}"
|
||
NATS_NAMESPACE="${NATS_NAMESPACE:-iot-system}"
|
||
NATS_NAME="${NATS_NAME:-iot-nats}"
|
||
NATS_NODE_PORT="${NATS_NODE_PORT:-4222}"
|
||
NATS_IMAGE="${NATS_IMAGE:-docker.io/library/nats:2.10-alpine}"
|
||
|
||
DEVICES="${DEVICES:-100}"
|
||
GROUP_SIZES="${GROUP_SIZES:-55,5,5,5,5,5,5,5,5,5}"
|
||
TICK_MS="${TICK_MS:-1000}"
|
||
DURATION="${DURATION:-60}"
|
||
NAMESPACE="${NAMESPACE:-iot-load}"
|
||
|
||
# Keep the stack alive after the test completes so the user can poke
|
||
# at CRs + NATS interactively. Ctrl-C to tear everything down.
|
||
HOLD="${HOLD:-0}"
|
||
|
||
# Stable working dir so kubectl + tail targets are predictable.
|
||
WORK_DIR="${WORK_DIR:-/tmp/iot-load-test}"
|
||
mkdir -p "$WORK_DIR"
|
||
|
||
KUBECONFIG_FILE="$WORK_DIR/kubeconfig"
|
||
OPERATOR_LOG="$WORK_DIR/operator.log"
|
||
OPERATOR_PID=""
|
||
|
||
log() { printf '\033[1;34m[load-test]\033[0m %s\n' "$*"; }
|
||
fail() { printf '\033[1;31m[load-test FAIL]\033[0m %s\n' "$*" >&2; exit 1; }
|
||
|
||
cleanup() {
|
||
local rc=$?
|
||
log "cleanup…"
|
||
if [[ -n "$OPERATOR_PID" ]] && kill -0 "$OPERATOR_PID" 2>/dev/null; then
|
||
kill "$OPERATOR_PID" 2>/dev/null || true
|
||
wait "$OPERATOR_PID" 2>/dev/null || true
|
||
fi
|
||
"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
|
||
if [[ $rc -ne 0 && -s "$OPERATOR_LOG" ]]; then
|
||
log "operator log at $OPERATOR_LOG (kept for inspection)"
|
||
echo "----- operator log tail -----"
|
||
tail -n 60 "$OPERATOR_LOG" 2>/dev/null || true
|
||
else
|
||
# Leave the operator log on success too — cheap, often useful.
|
||
log "operator log at $OPERATOR_LOG"
|
||
fi
|
||
exit $rc
|
||
}
|
||
trap cleanup EXIT INT TERM
|
||
|
||
require() { command -v "$1" >/dev/null 2>&1 || fail "missing required tool: $1"; }
|
||
require cargo
|
||
require kubectl
|
||
require podman
|
||
require docker
|
||
[[ -x "$K3D_BIN" ]] || fail "k3d binary not executable at $K3D_BIN"
|
||
|
||
# ---- phase 1: k3d cluster ---------------------------------------------------
|
||
|
||
log "phase 1: create k3d cluster '$CLUSTER_NAME' (host port $NATS_NODE_PORT → loadbalancer)"
|
||
"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
|
||
"$K3D_BIN" cluster create "$CLUSTER_NAME" \
|
||
--wait --timeout 90s \
|
||
-p "${NATS_NODE_PORT}:${NATS_NODE_PORT}@loadbalancer" \
|
||
>/dev/null
|
||
"$K3D_BIN" kubeconfig get "$CLUSTER_NAME" > "$KUBECONFIG_FILE"
|
||
export KUBECONFIG="$KUBECONFIG_FILE"
|
||
|
||
# ---- phase 2: NATS in-cluster ------------------------------------------------
|
||
|
||
log "phase 2a: sideload NATS image ($NATS_IMAGE)"
|
||
if ! docker image inspect "$NATS_IMAGE" >/dev/null 2>&1; then
|
||
if ! podman image inspect "$NATS_IMAGE" >/dev/null 2>&1; then
|
||
podman pull "$NATS_IMAGE" >/dev/null || fail "podman pull $NATS_IMAGE failed"
|
||
fi
|
||
tmptar="$(mktemp -t nats-image.XXXXXX.tar)"
|
||
podman save "$NATS_IMAGE" -o "$tmptar" >/dev/null
|
||
docker load -i "$tmptar" >/dev/null
|
||
rm -f "$tmptar"
|
||
fi
|
||
"$K3D_BIN" image import "$NATS_IMAGE" -c "$CLUSTER_NAME" >/dev/null
|
||
|
||
log "phase 2b: install NATS via NatsBasicScore"
|
||
(
|
||
cd "$REPO_ROOT"
|
||
cargo run -q --release -p example_iot_nats_install -- \
|
||
--namespace "$NATS_NAMESPACE" \
|
||
--name "$NATS_NAME" \
|
||
--expose load-balancer
|
||
)
|
||
kubectl -n "$NATS_NAMESPACE" wait --for=condition=Available \
|
||
"deployment/$NATS_NAME" --timeout=120s >/dev/null
|
||
|
||
log "probing nats://localhost:$NATS_NODE_PORT end-to-end"
|
||
for _ in $(seq 1 60); do
|
||
(echo >"/dev/tcp/127.0.0.1/$NATS_NODE_PORT") 2>/dev/null && break
|
||
sleep 1
|
||
done
|
||
(echo >"/dev/tcp/127.0.0.1/$NATS_NODE_PORT") 2>/dev/null \
|
||
|| fail "TCP localhost:$NATS_NODE_PORT never came up"
|
||
|
||
# ---- phase 3: CRD + operator ------------------------------------------------
|
||
|
||
log "phase 3: install CRD"
|
||
(
|
||
cd "$OPERATOR_DIR"
|
||
cargo run -q -- install
|
||
)
|
||
kubectl wait --for=condition=Established \
|
||
"crd/deployments.iot.nationtech.io" --timeout=30s >/dev/null
|
||
|
||
log "phase 4: start operator"
|
||
(
|
||
cd "$OPERATOR_DIR"
|
||
cargo build -q --release
|
||
)
|
||
|
||
# Default log level exposes the CR patch loop + watch attach; DEBUG=1
|
||
# bumps it so every status patch + transition is printed.
|
||
if [[ "${DEBUG:-0}" == "1" ]]; then
|
||
OPERATOR_RUST_LOG="debug,async_nats=warn,hyper=warn,rustls=warn,kube=info"
|
||
else
|
||
OPERATOR_RUST_LOG="info,kube_runtime=warn"
|
||
fi
|
||
|
||
NATS_URL="nats://localhost:$NATS_NODE_PORT" \
|
||
KV_BUCKET="desired-state" \
|
||
RUST_LOG="$OPERATOR_RUST_LOG" \
|
||
"$REPO_ROOT/target/release/iot-operator-v0" \
|
||
>"$OPERATOR_LOG" 2>&1 &
|
||
OPERATOR_PID=$!
|
||
log "operator pid=$OPERATOR_PID"
|
||
for _ in $(seq 1 30); do
|
||
if grep -q "starting Deployment controller" "$OPERATOR_LOG"; then break; fi
|
||
if ! kill -0 "$OPERATOR_PID" 2>/dev/null; then fail "operator exited early"; fi
|
||
sleep 0.5
|
||
done
|
||
grep -q "starting Deployment controller" "$OPERATOR_LOG" \
|
||
|| fail "operator never logged controller startup"
|
||
|
||
# ---- explore banner (before the load run so the user can start watching) ----
|
||
|
||
print_banner() {
|
||
cat <<EOF
|
||
|
||
$(printf '\033[1;32m[load-test]\033[0m stack ready. In another terminal:')
|
||
|
||
$(printf '\033[1mPoint kubectl at the k3d cluster:\033[0m')
|
||
export KUBECONFIG=$KUBECONFIG_FILE
|
||
|
||
$(printf '\033[1mWatch CRs as they update:\033[0m')
|
||
kubectl -n $NAMESPACE get deployments.iot.nationtech.io -w
|
||
|
||
$(printf '\033[1mSnapshot aggregate columns:\033[0m')
|
||
kubectl -n $NAMESPACE get deployments.iot.nationtech.io \\
|
||
-o custom-columns=NAME:.metadata.name,SUCCEEDED:.status.aggregate.succeeded,FAILED:.status.aggregate.failed,PENDING:.status.aggregate.pending,LAST_ERR:.status.aggregate.lastError.message
|
||
|
||
$(printf '\033[1mFull CR status JSON for one CR (first group):\033[0m')
|
||
kubectl -n $NAMESPACE get deployments.iot.nationtech.io/load-group-00 -o jsonpath='{.status.aggregate}' | jq
|
||
|
||
$(printf '\033[1mOperator log:\033[0m')
|
||
tail -F $OPERATOR_LOG
|
||
|
||
$(printf '\033[1mPeek at NATS KV directly (natsbox):\033[0m')
|
||
alias natsbox='podman run --rm docker.io/natsio/nats-box:latest nats --server nats://host.containers.internal:$NATS_NODE_PORT'
|
||
natsbox kv ls device-state
|
||
natsbox kv get device-state 'state.load-dev-00001.load-group-00' --raw
|
||
natsbox kv ls device-heartbeat
|
||
natsbox kv get device-heartbeat 'heartbeat.load-dev-00001' --raw
|
||
|
||
EOF
|
||
}
|
||
|
||
print_banner
|
||
|
||
# ---- phase 5: load test ------------------------------------------------------
|
||
|
||
log "phase 5: run iot_load_test (devices=$DEVICES, tick=${TICK_MS}ms, duration=${DURATION}s)"
|
||
(
|
||
cd "$REPO_ROOT"
|
||
cargo build -q --release -p example_iot_load_test
|
||
)
|
||
|
||
# `--no-cleanup` keeps the CRs + KV entries around after the run so
|
||
# you can inspect steady-state aggregate numbers after duration elapses.
|
||
LOAD_ARGS=(
|
||
--nats-url "nats://localhost:$NATS_NODE_PORT"
|
||
--namespace "$NAMESPACE"
|
||
--groups "$GROUP_SIZES"
|
||
--tick-ms "$TICK_MS"
|
||
--duration-s "$DURATION"
|
||
)
|
||
if [[ "$HOLD" == "1" ]]; then
|
||
LOAD_ARGS+=(--keep)
|
||
fi
|
||
|
||
RUST_LOG="info" "$REPO_ROOT/target/release/iot_load_test" "${LOAD_ARGS[@]}"
|
||
|
||
# ---- phase 6: operator log stats --------------------------------------------
|
||
|
||
log "phase 6: operator log summary"
|
||
patches="$(grep -c "aggregator: status patched" "$OPERATOR_LOG" 2>/dev/null || echo 0)"
|
||
warnings="$(grep -c " WARN " "$OPERATOR_LOG" 2>/dev/null || echo 0)"
|
||
errors="$(grep -c " ERROR " "$OPERATOR_LOG" 2>/dev/null || echo 0)"
|
||
log " CR status patches logged (DEBUG-level; use DEBUG=1 to surface): $patches"
|
||
log " operator warnings: $warnings errors: $errors"
|
||
if [[ "$errors" -gt 0 ]]; then
|
||
echo "----- operator error lines -----"
|
||
grep " ERROR " "$OPERATOR_LOG" | tail -20
|
||
fi
|
||
|
||
# ---- hold open (optional) ---------------------------------------------------
|
||
|
||
if [[ "$HOLD" == "1" ]]; then
|
||
print_banner
|
||
log "HOLD=1 — stack is still running. Ctrl-C to tear down."
|
||
# Block until user interrupts; cleanup trap does the teardown.
|
||
while true; do sleep 60; done
|
||
fi
|
||
|
||
log "PASS"
|