Files
harmony/fleet/scripts/load-test.sh
Jean-Gabriel Gill-Couture 22eed9b533
Some checks failed
Run Check Script / check (pull_request) Failing after 59s
Merge branch 'feat/iot-walking-skeleton' into feat/deploy_fleet_server_side
2026-05-05 10:32:51 -04:00

292 lines
12 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# Load-test harness for the Harmony fleet operator's fleet_aggregator.
#
# Brings up the minimum stack (k3d + in-cluster NATS + CRD + operator)
# with no VM or real agent, then runs the `fleet_load_test` binary
# which simulates N devices pushing DeploymentState to NATS.
#
# All stable paths under $WORK_DIR (default /tmp/fleet-load-test) so you
# can point kubectl / tail at them while the test is running.
#
# Quick usage:
# fleet/scripts/load-test.sh # 100-device default (55 + 9×5)
# HOLD=1 fleet/scripts/load-test.sh # leave stack running for exploration
# DEVICES=10000 GROUP_SIZES=5500,500,500,500,500,500,500,500,500,500 \
# DURATION=90 fleet/scripts/load-test.sh
#
# While it's running, in another terminal:
# export KUBECONFIG=/tmp/fleet-load-test/kubeconfig
# kubectl get deployments.fleet.nationtech.io -A -w
# kubectl get deployments.fleet.nationtech.io -A \
# -o custom-columns=NAME:.metadata.name,RUN:.status.aggregate.succeeded,FAIL:.status.aggregate.failed,PEND:.status.aggregate.pending
# tail -f /tmp/fleet-load-test/operator.log
#
# Set DEBUG=1 to bump RUST_LOG so the operator logs every status patch.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# ---- config -----------------------------------------------------------------
K3D_BIN="${K3D_BIN:-$HOME/.local/share/harmony/k3d/k3d}"
CLUSTER_NAME="${CLUSTER_NAME:-fleet-load}"
NATS_NAMESPACE="${NATS_NAMESPACE:-fleet-system}"
NATS_NAME="${NATS_NAME:-fleet-nats}"
NATS_NODE_PORT="${NATS_NODE_PORT:-4222}"
NATS_IMAGE="${NATS_IMAGE:-docker.io/library/nats:2.10-alpine}"
DEVICES="${DEVICES:-100}"
GROUP_SIZES="${GROUP_SIZES:-55,5,5,5,5,5,5,5,5,5}"
TICK_MS="${TICK_MS:-1000}"
DURATION="${DURATION:-60}"
NAMESPACE="${NAMESPACE:-fleet-load}"
# Keep the stack alive after the test completes so the user can poke
# at CRs + NATS interactively. Ctrl-C to tear everything down.
HOLD="${HOLD:-0}"
# Stable working dir so kubectl + tail targets are predictable.
WORK_DIR="${WORK_DIR:-/tmp/fleet-load-test}"
mkdir -p "$WORK_DIR"
KUBECONFIG_FILE="$WORK_DIR/kubeconfig"
OPERATOR_LOG="$WORK_DIR/operator.log"
OPERATOR_IMAGE="${OPERATOR_IMAGE:-localhost/harmony-fleet-operator:latest}"
OPERATOR_NAMESPACE="${OPERATOR_NAMESPACE:-fleet-system}"
OPERATOR_RELEASE="${OPERATOR_RELEASE:-harmony-fleet-operator}"
OPERATOR_PID="" # unused in the helm path; kept so older trap-cleanup logic doesn't choke.
log() { printf '\033[1;34m[load-test]\033[0m %s\n' "$*"; }
fail() { printf '\033[1;31m[load-test FAIL]\033[0m %s\n' "$*" >&2; exit 1; }
dump_operator_log() {
[[ -n "$KUBECONFIG" && -f "$KUBECONFIG" ]] || return 0
kubectl -n "$OPERATOR_NAMESPACE" logs "deployment/$OPERATOR_RELEASE" \
--tail=1000 >"$OPERATOR_LOG" 2>/dev/null || true
}
cleanup() {
local rc=$?
log "cleanup…"
# Capture the operator's in-cluster log before we kill the
# cluster, so the tail-on-failure hook has something to show.
dump_operator_log
"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
if [[ $rc -ne 0 && -s "$OPERATOR_LOG" ]]; then
log "operator log at $OPERATOR_LOG (kept for inspection)"
echo "----- operator log tail -----"
tail -n 60 "$OPERATOR_LOG" 2>/dev/null || true
elif [[ -s "$OPERATOR_LOG" ]]; then
log "operator log at $OPERATOR_LOG"
fi
exit $rc
}
trap cleanup EXIT INT TERM
require() { command -v "$1" >/dev/null 2>&1 || fail "missing required tool: $1"; }
require cargo
require kubectl
require podman
require docker
require helm
[[ -x "$K3D_BIN" ]] || fail "k3d binary not executable at $K3D_BIN"
# ---- phase 1: k3d cluster ---------------------------------------------------
log "phase 1: create k3d cluster '$CLUSTER_NAME' (host port $NATS_NODE_PORT → loadbalancer)"
"$K3D_BIN" cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
"$K3D_BIN" cluster create "$CLUSTER_NAME" \
--wait --timeout 90s \
-p "${NATS_NODE_PORT}:${NATS_NODE_PORT}@loadbalancer" \
>/dev/null
"$K3D_BIN" kubeconfig get "$CLUSTER_NAME" > "$KUBECONFIG_FILE"
export KUBECONFIG="$KUBECONFIG_FILE"
# ---- phase 2: NATS in-cluster ------------------------------------------------
log "phase 2a: sideload NATS image ($NATS_IMAGE)"
if ! docker image inspect "$NATS_IMAGE" >/dev/null 2>&1; then
if ! podman image inspect "$NATS_IMAGE" >/dev/null 2>&1; then
podman pull "$NATS_IMAGE" >/dev/null || fail "podman pull $NATS_IMAGE failed"
fi
tmptar="$(mktemp -t nats-image.XXXXXX.tar)"
podman save "$NATS_IMAGE" -o "$tmptar" >/dev/null
docker load -i "$tmptar" >/dev/null
rm -f "$tmptar"
fi
"$K3D_BIN" image import "$NATS_IMAGE" -c "$CLUSTER_NAME" >/dev/null
log "phase 2b: install NATS via NatsBasicScore"
(
cd "$REPO_ROOT"
cargo run -q --release -p example_fleet_nats_install -- \
--namespace "$NATS_NAMESPACE" \
--name "$NATS_NAME" \
--expose load-balancer
)
# The upstream nats/nats helm chart provisions a StatefulSet, not a
# Deployment. Waiting on the pod-label condition works across both
# shapes without hardcoding a workload kind.
kubectl -n "$NATS_NAMESPACE" wait --for=condition=Ready \
"pod" -l "app.kubernetes.io/name=nats" --timeout=180s >/dev/null
log "probing nats://localhost:$NATS_NODE_PORT end-to-end"
for _ in $(seq 1 60); do
(echo >"/dev/tcp/127.0.0.1/$NATS_NODE_PORT") 2>/dev/null && break
sleep 1
done
(echo >"/dev/tcp/127.0.0.1/$NATS_NODE_PORT") 2>/dev/null \
|| fail "TCP localhost:$NATS_NODE_PORT never came up"
# ---- phase 3: operator container image + helm install ---------------------
log "phase 3a: build container image $OPERATOR_IMAGE"
# The Dockerfile is multi-stage and runs `cargo build` itself inside a
# pinned rust image, so the build context is the workspace root.
podman build -q \
-f "$REPO_ROOT/fleet/harmony-fleet-operator/Dockerfile" \
-t "$OPERATOR_IMAGE" \
"$REPO_ROOT" >/dev/null
log "phase 3b: sideload operator image into k3d cluster"
tmptar="$(mktemp -t harmony-fleet-operator-image.XXXXXX.tar)"
podman save "$OPERATOR_IMAGE" -o "$tmptar" >/dev/null
docker load -i "$tmptar" >/dev/null
rm -f "$tmptar"
"$K3D_BIN" image import "$OPERATOR_IMAGE" -c "$CLUSTER_NAME" >/dev/null
log "phase 3c: install operator (CRDs + RBAC + Deployment) via FleetServerScore"
# DEBUG=1 bumps operator logging so `kubectl logs` prints every
# status patch + transition.
if [[ "${DEBUG:-0}" == "1" ]]; then
OPERATOR_RUST_LOG="debug,async_nats=warn,hyper=warn,rustls=warn,kube=info"
else
OPERATOR_RUST_LOG="info,kube_runtime=warn"
fi
# FleetServerScore composes NatsBasicScore + FleetOperatorScore. NATS
# was already installed in phase 2b so HelmChartScore::find_installed_release
# short-circuits the NATS branch; the operator install is the meaningful
# work this invocation does.
(
cd "$REPO_ROOT"
cargo run -q --release -p example_fleet_server_install -- \
--nats-namespace "$NATS_NAMESPACE" \
--nats-name "$NATS_NAME" \
--nats-expose load-balancer \
--operator-namespace "$OPERATOR_NAMESPACE" \
--operator-release "$OPERATOR_RELEASE" \
--operator-image "$OPERATOR_IMAGE" \
--operator-image-pull-policy IfNotPresent \
--log-level "$OPERATOR_RUST_LOG"
)
# Sanity assertions: the Score returned success, but verify what it
# installed actually came up. Cheap, useful for failure triage.
kubectl wait --for=condition=Established \
"crd/deployments.fleet.nationtech.io" --timeout=30s >/dev/null
kubectl wait --for=condition=Established \
"crd/devices.fleet.nationtech.io" --timeout=30s >/dev/null
kubectl -n "$OPERATOR_NAMESPACE" wait --for=condition=Available \
"deployment/$OPERATOR_RELEASE" --timeout=120s >/dev/null
# Seed the operator log file from the pod so HOLD=1 banner + final
# summary both have something to read. We re-dump on cleanup.
dump_operator_log
# ---- explore banner (before the load run so the user can start watching) ----
print_banner() {
cat <<EOF
$(printf '\033[1;32m[load-test]\033[0m stack ready. In another terminal:')
$(printf '\033[1mPoint kubectl at the k3d cluster:\033[0m')
export KUBECONFIG=$KUBECONFIG_FILE
$(printf '\033[1mWatch CRs as they update:\033[0m')
kubectl -n $NAMESPACE get deployments.fleet.nationtech.io -w
$(printf '\033[1mSnapshot aggregate columns:\033[0m')
kubectl -n $NAMESPACE get deployments.fleet.nationtech.io \\
-o custom-columns=NAME:.metadata.name,MATCHED:.status.aggregate.matchedDeviceCount,OK:.status.aggregate.succeeded,FAIL:.status.aggregate.failed,PEND:.status.aggregate.pending,LAST_ERR:.status.aggregate.lastError.message
$(printf '\033[1mInspect a Deployment spec (no device list — selector only):\033[0m')
kubectl -n $NAMESPACE get deployments.fleet.nationtech.io/load-group-00 -o jsonpath='{.spec}' | jq
$(printf '\033[1mFull CR status JSON for one CR:\033[0m')
kubectl -n $NAMESPACE get deployments.fleet.nationtech.io/load-group-00 -o jsonpath='{.status.aggregate}' | jq
$(printf '\033[1mList Devices + filter by label:\033[0m')
kubectl get devices.fleet.nationtech.io | head -20
kubectl get devices.fleet.nationtech.io -l group=load-group-00 | head -10
kubectl get device.fleet.nationtech.io load-dev-00001 -o yaml
$(printf '\033[1mOperator log (in-cluster pod):\033[0m')
kubectl -n $OPERATOR_NAMESPACE logs -f deployment/$OPERATOR_RELEASE
# or the last snapshot dumped by the harness:
tail -F $OPERATOR_LOG
$(printf '\033[1mPeek at NATS KV directly (natsbox):\033[0m')
alias natsbox='podman run --rm docker.io/natsio/nats-box:latest nats --server nats://host.containers.internal:$NATS_NODE_PORT'
natsbox kv ls device-state
natsbox kv get device-state 'state.load-dev-00001.load-group-00' --raw
natsbox kv ls device-heartbeat
natsbox kv get device-heartbeat 'heartbeat.load-dev-00001' --raw
EOF
}
alias natsbox='podman run --rm docker.io/natsio/nats-box:latest nats --server nats://192.168.12.102:4222'
print_banner
# ---- phase 5: load test ------------------------------------------------------
log "phase 5: run fleet_load_test (devices=$DEVICES, tick=${TICK_MS}ms, duration=${DURATION}s)"
(
cd "$REPO_ROOT"
cargo build -q --release -p example_fleet_load_test
)
# `--no-cleanup` keeps the CRs + KV entries around after the run so
# you can inspect steady-state aggregate numbers after duration elapses.
LOAD_ARGS=(
--nats-url "nats://localhost:$NATS_NODE_PORT"
--namespace "$NAMESPACE"
--groups "$GROUP_SIZES"
--tick-ms "$TICK_MS"
--duration-s "$DURATION"
)
if [[ "$HOLD" == "1" ]]; then
LOAD_ARGS+=(--keep)
fi
RUST_LOG="info" "$REPO_ROOT/target/release/fleet_load_test" "${LOAD_ARGS[@]}"
# ---- phase 6: operator log stats --------------------------------------------
log "phase 6: operator log summary"
dump_operator_log
patches="$(grep -c "aggregator: status patched" "$OPERATOR_LOG" 2>/dev/null || echo 0)"
warnings="$(grep -c " WARN " "$OPERATOR_LOG" 2>/dev/null || echo 0)"
errors="$(grep -c " ERROR " "$OPERATOR_LOG" 2>/dev/null || echo 0)"
log " CR status patches logged (DEBUG-level; use DEBUG=1 to surface): $patches"
log " operator warnings: $warnings errors: $errors"
if [[ "$errors" -gt 0 ]]; then
echo "----- operator error lines -----"
grep " ERROR " "$OPERATOR_LOG" | tail -20
fi
# ---- hold open (optional) ---------------------------------------------------
if [[ "$HOLD" == "1" ]]; then
print_banner
log "HOLD=1 — stack is still running. Ctrl-C to tear down."
# Block until user interrupts; cleanup trap does the teardown.
while true; do sleep 60; done
fi
log "PASS"