#!/usr/bin/env bash
# gpumod-h6gs benchmark driver: runs gemma4-12b-q4 then gemma4-12b-q5
# sequentially. Each step starts the service, waits for /health, runs the
# 15-iter v2 coding benchmark, stops the service, and waits past the
# quiesce window before moving on.
#
# Launch inside a tmux session ('bench') and pair with a 'monitor' session
# per .claude/CLAUDE.md "Running Long Benchmarks".
set -euo pipefail
cd "$(dirname "$0")/../../.."

OUT="docs/benchmarks/20260603_gemma4_12b_vs_qwen36_35b_a3b_mtp"
QUIESCE_SECS=20

# VRAM isolation per .claude/CLAUDE.md "Running Long Benchmarks": stop all
# gpumod-tracked services before the first model start. Idempotent — if mode
# is already blank, this is a no-op.
echo "=== $(date -Iseconds) ensuring blank mode (VRAM isolation) ==="
uv run gpumod mode switch blank

archive_prior() {
    # gpumod-t84m: archive any existing result/log/artifacts for $model to
    # the next available .runN slot before launching a fresh run. Idempotent —
    # if no current data exists (fresh-state launch), the function returns
    # without renaming anything. Mirrors the rerun_q8.sh / rerun_26b_a4b.sh
    # pattern so re-launching this driver never silently overwrites data.
    local model="$1"
    if [ ! -f "$OUT/result_${model}.json" ]; then
        return 0
    fi
    local n=1
    while [ -e "$OUT/result_${model}.run${n}.json" ] \
       || [ -e "$OUT/run_${model}.run${n}.log" ] \
       || [ -e "$OUT/artifacts/${model}.run${n}" ]; do
        n=$((n + 1))
    done
    echo "=== $(date -Iseconds) archiving current ${model} data → run${n} ==="
    mv "$OUT/result_${model}.json" "$OUT/result_${model}.run${n}.json"
    [ -f "$OUT/run_${model}.log" ]   && mv "$OUT/run_${model}.log"   "$OUT/run_${model}.run${n}.log"
    [ -d "$OUT/artifacts/${model}" ] && mv "$OUT/artifacts/${model}" "$OUT/artifacts/${model}.run${n}"
}

run_one() {
    local model="$1" port="$2"
    archive_prior "$model"
    echo "=== $(date -Iseconds) starting service $model on port $port ==="
    uv run gpumod service start "$model"
    echo "=== waiting for /health on $port ==="
    local waited=0
    until curl -fsS "http://127.0.0.1:$port/health" >/dev/null 2>&1; do
        sleep 3
        waited=$((waited + 3))
        if [ "$waited" -gt 300 ]; then
            echo "!!! $model failed to come up in 5min — aborting"
            uv run gpumod service stop "$model" || true
            exit 1
        fi
    done
    echo "=== health OK after ${waited}s, launching benchmark ==="
    uv run python scripts/run_qwen36_benchmark.py \
        --model "$model" \
        --output-dir "$OUT" \
        2>&1 | tee -a "$OUT/run_${model}.log"
    echo "=== $(date -Iseconds) benchmark $model done, stopping service ==="
    uv run gpumod service stop "$model"
    echo "=== quiescing ${QUIESCE_SECS}s before next model ==="
    sleep "$QUIESCE_SECS"
}

run_one gemma4-12b-q4 7106
run_one gemma4-12b-q5 7107

echo "=== $(date -Iseconds) ALL BENCHMARKS COMPLETE ==="
