diff --git a/.github/workflows/bench-abba.yml b/.github/workflows/bench-abba.yml
new file mode 100644
index 000000000..200982d17
--- /dev/null
+++ b/.github/workflows/bench-abba.yml
@@ -0,0 +1,122 @@
+name: Bench ABBA tiebreaker
+
+# Drift-free paired (A/B/B/A) prover benchmark for resolving small (~1%) deltas the
+# cheap PR benchmark can't confirm. It builds both binaries and runs ~20 interleaved
+# pairs, so it OCCUPIES THE SINGLE BENCH SERVER FOR ~30-40 MIN. For that reason it
+# NEVER auto-triggers -- it runs only on an explicit `/bench-abba` comment on a PR.
+on:
+  issue_comment:
+    types: [created]
+
+# One ABBA run per PR; a re-trigger cancels the stale one. (The single self-hosted
+# bench runner serializes across PRs on its own.)
+concurrency:
+  group: bench-abba-${{ github.event.issue.number }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+
+jobs:
+  abba:
+    # Manual-only: a "/bench-abba" comment on a PR, from a repo member. Never auto.
+    if: >-
+      github.event.issue.pull_request &&
+      startsWith(github.event.comment.body, '/bench-abba') &&
+      contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association)
+    runs-on: [self-hosted, bench]
+    # Generous ceiling so a hang/OOM can't strand the single bench runner; the
+    # workload itself is ~30-40 min at the default 20 pairs (clamped to <=40).
+    timeout-minutes: 120
+    steps:
+      - name: Acknowledge (react + occupancy notice)
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner, repo: context.repo.repo,
+              comment_id: context.payload.comment.id, content: 'eyes'
+            });
+            await github.rest.issues.createComment({
+              owner: context.repo.owner, repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: '⏳ **ABBA tiebreaker started** on the bench server (~30–40 min). The bench server is occupied until it finishes.'
+            });
+
+      - name: Resolve PR head + pair count
+        id: cfg
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUM: ${{ github.event.issue.number }}
+          COMMENT_BODY: ${{ github.event.comment.body }}
+        run: |
+          # Resolve the head SHA (not the branch name): pinning the commit works for
+          # fork PRs too (the branch lives in the fork, not origin/) and avoids a
+          # force-push race mid-run.
+          HEAD_SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+          echo "head_sha=$HEAD_SHA" >> "$GITHUB_OUTPUT"
+          # Optional pair count, e.g. "/bench-abba 32"; default 20. Clamp to [2,40]
+          # so a "/bench-abba 10000" can't monopolize the single bench server.
+          N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-abba[[:space:]]*\([0-9]\+\).*|\1|p')
+          N=${N:-20}
+          if [ "$N" -lt 2 ] 2>/dev/null || [ "$N" -gt 40 ] 2>/dev/null; then
+            echo "::warning::pair count $N out of range [2,40]; using 20"
+            N=20
+          fi
+          echo "pairs=$N" >> "$GITHUB_OUTPUT"
+
+      - name: Checkout (full history for ref resolution)
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Fetch PR head commit (works for fork PRs)
+        env:
+          PR_NUM: ${{ github.event.issue.number }}
+        run: git fetch origin "pull/$PR_NUM/head" --quiet
+
+      - name: Add cargo to PATH
+        run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
+
+      - name: Run ABBA tiebreaker
+        id: run
+        env:
+          HEAD_SHA: ${{ steps.cfg.outputs.head_sha }}
+          PAIRS: ${{ steps.cfg.outputs.pairs }}
+        run: |
+          export SYSROOT_DIR="$HOME/.lambda-vm-sysroot"
+          set -o pipefail
+          # bench_abba.sh builds the cli at both refs (isolated worktree), runs the
+          # interleaved pairs, and prints the paired-t CI + exact Wilcoxon test.
+          # Pass the head SHA (pinned above) so fork PRs resolve.
+          scripts/bench_abba.sh "$HEAD_SHA" origin/main "$PAIRS" 2>&1 | tee /tmp/abba_out.txt
+          sed -n '/=== ABBA paired result/,$p' /tmp/abba_out.txt > /tmp/abba_result.txt
+
+      - name: Post result
+        if: always()
+        uses: actions/github-script@v7
+        env:
+          HEAD_SHA: ${{ steps.cfg.outputs.head_sha }}
+          PAIRS: ${{ steps.cfg.outputs.pairs }}
+          OUTCOME: ${{ steps.run.outcome }}
+        with:
+          script: |
+            const fs = require('fs');
+            const read = (p) => { try { return fs.readFileSync(p, 'utf8').trim(); } catch { return ''; } };
+            const head = (process.env.HEAD_SHA || '').slice(0, 10), pairs = process.env.PAIRS;
+            let body = `## ABBA tiebreaker — \`${head}\` vs \`main\` (${pairs} pairs)\n\n`;
+            if (process.env.OUTCOME === 'success') {
+              const res = read('/tmp/abba_result.txt') || read('/tmp/abba_out.txt');
+              body += '```\n' + res + '\n```\n';
+              body += '\n<sub>Drift-free interleaved A/B/B/A measurement. + = PR faster. ';
+              body += 'Trust the verdict when paired-t and Wilcoxon agree.</sub>\n';
+            } else {
+              const tail = read('/tmp/abba_out.txt').split('\n').slice(-30).join('\n');
+              body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n';
+            }
+            await github.rest.issues.createComment({
+              owner: context.repo.owner, repo: context.repo.repo,
+              issue_number: context.issue.number, body
+            });
diff --git a/.github/workflows/benchmark-pr.yml b/.github/workflows/benchmark-pr.yml
index ca66bf9a7..57169967d 100644
--- a/.github/workflows/benchmark-pr.yml
+++ b/.github/workflows/benchmark-pr.yml
@@ -38,7 +38,11 @@ env:
   ELF: executor/program_artifacts/rust/ethrex.elf
   INPUT: executor/tests/ethrex_bench_20.bin
   BENCH_RUNS_PR: 3
-  BENCH_RUNS_BASELINE: 3
+  # Cheap-tier screen: catches regressions down to ~1.5% on its own and leaves
+  # smaller/ambiguous deltas to the manual drift-free ABBA tiebreaker. Pushing either
+  # side past 5 buys little here (the cached comparison can't beat the ~1% drift wall),
+  # so the per-PR run count is also capped at 5 (clamp below).
+  BENCH_RUNS_BASELINE: 5
   # Memory-scaling sweep: same ELF, different N-transfer inputs. GROWTH_PROGRAMS
   # are the generated (gitignored) fixture basenames in executor/tests/; GROWTH_STEPS
   # the matching transfer counts (x-axis; slope is MB per transfer).
@@ -55,6 +59,7 @@ jobs:
       (github.event_name == 'issue_comment' &&
        github.event.issue.pull_request &&
        startsWith(github.event.comment.body, '/bench') &&
+       !startsWith(github.event.comment.body, '/bench-abba') &&
        contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
     steps:
       - name: React to comment
@@ -136,9 +141,10 @@ jobs:
             RUNS=$BENCH_RUNS_PR
           fi
 
-          # Clamp to 1-10
-          if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 10 ] 2>/dev/null; then
-            echo "::warning::Run count $RUNS out of range [1,10], defaulting to $BENCH_RUNS_PR"
+          # Clamp to 1-5. Beyond 5 the single-session cached comparison barely improves
+          # (it can't beat the ~1% drift wall); use the ABBA tiebreaker for finer deltas.
+          if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 5 ] 2>/dev/null; then
+            echo "::warning::Run count $RUNS out of range [1,5], defaulting to $BENCH_RUNS_PR"
             RUNS=$BENCH_RUNS_PR
           fi
 
@@ -702,6 +708,18 @@ jobs:
               body += `> ✅ No significant change.\n`;
             }
 
+            // Tier-1 -> Tier-2 escalation: a small time speedup the cheap 3-5 run CI
+            // can't confirm (it catches >=~1.5% on its own). Point at the drift-free
+            // ABBA tiebreaker, which the user runs on demand via `/bench-abba`.
+            const tp = parseFloat(timePct);
+            if (tp < 0 && tp > -1.5) {
+              body += `>\n`;
+              body += `> 🔬 **Looks like a small speedup (${fmt(timePct)}%) — below what ${runs} runs can confirm.** `;
+              body += `Comment \`/bench-abba\` to run the drift-free ABBA tiebreaker (paired-t CI + exact Wilcoxon). `;
+              body += `Note: it occupies the bench server for ~30–40 min.\n`;
+              body += `> Optional pair count: \`/bench-abba 32\` (20 resolves ~1%, 32 for ~0.6%).\n`;
+            }
+
             // Spread warnings
             const prWarnings = [];
             const baseWarnings = [];
diff --git a/scripts/bench_abba.sh b/scripts/bench_abba.sh
new file mode 100755
index 000000000..79bfddf27
--- /dev/null
+++ b/scripts/bench_abba.sh
@@ -0,0 +1,262 @@
+#!/usr/bin/env bash
+#
+# bench_abba.sh — interleaved A/B/B/A paired prover benchmark.
+#
+# WHY: comparing a PR against a separately-recorded (cached) baseline conflates the
+# code delta with machine drift between the two measurement sessions. For small
+# (~1%) prover changes that drift is the dominant error. Measuring both binaries
+# *interleaved on the same machine in the same session* cancels the drift (it hits
+# both sides equally), and a paired analysis over the A/B pairs is far more powerful
+# than an unpaired two-sample test.
+#
+# WHAT IT DOES:
+#   1. Builds the ethrex guest ELF + 20-transfer fixture once (identical for both
+#      sides — a prover-only change doesn't touch the guest).
+#   2. Builds the `cli` prover at REF_A and REF_B (skips the build and reuses the
+#      cached binaries if they already exist; set REBUILD=1 to force).
+#   3. Runs N_PAIRS interleaved pairs in A B B A ... order (alternating which side
+#      runs first each pair, to cancel linear drift). Use an EVEN N_PAIRS.
+#   4. Reports BOTH a paired-t 95% CI (sensitive to outliers) AND a robust
+#      median + Wilcoxon signed-rank result (shrugs off transient slow runs).
+#
+# CONVENTION: every reported number is an IMPROVEMENT, positive = PR FASTER.
+#
+# USAGE:
+#   scripts/bench_abba.sh REF_A [REF_B] [N_PAIRS]
+#     REF_A    REQUIRED — ref or SHA to evaluate (the PR side)
+#     REF_B    baseline   (default: origin/main)
+#     N_PAIRS  pairs      (default: 20 -> 40 runs, ~33 min on ethrex)
+#   Env: REBUILD=1 forces a rebuild even if cached binaries exist.
+#
+#   Sizing (ethrex pair-noise sd ~1.2%, 80% power): ~12 pairs for a 1% effect,
+#   ~18 for 0.8%, ~32 for 0.6%. Default 20 -> solid on 0.8-1%, ~60% power at 0.6%
+#   (if a 20-pair run straddles 0 on a ~0.6%-looking effect, extend to 32).
+#
+#   scripts/bench_abba.sh origin/my-pr-branch                # vs main, 20 pairs
+#   scripts/bench_abba.sh origin/my-pr-branch origin/main 32 # 32 pairs (~0.6%)
+
+set -euo pipefail
+
+if [ $# -lt 1 ]; then
+  echo "usage: bench_abba.sh REF_A [REF_B=origin/main] [N_PAIRS=20]" >&2
+  echo "  REF_A: ref or SHA to evaluate (the PR side)" >&2
+  exit 2
+fi
+REF_A="$1"
+REF_B="${2:-origin/main}"
+N_PAIRS="${3:-20}"
+
+ELF_REL="executor/program_artifacts/rust/ethrex.elf"
+INPUT_REL="executor/tests/ethrex_bench_20.bin"
+WORK="/tmp/abba_run"
+WT="/tmp/abba_wt"
+PROOF="/tmp/abba_proof.bin"
+
+ROOT="$(git rev-parse --show-toplevel)"
+cd "$ROOT"
+
+# Fail fast on the toolchain the final stats step needs, before the ~30-min build.
+command -v python3 >/dev/null 2>&1 || { echo "ERROR: python3 is required (final stats step)." >&2; exit 1; }
+
+echo "==> Refs"
+git fetch origin --quiet || echo "WARNING: 'git fetch origin' failed -- resolving against possibly-stale local refs." >&2
+SHA_A="$(git rev-parse "$REF_A")"
+SHA_B="$(git rev-parse "$REF_B")"
+echo "   A (PR)       $REF_A  -> ${SHA_A:0:10}"
+echo "   B (baseline) $REF_B  -> ${SHA_B:0:10}"
+if [ $((N_PAIRS % 2)) -ne 0 ]; then
+  echo "   WARNING: N_PAIRS=$N_PAIRS is odd; use an even count so AB/BA orders balance."
+fi
+echo "   pairs=$N_PAIRS  (=$((N_PAIRS * 2)) prove runs)"
+
+mkdir -p "$WORK"
+
+# --- 1. Guest ELF + fixture (identical for both sides; build once if missing) ---
+if [ ! -f "$ELF_REL" ]; then
+  echo "==> Building ethrex guest ELF (missing)"
+  export SYSROOT_DIR="${SYSROOT_DIR:-$HOME/.lambda-vm-sysroot}"
+  make "$ELF_REL"
+fi
+if [ ! -f "$INPUT_REL" ]; then
+  echo "==> Generating ethrex 20-transfer fixture (missing)"
+  ( cd tooling/ethrex-fixtures && cargo build --release )
+  tooling/ethrex-fixtures/target/release/ethrex-fixtures 20 "$INPUT_REL" distinct
+fi
+ELF="$(cd "$(dirname "$ELF_REL")" && pwd)/$(basename "$ELF_REL")"
+INPUT="$(cd "$(dirname "$INPUT_REL")" && pwd)/$(basename "$INPUT_REL")"
+
+# --- 2. Build (or reuse) both prover binaries ---
+need_build=0
+if [ "${REBUILD:-0}" = "1" ] || [ ! -x "$WORK/cli_A" ] || [ ! -x "$WORK/cli_B" ]; then
+  need_build=1
+elif [ "$(cat "$WORK/cli_A.sha" 2>/dev/null)" != "$SHA_A" ] || [ "$(cat "$WORK/cli_B.sha" 2>/dev/null)" != "$SHA_B" ]; then
+  # Cache persists on the self-hosted runner; rebuild if it's for different refs
+  # (a different PR, or main advanced) so we never benchmark stale binaries.
+  echo "==> Cached binaries are for different refs; rebuilding."
+  need_build=1
+fi
+if [ "$need_build" = "1" ]; then
+  cleanup() { git worktree remove --force "$WT" 2>/dev/null || true; }
+  trap cleanup EXIT
+  git worktree remove --force "$WT" 2>/dev/null || true
+  echo "==> Building both prover binaries in isolated worktree $WT"
+  git worktree add --detach "$WT" "$SHA_B" >/dev/null
+  build_cli() {  # $1=sha $2=out (shared target dir -> 2nd build is incremental)
+    echo "==> Building cli @ ${1:0:10} -> $2"
+    git -C "$WT" checkout --quiet "$1"
+    if ! ( cd "$WT" && cargo build --release -p cli --features jemalloc-stats >"$WORK/build_$2.log" 2>&1 ); then
+      echo "ERROR: cargo build failed for $2 (@ ${1:0:10}). Tail of $WORK/build_$2.log:" >&2
+      tail -40 "$WORK/build_$2.log" >&2
+      exit 1
+    fi
+    cp "$WT/target/release/cli" "$WORK/$2"
+    echo "$1" > "$WORK/$2.sha"
+  }
+  build_cli "$SHA_B" cli_B
+  build_cli "$SHA_A" cli_A
+  cleanup
+  trap - EXIT
+else
+  echo "==> Reusing cached binaries (SHAs match requested refs; REBUILD=1 to force):"
+  echo "     cli_A=${SHA_A:0:10}  cli_B=${SHA_B:0:10}"
+fi
+
+# --- 3. Interleaved A/B/B/A measurement (fresh CSV -- pre-committed batch) ---
+run_prove() {  # $1=binary -> echoes proving time (s)
+  local out t
+  out="$("$1" prove "$ELF" --private-input "$INPUT" -o "$PROOF" --time 2>&1)"
+  rm -f "$PROOF"
+  t="$(printf '%s\n' "$out" | grep -o 'Proving time: [0-9.]*' | awk '{print $3}')"
+  if [ -z "$t" ]; then
+    echo "ERROR: could not parse 'Proving time' from cli output:" >&2
+    printf '%s\n' "$out" >&2
+    exit 1
+  fi
+  echo "$t"
+}
+
+echo "==> Running $N_PAIRS interleaved pairs  (improvement: + = PR faster)"
+printf 'pair,a_time,b_time\n' > "$WORK/pairs.csv"
+for i in $(seq 1 "$N_PAIRS"); do
+  if [ $((i % 2)) -eq 1 ]; then          # odd pair: A then B
+    a="$(run_prove "$WORK/cli_A")"; b="$(run_prove "$WORK/cli_B")"
+  else                                   # even pair: B then A (ABBA pattern)
+    b="$(run_prove "$WORK/cli_B")"; a="$(run_prove "$WORK/cli_A")"
+  fi
+  printf '%d,%s,%s\n' "$i" "$a" "$b" >> "$WORK/pairs.csv"
+  printf '   pair %2d/%d   A=%ss  B=%ss   PR %+.2f%% (+=faster)\n' \
+    "$i" "$N_PAIRS" "$a" "$b" "$(awk "BEGIN{print ($b-$a)/$b*100}")"
+done
+
+# --- 4. Paired t-test + robust median/Wilcoxon ---
+python3 - "$WORK/pairs.csv" <<'PY'
+import sys, csv, math
+
+rows = list(csv.DictReader(open(sys.argv[1])))
+A = [float(r['a_time']) for r in rows]   # PR
+B = [float(r['b_time']) for r in rows]   # baseline
+n = len(A)
+# per-pair improvement: positive => PR (A) faster than baseline (B)
+d = [(b - a) / b * 100.0 for a, b in zip(A, B)]
+
+# ---- parametric: paired t ----
+mean = sum(d) / n
+var = sum((x - mean) ** 2 for x in d) / (n - 1) if n > 1 else 0.0
+sd = math.sqrt(var)
+se = sd / math.sqrt(n) if n else float('inf')
+TT = {1:12.706,2:4.303,3:3.182,4:2.776,5:2.571,6:2.447,7:2.365,8:2.306,9:2.262,
+      10:2.228,11:2.201,12:2.179,13:2.160,14:2.145,15:2.131,16:2.120,17:2.110,
+      18:2.101,19:2.093,20:2.086,21:2.080,22:2.074,23:2.069,24:2.064,25:2.060,
+      26:2.056,27:2.052,28:2.048,29:2.045,30:2.042,35:2.030,40:2.021,50:2.009,
+      60:2.000,80:1.990,120:1.980}
+df = n - 1
+tc = TT.get(df) or (1.96 if df > 120 else TT[min(TT, key=lambda k: abs(k - df))])
+lo, hi = mean - tc * se, mean + tc * se
+
+# ---- robust: median + Wilcoxon signed-rank (tie-averaged ranks, EXACT p, pure stdlib) ----
+def median(xs):
+    s = sorted(xs); m = len(s)
+    return s[m // 2] if m % 2 else (s[m // 2 - 1] + s[m // 2]) / 2
+
+nz = [x for x in d if x != 0.0]
+m = len(nz)
+order = sorted(range(m), key=lambda i: abs(nz[i]))
+ranks = [0.0] * m
+i = 0
+while i < m:                                   # average ranks within ties on |d|
+    j = i
+    while j + 1 < m and abs(nz[order[j + 1]]) == abs(nz[order[i]]):
+        j += 1
+    avg = (i + 1 + j + 1) / 2.0
+    for k in range(i, j + 1):
+        ranks[order[k]] = avg
+    i = j + 1
+Wp = sum(r for r, x in zip(ranks, nz) if x > 0)
+Wn = sum(r for r, x in zip(ranks, nz) if x < 0)
+mu = m * (m + 1) / 4.0
+sig = math.sqrt(m * (m + 1) * (2 * m + 1) / 24.0) if m else 0.0
+z = (Wp - mu - (0.5 if Wp > mu else -0.5)) / sig if sig else 0.0   # normal approx (display only)
+# EXACT two-sided p: enumerate the signed-rank null distribution. Each rank is +/- with
+# prob 1/2, so the count of assignments giving W+=v is the coeff of x^v in prod(1 + x^rank)
+# -- build it with a generating-function DP. Double the ranks so tie-averaged (half-integer)
+# ranks become integers. No scipy; exact even at small n where the normal approx is loose.
+if m:
+    ir = [int(round(2 * r)) for r in ranks]
+    poly = [1]
+    for r in ir:
+        nxt = [0] * (len(poly) + r)
+        for v, c in enumerate(poly):
+            if c:
+                nxt[v] += c          # this rank negative -> adds 0 to W+
+                nxt[v + r] += c      # this rank positive -> adds r to W+
+        poly = nxt
+    Wp2 = int(round(2 * Wp))
+    p = min(1.0, 2.0 * min(sum(poly[:Wp2 + 1]), sum(poly[Wp2:])) / (1 << m))
+else:
+    p = 1.0
+med = median(d)
+
+# ---- server stability (byproduct): run-to-run jitter + within-session drift ----
+def cv(xs):
+    mm = sum(xs) / len(xs)
+    s = math.sqrt(sum((x - mm) ** 2 for x in xs) / (len(xs) - 1)) if len(xs) > 1 else 0.0
+    return (s / mm * 100.0) if mm else 0.0
+mA, mB = sum(A) / n, sum(B) / n
+cvA, cvB = cv(A), cv(B)
+# reconstruct execution order (odd pair: A,B ; even pair: B,A) and normalize each
+# run by its binary's mean so the A/B offset drops out, leaving pure machine drift.
+seq = []
+for i in range(n):
+    seq += ([('A', A[i]), ('B', B[i])] if (i + 1) % 2 else [('B', B[i]), ('A', A[i])])
+nrm = [(t / (mA if lbl == 'A' else mB) - 1) * 100 for lbl, t in seq]
+N = len(nrm); mi = (N - 1) / 2.0; mn = sum(nrm) / N
+denom = sum((i - mi) ** 2 for i in range(N))
+slope = (sum((i - mi) * (nrm[i] - mn) for i in range(N)) / denom) if denom else 0.0
+half = N // 2
+drift_shift = sum(nrm[half:]) / (N - half) - sum(nrm[:half]) / half
+
+print("\n=== ABBA paired result  (improvement: + = PR faster) ===")
+print(f"  pairs: {n}   mean A (PR): {sum(A)/n:.3f}s   mean B (base): {sum(B)/n:.3f}s")
+print()
+print(f"  [parametric] paired-t   mean {mean:+.2f}%   sd {sd:.2f}%   se {se:.2f}%")
+print(f"               95% CI: [{lo:+.2f}%, {hi:+.2f}%]   (t df={df} = {tc})")
+pstr = f"{p:.4f}" if p >= 1e-4 else f"{p:.1e}"
+print(f"  [robust]     median {med:+.2f}%   Wilcoxon W+={Wp:.0f} W-={Wn:.0f}  p(exact)={pstr}  (z={z:+.2f})")
+print()
+print("  --- server stability (this run; compare across servers) ---")
+print(f"  run-to-run jitter:    A CV {cvA:.2f}%   B CV {cvB:.2f}%        (lower = steadier)")
+print(f"  within-session drift: {slope * N:+.2f}% over the run, 1st->2nd half {drift_shift:+.2f}%")
+print(f"    (jitter -> Tier-1 cached gate floor; drift -> whether the cached baseline can be trusted)")
+print()
+if lo > 0 and p < 0.05:
+    print(f"  VERDICT: REAL IMPROVEMENT - PR faster by ~{mean:.2f}% (t-CI and Wilcoxon agree)")
+elif hi < 0 and p < 0.05:
+    print(f"  VERDICT: REAL REGRESSION - PR slower by ~{-mean:.2f}% (t-CI and Wilcoxon agree)")
+elif (lo > 0) != (p < 0.05):
+    print(f"  VERDICT: BORDERLINE - parametric and robust disagree; suspect outlier pair(s).")
+    print(f"           Trust the median ({med:+.2f}%); add pairs or inspect the per-pair list.")
+else:
+    print(f"  VERDICT: INCONCLUSIVE - effect not separable from 0 at n={n}.")
+    print(f"           Point estimate ~{med:+.2f}% (median). Need more pairs to resolve.")
+print(f"\n  raw pairs: {sys.argv[1]}")
+PY