diff --git a/.github/workflows/bench-abba.yml b/.github/workflows/bench-abba.yml new file mode 100644 index 000000000..200982d17 --- /dev/null +++ b/.github/workflows/bench-abba.yml @@ -0,0 +1,122 @@ +name: Bench ABBA tiebreaker + +# Drift-free paired (A/B/B/A) prover benchmark for resolving small (~1%) deltas the +# cheap PR benchmark can't confirm. It builds both binaries and runs ~20 interleaved +# pairs, so it OCCUPIES THE SINGLE BENCH SERVER FOR ~30-40 MIN. For that reason it +# NEVER auto-triggers -- it runs only on an explicit `/bench-abba` comment on a PR. +on: + issue_comment: + types: [created] + +# One ABBA run per PR; a re-trigger cancels the stale one. (The single self-hosted +# bench runner serializes across PRs on its own.) +concurrency: + group: bench-abba-${{ github.event.issue.number }} + cancel-in-progress: true + +permissions: + contents: read + pull-requests: write + issues: write + +jobs: + abba: + # Manual-only: a "/bench-abba" comment on a PR, from a repo member. Never auto. + if: >- + github.event.issue.pull_request && + startsWith(github.event.comment.body, '/bench-abba') && + contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association) + runs-on: [self-hosted, bench] + # Generous ceiling so a hang/OOM can't strand the single bench runner; the + # workload itself is ~30-40 min at the default 20 pairs (clamped to <=40). + timeout-minutes: 120 + steps: + - name: Acknowledge (react + occupancy notice) + uses: actions/github-script@v7 + with: + script: | + await github.rest.reactions.createForIssueComment({ + owner: context.repo.owner, repo: context.repo.repo, + comment_id: context.payload.comment.id, content: 'eyes' + }); + await github.rest.issues.createComment({ + owner: context.repo.owner, repo: context.repo.repo, + issue_number: context.issue.number, + body: '⏳ **ABBA tiebreaker started** on the bench server (~30–40 min). The bench server is occupied until it finishes.' + }); + + - name: Resolve PR head + pair count + id: cfg + env: + GH_TOKEN: ${{ github.token }} + PR_NUM: ${{ github.event.issue.number }} + COMMENT_BODY: ${{ github.event.comment.body }} + run: | + # Resolve the head SHA (not the branch name): pinning the commit works for + # fork PRs too (the branch lives in the fork, not origin/) and avoids a + # force-push race mid-run. + HEAD_SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid) + echo "head_sha=$HEAD_SHA" >> "$GITHUB_OUTPUT" + # Optional pair count, e.g. "/bench-abba 32"; default 20. Clamp to [2,40] + # so a "/bench-abba 10000" can't monopolize the single bench server. + N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-abba[[:space:]]*\([0-9]\+\).*|\1|p') + N=${N:-20} + if [ "$N" -lt 2 ] 2>/dev/null || [ "$N" -gt 40 ] 2>/dev/null; then + echo "::warning::pair count $N out of range [2,40]; using 20" + N=20 + fi + echo "pairs=$N" >> "$GITHUB_OUTPUT" + + - name: Checkout (full history for ref resolution) + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fetch PR head commit (works for fork PRs) + env: + PR_NUM: ${{ github.event.issue.number }} + run: git fetch origin "pull/$PR_NUM/head" --quiet + + - name: Add cargo to PATH + run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" + + - name: Run ABBA tiebreaker + id: run + env: + HEAD_SHA: ${{ steps.cfg.outputs.head_sha }} + PAIRS: ${{ steps.cfg.outputs.pairs }} + run: | + export SYSROOT_DIR="$HOME/.lambda-vm-sysroot" + set -o pipefail + # bench_abba.sh builds the cli at both refs (isolated worktree), runs the + # interleaved pairs, and prints the paired-t CI + exact Wilcoxon test. + # Pass the head SHA (pinned above) so fork PRs resolve. + scripts/bench_abba.sh "$HEAD_SHA" origin/main "$PAIRS" 2>&1 | tee /tmp/abba_out.txt + sed -n '/=== ABBA paired result/,$p' /tmp/abba_out.txt > /tmp/abba_result.txt + + - name: Post result + if: always() + uses: actions/github-script@v7 + env: + HEAD_SHA: ${{ steps.cfg.outputs.head_sha }} + PAIRS: ${{ steps.cfg.outputs.pairs }} + OUTCOME: ${{ steps.run.outcome }} + with: + script: | + const fs = require('fs'); + const read = (p) => { try { return fs.readFileSync(p, 'utf8').trim(); } catch { return ''; } }; + const head = (process.env.HEAD_SHA || '').slice(0, 10), pairs = process.env.PAIRS; + let body = `## ABBA tiebreaker — \`${head}\` vs \`main\` (${pairs} pairs)\n\n`; + if (process.env.OUTCOME === 'success') { + const res = read('/tmp/abba_result.txt') || read('/tmp/abba_out.txt'); + body += '```\n' + res + '\n```\n'; + body += '\nDrift-free interleaved A/B/B/A measurement. + = PR faster. '; + body += 'Trust the verdict when paired-t and Wilcoxon agree.\n'; + } else { + const tail = read('/tmp/abba_out.txt').split('\n').slice(-30).join('\n'); + body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n'; + } + await github.rest.issues.createComment({ + owner: context.repo.owner, repo: context.repo.repo, + issue_number: context.issue.number, body + }); diff --git a/.github/workflows/benchmark-pr.yml b/.github/workflows/benchmark-pr.yml index ca66bf9a7..57169967d 100644 --- a/.github/workflows/benchmark-pr.yml +++ b/.github/workflows/benchmark-pr.yml @@ -38,7 +38,11 @@ env: ELF: executor/program_artifacts/rust/ethrex.elf INPUT: executor/tests/ethrex_bench_20.bin BENCH_RUNS_PR: 3 - BENCH_RUNS_BASELINE: 3 + # Cheap-tier screen: catches regressions down to ~1.5% on its own and leaves + # smaller/ambiguous deltas to the manual drift-free ABBA tiebreaker. Pushing either + # side past 5 buys little here (the cached comparison can't beat the ~1% drift wall), + # so the per-PR run count is also capped at 5 (clamp below). + BENCH_RUNS_BASELINE: 5 # Memory-scaling sweep: same ELF, different N-transfer inputs. GROWTH_PROGRAMS # are the generated (gitignored) fixture basenames in executor/tests/; GROWTH_STEPS # the matching transfer counts (x-axis; slope is MB per transfer). @@ -55,6 +59,7 @@ jobs: (github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/bench') && + !startsWith(github.event.comment.body, '/bench-abba') && contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association)) steps: - name: React to comment @@ -136,9 +141,10 @@ jobs: RUNS=$BENCH_RUNS_PR fi - # Clamp to 1-10 - if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 10 ] 2>/dev/null; then - echo "::warning::Run count $RUNS out of range [1,10], defaulting to $BENCH_RUNS_PR" + # Clamp to 1-5. Beyond 5 the single-session cached comparison barely improves + # (it can't beat the ~1% drift wall); use the ABBA tiebreaker for finer deltas. + if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 5 ] 2>/dev/null; then + echo "::warning::Run count $RUNS out of range [1,5], defaulting to $BENCH_RUNS_PR" RUNS=$BENCH_RUNS_PR fi @@ -702,6 +708,18 @@ jobs: body += `> ✅ No significant change.\n`; } + // Tier-1 -> Tier-2 escalation: a small time speedup the cheap 3-5 run CI + // can't confirm (it catches >=~1.5% on its own). Point at the drift-free + // ABBA tiebreaker, which the user runs on demand via `/bench-abba`. + const tp = parseFloat(timePct); + if (tp < 0 && tp > -1.5) { + body += `>\n`; + body += `> 🔬 **Looks like a small speedup (${fmt(timePct)}%) — below what ${runs} runs can confirm.** `; + body += `Comment \`/bench-abba\` to run the drift-free ABBA tiebreaker (paired-t CI + exact Wilcoxon). `; + body += `Note: it occupies the bench server for ~30–40 min.\n`; + body += `> Optional pair count: \`/bench-abba 32\` (20 resolves ~1%, 32 for ~0.6%).\n`; + } + // Spread warnings const prWarnings = []; const baseWarnings = []; diff --git a/scripts/bench_abba.sh b/scripts/bench_abba.sh new file mode 100755 index 000000000..79bfddf27 --- /dev/null +++ b/scripts/bench_abba.sh @@ -0,0 +1,262 @@ +#!/usr/bin/env bash +# +# bench_abba.sh — interleaved A/B/B/A paired prover benchmark. +# +# WHY: comparing a PR against a separately-recorded (cached) baseline conflates the +# code delta with machine drift between the two measurement sessions. For small +# (~1%) prover changes that drift is the dominant error. Measuring both binaries +# *interleaved on the same machine in the same session* cancels the drift (it hits +# both sides equally), and a paired analysis over the A/B pairs is far more powerful +# than an unpaired two-sample test. +# +# WHAT IT DOES: +# 1. Builds the ethrex guest ELF + 20-transfer fixture once (identical for both +# sides — a prover-only change doesn't touch the guest). +# 2. Builds the `cli` prover at REF_A and REF_B (skips the build and reuses the +# cached binaries if they already exist; set REBUILD=1 to force). +# 3. Runs N_PAIRS interleaved pairs in A B B A ... order (alternating which side +# runs first each pair, to cancel linear drift). Use an EVEN N_PAIRS. +# 4. Reports BOTH a paired-t 95% CI (sensitive to outliers) AND a robust +# median + Wilcoxon signed-rank result (shrugs off transient slow runs). +# +# CONVENTION: every reported number is an IMPROVEMENT, positive = PR FASTER. +# +# USAGE: +# scripts/bench_abba.sh REF_A [REF_B] [N_PAIRS] +# REF_A REQUIRED — ref or SHA to evaluate (the PR side) +# REF_B baseline (default: origin/main) +# N_PAIRS pairs (default: 20 -> 40 runs, ~33 min on ethrex) +# Env: REBUILD=1 forces a rebuild even if cached binaries exist. +# +# Sizing (ethrex pair-noise sd ~1.2%, 80% power): ~12 pairs for a 1% effect, +# ~18 for 0.8%, ~32 for 0.6%. Default 20 -> solid on 0.8-1%, ~60% power at 0.6% +# (if a 20-pair run straddles 0 on a ~0.6%-looking effect, extend to 32). +# +# scripts/bench_abba.sh origin/my-pr-branch # vs main, 20 pairs +# scripts/bench_abba.sh origin/my-pr-branch origin/main 32 # 32 pairs (~0.6%) + +set -euo pipefail + +if [ $# -lt 1 ]; then + echo "usage: bench_abba.sh REF_A [REF_B=origin/main] [N_PAIRS=20]" >&2 + echo " REF_A: ref or SHA to evaluate (the PR side)" >&2 + exit 2 +fi +REF_A="$1" +REF_B="${2:-origin/main}" +N_PAIRS="${3:-20}" + +ELF_REL="executor/program_artifacts/rust/ethrex.elf" +INPUT_REL="executor/tests/ethrex_bench_20.bin" +WORK="/tmp/abba_run" +WT="/tmp/abba_wt" +PROOF="/tmp/abba_proof.bin" + +ROOT="$(git rev-parse --show-toplevel)" +cd "$ROOT" + +# Fail fast on the toolchain the final stats step needs, before the ~30-min build. +command -v python3 >/dev/null 2>&1 || { echo "ERROR: python3 is required (final stats step)." >&2; exit 1; } + +echo "==> Refs" +git fetch origin --quiet || echo "WARNING: 'git fetch origin' failed -- resolving against possibly-stale local refs." >&2 +SHA_A="$(git rev-parse "$REF_A")" +SHA_B="$(git rev-parse "$REF_B")" +echo " A (PR) $REF_A -> ${SHA_A:0:10}" +echo " B (baseline) $REF_B -> ${SHA_B:0:10}" +if [ $((N_PAIRS % 2)) -ne 0 ]; then + echo " WARNING: N_PAIRS=$N_PAIRS is odd; use an even count so AB/BA orders balance." +fi +echo " pairs=$N_PAIRS (=$((N_PAIRS * 2)) prove runs)" + +mkdir -p "$WORK" + +# --- 1. Guest ELF + fixture (identical for both sides; build once if missing) --- +if [ ! -f "$ELF_REL" ]; then + echo "==> Building ethrex guest ELF (missing)" + export SYSROOT_DIR="${SYSROOT_DIR:-$HOME/.lambda-vm-sysroot}" + make "$ELF_REL" +fi +if [ ! -f "$INPUT_REL" ]; then + echo "==> Generating ethrex 20-transfer fixture (missing)" + ( cd tooling/ethrex-fixtures && cargo build --release ) + tooling/ethrex-fixtures/target/release/ethrex-fixtures 20 "$INPUT_REL" distinct +fi +ELF="$(cd "$(dirname "$ELF_REL")" && pwd)/$(basename "$ELF_REL")" +INPUT="$(cd "$(dirname "$INPUT_REL")" && pwd)/$(basename "$INPUT_REL")" + +# --- 2. Build (or reuse) both prover binaries --- +need_build=0 +if [ "${REBUILD:-0}" = "1" ] || [ ! -x "$WORK/cli_A" ] || [ ! -x "$WORK/cli_B" ]; then + need_build=1 +elif [ "$(cat "$WORK/cli_A.sha" 2>/dev/null)" != "$SHA_A" ] || [ "$(cat "$WORK/cli_B.sha" 2>/dev/null)" != "$SHA_B" ]; then + # Cache persists on the self-hosted runner; rebuild if it's for different refs + # (a different PR, or main advanced) so we never benchmark stale binaries. + echo "==> Cached binaries are for different refs; rebuilding." + need_build=1 +fi +if [ "$need_build" = "1" ]; then + cleanup() { git worktree remove --force "$WT" 2>/dev/null || true; } + trap cleanup EXIT + git worktree remove --force "$WT" 2>/dev/null || true + echo "==> Building both prover binaries in isolated worktree $WT" + git worktree add --detach "$WT" "$SHA_B" >/dev/null + build_cli() { # $1=sha $2=out (shared target dir -> 2nd build is incremental) + echo "==> Building cli @ ${1:0:10} -> $2" + git -C "$WT" checkout --quiet "$1" + if ! ( cd "$WT" && cargo build --release -p cli --features jemalloc-stats >"$WORK/build_$2.log" 2>&1 ); then + echo "ERROR: cargo build failed for $2 (@ ${1:0:10}). Tail of $WORK/build_$2.log:" >&2 + tail -40 "$WORK/build_$2.log" >&2 + exit 1 + fi + cp "$WT/target/release/cli" "$WORK/$2" + echo "$1" > "$WORK/$2.sha" + } + build_cli "$SHA_B" cli_B + build_cli "$SHA_A" cli_A + cleanup + trap - EXIT +else + echo "==> Reusing cached binaries (SHAs match requested refs; REBUILD=1 to force):" + echo " cli_A=${SHA_A:0:10} cli_B=${SHA_B:0:10}" +fi + +# --- 3. Interleaved A/B/B/A measurement (fresh CSV -- pre-committed batch) --- +run_prove() { # $1=binary -> echoes proving time (s) + local out t + out="$("$1" prove "$ELF" --private-input "$INPUT" -o "$PROOF" --time 2>&1)" + rm -f "$PROOF" + t="$(printf '%s\n' "$out" | grep -o 'Proving time: [0-9.]*' | awk '{print $3}')" + if [ -z "$t" ]; then + echo "ERROR: could not parse 'Proving time' from cli output:" >&2 + printf '%s\n' "$out" >&2 + exit 1 + fi + echo "$t" +} + +echo "==> Running $N_PAIRS interleaved pairs (improvement: + = PR faster)" +printf 'pair,a_time,b_time\n' > "$WORK/pairs.csv" +for i in $(seq 1 "$N_PAIRS"); do + if [ $((i % 2)) -eq 1 ]; then # odd pair: A then B + a="$(run_prove "$WORK/cli_A")"; b="$(run_prove "$WORK/cli_B")" + else # even pair: B then A (ABBA pattern) + b="$(run_prove "$WORK/cli_B")"; a="$(run_prove "$WORK/cli_A")" + fi + printf '%d,%s,%s\n' "$i" "$a" "$b" >> "$WORK/pairs.csv" + printf ' pair %2d/%d A=%ss B=%ss PR %+.2f%% (+=faster)\n' \ + "$i" "$N_PAIRS" "$a" "$b" "$(awk "BEGIN{print ($b-$a)/$b*100}")" +done + +# --- 4. Paired t-test + robust median/Wilcoxon --- +python3 - "$WORK/pairs.csv" <<'PY' +import sys, csv, math + +rows = list(csv.DictReader(open(sys.argv[1]))) +A = [float(r['a_time']) for r in rows] # PR +B = [float(r['b_time']) for r in rows] # baseline +n = len(A) +# per-pair improvement: positive => PR (A) faster than baseline (B) +d = [(b - a) / b * 100.0 for a, b in zip(A, B)] + +# ---- parametric: paired t ---- +mean = sum(d) / n +var = sum((x - mean) ** 2 for x in d) / (n - 1) if n > 1 else 0.0 +sd = math.sqrt(var) +se = sd / math.sqrt(n) if n else float('inf') +TT = {1:12.706,2:4.303,3:3.182,4:2.776,5:2.571,6:2.447,7:2.365,8:2.306,9:2.262, + 10:2.228,11:2.201,12:2.179,13:2.160,14:2.145,15:2.131,16:2.120,17:2.110, + 18:2.101,19:2.093,20:2.086,21:2.080,22:2.074,23:2.069,24:2.064,25:2.060, + 26:2.056,27:2.052,28:2.048,29:2.045,30:2.042,35:2.030,40:2.021,50:2.009, + 60:2.000,80:1.990,120:1.980} +df = n - 1 +tc = TT.get(df) or (1.96 if df > 120 else TT[min(TT, key=lambda k: abs(k - df))]) +lo, hi = mean - tc * se, mean + tc * se + +# ---- robust: median + Wilcoxon signed-rank (tie-averaged ranks, EXACT p, pure stdlib) ---- +def median(xs): + s = sorted(xs); m = len(s) + return s[m // 2] if m % 2 else (s[m // 2 - 1] + s[m // 2]) / 2 + +nz = [x for x in d if x != 0.0] +m = len(nz) +order = sorted(range(m), key=lambda i: abs(nz[i])) +ranks = [0.0] * m +i = 0 +while i < m: # average ranks within ties on |d| + j = i + while j + 1 < m and abs(nz[order[j + 1]]) == abs(nz[order[i]]): + j += 1 + avg = (i + 1 + j + 1) / 2.0 + for k in range(i, j + 1): + ranks[order[k]] = avg + i = j + 1 +Wp = sum(r for r, x in zip(ranks, nz) if x > 0) +Wn = sum(r for r, x in zip(ranks, nz) if x < 0) +mu = m * (m + 1) / 4.0 +sig = math.sqrt(m * (m + 1) * (2 * m + 1) / 24.0) if m else 0.0 +z = (Wp - mu - (0.5 if Wp > mu else -0.5)) / sig if sig else 0.0 # normal approx (display only) +# EXACT two-sided p: enumerate the signed-rank null distribution. Each rank is +/- with +# prob 1/2, so the count of assignments giving W+=v is the coeff of x^v in prod(1 + x^rank) +# -- build it with a generating-function DP. Double the ranks so tie-averaged (half-integer) +# ranks become integers. No scipy; exact even at small n where the normal approx is loose. +if m: + ir = [int(round(2 * r)) for r in ranks] + poly = [1] + for r in ir: + nxt = [0] * (len(poly) + r) + for v, c in enumerate(poly): + if c: + nxt[v] += c # this rank negative -> adds 0 to W+ + nxt[v + r] += c # this rank positive -> adds r to W+ + poly = nxt + Wp2 = int(round(2 * Wp)) + p = min(1.0, 2.0 * min(sum(poly[:Wp2 + 1]), sum(poly[Wp2:])) / (1 << m)) +else: + p = 1.0 +med = median(d) + +# ---- server stability (byproduct): run-to-run jitter + within-session drift ---- +def cv(xs): + mm = sum(xs) / len(xs) + s = math.sqrt(sum((x - mm) ** 2 for x in xs) / (len(xs) - 1)) if len(xs) > 1 else 0.0 + return (s / mm * 100.0) if mm else 0.0 +mA, mB = sum(A) / n, sum(B) / n +cvA, cvB = cv(A), cv(B) +# reconstruct execution order (odd pair: A,B ; even pair: B,A) and normalize each +# run by its binary's mean so the A/B offset drops out, leaving pure machine drift. +seq = [] +for i in range(n): + seq += ([('A', A[i]), ('B', B[i])] if (i + 1) % 2 else [('B', B[i]), ('A', A[i])]) +nrm = [(t / (mA if lbl == 'A' else mB) - 1) * 100 for lbl, t in seq] +N = len(nrm); mi = (N - 1) / 2.0; mn = sum(nrm) / N +denom = sum((i - mi) ** 2 for i in range(N)) +slope = (sum((i - mi) * (nrm[i] - mn) for i in range(N)) / denom) if denom else 0.0 +half = N // 2 +drift_shift = sum(nrm[half:]) / (N - half) - sum(nrm[:half]) / half + +print("\n=== ABBA paired result (improvement: + = PR faster) ===") +print(f" pairs: {n} mean A (PR): {sum(A)/n:.3f}s mean B (base): {sum(B)/n:.3f}s") +print() +print(f" [parametric] paired-t mean {mean:+.2f}% sd {sd:.2f}% se {se:.2f}%") +print(f" 95% CI: [{lo:+.2f}%, {hi:+.2f}%] (t df={df} = {tc})") +pstr = f"{p:.4f}" if p >= 1e-4 else f"{p:.1e}" +print(f" [robust] median {med:+.2f}% Wilcoxon W+={Wp:.0f} W-={Wn:.0f} p(exact)={pstr} (z={z:+.2f})") +print() +print(" --- server stability (this run; compare across servers) ---") +print(f" run-to-run jitter: A CV {cvA:.2f}% B CV {cvB:.2f}% (lower = steadier)") +print(f" within-session drift: {slope * N:+.2f}% over the run, 1st->2nd half {drift_shift:+.2f}%") +print(f" (jitter -> Tier-1 cached gate floor; drift -> whether the cached baseline can be trusted)") +print() +if lo > 0 and p < 0.05: + print(f" VERDICT: REAL IMPROVEMENT - PR faster by ~{mean:.2f}% (t-CI and Wilcoxon agree)") +elif hi < 0 and p < 0.05: + print(f" VERDICT: REAL REGRESSION - PR slower by ~{-mean:.2f}% (t-CI and Wilcoxon agree)") +elif (lo > 0) != (p < 0.05): + print(f" VERDICT: BORDERLINE - parametric and robust disagree; suspect outlier pair(s).") + print(f" Trust the median ({med:+.2f}%); add pairs or inspect the per-pair list.") +else: + print(f" VERDICT: INCONCLUSIVE - effect not separable from 0 at n={n}.") + print(f" Point estimate ~{med:+.2f}% (median). Need more pairs to resolve.") +print(f"\n raw pairs: {sys.argv[1]}") +PY