yetanotherco · MauroToscano · Jun 26, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/.github/workflows/bench-abba.yml b/.github/workflows/bench-abba.yml
@@ -0,0 +1,122 @@
+name: Bench ABBA tiebreaker
+
+# Drift-free paired (A/B/B/A) prover benchmark for resolving small (~1%) deltas the
+# cheap PR benchmark can't confirm. It builds both binaries and runs ~20 interleaved
+# pairs, so it OCCUPIES THE SINGLE BENCH SERVER FOR ~30-40 MIN. For that reason it
+# NEVER auto-triggers -- it runs only on an explicit `/bench-abba` comment on a PR.
+on:
+  issue_comment:
+    types: [created]
+
+# One ABBA run per PR; a re-trigger cancels the stale one. (The single self-hosted
+# bench runner serializes across PRs on its own.)
+concurrency:
+  group: bench-abba-${{ github.event.issue.number }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+
+jobs:
+  abba:
+    # Manual-only: a "/bench-abba" comment on a PR, from a repo member. Never auto.
+    if: >-
+      github.event.issue.pull_request &&
+      startsWith(github.event.comment.body, '/bench-abba') &&
+      contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association)
+    runs-on: [self-hosted, bench]
+    # Generous ceiling so a hang/OOM can't strand the single bench runner; the
+    # workload itself is ~30-40 min at the default 20 pairs (clamped to <=40).
+    timeout-minutes: 120
+    steps:
+      - name: Acknowledge (react + occupancy notice)
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner, repo: context.repo.repo,
+              comment_id: context.payload.comment.id, content: 'eyes'
+            });
+            await github.rest.issues.createComment({
+              owner: context.repo.owner, repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: '⏳ **ABBA tiebreaker started** on the bench server (~30–40 min). The bench server is occupied until it finishes.'
+            });
+
+      - name: Resolve PR head + pair count
+        id: cfg
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUM: ${{ github.event.issue.number }}
+          COMMENT_BODY: ${{ github.event.comment.body }}
+        run: |
+          # Resolve the head SHA (not the branch name): pinning the commit works for
+          # fork PRs too (the branch lives in the fork, not origin/) and avoids a
+          # force-push race mid-run.
+          HEAD_SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+          echo "head_sha=$HEAD_SHA" >> "$GITHUB_OUTPUT"
+          # Optional pair count, e.g. "/bench-abba 32"; default 20. Clamp to [2,40]
+          # so a "/bench-abba 10000" can't monopolize the single bench server.
+          N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-abba[[:space:]]*\([0-9]\+\).*|\1|p')
+          N=${N:-20}
+          if [ "$N" -lt 2 ] 2>/dev/null || [ "$N" -gt 40 ] 2>/dev/null; then
+            echo "::warning::pair count $N out of range [2,40]; using 20"
+            N=20
+          fi
+          echo "pairs=$N" >> "$GITHUB_OUTPUT"
+
+      - name: Checkout (full history for ref resolution)
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Fetch PR head commit (works for fork PRs)
+        env:
+          PR_NUM: ${{ github.event.issue.number }}
+        run: git fetch origin "pull/$PR_NUM/head" --quiet
+
+      - name: Add cargo to PATH
+        run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
+
+      - name: Run ABBA tiebreaker
+        id: run
+        env:
+          HEAD_SHA: ${{ steps.cfg.outputs.head_sha }}
+          PAIRS: ${{ steps.cfg.outputs.pairs }}
+        run: |
+          export SYSROOT_DIR="$HOME/.lambda-vm-sysroot"
+          set -o pipefail
+          # bench_abba.sh builds the cli at both refs (isolated worktree), runs the
+          # interleaved pairs, and prints the paired-t CI + exact Wilcoxon test.
+          # Pass the head SHA (pinned above) so fork PRs resolve.
+          scripts/bench_abba.sh "$HEAD_SHA" origin/main "$PAIRS" 2>&1 | tee /tmp/abba_out.txt
+          sed -n '/=== ABBA paired result/,$p' /tmp/abba_out.txt > /tmp/abba_result.txt
+
+      - name: Post result
+        if: always()
+        uses: actions/github-script@v7
+        env:
+          HEAD_SHA: ${{ steps.cfg.outputs.head_sha }}
+          PAIRS: ${{ steps.cfg.outputs.pairs }}
+          OUTCOME: ${{ steps.run.outcome }}
+        with:
+          script: |
+            const fs = require('fs');
+            const read = (p) => { try { return fs.readFileSync(p, 'utf8').trim(); } catch { return ''; } };
+            const head = (process.env.HEAD_SHA || '').slice(0, 10), pairs = process.env.PAIRS;
+            let body = `## ABBA tiebreaker — \`${head}\` vs \`main\` (${pairs} pairs)\n\n`;
+            if (process.env.OUTCOME === 'success') {
+              const res = read('/tmp/abba_result.txt') || read('/tmp/abba_out.txt');
+              body += '```\n' + res + '\n```\n';
+              body += '\n<sub>Drift-free interleaved A/B/B/A measurement. + = PR faster. ';
+              body += 'Trust the verdict when paired-t and Wilcoxon agree.</sub>\n';
+            } else {
+              const tail = read('/tmp/abba_out.txt').split('\n').slice(-30).join('\n');
+              body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n';
+            }
+            await github.rest.issues.createComment({
+              owner: context.repo.owner, repo: context.repo.repo,
+              issue_number: context.issue.number, body
+            });
diff --git a/.github/workflows/benchmark-pr.yml b/.github/workflows/benchmark-pr.yml
@@ -38,7 +38,11 @@ env:
   ELF: executor/program_artifacts/rust/ethrex.elf
   INPUT: executor/tests/ethrex_bench_20.bin
   BENCH_RUNS_PR: 3
-  BENCH_RUNS_BASELINE: 3
+  # Cheap-tier screen: catches regressions down to ~1.5% on its own and leaves
+  # smaller/ambiguous deltas to the manual drift-free ABBA tiebreaker. Pushing either
+  # side past 5 buys little here (the cached comparison can't beat the ~1% drift wall),
+  # so the per-PR run count is also capped at 5 (clamp below).
+  BENCH_RUNS_BASELINE: 5
   # Memory-scaling sweep: same ELF, different N-transfer inputs. GROWTH_PROGRAMS
   # are the generated (gitignored) fixture basenames in executor/tests/; GROWTH_STEPS
   # the matching transfer counts (x-axis; slope is MB per transfer).
@@ -55,6 +59,7 @@ jobs:
       (github.event_name == 'issue_comment' &&
        github.event.issue.pull_request &&
        startsWith(github.event.comment.body, '/bench') &&
+       !startsWith(github.event.comment.body, '/bench-abba') &&
        contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
     steps:
       - name: React to comment
@@ -136,9 +141,10 @@ jobs:
             RUNS=$BENCH_RUNS_PR
           fi
 
-          # Clamp to 1-10
-          if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 10 ] 2>/dev/null; then
-            echo "::warning::Run count $RUNS out of range [1,10], defaulting to $BENCH_RUNS_PR"
+          # Clamp to 1-5. Beyond 5 the single-session cached comparison barely improves
+          # (it can't beat the ~1% drift wall); use the ABBA tiebreaker for finer deltas.
+          if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 5 ] 2>/dev/null; then
+            echo "::warning::Run count $RUNS out of range [1,5], defaulting to $BENCH_RUNS_PR"
             RUNS=$BENCH_RUNS_PR
           fi
 
@@ -702,6 +708,18 @@ jobs:
               body += `> ✅ No significant change.\n`;
             }
 
+            // Tier-1 -> Tier-2 escalation: a small time speedup the cheap 3-5 run CI
+            // can't confirm (it catches >=~1.5% on its own). Point at the drift-free
+            // ABBA tiebreaker, which the user runs on demand via `/bench-abba`.
+            const tp = parseFloat(timePct);
+            if (tp < 0 && tp > -1.5) {
+              body += `>\n`;
+              body += `> 🔬 **Looks like a small speedup (${fmt(timePct)}%) — below what ${runs} runs can confirm.** `;
+              body += `Comment \`/bench-abba\` to run the drift-free ABBA tiebreaker (paired-t CI + exact Wilcoxon). `;
+              body += `Note: it occupies the bench server for ~30–40 min.\n`;
+              body += `> Optional pair count: \`/bench-abba 32\` (20 resolves ~1%, 32 for ~0.6%).\n`;
+            }
+
             // Spread warnings
             const prWarnings = [];
             const baseWarnings = [];