Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions .github/workflows/bench-abba.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
name: Bench ABBA tiebreaker

# Drift-free paired (A/B/B/A) prover benchmark for resolving small (~1%) deltas the
# cheap PR benchmark can't confirm. It builds both binaries and runs ~20 interleaved
# pairs, so it OCCUPIES THE SINGLE BENCH SERVER FOR ~30-40 MIN. For that reason it
# NEVER auto-triggers -- it runs only on an explicit `/bench-abba` comment on a PR.
on:
issue_comment:
types: [created]

# One ABBA run per PR; a re-trigger cancels the stale one. (The single self-hosted
# bench runner serializes across PRs on its own.)
concurrency:
group: bench-abba-${{ github.event.issue.number }}
cancel-in-progress: true

permissions:
contents: read
pull-requests: write
issues: write

jobs:
abba:
# Manual-only: a "/bench-abba" comment on a PR, from a repo member. Never auto.
if: >-
github.event.issue.pull_request &&
startsWith(github.event.comment.body, '/bench-abba') &&
contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association)
runs-on: [self-hosted, bench]
# Generous ceiling so a hang/OOM can't strand the single bench runner; the
# workload itself is ~30-40 min at the default 20 pairs (clamped to <=40).
timeout-minutes: 120
steps:
- name: Acknowledge (react + occupancy notice)
uses: actions/github-script@v7
with:
script: |
await github.rest.reactions.createForIssueComment({
owner: context.repo.owner, repo: context.repo.repo,
comment_id: context.payload.comment.id, content: 'eyes'
});
await github.rest.issues.createComment({
owner: context.repo.owner, repo: context.repo.repo,
issue_number: context.issue.number,
body: '⏳ **ABBA tiebreaker started** on the bench server (~30–40 min). The bench server is occupied until it finishes.'
});

- name: Resolve PR head + pair count
id: cfg
env:
GH_TOKEN: ${{ github.token }}
PR_NUM: ${{ github.event.issue.number }}
COMMENT_BODY: ${{ github.event.comment.body }}
run: |
# Resolve the head SHA (not the branch name): pinning the commit works for
# fork PRs too (the branch lives in the fork, not origin/) and avoids a
# force-push race mid-run.
HEAD_SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
echo "head_sha=$HEAD_SHA" >> "$GITHUB_OUTPUT"
# Optional pair count, e.g. "/bench-abba 32"; default 20. Clamp to [2,40]
# so a "/bench-abba 10000" can't monopolize the single bench server.
N=$(echo "$COMMENT_BODY" | sed -n 's|^/bench-abba[[:space:]]*\([0-9]\+\).*|\1|p')
N=${N:-20}
if [ "$N" -lt 2 ] 2>/dev/null || [ "$N" -gt 40 ] 2>/dev/null; then
echo "::warning::pair count $N out of range [2,40]; using 20"
N=20
fi
echo "pairs=$N" >> "$GITHUB_OUTPUT"

- name: Checkout (full history for ref resolution)
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Fetch PR head commit (works for fork PRs)
env:
PR_NUM: ${{ github.event.issue.number }}
run: git fetch origin "pull/$PR_NUM/head" --quiet

- name: Add cargo to PATH
run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"

- name: Run ABBA tiebreaker
id: run
env:
HEAD_SHA: ${{ steps.cfg.outputs.head_sha }}
PAIRS: ${{ steps.cfg.outputs.pairs }}
run: |
export SYSROOT_DIR="$HOME/.lambda-vm-sysroot"
set -o pipefail
# bench_abba.sh builds the cli at both refs (isolated worktree), runs the
# interleaved pairs, and prints the paired-t CI + exact Wilcoxon test.
# Pass the head SHA (pinned above) so fork PRs resolve.
scripts/bench_abba.sh "$HEAD_SHA" origin/main "$PAIRS" 2>&1 | tee /tmp/abba_out.txt
sed -n '/=== ABBA paired result/,$p' /tmp/abba_out.txt > /tmp/abba_result.txt

- name: Post result
if: always()
uses: actions/github-script@v7
env:
HEAD_SHA: ${{ steps.cfg.outputs.head_sha }}
PAIRS: ${{ steps.cfg.outputs.pairs }}
OUTCOME: ${{ steps.run.outcome }}
with:
script: |
const fs = require('fs');
const read = (p) => { try { return fs.readFileSync(p, 'utf8').trim(); } catch { return ''; } };
const head = (process.env.HEAD_SHA || '').slice(0, 10), pairs = process.env.PAIRS;
let body = `## ABBA tiebreaker — \`${head}\` vs \`main\` (${pairs} pairs)\n\n`;
if (process.env.OUTCOME === 'success') {
const res = read('/tmp/abba_result.txt') || read('/tmp/abba_out.txt');
body += '```\n' + res + '\n```\n';
body += '\n<sub>Drift-free interleaved A/B/B/A measurement. + = PR faster. ';
body += 'Trust the verdict when paired-t and Wilcoxon agree.</sub>\n';
} else {
const tail = read('/tmp/abba_out.txt').split('\n').slice(-30).join('\n');
body += `❌ Run failed. Last log lines:\n\n` + '```\n' + tail + '\n```\n';
}
await github.rest.issues.createComment({
owner: context.repo.owner, repo: context.repo.repo,
issue_number: context.issue.number, body
});
26 changes: 22 additions & 4 deletions .github/workflows/benchmark-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ env:
ELF: executor/program_artifacts/rust/ethrex.elf
INPUT: executor/tests/ethrex_bench_20.bin
BENCH_RUNS_PR: 3
BENCH_RUNS_BASELINE: 3
# Cheap-tier screen: catches regressions down to ~1.5% on its own and leaves
# smaller/ambiguous deltas to the manual drift-free ABBA tiebreaker. Pushing either
# side past 5 buys little here (the cached comparison can't beat the ~1% drift wall),
# so the per-PR run count is also capped at 5 (clamp below).
BENCH_RUNS_BASELINE: 5
# Memory-scaling sweep: same ELF, different N-transfer inputs. GROWTH_PROGRAMS
# are the generated (gitignored) fixture basenames in executor/tests/; GROWTH_STEPS
# the matching transfer counts (x-axis; slope is MB per transfer).
Expand All @@ -55,6 +59,7 @@ jobs:
(github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
startsWith(github.event.comment.body, '/bench') &&
!startsWith(github.event.comment.body, '/bench-abba') &&
contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
steps:
- name: React to comment
Expand Down Expand Up @@ -136,9 +141,10 @@ jobs:
RUNS=$BENCH_RUNS_PR
fi

# Clamp to 1-10
if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 10 ] 2>/dev/null; then
echo "::warning::Run count $RUNS out of range [1,10], defaulting to $BENCH_RUNS_PR"
# Clamp to 1-5. Beyond 5 the single-session cached comparison barely improves
# (it can't beat the ~1% drift wall); use the ABBA tiebreaker for finer deltas.
if [ "$RUNS" -lt 1 ] 2>/dev/null || [ "$RUNS" -gt 5 ] 2>/dev/null; then
echo "::warning::Run count $RUNS out of range [1,5], defaulting to $BENCH_RUNS_PR"
RUNS=$BENCH_RUNS_PR
fi

Expand Down Expand Up @@ -702,6 +708,18 @@ jobs:
body += `> ✅ No significant change.\n`;
}

// Tier-1 -> Tier-2 escalation: a small time speedup the cheap 3-5 run CI
// can't confirm (it catches >=~1.5% on its own). Point at the drift-free
// ABBA tiebreaker, which the user runs on demand via `/bench-abba`.
const tp = parseFloat(timePct);
if (tp < 0 && tp > -1.5) {
body += `>\n`;
body += `> 🔬 **Looks like a small speedup (${fmt(timePct)}%) — below what ${runs} runs can confirm.** `;
body += `Comment \`/bench-abba\` to run the drift-free ABBA tiebreaker (paired-t CI + exact Wilcoxon). `;
body += `Note: it occupies the bench server for ~30–40 min.\n`;
body += `> Optional pair count: \`/bench-abba 32\` (20 resolves ~1%, 32 for ~0.6%).\n`;
}

// Spread warnings
const prWarnings = [];
const baseWarnings = [];
Expand Down
Loading
Loading