Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ jobs:
# exercise Flask routes via app.test_client(). Only listed files — not
# `pytest tests/` — to avoid re-collecting unittest.TestCase classes above.
# -o addopts= avoids inheriting benchmark-only options from pyproject.toml.
run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py tests/test_check_benchmark_regression.py -v --tb=short -o addopts=
run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py tests/test_check_benchmark_regression.py tests/test_reduce_baselines.py -v --tb=short -o addopts=

# ── PyInstaller desktop build (Windows only, once per workflow) ────────
# Closes #44. Builds the onedir bundle and smoke-tests --help so the
Expand Down Expand Up @@ -215,7 +215,7 @@ jobs:
--redact \
--exit-code 1

# ── Performance benchmarks: summary cache (issue #115) ─────────────────────
# ── Performance benchmarks: unified suite (issues #115, #110) ──────────────
benchmarks:
name: Performance benchmarks (gated)
needs: [unittest]
Expand All @@ -236,7 +236,7 @@ jobs:
python -m pip install -r requirements-lock.txt
python -m pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0'

- name: Run summary-cache benchmarks
- name: Run benchmark suite
run: >
python -m pytest tests/benchmarks/
--benchmark-only
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,5 @@ coverage.xml
.hypothesis/
benchmark-results.json
benchmarks/_raw.json
benchmarks/_merged.json
benchmarks/_ci/
21 changes: 21 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
.PHONY: seed-baselines-local update-baselines check-benchmarks clean-benchmark-artifacts

# WARNING: captures timings on THIS machine. Production baselines must match ubuntu-latest CI.
# Prefer downloading benchmark-results.json from a CI artifact, then:
# python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5
seed-baselines-local:
@echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2
python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
python -c "import os, subprocess, sys; \
cmd = [sys.executable, 'scripts/reduce_baselines.py', 'benchmarks/_raw.json', 'benchmarks/baselines.json', '--slack', '1.5', '--source', 'local']; \
(subprocess.run(cmd, check=True), print('Updated benchmarks/baselines.json', file=sys.stderr)) if os.environ.get('FORCE') == '1' else print('Wrote benchmarks/_raw.json only. Set FORCE=1 to overwrite benchmarks/baselines.json.', file=sys.stderr)"

# Deprecated alias — kept for muscle memory; see seed-baselines-local warning above.
update-baselines: seed-baselines-local

check-benchmarks:
python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json

clean-benchmark-artifacts:
python -c "import pathlib; [p.unlink(missing_ok=True) for p in (pathlib.Path('benchmarks/_raw.json'), pathlib.Path('benchmark-results.json'))]"
70 changes: 70 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Performance benchmarks

Test files live under `tests/benchmarks/`; this directory holds documentation and `baselines.json` for the CI regression gate.

Repeatable local measurements for workspace listing, export, search, and summary-cache hot paths.

## Run locally

```bash
pip install -r requirements-lock.txt
pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0'
pytest tests/benchmarks/ --benchmark-only -o addopts= -v
```

## Scenarios

| Group | What |
|-------|------|
| parse | `list_workspace_projects(..., nocache=True)` over 10 / 50 / 200 synthetic composers |
| export | `POST /api/export` (ZIP) over 10 / 50 composer corpora (capped at 50 for CI runtime; parse goes to 200) |
| search | `GET /api/search` over a 50-composer corpus — **live-scan** (`test_search_full_corpus_live_scan`, `NO_SEARCH_INDEX=1`) and **FTS index** (`test_search_full_corpus_indexed`, pre-built index) |
| summary-cache | projects lookup (hit/miss), composer-map lookup (hit/miss), fingerprint (10/50/200), round-trip, tab-summary lookup |

Synthetic corpora are built in `tests/benchmarks/conftest.py` — no real Cursor storage dependency.

### Adding a benchmark group

Every `@pytest.mark.benchmark(group="...")` name must appear in `GATED_GROUPS` inside `scripts/reduce_baselines.py`. Otherwise `reduce_baselines.py` fails at refresh time with an unknown-group error. Update both the test marker and `GATED_GROUPS` when introducing a new group.

## CI gate

The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suite (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`.

- **Fail** when a gated mean exceeds its baseline by **>20%**
- **Fail** when a gated mean is **<50%** of baseline (stale — refresh after intentional speedups)
- **Fail** when a gated baseline name has no current result
- **Warn** for benchmarks without a baseline entry
- All benchmarks listed in `baselines.json` are gated unless named in `EXCLUDED_FROM_GATE` in `scripts/check_benchmark_regression.py`

Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`.

Sub-millisecond benches (e.g. `test_summary_cache_lookup`, `test_composer_map_cache_lookup`) can be high-variance on shared runners. If the gate becomes flaky, raise `--slack` for those entries or add targeted exclusions in `EXCLUDED_FROM_GATE`.

`test_summary_cache_round_trip` is intentionally excluded from the gate: it calls `set_cached_projects` (file write) + `get_cached_projects` (file read) each round, so OS page-cache state on shared runners causes 3–5x variation between consecutive CI runs. The baseline entry is kept for observation only.

## Refresh baselines

After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job). Download `benchmark-results.json` from a CI artifact when possible:

```bash
python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 --source ubuntu-latest-ci
```

For a quick local snapshot only (may not match CI timings):

```bash
make seed-baselines-local
# writes benchmarks/_raw.json only; does not overwrite benchmarks/baselines.json
make seed-baselines-local FORCE=1 # also runs reduce_baselines into benchmarks/baselines.json
```

`make update-baselines` is a deprecated alias for `seed-baselines-local`. Do not commit baselines from macOS/Windows unless you accept cross-OS gate skew.

## Makefile targets

| Target | Purpose |
|--------|---------|
| `make check-benchmarks` | Run suite + regression gate locally |
| `make seed-baselines-local` | Capture local timings to `benchmarks/_raw.json` (use `FORCE=1` to update `baselines.json`) |
| `make clean-benchmark-artifacts` | Remove `benchmark-results.json` and `benchmarks/_raw.json` |
33 changes: 25 additions & 8 deletions benchmarks/baselines.json
Original file line number Diff line number Diff line change
@@ -1,15 +1,32 @@
{
"_note": "Gated means from ubuntu-latest CI benchmark-results.json (PR #120, run 28123677675). Refresh after intentional perf changes: download benchmark-results.json from the CI artifacts job, then `python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json` (re-seed with reduce_baselines or edit means). Local capture: `pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=` on ubuntu-latest.",
"updated": "2026-06-24T19:20:27Z",
"_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Excluded from gate (recorded for reference): test_summary_cache_round_trip. Refresh after intentional speedups via reduce_baselines.py.",
"updated": "2026-06-25T23:36:11Z",
"machine": "Linux",
"groups": {
"parse": {
"test_list_workspace_projects_nocache[composers-10]": 0.016421750017237738,
"test_list_workspace_projects_nocache[composers-50]": 0.07185380692856874,
"test_list_workspace_projects_nocache[composers-200]": 0.2388664538571439
},
"export": {
"test_post_export_zip[composers-10]": 0.010621589857140498,
"test_post_export_zip[composers-50]": 0.03968703356250458
},
"search": {
"test_search_full_corpus_live_scan": 0.04461661563157736,
"test_search_full_corpus_indexed": 0.05512249660713918
},
"summary-cache": {
"test_summary_cache_hit": 6.3e-05,
"test_summary_cache_miss": 6.3e-05,
"test_fingerprint_workspace_entries[10]": 0.001844,
"test_fingerprint_workspace_entries[50]": 0.007759,
"test_fingerprint_workspace_entries[200]": 0.022231,
"test_summary_cache_round_trip": 0.000351
"test_summary_cache_lookup[hit]": 7.249851343825762e-05,
"test_summary_cache_lookup[miss]": 7.193702095574013e-05,
"test_composer_map_cache_lookup[hit]": 7.151645086519804e-05,
"test_composer_map_cache_lookup[miss]": 7.112598943352091e-05,
"test_fingerprint_workspace_entries[10]": 0.0024127972424549185,
"test_fingerprint_workspace_entries[50]": 0.010196820941858245,
"test_fingerprint_workspace_entries[200]": 0.029070524094341035,
"test_summary_cache_round_trip": 0.0004703680658560554,
"test_tab_summary_cache_lookup[hit]": 7.844850562859133e-05,
"test_tab_summary_cache_lookup[miss]": 7.843399021512e-05
}
}
}
57 changes: 53 additions & 4 deletions scripts/check_benchmark_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,24 @@

import argparse
import json
import math
import sys
from pathlib import Path

THRESHOLD = 1.20
STALE_FLOOR = 0.50

# Benchmarks recorded in baselines.json but excluded from the regression gate.
# Use sparingly — only for benches whose timing is inherently noisy across CI runs
# (e.g. file I/O operations that depend on OS page-cache state).
EXCLUDED_FROM_GATE: frozenset[str] = frozenset(
{
# round_trip calls set_cached_projects (file write) + get_cached_projects (file read)
# each round. OS page-cache state on shared runners causes 3–5x variation between
# consecutive CI runs, making this ungatable with any reasonable slack.
"test_summary_cache_round_trip",
}
)


class BenchmarkDataError(ValueError):
Expand Down Expand Up @@ -97,19 +111,35 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
return means


def _validate_gate_ratios(threshold: float, stale_floor: float) -> None:
if not math.isfinite(threshold):
raise BenchmarkDataError("threshold must be finite")
if threshold <= 1:
raise BenchmarkDataError("threshold must be greater than 1")
if not math.isfinite(stale_floor):
raise BenchmarkDataError("stale_floor must be finite")
if not 0 < stale_floor < 1:
raise BenchmarkDataError("stale_floor must be between 0 and 1 (exclusive)")


def check_regression(
results_path: str | Path,
baselines_path: str | Path,
*,
threshold: float = THRESHOLD,
stale_floor: float = STALE_FLOOR,
) -> int:
"""Return 0 when within threshold; 1 when any gated benchmark regresses."""
"""Return 0 when within threshold; 1 when any gated benchmark regresses or is stale."""
_validate_gate_ratios(threshold, stale_floor)
flat = load_results(results_path)
baseline_means = load_baseline_means(baselines_path)

failures: list[str] = []
stale: list[str] = []
missing: list[str] = []
for name, base in baseline_means.items():
if name in EXCLUDED_FROM_GATE:
continue
cur = flat.get(name)
if cur is None:
print(f"FAIL: no current result for gated baseline {name!r}")
Expand All @@ -119,20 +149,32 @@ def check_regression(
print(f"WARN: baseline for {name!r} is zero; skipping ratio check")
continue
ratio = cur / base
tag = "FAIL" if ratio > threshold else "ok"
print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
if ratio > threshold:
tag = "FAIL"
failures.append(name)
elif ratio < stale_floor:
tag = "STALE"
stale.append(name)
else:
tag = "ok"
print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")

for name in flat:
if name in EXCLUDED_FROM_GATE:
continue
if name not in baseline_means:
print(f"WARN: {name!r} has no baseline yet; not gated")

if failures:
print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}")
if stale:
print(
f"\nSTALE: {len(stale)} benchmark(s) are faster than {stale_floor:.0%} of baseline "
"(refresh baselines after intentional speedups)"
)
if missing:
print(f"\nMISSING: {len(missing)} gated benchmark(s) absent from current results")
if failures or missing:
if failures or stale or missing:
return 1
return 0

Expand All @@ -147,12 +189,19 @@ def main(argv: list[str] | None = None) -> int:
default=THRESHOLD,
help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)",
)
parser.add_argument(
"--stale-floor",
type=float,
default=STALE_FLOOR,
help="fail when current mean is below this fraction of baseline (default: 0.50)",
)
args = parser.parse_args(argv)
try:
return check_regression(
args.results_path,
args.baselines_path,
threshold=args.threshold,
stale_floor=args.stale_floor,
)
except BenchmarkDataError as exc:
print(f"ERROR: {exc}", file=sys.stderr)
Expand Down
Loading
Loading