cppalliance · wpak-ai · Jun 26, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -115,7 +115,7 @@ jobs:
         # exercise Flask routes via app.test_client(). Only listed files — not
         # `pytest tests/` — to avoid re-collecting unittest.TestCase classes above.
         # -o addopts= avoids inheriting benchmark-only options from pyproject.toml.
-        run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py tests/test_check_benchmark_regression.py -v --tb=short -o addopts=
+        run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py tests/test_check_benchmark_regression.py tests/test_reduce_baselines.py -v --tb=short -o addopts=
 
       # ── PyInstaller desktop build (Windows only, once per workflow) ────────
       # Closes #44. Builds the onedir bundle and smoke-tests --help so the
@@ -215,7 +215,7 @@ jobs:
             --redact \
             --exit-code 1
 
-  # ── Performance benchmarks: summary cache (issue #115) ─────────────────────
+  # ── Performance benchmarks: unified suite (issues #115, #110) ──────────────
   benchmarks:
     name: Performance benchmarks (gated)
     needs: [unittest]
@@ -236,7 +236,7 @@ jobs:
           python -m pip install -r requirements-lock.txt
           python -m pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0'
 
-      - name: Run summary-cache benchmarks
+      - name: Run benchmark suite
         run: >
           python -m pytest tests/benchmarks/
           --benchmark-only

diff --git a/.gitignore b/.gitignore
@@ -46,3 +46,5 @@ coverage.xml
 .hypothesis/
 benchmark-results.json
 benchmarks/_raw.json
+benchmarks/_merged.json
+benchmarks/_ci/
diff --git a/Makefile b/Makefile
@@ -0,0 +1,21 @@
+.PHONY: seed-baselines-local update-baselines check-benchmarks clean-benchmark-artifacts
+
+# WARNING: captures timings on THIS machine. Production baselines must match ubuntu-latest CI.
+# Prefer downloading benchmark-results.json from a CI artifact, then:
+#   python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5
+seed-baselines-local:
+	@echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2
+	python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+	python -c "import os, subprocess, sys; \
+	  cmd = [sys.executable, 'scripts/reduce_baselines.py', 'benchmarks/_raw.json', 'benchmarks/baselines.json', '--slack', '1.5', '--source', 'local']; \
+	  (subprocess.run(cmd, check=True), print('Updated benchmarks/baselines.json', file=sys.stderr)) if os.environ.get('FORCE') == '1' else print('Wrote benchmarks/_raw.json only. Set FORCE=1 to overwrite benchmarks/baselines.json.', file=sys.stderr)"
+
+# Deprecated alias — kept for muscle memory; see seed-baselines-local warning above.
+update-baselines: seed-baselines-local
+
+check-benchmarks:
+	python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
+	python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
+
+clean-benchmark-artifacts:
+	python -c "import pathlib; [p.unlink(missing_ok=True) for p in (pathlib.Path('benchmarks/_raw.json'), pathlib.Path('benchmark-results.json'))]"
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,70 @@
+# Performance benchmarks
+
+Test files live under `tests/benchmarks/`; this directory holds documentation and `baselines.json` for the CI regression gate.
+
+Repeatable local measurements for workspace listing, export, search, and summary-cache hot paths.
+
+## Run locally
+
+```bash
+pip install -r requirements-lock.txt
+pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0'
+pytest tests/benchmarks/ --benchmark-only -o addopts= -v
+```
+
+## Scenarios
+
+| Group | What |
+|-------|------|
+| parse | `list_workspace_projects(..., nocache=True)` over 10 / 50 / 200 synthetic composers |
+| export | `POST /api/export` (ZIP) over 10 / 50 composer corpora (capped at 50 for CI runtime; parse goes to 200) |
+| search | `GET /api/search` over a 50-composer corpus — **live-scan** (`test_search_full_corpus_live_scan`, `NO_SEARCH_INDEX=1`) and **FTS index** (`test_search_full_corpus_indexed`, pre-built index) |
+| summary-cache | projects lookup (hit/miss), composer-map lookup (hit/miss), fingerprint (10/50/200), round-trip, tab-summary lookup |
+
+Synthetic corpora are built in `tests/benchmarks/conftest.py` — no real Cursor storage dependency.
+
+### Adding a benchmark group
+
+Every `@pytest.mark.benchmark(group="...")` name must appear in `GATED_GROUPS` inside `scripts/reduce_baselines.py`. Otherwise `reduce_baselines.py` fails at refresh time with an unknown-group error. Update both the test marker and `GATED_GROUPS` when introducing a new group.
+
+## CI gate
+
+The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suite (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`.
+
+- **Fail** when a gated mean exceeds its baseline by **>20%**
+- **Fail** when a gated mean is **<50%** of baseline (stale — refresh after intentional speedups)
+- **Fail** when a gated baseline name has no current result
+- **Warn** for benchmarks without a baseline entry
+- All benchmarks listed in `baselines.json` are gated unless named in `EXCLUDED_FROM_GATE` in `scripts/check_benchmark_regression.py`
+
+Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`.
+
+Sub-millisecond benches (e.g. `test_summary_cache_lookup`, `test_composer_map_cache_lookup`) can be high-variance on shared runners. If the gate becomes flaky, raise `--slack` for those entries or add targeted exclusions in `EXCLUDED_FROM_GATE`.
+
+`test_summary_cache_round_trip` is intentionally excluded from the gate: it calls `set_cached_projects` (file write) + `get_cached_projects` (file read) each round, so OS page-cache state on shared runners causes 3–5x variation between consecutive CI runs. The baseline entry is kept for observation only.
+
+## Refresh baselines
+
+After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job). Download `benchmark-results.json` from a CI artifact when possible:
+
+```bash
+python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 --source ubuntu-latest-ci
+```
+
+For a quick local snapshot only (may not match CI timings):
+
+```bash
+make seed-baselines-local
+# writes benchmarks/_raw.json only; does not overwrite benchmarks/baselines.json
+make seed-baselines-local FORCE=1   # also runs reduce_baselines into benchmarks/baselines.json
+```
+
+`make update-baselines` is a deprecated alias for `seed-baselines-local`. Do not commit baselines from macOS/Windows unless you accept cross-OS gate skew.
+
+## Makefile targets
+
+| Target | Purpose |
+|--------|---------|
+| `make check-benchmarks` | Run suite + regression gate locally |
+| `make seed-baselines-local` | Capture local timings to `benchmarks/_raw.json` (use `FORCE=1` to update `baselines.json`) |
+| `make clean-benchmark-artifacts` | Remove `benchmark-results.json` and `benchmarks/_raw.json` |
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
@@ -1,15 +1,32 @@
 {
-  "_note": "Gated means from ubuntu-latest CI benchmark-results.json (PR #120, run 28123677675). Refresh after intentional perf changes: download benchmark-results.json from the CI artifacts job, then `python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json` (re-seed with reduce_baselines or edit means). Local capture: `pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=` on ubuntu-latest.",
-  "updated": "2026-06-24T19:20:27Z",
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Excluded from gate (recorded for reference): test_summary_cache_round_trip. Refresh after intentional speedups via reduce_baselines.py.",
+  "updated": "2026-06-25T23:36:11Z",
   "machine": "Linux",
   "groups": {
+    "parse": {
+      "test_list_workspace_projects_nocache[composers-10]": 0.016421750017237738,
+      "test_list_workspace_projects_nocache[composers-50]": 0.07185380692856874,
+      "test_list_workspace_projects_nocache[composers-200]": 0.2388664538571439
+    },
+    "export": {
+      "test_post_export_zip[composers-10]": 0.010621589857140498,
+      "test_post_export_zip[composers-50]": 0.03968703356250458
+    },
+    "search": {
+      "test_search_full_corpus_live_scan": 0.04461661563157736,
+      "test_search_full_corpus_indexed": 0.05512249660713918
+    },
     "summary-cache": {
-      "test_summary_cache_hit": 6.3e-05,
-      "test_summary_cache_miss": 6.3e-05,
-      "test_fingerprint_workspace_entries[10]": 0.001844,
-      "test_fingerprint_workspace_entries[50]": 0.007759,
-      "test_fingerprint_workspace_entries[200]": 0.022231,
-      "test_summary_cache_round_trip": 0.000351
+      "test_summary_cache_lookup[hit]": 7.249851343825762e-05,
+      "test_summary_cache_lookup[miss]": 7.193702095574013e-05,
+      "test_composer_map_cache_lookup[hit]": 7.151645086519804e-05,
+      "test_composer_map_cache_lookup[miss]": 7.112598943352091e-05,
+      "test_fingerprint_workspace_entries[10]": 0.0024127972424549185,
+      "test_fingerprint_workspace_entries[50]": 0.010196820941858245,
+      "test_fingerprint_workspace_entries[200]": 0.029070524094341035,
+      "test_summary_cache_round_trip": 0.0004703680658560554,
+      "test_tab_summary_cache_lookup[hit]": 7.844850562859133e-05,
+      "test_tab_summary_cache_lookup[miss]": 7.843399021512e-05
     }
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
@@ -4,10 +4,24 @@
 
 import argparse
 import json
+import math
 import sys
 from pathlib import Path
 
 THRESHOLD = 1.20
+STALE_FLOOR = 0.50
+
+# Benchmarks recorded in baselines.json but excluded from the regression gate.
+# Use sparingly — only for benches whose timing is inherently noisy across CI runs
+# (e.g. file I/O operations that depend on OS page-cache state).
+EXCLUDED_FROM_GATE: frozenset[str] = frozenset(
+    {
+        # round_trip calls set_cached_projects (file write) + get_cached_projects (file read)
+        # each round. OS page-cache state on shared runners causes 3–5x variation between
+        # consecutive CI runs, making this ungatable with any reasonable slack.
+        "test_summary_cache_round_trip",
+    }
+)
 
 
 class BenchmarkDataError(ValueError):
@@ -97,19 +111,35 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
     return means
 
 
+def _validate_gate_ratios(threshold: float, stale_floor: float) -> None:
+    if not math.isfinite(threshold):
+        raise BenchmarkDataError("threshold must be finite")
+    if threshold <= 1:
+        raise BenchmarkDataError("threshold must be greater than 1")
+    if not math.isfinite(stale_floor):
+        raise BenchmarkDataError("stale_floor must be finite")
+    if not 0 < stale_floor < 1:
+        raise BenchmarkDataError("stale_floor must be between 0 and 1 (exclusive)")
+
+
 def check_regression(
     results_path: str | Path,
     baselines_path: str | Path,
     *,
     threshold: float = THRESHOLD,
+    stale_floor: float = STALE_FLOOR,
 ) -> int:
-    """Return 0 when within threshold; 1 when any gated benchmark regresses."""
+    """Return 0 when within threshold; 1 when any gated benchmark regresses or is stale."""
+    _validate_gate_ratios(threshold, stale_floor)
     flat = load_results(results_path)
     baseline_means = load_baseline_means(baselines_path)
 
     failures: list[str] = []
+    stale: list[str] = []
     missing: list[str] = []
     for name, base in baseline_means.items():
+        if name in EXCLUDED_FROM_GATE:
+            continue
         cur = flat.get(name)
         if cur is None:
             print(f"FAIL: no current result for gated baseline {name!r}")
@@ -119,20 +149,32 @@ def check_regression(
             print(f"WARN: baseline for {name!r} is zero; skipping ratio check")
             continue
         ratio = cur / base
-        tag = "FAIL" if ratio > threshold else "ok"
-        print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
         if ratio > threshold:
+            tag = "FAIL"
             failures.append(name)
+        elif ratio < stale_floor:
+            tag = "STALE"
+            stale.append(name)
+        else:
+            tag = "ok"
+        print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
 
     for name in flat:
+        if name in EXCLUDED_FROM_GATE:
+            continue
         if name not in baseline_means:
             print(f"WARN: {name!r} has no baseline yet; not gated")
 
     if failures:
         print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}")
+    if stale:
+        print(
+            f"\nSTALE: {len(stale)} benchmark(s) are faster than {stale_floor:.0%} of baseline "
+            "(refresh baselines after intentional speedups)"
+        )
     if missing:
         print(f"\nMISSING: {len(missing)} gated benchmark(s) absent from current results")
-    if failures or missing:
+    if failures or stale or missing:
         return 1
     return 0
 
@@ -147,12 +189,19 @@ def main(argv: list[str] | None = None) -> int:
         default=THRESHOLD,
         help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)",
     )
+    parser.add_argument(
+        "--stale-floor",
+        type=float,
+        default=STALE_FLOOR,
+        help="fail when current mean is below this fraction of baseline (default: 0.50)",
+    )
     args = parser.parse_args(argv)
     try:
         return check_regression(
             args.results_path,
             args.baselines_path,
             threshold=args.threshold,
+            stale_floor=args.stale_floor,
         )
     except BenchmarkDataError as exc:
         print(f"ERROR: {exc}", file=sys.stderr)