From 5eef89730aefca71d09a6bcc6f76b8ead187fc2b Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Fri, 26 Jun 2026 04:45:48 +0800
Subject: [PATCH 1/7] ci: unified benchmark suite with full baselines and
 regression gate

---
 .github/workflows/tests.yml                  |   4 +-
 Makefile                                     |  19 +++
 benchmarks/README.md                         |  60 ++++++++
 benchmarks/baselines.json                    |  32 ++--
 scripts/check_benchmark_regression.py        |  40 ++++-
 scripts/reduce_baselines.py                  | 112 ++++++++++++++
 tests/benchmarks/conftest.py                 | 146 ++++++++++++++++++-
 tests/benchmarks/test_export_bench.py        |  30 ++++
 tests/benchmarks/test_parse_bench.py         |  27 ++++
 tests/benchmarks/test_search_bench.py        |  26 ++++
 tests/benchmarks/test_summary_cache_bench.py |  45 +++---
 tests/test_check_benchmark_regression.py     |  50 ++++++-
 12 files changed, 544 insertions(+), 47 deletions(-)
 create mode 100644 Makefile
 create mode 100644 benchmarks/README.md
 create mode 100644 scripts/reduce_baselines.py
 create mode 100644 tests/benchmarks/test_export_bench.py
 create mode 100644 tests/benchmarks/test_parse_bench.py
 create mode 100644 tests/benchmarks/test_search_bench.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 158b598..3562801 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -215,7 +215,7 @@ jobs:
             --redact \
             --exit-code 1
 
-  # ── Performance benchmarks: summary cache (issue #115) ─────────────────────
+  # ── Performance benchmarks: unified suite (issues #115, #110) ──────────────
   benchmarks:
     name: Performance benchmarks (gated)
     needs: [unittest]
@@ -236,7 +236,7 @@ jobs:
           python -m pip install -r requirements-lock.txt
           python -m pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0'
 
-      - name: Run summary-cache benchmarks
+      - name: Run benchmark suite
         run: >
           python -m pytest tests/benchmarks/
           --benchmark-only
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..2a27405
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,19 @@
+.PHONY: seed-baselines-local update-baselines check-benchmarks clean-benchmark-artifacts
+
+# WARNING: captures timings on THIS machine. Production baselines must match ubuntu-latest CI.
+# Prefer downloading benchmark-results.json from a CI artifact, then:
+#   python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5
+seed-baselines-local:
+	@echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2
+	python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+	python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5
+
+# Deprecated alias — kept for muscle memory; see seed-baselines-local warning above.
+update-baselines: seed-baselines-local
+
+check-benchmarks:
+	python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
+	python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
+
+clean-benchmark-artifacts:
+	rm -f benchmarks/_raw.json benchmark-results.json
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000..0bd1725
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,60 @@
+# Performance benchmarks
+
+Test files live under `tests/benchmarks/`; this directory holds documentation and `baselines.json` for the CI regression gate.
+
+Repeatable local measurements for workspace listing, export, search, and summary-cache hot paths.
+
+## Run locally
+
+```bash
+pip install -r requirements-lock.txt
+pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0'
+pytest tests/benchmarks/ --benchmark-only -o addopts= -v
+```
+
+## Scenarios
+
+| Group | What |
+|-------|------|
+| parse | `list_workspace_projects(..., nocache=True)` over 10 / 50 / 200 synthetic composers |
+| export | `POST /api/export` (ZIP) over 10 / 50 composer corpora |
+| search | `GET /api/search` over a 50-composer synthetic corpus |
+| summary-cache | cache lookup (hit/miss), fingerprint (10/50/200), round-trip, tab-summary lookup |
+
+Synthetic corpora are built in `tests/benchmarks/conftest.py` — no real Cursor storage dependency.
+
+## CI gate
+
+The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suite (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`.
+
+- **Fail** when a gated mean exceeds its baseline by **>20%**
+- **Fail** when a gated mean is **<50%** of baseline (stale — refresh after intentional speedups)
+- **Fail** when a gated baseline name has no current result
+- **Warn** for benchmarks without a baseline entry
+- **Skip gate** for `EXCLUDED_FROM_GATE` names (smallest parse corpus, full-corpus search — sub-ms CI noise)
+
+Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`.
+
+## Refresh baselines
+
+After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job). Download `benchmark-results.json` from a CI artifact when possible:
+
+```bash
+python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5
+```
+
+For a quick local snapshot only (may not match CI timings):
+
+```bash
+make seed-baselines-local
+```
+
+`make update-baselines` is a deprecated alias for `seed-baselines-local`. Do not commit baselines from macOS/Windows unless you accept cross-OS gate skew.
+
+## Makefile targets
+
+| Target | Purpose |
+|--------|---------|
+| `make check-benchmarks` | Run suite + regression gate locally |
+| `make seed-baselines-local` | Capture local timings into `benchmarks/baselines.json` (with slack) |
+| `make clean-benchmark-artifacts` | Remove `benchmark-results.json` and `benchmarks/_raw.json` |
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 131b638..e8a35fa 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,15 +1,29 @@
 {
-  "_note": "Gated means from ubuntu-latest CI benchmark-results.json (PR #120, run 28123677675). Refresh after intentional perf changes: download benchmark-results.json from the CI artifacts job, then `python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json` (re-seed with reduce_baselines or edit means). Local capture: `pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=` on ubuntu-latest.",
-  "updated": "2026-06-24T19:20:27Z",
-  "machine": "Linux",
+  "_note": "Gated means seeded locally (Windows, 1.5× slack) — refresh from ubuntu-latest CI benchmark-results.json artifact before merge. Excluded from gate: test_list_workspace_projects_nocache[composers-10], test_search_full_corpus.",
+  "updated": "2026-06-25T20:34:07Z",
+  "machine": "Windows",
   "groups": {
+    "parse": {
+      "test_list_workspace_projects_nocache[composers-10]": 0.01313006085768828,
+      "test_list_workspace_projects_nocache[composers-50]": 0.04705098008271307,
+      "test_list_workspace_projects_nocache[composers-200]": 0.19944224995560944
+    },
+    "export": {
+      "test_post_export_zip[composers-10]": 0.0170322916819714,
+      "test_post_export_zip[composers-50]": 0.040990050032269215
+    },
+    "search": {
+      "test_search_full_corpus": 0.057670830062124874
+    },
     "summary-cache": {
-      "test_summary_cache_hit": 6.3e-05,
-      "test_summary_cache_miss": 6.3e-05,
-      "test_fingerprint_workspace_entries[10]": 0.001844,
-      "test_fingerprint_workspace_entries[50]": 0.007759,
-      "test_fingerprint_workspace_entries[200]": 0.022231,
-      "test_summary_cache_round_trip": 0.000351
+      "test_summary_cache_lookup[hit]": 0.00014543285277406022,
+      "test_summary_cache_lookup[miss]": 0.0001437347241805802,
+      "test_fingerprint_workspace_entries[10]": 0.001866654586096193,
+      "test_fingerprint_workspace_entries[50]": 0.00636450619807407,
+      "test_fingerprint_workspace_entries[200]": 0.020523441289855247,
+      "test_summary_cache_round_trip": 0.0019650292328056915,
+      "test_tab_summary_cache_lookup[hit]": 0.00015344636292124477,
+      "test_tab_summary_cache_lookup[miss]": 0.00012440098537902896
     }
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index d2fc79c..b2d8d53 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -8,6 +8,15 @@
 from pathlib import Path
 
 THRESHOLD = 1.20
+STALE_FLOOR = 0.50
+
+# Sub-ms timings are too noisy for a fixed 20% gate on ubuntu CI.
+EXCLUDED_FROM_GATE = frozenset(
+    {
+        "test_list_workspace_projects_nocache[composers-10]",
+        "test_search_full_corpus",
+    }
+)
 
 
 class BenchmarkDataError(ValueError):
@@ -102,14 +111,18 @@ def check_regression(
     baselines_path: str | Path,
     *,
     threshold: float = THRESHOLD,
+    stale_floor: float = STALE_FLOOR,
 ) -> int:
-    """Return 0 when within threshold; 1 when any gated benchmark regresses."""
+    """Return 0 when within threshold; 1 when any gated benchmark regresses or is stale."""
     flat = load_results(results_path)
     baseline_means = load_baseline_means(baselines_path)
 
     failures: list[str] = []
+    stale: list[str] = []
     missing: list[str] = []
     for name, base in baseline_means.items():
+        if name in EXCLUDED_FROM_GATE:
+            continue
         cur = flat.get(name)
         if cur is None:
             print(f"FAIL: no current result for gated baseline {name!r}")
@@ -119,20 +132,32 @@ def check_regression(
             print(f"WARN: baseline for {name!r} is zero; skipping ratio check")
             continue
         ratio = cur / base
-        tag = "FAIL" if ratio > threshold else "ok"
-        print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
         if ratio > threshold:
+            tag = "FAIL"
             failures.append(name)
+        elif ratio < stale_floor:
+            tag = "STALE"
+            stale.append(name)
+        else:
+            tag = "ok"
+        print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
 
     for name in flat:
+        if name in EXCLUDED_FROM_GATE:
+            continue
         if name not in baseline_means:
             print(f"WARN: {name!r} has no baseline yet; not gated")
 
     if failures:
         print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}")
+    if stale:
+        print(
+            f"\nSTALE: {len(stale)} benchmark(s) are faster than {stale_floor:.0%} of baseline "
+            "(refresh baselines after intentional speedups)"
+        )
     if missing:
         print(f"\nMISSING: {len(missing)} gated benchmark(s) absent from current results")
-    if failures or missing:
+    if failures or stale or missing:
         return 1
     return 0
 
@@ -147,12 +172,19 @@ def main(argv: list[str] | None = None) -> int:
         default=THRESHOLD,
         help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)",
     )
+    parser.add_argument(
+        "--stale-floor",
+        type=float,
+        default=STALE_FLOOR,
+        help="fail when current mean is below this fraction of baseline (default: 0.50)",
+    )
     args = parser.parse_args(argv)
     try:
         return check_regression(
             args.results_path,
             args.baselines_path,
             threshold=args.threshold,
+            stale_floor=args.stale_floor,
         )
     except BenchmarkDataError as exc:
         print(f"ERROR: {exc}", file=sys.stderr)
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
new file mode 100644
index 0000000..76d2560
--- /dev/null
+++ b/scripts/reduce_baselines.py
@@ -0,0 +1,112 @@
+"""Reduce pytest-benchmark JSON into benchmarks/baselines.json."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import UTC, datetime
+from pathlib import Path
+
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
+from scripts.check_benchmark_regression import (
+    EXCLUDED_FROM_GATE,
+    BenchmarkDataError,
+    normalize_benchmark_name,
+)
+
+GATED_GROUPS = ("parse", "export", "search", "summary-cache")
+
+
+def _positive_float(value: str) -> float:
+    parsed = float(value)
+    if parsed <= 0:
+        raise argparse.ArgumentTypeError("slack must be greater than zero")
+    return parsed
+
+
+def reduce_baselines(
+    raw_path: str | Path,
+    out_path: str | Path,
+    *,
+    slack: float = 1.0,
+) -> dict[str, object]:
+    path = Path(raw_path)
+    try:
+        raw = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc
+
+    try:
+        entries = raw["benchmarks"]
+    except (KeyError, TypeError) as exc:
+        raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc
+    if not isinstance(entries, list):
+        raise BenchmarkDataError(f"{path} 'benchmarks' must be an array")
+
+    groups: dict[str, dict[str, float]] = {group: {} for group in GATED_GROUPS}
+    for index, entry in enumerate(entries):
+        if not isinstance(entry, dict):
+            raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
+        try:
+            raw_name = entry["name"]
+            mean = float(entry["stats"]["mean"])
+        except (KeyError, TypeError, ValueError) as exc:
+            raise BenchmarkDataError(
+                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+            ) from exc
+        bench_name = normalize_benchmark_name(str(raw_name))
+        group = entry.get("group")
+        if group not in GATED_GROUPS:
+            continue
+        groups[group][bench_name] = mean * slack
+
+    excluded = ", ".join(sorted(EXCLUDED_FROM_GATE))
+    slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else ""
+    machine_info = raw.get("machine_info")
+    machine = machine_info.get("system") if isinstance(machine_info, dict) else None
+    output: dict[str, object] = {
+        "_note": (
+            "Gated means from ubuntu-latest CI benchmark-results.json."
+            f"{slack_note} "
+            f"Excluded from gate (recorded for reference): {excluded}. "
+            "Refresh after intentional speedups via reduce_baselines.py."
+        ),
+        "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "machine": machine,
+        "groups": groups,
+    }
+    out = Path(out_path)
+    try:
+        out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8")
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot write {out}: {exc}") from exc
+    return output
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("raw_path", help="pytest-benchmark --benchmark-json output")
+    parser.add_argument("out_path", help="destination baselines.json path")
+    parser.add_argument(
+        "--slack",
+        type=_positive_float,
+        default=1.0,
+        help="multiply means by this factor (must be > 0)",
+    )
+    args = parser.parse_args(argv)
+    try:
+        reduce_baselines(args.raw_path, args.out_path, slack=args.slack)
+    except BenchmarkDataError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 2
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
index e3e17e2..ed16f09 100644
--- a/tests/benchmarks/conftest.py
+++ b/tests/benchmarks/conftest.py
@@ -1,15 +1,22 @@
-"""Synthetic workspace trees for summary-cache performance benchmarks."""
+"""Shared synthetic fixtures for pytest-benchmark hot paths."""
 
 from __future__ import annotations
 
+import contextlib
+import json
+import sqlite3
 from pathlib import Path
 from typing import Any
 
 import pytest
+from flask.testing import FlaskClient
 
+from app import create_app
 from services import summary_cache
 from services.summary_cache import fingerprint_workspace_storage
 
+BENCH_SEARCH_TERM = "bench-search-token"
+
 
 def make_workspace_entries(workspace_root: Path, count: int) -> list[dict[str, Any]]:
     """Build *count* synthetic workspace entries with on-disk state files."""
@@ -30,14 +37,89 @@ def make_workspace_entries(workspace_root: Path, count: int) -> list[dict[str, A
     return entries
 
 
+def _composer_ids(count: int) -> list[tuple[str, str, str]]:
+    return [(f"ws_{i:04d}", f"cmp_{i:04d}", f"bub_{i:04d}") for i in range(count)]
+
+
+def build_bench_storage(root: Path, composer_count: int) -> dict[str, str]:
+    """Create workspaceStorage, globalStorage, and cli_chats trees for *composer_count* composers."""
+    ws_root = root / "workspaceStorage"
+    global_root = root / "globalStorage"
+    cli_root = root / "cli_chats"
+    projects_root = root / "projects"
+    ws_root.mkdir(parents=True)
+    global_root.mkdir(parents=True)
+    cli_root.mkdir(parents=True)
+    projects_root.mkdir(parents=True)
+
+    global_db_path = global_root / "state.vscdb"
+    with contextlib.closing(sqlite3.connect(global_db_path)) as conn:
+        conn.execute("CREATE TABLE cursorDiskKV ([key] TEXT PRIMARY KEY, value TEXT)")
+        base_ts = 1_715_000_000_000
+        for i, (workspace_id, composer_id, bubble_id) in enumerate(_composer_ids(composer_count)):
+            project_folder = projects_root / f"proj_{i:04d}"
+            project_folder.mkdir(parents=True, exist_ok=True)
+
+            ws_dir = ws_root / workspace_id
+            ws_dir.mkdir(parents=True, exist_ok=True)
+            (ws_dir / "workspace.json").write_text(
+                json.dumps({"folder": str(project_folder)}),
+                encoding="utf-8",
+            )
+            with contextlib.closing(sqlite3.connect(ws_dir / "state.vscdb")) as ws_conn:
+                ws_conn.execute("CREATE TABLE ItemTable ([key] TEXT PRIMARY KEY, value TEXT)")
+                ws_conn.execute(
+                    "INSERT INTO ItemTable ([key], value) VALUES (?, ?)",
+                    (
+                        "composer.composerData",
+                        json.dumps({"allComposers": [{"composerId": composer_id}]}),
+                    ),
+                )
+                ws_conn.commit()
+
+            created_at = base_ts + i * 1_000
+            conn.execute(
+                "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)",
+                (
+                    f"composerData:{composer_id}",
+                    json.dumps(
+                        {
+                            "name": f"Bench chat {i:04d}",
+                            "createdAt": created_at,
+                            "lastUpdatedAt": created_at + 500,
+                            "fullConversationHeadersOnly": [
+                                {"bubbleId": bubble_id, "type": 1},
+                            ],
+                            "modelConfig": {"modelName": "gpt-4o"},
+                        }
+                    ),
+                ),
+            )
+            conn.execute(
+                "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)",
+                (
+                    f"bubbleId:{composer_id}:{bubble_id}",
+                    json.dumps(
+                        {
+                            "text": f"find {BENCH_SEARCH_TERM} in composer {i:04d}",
+                            "type": "user",
+                            "createdAt": created_at + 400,
+                        }
+                    ),
+                ),
+            )
+        conn.commit()
+
+    return {
+        "workspace_path": str(ws_root),
+        "cli_chats_path": str(cli_root),
+        "storage_root": str(root),
+    }
+
+
 @pytest.fixture
 def summary_cache_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path:
-    """Redirect summary-cache files to an isolated temp directory.
-
-    Patches ``CACHE_DIR`` (also used by tab-summary paths via ``_tab_summaries_path``)
-    plus the projects/composer-map file constants used by current benchmarks.
-    Tab-summary cache benchmarks are deferred to issue #110 (unified benchmark suite).
-    """
+    """Redirect summary-cache files to an isolated temp directory."""
     cache_dir = tmp_path / "cache"
     cache_dir.mkdir()
     monkeypatch.setattr(summary_cache, "CACHE_DIR", cache_dir)
@@ -87,3 +169,53 @@ def workspace_fingerprint(synthetic_workspace: tuple[str, list[dict[str, Any]]])
 def stale_fingerprint(workspace_fingerprint: dict[str, Any]) -> dict[str, Any]:
     """Return a fingerprint guaranteed to differ from the stored one."""
     return {**workspace_fingerprint, "rules_digest": "deadbeefdeadbeef"}
+
+
+@pytest.fixture
+def bench_storage(tmp_path: Path, request: pytest.FixtureRequest) -> dict[str, str]:
+    """On-disk Cursor layout with N composers (indirect ``composer_count`` param)."""
+    count = getattr(request, "param", 10)
+    return build_bench_storage(tmp_path / "storage", count)
+
+
+@pytest.fixture
+def bench_env(
+    bench_storage: dict[str, str],
+    monkeypatch: pytest.MonkeyPatch,
+) -> dict[str, str]:
+    """Set WORKSPACE_PATH / CLI_CHATS_PATH for the synthetic storage tree."""
+    monkeypatch.setenv("WORKSPACE_PATH", bench_storage["workspace_path"])
+    monkeypatch.setenv("CLI_CHATS_PATH", bench_storage["cli_chats_path"])
+    monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1")
+    return bench_storage
+
+
+@pytest.fixture
+def bench_client(bench_env: dict[str, str], tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> FlaskClient:
+    """Flask test client bound to synthetic bench storage."""
+    state_dir = tmp_path / ".cursor-chat-browser"
+    state_dir.mkdir()
+    monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir))
+    app = create_app()
+    app.config["TESTING"] = True
+    app.config["EXCLUSION_RULES"] = []
+    return app.test_client()
+
+
+@pytest.fixture
+def bench_client_search_corpus(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> FlaskClient:
+    """Flask client over a fixed 50-composer corpus for search benchmarks."""
+    storage = build_bench_storage(tmp_path / "search_storage", 50)
+    monkeypatch.setenv("WORKSPACE_PATH", storage["workspace_path"])
+    monkeypatch.setenv("CLI_CHATS_PATH", storage["cli_chats_path"])
+    monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1")
+    state_dir = tmp_path / ".cursor-chat-browser-search"
+    state_dir.mkdir()
+    monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir))
+    app = create_app()
+    app.config["TESTING"] = True
+    app.config["EXCLUSION_RULES"] = []
+    return app.test_client()
diff --git a/tests/benchmarks/test_export_bench.py b/tests/benchmarks/test_export_bench.py
new file mode 100644
index 0000000..c5e3051
--- /dev/null
+++ b/tests/benchmarks/test_export_bench.py
@@ -0,0 +1,30 @@
+"""Benchmark POST /api/export (ZIP) over synthetic workspace + global DB."""
+
+from __future__ import annotations
+
+import pytest
+from flask.testing import FlaskClient
+
+
+@pytest.mark.benchmark(group="export")
+@pytest.mark.parametrize(
+    "bench_storage",
+    [10, 50],
+    indirect=True,
+    ids=["composers-10", "composers-50"],
+)
+def test_post_export_zip(
+    benchmark,
+    bench_client: FlaskClient,
+) -> None:
+    def _run() -> object:
+        return bench_client.post(
+            "/api/export",
+            json={},
+            content_type="application/json",
+        )
+
+    response = benchmark(_run)
+    assert response.status_code == 200
+    assert response.content_type.startswith("application/zip")
+    assert int(response.headers.get("X-Export-Count", "0")) >= 1
diff --git a/tests/benchmarks/test_parse_bench.py b/tests/benchmarks/test_parse_bench.py
new file mode 100644
index 0000000..d3814d9
--- /dev/null
+++ b/tests/benchmarks/test_parse_bench.py
@@ -0,0 +1,27 @@
+"""Benchmark list_workspace_projects (nocache) over synthetic composer corpora."""
+
+from __future__ import annotations
+
+import pytest
+
+from services.workspace_listing import list_workspace_projects
+
+
+@pytest.mark.benchmark(group="parse")
+@pytest.mark.parametrize(
+    "bench_storage",
+    [10, 50, 200],
+    indirect=True,
+    ids=["composers-10", "composers-50", "composers-200"],
+)
+def test_list_workspace_projects_nocache(
+    benchmark,
+    bench_env: dict[str, str],
+) -> None:
+    workspace_path = bench_env["workspace_path"]
+
+    def _run() -> object:
+        return list_workspace_projects(workspace_path, [], nocache=True)
+
+    projects, _warnings = benchmark(_run)
+    assert isinstance(projects, list) and len(projects) > 0
diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py
new file mode 100644
index 0000000..1606f24
--- /dev/null
+++ b/tests/benchmarks/test_search_bench.py
@@ -0,0 +1,26 @@
+"""Benchmark GET /api/search over a 50-composer synthetic corpus."""
+
+from __future__ import annotations
+
+import pytest
+from flask.testing import FlaskClient
+
+from tests.benchmarks.conftest import BENCH_SEARCH_TERM
+
+
+@pytest.mark.benchmark(group="search")
+def test_search_full_corpus(
+    benchmark,
+    bench_client_search_corpus: FlaskClient,
+) -> None:
+    def _run() -> object:
+        return bench_client_search_corpus.get(
+            f"/api/search?q={BENCH_SEARCH_TERM}&all_history=1",
+        )
+
+    response = benchmark(_run)
+    assert response.status_code == 200
+    body = response.get_json()
+    assert isinstance(body, dict)
+    results = body.get("results")
+    assert isinstance(results, list) and len(results) > 0
diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py
index dad4a15..70f127b 100644
--- a/tests/benchmarks/test_summary_cache_bench.py
+++ b/tests/benchmarks/test_summary_cache_bench.py
@@ -1,44 +1,35 @@
-"""pytest-benchmark coverage for services/summary_cache.py hot paths.
-
-``test_summary_cache_hit`` and ``test_summary_cache_miss`` both time ``get_cached_projects``
-only. Miss means fingerprint mismatch (cache not used), not a full cache rebuild.
-"""
+"""pytest-benchmark coverage for services/summary_cache.py hot paths."""
 
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 
 import pytest
 
 from services.summary_cache import (
     fingerprint_workspace_storage,
     get_cached_projects,
+    get_cached_tab_summaries,
     set_cached_projects,
+    set_cached_tab_summaries,
 )
 
 
 @pytest.mark.benchmark(group="summary-cache")
-def test_summary_cache_hit(
-    benchmark,
-    summary_cache_dir: Path,
-    workspace_fingerprint: dict[str, Any],
-    sample_projects: list[dict[str, Any]],
-) -> None:
-    set_cached_projects(workspace_fingerprint, sample_projects, [])
-    benchmark(get_cached_projects, workspace_fingerprint)
-
-
-@pytest.mark.benchmark(group="summary-cache")
-def test_summary_cache_miss(
+@pytest.mark.parametrize("mode", ["hit", "miss"], ids=["hit", "miss"])
+def test_summary_cache_lookup(
     benchmark,
+    mode: Literal["hit", "miss"],
     summary_cache_dir: Path,
     workspace_fingerprint: dict[str, Any],
     stale_fingerprint: dict[str, Any],
     sample_projects: list[dict[str, Any]],
 ) -> None:
+    """Time ``get_cached_projects`` only; miss = fingerprint mismatch, not rebuild."""
     set_cached_projects(workspace_fingerprint, sample_projects, [])
-    benchmark(get_cached_projects, stale_fingerprint)
+    lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint
+    benchmark(get_cached_projects, lookup_fp)
 
 
 @pytest.mark.benchmark(group="summary-cache")
@@ -76,3 +67,19 @@ def _run() -> None:
         get_cached_projects(fp)
 
     benchmark(_run)
+
+
+@pytest.mark.benchmark(group="summary-cache")
+@pytest.mark.parametrize("mode", ["hit", "miss"], ids=["hit", "miss"])
+def test_tab_summary_cache_lookup(
+    benchmark,
+    mode: Literal["hit", "miss"],
+    summary_cache_dir: Path,
+    workspace_fingerprint: dict[str, Any],
+    stale_fingerprint: dict[str, Any],
+) -> None:
+    workspace_id = "ws_0000"
+    payload = {"tabs": [{"id": "cmp_0000", "title": "Bench"}]}
+    set_cached_tab_summaries(workspace_fingerprint, workspace_id, payload, 200)
+    lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint
+    benchmark(get_cached_tab_summaries, lookup_fp, workspace_id)
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
index 8de10a8..e15118f 100644
--- a/tests/test_check_benchmark_regression.py
+++ b/tests/test_check_benchmark_regression.py
@@ -14,7 +14,7 @@
     normalize_benchmark_name,
 )
 
-GATED_BENCH = "test_summary_cache_hit"
+GATED_BENCH = "test_summary_cache_lookup[hit]"
 
 
 def _write_results(path, benchmarks: list[dict]) -> None:
@@ -32,9 +32,9 @@ def _write_baselines(path, groups: dict[str, dict[str, float]]) -> None:
 
 
 def test_normalize_benchmark_name_strips_module_prefix() -> None:
-    full = "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_hit"
-    assert normalize_benchmark_name(full) == "test_summary_cache_hit"
-    assert normalize_benchmark_name("test_summary_cache_hit") == "test_summary_cache_hit"
+    full = "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_lookup[hit]"
+    assert normalize_benchmark_name(full) == "test_summary_cache_lookup[hit]"
+    assert normalize_benchmark_name("test_summary_cache_lookup[hit]") == "test_summary_cache_lookup[hit]"
 
 
 def test_normalize_benchmark_name_preserves_colons_in_param_values() -> None:
@@ -50,13 +50,13 @@ def test_load_results_normalizes_full_node_id(tmp_path) -> None:
         path,
         [
             {
-                "name": "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_hit",
+                "name": "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_lookup[hit]",
                 "stats": {"mean": 0.0001},
             }
         ],
     )
 
-    assert load_results(path)["test_summary_cache_hit"] == pytest.approx(0.0001)
+    assert load_results(path)["test_summary_cache_lookup[hit]"] == pytest.approx(0.0001)
 
 
 def test_missing_baseline_warns_without_failing(
@@ -213,3 +213,41 @@ def test_load_baseline_means_rejects_non_dict_group(tmp_path) -> None:
 
     with pytest.raises(BenchmarkDataError, match="must be an object"):
         load_baseline_means(baselines)
+
+
+def test_stale_baseline_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": GATED_BENCH, "stats": {"mean": 0.00005}}],
+    )
+    _write_baselines(
+        baselines,
+        {"summary-cache": {GATED_BENCH: 0.0002}},
+    )
+
+    assert check_regression(results, baselines) == 1
+    out = capsys.readouterr().out
+    assert "STALE" in out
+
+
+def test_excluded_benchmark_not_gated(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE
+
+    excluded = next(iter(EXCLUDED_FROM_GATE))
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": excluded, "stats": {"mean": 1.0}}],
+    )
+    _write_baselines(
+        baselines,
+        {"search": {excluded: 0.0001}},
+    )
+
+    assert check_regression(results, baselines) == 0
+    out = capsys.readouterr().out
+    assert "REGRESSION" not in out
+    assert "STALE" not in out

From fcda8edcb005d712602d20f79cdfdb98d722dcbf Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Fri, 26 Jun 2026 05:21:11 +0800
Subject: [PATCH 2/7] fix(bench): address review feedback and seed ubuntu
 baselines

---
 .gitignore                                   |  1 +
 benchmarks/baselines.json                    | 34 ++++++++++----------
 scripts/check_benchmark_regression.py        |  6 ++++
 scripts/reduce_baselines.py                  | 15 +++++++--
 tests/benchmarks/test_parse_bench.py         |  3 +-
 tests/benchmarks/test_summary_cache_bench.py | 18 +++++++++--
 tests/test_check_benchmark_regression.py     | 24 ++++++++++++++
 7 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index f204306..0712397 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,4 @@ coverage.xml
 .hypothesis/
 benchmark-results.json
 benchmarks/_raw.json
+benchmarks/_ci/
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index e8a35fa..959c82d 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,29 +1,29 @@
 {
-  "_note": "Gated means seeded locally (Windows, 1.5× slack) — refresh from ubuntu-latest CI benchmark-results.json artifact before merge. Excluded from gate: test_list_workspace_projects_nocache[composers-10], test_search_full_corpus.",
-  "updated": "2026-06-25T20:34:07Z",
-  "machine": "Windows",
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5\u00d7 slack at generation time. Excluded from gate (recorded for reference): test_list_workspace_projects_nocache[composers-10], test_search_full_corpus. Refresh after intentional speedups via reduce_baselines.py.",
+  "updated": "2026-06-25T21:14:16Z",
+  "machine": "Linux",
   "groups": {
     "parse": {
-      "test_list_workspace_projects_nocache[composers-10]": 0.01313006085768828,
-      "test_list_workspace_projects_nocache[composers-50]": 0.04705098008271307,
-      "test_list_workspace_projects_nocache[composers-200]": 0.19944224995560944
+      "test_list_workspace_projects_nocache[composers-10]": 0.01702312019643009,
+      "test_list_workspace_projects_nocache[composers-50]": 0.07538331990000699,
+      "test_list_workspace_projects_nocache[composers-200]": 0.251991555999993
     },
     "export": {
-      "test_post_export_zip[composers-10]": 0.0170322916819714,
-      "test_post_export_zip[composers-50]": 0.040990050032269215
+      "test_post_export_zip[composers-10]": 0.0112034034344294,
+      "test_post_export_zip[composers-50]": 0.04482855966665985
     },
     "search": {
-      "test_search_full_corpus": 0.057670830062124874
+      "test_search_full_corpus": 0.047164217833331655
     },
     "summary-cache": {
-      "test_summary_cache_lookup[hit]": 0.00014543285277406022,
-      "test_summary_cache_lookup[miss]": 0.0001437347241805802,
-      "test_fingerprint_workspace_entries[10]": 0.001866654586096193,
-      "test_fingerprint_workspace_entries[50]": 0.00636450619807407,
-      "test_fingerprint_workspace_entries[200]": 0.020523441289855247,
-      "test_summary_cache_round_trip": 0.0019650292328056915,
-      "test_tab_summary_cache_lookup[hit]": 0.00015344636292124477,
-      "test_tab_summary_cache_lookup[miss]": 0.00012440098537902896
+      "test_summary_cache_lookup[hit]": 9.224067718099102e-05,
+      "test_summary_cache_lookup[miss]": 9.128770315496628e-05,
+      "test_fingerprint_workspace_entries[10]": 0.0024789120309553535,
+      "test_fingerprint_workspace_entries[50]": 0.010901568931818675,
+      "test_fingerprint_workspace_entries[200]": 0.03069810573000666,
+      "test_summary_cache_round_trip": 0.0004966099535917549,
+      "test_tab_summary_cache_lookup[hit]": 0.00010487297799045405,
+      "test_tab_summary_cache_lookup[miss]": 0.00010309520517204601
     }
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index b2d8d53..394d047 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -179,6 +179,12 @@ def main(argv: list[str] | None = None) -> int:
         help="fail when current mean is below this fraction of baseline (default: 0.50)",
     )
     args = parser.parse_args(argv)
+    if args.threshold <= 1:
+        print("ERROR: --threshold must be greater than 1", file=sys.stderr)
+        return 2
+    if not 0 < args.stale_floor < 1:
+        print("ERROR: --stale-floor must be between 0 and 1 (exclusive)", file=sys.stderr)
+        return 2
     try:
         return check_regression(
             args.results_path,
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
index 76d2560..82e6562 100644
--- a/scripts/reduce_baselines.py
+++ b/scripts/reduce_baselines.py
@@ -33,6 +33,7 @@ def reduce_baselines(
     out_path: str | Path,
     *,
     slack: float = 1.0,
+    source: str = "local",
 ) -> dict[str, object]:
     path = Path(raw_path)
     try:
@@ -70,9 +71,14 @@ def reduce_baselines(
     slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else ""
     machine_info = raw.get("machine_info")
     machine = machine_info.get("system") if isinstance(machine_info, dict) else None
+    source_labels = {
+        "ubuntu-latest-ci": "ubuntu-latest CI benchmark-results.json",
+        "local": "local benchmark-results.json",
+    }
+    source_label = source_labels.get(source, source)
     output: dict[str, object] = {
         "_note": (
-            "Gated means from ubuntu-latest CI benchmark-results.json."
+            f"Gated means from {source_label}."
             f"{slack_note} "
             f"Excluded from gate (recorded for reference): {excluded}. "
             "Refresh after intentional speedups via reduce_baselines.py."
@@ -99,9 +105,14 @@ def main(argv: list[str] | None = None) -> int:
         default=1.0,
         help="multiply means by this factor (must be > 0)",
     )
+    parser.add_argument(
+        "--source",
+        default="local",
+        help="provenance label for _note (e.g. ubuntu-latest-ci, local)",
+    )
     args = parser.parse_args(argv)
     try:
-        reduce_baselines(args.raw_path, args.out_path, slack=args.slack)
+        reduce_baselines(args.raw_path, args.out_path, slack=args.slack, source=args.source)
     except BenchmarkDataError as exc:
         print(f"ERROR: {exc}", file=sys.stderr)
         return 2
diff --git a/tests/benchmarks/test_parse_bench.py b/tests/benchmarks/test_parse_bench.py
index d3814d9..9f23872 100644
--- a/tests/benchmarks/test_parse_bench.py
+++ b/tests/benchmarks/test_parse_bench.py
@@ -23,5 +23,6 @@ def test_list_workspace_projects_nocache(
     def _run() -> object:
         return list_workspace_projects(workspace_path, [], nocache=True)
 
-    projects, _warnings = benchmark(_run)
+    projects, warnings = benchmark(_run)
     assert isinstance(projects, list) and len(projects) > 0
+    assert warnings == []
diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py
index 70f127b..c9b1608 100644
--- a/tests/benchmarks/test_summary_cache_bench.py
+++ b/tests/benchmarks/test_summary_cache_bench.py
@@ -29,7 +29,14 @@ def test_summary_cache_lookup(
     """Time ``get_cached_projects`` only; miss = fingerprint mismatch, not rebuild."""
     set_cached_projects(workspace_fingerprint, sample_projects, [])
     lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint
-    benchmark(get_cached_projects, lookup_fp)
+    result = benchmark(get_cached_projects, lookup_fp)
+    if mode == "hit":
+        assert result is not None
+        projects, warnings = result
+        assert projects == sample_projects
+        assert warnings == []
+    else:
+        assert result is None
 
 
 @pytest.mark.benchmark(group="summary-cache")
@@ -82,4 +89,11 @@ def test_tab_summary_cache_lookup(
     payload = {"tabs": [{"id": "cmp_0000", "title": "Bench"}]}
     set_cached_tab_summaries(workspace_fingerprint, workspace_id, payload, 200)
     lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint
-    benchmark(get_cached_tab_summaries, lookup_fp, workspace_id)
+    result = benchmark(get_cached_tab_summaries, lookup_fp, workspace_id)
+    if mode == "hit":
+        assert result is not None
+        cached_payload, status = result
+        assert status == 200
+        assert cached_payload == payload
+    else:
+        assert result is None
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
index e15118f..38ddcae 100644
--- a/tests/test_check_benchmark_regression.py
+++ b/tests/test_check_benchmark_regression.py
@@ -251,3 +251,27 @@ def test_excluded_benchmark_not_gated(tmp_path, capsys: pytest.CaptureFixture[st
     out = capsys.readouterr().out
     assert "REGRESSION" not in out
     assert "STALE" not in out
+
+
+def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    from scripts.check_benchmark_regression import main
+
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
+    _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
+
+    assert main([str(results), str(baselines), "--threshold", "1.0"]) == 2
+    assert "--threshold must be greater than 1" in capsys.readouterr().err
+
+
+def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    from scripts.check_benchmark_regression import main
+
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
+    _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
+
+    assert main([str(results), str(baselines), "--stale-floor", "1.5"]) == 2
+    assert "--stale-floor must be between 0 and 1" in capsys.readouterr().err

From 87a747cb4e7a6eb281c1554056827f3afee853f2 Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Fri, 26 Jun 2026 05:49:49 +0800
Subject: [PATCH 3/7] fix(bench): gate all baseline benchmarks and validate
 finite ratios

---
 .github/workflows/tests.yml                  |   2 +-
 Makefile                                     |   4 +-
 benchmarks/README.md                         |   2 +-
 benchmarks/baselines.json                    |   4 +-
 scripts/check_benchmark_regression.py        |  28 ++--
 scripts/reduce_baselines.py                  |  20 ++-
 tests/benchmarks/conftest.py                 |  47 ++++---
 tests/benchmarks/constants.py                |   3 +
 tests/benchmarks/test_search_bench.py        |   2 +-
 tests/benchmarks/test_summary_cache_bench.py |   5 +
 tests/test_check_benchmark_regression.py     |  53 +++++---
 tests/test_reduce_baselines.py               | 132 +++++++++++++++++++
 12 files changed, 239 insertions(+), 63 deletions(-)
 create mode 100644 tests/benchmarks/constants.py
 create mode 100644 tests/test_reduce_baselines.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 3562801..3a34c3d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -115,7 +115,7 @@ jobs:
         # exercise Flask routes via app.test_client(). Only listed files — not
         # `pytest tests/` — to avoid re-collecting unittest.TestCase classes above.
         # -o addopts= avoids inheriting benchmark-only options from pyproject.toml.
-        run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py tests/test_check_benchmark_regression.py -v --tb=short -o addopts=
+        run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py tests/test_check_benchmark_regression.py tests/test_reduce_baselines.py -v --tb=short -o addopts=
 
       # ── PyInstaller desktop build (Windows only, once per workflow) ────────
       # Closes #44. Builds the onedir bundle and smoke-tests --help so the
diff --git a/Makefile b/Makefile
index 2a27405..a1f7607 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@
 seed-baselines-local:
 	@echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2
 	python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
-	python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5
+	python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5 --source local
 
 # Deprecated alias — kept for muscle memory; see seed-baselines-local warning above.
 update-baselines: seed-baselines-local
@@ -16,4 +16,4 @@ check-benchmarks:
 	python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
 
 clean-benchmark-artifacts:
-	rm -f benchmarks/_raw.json benchmark-results.json
+	python -c "import pathlib; [p.unlink(missing_ok=True) for p in (pathlib.Path('benchmarks/_raw.json'), pathlib.Path('benchmark-results.json'))]"
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 0bd1725..0f4e23a 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -31,7 +31,7 @@ The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suit
 - **Fail** when a gated mean is **<50%** of baseline (stale — refresh after intentional speedups)
 - **Fail** when a gated baseline name has no current result
 - **Warn** for benchmarks without a baseline entry
-- **Skip gate** for `EXCLUDED_FROM_GATE` names (smallest parse corpus, full-corpus search — sub-ms CI noise)
+- All benchmarks listed in `baselines.json` are gated (no exclusion list)
 
 Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`.
 
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 959c82d..9302877 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,6 +1,6 @@
 {
-  "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5\u00d7 slack at generation time. Excluded from gate (recorded for reference): test_list_workspace_projects_nocache[composers-10], test_search_full_corpus. Refresh after intentional speedups via reduce_baselines.py.",
-  "updated": "2026-06-25T21:14:16Z",
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5\u00d7 slack at generation time. Refresh after intentional speedups via reduce_baselines.py.",
+  "updated": "2026-06-25T21:48:35Z",
   "machine": "Linux",
   "groups": {
     "parse": {
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index 394d047..e30bee1 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -4,19 +4,15 @@
 
 import argparse
 import json
+import math
 import sys
 from pathlib import Path
 
 THRESHOLD = 1.20
 STALE_FLOOR = 0.50
 
-# Sub-ms timings are too noisy for a fixed 20% gate on ubuntu CI.
-EXCLUDED_FROM_GATE = frozenset(
-    {
-        "test_list_workspace_projects_nocache[composers-10]",
-        "test_search_full_corpus",
-    }
-)
+# Benchmarks gated via baselines.json; empty set means all baseline entries are checked.
+EXCLUDED_FROM_GATE: frozenset[str] = frozenset()
 
 
 class BenchmarkDataError(ValueError):
@@ -106,6 +102,17 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
     return means
 
 
+def _validate_gate_ratios(threshold: float, stale_floor: float) -> None:
+    if not math.isfinite(threshold):
+        raise BenchmarkDataError("threshold must be finite")
+    if threshold <= 1:
+        raise BenchmarkDataError("threshold must be greater than 1")
+    if not math.isfinite(stale_floor):
+        raise BenchmarkDataError("stale_floor must be finite")
+    if not 0 < stale_floor < 1:
+        raise BenchmarkDataError("stale_floor must be between 0 and 1 (exclusive)")
+
+
 def check_regression(
     results_path: str | Path,
     baselines_path: str | Path,
@@ -114,6 +121,7 @@ def check_regression(
     stale_floor: float = STALE_FLOOR,
 ) -> int:
     """Return 0 when within threshold; 1 when any gated benchmark regresses or is stale."""
+    _validate_gate_ratios(threshold, stale_floor)
     flat = load_results(results_path)
     baseline_means = load_baseline_means(baselines_path)
 
@@ -179,12 +187,6 @@ def main(argv: list[str] | None = None) -> int:
         help="fail when current mean is below this fraction of baseline (default: 0.50)",
     )
     args = parser.parse_args(argv)
-    if args.threshold <= 1:
-        print("ERROR: --threshold must be greater than 1", file=sys.stderr)
-        return 2
-    if not 0 < args.stale_floor < 1:
-        print("ERROR: --stale-floor must be between 0 and 1 (exclusive)", file=sys.stderr)
-        return 2
     try:
         return check_regression(
             args.results_path,
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
index 82e6562..6264018 100644
--- a/scripts/reduce_baselines.py
+++ b/scripts/reduce_baselines.py
@@ -4,6 +4,7 @@
 
 import argparse
 import json
+import math
 import sys
 from datetime import UTC, datetime
 from pathlib import Path
@@ -23,6 +24,8 @@
 
 def _positive_float(value: str) -> float:
     parsed = float(value)
+    if not math.isfinite(parsed):
+        raise argparse.ArgumentTypeError("slack must be a finite number")
     if parsed <= 0:
         raise argparse.ArgumentTypeError("slack must be greater than zero")
     return parsed
@@ -63,11 +66,23 @@ def reduce_baselines(
             ) from exc
         bench_name = normalize_benchmark_name(str(raw_name))
         group = entry.get("group")
+        if group is None:
+            raise BenchmarkDataError(
+                f"{path} benchmarks[{index}] ({bench_name!r}) missing required 'group'"
+            )
         if group not in GATED_GROUPS:
-            continue
+            raise BenchmarkDataError(
+                f"{path} benchmarks[{index}] ({bench_name!r}) has unknown group {group!r}; "
+                f"expected one of {GATED_GROUPS}"
+            )
         groups[group][bench_name] = mean * slack
 
     excluded = ", ".join(sorted(EXCLUDED_FROM_GATE))
+    excluded_note = (
+        f" Excluded from gate (recorded for reference): {excluded}."
+        if excluded
+        else ""
+    )
     slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else ""
     machine_info = raw.get("machine_info")
     machine = machine_info.get("system") if isinstance(machine_info, dict) else None
@@ -79,8 +94,7 @@ def reduce_baselines(
     output: dict[str, object] = {
         "_note": (
             f"Gated means from {source_label}."
-            f"{slack_note} "
-            f"Excluded from gate (recorded for reference): {excluded}. "
+            f"{slack_note}{excluded_note} "
             "Refresh after intentional speedups via reduce_baselines.py."
         ),
         "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
index ed16f09..1cbf3b6 100644
--- a/tests/benchmarks/conftest.py
+++ b/tests/benchmarks/conftest.py
@@ -14,8 +14,7 @@
 from app import create_app
 from services import summary_cache
 from services.summary_cache import fingerprint_workspace_storage
-
-BENCH_SEARCH_TERM = "bench-search-token"
+from tests.benchmarks.constants import BENCH_SEARCH_TERM
 
 
 def make_workspace_entries(workspace_root: Path, count: int) -> list[dict[str, Any]]:
@@ -117,6 +116,26 @@ def build_bench_storage(root: Path, composer_count: int) -> dict[str, str]:
     }
 
 
+def _make_bench_flask_client(
+    storage: dict[str, str],
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+    *,
+    state_subdir: str = ".cursor-chat-browser",
+) -> FlaskClient:
+    """Flask test client with env + export state patched for synthetic storage."""
+    monkeypatch.setenv("WORKSPACE_PATH", storage["workspace_path"])
+    monkeypatch.setenv("CLI_CHATS_PATH", storage["cli_chats_path"])
+    monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1")
+    state_dir = tmp_path / state_subdir
+    state_dir.mkdir()
+    monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir))
+    app = create_app()
+    app.config["TESTING"] = True
+    app.config["EXCLUSION_RULES"] = []
+    return app.test_client()
+
+
 @pytest.fixture
 def summary_cache_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path:
     """Redirect summary-cache files to an isolated temp directory."""
@@ -193,13 +212,7 @@ def bench_env(
 @pytest.fixture
 def bench_client(bench_env: dict[str, str], tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> FlaskClient:
     """Flask test client bound to synthetic bench storage."""
-    state_dir = tmp_path / ".cursor-chat-browser"
-    state_dir.mkdir()
-    monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir))
-    app = create_app()
-    app.config["TESTING"] = True
-    app.config["EXCLUSION_RULES"] = []
-    return app.test_client()
+    return _make_bench_flask_client(bench_env, tmp_path, monkeypatch)
 
 
 @pytest.fixture
@@ -209,13 +222,9 @@ def bench_client_search_corpus(
 ) -> FlaskClient:
     """Flask client over a fixed 50-composer corpus for search benchmarks."""
     storage = build_bench_storage(tmp_path / "search_storage", 50)
-    monkeypatch.setenv("WORKSPACE_PATH", storage["workspace_path"])
-    monkeypatch.setenv("CLI_CHATS_PATH", storage["cli_chats_path"])
-    monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1")
-    state_dir = tmp_path / ".cursor-chat-browser-search"
-    state_dir.mkdir()
-    monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir))
-    app = create_app()
-    app.config["TESTING"] = True
-    app.config["EXCLUSION_RULES"] = []
-    return app.test_client()
+    return _make_bench_flask_client(
+        storage,
+        tmp_path,
+        monkeypatch,
+        state_subdir=".cursor-chat-browser-search",
+    )
diff --git a/tests/benchmarks/constants.py b/tests/benchmarks/constants.py
new file mode 100644
index 0000000..ab682d3
--- /dev/null
+++ b/tests/benchmarks/constants.py
@@ -0,0 +1,3 @@
+"""Shared constants for benchmark corpora (importable outside conftest)."""
+
+BENCH_SEARCH_TERM = "bench-search-token"
diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py
index 1606f24..e7df914 100644
--- a/tests/benchmarks/test_search_bench.py
+++ b/tests/benchmarks/test_search_bench.py
@@ -5,7 +5,7 @@
 import pytest
 from flask.testing import FlaskClient
 
-from tests.benchmarks.conftest import BENCH_SEARCH_TERM
+from tests.benchmarks.constants import BENCH_SEARCH_TERM
 
 
 @pytest.mark.benchmark(group="search")
diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py
index c9b1608..4a259c8 100644
--- a/tests/benchmarks/test_summary_cache_bench.py
+++ b/tests/benchmarks/test_summary_cache_bench.py
@@ -74,6 +74,11 @@ def _run() -> None:
         get_cached_projects(fp)
 
     benchmark(_run)
+    cached = get_cached_projects(fp)
+    assert cached is not None
+    cached_projects, cached_warnings = cached
+    assert cached_projects == projects
+    assert cached_warnings == []
 
 
 @pytest.mark.benchmark(group="summary-cache")
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
index 38ddcae..873d68a 100644
--- a/tests/test_check_benchmark_regression.py
+++ b/tests/test_check_benchmark_regression.py
@@ -232,28 +232,19 @@ def test_stale_baseline_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> N
     assert "STALE" in out
 
 
-def test_excluded_benchmark_not_gated(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
-    from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE
+def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    from scripts.check_benchmark_regression import main
 
-    excluded = next(iter(EXCLUDED_FROM_GATE))
     results = tmp_path / "results.json"
     baselines = tmp_path / "baselines.json"
-    _write_results(
-        results,
-        [{"name": excluded, "stats": {"mean": 1.0}}],
-    )
-    _write_baselines(
-        baselines,
-        {"search": {excluded: 0.0001}},
-    )
+    _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
+    _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
 
-    assert check_regression(results, baselines) == 0
-    out = capsys.readouterr().out
-    assert "REGRESSION" not in out
-    assert "STALE" not in out
+    assert main([str(results), str(baselines), "--threshold", "1.0"]) == 2
+    assert "threshold must be greater than 1" in capsys.readouterr().err
 
 
-def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
     from scripts.check_benchmark_regression import main
 
     results = tmp_path / "results.json"
@@ -261,11 +252,31 @@ def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[
     _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
     _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
 
-    assert main([str(results), str(baselines), "--threshold", "1.0"]) == 2
-    assert "--threshold must be greater than 1" in capsys.readouterr().err
+    assert main([str(results), str(baselines), "--stale-floor", "1.5"]) == 2
+    assert "stale_floor must be between 0 and 1" in capsys.readouterr().err
 
 
-def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+def test_check_regression_rejects_invalid_threshold(tmp_path) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
+    _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
+
+    with pytest.raises(BenchmarkDataError, match="threshold must be greater than 1"):
+        check_regression(results, baselines, threshold=1.0)
+
+
+def test_check_regression_rejects_non_finite_threshold(tmp_path) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
+    _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
+
+    with pytest.raises(BenchmarkDataError, match="threshold must be finite"):
+        check_regression(results, baselines, threshold=float("nan"))
+
+
+def test_main_rejects_non_finite_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
     from scripts.check_benchmark_regression import main
 
     results = tmp_path / "results.json"
@@ -273,5 +284,5 @@ def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixtur
     _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}])
     _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}})
 
-    assert main([str(results), str(baselines), "--stale-floor", "1.5"]) == 2
-    assert "--stale-floor must be between 0 and 1" in capsys.readouterr().err
+    assert main([str(results), str(baselines), "--threshold", "inf"]) == 2
+    assert "threshold must be finite" in capsys.readouterr().err
diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py
new file mode 100644
index 0000000..6f5ab89
--- /dev/null
+++ b/tests/test_reduce_baselines.py
@@ -0,0 +1,132 @@
+"""Tests for scripts/reduce_baselines.py."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from scripts.reduce_baselines import reduce_baselines
+from scripts.check_benchmark_regression import BenchmarkDataError
+
+
+def _write_raw(path, benchmarks: list[dict], *, machine: str = "Linux") -> None:
+    path.write_text(
+        json.dumps(
+            {
+                "machine_info": {"system": machine},
+                "benchmarks": benchmarks,
+            },
+            indent=2,
+        ),
+        encoding="utf-8",
+    )
+
+
+def test_reduce_baselines_groups_and_slack(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [
+            {
+                "name": "test_list_workspace_projects_nocache[composers-50]",
+                "group": "parse",
+                "stats": {"mean": 0.05},
+            },
+            {
+                "name": "test_post_export_zip[composers-10]",
+                "group": "export",
+                "stats": {"mean": 0.01},
+            },
+            {
+                "name": "test_search_full_corpus",
+                "group": "search",
+                "stats": {"mean": 0.04},
+            },
+            {
+                "name": "test_summary_cache_lookup[hit]",
+                "group": "summary-cache",
+                "stats": {"mean": 0.0001},
+            },
+        ],
+    )
+
+    output = reduce_baselines(raw, out, slack=1.5, source="ubuntu-latest-ci")
+    data = json.loads(out.read_text(encoding="utf-8"))
+    groups = data["groups"]
+
+    assert groups["parse"]["test_list_workspace_projects_nocache[composers-50]"] == pytest.approx(0.075)
+    assert groups["export"]["test_post_export_zip[composers-10]"] == pytest.approx(0.015)
+    assert groups["search"]["test_search_full_corpus"] == pytest.approx(0.06)
+    assert data["machine"] == "Linux"
+    assert "ubuntu-latest CI benchmark-results.json" in data["_note"]
+    assert "1.5× slack" in data["_note"]
+    assert output["groups"] == groups
+
+
+def test_reduce_baselines_local_source_note(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [
+            {
+                "name": "test_summary_cache_lookup[hit]",
+                "group": "summary-cache",
+                "stats": {"mean": 0.0001},
+            },
+        ],
+        machine="Windows",
+    )
+
+    reduce_baselines(raw, out, source="local")
+    data = json.loads(out.read_text(encoding="utf-8"))
+    assert "local benchmark-results.json" in data["_note"]
+    assert data["machine"] == "Windows"
+
+
+def test_reduce_baselines_rejects_unknown_group(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [
+            {
+                "name": "test_cache_only",
+                "group": "cache",
+                "stats": {"mean": 0.001},
+            },
+        ],
+    )
+
+    with pytest.raises(BenchmarkDataError, match="unknown group 'cache'"):
+        reduce_baselines(raw, out)
+
+
+def test_reduce_baselines_rejects_missing_group(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [
+            {
+                "name": "test_no_group",
+                "stats": {"mean": 0.001},
+            },
+        ],
+    )
+
+    with pytest.raises(BenchmarkDataError, match="missing required 'group'"):
+        reduce_baselines(raw, out)
+
+
+def test_positive_float_rejects_non_finite() -> None:
+    import argparse
+
+    from scripts.reduce_baselines import _positive_float
+
+    with pytest.raises(argparse.ArgumentTypeError, match="finite"):
+        _positive_float("nan")
+    with pytest.raises(argparse.ArgumentTypeError, match="finite"):
+        _positive_float("inf")

From 29ad5a7c475e24126b7cbb6948730bf64c3fe451 Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Fri, 26 Jun 2026 05:57:11 +0800
Subject: [PATCH 4/7] fix(bench): harden reduce_baselines and fix Python 3.10
 CI

---
 benchmarks/baselines.json      |  2 +-
 scripts/reduce_baselines.py    | 11 ++++++++---
 tests/test_reduce_baselines.py | 26 +++++++++++++++++++++++++-
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 9302877..3a4d413 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,5 +1,5 @@
 {
-  "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5\u00d7 slack at generation time. Refresh after intentional speedups via reduce_baselines.py.",
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Refresh after intentional speedups via reduce_baselines.py.",
   "updated": "2026-06-25T21:48:35Z",
   "machine": "Linux",
   "groups": {
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
index 6264018..78bfbd1 100644
--- a/scripts/reduce_baselines.py
+++ b/scripts/reduce_baselines.py
@@ -6,7 +6,7 @@
 import json
 import math
 import sys
-from datetime import UTC, datetime
+from datetime import datetime, timezone
 from pathlib import Path
 
 _REPO_ROOT = Path(__file__).resolve().parent.parent
@@ -75,6 +75,11 @@ def reduce_baselines(
                 f"{path} benchmarks[{index}] ({bench_name!r}) has unknown group {group!r}; "
                 f"expected one of {GATED_GROUPS}"
             )
+        if bench_name in groups[group]:
+            raise BenchmarkDataError(
+                f"{path} benchmarks[{index}] ({raw_name!r}) duplicates normalized "
+                f"benchmark {group!r}/{bench_name!r}"
+            )
         groups[group][bench_name] = mean * slack
 
     excluded = ", ".join(sorted(EXCLUDED_FROM_GATE))
@@ -83,7 +88,7 @@ def reduce_baselines(
         if excluded
         else ""
     )
-    slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else ""
+    slack_note = f" Values multiplied by {slack}x slack at generation time." if slack != 1.0 else ""
     machine_info = raw.get("machine_info")
     machine = machine_info.get("system") if isinstance(machine_info, dict) else None
     source_labels = {
@@ -97,7 +102,7 @@ def reduce_baselines(
             f"{slack_note}{excluded_note} "
             "Refresh after intentional speedups via reduce_baselines.py."
         ),
-        "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "updated": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
         "machine": machine,
         "groups": groups,
     }
diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py
index 6f5ab89..9cc24e4 100644
--- a/tests/test_reduce_baselines.py
+++ b/tests/test_reduce_baselines.py
@@ -59,9 +59,10 @@ def test_reduce_baselines_groups_and_slack(tmp_path) -> None:
     assert groups["parse"]["test_list_workspace_projects_nocache[composers-50]"] == pytest.approx(0.075)
     assert groups["export"]["test_post_export_zip[composers-10]"] == pytest.approx(0.015)
     assert groups["search"]["test_search_full_corpus"] == pytest.approx(0.06)
+    assert groups["summary-cache"]["test_summary_cache_lookup[hit]"] == pytest.approx(0.00015)
     assert data["machine"] == "Linux"
     assert "ubuntu-latest CI benchmark-results.json" in data["_note"]
-    assert "1.5× slack" in data["_note"]
+    assert "1.5x slack" in data["_note"]
     assert output["groups"] == groups
 
 
@@ -121,6 +122,29 @@ def test_reduce_baselines_rejects_missing_group(tmp_path) -> None:
         reduce_baselines(raw, out)
 
 
+def test_reduce_baselines_rejects_duplicate_normalized_name(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [
+            {
+                "name": "test_summary_cache_lookup[hit]",
+                "group": "summary-cache",
+                "stats": {"mean": 0.0001},
+            },
+            {
+                "name": "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_lookup[hit]",
+                "group": "summary-cache",
+                "stats": {"mean": 0.0002},
+            },
+        ],
+    )
+
+    with pytest.raises(BenchmarkDataError, match="duplicates normalized"):
+        reduce_baselines(raw, out)
+
+
 def test_positive_float_rejects_non_finite() -> None:
     import argparse
 

From c5c2066c775dacc87b11f7615270d6203cc3857b Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Fri, 26 Jun 2026 07:11:19 +0800
Subject: [PATCH 5/7] fix(bench): address bradjin8 review on search, cache, and
 baselines

---
 .gitignore                                   |  1 +
 Makefile                                     |  4 +-
 benchmarks/README.md                         | 18 ++++--
 benchmarks/baselines.json                    |  9 ++-
 tests/benchmarks/conftest.py                 | 60 ++++++++++++++++++--
 tests/benchmarks/test_search_bench.py        | 40 ++++++++++---
 tests/benchmarks/test_summary_cache_bench.py | 22 +++++++
 7 files changed, 131 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0712397..0f8d574 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,4 +46,5 @@ coverage.xml
 .hypothesis/
 benchmark-results.json
 benchmarks/_raw.json
+benchmarks/_merged.json
 benchmarks/_ci/
diff --git a/Makefile b/Makefile
index a1f7607..599d5a1 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,9 @@
 seed-baselines-local:
 	@echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2
 	python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
-	python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5 --source local
+	python -c "import os, subprocess, sys; \
+	  cmd = [sys.executable, 'scripts/reduce_baselines.py', 'benchmarks/_raw.json', 'benchmarks/baselines.json', '--slack', '1.5', '--source', 'local']; \
+	  (subprocess.run(cmd, check=True), print('Updated benchmarks/baselines.json', file=sys.stderr)) if os.environ.get('FORCE') == '1' else print('Wrote benchmarks/_raw.json only. Set FORCE=1 to overwrite benchmarks/baselines.json.', file=sys.stderr)"
 
 # Deprecated alias — kept for muscle memory; see seed-baselines-local warning above.
 update-baselines: seed-baselines-local
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 0f4e23a..79fce2a 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -17,12 +17,16 @@ pytest tests/benchmarks/ --benchmark-only -o addopts= -v
 | Group | What |
 |-------|------|
 | parse | `list_workspace_projects(..., nocache=True)` over 10 / 50 / 200 synthetic composers |
-| export | `POST /api/export` (ZIP) over 10 / 50 composer corpora |
-| search | `GET /api/search` over a 50-composer synthetic corpus |
-| summary-cache | cache lookup (hit/miss), fingerprint (10/50/200), round-trip, tab-summary lookup |
+| export | `POST /api/export` (ZIP) over 10 / 50 composer corpora (capped at 50 for CI runtime; parse goes to 200) |
+| search | `GET /api/search` over a 50-composer corpus — **live-scan** (`test_search_full_corpus_live_scan`, `NO_SEARCH_INDEX=1`) and **FTS index** (`test_search_full_corpus_indexed`, pre-built index) |
+| summary-cache | projects lookup (hit/miss), composer-map lookup (hit/miss), fingerprint (10/50/200), round-trip, tab-summary lookup |
 
 Synthetic corpora are built in `tests/benchmarks/conftest.py` — no real Cursor storage dependency.
 
+### Adding a benchmark group
+
+Every `@pytest.mark.benchmark(group="...")` name must appear in `GATED_GROUPS` inside `scripts/reduce_baselines.py`. Otherwise `reduce_baselines.py` fails at refresh time with an unknown-group error. Update both the test marker and `GATED_GROUPS` when introducing a new group.
+
 ## CI gate
 
 The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suite (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`.
@@ -35,18 +39,22 @@ The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suit
 
 Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`.
 
+Sub-millisecond benches (e.g. `test_summary_cache_lookup`, `test_composer_map_cache_lookup`) can be high-variance on shared runners. If the gate becomes flaky, raise `--slack` for those entries or reintroduce targeted exclusions in `EXCLUDED_FROM_GATE`.
+
 ## Refresh baselines
 
 After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job). Download `benchmark-results.json` from a CI artifact when possible:
 
 ```bash
-python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5
+python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 --source ubuntu-latest-ci
 ```
 
 For a quick local snapshot only (may not match CI timings):
 
 ```bash
 make seed-baselines-local
+# writes benchmarks/_raw.json only; does not overwrite benchmarks/baselines.json
+make seed-baselines-local FORCE=1   # also runs reduce_baselines into benchmarks/baselines.json
 ```
 
 `make update-baselines` is a deprecated alias for `seed-baselines-local`. Do not commit baselines from macOS/Windows unless you accept cross-OS gate skew.
@@ -56,5 +64,5 @@ make seed-baselines-local
 | Target | Purpose |
 |--------|---------|
 | `make check-benchmarks` | Run suite + regression gate locally |
-| `make seed-baselines-local` | Capture local timings into `benchmarks/baselines.json` (with slack) |
+| `make seed-baselines-local` | Capture local timings to `benchmarks/_raw.json` (use `FORCE=1` to update `baselines.json`) |
 | `make clean-benchmark-artifacts` | Remove `benchmark-results.json` and `benchmarks/_raw.json` |
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 3a4d413..9afa18d 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,6 +1,6 @@
 {
   "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Refresh after intentional speedups via reduce_baselines.py.",
-  "updated": "2026-06-25T21:48:35Z",
+  "updated": "2026-06-25T22:57:33Z",
   "machine": "Linux",
   "groups": {
     "parse": {
@@ -13,7 +13,8 @@
       "test_post_export_zip[composers-50]": 0.04482855966665985
     },
     "search": {
-      "test_search_full_corpus": 0.047164217833331655
+      "test_search_full_corpus_live_scan": 0.047164217833331655,
+      "test_search_full_corpus_indexed": 0.05494209932945618
     },
     "summary-cache": {
       "test_summary_cache_lookup[hit]": 9.224067718099102e-05,
@@ -23,7 +24,9 @@
       "test_fingerprint_workspace_entries[200]": 0.03069810573000666,
       "test_summary_cache_round_trip": 0.0004966099535917549,
       "test_tab_summary_cache_lookup[hit]": 0.00010487297799045405,
-      "test_tab_summary_cache_lookup[miss]": 0.00010309520517204601
+      "test_tab_summary_cache_lookup[miss]": 0.00010309520517204601,
+      "test_composer_map_cache_lookup[hit]": 8.074544668606364e-05,
+      "test_composer_map_cache_lookup[miss]": 9.495690246481993e-05
     }
   }
 }
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
index 1cbf3b6..cfc133f 100644
--- a/tests/benchmarks/conftest.py
+++ b/tests/benchmarks/conftest.py
@@ -122,11 +122,21 @@ def _make_bench_flask_client(
     monkeypatch: pytest.MonkeyPatch,
     *,
     state_subdir: str = ".cursor-chat-browser",
+    live_scan_search: bool = False,
 ) -> FlaskClient:
-    """Flask test client with env + export state patched for synthetic storage."""
+    """Flask test client with env + export state patched for synthetic storage.
+
+    When *live_scan_search* is True, set ``CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX=1`` so
+    ``/api/search`` measures the live-scan fallback. Otherwise the FTS index path
+    from #113 may be used when an index is built (see indexed search fixtures).
+    """
     monkeypatch.setenv("WORKSPACE_PATH", storage["workspace_path"])
     monkeypatch.setenv("CLI_CHATS_PATH", storage["cli_chats_path"])
-    monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1")
+    if live_scan_search:
+        monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1")
+    else:
+        monkeypatch.delenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", raising=False)
+        monkeypatch.delenv("CURSOR_CHAT_BROWSER_NOCACHE", raising=False)
     state_dir = tmp_path / state_subdir
     state_dir.mkdir()
     monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir))
@@ -138,7 +148,12 @@ def _make_bench_flask_client(
 
 @pytest.fixture
 def summary_cache_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path:
-    """Redirect summary-cache files to an isolated temp directory."""
+    """Redirect summary-cache files to an isolated temp directory.
+
+    Tab-summary files use ``CACHE_DIR`` + hashed filenames only (see
+    ``summary_cache._tab_summaries_path``); they do not use
+    ``PROJECTS_CACHE_FILE`` or ``COMPOSER_MAP_CACHE_FILE``.
+    """
     cache_dir = tmp_path / "cache"
     cache_dir.mkdir()
     monkeypatch.setattr(summary_cache, "CACHE_DIR", cache_dir)
@@ -212,7 +227,7 @@ def bench_env(
 @pytest.fixture
 def bench_client(bench_env: dict[str, str], tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> FlaskClient:
     """Flask test client bound to synthetic bench storage."""
-    return _make_bench_flask_client(bench_env, tmp_path, monkeypatch)
+    return _make_bench_flask_client(bench_env, tmp_path, monkeypatch, live_scan_search=True)
 
 
 @pytest.fixture
@@ -220,11 +235,46 @@ def bench_client_search_corpus(
     tmp_path: Path,
     monkeypatch: pytest.MonkeyPatch,
 ) -> FlaskClient:
-    """Flask client over a fixed 50-composer corpus for search benchmarks."""
+    """Flask client over a fixed 50-composer corpus (live-scan search path)."""
     storage = build_bench_storage(tmp_path / "search_storage", 50)
     return _make_bench_flask_client(
         storage,
         tmp_path,
         monkeypatch,
         state_subdir=".cursor-chat-browser-search",
+        live_scan_search=True,
+    )
+
+
+@pytest.fixture
+def bench_client_search_corpus_indexed(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> FlaskClient:
+    """Flask client with FTS index built for the 50-composer search corpus."""
+    from services.search_index import build_search_index
+
+    monkeypatch.delenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", raising=False)
+    monkeypatch.delenv("CURSOR_CHAT_BROWSER_NOCACHE", raising=False)
+
+    storage = build_bench_storage(tmp_path / "search_indexed_storage", 50)
+    cache_dir = tmp_path / "search_index_cache"
+    cache_dir.mkdir()
+    monkeypatch.setattr("services.search_index.CACHE_DIR", cache_dir)
+    monkeypatch.setattr(
+        "services.search_index.SEARCH_INDEX_POINTER_FILE",
+        cache_dir / "search_index.active",
+    )
+    monkeypatch.setattr(
+        "services.search_index.SEARCH_INDEX_FILE",
+        cache_dir / "search_index.sqlite",
+    )
+    built = build_search_index(storage["workspace_path"], [], force=True)
+    assert built is True
+    return _make_bench_flask_client(
+        storage,
+        tmp_path,
+        monkeypatch,
+        state_subdir=".cursor-chat-browser-search-indexed",
+        live_scan_search=False,
     )
diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py
index e7df914..33eee5b 100644
--- a/tests/benchmarks/test_search_bench.py
+++ b/tests/benchmarks/test_search_bench.py
@@ -8,19 +8,41 @@
 from tests.benchmarks.constants import BENCH_SEARCH_TERM
 
 
+def _search_url() -> str:
+    return f"/api/search?q={BENCH_SEARCH_TERM}&all_history=1"
+
+
+def _assert_search_response(response: object) -> None:
+    assert response.status_code == 200  # type: ignore[attr-defined]
+    body = response.get_json()  # type: ignore[attr-defined]
+    assert isinstance(body, dict)
+    results = body.get("results")
+    assert isinstance(results, list) and len(results) > 0
+
+
 @pytest.mark.benchmark(group="search")
-def test_search_full_corpus(
+def test_search_full_corpus_live_scan(
     benchmark,
     bench_client_search_corpus: FlaskClient,
 ) -> None:
+    """Live-scan fallback only (``CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX=1``)."""
+
     def _run() -> object:
-        return bench_client_search_corpus.get(
-            f"/api/search?q={BENCH_SEARCH_TERM}&all_history=1",
-        )
+        return bench_client_search_corpus.get(_search_url())
 
     response = benchmark(_run)
-    assert response.status_code == 200
-    body = response.get_json()
-    assert isinstance(body, dict)
-    results = body.get("results")
-    assert isinstance(results, list) and len(results) > 0
+    _assert_search_response(response)
+
+
+@pytest.mark.benchmark(group="search")
+def test_search_full_corpus_indexed(
+    benchmark,
+    bench_client_search_corpus_indexed: FlaskClient,
+) -> None:
+    """FTS index path (#113) with pre-built ``search_index.sqlite``."""
+
+    def _run() -> object:
+        return bench_client_search_corpus_indexed.get(_search_url())
+
+    response = benchmark(_run)
+    _assert_search_response(response)
diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py
index 4a259c8..16552d2 100644
--- a/tests/benchmarks/test_summary_cache_bench.py
+++ b/tests/benchmarks/test_summary_cache_bench.py
@@ -9,8 +9,10 @@
 
 from services.summary_cache import (
     fingerprint_workspace_storage,
+    get_cached_composer_id_to_ws,
     get_cached_projects,
     get_cached_tab_summaries,
+    set_cached_composer_id_to_ws,
     set_cached_projects,
     set_cached_tab_summaries,
 )
@@ -39,6 +41,26 @@ def test_summary_cache_lookup(
         assert result is None
 
 
+@pytest.mark.benchmark(group="summary-cache")
+@pytest.mark.parametrize("mode", ["hit", "miss"], ids=["hit", "miss"])
+def test_composer_map_cache_lookup(
+    benchmark,
+    mode: Literal["hit", "miss"],
+    summary_cache_dir: Path,
+    workspace_fingerprint: dict[str, Any],
+    stale_fingerprint: dict[str, Any],
+) -> None:
+    """Time ``get_cached_composer_id_to_ws`` hit/miss (fingerprint mismatch on miss)."""
+    mapping = {"cmp_0000": "ws_0000"}
+    set_cached_composer_id_to_ws(workspace_fingerprint, mapping)
+    lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint
+    result = benchmark(get_cached_composer_id_to_ws, lookup_fp)
+    if mode == "hit":
+        assert result == mapping
+    else:
+        assert result is None
+
+
 @pytest.mark.benchmark(group="summary-cache")
 @pytest.mark.parametrize(
     "synthetic_workspace",

From 5a51d7005bf8bcf79230954902b096199d128675 Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Fri, 26 Jun 2026 07:26:29 +0800
Subject: [PATCH 6/7] chore(bench): refresh baselines from ubuntu CI run
 28206463463

---
 benchmarks/baselines.json | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 9afa18d..8472691 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,32 +1,32 @@
 {
   "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Refresh after intentional speedups via reduce_baselines.py.",
-  "updated": "2026-06-25T22:57:33Z",
+  "updated": "2026-06-25T23:21:29Z",
   "machine": "Linux",
   "groups": {
     "parse": {
-      "test_list_workspace_projects_nocache[composers-10]": 0.01702312019643009,
-      "test_list_workspace_projects_nocache[composers-50]": 0.07538331990000699,
-      "test_list_workspace_projects_nocache[composers-200]": 0.251991555999993
+      "test_list_workspace_projects_nocache[composers-10]": 0.012957852608107466,
+      "test_list_workspace_projects_nocache[composers-50]": 0.05577718626923036,
+      "test_list_workspace_projects_nocache[composers-200]": 0.1878804727500003
     },
     "export": {
-      "test_post_export_zip[composers-10]": 0.0112034034344294,
-      "test_post_export_zip[composers-50]": 0.04482855966665985
+      "test_post_export_zip[composers-10]": 0.009724031427631593,
+      "test_post_export_zip[composers-50]": 0.041050375020001154
     },
     "search": {
-      "test_search_full_corpus_live_scan": 0.047164217833331655,
-      "test_search_full_corpus_indexed": 0.05494209932945618
+      "test_search_full_corpus_live_scan": 0.03440949781249936,
+      "test_search_full_corpus_indexed": 0.04301802726315884
     },
     "summary-cache": {
-      "test_summary_cache_lookup[hit]": 9.224067718099102e-05,
-      "test_summary_cache_lookup[miss]": 9.128770315496628e-05,
-      "test_fingerprint_workspace_entries[10]": 0.0024789120309553535,
-      "test_fingerprint_workspace_entries[50]": 0.010901568931818675,
-      "test_fingerprint_workspace_entries[200]": 0.03069810573000666,
-      "test_summary_cache_round_trip": 0.0004966099535917549,
-      "test_tab_summary_cache_lookup[hit]": 0.00010487297799045405,
-      "test_tab_summary_cache_lookup[miss]": 0.00010309520517204601,
-      "test_composer_map_cache_lookup[hit]": 8.074544668606364e-05,
-      "test_composer_map_cache_lookup[miss]": 9.495690246481993e-05
+      "test_summary_cache_lookup[hit]": 5.7807392057047035e-05,
+      "test_summary_cache_lookup[miss]": 5.6223937183791584e-05,
+      "test_composer_map_cache_lookup[hit]": 5.551344090189019e-05,
+      "test_composer_map_cache_lookup[miss]": 5.490079494266499e-05,
+      "test_fingerprint_workspace_entries[10]": 0.0019021180608754708,
+      "test_fingerprint_workspace_entries[50]": 0.008100319212766178,
+      "test_fingerprint_workspace_entries[200]": 0.0235079150476191,
+      "test_summary_cache_round_trip": 0.001704988098923577,
+      "test_tab_summary_cache_lookup[hit]": 6.058533512024974e-05,
+      "test_tab_summary_cache_lookup[miss]": 6.130049047937722e-05
     }
   }
 }

From 6fed47ed1a58aa29ba4816b1cc920fc5ba5e4100 Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Fri, 26 Jun 2026 07:37:18 +0800
Subject: [PATCH 7/7] bench: exclude round_trip from gate; refresh baselines
 from latest CI

test_summary_cache_round_trip calls set/get each round; OS page-cache
state causes 3-5x variation between CI runs (0.000314s vs 0.001137s).
Add to EXCLUDED_FROM_GATE with comment; baseline kept for observation.

Regenerate baselines.json from run 28206913751 (ubuntu-latest, 1.5x
slack). Update README to document the exclusion and rationale.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 benchmarks/README.md                  |  6 +++--
 benchmarks/baselines.json             | 38 +++++++++++++--------------
 scripts/check_benchmark_regression.py | 13 +++++++--
 3 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 79fce2a..e2e0064 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -35,11 +35,13 @@ The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suit
 - **Fail** when a gated mean is **<50%** of baseline (stale — refresh after intentional speedups)
 - **Fail** when a gated baseline name has no current result
 - **Warn** for benchmarks without a baseline entry
-- All benchmarks listed in `baselines.json` are gated (no exclusion list)
+- All benchmarks listed in `baselines.json` are gated unless named in `EXCLUDED_FROM_GATE` in `scripts/check_benchmark_regression.py`
 
 Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`.
 
-Sub-millisecond benches (e.g. `test_summary_cache_lookup`, `test_composer_map_cache_lookup`) can be high-variance on shared runners. If the gate becomes flaky, raise `--slack` for those entries or reintroduce targeted exclusions in `EXCLUDED_FROM_GATE`.
+Sub-millisecond benches (e.g. `test_summary_cache_lookup`, `test_composer_map_cache_lookup`) can be high-variance on shared runners. If the gate becomes flaky, raise `--slack` for those entries or add targeted exclusions in `EXCLUDED_FROM_GATE`.
+
+`test_summary_cache_round_trip` is intentionally excluded from the gate: it calls `set_cached_projects` (file write) + `get_cached_projects` (file read) each round, so OS page-cache state on shared runners causes 3–5x variation between consecutive CI runs. The baseline entry is kept for observation only.
 
 ## Refresh baselines
 
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 8472691..1f3a5c0 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,32 +1,32 @@
 {
-  "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Refresh after intentional speedups via reduce_baselines.py.",
-  "updated": "2026-06-25T23:21:29Z",
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Excluded from gate (recorded for reference): test_summary_cache_round_trip. Refresh after intentional speedups via reduce_baselines.py.",
+  "updated": "2026-06-25T23:36:11Z",
   "machine": "Linux",
   "groups": {
     "parse": {
-      "test_list_workspace_projects_nocache[composers-10]": 0.012957852608107466,
-      "test_list_workspace_projects_nocache[composers-50]": 0.05577718626923036,
-      "test_list_workspace_projects_nocache[composers-200]": 0.1878804727500003
+      "test_list_workspace_projects_nocache[composers-10]": 0.016421750017237738,
+      "test_list_workspace_projects_nocache[composers-50]": 0.07185380692856874,
+      "test_list_workspace_projects_nocache[composers-200]": 0.2388664538571439
     },
     "export": {
-      "test_post_export_zip[composers-10]": 0.009724031427631593,
-      "test_post_export_zip[composers-50]": 0.041050375020001154
+      "test_post_export_zip[composers-10]": 0.010621589857140498,
+      "test_post_export_zip[composers-50]": 0.03968703356250458
     },
     "search": {
-      "test_search_full_corpus_live_scan": 0.03440949781249936,
-      "test_search_full_corpus_indexed": 0.04301802726315884
+      "test_search_full_corpus_live_scan": 0.04461661563157736,
+      "test_search_full_corpus_indexed": 0.05512249660713918
     },
     "summary-cache": {
-      "test_summary_cache_lookup[hit]": 5.7807392057047035e-05,
-      "test_summary_cache_lookup[miss]": 5.6223937183791584e-05,
-      "test_composer_map_cache_lookup[hit]": 5.551344090189019e-05,
-      "test_composer_map_cache_lookup[miss]": 5.490079494266499e-05,
-      "test_fingerprint_workspace_entries[10]": 0.0019021180608754708,
-      "test_fingerprint_workspace_entries[50]": 0.008100319212766178,
-      "test_fingerprint_workspace_entries[200]": 0.0235079150476191,
-      "test_summary_cache_round_trip": 0.001704988098923577,
-      "test_tab_summary_cache_lookup[hit]": 6.058533512024974e-05,
-      "test_tab_summary_cache_lookup[miss]": 6.130049047937722e-05
+      "test_summary_cache_lookup[hit]": 7.249851343825762e-05,
+      "test_summary_cache_lookup[miss]": 7.193702095574013e-05,
+      "test_composer_map_cache_lookup[hit]": 7.151645086519804e-05,
+      "test_composer_map_cache_lookup[miss]": 7.112598943352091e-05,
+      "test_fingerprint_workspace_entries[10]": 0.0024127972424549185,
+      "test_fingerprint_workspace_entries[50]": 0.010196820941858245,
+      "test_fingerprint_workspace_entries[200]": 0.029070524094341035,
+      "test_summary_cache_round_trip": 0.0004703680658560554,
+      "test_tab_summary_cache_lookup[hit]": 7.844850562859133e-05,
+      "test_tab_summary_cache_lookup[miss]": 7.843399021512e-05
     }
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index e30bee1..6655460 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -11,8 +11,17 @@
 THRESHOLD = 1.20
 STALE_FLOOR = 0.50
 
-# Benchmarks gated via baselines.json; empty set means all baseline entries are checked.
-EXCLUDED_FROM_GATE: frozenset[str] = frozenset()
+# Benchmarks recorded in baselines.json but excluded from the regression gate.
+# Use sparingly — only for benches whose timing is inherently noisy across CI runs
+# (e.g. file I/O operations that depend on OS page-cache state).
+EXCLUDED_FROM_GATE: frozenset[str] = frozenset(
+    {
+        # round_trip calls set_cached_projects (file write) + get_cached_projects (file read)
+        # each round. OS page-cache state on shared runners causes 3–5x variation between
+        # consecutive CI runs, making this ungatable with any reasonable slack.
+        "test_summary_cache_round_trip",
+    }
+)
 
 
 class BenchmarkDataError(ValueError):