From 5eef89730aefca71d09a6bcc6f76b8ead187fc2b Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 26 Jun 2026 04:45:48 +0800 Subject: [PATCH 1/7] ci: unified benchmark suite with full baselines and regression gate --- .github/workflows/tests.yml | 4 +- Makefile | 19 +++ benchmarks/README.md | 60 ++++++++ benchmarks/baselines.json | 32 ++-- scripts/check_benchmark_regression.py | 40 ++++- scripts/reduce_baselines.py | 112 ++++++++++++++ tests/benchmarks/conftest.py | 146 ++++++++++++++++++- tests/benchmarks/test_export_bench.py | 30 ++++ tests/benchmarks/test_parse_bench.py | 27 ++++ tests/benchmarks/test_search_bench.py | 26 ++++ tests/benchmarks/test_summary_cache_bench.py | 45 +++--- tests/test_check_benchmark_regression.py | 50 ++++++- 12 files changed, 544 insertions(+), 47 deletions(-) create mode 100644 Makefile create mode 100644 benchmarks/README.md create mode 100644 scripts/reduce_baselines.py create mode 100644 tests/benchmarks/test_export_bench.py create mode 100644 tests/benchmarks/test_parse_bench.py create mode 100644 tests/benchmarks/test_search_bench.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 158b598..3562801 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -215,7 +215,7 @@ jobs: --redact \ --exit-code 1 - # ── Performance benchmarks: summary cache (issue #115) ───────────────────── + # ── Performance benchmarks: unified suite (issues #115, #110) ────────────── benchmarks: name: Performance benchmarks (gated) needs: [unittest] @@ -236,7 +236,7 @@ jobs: python -m pip install -r requirements-lock.txt python -m pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0' - - name: Run summary-cache benchmarks + - name: Run benchmark suite run: > python -m pytest tests/benchmarks/ --benchmark-only diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2a27405 --- /dev/null +++ b/Makefile @@ -0,0 +1,19 @@ +.PHONY: seed-baselines-local update-baselines check-benchmarks clean-benchmark-artifacts + +# WARNING: captures timings on THIS machine. Production baselines must match ubuntu-latest CI. +# Prefer downloading benchmark-results.json from a CI artifact, then: +# python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 +seed-baselines-local: + @echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2 + python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= + python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5 + +# Deprecated alias — kept for muscle memory; see seed-baselines-local warning above. +update-baselines: seed-baselines-local + +check-benchmarks: + python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts= + python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json + +clean-benchmark-artifacts: + rm -f benchmarks/_raw.json benchmark-results.json diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..0bd1725 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,60 @@ +# Performance benchmarks + +Test files live under `tests/benchmarks/`; this directory holds documentation and `baselines.json` for the CI regression gate. + +Repeatable local measurements for workspace listing, export, search, and summary-cache hot paths. + +## Run locally + +```bash +pip install -r requirements-lock.txt +pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0' +pytest tests/benchmarks/ --benchmark-only -o addopts= -v +``` + +## Scenarios + +| Group | What | +|-------|------| +| parse | `list_workspace_projects(..., nocache=True)` over 10 / 50 / 200 synthetic composers | +| export | `POST /api/export` (ZIP) over 10 / 50 composer corpora | +| search | `GET /api/search` over a 50-composer synthetic corpus | +| summary-cache | cache lookup (hit/miss), fingerprint (10/50/200), round-trip, tab-summary lookup | + +Synthetic corpora are built in `tests/benchmarks/conftest.py` — no real Cursor storage dependency. + +## CI gate + +The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suite (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. + +- **Fail** when a gated mean exceeds its baseline by **>20%** +- **Fail** when a gated mean is **<50%** of baseline (stale — refresh after intentional speedups) +- **Fail** when a gated baseline name has no current result +- **Warn** for benchmarks without a baseline entry +- **Skip gate** for `EXCLUDED_FROM_GATE` names (smallest parse corpus, full-corpus search — sub-ms CI noise) + +Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`. + +## Refresh baselines + +After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job). Download `benchmark-results.json` from a CI artifact when possible: + +```bash +python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 +``` + +For a quick local snapshot only (may not match CI timings): + +```bash +make seed-baselines-local +``` + +`make update-baselines` is a deprecated alias for `seed-baselines-local`. Do not commit baselines from macOS/Windows unless you accept cross-OS gate skew. + +## Makefile targets + +| Target | Purpose | +|--------|---------| +| `make check-benchmarks` | Run suite + regression gate locally | +| `make seed-baselines-local` | Capture local timings into `benchmarks/baselines.json` (with slack) | +| `make clean-benchmark-artifacts` | Remove `benchmark-results.json` and `benchmarks/_raw.json` | diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 131b638..e8a35fa 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,15 +1,29 @@ { - "_note": "Gated means from ubuntu-latest CI benchmark-results.json (PR #120, run 28123677675). Refresh after intentional perf changes: download benchmark-results.json from the CI artifacts job, then `python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json` (re-seed with reduce_baselines or edit means). Local capture: `pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=` on ubuntu-latest.", - "updated": "2026-06-24T19:20:27Z", - "machine": "Linux", + "_note": "Gated means seeded locally (Windows, 1.5× slack) — refresh from ubuntu-latest CI benchmark-results.json artifact before merge. Excluded from gate: test_list_workspace_projects_nocache[composers-10], test_search_full_corpus.", + "updated": "2026-06-25T20:34:07Z", + "machine": "Windows", "groups": { + "parse": { + "test_list_workspace_projects_nocache[composers-10]": 0.01313006085768828, + "test_list_workspace_projects_nocache[composers-50]": 0.04705098008271307, + "test_list_workspace_projects_nocache[composers-200]": 0.19944224995560944 + }, + "export": { + "test_post_export_zip[composers-10]": 0.0170322916819714, + "test_post_export_zip[composers-50]": 0.040990050032269215 + }, + "search": { + "test_search_full_corpus": 0.057670830062124874 + }, "summary-cache": { - "test_summary_cache_hit": 6.3e-05, - "test_summary_cache_miss": 6.3e-05, - "test_fingerprint_workspace_entries[10]": 0.001844, - "test_fingerprint_workspace_entries[50]": 0.007759, - "test_fingerprint_workspace_entries[200]": 0.022231, - "test_summary_cache_round_trip": 0.000351 + "test_summary_cache_lookup[hit]": 0.00014543285277406022, + "test_summary_cache_lookup[miss]": 0.0001437347241805802, + "test_fingerprint_workspace_entries[10]": 0.001866654586096193, + "test_fingerprint_workspace_entries[50]": 0.00636450619807407, + "test_fingerprint_workspace_entries[200]": 0.020523441289855247, + "test_summary_cache_round_trip": 0.0019650292328056915, + "test_tab_summary_cache_lookup[hit]": 0.00015344636292124477, + "test_tab_summary_cache_lookup[miss]": 0.00012440098537902896 } } } diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index d2fc79c..b2d8d53 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -8,6 +8,15 @@ from pathlib import Path THRESHOLD = 1.20 +STALE_FLOOR = 0.50 + +# Sub-ms timings are too noisy for a fixed 20% gate on ubuntu CI. +EXCLUDED_FROM_GATE = frozenset( + { + "test_list_workspace_projects_nocache[composers-10]", + "test_search_full_corpus", + } +) class BenchmarkDataError(ValueError): @@ -102,14 +111,18 @@ def check_regression( baselines_path: str | Path, *, threshold: float = THRESHOLD, + stale_floor: float = STALE_FLOOR, ) -> int: - """Return 0 when within threshold; 1 when any gated benchmark regresses.""" + """Return 0 when within threshold; 1 when any gated benchmark regresses or is stale.""" flat = load_results(results_path) baseline_means = load_baseline_means(baselines_path) failures: list[str] = [] + stale: list[str] = [] missing: list[str] = [] for name, base in baseline_means.items(): + if name in EXCLUDED_FROM_GATE: + continue cur = flat.get(name) if cur is None: print(f"FAIL: no current result for gated baseline {name!r}") @@ -119,20 +132,32 @@ def check_regression( print(f"WARN: baseline for {name!r} is zero; skipping ratio check") continue ratio = cur / base - tag = "FAIL" if ratio > threshold else "ok" - print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") if ratio > threshold: + tag = "FAIL" failures.append(name) + elif ratio < stale_floor: + tag = "STALE" + stale.append(name) + else: + tag = "ok" + print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") for name in flat: + if name in EXCLUDED_FROM_GATE: + continue if name not in baseline_means: print(f"WARN: {name!r} has no baseline yet; not gated") if failures: print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}") + if stale: + print( + f"\nSTALE: {len(stale)} benchmark(s) are faster than {stale_floor:.0%} of baseline " + "(refresh baselines after intentional speedups)" + ) if missing: print(f"\nMISSING: {len(missing)} gated benchmark(s) absent from current results") - if failures or missing: + if failures or stale or missing: return 1 return 0 @@ -147,12 +172,19 @@ def main(argv: list[str] | None = None) -> int: default=THRESHOLD, help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)", ) + parser.add_argument( + "--stale-floor", + type=float, + default=STALE_FLOOR, + help="fail when current mean is below this fraction of baseline (default: 0.50)", + ) args = parser.parse_args(argv) try: return check_regression( args.results_path, args.baselines_path, threshold=args.threshold, + stale_floor=args.stale_floor, ) except BenchmarkDataError as exc: print(f"ERROR: {exc}", file=sys.stderr) diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py new file mode 100644 index 0000000..76d2560 --- /dev/null +++ b/scripts/reduce_baselines.py @@ -0,0 +1,112 @@ +"""Reduce pytest-benchmark JSON into benchmarks/baselines.json.""" + +from __future__ import annotations + +import argparse +import json +import sys +from datetime import UTC, datetime +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parent.parent +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from scripts.check_benchmark_regression import ( + EXCLUDED_FROM_GATE, + BenchmarkDataError, + normalize_benchmark_name, +) + +GATED_GROUPS = ("parse", "export", "search", "summary-cache") + + +def _positive_float(value: str) -> float: + parsed = float(value) + if parsed <= 0: + raise argparse.ArgumentTypeError("slack must be greater than zero") + return parsed + + +def reduce_baselines( + raw_path: str | Path, + out_path: str | Path, + *, + slack: float = 1.0, +) -> dict[str, object]: + path = Path(raw_path) + try: + raw = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + except OSError as exc: + raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc + + try: + entries = raw["benchmarks"] + except (KeyError, TypeError) as exc: + raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc + if not isinstance(entries, list): + raise BenchmarkDataError(f"{path} 'benchmarks' must be an array") + + groups: dict[str, dict[str, float]] = {group: {} for group in GATED_GROUPS} + for index, entry in enumerate(entries): + if not isinstance(entry, dict): + raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") + try: + raw_name = entry["name"] + mean = float(entry["stats"]["mean"]) + except (KeyError, TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" + ) from exc + bench_name = normalize_benchmark_name(str(raw_name)) + group = entry.get("group") + if group not in GATED_GROUPS: + continue + groups[group][bench_name] = mean * slack + + excluded = ", ".join(sorted(EXCLUDED_FROM_GATE)) + slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else "" + machine_info = raw.get("machine_info") + machine = machine_info.get("system") if isinstance(machine_info, dict) else None + output: dict[str, object] = { + "_note": ( + "Gated means from ubuntu-latest CI benchmark-results.json." + f"{slack_note} " + f"Excluded from gate (recorded for reference): {excluded}. " + "Refresh after intentional speedups via reduce_baselines.py." + ), + "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"), + "machine": machine, + "groups": groups, + } + out = Path(out_path) + try: + out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8") + except OSError as exc: + raise BenchmarkDataError(f"cannot write {out}: {exc}") from exc + return output + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("raw_path", help="pytest-benchmark --benchmark-json output") + parser.add_argument("out_path", help="destination baselines.json path") + parser.add_argument( + "--slack", + type=_positive_float, + default=1.0, + help="multiply means by this factor (must be > 0)", + ) + args = parser.parse_args(argv) + try: + reduce_baselines(args.raw_path, args.out_path, slack=args.slack) + except BenchmarkDataError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 2 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index e3e17e2..ed16f09 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -1,15 +1,22 @@ -"""Synthetic workspace trees for summary-cache performance benchmarks.""" +"""Shared synthetic fixtures for pytest-benchmark hot paths.""" from __future__ import annotations +import contextlib +import json +import sqlite3 from pathlib import Path from typing import Any import pytest +from flask.testing import FlaskClient +from app import create_app from services import summary_cache from services.summary_cache import fingerprint_workspace_storage +BENCH_SEARCH_TERM = "bench-search-token" + def make_workspace_entries(workspace_root: Path, count: int) -> list[dict[str, Any]]: """Build *count* synthetic workspace entries with on-disk state files.""" @@ -30,14 +37,89 @@ def make_workspace_entries(workspace_root: Path, count: int) -> list[dict[str, A return entries +def _composer_ids(count: int) -> list[tuple[str, str, str]]: + return [(f"ws_{i:04d}", f"cmp_{i:04d}", f"bub_{i:04d}") for i in range(count)] + + +def build_bench_storage(root: Path, composer_count: int) -> dict[str, str]: + """Create workspaceStorage, globalStorage, and cli_chats trees for *composer_count* composers.""" + ws_root = root / "workspaceStorage" + global_root = root / "globalStorage" + cli_root = root / "cli_chats" + projects_root = root / "projects" + ws_root.mkdir(parents=True) + global_root.mkdir(parents=True) + cli_root.mkdir(parents=True) + projects_root.mkdir(parents=True) + + global_db_path = global_root / "state.vscdb" + with contextlib.closing(sqlite3.connect(global_db_path)) as conn: + conn.execute("CREATE TABLE cursorDiskKV ([key] TEXT PRIMARY KEY, value TEXT)") + base_ts = 1_715_000_000_000 + for i, (workspace_id, composer_id, bubble_id) in enumerate(_composer_ids(composer_count)): + project_folder = projects_root / f"proj_{i:04d}" + project_folder.mkdir(parents=True, exist_ok=True) + + ws_dir = ws_root / workspace_id + ws_dir.mkdir(parents=True, exist_ok=True) + (ws_dir / "workspace.json").write_text( + json.dumps({"folder": str(project_folder)}), + encoding="utf-8", + ) + with contextlib.closing(sqlite3.connect(ws_dir / "state.vscdb")) as ws_conn: + ws_conn.execute("CREATE TABLE ItemTable ([key] TEXT PRIMARY KEY, value TEXT)") + ws_conn.execute( + "INSERT INTO ItemTable ([key], value) VALUES (?, ?)", + ( + "composer.composerData", + json.dumps({"allComposers": [{"composerId": composer_id}]}), + ), + ) + ws_conn.commit() + + created_at = base_ts + i * 1_000 + conn.execute( + "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)", + ( + f"composerData:{composer_id}", + json.dumps( + { + "name": f"Bench chat {i:04d}", + "createdAt": created_at, + "lastUpdatedAt": created_at + 500, + "fullConversationHeadersOnly": [ + {"bubbleId": bubble_id, "type": 1}, + ], + "modelConfig": {"modelName": "gpt-4o"}, + } + ), + ), + ) + conn.execute( + "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)", + ( + f"bubbleId:{composer_id}:{bubble_id}", + json.dumps( + { + "text": f"find {BENCH_SEARCH_TERM} in composer {i:04d}", + "type": "user", + "createdAt": created_at + 400, + } + ), + ), + ) + conn.commit() + + return { + "workspace_path": str(ws_root), + "cli_chats_path": str(cli_root), + "storage_root": str(root), + } + + @pytest.fixture def summary_cache_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: - """Redirect summary-cache files to an isolated temp directory. - - Patches ``CACHE_DIR`` (also used by tab-summary paths via ``_tab_summaries_path``) - plus the projects/composer-map file constants used by current benchmarks. - Tab-summary cache benchmarks are deferred to issue #110 (unified benchmark suite). - """ + """Redirect summary-cache files to an isolated temp directory.""" cache_dir = tmp_path / "cache" cache_dir.mkdir() monkeypatch.setattr(summary_cache, "CACHE_DIR", cache_dir) @@ -87,3 +169,53 @@ def workspace_fingerprint(synthetic_workspace: tuple[str, list[dict[str, Any]]]) def stale_fingerprint(workspace_fingerprint: dict[str, Any]) -> dict[str, Any]: """Return a fingerprint guaranteed to differ from the stored one.""" return {**workspace_fingerprint, "rules_digest": "deadbeefdeadbeef"} + + +@pytest.fixture +def bench_storage(tmp_path: Path, request: pytest.FixtureRequest) -> dict[str, str]: + """On-disk Cursor layout with N composers (indirect ``composer_count`` param).""" + count = getattr(request, "param", 10) + return build_bench_storage(tmp_path / "storage", count) + + +@pytest.fixture +def bench_env( + bench_storage: dict[str, str], + monkeypatch: pytest.MonkeyPatch, +) -> dict[str, str]: + """Set WORKSPACE_PATH / CLI_CHATS_PATH for the synthetic storage tree.""" + monkeypatch.setenv("WORKSPACE_PATH", bench_storage["workspace_path"]) + monkeypatch.setenv("CLI_CHATS_PATH", bench_storage["cli_chats_path"]) + monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1") + return bench_storage + + +@pytest.fixture +def bench_client(bench_env: dict[str, str], tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> FlaskClient: + """Flask test client bound to synthetic bench storage.""" + state_dir = tmp_path / ".cursor-chat-browser" + state_dir.mkdir() + monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir)) + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + return app.test_client() + + +@pytest.fixture +def bench_client_search_corpus( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> FlaskClient: + """Flask client over a fixed 50-composer corpus for search benchmarks.""" + storage = build_bench_storage(tmp_path / "search_storage", 50) + monkeypatch.setenv("WORKSPACE_PATH", storage["workspace_path"]) + monkeypatch.setenv("CLI_CHATS_PATH", storage["cli_chats_path"]) + monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1") + state_dir = tmp_path / ".cursor-chat-browser-search" + state_dir.mkdir() + monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir)) + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + return app.test_client() diff --git a/tests/benchmarks/test_export_bench.py b/tests/benchmarks/test_export_bench.py new file mode 100644 index 0000000..c5e3051 --- /dev/null +++ b/tests/benchmarks/test_export_bench.py @@ -0,0 +1,30 @@ +"""Benchmark POST /api/export (ZIP) over synthetic workspace + global DB.""" + +from __future__ import annotations + +import pytest +from flask.testing import FlaskClient + + +@pytest.mark.benchmark(group="export") +@pytest.mark.parametrize( + "bench_storage", + [10, 50], + indirect=True, + ids=["composers-10", "composers-50"], +) +def test_post_export_zip( + benchmark, + bench_client: FlaskClient, +) -> None: + def _run() -> object: + return bench_client.post( + "/api/export", + json={}, + content_type="application/json", + ) + + response = benchmark(_run) + assert response.status_code == 200 + assert response.content_type.startswith("application/zip") + assert int(response.headers.get("X-Export-Count", "0")) >= 1 diff --git a/tests/benchmarks/test_parse_bench.py b/tests/benchmarks/test_parse_bench.py new file mode 100644 index 0000000..d3814d9 --- /dev/null +++ b/tests/benchmarks/test_parse_bench.py @@ -0,0 +1,27 @@ +"""Benchmark list_workspace_projects (nocache) over synthetic composer corpora.""" + +from __future__ import annotations + +import pytest + +from services.workspace_listing import list_workspace_projects + + +@pytest.mark.benchmark(group="parse") +@pytest.mark.parametrize( + "bench_storage", + [10, 50, 200], + indirect=True, + ids=["composers-10", "composers-50", "composers-200"], +) +def test_list_workspace_projects_nocache( + benchmark, + bench_env: dict[str, str], +) -> None: + workspace_path = bench_env["workspace_path"] + + def _run() -> object: + return list_workspace_projects(workspace_path, [], nocache=True) + + projects, _warnings = benchmark(_run) + assert isinstance(projects, list) and len(projects) > 0 diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py new file mode 100644 index 0000000..1606f24 --- /dev/null +++ b/tests/benchmarks/test_search_bench.py @@ -0,0 +1,26 @@ +"""Benchmark GET /api/search over a 50-composer synthetic corpus.""" + +from __future__ import annotations + +import pytest +from flask.testing import FlaskClient + +from tests.benchmarks.conftest import BENCH_SEARCH_TERM + + +@pytest.mark.benchmark(group="search") +def test_search_full_corpus( + benchmark, + bench_client_search_corpus: FlaskClient, +) -> None: + def _run() -> object: + return bench_client_search_corpus.get( + f"/api/search?q={BENCH_SEARCH_TERM}&all_history=1", + ) + + response = benchmark(_run) + assert response.status_code == 200 + body = response.get_json() + assert isinstance(body, dict) + results = body.get("results") + assert isinstance(results, list) and len(results) > 0 diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py index dad4a15..70f127b 100644 --- a/tests/benchmarks/test_summary_cache_bench.py +++ b/tests/benchmarks/test_summary_cache_bench.py @@ -1,44 +1,35 @@ -"""pytest-benchmark coverage for services/summary_cache.py hot paths. - -``test_summary_cache_hit`` and ``test_summary_cache_miss`` both time ``get_cached_projects`` -only. Miss means fingerprint mismatch (cache not used), not a full cache rebuild. -""" +"""pytest-benchmark coverage for services/summary_cache.py hot paths.""" from __future__ import annotations from pathlib import Path -from typing import Any +from typing import Any, Literal import pytest from services.summary_cache import ( fingerprint_workspace_storage, get_cached_projects, + get_cached_tab_summaries, set_cached_projects, + set_cached_tab_summaries, ) @pytest.mark.benchmark(group="summary-cache") -def test_summary_cache_hit( - benchmark, - summary_cache_dir: Path, - workspace_fingerprint: dict[str, Any], - sample_projects: list[dict[str, Any]], -) -> None: - set_cached_projects(workspace_fingerprint, sample_projects, []) - benchmark(get_cached_projects, workspace_fingerprint) - - -@pytest.mark.benchmark(group="summary-cache") -def test_summary_cache_miss( +@pytest.mark.parametrize("mode", ["hit", "miss"], ids=["hit", "miss"]) +def test_summary_cache_lookup( benchmark, + mode: Literal["hit", "miss"], summary_cache_dir: Path, workspace_fingerprint: dict[str, Any], stale_fingerprint: dict[str, Any], sample_projects: list[dict[str, Any]], ) -> None: + """Time ``get_cached_projects`` only; miss = fingerprint mismatch, not rebuild.""" set_cached_projects(workspace_fingerprint, sample_projects, []) - benchmark(get_cached_projects, stale_fingerprint) + lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint + benchmark(get_cached_projects, lookup_fp) @pytest.mark.benchmark(group="summary-cache") @@ -76,3 +67,19 @@ def _run() -> None: get_cached_projects(fp) benchmark(_run) + + +@pytest.mark.benchmark(group="summary-cache") +@pytest.mark.parametrize("mode", ["hit", "miss"], ids=["hit", "miss"]) +def test_tab_summary_cache_lookup( + benchmark, + mode: Literal["hit", "miss"], + summary_cache_dir: Path, + workspace_fingerprint: dict[str, Any], + stale_fingerprint: dict[str, Any], +) -> None: + workspace_id = "ws_0000" + payload = {"tabs": [{"id": "cmp_0000", "title": "Bench"}]} + set_cached_tab_summaries(workspace_fingerprint, workspace_id, payload, 200) + lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint + benchmark(get_cached_tab_summaries, lookup_fp, workspace_id) diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py index 8de10a8..e15118f 100644 --- a/tests/test_check_benchmark_regression.py +++ b/tests/test_check_benchmark_regression.py @@ -14,7 +14,7 @@ normalize_benchmark_name, ) -GATED_BENCH = "test_summary_cache_hit" +GATED_BENCH = "test_summary_cache_lookup[hit]" def _write_results(path, benchmarks: list[dict]) -> None: @@ -32,9 +32,9 @@ def _write_baselines(path, groups: dict[str, dict[str, float]]) -> None: def test_normalize_benchmark_name_strips_module_prefix() -> None: - full = "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_hit" - assert normalize_benchmark_name(full) == "test_summary_cache_hit" - assert normalize_benchmark_name("test_summary_cache_hit") == "test_summary_cache_hit" + full = "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_lookup[hit]" + assert normalize_benchmark_name(full) == "test_summary_cache_lookup[hit]" + assert normalize_benchmark_name("test_summary_cache_lookup[hit]") == "test_summary_cache_lookup[hit]" def test_normalize_benchmark_name_preserves_colons_in_param_values() -> None: @@ -50,13 +50,13 @@ def test_load_results_normalizes_full_node_id(tmp_path) -> None: path, [ { - "name": "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_hit", + "name": "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_lookup[hit]", "stats": {"mean": 0.0001}, } ], ) - assert load_results(path)["test_summary_cache_hit"] == pytest.approx(0.0001) + assert load_results(path)["test_summary_cache_lookup[hit]"] == pytest.approx(0.0001) def test_missing_baseline_warns_without_failing( @@ -213,3 +213,41 @@ def test_load_baseline_means_rejects_non_dict_group(tmp_path) -> None: with pytest.raises(BenchmarkDataError, match="must be an object"): load_baseline_means(baselines) + + +def test_stale_baseline_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": GATED_BENCH, "stats": {"mean": 0.00005}}], + ) + _write_baselines( + baselines, + {"summary-cache": {GATED_BENCH: 0.0002}}, + ) + + assert check_regression(results, baselines) == 1 + out = capsys.readouterr().out + assert "STALE" in out + + +def test_excluded_benchmark_not_gated(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE + + excluded = next(iter(EXCLUDED_FROM_GATE)) + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": excluded, "stats": {"mean": 1.0}}], + ) + _write_baselines( + baselines, + {"search": {excluded: 0.0001}}, + ) + + assert check_regression(results, baselines) == 0 + out = capsys.readouterr().out + assert "REGRESSION" not in out + assert "STALE" not in out From fcda8edcb005d712602d20f79cdfdb98d722dcbf Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 26 Jun 2026 05:21:11 +0800 Subject: [PATCH 2/7] fix(bench): address review feedback and seed ubuntu baselines --- .gitignore | 1 + benchmarks/baselines.json | 34 ++++++++++---------- scripts/check_benchmark_regression.py | 6 ++++ scripts/reduce_baselines.py | 15 +++++++-- tests/benchmarks/test_parse_bench.py | 3 +- tests/benchmarks/test_summary_cache_bench.py | 18 +++++++++-- tests/test_check_benchmark_regression.py | 24 ++++++++++++++ 7 files changed, 79 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index f204306..0712397 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ coverage.xml .hypothesis/ benchmark-results.json benchmarks/_raw.json +benchmarks/_ci/ diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index e8a35fa..959c82d 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,29 +1,29 @@ { - "_note": "Gated means seeded locally (Windows, 1.5× slack) — refresh from ubuntu-latest CI benchmark-results.json artifact before merge. Excluded from gate: test_list_workspace_projects_nocache[composers-10], test_search_full_corpus.", - "updated": "2026-06-25T20:34:07Z", - "machine": "Windows", + "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5\u00d7 slack at generation time. Excluded from gate (recorded for reference): test_list_workspace_projects_nocache[composers-10], test_search_full_corpus. Refresh after intentional speedups via reduce_baselines.py.", + "updated": "2026-06-25T21:14:16Z", + "machine": "Linux", "groups": { "parse": { - "test_list_workspace_projects_nocache[composers-10]": 0.01313006085768828, - "test_list_workspace_projects_nocache[composers-50]": 0.04705098008271307, - "test_list_workspace_projects_nocache[composers-200]": 0.19944224995560944 + "test_list_workspace_projects_nocache[composers-10]": 0.01702312019643009, + "test_list_workspace_projects_nocache[composers-50]": 0.07538331990000699, + "test_list_workspace_projects_nocache[composers-200]": 0.251991555999993 }, "export": { - "test_post_export_zip[composers-10]": 0.0170322916819714, - "test_post_export_zip[composers-50]": 0.040990050032269215 + "test_post_export_zip[composers-10]": 0.0112034034344294, + "test_post_export_zip[composers-50]": 0.04482855966665985 }, "search": { - "test_search_full_corpus": 0.057670830062124874 + "test_search_full_corpus": 0.047164217833331655 }, "summary-cache": { - "test_summary_cache_lookup[hit]": 0.00014543285277406022, - "test_summary_cache_lookup[miss]": 0.0001437347241805802, - "test_fingerprint_workspace_entries[10]": 0.001866654586096193, - "test_fingerprint_workspace_entries[50]": 0.00636450619807407, - "test_fingerprint_workspace_entries[200]": 0.020523441289855247, - "test_summary_cache_round_trip": 0.0019650292328056915, - "test_tab_summary_cache_lookup[hit]": 0.00015344636292124477, - "test_tab_summary_cache_lookup[miss]": 0.00012440098537902896 + "test_summary_cache_lookup[hit]": 9.224067718099102e-05, + "test_summary_cache_lookup[miss]": 9.128770315496628e-05, + "test_fingerprint_workspace_entries[10]": 0.0024789120309553535, + "test_fingerprint_workspace_entries[50]": 0.010901568931818675, + "test_fingerprint_workspace_entries[200]": 0.03069810573000666, + "test_summary_cache_round_trip": 0.0004966099535917549, + "test_tab_summary_cache_lookup[hit]": 0.00010487297799045405, + "test_tab_summary_cache_lookup[miss]": 0.00010309520517204601 } } } diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index b2d8d53..394d047 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -179,6 +179,12 @@ def main(argv: list[str] | None = None) -> int: help="fail when current mean is below this fraction of baseline (default: 0.50)", ) args = parser.parse_args(argv) + if args.threshold <= 1: + print("ERROR: --threshold must be greater than 1", file=sys.stderr) + return 2 + if not 0 < args.stale_floor < 1: + print("ERROR: --stale-floor must be between 0 and 1 (exclusive)", file=sys.stderr) + return 2 try: return check_regression( args.results_path, diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py index 76d2560..82e6562 100644 --- a/scripts/reduce_baselines.py +++ b/scripts/reduce_baselines.py @@ -33,6 +33,7 @@ def reduce_baselines( out_path: str | Path, *, slack: float = 1.0, + source: str = "local", ) -> dict[str, object]: path = Path(raw_path) try: @@ -70,9 +71,14 @@ def reduce_baselines( slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else "" machine_info = raw.get("machine_info") machine = machine_info.get("system") if isinstance(machine_info, dict) else None + source_labels = { + "ubuntu-latest-ci": "ubuntu-latest CI benchmark-results.json", + "local": "local benchmark-results.json", + } + source_label = source_labels.get(source, source) output: dict[str, object] = { "_note": ( - "Gated means from ubuntu-latest CI benchmark-results.json." + f"Gated means from {source_label}." f"{slack_note} " f"Excluded from gate (recorded for reference): {excluded}. " "Refresh after intentional speedups via reduce_baselines.py." @@ -99,9 +105,14 @@ def main(argv: list[str] | None = None) -> int: default=1.0, help="multiply means by this factor (must be > 0)", ) + parser.add_argument( + "--source", + default="local", + help="provenance label for _note (e.g. ubuntu-latest-ci, local)", + ) args = parser.parse_args(argv) try: - reduce_baselines(args.raw_path, args.out_path, slack=args.slack) + reduce_baselines(args.raw_path, args.out_path, slack=args.slack, source=args.source) except BenchmarkDataError as exc: print(f"ERROR: {exc}", file=sys.stderr) return 2 diff --git a/tests/benchmarks/test_parse_bench.py b/tests/benchmarks/test_parse_bench.py index d3814d9..9f23872 100644 --- a/tests/benchmarks/test_parse_bench.py +++ b/tests/benchmarks/test_parse_bench.py @@ -23,5 +23,6 @@ def test_list_workspace_projects_nocache( def _run() -> object: return list_workspace_projects(workspace_path, [], nocache=True) - projects, _warnings = benchmark(_run) + projects, warnings = benchmark(_run) assert isinstance(projects, list) and len(projects) > 0 + assert warnings == [] diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py index 70f127b..c9b1608 100644 --- a/tests/benchmarks/test_summary_cache_bench.py +++ b/tests/benchmarks/test_summary_cache_bench.py @@ -29,7 +29,14 @@ def test_summary_cache_lookup( """Time ``get_cached_projects`` only; miss = fingerprint mismatch, not rebuild.""" set_cached_projects(workspace_fingerprint, sample_projects, []) lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint - benchmark(get_cached_projects, lookup_fp) + result = benchmark(get_cached_projects, lookup_fp) + if mode == "hit": + assert result is not None + projects, warnings = result + assert projects == sample_projects + assert warnings == [] + else: + assert result is None @pytest.mark.benchmark(group="summary-cache") @@ -82,4 +89,11 @@ def test_tab_summary_cache_lookup( payload = {"tabs": [{"id": "cmp_0000", "title": "Bench"}]} set_cached_tab_summaries(workspace_fingerprint, workspace_id, payload, 200) lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint - benchmark(get_cached_tab_summaries, lookup_fp, workspace_id) + result = benchmark(get_cached_tab_summaries, lookup_fp, workspace_id) + if mode == "hit": + assert result is not None + cached_payload, status = result + assert status == 200 + assert cached_payload == payload + else: + assert result is None diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py index e15118f..38ddcae 100644 --- a/tests/test_check_benchmark_regression.py +++ b/tests/test_check_benchmark_regression.py @@ -251,3 +251,27 @@ def test_excluded_benchmark_not_gated(tmp_path, capsys: pytest.CaptureFixture[st out = capsys.readouterr().out assert "REGRESSION" not in out assert "STALE" not in out + + +def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + from scripts.check_benchmark_regression import main + + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) + + assert main([str(results), str(baselines), "--threshold", "1.0"]) == 2 + assert "--threshold must be greater than 1" in capsys.readouterr().err + + +def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + from scripts.check_benchmark_regression import main + + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) + + assert main([str(results), str(baselines), "--stale-floor", "1.5"]) == 2 + assert "--stale-floor must be between 0 and 1" in capsys.readouterr().err From 87a747cb4e7a6eb281c1554056827f3afee853f2 Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 26 Jun 2026 05:49:49 +0800 Subject: [PATCH 3/7] fix(bench): gate all baseline benchmarks and validate finite ratios --- .github/workflows/tests.yml | 2 +- Makefile | 4 +- benchmarks/README.md | 2 +- benchmarks/baselines.json | 4 +- scripts/check_benchmark_regression.py | 28 ++-- scripts/reduce_baselines.py | 20 ++- tests/benchmarks/conftest.py | 47 ++++--- tests/benchmarks/constants.py | 3 + tests/benchmarks/test_search_bench.py | 2 +- tests/benchmarks/test_summary_cache_bench.py | 5 + tests/test_check_benchmark_regression.py | 53 +++++--- tests/test_reduce_baselines.py | 132 +++++++++++++++++++ 12 files changed, 239 insertions(+), 63 deletions(-) create mode 100644 tests/benchmarks/constants.py create mode 100644 tests/test_reduce_baselines.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3562801..3a34c3d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -115,7 +115,7 @@ jobs: # exercise Flask routes via app.test_client(). Only listed files — not # `pytest tests/` — to avoid re-collecting unittest.TestCase classes above. # -o addopts= avoids inheriting benchmark-only options from pyproject.toml. - run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py tests/test_check_benchmark_regression.py -v --tb=short -o addopts= + run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py tests/test_check_benchmark_regression.py tests/test_reduce_baselines.py -v --tb=short -o addopts= # ── PyInstaller desktop build (Windows only, once per workflow) ──────── # Closes #44. Builds the onedir bundle and smoke-tests --help so the diff --git a/Makefile b/Makefile index 2a27405..a1f7607 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ seed-baselines-local: @echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2 python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= - python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5 + python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5 --source local # Deprecated alias — kept for muscle memory; see seed-baselines-local warning above. update-baselines: seed-baselines-local @@ -16,4 +16,4 @@ check-benchmarks: python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json clean-benchmark-artifacts: - rm -f benchmarks/_raw.json benchmark-results.json + python -c "import pathlib; [p.unlink(missing_ok=True) for p in (pathlib.Path('benchmarks/_raw.json'), pathlib.Path('benchmark-results.json'))]" diff --git a/benchmarks/README.md b/benchmarks/README.md index 0bd1725..0f4e23a 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -31,7 +31,7 @@ The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suit - **Fail** when a gated mean is **<50%** of baseline (stale — refresh after intentional speedups) - **Fail** when a gated baseline name has no current result - **Warn** for benchmarks without a baseline entry -- **Skip gate** for `EXCLUDED_FROM_GATE` names (smallest parse corpus, full-corpus search — sub-ms CI noise) +- All benchmarks listed in `baselines.json` are gated (no exclusion list) Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`. diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 959c82d..9302877 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,6 +1,6 @@ { - "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5\u00d7 slack at generation time. Excluded from gate (recorded for reference): test_list_workspace_projects_nocache[composers-10], test_search_full_corpus. Refresh after intentional speedups via reduce_baselines.py.", - "updated": "2026-06-25T21:14:16Z", + "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5\u00d7 slack at generation time. Refresh after intentional speedups via reduce_baselines.py.", + "updated": "2026-06-25T21:48:35Z", "machine": "Linux", "groups": { "parse": { diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index 394d047..e30bee1 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -4,19 +4,15 @@ import argparse import json +import math import sys from pathlib import Path THRESHOLD = 1.20 STALE_FLOOR = 0.50 -# Sub-ms timings are too noisy for a fixed 20% gate on ubuntu CI. -EXCLUDED_FROM_GATE = frozenset( - { - "test_list_workspace_projects_nocache[composers-10]", - "test_search_full_corpus", - } -) +# Benchmarks gated via baselines.json; empty set means all baseline entries are checked. +EXCLUDED_FROM_GATE: frozenset[str] = frozenset() class BenchmarkDataError(ValueError): @@ -106,6 +102,17 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: return means +def _validate_gate_ratios(threshold: float, stale_floor: float) -> None: + if not math.isfinite(threshold): + raise BenchmarkDataError("threshold must be finite") + if threshold <= 1: + raise BenchmarkDataError("threshold must be greater than 1") + if not math.isfinite(stale_floor): + raise BenchmarkDataError("stale_floor must be finite") + if not 0 < stale_floor < 1: + raise BenchmarkDataError("stale_floor must be between 0 and 1 (exclusive)") + + def check_regression( results_path: str | Path, baselines_path: str | Path, @@ -114,6 +121,7 @@ def check_regression( stale_floor: float = STALE_FLOOR, ) -> int: """Return 0 when within threshold; 1 when any gated benchmark regresses or is stale.""" + _validate_gate_ratios(threshold, stale_floor) flat = load_results(results_path) baseline_means = load_baseline_means(baselines_path) @@ -179,12 +187,6 @@ def main(argv: list[str] | None = None) -> int: help="fail when current mean is below this fraction of baseline (default: 0.50)", ) args = parser.parse_args(argv) - if args.threshold <= 1: - print("ERROR: --threshold must be greater than 1", file=sys.stderr) - return 2 - if not 0 < args.stale_floor < 1: - print("ERROR: --stale-floor must be between 0 and 1 (exclusive)", file=sys.stderr) - return 2 try: return check_regression( args.results_path, diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py index 82e6562..6264018 100644 --- a/scripts/reduce_baselines.py +++ b/scripts/reduce_baselines.py @@ -4,6 +4,7 @@ import argparse import json +import math import sys from datetime import UTC, datetime from pathlib import Path @@ -23,6 +24,8 @@ def _positive_float(value: str) -> float: parsed = float(value) + if not math.isfinite(parsed): + raise argparse.ArgumentTypeError("slack must be a finite number") if parsed <= 0: raise argparse.ArgumentTypeError("slack must be greater than zero") return parsed @@ -63,11 +66,23 @@ def reduce_baselines( ) from exc bench_name = normalize_benchmark_name(str(raw_name)) group = entry.get("group") + if group is None: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] ({bench_name!r}) missing required 'group'" + ) if group not in GATED_GROUPS: - continue + raise BenchmarkDataError( + f"{path} benchmarks[{index}] ({bench_name!r}) has unknown group {group!r}; " + f"expected one of {GATED_GROUPS}" + ) groups[group][bench_name] = mean * slack excluded = ", ".join(sorted(EXCLUDED_FROM_GATE)) + excluded_note = ( + f" Excluded from gate (recorded for reference): {excluded}." + if excluded + else "" + ) slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else "" machine_info = raw.get("machine_info") machine = machine_info.get("system") if isinstance(machine_info, dict) else None @@ -79,8 +94,7 @@ def reduce_baselines( output: dict[str, object] = { "_note": ( f"Gated means from {source_label}." - f"{slack_note} " - f"Excluded from gate (recorded for reference): {excluded}. " + f"{slack_note}{excluded_note} " "Refresh after intentional speedups via reduce_baselines.py." ), "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"), diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index ed16f09..1cbf3b6 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -14,8 +14,7 @@ from app import create_app from services import summary_cache from services.summary_cache import fingerprint_workspace_storage - -BENCH_SEARCH_TERM = "bench-search-token" +from tests.benchmarks.constants import BENCH_SEARCH_TERM def make_workspace_entries(workspace_root: Path, count: int) -> list[dict[str, Any]]: @@ -117,6 +116,26 @@ def build_bench_storage(root: Path, composer_count: int) -> dict[str, str]: } +def _make_bench_flask_client( + storage: dict[str, str], + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + *, + state_subdir: str = ".cursor-chat-browser", +) -> FlaskClient: + """Flask test client with env + export state patched for synthetic storage.""" + monkeypatch.setenv("WORKSPACE_PATH", storage["workspace_path"]) + monkeypatch.setenv("CLI_CHATS_PATH", storage["cli_chats_path"]) + monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1") + state_dir = tmp_path / state_subdir + state_dir.mkdir() + monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir)) + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + return app.test_client() + + @pytest.fixture def summary_cache_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: """Redirect summary-cache files to an isolated temp directory.""" @@ -193,13 +212,7 @@ def bench_env( @pytest.fixture def bench_client(bench_env: dict[str, str], tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> FlaskClient: """Flask test client bound to synthetic bench storage.""" - state_dir = tmp_path / ".cursor-chat-browser" - state_dir.mkdir() - monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir)) - app = create_app() - app.config["TESTING"] = True - app.config["EXCLUSION_RULES"] = [] - return app.test_client() + return _make_bench_flask_client(bench_env, tmp_path, monkeypatch) @pytest.fixture @@ -209,13 +222,9 @@ def bench_client_search_corpus( ) -> FlaskClient: """Flask client over a fixed 50-composer corpus for search benchmarks.""" storage = build_bench_storage(tmp_path / "search_storage", 50) - monkeypatch.setenv("WORKSPACE_PATH", storage["workspace_path"]) - monkeypatch.setenv("CLI_CHATS_PATH", storage["cli_chats_path"]) - monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1") - state_dir = tmp_path / ".cursor-chat-browser-search" - state_dir.mkdir() - monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir)) - app = create_app() - app.config["TESTING"] = True - app.config["EXCLUSION_RULES"] = [] - return app.test_client() + return _make_bench_flask_client( + storage, + tmp_path, + monkeypatch, + state_subdir=".cursor-chat-browser-search", + ) diff --git a/tests/benchmarks/constants.py b/tests/benchmarks/constants.py new file mode 100644 index 0000000..ab682d3 --- /dev/null +++ b/tests/benchmarks/constants.py @@ -0,0 +1,3 @@ +"""Shared constants for benchmark corpora (importable outside conftest).""" + +BENCH_SEARCH_TERM = "bench-search-token" diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py index 1606f24..e7df914 100644 --- a/tests/benchmarks/test_search_bench.py +++ b/tests/benchmarks/test_search_bench.py @@ -5,7 +5,7 @@ import pytest from flask.testing import FlaskClient -from tests.benchmarks.conftest import BENCH_SEARCH_TERM +from tests.benchmarks.constants import BENCH_SEARCH_TERM @pytest.mark.benchmark(group="search") diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py index c9b1608..4a259c8 100644 --- a/tests/benchmarks/test_summary_cache_bench.py +++ b/tests/benchmarks/test_summary_cache_bench.py @@ -74,6 +74,11 @@ def _run() -> None: get_cached_projects(fp) benchmark(_run) + cached = get_cached_projects(fp) + assert cached is not None + cached_projects, cached_warnings = cached + assert cached_projects == projects + assert cached_warnings == [] @pytest.mark.benchmark(group="summary-cache") diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py index 38ddcae..873d68a 100644 --- a/tests/test_check_benchmark_regression.py +++ b/tests/test_check_benchmark_regression.py @@ -232,28 +232,19 @@ def test_stale_baseline_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> N assert "STALE" in out -def test_excluded_benchmark_not_gated(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: - from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE +def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + from scripts.check_benchmark_regression import main - excluded = next(iter(EXCLUDED_FROM_GATE)) results = tmp_path / "results.json" baselines = tmp_path / "baselines.json" - _write_results( - results, - [{"name": excluded, "stats": {"mean": 1.0}}], - ) - _write_baselines( - baselines, - {"search": {excluded: 0.0001}}, - ) + _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) - assert check_regression(results, baselines) == 0 - out = capsys.readouterr().out - assert "REGRESSION" not in out - assert "STALE" not in out + assert main([str(results), str(baselines), "--threshold", "1.0"]) == 2 + assert "threshold must be greater than 1" in capsys.readouterr().err -def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: +def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: from scripts.check_benchmark_regression import main results = tmp_path / "results.json" @@ -261,11 +252,31 @@ def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[ _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) - assert main([str(results), str(baselines), "--threshold", "1.0"]) == 2 - assert "--threshold must be greater than 1" in capsys.readouterr().err + assert main([str(results), str(baselines), "--stale-floor", "1.5"]) == 2 + assert "stale_floor must be between 0 and 1" in capsys.readouterr().err -def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: +def test_check_regression_rejects_invalid_threshold(tmp_path) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) + + with pytest.raises(BenchmarkDataError, match="threshold must be greater than 1"): + check_regression(results, baselines, threshold=1.0) + + +def test_check_regression_rejects_non_finite_threshold(tmp_path) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) + + with pytest.raises(BenchmarkDataError, match="threshold must be finite"): + check_regression(results, baselines, threshold=float("nan")) + + +def test_main_rejects_non_finite_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: from scripts.check_benchmark_regression import main results = tmp_path / "results.json" @@ -273,5 +284,5 @@ def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixtur _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) - assert main([str(results), str(baselines), "--stale-floor", "1.5"]) == 2 - assert "--stale-floor must be between 0 and 1" in capsys.readouterr().err + assert main([str(results), str(baselines), "--threshold", "inf"]) == 2 + assert "threshold must be finite" in capsys.readouterr().err diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py new file mode 100644 index 0000000..6f5ab89 --- /dev/null +++ b/tests/test_reduce_baselines.py @@ -0,0 +1,132 @@ +"""Tests for scripts/reduce_baselines.py.""" + +from __future__ import annotations + +import json + +import pytest + +from scripts.reduce_baselines import reduce_baselines +from scripts.check_benchmark_regression import BenchmarkDataError + + +def _write_raw(path, benchmarks: list[dict], *, machine: str = "Linux") -> None: + path.write_text( + json.dumps( + { + "machine_info": {"system": machine}, + "benchmarks": benchmarks, + }, + indent=2, + ), + encoding="utf-8", + ) + + +def test_reduce_baselines_groups_and_slack(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "name": "test_list_workspace_projects_nocache[composers-50]", + "group": "parse", + "stats": {"mean": 0.05}, + }, + { + "name": "test_post_export_zip[composers-10]", + "group": "export", + "stats": {"mean": 0.01}, + }, + { + "name": "test_search_full_corpus", + "group": "search", + "stats": {"mean": 0.04}, + }, + { + "name": "test_summary_cache_lookup[hit]", + "group": "summary-cache", + "stats": {"mean": 0.0001}, + }, + ], + ) + + output = reduce_baselines(raw, out, slack=1.5, source="ubuntu-latest-ci") + data = json.loads(out.read_text(encoding="utf-8")) + groups = data["groups"] + + assert groups["parse"]["test_list_workspace_projects_nocache[composers-50]"] == pytest.approx(0.075) + assert groups["export"]["test_post_export_zip[composers-10]"] == pytest.approx(0.015) + assert groups["search"]["test_search_full_corpus"] == pytest.approx(0.06) + assert data["machine"] == "Linux" + assert "ubuntu-latest CI benchmark-results.json" in data["_note"] + assert "1.5× slack" in data["_note"] + assert output["groups"] == groups + + +def test_reduce_baselines_local_source_note(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "name": "test_summary_cache_lookup[hit]", + "group": "summary-cache", + "stats": {"mean": 0.0001}, + }, + ], + machine="Windows", + ) + + reduce_baselines(raw, out, source="local") + data = json.loads(out.read_text(encoding="utf-8")) + assert "local benchmark-results.json" in data["_note"] + assert data["machine"] == "Windows" + + +def test_reduce_baselines_rejects_unknown_group(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "name": "test_cache_only", + "group": "cache", + "stats": {"mean": 0.001}, + }, + ], + ) + + with pytest.raises(BenchmarkDataError, match="unknown group 'cache'"): + reduce_baselines(raw, out) + + +def test_reduce_baselines_rejects_missing_group(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "name": "test_no_group", + "stats": {"mean": 0.001}, + }, + ], + ) + + with pytest.raises(BenchmarkDataError, match="missing required 'group'"): + reduce_baselines(raw, out) + + +def test_positive_float_rejects_non_finite() -> None: + import argparse + + from scripts.reduce_baselines import _positive_float + + with pytest.raises(argparse.ArgumentTypeError, match="finite"): + _positive_float("nan") + with pytest.raises(argparse.ArgumentTypeError, match="finite"): + _positive_float("inf") From 29ad5a7c475e24126b7cbb6948730bf64c3fe451 Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 26 Jun 2026 05:57:11 +0800 Subject: [PATCH 4/7] fix(bench): harden reduce_baselines and fix Python 3.10 CI --- benchmarks/baselines.json | 2 +- scripts/reduce_baselines.py | 11 ++++++++--- tests/test_reduce_baselines.py | 26 +++++++++++++++++++++++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 9302877..3a4d413 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,5 +1,5 @@ { - "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5\u00d7 slack at generation time. Refresh after intentional speedups via reduce_baselines.py.", + "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Refresh after intentional speedups via reduce_baselines.py.", "updated": "2026-06-25T21:48:35Z", "machine": "Linux", "groups": { diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py index 6264018..78bfbd1 100644 --- a/scripts/reduce_baselines.py +++ b/scripts/reduce_baselines.py @@ -6,7 +6,7 @@ import json import math import sys -from datetime import UTC, datetime +from datetime import datetime, timezone from pathlib import Path _REPO_ROOT = Path(__file__).resolve().parent.parent @@ -75,6 +75,11 @@ def reduce_baselines( f"{path} benchmarks[{index}] ({bench_name!r}) has unknown group {group!r}; " f"expected one of {GATED_GROUPS}" ) + if bench_name in groups[group]: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] ({raw_name!r}) duplicates normalized " + f"benchmark {group!r}/{bench_name!r}" + ) groups[group][bench_name] = mean * slack excluded = ", ".join(sorted(EXCLUDED_FROM_GATE)) @@ -83,7 +88,7 @@ def reduce_baselines( if excluded else "" ) - slack_note = f" Values multiplied by {slack}× slack at generation time." if slack != 1.0 else "" + slack_note = f" Values multiplied by {slack}x slack at generation time." if slack != 1.0 else "" machine_info = raw.get("machine_info") machine = machine_info.get("system") if isinstance(machine_info, dict) else None source_labels = { @@ -97,7 +102,7 @@ def reduce_baselines( f"{slack_note}{excluded_note} " "Refresh after intentional speedups via reduce_baselines.py." ), - "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"), + "updated": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), "machine": machine, "groups": groups, } diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py index 6f5ab89..9cc24e4 100644 --- a/tests/test_reduce_baselines.py +++ b/tests/test_reduce_baselines.py @@ -59,9 +59,10 @@ def test_reduce_baselines_groups_and_slack(tmp_path) -> None: assert groups["parse"]["test_list_workspace_projects_nocache[composers-50]"] == pytest.approx(0.075) assert groups["export"]["test_post_export_zip[composers-10]"] == pytest.approx(0.015) assert groups["search"]["test_search_full_corpus"] == pytest.approx(0.06) + assert groups["summary-cache"]["test_summary_cache_lookup[hit]"] == pytest.approx(0.00015) assert data["machine"] == "Linux" assert "ubuntu-latest CI benchmark-results.json" in data["_note"] - assert "1.5× slack" in data["_note"] + assert "1.5x slack" in data["_note"] assert output["groups"] == groups @@ -121,6 +122,29 @@ def test_reduce_baselines_rejects_missing_group(tmp_path) -> None: reduce_baselines(raw, out) +def test_reduce_baselines_rejects_duplicate_normalized_name(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "name": "test_summary_cache_lookup[hit]", + "group": "summary-cache", + "stats": {"mean": 0.0001}, + }, + { + "name": "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_lookup[hit]", + "group": "summary-cache", + "stats": {"mean": 0.0002}, + }, + ], + ) + + with pytest.raises(BenchmarkDataError, match="duplicates normalized"): + reduce_baselines(raw, out) + + def test_positive_float_rejects_non_finite() -> None: import argparse From c5c2066c775dacc87b11f7615270d6203cc3857b Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 26 Jun 2026 07:11:19 +0800 Subject: [PATCH 5/7] fix(bench): address bradjin8 review on search, cache, and baselines --- .gitignore | 1 + Makefile | 4 +- benchmarks/README.md | 18 ++++-- benchmarks/baselines.json | 9 ++- tests/benchmarks/conftest.py | 60 ++++++++++++++++++-- tests/benchmarks/test_search_bench.py | 40 ++++++++++--- tests/benchmarks/test_summary_cache_bench.py | 22 +++++++ 7 files changed, 131 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 0712397..0f8d574 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,5 @@ coverage.xml .hypothesis/ benchmark-results.json benchmarks/_raw.json +benchmarks/_merged.json benchmarks/_ci/ diff --git a/Makefile b/Makefile index a1f7607..599d5a1 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,9 @@ seed-baselines-local: @echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2 python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= - python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json --slack 1.5 --source local + python -c "import os, subprocess, sys; \ + cmd = [sys.executable, 'scripts/reduce_baselines.py', 'benchmarks/_raw.json', 'benchmarks/baselines.json', '--slack', '1.5', '--source', 'local']; \ + (subprocess.run(cmd, check=True), print('Updated benchmarks/baselines.json', file=sys.stderr)) if os.environ.get('FORCE') == '1' else print('Wrote benchmarks/_raw.json only. Set FORCE=1 to overwrite benchmarks/baselines.json.', file=sys.stderr)" # Deprecated alias — kept for muscle memory; see seed-baselines-local warning above. update-baselines: seed-baselines-local diff --git a/benchmarks/README.md b/benchmarks/README.md index 0f4e23a..79fce2a 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -17,12 +17,16 @@ pytest tests/benchmarks/ --benchmark-only -o addopts= -v | Group | What | |-------|------| | parse | `list_workspace_projects(..., nocache=True)` over 10 / 50 / 200 synthetic composers | -| export | `POST /api/export` (ZIP) over 10 / 50 composer corpora | -| search | `GET /api/search` over a 50-composer synthetic corpus | -| summary-cache | cache lookup (hit/miss), fingerprint (10/50/200), round-trip, tab-summary lookup | +| export | `POST /api/export` (ZIP) over 10 / 50 composer corpora (capped at 50 for CI runtime; parse goes to 200) | +| search | `GET /api/search` over a 50-composer corpus — **live-scan** (`test_search_full_corpus_live_scan`, `NO_SEARCH_INDEX=1`) and **FTS index** (`test_search_full_corpus_indexed`, pre-built index) | +| summary-cache | projects lookup (hit/miss), composer-map lookup (hit/miss), fingerprint (10/50/200), round-trip, tab-summary lookup | Synthetic corpora are built in `tests/benchmarks/conftest.py` — no real Cursor storage dependency. +### Adding a benchmark group + +Every `@pytest.mark.benchmark(group="...")` name must appear in `GATED_GROUPS` inside `scripts/reduce_baselines.py`. Otherwise `reduce_baselines.py` fails at refresh time with an unknown-group error. Update both the test marker and `GATED_GROUPS` when introducing a new group. + ## CI gate The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suite (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. @@ -35,18 +39,22 @@ The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suit Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`. +Sub-millisecond benches (e.g. `test_summary_cache_lookup`, `test_composer_map_cache_lookup`) can be high-variance on shared runners. If the gate becomes flaky, raise `--slack` for those entries or reintroduce targeted exclusions in `EXCLUDED_FROM_GATE`. + ## Refresh baselines After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job). Download `benchmark-results.json` from a CI artifact when possible: ```bash -python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 +python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 --source ubuntu-latest-ci ``` For a quick local snapshot only (may not match CI timings): ```bash make seed-baselines-local +# writes benchmarks/_raw.json only; does not overwrite benchmarks/baselines.json +make seed-baselines-local FORCE=1 # also runs reduce_baselines into benchmarks/baselines.json ``` `make update-baselines` is a deprecated alias for `seed-baselines-local`. Do not commit baselines from macOS/Windows unless you accept cross-OS gate skew. @@ -56,5 +64,5 @@ make seed-baselines-local | Target | Purpose | |--------|---------| | `make check-benchmarks` | Run suite + regression gate locally | -| `make seed-baselines-local` | Capture local timings into `benchmarks/baselines.json` (with slack) | +| `make seed-baselines-local` | Capture local timings to `benchmarks/_raw.json` (use `FORCE=1` to update `baselines.json`) | | `make clean-benchmark-artifacts` | Remove `benchmark-results.json` and `benchmarks/_raw.json` | diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 3a4d413..9afa18d 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,6 +1,6 @@ { "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Refresh after intentional speedups via reduce_baselines.py.", - "updated": "2026-06-25T21:48:35Z", + "updated": "2026-06-25T22:57:33Z", "machine": "Linux", "groups": { "parse": { @@ -13,7 +13,8 @@ "test_post_export_zip[composers-50]": 0.04482855966665985 }, "search": { - "test_search_full_corpus": 0.047164217833331655 + "test_search_full_corpus_live_scan": 0.047164217833331655, + "test_search_full_corpus_indexed": 0.05494209932945618 }, "summary-cache": { "test_summary_cache_lookup[hit]": 9.224067718099102e-05, @@ -23,7 +24,9 @@ "test_fingerprint_workspace_entries[200]": 0.03069810573000666, "test_summary_cache_round_trip": 0.0004966099535917549, "test_tab_summary_cache_lookup[hit]": 0.00010487297799045405, - "test_tab_summary_cache_lookup[miss]": 0.00010309520517204601 + "test_tab_summary_cache_lookup[miss]": 0.00010309520517204601, + "test_composer_map_cache_lookup[hit]": 8.074544668606364e-05, + "test_composer_map_cache_lookup[miss]": 9.495690246481993e-05 } } } diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index 1cbf3b6..cfc133f 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -122,11 +122,21 @@ def _make_bench_flask_client( monkeypatch: pytest.MonkeyPatch, *, state_subdir: str = ".cursor-chat-browser", + live_scan_search: bool = False, ) -> FlaskClient: - """Flask test client with env + export state patched for synthetic storage.""" + """Flask test client with env + export state patched for synthetic storage. + + When *live_scan_search* is True, set ``CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX=1`` so + ``/api/search`` measures the live-scan fallback. Otherwise the FTS index path + from #113 may be used when an index is built (see indexed search fixtures). + """ monkeypatch.setenv("WORKSPACE_PATH", storage["workspace_path"]) monkeypatch.setenv("CLI_CHATS_PATH", storage["cli_chats_path"]) - monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1") + if live_scan_search: + monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1") + else: + monkeypatch.delenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", raising=False) + monkeypatch.delenv("CURSOR_CHAT_BROWSER_NOCACHE", raising=False) state_dir = tmp_path / state_subdir state_dir.mkdir() monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir)) @@ -138,7 +148,12 @@ def _make_bench_flask_client( @pytest.fixture def summary_cache_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: - """Redirect summary-cache files to an isolated temp directory.""" + """Redirect summary-cache files to an isolated temp directory. + + Tab-summary files use ``CACHE_DIR`` + hashed filenames only (see + ``summary_cache._tab_summaries_path``); they do not use + ``PROJECTS_CACHE_FILE`` or ``COMPOSER_MAP_CACHE_FILE``. + """ cache_dir = tmp_path / "cache" cache_dir.mkdir() monkeypatch.setattr(summary_cache, "CACHE_DIR", cache_dir) @@ -212,7 +227,7 @@ def bench_env( @pytest.fixture def bench_client(bench_env: dict[str, str], tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> FlaskClient: """Flask test client bound to synthetic bench storage.""" - return _make_bench_flask_client(bench_env, tmp_path, monkeypatch) + return _make_bench_flask_client(bench_env, tmp_path, monkeypatch, live_scan_search=True) @pytest.fixture @@ -220,11 +235,46 @@ def bench_client_search_corpus( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> FlaskClient: - """Flask client over a fixed 50-composer corpus for search benchmarks.""" + """Flask client over a fixed 50-composer corpus (live-scan search path).""" storage = build_bench_storage(tmp_path / "search_storage", 50) return _make_bench_flask_client( storage, tmp_path, monkeypatch, state_subdir=".cursor-chat-browser-search", + live_scan_search=True, + ) + + +@pytest.fixture +def bench_client_search_corpus_indexed( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> FlaskClient: + """Flask client with FTS index built for the 50-composer search corpus.""" + from services.search_index import build_search_index + + monkeypatch.delenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", raising=False) + monkeypatch.delenv("CURSOR_CHAT_BROWSER_NOCACHE", raising=False) + + storage = build_bench_storage(tmp_path / "search_indexed_storage", 50) + cache_dir = tmp_path / "search_index_cache" + cache_dir.mkdir() + monkeypatch.setattr("services.search_index.CACHE_DIR", cache_dir) + monkeypatch.setattr( + "services.search_index.SEARCH_INDEX_POINTER_FILE", + cache_dir / "search_index.active", + ) + monkeypatch.setattr( + "services.search_index.SEARCH_INDEX_FILE", + cache_dir / "search_index.sqlite", + ) + built = build_search_index(storage["workspace_path"], [], force=True) + assert built is True + return _make_bench_flask_client( + storage, + tmp_path, + monkeypatch, + state_subdir=".cursor-chat-browser-search-indexed", + live_scan_search=False, ) diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py index e7df914..33eee5b 100644 --- a/tests/benchmarks/test_search_bench.py +++ b/tests/benchmarks/test_search_bench.py @@ -8,19 +8,41 @@ from tests.benchmarks.constants import BENCH_SEARCH_TERM +def _search_url() -> str: + return f"/api/search?q={BENCH_SEARCH_TERM}&all_history=1" + + +def _assert_search_response(response: object) -> None: + assert response.status_code == 200 # type: ignore[attr-defined] + body = response.get_json() # type: ignore[attr-defined] + assert isinstance(body, dict) + results = body.get("results") + assert isinstance(results, list) and len(results) > 0 + + @pytest.mark.benchmark(group="search") -def test_search_full_corpus( +def test_search_full_corpus_live_scan( benchmark, bench_client_search_corpus: FlaskClient, ) -> None: + """Live-scan fallback only (``CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX=1``).""" + def _run() -> object: - return bench_client_search_corpus.get( - f"/api/search?q={BENCH_SEARCH_TERM}&all_history=1", - ) + return bench_client_search_corpus.get(_search_url()) response = benchmark(_run) - assert response.status_code == 200 - body = response.get_json() - assert isinstance(body, dict) - results = body.get("results") - assert isinstance(results, list) and len(results) > 0 + _assert_search_response(response) + + +@pytest.mark.benchmark(group="search") +def test_search_full_corpus_indexed( + benchmark, + bench_client_search_corpus_indexed: FlaskClient, +) -> None: + """FTS index path (#113) with pre-built ``search_index.sqlite``.""" + + def _run() -> object: + return bench_client_search_corpus_indexed.get(_search_url()) + + response = benchmark(_run) + _assert_search_response(response) diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py index 4a259c8..16552d2 100644 --- a/tests/benchmarks/test_summary_cache_bench.py +++ b/tests/benchmarks/test_summary_cache_bench.py @@ -9,8 +9,10 @@ from services.summary_cache import ( fingerprint_workspace_storage, + get_cached_composer_id_to_ws, get_cached_projects, get_cached_tab_summaries, + set_cached_composer_id_to_ws, set_cached_projects, set_cached_tab_summaries, ) @@ -39,6 +41,26 @@ def test_summary_cache_lookup( assert result is None +@pytest.mark.benchmark(group="summary-cache") +@pytest.mark.parametrize("mode", ["hit", "miss"], ids=["hit", "miss"]) +def test_composer_map_cache_lookup( + benchmark, + mode: Literal["hit", "miss"], + summary_cache_dir: Path, + workspace_fingerprint: dict[str, Any], + stale_fingerprint: dict[str, Any], +) -> None: + """Time ``get_cached_composer_id_to_ws`` hit/miss (fingerprint mismatch on miss).""" + mapping = {"cmp_0000": "ws_0000"} + set_cached_composer_id_to_ws(workspace_fingerprint, mapping) + lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint + result = benchmark(get_cached_composer_id_to_ws, lookup_fp) + if mode == "hit": + assert result == mapping + else: + assert result is None + + @pytest.mark.benchmark(group="summary-cache") @pytest.mark.parametrize( "synthetic_workspace", From 5a51d7005bf8bcf79230954902b096199d128675 Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 26 Jun 2026 07:26:29 +0800 Subject: [PATCH 6/7] chore(bench): refresh baselines from ubuntu CI run 28206463463 --- benchmarks/baselines.json | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 9afa18d..8472691 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,32 +1,32 @@ { "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Refresh after intentional speedups via reduce_baselines.py.", - "updated": "2026-06-25T22:57:33Z", + "updated": "2026-06-25T23:21:29Z", "machine": "Linux", "groups": { "parse": { - "test_list_workspace_projects_nocache[composers-10]": 0.01702312019643009, - "test_list_workspace_projects_nocache[composers-50]": 0.07538331990000699, - "test_list_workspace_projects_nocache[composers-200]": 0.251991555999993 + "test_list_workspace_projects_nocache[composers-10]": 0.012957852608107466, + "test_list_workspace_projects_nocache[composers-50]": 0.05577718626923036, + "test_list_workspace_projects_nocache[composers-200]": 0.1878804727500003 }, "export": { - "test_post_export_zip[composers-10]": 0.0112034034344294, - "test_post_export_zip[composers-50]": 0.04482855966665985 + "test_post_export_zip[composers-10]": 0.009724031427631593, + "test_post_export_zip[composers-50]": 0.041050375020001154 }, "search": { - "test_search_full_corpus_live_scan": 0.047164217833331655, - "test_search_full_corpus_indexed": 0.05494209932945618 + "test_search_full_corpus_live_scan": 0.03440949781249936, + "test_search_full_corpus_indexed": 0.04301802726315884 }, "summary-cache": { - "test_summary_cache_lookup[hit]": 9.224067718099102e-05, - "test_summary_cache_lookup[miss]": 9.128770315496628e-05, - "test_fingerprint_workspace_entries[10]": 0.0024789120309553535, - "test_fingerprint_workspace_entries[50]": 0.010901568931818675, - "test_fingerprint_workspace_entries[200]": 0.03069810573000666, - "test_summary_cache_round_trip": 0.0004966099535917549, - "test_tab_summary_cache_lookup[hit]": 0.00010487297799045405, - "test_tab_summary_cache_lookup[miss]": 0.00010309520517204601, - "test_composer_map_cache_lookup[hit]": 8.074544668606364e-05, - "test_composer_map_cache_lookup[miss]": 9.495690246481993e-05 + "test_summary_cache_lookup[hit]": 5.7807392057047035e-05, + "test_summary_cache_lookup[miss]": 5.6223937183791584e-05, + "test_composer_map_cache_lookup[hit]": 5.551344090189019e-05, + "test_composer_map_cache_lookup[miss]": 5.490079494266499e-05, + "test_fingerprint_workspace_entries[10]": 0.0019021180608754708, + "test_fingerprint_workspace_entries[50]": 0.008100319212766178, + "test_fingerprint_workspace_entries[200]": 0.0235079150476191, + "test_summary_cache_round_trip": 0.001704988098923577, + "test_tab_summary_cache_lookup[hit]": 6.058533512024974e-05, + "test_tab_summary_cache_lookup[miss]": 6.130049047937722e-05 } } } From 6fed47ed1a58aa29ba4816b1cc920fc5ba5e4100 Mon Sep 17 00:00:00 2001 From: chen Date: Fri, 26 Jun 2026 07:37:18 +0800 Subject: [PATCH 7/7] bench: exclude round_trip from gate; refresh baselines from latest CI test_summary_cache_round_trip calls set/get each round; OS page-cache state causes 3-5x variation between CI runs (0.000314s vs 0.001137s). Add to EXCLUDED_FROM_GATE with comment; baseline kept for observation. Regenerate baselines.json from run 28206913751 (ubuntu-latest, 1.5x slack). Update README to document the exclusion and rationale. Co-authored-by: Cursor --- benchmarks/README.md | 6 +++-- benchmarks/baselines.json | 38 +++++++++++++-------------- scripts/check_benchmark_regression.py | 13 +++++++-- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 79fce2a..e2e0064 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -35,11 +35,13 @@ The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suit - **Fail** when a gated mean is **<50%** of baseline (stale — refresh after intentional speedups) - **Fail** when a gated baseline name has no current result - **Warn** for benchmarks without a baseline entry -- All benchmarks listed in `baselines.json` are gated (no exclusion list) +- All benchmarks listed in `baselines.json` are gated unless named in `EXCLUDED_FROM_GATE` in `scripts/check_benchmark_regression.py` Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`. -Sub-millisecond benches (e.g. `test_summary_cache_lookup`, `test_composer_map_cache_lookup`) can be high-variance on shared runners. If the gate becomes flaky, raise `--slack` for those entries or reintroduce targeted exclusions in `EXCLUDED_FROM_GATE`. +Sub-millisecond benches (e.g. `test_summary_cache_lookup`, `test_composer_map_cache_lookup`) can be high-variance on shared runners. If the gate becomes flaky, raise `--slack` for those entries or add targeted exclusions in `EXCLUDED_FROM_GATE`. + +`test_summary_cache_round_trip` is intentionally excluded from the gate: it calls `set_cached_projects` (file write) + `get_cached_projects` (file read) each round, so OS page-cache state on shared runners causes 3–5x variation between consecutive CI runs. The baseline entry is kept for observation only. ## Refresh baselines diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 8472691..1f3a5c0 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,32 +1,32 @@ { - "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Refresh after intentional speedups via reduce_baselines.py.", - "updated": "2026-06-25T23:21:29Z", + "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Excluded from gate (recorded for reference): test_summary_cache_round_trip. Refresh after intentional speedups via reduce_baselines.py.", + "updated": "2026-06-25T23:36:11Z", "machine": "Linux", "groups": { "parse": { - "test_list_workspace_projects_nocache[composers-10]": 0.012957852608107466, - "test_list_workspace_projects_nocache[composers-50]": 0.05577718626923036, - "test_list_workspace_projects_nocache[composers-200]": 0.1878804727500003 + "test_list_workspace_projects_nocache[composers-10]": 0.016421750017237738, + "test_list_workspace_projects_nocache[composers-50]": 0.07185380692856874, + "test_list_workspace_projects_nocache[composers-200]": 0.2388664538571439 }, "export": { - "test_post_export_zip[composers-10]": 0.009724031427631593, - "test_post_export_zip[composers-50]": 0.041050375020001154 + "test_post_export_zip[composers-10]": 0.010621589857140498, + "test_post_export_zip[composers-50]": 0.03968703356250458 }, "search": { - "test_search_full_corpus_live_scan": 0.03440949781249936, - "test_search_full_corpus_indexed": 0.04301802726315884 + "test_search_full_corpus_live_scan": 0.04461661563157736, + "test_search_full_corpus_indexed": 0.05512249660713918 }, "summary-cache": { - "test_summary_cache_lookup[hit]": 5.7807392057047035e-05, - "test_summary_cache_lookup[miss]": 5.6223937183791584e-05, - "test_composer_map_cache_lookup[hit]": 5.551344090189019e-05, - "test_composer_map_cache_lookup[miss]": 5.490079494266499e-05, - "test_fingerprint_workspace_entries[10]": 0.0019021180608754708, - "test_fingerprint_workspace_entries[50]": 0.008100319212766178, - "test_fingerprint_workspace_entries[200]": 0.0235079150476191, - "test_summary_cache_round_trip": 0.001704988098923577, - "test_tab_summary_cache_lookup[hit]": 6.058533512024974e-05, - "test_tab_summary_cache_lookup[miss]": 6.130049047937722e-05 + "test_summary_cache_lookup[hit]": 7.249851343825762e-05, + "test_summary_cache_lookup[miss]": 7.193702095574013e-05, + "test_composer_map_cache_lookup[hit]": 7.151645086519804e-05, + "test_composer_map_cache_lookup[miss]": 7.112598943352091e-05, + "test_fingerprint_workspace_entries[10]": 0.0024127972424549185, + "test_fingerprint_workspace_entries[50]": 0.010196820941858245, + "test_fingerprint_workspace_entries[200]": 0.029070524094341035, + "test_summary_cache_round_trip": 0.0004703680658560554, + "test_tab_summary_cache_lookup[hit]": 7.844850562859133e-05, + "test_tab_summary_cache_lookup[miss]": 7.843399021512e-05 } } } diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index e30bee1..6655460 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -11,8 +11,17 @@ THRESHOLD = 1.20 STALE_FLOOR = 0.50 -# Benchmarks gated via baselines.json; empty set means all baseline entries are checked. -EXCLUDED_FROM_GATE: frozenset[str] = frozenset() +# Benchmarks recorded in baselines.json but excluded from the regression gate. +# Use sparingly — only for benches whose timing is inherently noisy across CI runs +# (e.g. file I/O operations that depend on OS page-cache state). +EXCLUDED_FROM_GATE: frozenset[str] = frozenset( + { + # round_trip calls set_cached_projects (file write) + get_cached_projects (file read) + # each round. OS page-cache state on shared runners causes 3–5x variation between + # consecutive CI runs, making this ungatable with any reasonable slack. + "test_summary_cache_round_trip", + } +) class BenchmarkDataError(ValueError):