diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 158b598..3a34c3d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -115,7 +115,7 @@ jobs: # exercise Flask routes via app.test_client(). Only listed files — not # `pytest tests/` — to avoid re-collecting unittest.TestCase classes above. # -o addopts= avoids inheriting benchmark-only options from pyproject.toml. - run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py tests/test_check_benchmark_regression.py -v --tb=short -o addopts= + run: python -m pytest tests/test_api_search.py tests/test_api_workspaces.py tests/test_api_export.py tests/test_pdf_export.py tests/test_search_helpers.py tests/test_check_benchmark_regression.py tests/test_reduce_baselines.py -v --tb=short -o addopts= # ── PyInstaller desktop build (Windows only, once per workflow) ──────── # Closes #44. Builds the onedir bundle and smoke-tests --help so the @@ -215,7 +215,7 @@ jobs: --redact \ --exit-code 1 - # ── Performance benchmarks: summary cache (issue #115) ───────────────────── + # ── Performance benchmarks: unified suite (issues #115, #110) ────────────── benchmarks: name: Performance benchmarks (gated) needs: [unittest] @@ -236,7 +236,7 @@ jobs: python -m pip install -r requirements-lock.txt python -m pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0' - - name: Run summary-cache benchmarks + - name: Run benchmark suite run: > python -m pytest tests/benchmarks/ --benchmark-only diff --git a/.gitignore b/.gitignore index f204306..0f8d574 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,5 @@ coverage.xml .hypothesis/ benchmark-results.json benchmarks/_raw.json +benchmarks/_merged.json +benchmarks/_ci/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..599d5a1 --- /dev/null +++ b/Makefile @@ -0,0 +1,21 @@ +.PHONY: seed-baselines-local update-baselines check-benchmarks clean-benchmark-artifacts + +# WARNING: captures timings on THIS machine. Production baselines must match ubuntu-latest CI. +# Prefer downloading benchmark-results.json from a CI artifact, then: +# python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 +seed-baselines-local: + @echo "WARNING: seed-baselines-local uses this host's timings; CI gates on ubuntu-latest." >&2 + python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= + python -c "import os, subprocess, sys; \ + cmd = [sys.executable, 'scripts/reduce_baselines.py', 'benchmarks/_raw.json', 'benchmarks/baselines.json', '--slack', '1.5', '--source', 'local']; \ + (subprocess.run(cmd, check=True), print('Updated benchmarks/baselines.json', file=sys.stderr)) if os.environ.get('FORCE') == '1' else print('Wrote benchmarks/_raw.json only. Set FORCE=1 to overwrite benchmarks/baselines.json.', file=sys.stderr)" + +# Deprecated alias — kept for muscle memory; see seed-baselines-local warning above. +update-baselines: seed-baselines-local + +check-benchmarks: + python -m pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts= + python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json + +clean-benchmark-artifacts: + python -c "import pathlib; [p.unlink(missing_ok=True) for p in (pathlib.Path('benchmarks/_raw.json'), pathlib.Path('benchmark-results.json'))]" diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..e2e0064 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,70 @@ +# Performance benchmarks + +Test files live under `tests/benchmarks/`; this directory holds documentation and `baselines.json` for the CI regression gate. + +Repeatable local measurements for workspace listing, export, search, and summary-cache hot paths. + +## Run locally + +```bash +pip install -r requirements-lock.txt +pip install 'pytest>=8,<9' 'pytest-benchmark==4.0.0' +pytest tests/benchmarks/ --benchmark-only -o addopts= -v +``` + +## Scenarios + +| Group | What | +|-------|------| +| parse | `list_workspace_projects(..., nocache=True)` over 10 / 50 / 200 synthetic composers | +| export | `POST /api/export` (ZIP) over 10 / 50 composer corpora (capped at 50 for CI runtime; parse goes to 200) | +| search | `GET /api/search` over a 50-composer corpus — **live-scan** (`test_search_full_corpus_live_scan`, `NO_SEARCH_INDEX=1`) and **FTS index** (`test_search_full_corpus_indexed`, pre-built index) | +| summary-cache | projects lookup (hit/miss), composer-map lookup (hit/miss), fingerprint (10/50/200), round-trip, tab-summary lookup | + +Synthetic corpora are built in `tests/benchmarks/conftest.py` — no real Cursor storage dependency. + +### Adding a benchmark group + +Every `@pytest.mark.benchmark(group="...")` name must appear in `GATED_GROUPS` inside `scripts/reduce_baselines.py`. Otherwise `reduce_baselines.py` fails at refresh time with an unknown-group error. Update both the test marker and `GATED_GROUPS` when introducing a new group. + +## CI gate + +The `benchmarks` job on **ubuntu-latest** runs the full `tests/benchmarks/` suite (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. + +- **Fail** when a gated mean exceeds its baseline by **>20%** +- **Fail** when a gated mean is **<50%** of baseline (stale — refresh after intentional speedups) +- **Fail** when a gated baseline name has no current result +- **Warn** for benchmarks without a baseline entry +- All benchmarks listed in `baselines.json` are gated unless named in `EXCLUDED_FROM_GATE` in `scripts/check_benchmark_regression.py` + +Pinned runner: `ubuntu-latest`, `--benchmark-min-rounds=5`. + +Sub-millisecond benches (e.g. `test_summary_cache_lookup`, `test_composer_map_cache_lookup`) can be high-variance on shared runners. If the gate becomes flaky, raise `--slack` for those entries or add targeted exclusions in `EXCLUDED_FROM_GATE`. + +`test_summary_cache_round_trip` is intentionally excluded from the gate: it calls `set_cached_projects` (file write) + `get_cached_projects` (file read) each round, so OS page-cache state on shared runners causes 3–5x variation between consecutive CI runs. The baseline entry is kept for observation only. + +## Refresh baselines + +After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job). Download `benchmark-results.json` from a CI artifact when possible: + +```bash +python scripts/reduce_baselines.py benchmark-results.json benchmarks/baselines.json --slack 1.5 --source ubuntu-latest-ci +``` + +For a quick local snapshot only (may not match CI timings): + +```bash +make seed-baselines-local +# writes benchmarks/_raw.json only; does not overwrite benchmarks/baselines.json +make seed-baselines-local FORCE=1 # also runs reduce_baselines into benchmarks/baselines.json +``` + +`make update-baselines` is a deprecated alias for `seed-baselines-local`. Do not commit baselines from macOS/Windows unless you accept cross-OS gate skew. + +## Makefile targets + +| Target | Purpose | +|--------|---------| +| `make check-benchmarks` | Run suite + regression gate locally | +| `make seed-baselines-local` | Capture local timings to `benchmarks/_raw.json` (use `FORCE=1` to update `baselines.json`) | +| `make clean-benchmark-artifacts` | Remove `benchmark-results.json` and `benchmarks/_raw.json` | diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 131b638..1f3a5c0 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,15 +1,32 @@ { - "_note": "Gated means from ubuntu-latest CI benchmark-results.json (PR #120, run 28123677675). Refresh after intentional perf changes: download benchmark-results.json from the CI artifacts job, then `python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json` (re-seed with reduce_baselines or edit means). Local capture: `pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=` on ubuntu-latest.", - "updated": "2026-06-24T19:20:27Z", + "_note": "Gated means from ubuntu-latest CI benchmark-results.json. Values multiplied by 1.5x slack at generation time. Excluded from gate (recorded for reference): test_summary_cache_round_trip. Refresh after intentional speedups via reduce_baselines.py.", + "updated": "2026-06-25T23:36:11Z", "machine": "Linux", "groups": { + "parse": { + "test_list_workspace_projects_nocache[composers-10]": 0.016421750017237738, + "test_list_workspace_projects_nocache[composers-50]": 0.07185380692856874, + "test_list_workspace_projects_nocache[composers-200]": 0.2388664538571439 + }, + "export": { + "test_post_export_zip[composers-10]": 0.010621589857140498, + "test_post_export_zip[composers-50]": 0.03968703356250458 + }, + "search": { + "test_search_full_corpus_live_scan": 0.04461661563157736, + "test_search_full_corpus_indexed": 0.05512249660713918 + }, "summary-cache": { - "test_summary_cache_hit": 6.3e-05, - "test_summary_cache_miss": 6.3e-05, - "test_fingerprint_workspace_entries[10]": 0.001844, - "test_fingerprint_workspace_entries[50]": 0.007759, - "test_fingerprint_workspace_entries[200]": 0.022231, - "test_summary_cache_round_trip": 0.000351 + "test_summary_cache_lookup[hit]": 7.249851343825762e-05, + "test_summary_cache_lookup[miss]": 7.193702095574013e-05, + "test_composer_map_cache_lookup[hit]": 7.151645086519804e-05, + "test_composer_map_cache_lookup[miss]": 7.112598943352091e-05, + "test_fingerprint_workspace_entries[10]": 0.0024127972424549185, + "test_fingerprint_workspace_entries[50]": 0.010196820941858245, + "test_fingerprint_workspace_entries[200]": 0.029070524094341035, + "test_summary_cache_round_trip": 0.0004703680658560554, + "test_tab_summary_cache_lookup[hit]": 7.844850562859133e-05, + "test_tab_summary_cache_lookup[miss]": 7.843399021512e-05 } } } diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index d2fc79c..6655460 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -4,10 +4,24 @@ import argparse import json +import math import sys from pathlib import Path THRESHOLD = 1.20 +STALE_FLOOR = 0.50 + +# Benchmarks recorded in baselines.json but excluded from the regression gate. +# Use sparingly — only for benches whose timing is inherently noisy across CI runs +# (e.g. file I/O operations that depend on OS page-cache state). +EXCLUDED_FROM_GATE: frozenset[str] = frozenset( + { + # round_trip calls set_cached_projects (file write) + get_cached_projects (file read) + # each round. OS page-cache state on shared runners causes 3–5x variation between + # consecutive CI runs, making this ungatable with any reasonable slack. + "test_summary_cache_round_trip", + } +) class BenchmarkDataError(ValueError): @@ -97,19 +111,35 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: return means +def _validate_gate_ratios(threshold: float, stale_floor: float) -> None: + if not math.isfinite(threshold): + raise BenchmarkDataError("threshold must be finite") + if threshold <= 1: + raise BenchmarkDataError("threshold must be greater than 1") + if not math.isfinite(stale_floor): + raise BenchmarkDataError("stale_floor must be finite") + if not 0 < stale_floor < 1: + raise BenchmarkDataError("stale_floor must be between 0 and 1 (exclusive)") + + def check_regression( results_path: str | Path, baselines_path: str | Path, *, threshold: float = THRESHOLD, + stale_floor: float = STALE_FLOOR, ) -> int: - """Return 0 when within threshold; 1 when any gated benchmark regresses.""" + """Return 0 when within threshold; 1 when any gated benchmark regresses or is stale.""" + _validate_gate_ratios(threshold, stale_floor) flat = load_results(results_path) baseline_means = load_baseline_means(baselines_path) failures: list[str] = [] + stale: list[str] = [] missing: list[str] = [] for name, base in baseline_means.items(): + if name in EXCLUDED_FROM_GATE: + continue cur = flat.get(name) if cur is None: print(f"FAIL: no current result for gated baseline {name!r}") @@ -119,20 +149,32 @@ def check_regression( print(f"WARN: baseline for {name!r} is zero; skipping ratio check") continue ratio = cur / base - tag = "FAIL" if ratio > threshold else "ok" - print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") if ratio > threshold: + tag = "FAIL" failures.append(name) + elif ratio < stale_floor: + tag = "STALE" + stale.append(name) + else: + tag = "ok" + print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") for name in flat: + if name in EXCLUDED_FROM_GATE: + continue if name not in baseline_means: print(f"WARN: {name!r} has no baseline yet; not gated") if failures: print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}") + if stale: + print( + f"\nSTALE: {len(stale)} benchmark(s) are faster than {stale_floor:.0%} of baseline " + "(refresh baselines after intentional speedups)" + ) if missing: print(f"\nMISSING: {len(missing)} gated benchmark(s) absent from current results") - if failures or missing: + if failures or stale or missing: return 1 return 0 @@ -147,12 +189,19 @@ def main(argv: list[str] | None = None) -> int: default=THRESHOLD, help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)", ) + parser.add_argument( + "--stale-floor", + type=float, + default=STALE_FLOOR, + help="fail when current mean is below this fraction of baseline (default: 0.50)", + ) args = parser.parse_args(argv) try: return check_regression( args.results_path, args.baselines_path, threshold=args.threshold, + stale_floor=args.stale_floor, ) except BenchmarkDataError as exc: print(f"ERROR: {exc}", file=sys.stderr) diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py new file mode 100644 index 0000000..78bfbd1 --- /dev/null +++ b/scripts/reduce_baselines.py @@ -0,0 +1,142 @@ +"""Reduce pytest-benchmark JSON into benchmarks/baselines.json.""" + +from __future__ import annotations + +import argparse +import json +import math +import sys +from datetime import datetime, timezone +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parent.parent +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from scripts.check_benchmark_regression import ( + EXCLUDED_FROM_GATE, + BenchmarkDataError, + normalize_benchmark_name, +) + +GATED_GROUPS = ("parse", "export", "search", "summary-cache") + + +def _positive_float(value: str) -> float: + parsed = float(value) + if not math.isfinite(parsed): + raise argparse.ArgumentTypeError("slack must be a finite number") + if parsed <= 0: + raise argparse.ArgumentTypeError("slack must be greater than zero") + return parsed + + +def reduce_baselines( + raw_path: str | Path, + out_path: str | Path, + *, + slack: float = 1.0, + source: str = "local", +) -> dict[str, object]: + path = Path(raw_path) + try: + raw = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + except OSError as exc: + raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc + + try: + entries = raw["benchmarks"] + except (KeyError, TypeError) as exc: + raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc + if not isinstance(entries, list): + raise BenchmarkDataError(f"{path} 'benchmarks' must be an array") + + groups: dict[str, dict[str, float]] = {group: {} for group in GATED_GROUPS} + for index, entry in enumerate(entries): + if not isinstance(entry, dict): + raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") + try: + raw_name = entry["name"] + mean = float(entry["stats"]["mean"]) + except (KeyError, TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" + ) from exc + bench_name = normalize_benchmark_name(str(raw_name)) + group = entry.get("group") + if group is None: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] ({bench_name!r}) missing required 'group'" + ) + if group not in GATED_GROUPS: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] ({bench_name!r}) has unknown group {group!r}; " + f"expected one of {GATED_GROUPS}" + ) + if bench_name in groups[group]: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] ({raw_name!r}) duplicates normalized " + f"benchmark {group!r}/{bench_name!r}" + ) + groups[group][bench_name] = mean * slack + + excluded = ", ".join(sorted(EXCLUDED_FROM_GATE)) + excluded_note = ( + f" Excluded from gate (recorded for reference): {excluded}." + if excluded + else "" + ) + slack_note = f" Values multiplied by {slack}x slack at generation time." if slack != 1.0 else "" + machine_info = raw.get("machine_info") + machine = machine_info.get("system") if isinstance(machine_info, dict) else None + source_labels = { + "ubuntu-latest-ci": "ubuntu-latest CI benchmark-results.json", + "local": "local benchmark-results.json", + } + source_label = source_labels.get(source, source) + output: dict[str, object] = { + "_note": ( + f"Gated means from {source_label}." + f"{slack_note}{excluded_note} " + "Refresh after intentional speedups via reduce_baselines.py." + ), + "updated": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "machine": machine, + "groups": groups, + } + out = Path(out_path) + try: + out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8") + except OSError as exc: + raise BenchmarkDataError(f"cannot write {out}: {exc}") from exc + return output + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("raw_path", help="pytest-benchmark --benchmark-json output") + parser.add_argument("out_path", help="destination baselines.json path") + parser.add_argument( + "--slack", + type=_positive_float, + default=1.0, + help="multiply means by this factor (must be > 0)", + ) + parser.add_argument( + "--source", + default="local", + help="provenance label for _note (e.g. ubuntu-latest-ci, local)", + ) + args = parser.parse_args(argv) + try: + reduce_baselines(args.raw_path, args.out_path, slack=args.slack, source=args.source) + except BenchmarkDataError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 2 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py index e3e17e2..cfc133f 100644 --- a/tests/benchmarks/conftest.py +++ b/tests/benchmarks/conftest.py @@ -1,14 +1,20 @@ -"""Synthetic workspace trees for summary-cache performance benchmarks.""" +"""Shared synthetic fixtures for pytest-benchmark hot paths.""" from __future__ import annotations +import contextlib +import json +import sqlite3 from pathlib import Path from typing import Any import pytest +from flask.testing import FlaskClient +from app import create_app from services import summary_cache from services.summary_cache import fingerprint_workspace_storage +from tests.benchmarks.constants import BENCH_SEARCH_TERM def make_workspace_entries(workspace_root: Path, count: int) -> list[dict[str, Any]]: @@ -30,13 +36,123 @@ def make_workspace_entries(workspace_root: Path, count: int) -> list[dict[str, A return entries +def _composer_ids(count: int) -> list[tuple[str, str, str]]: + return [(f"ws_{i:04d}", f"cmp_{i:04d}", f"bub_{i:04d}") for i in range(count)] + + +def build_bench_storage(root: Path, composer_count: int) -> dict[str, str]: + """Create workspaceStorage, globalStorage, and cli_chats trees for *composer_count* composers.""" + ws_root = root / "workspaceStorage" + global_root = root / "globalStorage" + cli_root = root / "cli_chats" + projects_root = root / "projects" + ws_root.mkdir(parents=True) + global_root.mkdir(parents=True) + cli_root.mkdir(parents=True) + projects_root.mkdir(parents=True) + + global_db_path = global_root / "state.vscdb" + with contextlib.closing(sqlite3.connect(global_db_path)) as conn: + conn.execute("CREATE TABLE cursorDiskKV ([key] TEXT PRIMARY KEY, value TEXT)") + base_ts = 1_715_000_000_000 + for i, (workspace_id, composer_id, bubble_id) in enumerate(_composer_ids(composer_count)): + project_folder = projects_root / f"proj_{i:04d}" + project_folder.mkdir(parents=True, exist_ok=True) + + ws_dir = ws_root / workspace_id + ws_dir.mkdir(parents=True, exist_ok=True) + (ws_dir / "workspace.json").write_text( + json.dumps({"folder": str(project_folder)}), + encoding="utf-8", + ) + with contextlib.closing(sqlite3.connect(ws_dir / "state.vscdb")) as ws_conn: + ws_conn.execute("CREATE TABLE ItemTable ([key] TEXT PRIMARY KEY, value TEXT)") + ws_conn.execute( + "INSERT INTO ItemTable ([key], value) VALUES (?, ?)", + ( + "composer.composerData", + json.dumps({"allComposers": [{"composerId": composer_id}]}), + ), + ) + ws_conn.commit() + + created_at = base_ts + i * 1_000 + conn.execute( + "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)", + ( + f"composerData:{composer_id}", + json.dumps( + { + "name": f"Bench chat {i:04d}", + "createdAt": created_at, + "lastUpdatedAt": created_at + 500, + "fullConversationHeadersOnly": [ + {"bubbleId": bubble_id, "type": 1}, + ], + "modelConfig": {"modelName": "gpt-4o"}, + } + ), + ), + ) + conn.execute( + "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)", + ( + f"bubbleId:{composer_id}:{bubble_id}", + json.dumps( + { + "text": f"find {BENCH_SEARCH_TERM} in composer {i:04d}", + "type": "user", + "createdAt": created_at + 400, + } + ), + ), + ) + conn.commit() + + return { + "workspace_path": str(ws_root), + "cli_chats_path": str(cli_root), + "storage_root": str(root), + } + + +def _make_bench_flask_client( + storage: dict[str, str], + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + *, + state_subdir: str = ".cursor-chat-browser", + live_scan_search: bool = False, +) -> FlaskClient: + """Flask test client with env + export state patched for synthetic storage. + + When *live_scan_search* is True, set ``CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX=1`` so + ``/api/search`` measures the live-scan fallback. Otherwise the FTS index path + from #113 may be used when an index is built (see indexed search fixtures). + """ + monkeypatch.setenv("WORKSPACE_PATH", storage["workspace_path"]) + monkeypatch.setenv("CLI_CHATS_PATH", storage["cli_chats_path"]) + if live_scan_search: + monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1") + else: + monkeypatch.delenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", raising=False) + monkeypatch.delenv("CURSOR_CHAT_BROWSER_NOCACHE", raising=False) + state_dir = tmp_path / state_subdir + state_dir.mkdir() + monkeypatch.setattr("api.export_api._get_state_dir", lambda: str(state_dir)) + app = create_app() + app.config["TESTING"] = True + app.config["EXCLUSION_RULES"] = [] + return app.test_client() + + @pytest.fixture def summary_cache_dir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: """Redirect summary-cache files to an isolated temp directory. - Patches ``CACHE_DIR`` (also used by tab-summary paths via ``_tab_summaries_path``) - plus the projects/composer-map file constants used by current benchmarks. - Tab-summary cache benchmarks are deferred to issue #110 (unified benchmark suite). + Tab-summary files use ``CACHE_DIR`` + hashed filenames only (see + ``summary_cache._tab_summaries_path``); they do not use + ``PROJECTS_CACHE_FILE`` or ``COMPOSER_MAP_CACHE_FILE``. """ cache_dir = tmp_path / "cache" cache_dir.mkdir() @@ -87,3 +203,78 @@ def workspace_fingerprint(synthetic_workspace: tuple[str, list[dict[str, Any]]]) def stale_fingerprint(workspace_fingerprint: dict[str, Any]) -> dict[str, Any]: """Return a fingerprint guaranteed to differ from the stored one.""" return {**workspace_fingerprint, "rules_digest": "deadbeefdeadbeef"} + + +@pytest.fixture +def bench_storage(tmp_path: Path, request: pytest.FixtureRequest) -> dict[str, str]: + """On-disk Cursor layout with N composers (indirect ``composer_count`` param).""" + count = getattr(request, "param", 10) + return build_bench_storage(tmp_path / "storage", count) + + +@pytest.fixture +def bench_env( + bench_storage: dict[str, str], + monkeypatch: pytest.MonkeyPatch, +) -> dict[str, str]: + """Set WORKSPACE_PATH / CLI_CHATS_PATH for the synthetic storage tree.""" + monkeypatch.setenv("WORKSPACE_PATH", bench_storage["workspace_path"]) + monkeypatch.setenv("CLI_CHATS_PATH", bench_storage["cli_chats_path"]) + monkeypatch.setenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", "1") + return bench_storage + + +@pytest.fixture +def bench_client(bench_env: dict[str, str], tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> FlaskClient: + """Flask test client bound to synthetic bench storage.""" + return _make_bench_flask_client(bench_env, tmp_path, monkeypatch, live_scan_search=True) + + +@pytest.fixture +def bench_client_search_corpus( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> FlaskClient: + """Flask client over a fixed 50-composer corpus (live-scan search path).""" + storage = build_bench_storage(tmp_path / "search_storage", 50) + return _make_bench_flask_client( + storage, + tmp_path, + monkeypatch, + state_subdir=".cursor-chat-browser-search", + live_scan_search=True, + ) + + +@pytest.fixture +def bench_client_search_corpus_indexed( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> FlaskClient: + """Flask client with FTS index built for the 50-composer search corpus.""" + from services.search_index import build_search_index + + monkeypatch.delenv("CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX", raising=False) + monkeypatch.delenv("CURSOR_CHAT_BROWSER_NOCACHE", raising=False) + + storage = build_bench_storage(tmp_path / "search_indexed_storage", 50) + cache_dir = tmp_path / "search_index_cache" + cache_dir.mkdir() + monkeypatch.setattr("services.search_index.CACHE_DIR", cache_dir) + monkeypatch.setattr( + "services.search_index.SEARCH_INDEX_POINTER_FILE", + cache_dir / "search_index.active", + ) + monkeypatch.setattr( + "services.search_index.SEARCH_INDEX_FILE", + cache_dir / "search_index.sqlite", + ) + built = build_search_index(storage["workspace_path"], [], force=True) + assert built is True + return _make_bench_flask_client( + storage, + tmp_path, + monkeypatch, + state_subdir=".cursor-chat-browser-search-indexed", + live_scan_search=False, + ) diff --git a/tests/benchmarks/constants.py b/tests/benchmarks/constants.py new file mode 100644 index 0000000..ab682d3 --- /dev/null +++ b/tests/benchmarks/constants.py @@ -0,0 +1,3 @@ +"""Shared constants for benchmark corpora (importable outside conftest).""" + +BENCH_SEARCH_TERM = "bench-search-token" diff --git a/tests/benchmarks/test_export_bench.py b/tests/benchmarks/test_export_bench.py new file mode 100644 index 0000000..c5e3051 --- /dev/null +++ b/tests/benchmarks/test_export_bench.py @@ -0,0 +1,30 @@ +"""Benchmark POST /api/export (ZIP) over synthetic workspace + global DB.""" + +from __future__ import annotations + +import pytest +from flask.testing import FlaskClient + + +@pytest.mark.benchmark(group="export") +@pytest.mark.parametrize( + "bench_storage", + [10, 50], + indirect=True, + ids=["composers-10", "composers-50"], +) +def test_post_export_zip( + benchmark, + bench_client: FlaskClient, +) -> None: + def _run() -> object: + return bench_client.post( + "/api/export", + json={}, + content_type="application/json", + ) + + response = benchmark(_run) + assert response.status_code == 200 + assert response.content_type.startswith("application/zip") + assert int(response.headers.get("X-Export-Count", "0")) >= 1 diff --git a/tests/benchmarks/test_parse_bench.py b/tests/benchmarks/test_parse_bench.py new file mode 100644 index 0000000..9f23872 --- /dev/null +++ b/tests/benchmarks/test_parse_bench.py @@ -0,0 +1,28 @@ +"""Benchmark list_workspace_projects (nocache) over synthetic composer corpora.""" + +from __future__ import annotations + +import pytest + +from services.workspace_listing import list_workspace_projects + + +@pytest.mark.benchmark(group="parse") +@pytest.mark.parametrize( + "bench_storage", + [10, 50, 200], + indirect=True, + ids=["composers-10", "composers-50", "composers-200"], +) +def test_list_workspace_projects_nocache( + benchmark, + bench_env: dict[str, str], +) -> None: + workspace_path = bench_env["workspace_path"] + + def _run() -> object: + return list_workspace_projects(workspace_path, [], nocache=True) + + projects, warnings = benchmark(_run) + assert isinstance(projects, list) and len(projects) > 0 + assert warnings == [] diff --git a/tests/benchmarks/test_search_bench.py b/tests/benchmarks/test_search_bench.py new file mode 100644 index 0000000..33eee5b --- /dev/null +++ b/tests/benchmarks/test_search_bench.py @@ -0,0 +1,48 @@ +"""Benchmark GET /api/search over a 50-composer synthetic corpus.""" + +from __future__ import annotations + +import pytest +from flask.testing import FlaskClient + +from tests.benchmarks.constants import BENCH_SEARCH_TERM + + +def _search_url() -> str: + return f"/api/search?q={BENCH_SEARCH_TERM}&all_history=1" + + +def _assert_search_response(response: object) -> None: + assert response.status_code == 200 # type: ignore[attr-defined] + body = response.get_json() # type: ignore[attr-defined] + assert isinstance(body, dict) + results = body.get("results") + assert isinstance(results, list) and len(results) > 0 + + +@pytest.mark.benchmark(group="search") +def test_search_full_corpus_live_scan( + benchmark, + bench_client_search_corpus: FlaskClient, +) -> None: + """Live-scan fallback only (``CURSOR_CHAT_BROWSER_NO_SEARCH_INDEX=1``).""" + + def _run() -> object: + return bench_client_search_corpus.get(_search_url()) + + response = benchmark(_run) + _assert_search_response(response) + + +@pytest.mark.benchmark(group="search") +def test_search_full_corpus_indexed( + benchmark, + bench_client_search_corpus_indexed: FlaskClient, +) -> None: + """FTS index path (#113) with pre-built ``search_index.sqlite``.""" + + def _run() -> object: + return bench_client_search_corpus_indexed.get(_search_url()) + + response = benchmark(_run) + _assert_search_response(response) diff --git a/tests/benchmarks/test_summary_cache_bench.py b/tests/benchmarks/test_summary_cache_bench.py index dad4a15..16552d2 100644 --- a/tests/benchmarks/test_summary_cache_bench.py +++ b/tests/benchmarks/test_summary_cache_bench.py @@ -1,44 +1,64 @@ -"""pytest-benchmark coverage for services/summary_cache.py hot paths. - -``test_summary_cache_hit`` and ``test_summary_cache_miss`` both time ``get_cached_projects`` -only. Miss means fingerprint mismatch (cache not used), not a full cache rebuild. -""" +"""pytest-benchmark coverage for services/summary_cache.py hot paths.""" from __future__ import annotations from pathlib import Path -from typing import Any +from typing import Any, Literal import pytest from services.summary_cache import ( fingerprint_workspace_storage, + get_cached_composer_id_to_ws, get_cached_projects, + get_cached_tab_summaries, + set_cached_composer_id_to_ws, set_cached_projects, + set_cached_tab_summaries, ) @pytest.mark.benchmark(group="summary-cache") -def test_summary_cache_hit( +@pytest.mark.parametrize("mode", ["hit", "miss"], ids=["hit", "miss"]) +def test_summary_cache_lookup( benchmark, + mode: Literal["hit", "miss"], summary_cache_dir: Path, workspace_fingerprint: dict[str, Any], + stale_fingerprint: dict[str, Any], sample_projects: list[dict[str, Any]], ) -> None: + """Time ``get_cached_projects`` only; miss = fingerprint mismatch, not rebuild.""" set_cached_projects(workspace_fingerprint, sample_projects, []) - benchmark(get_cached_projects, workspace_fingerprint) + lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint + result = benchmark(get_cached_projects, lookup_fp) + if mode == "hit": + assert result is not None + projects, warnings = result + assert projects == sample_projects + assert warnings == [] + else: + assert result is None @pytest.mark.benchmark(group="summary-cache") -def test_summary_cache_miss( +@pytest.mark.parametrize("mode", ["hit", "miss"], ids=["hit", "miss"]) +def test_composer_map_cache_lookup( benchmark, + mode: Literal["hit", "miss"], summary_cache_dir: Path, workspace_fingerprint: dict[str, Any], stale_fingerprint: dict[str, Any], - sample_projects: list[dict[str, Any]], ) -> None: - set_cached_projects(workspace_fingerprint, sample_projects, []) - benchmark(get_cached_projects, stale_fingerprint) + """Time ``get_cached_composer_id_to_ws`` hit/miss (fingerprint mismatch on miss).""" + mapping = {"cmp_0000": "ws_0000"} + set_cached_composer_id_to_ws(workspace_fingerprint, mapping) + lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint + result = benchmark(get_cached_composer_id_to_ws, lookup_fp) + if mode == "hit": + assert result == mapping + else: + assert result is None @pytest.mark.benchmark(group="summary-cache") @@ -76,3 +96,31 @@ def _run() -> None: get_cached_projects(fp) benchmark(_run) + cached = get_cached_projects(fp) + assert cached is not None + cached_projects, cached_warnings = cached + assert cached_projects == projects + assert cached_warnings == [] + + +@pytest.mark.benchmark(group="summary-cache") +@pytest.mark.parametrize("mode", ["hit", "miss"], ids=["hit", "miss"]) +def test_tab_summary_cache_lookup( + benchmark, + mode: Literal["hit", "miss"], + summary_cache_dir: Path, + workspace_fingerprint: dict[str, Any], + stale_fingerprint: dict[str, Any], +) -> None: + workspace_id = "ws_0000" + payload = {"tabs": [{"id": "cmp_0000", "title": "Bench"}]} + set_cached_tab_summaries(workspace_fingerprint, workspace_id, payload, 200) + lookup_fp = workspace_fingerprint if mode == "hit" else stale_fingerprint + result = benchmark(get_cached_tab_summaries, lookup_fp, workspace_id) + if mode == "hit": + assert result is not None + cached_payload, status = result + assert status == 200 + assert cached_payload == payload + else: + assert result is None diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py index 8de10a8..873d68a 100644 --- a/tests/test_check_benchmark_regression.py +++ b/tests/test_check_benchmark_regression.py @@ -14,7 +14,7 @@ normalize_benchmark_name, ) -GATED_BENCH = "test_summary_cache_hit" +GATED_BENCH = "test_summary_cache_lookup[hit]" def _write_results(path, benchmarks: list[dict]) -> None: @@ -32,9 +32,9 @@ def _write_baselines(path, groups: dict[str, dict[str, float]]) -> None: def test_normalize_benchmark_name_strips_module_prefix() -> None: - full = "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_hit" - assert normalize_benchmark_name(full) == "test_summary_cache_hit" - assert normalize_benchmark_name("test_summary_cache_hit") == "test_summary_cache_hit" + full = "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_lookup[hit]" + assert normalize_benchmark_name(full) == "test_summary_cache_lookup[hit]" + assert normalize_benchmark_name("test_summary_cache_lookup[hit]") == "test_summary_cache_lookup[hit]" def test_normalize_benchmark_name_preserves_colons_in_param_values() -> None: @@ -50,13 +50,13 @@ def test_load_results_normalizes_full_node_id(tmp_path) -> None: path, [ { - "name": "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_hit", + "name": "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_lookup[hit]", "stats": {"mean": 0.0001}, } ], ) - assert load_results(path)["test_summary_cache_hit"] == pytest.approx(0.0001) + assert load_results(path)["test_summary_cache_lookup[hit]"] == pytest.approx(0.0001) def test_missing_baseline_warns_without_failing( @@ -213,3 +213,76 @@ def test_load_baseline_means_rejects_non_dict_group(tmp_path) -> None: with pytest.raises(BenchmarkDataError, match="must be an object"): load_baseline_means(baselines) + + +def test_stale_baseline_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": GATED_BENCH, "stats": {"mean": 0.00005}}], + ) + _write_baselines( + baselines, + {"summary-cache": {GATED_BENCH: 0.0002}}, + ) + + assert check_regression(results, baselines) == 1 + out = capsys.readouterr().out + assert "STALE" in out + + +def test_main_rejects_invalid_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + from scripts.check_benchmark_regression import main + + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) + + assert main([str(results), str(baselines), "--threshold", "1.0"]) == 2 + assert "threshold must be greater than 1" in capsys.readouterr().err + + +def test_main_rejects_invalid_stale_floor(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + from scripts.check_benchmark_regression import main + + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) + + assert main([str(results), str(baselines), "--stale-floor", "1.5"]) == 2 + assert "stale_floor must be between 0 and 1" in capsys.readouterr().err + + +def test_check_regression_rejects_invalid_threshold(tmp_path) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) + + with pytest.raises(BenchmarkDataError, match="threshold must be greater than 1"): + check_regression(results, baselines, threshold=1.0) + + +def test_check_regression_rejects_non_finite_threshold(tmp_path) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) + + with pytest.raises(BenchmarkDataError, match="threshold must be finite"): + check_regression(results, baselines, threshold=float("nan")) + + +def test_main_rejects_non_finite_threshold(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + from scripts.check_benchmark_regression import main + + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, [{"name": GATED_BENCH, "stats": {"mean": 0.0001}}]) + _write_baselines(baselines, {"summary-cache": {GATED_BENCH: 0.0002}}) + + assert main([str(results), str(baselines), "--threshold", "inf"]) == 2 + assert "threshold must be finite" in capsys.readouterr().err diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py new file mode 100644 index 0000000..9cc24e4 --- /dev/null +++ b/tests/test_reduce_baselines.py @@ -0,0 +1,156 @@ +"""Tests for scripts/reduce_baselines.py.""" + +from __future__ import annotations + +import json + +import pytest + +from scripts.reduce_baselines import reduce_baselines +from scripts.check_benchmark_regression import BenchmarkDataError + + +def _write_raw(path, benchmarks: list[dict], *, machine: str = "Linux") -> None: + path.write_text( + json.dumps( + { + "machine_info": {"system": machine}, + "benchmarks": benchmarks, + }, + indent=2, + ), + encoding="utf-8", + ) + + +def test_reduce_baselines_groups_and_slack(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "name": "test_list_workspace_projects_nocache[composers-50]", + "group": "parse", + "stats": {"mean": 0.05}, + }, + { + "name": "test_post_export_zip[composers-10]", + "group": "export", + "stats": {"mean": 0.01}, + }, + { + "name": "test_search_full_corpus", + "group": "search", + "stats": {"mean": 0.04}, + }, + { + "name": "test_summary_cache_lookup[hit]", + "group": "summary-cache", + "stats": {"mean": 0.0001}, + }, + ], + ) + + output = reduce_baselines(raw, out, slack=1.5, source="ubuntu-latest-ci") + data = json.loads(out.read_text(encoding="utf-8")) + groups = data["groups"] + + assert groups["parse"]["test_list_workspace_projects_nocache[composers-50]"] == pytest.approx(0.075) + assert groups["export"]["test_post_export_zip[composers-10]"] == pytest.approx(0.015) + assert groups["search"]["test_search_full_corpus"] == pytest.approx(0.06) + assert groups["summary-cache"]["test_summary_cache_lookup[hit]"] == pytest.approx(0.00015) + assert data["machine"] == "Linux" + assert "ubuntu-latest CI benchmark-results.json" in data["_note"] + assert "1.5x slack" in data["_note"] + assert output["groups"] == groups + + +def test_reduce_baselines_local_source_note(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "name": "test_summary_cache_lookup[hit]", + "group": "summary-cache", + "stats": {"mean": 0.0001}, + }, + ], + machine="Windows", + ) + + reduce_baselines(raw, out, source="local") + data = json.loads(out.read_text(encoding="utf-8")) + assert "local benchmark-results.json" in data["_note"] + assert data["machine"] == "Windows" + + +def test_reduce_baselines_rejects_unknown_group(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "name": "test_cache_only", + "group": "cache", + "stats": {"mean": 0.001}, + }, + ], + ) + + with pytest.raises(BenchmarkDataError, match="unknown group 'cache'"): + reduce_baselines(raw, out) + + +def test_reduce_baselines_rejects_missing_group(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "name": "test_no_group", + "stats": {"mean": 0.001}, + }, + ], + ) + + with pytest.raises(BenchmarkDataError, match="missing required 'group'"): + reduce_baselines(raw, out) + + +def test_reduce_baselines_rejects_duplicate_normalized_name(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + { + "name": "test_summary_cache_lookup[hit]", + "group": "summary-cache", + "stats": {"mean": 0.0001}, + }, + { + "name": "tests/benchmarks/test_summary_cache_bench.py::test_summary_cache_lookup[hit]", + "group": "summary-cache", + "stats": {"mean": 0.0002}, + }, + ], + ) + + with pytest.raises(BenchmarkDataError, match="duplicates normalized"): + reduce_baselines(raw, out) + + +def test_positive_float_rejects_non_finite() -> None: + import argparse + + from scripts.reduce_baselines import _positive_float + + with pytest.raises(argparse.ArgumentTypeError, match="finite"): + _positive_float("nan") + with pytest.raises(argparse.ArgumentTypeError, match="finite"): + _positive_float("inf")