From c63d1c8397a7b990bfdabc2d25fac29d7d828234 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 3 Jul 2026 13:45:17 +0200 Subject: [PATCH 1/3] fix(results): publish runs at results branch root --- .../src/lib/project-sync-status.test.ts | 8 +- apps/dashboard/src/lib/project-sync-status.ts | 12 +- .../content/docs/docs/next/tools/results.mdx | 2 +- .../docs/docs/v4.42.4/tools/results.mdx | 18 +- .../src/evaluation/results-repo-cache.test.ts | 18 +- packages/core/src/evaluation/results-repo.ts | 299 +++---- .../core/test/evaluation/results-repo.test.ts | 768 +++--------------- 7 files changed, 228 insertions(+), 897 deletions(-) diff --git a/apps/dashboard/src/lib/project-sync-status.test.ts b/apps/dashboard/src/lib/project-sync-status.test.ts index d152ccc29..6975bff52 100644 --- a/apps/dashboard/src/lib/project-sync-status.test.ts +++ b/apps/dashboard/src/lib/project-sync-status.test.ts @@ -18,19 +18,19 @@ describe('getProjectSyncView', () => { }); }); - it('surfaces dirty metadata as syncable without reset language', () => { + it('surfaces dirty result artifacts as syncable without reset language', () => { const view = getProjectSyncView({ configured: true, available: true, sync_status: 'dirty', - dirty_paths: ['metadata/runs/demo/tags.json'], + dirty_paths: ['demo-run/summary.json'], auto_push: false, }); expect(view).toMatchObject({ state: 'dirty', label: 'Dirty', - actionLabel: 'Sync Metadata', + actionLabel: 'Sync Results', canSync: true, }); expect(view.nextAction).toContain('no reset'); @@ -182,7 +182,7 @@ describe('buildProjectSyncFeedback', () => { expect(feedback.kind).toBe('success'); expect(feedback.message).toContain( - 'Sync completed: committed pending metadata, pulled remote results, pushed local results.', + 'Sync completed: committed pending results, pulled remote results, pushed local results.', ); }); diff --git a/apps/dashboard/src/lib/project-sync-status.ts b/apps/dashboard/src/lib/project-sync-status.ts index e6e45c229..467be30e3 100644 --- a/apps/dashboard/src/lib/project-sync-status.ts +++ b/apps/dashboard/src/lib/project-sync-status.ts @@ -218,13 +218,13 @@ export function getProjectSyncView( return { state: 'dirty', label: 'Dirty', - actionLabel: 'Sync Metadata', + actionLabel: 'Sync Results', tone: 'warn', - summary: status.block_reason ?? 'Local result metadata has pending edits.', + summary: status.block_reason ?? 'Local result artifacts have pending edits.', nextAction: status.auto_push === true - ? 'Sync Project will commit safe result metadata changes before syncing.' - : 'Review or commit the pending result metadata; no reset will be performed.', + ? 'Sync Project will commit safe result artifact changes before syncing.' + : 'Review or commit the pending result artifacts; no reset will be performed.', canSync: true, }; } @@ -261,7 +261,7 @@ export function getProjectSyncView( label: 'Clean', actionLabel: 'Sync Project', tone: 'good', - summary: 'Local and remote result metadata are in sync.', + summary: 'Local and remote results are in sync.', canSync: true, }; } @@ -332,7 +332,7 @@ export function buildProjectSyncFeedback(status: RemoteStatusResponse): { } const actions = [ - status.commit_created ? 'committed pending metadata' : undefined, + status.commit_created ? 'committed pending results' : undefined, status.pull_performed ? 'pulled remote results' : undefined, status.auto_merged_remote ? 'Merged remote (auto)' : undefined, status.push_performed ? 'pushed local results' : undefined, diff --git a/apps/web/src/content/docs/docs/next/tools/results.mdx b/apps/web/src/content/docs/docs/next/tools/results.mdx index e60ad51e1..64e04be8b 100644 --- a/apps/web/src/content/docs/docs/next/tools/results.mdx +++ b/apps/web/src/content/docs/docs/next/tools/results.mdx @@ -255,7 +255,7 @@ The CLI contract is deliberately narrow: `agentv results` manages local result a Use these supported remote workflows instead: -- **Automatic publishing:** configure `projects[].results` or top-level `results`; new `agentv eval` and `agentv pipeline bench` runs publish completed artifacts after the run completes. Use `results.repo` with `results.path` pointing at the source checkout and `results.branch: agentv/results/v1` to store primary result records on a dedicated branch of the source repo. AgentV never adds or rewrites remotes in an existing checkout; that checkout's `origin` must already point at the repository you want to fetch and push. AgentV reserves `agentv/results/v1` for primary results and `agentv/artifacts/v1` for heavy artifact payloads. When `index.jsonl` rows point trace or transcript payloads at `agentv/artifacts/v1`, automatic publishing stores those bytes on that artifact branch in the same remote and publishes pointer keys such as `runs//`. The configured results branch remains the metadata/control plane (`index.jsonl`, `summary.json`, tags, and pointers) instead of duplicating canonical trace/transcript payload bodies. Local pre-publish run workspaces can still contain those files beside the manifest so local tools keep working. Mutable run tags are stored as `tags.json` with a `tag_revision`; there is no tag event log in the normal results layout. `results.path` without `results.repo` means an existing local Git checkout, distinct from `workspace.repos[].repo`, which is a portable repository identity. Set `auto_push: true` to push after publish. In CI, use `agentv eval run --results-require-push` when push failures should fail that invocation after local artifacts are written. Non-fast-forward result branch pushes never force-push: AgentV auto-merges concurrent remote writes with artifact-aware Git merge drivers (a union driver for the append-only `index.jsonl`, a JSON-union driver for tag overlays) and pushes the merge as a fast-forward, and routes a genuine overlay conflict to a timestamped `agentv/results-sync/...` branch plus a GitHub compare/PR link for a human merge. While an eval is still running, [WIP checkpoints](/docs/tools/wip-checkpoints/) can keep partial run output durable on `agentv/wip/...` branches when auto-push is enabled. +- **Automatic publishing:** configure `projects[].results` or top-level `results`; new `agentv eval` and `agentv pipeline bench` runs publish completed artifacts after the run completes. Use `results.repo` with `results.path` pointing at the source checkout and `results.branch: agentv/results/v1` to store primary result records on a dedicated branch of the source repo. AgentV never adds or rewrites remotes in an existing checkout; that checkout's `origin` must already point at the repository you want to fetch and push. AgentV reserves `agentv/results/v1` for primary results and `agentv/artifacts/v1` for heavy artifact payloads. The results branch stores run bundles at `/` with `summary.json` at the run root and machine files such as the per-run JSONL index under `/.internal/`; cross-run derived catalogs live under `.indexes/`. When index rows point trace or transcript payloads at `agentv/artifacts/v1`, automatic publishing stores those bytes on the artifact branch in the same remote and publishes pointer keys such as `/`. Run tags are read from `summary.json` and index rows; there is no mutable `tags.json` overlay in the published results layout. `results.path` without `results.repo` means an existing local Git checkout, distinct from `workspace.repos[].repo`, which is a portable repository identity. Set `auto_push: true` to push after publish. In CI, use `agentv eval run --results-require-push` when push failures should fail that invocation after local artifacts are written. Non-fast-forward result branch pushes never force-push: AgentV auto-merges concurrent remote writes with artifact-aware Git merge rules for append-only JSONL indexes and pushes the merge as a fast-forward, routing genuine content conflicts to a timestamped `agentv/results-sync/...` branch plus a GitHub compare/PR link for a human merge. While an eval is still running, [WIP checkpoints](/docs/tools/wip-checkpoints/) can keep partial run output durable on `agentv/wip/...` branches when auto-push is enabled. - **Manual Dashboard sync:** run `agentv dashboard`, open the project, and use **Sync Project**. - **Manual API sync:** while Dashboard is running, call `GET /api/projects/:projectId/remote/status` or `POST /api/projects/:projectId/remote/sync` for project-scoped automation. Single-project sessions also expose `GET /api/remote/status` and `POST /api/remote/sync`. - **Git escape hatch:** for advanced recovery, inspect or repair the configured `projects[].results.path` clone with `git` directly, then sync again. diff --git a/apps/web/src/content/docs/docs/v4.42.4/tools/results.mdx b/apps/web/src/content/docs/docs/v4.42.4/tools/results.mdx index 5daf29f01..1c12cb71f 100644 --- a/apps/web/src/content/docs/docs/v4.42.4/tools/results.mdx +++ b/apps/web/src/content/docs/docs/v4.42.4/tools/results.mdx @@ -43,10 +43,10 @@ Examples: ```bash # Generate report.html next to the run manifest -agentv results report .agentv/results/runs/2026-03-14T10-32-00_claude +agentv results report .agentv/results/2026-03-14T10-32-00_claude # Use an explicit output path -agentv results report .agentv/results/runs/2026-03-14T10-32-00_claude/index.jsonl \ +agentv results report .agentv/results/2026-03-14T10-32-00_claude/index.jsonl \ --out ./reports/human-review.html ``` @@ -65,17 +65,17 @@ One minimal publication workflow is: ```bash # 1. Run an eval and sync or copy the run workspace into your public results repo. -agentv eval evals/demo.eval.yaml --output .agentv/results/runs/demo-live +agentv eval evals/demo.eval.yaml --output .agentv/results/demo-live # 2. In the public results repo, render the report into the Pages source directory. -agentv results report .agentv/results/runs/demo-live --out docs/index.html +agentv results report .agentv/results/demo-live --out docs/index.html # 3. Review the generated HTML before publishing. grep -RInE 'sk-[A-Za-z0-9]|Bearer |localhost|127\.0\.0\.1|/home/|/Users/|/tmp/' docs/index.html # 4. Commit the run artifacts and docs/index.html, then enable GitHub Pages # for the repository's docs/ directory or the branch used for Pages. -git add .agentv/results/runs/demo-live docs/index.html README.md +git add .agentv/results/demo-live docs/index.html README.md git commit -m "docs(results): publish static AgentV report" git push ``` @@ -116,10 +116,10 @@ Duplicate policy is explicit: For lightweight terminal workflows: ```bash -agentv results summary .agentv/results/runs/ -agentv results failures .agentv/results/runs/ -agentv results show .agentv/results/runs/ --test-id my-case -agentv results validate .agentv/results/runs/ +agentv results summary .agentv/results/ +agentv results failures .agentv/results/ +agentv results show .agentv/results/ --test-id my-case +agentv results validate .agentv/results/ ``` For a review-centric workflow built around these artifacts, see [Human Review Checkpoint](/docs/v4.42.4/guides/human-review/). diff --git a/packages/core/src/evaluation/results-repo-cache.test.ts b/packages/core/src/evaluation/results-repo-cache.test.ts index 38dce44c0..f41c865bf 100644 --- a/packages/core/src/evaluation/results-repo-cache.test.ts +++ b/packages/core/src/evaluation/results-repo-cache.test.ts @@ -34,10 +34,12 @@ function writeRun( score: number, executionStatus = 'ok', ): void { - const runDir = path.join(repoDir, 'runs', timestamp); + const runDir = path.join(repoDir, timestamp); + const internalDir = path.join(runDir, '.internal'); mkdirSync(runDir, { recursive: true }); + mkdirSync(internalDir, { recursive: true }); writeFileSync( - path.join(runDir, 'index.jsonl'), + path.join(internalDir, 'index.jsonl'), `${JSON.stringify({ timestamp, test_id: `${experiment}-case`, @@ -51,7 +53,7 @@ function writeRun( path.join(runDir, 'summary.json'), `${JSON.stringify( { - manifest_path: 'index.jsonl', + index_path: '.internal/index.jsonl', metadata: { display_name: `${experiment} ${timestamp}`, experiment, @@ -82,7 +84,7 @@ function createResultsRepo(tempRoot: string): string { git(repoDir, ['checkout', '--orphan', RESULTS_REF]); rmSync(path.join(repoDir, 'README.md'), { force: true }); writeRun(repoDir, 'default', '2026-06-28T00-00-00-000Z', 1); - git(repoDir, ['add', 'runs']); + git(repoDir, ['add', '.']); git(repoDir, ['commit', '-m', 'add first run']); return repoDir; } @@ -114,7 +116,7 @@ describe('git results filesystem index cache', () => { const runs = await listGitRunsCached(repoDir, RESULTS_REF); expect(runs).toHaveLength(1); expect(runs[0]?.run_id).toBe('2026-06-28T00-00-00-000Z'); - expect(runs[0]?.summary_path).toBe('runs/2026-06-28T00-00-00-000Z/summary.json'); + expect(runs[0]?.summary_path).toBe('2026-06-28T00-00-00-000Z/summary.json'); const cacheFile = resolveGitResultsIndexCacheFile({ repoDir, @@ -146,7 +148,7 @@ describe('git results filesystem index cache', () => { run_id: 'sentinel', experiment: 'default', timestamp: '2026-06-28T01-00-00-000Z', - manifest_path: 'runs/sentinel/index.jsonl', + manifest_path: 'sentinel/.internal/index.jsonl', display_name: 'from cache', test_count: 1, avg_score: 0.5, @@ -172,7 +174,7 @@ describe('git results filesystem index cache', () => { const firstCommit = await resolveGitRunsRefCommit(repoDir, RESULTS_REF); writeRun(repoDir, 'experiment-a', '2026-06-28T02-00-00-000Z', 0.25); - git(repoDir, ['add', 'runs']); + git(repoDir, ['add', '.']); git(repoDir, ['commit', '-m', 'add second run']); const secondCommit = await resolveGitRunsRefCommit(repoDir, RESULTS_REF); @@ -211,7 +213,7 @@ describe('git results filesystem index cache', () => { it('preserves execution error counts for remote-only list metadata', async () => { const repoDir = createResultsRepo(tempRoot); writeRun(repoDir, 'error-experiment', '2026-06-28T03-00-00-000Z', 0, 'execution_error'); - git(repoDir, ['add', 'runs']); + git(repoDir, ['add', '.']); git(repoDir, ['commit', '-m', 'add execution error run']); const runs = await listGitRunsCached(repoDir, RESULTS_REF); diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts index 5c77d8024..e9e0fbb6a 100644 --- a/packages/core/src/evaluation/results-repo.ts +++ b/packages/core/src/evaluation/results-repo.ts @@ -30,16 +30,16 @@ const execFileAsync = promisify(execFile); // write runs here. This is NOT the on-branch layout — see RESULTS_REPO_RUNS_DIR. const RESULTS_REPO_RESULTS_DIR = '.agentv/results'; // On-branch / results-repo-clone storage layout. The results branch (e.g. -// agentv/results/v1) already namespaces results, so runs are stored flat at -// runs// and the editable tag overlays at metadata/runs// — -// no redundant `.agentv/results/` prefix. -const RESULTS_REPO_RUNS_DIR = 'runs'; -const RESULTS_REPO_METADATA_DIR = 'metadata'; +// agentv/results/v1) already namespaces results, so run bundles are stored at +// the branch root as / with ADR-0017 internals preserved. +const RESULTS_REPO_RUNS_DIR = '.'; +const RESULTS_REPO_INDEXES_DIR = '.indexes'; +const RESULTS_REPO_CACHE_DIR = '.cache'; // Top-level directories AgentV owns on the results branch. The auto-sync // dirty-commit path stages only these so it never touches unrelated repo files. -const RESULTS_REPO_TRACKED_DIRS = [RESULTS_REPO_RUNS_DIR, RESULTS_REPO_METADATA_DIR] as const; +const RESULTS_REPO_TRACKED_DOT_DIRS = [RESULTS_REPO_INDEXES_DIR, RESULTS_REPO_CACHE_DIR] as const; const GIT_RESULTS_INDEX_CACHE_SCHEMA_VERSION = 'agentv.git_results_index_cache.v1'; -const GIT_RESULTS_INDEX_LAYOUT_VERSION = 'agentv.results_repo_runs.v1'; +const GIT_RESULTS_INDEX_LAYOUT_VERSION = 'agentv.results_repo_branch_root.v1'; const FALLBACK_RESULTS_REPO_COMMIT_EMAIL = 'agentv@results-repo'; const FALLBACK_RESULTS_REPO_COMMIT_NAME = 'AgentV Results'; const GIT_COMMIT_IDENTITY_ENV_KEYS = [ @@ -68,135 +68,15 @@ const RESULTS_REPO_GENESIS_MESSAGE = 'chore(results): initialize AgentV results const RESULTS_REPO_GENESIS_DATE = '@0 +0000'; const RESULT_INDEX_FILENAME = 'index.jsonl'; -// Artifact-aware merge config for the AgentV-owned results checkout. These two -// pieces let `git merge` reconcile concurrent result writes automatically so -// results sync never has to force-push (see resolveResultBranchPushConflict): -// - `.gitattributes` (committed on the results branch) maps the append-only -// run index to git's stock `union` driver and the editable JSON overlay to -// our `agentv-json` driver. -// - `merge.agentv-json.driver` (registered once in the checkout's local git -// config) points at a tiny 3-way JSON set/field union script. -// Run bundles under runs//** are uniquely pathed, so a 3-way merge -// never conflicts on them and they need no attribute. +// Artifact-aware merge config for the AgentV-owned results checkout. Concurrent +// writers append to rebuildable cross-run JSONL catalogs and each run's +// per-run JSONL index; git's stock `union` driver can reconcile those appends. +// Run bundles under /** are uniquely pathed, so a 3-way merge usually +// never conflicts on them. const RESULTS_REPO_GITATTRIBUTES_FILE = '.gitattributes'; const RESULTS_REPO_GITATTRIBUTES_CONTENT = `# Managed by AgentV. Artifact-aware merge so results sync never force-pushes. -# Append-only run manifests: union concurrent appends (lines are orthogonal). -index.jsonl merge=union -# Editable run overlay (tags): 3-way JSON set/field union via the -# agentv-json driver; a genuine scalar conflict falls through to a human merge. -metadata/runs/**/*.json merge=agentv-json -`; -const RESULTS_JSON_MERGE_DRIVER_NAME = 'agentv-json'; -// Materialized into the results checkout's git dir and invoked by git as -// `node