From 1d327bef5e44ecceeba15d87b15dcd3e96b9fa56 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 3 Jul 2026 15:07:22 +0200 Subject: [PATCH] feat(results): implement ADR-0017 bundle content --- CONCEPTS.md | 6 +- apps/cli/src/commands/eval/artifact-writer.ts | 1 - apps/cli/src/commands/eval/result-layout.ts | 10 +- apps/cli/src/commands/eval/run-eval.ts | 11 +- apps/cli/src/commands/pipeline/bench.ts | 30 +- apps/cli/src/commands/pipeline/run.ts | 28 +- apps/cli/src/commands/results/export.ts | 25 +- apps/cli/src/commands/results/manifest.ts | 60 +- .../src/commands/results/projection-bundle.ts | 8 +- apps/cli/src/commands/results/remote.ts | 5 +- apps/cli/src/commands/results/report.ts | 11 +- apps/cli/src/commands/results/serve.ts | 51 +- apps/cli/src/commands/results/validate.ts | 23 +- apps/cli/test/commands/eval/aggregate.test.ts | 28 +- .../commands/eval/artifact-writer.test.ts | 326 ++++------ apps/cli/test/commands/eval/bundle.test.ts | 2 +- .../test/commands/eval/pipeline/bench.test.ts | 6 +- .../eval/pipeline/pipeline-e2e.test.ts | 2 +- .../test/commands/eval/result-layout.test.ts | 9 +- apps/cli/test/commands/eval/run-cache.test.ts | 4 +- .../commands/grade/grade-prepared.test.ts | 16 +- .../results/export-e2e-providers.test.ts | 68 +- apps/cli/test/commands/results/export.test.ts | 77 +-- apps/cli/test/commands/results/report.test.ts | 2 +- .../test/commands/results/validate.test.ts | 8 +- apps/cli/test/commands/runs/rerun.test.ts | 5 +- apps/cli/test/eval.integration.test.ts | 37 +- .../docs/next/evaluation/running-evals.mdx | 22 +- .../docs/next/getting-started/quickstart.mdx | 2 +- .../docs/docs/next/guides/autoresearch.mdx | 4 +- .../docs/next/reference/result-artifacts.mdx | 71 +- .../content/docs/docs/next/tools/compare.mdx | 12 +- .../content/docs/docs/next/tools/inspect.mdx | 2 +- .../content/docs/docs/next/tools/results.mdx | 18 +- .../content/docs/docs/next/tools/trend.mdx | 6 +- .../docs/docs/next/tools/wip-checkpoints.mdx | 2 +- packages/core/src/evaluation/metrics.ts | 65 +- .../src/evaluation/results-repo-cache.test.ts | 2 +- packages/core/src/evaluation/results-repo.ts | 22 +- packages/core/src/evaluation/run-artifacts.ts | 607 +++++++++++++++--- packages/core/src/evaluation/types.ts | 2 +- .../evaluate-programmatic-api.test.ts | 17 +- .../core/test/evaluation/orchestrator.test.ts | 8 +- .../core/test/evaluation/results-repo.test.ts | 4 +- .../references/breaking-changes.md | 7 + 45 files changed, 1131 insertions(+), 601 deletions(-) diff --git a/CONCEPTS.md b/CONCEPTS.md index 7c2f3cc65..cb3c73a8e 100644 --- a/CONCEPTS.md +++ b/CONCEPTS.md @@ -26,11 +26,11 @@ Shared domain vocabulary for this project — entities, named processes, and sta **Workspace** — The task environment an eval prepares for the agent: repositories, templates, fixture files, and lifecycle hooks. It is not prompt input; use `input` for instructions and `workspace.repos[]` for multi-repo workspaces the agent can inspect or modify through tools. -**Run bundle** — A committed local result directory at `.agentv/results//`. `summary.json` records run metadata such as `run_id` and `experiment`; `index.jsonl` records per-case rows. +**Run bundle** — A committed local result directory at `.agentv/results//`. `summary.json` records run metadata such as `run_id` and `experiment`; `.internal/index.jsonl` records per-case rows. **Run manifest** — The root `summary.json` file in a run bundle. It owns aggregate run metadata and rollups such as `run_id`, `experiment`, timestamps, planned/completed counts, pass rate, score summaries, duration, tokens, and cost. -**Result index** — The root `index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `test_dir`, `summary_path`, and `grading_path`. +**Result index** — The `.internal/index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `test_dir`, `summary_path`, `grading_path`, and `metrics_path`. **Result source identity** — The stable source identity for a result row: repo-relative `eval_path`, `test_id`, and `target`. `suite` and `name` are display metadata, not storage or routing identity. @@ -38,7 +38,7 @@ Shared domain vocabulary for this project — entities, named processes, and sta **Artifact sidecar** — A file beside or below a result directory that provides evidence for a result, such as `summary.json`, `grading.json`, `result.json`, transcripts, logs, or outputs. Sidecars are evidence, not the primary discovery mechanism for a run. -**Artifact attempt folder** — A per-case `attempt-N/` folder under a result directory. It stores one materialized execution's sidecars and outputs. It is not the primary comparison dimension: stochastic samples and infrastructure retries should be represented with explicit sample/retry metadata rather than inferred from `attempt-1`, `attempt-2`, and so on. +**Artifact sample folder** — A per-case `sample-N/` folder under a result directory. It stores one materialized execution's sidecars and outputs. It is not the primary comparison dimension: stochastic samples and infrastructure retries are represented with explicit `sample_index` and `retry_index` metadata rather than inferred from folder position. ## Evaluation Reliability diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 818729287..afb4246b5 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -98,7 +98,6 @@ export function buildIndexArtifactEntry( outputDir: string; resultDir?: string; gradingPath?: string; - timingPath?: string; summaryPath?: string; outputPath?: string; answerPath?: string; diff --git a/apps/cli/src/commands/eval/result-layout.ts b/apps/cli/src/commands/eval/result-layout.ts index dc9f1efa4..12a826e7e 100644 --- a/apps/cli/src/commands/eval/result-layout.ts +++ b/apps/cli/src/commands/eval/result-layout.ts @@ -3,6 +3,7 @@ import path from 'node:path'; export const RESULT_INDEX_FILENAME = 'index.jsonl'; export const RUN_SUMMARY_FILENAME = 'summary.json'; +export const RUN_INTERNAL_DIRNAME = '.internal'; export const RESULTS_DIRNAME = 'results'; export const DEFAULT_EXPERIMENT_NAME = 'default'; export const RESERVED_RESULTS_NAMESPACES = new Set(['export', 'metadata', 'runs']); @@ -59,11 +60,11 @@ export function buildDefaultRunDir( } export function buildDefaultIndexPath(cwd: string, experiment?: string): string { - return path.join(buildDefaultRunDir(cwd, experiment), RESULT_INDEX_FILENAME); + return resolveRunIndexPath(buildDefaultRunDir(cwd, experiment)); } export function resolveRunIndexPath(runDir: string): string { - return path.join(runDir, RESULT_INDEX_FILENAME); + return path.join(runDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME); } export function isRunManifestPath(filePath: string): boolean { @@ -76,6 +77,11 @@ export function resolveExistingRunPrimaryPath(runDir: string): string | undefine return indexPath; } + const legacyIndexPath = path.join(runDir, RESULT_INDEX_FILENAME); + if (existsSync(legacyIndexPath)) { + return legacyIndexPath; + } + return undefined; } diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index f7b22916a..9c25d8dbf 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -74,6 +74,7 @@ import { createRunDirName, discoverRunManifestPaths, normalizeExperimentName, + resolveRunIndexPath, } from './result-layout.js'; import { buildExclusionFilter, @@ -1236,7 +1237,7 @@ class RunOutputWriter implements OutputWriter { private readonly invocationDir: string, private readonly appendMode: boolean, ) { - this.indexPath = path.join(invocationDir, RESULT_INDEX_FILENAME); + this.indexPath = resolveRunIndexPath(invocationDir); } async append(result: EvaluationResult): Promise { @@ -1280,7 +1281,11 @@ async function resolveRerunFailedRunDir(cwd: string, source: string): Promise/grading.json (per-test grading breakdown) - * - index.jsonl (one line per test) + * - .internal/index.jsonl (one line per test) * - summary.json (aggregate statistics) */ import { existsSync } from 'node:fs'; -import { readFile, readdir, writeFile } from 'node:fs/promises'; +import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises'; import { join } from 'node:path'; import { command, positional, string } from 'cmd-ts'; @@ -174,17 +174,17 @@ export const evalBenchCommand = command({ })), })); - // Read execution_status from timing.json (written by pipeline run) + // Read execution_status from metrics.json (written by pipeline run) let executionStatus = 'ok'; - const timingPath = join(testDir, 'timing.json'); - if (existsSync(timingPath)) { + const metricsPath = join(testDir, 'metrics.json'); + if (existsSync(metricsPath)) { try { - const timing = JSON.parse(await readFile(timingPath, 'utf8')); - if (typeof timing.execution_status === 'string') { - executionStatus = timing.execution_status; + const metrics = JSON.parse(await readFile(metricsPath, 'utf8')); + if (typeof metrics.execution?.status === 'string') { + executionStatus = metrics.execution.status; } } catch { - // Fall back to 'ok' if timing.json is unreadable + // Fall back to 'ok' if metrics.json is unreadable } } @@ -200,23 +200,21 @@ export const evalBenchCommand = command({ scores, execution_status: executionStatus, grading_path: `${artifactSubdir}/grading.json`, - timing_path: `${artifactSubdir}/timing.json`, + metrics_path: `${artifactSubdir}/metrics.json`, response_path: hasResponse ? `${artifactSubdir}/response.md` : undefined, }), ); } // Write row-level run manifest. - await writeFile( - join(exportDir, RESULT_INDEX_FILENAME), - indexLines.length > 0 ? `${indexLines.join('\n')}\n` : '', - 'utf8', - ); + const indexPath = join(exportDir, '.internal', RESULT_INDEX_FILENAME); + await mkdir(join(exportDir, '.internal'), { recursive: true }); + await writeFile(indexPath, indexLines.length > 0 ? `${indexLines.join('\n')}\n` : '', 'utf8'); // Write summary.json const passRateStats = computeStats(allPassRates); const summary = { - manifest_path: RESULT_INDEX_FILENAME, + index_path: `.internal/${RESULT_INDEX_FILENAME}`, metadata: { eval_file: manifest.eval_file, timestamp: manifest.timestamp, diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts index 70678d7c0..5dc061ea0 100644 --- a/apps/cli/src/commands/pipeline/run.ts +++ b/apps/cli/src/commands/pipeline/run.ts @@ -4,7 +4,7 @@ * * Equivalent to running: * 1. `agentv pipeline input --out ` - * 2. Invoking each CLI target in parallel (writing response.md + timing.json) + * 2. Invoking each CLI target in parallel (writing response.md + metrics.json) * 3. `agentv pipeline grade ` * * For `kind: agent` targets, step 2 is skipped (subagent handles execution). @@ -289,10 +289,15 @@ export const evalRunCommand = command({ } await writeFile(join(testDir, 'response.md'), response, 'utf8'); - await writeJson(join(testDir, 'timing.json'), { - duration_ms: durationMs, - total_duration_seconds: Math.round(durationMs / 10) / 100, - execution_status: 'ok', + await writeJson(join(testDir, 'metrics.json'), { + duration: { + total_ms: durationMs, + total_seconds: Math.round(durationMs / 10) / 100, + source: 'provider_reported', + }, + tokens: { total: 0, input: 0, output: 0, reasoning: 0, source: 'unavailable' }, + cost: { usd: null, source: 'unavailable' }, + execution: { status: 'ok' }, }); process.stderr.write(`\n ${testId}: OK (${durationMs}ms, ${response.length} chars)\n`); @@ -301,10 +306,15 @@ export const evalRunCommand = command({ const message = error instanceof Error ? error.message : String(error); const response = `ERROR: target failed — ${message}`; await writeFile(join(testDir, 'response.md'), response, 'utf8'); - await writeJson(join(testDir, 'timing.json'), { - duration_ms: durationMs, - total_duration_seconds: Math.round(durationMs / 10) / 100, - execution_status: 'execution_error', + await writeJson(join(testDir, 'metrics.json'), { + duration: { + total_ms: durationMs, + total_seconds: Math.round(durationMs / 10) / 100, + source: 'provider_reported', + }, + tokens: { total: 0, input: 0, output: 0, reasoning: 0, source: 'unavailable' }, + cost: { usd: null, source: 'unavailable' }, + execution: { status: 'execution_error' }, }); process.stderr.write( `\n ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}\n`, diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts index 3bda4e2c7..45b137fe3 100644 --- a/apps/cli/src/commands/results/export.ts +++ b/apps/cli/src/commands/results/export.ts @@ -8,9 +8,9 @@ * index.jsonl — per-test manifest with artifact pointers * / * summary.json — per-case aggregate - * attempt-1/result.json — per-attempt result - * attempt-1/grading.json — per-attempt grading artifact (assertions, graders) - * attempt-1/metrics.json — per-attempt metrics artifact + * sample-1/result.json — per-sample result + * sample-1/grading.json — per-sample grading artifact (assertions, graders) + * sample-1/metrics.json — per-sample metrics artifact * * This module delegates artifact building to the shared artifact-writer so * that summary/grading/timing schemas stay aligned with `agentv eval`. @@ -36,6 +36,7 @@ import type { import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js'; import { RESULT_INDEX_FILENAME, + RUN_INTERNAL_DIRNAME, isReservedResultsNamespace, isRunManifestPath, } from '../eval/result-layout.js'; @@ -69,7 +70,7 @@ export async function exportResults( duplicatePolicy: options?.duplicatePolicy ?? 'update', additionalArtifacts: createExportBundleArtifactsWriter({ outputDir, - sourceBaseDir: path.dirname(sourceFile), + sourceBaseDir: runRootFromIndexPath(sourceFile), sourceRecordsByResult: buildSourceRecordMap(results, sourceIndexRecords), }), }); @@ -85,7 +86,7 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string { throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`); } - const runDir = path.dirname(sourceFile); + const runDir = runRootFromIndexPath(sourceFile); const segments = path.normalize(runDir).split(path.sep).filter(Boolean); const resultsIndex = segments.lastIndexOf('results'); if (resultsIndex >= 0 && resultsIndex < segments.length - 2) { @@ -104,11 +105,19 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string { export function deriveExportRunId(sourceFile: string): string { if (isRunManifestPath(sourceFile)) { - return path.basename(path.dirname(sourceFile)); + return path.basename(runRootFromIndexPath(sourceFile)); } return path.basename(sourceFile, path.extname(sourceFile)); } +function runRootFromIndexPath(sourceFile: string): string { + const indexDir = path.dirname(sourceFile); + if (path.basename(indexDir) === RUN_INTERNAL_DIRNAME) { + return path.dirname(indexDir); + } + return indexDir; +} + export async function loadExportSource( source: string | undefined, cwd: string, @@ -222,7 +231,7 @@ export function buildProjectionBundleFromExportedIndex(options: { readonly includeRawContent?: boolean; readonly duplicatePolicy?: ExportDuplicatePolicy; }): ProjectionBundle { - const indexPath = path.join(options.outputDir, RESULT_INDEX_FILENAME); + const indexPath = path.join(options.outputDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME); const indexRecords = readIndexArtifactEntries(indexPath); const emittedResults = loadManifestResults(indexPath); @@ -327,7 +336,7 @@ export const resultsExportCommand = command({ duplicatePolicy: policy, additionalArtifacts: createExportBundleArtifactsWriter({ outputDir, - sourceBaseDir: path.dirname(sourceFile), + sourceBaseDir: runRootFromIndexPath(sourceFile), sourceRecordsByResult: buildSourceRecordMap(results, indexRecords ?? []), }), }); diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 32b7a8156..c473d9927 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -14,7 +14,7 @@ import { traceEnvelopeToTranscriptMessages, } from '@agentv/core'; -import type { GradingArtifact, TimingArtifact } from '../eval/artifact-writer.js'; +import type { GradingArtifact } from '../eval/artifact-writer.js'; import { isDirectoryPath, isRunManifestPath, @@ -37,6 +37,7 @@ export interface ResultManifestRecord { readonly attempts?: readonly { readonly attempt?: number; readonly attempt_path?: string; + readonly sample_path?: string; readonly run_path?: string; readonly score?: number; readonly verdict?: string; @@ -45,6 +46,7 @@ export interface ResultManifestRecord { readonly trials?: readonly { readonly attempt?: number; readonly attempt_path?: string; + readonly sample_path?: string; readonly run_path?: string; readonly score?: number; readonly verdict?: string; @@ -86,6 +88,34 @@ export interface ResultManifestRecord { readonly metadata?: Record; } +interface MetricsUsageArtifact { + readonly duration?: { + readonly total_ms?: number; + }; + readonly tokens?: { + readonly input?: number; + readonly output?: number; + readonly reasoning?: number; + }; + readonly cost?: { + readonly usd?: number | null; + }; +} + +interface LegacyTimingArtifact { + readonly duration_ms?: number; + readonly token_usage?: { + readonly input?: number; + readonly output?: number; + readonly reasoning?: number; + }; +} + +function manifestBaseDir(indexPath: string): string { + const dir = path.dirname(indexPath); + return path.basename(dir) === '.internal' ? path.dirname(dir) : dir; +} + export interface ManifestHydrationOptions { /** * Defaults to true for report/inspect consumers that need a trace projection. @@ -304,7 +334,8 @@ function hydrateManifestRecord( options: ManifestHydrationOptions, ): EvaluationResult { const grading = readOptionalJson(baseDir, record.grading_path); - const timing = readOptionalJson(baseDir, record.timing_path); + const metrics = readOptionalJson(baseDir, record.metrics_path); + const timing = metrics ?? readOptionalJson(baseDir, record.timing_path); const testId = record.test_id ?? 'unknown'; const gradingAssertions = grading ? readGradingAssertionResults(grading as unknown as Record) @@ -334,15 +365,24 @@ function hydrateManifestRecord( // `evaluators` was renamed to `graders` in v4.13 — read both for backwards compat with old artifacts. // TODO: remove `evaluators` fallback once old run directories are no longer in use. gradingScores ?? (record.scores as EvaluationResult['scores']), - tokenUsage: timing?.token_usage + tokenUsage: metrics?.tokens ? { - input: timing.token_usage.input, - output: timing.token_usage.output, - reasoning: timing.token_usage.reasoning, + input: metrics.tokens.input, + output: metrics.tokens.output, + reasoning: metrics.tokens.reasoning, } - : record.token_usage, - durationMs: timing?.duration_ms ?? record.duration_ms, - costUsd: record.cost_usd, + : (timing as LegacyTimingArtifact | undefined)?.token_usage + ? { + input: (timing as LegacyTimingArtifact).token_usage?.input, + output: (timing as LegacyTimingArtifact).token_usage?.output, + reasoning: (timing as LegacyTimingArtifact).token_usage?.reasoning, + } + : record.token_usage, + durationMs: + metrics?.duration?.total_ms ?? + (timing as LegacyTimingArtifact | undefined)?.duration_ms ?? + record.duration_ms, + costUsd: metrics?.cost?.usd ?? record.cost_usd, input: hydrateInput(baseDir, record), output: hydrateOutput(baseDir, record) ?? '', trace: hydrateTrace(baseDir, record, options), @@ -369,7 +409,7 @@ export function loadManifestResults( const resolvedSourceFile = resolveRunManifestPath(sourceFile); const content = readFileSync(resolvedSourceFile, 'utf8'); const records = parseResultRows(content, resolvedSourceFile); - const baseDir = path.dirname(resolvedSourceFile); + const baseDir = manifestBaseDir(resolvedSourceFile); return records.map((record) => hydrateManifestRecord(baseDir, record, options)); } diff --git a/apps/cli/src/commands/results/projection-bundle.ts b/apps/cli/src/commands/results/projection-bundle.ts index 1d4ee85c7..91ec2a9d6 100644 --- a/apps/cli/src/commands/results/projection-bundle.ts +++ b/apps/cli/src/commands/results/projection-bundle.ts @@ -69,7 +69,7 @@ export interface ProjectionBundleEntry { readonly result_score: number; readonly execution_status?: string; readonly grading_path?: string; - readonly timing_path?: string; + readonly metrics_path?: string; readonly assertion_count: number; readonly scores?: readonly TraceEnvelopeScoreWire[]; }; @@ -88,7 +88,7 @@ export type ProjectionBundleArtifactRefs = Partial< | 'result_dir' | 'summary_path' | 'grading_path' - | 'timing_path' + | 'metrics_path' | 'input_path' | 'output_path' | 'answer_path' @@ -156,7 +156,7 @@ function artifactRefs( ): ProjectionBundleArtifactRefs { const metadataRefs = dropUndefined({ status: options.status, - timing_path: indexEntry.timing_path, + metrics_path: indexEntry.metrics_path, }); if (!options.includeRawContent) { @@ -310,7 +310,7 @@ function buildEntry( result_score: result.score, execution_status: result.executionStatus, grading_path: refs.grading_path, - timing_path: refs.timing_path, + metrics_path: refs.metrics_path, assertion_count: result.assertions?.length ?? 0, scores, }); diff --git a/apps/cli/src/commands/results/remote.ts b/apps/cli/src/commands/results/remote.ts index e9a298c0e..caa6c9014 100644 --- a/apps/cli/src/commands/results/remote.ts +++ b/apps/cli/src/commands/results/remote.ts @@ -411,7 +411,10 @@ export async function listMergedResultFiles( raw_filename: r.run_id, source: 'remote' as const, on_remote: true, - path: path.join(config.path, r.manifest_path), + path: path.join( + config.path, + r.index_path ?? (r as { manifest_path?: string }).manifest_path ?? '', + ), ...(r.summary_path && { summaryPath: path.join(config.path, r.summary_path) }), experiment: r.experiment, ...(r.target && { target: r.target }), diff --git a/apps/cli/src/commands/results/report.ts b/apps/cli/src/commands/results/report.ts index 7cac9cfd8..22397aeea 100644 --- a/apps/cli/src/commands/results/report.ts +++ b/apps/cli/src/commands/results/report.ts @@ -5,6 +5,7 @@ import { command, option, optional, string } from 'cmd-ts'; import type { EvaluationResult, RunRuntimeSourceMetadata } from '@agentv/core'; +import { RUN_INTERNAL_DIRNAME } from '../eval/result-layout.js'; import { loadManifestResults, parseResultManifest, resolveResultSourcePath } from './manifest.js'; import { RESULTS_REPORT_TEMPLATE } from './report-template.js'; import { resolveSourceFile, sourceArg } from './shared.js'; @@ -43,7 +44,10 @@ function readSummaryEvalFile(sourceFile: string): string | undefined { } function readSummaryReportMetadata(sourceFile: string): RunSummaryReportMetadata { - const summaryPath = path.join(path.dirname(sourceFile), 'summary.json'); + const sourceDir = path.dirname(sourceFile); + const runDir = + path.basename(sourceDir) === RUN_INTERNAL_DIRNAME ? path.dirname(sourceDir) : sourceDir; + const summaryPath = path.join(runDir, 'summary.json'); if (!existsSync(summaryPath)) { return {}; } @@ -67,7 +71,10 @@ function readSummaryReportMetadata(sourceFile: string): RunSummaryReportMetadata } export function deriveReportPath(sourceFile: string): string { - return path.join(path.dirname(sourceFile), 'report.html'); + const sourceDir = path.dirname(sourceFile); + const runDir = + path.basename(sourceDir) === RUN_INTERNAL_DIRNAME ? path.dirname(sourceDir) : sourceDir; + return path.join(runDir, 'report.html'); } function serializeReportResult( diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 01853b465..cea680041 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -777,7 +777,12 @@ function addTrialRunCatalogEntries( : undefined; if (!resultDir) return; for (const trial of record.attempts ?? record.trials ?? []) { - const rawPath = typeof trial.attempt_path === 'string' ? trial.attempt_path : trial.run_path; + const rawPath = + typeof trial.sample_path === 'string' + ? trial.sample_path + : typeof trial.attempt_path === 'string' + ? trial.attempt_path + : trial.run_path; const runPath = rawPath ? normalizeArtifactRelativePath(rawPath) : undefined; if (!runPath) continue; const runDir = path.posix.join(resultDir, runPath); @@ -799,12 +804,6 @@ function addTrialRunCatalogEntries( path.posix.join(runDir, 'metrics.json'), 'artifact', ); - addDirectArtifactCatalogEntry( - entries, - seen, - path.posix.join(runDir, 'timing.json'), - 'artifact', - ); } } @@ -824,6 +823,7 @@ function buildResultArtifactCatalog( addDirectArtifactCatalogEntry(entries, seen, record.summary_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, record.grading_path, 'artifact'); + addDirectArtifactCatalogEntry(entries, seen, record.metrics_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, record.timing_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, record.input_path, 'artifact'); addDirectArtifactCatalogEntry(entries, seen, record.output_path, 'artifact'); @@ -1124,7 +1124,12 @@ function buildRepeatTrialReadModels( : undefined; return attempts.map((trial) => { - const rawPath = typeof trial.attempt_path === 'string' ? trial.attempt_path : trial.run_path; + const rawPath = + typeof trial.sample_path === 'string' + ? trial.sample_path + : typeof trial.attempt_path === 'string' + ? trial.attempt_path + : trial.run_path; const runPath = rawPath ? normalizeArtifactRelativePath(rawPath) : undefined; const metricsPath = caseTrialArtifactPath(resultDir, runPath, 'metrics.json'); const timingPath = caseTrialArtifactPath(resultDir, runPath, 'timing.json'); @@ -1137,21 +1142,35 @@ function buildRepeatTrialReadModels( const metrics = readArtifactJsonObject(baseDir, metricsPath); const timing = readArtifactJsonObject(baseDir, timingPath); const toolCalls = objectField(metrics, 'tool_calls'); - const tokenUsage = objectField(timing, 'token_usage'); + const tokenUsage = objectField(metrics, 'tokens') ?? objectField(timing, 'token_usage'); + const duration = objectField(metrics, 'duration'); + const cost = objectField(metrics, 'cost'); const transcriptSummary = objectField(trial, 'transcript_summary') ?? objectField(runResult, 'transcript_summary'); return { ...trial, - ...(numberField(timing, 'duration_ms') !== undefined && { - duration_ms: numberField(timing, 'duration_ms'), + ...(numberField(duration, 'total_ms') !== undefined && { + duration_ms: numberField(duration, 'total_ms'), }), - ...(numberField(timing, 'total_tokens') !== undefined && { - total_tokens: numberField(timing, 'total_tokens'), + ...(numberField(duration, 'total_ms') === undefined && + numberField(timing, 'duration_ms') !== undefined && { + duration_ms: numberField(timing, 'duration_ms'), + }), + ...(numberField(tokenUsage, 'total') !== undefined && { + total_tokens: numberField(tokenUsage, 'total'), }), - ...(numberField(timing, 'cost_usd') !== undefined && { - cost_usd: numberField(timing, 'cost_usd'), + ...(numberField(tokenUsage, 'total') === undefined && + numberField(timing, 'total_tokens') !== undefined && { + total_tokens: numberField(timing, 'total_tokens'), + }), + ...(numberField(cost, 'usd') !== undefined && { + cost_usd: numberField(cost, 'usd'), }), + ...(numberField(cost, 'usd') === undefined && + numberField(timing, 'cost_usd') !== undefined && { + cost_usd: numberField(timing, 'cost_usd'), + }), ...(tokenUsage && { token_usage: tokenUsage }), ...(numberField(metrics, 'total_tool_calls') !== undefined && { total_tool_calls: numberField(metrics, 'total_tool_calls'), @@ -1159,7 +1178,7 @@ function buildRepeatTrialReadModels( ...(toolCalls && { tool_calls: toolCalls }), ...(transcriptSummary && { transcript_summary: transcriptSummary }), ...(metricsPath && { metrics_path: metricsPath }), - ...(timingPath && { timing_path: timingPath }), + ...(timing && timingPath && { timing_path: timingPath }), ...(gradingPath && { grading_path: gradingPath }), ...(transcriptPath && { transcript_path: transcriptPath }), ...(transcriptRawPath && { transcript_raw_path: transcriptRawPath }), diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts index a98511a30..77e0ee46e 100644 --- a/apps/cli/src/commands/results/validate.ts +++ b/apps/cli/src/commands/results/validate.ts @@ -39,9 +39,18 @@ interface IndexEntry { readonly summary_path?: string; readonly grading_path?: string; readonly timing_path?: string; + readonly metrics_path?: string; readonly result_dir?: string; - readonly attempts?: readonly { readonly attempt_path?: string; readonly run_path?: string }[]; - readonly trials?: readonly { readonly attempt_path?: string; readonly run_path?: string }[]; + readonly attempts?: readonly { + readonly attempt_path?: string; + readonly sample_path?: string; + readonly run_path?: string; + }[]; + readonly trials?: readonly { + readonly attempt_path?: string; + readonly sample_path?: string; + readonly run_path?: string; + }[]; readonly [key: string]: unknown; } @@ -304,13 +313,13 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[] } } - // Check timing.json - if (entry.timing_path) { - const timingPath = path.join(runDir, entry.timing_path); - if (!existsSync(timingPath)) { + // Check metrics.json. Legacy timing_path is tolerated for old bundles. + if (entry.metrics_path) { + const metricsPath = path.join(runDir, entry.metrics_path); + if (!existsSync(metricsPath)) { diagnostics.push({ severity: 'warning', - message: `${testId}: timing.json not found at '${entry.timing_path}'`, + message: `${testId}: metrics.json not found at '${entry.metrics_path}'`, }); } } diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts index ae72d8258..772ecff20 100644 --- a/apps/cli/test/commands/eval/aggregate.test.ts +++ b/apps/cli/test/commands/eval/aggregate.test.ts @@ -52,14 +52,18 @@ function writeJsonlIndex( results: Partial[], filename = RESULT_INDEX_FILENAME, ): string { - const indexPath = path.join(dir, filename); + const indexPath = + filename === RESULT_INDEX_FILENAME + ? path.join(dir, '.internal', filename) + : path.join(dir, filename); + mkdirSync(path.dirname(indexPath), { recursive: true }); const lines = results.map((r) => JSON.stringify(toSnakeCaseDeep(makeResult(r)))).join('\n'); writeFileSync(indexPath, `${lines}\n`); return indexPath; } function readIndexRows(dir: string): Array<{ test_id: string; result_dir: string }> { - const indexPath = path.join(dir, RESULT_INDEX_FILENAME); + const indexPath = path.join(dir, '.internal', RESULT_INDEX_FILENAME); if (!existsSync(indexPath)) { return readdirSync(dir) .filter((entry) => /--[a-f0-9]{12}$/.test(entry)) @@ -217,11 +221,11 @@ describe('aggregateRunDir', () => { expect(result.targetCount).toBe(1); const summary = JSON.parse(readFileSync(result.summaryPath, 'utf8')); - expect(summary.manifest_path).toBe(RESULT_INDEX_FILENAME); + expect(summary.index_path).toBe('.internal/index.jsonl'); expect(summary.metadata.tests_run).toContain('a'); expect(summary.metadata.tests_run).toContain('b'); expect(summary.run_summary.x).toBeDefined(); - expect(summary.timing.total_tokens).toBeGreaterThanOrEqual(0); + expect(summary.usage.total_tokens).toBeGreaterThanOrEqual(0); }); it('reads canonical index.jsonl bundles', async () => { @@ -238,7 +242,7 @@ describe('aggregateRunDir', () => { expect(result.testCount).toBe(2); const summary = JSON.parse(readFileSync(result.summaryPath, 'utf8')); - expect(summary.manifest_path).toBe(RESULT_INDEX_FILENAME); + expect(summary.index_path).toBe('.internal/index.jsonl'); expect(summary.metadata.tests_run).toEqual(['case-a', 'case-b']); }); @@ -283,23 +287,23 @@ describe('writePerTestArtifacts', () => { rmSync(tmpDir, { recursive: true, force: true }); }); - it('writes grading.json and timing.json for each result', async () => { + it('writes grading.json and metrics.json for each result', async () => { const results = [makeResult({ testId: 'test-1' }), makeResult({ testId: 'test-2' })]; await writePerTestArtifacts(results, tmpDir); const grading1 = JSON.parse( - readFileSync(rowRunPath(tmpDir, 'test-1', 'attempt-1', 'grading.json'), 'utf8'), + readFileSync(rowRunPath(tmpDir, 'test-1', 'sample-1', 'grading.json'), 'utf8'), ); expect(grading1.assertion_results).toHaveLength(1); - const timing1 = JSON.parse( - readFileSync(rowRunPath(tmpDir, 'test-1', 'attempt-1', 'timing.json'), 'utf8'), + const metrics1 = JSON.parse( + readFileSync(rowRunPath(tmpDir, 'test-1', 'sample-1', 'metrics.json'), 'utf8'), ); - expect(timing1.total_tokens).toBeGreaterThanOrEqual(0); + expect(metrics1.tokens.total).toBeGreaterThanOrEqual(0); const grading2 = JSON.parse( - readFileSync(rowRunPath(tmpDir, 'test-2', 'attempt-1', 'grading.json'), 'utf8'), + readFileSync(rowRunPath(tmpDir, 'test-2', 'sample-1', 'grading.json'), 'utf8'), ); expect(grading2.assertion_results).toHaveLength(1); }); @@ -310,7 +314,7 @@ describe('writePerTestArtifacts', () => { await writePerTestArtifacts(results, tmpDir); const answer = readFileSync( - rowRunPath(tmpDir, 'test-1', 'attempt-1', 'outputs', 'answer.md'), + rowRunPath(tmpDir, 'test-1', 'sample-1', 'outputs', 'answer.md'), 'utf8', ); expect(answer).toContain('hello'); diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 5043cdb68..63ba0233e 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -397,11 +397,11 @@ describe('buildGradingArtifact', () => { }); // --------------------------------------------------------------------------- -// Timing artifact +// Metrics usage artifact // --------------------------------------------------------------------------- describe('buildTimingArtifact', () => { - it('aggregates timing across results', () => { + it('aggregates duration and token usage across results', () => { const results = [ makeResult({ durationMs: 30000, @@ -413,30 +413,26 @@ describe('buildTimingArtifact', () => { } as Partial), ]; - const timing = buildTimingArtifact(results); + const metrics = buildTimingArtifact(results); - expect(timing.total_tokens).toBe(4500); - expect(timing.duration_ms).toBe(90000); - expect(timing.total_duration_seconds).toBe(90); - expect(timing.token_usage).toEqual({ input: 3000, output: 1500, reasoning: 0 }); + expect(metrics.tokens).toMatchObject({ total: 4500, input: 3000, output: 1500, reasoning: 0 }); + expect(metrics.duration).toMatchObject({ total_ms: 90000, total_seconds: 90 }); }); - it('handles results with no timing data', () => { + it('handles results with no usage data', () => { const results = [makeResult({})]; - const timing = buildTimingArtifact(results); + const metrics = buildTimingArtifact(results); - expect(timing.total_tokens).toBe(0); - expect(timing.duration_ms).toBe(0); - expect(timing.total_duration_seconds).toBe(0); - expect(timing.token_usage).toEqual({ input: 0, output: 0, reasoning: 0 }); + expect(metrics.tokens).toMatchObject({ total: 0, input: 0, output: 0, reasoning: 0 }); + expect(metrics.duration).toMatchObject({ total_ms: 0, total_seconds: 0 }); }); it('handles empty results array', () => { - const timing = buildTimingArtifact([]); + const metrics = buildTimingArtifact([]); - expect(timing.total_tokens).toBe(0); - expect(timing.duration_ms).toBe(0); - expect(timing.total_duration_seconds).toBe(0); + expect(metrics.tokens.total).toBe(0); + expect(metrics.duration.total_ms).toBe(0); + expect(metrics.duration.total_seconds).toBe(0); }); it('handles partial token usage', () => { @@ -446,9 +442,9 @@ describe('buildTimingArtifact', () => { } as Partial), ]; - const timing = buildTimingArtifact(results); - expect(timing.total_tokens).toBe(500); - expect(timing.token_usage).toEqual({ input: 500, output: 0, reasoning: 0 }); + const metrics = buildTimingArtifact(results); + expect(metrics.tokens.total).toBe(500); + expect(metrics.tokens).toMatchObject({ input: 500, output: 0, reasoning: 0 }); }); }); @@ -561,7 +557,7 @@ describe('buildRunSummaryArtifact', () => { [makeResult({})], 'test.eval.yaml', 'baseline-v2', - 'attempt-1', + 'sample-1', undefined, undefined, undefined, @@ -776,8 +772,8 @@ describe('buildIndexArtifactEntry', () => { }), { outputDir: '/tmp/artifacts', - gradingPath: '/tmp/artifacts/alpha/attempt-1/grading.json', - timingPath: '/tmp/artifacts/alpha/attempt-1/timing.json', + gradingPath: '/tmp/artifacts/alpha/sample-1/grading.json', + metricsPath: '/tmp/artifacts/alpha/sample-1/metrics.json', outputPath: '/tmp/artifacts/alpha/outputs/answer.md', answerPath: '/tmp/artifacts/alpha/outputs/answer.md', }, @@ -805,16 +801,18 @@ describe('buildIndexArtifactEntry', () => { ], }, ], + named_scores: { quality: 0.7 }, + provenance: 'native', execution_status: 'quality_failure', error: 'model drift', - grading_path: 'alpha/attempt-1/grading.json', - timing_path: 'alpha/attempt-1/timing.json', + grading_path: 'alpha/sample-1/grading.json', + metrics_path: 'alpha/sample-1/metrics.json', output_path: 'alpha/outputs/answer.md', answer_path: 'alpha/outputs/answer.md', attempts: [ { attempt: 0, - attempt_path: 'attempt-1', + sample_path: 'sample-1', score: 0.9, verdict: 'fail', scores: [ @@ -875,8 +873,8 @@ describe('buildIndexArtifactEntry', () => { }), { outputDir: '/tmp/artifacts', - gradingPath: '/tmp/artifacts/alpha/attempt-1/grading.json', - timingPath: '/tmp/artifacts/alpha/attempt-1/timing.json', + gradingPath: '/tmp/artifacts/alpha/sample-1/grading.json', + metricsPath: '/tmp/artifacts/alpha/sample-1/metrics.json', }, ); @@ -940,9 +938,9 @@ describe('parseJsonlResults', () => { artifactPointers: { transcript: { ref: 'agentv/artifacts/v1', - key: 'transcripts/pointer-row/attempt-1/transcript-raw.jsonl', + key: 'transcripts/pointer-row/sample-1/transcript-raw.jsonl', object_version: 'sha256:test', - path: 'pointer-row/attempt-1/transcript-raw.jsonl', + path: 'pointer-row/sample-1/transcript-raw.jsonl', sha256: 'test', size: 1, schema_version: 'agentv.transcript.v1', @@ -960,7 +958,7 @@ describe('parseJsonlResults', () => { test_id: 'file-changes-row', target: 'codex', score: 1, - fileChangesPath: 'file-changes-row/attempt-1/outputs/file_changes.diff', + fileChangesPath: 'file-changes-row/sample-1/outputs/file_changes.diff', })}\n`; expect(() => parseJsonlResults(content)).toThrow(/Use "file_changes_path"/); @@ -972,7 +970,7 @@ describe('parseJsonlResults', () => { target: 'codex', score: 1, output: 'done', - raw_provider_log_path: 'raw-log-case/attempt-1/provider.log', + raw_provider_log_path: 'raw-log-case/sample-1/provider.log', })}\n`; const results = parseJsonlResults(content); @@ -1061,15 +1059,16 @@ describe('schema compatibility', () => { expect(typeof grading.summary.pass_rate).toBe('number'); }); - it('timing has total_tokens, duration_ms, total_duration_seconds, token_usage', () => { - const timing = buildTimingArtifact([makeResult({})]); + it('metrics usage has duration, tokens, cost, execution, and trajectory sections', () => { + const metrics = buildTimingArtifact([makeResult({})]); - expect(timing).toHaveProperty('total_tokens'); - expect(timing).toHaveProperty('duration_ms'); - expect(timing).toHaveProperty('total_duration_seconds'); - expect(timing).toHaveProperty('token_usage'); - expect(timing.token_usage).toHaveProperty('input'); - expect(timing.token_usage).toHaveProperty('output'); + expect(metrics).toHaveProperty('duration'); + expect(metrics).toHaveProperty('tokens'); + expect(metrics).toHaveProperty('cost'); + expect(metrics).toHaveProperty('execution'); + expect(metrics).toHaveProperty('trajectory'); + expect(metrics.tokens).toHaveProperty('input'); + expect(metrics.tokens).toHaveProperty('output'); }); it('benchmark run_summary has pass_rate/time_seconds/tokens with mean/stddev', () => { @@ -1099,6 +1098,9 @@ describe('writeArtifactsFromResults', () => { afterEach(async () => { await rm(testDir, { recursive: true, force: true }).catch(() => undefined); + await rm(path.join(import.meta.dir, '.indexes'), { recursive: true, force: true }).catch( + () => undefined, + ); }); it('writes summary, index.jsonl, and per-run artifact files', async () => { @@ -1112,7 +1114,7 @@ describe('writeArtifactsFromResults', () => { }); expect(path.basename(paths.indexPath)).toBe('index.jsonl'); - expect(paths.indexPath).toBe(path.join(testDir, 'index.jsonl')); + expect(paths.indexPath).toBe(path.join(testDir, '.internal', 'index.jsonl')); expect(existsSync(paths.indexPath)).toBe(true); const indexLines = await readIndexLines(paths.indexPath); expect(indexLines).toHaveLength(2); @@ -1122,58 +1124,52 @@ describe('writeArtifactsFromResults', () => { // Check per-test artifact directories const artifactEntries = await readdir(paths.testArtifactDir); - expect(artifactEntries.sort()).toEqual([ - alphaRowDir, - betaRowDir, - RESULT_INDEX_FILENAME, - 'summary.json', - ]); + expect(artifactEntries.sort()).toEqual(['.internal', alphaRowDir, betaRowDir, 'summary.json']); const rootSummary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8')); - expect(rootSummary.manifest_path).toBe(RESULT_INDEX_FILENAME); + expect(rootSummary.index_path).toBe('.internal/index.jsonl'); const alphaEntries = await readdir(path.join(paths.testArtifactDir, alphaRowDir)); - expect(alphaEntries.sort()).toEqual(['attempt-1', 'summary.json']); + expect(alphaEntries.sort()).toEqual(['sample-1', 'summary.json']); const alphaRunEntries = await readdir( - path.join(paths.testArtifactDir, alphaRowDir, 'attempt-1'), + path.join(paths.testArtifactDir, alphaRowDir, 'sample-1'), ); expect(alphaRunEntries.sort()).toEqual([ 'grading.json', 'metrics.json', 'outputs', 'result.json', - 'timing.json', 'transcript-raw.jsonl', 'transcript.json', ]); const alphaGrading: GradingArtifact = JSON.parse( await readFile( - path.join(paths.testArtifactDir, alphaRowDir, 'attempt-1', 'grading.json'), + path.join(paths.testArtifactDir, alphaRowDir, 'sample-1', 'grading.json'), 'utf8', ), ); expect(alphaGrading.summary).toBeDefined(); expect(alphaGrading).not.toHaveProperty('execution_metrics'); - const alphaTiming: TimingArtifact = JSON.parse( + const alphaMetrics: TimingArtifact = JSON.parse( await readFile( - path.join(paths.testArtifactDir, alphaRowDir, 'attempt-1', 'timing.json'), + path.join(paths.testArtifactDir, alphaRowDir, 'sample-1', 'metrics.json'), 'utf8', ), ); - expect(alphaTiming.duration_ms).toBe(5000); + expect(alphaMetrics.duration.total_ms).toBe(5000); const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8')); expect(summary.metadata.eval_file).toBe('my-eval.yaml'); expect(summary.metadata.tests_run.sort()).toEqual(['alpha', 'beta']); - expect(summary.timing.duration_ms).toBe(13000); + expect(summary.metrics.duration.total_ms).toBe(13000); expect(indexLines[0]?.summary_path).toBe(`${alphaRowDir}/summary.json`); - expect(indexLines[0]?.grading_path).toBe(`${alphaRowDir}/attempt-1/grading.json`); - expect(indexLines[0]?.timing_path).toBe(`${alphaRowDir}/attempt-1/timing.json`); - expect(indexLines[0]?.metrics_path).toBe(`${alphaRowDir}/attempt-1/metrics.json`); + expect(indexLines[0]?.grading_path).toBe(`${alphaRowDir}/sample-1/grading.json`); + expect(indexLines[0]?.timing_path).toBeUndefined(); + expect(indexLines[0]?.metrics_path).toBe(`${alphaRowDir}/sample-1/metrics.json`); }); it('writes optional runtime source metadata to summary and index rows', async () => { @@ -1277,8 +1273,8 @@ describe('writeArtifactsFromResults', () => { const [indexEntry] = await readIndexLines(paths.indexPath); const repeatRowDir = expectRowDir(indexEntry, 'repeat-case'); expect(indexEntry?.attempts).toMatchObject([ - { attempt: 0, attempt_path: 'attempt-1', score: 0.25, verdict: 'fail' }, - { attempt: 1, attempt_path: 'attempt-2', score: 1, verdict: 'pass' }, + { attempt: 0, sample_path: 'sample-1', score: 0.25, verdict: 'fail' }, + { attempt: 1, sample_path: 'sample-2', score: 1, verdict: 'pass' }, ]); expect(indexEntry?.aggregation).toEqual({ strategy: 'confidence_interval', @@ -1297,7 +1293,7 @@ describe('writeArtifactsFromResults', () => { expect(indexEntry?.metrics_path).toBeUndefined(); const repeatEntries = await readdir(path.join(paths.testArtifactDir, repeatRowDir)); - expect(repeatEntries.sort()).toEqual(['attempt-1', 'attempt-2', 'summary.json']); + expect(repeatEntries.sort()).toEqual(['sample-1', 'sample-2', 'summary.json']); const caseSummary = JSON.parse( await readFile(path.join(paths.testArtifactDir, repeatRowDir, 'summary.json'), 'utf8'), @@ -1308,26 +1304,21 @@ describe('writeArtifactsFromResults', () => { pass_rate: '50%', mean_duration_ms: 3000, mean_duration_seconds: 3, - duration_ms: 6000, - total_duration_seconds: 6, - duration_stats: { - count: 2, - mean_ms: 3000, - mean_seconds: 3, - stddev_ms: 1000, - stddev_seconds: 1, - min_ms: 2000, - max_ms: 4000, - }, - total_tokens: 0, - cost_usd: null, - token_usage: { input: 0, output: 0, reasoning: 0 }, - usage_sources: { - token_usage: 'unavailable', - total_tokens: 'unavailable', - duration: 'aggregate', - cost: 'unavailable', + duration: { + total_ms: 6000, + total_seconds: 6, + stats: { + count: 2, + mean_ms: 3000, + mean_seconds: 3, + stddev_ms: 1000, + stddev_seconds: 1, + min_ms: 2000, + max_ms: 4000, + }, }, + tokens: { total: 0, input: 0, output: 0, reasoning: 0 }, + cost: { usd: null }, }); expect(typeof caseSummary.fingerprint).toBe('string'); @@ -1335,14 +1326,13 @@ describe('writeArtifactsFromResults', () => { readFile(path.join(paths.testArtifactDir, repeatRowDir, 'grading.json'), 'utf8'), ).rejects.toThrow(); - for (const runDir of ['attempt-1', 'attempt-2']) { + for (const runDir of ['sample-1', 'sample-2']) { const runEntries = await readdir(path.join(paths.testArtifactDir, repeatRowDir, runDir)); expect(runEntries.sort()).toEqual([ 'grading.json', 'metrics.json', 'outputs', 'result.json', - 'timing.json', 'transcript-raw.jsonl', 'transcript.json', ]); @@ -1350,7 +1340,7 @@ describe('writeArtifactsFromResults', () => { const runOneResult = JSON.parse( await readFile( - path.join(paths.testArtifactDir, repeatRowDir, 'attempt-1', 'result.json'), + path.join(paths.testArtifactDir, repeatRowDir, 'sample-1', 'result.json'), 'utf8', ), ) as Record; @@ -1365,22 +1355,20 @@ describe('writeArtifactsFromResults', () => { transcript_path: './transcript.json', transcript_raw_path: './transcript-raw.jsonl', output_paths: { answer: './outputs/answer.md' }, - timing: { - duration_ms: 2000, - }, }); + expect(runOneResult).not.toHaveProperty('timing'); expect(runOneResult).not.toHaveProperty('status'); expect(indexEntry?.attempts?.[0]?.transcript_summary).toEqual(runOneResult.transcript_summary); const runTwoAnswer = await readFile( - path.join(paths.testArtifactDir, repeatRowDir, 'attempt-2', 'outputs', 'answer.md'), + path.join(paths.testArtifactDir, repeatRowDir, 'sample-2', 'outputs', 'answer.md'), 'utf8', ); expect(runTwoAnswer).toBe('second attempt'); const runTwoResult = JSON.parse( await readFile( - path.join(paths.testArtifactDir, repeatRowDir, 'attempt-2', 'result.json'), + path.join(paths.testArtifactDir, repeatRowDir, 'sample-2', 'result.json'), 'utf8', ), ) as Record; @@ -1391,10 +1379,8 @@ describe('writeArtifactsFromResults', () => { metrics_path: './metrics.json', transcript_path: './transcript.json', transcript_raw_path: './transcript-raw.jsonl', - timing: { - duration_ms: 4000, - }, }); + expect(runTwoResult).not.toHaveProperty('timing'); expect(runTwoResult).not.toHaveProperty('status'); expect(indexEntry?.attempts?.[1]?.transcript_summary).toEqual(runTwoResult.transcript_summary); }); @@ -1428,16 +1414,16 @@ describe('writeArtifactsFromResults', () => { const paths = await writeArtifactsFromResults([], testDir); const artifactEntries = await readdir(paths.testArtifactDir); - expect(artifactEntries.sort()).toEqual([RESULT_INDEX_FILENAME, 'summary.json']); + expect(artifactEntries.sort()).toEqual(['.internal', 'summary.json']); const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8')); - expect(summary.manifest_path).toBe(RESULT_INDEX_FILENAME); + expect(summary.index_path).toBe('.internal/index.jsonl'); expect(summary.notes).toContain('No results to summarize'); - expect(summary.timing.total_tokens).toBe(0); + expect(summary.metrics.tokens.total).toBe(0); expect(await readFile(paths.indexPath, 'utf8')).toBe(''); }); - it('writes grading.json and timing.json inside each test directory', async () => { + it('writes grading.json and metrics.json inside each test directory', async () => { const results = [ makeResult({ testId: 'test-1', @@ -1458,20 +1444,20 @@ describe('writeArtifactsFromResults', () => { const testTwo = indexLines.find((line) => line.test_id === 'test-2'); const gradingOne: GradingArtifact = JSON.parse( - await readFile(runArtifactPath(testDir, testOne, 'attempt-1', 'grading.json'), 'utf8'), + await readFile(runArtifactPath(testDir, testOne, 'sample-1', 'grading.json'), 'utf8'), ); const gradingTwo: GradingArtifact = JSON.parse( - await readFile(runArtifactPath(testDir, testTwo, 'attempt-1', 'grading.json'), 'utf8'), + await readFile(runArtifactPath(testDir, testTwo, 'sample-1', 'grading.json'), 'utf8'), ); - const timingOne: TimingArtifact = JSON.parse( - await readFile(runArtifactPath(testDir, testOne, 'attempt-1', 'timing.json'), 'utf8'), + const metricsOne: TimingArtifact = JSON.parse( + await readFile(runArtifactPath(testDir, testOne, 'sample-1', 'metrics.json'), 'utf8'), ); expect(gradingOne.summary.total).toBe(1); expect(gradingOne.summary.passed).toBe(1); expect(gradingTwo.summary.total).toBe(2); expect(gradingTwo.summary.failed).toBe(1); - expect(timingOne.duration_ms).toBe(0); + expect(metricsOne.duration.total_ms).toBe(0); }); it('writes normalized transcript.json plus raw transcript evidence', async () => { @@ -1528,12 +1514,12 @@ describe('writeArtifactsFromResults', () => { const [indexLine] = await readIndexLines(paths.indexPath); const rowDir = expectRowDir(indexLine, 'transcript-case'); - const transcriptPath = runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript.json'); + const transcriptPath = runArtifactPath(testDir, indexLine, 'sample-1', 'transcript.json'); const transcript = JSON.parse(await readFile(transcriptPath, 'utf8')); const rawTranscriptLines = ( await readFile( - runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript-raw.jsonl'), + runArtifactPath(testDir, indexLine, 'sample-1', 'transcript-raw.jsonl'), 'utf8', ) ) @@ -1618,17 +1604,17 @@ describe('writeArtifactsFromResults', () => { role: 'user', }); await expect( - readFile(path.join(testDir, rowDir, 'attempt-1', 'transcript.jsonl'), 'utf8'), + readFile(path.join(testDir, rowDir, 'sample-1', 'transcript.jsonl'), 'utf8'), ).rejects.toThrow(); await expect( - readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'trace.json'), 'utf8'), + readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'trace.json'), 'utf8'), ).rejects.toThrow(); expect(indexLine).not.toHaveProperty('trace_path'); - expect(indexLine?.transcript_path).toBe(`${rowDir}/attempt-1/transcript.json`); - expect(indexLine?.transcript_raw_path).toBe(`${rowDir}/attempt-1/transcript-raw.jsonl`); + expect(indexLine?.transcript_path).toBe(`${rowDir}/sample-1/transcript.json`); + expect(indexLine?.transcript_raw_path).toBe(`${rowDir}/sample-1/transcript-raw.jsonl`); expect(indexLine?.transcript_summary).toEqual(transcript.transcript_summary); - expect(indexLine?.metrics_path).toBe(`${rowDir}/attempt-1/metrics.json`); + expect(indexLine?.metrics_path).toBe(`${rowDir}/sample-1/metrics.json`); expect(indexLine.metrics_path.endsWith(CANONICAL_METRICS_ARTIFACT_PATH)).toBe(true); expect(indexLine.artifact_pointers).toBeUndefined(); @@ -1731,26 +1717,26 @@ describe('writeArtifactsFromResults', () => { const [indexLine] = await readIndexLines(paths.indexPath); const rowDir = expectRowDir(indexLine, 'summary-case'); - expect(indexLine?.metrics_path).toBe(`${rowDir}/attempt-1/metrics.json`); + expect(indexLine?.metrics_path).toBe(`${rowDir}/sample-1/metrics.json`); expect(indexLine?.file_changes_path).toBe( - `${rowDir}/attempt-1/${CANONICAL_FILE_CHANGES_ARTIFACT_PATH}`, + `${rowDir}/sample-1/${CANONICAL_FILE_CHANGES_ARTIFACT_PATH}`, ); await expect( readFile( - runArtifactPath(testDir, indexLine, 'attempt-1', 'outputs', 'file_changes.diff'), + runArtifactPath(testDir, indexLine, 'sample-1', 'outputs', 'file_changes.diff'), 'utf8', ), ).resolves.toBe(fileChanges); const runResult = JSON.parse( - await readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'result.json'), 'utf8'), + await readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'result.json'), 'utf8'), ); expect(runResult.file_changes_path).toBe('./outputs/file_changes.diff'); expect(runResult.output_paths.file_changes).toBe('./outputs/file_changes.diff'); const summary = MetricsArtifactWireSchema.parse( JSON.parse( - await readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'metrics.json'), 'utf8'), + await readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'metrics.json'), 'utf8'), ), ); @@ -1764,12 +1750,11 @@ describe('writeArtifactsFromResults', () => { expect(summary.source_artifacts).toMatchObject({ transcript_path: 'transcript.json', grading_path: 'grading.json', - timing_path: 'timing.json', file_changes_path: CANONICAL_FILE_CHANGES_ARTIFACT_PATH, }); expect(summary.source_artifacts).not.toHaveProperty('trace_path'); await expect( - readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'trace.json'), 'utf8'), + readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'trace.json'), 'utf8'), ).rejects.toThrow(); expect(summary.metrics.total_turns).toBe(2); expect(summary.metrics.total_tool_calls).toBe(4); @@ -1842,24 +1827,14 @@ describe('writeArtifactsFromResults', () => { ]); expect(summary).not.toHaveProperty('usage_summary'); - const timing = JSON.parse( - await readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'timing.json'), 'utf8'), - ); - expect(timing).toMatchObject({ - total_tokens: 140, - duration_ms: 4200, - cost_usd: 0.25, - token_usage: { input: 100, output: 40, reasoning: 5 }, - usage_sources: { - token_usage: 'provider_reported', - total_tokens: 'provider_reported', - duration: 'provider_reported', - cost: 'provider_reported', - }, + expect(summary).toMatchObject({ + tokens: { total: 140, input: 100, output: 40, reasoning: 5, source: 'provider_reported' }, + duration: { total_ms: 4200, source: 'provider_reported' }, + cost: { usd: 0.25, source: 'provider_reported' }, }); }); - it('distinguishes aggregate, estimated, and unavailable timing usage sources', async () => { + it('distinguishes aggregate, estimated, and unavailable metrics usage sources', async () => { const aggregateOutput = [ { role: 'assistant' as const, @@ -1900,56 +1875,39 @@ describe('writeArtifactsFromResults', () => { const aggregateRow = indexLines.find((line) => line.test_id === 'aggregate-usage'); const estimatedRow = indexLines.find((line) => line.test_id === 'estimated-usage'); - const aggregateTiming = JSON.parse( - await readFile(runArtifactPath(testDir, aggregateRow, 'attempt-1', 'timing.json'), 'utf8'), + const aggregateMetrics = JSON.parse( + await readFile(runArtifactPath(testDir, aggregateRow, 'sample-1', 'metrics.json'), 'utf8'), ); - const estimatedTiming = JSON.parse( - await readFile(runArtifactPath(testDir, estimatedRow, 'attempt-1', 'timing.json'), 'utf8'), + const estimatedMetrics = JSON.parse( + await readFile(runArtifactPath(testDir, estimatedRow, 'sample-1', 'metrics.json'), 'utf8'), ); const runSummary = JSON.parse(await readFile(path.join(testDir, 'summary.json'), 'utf8')); MetricsArtifactWireSchema.parse( JSON.parse( - await readFile(runArtifactPath(testDir, aggregateRow, 'attempt-1', 'metrics.json'), 'utf8'), + await readFile(runArtifactPath(testDir, aggregateRow, 'sample-1', 'metrics.json'), 'utf8'), ), ); MetricsArtifactWireSchema.parse( JSON.parse( - await readFile(runArtifactPath(testDir, estimatedRow, 'attempt-1', 'metrics.json'), 'utf8'), + await readFile(runArtifactPath(testDir, estimatedRow, 'sample-1', 'metrics.json'), 'utf8'), ), ); - expect(aggregateTiming).toMatchObject({ - token_usage: { input: 3, output: 4, reasoning: 0 }, - total_tokens: 7, - cost_usd: null, - usage_sources: { - token_usage: 'aggregate', - total_tokens: 'aggregate', - cost: 'unavailable', - duration: 'unavailable', - }, + expect(aggregateMetrics).toMatchObject({ + tokens: { input: 3, output: 4, reasoning: 0, total: 7, source: 'aggregate' }, + cost: { usd: null, source: 'unavailable' }, + duration: { source: 'unavailable' }, }); - expect(estimatedTiming).toMatchObject({ - token_usage: { input: 6, output: 7, reasoning: 0 }, - total_tokens: 13, - cost_usd: 0.002, - usage_sources: { - token_usage: 'token_estimated', - total_tokens: 'token_estimated', - cost: 'token_estimated', - duration: 'unavailable', - }, + expect(estimatedMetrics).toMatchObject({ + tokens: { input: 6, output: 7, reasoning: 0, total: 13, source: 'token_estimated' }, + cost: { usd: 0.002, source: 'token_estimated' }, + duration: { source: 'unavailable' }, }); - expect(runSummary.timing).toMatchObject({ - total_tokens: 20, - cost_usd: 0.002, - usage_sources: { - token_usage: 'aggregate', - total_tokens: 'aggregate', - cost: 'aggregate', - duration: 'unavailable', - }, + expect(runSummary.metrics).toMatchObject({ + tokens: { total: 20, source: 'aggregate' }, + cost: { usd: 0.002, source: 'aggregate' }, + duration: { source: 'unavailable' }, }); }); @@ -1976,18 +1934,18 @@ describe('writeArtifactsFromResults', () => { const [indexLine] = await readIndexLines(paths.indexPath); const rowDir = expectRowDir(indexLine, 'raw-log-case'); - const copiedRawLogPath = runArtifactPath(testDir, indexLine, 'attempt-1', 'provider.log'); + const copiedRawLogPath = runArtifactPath(testDir, indexLine, 'sample-1', 'provider.log'); await expect(readFile(copiedRawLogPath, 'utf8')).rejects.toThrow(); - const transcriptPath = runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript-raw.jsonl'); + const transcriptPath = runArtifactPath(testDir, indexLine, 'sample-1', 'transcript-raw.jsonl'); await expect(readFile(transcriptPath, 'utf8')).resolves.toBe(rawLog); await expect(readFile(rawLogPath, 'utf8')).resolves.toBe(rawLog); await expect( - readFile(path.join(testDir, rowDir, 'attempt-1', 'transcript.jsonl'), 'utf8'), + readFile(path.join(testDir, rowDir, 'sample-1', 'transcript.jsonl'), 'utf8'), ).rejects.toThrow(); const transcript = JSON.parse( - await readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript.json'), 'utf8'), + await readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'transcript.json'), 'utf8'), ); expect(transcript.turns[0]).toMatchObject({ v: 1, @@ -1997,8 +1955,8 @@ describe('writeArtifactsFromResults', () => { }); expect(indexLine.raw_provider_log_path).toBeUndefined(); - expect(indexLine.transcript_path).toBe(`${rowDir}/attempt-1/transcript.json`); - expect(indexLine.transcript_raw_path).toBe(`${rowDir}/attempt-1/transcript-raw.jsonl`); + expect(indexLine.transcript_path).toBe(`${rowDir}/sample-1/transcript.json`); + expect(indexLine.transcript_raw_path).toBe(`${rowDir}/sample-1/transcript-raw.jsonl`); expect(indexLine).not.toHaveProperty('transcript_json_path'); }); @@ -2043,7 +2001,7 @@ describe('writeArtifactsFromResults', () => { expect(JSON.stringify(indexLine)).not.toContain('api_key'); const transcriptJson = await readFile( - runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript.json'), + runArtifactPath(testDir, indexLine, 'sample-1', 'transcript.json'), 'utf8', ); expect(transcriptJson).not.toContain('secret'); @@ -2062,12 +2020,12 @@ describe('writeArtifactsFromResults', () => { const paths = await writeArtifactsFromResults(results, testDir); const [indexLine] = await readIndexLines(paths.indexPath); - const transcriptPath = runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript-raw.jsonl'); + const transcriptPath = runArtifactPath(testDir, indexLine, 'sample-1', 'transcript-raw.jsonl'); await expect(readFile(transcriptPath, 'utf8')).rejects.toThrow(); expect(indexLine).not.toHaveProperty('transcript_path'); expect(indexLine.metrics_path).toBe( - `${expectRowDir(indexLine, 'no-transcript-case')}/attempt-1/metrics.json`, + `${expectRowDir(indexLine, 'no-transcript-case')}/sample-1/metrics.json`, ); expect(indexLine.artifact_pointers).toBeUndefined(); }); @@ -2096,11 +2054,11 @@ describe('writeArtifactsFromResults', () => { const [indexLine] = await readIndexLines(paths.indexPath); const rowDir = expectRowDir(indexLine, 'shared-id'); - expect(indexLine.grading_path).toBe(`${rowDir}/attempt-1/grading.json`); + expect(indexLine.grading_path).toBe(`${rowDir}/sample-1/grading.json`); expect(rowDir).not.toContain('/'); const grading: GradingArtifact = JSON.parse( - await readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'grading.json'), 'utf8'), + await readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'grading.json'), 'utf8'), ); expect(grading.assertion_results[0].text).toBe('baseline-check'); @@ -2119,11 +2077,11 @@ describe('writeArtifactsFromResults', () => { const rowDirs = indexLines.map((line) => expectRowDir(line, 'shared-id')); expect(new Set(rowDirs).size).toBe(2); expect(indexLines.map((line) => line.grading_path)).toEqual( - rowDirs.map((rowDir) => `${rowDir}/attempt-1/grading.json`), + rowDirs.map((rowDir) => `${rowDir}/sample-1/grading.json`), ); const answers = await Promise.all( indexLines.map((line) => - readFile(runArtifactPath(testDir, line, 'attempt-1', 'outputs', 'answer.md'), 'utf8'), + readFile(runArtifactPath(testDir, line, 'sample-1', 'outputs', 'answer.md'), 'utf8'), ), ); expect(answers.sort()).toEqual(['alpha answer', 'beta answer']); @@ -2211,7 +2169,7 @@ describe('writeArtifactsFromResults', () => { id: 'alpha', key: 'alpha', dimensions: { - runId: 'attempt-1', + runId: 'sample-1', suite: 'variant-suite', evalPath: 'evals/variant.eval.yaml', testId: 'shared-id', @@ -2233,7 +2191,7 @@ describe('writeArtifactsFromResults', () => { id: 'beta', key: 'beta', dimensions: { - runId: 'attempt-1', + runId: 'sample-1', suite: 'variant-suite', evalPath: 'evals/variant.eval.yaml', testId: 'shared-id', @@ -2570,11 +2528,11 @@ describe('writeArtifacts (from JSONL file)', () => { const artifactEntries = await readdir(paths.testArtifactDir); const [indexLine] = await readIndexLines(paths.indexPath); expect(artifactEntries).toContain(expectRowDir(indexLine, 'from-file')); - expect(artifactEntries).toContain(RESULT_INDEX_FILENAME); + expect(artifactEntries).toContain('.internal'); const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8')); - expect(summary.manifest_path).toBe(RESULT_INDEX_FILENAME); - expect(summary.timing.duration_ms).toBe(12000); - expect(summary.timing.total_tokens).toBe(700); + expect(summary.index_path).toBe('.internal/index.jsonl'); + expect(summary.metrics.duration.total_ms).toBe(12000); + expect(summary.metrics.tokens.total).toBe(700); }); }); diff --git a/apps/cli/test/commands/eval/bundle.test.ts b/apps/cli/test/commands/eval/bundle.test.ts index 6687eb02a..9b0291178 100644 --- a/apps/cli/test/commands/eval/bundle.test.ts +++ b/apps/cli/test/commands/eval/bundle.test.ts @@ -166,7 +166,7 @@ tests: ../data/cases.yaml expect(run.exitCode).toBe(0); expect(run.stdout).toContain('RESULT: PASS'); - await expectFileExists(path.join(bundleDir, 'run', 'index.jsonl')); + await expectFileExists(path.join(bundleDir, 'run', '.internal', 'index.jsonl')); }, 60_000); it('preserves inline eval target object definitions in the bundled target graph', async () => { diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts index 766ad20ed..3a2a823ca 100644 --- a/apps/cli/test/commands/eval/pipeline/bench.test.ts +++ b/apps/cli/test/commands/eval/pipeline/bench.test.ts @@ -76,7 +76,7 @@ describe('pipeline bench', () => { expect(grading.assertion_results.length).toBeGreaterThan(0); expect(grading.graders).toHaveLength(2); - const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); + const indexContent = await readFile(join(OUT_DIR, '.internal', 'index.jsonl'), 'utf8'); const lines = indexContent .trim() .split('\n') @@ -106,7 +106,7 @@ describe('pipeline bench', () => { const { execa } = await import('execa'); await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); - const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); + const indexContent = await readFile(join(OUT_DIR, '.internal', 'index.jsonl'), 'utf8'); const entry = JSON.parse(indexContent.trim().split('\n')[0]); expect(entry.experiment).toBe('without_skills'); @@ -118,7 +118,7 @@ describe('pipeline bench', () => { const { execa } = await import('execa'); await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]); - const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8'); + const indexContent = await readFile(join(OUT_DIR, '.internal', 'index.jsonl'), 'utf8'); const entry = JSON.parse(indexContent.trim().split('\n')[0]); expect(entry.experiment).toBeUndefined(); diff --git a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts index 2d013b2b9..fc755cc41 100644 --- a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts +++ b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts @@ -61,7 +61,7 @@ describe('eval pipeline e2e', () => { expect(grading.graders).toHaveLength(2); expect(grading.summary.pass_rate).toBeGreaterThan(0); - const indexContent = await readFile(join(outDir, 'index.jsonl'), 'utf8'); + const indexContent = await readFile(join(outDir, '.internal', 'index.jsonl'), 'utf8'); const indexLines = indexContent .trim() .split('\n') diff --git a/apps/cli/test/commands/eval/result-layout.test.ts b/apps/cli/test/commands/eval/result-layout.test.ts index 40a8783b3..355754179 100644 --- a/apps/cli/test/commands/eval/result-layout.test.ts +++ b/apps/cli/test/commands/eval/result-layout.test.ts @@ -5,6 +5,7 @@ import path from 'node:path'; import { RESULT_INDEX_FILENAME, + RUN_INTERNAL_DIRNAME, buildDefaultRunDir, buildDefaultRunDirFromName, discoverRunManifestPaths, @@ -51,7 +52,8 @@ describe('result layout', () => { it('resolves the canonical index.jsonl file in a run directory', () => { const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-layout-test-')); try { - const indexPath = path.join(tempDir, RESULT_INDEX_FILENAME); + const indexPath = path.join(tempDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME); + mkdirSync(path.dirname(indexPath), { recursive: true }); writeFileSync(indexPath, '{"test_id":"case"}\n'); expect(resolveExistingRunPrimaryPath(tempDir)).toBe(indexPath); @@ -76,12 +78,13 @@ describe('result layout', () => { } }); - it('treats the root index.jsonl as authoritative when legacy nested bundles also exist', () => { + it('treats the internal index.jsonl as authoritative when legacy nested bundles also exist', () => { const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-layout-test-')); try { const nestedBundleDir = path.join(tempDir, 'target-a'); mkdirSync(nestedBundleDir, { recursive: true }); - const rootIndexPath = path.join(tempDir, RESULT_INDEX_FILENAME); + const rootIndexPath = path.join(tempDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME); + mkdirSync(path.dirname(rootIndexPath), { recursive: true }); writeFileSync(rootIndexPath, '{"test_id":"root"}\n'); writeFileSync(path.join(nestedBundleDir, RESULT_INDEX_FILENAME), '{"test_id":"legacy"}\n'); diff --git a/apps/cli/test/commands/eval/run-cache.test.ts b/apps/cli/test/commands/eval/run-cache.test.ts index 2c5ccd2ba..470622eff 100644 --- a/apps/cli/test/commands/eval/run-cache.test.ts +++ b/apps/cli/test/commands/eval/run-cache.test.ts @@ -10,7 +10,7 @@ describe('resolveRunCacheFile', () => { timestamp: '', }; expect(resolveRunCacheFile(cache)).toBe( - path.join('/results/2026-03-24T00-00-00-000Z', 'index.jsonl'), + path.join('/results/2026-03-24T00-00-00-000Z', '.internal', 'index.jsonl'), ); }); @@ -29,7 +29,7 @@ describe('resolveRunCacheFile', () => { timestamp: '', }; expect(resolveRunCacheFile(cache)).toBe( - path.join('/results/2026-03-24T00-00-00-000Z', 'index.jsonl'), + path.join('/results/2026-03-24T00-00-00-000Z', '.internal', 'index.jsonl'), ); }); diff --git a/apps/cli/test/commands/grade/grade-prepared.test.ts b/apps/cli/test/commands/grade/grade-prepared.test.ts index 11584a69b..ad74988ad 100644 --- a/apps/cli/test/commands/grade/grade-prepared.test.ts +++ b/apps/cli/test/commands/grade/grade-prepared.test.ts @@ -169,7 +169,7 @@ describe('agentv grade prepared attempts', () => { workspace_path: path.join(preparedDir, 'workspace'), manifest_path: path.join(preparedDir, 'agentv_prepare.json'), output_dir: runDir, - index_path: path.join(runDir, 'index.jsonl'), + index_path: path.join(runDir, '.internal', 'index.jsonl'), }); expect(await exists(targetMarker)).toBe(false); @@ -177,7 +177,9 @@ describe('agentv grade prepared attempts', () => { expect(graderPayload.workspace_path).toBe(path.join(preparedDir, 'workspace')); expect(graderPayload.file_changes).toContain('+manual edit'); - const row = JSON.parse((await readFile(path.join(runDir, 'index.jsonl'), 'utf8')).trim()); + const row = JSON.parse( + (await readFile(path.join(runDir, '.internal', 'index.jsonl'), 'utf8')).trim(), + ); expect(row).toMatchObject({ test_id: 'case-1', target: 'codex', @@ -197,7 +199,7 @@ describe('agentv grade prepared attempts', () => { }); expect(typeof row.metadata.prepared_attempt.baseline_commit).toBe('string'); - expect(row.file_changes_path).toMatch(/\/attempt-1\/outputs\/file_changes\.diff$/); + expect(row.file_changes_path).toMatch(/\/sample-1\/outputs\/file_changes\.diff$/); await expect(readFile(path.join(runDir, row.file_changes_path), 'utf8')).resolves.toContain( '+manual edit', ); @@ -275,7 +277,9 @@ describe('agentv grade prepared attempts', () => { ); expect(await exists(targetMarker)).toBe(false); - const row = JSON.parse((await readFile(path.join(runDir, 'index.jsonl'), 'utf8')).trim()); + const row = JSON.parse( + (await readFile(path.join(runDir, '.internal', 'index.jsonl'), 'utf8')).trim(), + ); expect(row.score).toBe(0); expect(row.scores[0]).toMatchObject({ name: 'expected-tool-sequence', @@ -382,7 +386,9 @@ describe('agentv grade prepared attempts', () => { }); expect(await exists(targetMarker)).toBe(false); - const row = JSON.parse((await readFile(path.join(runDir, 'index.jsonl'), 'utf8')).trim()); + const row = JSON.parse( + (await readFile(path.join(runDir, '.internal', 'index.jsonl'), 'utf8')).trim(), + ); const answerPath = row.answer_path ?? row.response_path ?? row.output_path; expect(typeof answerPath).toBe('string'); expect((await readFile(path.join(runDir, answerPath), 'utf8')).trim()).toBe('done'); diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts index 574e84c00..89ba73a04 100644 --- a/apps/cli/test/commands/results/export-e2e-providers.test.ts +++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts @@ -213,7 +213,7 @@ function toJsonl(...records: object[]): string { } function readIndex(outputDir: string): IndexArtifactEntry[] { - return readFileSync(path.join(outputDir, RESULT_INDEX_FILENAME), 'utf8') + return readFileSync(path.join(outputDir, '.internal', RESULT_INDEX_FILENAME), 'utf8') .trim() .split('\n') .filter(Boolean) @@ -238,7 +238,7 @@ function runArtifactDir( outputDir: string, record: { suite?: string; target?: string; test_id?: string }, ): string { - return path.join(outputDir, findIndexEntry(outputDir, record).result_dir, 'attempt-1'); + return path.join(outputDir, findIndexEntry(outputDir, record).result_dir, 'sample-1'); } describe('export e2e — multi-provider metrics verification', () => { @@ -254,7 +254,7 @@ describe('export e2e — multi-provider metrics verification', () => { // ── Timing artifact tests ────────────────────────────────────────────── - describe('/timing.json — per-test timing', () => { + describe('/metrics.json — per-test timing', () => { it('should include reasoning tokens in token_usage', async () => { const outputDir = path.join(tempDir, 'claude'); const content = toJsonl(CLAUDE_CLI_RESULT); @@ -263,14 +263,14 @@ describe('export e2e — multi-provider metrics verification', () => { const timing: TimingArtifact = JSON.parse( readFileSync( - path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'), 'utf8', ), ); - expect(timing.token_usage.input).toBe(2000); - expect(timing.token_usage.output).toBe(800); - expect(timing.token_usage.reasoning).toBe(1500); + expect(timing.tokens.input).toBe(2000); + expect(timing.tokens.output).toBe(800); + expect(timing.tokens.reasoning).toBe(1500); }); it('should write independent timing files for multiple providers', async () => { @@ -281,20 +281,20 @@ describe('export e2e — multi-provider metrics verification', () => { const claudeTiming: TimingArtifact = JSON.parse( readFileSync( - path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'), 'utf8', ), ); const codexTiming: TimingArtifact = JSON.parse( - readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'metrics.json'), 'utf8'), ); const copilotTiming: TimingArtifact = JSON.parse( - readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'metrics.json'), 'utf8'), ); - expect(claudeTiming.token_usage.reasoning).toBe(1500); - expect(codexTiming.token_usage.reasoning).toBe(2500); - expect(copilotTiming.token_usage.reasoning).toBe(0); + expect(claudeTiming.tokens.reasoning).toBe(1500); + expect(codexTiming.tokens.reasoning).toBe(2500); + expect(copilotTiming.tokens.reasoning).toBe(0); }); it('should compute total_tokens as input + output (not including reasoning)', async () => { @@ -305,12 +305,12 @@ describe('export e2e — multi-provider metrics verification', () => { const timing: TimingArtifact = JSON.parse( readFileSync( - path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'), 'utf8', ), ); - expect(timing.total_tokens).toBe(2800); + expect(timing.tokens.total).toBe(2800); }); it('should preserve duration_ms per test result', async () => { @@ -320,11 +320,11 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const timing: TimingArtifact = JSON.parse( - readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'metrics.json'), 'utf8'), ); - expect(timing.duration_ms).toBe(12000); - expect(timing.total_duration_seconds).toBe(12); + expect(timing.duration.total_ms).toBe(12000); + expect(timing.duration.total_seconds).toBe(12); }); it('should handle results with no token_usage gracefully', async () => { @@ -334,14 +334,14 @@ describe('export e2e — multi-provider metrics verification', () => { await exportResults('test.jsonl', content, outputDir); const timing: TimingArtifact = JSON.parse( - readFileSync(path.join(runArtifactDir(outputDir, MINIMAL_RESULT), 'timing.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, MINIMAL_RESULT), 'metrics.json'), 'utf8'), ); - expect(timing.total_tokens).toBe(0); - expect(timing.duration_ms).toBe(0); - expect(timing.token_usage.input).toBe(0); - expect(timing.token_usage.output).toBe(0); - expect(timing.token_usage.reasoning).toBe(0); + expect(timing.tokens.total).toBe(0); + expect(timing.duration.total_ms).toBe(0); + expect(timing.tokens.input).toBe(0); + expect(timing.tokens.output).toBe(0); + expect(timing.tokens.reasoning).toBe(0); }); it('should handle providers with and without reasoning tokens', async () => { @@ -352,16 +352,16 @@ describe('export e2e — multi-provider metrics verification', () => { const claudeTiming: TimingArtifact = JSON.parse( readFileSync( - path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'), + path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'), 'utf8', ), ); const copilotTiming: TimingArtifact = JSON.parse( - readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'), + readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'metrics.json'), 'utf8'), ); - expect(claudeTiming.token_usage.reasoning).toBe(1500); - expect(copilotTiming.token_usage.reasoning).toBe(0); + expect(claudeTiming.tokens.reasoning).toBe(1500); + expect(copilotTiming.tokens.reasoning).toBe(0); }); }); @@ -644,7 +644,7 @@ describe('export e2e — multi-provider metrics verification', () => { // Verify all artifact files exist expect(existsSync(path.join(outputDir, 'summary.json'))).toBe(true); - expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(false); + expect(existsSync(path.join(outputDir, 'metrics.json'))).toBe(false); // Verify benchmark const benchmark: RunSummaryArtifact = JSON.parse( @@ -713,16 +713,16 @@ describe('export e2e — multi-provider metrics verification', () => { readFileSync( path.join( runArtifactDir(outputDir, { ...record, target: 'mock' as const }), - 'timing.json', + 'metrics.json', ), 'utf8', ), ); - expect(timing.token_usage.input).toBe(100); - expect(timing.token_usage.output).toBe(50); - expect(timing.token_usage.reasoning).toBe(75); - expect(timing.duration_ms).toBe(1000); + expect(timing.tokens.input).toBe(100); + expect(timing.tokens.output).toBe(50); + expect(timing.tokens.reasoning).toBe(75); + expect(timing.duration.total_ms).toBe(1000); }); }); }); diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts index efc93cd3b..6a8014d46 100644 --- a/apps/cli/test/commands/results/export.test.ts +++ b/apps/cli/test/commands/results/export.test.ts @@ -165,7 +165,7 @@ function toJsonl(...records: object[]): string { } function readIndex(outputDir: string): IndexArtifactEntry[] { - return readFileSync(path.join(outputDir, RESULT_INDEX_FILENAME), 'utf8') + return readFileSync(path.join(outputDir, '.internal', RESULT_INDEX_FILENAME), 'utf8') .trim() .split('\n') .filter(Boolean) @@ -197,7 +197,7 @@ function runArtifactDir( outputDir: string, record: { suite?: string; target?: string; test_id?: string }, ): string { - return path.join(artifactDir(outputDir, record), 'attempt-1'); + return path.join(artifactDir(outputDir, record), 'sample-1'); } function readAnswer( @@ -220,8 +220,8 @@ describe('results export', () => { it('loadExportSource resolves run workspaces to index.jsonl', async () => { const runDir = path.join(tempDir, '2026-03-18T10-00-00-000Z'); - mkdirSync(runDir, { recursive: true }); - const sourceFile = path.join(runDir, RESULT_INDEX_FILENAME); + mkdirSync(path.join(runDir, '.internal'), { recursive: true }); + const sourceFile = path.join(runDir, '.internal/index.jsonl'); writeFileSync(sourceFile, toJsonl(RESULT_FULL)); const { sourceFile: loadedSource, results } = await loadExportSource(runDir, tempDir); @@ -234,7 +234,7 @@ describe('results export', () => { it('deriveOutputDir uses the run directory name for manifest inputs', () => { const outputDir = deriveOutputDir( tempDir, - path.join(tempDir, '2026-03-18T10-00-00-000Z', 'index.jsonl'), + path.join(tempDir, '2026-03-18T10-00-00-000Z', '.internal', 'index.jsonl'), ); expect(outputDir).toBe( path.join(tempDir, '.agentv', 'results', 'export', '2026-03-18T10-00-00-000Z'), @@ -250,6 +250,7 @@ describe('results export', () => { 'results', 'with-skills', '2026-03-18T10-00-00-000Z', + '.internal', RESULT_INDEX_FILENAME, ), ); @@ -299,7 +300,7 @@ describe('results export', () => { }); expect(first.entries[0].artifact_refs).toMatchObject({ status: 'planned_export', - timing_path: expect.stringMatching(/^test-private--[a-f0-9]{12}\/attempt-1\/timing\.json$/), + metrics_path: expect.stringMatching(/^test-private--[a-f0-9]{12}\/sample-1\/metrics\.json$/), }); expect(first.entries[0].artifact_refs).not.toHaveProperty('input_path'); expect(first.entries[0].artifact_refs).not.toHaveProperty('output_path'); @@ -380,20 +381,19 @@ describe('results export', () => { status: 'planned_export', result_dir: resultDir, summary_path: `${resultDir}/summary.json`, - grading_path: `${resultDir}/attempt-1/grading.json`, - timing_path: `${resultDir}/attempt-1/timing.json`, - metrics_path: `${resultDir}/attempt-1/metrics.json`, - output_path: `${resultDir}/attempt-1/outputs/answer.md`, - answer_path: `${resultDir}/attempt-1/outputs/answer.md`, - transcript_path: `${resultDir}/attempt-1/transcript.json`, - transcript_raw_path: `${resultDir}/attempt-1/transcript-raw.jsonl`, + grading_path: `${resultDir}/sample-1/grading.json`, + metrics_path: `${resultDir}/sample-1/metrics.json`, + output_path: `${resultDir}/sample-1/outputs/answer.md`, + answer_path: `${resultDir}/sample-1/outputs/answer.md`, + transcript_path: `${resultDir}/sample-1/transcript.json`, + transcript_raw_path: `${resultDir}/sample-1/transcript-raw.jsonl`, }); expect(bundle.entries[0].artifact_refs).not.toHaveProperty('trace_path'); expect(bundle.entries[0].artifact_refs).not.toHaveProperty('input_path'); expect(bundle.entries[0].trace).not.toHaveProperty('envelope_ref'); expect(bundle.entries[0].trace_envelope.artifacts).toBeDefined(); expect(bundle.entries[0].trace_envelope.artifacts).not.toHaveProperty('trace_path'); - expect(bundle.entries[0].feedback.grading_path).toBe(`${resultDir}/attempt-1/grading.json`); + expect(bundle.entries[0].feedback.grading_path).toBe(`${resultDir}/sample-1/grading.json`); expect(bundle.entries[0].raw_content).toBeDefined(); expect(bundle.entries[0].feedback.scores?.[0]).toHaveProperty('evidence'); expect(serialized).toContain('SECRET_PROMPT_TEXT'); @@ -413,7 +413,7 @@ describe('results export', () => { expect(existsSync(summaryPath)).toBe(true); const benchmark: RunSummaryArtifact = JSON.parse(readFileSync(summaryPath, 'utf8')); - expect(benchmark.manifest_path).toBe(RESULT_INDEX_FILENAME); + expect(benchmark.index_path).toBe('.internal/index.jsonl'); expect(benchmark.metadata.eval_file).toBe('eval_2026-03-18.jsonl'); expect(benchmark.metadata.timestamp).toBe('2026-03-18T10:00:01.000Z'); // artifact-writer uses string[] for tests_run, not a count @@ -437,7 +437,7 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); - const indexPath = path.join(outputDir, RESULT_INDEX_FILENAME); + const indexPath = path.join(outputDir, '.internal', RESULT_INDEX_FILENAME); expect(existsSync(indexPath)).toBe(true); const entries = readFileSync(indexPath, 'utf8') @@ -454,13 +454,12 @@ describe('results export', () => { execution_status: 'ok', result_dir: rowDir, summary_path: `${rowDir}/summary.json`, - grading_path: `${rowDir}/attempt-1/grading.json`, - timing_path: `${rowDir}/attempt-1/timing.json`, - metrics_path: `${rowDir}/attempt-1/metrics.json`, - output_path: `${rowDir}/attempt-1/outputs/answer.md`, - answer_path: `${rowDir}/attempt-1/outputs/answer.md`, - transcript_path: `${rowDir}/attempt-1/transcript.json`, - transcript_raw_path: `${rowDir}/attempt-1/transcript-raw.jsonl`, + grading_path: `${rowDir}/sample-1/grading.json`, + metrics_path: `${rowDir}/sample-1/metrics.json`, + output_path: `${rowDir}/sample-1/outputs/answer.md`, + answer_path: `${rowDir}/sample-1/outputs/answer.md`, + transcript_path: `${rowDir}/sample-1/transcript.json`, + transcript_raw_path: `${rowDir}/sample-1/transcript-raw.jsonl`, }); expect(entries[0]).not.toHaveProperty('input_path'); expect(entries[0].projection_identity).toMatchObject({ @@ -575,12 +574,13 @@ describe('results export', () => { it('exports generated test bundle refs and files from source manifests', async () => { const sourceDir = path.join(tempDir, 'source-run'); mkdirSync(path.join(sourceDir, 'case', 'test'), { recursive: true }); + mkdirSync(path.join(sourceDir, '.internal'), { recursive: true }); writeFileSync( path.join(sourceDir, 'case', 'test', 'EVAL.yaml'), 'tests:\n - id: test-greeting\n', ); writeFileSync(path.join(sourceDir, 'case', 'test', 'targets.yaml'), 'targets: []\n'); - const sourceFile = path.join(sourceDir, RESULT_INDEX_FILENAME); + const sourceFile = path.join(sourceDir, '.internal/index.jsonl'); const outputDir = path.join(tempDir, 'output'); const content = toJsonl({ ...RESULT_FULL, @@ -618,12 +618,13 @@ describe('results export', () => { it('exports legacy task_dir bundles as new test_dir artifacts', async () => { const sourceDir = path.join(tempDir, 'legacy-run'); mkdirSync(path.join(sourceDir, 'case', 'task'), { recursive: true }); + mkdirSync(path.join(sourceDir, '.internal'), { recursive: true }); writeFileSync( path.join(sourceDir, 'case', 'task', 'EVAL.yaml'), 'tests:\n - id: test-greeting\n', ); writeFileSync(path.join(sourceDir, 'case', 'task', 'targets.yaml'), 'targets: []\n'); - const sourceFile = path.join(sourceDir, RESULT_INDEX_FILENAME); + const sourceFile = path.join(sourceDir, '.internal/index.jsonl'); const outputDir = path.join(tempDir, 'output'); const content = toJsonl({ ...RESULT_FULL, @@ -647,8 +648,8 @@ describe('results export', () => { it('preserves source bundle refs in dry-run projection inputs', async () => { const sourceDir = path.join(tempDir, 'source-run'); - mkdirSync(sourceDir, { recursive: true }); - const sourceFile = path.join(sourceDir, RESULT_INDEX_FILENAME); + mkdirSync(path.join(sourceDir, '.internal'), { recursive: true }); + const sourceFile = path.join(sourceDir, '.internal/index.jsonl'); writeFileSync( sourceFile, toJsonl({ @@ -694,21 +695,21 @@ describe('results export', () => { expect(readAnswer(outputDir, RESULT_FULL)).toBe('Hello, Alice!'); }); - it('should create per-test timing.json with run timing', async () => { + it('should create per-test metrics.json with run timing', async () => { const outputDir = path.join(tempDir, 'output'); const content = toJsonl(RESULT_FULL, RESULT_PARTIAL); await exportResults('test.jsonl', content, outputDir); - const timingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'timing.json'); + const timingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'metrics.json'); expect(existsSync(timingPath)).toBe(true); const timing: TimingArtifact = JSON.parse(readFileSync(timingPath, 'utf8')); - expect(timing.total_tokens).toBe(1500); - expect(timing.duration_ms).toBe(3500); - expect(timing.token_usage).toHaveProperty('input'); - expect(timing.token_usage).toHaveProperty('output'); - expect(timing.token_usage).toHaveProperty('reasoning'); + expect(timing.tokens.total).toBe(1500); + expect(timing.duration.total_ms).toBe(3500); + expect(timing.tokens).toHaveProperty('input'); + expect(timing.tokens).toHaveProperty('output'); + expect(timing.tokens).toHaveProperty('reasoning'); }); it('should create per-test artifact directories', async () => { @@ -750,7 +751,7 @@ describe('results export', () => { expect(grading.graders?.[0].name).toBe('greeting_quality'); expect(grading.graders?.[0].type).toBe('llm-grader'); - const perTestTimingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'timing.json'); + const perTestTimingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'metrics.json'); expect(existsSync(perTestTimingPath)).toBe(true); }); @@ -803,8 +804,8 @@ describe('results export', () => { await exportResults('test.jsonl', content, outputDir); expect(existsSync(path.join(outputDir, 'summary.json'))).toBe(true); - expect(existsSync(path.join(outputDir, RESULT_INDEX_FILENAME))).toBe(true); - expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(false); + expect(existsSync(path.join(outputDir, '.internal', RESULT_INDEX_FILENAME))).toBe(true); + expect(existsSync(path.join(outputDir, 'metrics.json'))).toBe(false); expect(existsSync(path.join(runArtifactDir(outputDir, RESULT_FULL), 'grading.json'))).toBe( true, ); @@ -837,7 +838,7 @@ describe('results export', () => { const answerPath = path.join( artifactDir(outputDir, RESULT_DIFFERENT_TARGET), - 'attempt-1', + 'sample-1', 'outputs', 'answer.md', ); diff --git a/apps/cli/test/commands/results/report.test.ts b/apps/cli/test/commands/results/report.test.ts index e7c2e0f5d..41e8b4274 100644 --- a/apps/cli/test/commands/results/report.test.ts +++ b/apps/cli/test/commands/results/report.test.ts @@ -118,7 +118,7 @@ describe('results report', () => { { evalFile: 'evals/demo.eval.yaml' }, ); - const indexPath = path.join(runDir, RESULT_INDEX_FILENAME); + const indexPath = path.join(runDir, '.internal', RESULT_INDEX_FILENAME); const lines = readFileSync(indexPath, 'utf8') .trim() .split('\n') diff --git a/apps/cli/test/commands/results/validate.test.ts b/apps/cli/test/commands/results/validate.test.ts index 4a47e7cbf..9bdf30406 100644 --- a/apps/cli/test/commands/results/validate.test.ts +++ b/apps/cli/test/commands/results/validate.test.ts @@ -37,7 +37,7 @@ describe('results validate', () => { writeFileSync( path.join(runDir, 'summary.json'), `${JSON.stringify({ - manifest_path: 'index.jsonl', + index_path: 'index.jsonl', schema_version: 1, metadata: { experiment: 'with-skills', @@ -76,12 +76,12 @@ describe('results validate', () => { scores: [{ name: 'quality', type: 'llm', score: 1, verdict: 'pass' }], execution_status: 'ok', summary_path: 'test-greeting/summary.json', - trace_path: 'test-greeting/attempt-1/trace.json', + trace_path: 'test-greeting/sample-1/trace.json', artifact_pointers: { trace: { ref: 'agentv/artifacts/v1', - key: 'traces/test-greeting/attempt-1/trace.json', - path: 'test-greeting/attempt-1/trace.json', + key: 'traces/test-greeting/sample-1/trace.json', + path: 'test-greeting/sample-1/trace.json', }, }, })}\n`, diff --git a/apps/cli/test/commands/runs/rerun.test.ts b/apps/cli/test/commands/runs/rerun.test.ts index facc3c3f1..f91963d53 100644 --- a/apps/cli/test/commands/runs/rerun.test.ts +++ b/apps/cli/test/commands/runs/rerun.test.ts @@ -241,7 +241,10 @@ describe('agentv runs rerun', () => { }, }); - const answerPath = path.join(path.dirname(indexPath), String(rows[0].answer_path)); + const answerPath = path.join( + path.dirname(path.dirname(indexPath)), + String(rows[0].answer_path), + ); const answer = await readFile(answerPath, 'utf8'); expect(answer).toContain('Alpha answer'); expect(answer).not.toContain('Captured answer'); diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts index a89fabd8b..9ad40075c 100644 --- a/apps/cli/test/eval.integration.test.ts +++ b/apps/cli/test/eval.integration.test.ts @@ -21,6 +21,15 @@ const __dirname = path.dirname(__filename); const projectRoot = path.resolve(__dirname, '../../..'); const CLI_ENTRY = path.join(projectRoot, 'apps/cli/src/cli.ts'); const MOCK_RUNNER = path.join(projectRoot, 'apps/cli/test/fixtures/mock-run-evaluation.ts'); + +function runIndexPath(runDir: string): string { + return path.join(runDir, '.internal', 'index.jsonl'); +} + +function runDirFromIndexPath(indexPath: string): string { + return path.dirname(path.dirname(indexPath)); +} + async function createFixture(): Promise { const baseDir = await mkdtemp(path.join(tmpdir(), 'agentv-cli-test-')); const suiteDir = path.join(baseDir, 'suite'); @@ -332,7 +341,7 @@ describe('agentv eval CLI', () => { ]); expect(exitCode).toBe(0); - const indexPath = path.join(outputDir, 'index.jsonl'); + const indexPath = runIndexPath(outputDir); expect(extractOutputPath(stdout)).toBe(indexPath); expect(stdout).toContain(`Artifact directory: ${outputDir}`); @@ -343,7 +352,7 @@ describe('agentv eval CLI', () => { const resultDir = row.result_dir as string; expect(resultDir).not.toContain('/'); await expectFileExists(path.join(outputDir, resultDir, 'summary.json')); - await expectFileExists(path.join(outputDir, resultDir, 'attempt-1', 'grading.json')); + await expectFileExists(path.join(outputDir, resultDir, 'sample-1', 'grading.json')); } } finally { await rm(fixture.baseDir, { recursive: true, force: true }); @@ -359,14 +368,14 @@ describe('agentv eval CLI', () => { const outputDir = path.join(fixture.suiteDir, 'configured-results'); expect(exitCode).toBe(0); - const indexPath = path.join(outputDir, 'index.jsonl'); + const indexPath = runIndexPath(outputDir); expect(extractOutputPath(stdout)).toBe(indexPath); await expectFileExists(indexPath); await expectFileExists(path.join(outputDir, 'summary.json')); const [firstRow] = (await readJsonLines(indexPath)) as Array>; await expectFileExists(path.join(outputDir, firstRow.result_dir as string, 'summary.json')); await expectFileExists( - path.join(outputDir, firstRow.result_dir as string, 'attempt-1', 'grading.json'), + path.join(outputDir, firstRow.result_dir as string, 'sample-1', 'grading.json'), ); } finally { await rm(fixture.baseDir, { recursive: true, force: true }); @@ -401,7 +410,7 @@ describe('agentv eval CLI', () => { ]); expect(exitCode).toBe(1); - const indexPath = path.join(outputDir, 'index.jsonl'); + const indexPath = runIndexPath(outputDir); expect(extractOutputPath(stdout)).toBe(indexPath); expect(stdout).not.toContain('Export files:'); @@ -409,10 +418,10 @@ describe('agentv eval CLI', () => { expect(canonicalResults).toHaveLength(2); await expectFileExists(path.join(outputDir, 'summary.json')); for (const row of canonicalResults) { - expect(row.transcript_path).toMatch(/attempt-1\/transcript\.json$/); + expect(row.transcript_path).toMatch(/sample-1\/transcript\.json$/); await expectFileExists(path.join(outputDir, row.transcript_path as string)); expect(row.transcript_summary).toBeDefined(); - expect(row.transcript_raw_path).toMatch(/attempt-1\/transcript-raw\.jsonl$/); + expect(row.transcript_raw_path).toMatch(/sample-1\/transcript-raw\.jsonl$/); await expectFileExists(path.join(outputDir, row.transcript_raw_path as string)); } } finally { @@ -664,7 +673,7 @@ describe('agentv eval CLI', () => { expect(exitCode).toBe(0); const outputPath = extractOutputPath(stdout); - expect(path.dirname(path.dirname(outputPath))).toBe( + expect(path.dirname(runDirFromIndexPath(outputPath))).toBe( path.join(fixture.suiteDir, '.agentv', 'results'), ); @@ -685,7 +694,7 @@ describe('agentv eval CLI', () => { }); const benchmark = JSON.parse( - await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'), + await readFile(path.join(runDirFromIndexPath(outputPath), 'summary.json'), 'utf8'), ) as { metadata?: Record }; expect(benchmark.metadata?.experiment).toBe('native-exp'); expect(benchmark.metadata?.experiment_config).toMatchObject({ @@ -860,7 +869,7 @@ describe('agentv eval CLI', () => { expect(exitCode).toBe(0); const outputPath = extractOutputPath(stdout); - expect(path.dirname(path.dirname(outputPath))).toBe( + expect(path.dirname(runDirFromIndexPath(outputPath))).toBe( path.join(fixture.suiteDir, '.agentv', 'results'), ); @@ -885,7 +894,7 @@ describe('agentv eval CLI', () => { }); const benchmark = JSON.parse( - await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'), + await readFile(path.join(runDirFromIndexPath(outputPath), 'summary.json'), 'utf8'), ) as { metadata?: Record }; expect(benchmark.metadata?.runtime_source).toMatchObject({ schema_version: 'agentv.runtime_source.v1', @@ -946,7 +955,7 @@ describe('agentv eval CLI', () => { '0.8', ]); expect(first.exitCode).toBe(1); - const priorIndexPath = path.join(priorRunDir, 'index.jsonl'); + const priorIndexPath = runIndexPath(priorRunDir); const priorRows = (await readJsonLines(priorIndexPath)) as Array>; const alphaRow = priorRows.find((row) => row.test_id === 'case-alpha'); const betaRow = priorRows.find((row) => row.test_id === 'case-beta'); @@ -1062,7 +1071,7 @@ tests: ]); expect(first.exitCode).toBe(0); - const priorIndexPath = path.join(priorRunDir, 'index.jsonl'); + const priorIndexPath = runIndexPath(priorRunDir); const priorRows = (await readJsonLines(priorIndexPath)) as Array>; expect(priorRows).toHaveLength(1); const baseRow = priorRows[0]; @@ -1181,7 +1190,7 @@ tests: expect(exitCode).toBe(0); const outputPath = extractOutputPath(stdout); const benchmark = JSON.parse( - await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'), + await readFile(path.join(runDirFromIndexPath(outputPath), 'summary.json'), 'utf8'), ) as { metadata?: Record }; expect(benchmark.metadata?.runtime_source).toMatchObject({ schema_version: 'agentv.runtime_source.v1', diff --git a/apps/web/src/content/docs/docs/next/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/next/evaluation/running-evals.mdx index 896ca40e4..782c6885a 100644 --- a/apps/web/src/content/docs/docs/next/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/next/evaluation/running-evals.mdx @@ -11,7 +11,7 @@ sidebar: agentv eval evals/my-eval.yaml ``` -Results are written to `.agentv/results//index.jsonl`. Each CLI +Results are written to `.agentv/results//.internal/index.jsonl`. Each CLI invocation writes one run bundle. The experiment label is stored in `summary.json` and row metadata. Each line is a JSON object with one result per test case, and the run workspace also stores the summary and related artifacts. @@ -98,23 +98,23 @@ are unchanged. ### Custom Output Directory -Write all artifacts (index.jsonl, summary.json, per-test grading/timing) to a specific directory: +Write all artifacts (`.internal/index.jsonl`, `summary.json`, per-test grading, metrics, and transcripts) to a specific directory: ```bash agentv eval evals/my-eval.yaml --output ./my-results ``` `--output` is a run directory, not a file path. The canonical manifest is always -`/index.jsonl`; the aggregate summary is +`/.internal/index.jsonl`; the aggregate summary is `/summary.json`. ### Read Results from the Run Manifest -The run directory is the complete artifact boundary. Use `/index.jsonl` for scripts, CI summaries, and downstream tools: +The run directory is the complete artifact boundary. Use `/.internal/index.jsonl` for scripts, CI summaries, and downstream tools: ```bash agentv eval evals/my-eval.yaml --output ./my-results -cat ./my-results/index.jsonl +cat ./my-results/.internal/index.jsonl ``` ### Generated Test Bundles @@ -128,15 +128,15 @@ Typical layout: ```text my-results/ - index.jsonl summary.json + .internal/ + index.jsonl / summary.json - attempt-1/ + sample-1/ result.json grading.json metrics.json - timing.json transcript.json transcript-raw.jsonl outputs/answer.md @@ -187,7 +187,7 @@ OpenTelemetry/OpenInference spans directly to that backend during execution. ```bash # Summary-level inspection from the run manifest -agentv inspect stats .agentv/results//index.jsonl +agentv inspect stats .agentv/results//.internal/index.jsonl # Inspect AgentV-owned per-case artifacts and transcript sidecars agentv inspect show .agentv/results//index.jsonl --tree @@ -275,7 +275,7 @@ agentv eval evals/my-eval.yaml --output .agentv/results/ --resume agentv eval evals/my-eval.yaml --rerun-failed # Re-run only execution errors from any prior run by path -agentv eval evals/my-eval.yaml --retry-errors .agentv/results//index.jsonl +agentv eval evals/my-eval.yaml --retry-errors .agentv/results//.internal/index.jsonl ``` After any failing run, the CLI prints the exact `--rerun-failed` command for the run dir that just completed — copy/paste it. If the process or pod disappeared before you could access the local run directory and results auto-push was enabled, recover the partial run from [WIP checkpoints](/docs/tools/wip-checkpoints/) first, then use the same `--resume` flow. @@ -367,7 +367,7 @@ See the [Import tool docs](/docs/tools/import/) for all providers and options. Each result row's `result_dir` is an allocated folder under the timestamped run bundle, usually with a readable test-id prefix plus a short hash suffix. It can include `transcript.json`, `transcript-raw.jsonl`, `grading.json`, -`timing.json`, `metrics.json`, and generated outputs under `outputs/`. The run +``metrics.json`, and generated outputs under `outputs/`. The run root does not contain target, model, or `cases/` folders, and it does not contain a mixed transcript artifact; use each index row's `transcript_path` to find the per-result transcript. diff --git a/apps/web/src/content/docs/docs/next/getting-started/quickstart.mdx b/apps/web/src/content/docs/docs/next/getting-started/quickstart.mdx index 34c73acb1..409e0a778 100644 --- a/apps/web/src/content/docs/docs/next/getting-started/quickstart.mdx +++ b/apps/web/src/content/docs/docs/next/getting-started/quickstart.mdx @@ -65,7 +65,7 @@ tests: agentv eval ./evals/example.yaml ``` -Results appear in `.agentv/results//index.jsonl` with scores, reasoning, and execution traces. +Results appear in `.agentv/results//.internal/index.jsonl` with scores, reasoning, and execution traces. ## Next Steps diff --git a/apps/web/src/content/docs/docs/next/guides/autoresearch.mdx b/apps/web/src/content/docs/docs/next/guides/autoresearch.mdx index cce9887f5..827cea502 100644 --- a/apps/web/src/content/docs/docs/next/guides/autoresearch.mdx +++ b/apps/web/src/content/docs/docs/next/guides/autoresearch.mdx @@ -81,9 +81,9 @@ Each autoresearch session creates a self-contained experiment directory: │ ├── iterations.jsonl # Per-cycle data (score, decision, mutation) │ └── trajectory.html # Live-updating Chart.js visualization ├── 2026-04-15T10-30-00/ # Cycle 1 run artifacts -│ ├── index.jsonl +│ ├── .internal/index.jsonl │ ├── grading.json -│ └── timing.json +│ └── metrics.json ├── 2026-04-15T10-35-00/ # Cycle 2 run artifacts │ └── ... └── ... diff --git a/apps/web/src/content/docs/docs/next/reference/result-artifacts.mdx b/apps/web/src/content/docs/docs/next/reference/result-artifacts.mdx index da2e88bc0..6594efaac 100644 --- a/apps/web/src/content/docs/docs/next/reference/result-artifacts.mdx +++ b/apps/web/src/content/docs/docs/next/reference/result-artifacts.mdx @@ -12,9 +12,9 @@ external adapters. The contract is run-centric: - `summary.json` owns aggregate run facts. -- `index.jsonl` owns row-level discovery and filtering. +- `.internal/index.jsonl` owns per-run row discovery and filtering. - Per-case sidecars own detailed payloads such as grading, metrics, transcripts, - timing, generated files, and raw provider evidence. + generated files, and raw provider evidence. - Dashboard, search, SQLite, HTML reports, and vendor exports are rebuildable projections over the bundle. @@ -24,10 +24,17 @@ The default local layout is: ```text .agentv/results/ + .indexes/ + runs.jsonl # rebuildable cross-run run catalog + cases.jsonl # rebuildable cross-run case catalog + .cache/ / summary.json - index.jsonl - tags.json # optional mutable Dashboard tags + .internal/ + index.jsonl # one row per case/result in this run + progress.json + events.jsonl + bundle.json / summary.json # optional per-case aggregate, especially repeats test/ # optional generated test bundle @@ -35,28 +42,24 @@ The default local layout is: targets.yaml files/ graders/ - attempt-1/ + sample-1/ result.json grading.json metrics.json - timing.json transcript.json transcript-raw.jsonl outputs/ answer.md file_changes.diff - attempt-2/ + sample-2/ result.json grading.json metrics.json - timing.json transcript.json transcript-raw.jsonl outputs/ answer.md file_changes.diff - .indexes/ # reserved rebuildable/local indexes - .cache/ # reserved local cache ``` `` is the only committed run-bundle path identity. It helps AgentV put @@ -85,22 +88,21 @@ reserved for rebuildable local state and are skipped by run discovery. | File or field | Owns | Use it for | | --- | --- | --- | | `summary.json` | Aggregate run metadata and rollups: run id, experiment metadata, counts, pass rate, score summaries, duration, token/cost totals, and writer metadata. | Listing runs, CI summaries, quick dashboards, trend cards, and validating that a run is complete enough to inspect. | -| `index.jsonl` | Canonical row index: one row per result, attempt, or case-level aggregate, with identity fields, filter metadata, scores, status, and explicit run-relative paths to sidecars. | Filtering, compare/trend inputs, Dashboard detail routing, rerun/resume lookup, export adapters, and artifact discovery. | +| `.internal/index.jsonl` | Canonical per-run row index: one row per case/result aggregate, with identity fields, filter metadata, scores, status, and explicit run-relative paths to sidecars. | Filtering, compare/trend inputs, Dashboard detail routing, rerun/resume lookup, export adapters, and artifact discovery. | | `result.json` | Compact per-attempt manifest for one attempt directory, including AgentV `execution_status` and `verdict`. | Loading one attempt without scanning the whole run index. | | `grading.json` | Grader outputs, `assertion_results`, rubric evidence, execution-metric grader facts, and scoring provenance. | Explaining why a row passed or failed. | -| `metrics.json` | Derived executor behavior summary, such as tool calls, files touched, shell commands, errors, turns, and output sizes. | Dashboard behavior views, metric-style graders, adapter projections, and lightweight analysis. | +| `metrics.json` | Duration, token usage, cost, execution status, trajectory, and derived executor behavior such as tool calls, files touched, shell commands, errors, turns, and output sizes. | Dashboard behavior views, cost/latency reporting, metric-style graders, adapter projections, and lightweight analysis. | | `outputs/file_changes.diff` | Full unified diff of workspace file changes when file changes are captured. | Human review and external artifact inspection; LLM and script graders still receive the same full diff through `file_changes`. | -| `timing.json` | Duration, token usage, cost usage, and source labels such as `provider_reported`, `token_estimated`, `aggregate`, or `unavailable`. | Cost/latency reporting and provider-accounting audits. | | `transcript.json` | AgentV-normalized transcript/timeline document with canonical `tool_name` values and `transcript_summary`. | Portable human review, transcript-aware graders, and tool-trajectory analysis. | | `transcript-raw.jsonl` | Native provider or harness evidence when available. | Parser debugging, forensic review, and preserving source bytes without making provider schemas public AgentV fields. | | `test/` | Generated test bundle for the exact eval slice and target settings that produced a row. | Audit, external review, and rerun workflows that should not depend on a mutable source checkout. | | `artifact_pointers` | Offload indirection for large detached payload bytes. | Finding payloads published outside the primary metadata/control-plane branch, such as transcript bytes on `agentv/artifacts/v1`. | -`summary.json` and `index.jsonl` are complementary, not redundant. A run list +`summary.json` and `.internal/index.jsonl` are complementary, not redundant. A run list should not scan every row just to show pass rate or total duration, and a row reader should not parse aggregate summary structures to find one case's grading or transcript. Keep aggregate questions on `summary.json`; keep row and artifact -discovery on `index.jsonl`. +discovery on `.internal/index.jsonl`. ## Grading Contract @@ -181,17 +183,18 @@ Example row: "test_id": "refund-eligibility", "target": "codex-gpt5", "variant": "skills-v2", - "attempt": 1, + "sample_index": 1, + "retry_index": 0, + "provenance": "native", "execution_status": "ok", "score": 0.92, "duration_ms": 184200, "result_dir": "refund-eligibility--4f9a7c2d1b6e", "summary_path": "refund-eligibility--4f9a7c2d1b6e/summary.json", - "grading_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/grading.json", - "metrics_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/metrics.json", - "timing_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/timing.json", - "transcript_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/transcript.json", - "transcript_raw_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/transcript-raw.jsonl", + "grading_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/grading.json", + "metrics_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/metrics.json", + "transcript_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/transcript.json", + "transcript_raw_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/transcript-raw.jsonl", "transcript_summary": { "total_turns": 4, "tool_calls": { "file_read": 2, "shell": 1, "unknown": 0 }, @@ -202,16 +205,16 @@ Example row: "errors": [], "thinking_blocks": 1 }, - "output_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/outputs/answer.md", - "answer_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/outputs/answer.md", - "file_changes_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/outputs/file_changes.diff", + "output_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/outputs/answer.md", + "answer_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/outputs/answer.md", + "file_changes_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/outputs/file_changes.diff", "test_dir": "refund-eligibility--4f9a7c2d1b6e/test" } ``` Rows can represent repeated attempts, multi-target runs, imported suites, manual `prepare`/`grade` attempts, or imported provider sessions. That is why -`experiment`, `eval_path`, `test_id`, `target`, `variant`, `attempt`, and +`experiment`, `eval_path`, `test_id`, `target`, `variant`, `sample_index`, `retry_index`, and source metadata belong in `index.jsonl`: tools can filter dynamically without requiring every run to be pre-split into semantic folders. @@ -221,8 +224,8 @@ each row and as `summary.json.metadata.tags`. Its reserved `experiment` key matches the row `experiment` field, so trend/compare views can group by `tags.experiment`. -Use `repeat` for authoring configuration and `attempts` for produced -executions. The `attempt-1/`, `attempt-2/`, and later folders under a result +Use `repeat` for authoring configuration and `samples` for produced +executions. The `sample-1/`, `sample-2/`, and later folders under a result directory are artifact folders for those produced executions. Do not treat those folder names as the comparison dimension. Repeated stochastic samples should be represented by explicit metadata such as `sample_index` and `sample_count`; @@ -265,7 +268,7 @@ Run an eval and inspect the portable bundle: agentv eval evals/support/refunds.eval.yaml --experiment with_skills ls .agentv/results/ cat .agentv/results//summary.json -cat .agentv/results//index.jsonl +cat .agentv/results//.internal/index.jsonl ``` Find failed rows without loading every sidecar: @@ -273,15 +276,15 @@ Find failed rows without loading every sidecar: ```bash jq -r 'select(.execution_status != "ok" or .score < 0.5) | [.eval_path, .test_id, .target, .grading_path] | @tsv' \ - .agentv/results//index.jsonl + .agentv/results//.internal/index.jsonl ``` Compare two completed runs by their row indexes: ```bash agentv results compare \ - .agentv/results//index.jsonl \ - .agentv/results//index.jsonl + .agentv/results//.internal/index.jsonl \ + .agentv/results//.internal/index.jsonl ``` Generate a shareable report from the same canonical bundle: @@ -302,7 +305,7 @@ import { createInterface } from "node:readline"; export async function* rows(runDir: string) { const rl = createInterface({ - input: createReadStream(path.join(runDir, "index.jsonl"), "utf8"), + input: createReadStream(path.join(runDir, ".internal/index.jsonl"), "utf8"), crlfDelay: Infinity, }); @@ -323,8 +326,8 @@ for await (const row of rows(".agentv/results/2026-run")) { Adapter guidance: - Preserve unknown row fields when possible. -- Prefer path fields such as `grading_path`, `metrics_path`, `timing_path`, - `transcript_path`, and `transcript_raw_path` over ad hoc path construction. +- Prefer path fields such as `grading_path`, `metrics_path`, `transcript_path`, + and `transcript_raw_path` over ad hoc path construction. - Use `artifact_pointers` only for detached payload lookup; do not make pointers the discovery path for ordinary sidecars that are present in the run tree. - If you build a database or search index, store enough source metadata to diff --git a/apps/web/src/content/docs/docs/next/tools/compare.mdx b/apps/web/src/content/docs/docs/next/tools/compare.mdx index 348bcc5d4..dfff2ee14 100644 --- a/apps/web/src/content/docs/docs/next/tools/compare.mdx +++ b/apps/web/src/content/docs/docs/next/tools/compare.mdx @@ -15,11 +15,11 @@ Run two evaluations and compare them: agentv eval evals/my-eval.yaml --output .agentv/results/before # ... make changes to your agent ... agentv eval evals/my-eval.yaml --output .agentv/results/after -agentv results compare .agentv/results/before/index.jsonl .agentv/results/after/index.jsonl +agentv results compare .agentv/results/before/.internal/index.jsonl .agentv/results/after/.internal/index.jsonl ``` `index.jsonl` is the canonical row-level result index. New runs live at -`.agentv/results//index.jsonl`. +`.agentv/results//.internal/index.jsonl`. ## Options @@ -132,7 +132,7 @@ agentv eval evals/*.yaml --target gpt-4 --output .agentv/results/baseline agentv eval evals/*.yaml --target gpt-4o --output .agentv/results/candidate # Compare results -agentv results compare .agentv/results/baseline/index.jsonl .agentv/results/candidate/index.jsonl +agentv results compare .agentv/results/baseline/.internal/index.jsonl .agentv/results/candidate/.internal/index.jsonl ``` ### Prompt Optimization @@ -147,7 +147,7 @@ agentv eval evals/*.yaml --output .agentv/results/before agentv eval evals/*.yaml --output .agentv/results/after # Compare with strict threshold -agentv results compare .agentv/results/before/index.jsonl .agentv/results/after/index.jsonl --threshold 0.05 +agentv results compare .agentv/results/before/.internal/index.jsonl .agentv/results/after/.internal/index.jsonl --threshold 0.05 ``` ### CI Quality Gate @@ -157,8 +157,8 @@ Fail CI if the candidate regresses: ```bash #!/bin/bash agentv results compare \ - .agentv/results/baseline/index.jsonl \ - .agentv/results/candidate/index.jsonl + .agentv/results/baseline/.internal/index.jsonl \ + .agentv/results/candidate/.internal/index.jsonl if [ $? -eq 1 ]; then echo "Regression detected! Candidate performs worse than baseline." exit 1 diff --git a/apps/web/src/content/docs/docs/next/tools/inspect.mdx b/apps/web/src/content/docs/docs/next/tools/inspect.mdx index 6b85c0278..ada156314 100644 --- a/apps/web/src/content/docs/docs/next/tools/inspect.mdx +++ b/apps/web/src/content/docs/docs/next/tools/inspect.mdx @@ -96,7 +96,7 @@ agentv inspect show trace.otlp.json --format json \ | jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]' # Compare providers -agentv inspect stats .agentv/results//index.jsonl --group-by target --format json \ +agentv inspect stats .agentv/results//.internal/index.jsonl --group-by target --format json \ | jq '.groups[] | {label, score_mean: .metrics.score.mean}' ``` diff --git a/apps/web/src/content/docs/docs/next/tools/results.mdx b/apps/web/src/content/docs/docs/next/tools/results.mdx index 64e04be8b..8636f8cd6 100644 --- a/apps/web/src/content/docs/docs/next/tools/results.mdx +++ b/apps/web/src/content/docs/docs/next/tools/results.mdx @@ -48,7 +48,7 @@ Examples: agentv results report .agentv/results/2026-03-14T10-32-00_claude # Use an explicit output path -agentv results report .agentv/results/2026-03-14T10-32-00_claude/index.jsonl \ +agentv results report .agentv/results/2026-03-14T10-32-00_claude/.internal/index.jsonl \ --out ./reports/human-review.html ``` @@ -128,13 +128,13 @@ and metric-style graders; it is not canonical trace storage and does not carry token/cost usage. Every case uses aggregate `summary.json`, then stores execution artifact details -under `attempt-N/`. Each `attempt-N/` contains a compact per-attempt manifest -`result.json`, `grading.json`, `metrics.json`, `timing.json`, +under `sample-N/`. Each `sample-N/` contains a compact per-attempt manifest +`result.json`, `grading.json`, `metrics.json`, `transcript.json`, `transcript-raw.jsonl`, `outputs/answer.md`, and `outputs/file_changes.diff` when workspace changes were captured. The `result.json` file carries AgentV `execution_status` and `verdict` fields plus `grading_path`, `metrics_path`, transcript, output, and `file_changes_path` -paths. Treat `attempt-N/` as an artifact attempt folder, not as a comparison +paths. Treat `sample-N/` as an artifact attempt folder, not as a comparison dimension; stochastic samples and infrastructure retries should be represented with explicit sample/retry metadata rather than inferred from folder names. @@ -151,7 +151,7 @@ lightweight explicit paths such as `transcript_path`, `transcript_raw_path`, detached payload publishing needs them. Dashboard search indexes, SQLite indexes, and other read models are derived projections over these run artifacts, not replacements for `index.jsonl`. -Duration, token, and cost usage remains in `timing.json`, including source +Duration, token, and cost usage remains in `metrics.json`, including source labels such as `provider_reported`, `token_estimated`, `aggregate`, or `unavailable`. @@ -182,10 +182,10 @@ Agent Skills eval artifacts map into AgentV like this: | Agent Skills pattern | AgentV field | Artifact location | |----------------------|--------------|-------------------| | Converted Agent Skills cases | AgentV eval cases and test bundle paths | Converted EVAL YAML plus optional `test_dir`, `eval_path`, `targets_path`, `files_path`, and `graders_path` in `index.jsonl` | -| Per-case answer | Generated target output artifact | `attempt-N/outputs/answer.md` | -| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `attempt-N/transcript.json`, `attempt-N/transcript-raw.jsonl`, `attempt-N/metrics.json` | -| Per-attempt `timing.json` | Duration, token totals, cost, and usage source labels | `attempt-N/timing.json` | -| Per-attempt `grading.json` | Assertions, graders, execution metrics, workspace changes | `attempt-N/grading.json`; summary fields can reference the same trace/result facts | +| Per-case answer | Generated target output artifact | `sample-N/outputs/answer.md` | +| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `sample-N/transcript.json`, `sample-N/transcript-raw.jsonl`, `sample-N/metrics.json` | +| Per-sample `metrics.json` | Duration, token totals, cost, execution, trajectory, and usage source labels | `sample-N/metrics.json` | +| Per-attempt `grading.json` | Assertions, graders, execution metrics, workspace changes | `sample-N/grading.json`; summary fields can reference the same trace/result facts | | Iteration-level `summary.json` | Pass rate, time, tokens, tool calls, cost aggregates | Run-level `summary.json` | | Transcript/log outlier analysis | Normalized transcript, raw evidence, metrics, and optional external trace link | `transcript.json` for portable review; `transcript-raw.jsonl` for native evidence; `metrics.json` for behavior summaries; `external_trace` for link-out correlation | | Aggregate pass rate/time/tokens/delta | Run summaries and comparison tooling | `summary.json`, result comparisons, and projection bundles | diff --git a/apps/web/src/content/docs/docs/next/tools/trend.mdx b/apps/web/src/content/docs/docs/next/tools/trend.mdx index 3b1c307e7..5f55f7aa4 100644 --- a/apps/web/src/content/docs/docs/next/tools/trend.mdx +++ b/apps/web/src/content/docs/docs/next/tools/trend.mdx @@ -30,7 +30,7 @@ Point directly at run workspaces or `index.jsonl` manifests when you need a spec ```bash agentv results trend \ .agentv/results/2026-03-01T10-00-00-000Z/ \ - .agentv/results/2026-03-08T10-00-00-000Z/index.jsonl \ + .agentv/results/2026-03-08T10-00-00-000Z/.internal/index.jsonl \ .agentv/results/2026-03-15T10-00-00-000Z/ ``` @@ -46,7 +46,7 @@ agentv results trend --last 8 --suite code-review --target claude-sonnet \ `trend` only accepts canonical run workspaces: - `.agentv/results//` -- `.agentv/results//index.jsonl` +- `.agentv/results//.internal/index.jsonl` Legacy flat `results.jsonl` files are rejected. The command stays on lightweight `index.jsonl` manifests and does not require per-test artifact @@ -114,7 +114,7 @@ Regression Gate: threshold=0.010 fail_on_degrading=true triggered=true "runs": [ { "label": "2026-03-01T10:00:00.000Z", - "path": "/repo/.agentv/results/2026-03-01T10-00-00-000Z/index.jsonl", + "path": "/repo/.agentv/results/2026-03-01T10-00-00-000Z/.internal/index.jsonl", "timestamp": "2026-03-01T10:00:00.000Z", "matched_test_count": 42, "mean_score": 0.92 diff --git a/apps/web/src/content/docs/docs/next/tools/wip-checkpoints.mdx b/apps/web/src/content/docs/docs/next/tools/wip-checkpoints.mdx index cbb675944..559241984 100644 --- a/apps/web/src/content/docs/docs/next/tools/wip-checkpoints.mdx +++ b/apps/web/src/content/docs/docs/next/tools/wip-checkpoints.mdx @@ -23,7 +23,7 @@ If no results repo is configured, or auto-push is disabled, `agentv eval` still | Location | Path or ref | What it contains | | --- | --- | --- | | Local project | `.agentv/results//summary.json` | A run-start stub with `metadata.run_id`, `metadata.experiment`, `metadata.planned_test_count`, and the eval file path when known. This lets Dashboard recognize incomplete local runs as resumable. | -| Local project | `.agentv/results//index.jsonl` | Result rows appended as test cases finish. Rows use the normal snake_case result JSONL format. | +| Local project | `.agentv/results//.internal/index.jsonl` | Result rows appended as test cases finish. Rows use the normal snake_case result JSONL format. | | Results repo remote | `agentv/wip//` | A forced-updated branch containing the checkpointed run under `.agentv/results//`. | | Results repo storage branch | Configured `results.repo.branch`; local checkout configs default to `agentv/results/v1` | The final published run after `agentv eval` completes and the normal auto-export succeeds. | diff --git a/packages/core/src/evaluation/metrics.ts b/packages/core/src/evaluation/metrics.ts index 481616ee5..0099abc94 100644 --- a/packages/core/src/evaluation/metrics.ts +++ b/packages/core/src/evaluation/metrics.ts @@ -4,8 +4,9 @@ * This is a derived per-case executor metrics projection over `EvaluationResult` * and the internal trace envelope. It aligns with AgentV's case-local `metrics.json` * while carrying compact executor observability fields. It is not the - * canonical trace store; portable transcript detail stays in `transcript.json`, and - * duration/token/cost usage stays in `timing.json`. + * canonical trace store; portable transcript detail stays in `transcript.json`. + * Duration, token, and cost usage live in this artifact's top-level + * `duration`, `tokens`, and `cost` sections. */ import { z } from 'zod'; @@ -115,27 +116,42 @@ const ReasoningBlockWireSchema = z }) .strict(); -const MetricsTimingWireSchema = z +const MetricsDurationWireSchema = z .object({ - total_tokens: z.number().int().nonnegative(), - duration_ms: z.number().nonnegative(), - total_duration_seconds: z.number().nonnegative(), - cost_usd: z.number().nonnegative().nullable(), - token_usage: z + total_ms: z.number().nonnegative(), + total_seconds: z.number().nonnegative(), + mean_ms: z.number().nonnegative().optional(), + mean_seconds: z.number().nonnegative().optional(), + source: z.enum(TIMING_SOURCE_VALUES), + stats: z .object({ - input: z.number().int().nonnegative(), - output: z.number().int().nonnegative(), - reasoning: z.number().int().nonnegative(), + count: z.number().int().nonnegative(), + mean_ms: z.number().nonnegative(), + mean_seconds: z.number().nonnegative(), + stddev_ms: z.number().nonnegative(), + stddev_seconds: z.number().nonnegative(), + min_ms: z.number().nonnegative(), + max_ms: z.number().nonnegative(), }) - .strict(), - usage_sources: z - .object({ - token_usage: z.enum(TIMING_SOURCE_VALUES), - total_tokens: z.enum(TIMING_SOURCE_VALUES), - duration: z.enum(TIMING_SOURCE_VALUES), - cost: z.enum(TIMING_SOURCE_VALUES), - }) - .strict(), + .strict() + .optional(), + }) + .strict(); + +const MetricsTokensWireSchema = z + .object({ + total: z.number().int().nonnegative(), + input: z.number().int().nonnegative(), + output: z.number().int().nonnegative(), + reasoning: z.number().int().nonnegative(), + source: z.enum(TIMING_SOURCE_VALUES), + }) + .strict(); + +const MetricsCostWireSchema = z + .object({ + usd: z.number().nonnegative().nullable(), + source: z.enum(TIMING_SOURCE_VALUES), }) .strict(); @@ -184,12 +200,15 @@ export const MetricsArtifactWireSchema = z .object({ transcript_path: z.string().optional(), grading_path: z.string().optional(), - timing_path: z.string().optional(), file_changes_path: z.string().optional(), }) .strict(), + duration: MetricsDurationWireSchema.optional(), + tokens: MetricsTokensWireSchema.optional(), + cost: MetricsCostWireSchema.optional(), + execution: z.record(z.string(), z.unknown()).optional(), + trajectory: z.record(z.string(), z.unknown()).optional(), metrics: MetricsWireSchema, - timing: MetricsTimingWireSchema.optional(), }) .strict(); @@ -879,7 +898,6 @@ export function buildMetricsArtifact( options: { transcriptPath?: string; gradingPath?: string; - timingPath?: string; fileChangesPath?: string; generatedAt?: string; } = {}, @@ -902,7 +920,6 @@ export function buildMetricsArtifact( source_artifacts: dropUndefined({ transcript_path: options.transcriptPath, grading_path: options.gradingPath, - timing_path: options.timingPath, file_changes_path: options.fileChangesPath, }), metrics: buildMetrics(result), diff --git a/packages/core/src/evaluation/results-repo-cache.test.ts b/packages/core/src/evaluation/results-repo-cache.test.ts index f41c865bf..ebb7a6e12 100644 --- a/packages/core/src/evaluation/results-repo-cache.test.ts +++ b/packages/core/src/evaluation/results-repo-cache.test.ts @@ -148,7 +148,7 @@ describe('git results filesystem index cache', () => { run_id: 'sentinel', experiment: 'default', timestamp: '2026-06-28T01-00-00-000Z', - manifest_path: 'sentinel/.internal/index.jsonl', + index_path: 'sentinel/.internal/index.jsonl', display_name: 'from cache', test_count: 1, avg_score: 0.5, diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts index cd7124142..1de620383 100644 --- a/packages/core/src/evaluation/results-repo.ts +++ b/packages/core/src/evaluation/results-repo.ts @@ -67,6 +67,7 @@ const GIT_EMPTY_TREE = '4b825dc642cb6eb9a060e54bf8d69288fbee4904'; const RESULTS_REPO_GENESIS_MESSAGE = 'chore(results): initialize AgentV results branch'; const RESULTS_REPO_GENESIS_DATE = '@0 +0000'; const RESULT_INDEX_FILENAME = 'index.jsonl'; +const RUN_INTERNAL_DIRNAME = '.internal'; // Artifact-aware merge config for the AgentV-owned results checkout. Concurrent // writers append to rebuildable cross-run JSONL catalogs and each run's @@ -2924,8 +2925,8 @@ function safeLocalSummaryManifestPath( function resolveLocalResultManifestPath(sourceDir: string): string | undefined { try { const summary = JSON.parse(readFileSync(path.join(sourceDir, 'summary.json'), 'utf8')) as { - manifest_path?: unknown; index_path?: unknown; + manifest_path?: unknown; }; const manifestPath = safeLocalSummaryManifestPath( sourceDir, @@ -2936,15 +2937,14 @@ function resolveLocalResultManifestPath(sourceDir: string): string | undefined { } } catch {} - const internalManifestPath = path.join(sourceDir, '.internal', RESULT_INDEX_FILENAME); - if (existsSync(internalManifestPath)) { - return internalManifestPath; - } - - const manifestPath = path.join(sourceDir, RESULT_INDEX_FILENAME); + const manifestPath = path.join(sourceDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME); if (existsSync(manifestPath)) { return manifestPath; } + const legacyManifestPath = path.join(sourceDir, RESULT_INDEX_FILENAME); + if (existsSync(legacyManifestPath)) { + return legacyManifestPath; + } return undefined; } @@ -3719,7 +3719,7 @@ export interface GitListedRun { timestamp: string; pass_rate?: number; target?: string; - manifest_path: string; + index_path: string; summary_path?: string; display_name: string; test_count: number; @@ -3745,8 +3745,8 @@ type GitBatchBlob = { }; type GitRunSummary = { - readonly manifest_path?: string; readonly index_path?: string; + readonly manifest_path?: string; readonly metadata?: { readonly display_name?: string; readonly timestamp?: string; @@ -4216,7 +4216,7 @@ function isGitListedRun(value: unknown): value is GitListedRun { typeof record.run_id === 'string' && typeof record.experiment === 'string' && typeof record.timestamp === 'string' && - typeof record.manifest_path === 'string' && + (typeof record.index_path === 'string' || typeof record.manifest_path === 'string') && typeof record.display_name === 'string' && typeof record.test_count === 'number' && typeof record.avg_score === 'number' && @@ -4443,7 +4443,7 @@ export async function listGitRuns(repoDir: string, ref = 'origin/main'): Promise timestamp, ...(passRate !== undefined && { pass_rate: passRate }), ...(targets.length === 1 && targets[0] ? { target: targets[0] } : {}), - manifest_path: manifestPath, + index_path: manifestPath, ...(summaryByPath.has(summaryPath) && { summary_path: summaryPath }), display_name: displayName, test_count: summary?.metadata?.tests_run?.length ?? rowTestIds.length, diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts index e687a0cb5..bb26a6ae4 100644 --- a/packages/core/src/evaluation/run-artifacts.ts +++ b/packages/core/src/evaluation/run-artifacts.ts @@ -2,13 +2,14 @@ * Canonical AgentV run artifact helpers. * * This module owns the shared run-workspace contract used by CLI and - * programmatic evals: `index.jsonl`, run-root `summary.json`, per-case - * `summary.json`, `run-N/result.json`, and transcript projections. Keep wire - * keys in snake_case here so every caller produces the same artifacts. + * programmatic evals: run-root `summary.json`, per-run + * `.internal/index.jsonl`, per-case `summary.json`, `sample-N/result.json`, + * and transcript projections. Keep wire keys in snake_case here so every caller + * produces the same artifacts. */ import { createHash } from 'node:crypto'; -import { copyFile, mkdir, readFile, rm, rmdir, writeFile } from 'node:fs/promises'; +import { copyFile, mkdir, readFile, readdir, rm, rmdir, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import path from 'node:path'; @@ -60,6 +61,25 @@ import type { export const RESULT_INDEX_FILENAME = 'index.jsonl'; export const RUN_SUMMARY_FILENAME = 'summary.json'; +export const RUN_INTERNAL_DIRNAME = '.internal'; +export const CROSS_RUN_INDEX_DIRNAME = '.indexes'; +export const CROSS_RUN_RUNS_INDEX_FILENAME = 'runs.jsonl'; +export const CROSS_RUN_CASES_INDEX_FILENAME = 'cases.jsonl'; + +export function runInternalPath(runDir: string, filename: string): string { + return path.join(runDir, RUN_INTERNAL_DIRNAME, filename); +} + +export function runIndexPath(runDir: string): string { + return runInternalPath(runDir, RESULT_INDEX_FILENAME); +} + +function isCanonicalResultsRoot(resultsRoot: string): boolean { + return ( + path.basename(resultsRoot) === 'results' && + path.basename(path.dirname(resultsRoot)) === '.agentv' + ); +} const TIMING_SOURCE_VALUES = [ 'provider_reported', @@ -171,8 +191,7 @@ export async function aggregateRunDir( tags?: Record; }, ): Promise<{ summaryPath: string; testCount: number; targetCount: number }> { - const indexPath = - (await resolveExistingResultManifestPath(runDir)) ?? path.join(runDir, RESULT_INDEX_FILENAME); + const indexPath = (await resolveExistingResultManifestPath(runDir)) ?? runIndexPath(runDir); const content = await readFile(indexPath, 'utf8'); const allResults = parseJsonlResults(content); const results = deduplicateByTestIdTarget(allResults); @@ -223,8 +242,8 @@ async function readRunSummaryManifestPath(runDir: string): Promise; + }; +} + +interface DurationStats { + readonly count: number; + readonly mean_ms: number; + readonly mean_seconds: number; + readonly stddev_ms: number; + readonly stddev_seconds: number; + readonly min_ms: number; + readonly max_ms: number; } export interface RunSummaryArtifact { - readonly manifest_path: string; + readonly index_path: string; + readonly run_id?: string; + readonly status: { + readonly passed: { readonly count: number; readonly percentage: number }; + readonly failed: { readonly count: number; readonly percentage: number }; + readonly errored: { readonly count: number; readonly percentage: number }; + readonly skipped: { readonly count: number; readonly percentage: number }; + }; + readonly counts: { + readonly total_cases: number; + readonly total_instances: number; + readonly passed_cases: number; + readonly failed_cases: number; + readonly errored_instances: number; + }; + readonly pass_at_k: { + readonly k: number; + readonly passed_cases: number; + readonly total_cases: number; + readonly rate: number; + }; + readonly usage: { + readonly total_tokens: number; + readonly input_tokens: number; + readonly output_tokens: number; + readonly reasoning_tokens: number; + readonly cost_usd: number | null; + }; + readonly infra_failures: { + readonly total: number; + readonly reasons: readonly { readonly reason: string; readonly count: number }[]; + }; + readonly cases: readonly Record[]; + readonly instances: readonly Record[]; readonly metadata: { readonly run_id?: string; readonly eval_file: string; @@ -458,7 +535,7 @@ export interface RunSummaryArtifact { } >; readonly per_grader_summary?: Record; - readonly timing: TimingArtifact; + readonly metrics: TimingArtifact; readonly notes: readonly string[]; } @@ -501,6 +578,9 @@ export interface IndexArtifactEntry { readonly start_time?: string; readonly end_time?: string; readonly scores?: readonly Record[]; + readonly named_scores?: Record; + readonly derived_metrics?: Record; + readonly provenance?: string; readonly attempts?: readonly TrialResultArtifact[]; readonly aggregation?: TrialAggregationArtifact; readonly execution_status?: string; @@ -510,7 +590,6 @@ export interface IndexArtifactEntry { readonly workspace_path?: string; readonly result_dir?: string; readonly grading_path?: string; - readonly timing_path?: string; readonly summary_path?: string; readonly output_path?: string; readonly answer_path?: string; @@ -521,6 +600,8 @@ export interface IndexArtifactEntry { readonly file_changes_path?: string; readonly artifact_pointers?: ResultArtifactPointersWire; readonly runtime_source?: RunRuntimeSourceMetadata; + readonly sample_index?: number; + readonly retry_index?: number; readonly raw_provider_log_path?: string; readonly input_path?: string; readonly test_dir?: string; @@ -565,6 +646,8 @@ export interface AdditionalResultArtifactsContext { export interface AgentVRunResultArtifact { readonly execution_status: EvaluationResult['executionStatus']; readonly verdict: TrialResult['verdict']; + readonly sample_index?: number; + readonly retry_index?: number; readonly duration_ms?: number; readonly duration_seconds: number; readonly model: string; @@ -591,7 +674,6 @@ export interface AgentVRunResultArtifact { readonly file_changes?: string; readonly scripts?: Record; }; - readonly timing?: TimingArtifact; } export interface RepeatCaseSummaryArtifact { @@ -601,13 +683,11 @@ export interface RepeatCaseSummaryArtifact { readonly mean_duration_ms: number; readonly mean_duration_seconds: number; readonly fingerprint: string; - readonly total_tokens: number; - readonly duration_ms: number; - readonly total_duration_seconds: number; - readonly duration_stats?: TimingArtifact['duration_stats']; - readonly cost_usd: number | null; - readonly token_usage: TimingArtifact['token_usage']; - readonly usage_sources: TimingArtifact['usage_sources']; + readonly duration: TimingArtifact['duration']; + readonly tokens: TimingArtifact['tokens']; + readonly cost: TimingArtifact['cost']; + readonly execution?: TimingArtifact['execution']; + readonly trajectory?: TimingArtifact['trajectory']; } export type AdditionalResultArtifactsWriter = ( @@ -761,8 +841,31 @@ function toIndexScores(scores: readonly GraderResult[] | undefined): IndexArtifa return scores?.map(toIndexScore) as IndexArtifactEntry['scores']; } -function attemptDirName(attempt: number): string { - return `attempt-${attempt + 1}`; +function collectNamedScores( + scores: readonly GraderResult[] | undefined, + out: Record = {}, +): Record | undefined { + for (const score of scores ?? []) { + if (score.name) { + out[score.name] = score.score; + } + collectNamedScores(score.scores, out); + } + return Object.keys(out).length > 0 ? out : undefined; +} + +function resultDerivedMetrics(result: EvaluationResult): Record | undefined { + const value = result.metadata?.derived_metrics ?? result.metadata?.derivedMetrics; + return isRecord(value) ? value : undefined; +} + +function resultProvenance(result: EvaluationResult): string { + const value = result.metadata?.provenance; + return typeof value === 'string' && value.trim().length > 0 ? value : 'native'; +} + +function sampleDirName(sampleIndex: number): string { + return `sample-${sampleIndex + 1}`; } function hasPersistedTrialRuns(result: EvaluationResult): boolean { @@ -784,7 +887,7 @@ function toTrialArtifacts( } return trials.map((trial) => ({ attempt: trial.attempt, - attempt_path: trial.result ? attemptDirName(trial.attempt) : undefined, + sample_path: trial.result ? sampleDirName(trial.attempt) : undefined, score: trial.score, verdict: trial.verdict, scores: toIndexScores(trial.scores), @@ -904,16 +1007,19 @@ function buildRepeatAggregateTimingArtifact(result: EvaluationResult): TimingArt const maxMs = Math.max(...durationsMs); return { ...timing, - mean_duration_ms: stats.mean, - mean_duration_seconds: roundSecondsFromMs(stats.mean), - duration_stats: { - count: durationsMs.length, + duration: { + ...timing.duration, mean_ms: stats.mean, mean_seconds: roundSecondsFromMs(stats.mean), - stddev_ms: stats.stddev, - stddev_seconds: roundSecondsFromMs(stats.stddev), - min_ms: roundMillis(minMs), - max_ms: roundMillis(maxMs), + stats: { + count: durationsMs.length, + mean_ms: stats.mean, + mean_seconds: roundSecondsFromMs(stats.mean), + stddev_ms: stats.stddev, + stddev_seconds: roundSecondsFromMs(stats.stddev), + min_ms: roundMillis(minMs), + max_ms: roundMillis(maxMs), + }, }, }; } @@ -926,6 +1032,10 @@ function formatRepeatPassRate(passedRuns: number, totalRuns: number): string { return `${Number.isInteger(percent) ? percent.toFixed(0) : percent.toFixed(1)}%`; } +function percentage(count: number, total: number): number { + return total > 0 ? Math.round((count / total) * 1000) / 1000 : 0; +} + function fallbackRepeatFingerprint(result: EvaluationResult): string { return createHash('sha256') .update( @@ -952,23 +1062,21 @@ function buildRepeatCaseSummaryArtifact( : resultVerdict(result) === 'pass' ? 1 : 0; - const fallbackMeanMs = totalRuns > 0 ? roundMillis(timing.duration_ms / totalRuns) : 0; - const meanDurationMs = timing.mean_duration_ms ?? fallbackMeanMs; + const fallbackMeanMs = totalRuns > 0 ? roundMillis(timing.duration.total_ms / totalRuns) : 0; + const meanDurationMs = timing.duration.mean_ms ?? fallbackMeanMs; return { total_attempts: totalRuns, passed_attempts: passedRuns, pass_rate: formatRepeatPassRate(passedRuns, totalRuns), mean_duration_ms: meanDurationMs, - mean_duration_seconds: timing.mean_duration_seconds ?? roundSecondsFromMs(meanDurationMs), + mean_duration_seconds: timing.duration.mean_seconds ?? roundSecondsFromMs(meanDurationMs), fingerprint: fingerprint ?? fallbackRepeatFingerprint(result), - total_tokens: timing.total_tokens, - duration_ms: timing.duration_ms, - total_duration_seconds: timing.total_duration_seconds, - duration_stats: timing.duration_stats, - cost_usd: timing.cost_usd, - token_usage: timing.token_usage, - usage_sources: timing.usage_sources, + duration: timing.duration, + tokens: timing.tokens, + cost: timing.cost, + execution: timing.execution, + trajectory: timing.trajectory, }; } @@ -996,7 +1104,9 @@ function buildAgentVRunResultArtifact(params: { readonly trial: TrialResult; readonly result: EvaluationResult; readonly metricsArtifact: ReturnType & { - readonly timing?: TimingArtifact; + readonly duration?: TimingArtifact['duration']; + readonly tokens?: TimingArtifact['tokens']; + readonly cost?: TimingArtifact['cost']; }; readonly hasTranscript: boolean; readonly hasOutput: boolean; @@ -1009,6 +1119,8 @@ function buildAgentVRunResultArtifact(params: { return dropUndefined({ execution_status: params.trial.executionStatus ?? params.result.executionStatus, verdict: params.trial.verdict, + sample_index: params.result.sampleIndex, + retry_index: params.result.retryIndex, duration_ms: resultDurationMs(params.result), duration_seconds: resultDurationSeconds(params.result), model: params.result.target ?? 'unknown', @@ -1039,7 +1151,6 @@ function buildAgentVRunResultArtifact(params: { file_changes: fileChangesPath, }) : undefined, - timing: params.metricsArtifact.timing, }) as unknown as AgentVRunResultArtifact; } @@ -1078,12 +1189,11 @@ async function writeTrialRunArtifacts(params: { return; } - const runDirName = attemptDirName(params.trial.attempt); + const runDirName = sampleDirName(params.trial.attempt); const runDir = path.join(params.parentTestDir, runDirName); const grading = buildGradingArtifact(result, { includeTrials: false }); const timing = buildTimingArtifact([result]); const gradingPath = path.join(runDir, 'grading.json'); - const timingPath = path.join(runDir, 'timing.json'); const metricsPath = path.join(runDir, CANONICAL_METRICS_ARTIFACT_PATH); const outputsDir = path.join(runDir, 'outputs'); const answerOutputPath = @@ -1110,7 +1220,6 @@ async function writeTrialRunArtifacts(params: { await mkdir(runDir, { recursive: true }); await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}\n`, 'utf8'); - await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8'); await mkdir(outputsDir, { recursive: true }); if (answerOutputPath) { @@ -1129,7 +1238,7 @@ async function writeTrialRunArtifacts(params: { envelope, transcriptArtifactPath: transcriptPath ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined, gradingArtifactPath: 'grading.json', - timingArtifactPath: 'timing.json', + timingArtifactPath: null, fileChangesArtifactPath: fileChangesPath ? CANONICAL_FILE_CHANGES_ARTIFACT_PATH : undefined, timing, }); @@ -1364,22 +1473,51 @@ export function buildTimingArtifact(results: readonly EvaluationResult[]): Timin const durationSource = combineTimingSources(results, durationSources, hasDuration); const costSource = combineTimingSources(results, costSources, hasCost); + const first = results[0]; + const totalToolCalls = results.reduce((sum, result) => sum + countToolCalls(result).total, 0); return { - total_tokens: totalInput + totalOutput, - duration_ms: totalDurationMs, - total_duration_seconds: Math.round((totalDurationMs / 1000) * 1000) / 1000, - cost_usd: hasCost ? totalCostUsd : null, - token_usage: { + duration: { + total_ms: totalDurationMs, + total_seconds: Math.round((totalDurationMs / 1000) * 1000) / 1000, + source: durationSource, + }, + tokens: { + total: totalInput + totalOutput, input: totalInput, output: totalOutput, reasoning: totalReasoning, + source: tokenUsageSource, }, - usage_sources: { - token_usage: tokenUsageSource, - total_tokens: tokenUsageSource, - duration: durationSource, - cost: costSource, + cost: { + usd: hasCost ? totalCostUsd : null, + source: costSource, }, + execution: first + ? dropUndefined({ + status: first.executionStatus, + failure_stage: first.failureStage, + failure_reason_code: first.failureReasonCode, + }) + : undefined, + trajectory: + results.length > 0 + ? { + total_turns: results.reduce( + (sum, result) => + sum + + (result.trace.llmCallCount ?? + result.trace.messages.filter((message) => message.role === 'assistant').length), + 0, + ), + total_tool_calls: totalToolCalls, + tool_calls: results.reduce>((counts, result) => { + for (const [tool, count] of Object.entries(countToolCalls(result).toolCalls)) { + counts[tool] = (counts[tool] ?? 0) + count; + } + return counts; + }, {}), + } + : undefined, }; } @@ -1479,9 +1617,131 @@ export function buildRunSummaryArtifact( const firstResult = results[0]; const timestamp = firstResult?.timestamp ?? new Date().toISOString(); + const runMetrics = buildTimingArtifact(results); + const casesByKey = new Map< + string, + { + test_id: string; + suite?: string; + eval_path?: string; + target: string; + variant?: string; + sample_count: number; + pass_count: number; + status_counts: Record; + samples: Record[]; + } + >(); + const instances = results.flatMap((result) => { + const trials = materializedRunTrials(result); + return trials.map((trial) => { + const sampleIndex = trial.attempt + 1; + const status = trial.executionStatus ?? result.executionStatus; + const verdict = trial.verdict; + const caseKey = buildEvaluationResultTargetKey(result); + const sourceTest = undefined; + const evalPath = sourceEvalPath(result, sourceTest); + const caseSummary = casesByKey.get(caseKey) ?? { + test_id: result.testId ?? 'unknown', + suite: result.suite, + eval_path: evalPath, + target: result.target ?? 'unknown', + variant: result.variant, + sample_count: 0, + pass_count: 0, + status_counts: {}, + samples: [], + }; + caseSummary.sample_count += 1; + if (verdict === 'pass') { + caseSummary.pass_count += 1; + } + caseSummary.status_counts[status ?? 'unknown'] = + (caseSummary.status_counts[status ?? 'unknown'] ?? 0) + 1; + const instance = dropUndefined({ + test_id: result.testId ?? 'unknown', + suite: result.suite, + eval_path: evalPath, + target: result.target ?? 'unknown', + variant: result.variant, + sample_index: sampleIndex, + retry_index: result.metadata?.retry_index, + verdict, + score: trial.score, + execution_status: status, + failure_stage: trial.failureStage, + failure_reason_code: trial.failureReasonCode, + duration_ms: trial.result ? resultDurationMs(trial.result) : resultDurationMs(result), + cost_usd: trial.costUsd, + }); + caseSummary.samples.push(instance); + casesByKey.set(caseKey, caseSummary); + return instance; + }); + }); + const caseSummaries = [...casesByKey.values()].map((entry) => ({ + ...entry, + pass_rate: percentage(entry.pass_count, entry.sample_count), + pass_at_1: entry.pass_count > 0, + })); + const passedCases = caseSummaries.filter((entry) => entry.pass_count > 0).length; + const erroredInstances = instances.filter( + (entry) => entry.execution_status === 'execution_error', + ).length; + const failedCases = caseSummaries.length - passedCases; + const infraFailureCounts = new Map(); + for (const instance of instances) { + const reason = + typeof instance.failure_reason_code === 'string' + ? instance.failure_reason_code + : instance.execution_status === 'execution_error' + ? 'execution_error' + : undefined; + if (reason) { + infraFailureCounts.set(reason, (infraFailureCounts.get(reason) ?? 0) + 1); + } + } return { - manifest_path: RESULT_INDEX_FILENAME, + index_path: `${RUN_INTERNAL_DIRNAME}/${RESULT_INDEX_FILENAME}`, + run_id: runId, + status: { + passed: { count: passedCases, percentage: percentage(passedCases, caseSummaries.length) }, + failed: { count: failedCases, percentage: percentage(failedCases, caseSummaries.length) }, + errored: { + count: erroredInstances, + percentage: percentage(erroredInstances, instances.length), + }, + skipped: { count: 0, percentage: 0 }, + }, + counts: { + total_cases: caseSummaries.length, + total_instances: instances.length, + passed_cases: passedCases, + failed_cases: failedCases, + errored_instances: erroredInstances, + }, + pass_at_k: { + k: 1, + passed_cases: passedCases, + total_cases: caseSummaries.length, + rate: percentage(passedCases, caseSummaries.length), + }, + usage: { + total_tokens: runMetrics.tokens.total, + input_tokens: runMetrics.tokens.input, + output_tokens: runMetrics.tokens.output, + reasoning_tokens: runMetrics.tokens.reasoning, + cost_usd: runMetrics.cost.usd, + }, + infra_failures: { + total: [...infraFailureCounts.values()].reduce((sum, count) => sum + count, 0), + reasons: [...infraFailureCounts.entries()] + .sort(([a], [b]) => a.localeCompare(b)) + .map(([reason, count]) => ({ reason, count })), + }, + cases: caseSummaries, + instances, metadata: { run_id: runId, eval_file: evalFile, @@ -1497,7 +1757,7 @@ export function buildRunSummaryArtifact( }, run_summary: runSummary, per_grader_summary: perEvaluatorSummary, - timing: buildTimingArtifact(results), + metrics: runMetrics, notes, }; } @@ -1780,7 +2040,6 @@ export function buildIndexArtifactEntry( outputDir: string; resultDir?: string; gradingPath?: string; - timingPath?: string; summaryPath?: string; outputPath?: string; answerPath?: string; @@ -1811,6 +2070,9 @@ export function buildIndexArtifactEntry( start_time: result.startTime, end_time: result.endTime, scores: toIndexScores(result.scores), + named_scores: collectNamedScores(result.scores), + derived_metrics: resultDerivedMetrics(result), + provenance: resultProvenance(result), attempts: toIndexTrialArtifacts(result), aggregation: toTrialAggregationArtifact(result.aggregation), execution_status: result.executionStatus, @@ -1824,9 +2086,6 @@ export function buildIndexArtifactEntry( grading_path: options.gradingPath ? toRelativeArtifactPath(options.outputDir, options.gradingPath) : undefined, - timing_path: options.timingPath - ? toRelativeArtifactPath(options.outputDir, options.timingPath) - : undefined, summary_path: options.summaryPath ? toRelativeArtifactPath(options.outputDir, options.summaryPath) : undefined, @@ -1854,6 +2113,8 @@ export function buildIndexArtifactEntry( : undefined, artifact_pointers: options.artifactPointers, runtime_source: options.runtimeSource, + sample_index: result.sampleIndex, + retry_index: result.retryIndex, ...options.extraIndexFields, external_trace: toIndexExternalTrace(result, options.projectionIdentity?.dimensions.runId), projection_identity: options.projectionIdentity @@ -1884,7 +2145,7 @@ export function buildResultIndexArtifact( const hasFileChanges = result.fileChanges !== undefined && result.fileChanges.length > 0; const hasTranscript = resultHasExecutionTraceTranscript(result); const isSingleRun = !hasPersistedTrialRuns(result); - const singleRunDir = path.posix.join(artifactSubdir, attemptDirName(0)); + const singleRunDir = path.posix.join(artifactSubdir, sampleDirName(0)); return { timestamp: result.timestamp, @@ -1901,6 +2162,9 @@ export function buildResultIndexArtifact( start_time: result.startTime, end_time: result.endTime, scores: toIndexScores(result.scores), + named_scores: collectNamedScores(result.scores), + derived_metrics: resultDerivedMetrics(result), + provenance: resultProvenance(result), attempts: toIndexTrialArtifacts(result), aggregation: toTrialAggregationArtifact(result.aggregation), execution_status: result.executionStatus, @@ -1911,7 +2175,6 @@ export function buildResultIndexArtifact( result_dir: artifactSubdir, summary_path: path.posix.join(artifactSubdir, RUN_SUMMARY_FILENAME), grading_path: isSingleRun ? path.posix.join(singleRunDir, 'grading.json') : undefined, - timing_path: isSingleRun ? path.posix.join(singleRunDir, 'timing.json') : undefined, metrics_path: isSingleRun ? path.posix.join(singleRunDir, CANONICAL_METRICS_ARTIFACT_PATH) : undefined, @@ -1935,6 +2198,8 @@ export function buildResultIndexArtifact( isSingleRun && hasTranscript ? buildResultTranscriptSummary(result) : undefined, artifact_pointers: options?.artifactPointers, runtime_source: options?.runtimeSource, + sample_index: result.sampleIndex, + retry_index: result.retryIndex, ...extraIndexFields, external_trace: toIndexExternalTrace(result, options?.projectionIdentity?.dimensions.runId), projection_identity: options?.projectionIdentity @@ -2004,17 +2269,28 @@ function buildMetricsArtifactPayload(params: { readonly timingArtifactPath?: string | null; readonly fileChangesArtifactPath?: string; readonly timing?: TimingArtifact; -}): ReturnType & { readonly timing?: TimingArtifact } { +}): ReturnType & { + readonly duration?: TimingArtifact['duration']; + readonly tokens?: TimingArtifact['tokens']; + readonly cost?: TimingArtifact['cost']; +} { const artifact = buildMetricsArtifact(params.result, params.envelope, { transcriptPath: params.transcriptArtifactPath ?? (params.transcriptPath ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined), gradingPath: params.gradingArtifactPath ?? 'grading.json', - timingPath: - params.timingArtifactPath === null ? undefined : (params.timingArtifactPath ?? 'timing.json'), fileChangesPath: params.fileChangesArtifactPath, }); - return params.timing ? { ...artifact, timing: params.timing } : artifact; + return params.timing + ? { + ...artifact, + duration: params.timing.duration, + tokens: params.timing.tokens, + cost: params.timing.cost, + execution: params.timing.execution, + trajectory: params.timing.trajectory, + } + : artifact; } async function writeMetricsArtifact(params: { @@ -2027,7 +2303,13 @@ async function writeMetricsArtifact(params: { readonly timingArtifactPath?: string | null; readonly fileChangesArtifactPath?: string; readonly timing?: TimingArtifact; -}): Promise & { readonly timing?: TimingArtifact }> { +}): Promise< + ReturnType & { + readonly duration?: TimingArtifact['duration']; + readonly tokens?: TimingArtifact['tokens']; + readonly cost?: TimingArtifact['cost']; + } +> { const artifactWithTiming = buildMetricsArtifactPayload(params); await writeFile(params.filePath, `${JSON.stringify(artifactWithTiming, null, 2)}\n`, 'utf8'); return artifactWithTiming; @@ -2183,6 +2465,133 @@ async function rewriteExistingIndexRecords( await writeJsonlFile(indexPath, records); } +async function readJsonFile(filePath: string): Promise { + const text = await readTextIfExists(filePath); + if (text === undefined) { + return undefined; + } + try { + return JSON.parse(text) as unknown; + } catch { + return undefined; + } +} + +function summaryRunId(summary: unknown, fallback: string): string { + if (!isRecord(summary)) { + return fallback; + } + const runId = isRecord(summary.metadata) ? summary.metadata.run_id : undefined; + return typeof runId === 'string' && runId.trim().length > 0 ? runId : fallback; +} + +function summaryTimestamp(summary: unknown): string | undefined { + if (!isRecord(summary) || !isRecord(summary.metadata)) { + return undefined; + } + return typeof summary.metadata.timestamp === 'string' ? summary.metadata.timestamp : undefined; +} + +function summaryTargets(summary: unknown): unknown { + return isRecord(summary) && isRecord(summary.metadata) ? summary.metadata.targets : undefined; +} + +function summaryTags(summary: unknown): unknown { + return isRecord(summary) && isRecord(summary.metadata) ? summary.metadata.tags : undefined; +} + +async function readJsonlFile(filePath: string): Promise { + const content = await readTextIfExists(filePath); + if (content === undefined) { + return []; + } + const records: unknown[] = []; + for (const line of content.split('\n')) { + if (line.trim().length === 0) { + continue; + } + try { + records.push(JSON.parse(line) as unknown); + } catch {} + } + return records; +} + +function buildCrossRunRunRecord(params: { + readonly runId: string; + readonly runDirName: string; + readonly summary: unknown; +}): Record { + const summary = isRecord(params.summary) ? params.summary : {}; + return dropUndefined({ + run_id: params.runId, + run_dir: params.runDirName, + summary_path: `${params.runDirName}/${RUN_SUMMARY_FILENAME}`, + index_path: `${params.runDirName}/${RUN_INTERNAL_DIRNAME}/${RESULT_INDEX_FILENAME}`, + timestamp: summaryTimestamp(summary), + targets: summaryTargets(summary), + tags: summaryTags(summary), + status: summary.status, + run_summary: summary.run_summary, + metrics: summary.metrics, + }); +} + +function buildCrossRunCaseRecord(params: { + readonly runId: string; + readonly runDirName: string; + readonly summary: unknown; + readonly caseRecord: unknown; +}): Record | undefined { + if (!isRecord(params.caseRecord)) { + return undefined; + } + return dropUndefined({ + ...params.caseRecord, + run_id: params.runId, + run_dir: params.runDirName, + run_timestamp: summaryTimestamp(params.summary), + run_tags: summaryTags(params.summary), + }); +} + +export async function rebuildCrossRunIndexes(resultsRoot: string): Promise { + const entries = await readdir(resultsRoot, { withFileTypes: true }).catch(() => []); + const runRecords: Record[] = []; + const caseRecords: Record[] = []; + + for (const entry of entries) { + if (!entry.isDirectory() || entry.name.startsWith('.')) { + continue; + } + const runDir = path.join(resultsRoot, entry.name); + const summary = await readJsonFile(path.join(runDir, RUN_SUMMARY_FILENAME)); + if (!summary) { + continue; + } + const indexPath = (await resolveExistingResultManifestPath(runDir)) ?? runIndexPath(runDir); + const runId = summaryRunId(summary, entry.name); + runRecords.push(buildCrossRunRunRecord({ runId, runDirName: entry.name, summary })); + const cases = await readJsonlFile(indexPath); + for (const caseRecord of cases) { + const projected = buildCrossRunCaseRecord({ + runId, + runDirName: entry.name, + summary, + caseRecord, + }); + if (projected) { + caseRecords.push(projected); + } + } + } + + const indexesDir = path.join(resultsRoot, CROSS_RUN_INDEX_DIRNAME); + await mkdir(indexesDir, { recursive: true }); + await writeJsonlFile(path.join(indexesDir, CROSS_RUN_RUNS_INDEX_FILENAME), runRecords); + await writeJsonlFile(path.join(indexesDir, CROSS_RUN_CASES_INDEX_FILENAME), caseRecords); +} + type ParsedEvaluationResult = Record & { timestamp: string; testId: string; @@ -2405,7 +2814,7 @@ export async function writePerTestArtifacts( } const isSingleRun = !hasPersistedTrialRuns(result); - const singleRunDir = path.join(testDir, attemptDirName(0)); + const singleRunDir = path.join(testDir, sampleDirName(0)); const singleAnswerPath = isSingleRun && result.output.length > 0 ? path.join(singleRunDir, 'outputs', 'answer.md') @@ -2419,7 +2828,6 @@ export async function writePerTestArtifacts( ? path.join(singleRunDir, 'transcript-raw.jsonl') : undefined; const singleGradingPath = isSingleRun ? path.join(singleRunDir, 'grading.json') : undefined; - const singleTimingPath = isSingleRun ? path.join(singleRunDir, 'timing.json') : undefined; const singleMetricsPath = isSingleRun ? path.join(singleRunDir, CANONICAL_METRICS_ARTIFACT_PATH) : undefined; @@ -2442,7 +2850,6 @@ export async function writePerTestArtifacts( resultDir: testDir, summaryPath: caseSummaryPath, gradingPath: singleGradingPath, - timingPath: singleTimingPath, metricsPath: singleMetricsPath, outputPath: singleAnswerPath, answerPath: singleAnswerPath, @@ -2485,7 +2892,7 @@ export async function writeArtifactsFromResults( }> { const testArtifactDir = outputDir; const summaryPath = path.join(outputDir, RUN_SUMMARY_FILENAME); - const indexPath = path.join(outputDir, RESULT_INDEX_FILENAME); + const indexPath = runIndexPath(outputDir); await mkdir(outputDir, { recursive: true }); const duplicatePolicy = options?.duplicatePolicy ?? 'update'; const resolvedTags = @@ -2521,7 +2928,7 @@ export async function writeArtifactsFromResults( const caseSummaryPath = path.join(testDir, RUN_SUMMARY_FILENAME); const identityId = projectionIdentity.id; const isSingleRun = !hasPersistedTrialRuns(result); - const singleRunDir = path.join(testDir, attemptDirName(0)); + const singleRunDir = path.join(testDir, sampleDirName(0)); const singleAnswerPath = isSingleRun && result.output.length > 0 ? path.join(singleRunDir, 'outputs', 'answer.md') @@ -2535,7 +2942,6 @@ export async function writeArtifactsFromResults( ? path.join(singleRunDir, 'transcript-raw.jsonl') : undefined; const singleGradingPath = isSingleRun ? path.join(singleRunDir, 'grading.json') : undefined; - const singleTimingPath = isSingleRun ? path.join(singleRunDir, 'timing.json') : undefined; const singleMetricsPath = isSingleRun ? path.join(singleRunDir, CANONICAL_METRICS_ARTIFACT_PATH) : undefined; @@ -2553,7 +2959,6 @@ export async function writeArtifactsFromResults( singleTranscriptPath, singleTranscriptRawPath, singleGradingPath, - singleTimingPath, singleMetricsPath, singleFileChangesPath, identityId, @@ -2624,7 +3029,6 @@ export async function writeArtifactsFromResults( resultDir: plan.testDir, summaryPath: plan.caseSummaryPath, gradingPath: plan.singleGradingPath, - timingPath: plan.singleTimingPath, metricsPath: plan.singleMetricsPath, outputPath: plan.singleAnswerPath, answerPath: plan.singleAnswerPath, @@ -2668,7 +3072,12 @@ export async function writeArtifactsFromResults( ); await writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, 'utf8'); + await mkdir(path.dirname(indexPath), { recursive: true }); await writeJsonlFile(indexPath, indexRecords); + const resultsRoot = path.dirname(outputDir); + if (isCanonicalResultsRoot(resultsRoot)) { + await rebuildCrossRunIndexes(resultsRoot); + } return { testArtifactDir, summaryPath, indexPath }; } diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 3aa0de8d4..3d8eddd81 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -1166,7 +1166,7 @@ export interface TrialResult { readonly failureReasonCode?: string; /** * Full per-attempt result used by artifact writers to materialize AgentV - * attempt-N folders. This is intentionally omitted from compact wire summaries. + * sample-N folders. This is intentionally omitted from compact wire summaries. */ readonly result?: EvaluationResult; } diff --git a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts index fa347b6b6..a9074a5f8 100644 --- a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts +++ b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts @@ -132,10 +132,15 @@ describe('evaluate() — programmatic API extensions', () => { expect(result.artifacts).toBeDefined(); expect(result.artifacts?.runDir).toBe(outputDir); - expect(result.artifacts?.indexPath).toBe(path.join(outputDir, RESULT_INDEX_FILENAME)); + expect(result.artifacts?.indexPath).toBe( + path.join(outputDir, '.internal', RESULT_INDEX_FILENAME), + ); expect(result.artifacts?.summaryPath).toBe(path.join(outputDir, 'summary.json')); - const indexContent = await readFile(path.join(outputDir, RESULT_INDEX_FILENAME), 'utf8'); + const indexContent = await readFile( + path.join(outputDir, '.internal', RESULT_INDEX_FILENAME), + 'utf8', + ); expect(indexContent).toContain('"test_id":"programmatic-artifacts"'); expect(indexContent).toContain('"experiment":"sdk-test"'); const [indexRow] = indexContent @@ -153,20 +158,20 @@ describe('evaluate() — programmatic API extensions', () => { tests_run: string[]; eval_file: string; }; - timing: { duration_ms: number }; + metrics: { duration: { total_ms: number } }; }; expect(summaryArtifact.metadata.run_id).toBe(path.basename(outputDir)); expect(summaryArtifact.metadata.experiment).toBe('sdk-test'); expect(summaryArtifact.metadata.tests_run).toEqual(['programmatic-artifacts']); expect(summaryArtifact.metadata.eval_file).toBe(''); - expect(summaryArtifact.timing.duration_ms).toBeGreaterThanOrEqual(0); + expect(summaryArtifact.metrics.duration.total_ms).toBeGreaterThanOrEqual(0); expect(resultDir).toMatch(/^programmatic-artifacts--[a-f0-9]{12}$/); - expect(existsSync(path.join(outputDir, resultDir ?? '', 'attempt-1', 'grading.json'))).toBe( + expect(existsSync(path.join(outputDir, resultDir ?? '', 'sample-1', 'grading.json'))).toBe( true, ); expect( - existsSync(path.join(outputDir, resultDir ?? '', 'attempt-1', 'outputs', 'answer.md')), + existsSync(path.join(outputDir, resultDir ?? '', 'sample-1', 'outputs', 'answer.md')), ).toBe(true); } finally { rmSync(outputDir, { recursive: true, force: true }); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index f57e10d43..760306e80 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -825,13 +825,13 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, const outputDir = path.join(tempDir, 'artifacts'); await writeArtifactsFromResults([result], outputDir); - const indexRows = readFileSync(path.join(outputDir, RESULT_INDEX_FILENAME), 'utf8') + const indexRows = readFileSync(path.join(outputDir, '.internal', RESULT_INDEX_FILENAME), 'utf8') .trim() .split('\n') .map((line) => JSON.parse(line) as Record); const resultDir = indexRows[0]?.result_dir; expect(resultDir).toMatch(/^case-1--[a-f0-9]{12}$/); - const runDir = path.join(outputDir, resultDir ?? '', 'attempt-1'); + const runDir = path.join(outputDir, resultDir ?? '', 'sample-1'); const outputsDir = path.join(runDir, 'outputs'); expect(readdirSync(runDir)).not.toContain('provider.log'); expect(readdirSync(runDir)).toContain('transcript-raw.jsonl'); @@ -844,8 +844,8 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, expect(indexRows[0]?.raw_provider_log_path).toBeUndefined(); expect(indexRows[0]?.trace_path).toBeUndefined(); - expect(indexRows[0]?.transcript_path).toBe(`${resultDir}/attempt-1/transcript.json`); - expect(indexRows[0]?.transcript_raw_path).toBe(`${resultDir}/attempt-1/transcript-raw.jsonl`); + expect(indexRows[0]?.transcript_path).toBe(`${resultDir}/sample-1/transcript.json`); + expect(indexRows[0]?.transcript_raw_path).toBe(`${resultDir}/sample-1/transcript-raw.jsonl`); expect(existsSync(rawLogPath)).toBe(false); }); diff --git a/packages/core/test/evaluation/results-repo.test.ts b/packages/core/test/evaluation/results-repo.test.ts index fce1fc1ea..f1a8a758a 100644 --- a/packages/core/test/evaluation/results-repo.test.ts +++ b/packages/core/test/evaluation/results-repo.test.ts @@ -433,7 +433,7 @@ describe('listGitRuns', () => { experiment: 'with-skills', timestamp: '2026-05-21T11:00:00.000Z', display_name: 'remote friendly run', - manifest_path: '2026-05-21T11-00-00-000Z/.internal/index.jsonl', + index_path: '2026-05-21T11-00-00-000Z/.internal/index.jsonl', summary_path: '2026-05-21T11-00-00-000Z/summary.json', test_count: 3, pass_rate: 0.75, @@ -444,7 +444,7 @@ describe('listGitRuns', () => { experiment: 'default', display_name: '2026-05-20T10-00-00-000Z', target: 'gpt-4o', - manifest_path: '2026-05-20T10-00-00-000Z/.internal/index.jsonl', + index_path: '2026-05-20T10-00-00-000Z/.internal/index.jsonl', test_count: 2, pass_rate: 0.5, }); diff --git a/skills-data/agentv-eval-migrations/references/breaking-changes.md b/skills-data/agentv-eval-migrations/references/breaking-changes.md index 113cdb7b6..0f0f697d6 100644 --- a/skills-data/agentv-eval-migrations/references/breaking-changes.md +++ b/skills-data/agentv-eval-migrations/references/breaking-changes.md @@ -911,6 +911,13 @@ v4.42.4 docs described local run workspaces under workspaces under `.agentv/results//`, with experiment metadata stored in `summary.json` / rows rather than inferred from the path. +Within each run bundle, the per-run index is `.internal/index.jsonl` and +`summary.json` points to it with `index_path`. Per-sample execution folders are +named `sample-N`; use row fields such as `sample_index` and `retry_index` for +semantics. New writers emit `metrics.json` for duration, tokens, cost, +execution, and trajectory data; they do not emit `timing.json`, `timing_path`, +or nested `metrics.timing`. + Do not edit eval YAML just to chase result artifact path changes. Migrate only authored fields that the eval parser reads. Use: