Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CONCEPTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,19 @@ Shared domain vocabulary for this project — entities, named processes, and sta

**Workspace** — The task environment an eval prepares for the agent: repositories, templates, fixture files, and lifecycle hooks. It is not prompt input; use `input` for instructions and `workspace.repos[]` for multi-repo workspaces the agent can inspect or modify through tools.

**Run bundle** — A committed local result directory at `.agentv/results/<run_id>/`. `summary.json` records run metadata such as `run_id` and `experiment`; `index.jsonl` records per-case rows.
**Run bundle** — A committed local result directory at `.agentv/results/<run_id>/`. `summary.json` records run metadata such as `run_id` and `experiment`; `.internal/index.jsonl` records per-case rows.

**Run manifest** — The root `summary.json` file in a run bundle. It owns aggregate run metadata and rollups such as `run_id`, `experiment`, timestamps, planned/completed counts, pass rate, score summaries, duration, tokens, and cost.

**Result index** — The root `index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `test_dir`, `summary_path`, and `grading_path`.
**Result index** — The `.internal/index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `test_dir`, `summary_path`, `grading_path`, and `metrics_path`.

**Result source identity** — The stable source identity for a result row: repo-relative `eval_path`, `test_id`, and `target`. `suite` and `name` are display metadata, not storage or routing identity.

**Result directory** — The `result_dir` field in a `index.jsonl` row. It is a run-local directory allocation for that row's sidecars and outputs, usually a readable test-id or slug prefix plus a UUID/hash-like suffix. Consumers discover it from `index.jsonl` and must not infer it from suite names, display names, test IDs, targets, models, or folder position.

**Artifact sidecar** — A file beside or below a result directory that provides evidence for a result, such as `summary.json`, `grading.json`, `result.json`, transcripts, logs, or outputs. Sidecars are evidence, not the primary discovery mechanism for a run.

**Artifact attempt folder** — A per-case `attempt-N/` folder under a result directory. It stores one materialized execution's sidecars and outputs. It is not the primary comparison dimension: stochastic samples and infrastructure retries should be represented with explicit sample/retry metadata rather than inferred from `attempt-1`, `attempt-2`, and so on.
**Artifact sample folder** — A per-case `sample-N/` folder under a result directory. It stores one materialized execution's sidecars and outputs. It is not the primary comparison dimension: stochastic samples and infrastructure retries are represented with explicit `sample_index` and `retry_index` metadata rather than inferred from folder position.

## Evaluation Reliability

Expand Down
1 change: 0 additions & 1 deletion apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ export function buildIndexArtifactEntry(
outputDir: string;
resultDir?: string;
gradingPath?: string;
timingPath?: string;
summaryPath?: string;
outputPath?: string;
answerPath?: string;
Expand Down
10 changes: 8 additions & 2 deletions apps/cli/src/commands/eval/result-layout.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import path from 'node:path';

export const RESULT_INDEX_FILENAME = 'index.jsonl';
export const RUN_SUMMARY_FILENAME = 'summary.json';
export const RUN_INTERNAL_DIRNAME = '.internal';
export const RESULTS_DIRNAME = 'results';
export const DEFAULT_EXPERIMENT_NAME = 'default';
export const RESERVED_RESULTS_NAMESPACES = new Set(['export', 'metadata', 'runs']);
Expand Down Expand Up @@ -59,11 +60,11 @@ export function buildDefaultRunDir(
}

export function buildDefaultIndexPath(cwd: string, experiment?: string): string {
return path.join(buildDefaultRunDir(cwd, experiment), RESULT_INDEX_FILENAME);
return resolveRunIndexPath(buildDefaultRunDir(cwd, experiment));
}

export function resolveRunIndexPath(runDir: string): string {
return path.join(runDir, RESULT_INDEX_FILENAME);
return path.join(runDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME);
}

export function isRunManifestPath(filePath: string): boolean {
Expand All @@ -76,6 +77,11 @@ export function resolveExistingRunPrimaryPath(runDir: string): string | undefine
return indexPath;
}

const legacyIndexPath = path.join(runDir, RESULT_INDEX_FILENAME);
if (existsSync(legacyIndexPath)) {
return legacyIndexPath;
}

return undefined;
}

Expand Down
11 changes: 8 additions & 3 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ import {
createRunDirName,
discoverRunManifestPaths,
normalizeExperimentName,
resolveRunIndexPath,
} from './result-layout.js';
import {
buildExclusionFilter,
Expand Down Expand Up @@ -1236,7 +1237,7 @@ class RunOutputWriter implements OutputWriter {
private readonly invocationDir: string,
private readonly appendMode: boolean,
) {
this.indexPath = path.join(invocationDir, RESULT_INDEX_FILENAME);
this.indexPath = resolveRunIndexPath(invocationDir);
}

async append(result: EvaluationResult): Promise<void> {
Expand Down Expand Up @@ -1280,7 +1281,11 @@ async function resolveRerunFailedRunDir(cwd: string, source: string): Promise<st

const candidate = path.isAbsolute(trimmed) ? trimmed : path.resolve(cwd, trimmed);
if (existsSync(candidate)) {
return path.basename(candidate) === RESULT_INDEX_FILENAME ? path.dirname(candidate) : candidate;
if (path.basename(candidate) !== RESULT_INDEX_FILENAME) {
return candidate;
}
const manifestDir = path.dirname(candidate);
return path.basename(manifestDir) === '.internal' ? path.dirname(manifestDir) : manifestDir;
}

const runIdCandidate = path.join(cwd, '.agentv', 'results', trimmed);
Expand Down Expand Up @@ -2622,7 +2627,7 @@ export async function runEvalCommand(
runtimeSource: runtimeSourceMetadata,
tags: emittedTags,
});
const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
const indexPath = resolveRunIndexPath(runDir);
console.log(`Artifact bundle updated: ${runDir}`);
console.log(` Run manifest: ${indexPath}`);
console.log(` Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
Expand Down
30 changes: 14 additions & 16 deletions apps/cli/src/commands/pipeline/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
*
* Writes:
* - <test-id>/grading.json (per-test grading breakdown)
* - index.jsonl (one line per test)
* - .internal/index.jsonl (one line per test)
* - summary.json (aggregate statistics)
*/
import { existsSync } from 'node:fs';
import { readFile, readdir, writeFile } from 'node:fs/promises';
import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
import { join } from 'node:path';

import { command, positional, string } from 'cmd-ts';
Expand Down Expand Up @@ -174,17 +174,17 @@ export const evalBenchCommand = command({
})),
}));

// Read execution_status from timing.json (written by pipeline run)
// Read execution_status from metrics.json (written by pipeline run)
let executionStatus = 'ok';
const timingPath = join(testDir, 'timing.json');
if (existsSync(timingPath)) {
const metricsPath = join(testDir, 'metrics.json');
if (existsSync(metricsPath)) {
try {
const timing = JSON.parse(await readFile(timingPath, 'utf8'));
if (typeof timing.execution_status === 'string') {
executionStatus = timing.execution_status;
const metrics = JSON.parse(await readFile(metricsPath, 'utf8'));
if (typeof metrics.execution?.status === 'string') {
executionStatus = metrics.execution.status;
}
} catch {
// Fall back to 'ok' if timing.json is unreadable
// Fall back to 'ok' if metrics.json is unreadable
}
}

Expand All @@ -200,23 +200,21 @@ export const evalBenchCommand = command({
scores,
execution_status: executionStatus,
grading_path: `${artifactSubdir}/grading.json`,
timing_path: `${artifactSubdir}/timing.json`,
metrics_path: `${artifactSubdir}/metrics.json`,
response_path: hasResponse ? `${artifactSubdir}/response.md` : undefined,
}),
);
}

// Write row-level run manifest.
await writeFile(
join(exportDir, RESULT_INDEX_FILENAME),
indexLines.length > 0 ? `${indexLines.join('\n')}\n` : '',
'utf8',
);
const indexPath = join(exportDir, '.internal', RESULT_INDEX_FILENAME);
await mkdir(join(exportDir, '.internal'), { recursive: true });
await writeFile(indexPath, indexLines.length > 0 ? `${indexLines.join('\n')}\n` : '', 'utf8');

// Write summary.json
const passRateStats = computeStats(allPassRates);
const summary = {
manifest_path: RESULT_INDEX_FILENAME,
index_path: `.internal/${RESULT_INDEX_FILENAME}`,
metadata: {
eval_file: manifest.eval_file,
timestamp: manifest.timestamp,
Expand Down
28 changes: 19 additions & 9 deletions apps/cli/src/commands/pipeline/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*
* Equivalent to running:
* 1. `agentv pipeline input <eval> --out <dir>`
* 2. Invoking each CLI target in parallel (writing response.md + timing.json)
* 2. Invoking each CLI target in parallel (writing response.md + metrics.json)
* 3. `agentv pipeline grade <dir>`
*
* For `kind: agent` targets, step 2 is skipped (subagent handles execution).
Expand Down Expand Up @@ -289,10 +289,15 @@ export const evalRunCommand = command({
}

await writeFile(join(testDir, 'response.md'), response, 'utf8');
await writeJson(join(testDir, 'timing.json'), {
duration_ms: durationMs,
total_duration_seconds: Math.round(durationMs / 10) / 100,
execution_status: 'ok',
await writeJson(join(testDir, 'metrics.json'), {
duration: {
total_ms: durationMs,
total_seconds: Math.round(durationMs / 10) / 100,
source: 'provider_reported',
},
tokens: { total: 0, input: 0, output: 0, reasoning: 0, source: 'unavailable' },
cost: { usd: null, source: 'unavailable' },
execution: { status: 'ok' },
});

process.stderr.write(`\n ${testId}: OK (${durationMs}ms, ${response.length} chars)\n`);
Expand All @@ -301,10 +306,15 @@ export const evalRunCommand = command({
const message = error instanceof Error ? error.message : String(error);
const response = `ERROR: target failed — ${message}`;
await writeFile(join(testDir, 'response.md'), response, 'utf8');
await writeJson(join(testDir, 'timing.json'), {
duration_ms: durationMs,
total_duration_seconds: Math.round(durationMs / 10) / 100,
execution_status: 'execution_error',
await writeJson(join(testDir, 'metrics.json'), {
duration: {
total_ms: durationMs,
total_seconds: Math.round(durationMs / 10) / 100,
source: 'provider_reported',
},
tokens: { total: 0, input: 0, output: 0, reasoning: 0, source: 'unavailable' },
cost: { usd: null, source: 'unavailable' },
execution: { status: 'execution_error' },
});
process.stderr.write(
`\n ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}\n`,
Expand Down
25 changes: 17 additions & 8 deletions apps/cli/src/commands/results/export.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
* index.jsonl — per-test manifest with artifact pointers
* <test-id>/
* summary.json — per-case aggregate
* attempt-1/result.json — per-attempt result
* attempt-1/grading.json — per-attempt grading artifact (assertions, graders)
* attempt-1/metrics.json — per-attempt metrics artifact
* sample-1/result.json — per-sample result
* sample-1/grading.json — per-sample grading artifact (assertions, graders)
* sample-1/metrics.json — per-sample metrics artifact
*
* This module delegates artifact building to the shared artifact-writer so
* that summary/grading/timing schemas stay aligned with `agentv eval`.
Expand All @@ -36,6 +36,7 @@ import type {
import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js';
import {
RESULT_INDEX_FILENAME,
RUN_INTERNAL_DIRNAME,
isReservedResultsNamespace,
isRunManifestPath,
} from '../eval/result-layout.js';
Expand Down Expand Up @@ -69,7 +70,7 @@ export async function exportResults(
duplicatePolicy: options?.duplicatePolicy ?? 'update',
additionalArtifacts: createExportBundleArtifactsWriter({
outputDir,
sourceBaseDir: path.dirname(sourceFile),
sourceBaseDir: runRootFromIndexPath(sourceFile),
sourceRecordsByResult: buildSourceRecordMap(results, sourceIndexRecords),
}),
});
Expand All @@ -85,7 +86,7 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string {
throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`);
}

const runDir = path.dirname(sourceFile);
const runDir = runRootFromIndexPath(sourceFile);
const segments = path.normalize(runDir).split(path.sep).filter(Boolean);
const resultsIndex = segments.lastIndexOf('results');
if (resultsIndex >= 0 && resultsIndex < segments.length - 2) {
Expand All @@ -104,11 +105,19 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string {

export function deriveExportRunId(sourceFile: string): string {
if (isRunManifestPath(sourceFile)) {
return path.basename(path.dirname(sourceFile));
return path.basename(runRootFromIndexPath(sourceFile));
}
return path.basename(sourceFile, path.extname(sourceFile));
}

function runRootFromIndexPath(sourceFile: string): string {
const indexDir = path.dirname(sourceFile);
if (path.basename(indexDir) === RUN_INTERNAL_DIRNAME) {
return path.dirname(indexDir);
}
return indexDir;
}

export async function loadExportSource(
source: string | undefined,
cwd: string,
Expand Down Expand Up @@ -222,7 +231,7 @@ export function buildProjectionBundleFromExportedIndex(options: {
readonly includeRawContent?: boolean;
readonly duplicatePolicy?: ExportDuplicatePolicy;
}): ProjectionBundle {
const indexPath = path.join(options.outputDir, RESULT_INDEX_FILENAME);
const indexPath = path.join(options.outputDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME);
const indexRecords = readIndexArtifactEntries(indexPath);
const emittedResults = loadManifestResults(indexPath);

Expand Down Expand Up @@ -327,7 +336,7 @@ export const resultsExportCommand = command({
duplicatePolicy: policy,
additionalArtifacts: createExportBundleArtifactsWriter({
outputDir,
sourceBaseDir: path.dirname(sourceFile),
sourceBaseDir: runRootFromIndexPath(sourceFile),
sourceRecordsByResult: buildSourceRecordMap(results, indexRecords ?? []),
}),
});
Expand Down
Loading
Loading