EntityProcess · christso · Jul 3, 2026 · Jul 3, 2026
diff --git a/CONCEPTS.md b/CONCEPTS.md
@@ -26,19 +26,19 @@ Shared domain vocabulary for this project — entities, named processes, and sta
 
 **Workspace** — The task environment an eval prepares for the agent: repositories, templates, fixture files, and lifecycle hooks. It is not prompt input; use `input` for instructions and `workspace.repos[]` for multi-repo workspaces the agent can inspect or modify through tools.
 
-**Run bundle** — A committed local result directory at `.agentv/results/<run_id>/`. `summary.json` records run metadata such as `run_id` and `experiment`; `index.jsonl` records per-case rows.
+**Run bundle** — A committed local result directory at `.agentv/results/<run_id>/`. `summary.json` records run metadata such as `run_id` and `experiment`; `.internal/index.jsonl` records per-case rows.
 
 **Run manifest** — The root `summary.json` file in a run bundle. It owns aggregate run metadata and rollups such as `run_id`, `experiment`, timestamps, planned/completed counts, pass rate, score summaries, duration, tokens, and cost.
 
-**Result index** — The root `index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `test_dir`, `summary_path`, and `grading_path`.
+**Result index** — The `.internal/index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `test_dir`, `summary_path`, `grading_path`, and `metrics_path`.
 
 **Result source identity** — The stable source identity for a result row: repo-relative `eval_path`, `test_id`, and `target`. `suite` and `name` are display metadata, not storage or routing identity.
 
 **Result directory** — The `result_dir` field in a `index.jsonl` row. It is a run-local directory allocation for that row's sidecars and outputs, usually a readable test-id or slug prefix plus a UUID/hash-like suffix. Consumers discover it from `index.jsonl` and must not infer it from suite names, display names, test IDs, targets, models, or folder position.
 
 **Artifact sidecar** — A file beside or below a result directory that provides evidence for a result, such as `summary.json`, `grading.json`, `result.json`, transcripts, logs, or outputs. Sidecars are evidence, not the primary discovery mechanism for a run.
 
-**Artifact attempt folder** — A per-case `attempt-N/` folder under a result directory. It stores one materialized execution's sidecars and outputs. It is not the primary comparison dimension: stochastic samples and infrastructure retries should be represented with explicit sample/retry metadata rather than inferred from `attempt-1`, `attempt-2`, and so on.
+**Artifact sample folder** — A per-case `sample-N/` folder under a result directory. It stores one materialized execution's sidecars and outputs. It is not the primary comparison dimension: stochastic samples and infrastructure retries are represented with explicit `sample_index` and `retry_index` metadata rather than inferred from folder position.
 
 ## Evaluation Reliability
 

diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -98,7 +98,6 @@ export function buildIndexArtifactEntry(
     outputDir: string;
     resultDir?: string;
     gradingPath?: string;
-    timingPath?: string;
     summaryPath?: string;
     outputPath?: string;
     answerPath?: string;

diff --git a/apps/cli/src/commands/eval/result-layout.ts b/apps/cli/src/commands/eval/result-layout.ts
@@ -3,6 +3,7 @@ import path from 'node:path';
 
 export const RESULT_INDEX_FILENAME = 'index.jsonl';
 export const RUN_SUMMARY_FILENAME = 'summary.json';
+export const RUN_INTERNAL_DIRNAME = '.internal';
 export const RESULTS_DIRNAME = 'results';
 export const DEFAULT_EXPERIMENT_NAME = 'default';
 export const RESERVED_RESULTS_NAMESPACES = new Set(['export', 'metadata', 'runs']);
@@ -59,11 +60,11 @@ export function buildDefaultRunDir(
 }
 
 export function buildDefaultIndexPath(cwd: string, experiment?: string): string {
-  return path.join(buildDefaultRunDir(cwd, experiment), RESULT_INDEX_FILENAME);
+  return resolveRunIndexPath(buildDefaultRunDir(cwd, experiment));
 }
 
 export function resolveRunIndexPath(runDir: string): string {
-  return path.join(runDir, RESULT_INDEX_FILENAME);
+  return path.join(runDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME);
 }
 
 export function isRunManifestPath(filePath: string): boolean {
@@ -76,6 +77,11 @@ export function resolveExistingRunPrimaryPath(runDir: string): string | undefine
     return indexPath;
   }
 
+  const legacyIndexPath = path.join(runDir, RESULT_INDEX_FILENAME);
+  if (existsSync(legacyIndexPath)) {
+    return legacyIndexPath;
+  }
+
   return undefined;
 }
 

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -74,6 +74,7 @@ import {
   createRunDirName,
   discoverRunManifestPaths,
   normalizeExperimentName,
+  resolveRunIndexPath,
 } from './result-layout.js';
 import {
   buildExclusionFilter,
@@ -1236,7 +1237,7 @@ class RunOutputWriter implements OutputWriter {
     private readonly invocationDir: string,
     private readonly appendMode: boolean,
   ) {
-    this.indexPath = path.join(invocationDir, RESULT_INDEX_FILENAME);
+    this.indexPath = resolveRunIndexPath(invocationDir);
   }
 
   async append(result: EvaluationResult): Promise<void> {
@@ -1280,7 +1281,11 @@ async function resolveRerunFailedRunDir(cwd: string, source: string): Promise<st
 
   const candidate = path.isAbsolute(trimmed) ? trimmed : path.resolve(cwd, trimmed);
   if (existsSync(candidate)) {
-    return path.basename(candidate) === RESULT_INDEX_FILENAME ? path.dirname(candidate) : candidate;
+    if (path.basename(candidate) !== RESULT_INDEX_FILENAME) {
+      return candidate;
+    }
+    const manifestDir = path.dirname(candidate);
+    return path.basename(manifestDir) === '.internal' ? path.dirname(manifestDir) : manifestDir;
   }
 
   const runIdCandidate = path.join(cwd, '.agentv', 'results', trimmed);
@@ -2622,7 +2627,7 @@ export async function runEvalCommand(
           runtimeSource: runtimeSourceMetadata,
           tags: emittedTags,
         });
-        const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
+        const indexPath = resolveRunIndexPath(runDir);
         console.log(`Artifact bundle updated: ${runDir}`);
         console.log(`  Run manifest: ${indexPath}`);
         console.log(`  Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);

diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
@@ -6,11 +6,11 @@
  *
  * Writes:
  *   - <test-id>/grading.json  (per-test grading breakdown)
- *   - index.jsonl      (one line per test)
+ *   - .internal/index.jsonl  (one line per test)
  *   - summary.json            (aggregate statistics)
  */
 import { existsSync } from 'node:fs';
-import { readFile, readdir, writeFile } from 'node:fs/promises';
+import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
 import { join } from 'node:path';
 
 import { command, positional, string } from 'cmd-ts';
@@ -174,17 +174,17 @@ export const evalBenchCommand = command({
         })),
       }));
 
-      // Read execution_status from timing.json (written by pipeline run)
+      // Read execution_status from metrics.json (written by pipeline run)
       let executionStatus = 'ok';
-      const timingPath = join(testDir, 'timing.json');
-      if (existsSync(timingPath)) {
+      const metricsPath = join(testDir, 'metrics.json');
+      if (existsSync(metricsPath)) {
         try {
-          const timing = JSON.parse(await readFile(timingPath, 'utf8'));
-          if (typeof timing.execution_status === 'string') {
-            executionStatus = timing.execution_status;
+          const metrics = JSON.parse(await readFile(metricsPath, 'utf8'));
+          if (typeof metrics.execution?.status === 'string') {
+            executionStatus = metrics.execution.status;
           }
         } catch {
-          // Fall back to 'ok' if timing.json is unreadable
+          // Fall back to 'ok' if metrics.json is unreadable
         }
       }
 
@@ -200,23 +200,21 @@ export const evalBenchCommand = command({
           scores,
           execution_status: executionStatus,
           grading_path: `${artifactSubdir}/grading.json`,
-          timing_path: `${artifactSubdir}/timing.json`,
+          metrics_path: `${artifactSubdir}/metrics.json`,
           response_path: hasResponse ? `${artifactSubdir}/response.md` : undefined,
         }),
       );
     }
 
     // Write row-level run manifest.
-    await writeFile(
-      join(exportDir, RESULT_INDEX_FILENAME),
-      indexLines.length > 0 ? `${indexLines.join('\n')}\n` : '',
-      'utf8',
-    );
+    const indexPath = join(exportDir, '.internal', RESULT_INDEX_FILENAME);
+    await mkdir(join(exportDir, '.internal'), { recursive: true });
+    await writeFile(indexPath, indexLines.length > 0 ? `${indexLines.join('\n')}\n` : '', 'utf8');
 
     // Write summary.json
     const passRateStats = computeStats(allPassRates);
     const summary = {
-      manifest_path: RESULT_INDEX_FILENAME,
+      index_path: `.internal/${RESULT_INDEX_FILENAME}`,
       metadata: {
         eval_file: manifest.eval_file,
         timestamp: manifest.timestamp,

diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
@@ -4,7 +4,7 @@
  *
  * Equivalent to running:
  *   1. `agentv pipeline input <eval> --out <dir>`
- *   2. Invoking each CLI target in parallel (writing response.md + timing.json)
+ *   2. Invoking each CLI target in parallel (writing response.md + metrics.json)
  *   3. `agentv pipeline grade <dir>`
  *
  * For `kind: agent` targets, step 2 is skipped (subagent handles execution).
@@ -289,10 +289,15 @@ export const evalRunCommand = command({
           }
 
           await writeFile(join(testDir, 'response.md'), response, 'utf8');
-          await writeJson(join(testDir, 'timing.json'), {
-            duration_ms: durationMs,
-            total_duration_seconds: Math.round(durationMs / 10) / 100,
-            execution_status: 'ok',
+          await writeJson(join(testDir, 'metrics.json'), {
+            duration: {
+              total_ms: durationMs,
+              total_seconds: Math.round(durationMs / 10) / 100,
+              source: 'provider_reported',
+            },
+            tokens: { total: 0, input: 0, output: 0, reasoning: 0, source: 'unavailable' },
+            cost: { usd: null, source: 'unavailable' },
+            execution: { status: 'ok' },
           });
 
           process.stderr.write(`\n  ${testId}: OK (${durationMs}ms, ${response.length} chars)\n`);
@@ -301,10 +306,15 @@ export const evalRunCommand = command({
           const message = error instanceof Error ? error.message : String(error);
           const response = `ERROR: target failed — ${message}`;
           await writeFile(join(testDir, 'response.md'), response, 'utf8');
-          await writeJson(join(testDir, 'timing.json'), {
-            duration_ms: durationMs,
-            total_duration_seconds: Math.round(durationMs / 10) / 100,
-            execution_status: 'execution_error',
+          await writeJson(join(testDir, 'metrics.json'), {
+            duration: {
+              total_ms: durationMs,
+              total_seconds: Math.round(durationMs / 10) / 100,
+              source: 'provider_reported',
+            },
+            tokens: { total: 0, input: 0, output: 0, reasoning: 0, source: 'unavailable' },
+            cost: { usd: null, source: 'unavailable' },
+            execution: { status: 'execution_error' },
           });
           process.stderr.write(
             `\n  ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}\n`,

diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts
@@ -8,9 +8,9 @@
  *     index.jsonl              — per-test manifest with artifact pointers
  *     <test-id>/
  *       summary.json           — per-case aggregate
- *       attempt-1/result.json  — per-attempt result
- *       attempt-1/grading.json — per-attempt grading artifact (assertions, graders)
- *       attempt-1/metrics.json — per-attempt metrics artifact
+ *       sample-1/result.json  — per-sample result
+ *       sample-1/grading.json — per-sample grading artifact (assertions, graders)
+ *       sample-1/metrics.json — per-sample metrics artifact
  *
  * This module delegates artifact building to the shared artifact-writer so
  * that summary/grading/timing schemas stay aligned with `agentv eval`.
@@ -36,6 +36,7 @@ import type {
 import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js';
 import {
   RESULT_INDEX_FILENAME,
+  RUN_INTERNAL_DIRNAME,
   isReservedResultsNamespace,
   isRunManifestPath,
 } from '../eval/result-layout.js';
@@ -69,7 +70,7 @@ export async function exportResults(
     duplicatePolicy: options?.duplicatePolicy ?? 'update',
     additionalArtifacts: createExportBundleArtifactsWriter({
       outputDir,
-      sourceBaseDir: path.dirname(sourceFile),
+      sourceBaseDir: runRootFromIndexPath(sourceFile),
       sourceRecordsByResult: buildSourceRecordMap(results, sourceIndexRecords),
     }),
   });
@@ -85,7 +86,7 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string {
     throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`);
   }
 
-  const runDir = path.dirname(sourceFile);
+  const runDir = runRootFromIndexPath(sourceFile);
   const segments = path.normalize(runDir).split(path.sep).filter(Boolean);
   const resultsIndex = segments.lastIndexOf('results');
   if (resultsIndex >= 0 && resultsIndex < segments.length - 2) {
@@ -104,11 +105,19 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string {
 
 export function deriveExportRunId(sourceFile: string): string {
   if (isRunManifestPath(sourceFile)) {
-    return path.basename(path.dirname(sourceFile));
+    return path.basename(runRootFromIndexPath(sourceFile));
   }
   return path.basename(sourceFile, path.extname(sourceFile));
 }
 
+function runRootFromIndexPath(sourceFile: string): string {
+  const indexDir = path.dirname(sourceFile);
+  if (path.basename(indexDir) === RUN_INTERNAL_DIRNAME) {
+    return path.dirname(indexDir);
+  }
+  return indexDir;
+}
+
 export async function loadExportSource(
   source: string | undefined,
   cwd: string,
@@ -222,7 +231,7 @@ export function buildProjectionBundleFromExportedIndex(options: {
   readonly includeRawContent?: boolean;
   readonly duplicatePolicy?: ExportDuplicatePolicy;
 }): ProjectionBundle {
-  const indexPath = path.join(options.outputDir, RESULT_INDEX_FILENAME);
+  const indexPath = path.join(options.outputDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME);
   const indexRecords = readIndexArtifactEntries(indexPath);
   const emittedResults = loadManifestResults(indexPath);
 
@@ -327,7 +336,7 @@ export const resultsExportCommand = command({
         duplicatePolicy: policy,
         additionalArtifacts: createExportBundleArtifactsWriter({
           outputDir,
-          sourceBaseDir: path.dirname(sourceFile),
+          sourceBaseDir: runRootFromIndexPath(sourceFile),
           sourceRecordsByResult: buildSourceRecordMap(results, indexRecords ?? []),
         }),
       });