From 1d327bef5e44ecceeba15d87b15dcd3e96b9fa56 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 3 Jul 2026 15:07:22 +0200
Subject: [PATCH] feat(results): implement ADR-0017 bundle content

---
 CONCEPTS.md                                   |   6 +-
 apps/cli/src/commands/eval/artifact-writer.ts |   1 -
 apps/cli/src/commands/eval/result-layout.ts   |  10 +-
 apps/cli/src/commands/eval/run-eval.ts        |  11 +-
 apps/cli/src/commands/pipeline/bench.ts       |  30 +-
 apps/cli/src/commands/pipeline/run.ts         |  28 +-
 apps/cli/src/commands/results/export.ts       |  25 +-
 apps/cli/src/commands/results/manifest.ts     |  60 +-
 .../src/commands/results/projection-bundle.ts |   8 +-
 apps/cli/src/commands/results/remote.ts       |   5 +-
 apps/cli/src/commands/results/report.ts       |  11 +-
 apps/cli/src/commands/results/serve.ts        |  51 +-
 apps/cli/src/commands/results/validate.ts     |  23 +-
 apps/cli/test/commands/eval/aggregate.test.ts |  28 +-
 .../commands/eval/artifact-writer.test.ts     | 326 ++++------
 apps/cli/test/commands/eval/bundle.test.ts    |   2 +-
 .../test/commands/eval/pipeline/bench.test.ts |   6 +-
 .../eval/pipeline/pipeline-e2e.test.ts        |   2 +-
 .../test/commands/eval/result-layout.test.ts  |   9 +-
 apps/cli/test/commands/eval/run-cache.test.ts |   4 +-
 .../commands/grade/grade-prepared.test.ts     |  16 +-
 .../results/export-e2e-providers.test.ts      |  68 +-
 apps/cli/test/commands/results/export.test.ts |  77 +--
 apps/cli/test/commands/results/report.test.ts |   2 +-
 .../test/commands/results/validate.test.ts    |   8 +-
 apps/cli/test/commands/runs/rerun.test.ts     |   5 +-
 apps/cli/test/eval.integration.test.ts        |  37 +-
 .../docs/next/evaluation/running-evals.mdx    |  22 +-
 .../docs/next/getting-started/quickstart.mdx  |   2 +-
 .../docs/docs/next/guides/autoresearch.mdx    |   4 +-
 .../docs/next/reference/result-artifacts.mdx  |  71 +-
 .../content/docs/docs/next/tools/compare.mdx  |  12 +-
 .../content/docs/docs/next/tools/inspect.mdx  |   2 +-
 .../content/docs/docs/next/tools/results.mdx  |  18 +-
 .../content/docs/docs/next/tools/trend.mdx    |   6 +-
 .../docs/docs/next/tools/wip-checkpoints.mdx  |   2 +-
 packages/core/src/evaluation/metrics.ts       |  65 +-
 .../src/evaluation/results-repo-cache.test.ts |   2 +-
 packages/core/src/evaluation/results-repo.ts  |  22 +-
 packages/core/src/evaluation/run-artifacts.ts | 607 +++++++++++++++---
 packages/core/src/evaluation/types.ts         |   2 +-
 .../evaluate-programmatic-api.test.ts         |  17 +-
 .../core/test/evaluation/orchestrator.test.ts |   8 +-
 .../core/test/evaluation/results-repo.test.ts |   4 +-
 .../references/breaking-changes.md            |   7 +
 45 files changed, 1131 insertions(+), 601 deletions(-)

diff --git a/CONCEPTS.md b/CONCEPTS.md
index 7c2f3cc65..cb3c73a8e 100644
--- a/CONCEPTS.md
+++ b/CONCEPTS.md
@@ -26,11 +26,11 @@ Shared domain vocabulary for this project — entities, named processes, and sta
 
 **Workspace** — The task environment an eval prepares for the agent: repositories, templates, fixture files, and lifecycle hooks. It is not prompt input; use `input` for instructions and `workspace.repos[]` for multi-repo workspaces the agent can inspect or modify through tools.
 
-**Run bundle** — A committed local result directory at `.agentv/results/<run_id>/`. `summary.json` records run metadata such as `run_id` and `experiment`; `index.jsonl` records per-case rows.
+**Run bundle** — A committed local result directory at `.agentv/results/<run_id>/`. `summary.json` records run metadata such as `run_id` and `experiment`; `.internal/index.jsonl` records per-case rows.
 
 **Run manifest** — The root `summary.json` file in a run bundle. It owns aggregate run metadata and rollups such as `run_id`, `experiment`, timestamps, planned/completed counts, pass rate, score summaries, duration, tokens, and cost.
 
-**Result index** — The root `index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `test_dir`, `summary_path`, and `grading_path`.
+**Result index** — The `.internal/index.jsonl` file in a run bundle. It is the dashboard and tooling loading contract for per-case result rows and artifact locations, including fields such as `result_dir`, `test_dir`, `summary_path`, `grading_path`, and `metrics_path`.
 
 **Result source identity** — The stable source identity for a result row: repo-relative `eval_path`, `test_id`, and `target`. `suite` and `name` are display metadata, not storage or routing identity.
 
@@ -38,7 +38,7 @@ Shared domain vocabulary for this project — entities, named processes, and sta
 
 **Artifact sidecar** — A file beside or below a result directory that provides evidence for a result, such as `summary.json`, `grading.json`, `result.json`, transcripts, logs, or outputs. Sidecars are evidence, not the primary discovery mechanism for a run.
 
-**Artifact attempt folder** — A per-case `attempt-N/` folder under a result directory. It stores one materialized execution's sidecars and outputs. It is not the primary comparison dimension: stochastic samples and infrastructure retries should be represented with explicit sample/retry metadata rather than inferred from `attempt-1`, `attempt-2`, and so on.
+**Artifact sample folder** — A per-case `sample-N/` folder under a result directory. It stores one materialized execution's sidecars and outputs. It is not the primary comparison dimension: stochastic samples and infrastructure retries are represented with explicit `sample_index` and `retry_index` metadata rather than inferred from folder position.
 
 ## Evaluation Reliability
 
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index 818729287..afb4246b5 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -98,7 +98,6 @@ export function buildIndexArtifactEntry(
     outputDir: string;
     resultDir?: string;
     gradingPath?: string;
-    timingPath?: string;
     summaryPath?: string;
     outputPath?: string;
     answerPath?: string;
diff --git a/apps/cli/src/commands/eval/result-layout.ts b/apps/cli/src/commands/eval/result-layout.ts
index dc9f1efa4..12a826e7e 100644
--- a/apps/cli/src/commands/eval/result-layout.ts
+++ b/apps/cli/src/commands/eval/result-layout.ts
@@ -3,6 +3,7 @@ import path from 'node:path';
 
 export const RESULT_INDEX_FILENAME = 'index.jsonl';
 export const RUN_SUMMARY_FILENAME = 'summary.json';
+export const RUN_INTERNAL_DIRNAME = '.internal';
 export const RESULTS_DIRNAME = 'results';
 export const DEFAULT_EXPERIMENT_NAME = 'default';
 export const RESERVED_RESULTS_NAMESPACES = new Set(['export', 'metadata', 'runs']);
@@ -59,11 +60,11 @@ export function buildDefaultRunDir(
 }
 
 export function buildDefaultIndexPath(cwd: string, experiment?: string): string {
-  return path.join(buildDefaultRunDir(cwd, experiment), RESULT_INDEX_FILENAME);
+  return resolveRunIndexPath(buildDefaultRunDir(cwd, experiment));
 }
 
 export function resolveRunIndexPath(runDir: string): string {
-  return path.join(runDir, RESULT_INDEX_FILENAME);
+  return path.join(runDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME);
 }
 
 export function isRunManifestPath(filePath: string): boolean {
@@ -76,6 +77,11 @@ export function resolveExistingRunPrimaryPath(runDir: string): string | undefine
     return indexPath;
   }
 
+  const legacyIndexPath = path.join(runDir, RESULT_INDEX_FILENAME);
+  if (existsSync(legacyIndexPath)) {
+    return legacyIndexPath;
+  }
+
   return undefined;
 }
 
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index f7b22916a..9c25d8dbf 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -74,6 +74,7 @@ import {
   createRunDirName,
   discoverRunManifestPaths,
   normalizeExperimentName,
+  resolveRunIndexPath,
 } from './result-layout.js';
 import {
   buildExclusionFilter,
@@ -1236,7 +1237,7 @@ class RunOutputWriter implements OutputWriter {
     private readonly invocationDir: string,
     private readonly appendMode: boolean,
   ) {
-    this.indexPath = path.join(invocationDir, RESULT_INDEX_FILENAME);
+    this.indexPath = resolveRunIndexPath(invocationDir);
   }
 
   async append(result: EvaluationResult): Promise<void> {
@@ -1280,7 +1281,11 @@ async function resolveRerunFailedRunDir(cwd: string, source: string): Promise<st
 
   const candidate = path.isAbsolute(trimmed) ? trimmed : path.resolve(cwd, trimmed);
   if (existsSync(candidate)) {
-    return path.basename(candidate) === RESULT_INDEX_FILENAME ? path.dirname(candidate) : candidate;
+    if (path.basename(candidate) !== RESULT_INDEX_FILENAME) {
+      return candidate;
+    }
+    const manifestDir = path.dirname(candidate);
+    return path.basename(manifestDir) === '.internal' ? path.dirname(manifestDir) : manifestDir;
   }
 
   const runIdCandidate = path.join(cwd, '.agentv', 'results', trimmed);
@@ -2622,7 +2627,7 @@ export async function runEvalCommand(
           runtimeSource: runtimeSourceMetadata,
           tags: emittedTags,
         });
-        const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
+        const indexPath = resolveRunIndexPath(runDir);
         console.log(`Artifact bundle updated: ${runDir}`);
         console.log(`  Run manifest: ${indexPath}`);
         console.log(`  Per-test artifacts: ${runDir} (${allResults.length} new test directories)`);
diff --git a/apps/cli/src/commands/pipeline/bench.ts b/apps/cli/src/commands/pipeline/bench.ts
index e12db1a69..855833b57 100644
--- a/apps/cli/src/commands/pipeline/bench.ts
+++ b/apps/cli/src/commands/pipeline/bench.ts
@@ -6,11 +6,11 @@
  *
  * Writes:
  *   - <test-id>/grading.json  (per-test grading breakdown)
- *   - index.jsonl      (one line per test)
+ *   - .internal/index.jsonl  (one line per test)
  *   - summary.json            (aggregate statistics)
  */
 import { existsSync } from 'node:fs';
-import { readFile, readdir, writeFile } from 'node:fs/promises';
+import { mkdir, readFile, readdir, writeFile } from 'node:fs/promises';
 import { join } from 'node:path';
 
 import { command, positional, string } from 'cmd-ts';
@@ -174,17 +174,17 @@ export const evalBenchCommand = command({
         })),
       }));
 
-      // Read execution_status from timing.json (written by pipeline run)
+      // Read execution_status from metrics.json (written by pipeline run)
       let executionStatus = 'ok';
-      const timingPath = join(testDir, 'timing.json');
-      if (existsSync(timingPath)) {
+      const metricsPath = join(testDir, 'metrics.json');
+      if (existsSync(metricsPath)) {
         try {
-          const timing = JSON.parse(await readFile(timingPath, 'utf8'));
-          if (typeof timing.execution_status === 'string') {
-            executionStatus = timing.execution_status;
+          const metrics = JSON.parse(await readFile(metricsPath, 'utf8'));
+          if (typeof metrics.execution?.status === 'string') {
+            executionStatus = metrics.execution.status;
           }
         } catch {
-          // Fall back to 'ok' if timing.json is unreadable
+          // Fall back to 'ok' if metrics.json is unreadable
         }
       }
 
@@ -200,23 +200,21 @@ export const evalBenchCommand = command({
           scores,
           execution_status: executionStatus,
           grading_path: `${artifactSubdir}/grading.json`,
-          timing_path: `${artifactSubdir}/timing.json`,
+          metrics_path: `${artifactSubdir}/metrics.json`,
           response_path: hasResponse ? `${artifactSubdir}/response.md` : undefined,
         }),
       );
     }
 
     // Write row-level run manifest.
-    await writeFile(
-      join(exportDir, RESULT_INDEX_FILENAME),
-      indexLines.length > 0 ? `${indexLines.join('\n')}\n` : '',
-      'utf8',
-    );
+    const indexPath = join(exportDir, '.internal', RESULT_INDEX_FILENAME);
+    await mkdir(join(exportDir, '.internal'), { recursive: true });
+    await writeFile(indexPath, indexLines.length > 0 ? `${indexLines.join('\n')}\n` : '', 'utf8');
 
     // Write summary.json
     const passRateStats = computeStats(allPassRates);
     const summary = {
-      manifest_path: RESULT_INDEX_FILENAME,
+      index_path: `.internal/${RESULT_INDEX_FILENAME}`,
       metadata: {
         eval_file: manifest.eval_file,
         timestamp: manifest.timestamp,
diff --git a/apps/cli/src/commands/pipeline/run.ts b/apps/cli/src/commands/pipeline/run.ts
index 70678d7c0..5dc061ea0 100644
--- a/apps/cli/src/commands/pipeline/run.ts
+++ b/apps/cli/src/commands/pipeline/run.ts
@@ -4,7 +4,7 @@
  *
  * Equivalent to running:
  *   1. `agentv pipeline input <eval> --out <dir>`
- *   2. Invoking each CLI target in parallel (writing response.md + timing.json)
+ *   2. Invoking each CLI target in parallel (writing response.md + metrics.json)
  *   3. `agentv pipeline grade <dir>`
  *
  * For `kind: agent` targets, step 2 is skipped (subagent handles execution).
@@ -289,10 +289,15 @@ export const evalRunCommand = command({
           }
 
           await writeFile(join(testDir, 'response.md'), response, 'utf8');
-          await writeJson(join(testDir, 'timing.json'), {
-            duration_ms: durationMs,
-            total_duration_seconds: Math.round(durationMs / 10) / 100,
-            execution_status: 'ok',
+          await writeJson(join(testDir, 'metrics.json'), {
+            duration: {
+              total_ms: durationMs,
+              total_seconds: Math.round(durationMs / 10) / 100,
+              source: 'provider_reported',
+            },
+            tokens: { total: 0, input: 0, output: 0, reasoning: 0, source: 'unavailable' },
+            cost: { usd: null, source: 'unavailable' },
+            execution: { status: 'ok' },
           });
 
           process.stderr.write(`\n  ${testId}: OK (${durationMs}ms, ${response.length} chars)\n`);
@@ -301,10 +306,15 @@ export const evalRunCommand = command({
           const message = error instanceof Error ? error.message : String(error);
           const response = `ERROR: target failed — ${message}`;
           await writeFile(join(testDir, 'response.md'), response, 'utf8');
-          await writeJson(join(testDir, 'timing.json'), {
-            duration_ms: durationMs,
-            total_duration_seconds: Math.round(durationMs / 10) / 100,
-            execution_status: 'execution_error',
+          await writeJson(join(testDir, 'metrics.json'), {
+            duration: {
+              total_ms: durationMs,
+              total_seconds: Math.round(durationMs / 10) / 100,
+              source: 'provider_reported',
+            },
+            tokens: { total: 0, input: 0, output: 0, reasoning: 0, source: 'unavailable' },
+            cost: { usd: null, source: 'unavailable' },
+            execution: { status: 'execution_error' },
           });
           process.stderr.write(
             `\n  ${testId}: FAILED (${durationMs}ms) — ${message.slice(0, 200)}\n`,
diff --git a/apps/cli/src/commands/results/export.ts b/apps/cli/src/commands/results/export.ts
index 3bda4e2c7..45b137fe3 100644
--- a/apps/cli/src/commands/results/export.ts
+++ b/apps/cli/src/commands/results/export.ts
@@ -8,9 +8,9 @@
  *     index.jsonl              — per-test manifest with artifact pointers
  *     <test-id>/
  *       summary.json           — per-case aggregate
- *       attempt-1/result.json  — per-attempt result
- *       attempt-1/grading.json — per-attempt grading artifact (assertions, graders)
- *       attempt-1/metrics.json — per-attempt metrics artifact
+ *       sample-1/result.json  — per-sample result
+ *       sample-1/grading.json — per-sample grading artifact (assertions, graders)
+ *       sample-1/metrics.json — per-sample metrics artifact
  *
  * This module delegates artifact building to the shared artifact-writer so
  * that summary/grading/timing schemas stay aligned with `agentv eval`.
@@ -36,6 +36,7 @@ import type {
 import { parseJsonlResults, writeArtifactsFromResults } from '../eval/artifact-writer.js';
 import {
   RESULT_INDEX_FILENAME,
+  RUN_INTERNAL_DIRNAME,
   isReservedResultsNamespace,
   isRunManifestPath,
 } from '../eval/result-layout.js';
@@ -69,7 +70,7 @@ export async function exportResults(
     duplicatePolicy: options?.duplicatePolicy ?? 'update',
     additionalArtifacts: createExportBundleArtifactsWriter({
       outputDir,
-      sourceBaseDir: path.dirname(sourceFile),
+      sourceBaseDir: runRootFromIndexPath(sourceFile),
       sourceRecordsByResult: buildSourceRecordMap(results, sourceIndexRecords),
     }),
   });
@@ -85,7 +86,7 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string {
     throw new Error(`Expected a run manifest named ${RESULT_INDEX_FILENAME}: ${sourceFile}`);
   }
 
-  const runDir = path.dirname(sourceFile);
+  const runDir = runRootFromIndexPath(sourceFile);
   const segments = path.normalize(runDir).split(path.sep).filter(Boolean);
   const resultsIndex = segments.lastIndexOf('results');
   if (resultsIndex >= 0 && resultsIndex < segments.length - 2) {
@@ -104,11 +105,19 @@ export function deriveOutputDir(cwd: string, sourceFile: string): string {
 
 export function deriveExportRunId(sourceFile: string): string {
   if (isRunManifestPath(sourceFile)) {
-    return path.basename(path.dirname(sourceFile));
+    return path.basename(runRootFromIndexPath(sourceFile));
   }
   return path.basename(sourceFile, path.extname(sourceFile));
 }
 
+function runRootFromIndexPath(sourceFile: string): string {
+  const indexDir = path.dirname(sourceFile);
+  if (path.basename(indexDir) === RUN_INTERNAL_DIRNAME) {
+    return path.dirname(indexDir);
+  }
+  return indexDir;
+}
+
 export async function loadExportSource(
   source: string | undefined,
   cwd: string,
@@ -222,7 +231,7 @@ export function buildProjectionBundleFromExportedIndex(options: {
   readonly includeRawContent?: boolean;
   readonly duplicatePolicy?: ExportDuplicatePolicy;
 }): ProjectionBundle {
-  const indexPath = path.join(options.outputDir, RESULT_INDEX_FILENAME);
+  const indexPath = path.join(options.outputDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME);
   const indexRecords = readIndexArtifactEntries(indexPath);
   const emittedResults = loadManifestResults(indexPath);
 
@@ -327,7 +336,7 @@ export const resultsExportCommand = command({
         duplicatePolicy: policy,
         additionalArtifacts: createExportBundleArtifactsWriter({
           outputDir,
-          sourceBaseDir: path.dirname(sourceFile),
+          sourceBaseDir: runRootFromIndexPath(sourceFile),
           sourceRecordsByResult: buildSourceRecordMap(results, indexRecords ?? []),
         }),
       });
diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts
index 32b7a8156..c473d9927 100644
--- a/apps/cli/src/commands/results/manifest.ts
+++ b/apps/cli/src/commands/results/manifest.ts
@@ -14,7 +14,7 @@ import {
   traceEnvelopeToTranscriptMessages,
 } from '@agentv/core';
 
-import type { GradingArtifact, TimingArtifact } from '../eval/artifact-writer.js';
+import type { GradingArtifact } from '../eval/artifact-writer.js';
 import {
   isDirectoryPath,
   isRunManifestPath,
@@ -37,6 +37,7 @@ export interface ResultManifestRecord {
   readonly attempts?: readonly {
     readonly attempt?: number;
     readonly attempt_path?: string;
+    readonly sample_path?: string;
     readonly run_path?: string;
     readonly score?: number;
     readonly verdict?: string;
@@ -45,6 +46,7 @@ export interface ResultManifestRecord {
   readonly trials?: readonly {
     readonly attempt?: number;
     readonly attempt_path?: string;
+    readonly sample_path?: string;
     readonly run_path?: string;
     readonly score?: number;
     readonly verdict?: string;
@@ -86,6 +88,34 @@ export interface ResultManifestRecord {
   readonly metadata?: Record<string, unknown>;
 }
 
+interface MetricsUsageArtifact {
+  readonly duration?: {
+    readonly total_ms?: number;
+  };
+  readonly tokens?: {
+    readonly input?: number;
+    readonly output?: number;
+    readonly reasoning?: number;
+  };
+  readonly cost?: {
+    readonly usd?: number | null;
+  };
+}
+
+interface LegacyTimingArtifact {
+  readonly duration_ms?: number;
+  readonly token_usage?: {
+    readonly input?: number;
+    readonly output?: number;
+    readonly reasoning?: number;
+  };
+}
+
+function manifestBaseDir(indexPath: string): string {
+  const dir = path.dirname(indexPath);
+  return path.basename(dir) === '.internal' ? path.dirname(dir) : dir;
+}
+
 export interface ManifestHydrationOptions {
   /**
    * Defaults to true for report/inspect consumers that need a trace projection.
@@ -304,7 +334,8 @@ function hydrateManifestRecord(
   options: ManifestHydrationOptions,
 ): EvaluationResult {
   const grading = readOptionalJson<GradingArtifact>(baseDir, record.grading_path);
-  const timing = readOptionalJson<TimingArtifact>(baseDir, record.timing_path);
+  const metrics = readOptionalJson<MetricsUsageArtifact>(baseDir, record.metrics_path);
+  const timing = metrics ?? readOptionalJson<LegacyTimingArtifact>(baseDir, record.timing_path);
   const testId = record.test_id ?? 'unknown';
   const gradingAssertions = grading
     ? readGradingAssertionResults(grading as unknown as Record<string, unknown>)
@@ -334,15 +365,24 @@ function hydrateManifestRecord(
       // `evaluators` was renamed to `graders` in v4.13 — read both for backwards compat with old artifacts.
       // TODO: remove `evaluators` fallback once old run directories are no longer in use.
       gradingScores ?? (record.scores as EvaluationResult['scores']),
-    tokenUsage: timing?.token_usage
+    tokenUsage: metrics?.tokens
       ? {
-          input: timing.token_usage.input,
-          output: timing.token_usage.output,
-          reasoning: timing.token_usage.reasoning,
+          input: metrics.tokens.input,
+          output: metrics.tokens.output,
+          reasoning: metrics.tokens.reasoning,
         }
-      : record.token_usage,
-    durationMs: timing?.duration_ms ?? record.duration_ms,
-    costUsd: record.cost_usd,
+      : (timing as LegacyTimingArtifact | undefined)?.token_usage
+        ? {
+            input: (timing as LegacyTimingArtifact).token_usage?.input,
+            output: (timing as LegacyTimingArtifact).token_usage?.output,
+            reasoning: (timing as LegacyTimingArtifact).token_usage?.reasoning,
+          }
+        : record.token_usage,
+    durationMs:
+      metrics?.duration?.total_ms ??
+      (timing as LegacyTimingArtifact | undefined)?.duration_ms ??
+      record.duration_ms,
+    costUsd: metrics?.cost?.usd ?? record.cost_usd,
     input: hydrateInput(baseDir, record),
     output: hydrateOutput(baseDir, record) ?? '',
     trace: hydrateTrace(baseDir, record, options),
@@ -369,7 +409,7 @@ export function loadManifestResults(
   const resolvedSourceFile = resolveRunManifestPath(sourceFile);
   const content = readFileSync(resolvedSourceFile, 'utf8');
   const records = parseResultRows(content, resolvedSourceFile);
-  const baseDir = path.dirname(resolvedSourceFile);
+  const baseDir = manifestBaseDir(resolvedSourceFile);
   return records.map((record) => hydrateManifestRecord(baseDir, record, options));
 }
 
diff --git a/apps/cli/src/commands/results/projection-bundle.ts b/apps/cli/src/commands/results/projection-bundle.ts
index 1d4ee85c7..91ec2a9d6 100644
--- a/apps/cli/src/commands/results/projection-bundle.ts
+++ b/apps/cli/src/commands/results/projection-bundle.ts
@@ -69,7 +69,7 @@ export interface ProjectionBundleEntry {
     readonly result_score: number;
     readonly execution_status?: string;
     readonly grading_path?: string;
-    readonly timing_path?: string;
+    readonly metrics_path?: string;
     readonly assertion_count: number;
     readonly scores?: readonly TraceEnvelopeScoreWire[];
   };
@@ -88,7 +88,7 @@ export type ProjectionBundleArtifactRefs = Partial<
     | 'result_dir'
     | 'summary_path'
     | 'grading_path'
-    | 'timing_path'
+    | 'metrics_path'
     | 'input_path'
     | 'output_path'
     | 'answer_path'
@@ -156,7 +156,7 @@ function artifactRefs(
 ): ProjectionBundleArtifactRefs {
   const metadataRefs = dropUndefined({
     status: options.status,
-    timing_path: indexEntry.timing_path,
+    metrics_path: indexEntry.metrics_path,
   });
 
   if (!options.includeRawContent) {
@@ -310,7 +310,7 @@ function buildEntry(
     result_score: result.score,
     execution_status: result.executionStatus,
     grading_path: refs.grading_path,
-    timing_path: refs.timing_path,
+    metrics_path: refs.metrics_path,
     assertion_count: result.assertions?.length ?? 0,
     scores,
   });
diff --git a/apps/cli/src/commands/results/remote.ts b/apps/cli/src/commands/results/remote.ts
index e9a298c0e..caa6c9014 100644
--- a/apps/cli/src/commands/results/remote.ts
+++ b/apps/cli/src/commands/results/remote.ts
@@ -411,7 +411,10 @@ export async function listMergedResultFiles(
         raw_filename: r.run_id,
         source: 'remote' as const,
         on_remote: true,
-        path: path.join(config.path, r.manifest_path),
+        path: path.join(
+          config.path,
+          r.index_path ?? (r as { manifest_path?: string }).manifest_path ?? '',
+        ),
         ...(r.summary_path && { summaryPath: path.join(config.path, r.summary_path) }),
         experiment: r.experiment,
         ...(r.target && { target: r.target }),
diff --git a/apps/cli/src/commands/results/report.ts b/apps/cli/src/commands/results/report.ts
index 7cac9cfd8..22397aeea 100644
--- a/apps/cli/src/commands/results/report.ts
+++ b/apps/cli/src/commands/results/report.ts
@@ -5,6 +5,7 @@ import { command, option, optional, string } from 'cmd-ts';
 
 import type { EvaluationResult, RunRuntimeSourceMetadata } from '@agentv/core';
 
+import { RUN_INTERNAL_DIRNAME } from '../eval/result-layout.js';
 import { loadManifestResults, parseResultManifest, resolveResultSourcePath } from './manifest.js';
 import { RESULTS_REPORT_TEMPLATE } from './report-template.js';
 import { resolveSourceFile, sourceArg } from './shared.js';
@@ -43,7 +44,10 @@ function readSummaryEvalFile(sourceFile: string): string | undefined {
 }
 
 function readSummaryReportMetadata(sourceFile: string): RunSummaryReportMetadata {
-  const summaryPath = path.join(path.dirname(sourceFile), 'summary.json');
+  const sourceDir = path.dirname(sourceFile);
+  const runDir =
+    path.basename(sourceDir) === RUN_INTERNAL_DIRNAME ? path.dirname(sourceDir) : sourceDir;
+  const summaryPath = path.join(runDir, 'summary.json');
   if (!existsSync(summaryPath)) {
     return {};
   }
@@ -67,7 +71,10 @@ function readSummaryReportMetadata(sourceFile: string): RunSummaryReportMetadata
 }
 
 export function deriveReportPath(sourceFile: string): string {
-  return path.join(path.dirname(sourceFile), 'report.html');
+  const sourceDir = path.dirname(sourceFile);
+  const runDir =
+    path.basename(sourceDir) === RUN_INTERNAL_DIRNAME ? path.dirname(sourceDir) : sourceDir;
+  return path.join(runDir, 'report.html');
 }
 
 function serializeReportResult(
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index 01853b465..cea680041 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -777,7 +777,12 @@ function addTrialRunCatalogEntries(
     : undefined;
   if (!resultDir) return;
   for (const trial of record.attempts ?? record.trials ?? []) {
-    const rawPath = typeof trial.attempt_path === 'string' ? trial.attempt_path : trial.run_path;
+    const rawPath =
+      typeof trial.sample_path === 'string'
+        ? trial.sample_path
+        : typeof trial.attempt_path === 'string'
+          ? trial.attempt_path
+          : trial.run_path;
     const runPath = rawPath ? normalizeArtifactRelativePath(rawPath) : undefined;
     if (!runPath) continue;
     const runDir = path.posix.join(resultDir, runPath);
@@ -799,12 +804,6 @@ function addTrialRunCatalogEntries(
       path.posix.join(runDir, 'metrics.json'),
       'artifact',
     );
-    addDirectArtifactCatalogEntry(
-      entries,
-      seen,
-      path.posix.join(runDir, 'timing.json'),
-      'artifact',
-    );
   }
 }
 
@@ -824,6 +823,7 @@ function buildResultArtifactCatalog(
 
   addDirectArtifactCatalogEntry(entries, seen, record.summary_path, 'artifact');
   addDirectArtifactCatalogEntry(entries, seen, record.grading_path, 'artifact');
+  addDirectArtifactCatalogEntry(entries, seen, record.metrics_path, 'artifact');
   addDirectArtifactCatalogEntry(entries, seen, record.timing_path, 'artifact');
   addDirectArtifactCatalogEntry(entries, seen, record.input_path, 'artifact');
   addDirectArtifactCatalogEntry(entries, seen, record.output_path, 'artifact');
@@ -1124,7 +1124,12 @@ function buildRepeatTrialReadModels(
     : undefined;
 
   return attempts.map((trial) => {
-    const rawPath = typeof trial.attempt_path === 'string' ? trial.attempt_path : trial.run_path;
+    const rawPath =
+      typeof trial.sample_path === 'string'
+        ? trial.sample_path
+        : typeof trial.attempt_path === 'string'
+          ? trial.attempt_path
+          : trial.run_path;
     const runPath = rawPath ? normalizeArtifactRelativePath(rawPath) : undefined;
     const metricsPath = caseTrialArtifactPath(resultDir, runPath, 'metrics.json');
     const timingPath = caseTrialArtifactPath(resultDir, runPath, 'timing.json');
@@ -1137,21 +1142,35 @@ function buildRepeatTrialReadModels(
     const metrics = readArtifactJsonObject(baseDir, metricsPath);
     const timing = readArtifactJsonObject(baseDir, timingPath);
     const toolCalls = objectField(metrics, 'tool_calls');
-    const tokenUsage = objectField(timing, 'token_usage');
+    const tokenUsage = objectField(metrics, 'tokens') ?? objectField(timing, 'token_usage');
+    const duration = objectField(metrics, 'duration');
+    const cost = objectField(metrics, 'cost');
     const transcriptSummary =
       objectField(trial, 'transcript_summary') ?? objectField(runResult, 'transcript_summary');
 
     return {
       ...trial,
-      ...(numberField(timing, 'duration_ms') !== undefined && {
-        duration_ms: numberField(timing, 'duration_ms'),
+      ...(numberField(duration, 'total_ms') !== undefined && {
+        duration_ms: numberField(duration, 'total_ms'),
       }),
-      ...(numberField(timing, 'total_tokens') !== undefined && {
-        total_tokens: numberField(timing, 'total_tokens'),
+      ...(numberField(duration, 'total_ms') === undefined &&
+        numberField(timing, 'duration_ms') !== undefined && {
+          duration_ms: numberField(timing, 'duration_ms'),
+        }),
+      ...(numberField(tokenUsage, 'total') !== undefined && {
+        total_tokens: numberField(tokenUsage, 'total'),
       }),
-      ...(numberField(timing, 'cost_usd') !== undefined && {
-        cost_usd: numberField(timing, 'cost_usd'),
+      ...(numberField(tokenUsage, 'total') === undefined &&
+        numberField(timing, 'total_tokens') !== undefined && {
+          total_tokens: numberField(timing, 'total_tokens'),
+        }),
+      ...(numberField(cost, 'usd') !== undefined && {
+        cost_usd: numberField(cost, 'usd'),
       }),
+      ...(numberField(cost, 'usd') === undefined &&
+        numberField(timing, 'cost_usd') !== undefined && {
+          cost_usd: numberField(timing, 'cost_usd'),
+        }),
       ...(tokenUsage && { token_usage: tokenUsage }),
       ...(numberField(metrics, 'total_tool_calls') !== undefined && {
         total_tool_calls: numberField(metrics, 'total_tool_calls'),
@@ -1159,7 +1178,7 @@ function buildRepeatTrialReadModels(
       ...(toolCalls && { tool_calls: toolCalls }),
       ...(transcriptSummary && { transcript_summary: transcriptSummary }),
       ...(metricsPath && { metrics_path: metricsPath }),
-      ...(timingPath && { timing_path: timingPath }),
+      ...(timing && timingPath && { timing_path: timingPath }),
       ...(gradingPath && { grading_path: gradingPath }),
       ...(transcriptPath && { transcript_path: transcriptPath }),
       ...(transcriptRawPath && { transcript_raw_path: transcriptRawPath }),
diff --git a/apps/cli/src/commands/results/validate.ts b/apps/cli/src/commands/results/validate.ts
index a98511a30..77e0ee46e 100644
--- a/apps/cli/src/commands/results/validate.ts
+++ b/apps/cli/src/commands/results/validate.ts
@@ -39,9 +39,18 @@ interface IndexEntry {
   readonly summary_path?: string;
   readonly grading_path?: string;
   readonly timing_path?: string;
+  readonly metrics_path?: string;
   readonly result_dir?: string;
-  readonly attempts?: readonly { readonly attempt_path?: string; readonly run_path?: string }[];
-  readonly trials?: readonly { readonly attempt_path?: string; readonly run_path?: string }[];
+  readonly attempts?: readonly {
+    readonly attempt_path?: string;
+    readonly sample_path?: string;
+    readonly run_path?: string;
+  }[];
+  readonly trials?: readonly {
+    readonly attempt_path?: string;
+    readonly sample_path?: string;
+    readonly run_path?: string;
+  }[];
   readonly [key: string]: unknown;
 }
 
@@ -304,13 +313,13 @@ function checkArtifactFiles(runDir: string, entries: IndexEntry[]): Diagnostic[]
       }
     }
 
-    // Check timing.json
-    if (entry.timing_path) {
-      const timingPath = path.join(runDir, entry.timing_path);
-      if (!existsSync(timingPath)) {
+    // Check metrics.json. Legacy timing_path is tolerated for old bundles.
+    if (entry.metrics_path) {
+      const metricsPath = path.join(runDir, entry.metrics_path);
+      if (!existsSync(metricsPath)) {
         diagnostics.push({
           severity: 'warning',
-          message: `${testId}: timing.json not found at '${entry.timing_path}'`,
+          message: `${testId}: metrics.json not found at '${entry.metrics_path}'`,
         });
       }
     }
diff --git a/apps/cli/test/commands/eval/aggregate.test.ts b/apps/cli/test/commands/eval/aggregate.test.ts
index ae72d8258..772ecff20 100644
--- a/apps/cli/test/commands/eval/aggregate.test.ts
+++ b/apps/cli/test/commands/eval/aggregate.test.ts
@@ -52,14 +52,18 @@ function writeJsonlIndex(
   results: Partial<EvaluationResult>[],
   filename = RESULT_INDEX_FILENAME,
 ): string {
-  const indexPath = path.join(dir, filename);
+  const indexPath =
+    filename === RESULT_INDEX_FILENAME
+      ? path.join(dir, '.internal', filename)
+      : path.join(dir, filename);
+  mkdirSync(path.dirname(indexPath), { recursive: true });
   const lines = results.map((r) => JSON.stringify(toSnakeCaseDeep(makeResult(r)))).join('\n');
   writeFileSync(indexPath, `${lines}\n`);
   return indexPath;
 }
 
 function readIndexRows(dir: string): Array<{ test_id: string; result_dir: string }> {
-  const indexPath = path.join(dir, RESULT_INDEX_FILENAME);
+  const indexPath = path.join(dir, '.internal', RESULT_INDEX_FILENAME);
   if (!existsSync(indexPath)) {
     return readdirSync(dir)
       .filter((entry) => /--[a-f0-9]{12}$/.test(entry))
@@ -217,11 +221,11 @@ describe('aggregateRunDir', () => {
     expect(result.targetCount).toBe(1);
 
     const summary = JSON.parse(readFileSync(result.summaryPath, 'utf8'));
-    expect(summary.manifest_path).toBe(RESULT_INDEX_FILENAME);
+    expect(summary.index_path).toBe('.internal/index.jsonl');
     expect(summary.metadata.tests_run).toContain('a');
     expect(summary.metadata.tests_run).toContain('b');
     expect(summary.run_summary.x).toBeDefined();
-    expect(summary.timing.total_tokens).toBeGreaterThanOrEqual(0);
+    expect(summary.usage.total_tokens).toBeGreaterThanOrEqual(0);
   });
 
   it('reads canonical index.jsonl bundles', async () => {
@@ -238,7 +242,7 @@ describe('aggregateRunDir', () => {
     expect(result.testCount).toBe(2);
 
     const summary = JSON.parse(readFileSync(result.summaryPath, 'utf8'));
-    expect(summary.manifest_path).toBe(RESULT_INDEX_FILENAME);
+    expect(summary.index_path).toBe('.internal/index.jsonl');
     expect(summary.metadata.tests_run).toEqual(['case-a', 'case-b']);
   });
 
@@ -283,23 +287,23 @@ describe('writePerTestArtifacts', () => {
     rmSync(tmpDir, { recursive: true, force: true });
   });
 
-  it('writes grading.json and timing.json for each result', async () => {
+  it('writes grading.json and metrics.json for each result', async () => {
     const results = [makeResult({ testId: 'test-1' }), makeResult({ testId: 'test-2' })];
 
     await writePerTestArtifacts(results, tmpDir);
 
     const grading1 = JSON.parse(
-      readFileSync(rowRunPath(tmpDir, 'test-1', 'attempt-1', 'grading.json'), 'utf8'),
+      readFileSync(rowRunPath(tmpDir, 'test-1', 'sample-1', 'grading.json'), 'utf8'),
     );
     expect(grading1.assertion_results).toHaveLength(1);
 
-    const timing1 = JSON.parse(
-      readFileSync(rowRunPath(tmpDir, 'test-1', 'attempt-1', 'timing.json'), 'utf8'),
+    const metrics1 = JSON.parse(
+      readFileSync(rowRunPath(tmpDir, 'test-1', 'sample-1', 'metrics.json'), 'utf8'),
     );
-    expect(timing1.total_tokens).toBeGreaterThanOrEqual(0);
+    expect(metrics1.tokens.total).toBeGreaterThanOrEqual(0);
 
     const grading2 = JSON.parse(
-      readFileSync(rowRunPath(tmpDir, 'test-2', 'attempt-1', 'grading.json'), 'utf8'),
+      readFileSync(rowRunPath(tmpDir, 'test-2', 'sample-1', 'grading.json'), 'utf8'),
     );
     expect(grading2.assertion_results).toHaveLength(1);
   });
@@ -310,7 +314,7 @@ describe('writePerTestArtifacts', () => {
     await writePerTestArtifacts(results, tmpDir);
 
     const answer = readFileSync(
-      rowRunPath(tmpDir, 'test-1', 'attempt-1', 'outputs', 'answer.md'),
+      rowRunPath(tmpDir, 'test-1', 'sample-1', 'outputs', 'answer.md'),
       'utf8',
     );
     expect(answer).toContain('hello');
diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
index 5043cdb68..63ba0233e 100644
--- a/apps/cli/test/commands/eval/artifact-writer.test.ts
+++ b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -397,11 +397,11 @@ describe('buildGradingArtifact', () => {
 });
 
 // ---------------------------------------------------------------------------
-// Timing artifact
+// Metrics usage artifact
 // ---------------------------------------------------------------------------
 
 describe('buildTimingArtifact', () => {
-  it('aggregates timing across results', () => {
+  it('aggregates duration and token usage across results', () => {
     const results = [
       makeResult({
         durationMs: 30000,
@@ -413,30 +413,26 @@ describe('buildTimingArtifact', () => {
       } as Partial<EvaluationResult>),
     ];
 
-    const timing = buildTimingArtifact(results);
+    const metrics = buildTimingArtifact(results);
 
-    expect(timing.total_tokens).toBe(4500);
-    expect(timing.duration_ms).toBe(90000);
-    expect(timing.total_duration_seconds).toBe(90);
-    expect(timing.token_usage).toEqual({ input: 3000, output: 1500, reasoning: 0 });
+    expect(metrics.tokens).toMatchObject({ total: 4500, input: 3000, output: 1500, reasoning: 0 });
+    expect(metrics.duration).toMatchObject({ total_ms: 90000, total_seconds: 90 });
   });
 
-  it('handles results with no timing data', () => {
+  it('handles results with no usage data', () => {
     const results = [makeResult({})];
-    const timing = buildTimingArtifact(results);
+    const metrics = buildTimingArtifact(results);
 
-    expect(timing.total_tokens).toBe(0);
-    expect(timing.duration_ms).toBe(0);
-    expect(timing.total_duration_seconds).toBe(0);
-    expect(timing.token_usage).toEqual({ input: 0, output: 0, reasoning: 0 });
+    expect(metrics.tokens).toMatchObject({ total: 0, input: 0, output: 0, reasoning: 0 });
+    expect(metrics.duration).toMatchObject({ total_ms: 0, total_seconds: 0 });
   });
 
   it('handles empty results array', () => {
-    const timing = buildTimingArtifact([]);
+    const metrics = buildTimingArtifact([]);
 
-    expect(timing.total_tokens).toBe(0);
-    expect(timing.duration_ms).toBe(0);
-    expect(timing.total_duration_seconds).toBe(0);
+    expect(metrics.tokens.total).toBe(0);
+    expect(metrics.duration.total_ms).toBe(0);
+    expect(metrics.duration.total_seconds).toBe(0);
   });
 
   it('handles partial token usage', () => {
@@ -446,9 +442,9 @@ describe('buildTimingArtifact', () => {
       } as Partial<EvaluationResult>),
     ];
 
-    const timing = buildTimingArtifact(results);
-    expect(timing.total_tokens).toBe(500);
-    expect(timing.token_usage).toEqual({ input: 500, output: 0, reasoning: 0 });
+    const metrics = buildTimingArtifact(results);
+    expect(metrics.tokens.total).toBe(500);
+    expect(metrics.tokens).toMatchObject({ input: 500, output: 0, reasoning: 0 });
   });
 });
 
@@ -561,7 +557,7 @@ describe('buildRunSummaryArtifact', () => {
       [makeResult({})],
       'test.eval.yaml',
       'baseline-v2',
-      'attempt-1',
+      'sample-1',
       undefined,
       undefined,
       undefined,
@@ -776,8 +772,8 @@ describe('buildIndexArtifactEntry', () => {
       }),
       {
         outputDir: '/tmp/artifacts',
-        gradingPath: '/tmp/artifacts/alpha/attempt-1/grading.json',
-        timingPath: '/tmp/artifacts/alpha/attempt-1/timing.json',
+        gradingPath: '/tmp/artifacts/alpha/sample-1/grading.json',
+        metricsPath: '/tmp/artifacts/alpha/sample-1/metrics.json',
         outputPath: '/tmp/artifacts/alpha/outputs/answer.md',
         answerPath: '/tmp/artifacts/alpha/outputs/answer.md',
       },
@@ -805,16 +801,18 @@ describe('buildIndexArtifactEntry', () => {
           ],
         },
       ],
+      named_scores: { quality: 0.7 },
+      provenance: 'native',
       execution_status: 'quality_failure',
       error: 'model drift',
-      grading_path: 'alpha/attempt-1/grading.json',
-      timing_path: 'alpha/attempt-1/timing.json',
+      grading_path: 'alpha/sample-1/grading.json',
+      metrics_path: 'alpha/sample-1/metrics.json',
       output_path: 'alpha/outputs/answer.md',
       answer_path: 'alpha/outputs/answer.md',
       attempts: [
         {
           attempt: 0,
-          attempt_path: 'attempt-1',
+          sample_path: 'sample-1',
           score: 0.9,
           verdict: 'fail',
           scores: [
@@ -875,8 +873,8 @@ describe('buildIndexArtifactEntry', () => {
       }),
       {
         outputDir: '/tmp/artifacts',
-        gradingPath: '/tmp/artifacts/alpha/attempt-1/grading.json',
-        timingPath: '/tmp/artifacts/alpha/attempt-1/timing.json',
+        gradingPath: '/tmp/artifacts/alpha/sample-1/grading.json',
+        metricsPath: '/tmp/artifacts/alpha/sample-1/metrics.json',
       },
     );
 
@@ -940,9 +938,9 @@ describe('parseJsonlResults', () => {
       artifactPointers: {
         transcript: {
           ref: 'agentv/artifacts/v1',
-          key: 'transcripts/pointer-row/attempt-1/transcript-raw.jsonl',
+          key: 'transcripts/pointer-row/sample-1/transcript-raw.jsonl',
           object_version: 'sha256:test',
-          path: 'pointer-row/attempt-1/transcript-raw.jsonl',
+          path: 'pointer-row/sample-1/transcript-raw.jsonl',
           sha256: 'test',
           size: 1,
           schema_version: 'agentv.transcript.v1',
@@ -960,7 +958,7 @@ describe('parseJsonlResults', () => {
       test_id: 'file-changes-row',
       target: 'codex',
       score: 1,
-      fileChangesPath: 'file-changes-row/attempt-1/outputs/file_changes.diff',
+      fileChangesPath: 'file-changes-row/sample-1/outputs/file_changes.diff',
     })}\n`;
 
     expect(() => parseJsonlResults(content)).toThrow(/Use "file_changes_path"/);
@@ -972,7 +970,7 @@ describe('parseJsonlResults', () => {
       target: 'codex',
       score: 1,
       output: 'done',
-      raw_provider_log_path: 'raw-log-case/attempt-1/provider.log',
+      raw_provider_log_path: 'raw-log-case/sample-1/provider.log',
     })}\n`;
 
     const results = parseJsonlResults(content);
@@ -1061,15 +1059,16 @@ describe('schema compatibility', () => {
     expect(typeof grading.summary.pass_rate).toBe('number');
   });
 
-  it('timing has total_tokens, duration_ms, total_duration_seconds, token_usage', () => {
-    const timing = buildTimingArtifact([makeResult({})]);
+  it('metrics usage has duration, tokens, cost, execution, and trajectory sections', () => {
+    const metrics = buildTimingArtifact([makeResult({})]);
 
-    expect(timing).toHaveProperty('total_tokens');
-    expect(timing).toHaveProperty('duration_ms');
-    expect(timing).toHaveProperty('total_duration_seconds');
-    expect(timing).toHaveProperty('token_usage');
-    expect(timing.token_usage).toHaveProperty('input');
-    expect(timing.token_usage).toHaveProperty('output');
+    expect(metrics).toHaveProperty('duration');
+    expect(metrics).toHaveProperty('tokens');
+    expect(metrics).toHaveProperty('cost');
+    expect(metrics).toHaveProperty('execution');
+    expect(metrics).toHaveProperty('trajectory');
+    expect(metrics.tokens).toHaveProperty('input');
+    expect(metrics.tokens).toHaveProperty('output');
   });
 
   it('benchmark run_summary has pass_rate/time_seconds/tokens with mean/stddev', () => {
@@ -1099,6 +1098,9 @@ describe('writeArtifactsFromResults', () => {
 
   afterEach(async () => {
     await rm(testDir, { recursive: true, force: true }).catch(() => undefined);
+    await rm(path.join(import.meta.dir, '.indexes'), { recursive: true, force: true }).catch(
+      () => undefined,
+    );
   });
 
   it('writes summary, index.jsonl, and per-run artifact files', async () => {
@@ -1112,7 +1114,7 @@ describe('writeArtifactsFromResults', () => {
     });
 
     expect(path.basename(paths.indexPath)).toBe('index.jsonl');
-    expect(paths.indexPath).toBe(path.join(testDir, 'index.jsonl'));
+    expect(paths.indexPath).toBe(path.join(testDir, '.internal', 'index.jsonl'));
     expect(existsSync(paths.indexPath)).toBe(true);
     const indexLines = await readIndexLines(paths.indexPath);
     expect(indexLines).toHaveLength(2);
@@ -1122,58 +1124,52 @@ describe('writeArtifactsFromResults', () => {
 
     // Check per-test artifact directories
     const artifactEntries = await readdir(paths.testArtifactDir);
-    expect(artifactEntries.sort()).toEqual([
-      alphaRowDir,
-      betaRowDir,
-      RESULT_INDEX_FILENAME,
-      'summary.json',
-    ]);
+    expect(artifactEntries.sort()).toEqual(['.internal', alphaRowDir, betaRowDir, 'summary.json']);
 
     const rootSummary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8'));
-    expect(rootSummary.manifest_path).toBe(RESULT_INDEX_FILENAME);
+    expect(rootSummary.index_path).toBe('.internal/index.jsonl');
 
     const alphaEntries = await readdir(path.join(paths.testArtifactDir, alphaRowDir));
-    expect(alphaEntries.sort()).toEqual(['attempt-1', 'summary.json']);
+    expect(alphaEntries.sort()).toEqual(['sample-1', 'summary.json']);
 
     const alphaRunEntries = await readdir(
-      path.join(paths.testArtifactDir, alphaRowDir, 'attempt-1'),
+      path.join(paths.testArtifactDir, alphaRowDir, 'sample-1'),
     );
     expect(alphaRunEntries.sort()).toEqual([
       'grading.json',
       'metrics.json',
       'outputs',
       'result.json',
-      'timing.json',
       'transcript-raw.jsonl',
       'transcript.json',
     ]);
 
     const alphaGrading: GradingArtifact = JSON.parse(
       await readFile(
-        path.join(paths.testArtifactDir, alphaRowDir, 'attempt-1', 'grading.json'),
+        path.join(paths.testArtifactDir, alphaRowDir, 'sample-1', 'grading.json'),
         'utf8',
       ),
     );
     expect(alphaGrading.summary).toBeDefined();
     expect(alphaGrading).not.toHaveProperty('execution_metrics');
 
-    const alphaTiming: TimingArtifact = JSON.parse(
+    const alphaMetrics: TimingArtifact = JSON.parse(
       await readFile(
-        path.join(paths.testArtifactDir, alphaRowDir, 'attempt-1', 'timing.json'),
+        path.join(paths.testArtifactDir, alphaRowDir, 'sample-1', 'metrics.json'),
         'utf8',
       ),
     );
-    expect(alphaTiming.duration_ms).toBe(5000);
+    expect(alphaMetrics.duration.total_ms).toBe(5000);
 
     const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8'));
     expect(summary.metadata.eval_file).toBe('my-eval.yaml');
     expect(summary.metadata.tests_run.sort()).toEqual(['alpha', 'beta']);
-    expect(summary.timing.duration_ms).toBe(13000);
+    expect(summary.metrics.duration.total_ms).toBe(13000);
 
     expect(indexLines[0]?.summary_path).toBe(`${alphaRowDir}/summary.json`);
-    expect(indexLines[0]?.grading_path).toBe(`${alphaRowDir}/attempt-1/grading.json`);
-    expect(indexLines[0]?.timing_path).toBe(`${alphaRowDir}/attempt-1/timing.json`);
-    expect(indexLines[0]?.metrics_path).toBe(`${alphaRowDir}/attempt-1/metrics.json`);
+    expect(indexLines[0]?.grading_path).toBe(`${alphaRowDir}/sample-1/grading.json`);
+    expect(indexLines[0]?.timing_path).toBeUndefined();
+    expect(indexLines[0]?.metrics_path).toBe(`${alphaRowDir}/sample-1/metrics.json`);
   });
 
   it('writes optional runtime source metadata to summary and index rows', async () => {
@@ -1277,8 +1273,8 @@ describe('writeArtifactsFromResults', () => {
     const [indexEntry] = await readIndexLines(paths.indexPath);
     const repeatRowDir = expectRowDir(indexEntry, 'repeat-case');
     expect(indexEntry?.attempts).toMatchObject([
-      { attempt: 0, attempt_path: 'attempt-1', score: 0.25, verdict: 'fail' },
-      { attempt: 1, attempt_path: 'attempt-2', score: 1, verdict: 'pass' },
+      { attempt: 0, sample_path: 'sample-1', score: 0.25, verdict: 'fail' },
+      { attempt: 1, sample_path: 'sample-2', score: 1, verdict: 'pass' },
     ]);
     expect(indexEntry?.aggregation).toEqual({
       strategy: 'confidence_interval',
@@ -1297,7 +1293,7 @@ describe('writeArtifactsFromResults', () => {
     expect(indexEntry?.metrics_path).toBeUndefined();
 
     const repeatEntries = await readdir(path.join(paths.testArtifactDir, repeatRowDir));
-    expect(repeatEntries.sort()).toEqual(['attempt-1', 'attempt-2', 'summary.json']);
+    expect(repeatEntries.sort()).toEqual(['sample-1', 'sample-2', 'summary.json']);
 
     const caseSummary = JSON.parse(
       await readFile(path.join(paths.testArtifactDir, repeatRowDir, 'summary.json'), 'utf8'),
@@ -1308,26 +1304,21 @@ describe('writeArtifactsFromResults', () => {
       pass_rate: '50%',
       mean_duration_ms: 3000,
       mean_duration_seconds: 3,
-      duration_ms: 6000,
-      total_duration_seconds: 6,
-      duration_stats: {
-        count: 2,
-        mean_ms: 3000,
-        mean_seconds: 3,
-        stddev_ms: 1000,
-        stddev_seconds: 1,
-        min_ms: 2000,
-        max_ms: 4000,
-      },
-      total_tokens: 0,
-      cost_usd: null,
-      token_usage: { input: 0, output: 0, reasoning: 0 },
-      usage_sources: {
-        token_usage: 'unavailable',
-        total_tokens: 'unavailable',
-        duration: 'aggregate',
-        cost: 'unavailable',
+      duration: {
+        total_ms: 6000,
+        total_seconds: 6,
+        stats: {
+          count: 2,
+          mean_ms: 3000,
+          mean_seconds: 3,
+          stddev_ms: 1000,
+          stddev_seconds: 1,
+          min_ms: 2000,
+          max_ms: 4000,
+        },
       },
+      tokens: { total: 0, input: 0, output: 0, reasoning: 0 },
+      cost: { usd: null },
     });
     expect(typeof caseSummary.fingerprint).toBe('string');
 
@@ -1335,14 +1326,13 @@ describe('writeArtifactsFromResults', () => {
       readFile(path.join(paths.testArtifactDir, repeatRowDir, 'grading.json'), 'utf8'),
     ).rejects.toThrow();
 
-    for (const runDir of ['attempt-1', 'attempt-2']) {
+    for (const runDir of ['sample-1', 'sample-2']) {
       const runEntries = await readdir(path.join(paths.testArtifactDir, repeatRowDir, runDir));
       expect(runEntries.sort()).toEqual([
         'grading.json',
         'metrics.json',
         'outputs',
         'result.json',
-        'timing.json',
         'transcript-raw.jsonl',
         'transcript.json',
       ]);
@@ -1350,7 +1340,7 @@ describe('writeArtifactsFromResults', () => {
 
     const runOneResult = JSON.parse(
       await readFile(
-        path.join(paths.testArtifactDir, repeatRowDir, 'attempt-1', 'result.json'),
+        path.join(paths.testArtifactDir, repeatRowDir, 'sample-1', 'result.json'),
         'utf8',
       ),
     ) as Record<string, unknown>;
@@ -1365,22 +1355,20 @@ describe('writeArtifactsFromResults', () => {
       transcript_path: './transcript.json',
       transcript_raw_path: './transcript-raw.jsonl',
       output_paths: { answer: './outputs/answer.md' },
-      timing: {
-        duration_ms: 2000,
-      },
     });
+    expect(runOneResult).not.toHaveProperty('timing');
     expect(runOneResult).not.toHaveProperty('status');
     expect(indexEntry?.attempts?.[0]?.transcript_summary).toEqual(runOneResult.transcript_summary);
 
     const runTwoAnswer = await readFile(
-      path.join(paths.testArtifactDir, repeatRowDir, 'attempt-2', 'outputs', 'answer.md'),
+      path.join(paths.testArtifactDir, repeatRowDir, 'sample-2', 'outputs', 'answer.md'),
       'utf8',
     );
     expect(runTwoAnswer).toBe('second attempt');
 
     const runTwoResult = JSON.parse(
       await readFile(
-        path.join(paths.testArtifactDir, repeatRowDir, 'attempt-2', 'result.json'),
+        path.join(paths.testArtifactDir, repeatRowDir, 'sample-2', 'result.json'),
         'utf8',
       ),
     ) as Record<string, unknown>;
@@ -1391,10 +1379,8 @@ describe('writeArtifactsFromResults', () => {
       metrics_path: './metrics.json',
       transcript_path: './transcript.json',
       transcript_raw_path: './transcript-raw.jsonl',
-      timing: {
-        duration_ms: 4000,
-      },
     });
+    expect(runTwoResult).not.toHaveProperty('timing');
     expect(runTwoResult).not.toHaveProperty('status');
     expect(indexEntry?.attempts?.[1]?.transcript_summary).toEqual(runTwoResult.transcript_summary);
   });
@@ -1428,16 +1414,16 @@ describe('writeArtifactsFromResults', () => {
     const paths = await writeArtifactsFromResults([], testDir);
 
     const artifactEntries = await readdir(paths.testArtifactDir);
-    expect(artifactEntries.sort()).toEqual([RESULT_INDEX_FILENAME, 'summary.json']);
+    expect(artifactEntries.sort()).toEqual(['.internal', 'summary.json']);
 
     const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8'));
-    expect(summary.manifest_path).toBe(RESULT_INDEX_FILENAME);
+    expect(summary.index_path).toBe('.internal/index.jsonl');
     expect(summary.notes).toContain('No results to summarize');
-    expect(summary.timing.total_tokens).toBe(0);
+    expect(summary.metrics.tokens.total).toBe(0);
     expect(await readFile(paths.indexPath, 'utf8')).toBe('');
   });
 
-  it('writes grading.json and timing.json inside each test directory', async () => {
+  it('writes grading.json and metrics.json inside each test directory', async () => {
     const results = [
       makeResult({
         testId: 'test-1',
@@ -1458,20 +1444,20 @@ describe('writeArtifactsFromResults', () => {
     const testTwo = indexLines.find((line) => line.test_id === 'test-2');
 
     const gradingOne: GradingArtifact = JSON.parse(
-      await readFile(runArtifactPath(testDir, testOne, 'attempt-1', 'grading.json'), 'utf8'),
+      await readFile(runArtifactPath(testDir, testOne, 'sample-1', 'grading.json'), 'utf8'),
     );
     const gradingTwo: GradingArtifact = JSON.parse(
-      await readFile(runArtifactPath(testDir, testTwo, 'attempt-1', 'grading.json'), 'utf8'),
+      await readFile(runArtifactPath(testDir, testTwo, 'sample-1', 'grading.json'), 'utf8'),
     );
-    const timingOne: TimingArtifact = JSON.parse(
-      await readFile(runArtifactPath(testDir, testOne, 'attempt-1', 'timing.json'), 'utf8'),
+    const metricsOne: TimingArtifact = JSON.parse(
+      await readFile(runArtifactPath(testDir, testOne, 'sample-1', 'metrics.json'), 'utf8'),
     );
 
     expect(gradingOne.summary.total).toBe(1);
     expect(gradingOne.summary.passed).toBe(1);
     expect(gradingTwo.summary.total).toBe(2);
     expect(gradingTwo.summary.failed).toBe(1);
-    expect(timingOne.duration_ms).toBe(0);
+    expect(metricsOne.duration.total_ms).toBe(0);
   });
 
   it('writes normalized transcript.json plus raw transcript evidence', async () => {
@@ -1528,12 +1514,12 @@ describe('writeArtifactsFromResults', () => {
     const [indexLine] = await readIndexLines(paths.indexPath);
     const rowDir = expectRowDir(indexLine, 'transcript-case');
 
-    const transcriptPath = runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript.json');
+    const transcriptPath = runArtifactPath(testDir, indexLine, 'sample-1', 'transcript.json');
     const transcript = JSON.parse(await readFile(transcriptPath, 'utf8'));
 
     const rawTranscriptLines = (
       await readFile(
-        runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript-raw.jsonl'),
+        runArtifactPath(testDir, indexLine, 'sample-1', 'transcript-raw.jsonl'),
         'utf8',
       )
     )
@@ -1618,17 +1604,17 @@ describe('writeArtifactsFromResults', () => {
       role: 'user',
     });
     await expect(
-      readFile(path.join(testDir, rowDir, 'attempt-1', 'transcript.jsonl'), 'utf8'),
+      readFile(path.join(testDir, rowDir, 'sample-1', 'transcript.jsonl'), 'utf8'),
     ).rejects.toThrow();
     await expect(
-      readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'trace.json'), 'utf8'),
+      readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'trace.json'), 'utf8'),
     ).rejects.toThrow();
 
     expect(indexLine).not.toHaveProperty('trace_path');
-    expect(indexLine?.transcript_path).toBe(`${rowDir}/attempt-1/transcript.json`);
-    expect(indexLine?.transcript_raw_path).toBe(`${rowDir}/attempt-1/transcript-raw.jsonl`);
+    expect(indexLine?.transcript_path).toBe(`${rowDir}/sample-1/transcript.json`);
+    expect(indexLine?.transcript_raw_path).toBe(`${rowDir}/sample-1/transcript-raw.jsonl`);
     expect(indexLine?.transcript_summary).toEqual(transcript.transcript_summary);
-    expect(indexLine?.metrics_path).toBe(`${rowDir}/attempt-1/metrics.json`);
+    expect(indexLine?.metrics_path).toBe(`${rowDir}/sample-1/metrics.json`);
     expect(indexLine.metrics_path.endsWith(CANONICAL_METRICS_ARTIFACT_PATH)).toBe(true);
 
     expect(indexLine.artifact_pointers).toBeUndefined();
@@ -1731,26 +1717,26 @@ describe('writeArtifactsFromResults', () => {
     const [indexLine] = await readIndexLines(paths.indexPath);
     const rowDir = expectRowDir(indexLine, 'summary-case');
 
-    expect(indexLine?.metrics_path).toBe(`${rowDir}/attempt-1/metrics.json`);
+    expect(indexLine?.metrics_path).toBe(`${rowDir}/sample-1/metrics.json`);
     expect(indexLine?.file_changes_path).toBe(
-      `${rowDir}/attempt-1/${CANONICAL_FILE_CHANGES_ARTIFACT_PATH}`,
+      `${rowDir}/sample-1/${CANONICAL_FILE_CHANGES_ARTIFACT_PATH}`,
     );
     await expect(
       readFile(
-        runArtifactPath(testDir, indexLine, 'attempt-1', 'outputs', 'file_changes.diff'),
+        runArtifactPath(testDir, indexLine, 'sample-1', 'outputs', 'file_changes.diff'),
         'utf8',
       ),
     ).resolves.toBe(fileChanges);
 
     const runResult = JSON.parse(
-      await readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'result.json'), 'utf8'),
+      await readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'result.json'), 'utf8'),
     );
     expect(runResult.file_changes_path).toBe('./outputs/file_changes.diff');
     expect(runResult.output_paths.file_changes).toBe('./outputs/file_changes.diff');
 
     const summary = MetricsArtifactWireSchema.parse(
       JSON.parse(
-        await readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'metrics.json'), 'utf8'),
+        await readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'metrics.json'), 'utf8'),
       ),
     );
 
@@ -1764,12 +1750,11 @@ describe('writeArtifactsFromResults', () => {
     expect(summary.source_artifacts).toMatchObject({
       transcript_path: 'transcript.json',
       grading_path: 'grading.json',
-      timing_path: 'timing.json',
       file_changes_path: CANONICAL_FILE_CHANGES_ARTIFACT_PATH,
     });
     expect(summary.source_artifacts).not.toHaveProperty('trace_path');
     await expect(
-      readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'trace.json'), 'utf8'),
+      readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'trace.json'), 'utf8'),
     ).rejects.toThrow();
     expect(summary.metrics.total_turns).toBe(2);
     expect(summary.metrics.total_tool_calls).toBe(4);
@@ -1842,24 +1827,14 @@ describe('writeArtifactsFromResults', () => {
     ]);
     expect(summary).not.toHaveProperty('usage_summary');
 
-    const timing = JSON.parse(
-      await readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'timing.json'), 'utf8'),
-    );
-    expect(timing).toMatchObject({
-      total_tokens: 140,
-      duration_ms: 4200,
-      cost_usd: 0.25,
-      token_usage: { input: 100, output: 40, reasoning: 5 },
-      usage_sources: {
-        token_usage: 'provider_reported',
-        total_tokens: 'provider_reported',
-        duration: 'provider_reported',
-        cost: 'provider_reported',
-      },
+    expect(summary).toMatchObject({
+      tokens: { total: 140, input: 100, output: 40, reasoning: 5, source: 'provider_reported' },
+      duration: { total_ms: 4200, source: 'provider_reported' },
+      cost: { usd: 0.25, source: 'provider_reported' },
     });
   });
 
-  it('distinguishes aggregate, estimated, and unavailable timing usage sources', async () => {
+  it('distinguishes aggregate, estimated, and unavailable metrics usage sources', async () => {
     const aggregateOutput = [
       {
         role: 'assistant' as const,
@@ -1900,56 +1875,39 @@ describe('writeArtifactsFromResults', () => {
     const aggregateRow = indexLines.find((line) => line.test_id === 'aggregate-usage');
     const estimatedRow = indexLines.find((line) => line.test_id === 'estimated-usage');
 
-    const aggregateTiming = JSON.parse(
-      await readFile(runArtifactPath(testDir, aggregateRow, 'attempt-1', 'timing.json'), 'utf8'),
+    const aggregateMetrics = JSON.parse(
+      await readFile(runArtifactPath(testDir, aggregateRow, 'sample-1', 'metrics.json'), 'utf8'),
     );
-    const estimatedTiming = JSON.parse(
-      await readFile(runArtifactPath(testDir, estimatedRow, 'attempt-1', 'timing.json'), 'utf8'),
+    const estimatedMetrics = JSON.parse(
+      await readFile(runArtifactPath(testDir, estimatedRow, 'sample-1', 'metrics.json'), 'utf8'),
     );
     const runSummary = JSON.parse(await readFile(path.join(testDir, 'summary.json'), 'utf8'));
 
     MetricsArtifactWireSchema.parse(
       JSON.parse(
-        await readFile(runArtifactPath(testDir, aggregateRow, 'attempt-1', 'metrics.json'), 'utf8'),
+        await readFile(runArtifactPath(testDir, aggregateRow, 'sample-1', 'metrics.json'), 'utf8'),
       ),
     );
     MetricsArtifactWireSchema.parse(
       JSON.parse(
-        await readFile(runArtifactPath(testDir, estimatedRow, 'attempt-1', 'metrics.json'), 'utf8'),
+        await readFile(runArtifactPath(testDir, estimatedRow, 'sample-1', 'metrics.json'), 'utf8'),
       ),
     );
 
-    expect(aggregateTiming).toMatchObject({
-      token_usage: { input: 3, output: 4, reasoning: 0 },
-      total_tokens: 7,
-      cost_usd: null,
-      usage_sources: {
-        token_usage: 'aggregate',
-        total_tokens: 'aggregate',
-        cost: 'unavailable',
-        duration: 'unavailable',
-      },
+    expect(aggregateMetrics).toMatchObject({
+      tokens: { input: 3, output: 4, reasoning: 0, total: 7, source: 'aggregate' },
+      cost: { usd: null, source: 'unavailable' },
+      duration: { source: 'unavailable' },
     });
-    expect(estimatedTiming).toMatchObject({
-      token_usage: { input: 6, output: 7, reasoning: 0 },
-      total_tokens: 13,
-      cost_usd: 0.002,
-      usage_sources: {
-        token_usage: 'token_estimated',
-        total_tokens: 'token_estimated',
-        cost: 'token_estimated',
-        duration: 'unavailable',
-      },
+    expect(estimatedMetrics).toMatchObject({
+      tokens: { input: 6, output: 7, reasoning: 0, total: 13, source: 'token_estimated' },
+      cost: { usd: 0.002, source: 'token_estimated' },
+      duration: { source: 'unavailable' },
     });
-    expect(runSummary.timing).toMatchObject({
-      total_tokens: 20,
-      cost_usd: 0.002,
-      usage_sources: {
-        token_usage: 'aggregate',
-        total_tokens: 'aggregate',
-        cost: 'aggregate',
-        duration: 'unavailable',
-      },
+    expect(runSummary.metrics).toMatchObject({
+      tokens: { total: 20, source: 'aggregate' },
+      cost: { usd: 0.002, source: 'aggregate' },
+      duration: { source: 'unavailable' },
     });
   });
 
@@ -1976,18 +1934,18 @@ describe('writeArtifactsFromResults', () => {
     const [indexLine] = await readIndexLines(paths.indexPath);
     const rowDir = expectRowDir(indexLine, 'raw-log-case');
 
-    const copiedRawLogPath = runArtifactPath(testDir, indexLine, 'attempt-1', 'provider.log');
+    const copiedRawLogPath = runArtifactPath(testDir, indexLine, 'sample-1', 'provider.log');
     await expect(readFile(copiedRawLogPath, 'utf8')).rejects.toThrow();
 
-    const transcriptPath = runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript-raw.jsonl');
+    const transcriptPath = runArtifactPath(testDir, indexLine, 'sample-1', 'transcript-raw.jsonl');
     await expect(readFile(transcriptPath, 'utf8')).resolves.toBe(rawLog);
     await expect(readFile(rawLogPath, 'utf8')).resolves.toBe(rawLog);
     await expect(
-      readFile(path.join(testDir, rowDir, 'attempt-1', 'transcript.jsonl'), 'utf8'),
+      readFile(path.join(testDir, rowDir, 'sample-1', 'transcript.jsonl'), 'utf8'),
     ).rejects.toThrow();
 
     const transcript = JSON.parse(
-      await readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript.json'), 'utf8'),
+      await readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'transcript.json'), 'utf8'),
     );
     expect(transcript.turns[0]).toMatchObject({
       v: 1,
@@ -1997,8 +1955,8 @@ describe('writeArtifactsFromResults', () => {
     });
 
     expect(indexLine.raw_provider_log_path).toBeUndefined();
-    expect(indexLine.transcript_path).toBe(`${rowDir}/attempt-1/transcript.json`);
-    expect(indexLine.transcript_raw_path).toBe(`${rowDir}/attempt-1/transcript-raw.jsonl`);
+    expect(indexLine.transcript_path).toBe(`${rowDir}/sample-1/transcript.json`);
+    expect(indexLine.transcript_raw_path).toBe(`${rowDir}/sample-1/transcript-raw.jsonl`);
     expect(indexLine).not.toHaveProperty('transcript_json_path');
   });
 
@@ -2043,7 +2001,7 @@ describe('writeArtifactsFromResults', () => {
     expect(JSON.stringify(indexLine)).not.toContain('api_key');
 
     const transcriptJson = await readFile(
-      runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript.json'),
+      runArtifactPath(testDir, indexLine, 'sample-1', 'transcript.json'),
       'utf8',
     );
     expect(transcriptJson).not.toContain('secret');
@@ -2062,12 +2020,12 @@ describe('writeArtifactsFromResults', () => {
     const paths = await writeArtifactsFromResults(results, testDir);
     const [indexLine] = await readIndexLines(paths.indexPath);
 
-    const transcriptPath = runArtifactPath(testDir, indexLine, 'attempt-1', 'transcript-raw.jsonl');
+    const transcriptPath = runArtifactPath(testDir, indexLine, 'sample-1', 'transcript-raw.jsonl');
     await expect(readFile(transcriptPath, 'utf8')).rejects.toThrow();
 
     expect(indexLine).not.toHaveProperty('transcript_path');
     expect(indexLine.metrics_path).toBe(
-      `${expectRowDir(indexLine, 'no-transcript-case')}/attempt-1/metrics.json`,
+      `${expectRowDir(indexLine, 'no-transcript-case')}/sample-1/metrics.json`,
     );
     expect(indexLine.artifact_pointers).toBeUndefined();
   });
@@ -2096,11 +2054,11 @@ describe('writeArtifactsFromResults', () => {
     const [indexLine] = await readIndexLines(paths.indexPath);
     const rowDir = expectRowDir(indexLine, 'shared-id');
 
-    expect(indexLine.grading_path).toBe(`${rowDir}/attempt-1/grading.json`);
+    expect(indexLine.grading_path).toBe(`${rowDir}/sample-1/grading.json`);
     expect(rowDir).not.toContain('/');
 
     const grading: GradingArtifact = JSON.parse(
-      await readFile(runArtifactPath(testDir, indexLine, 'attempt-1', 'grading.json'), 'utf8'),
+      await readFile(runArtifactPath(testDir, indexLine, 'sample-1', 'grading.json'), 'utf8'),
     );
 
     expect(grading.assertion_results[0].text).toBe('baseline-check');
@@ -2119,11 +2077,11 @@ describe('writeArtifactsFromResults', () => {
     const rowDirs = indexLines.map((line) => expectRowDir(line, 'shared-id'));
     expect(new Set(rowDirs).size).toBe(2);
     expect(indexLines.map((line) => line.grading_path)).toEqual(
-      rowDirs.map((rowDir) => `${rowDir}/attempt-1/grading.json`),
+      rowDirs.map((rowDir) => `${rowDir}/sample-1/grading.json`),
     );
     const answers = await Promise.all(
       indexLines.map((line) =>
-        readFile(runArtifactPath(testDir, line, 'attempt-1', 'outputs', 'answer.md'), 'utf8'),
+        readFile(runArtifactPath(testDir, line, 'sample-1', 'outputs', 'answer.md'), 'utf8'),
       ),
     );
     expect(answers.sort()).toEqual(['alpha answer', 'beta answer']);
@@ -2211,7 +2169,7 @@ describe('writeArtifactsFromResults', () => {
         id: 'alpha',
         key: 'alpha',
         dimensions: {
-          runId: 'attempt-1',
+          runId: 'sample-1',
           suite: 'variant-suite',
           evalPath: 'evals/variant.eval.yaml',
           testId: 'shared-id',
@@ -2233,7 +2191,7 @@ describe('writeArtifactsFromResults', () => {
         id: 'beta',
         key: 'beta',
         dimensions: {
-          runId: 'attempt-1',
+          runId: 'sample-1',
           suite: 'variant-suite',
           evalPath: 'evals/variant.eval.yaml',
           testId: 'shared-id',
@@ -2570,11 +2528,11 @@ describe('writeArtifacts (from JSONL file)', () => {
     const artifactEntries = await readdir(paths.testArtifactDir);
     const [indexLine] = await readIndexLines(paths.indexPath);
     expect(artifactEntries).toContain(expectRowDir(indexLine, 'from-file'));
-    expect(artifactEntries).toContain(RESULT_INDEX_FILENAME);
+    expect(artifactEntries).toContain('.internal');
 
     const summary: RunSummaryArtifact = JSON.parse(await readFile(paths.summaryPath, 'utf8'));
-    expect(summary.manifest_path).toBe(RESULT_INDEX_FILENAME);
-    expect(summary.timing.duration_ms).toBe(12000);
-    expect(summary.timing.total_tokens).toBe(700);
+    expect(summary.index_path).toBe('.internal/index.jsonl');
+    expect(summary.metrics.duration.total_ms).toBe(12000);
+    expect(summary.metrics.tokens.total).toBe(700);
   });
 });
diff --git a/apps/cli/test/commands/eval/bundle.test.ts b/apps/cli/test/commands/eval/bundle.test.ts
index 6687eb02a..9b0291178 100644
--- a/apps/cli/test/commands/eval/bundle.test.ts
+++ b/apps/cli/test/commands/eval/bundle.test.ts
@@ -166,7 +166,7 @@ tests: ../data/cases.yaml
 
     expect(run.exitCode).toBe(0);
     expect(run.stdout).toContain('RESULT: PASS');
-    await expectFileExists(path.join(bundleDir, 'run', 'index.jsonl'));
+    await expectFileExists(path.join(bundleDir, 'run', '.internal', 'index.jsonl'));
   }, 60_000);
 
   it('preserves inline eval target object definitions in the bundled target graph', async () => {
diff --git a/apps/cli/test/commands/eval/pipeline/bench.test.ts b/apps/cli/test/commands/eval/pipeline/bench.test.ts
index 766ad20ed..3a2a823ca 100644
--- a/apps/cli/test/commands/eval/pipeline/bench.test.ts
+++ b/apps/cli/test/commands/eval/pipeline/bench.test.ts
@@ -76,7 +76,7 @@ describe('pipeline bench', () => {
     expect(grading.assertion_results.length).toBeGreaterThan(0);
     expect(grading.graders).toHaveLength(2);
 
-    const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
+    const indexContent = await readFile(join(OUT_DIR, '.internal', 'index.jsonl'), 'utf8');
     const lines = indexContent
       .trim()
       .split('\n')
@@ -106,7 +106,7 @@ describe('pipeline bench', () => {
     const { execa } = await import('execa');
     await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);
 
-    const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
+    const indexContent = await readFile(join(OUT_DIR, '.internal', 'index.jsonl'), 'utf8');
     const entry = JSON.parse(indexContent.trim().split('\n')[0]);
     expect(entry.experiment).toBe('without_skills');
 
@@ -118,7 +118,7 @@ describe('pipeline bench', () => {
     const { execa } = await import('execa');
     await execa('bun', [CLI_ENTRY, 'pipeline', 'bench', OUT_DIR]);
 
-    const indexContent = await readFile(join(OUT_DIR, 'index.jsonl'), 'utf8');
+    const indexContent = await readFile(join(OUT_DIR, '.internal', 'index.jsonl'), 'utf8');
     const entry = JSON.parse(indexContent.trim().split('\n')[0]);
     expect(entry.experiment).toBeUndefined();
 
diff --git a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
index 2d013b2b9..fc755cc41 100644
--- a/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
+++ b/apps/cli/test/commands/eval/pipeline/pipeline-e2e.test.ts
@@ -61,7 +61,7 @@ describe('eval pipeline e2e', () => {
       expect(grading.graders).toHaveLength(2);
       expect(grading.summary.pass_rate).toBeGreaterThan(0);
 
-      const indexContent = await readFile(join(outDir, 'index.jsonl'), 'utf8');
+      const indexContent = await readFile(join(outDir, '.internal', 'index.jsonl'), 'utf8');
       const indexLines = indexContent
         .trim()
         .split('\n')
diff --git a/apps/cli/test/commands/eval/result-layout.test.ts b/apps/cli/test/commands/eval/result-layout.test.ts
index 40a8783b3..355754179 100644
--- a/apps/cli/test/commands/eval/result-layout.test.ts
+++ b/apps/cli/test/commands/eval/result-layout.test.ts
@@ -5,6 +5,7 @@ import path from 'node:path';
 
 import {
   RESULT_INDEX_FILENAME,
+  RUN_INTERNAL_DIRNAME,
   buildDefaultRunDir,
   buildDefaultRunDirFromName,
   discoverRunManifestPaths,
@@ -51,7 +52,8 @@ describe('result layout', () => {
   it('resolves the canonical index.jsonl file in a run directory', () => {
     const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-layout-test-'));
     try {
-      const indexPath = path.join(tempDir, RESULT_INDEX_FILENAME);
+      const indexPath = path.join(tempDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME);
+      mkdirSync(path.dirname(indexPath), { recursive: true });
       writeFileSync(indexPath, '{"test_id":"case"}\n');
 
       expect(resolveExistingRunPrimaryPath(tempDir)).toBe(indexPath);
@@ -76,12 +78,13 @@ describe('result layout', () => {
     }
   });
 
-  it('treats the root index.jsonl as authoritative when legacy nested bundles also exist', () => {
+  it('treats the internal index.jsonl as authoritative when legacy nested bundles also exist', () => {
     const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-layout-test-'));
     try {
       const nestedBundleDir = path.join(tempDir, 'target-a');
       mkdirSync(nestedBundleDir, { recursive: true });
-      const rootIndexPath = path.join(tempDir, RESULT_INDEX_FILENAME);
+      const rootIndexPath = path.join(tempDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME);
+      mkdirSync(path.dirname(rootIndexPath), { recursive: true });
       writeFileSync(rootIndexPath, '{"test_id":"root"}\n');
       writeFileSync(path.join(nestedBundleDir, RESULT_INDEX_FILENAME), '{"test_id":"legacy"}\n');
 
diff --git a/apps/cli/test/commands/eval/run-cache.test.ts b/apps/cli/test/commands/eval/run-cache.test.ts
index 2c5ccd2ba..470622eff 100644
--- a/apps/cli/test/commands/eval/run-cache.test.ts
+++ b/apps/cli/test/commands/eval/run-cache.test.ts
@@ -10,7 +10,7 @@ describe('resolveRunCacheFile', () => {
       timestamp: '',
     };
     expect(resolveRunCacheFile(cache)).toBe(
-      path.join('/results/2026-03-24T00-00-00-000Z', 'index.jsonl'),
+      path.join('/results/2026-03-24T00-00-00-000Z', '.internal', 'index.jsonl'),
     );
   });
 
@@ -29,7 +29,7 @@ describe('resolveRunCacheFile', () => {
       timestamp: '',
     };
     expect(resolveRunCacheFile(cache)).toBe(
-      path.join('/results/2026-03-24T00-00-00-000Z', 'index.jsonl'),
+      path.join('/results/2026-03-24T00-00-00-000Z', '.internal', 'index.jsonl'),
     );
   });
 
diff --git a/apps/cli/test/commands/grade/grade-prepared.test.ts b/apps/cli/test/commands/grade/grade-prepared.test.ts
index 11584a69b..ad74988ad 100644
--- a/apps/cli/test/commands/grade/grade-prepared.test.ts
+++ b/apps/cli/test/commands/grade/grade-prepared.test.ts
@@ -169,7 +169,7 @@ describe('agentv grade prepared attempts', () => {
       workspace_path: path.join(preparedDir, 'workspace'),
       manifest_path: path.join(preparedDir, 'agentv_prepare.json'),
       output_dir: runDir,
-      index_path: path.join(runDir, 'index.jsonl'),
+      index_path: path.join(runDir, '.internal', 'index.jsonl'),
     });
     expect(await exists(targetMarker)).toBe(false);
 
@@ -177,7 +177,9 @@ describe('agentv grade prepared attempts', () => {
     expect(graderPayload.workspace_path).toBe(path.join(preparedDir, 'workspace'));
     expect(graderPayload.file_changes).toContain('+manual edit');
 
-    const row = JSON.parse((await readFile(path.join(runDir, 'index.jsonl'), 'utf8')).trim());
+    const row = JSON.parse(
+      (await readFile(path.join(runDir, '.internal', 'index.jsonl'), 'utf8')).trim(),
+    );
     expect(row).toMatchObject({
       test_id: 'case-1',
       target: 'codex',
@@ -197,7 +199,7 @@ describe('agentv grade prepared attempts', () => {
     });
     expect(typeof row.metadata.prepared_attempt.baseline_commit).toBe('string');
 
-    expect(row.file_changes_path).toMatch(/\/attempt-1\/outputs\/file_changes\.diff$/);
+    expect(row.file_changes_path).toMatch(/\/sample-1\/outputs\/file_changes\.diff$/);
     await expect(readFile(path.join(runDir, row.file_changes_path), 'utf8')).resolves.toContain(
       '+manual edit',
     );
@@ -275,7 +277,9 @@ describe('agentv grade prepared attempts', () => {
     );
 
     expect(await exists(targetMarker)).toBe(false);
-    const row = JSON.parse((await readFile(path.join(runDir, 'index.jsonl'), 'utf8')).trim());
+    const row = JSON.parse(
+      (await readFile(path.join(runDir, '.internal', 'index.jsonl'), 'utf8')).trim(),
+    );
     expect(row.score).toBe(0);
     expect(row.scores[0]).toMatchObject({
       name: 'expected-tool-sequence',
@@ -382,7 +386,9 @@ describe('agentv grade prepared attempts', () => {
     });
     expect(await exists(targetMarker)).toBe(false);
 
-    const row = JSON.parse((await readFile(path.join(runDir, 'index.jsonl'), 'utf8')).trim());
+    const row = JSON.parse(
+      (await readFile(path.join(runDir, '.internal', 'index.jsonl'), 'utf8')).trim(),
+    );
     const answerPath = row.answer_path ?? row.response_path ?? row.output_path;
     expect(typeof answerPath).toBe('string');
     expect((await readFile(path.join(runDir, answerPath), 'utf8')).trim()).toBe('done');
diff --git a/apps/cli/test/commands/results/export-e2e-providers.test.ts b/apps/cli/test/commands/results/export-e2e-providers.test.ts
index 574e84c00..89ba73a04 100644
--- a/apps/cli/test/commands/results/export-e2e-providers.test.ts
+++ b/apps/cli/test/commands/results/export-e2e-providers.test.ts
@@ -213,7 +213,7 @@ function toJsonl(...records: object[]): string {
 }
 
 function readIndex(outputDir: string): IndexArtifactEntry[] {
-  return readFileSync(path.join(outputDir, RESULT_INDEX_FILENAME), 'utf8')
+  return readFileSync(path.join(outputDir, '.internal', RESULT_INDEX_FILENAME), 'utf8')
     .trim()
     .split('\n')
     .filter(Boolean)
@@ -238,7 +238,7 @@ function runArtifactDir(
   outputDir: string,
   record: { suite?: string; target?: string; test_id?: string },
 ): string {
-  return path.join(outputDir, findIndexEntry(outputDir, record).result_dir, 'attempt-1');
+  return path.join(outputDir, findIndexEntry(outputDir, record).result_dir, 'sample-1');
 }
 
 describe('export e2e — multi-provider metrics verification', () => {
@@ -254,7 +254,7 @@ describe('export e2e — multi-provider metrics verification', () => {
 
   // ── Timing artifact tests ──────────────────────────────────────────────
 
-  describe('<test-id>/timing.json — per-test timing', () => {
+  describe('<test-id>/metrics.json — per-test timing', () => {
     it('should include reasoning tokens in token_usage', async () => {
       const outputDir = path.join(tempDir, 'claude');
       const content = toJsonl(CLAUDE_CLI_RESULT);
@@ -263,14 +263,14 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       const timing: TimingArtifact = JSON.parse(
         readFileSync(
-          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'),
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'),
           'utf8',
         ),
       );
 
-      expect(timing.token_usage.input).toBe(2000);
-      expect(timing.token_usage.output).toBe(800);
-      expect(timing.token_usage.reasoning).toBe(1500);
+      expect(timing.tokens.input).toBe(2000);
+      expect(timing.tokens.output).toBe(800);
+      expect(timing.tokens.reasoning).toBe(1500);
     });
 
     it('should write independent timing files for multiple providers', async () => {
@@ -281,20 +281,20 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       const claudeTiming: TimingArtifact = JSON.parse(
         readFileSync(
-          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'),
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'),
           'utf8',
         ),
       );
       const codexTiming: TimingArtifact = JSON.parse(
-        readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'metrics.json'), 'utf8'),
       );
       const copilotTiming: TimingArtifact = JSON.parse(
-        readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'metrics.json'), 'utf8'),
       );
 
-      expect(claudeTiming.token_usage.reasoning).toBe(1500);
-      expect(codexTiming.token_usage.reasoning).toBe(2500);
-      expect(copilotTiming.token_usage.reasoning).toBe(0);
+      expect(claudeTiming.tokens.reasoning).toBe(1500);
+      expect(codexTiming.tokens.reasoning).toBe(2500);
+      expect(copilotTiming.tokens.reasoning).toBe(0);
     });
 
     it('should compute total_tokens as input + output (not including reasoning)', async () => {
@@ -305,12 +305,12 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       const timing: TimingArtifact = JSON.parse(
         readFileSync(
-          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'),
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'),
           'utf8',
         ),
       );
 
-      expect(timing.total_tokens).toBe(2800);
+      expect(timing.tokens.total).toBe(2800);
     });
 
     it('should preserve duration_ms per test result', async () => {
@@ -320,11 +320,11 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const timing: TimingArtifact = JSON.parse(
-        readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, CODEX_RESULT), 'metrics.json'), 'utf8'),
       );
 
-      expect(timing.duration_ms).toBe(12000);
-      expect(timing.total_duration_seconds).toBe(12);
+      expect(timing.duration.total_ms).toBe(12000);
+      expect(timing.duration.total_seconds).toBe(12);
     });
 
     it('should handle results with no token_usage gracefully', async () => {
@@ -334,14 +334,14 @@ describe('export e2e — multi-provider metrics verification', () => {
       await exportResults('test.jsonl', content, outputDir);
 
       const timing: TimingArtifact = JSON.parse(
-        readFileSync(path.join(runArtifactDir(outputDir, MINIMAL_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, MINIMAL_RESULT), 'metrics.json'), 'utf8'),
       );
 
-      expect(timing.total_tokens).toBe(0);
-      expect(timing.duration_ms).toBe(0);
-      expect(timing.token_usage.input).toBe(0);
-      expect(timing.token_usage.output).toBe(0);
-      expect(timing.token_usage.reasoning).toBe(0);
+      expect(timing.tokens.total).toBe(0);
+      expect(timing.duration.total_ms).toBe(0);
+      expect(timing.tokens.input).toBe(0);
+      expect(timing.tokens.output).toBe(0);
+      expect(timing.tokens.reasoning).toBe(0);
     });
 
     it('should handle providers with and without reasoning tokens', async () => {
@@ -352,16 +352,16 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       const claudeTiming: TimingArtifact = JSON.parse(
         readFileSync(
-          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'timing.json'),
+          path.join(runArtifactDir(outputDir, CLAUDE_CLI_RESULT), 'metrics.json'),
           'utf8',
         ),
       );
       const copilotTiming: TimingArtifact = JSON.parse(
-        readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'timing.json'), 'utf8'),
+        readFileSync(path.join(runArtifactDir(outputDir, COPILOT_RESULT), 'metrics.json'), 'utf8'),
       );
 
-      expect(claudeTiming.token_usage.reasoning).toBe(1500);
-      expect(copilotTiming.token_usage.reasoning).toBe(0);
+      expect(claudeTiming.tokens.reasoning).toBe(1500);
+      expect(copilotTiming.tokens.reasoning).toBe(0);
     });
   });
 
@@ -644,7 +644,7 @@ describe('export e2e — multi-provider metrics verification', () => {
 
       // Verify all artifact files exist
       expect(existsSync(path.join(outputDir, 'summary.json'))).toBe(true);
-      expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(false);
+      expect(existsSync(path.join(outputDir, 'metrics.json'))).toBe(false);
 
       // Verify benchmark
       const benchmark: RunSummaryArtifact = JSON.parse(
@@ -713,16 +713,16 @@ describe('export e2e — multi-provider metrics verification', () => {
         readFileSync(
           path.join(
             runArtifactDir(outputDir, { ...record, target: 'mock' as const }),
-            'timing.json',
+            'metrics.json',
           ),
           'utf8',
         ),
       );
 
-      expect(timing.token_usage.input).toBe(100);
-      expect(timing.token_usage.output).toBe(50);
-      expect(timing.token_usage.reasoning).toBe(75);
-      expect(timing.duration_ms).toBe(1000);
+      expect(timing.tokens.input).toBe(100);
+      expect(timing.tokens.output).toBe(50);
+      expect(timing.tokens.reasoning).toBe(75);
+      expect(timing.duration.total_ms).toBe(1000);
     });
   });
 });
diff --git a/apps/cli/test/commands/results/export.test.ts b/apps/cli/test/commands/results/export.test.ts
index efc93cd3b..6a8014d46 100644
--- a/apps/cli/test/commands/results/export.test.ts
+++ b/apps/cli/test/commands/results/export.test.ts
@@ -165,7 +165,7 @@ function toJsonl(...records: object[]): string {
 }
 
 function readIndex(outputDir: string): IndexArtifactEntry[] {
-  return readFileSync(path.join(outputDir, RESULT_INDEX_FILENAME), 'utf8')
+  return readFileSync(path.join(outputDir, '.internal', RESULT_INDEX_FILENAME), 'utf8')
     .trim()
     .split('\n')
     .filter(Boolean)
@@ -197,7 +197,7 @@ function runArtifactDir(
   outputDir: string,
   record: { suite?: string; target?: string; test_id?: string },
 ): string {
-  return path.join(artifactDir(outputDir, record), 'attempt-1');
+  return path.join(artifactDir(outputDir, record), 'sample-1');
 }
 
 function readAnswer(
@@ -220,8 +220,8 @@ describe('results export', () => {
 
   it('loadExportSource resolves run workspaces to index.jsonl', async () => {
     const runDir = path.join(tempDir, '2026-03-18T10-00-00-000Z');
-    mkdirSync(runDir, { recursive: true });
-    const sourceFile = path.join(runDir, RESULT_INDEX_FILENAME);
+    mkdirSync(path.join(runDir, '.internal'), { recursive: true });
+    const sourceFile = path.join(runDir, '.internal/index.jsonl');
     writeFileSync(sourceFile, toJsonl(RESULT_FULL));
 
     const { sourceFile: loadedSource, results } = await loadExportSource(runDir, tempDir);
@@ -234,7 +234,7 @@ describe('results export', () => {
   it('deriveOutputDir uses the run directory name for manifest inputs', () => {
     const outputDir = deriveOutputDir(
       tempDir,
-      path.join(tempDir, '2026-03-18T10-00-00-000Z', 'index.jsonl'),
+      path.join(tempDir, '2026-03-18T10-00-00-000Z', '.internal', 'index.jsonl'),
     );
     expect(outputDir).toBe(
       path.join(tempDir, '.agentv', 'results', 'export', '2026-03-18T10-00-00-000Z'),
@@ -250,6 +250,7 @@ describe('results export', () => {
         'results',
         'with-skills',
         '2026-03-18T10-00-00-000Z',
+        '.internal',
         RESULT_INDEX_FILENAME,
       ),
     );
@@ -299,7 +300,7 @@ describe('results export', () => {
     });
     expect(first.entries[0].artifact_refs).toMatchObject({
       status: 'planned_export',
-      timing_path: expect.stringMatching(/^test-private--[a-f0-9]{12}\/attempt-1\/timing\.json$/),
+      metrics_path: expect.stringMatching(/^test-private--[a-f0-9]{12}\/sample-1\/metrics\.json$/),
     });
     expect(first.entries[0].artifact_refs).not.toHaveProperty('input_path');
     expect(first.entries[0].artifact_refs).not.toHaveProperty('output_path');
@@ -380,20 +381,19 @@ describe('results export', () => {
       status: 'planned_export',
       result_dir: resultDir,
       summary_path: `${resultDir}/summary.json`,
-      grading_path: `${resultDir}/attempt-1/grading.json`,
-      timing_path: `${resultDir}/attempt-1/timing.json`,
-      metrics_path: `${resultDir}/attempt-1/metrics.json`,
-      output_path: `${resultDir}/attempt-1/outputs/answer.md`,
-      answer_path: `${resultDir}/attempt-1/outputs/answer.md`,
-      transcript_path: `${resultDir}/attempt-1/transcript.json`,
-      transcript_raw_path: `${resultDir}/attempt-1/transcript-raw.jsonl`,
+      grading_path: `${resultDir}/sample-1/grading.json`,
+      metrics_path: `${resultDir}/sample-1/metrics.json`,
+      output_path: `${resultDir}/sample-1/outputs/answer.md`,
+      answer_path: `${resultDir}/sample-1/outputs/answer.md`,
+      transcript_path: `${resultDir}/sample-1/transcript.json`,
+      transcript_raw_path: `${resultDir}/sample-1/transcript-raw.jsonl`,
     });
     expect(bundle.entries[0].artifact_refs).not.toHaveProperty('trace_path');
     expect(bundle.entries[0].artifact_refs).not.toHaveProperty('input_path');
     expect(bundle.entries[0].trace).not.toHaveProperty('envelope_ref');
     expect(bundle.entries[0].trace_envelope.artifacts).toBeDefined();
     expect(bundle.entries[0].trace_envelope.artifacts).not.toHaveProperty('trace_path');
-    expect(bundle.entries[0].feedback.grading_path).toBe(`${resultDir}/attempt-1/grading.json`);
+    expect(bundle.entries[0].feedback.grading_path).toBe(`${resultDir}/sample-1/grading.json`);
     expect(bundle.entries[0].raw_content).toBeDefined();
     expect(bundle.entries[0].feedback.scores?.[0]).toHaveProperty('evidence');
     expect(serialized).toContain('SECRET_PROMPT_TEXT');
@@ -413,7 +413,7 @@ describe('results export', () => {
     expect(existsSync(summaryPath)).toBe(true);
 
     const benchmark: RunSummaryArtifact = JSON.parse(readFileSync(summaryPath, 'utf8'));
-    expect(benchmark.manifest_path).toBe(RESULT_INDEX_FILENAME);
+    expect(benchmark.index_path).toBe('.internal/index.jsonl');
     expect(benchmark.metadata.eval_file).toBe('eval_2026-03-18.jsonl');
     expect(benchmark.metadata.timestamp).toBe('2026-03-18T10:00:01.000Z');
     // artifact-writer uses string[] for tests_run, not a count
@@ -437,7 +437,7 @@ describe('results export', () => {
 
     await exportResults('test.jsonl', content, outputDir);
 
-    const indexPath = path.join(outputDir, RESULT_INDEX_FILENAME);
+    const indexPath = path.join(outputDir, '.internal', RESULT_INDEX_FILENAME);
     expect(existsSync(indexPath)).toBe(true);
 
     const entries = readFileSync(indexPath, 'utf8')
@@ -454,13 +454,12 @@ describe('results export', () => {
       execution_status: 'ok',
       result_dir: rowDir,
       summary_path: `${rowDir}/summary.json`,
-      grading_path: `${rowDir}/attempt-1/grading.json`,
-      timing_path: `${rowDir}/attempt-1/timing.json`,
-      metrics_path: `${rowDir}/attempt-1/metrics.json`,
-      output_path: `${rowDir}/attempt-1/outputs/answer.md`,
-      answer_path: `${rowDir}/attempt-1/outputs/answer.md`,
-      transcript_path: `${rowDir}/attempt-1/transcript.json`,
-      transcript_raw_path: `${rowDir}/attempt-1/transcript-raw.jsonl`,
+      grading_path: `${rowDir}/sample-1/grading.json`,
+      metrics_path: `${rowDir}/sample-1/metrics.json`,
+      output_path: `${rowDir}/sample-1/outputs/answer.md`,
+      answer_path: `${rowDir}/sample-1/outputs/answer.md`,
+      transcript_path: `${rowDir}/sample-1/transcript.json`,
+      transcript_raw_path: `${rowDir}/sample-1/transcript-raw.jsonl`,
     });
     expect(entries[0]).not.toHaveProperty('input_path');
     expect(entries[0].projection_identity).toMatchObject({
@@ -575,12 +574,13 @@ describe('results export', () => {
   it('exports generated test bundle refs and files from source manifests', async () => {
     const sourceDir = path.join(tempDir, 'source-run');
     mkdirSync(path.join(sourceDir, 'case', 'test'), { recursive: true });
+    mkdirSync(path.join(sourceDir, '.internal'), { recursive: true });
     writeFileSync(
       path.join(sourceDir, 'case', 'test', 'EVAL.yaml'),
       'tests:\n  - id: test-greeting\n',
     );
     writeFileSync(path.join(sourceDir, 'case', 'test', 'targets.yaml'), 'targets: []\n');
-    const sourceFile = path.join(sourceDir, RESULT_INDEX_FILENAME);
+    const sourceFile = path.join(sourceDir, '.internal/index.jsonl');
     const outputDir = path.join(tempDir, 'output');
     const content = toJsonl({
       ...RESULT_FULL,
@@ -618,12 +618,13 @@ describe('results export', () => {
   it('exports legacy task_dir bundles as new test_dir artifacts', async () => {
     const sourceDir = path.join(tempDir, 'legacy-run');
     mkdirSync(path.join(sourceDir, 'case', 'task'), { recursive: true });
+    mkdirSync(path.join(sourceDir, '.internal'), { recursive: true });
     writeFileSync(
       path.join(sourceDir, 'case', 'task', 'EVAL.yaml'),
       'tests:\n  - id: test-greeting\n',
     );
     writeFileSync(path.join(sourceDir, 'case', 'task', 'targets.yaml'), 'targets: []\n');
-    const sourceFile = path.join(sourceDir, RESULT_INDEX_FILENAME);
+    const sourceFile = path.join(sourceDir, '.internal/index.jsonl');
     const outputDir = path.join(tempDir, 'output');
     const content = toJsonl({
       ...RESULT_FULL,
@@ -647,8 +648,8 @@ describe('results export', () => {
 
   it('preserves source bundle refs in dry-run projection inputs', async () => {
     const sourceDir = path.join(tempDir, 'source-run');
-    mkdirSync(sourceDir, { recursive: true });
-    const sourceFile = path.join(sourceDir, RESULT_INDEX_FILENAME);
+    mkdirSync(path.join(sourceDir, '.internal'), { recursive: true });
+    const sourceFile = path.join(sourceDir, '.internal/index.jsonl');
     writeFileSync(
       sourceFile,
       toJsonl({
@@ -694,21 +695,21 @@ describe('results export', () => {
     expect(readAnswer(outputDir, RESULT_FULL)).toBe('Hello, Alice!');
   });
 
-  it('should create per-test timing.json with run timing', async () => {
+  it('should create per-test metrics.json with run timing', async () => {
     const outputDir = path.join(tempDir, 'output');
     const content = toJsonl(RESULT_FULL, RESULT_PARTIAL);
 
     await exportResults('test.jsonl', content, outputDir);
 
-    const timingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'timing.json');
+    const timingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'metrics.json');
     expect(existsSync(timingPath)).toBe(true);
 
     const timing: TimingArtifact = JSON.parse(readFileSync(timingPath, 'utf8'));
-    expect(timing.total_tokens).toBe(1500);
-    expect(timing.duration_ms).toBe(3500);
-    expect(timing.token_usage).toHaveProperty('input');
-    expect(timing.token_usage).toHaveProperty('output');
-    expect(timing.token_usage).toHaveProperty('reasoning');
+    expect(timing.tokens.total).toBe(1500);
+    expect(timing.duration.total_ms).toBe(3500);
+    expect(timing.tokens).toHaveProperty('input');
+    expect(timing.tokens).toHaveProperty('output');
+    expect(timing.tokens).toHaveProperty('reasoning');
   });
 
   it('should create per-test artifact directories', async () => {
@@ -750,7 +751,7 @@ describe('results export', () => {
     expect(grading.graders?.[0].name).toBe('greeting_quality');
     expect(grading.graders?.[0].type).toBe('llm-grader');
 
-    const perTestTimingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'timing.json');
+    const perTestTimingPath = path.join(runArtifactDir(outputDir, RESULT_FULL), 'metrics.json');
     expect(existsSync(perTestTimingPath)).toBe(true);
   });
 
@@ -803,8 +804,8 @@ describe('results export', () => {
     await exportResults('test.jsonl', content, outputDir);
 
     expect(existsSync(path.join(outputDir, 'summary.json'))).toBe(true);
-    expect(existsSync(path.join(outputDir, RESULT_INDEX_FILENAME))).toBe(true);
-    expect(existsSync(path.join(outputDir, 'timing.json'))).toBe(false);
+    expect(existsSync(path.join(outputDir, '.internal', RESULT_INDEX_FILENAME))).toBe(true);
+    expect(existsSync(path.join(outputDir, 'metrics.json'))).toBe(false);
     expect(existsSync(path.join(runArtifactDir(outputDir, RESULT_FULL), 'grading.json'))).toBe(
       true,
     );
@@ -837,7 +838,7 @@ describe('results export', () => {
 
     const answerPath = path.join(
       artifactDir(outputDir, RESULT_DIFFERENT_TARGET),
-      'attempt-1',
+      'sample-1',
       'outputs',
       'answer.md',
     );
diff --git a/apps/cli/test/commands/results/report.test.ts b/apps/cli/test/commands/results/report.test.ts
index e7c2e0f5d..41e8b4274 100644
--- a/apps/cli/test/commands/results/report.test.ts
+++ b/apps/cli/test/commands/results/report.test.ts
@@ -118,7 +118,7 @@ describe('results report', () => {
       { evalFile: 'evals/demo.eval.yaml' },
     );
 
-    const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
+    const indexPath = path.join(runDir, '.internal', RESULT_INDEX_FILENAME);
     const lines = readFileSync(indexPath, 'utf8')
       .trim()
       .split('\n')
diff --git a/apps/cli/test/commands/results/validate.test.ts b/apps/cli/test/commands/results/validate.test.ts
index 4a47e7cbf..9bdf30406 100644
--- a/apps/cli/test/commands/results/validate.test.ts
+++ b/apps/cli/test/commands/results/validate.test.ts
@@ -37,7 +37,7 @@ describe('results validate', () => {
       writeFileSync(
         path.join(runDir, 'summary.json'),
         `${JSON.stringify({
-          manifest_path: 'index.jsonl',
+          index_path: 'index.jsonl',
           schema_version: 1,
           metadata: {
             experiment: 'with-skills',
@@ -76,12 +76,12 @@ describe('results validate', () => {
           scores: [{ name: 'quality', type: 'llm', score: 1, verdict: 'pass' }],
           execution_status: 'ok',
           summary_path: 'test-greeting/summary.json',
-          trace_path: 'test-greeting/attempt-1/trace.json',
+          trace_path: 'test-greeting/sample-1/trace.json',
           artifact_pointers: {
             trace: {
               ref: 'agentv/artifacts/v1',
-              key: 'traces/test-greeting/attempt-1/trace.json',
-              path: 'test-greeting/attempt-1/trace.json',
+              key: 'traces/test-greeting/sample-1/trace.json',
+              path: 'test-greeting/sample-1/trace.json',
             },
           },
         })}\n`,
diff --git a/apps/cli/test/commands/runs/rerun.test.ts b/apps/cli/test/commands/runs/rerun.test.ts
index facc3c3f1..f91963d53 100644
--- a/apps/cli/test/commands/runs/rerun.test.ts
+++ b/apps/cli/test/commands/runs/rerun.test.ts
@@ -241,7 +241,10 @@ describe('agentv runs rerun', () => {
       },
     });
 
-    const answerPath = path.join(path.dirname(indexPath), String(rows[0].answer_path));
+    const answerPath = path.join(
+      path.dirname(path.dirname(indexPath)),
+      String(rows[0].answer_path),
+    );
     const answer = await readFile(answerPath, 'utf8');
     expect(answer).toContain('Alpha answer');
     expect(answer).not.toContain('Captured answer');
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
index a89fabd8b..9ad40075c 100644
--- a/apps/cli/test/eval.integration.test.ts
+++ b/apps/cli/test/eval.integration.test.ts
@@ -21,6 +21,15 @@ const __dirname = path.dirname(__filename);
 const projectRoot = path.resolve(__dirname, '../../..');
 const CLI_ENTRY = path.join(projectRoot, 'apps/cli/src/cli.ts');
 const MOCK_RUNNER = path.join(projectRoot, 'apps/cli/test/fixtures/mock-run-evaluation.ts');
+
+function runIndexPath(runDir: string): string {
+  return path.join(runDir, '.internal', 'index.jsonl');
+}
+
+function runDirFromIndexPath(indexPath: string): string {
+  return path.dirname(path.dirname(indexPath));
+}
+
 async function createFixture(): Promise<EvalFixture> {
   const baseDir = await mkdtemp(path.join(tmpdir(), 'agentv-cli-test-'));
   const suiteDir = path.join(baseDir, 'suite');
@@ -332,7 +341,7 @@ describe('agentv eval CLI', () => {
       ]);
 
       expect(exitCode).toBe(0);
-      const indexPath = path.join(outputDir, 'index.jsonl');
+      const indexPath = runIndexPath(outputDir);
       expect(extractOutputPath(stdout)).toBe(indexPath);
       expect(stdout).toContain(`Artifact directory: ${outputDir}`);
 
@@ -343,7 +352,7 @@ describe('agentv eval CLI', () => {
         const resultDir = row.result_dir as string;
         expect(resultDir).not.toContain('/');
         await expectFileExists(path.join(outputDir, resultDir, 'summary.json'));
-        await expectFileExists(path.join(outputDir, resultDir, 'attempt-1', 'grading.json'));
+        await expectFileExists(path.join(outputDir, resultDir, 'sample-1', 'grading.json'));
       }
     } finally {
       await rm(fixture.baseDir, { recursive: true, force: true });
@@ -359,14 +368,14 @@ describe('agentv eval CLI', () => {
 
       const outputDir = path.join(fixture.suiteDir, 'configured-results');
       expect(exitCode).toBe(0);
-      const indexPath = path.join(outputDir, 'index.jsonl');
+      const indexPath = runIndexPath(outputDir);
       expect(extractOutputPath(stdout)).toBe(indexPath);
       await expectFileExists(indexPath);
       await expectFileExists(path.join(outputDir, 'summary.json'));
       const [firstRow] = (await readJsonLines(indexPath)) as Array<Record<string, unknown>>;
       await expectFileExists(path.join(outputDir, firstRow.result_dir as string, 'summary.json'));
       await expectFileExists(
-        path.join(outputDir, firstRow.result_dir as string, 'attempt-1', 'grading.json'),
+        path.join(outputDir, firstRow.result_dir as string, 'sample-1', 'grading.json'),
       );
     } finally {
       await rm(fixture.baseDir, { recursive: true, force: true });
@@ -401,7 +410,7 @@ describe('agentv eval CLI', () => {
       ]);
 
       expect(exitCode).toBe(1);
-      const indexPath = path.join(outputDir, 'index.jsonl');
+      const indexPath = runIndexPath(outputDir);
       expect(extractOutputPath(stdout)).toBe(indexPath);
       expect(stdout).not.toContain('Export files:');
 
@@ -409,10 +418,10 @@ describe('agentv eval CLI', () => {
       expect(canonicalResults).toHaveLength(2);
       await expectFileExists(path.join(outputDir, 'summary.json'));
       for (const row of canonicalResults) {
-        expect(row.transcript_path).toMatch(/attempt-1\/transcript\.json$/);
+        expect(row.transcript_path).toMatch(/sample-1\/transcript\.json$/);
         await expectFileExists(path.join(outputDir, row.transcript_path as string));
         expect(row.transcript_summary).toBeDefined();
-        expect(row.transcript_raw_path).toMatch(/attempt-1\/transcript-raw\.jsonl$/);
+        expect(row.transcript_raw_path).toMatch(/sample-1\/transcript-raw\.jsonl$/);
         await expectFileExists(path.join(outputDir, row.transcript_raw_path as string));
       }
     } finally {
@@ -664,7 +673,7 @@ describe('agentv eval CLI', () => {
 
       expect(exitCode).toBe(0);
       const outputPath = extractOutputPath(stdout);
-      expect(path.dirname(path.dirname(outputPath))).toBe(
+      expect(path.dirname(runDirFromIndexPath(outputPath))).toBe(
         path.join(fixture.suiteDir, '.agentv', 'results'),
       );
 
@@ -685,7 +694,7 @@ describe('agentv eval CLI', () => {
       });
 
       const benchmark = JSON.parse(
-        await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'),
+        await readFile(path.join(runDirFromIndexPath(outputPath), 'summary.json'), 'utf8'),
       ) as { metadata?: Record<string, unknown> };
       expect(benchmark.metadata?.experiment).toBe('native-exp');
       expect(benchmark.metadata?.experiment_config).toMatchObject({
@@ -860,7 +869,7 @@ describe('agentv eval CLI', () => {
 
       expect(exitCode).toBe(0);
       const outputPath = extractOutputPath(stdout);
-      expect(path.dirname(path.dirname(outputPath))).toBe(
+      expect(path.dirname(runDirFromIndexPath(outputPath))).toBe(
         path.join(fixture.suiteDir, '.agentv', 'results'),
       );
 
@@ -885,7 +894,7 @@ describe('agentv eval CLI', () => {
       });
 
       const benchmark = JSON.parse(
-        await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'),
+        await readFile(path.join(runDirFromIndexPath(outputPath), 'summary.json'), 'utf8'),
       ) as { metadata?: Record<string, unknown> };
       expect(benchmark.metadata?.runtime_source).toMatchObject({
         schema_version: 'agentv.runtime_source.v1',
@@ -946,7 +955,7 @@ describe('agentv eval CLI', () => {
         '0.8',
       ]);
       expect(first.exitCode).toBe(1);
-      const priorIndexPath = path.join(priorRunDir, 'index.jsonl');
+      const priorIndexPath = runIndexPath(priorRunDir);
       const priorRows = (await readJsonLines(priorIndexPath)) as Array<Record<string, unknown>>;
       const alphaRow = priorRows.find((row) => row.test_id === 'case-alpha');
       const betaRow = priorRows.find((row) => row.test_id === 'case-beta');
@@ -1062,7 +1071,7 @@ tests:
       ]);
       expect(first.exitCode).toBe(0);
 
-      const priorIndexPath = path.join(priorRunDir, 'index.jsonl');
+      const priorIndexPath = runIndexPath(priorRunDir);
       const priorRows = (await readJsonLines(priorIndexPath)) as Array<Record<string, unknown>>;
       expect(priorRows).toHaveLength(1);
       const baseRow = priorRows[0];
@@ -1181,7 +1190,7 @@ tests:
       expect(exitCode).toBe(0);
       const outputPath = extractOutputPath(stdout);
       const benchmark = JSON.parse(
-        await readFile(path.join(path.dirname(outputPath), 'summary.json'), 'utf8'),
+        await readFile(path.join(runDirFromIndexPath(outputPath), 'summary.json'), 'utf8'),
       ) as { metadata?: Record<string, unknown> };
       expect(benchmark.metadata?.runtime_source).toMatchObject({
         schema_version: 'agentv.runtime_source.v1',
diff --git a/apps/web/src/content/docs/docs/next/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/next/evaluation/running-evals.mdx
index 896ca40e4..782c6885a 100644
--- a/apps/web/src/content/docs/docs/next/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/next/evaluation/running-evals.mdx
@@ -11,7 +11,7 @@ sidebar:
 agentv eval evals/my-eval.yaml
 ```
 
-Results are written to `.agentv/results/<run_id>/index.jsonl`. Each CLI
+Results are written to `.agentv/results/<run_id>/.internal/index.jsonl`. Each CLI
 invocation writes one run bundle. The experiment label is stored in
 `summary.json` and row metadata. Each line is a JSON object with one result per
 test case, and the run workspace also stores the summary and related artifacts.
@@ -98,23 +98,23 @@ are unchanged.
 
 ### Custom Output Directory
 
-Write all artifacts (index.jsonl, summary.json, per-test grading/timing) to a specific directory:
+Write all artifacts (`.internal/index.jsonl`, `summary.json`, per-test grading, metrics, and transcripts) to a specific directory:
 
 ```bash
 agentv eval evals/my-eval.yaml --output ./my-results
 ```
 
 `--output` is a run directory, not a file path. The canonical manifest is always
-`<output>/index.jsonl`; the aggregate summary is
+`<output>/.internal/index.jsonl`; the aggregate summary is
 `<output>/summary.json`.
 
 ### Read Results from the Run Manifest
 
-The run directory is the complete artifact boundary. Use `<output>/index.jsonl` for scripts, CI summaries, and downstream tools:
+The run directory is the complete artifact boundary. Use `<output>/.internal/index.jsonl` for scripts, CI summaries, and downstream tools:
 
 ```bash
 agentv eval evals/my-eval.yaml --output ./my-results
-cat ./my-results/index.jsonl
+cat ./my-results/.internal/index.jsonl
 ```
 
 ### Generated Test Bundles
@@ -128,15 +128,15 @@ Typical layout:
 
 ```text
 my-results/
-  index.jsonl
   summary.json
+  .internal/
+    index.jsonl
   <test-id>/
     summary.json
-    attempt-1/
+    sample-1/
       result.json
       grading.json
       metrics.json
-      timing.json
       transcript.json
       transcript-raw.jsonl
       outputs/answer.md
@@ -187,7 +187,7 @@ OpenTelemetry/OpenInference spans directly to that backend during execution.
 
 ```bash
 # Summary-level inspection from the run manifest
-agentv inspect stats .agentv/results/<run_id>/index.jsonl
+agentv inspect stats .agentv/results/<run_id>/.internal/index.jsonl
 
 # Inspect AgentV-owned per-case artifacts and transcript sidecars
 agentv inspect show .agentv/results/<run_id>/index.jsonl --tree
@@ -275,7 +275,7 @@ agentv eval evals/my-eval.yaml --output .agentv/results/<run_id> --resume
 agentv eval evals/my-eval.yaml --rerun-failed <run_id>
 
 # Re-run only execution errors from any prior run by path
-agentv eval evals/my-eval.yaml --retry-errors .agentv/results/<run_id>/index.jsonl
+agentv eval evals/my-eval.yaml --retry-errors .agentv/results/<run_id>/.internal/index.jsonl
 ```
 
 After any failing run, the CLI prints the exact `--rerun-failed` command for the run dir that just completed — copy/paste it. If the process or pod disappeared before you could access the local run directory and results auto-push was enabled, recover the partial run from [WIP checkpoints](/docs/tools/wip-checkpoints/) first, then use the same `--resume` flow.
@@ -367,7 +367,7 @@ See the [Import tool docs](/docs/tools/import/) for all providers and options.
 Each result row's `result_dir` is an allocated folder under the timestamped run
 bundle, usually with a readable test-id prefix plus a short hash suffix. It can
 include `transcript.json`, `transcript-raw.jsonl`, `grading.json`,
-`timing.json`, `metrics.json`, and generated outputs under `outputs/`. The run
+``metrics.json`, and generated outputs under `outputs/`. The run
 root does not contain target, model, or `cases/` folders, and it does not contain
 a mixed transcript artifact; use each index row's `transcript_path` to find the
 per-result transcript.
diff --git a/apps/web/src/content/docs/docs/next/getting-started/quickstart.mdx b/apps/web/src/content/docs/docs/next/getting-started/quickstart.mdx
index 34c73acb1..409e0a778 100644
--- a/apps/web/src/content/docs/docs/next/getting-started/quickstart.mdx
+++ b/apps/web/src/content/docs/docs/next/getting-started/quickstart.mdx
@@ -65,7 +65,7 @@ tests:
 agentv eval ./evals/example.yaml
 ```
 
-Results appear in `.agentv/results/<run_id>/index.jsonl` with scores, reasoning, and execution traces.
+Results appear in `.agentv/results/<run_id>/.internal/index.jsonl` with scores, reasoning, and execution traces.
 
 ## Next Steps
 
diff --git a/apps/web/src/content/docs/docs/next/guides/autoresearch.mdx b/apps/web/src/content/docs/docs/next/guides/autoresearch.mdx
index cce9887f5..827cea502 100644
--- a/apps/web/src/content/docs/docs/next/guides/autoresearch.mdx
+++ b/apps/web/src/content/docs/docs/next/guides/autoresearch.mdx
@@ -81,9 +81,9 @@ Each autoresearch session creates a self-contained experiment directory:
 │   ├── iterations.jsonl     # Per-cycle data (score, decision, mutation)
 │   └── trajectory.html      # Live-updating Chart.js visualization
 ├── 2026-04-15T10-30-00/     # Cycle 1 run artifacts
-│   ├── index.jsonl
+│   ├── .internal/index.jsonl
 │   ├── grading.json
-│   └── timing.json
+│   └── metrics.json
 ├── 2026-04-15T10-35-00/     # Cycle 2 run artifacts
 │   └── ...
 └── ...
diff --git a/apps/web/src/content/docs/docs/next/reference/result-artifacts.mdx b/apps/web/src/content/docs/docs/next/reference/result-artifacts.mdx
index da2e88bc0..6594efaac 100644
--- a/apps/web/src/content/docs/docs/next/reference/result-artifacts.mdx
+++ b/apps/web/src/content/docs/docs/next/reference/result-artifacts.mdx
@@ -12,9 +12,9 @@ external adapters.
 The contract is run-centric:
 
 - `summary.json` owns aggregate run facts.
-- `index.jsonl` owns row-level discovery and filtering.
+- `.internal/index.jsonl` owns per-run row discovery and filtering.
 - Per-case sidecars own detailed payloads such as grading, metrics, transcripts,
-  timing, generated files, and raw provider evidence.
+  generated files, and raw provider evidence.
 - Dashboard, search, SQLite, HTML reports, and vendor exports are rebuildable
   projections over the bundle.
 
@@ -24,10 +24,17 @@ The default local layout is:
 
 ```text
 .agentv/results/
+  .indexes/
+    runs.jsonl               # rebuildable cross-run run catalog
+    cases.jsonl              # rebuildable cross-run case catalog
+  .cache/
   <run_id>/
     summary.json
-    index.jsonl
-    tags.json                 # optional mutable Dashboard tags
+    .internal/
+      index.jsonl             # one row per case/result in this run
+      progress.json
+      events.jsonl
+      bundle.json
     <case-or-allocation>/
       summary.json            # optional per-case aggregate, especially repeats
       test/                   # optional generated test bundle
@@ -35,28 +42,24 @@ The default local layout is:
         targets.yaml
         files/
         graders/
-      attempt-1/
+      sample-1/
         result.json
         grading.json
         metrics.json
-        timing.json
         transcript.json
         transcript-raw.jsonl
         outputs/
           answer.md
           file_changes.diff
-      attempt-2/
+      sample-2/
         result.json
         grading.json
         metrics.json
-        timing.json
         transcript.json
         transcript-raw.jsonl
         outputs/
           answer.md
           file_changes.diff
-  .indexes/                  # reserved rebuildable/local indexes
-  .cache/                    # reserved local cache
 ```
 
 `<run_id>` is the only committed run-bundle path identity. It helps AgentV put
@@ -85,22 +88,21 @@ reserved for rebuildable local state and are skipped by run discovery.
 | File or field | Owns | Use it for |
 | --- | --- | --- |
 | `summary.json` | Aggregate run metadata and rollups: run id, experiment metadata, counts, pass rate, score summaries, duration, token/cost totals, and writer metadata. | Listing runs, CI summaries, quick dashboards, trend cards, and validating that a run is complete enough to inspect. |
-| `index.jsonl` | Canonical row index: one row per result, attempt, or case-level aggregate, with identity fields, filter metadata, scores, status, and explicit run-relative paths to sidecars. | Filtering, compare/trend inputs, Dashboard detail routing, rerun/resume lookup, export adapters, and artifact discovery. |
+| `.internal/index.jsonl` | Canonical per-run row index: one row per case/result aggregate, with identity fields, filter metadata, scores, status, and explicit run-relative paths to sidecars. | Filtering, compare/trend inputs, Dashboard detail routing, rerun/resume lookup, export adapters, and artifact discovery. |
 | `result.json` | Compact per-attempt manifest for one attempt directory, including AgentV `execution_status` and `verdict`. | Loading one attempt without scanning the whole run index. |
 | `grading.json` | Grader outputs, `assertion_results`, rubric evidence, execution-metric grader facts, and scoring provenance. | Explaining why a row passed or failed. |
-| `metrics.json` | Derived executor behavior summary, such as tool calls, files touched, shell commands, errors, turns, and output sizes. | Dashboard behavior views, metric-style graders, adapter projections, and lightweight analysis. |
+| `metrics.json` | Duration, token usage, cost, execution status, trajectory, and derived executor behavior such as tool calls, files touched, shell commands, errors, turns, and output sizes. | Dashboard behavior views, cost/latency reporting, metric-style graders, adapter projections, and lightweight analysis. |
 | `outputs/file_changes.diff` | Full unified diff of workspace file changes when file changes are captured. | Human review and external artifact inspection; LLM and script graders still receive the same full diff through `file_changes`. |
-| `timing.json` | Duration, token usage, cost usage, and source labels such as `provider_reported`, `token_estimated`, `aggregate`, or `unavailable`. | Cost/latency reporting and provider-accounting audits. |
 | `transcript.json` | AgentV-normalized transcript/timeline document with canonical `tool_name` values and `transcript_summary`. | Portable human review, transcript-aware graders, and tool-trajectory analysis. |
 | `transcript-raw.jsonl` | Native provider or harness evidence when available. | Parser debugging, forensic review, and preserving source bytes without making provider schemas public AgentV fields. |
 | `test/` | Generated test bundle for the exact eval slice and target settings that produced a row. | Audit, external review, and rerun workflows that should not depend on a mutable source checkout. |
 | `artifact_pointers` | Offload indirection for large detached payload bytes. | Finding payloads published outside the primary metadata/control-plane branch, such as transcript bytes on `agentv/artifacts/v1`. |
 
-`summary.json` and `index.jsonl` are complementary, not redundant. A run list
+`summary.json` and `.internal/index.jsonl` are complementary, not redundant. A run list
 should not scan every row just to show pass rate or total duration, and a row
 reader should not parse aggregate summary structures to find one case's grading
 or transcript. Keep aggregate questions on `summary.json`; keep row and artifact
-discovery on `index.jsonl`.
+discovery on `.internal/index.jsonl`.
 
 ## Grading Contract
 
@@ -181,17 +183,18 @@ Example row:
   "test_id": "refund-eligibility",
   "target": "codex-gpt5",
   "variant": "skills-v2",
-  "attempt": 1,
+  "sample_index": 1,
+  "retry_index": 0,
+  "provenance": "native",
   "execution_status": "ok",
   "score": 0.92,
   "duration_ms": 184200,
   "result_dir": "refund-eligibility--4f9a7c2d1b6e",
   "summary_path": "refund-eligibility--4f9a7c2d1b6e/summary.json",
-  "grading_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/grading.json",
-  "metrics_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/metrics.json",
-  "timing_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/timing.json",
-  "transcript_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/transcript.json",
-  "transcript_raw_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/transcript-raw.jsonl",
+  "grading_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/grading.json",
+  "metrics_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/metrics.json",
+  "transcript_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/transcript.json",
+  "transcript_raw_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/transcript-raw.jsonl",
   "transcript_summary": {
     "total_turns": 4,
     "tool_calls": { "file_read": 2, "shell": 1, "unknown": 0 },
@@ -202,16 +205,16 @@ Example row:
     "errors": [],
     "thinking_blocks": 1
   },
-  "output_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/outputs/answer.md",
-  "answer_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/outputs/answer.md",
-  "file_changes_path": "refund-eligibility--4f9a7c2d1b6e/attempt-1/outputs/file_changes.diff",
+  "output_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/outputs/answer.md",
+  "answer_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/outputs/answer.md",
+  "file_changes_path": "refund-eligibility--4f9a7c2d1b6e/sample-1/outputs/file_changes.diff",
   "test_dir": "refund-eligibility--4f9a7c2d1b6e/test"
 }
 ```
 
 Rows can represent repeated attempts, multi-target runs, imported suites,
 manual `prepare`/`grade` attempts, or imported provider sessions. That is why
-`experiment`, `eval_path`, `test_id`, `target`, `variant`, `attempt`, and
+`experiment`, `eval_path`, `test_id`, `target`, `variant`, `sample_index`, `retry_index`, and
 source metadata belong in `index.jsonl`: tools can filter dynamically without
 requiring every run to be pre-split into semantic folders.
 
@@ -221,8 +224,8 @@ each row and as `summary.json.metadata.tags`. Its reserved `experiment` key
 matches the row `experiment` field, so trend/compare views can group by
 `tags.experiment`.
 
-Use `repeat` for authoring configuration and `attempts` for produced
-executions. The `attempt-1/`, `attempt-2/`, and later folders under a result
+Use `repeat` for authoring configuration and `samples` for produced
+executions. The `sample-1/`, `sample-2/`, and later folders under a result
 directory are artifact folders for those produced executions. Do not treat those
 folder names as the comparison dimension. Repeated stochastic samples should be
 represented by explicit metadata such as `sample_index` and `sample_count`;
@@ -265,7 +268,7 @@ Run an eval and inspect the portable bundle:
 agentv eval evals/support/refunds.eval.yaml --experiment with_skills
 ls .agentv/results/<run_id>
 cat .agentv/results/<run_id>/summary.json
-cat .agentv/results/<run_id>/index.jsonl
+cat .agentv/results/<run_id>/.internal/index.jsonl
 ```
 
 Find failed rows without loading every sidecar:
@@ -273,15 +276,15 @@ Find failed rows without loading every sidecar:
 ```bash
 jq -r 'select(.execution_status != "ok" or .score < 0.5) |
   [.eval_path, .test_id, .target, .grading_path] | @tsv' \
-  .agentv/results/<run_id>/index.jsonl
+  .agentv/results/<run_id>/.internal/index.jsonl
 ```
 
 Compare two completed runs by their row indexes:
 
 ```bash
 agentv results compare \
-  .agentv/results/<baseline-run-id>/index.jsonl \
-  .agentv/results/<candidate-run-id>/index.jsonl
+  .agentv/results/<baseline-run-id>/.internal/index.jsonl \
+  .agentv/results/<candidate-run-id>/.internal/index.jsonl
 ```
 
 Generate a shareable report from the same canonical bundle:
@@ -302,7 +305,7 @@ import { createInterface } from "node:readline";
 
 export async function* rows(runDir: string) {
   const rl = createInterface({
-    input: createReadStream(path.join(runDir, "index.jsonl"), "utf8"),
+    input: createReadStream(path.join(runDir, ".internal/index.jsonl"), "utf8"),
     crlfDelay: Infinity,
   });
 
@@ -323,8 +326,8 @@ for await (const row of rows(".agentv/results/2026-run")) {
 Adapter guidance:
 
 - Preserve unknown row fields when possible.
-- Prefer path fields such as `grading_path`, `metrics_path`, `timing_path`,
-  `transcript_path`, and `transcript_raw_path` over ad hoc path construction.
+- Prefer path fields such as `grading_path`, `metrics_path`, `transcript_path`,
+  and `transcript_raw_path` over ad hoc path construction.
 - Use `artifact_pointers` only for detached payload lookup; do not make pointers
   the discovery path for ordinary sidecars that are present in the run tree.
 - If you build a database or search index, store enough source metadata to
diff --git a/apps/web/src/content/docs/docs/next/tools/compare.mdx b/apps/web/src/content/docs/docs/next/tools/compare.mdx
index 348bcc5d4..dfff2ee14 100644
--- a/apps/web/src/content/docs/docs/next/tools/compare.mdx
+++ b/apps/web/src/content/docs/docs/next/tools/compare.mdx
@@ -15,11 +15,11 @@ Run two evaluations and compare them:
 agentv eval evals/my-eval.yaml --output .agentv/results/before
 # ... make changes to your agent ...
 agentv eval evals/my-eval.yaml --output .agentv/results/after
-agentv results compare .agentv/results/before/index.jsonl .agentv/results/after/index.jsonl
+agentv results compare .agentv/results/before/.internal/index.jsonl .agentv/results/after/.internal/index.jsonl
 ```
 
 `index.jsonl` is the canonical row-level result index. New runs live at
-`.agentv/results/<run_id>/index.jsonl`.
+`.agentv/results/<run_id>/.internal/index.jsonl`.
 
 ## Options
 
@@ -132,7 +132,7 @@ agentv eval evals/*.yaml --target gpt-4 --output .agentv/results/baseline
 agentv eval evals/*.yaml --target gpt-4o --output .agentv/results/candidate
 
 # Compare results
-agentv results compare .agentv/results/baseline/index.jsonl .agentv/results/candidate/index.jsonl
+agentv results compare .agentv/results/baseline/.internal/index.jsonl .agentv/results/candidate/.internal/index.jsonl
 ```
 
 ### Prompt Optimization
@@ -147,7 +147,7 @@ agentv eval evals/*.yaml --output .agentv/results/before
 agentv eval evals/*.yaml --output .agentv/results/after
 
 # Compare with strict threshold
-agentv results compare .agentv/results/before/index.jsonl .agentv/results/after/index.jsonl --threshold 0.05
+agentv results compare .agentv/results/before/.internal/index.jsonl .agentv/results/after/.internal/index.jsonl --threshold 0.05
 ```
 
 ### CI Quality Gate
@@ -157,8 +157,8 @@ Fail CI if the candidate regresses:
 ```bash
 #!/bin/bash
 agentv results compare \
-  .agentv/results/baseline/index.jsonl \
-  .agentv/results/candidate/index.jsonl
+  .agentv/results/baseline/.internal/index.jsonl \
+  .agentv/results/candidate/.internal/index.jsonl
 if [ $? -eq 1 ]; then
   echo "Regression detected! Candidate performs worse than baseline."
   exit 1
diff --git a/apps/web/src/content/docs/docs/next/tools/inspect.mdx b/apps/web/src/content/docs/docs/next/tools/inspect.mdx
index 6b85c0278..ada156314 100644
--- a/apps/web/src/content/docs/docs/next/tools/inspect.mdx
+++ b/apps/web/src/content/docs/docs/next/tools/inspect.mdx
@@ -96,7 +96,7 @@ agentv inspect show trace.otlp.json --format json \
   | jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]'
 
 # Compare providers
-agentv inspect stats .agentv/results/<run_id>/index.jsonl --group-by target --format json \
+agentv inspect stats .agentv/results/<run_id>/.internal/index.jsonl --group-by target --format json \
   | jq '.groups[] | {label, score_mean: .metrics.score.mean}'
 ```
 
diff --git a/apps/web/src/content/docs/docs/next/tools/results.mdx b/apps/web/src/content/docs/docs/next/tools/results.mdx
index 64e04be8b..8636f8cd6 100644
--- a/apps/web/src/content/docs/docs/next/tools/results.mdx
+++ b/apps/web/src/content/docs/docs/next/tools/results.mdx
@@ -48,7 +48,7 @@ Examples:
 agentv results report .agentv/results/2026-03-14T10-32-00_claude
 
 # Use an explicit output path
-agentv results report .agentv/results/2026-03-14T10-32-00_claude/index.jsonl \
+agentv results report .agentv/results/2026-03-14T10-32-00_claude/.internal/index.jsonl \
   --out ./reports/human-review.html
 ```
 
@@ -128,13 +128,13 @@ and metric-style graders; it is not canonical trace storage and does not carry
 token/cost usage.
 
 Every case uses aggregate `summary.json`, then stores execution artifact details
-under `attempt-N/`. Each `attempt-N/` contains a compact per-attempt manifest
-`result.json`, `grading.json`, `metrics.json`, `timing.json`,
+under `sample-N/`. Each `sample-N/` contains a compact per-attempt manifest
+`result.json`, `grading.json`, `metrics.json`,
 `transcript.json`, `transcript-raw.jsonl`, `outputs/answer.md`, and
 `outputs/file_changes.diff` when workspace changes were captured. The
 `result.json` file carries AgentV `execution_status` and `verdict` fields plus
 `grading_path`, `metrics_path`, transcript, output, and `file_changes_path`
-paths. Treat `attempt-N/` as an artifact attempt folder, not as a comparison
+paths. Treat `sample-N/` as an artifact attempt folder, not as a comparison
 dimension; stochastic samples and infrastructure retries should be represented
 with explicit sample/retry metadata rather than inferred from folder names.
 
@@ -151,7 +151,7 @@ lightweight explicit paths such as `transcript_path`, `transcript_raw_path`,
 detached payload publishing needs them. Dashboard search indexes, SQLite
 indexes, and other read models are derived projections over these run artifacts,
 not replacements for `index.jsonl`.
-Duration, token, and cost usage remains in `timing.json`, including source
+Duration, token, and cost usage remains in `metrics.json`, including source
 labels such as `provider_reported`, `token_estimated`, `aggregate`, or
 `unavailable`.
 
@@ -182,10 +182,10 @@ Agent Skills eval artifacts map into AgentV like this:
 | Agent Skills pattern | AgentV field | Artifact location |
 |----------------------|--------------|-------------------|
 | Converted Agent Skills cases | AgentV eval cases and test bundle paths | Converted EVAL YAML plus optional `test_dir`, `eval_path`, `targets_path`, `files_path`, and `graders_path` in `index.jsonl` |
-| Per-case answer | Generated target output artifact | `attempt-N/outputs/answer.md` |
-| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `attempt-N/transcript.json`, `attempt-N/transcript-raw.jsonl`, `attempt-N/metrics.json` |
-| Per-attempt `timing.json` | Duration, token totals, cost, and usage source labels | `attempt-N/timing.json` |
-| Per-attempt `grading.json` | Assertions, graders, execution metrics, workspace changes | `attempt-N/grading.json`; summary fields can reference the same trace/result facts |
+| Per-case answer | Generated target output artifact | `sample-N/outputs/answer.md` |
+| Per-attempt sidecars | Normalized transcript, metrics, and raw provider evidence | `sample-N/transcript.json`, `sample-N/transcript-raw.jsonl`, `sample-N/metrics.json` |
+| Per-sample `metrics.json` | Duration, token totals, cost, execution, trajectory, and usage source labels | `sample-N/metrics.json` |
+| Per-attempt `grading.json` | Assertions, graders, execution metrics, workspace changes | `sample-N/grading.json`; summary fields can reference the same trace/result facts |
 | Iteration-level `summary.json` | Pass rate, time, tokens, tool calls, cost aggregates | Run-level `summary.json` |
 | Transcript/log outlier analysis | Normalized transcript, raw evidence, metrics, and optional external trace link | `transcript.json` for portable review; `transcript-raw.jsonl` for native evidence; `metrics.json` for behavior summaries; `external_trace` for link-out correlation |
 | Aggregate pass rate/time/tokens/delta | Run summaries and comparison tooling | `summary.json`, result comparisons, and projection bundles |
diff --git a/apps/web/src/content/docs/docs/next/tools/trend.mdx b/apps/web/src/content/docs/docs/next/tools/trend.mdx
index 3b1c307e7..5f55f7aa4 100644
--- a/apps/web/src/content/docs/docs/next/tools/trend.mdx
+++ b/apps/web/src/content/docs/docs/next/tools/trend.mdx
@@ -30,7 +30,7 @@ Point directly at run workspaces or `index.jsonl` manifests when you need a spec
 ```bash
 agentv results trend \
   .agentv/results/2026-03-01T10-00-00-000Z/ \
-  .agentv/results/2026-03-08T10-00-00-000Z/index.jsonl \
+  .agentv/results/2026-03-08T10-00-00-000Z/.internal/index.jsonl \
   .agentv/results/2026-03-15T10-00-00-000Z/
 ```
 
@@ -46,7 +46,7 @@ agentv results trend --last 8 --suite code-review --target claude-sonnet \
 `trend` only accepts canonical run workspaces:
 
 - `.agentv/results/<run_id>/`
-- `.agentv/results/<run_id>/index.jsonl`
+- `.agentv/results/<run_id>/.internal/index.jsonl`
 
 Legacy flat `results.jsonl` files are rejected. The command stays on
 lightweight `index.jsonl` manifests and does not require per-test artifact
@@ -114,7 +114,7 @@ Regression Gate: threshold=0.010 fail_on_degrading=true triggered=true
   "runs": [
     {
       "label": "2026-03-01T10:00:00.000Z",
-      "path": "/repo/.agentv/results/2026-03-01T10-00-00-000Z/index.jsonl",
+      "path": "/repo/.agentv/results/2026-03-01T10-00-00-000Z/.internal/index.jsonl",
       "timestamp": "2026-03-01T10:00:00.000Z",
       "matched_test_count": 42,
       "mean_score": 0.92
diff --git a/apps/web/src/content/docs/docs/next/tools/wip-checkpoints.mdx b/apps/web/src/content/docs/docs/next/tools/wip-checkpoints.mdx
index cbb675944..559241984 100644
--- a/apps/web/src/content/docs/docs/next/tools/wip-checkpoints.mdx
+++ b/apps/web/src/content/docs/docs/next/tools/wip-checkpoints.mdx
@@ -23,7 +23,7 @@ If no results repo is configured, or auto-push is disabled, `agentv eval` still
 | Location | Path or ref | What it contains |
 | --- | --- | --- |
 | Local project | `.agentv/results/<run-id>/summary.json` | A run-start stub with `metadata.run_id`, `metadata.experiment`, `metadata.planned_test_count`, and the eval file path when known. This lets Dashboard recognize incomplete local runs as resumable. |
-| Local project | `.agentv/results/<run-id>/index.jsonl` | Result rows appended as test cases finish. Rows use the normal snake_case result JSONL format. |
+| Local project | `.agentv/results/<run-id>/.internal/index.jsonl` | Result rows appended as test cases finish. Rows use the normal snake_case result JSONL format. |
 | Results repo remote | `agentv/wip/<hostname>/<run-dir-basename>` | A forced-updated branch containing the checkpointed run under `.agentv/results/<same-relative-run-path>/`. |
 | Results repo storage branch | Configured `results.repo.branch`; local checkout configs default to `agentv/results/v1` | The final published run after `agentv eval` completes and the normal auto-export succeeds. |
 
diff --git a/packages/core/src/evaluation/metrics.ts b/packages/core/src/evaluation/metrics.ts
index 481616ee5..0099abc94 100644
--- a/packages/core/src/evaluation/metrics.ts
+++ b/packages/core/src/evaluation/metrics.ts
@@ -4,8 +4,9 @@
  * This is a derived per-case executor metrics projection over `EvaluationResult`
  * and the internal trace envelope. It aligns with AgentV's case-local `metrics.json`
  * while carrying compact executor observability fields. It is not the
- * canonical trace store; portable transcript detail stays in `transcript.json`, and
- * duration/token/cost usage stays in `timing.json`.
+ * canonical trace store; portable transcript detail stays in `transcript.json`.
+ * Duration, token, and cost usage live in this artifact's top-level
+ * `duration`, `tokens`, and `cost` sections.
  */
 
 import { z } from 'zod';
@@ -115,27 +116,42 @@ const ReasoningBlockWireSchema = z
   })
   .strict();
 
-const MetricsTimingWireSchema = z
+const MetricsDurationWireSchema = z
   .object({
-    total_tokens: z.number().int().nonnegative(),
-    duration_ms: z.number().nonnegative(),
-    total_duration_seconds: z.number().nonnegative(),
-    cost_usd: z.number().nonnegative().nullable(),
-    token_usage: z
+    total_ms: z.number().nonnegative(),
+    total_seconds: z.number().nonnegative(),
+    mean_ms: z.number().nonnegative().optional(),
+    mean_seconds: z.number().nonnegative().optional(),
+    source: z.enum(TIMING_SOURCE_VALUES),
+    stats: z
       .object({
-        input: z.number().int().nonnegative(),
-        output: z.number().int().nonnegative(),
-        reasoning: z.number().int().nonnegative(),
+        count: z.number().int().nonnegative(),
+        mean_ms: z.number().nonnegative(),
+        mean_seconds: z.number().nonnegative(),
+        stddev_ms: z.number().nonnegative(),
+        stddev_seconds: z.number().nonnegative(),
+        min_ms: z.number().nonnegative(),
+        max_ms: z.number().nonnegative(),
       })
-      .strict(),
-    usage_sources: z
-      .object({
-        token_usage: z.enum(TIMING_SOURCE_VALUES),
-        total_tokens: z.enum(TIMING_SOURCE_VALUES),
-        duration: z.enum(TIMING_SOURCE_VALUES),
-        cost: z.enum(TIMING_SOURCE_VALUES),
-      })
-      .strict(),
+      .strict()
+      .optional(),
+  })
+  .strict();
+
+const MetricsTokensWireSchema = z
+  .object({
+    total: z.number().int().nonnegative(),
+    input: z.number().int().nonnegative(),
+    output: z.number().int().nonnegative(),
+    reasoning: z.number().int().nonnegative(),
+    source: z.enum(TIMING_SOURCE_VALUES),
+  })
+  .strict();
+
+const MetricsCostWireSchema = z
+  .object({
+    usd: z.number().nonnegative().nullable(),
+    source: z.enum(TIMING_SOURCE_VALUES),
   })
   .strict();
 
@@ -184,12 +200,15 @@ export const MetricsArtifactWireSchema = z
       .object({
         transcript_path: z.string().optional(),
         grading_path: z.string().optional(),
-        timing_path: z.string().optional(),
         file_changes_path: z.string().optional(),
       })
       .strict(),
+    duration: MetricsDurationWireSchema.optional(),
+    tokens: MetricsTokensWireSchema.optional(),
+    cost: MetricsCostWireSchema.optional(),
+    execution: z.record(z.string(), z.unknown()).optional(),
+    trajectory: z.record(z.string(), z.unknown()).optional(),
     metrics: MetricsWireSchema,
-    timing: MetricsTimingWireSchema.optional(),
   })
   .strict();
 
@@ -879,7 +898,6 @@ export function buildMetricsArtifact(
   options: {
     transcriptPath?: string;
     gradingPath?: string;
-    timingPath?: string;
     fileChangesPath?: string;
     generatedAt?: string;
   } = {},
@@ -902,7 +920,6 @@ export function buildMetricsArtifact(
       source_artifacts: dropUndefined({
         transcript_path: options.transcriptPath,
         grading_path: options.gradingPath,
-        timing_path: options.timingPath,
         file_changes_path: options.fileChangesPath,
       }),
       metrics: buildMetrics(result),
diff --git a/packages/core/src/evaluation/results-repo-cache.test.ts b/packages/core/src/evaluation/results-repo-cache.test.ts
index f41c865bf..ebb7a6e12 100644
--- a/packages/core/src/evaluation/results-repo-cache.test.ts
+++ b/packages/core/src/evaluation/results-repo-cache.test.ts
@@ -148,7 +148,7 @@ describe('git results filesystem index cache', () => {
               run_id: 'sentinel',
               experiment: 'default',
               timestamp: '2026-06-28T01-00-00-000Z',
-              manifest_path: 'sentinel/.internal/index.jsonl',
+              index_path: 'sentinel/.internal/index.jsonl',
               display_name: 'from cache',
               test_count: 1,
               avg_score: 0.5,
diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts
index cd7124142..1de620383 100644
--- a/packages/core/src/evaluation/results-repo.ts
+++ b/packages/core/src/evaluation/results-repo.ts
@@ -67,6 +67,7 @@ const GIT_EMPTY_TREE = '4b825dc642cb6eb9a060e54bf8d69288fbee4904';
 const RESULTS_REPO_GENESIS_MESSAGE = 'chore(results): initialize AgentV results branch';
 const RESULTS_REPO_GENESIS_DATE = '@0 +0000';
 const RESULT_INDEX_FILENAME = 'index.jsonl';
+const RUN_INTERNAL_DIRNAME = '.internal';
 
 // Artifact-aware merge config for the AgentV-owned results checkout. Concurrent
 // writers append to rebuildable cross-run JSONL catalogs and each run's
@@ -2924,8 +2925,8 @@ function safeLocalSummaryManifestPath(
 function resolveLocalResultManifestPath(sourceDir: string): string | undefined {
   try {
     const summary = JSON.parse(readFileSync(path.join(sourceDir, 'summary.json'), 'utf8')) as {
-      manifest_path?: unknown;
       index_path?: unknown;
+      manifest_path?: unknown;
     };
     const manifestPath = safeLocalSummaryManifestPath(
       sourceDir,
@@ -2936,15 +2937,14 @@ function resolveLocalResultManifestPath(sourceDir: string): string | undefined {
     }
   } catch {}
 
-  const internalManifestPath = path.join(sourceDir, '.internal', RESULT_INDEX_FILENAME);
-  if (existsSync(internalManifestPath)) {
-    return internalManifestPath;
-  }
-
-  const manifestPath = path.join(sourceDir, RESULT_INDEX_FILENAME);
+  const manifestPath = path.join(sourceDir, RUN_INTERNAL_DIRNAME, RESULT_INDEX_FILENAME);
   if (existsSync(manifestPath)) {
     return manifestPath;
   }
+  const legacyManifestPath = path.join(sourceDir, RESULT_INDEX_FILENAME);
+  if (existsSync(legacyManifestPath)) {
+    return legacyManifestPath;
+  }
   return undefined;
 }
 
@@ -3719,7 +3719,7 @@ export interface GitListedRun {
   timestamp: string;
   pass_rate?: number;
   target?: string;
-  manifest_path: string;
+  index_path: string;
   summary_path?: string;
   display_name: string;
   test_count: number;
@@ -3745,8 +3745,8 @@ type GitBatchBlob = {
 };
 
 type GitRunSummary = {
-  readonly manifest_path?: string;
   readonly index_path?: string;
+  readonly manifest_path?: string;
   readonly metadata?: {
     readonly display_name?: string;
     readonly timestamp?: string;
@@ -4216,7 +4216,7 @@ function isGitListedRun(value: unknown): value is GitListedRun {
     typeof record.run_id === 'string' &&
     typeof record.experiment === 'string' &&
     typeof record.timestamp === 'string' &&
-    typeof record.manifest_path === 'string' &&
+    (typeof record.index_path === 'string' || typeof record.manifest_path === 'string') &&
     typeof record.display_name === 'string' &&
     typeof record.test_count === 'number' &&
     typeof record.avg_score === 'number' &&
@@ -4443,7 +4443,7 @@ export async function listGitRuns(repoDir: string, ref = 'origin/main'): Promise
         timestamp,
         ...(passRate !== undefined && { pass_rate: passRate }),
         ...(targets.length === 1 && targets[0] ? { target: targets[0] } : {}),
-        manifest_path: manifestPath,
+        index_path: manifestPath,
         ...(summaryByPath.has(summaryPath) && { summary_path: summaryPath }),
         display_name: displayName,
         test_count: summary?.metadata?.tests_run?.length ?? rowTestIds.length,
diff --git a/packages/core/src/evaluation/run-artifacts.ts b/packages/core/src/evaluation/run-artifacts.ts
index e687a0cb5..bb26a6ae4 100644
--- a/packages/core/src/evaluation/run-artifacts.ts
+++ b/packages/core/src/evaluation/run-artifacts.ts
@@ -2,13 +2,14 @@
  * Canonical AgentV run artifact helpers.
  *
  * This module owns the shared run-workspace contract used by CLI and
- * programmatic evals: `index.jsonl`, run-root `summary.json`, per-case
- * `summary.json`, `run-N/result.json`, and transcript projections. Keep wire
- * keys in snake_case here so every caller produces the same artifacts.
+ * programmatic evals: run-root `summary.json`, per-run
+ * `.internal/index.jsonl`, per-case `summary.json`, `sample-N/result.json`,
+ * and transcript projections. Keep wire keys in snake_case here so every caller
+ * produces the same artifacts.
  */
 
 import { createHash } from 'node:crypto';
-import { copyFile, mkdir, readFile, rm, rmdir, writeFile } from 'node:fs/promises';
+import { copyFile, mkdir, readFile, readdir, rm, rmdir, writeFile } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import path from 'node:path';
 
@@ -60,6 +61,25 @@ import type {
 
 export const RESULT_INDEX_FILENAME = 'index.jsonl';
 export const RUN_SUMMARY_FILENAME = 'summary.json';
+export const RUN_INTERNAL_DIRNAME = '.internal';
+export const CROSS_RUN_INDEX_DIRNAME = '.indexes';
+export const CROSS_RUN_RUNS_INDEX_FILENAME = 'runs.jsonl';
+export const CROSS_RUN_CASES_INDEX_FILENAME = 'cases.jsonl';
+
+export function runInternalPath(runDir: string, filename: string): string {
+  return path.join(runDir, RUN_INTERNAL_DIRNAME, filename);
+}
+
+export function runIndexPath(runDir: string): string {
+  return runInternalPath(runDir, RESULT_INDEX_FILENAME);
+}
+
+function isCanonicalResultsRoot(resultsRoot: string): boolean {
+  return (
+    path.basename(resultsRoot) === 'results' &&
+    path.basename(path.dirname(resultsRoot)) === '.agentv'
+  );
+}
 
 const TIMING_SOURCE_VALUES = [
   'provider_reported',
@@ -171,8 +191,7 @@ export async function aggregateRunDir(
     tags?: Record<string, string>;
   },
 ): Promise<{ summaryPath: string; testCount: number; targetCount: number }> {
-  const indexPath =
-    (await resolveExistingResultManifestPath(runDir)) ?? path.join(runDir, RESULT_INDEX_FILENAME);
+  const indexPath = (await resolveExistingResultManifestPath(runDir)) ?? runIndexPath(runDir);
   const content = await readFile(indexPath, 'utf8');
   const allResults = parseJsonlResults(content);
   const results = deduplicateByTestIdTarget(allResults);
@@ -223,8 +242,8 @@ async function readRunSummaryManifestPath(runDir: string): Promise<string | unde
     return undefined;
   }
   try {
-    const parsed = JSON.parse(summaryText) as { manifest_path?: unknown };
-    const manifestPath = safeSummaryManifestPath(runDir, parsed.manifest_path);
+    const parsed = JSON.parse(summaryText) as { index_path?: unknown; manifest_path?: unknown };
+    const manifestPath = safeSummaryManifestPath(runDir, parsed.index_path ?? parsed.manifest_path);
     if (manifestPath && (await readTextIfExists(manifestPath)) !== undefined) {
       return manifestPath;
     }
@@ -238,10 +257,14 @@ async function resolveExistingResultManifestPath(runDir: string): Promise<string
     return summaryManifestPath;
   }
 
-  const manifestPath = path.join(runDir, RESULT_INDEX_FILENAME);
+  const manifestPath = runIndexPath(runDir);
   if ((await readTextIfExists(manifestPath)) !== undefined) {
     return manifestPath;
   }
+  const legacyManifestPath = path.join(runDir, RESULT_INDEX_FILENAME);
+  if ((await readTextIfExists(legacyManifestPath)) !== undefined) {
+    return legacyManifestPath;
+  }
   return undefined;
 }
 
@@ -360,7 +383,7 @@ export interface GradingArtifact {
 
 export type TrialResultArtifact = {
   readonly attempt: number;
-  readonly attempt_path?: string;
+  readonly sample_path?: string;
   readonly score: number;
   readonly verdict: string;
   readonly scores?: IndexArtifactEntry['scores'];
@@ -399,36 +422,90 @@ export type TrialAggregationArtifact =
     };
 
 export interface TimingArtifact {
-  readonly total_tokens: number;
-  readonly duration_ms: number;
-  readonly total_duration_seconds: number;
-  readonly mean_duration_ms?: number;
-  readonly mean_duration_seconds?: number;
-  readonly duration_stats?: {
-    readonly count: number;
-    readonly mean_ms: number;
-    readonly mean_seconds: number;
-    readonly stddev_ms: number;
-    readonly stddev_seconds: number;
-    readonly min_ms: number;
-    readonly max_ms: number;
+  readonly duration: {
+    readonly total_ms: number;
+    readonly total_seconds: number;
+    readonly mean_ms?: number;
+    readonly mean_seconds?: number;
+    readonly stats?: {
+      readonly count: number;
+      readonly mean_ms: number;
+      readonly mean_seconds: number;
+      readonly stddev_ms: number;
+      readonly stddev_seconds: number;
+      readonly min_ms: number;
+      readonly max_ms: number;
+    };
+    readonly source: TimingSource;
   };
-  readonly cost_usd: number | null;
-  readonly token_usage: {
+  readonly tokens: {
+    readonly total: number;
     readonly input: number;
     readonly output: number;
     readonly reasoning: number;
+    readonly source: TimingSource;
+  };
+  readonly cost: {
+    readonly usd: number | null;
+    readonly source: TimingSource;
   };
-  readonly usage_sources: {
-    readonly token_usage: TimingSource;
-    readonly total_tokens: TimingSource;
-    readonly duration: TimingSource;
-    readonly cost: TimingSource;
+  readonly execution?: {
+    readonly status?: string;
+    readonly failure_stage?: string;
+    readonly failure_reason_code?: string;
   };
+  readonly trajectory?: {
+    readonly total_turns?: number;
+    readonly total_tool_calls?: number;
+    readonly tool_calls?: Record<string, number>;
+  };
+}
+
+interface DurationStats {
+  readonly count: number;
+  readonly mean_ms: number;
+  readonly mean_seconds: number;
+  readonly stddev_ms: number;
+  readonly stddev_seconds: number;
+  readonly min_ms: number;
+  readonly max_ms: number;
 }
 
 export interface RunSummaryArtifact {
-  readonly manifest_path: string;
+  readonly index_path: string;
+  readonly run_id?: string;
+  readonly status: {
+    readonly passed: { readonly count: number; readonly percentage: number };
+    readonly failed: { readonly count: number; readonly percentage: number };
+    readonly errored: { readonly count: number; readonly percentage: number };
+    readonly skipped: { readonly count: number; readonly percentage: number };
+  };
+  readonly counts: {
+    readonly total_cases: number;
+    readonly total_instances: number;
+    readonly passed_cases: number;
+    readonly failed_cases: number;
+    readonly errored_instances: number;
+  };
+  readonly pass_at_k: {
+    readonly k: number;
+    readonly passed_cases: number;
+    readonly total_cases: number;
+    readonly rate: number;
+  };
+  readonly usage: {
+    readonly total_tokens: number;
+    readonly input_tokens: number;
+    readonly output_tokens: number;
+    readonly reasoning_tokens: number;
+    readonly cost_usd: number | null;
+  };
+  readonly infra_failures: {
+    readonly total: number;
+    readonly reasons: readonly { readonly reason: string; readonly count: number }[];
+  };
+  readonly cases: readonly Record<string, unknown>[];
+  readonly instances: readonly Record<string, unknown>[];
   readonly metadata: {
     readonly run_id?: string;
     readonly eval_file: string;
@@ -458,7 +535,7 @@ export interface RunSummaryArtifact {
     }
   >;
   readonly per_grader_summary?: Record<string, { readonly mean: number; readonly stddev: number }>;
-  readonly timing: TimingArtifact;
+  readonly metrics: TimingArtifact;
   readonly notes: readonly string[];
 }
 
@@ -501,6 +578,9 @@ export interface IndexArtifactEntry {
   readonly start_time?: string;
   readonly end_time?: string;
   readonly scores?: readonly Record<string, unknown>[];
+  readonly named_scores?: Record<string, number>;
+  readonly derived_metrics?: Record<string, unknown>;
+  readonly provenance?: string;
   readonly attempts?: readonly TrialResultArtifact[];
   readonly aggregation?: TrialAggregationArtifact;
   readonly execution_status?: string;
@@ -510,7 +590,6 @@ export interface IndexArtifactEntry {
   readonly workspace_path?: string;
   readonly result_dir?: string;
   readonly grading_path?: string;
-  readonly timing_path?: string;
   readonly summary_path?: string;
   readonly output_path?: string;
   readonly answer_path?: string;
@@ -521,6 +600,8 @@ export interface IndexArtifactEntry {
   readonly file_changes_path?: string;
   readonly artifact_pointers?: ResultArtifactPointersWire;
   readonly runtime_source?: RunRuntimeSourceMetadata;
+  readonly sample_index?: number;
+  readonly retry_index?: number;
   readonly raw_provider_log_path?: string;
   readonly input_path?: string;
   readonly test_dir?: string;
@@ -565,6 +646,8 @@ export interface AdditionalResultArtifactsContext {
 export interface AgentVRunResultArtifact {
   readonly execution_status: EvaluationResult['executionStatus'];
   readonly verdict: TrialResult['verdict'];
+  readonly sample_index?: number;
+  readonly retry_index?: number;
   readonly duration_ms?: number;
   readonly duration_seconds: number;
   readonly model: string;
@@ -591,7 +674,6 @@ export interface AgentVRunResultArtifact {
     readonly file_changes?: string;
     readonly scripts?: Record<string, string>;
   };
-  readonly timing?: TimingArtifact;
 }
 
 export interface RepeatCaseSummaryArtifact {
@@ -601,13 +683,11 @@ export interface RepeatCaseSummaryArtifact {
   readonly mean_duration_ms: number;
   readonly mean_duration_seconds: number;
   readonly fingerprint: string;
-  readonly total_tokens: number;
-  readonly duration_ms: number;
-  readonly total_duration_seconds: number;
-  readonly duration_stats?: TimingArtifact['duration_stats'];
-  readonly cost_usd: number | null;
-  readonly token_usage: TimingArtifact['token_usage'];
-  readonly usage_sources: TimingArtifact['usage_sources'];
+  readonly duration: TimingArtifact['duration'];
+  readonly tokens: TimingArtifact['tokens'];
+  readonly cost: TimingArtifact['cost'];
+  readonly execution?: TimingArtifact['execution'];
+  readonly trajectory?: TimingArtifact['trajectory'];
 }
 
 export type AdditionalResultArtifactsWriter = (
@@ -761,8 +841,31 @@ function toIndexScores(scores: readonly GraderResult[] | undefined): IndexArtifa
   return scores?.map(toIndexScore) as IndexArtifactEntry['scores'];
 }
 
-function attemptDirName(attempt: number): string {
-  return `attempt-${attempt + 1}`;
+function collectNamedScores(
+  scores: readonly GraderResult[] | undefined,
+  out: Record<string, number> = {},
+): Record<string, number> | undefined {
+  for (const score of scores ?? []) {
+    if (score.name) {
+      out[score.name] = score.score;
+    }
+    collectNamedScores(score.scores, out);
+  }
+  return Object.keys(out).length > 0 ? out : undefined;
+}
+
+function resultDerivedMetrics(result: EvaluationResult): Record<string, unknown> | undefined {
+  const value = result.metadata?.derived_metrics ?? result.metadata?.derivedMetrics;
+  return isRecord(value) ? value : undefined;
+}
+
+function resultProvenance(result: EvaluationResult): string {
+  const value = result.metadata?.provenance;
+  return typeof value === 'string' && value.trim().length > 0 ? value : 'native';
+}
+
+function sampleDirName(sampleIndex: number): string {
+  return `sample-${sampleIndex + 1}`;
 }
 
 function hasPersistedTrialRuns(result: EvaluationResult): boolean {
@@ -784,7 +887,7 @@ function toTrialArtifacts(
   }
   return trials.map((trial) => ({
     attempt: trial.attempt,
-    attempt_path: trial.result ? attemptDirName(trial.attempt) : undefined,
+    sample_path: trial.result ? sampleDirName(trial.attempt) : undefined,
     score: trial.score,
     verdict: trial.verdict,
     scores: toIndexScores(trial.scores),
@@ -904,16 +1007,19 @@ function buildRepeatAggregateTimingArtifact(result: EvaluationResult): TimingArt
   const maxMs = Math.max(...durationsMs);
   return {
     ...timing,
-    mean_duration_ms: stats.mean,
-    mean_duration_seconds: roundSecondsFromMs(stats.mean),
-    duration_stats: {
-      count: durationsMs.length,
+    duration: {
+      ...timing.duration,
       mean_ms: stats.mean,
       mean_seconds: roundSecondsFromMs(stats.mean),
-      stddev_ms: stats.stddev,
-      stddev_seconds: roundSecondsFromMs(stats.stddev),
-      min_ms: roundMillis(minMs),
-      max_ms: roundMillis(maxMs),
+      stats: {
+        count: durationsMs.length,
+        mean_ms: stats.mean,
+        mean_seconds: roundSecondsFromMs(stats.mean),
+        stddev_ms: stats.stddev,
+        stddev_seconds: roundSecondsFromMs(stats.stddev),
+        min_ms: roundMillis(minMs),
+        max_ms: roundMillis(maxMs),
+      },
     },
   };
 }
@@ -926,6 +1032,10 @@ function formatRepeatPassRate(passedRuns: number, totalRuns: number): string {
   return `${Number.isInteger(percent) ? percent.toFixed(0) : percent.toFixed(1)}%`;
 }
 
+function percentage(count: number, total: number): number {
+  return total > 0 ? Math.round((count / total) * 1000) / 1000 : 0;
+}
+
 function fallbackRepeatFingerprint(result: EvaluationResult): string {
   return createHash('sha256')
     .update(
@@ -952,23 +1062,21 @@ function buildRepeatCaseSummaryArtifact(
       : resultVerdict(result) === 'pass'
         ? 1
         : 0;
-  const fallbackMeanMs = totalRuns > 0 ? roundMillis(timing.duration_ms / totalRuns) : 0;
-  const meanDurationMs = timing.mean_duration_ms ?? fallbackMeanMs;
+  const fallbackMeanMs = totalRuns > 0 ? roundMillis(timing.duration.total_ms / totalRuns) : 0;
+  const meanDurationMs = timing.duration.mean_ms ?? fallbackMeanMs;
 
   return {
     total_attempts: totalRuns,
     passed_attempts: passedRuns,
     pass_rate: formatRepeatPassRate(passedRuns, totalRuns),
     mean_duration_ms: meanDurationMs,
-    mean_duration_seconds: timing.mean_duration_seconds ?? roundSecondsFromMs(meanDurationMs),
+    mean_duration_seconds: timing.duration.mean_seconds ?? roundSecondsFromMs(meanDurationMs),
     fingerprint: fingerprint ?? fallbackRepeatFingerprint(result),
-    total_tokens: timing.total_tokens,
-    duration_ms: timing.duration_ms,
-    total_duration_seconds: timing.total_duration_seconds,
-    duration_stats: timing.duration_stats,
-    cost_usd: timing.cost_usd,
-    token_usage: timing.token_usage,
-    usage_sources: timing.usage_sources,
+    duration: timing.duration,
+    tokens: timing.tokens,
+    cost: timing.cost,
+    execution: timing.execution,
+    trajectory: timing.trajectory,
   };
 }
 
@@ -996,7 +1104,9 @@ function buildAgentVRunResultArtifact(params: {
   readonly trial: TrialResult;
   readonly result: EvaluationResult;
   readonly metricsArtifact: ReturnType<typeof buildMetricsArtifact> & {
-    readonly timing?: TimingArtifact;
+    readonly duration?: TimingArtifact['duration'];
+    readonly tokens?: TimingArtifact['tokens'];
+    readonly cost?: TimingArtifact['cost'];
   };
   readonly hasTranscript: boolean;
   readonly hasOutput: boolean;
@@ -1009,6 +1119,8 @@ function buildAgentVRunResultArtifact(params: {
   return dropUndefined({
     execution_status: params.trial.executionStatus ?? params.result.executionStatus,
     verdict: params.trial.verdict,
+    sample_index: params.result.sampleIndex,
+    retry_index: params.result.retryIndex,
     duration_ms: resultDurationMs(params.result),
     duration_seconds: resultDurationSeconds(params.result),
     model: params.result.target ?? 'unknown',
@@ -1039,7 +1151,6 @@ function buildAgentVRunResultArtifact(params: {
             file_changes: fileChangesPath,
           })
         : undefined,
-    timing: params.metricsArtifact.timing,
   }) as unknown as AgentVRunResultArtifact;
 }
 
@@ -1078,12 +1189,11 @@ async function writeTrialRunArtifacts(params: {
     return;
   }
 
-  const runDirName = attemptDirName(params.trial.attempt);
+  const runDirName = sampleDirName(params.trial.attempt);
   const runDir = path.join(params.parentTestDir, runDirName);
   const grading = buildGradingArtifact(result, { includeTrials: false });
   const timing = buildTimingArtifact([result]);
   const gradingPath = path.join(runDir, 'grading.json');
-  const timingPath = path.join(runDir, 'timing.json');
   const metricsPath = path.join(runDir, CANONICAL_METRICS_ARTIFACT_PATH);
   const outputsDir = path.join(runDir, 'outputs');
   const answerOutputPath =
@@ -1110,7 +1220,6 @@ async function writeTrialRunArtifacts(params: {
 
   await mkdir(runDir, { recursive: true });
   await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}\n`, 'utf8');
-  await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
 
   await mkdir(outputsDir, { recursive: true });
   if (answerOutputPath) {
@@ -1129,7 +1238,7 @@ async function writeTrialRunArtifacts(params: {
     envelope,
     transcriptArtifactPath: transcriptPath ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined,
     gradingArtifactPath: 'grading.json',
-    timingArtifactPath: 'timing.json',
+    timingArtifactPath: null,
     fileChangesArtifactPath: fileChangesPath ? CANONICAL_FILE_CHANGES_ARTIFACT_PATH : undefined,
     timing,
   });
@@ -1364,22 +1473,51 @@ export function buildTimingArtifact(results: readonly EvaluationResult[]): Timin
   const durationSource = combineTimingSources(results, durationSources, hasDuration);
   const costSource = combineTimingSources(results, costSources, hasCost);
 
+  const first = results[0];
+  const totalToolCalls = results.reduce((sum, result) => sum + countToolCalls(result).total, 0);
   return {
-    total_tokens: totalInput + totalOutput,
-    duration_ms: totalDurationMs,
-    total_duration_seconds: Math.round((totalDurationMs / 1000) * 1000) / 1000,
-    cost_usd: hasCost ? totalCostUsd : null,
-    token_usage: {
+    duration: {
+      total_ms: totalDurationMs,
+      total_seconds: Math.round((totalDurationMs / 1000) * 1000) / 1000,
+      source: durationSource,
+    },
+    tokens: {
+      total: totalInput + totalOutput,
       input: totalInput,
       output: totalOutput,
       reasoning: totalReasoning,
+      source: tokenUsageSource,
     },
-    usage_sources: {
-      token_usage: tokenUsageSource,
-      total_tokens: tokenUsageSource,
-      duration: durationSource,
-      cost: costSource,
+    cost: {
+      usd: hasCost ? totalCostUsd : null,
+      source: costSource,
     },
+    execution: first
+      ? dropUndefined({
+          status: first.executionStatus,
+          failure_stage: first.failureStage,
+          failure_reason_code: first.failureReasonCode,
+        })
+      : undefined,
+    trajectory:
+      results.length > 0
+        ? {
+            total_turns: results.reduce(
+              (sum, result) =>
+                sum +
+                (result.trace.llmCallCount ??
+                  result.trace.messages.filter((message) => message.role === 'assistant').length),
+              0,
+            ),
+            total_tool_calls: totalToolCalls,
+            tool_calls: results.reduce<Record<string, number>>((counts, result) => {
+              for (const [tool, count] of Object.entries(countToolCalls(result).toolCalls)) {
+                counts[tool] = (counts[tool] ?? 0) + count;
+              }
+              return counts;
+            }, {}),
+          }
+        : undefined,
   };
 }
 
@@ -1479,9 +1617,131 @@ export function buildRunSummaryArtifact(
 
   const firstResult = results[0];
   const timestamp = firstResult?.timestamp ?? new Date().toISOString();
+  const runMetrics = buildTimingArtifact(results);
+  const casesByKey = new Map<
+    string,
+    {
+      test_id: string;
+      suite?: string;
+      eval_path?: string;
+      target: string;
+      variant?: string;
+      sample_count: number;
+      pass_count: number;
+      status_counts: Record<string, number>;
+      samples: Record<string, unknown>[];
+    }
+  >();
+  const instances = results.flatMap((result) => {
+    const trials = materializedRunTrials(result);
+    return trials.map((trial) => {
+      const sampleIndex = trial.attempt + 1;
+      const status = trial.executionStatus ?? result.executionStatus;
+      const verdict = trial.verdict;
+      const caseKey = buildEvaluationResultTargetKey(result);
+      const sourceTest = undefined;
+      const evalPath = sourceEvalPath(result, sourceTest);
+      const caseSummary = casesByKey.get(caseKey) ?? {
+        test_id: result.testId ?? 'unknown',
+        suite: result.suite,
+        eval_path: evalPath,
+        target: result.target ?? 'unknown',
+        variant: result.variant,
+        sample_count: 0,
+        pass_count: 0,
+        status_counts: {},
+        samples: [],
+      };
+      caseSummary.sample_count += 1;
+      if (verdict === 'pass') {
+        caseSummary.pass_count += 1;
+      }
+      caseSummary.status_counts[status ?? 'unknown'] =
+        (caseSummary.status_counts[status ?? 'unknown'] ?? 0) + 1;
+      const instance = dropUndefined({
+        test_id: result.testId ?? 'unknown',
+        suite: result.suite,
+        eval_path: evalPath,
+        target: result.target ?? 'unknown',
+        variant: result.variant,
+        sample_index: sampleIndex,
+        retry_index: result.metadata?.retry_index,
+        verdict,
+        score: trial.score,
+        execution_status: status,
+        failure_stage: trial.failureStage,
+        failure_reason_code: trial.failureReasonCode,
+        duration_ms: trial.result ? resultDurationMs(trial.result) : resultDurationMs(result),
+        cost_usd: trial.costUsd,
+      });
+      caseSummary.samples.push(instance);
+      casesByKey.set(caseKey, caseSummary);
+      return instance;
+    });
+  });
+  const caseSummaries = [...casesByKey.values()].map((entry) => ({
+    ...entry,
+    pass_rate: percentage(entry.pass_count, entry.sample_count),
+    pass_at_1: entry.pass_count > 0,
+  }));
+  const passedCases = caseSummaries.filter((entry) => entry.pass_count > 0).length;
+  const erroredInstances = instances.filter(
+    (entry) => entry.execution_status === 'execution_error',
+  ).length;
+  const failedCases = caseSummaries.length - passedCases;
+  const infraFailureCounts = new Map<string, number>();
+  for (const instance of instances) {
+    const reason =
+      typeof instance.failure_reason_code === 'string'
+        ? instance.failure_reason_code
+        : instance.execution_status === 'execution_error'
+          ? 'execution_error'
+          : undefined;
+    if (reason) {
+      infraFailureCounts.set(reason, (infraFailureCounts.get(reason) ?? 0) + 1);
+    }
+  }
 
   return {
-    manifest_path: RESULT_INDEX_FILENAME,
+    index_path: `${RUN_INTERNAL_DIRNAME}/${RESULT_INDEX_FILENAME}`,
+    run_id: runId,
+    status: {
+      passed: { count: passedCases, percentage: percentage(passedCases, caseSummaries.length) },
+      failed: { count: failedCases, percentage: percentage(failedCases, caseSummaries.length) },
+      errored: {
+        count: erroredInstances,
+        percentage: percentage(erroredInstances, instances.length),
+      },
+      skipped: { count: 0, percentage: 0 },
+    },
+    counts: {
+      total_cases: caseSummaries.length,
+      total_instances: instances.length,
+      passed_cases: passedCases,
+      failed_cases: failedCases,
+      errored_instances: erroredInstances,
+    },
+    pass_at_k: {
+      k: 1,
+      passed_cases: passedCases,
+      total_cases: caseSummaries.length,
+      rate: percentage(passedCases, caseSummaries.length),
+    },
+    usage: {
+      total_tokens: runMetrics.tokens.total,
+      input_tokens: runMetrics.tokens.input,
+      output_tokens: runMetrics.tokens.output,
+      reasoning_tokens: runMetrics.tokens.reasoning,
+      cost_usd: runMetrics.cost.usd,
+    },
+    infra_failures: {
+      total: [...infraFailureCounts.values()].reduce((sum, count) => sum + count, 0),
+      reasons: [...infraFailureCounts.entries()]
+        .sort(([a], [b]) => a.localeCompare(b))
+        .map(([reason, count]) => ({ reason, count })),
+    },
+    cases: caseSummaries,
+    instances,
     metadata: {
       run_id: runId,
       eval_file: evalFile,
@@ -1497,7 +1757,7 @@ export function buildRunSummaryArtifact(
     },
     run_summary: runSummary,
     per_grader_summary: perEvaluatorSummary,
-    timing: buildTimingArtifact(results),
+    metrics: runMetrics,
     notes,
   };
 }
@@ -1780,7 +2040,6 @@ export function buildIndexArtifactEntry(
     outputDir: string;
     resultDir?: string;
     gradingPath?: string;
-    timingPath?: string;
     summaryPath?: string;
     outputPath?: string;
     answerPath?: string;
@@ -1811,6 +2070,9 @@ export function buildIndexArtifactEntry(
     start_time: result.startTime,
     end_time: result.endTime,
     scores: toIndexScores(result.scores),
+    named_scores: collectNamedScores(result.scores),
+    derived_metrics: resultDerivedMetrics(result),
+    provenance: resultProvenance(result),
     attempts: toIndexTrialArtifacts(result),
     aggregation: toTrialAggregationArtifact(result.aggregation),
     execution_status: result.executionStatus,
@@ -1824,9 +2086,6 @@ export function buildIndexArtifactEntry(
     grading_path: options.gradingPath
       ? toRelativeArtifactPath(options.outputDir, options.gradingPath)
       : undefined,
-    timing_path: options.timingPath
-      ? toRelativeArtifactPath(options.outputDir, options.timingPath)
-      : undefined,
     summary_path: options.summaryPath
       ? toRelativeArtifactPath(options.outputDir, options.summaryPath)
       : undefined,
@@ -1854,6 +2113,8 @@ export function buildIndexArtifactEntry(
       : undefined,
     artifact_pointers: options.artifactPointers,
     runtime_source: options.runtimeSource,
+    sample_index: result.sampleIndex,
+    retry_index: result.retryIndex,
     ...options.extraIndexFields,
     external_trace: toIndexExternalTrace(result, options.projectionIdentity?.dimensions.runId),
     projection_identity: options.projectionIdentity
@@ -1884,7 +2145,7 @@ export function buildResultIndexArtifact(
   const hasFileChanges = result.fileChanges !== undefined && result.fileChanges.length > 0;
   const hasTranscript = resultHasExecutionTraceTranscript(result);
   const isSingleRun = !hasPersistedTrialRuns(result);
-  const singleRunDir = path.posix.join(artifactSubdir, attemptDirName(0));
+  const singleRunDir = path.posix.join(artifactSubdir, sampleDirName(0));
 
   return {
     timestamp: result.timestamp,
@@ -1901,6 +2162,9 @@ export function buildResultIndexArtifact(
     start_time: result.startTime,
     end_time: result.endTime,
     scores: toIndexScores(result.scores),
+    named_scores: collectNamedScores(result.scores),
+    derived_metrics: resultDerivedMetrics(result),
+    provenance: resultProvenance(result),
     attempts: toIndexTrialArtifacts(result),
     aggregation: toTrialAggregationArtifact(result.aggregation),
     execution_status: result.executionStatus,
@@ -1911,7 +2175,6 @@ export function buildResultIndexArtifact(
     result_dir: artifactSubdir,
     summary_path: path.posix.join(artifactSubdir, RUN_SUMMARY_FILENAME),
     grading_path: isSingleRun ? path.posix.join(singleRunDir, 'grading.json') : undefined,
-    timing_path: isSingleRun ? path.posix.join(singleRunDir, 'timing.json') : undefined,
     metrics_path: isSingleRun
       ? path.posix.join(singleRunDir, CANONICAL_METRICS_ARTIFACT_PATH)
       : undefined,
@@ -1935,6 +2198,8 @@ export function buildResultIndexArtifact(
       isSingleRun && hasTranscript ? buildResultTranscriptSummary(result) : undefined,
     artifact_pointers: options?.artifactPointers,
     runtime_source: options?.runtimeSource,
+    sample_index: result.sampleIndex,
+    retry_index: result.retryIndex,
     ...extraIndexFields,
     external_trace: toIndexExternalTrace(result, options?.projectionIdentity?.dimensions.runId),
     projection_identity: options?.projectionIdentity
@@ -2004,17 +2269,28 @@ function buildMetricsArtifactPayload(params: {
   readonly timingArtifactPath?: string | null;
   readonly fileChangesArtifactPath?: string;
   readonly timing?: TimingArtifact;
-}): ReturnType<typeof buildMetricsArtifact> & { readonly timing?: TimingArtifact } {
+}): ReturnType<typeof buildMetricsArtifact> & {
+  readonly duration?: TimingArtifact['duration'];
+  readonly tokens?: TimingArtifact['tokens'];
+  readonly cost?: TimingArtifact['cost'];
+} {
   const artifact = buildMetricsArtifact(params.result, params.envelope, {
     transcriptPath:
       params.transcriptArtifactPath ??
       (params.transcriptPath ? CANONICAL_TRANSCRIPT_ARTIFACT_PATH : undefined),
     gradingPath: params.gradingArtifactPath ?? 'grading.json',
-    timingPath:
-      params.timingArtifactPath === null ? undefined : (params.timingArtifactPath ?? 'timing.json'),
     fileChangesPath: params.fileChangesArtifactPath,
   });
-  return params.timing ? { ...artifact, timing: params.timing } : artifact;
+  return params.timing
+    ? {
+        ...artifact,
+        duration: params.timing.duration,
+        tokens: params.timing.tokens,
+        cost: params.timing.cost,
+        execution: params.timing.execution,
+        trajectory: params.timing.trajectory,
+      }
+    : artifact;
 }
 
 async function writeMetricsArtifact(params: {
@@ -2027,7 +2303,13 @@ async function writeMetricsArtifact(params: {
   readonly timingArtifactPath?: string | null;
   readonly fileChangesArtifactPath?: string;
   readonly timing?: TimingArtifact;
-}): Promise<ReturnType<typeof buildMetricsArtifact> & { readonly timing?: TimingArtifact }> {
+}): Promise<
+  ReturnType<typeof buildMetricsArtifact> & {
+    readonly duration?: TimingArtifact['duration'];
+    readonly tokens?: TimingArtifact['tokens'];
+    readonly cost?: TimingArtifact['cost'];
+  }
+> {
   const artifactWithTiming = buildMetricsArtifactPayload(params);
   await writeFile(params.filePath, `${JSON.stringify(artifactWithTiming, null, 2)}\n`, 'utf8');
   return artifactWithTiming;
@@ -2183,6 +2465,133 @@ async function rewriteExistingIndexRecords(
   await writeJsonlFile(indexPath, records);
 }
 
+async function readJsonFile(filePath: string): Promise<unknown | undefined> {
+  const text = await readTextIfExists(filePath);
+  if (text === undefined) {
+    return undefined;
+  }
+  try {
+    return JSON.parse(text) as unknown;
+  } catch {
+    return undefined;
+  }
+}
+
+function summaryRunId(summary: unknown, fallback: string): string {
+  if (!isRecord(summary)) {
+    return fallback;
+  }
+  const runId = isRecord(summary.metadata) ? summary.metadata.run_id : undefined;
+  return typeof runId === 'string' && runId.trim().length > 0 ? runId : fallback;
+}
+
+function summaryTimestamp(summary: unknown): string | undefined {
+  if (!isRecord(summary) || !isRecord(summary.metadata)) {
+    return undefined;
+  }
+  return typeof summary.metadata.timestamp === 'string' ? summary.metadata.timestamp : undefined;
+}
+
+function summaryTargets(summary: unknown): unknown {
+  return isRecord(summary) && isRecord(summary.metadata) ? summary.metadata.targets : undefined;
+}
+
+function summaryTags(summary: unknown): unknown {
+  return isRecord(summary) && isRecord(summary.metadata) ? summary.metadata.tags : undefined;
+}
+
+async function readJsonlFile(filePath: string): Promise<unknown[]> {
+  const content = await readTextIfExists(filePath);
+  if (content === undefined) {
+    return [];
+  }
+  const records: unknown[] = [];
+  for (const line of content.split('\n')) {
+    if (line.trim().length === 0) {
+      continue;
+    }
+    try {
+      records.push(JSON.parse(line) as unknown);
+    } catch {}
+  }
+  return records;
+}
+
+function buildCrossRunRunRecord(params: {
+  readonly runId: string;
+  readonly runDirName: string;
+  readonly summary: unknown;
+}): Record<string, unknown> {
+  const summary = isRecord(params.summary) ? params.summary : {};
+  return dropUndefined({
+    run_id: params.runId,
+    run_dir: params.runDirName,
+    summary_path: `${params.runDirName}/${RUN_SUMMARY_FILENAME}`,
+    index_path: `${params.runDirName}/${RUN_INTERNAL_DIRNAME}/${RESULT_INDEX_FILENAME}`,
+    timestamp: summaryTimestamp(summary),
+    targets: summaryTargets(summary),
+    tags: summaryTags(summary),
+    status: summary.status,
+    run_summary: summary.run_summary,
+    metrics: summary.metrics,
+  });
+}
+
+function buildCrossRunCaseRecord(params: {
+  readonly runId: string;
+  readonly runDirName: string;
+  readonly summary: unknown;
+  readonly caseRecord: unknown;
+}): Record<string, unknown> | undefined {
+  if (!isRecord(params.caseRecord)) {
+    return undefined;
+  }
+  return dropUndefined({
+    ...params.caseRecord,
+    run_id: params.runId,
+    run_dir: params.runDirName,
+    run_timestamp: summaryTimestamp(params.summary),
+    run_tags: summaryTags(params.summary),
+  });
+}
+
+export async function rebuildCrossRunIndexes(resultsRoot: string): Promise<void> {
+  const entries = await readdir(resultsRoot, { withFileTypes: true }).catch(() => []);
+  const runRecords: Record<string, unknown>[] = [];
+  const caseRecords: Record<string, unknown>[] = [];
+
+  for (const entry of entries) {
+    if (!entry.isDirectory() || entry.name.startsWith('.')) {
+      continue;
+    }
+    const runDir = path.join(resultsRoot, entry.name);
+    const summary = await readJsonFile(path.join(runDir, RUN_SUMMARY_FILENAME));
+    if (!summary) {
+      continue;
+    }
+    const indexPath = (await resolveExistingResultManifestPath(runDir)) ?? runIndexPath(runDir);
+    const runId = summaryRunId(summary, entry.name);
+    runRecords.push(buildCrossRunRunRecord({ runId, runDirName: entry.name, summary }));
+    const cases = await readJsonlFile(indexPath);
+    for (const caseRecord of cases) {
+      const projected = buildCrossRunCaseRecord({
+        runId,
+        runDirName: entry.name,
+        summary,
+        caseRecord,
+      });
+      if (projected) {
+        caseRecords.push(projected);
+      }
+    }
+  }
+
+  const indexesDir = path.join(resultsRoot, CROSS_RUN_INDEX_DIRNAME);
+  await mkdir(indexesDir, { recursive: true });
+  await writeJsonlFile(path.join(indexesDir, CROSS_RUN_RUNS_INDEX_FILENAME), runRecords);
+  await writeJsonlFile(path.join(indexesDir, CROSS_RUN_CASES_INDEX_FILENAME), caseRecords);
+}
+
 type ParsedEvaluationResult = Record<string, unknown> & {
   timestamp: string;
   testId: string;
@@ -2405,7 +2814,7 @@ export async function writePerTestArtifacts(
     }
 
     const isSingleRun = !hasPersistedTrialRuns(result);
-    const singleRunDir = path.join(testDir, attemptDirName(0));
+    const singleRunDir = path.join(testDir, sampleDirName(0));
     const singleAnswerPath =
       isSingleRun && result.output.length > 0
         ? path.join(singleRunDir, 'outputs', 'answer.md')
@@ -2419,7 +2828,6 @@ export async function writePerTestArtifacts(
         ? path.join(singleRunDir, 'transcript-raw.jsonl')
         : undefined;
     const singleGradingPath = isSingleRun ? path.join(singleRunDir, 'grading.json') : undefined;
-    const singleTimingPath = isSingleRun ? path.join(singleRunDir, 'timing.json') : undefined;
     const singleMetricsPath = isSingleRun
       ? path.join(singleRunDir, CANONICAL_METRICS_ARTIFACT_PATH)
       : undefined;
@@ -2442,7 +2850,6 @@ export async function writePerTestArtifacts(
         resultDir: testDir,
         summaryPath: caseSummaryPath,
         gradingPath: singleGradingPath,
-        timingPath: singleTimingPath,
         metricsPath: singleMetricsPath,
         outputPath: singleAnswerPath,
         answerPath: singleAnswerPath,
@@ -2485,7 +2892,7 @@ export async function writeArtifactsFromResults(
 }> {
   const testArtifactDir = outputDir;
   const summaryPath = path.join(outputDir, RUN_SUMMARY_FILENAME);
-  const indexPath = path.join(outputDir, RESULT_INDEX_FILENAME);
+  const indexPath = runIndexPath(outputDir);
   await mkdir(outputDir, { recursive: true });
   const duplicatePolicy = options?.duplicatePolicy ?? 'update';
   const resolvedTags =
@@ -2521,7 +2928,7 @@ export async function writeArtifactsFromResults(
     const caseSummaryPath = path.join(testDir, RUN_SUMMARY_FILENAME);
     const identityId = projectionIdentity.id;
     const isSingleRun = !hasPersistedTrialRuns(result);
-    const singleRunDir = path.join(testDir, attemptDirName(0));
+    const singleRunDir = path.join(testDir, sampleDirName(0));
     const singleAnswerPath =
       isSingleRun && result.output.length > 0
         ? path.join(singleRunDir, 'outputs', 'answer.md')
@@ -2535,7 +2942,6 @@ export async function writeArtifactsFromResults(
         ? path.join(singleRunDir, 'transcript-raw.jsonl')
         : undefined;
     const singleGradingPath = isSingleRun ? path.join(singleRunDir, 'grading.json') : undefined;
-    const singleTimingPath = isSingleRun ? path.join(singleRunDir, 'timing.json') : undefined;
     const singleMetricsPath = isSingleRun
       ? path.join(singleRunDir, CANONICAL_METRICS_ARTIFACT_PATH)
       : undefined;
@@ -2553,7 +2959,6 @@ export async function writeArtifactsFromResults(
       singleTranscriptPath,
       singleTranscriptRawPath,
       singleGradingPath,
-      singleTimingPath,
       singleMetricsPath,
       singleFileChangesPath,
       identityId,
@@ -2624,7 +3029,6 @@ export async function writeArtifactsFromResults(
         resultDir: plan.testDir,
         summaryPath: plan.caseSummaryPath,
         gradingPath: plan.singleGradingPath,
-        timingPath: plan.singleTimingPath,
         metricsPath: plan.singleMetricsPath,
         outputPath: plan.singleAnswerPath,
         answerPath: plan.singleAnswerPath,
@@ -2668,7 +3072,12 @@ export async function writeArtifactsFromResults(
   );
   await writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
 
+  await mkdir(path.dirname(indexPath), { recursive: true });
   await writeJsonlFile(indexPath, indexRecords);
+  const resultsRoot = path.dirname(outputDir);
+  if (isCanonicalResultsRoot(resultsRoot)) {
+    await rebuildCrossRunIndexes(resultsRoot);
+  }
 
   return { testArtifactDir, summaryPath, indexPath };
 }
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 3aa0de8d4..3d8eddd81 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -1166,7 +1166,7 @@ export interface TrialResult {
   readonly failureReasonCode?: string;
   /**
    * Full per-attempt result used by artifact writers to materialize AgentV
-   * attempt-N folders. This is intentionally omitted from compact wire summaries.
+   * sample-N folders. This is intentionally omitted from compact wire summaries.
    */
   readonly result?: EvaluationResult;
 }
diff --git a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts
index fa347b6b6..a9074a5f8 100644
--- a/packages/core/test/evaluation/evaluate-programmatic-api.test.ts
+++ b/packages/core/test/evaluation/evaluate-programmatic-api.test.ts
@@ -132,10 +132,15 @@ describe('evaluate() — programmatic API extensions', () => {
 
         expect(result.artifacts).toBeDefined();
         expect(result.artifacts?.runDir).toBe(outputDir);
-        expect(result.artifacts?.indexPath).toBe(path.join(outputDir, RESULT_INDEX_FILENAME));
+        expect(result.artifacts?.indexPath).toBe(
+          path.join(outputDir, '.internal', RESULT_INDEX_FILENAME),
+        );
         expect(result.artifacts?.summaryPath).toBe(path.join(outputDir, 'summary.json'));
 
-        const indexContent = await readFile(path.join(outputDir, RESULT_INDEX_FILENAME), 'utf8');
+        const indexContent = await readFile(
+          path.join(outputDir, '.internal', RESULT_INDEX_FILENAME),
+          'utf8',
+        );
         expect(indexContent).toContain('"test_id":"programmatic-artifacts"');
         expect(indexContent).toContain('"experiment":"sdk-test"');
         const [indexRow] = indexContent
@@ -153,20 +158,20 @@ describe('evaluate() — programmatic API extensions', () => {
             tests_run: string[];
             eval_file: string;
           };
-          timing: { duration_ms: number };
+          metrics: { duration: { total_ms: number } };
         };
         expect(summaryArtifact.metadata.run_id).toBe(path.basename(outputDir));
         expect(summaryArtifact.metadata.experiment).toBe('sdk-test');
         expect(summaryArtifact.metadata.tests_run).toEqual(['programmatic-artifacts']);
         expect(summaryArtifact.metadata.eval_file).toBe('');
-        expect(summaryArtifact.timing.duration_ms).toBeGreaterThanOrEqual(0);
+        expect(summaryArtifact.metrics.duration.total_ms).toBeGreaterThanOrEqual(0);
 
         expect(resultDir).toMatch(/^programmatic-artifacts--[a-f0-9]{12}$/);
-        expect(existsSync(path.join(outputDir, resultDir ?? '', 'attempt-1', 'grading.json'))).toBe(
+        expect(existsSync(path.join(outputDir, resultDir ?? '', 'sample-1', 'grading.json'))).toBe(
           true,
         );
         expect(
-          existsSync(path.join(outputDir, resultDir ?? '', 'attempt-1', 'outputs', 'answer.md')),
+          existsSync(path.join(outputDir, resultDir ?? '', 'sample-1', 'outputs', 'answer.md')),
         ).toBe(true);
       } finally {
         rmSync(outputDir, { recursive: true, force: true });
diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index f57e10d43..760306e80 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -825,13 +825,13 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
     const outputDir = path.join(tempDir, 'artifacts');
     await writeArtifactsFromResults([result], outputDir);
 
-    const indexRows = readFileSync(path.join(outputDir, RESULT_INDEX_FILENAME), 'utf8')
+    const indexRows = readFileSync(path.join(outputDir, '.internal', RESULT_INDEX_FILENAME), 'utf8')
       .trim()
       .split('\n')
       .map((line) => JSON.parse(line) as Record<string, string | undefined>);
     const resultDir = indexRows[0]?.result_dir;
     expect(resultDir).toMatch(/^case-1--[a-f0-9]{12}$/);
-    const runDir = path.join(outputDir, resultDir ?? '', 'attempt-1');
+    const runDir = path.join(outputDir, resultDir ?? '', 'sample-1');
     const outputsDir = path.join(runDir, 'outputs');
     expect(readdirSync(runDir)).not.toContain('provider.log');
     expect(readdirSync(runDir)).toContain('transcript-raw.jsonl');
@@ -844,8 +844,8 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`,
 
     expect(indexRows[0]?.raw_provider_log_path).toBeUndefined();
     expect(indexRows[0]?.trace_path).toBeUndefined();
-    expect(indexRows[0]?.transcript_path).toBe(`${resultDir}/attempt-1/transcript.json`);
-    expect(indexRows[0]?.transcript_raw_path).toBe(`${resultDir}/attempt-1/transcript-raw.jsonl`);
+    expect(indexRows[0]?.transcript_path).toBe(`${resultDir}/sample-1/transcript.json`);
+    expect(indexRows[0]?.transcript_raw_path).toBe(`${resultDir}/sample-1/transcript-raw.jsonl`);
     expect(existsSync(rawLogPath)).toBe(false);
   });
 
diff --git a/packages/core/test/evaluation/results-repo.test.ts b/packages/core/test/evaluation/results-repo.test.ts
index fce1fc1ea..f1a8a758a 100644
--- a/packages/core/test/evaluation/results-repo.test.ts
+++ b/packages/core/test/evaluation/results-repo.test.ts
@@ -433,7 +433,7 @@ describe('listGitRuns', () => {
       experiment: 'with-skills',
       timestamp: '2026-05-21T11:00:00.000Z',
       display_name: 'remote friendly run',
-      manifest_path: '2026-05-21T11-00-00-000Z/.internal/index.jsonl',
+      index_path: '2026-05-21T11-00-00-000Z/.internal/index.jsonl',
       summary_path: '2026-05-21T11-00-00-000Z/summary.json',
       test_count: 3,
       pass_rate: 0.75,
@@ -444,7 +444,7 @@ describe('listGitRuns', () => {
       experiment: 'default',
       display_name: '2026-05-20T10-00-00-000Z',
       target: 'gpt-4o',
-      manifest_path: '2026-05-20T10-00-00-000Z/.internal/index.jsonl',
+      index_path: '2026-05-20T10-00-00-000Z/.internal/index.jsonl',
       test_count: 2,
       pass_rate: 0.5,
     });
diff --git a/skills-data/agentv-eval-migrations/references/breaking-changes.md b/skills-data/agentv-eval-migrations/references/breaking-changes.md
index 113cdb7b6..0f0f697d6 100644
--- a/skills-data/agentv-eval-migrations/references/breaking-changes.md
+++ b/skills-data/agentv-eval-migrations/references/breaking-changes.md
@@ -911,6 +911,13 @@ v4.42.4 docs described local run workspaces under
 workspaces under `.agentv/results/<run_id>/`, with experiment metadata stored
 in `summary.json` / rows rather than inferred from the path.
 
+Within each run bundle, the per-run index is `.internal/index.jsonl` and
+`summary.json` points to it with `index_path`. Per-sample execution folders are
+named `sample-N`; use row fields such as `sample_index` and `retry_index` for
+semantics. New writers emit `metrics.json` for duration, tokens, cost,
+execution, and trajectory data; they do not emit `timing.json`, `timing_path`,
+or nested `metrics.timing`.
+
 Do not edit eval YAML just to chase result artifact path changes. Migrate only
 authored fields that the eval parser reads. Use: