diff --git a/.github/scripts/aggregate_recursion_histogram.py b/.github/scripts/aggregate_recursion_histogram.py
new file mode 100755
index 000000000..8a12dc05e
--- /dev/null
+++ b/.github/scripts/aggregate_recursion_histogram.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""Format the recursion-guest per-function profile as a Markdown PR comment.
+
+`test_recursion_pc_histogram` prints a per-function summary table (cycles folded
+over each function's PCs, computed across the *full* histogram) followed by a
+per-address detail table. We extract the per-function table — the view that
+shows where the cycles actually go — and render it as Markdown.
+
+    Top 25 functions by cycle count (aggregated over their PCs):
+    rank          cycles        %    cum %    PCs  function (file:line)
+       1         5335072   24.95%   24.95%     72  <...>::visit_seq::<...>
+
+Reads the test's captured output from argv[1]; writes the Markdown body to
+argv[2] (or stdout).
+"""
+
+import re
+import sys
+
+# A per-function summary row: rank, cycles, pct%, cum%, pcs, function.
+# Distinguished from the per-PC detail rows by the absence of a 0x<pc> column.
+FN_ROW = re.compile(
+    r"^\s*\d+\s+(\d+)\s+([\d.]+)%\s+([\d.]+)%\s+(\d+)\s+(.*\S)\s*$"
+)
+FN_TABLE_START = re.compile(r"Top \d+ functions by cycle count")
+PC_TABLE_START = re.compile(r"Top \d+ PCs by cycle count")
+TOTAL_CYCLES = re.compile(r"Total cycles\s*:\s*(\d+)")
+UNIQUE_PCS = re.compile(r"Unique PCs\s*:\s*(\d+)")
+EXEC_TIME = re.compile(r"Exec time\s*:\s*(\S+)")
+
+
+def parse(text):
+    total_cycles = unique_pcs = exec_time = None
+    rows = []
+    in_fn_table = False
+    for line in text.splitlines():
+        if total_cycles is None and (m := TOTAL_CYCLES.search(line)):
+            total_cycles = int(m.group(1))
+        if unique_pcs is None and (m := UNIQUE_PCS.search(line)):
+            unique_pcs = int(m.group(1))
+        if exec_time is None and (m := EXEC_TIME.search(line)):
+            exec_time = m.group(1)
+        if FN_TABLE_START.search(line):
+            in_fn_table = True
+            continue
+        if PC_TABLE_START.search(line):
+            in_fn_table = False
+            continue
+        if in_fn_table and (m := FN_ROW.match(line)):
+            rows.append(
+                {
+                    "cycles": int(m.group(1)),
+                    "pct": m.group(2),
+                    "cum": m.group(3),
+                    "pcs": int(m.group(4)),
+                    "fn": m.group(5),
+                }
+            )
+    return total_cycles, unique_pcs, exec_time, rows
+
+
+def short(name, width=90):
+    return name if len(name) <= width else name[: width - 1] + "…"
+
+
+def render(total_cycles, unique_pcs, exec_time, rows, title="Recursion guest profile"):
+    if not rows:
+        return (
+            f"### {title}\n\n"
+            "> ⚠️ No per-function rows found in the test output — the run may "
+            "have failed before printing the table. Check the workflow logs.\n"
+        )
+
+    body = f"### {title}\n\n"
+    if total_cycles is not None:
+        body += f"**Total cycles:** {total_cycles:,}"
+        if unique_pcs is not None:
+            body += f" · **Unique PCs:** {unique_pcs:,}"
+        if exec_time:
+            body += f" · **Exec time:** {exec_time}"
+        body += "\n\n"
+
+    body += f"#### Top {len(rows)} functions by cycles (folded over their PCs)\n\n"
+    body += "| Rank | Cycles | % | Cum % | PCs | Function |\n"
+    body += "|-----:|-------:|--:|------:|----:|----------|\n"
+    for i, r in enumerate(rows, 1):
+        body += (
+            f"| {i} | {r['cycles']:,} | {r['pct']}% | {r['cum']}% | "
+            f"{r['pcs']} | `{short(r['fn'])}` |\n"
+        )
+
+    last_cum = rows[-1]["cum"]
+    body += (
+        f"\n<sub>Each function's cycles are summed over all its program counters "
+        f"across the full histogram; the top {len(rows)} cover {last_cum}% of total "
+        f"cycles. Percentages are of total cycles.</sub>\n"
+    )
+    return body
+
+
+def main():
+    import argparse
+
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("log", help="captured test output to parse")
+    ap.add_argument("-o", "--out", help="write Markdown here instead of stdout")
+    ap.add_argument(
+        "-t",
+        "--title",
+        default="Recursion guest profile",
+        help="section heading (e.g. the test/config name)",
+    )
+    args = ap.parse_args()
+
+    with open(args.log, "r", errors="replace") as f:
+        text = f.read()
+    body = render(*parse(text), title=args.title)
+    if args.out:
+        with open(args.out, "w") as f:
+            f.write(body)
+    else:
+        sys.stdout.write(body)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml
new file mode 100644
index 000000000..707b9dd6a
--- /dev/null
+++ b/.github/workflows/profile-recursion.yml
@@ -0,0 +1,177 @@
+name: Profile Recursion (PR)
+
+# Runs the recursion-guest PC histogram diagnostics (single-query and
+# multi-query, in parallel via a matrix) and posts a combined per-function
+# profile as a PR comment. Triggered by a `/profile_recursion` comment from a
+# repo member, or manually via workflow_dispatch.
+
+on:
+  workflow_dispatch:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: read
+  pull-requests: write
+
+concurrency:
+  group: profile-recursion-${{ github.event.issue.number || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  # One job per configuration; they run in parallel and each uploads a Markdown
+  # fragment artifact. The `comment` job stitches them into one PR comment.
+  profile:
+    # Skip unless: workflow_dispatch, or "/profile_recursion" comment on a PR by a member.
+    if: >-
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'issue_comment' &&
+       github.event.issue.pull_request &&
+       startsWith(github.event.comment.body, '/profile_recursion') &&
+       contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
+    runs-on: [self-hosted, bench]
+    timeout-minutes: 90
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: single-query
+            test: test_recursion_pc_histogram
+            title: "Single query (blowup=2, 1 query)"
+          - name: multi-query
+            test: test_recursion_pc_histogram_multiquery
+            title: "Multi query (blowup=8, 128-bit)"
+    steps:
+      - name: React to comment
+        if: github.event_name == 'issue_comment' && matrix.name == 'single-query'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: context.payload.comment.id,
+              content: 'eyes'
+            });
+
+      - name: Get PR head ref
+        id: pr-ref
+        if: github.event_name == 'issue_comment'
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUM: ${{ github.event.issue.number }}
+        run: |
+          SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+          echo "sha=$SHA" >> "$GITHUB_OUTPUT"
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ steps.pr-ref.outputs.sha || github.sha }}
+
+      - name: Setup Rust Environment
+        uses: ./.github/actions/setup-rust
+
+      - name: Add cargo to PATH
+        run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
+
+      - name: Run recursion PC histogram (${{ matrix.name }})
+        env:
+          TEST: ${{ matrix.test }}
+        run: |
+          # Self-provision the RISC-V sysroot in a user-writable dir (the default
+          # /opt path on the bench runner is root-owned); the guest ELF build the
+          # test triggers picks this up via the Makefile's `SYSROOT_DIR ?=`.
+          export SYSROOT_DIR="$HOME/.lambda-vm-sysroot"
+          set -o pipefail
+          # The test is #[ignore]d and prints the histogram to stderr.
+          cargo test --package lambda-vm-prover --lib "$TEST" \
+            -- --ignored --nocapture 2>&1 | tee /tmp/hist.log
+
+      - name: Aggregate into a per-function fragment
+        if: always()
+        env:
+          TITLE: ${{ matrix.title }}
+        run: |
+          python3 .github/scripts/aggregate_recursion_histogram.py \
+            /tmp/hist.log --title "$TITLE" --out "/tmp/fragment-${{ matrix.name }}.md"
+          cat "/tmp/fragment-${{ matrix.name }}.md" >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Upload fragment
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: profile-fragment-${{ matrix.name }}
+          path: /tmp/fragment-${{ matrix.name }}.md
+          retention-days: 7
+
+  # Stitch the matrix fragments into a single PR comment.
+  comment:
+    needs: profile
+    if: always() && github.event_name == 'issue_comment'
+    runs-on: [self-hosted, bench]
+    steps:
+      - name: Get PR head ref
+        id: pr-ref
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUM: ${{ github.event.issue.number }}
+        run: |
+          SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+          echo "sha=$SHA" >> "$GITHUB_OUTPUT"
+
+      - name: Download fragments
+        uses: actions/download-artifact@v4
+        with:
+          path: fragments
+          pattern: profile-fragment-*
+          merge-multiple: true
+
+      - name: Assemble comment body
+        env:
+          COMMIT_SHA: ${{ steps.pr-ref.outputs.sha }}
+        run: |
+          {
+            echo "## Recursion guest profile"
+            echo
+            # Single-query first, then multi-query, then any others.
+            for frag in fragments/fragment-single-query.md \
+                        fragments/fragment-multi-query.md; do
+              [ -f "$frag" ] && { cat "$frag"; echo; }
+            done
+            echo "<sub>Commit: ${COMMIT_SHA:0:8} · Runner: self-hosted bench</sub>"
+          } > /tmp/profile_comment.md
+          cat /tmp/profile_comment.md
+
+      - name: Comment on PR
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const body = fs.readFileSync('/tmp/profile_comment.md', 'utf8');
+
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            // Reuse our own marker comment so repeated /profile_recursion runs update in place.
+            const existing = comments.find(c =>
+              c.user.type === 'Bot' &&
+              c.body.includes('Recursion guest profile')
+            );
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body,
+              });
+            }
diff --git a/Cargo.lock b/Cargo.lock
index da2929c9d..4be069a65 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,6 +2,28 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "addr2line"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "efe1709241908a54ef1925c6018f41d3f523d0cfe174719761eb39e7b7bf086a"
+dependencies = [
+ "cpp_demangle",
+ "fallible-iterator",
+ "gimli",
+ "memmap2",
+ "object",
+ "rustc-demangle",
+ "smallvec",
+ "typed-arena",
+]
+
+[[package]]
+name = "adler2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
+
 [[package]]
 name = "ahash"
 version = "0.8.12"
@@ -230,6 +252,15 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
+[[package]]
+name = "atomic-polyfill"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4"
+dependencies = [
+ "critical-section",
+]
+
 [[package]]
 name = "atty"
 version = "0.2.14"
@@ -465,7 +496,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
 dependencies = [
  "ciborium-io",
- "half 2.7.1",
+ "half",
 ]
 
 [[package]]
@@ -543,6 +574,15 @@ dependencies = [
  "tikv-jemallocator",
 ]
 
+[[package]]
+name = "cobs"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1"
+dependencies = [
+ "thiserror",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.4"
@@ -590,6 +630,15 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
+[[package]]
+name = "cpp_demangle"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0667304c32ea56cb4cd6d2d7c0cfe9a2f8041229db8c033af7f8d69492429def"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "cpufeatures"
 version = "0.2.17"
@@ -668,6 +717,12 @@ dependencies = [
  "itertools 0.10.5",
 ]
 
+[[package]]
+name = "critical-section"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b"
+
 [[package]]
 name = "crossbeam"
 version = "0.8.4"
@@ -934,6 +989,18 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "embedded-io"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced"
+
+[[package]]
+name = "embedded-io"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d"
+
 [[package]]
 name = "enum-ordinalize"
 version = "4.3.2"
@@ -1046,7 +1113,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
- "thiserror 2.0.17",
+ "thiserror",
  "tracing",
 ]
 
@@ -1069,7 +1136,7 @@ dependencies = [
  "ripemd",
  "secp256k1",
  "sha2",
- "thiserror 2.0.17",
+ "thiserror",
  "tiny-keccak",
 ]
 
@@ -1089,7 +1156,7 @@ dependencies = [
  "rkyv",
  "serde",
  "serde_with",
- "thiserror 2.0.17",
+ "thiserror",
 ]
 
 [[package]]
@@ -1107,7 +1174,7 @@ dependencies = [
  "secp256k1",
  "serde",
  "serde_with",
- "thiserror 2.0.17",
+ "thiserror",
  "tracing",
 ]
 
@@ -1126,7 +1193,7 @@ dependencies = [
  "rustc-hash",
  "serde",
  "strum",
- "thiserror 2.0.17",
+ "thiserror",
 ]
 
 [[package]]
@@ -1136,7 +1203,7 @@ source = "git+https://github.com/lambdaclass/ethrex.git?rev=156cb8d6a3974f411d71
 dependencies = [
  "bytes",
  "ethereum-types",
- "thiserror 2.0.17",
+ "thiserror",
 ]
 
 [[package]]
@@ -1155,7 +1222,7 @@ dependencies = [
  "rkyv",
  "rustc-hash",
  "serde",
- "thiserror 2.0.17",
+ "thiserror",
 ]
 
 [[package]]
@@ -1173,7 +1240,7 @@ dependencies = [
  "rayon",
  "rustc-hash",
  "serde",
- "thiserror 2.0.17",
+ "thiserror",
  "tracing",
 ]
 
@@ -1183,14 +1250,21 @@ version = "0.1.0"
 dependencies = [
  "ecsm",
  "ethrex-guest-program",
+ "hashbrown 0.14.5",
  "rkyv",
  "rustc-demangle",
  "serde",
  "serde_json",
- "thiserror 1.0.69",
+ "thiserror",
  "tiny-keccak",
 ]
 
+[[package]]
+name = "fallible-iterator"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
+
 [[package]]
 name = "fastrand"
 version = "2.3.0"
@@ -1226,6 +1300,16 @@ dependencies = [
  "static_assertions",
 ]
 
+[[package]]
+name = "flate2"
+version = "1.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
 [[package]]
 name = "fnv"
 version = "1.0.7"
@@ -1286,6 +1370,15 @@ dependencies = [
  "wasip2",
 ]
 
+[[package]]
+name = "gimli"
+version = "0.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1033caf0b349c518623b5396bfb2cf0bddf44f0306d543a250e5743297aafd10"
+dependencies = [
+ "stable_deref_trait",
+]
+
 [[package]]
 name = "group"
 version = "0.13.0"
@@ -1297,12 +1390,6 @@ dependencies = [
  "subtle",
 ]
 
-[[package]]
-name = "half"
-version = "1.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b43ede17f21864e81be2fa654110bf1e793774238d86ef8555c37e6519c0403"
-
 [[package]]
 name = "half"
 version = "2.7.1"
@@ -1314,12 +1401,30 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "hash32"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
+dependencies = [
+ "byteorder",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
 
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+dependencies = [
+ "ahash",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.15.5"
@@ -1347,6 +1452,20 @@ version = "0.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
 
+[[package]]
+name = "heapless"
+version = "0.7.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f"
+dependencies = [
+ "atomic-polyfill",
+ "hash32",
+ "rustc_version",
+ "serde",
+ "spin",
+ "stable_deref_trait",
+]
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -1619,14 +1738,17 @@ dependencies = [
 name = "lambda-vm-prover"
 version = "0.1.0"
 dependencies = [
+ "addr2line",
  "bincode",
  "criterion 0.5.1",
  "crypto",
  "ecsm",
  "env_logger",
  "executor",
+ "hashbrown 0.14.5",
  "log",
  "math",
+ "postcard",
  "rayon",
  "serde",
  "sha3",
@@ -1699,6 +1821,15 @@ version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
 
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
 [[package]]
 name = "log"
 version = "0.4.29"
@@ -1814,6 +1945,32 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "minicbor"
+version = "2.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b7a5041e12946f8b7d3f5a9d96383a19d694b9335457c522be7815b9abafb02"
+
+[[package]]
+name = "minicbor-serde"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "293c7245401f035e2dcc4b12ebdb5c9d8847247fc79fe1b5b0a0d58d7275324c"
+dependencies = [
+ "minicbor",
+ "serde",
+]
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316"
+dependencies = [
+ "adler2",
+ "simd-adler32",
+]
+
 [[package]]
 name = "munge"
 version = "0.4.7"
@@ -1886,6 +2043,17 @@ dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "object"
+version = "0.39.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e5a6c098c7a3b6547378093f5cc30bc54fd361ce711e05293a5cc589562739b"
+dependencies = [
+ "flate2",
+ "memchr",
+ "ruzstd",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.21.3"
@@ -2030,6 +2198,19 @@ dependencies = [
  "portable-atomic",
 ]
 
+[[package]]
+name = "postcard"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24"
+dependencies = [
+ "cobs",
+ "embedded-io 0.4.0",
+ "embedded-io 0.6.1",
+ "heapless",
+ "serde",
+]
+
 [[package]]
 name = "powerfmt"
 version = "0.2.0"
@@ -2383,6 +2564,15 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3e75f6a532d0fd9f7f13144f392b6ad56a32696bfcd9c78f797f16bbb6f072d6"
 
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
 [[package]]
 name = "rustix"
 version = "1.1.3"
@@ -2414,6 +2604,15 @@ dependencies = [
  "wait-timeout",
 ]
 
+[[package]]
+name = "ruzstd"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7c1c839d570d835527c9a5e4db7cb2198683a988cb9d7293fc8674e6bd58fc8"
+dependencies = [
+ "twox-hash",
+]
+
 [[package]]
 name = "ryu"
 version = "1.0.21"
@@ -2462,6 +2661,12 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
 [[package]]
 name = "sec1"
 version = "0.7.3"
@@ -2496,6 +2701,12 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "semver"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
+
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -2517,16 +2728,6 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "serde_cbor"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5"
-dependencies = [
- "half 1.8.3",
- "serde",
-]
-
 [[package]]
 name = "serde_core"
 version = "1.0.228"
@@ -2637,12 +2838,33 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "simd-adler32"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214"
+
 [[package]]
 name = "simdutf8"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
 
+[[package]]
+name = "smallvec"
+version = "1.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90"
+
+[[package]]
+name = "spin"
+version = "0.9.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+dependencies = [
+ "lock_api",
+]
+
 [[package]]
 name = "spki"
 version = "0.7.3"
@@ -2653,6 +2875,12 @@ dependencies = [
  "der",
 ]
 
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
 [[package]]
 name = "stark"
 version = "0.1.0"
@@ -2661,22 +2889,23 @@ dependencies = [
  "criterion 0.4.0",
  "crypto",
  "env_logger",
+ "hashbrown 0.14.5",
  "itertools 0.11.0",
  "libc",
+ "libm",
  "log",
  "math",
  "math-cuda",
  "memmap2",
+ "minicbor-serde",
  "rand 0.8.5",
  "rand_chacha 0.3.1",
  "rayon",
  "serde",
  "serde-wasm-bindgen",
- "serde_cbor",
  "sha3",
  "tempfile",
  "test-log",
- "thiserror 1.0.69",
  "wasm-bindgen",
  "web-sys",
 ]
@@ -2791,33 +3020,13 @@ version = "0.16.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c13547615a44dc9c452a8a534638acdf07120d4b6847c8178705da06306a3057"
 
-[[package]]
-name = "thiserror"
-version = "1.0.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
-dependencies = [
- "thiserror-impl 1.0.69",
-]
-
 [[package]]
 name = "thiserror"
 version = "2.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8"
 dependencies = [
- "thiserror-impl 2.0.17",
-]
-
-[[package]]
-name = "thiserror-impl"
-version = "1.0.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
+ "thiserror-impl",
 ]
 
 [[package]]
@@ -3027,6 +3236,18 @@ dependencies = [
  "tracing-log",
 ]
 
+[[package]]
+name = "twox-hash"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c"
+
+[[package]]
+name = "typed-arena"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a"
+
 [[package]]
 name = "typenum"
 version = "1.19.0"
diff --git a/bench_vs/build_recursion_elfs.sh b/bench_vs/build_recursion_elfs.sh
new file mode 100755
index 000000000..a529b4bbe
--- /dev/null
+++ b/bench_vs/build_recursion_elfs.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Build the fibonacci-bench and recursion-bench ELFs for the recursion smoke test.
+#
+# Uses the same toolchain + flags as bench_vs/run.sh, plus pins serde to the last
+# pre-`serde_core`-split version (1.0.219) inside each guest's own workspace lock
+# so build-std works on the riscv64im-lambda-vm-elf target.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+ROOT_DIR="$(cd -- "$SCRIPT_DIR/.." &>/dev/null && pwd)"
+TARGET_SPEC="$ROOT_DIR/executor/programs/riscv64im-lambda-vm-elf.json"
+
+TOOLCHAIN="nightly-2026-02-01"
+
+build_one() {
+    local name="$1"
+    local dir="$ROOT_DIR/bench_vs/lambda/$name"
+    echo "[recursion-elfs] building $name ..."
+    (
+        cd "$dir"
+        # Recursion/deserialize-only guests pull in lambda-vm-prover and its
+        # serde stack; pin serde to 1.0.219 (pre-`serde_core` split) so
+        # `-Z build-std=core,alloc` works.
+        if [ "$name" = "recursion" ] || [ "$name" = "deserialize-only" ]; then
+            cargo "+$TOOLCHAIN" update -p serde --precise 1.0.219 2>/dev/null || true
+        fi
+        cargo "+$TOOLCHAIN" build --release \
+            --target "$TARGET_SPEC" \
+            -Z build-std=core,alloc \
+            -Z build-std-features=compiler-builtins-mem \
+            -Z json-target-spec
+    )
+}
+
+build_one empty
+build_one fibonacci
+build_one recursion
+build_one deserialize-only
+
+echo "[recursion-elfs] done"
diff --git a/bench_vs/lambda/deserialize-only/.cargo/config.toml b/bench_vs/lambda/deserialize-only/.cargo/config.toml
new file mode 100644
index 000000000..be730c3ec
--- /dev/null
+++ b/bench_vs/lambda/deserialize-only/.cargo/config.toml
@@ -0,0 +1,6 @@
+[target.riscv64im-lambda-vm-elf]
+rustflags = [
+  "-C", "link-arg=-e",
+  "-C", "link-arg=main",
+  "-C", "passes=lower-atomic"
+]
diff --git a/bench_vs/lambda/deserialize-only/Cargo.toml b/bench_vs/lambda/deserialize-only/Cargo.toml
new file mode 100644
index 000000000..b4a4616f4
--- /dev/null
+++ b/bench_vs/lambda/deserialize-only/Cargo.toml
@@ -0,0 +1,13 @@
+[workspace]
+
+[package]
+name = "deserialize-only-bench"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+lambda-vm-prover = { path = "../../../prover", default-features = false }
+embedded-alloc = "0.6"
+riscv = { version = "0.15", features = ["critical-section-single-hart"] }
+serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] }
+postcard = { version = "1.0", default-features = false, features = ["alloc"] }
diff --git a/bench_vs/lambda/deserialize-only/src/main.rs b/bench_vs/lambda/deserialize-only/src/main.rs
new file mode 100644
index 000000000..e2cecc938
--- /dev/null
+++ b/bench_vs/lambda/deserialize-only/src/main.rs
@@ -0,0 +1,94 @@
+//! Deserialize-only counterpart to the recursion guest.
+//!
+//! Reads the same private-input blob as `recursion-bench`, postcard-decodes
+//! `(VmProof, Vec<u8>, ProofOptions, VmVerifyingKey)`, then commits success
+//! and halts — without ever calling `verify_with_options`. The cycle delta
+//! between this guest and `recursion-bench` is the actual cost of the STARK
+//! verifier inside the VM (everything else being equal).
+
+#![no_std]
+#![no_main]
+
+extern crate alloc;
+
+use alloc::vec::Vec;
+use core::arch::asm;
+use core::panic::PanicInfo;
+
+use embedded_alloc::TlsfHeap as Heap;
+use lambda_vm_prover::{ProofOptions, VmProof, VmVerifyingKey};
+// Required to pull in the riscv crate's critical-section implementation.
+use riscv as _;
+
+const PRIVATE_INPUT_START: usize = 0xFF000000;
+const SYSCALL_COMMIT: u64 = 64;
+const SYSCALL_HALT: u64 = 93;
+const MAX_MEMORY_SIZE: usize = 0xC000_0000;
+
+#[global_allocator]
+static HEAP: Heap = Heap::empty();
+
+#[panic_handler]
+fn panic(_info: &PanicInfo) -> ! {
+    loop {}
+}
+
+fn init_allocator() {
+    unsafe extern "C" {
+        static _end: u8;
+    }
+    let heap_pos = (&raw const _end) as usize;
+    unsafe { HEAP.init(heap_pos, MAX_MEMORY_SIZE - heap_pos) }
+}
+
+fn read_private_input() -> &'static [u8] {
+    let len = unsafe { core::ptr::read_volatile(PRIVATE_INPUT_START as *const u32) } as usize;
+    let data = (PRIVATE_INPUT_START + 4) as *const u8;
+    unsafe { core::slice::from_raw_parts(data, len) }
+}
+
+fn commit(bytes: &[u8]) {
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") 1u64,
+            in("a1") bytes.as_ptr(),
+            in("a2") bytes.len(),
+            in("a7") SYSCALL_COMMIT,
+        );
+    }
+}
+
+fn halt() -> ! {
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") 0u64,
+            in("a7") SYSCALL_HALT,
+            options(noreturn),
+        );
+    }
+}
+
+#[unsafe(no_mangle)]
+pub fn main() -> ! {
+    init_allocator();
+
+    let blob = read_private_input();
+    let decoded: (VmProof, Vec<u8>, ProofOptions, VmVerifyingKey) =
+        postcard::from_bytes(blob).expect("failed to deserialize");
+
+    // Force the commit byte to depend on the actually-decoded value. Without
+    // this, LLVM at -O3 was eliding the postcard decode entirely — the only
+    // sinks for `decoded` were `black_box(&decoded)` (which only forces the
+    // *reference* to materialize, not the pointee) and `Drop`, neither of
+    // which require the decoded bytes to be real. With the commit byte tied
+    // to a deep field of the decoded value, the decode has to run.
+    let proof_options_byte = decoded.2.blowup_factor;
+    let inner_elf_byte = *decoded.1.first().unwrap_or(&0);
+    let vkey_byte = decoded.3.bitwise[0];
+    let marker = proof_options_byte ^ inner_elf_byte ^ vkey_byte;
+
+    commit(&[marker]);
+    halt()
+}
diff --git a/bench_vs/lambda/empty/.cargo/config.toml b/bench_vs/lambda/empty/.cargo/config.toml
new file mode 100644
index 000000000..be730c3ec
--- /dev/null
+++ b/bench_vs/lambda/empty/.cargo/config.toml
@@ -0,0 +1,6 @@
+[target.riscv64im-lambda-vm-elf]
+rustflags = [
+  "-C", "link-arg=-e",
+  "-C", "link-arg=main",
+  "-C", "passes=lower-atomic"
+]
diff --git a/bench_vs/lambda/empty/Cargo.lock b/bench_vs/lambda/empty/Cargo.lock
new file mode 100644
index 000000000..11dcd8cb1
--- /dev/null
+++ b/bench_vs/lambda/empty/Cargo.lock
@@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "empty-bench"
+version = "0.1.0"
diff --git a/bench_vs/lambda/empty/Cargo.toml b/bench_vs/lambda/empty/Cargo.toml
new file mode 100644
index 000000000..a6e4a0530
--- /dev/null
+++ b/bench_vs/lambda/empty/Cargo.toml
@@ -0,0 +1,8 @@
+[workspace]
+
+[package]
+name = "empty-bench"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
diff --git a/bench_vs/lambda/empty/src/main.rs b/bench_vs/lambda/empty/src/main.rs
new file mode 100644
index 000000000..555cae897
--- /dev/null
+++ b/bench_vs/lambda/empty/src/main.rs
@@ -0,0 +1,28 @@
+#![no_std]
+#![no_main]
+
+use core::arch::asm;
+use core::panic::PanicInfo;
+
+const SYSCALL_HALT: u64 = 93;
+
+#[panic_handler]
+fn panic(_info: &PanicInfo) -> ! {
+    loop {}
+}
+
+fn halt() -> ! {
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") 0u64,
+            in("a7") SYSCALL_HALT,
+            options(noreturn),
+        );
+    }
+}
+
+#[unsafe(no_mangle)]
+pub fn main() -> ! {
+    halt()
+}
diff --git a/bench_vs/lambda/recursion/.cargo/config.toml b/bench_vs/lambda/recursion/.cargo/config.toml
new file mode 100644
index 000000000..be730c3ec
--- /dev/null
+++ b/bench_vs/lambda/recursion/.cargo/config.toml
@@ -0,0 +1,6 @@
+[target.riscv64im-lambda-vm-elf]
+rustflags = [
+  "-C", "link-arg=-e",
+  "-C", "link-arg=main",
+  "-C", "passes=lower-atomic"
+]
diff --git a/bench_vs/lambda/recursion/Cargo.lock b/bench_vs/lambda/recursion/Cargo.lock
new file mode 100644
index 000000000..c19590031
--- /dev/null
+++ b/bench_vs/lambda/recursion/Cargo.lock
@@ -0,0 +1,796 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "ahash"
+version = "0.8.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+
+[[package]]
+name = "base16ct"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
+
+[[package]]
+name = "base64"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "cobs"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1"
+dependencies = [
+ "thiserror",
+]
+
+[[package]]
+name = "const-default"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b396d1f76d455557e1218ec8066ae14bba60b4b36ecd55577ba979f5db7ecaa"
+
+[[package]]
+name = "const-oid"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "critical-section"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b"
+
+[[package]]
+name = "crypto"
+version = "0.1.0"
+dependencies = [
+ "digest",
+ "math",
+ "rand",
+ "rand_chacha",
+ "serde",
+ "sha3",
+]
+
+[[package]]
+name = "crypto-bigint"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
+dependencies = [
+ "generic-array",
+ "rand_core",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "crypto-common"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "der"
+version = "0.7.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
+dependencies = [
+ "const-oid",
+ "zeroize",
+]
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+]
+
+[[package]]
+name = "ecsm"
+version = "0.1.0"
+dependencies = [
+ "k256",
+ "num-bigint",
+ "num-traits",
+]
+
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
+[[package]]
+name = "elliptic-curve"
+version = "0.13.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
+dependencies = [
+ "base16ct",
+ "crypto-bigint",
+ "ff",
+ "generic-array",
+ "group",
+ "rand_core",
+ "sec1",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "embedded-alloc"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f2de9133f68db0d4627ad69db767726c99ff8585272716708227008d3f1bddd"
+dependencies = [
+ "const-default",
+ "critical-section",
+ "linked_list_allocator",
+ "rlsf",
+]
+
+[[package]]
+name = "embedded-hal"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "361a90feb7004eca4019fb28352a9465666b24f840f5c3cddf0ff13920590b89"
+
+[[package]]
+name = "embedded-io"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced"
+
+[[package]]
+name = "embedded-io"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d"
+
+[[package]]
+name = "executor"
+version = "0.1.0"
+dependencies = [
+ "ecsm",
+ "hashbrown",
+ "thiserror",
+]
+
+[[package]]
+name = "ff"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393"
+dependencies = [
+ "rand_core",
+ "subtle",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
+
+[[package]]
+name = "futures-task"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
+
+[[package]]
+name = "futures-util"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "pin-project-lite",
+ "slab",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+ "zeroize",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "wasi",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "group"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63"
+dependencies = [
+ "ff",
+ "rand_core",
+ "subtle",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+dependencies = [
+ "ahash",
+]
+
+[[package]]
+name = "itertools"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.98"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67df7112613f8bfd9150013a0314e196f4800d3201ae742489d999db2f979f08"
+dependencies = [
+ "cfg-if",
+ "futures-util",
+ "once_cell",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "k256"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6e3919bbaa2945715f0bb6d3934a173d1e9a59ac23767fbaaef277265a7411b"
+dependencies = [
+ "cfg-if",
+ "elliptic-curve",
+]
+
+[[package]]
+name = "keccak"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653"
+dependencies = [
+ "cpufeatures",
+]
+
+[[package]]
+name = "lambda-vm-prover"
+version = "0.1.0"
+dependencies = [
+ "crypto",
+ "ecsm",
+ "executor",
+ "hashbrown",
+ "log",
+ "math",
+ "postcard",
+ "serde",
+ "sha3",
+ "stark",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.186"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
+
+[[package]]
+name = "libm"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
+
+[[package]]
+name = "linked_list_allocator"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b23ac50abb8261cb38c6e2a7192d3302e0836dac1628f6a93b82b4fad185897"
+
+[[package]]
+name = "log"
+version = "0.4.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
+
+[[package]]
+name = "math"
+version = "0.1.0"
+dependencies = [
+ "getrandom",
+ "num-bigint",
+ "num-traits",
+ "rand",
+ "serde",
+]
+
+[[package]]
+name = "minicbor"
+version = "2.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b7a5041e12946f8b7d3f5a9d96383a19d694b9335457c522be7815b9abafb02"
+
+[[package]]
+name = "minicbor-serde"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "293c7245401f035e2dcc4b12ebdb5c9d8847247fc79fe1b5b0a0d58d7275324c"
+dependencies = [
+ "minicbor",
+ "serde",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+
+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
+
+[[package]]
+name = "postcard"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24"
+dependencies = [
+ "cobs",
+ "embedded-io 0.4.0",
+ "embedded-io 0.6.1",
+ "serde",
+]
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+
+[[package]]
+name = "recursion-bench"
+version = "0.1.0"
+dependencies = [
+ "embedded-alloc",
+ "lambda-vm-prover",
+ "postcard",
+ "riscv",
+ "serde",
+]
+
+[[package]]
+name = "riscv"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b05cfa3f7b30c84536a9025150d44d26b8e1cc20ddf436448d74cd9591eefb25"
+dependencies = [
+ "critical-section",
+ "embedded-hal",
+ "paste",
+ "riscv-macros",
+ "riscv-pac",
+]
+
+[[package]]
+name = "riscv-macros"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d323d13972c1b104aa036bc692cd08b822c8bbf23d79a27c526095856499799"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "riscv-pac"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8188909339ccc0c68cfb5a04648313f09621e8b87dc03095454f1a11f6c5d436"
+
+[[package]]
+name = "rlsf"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1646a59a9734b8b7a0ac51689388a60fe1625d4b956348e9de07591a1478457a"
+dependencies = [
+ "cfg-if",
+ "const-default",
+ "libc",
+ "rustversion",
+ "svgbobdoc",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "sec1"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
+dependencies = [
+ "base16ct",
+ "der",
+ "generic-array",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "sha3"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77fd7028345d415a4034cf8777cd4f8ab1851274233b45f84e3d955502d93874"
+dependencies = [
+ "digest",
+ "keccak",
+]
+
+[[package]]
+name = "slab"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
+
+[[package]]
+name = "stark"
+version = "0.1.0"
+dependencies = [
+ "crypto",
+ "hashbrown",
+ "itertools",
+ "libm",
+ "log",
+ "math",
+ "minicbor-serde",
+ "serde",
+ "sha3",
+]
+
+[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+
+[[package]]
+name = "svgbobdoc"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2c04b93fc15d79b39c63218f15e3fdffaa4c227830686e3b7c5f41244eb3e50"
+dependencies = [
+ "base64",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "unicode-width",
+]
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "typenum"
+version = "1.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.121"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49ace1d07c165b0864824eee619580c4689389afa9dc9ed3a4c75040d82e6790"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.121"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e68e6f4afd367a562002c05637acb8578ff2dea1943df76afb9e83d177c8578"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.121"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d95a9ec35c64b2a7cb35d3fead40c4238d0940c86d107136999567a4703259f2"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.121"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4e0100b01e9f0d03189a92b96772a1fb998639d981193d7dbab487302513441"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e"
diff --git a/bench_vs/lambda/recursion/Cargo.toml b/bench_vs/lambda/recursion/Cargo.toml
new file mode 100644
index 000000000..dab83abf3
--- /dev/null
+++ b/bench_vs/lambda/recursion/Cargo.toml
@@ -0,0 +1,13 @@
+[workspace]
+
+[package]
+name = "recursion-bench"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+lambda-vm-prover = { path = "../../../prover", default-features = false }
+embedded-alloc = "0.6"
+riscv = { version = "0.15", features = ["critical-section-single-hart"] }
+serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] }
+postcard = { version = "1.0", default-features = false, features = ["alloc"] }
diff --git a/bench_vs/lambda/recursion/src/main.rs b/bench_vs/lambda/recursion/src/main.rs
new file mode 100644
index 000000000..a226ea225
--- /dev/null
+++ b/bench_vs/lambda/recursion/src/main.rs
@@ -0,0 +1,98 @@
+#![no_std]
+#![no_main]
+
+extern crate alloc;
+
+use alloc::vec::Vec;
+use core::arch::asm;
+use core::panic::PanicInfo;
+
+use embedded_alloc::TlsfHeap as Heap;
+use lambda_vm_prover::{ProofOptions, VmProof, VmVerifyingKey};
+// Required to pull in the riscv crate's critical-section implementation.
+use riscv as _;
+
+const PRIVATE_INPUT_START: usize = 0xFF000000;
+const SYSCALL_COMMIT: u64 = 64;
+const SYSCALL_HALT: u64 = 93;
+const MAX_MEMORY_SIZE: usize = 0xC000_0000;
+
+#[global_allocator]
+static HEAP: Heap = Heap::empty();
+
+#[panic_handler]
+fn panic(_info: &PanicInfo) -> ! {
+    loop {}
+}
+
+fn init_allocator() {
+    unsafe extern "C" {
+        static _end: u8;
+    }
+    let heap_pos = (&raw const _end) as usize;
+    unsafe { HEAP.init(heap_pos, MAX_MEMORY_SIZE - heap_pos) }
+}
+
+/// Read the entire private-input region as a byte slice.
+///
+/// Layout (per `syscalls::get_private_input`): 4-byte LE length prefix at
+/// `PRIVATE_INPUT_START`, payload at +4.
+fn read_private_input() -> &'static [u8] {
+    let len = unsafe { core::ptr::read_volatile(PRIVATE_INPUT_START as *const u32) } as usize;
+    let data = (PRIVATE_INPUT_START + 4) as *const u8;
+    unsafe { core::slice::from_raw_parts(data, len) }
+}
+
+fn commit(bytes: &[u8]) {
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") 1u64,
+            in("a1") bytes.as_ptr(),
+            in("a2") bytes.len(),
+            in("a7") SYSCALL_COMMIT,
+        );
+    }
+}
+
+fn halt() -> ! {
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") 0u64,
+            in("a7") SYSCALL_HALT,
+            options(noreturn),
+        );
+    }
+}
+
+/// Private input layout (postcard-encoded):
+///   (VmProof, Vec<u8>, ProofOptions, VmVerifyingKey)
+/// where the `Vec<u8>` holds the inner program's ELF bytes, the
+/// `ProofOptions` specifies the parameters the inner prover used, and the
+/// `VmVerifyingKey` carries the host-derived bitwise preprocessed commitment
+/// so the guest can skip the ~87% of verifier cycles that would otherwise be
+/// spent recomputing it from scratch.
+#[unsafe(no_mangle)]
+pub fn main() -> ! {
+    init_allocator();
+
+    let blob = read_private_input();
+    let (vm_proof, inner_elf, options, vkey): (VmProof, Vec<u8>, ProofOptions, VmVerifyingKey) =
+        postcard::from_bytes(blob).expect("failed to deserialize recursion input");
+
+    let ok =
+        lambda_vm_prover::verify_with_options_with_vkey(
+            &vm_proof,
+            &inner_elf,
+            &options,
+            None,
+            None,
+            Some(&vkey),
+        )
+        .expect("verify errored");
+    assert!(ok, "inner proof failed verification");
+
+    commit(&[1u8]);
+    halt()
+}
diff --git a/bench_vs/sp1/verifier/Cargo.toml b/bench_vs/sp1/verifier/Cargo.toml
new file mode 100644
index 000000000..fc24039c2
--- /dev/null
+++ b/bench_vs/sp1/verifier/Cargo.toml
@@ -0,0 +1,3 @@
+[workspace]
+members = ["program", "script"]
+resolver = "2"
diff --git a/bench_vs/sp1/verifier/program/Cargo.toml b/bench_vs/sp1/verifier/program/Cargo.toml
new file mode 100644
index 000000000..7fbc9c5ce
--- /dev/null
+++ b/bench_vs/sp1/verifier/program/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "verifier-program"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+sp1-zkvm = "6.0.1"
+lambda-vm-prover = { path = "../../../../prover", default-features = false }
+serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] }
+postcard = { version = "1.0", default-features = false, features = ["alloc"] }
diff --git a/bench_vs/sp1/verifier/program/src/main.rs b/bench_vs/sp1/verifier/program/src/main.rs
new file mode 100644
index 000000000..c63bb67ca
--- /dev/null
+++ b/bench_vs/sp1/verifier/program/src/main.rs
@@ -0,0 +1,34 @@
+//! SP1 guest that runs lambda-vm's `verify_with_options` on a single proof.
+//!
+//! Input layout (postcard-encoded `Vec<u8>` written via `SP1Stdin::write_vec`):
+//!   `(VmProof, Vec<u8>, ProofOptions)`
+//! where the inner `Vec<u8>` is the inner program's ELF bytes.
+//!
+//! Output: commits `[1u8]` on successful verify; the guest panics otherwise.
+//!
+//! Caveats:
+//! - The verifier hashes through the `keccak` crate. SP1 has a Keccak
+//!   precompile but it patches `tiny-keccak`, not `keccak`. We don't patch
+//!   here, so Keccak runs as software inside the guest. Cycle counts will be
+//!   inflated by that overhead. Worth keeping in mind when interpreting the
+//!   number relative to lambda-vm's in-VM count.
+
+#![no_main]
+
+extern crate alloc;
+
+use alloc::vec::Vec;
+
+use lambda_vm_prover::{ProofOptions, VmProof};
+
+sp1_zkvm::entrypoint!(main);
+
+pub fn main() {
+    let blob = sp1_zkvm::io::read_vec();
+    let (vm_proof, inner_elf, options): (VmProof, Vec<u8>, ProofOptions) =
+        postcard::from_bytes(&blob).expect("failed to deserialize input");
+    let ok = lambda_vm_prover::verify_with_options(&vm_proof, &inner_elf, &options, None, None)
+        .expect("verify errored");
+    assert!(ok, "inner proof failed verification");
+    sp1_zkvm::io::commit_slice(&[1u8]);
+}
diff --git a/bench_vs/sp1/verifier/script/Cargo.toml b/bench_vs/sp1/verifier/script/Cargo.toml
new file mode 100644
index 000000000..3198059bd
--- /dev/null
+++ b/bench_vs/sp1/verifier/script/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "verifier-script"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+sp1-sdk = { version = "6.0.1", features = ["blocking", "profiling"] }
+lambda-vm-prover = { path = "../../../../prover" }
+stark = { path = "../../../../crypto/stark" }
+postcard = { version = "1.0", features = ["alloc"] }
+
+[build-dependencies]
+sp1-build = "6.0.1"
diff --git a/bench_vs/sp1/verifier/script/build.rs b/bench_vs/sp1/verifier/script/build.rs
new file mode 100644
index 000000000..d6cf925d6
--- /dev/null
+++ b/bench_vs/sp1/verifier/script/build.rs
@@ -0,0 +1,5 @@
+use sp1_build::build_program_with_args;
+
+fn main() {
+    build_program_with_args("../program", Default::default());
+}
diff --git a/bench_vs/sp1/verifier/script/src/main.rs b/bench_vs/sp1/verifier/script/src/main.rs
new file mode 100644
index 000000000..86e46a710
--- /dev/null
+++ b/bench_vs/sp1/verifier/script/src/main.rs
@@ -0,0 +1,83 @@
+//! Host driver: prove an inner empty program on lambda-vm, then execute the
+//! lambda-vm verifier inside SP1's executor, printing the cycle count.
+//!
+//! Set `TRACE_FILE=profiles/verifier.json` to capture a DWARF-attributed
+//! profile (1 sample = 1 cycle). The output can be opened with
+//! `samply load profiles/verifier.json`.
+
+use std::path::PathBuf;
+
+use sp1_sdk::blocking::{Prover, ProverClient};
+use sp1_sdk::{SP1Stdin, include_elf};
+
+const VERIFIER_ELF: sp1_sdk::Elf = include_elf!("verifier-program");
+
+fn workspace_root() -> PathBuf {
+    // CARGO_MANIFEST_DIR for this crate is `<root>/bench_vs/sp1/verifier/script`.
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .ancestors()
+        .nth(4)
+        .expect("workspace root")
+        .to_path_buf()
+}
+
+fn main() {
+    sp1_sdk::utils::setup_logger();
+
+    let root = workspace_root();
+    let empty_elf_path = root
+        .join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench");
+    assert!(
+        empty_elf_path.exists(),
+        "empty-bench ELF not found at {} — run `bash bench_vs/build_recursion_elfs.sh` first",
+        empty_elf_path.display(),
+    );
+    let inner_elf = std::fs::read(&empty_elf_path).expect("read empty-bench");
+
+    let options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    println!("[sp1-verifier] proving inner (empty, blowup=2, 1 query) ...");
+    let inner_proof = lambda_vm_prover::prove_with_options_and_inputs(
+        &inner_elf,
+        &[],
+        &options,
+        &lambda_vm_prover::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let blob = postcard::to_allocvec(&(&inner_proof, &inner_elf, &options))
+        .expect("postcard encode failed");
+    println!("[sp1-verifier] postcard blob: {} bytes", blob.len());
+
+    let client = ProverClient::from_env();
+    let mut stdin = SP1Stdin::new();
+    stdin.write_vec(blob);
+
+    println!("[sp1-verifier] executing verifier in SP1 ...");
+    let (_, report) = client
+        .execute(VERIFIER_ELF.clone(), stdin)
+        .run()
+        .expect("execute failed");
+
+    let cycles = report.total_instruction_count();
+    println!();
+    println!("============================================================");
+    println!("  SP1 EXECUTION SUMMARY — lambda-vm verifier inside SP1");
+    println!("============================================================");
+    println!("  Total cycles : {cycles}");
+    println!();
+    println!("  Compare against lambda-vm in-VM count (~40.5B for the same");
+    println!("  proof). Both VMs target riscv64im, so word width is symmetric.");
+    println!("  Main remaining asymmetry: lambda-vm's KeccakPermute precompile");
+    println!("  is patched on its guests but SP1 does not patch `keccak` (only");
+    println!("  `tiny-keccak`), so Keccak rounds run as software in SP1 here.");
+    println!();
+    println!("  If TRACE_FILE was set, the profile was written there.");
+    println!("  Render with: samply load <trace>");
+    println!("============================================================");
+}
diff --git a/crypto/crypto/Cargo.toml b/crypto/crypto/Cargo.toml
index 6e3731beb..6dc2ab50a 100644
--- a/crypto/crypto/Cargo.toml
+++ b/crypto/crypto/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-math = { path = "../math", features = ["alloc"] }
+math = { path = "../math", default-features = false, features = ["alloc"] }
 digest = "0.10.7"
 sha3 = { version = "0.10.8", default-features = false }
 # Optional
diff --git a/crypto/ecsm/Cargo.toml b/crypto/ecsm/Cargo.toml
index 4d2800b2c..57f368f67 100644
--- a/crypto/ecsm/Cargo.toml
+++ b/crypto/ecsm/Cargo.toml
@@ -5,9 +5,16 @@ version = "0.1.0"
 edition = "2024"
 license.workspace = true
 
+[features]
+default = ["std"]
+# `std` re-enables the std builds of the numeric deps and the
+# `std::error::Error` impl. Guests (executor/prover with no default features)
+# leave it off so the crate builds `no_std` + `alloc` for riscv64.
+std = ["num-bigint/std", "num-traits/std"]
+
 [dependencies]
-num-bigint = "0.4.6"
-num-traits = "0.2.19"
+num-bigint = { version = "0.4.6", default-features = false }
+num-traits = { version = "0.2.19", default-features = false }
 # Audited secp256k1 arithmetic (host-side witness generation only; never in the
 # constraint system). Used for executor scalar multiplication and for the projective
 # double-and-add replay + batch inversion that builds ECDAS step witnesses efficiently.
diff --git a/crypto/ecsm/src/curve.rs b/crypto/ecsm/src/curve.rs
index 2f2acb0e1..0a1e74f32 100644
--- a/crypto/ecsm/src/curve.rs
+++ b/crypto/ecsm/src/curve.rs
@@ -6,6 +6,9 @@
 //! `k in [1, N)` (see `ecsm.typ` "Point at infinity" / ECDAS soundness argument), so the
 //! affine formulas below are always well defined.
 
+use alloc::vec;
+use alloc::vec::Vec;
+
 use num_bigint::BigUint;
 
 /// An affine curve point. Never the point at infinity.
diff --git a/crypto/ecsm/src/lib.rs b/crypto/ecsm/src/lib.rs
index 3a0a44dff..5feed9883 100644
--- a/crypto/ecsm/src/lib.rs
+++ b/crypto/ecsm/src/lib.rs
@@ -15,6 +15,10 @@
 //!
 //! Curve: secp256k1, `y^2 = x^3 + 7 mod p`, `p = 2^256 - 2^32 - 977`, order `N`.
 
+#![cfg_attr(not(feature = "std"), no_std)]
+
+extern crate alloc;
+
 pub mod curve;
 pub mod witness;
 
@@ -84,7 +88,7 @@ impl core::fmt::Display for EcsmError {
     }
 }
 
-impl std::error::Error for EcsmError {}
+impl core::error::Error for EcsmError {}
 
 /// Converts a `BigUint` to 32 little-endian bytes (zero-padded / truncated to 32).
 pub fn to_le_32(v: &BigUint) -> [u8; 32] {
diff --git a/crypto/ecsm/src/witness.rs b/crypto/ecsm/src/witness.rs
index 9322cba7e..2e1e99f7d 100644
--- a/crypto/ecsm/src/witness.rs
+++ b/crypto/ecsm/src/witness.rs
@@ -16,6 +16,8 @@
 //! negative; the chip range-checks `c_i + offset` as a halfword. We reproduce the exact
 //! integer recurrence here; the prover converts the resulting integers to field elements.
 
+use alloc::vec::Vec;
+
 use num_bigint::{BigInt, BigUint};
 use num_traits::{Signed, Zero};
 
diff --git a/crypto/math/src/fft/bowers_fft.rs b/crypto/math/src/fft/bowers_fft.rs
index 60a15410e..6ed9ec46d 100644
--- a/crypto/math/src/fft/bowers_fft.rs
+++ b/crypto/math/src/fft/bowers_fft.rs
@@ -296,6 +296,7 @@ fn process_fused_block<F, E>(
 /// 2-layer fusion: 8 reads + 8 writes instead of 8+8+8+8 for separate layers.
 #[cfg(feature = "alloc")]
 #[inline]
+#[allow(dead_code)]
 fn process_triple_fused_block<F, E>(
     block: &mut [FieldElement<E>],
     twiddles_l0: &[FieldElement<F>],
@@ -604,6 +605,7 @@ fn process_ifft_fused_block<F, E>(
 /// Process a single block with 3-layer IFFT fusion (DIT radix-8 butterfly).
 #[cfg(feature = "alloc")]
 #[inline]
+#[allow(dead_code)]
 fn process_ifft_triple_fused_block<F, E>(
     block: &mut [FieldElement<E>],
     twiddles_hi: &[FieldElement<F>], // innermost layer (highest index)
diff --git a/crypto/stark/Cargo.toml b/crypto/stark/Cargo.toml
index d0f6a51ef..92015f45f 100644
--- a/crypto/stark/Cargo.toml
+++ b/crypto/stark/Cargo.toml
@@ -9,16 +9,18 @@ crate-type = ["cdylib", "rlib"]
 
 
 [dependencies]
-math = { path = "../math", features = [
-    "std",
+math = { path = "../math", default-features = false, features = [
+    "alloc",
     "lambdaworks-serde-binary",
 ] }
-crypto = { path = "../crypto", features = ["std", "serde"] }
-thiserror = "1.0.38"
-log = "0.4.17"
-sha3 = "0.10.8"
-serde = { version = "1.0", features = ["derive"] }
-itertools = "0.11.0"
+crypto = { path = "../crypto", default-features = false, features = ["serde"] }
+log = { version = "0.4.17", default-features = false }
+sha3 = { version = "0.10.8", default-features = false }
+serde = { version = "1.0", default-features = false, features = ["derive", "alloc"] }
+minicbor-serde = { version = "0.7.0", default-features = false, features = ["alloc"] }
+itertools = { version = "0.11.0", default-features = false, features = ["use_alloc"] }
+hashbrown = { version = "0.14", default-features = false, features = ["inline-more", "ahash"] }
+libm = "0.2"
 
 # Parallelization crates
 rayon = { version = "1.8.0", optional = true }
@@ -34,7 +36,6 @@ math-cuda = { path = "../math-cuda", optional = true }
 wasm-bindgen = { version = "0.2", optional = true }
 serde-wasm-bindgen = { version = "0.5", optional = true }
 web-sys = { version = "0.3.64", features = ['console'], optional = true }
-serde_cbor = { version = "0.11.1" }
 
 [dev-dependencies]
 criterion = { version = "0.4", default-features = false }
@@ -45,14 +46,23 @@ rand = { version = "0.8.5", features = ["std"] }
 rand_chacha = "0.3.1"
 
 [features]
-test-utils = []
+default = ["std", "parallel"]
+std = [
+    "math/std",
+    "crypto/std",
+    "log/std",
+    "sha3/std",
+    "serde/std",
+    "itertools/use_std",
+]
+test-utils = ["std"]
 test_fiat_shamir = []
-instruments = []                                                     # This enables timing prints in prover and verifier
-debug-checks = []                                                    # Enables validate_trace + bus balance report in prover
-parallel = ["dep:rayon", "crypto/parallel"]
+instruments = ["std"]                                                # This enables timing prints in prover and verifier
+debug-checks = ["std"]                                               # Enables validate_trace + bus balance report in prover
+parallel = ["dep:rayon", "crypto/parallel", "math/parallel", "std"]
 cuda = ["dep:math-cuda"]
 test-cuda-faults = ["cuda", "math-cuda/test-faults"]
-wasm = ["dep:wasm-bindgen", "dep:serde-wasm-bindgen", "dep:web-sys"]
+wasm = ["dep:wasm-bindgen", "dep:serde-wasm-bindgen", "dep:web-sys", "std"]
 disk-spill = ["dep:memmap2", "dep:tempfile", "dep:libc", "crypto/disk-spill"]
 
 
diff --git a/crypto/stark/src/constraints/boundary.rs b/crypto/stark/src/constraints/boundary.rs
index b34b6afec..15c546784 100644
--- a/crypto/stark/src/constraints/boundary.rs
+++ b/crypto/stark/src/constraints/boundary.rs
@@ -1,3 +1,5 @@
+use alloc::vec::Vec;
+
 use math::field::{element::FieldElement, traits::IsField};
 
 /// Represents a boundary constraint that must hold in an execution trace:
diff --git a/crypto/stark/src/constraints/evaluator.rs b/crypto/stark/src/constraints/evaluator.rs
index 6e94473b7..e3e608108 100644
--- a/crypto/stark/src/constraints/evaluator.rs
+++ b/crypto/stark/src/constraints/evaluator.rs
@@ -4,6 +4,8 @@ use crate::lookup::{BusPublicInputs, LOGUP_CHALLENGE_ALPHA, PackingShifts, compu
 use crate::trace::LDETraceTable;
 use crate::traits::{AIR, TransitionEvaluationContext, ZerofierEvaluations};
 use crate::{frame::Frame, prover::evaluate_polynomial_on_lde_domain};
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::traits::{IsFFTField, IsField, IsSubFieldOf};
 use math::{fft::errors::FFTError, field::element::FieldElement};
 #[cfg(feature = "parallel")]
@@ -12,7 +14,7 @@ use rayon::{
     prelude::{IntoParallelIterator, ParallelIterator},
 };
 
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 pub struct ConstraintEvaluator<
     Field: IsSubFieldOf<FieldExtension> + IsFFTField + Send + Sync,
diff --git a/crypto/stark/src/constraints/transition.rs b/crypto/stark/src/constraints/transition.rs
index 1fe249c4c..6486c4652 100644
--- a/crypto/stark/src/constraints/transition.rs
+++ b/crypto/stark/src/constraints/transition.rs
@@ -1,3 +1,5 @@
+use alloc::boxed::Box;
+use alloc::vec::Vec;
 use core::ops::Div;
 
 use crate::domain::Domain;
diff --git a/crypto/stark/src/context.rs b/crypto/stark/src/context.rs
index b83b1427b..10d94f30a 100644
--- a/crypto/stark/src/context.rs
+++ b/crypto/stark/src/context.rs
@@ -1,4 +1,5 @@
 use super::proof::options::ProofOptions;
+use alloc::vec::Vec;
 
 #[derive(Clone, Debug)]
 pub struct AirContext {
diff --git a/crypto/stark/src/debug.rs b/crypto/stark/src/debug.rs
index bf1a454a7..7c68fdf63 100644
--- a/crypto/stark/src/debug.rs
+++ b/crypto/stark/src/debug.rs
@@ -4,6 +4,7 @@ use super::trace::TraceTable;
 use super::traits::{AIR, TransitionEvaluationContext};
 use crate::lookup::{LOGUP_CHALLENGE_ALPHA, PackingShifts, compute_alpha_powers};
 use crate::{frame::Frame, trace::LDETraceTable};
+use alloc::vec::Vec;
 use log::{error, info};
 use math::field::traits::IsSubFieldOf;
 use math::{
@@ -91,7 +92,7 @@ pub fn validate_trace<
     // --------- VALIDATE TRANSITION CONSTRAINTS -----------
     let n_transition_constraints = air.context().num_transition_constraints;
     let exemption_steps: Vec<usize> =
-        std::iter::repeat_n(lde_trace.num_steps(), n_transition_constraints)
+        core::iter::repeat_n(lde_trace.num_steps(), n_transition_constraints)
             .zip(air.transition_constraints())
             .map(|(trace_steps, constraint)| trace_steps - constraint.end_exemptions())
             .collect();
diff --git a/crypto/stark/src/domain.rs b/crypto/stark/src/domain.rs
index e858c502c..66d562080 100644
--- a/crypto/stark/src/domain.rs
+++ b/crypto/stark/src/domain.rs
@@ -1,3 +1,4 @@
+use alloc::vec::Vec;
 use math::{
     fft::roots_of_unity::get_powers_of_primitive_root_coset,
     field::{
diff --git a/crypto/stark/src/examples/dummy_air.rs b/crypto/stark/src/examples/dummy_air.rs
index 1409f96ba..f5ff09c90 100644
--- a/crypto/stark/src/examples/dummy_air.rs
+++ b/crypto/stark/src/examples/dummy_air.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use crate::{
     constraints::{
diff --git a/crypto/stark/src/examples/fibonacci_2_cols_shifted.rs b/crypto/stark/src/examples/fibonacci_2_cols_shifted.rs
index 76c8ea11f..afd437e32 100644
--- a/crypto/stark/src/examples/fibonacci_2_cols_shifted.rs
+++ b/crypto/stark/src/examples/fibonacci_2_cols_shifted.rs
@@ -8,11 +8,11 @@ use crate::{
     trace::TraceTable,
     traits::{AIR, TransitionEvaluationContext},
 };
+use core::marker::PhantomData;
 use math::{
     field::{element::FieldElement, traits::IsFFTField},
     traits::AsBytes,
 };
-use std::marker::PhantomData;
 
 #[derive(Clone)]
 struct ShiftedFibTransition1<F: IsFFTField> {
diff --git a/crypto/stark/src/examples/fibonacci_2_columns.rs b/crypto/stark/src/examples/fibonacci_2_columns.rs
index 7662c8f98..725ed541c 100644
--- a/crypto/stark/src/examples/fibonacci_2_columns.rs
+++ b/crypto/stark/src/examples/fibonacci_2_columns.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use super::simple_fibonacci::FibonacciPublicInputs;
 use crate::{
diff --git a/crypto/stark/src/examples/fibonacci_multi_column.rs b/crypto/stark/src/examples/fibonacci_multi_column.rs
index ac6069ece..9e8e8917f 100644
--- a/crypto/stark/src/examples/fibonacci_multi_column.rs
+++ b/crypto/stark/src/examples/fibonacci_multi_column.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use crate::{
     constraints::{
diff --git a/crypto/stark/src/examples/fibonacci_rap.rs b/crypto/stark/src/examples/fibonacci_rap.rs
index 10f1827d2..f6c6b4ce3 100644
--- a/crypto/stark/src/examples/fibonacci_rap.rs
+++ b/crypto/stark/src/examples/fibonacci_rap.rs
@@ -1,4 +1,4 @@
-use std::{marker::PhantomData, ops::Div};
+use core::{marker::PhantomData, ops::Div};
 
 use crate::{
     constraints::{
diff --git a/crypto/stark/src/examples/quadratic_air.rs b/crypto/stark/src/examples/quadratic_air.rs
index d49b0050d..59bcb753c 100644
--- a/crypto/stark/src/examples/quadratic_air.rs
+++ b/crypto/stark/src/examples/quadratic_air.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use crate::{
     constraints::{
diff --git a/crypto/stark/src/examples/read_only_memory.rs b/crypto/stark/src/examples/read_only_memory.rs
index 8c3e9efac..bffa1702f 100644
--- a/crypto/stark/src/examples/read_only_memory.rs
+++ b/crypto/stark/src/examples/read_only_memory.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use crate::{
     constraints::{
diff --git a/crypto/stark/src/examples/read_only_memory_logup.rs b/crypto/stark/src/examples/read_only_memory_logup.rs
index e4f25c16c..b32a29708 100644
--- a/crypto/stark/src/examples/read_only_memory_logup.rs
+++ b/crypto/stark/src/examples/read_only_memory_logup.rs
@@ -2,7 +2,7 @@
 //! See our blog post for detailed explanation.
 //! <https://blog.lambdaclass.com/logup-lookup-argument-and-its-implementation-using-lambdaworks-for-continuous-read-only-memory/>
 
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use crate::{
     constraints::{
diff --git a/crypto/stark/src/examples/simple_addition.rs b/crypto/stark/src/examples/simple_addition.rs
index 78f938838..9a48741cd 100644
--- a/crypto/stark/src/examples/simple_addition.rs
+++ b/crypto/stark/src/examples/simple_addition.rs
@@ -1,7 +1,7 @@
 //! A minimal AIR with a simple addition constraint: col0 + col1 = col2
 //! This is used to test STARK proving/verification with small traces (1-2 rows).
 
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use crate::{
     constraints::{
diff --git a/crypto/stark/src/examples/simple_fibonacci.rs b/crypto/stark/src/examples/simple_fibonacci.rs
index a39064258..51c537c8e 100644
--- a/crypto/stark/src/examples/simple_fibonacci.rs
+++ b/crypto/stark/src/examples/simple_fibonacci.rs
@@ -8,8 +8,8 @@ use crate::{
     trace::TraceTable,
     traits::{AIR, TransitionEvaluationContext},
 };
+use core::marker::PhantomData;
 use math::field::{element::FieldElement, traits::IsFFTField};
-use std::marker::PhantomData;
 
 #[derive(Clone)]
 struct FibConstraint<F: IsFFTField> {
diff --git a/crypto/stark/src/examples/simple_periodic_cols.rs b/crypto/stark/src/examples/simple_periodic_cols.rs
index 70f5da3b4..02660157e 100644
--- a/crypto/stark/src/examples/simple_periodic_cols.rs
+++ b/crypto/stark/src/examples/simple_periodic_cols.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use core::marker::PhantomData;
 
 use crate::{
     constraints::{
diff --git a/crypto/stark/src/frame.rs b/crypto/stark/src/frame.rs
index 952a3a110..91f2d94cb 100644
--- a/crypto/stark/src/frame.rs
+++ b/crypto/stark/src/frame.rs
@@ -1,4 +1,6 @@
 use crate::{table::TableView, trace::LDETraceTable};
+use alloc::vec;
+use alloc::vec::Vec;
 use itertools::Itertools;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
diff --git a/crypto/stark/src/fri/fri_commitment.rs b/crypto/stark/src/fri/fri_commitment.rs
index 831471761..4fafede22 100644
--- a/crypto/stark/src/fri/fri_commitment.rs
+++ b/crypto/stark/src/fri/fri_commitment.rs
@@ -1,3 +1,4 @@
+use alloc::vec::Vec;
 use crypto::merkle_tree::{merkle::MerkleTree, traits::IsMerkleTreeBackend};
 use math::{
     field::{element::FieldElement, traits::IsField},
diff --git a/crypto/stark/src/fri/fri_decommit.rs b/crypto/stark/src/fri/fri_decommit.rs
index f398096d5..4a1fb272c 100644
--- a/crypto/stark/src/fri/fri_decommit.rs
+++ b/crypto/stark/src/fri/fri_decommit.rs
@@ -1,3 +1,4 @@
+use alloc::vec::Vec;
 use crypto::merkle_tree::proof::Proof;
 use math::field::element::FieldElement;
 use math::field::traits::IsField;
diff --git a/crypto/stark/src/fri/fri_functions.rs b/crypto/stark/src/fri/fri_functions.rs
index 6037da4ec..bd8f79d77 100644
--- a/crypto/stark/src/fri/fri_functions.rs
+++ b/crypto/stark/src/fri/fri_functions.rs
@@ -1,3 +1,4 @@
+use alloc::vec::Vec;
 use math::fft::{
     bit_reversing::in_place_bit_reverse_permute, roots_of_unity::get_powers_of_primitive_root_coset,
 };
diff --git a/crypto/stark/src/fri/mod.rs b/crypto/stark/src/fri/mod.rs
index 60ad2a398..cc72c4a68 100644
--- a/crypto/stark/src/fri/mod.rs
+++ b/crypto/stark/src/fri/mod.rs
@@ -1,3 +1,5 @@
+use alloc::vec;
+use alloc::vec::Vec;
 pub mod fri_commitment;
 pub mod fri_decommit;
 pub(crate) mod fri_functions;
diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs
index e9f6a1cda..e5a756972 100644
--- a/crypto/stark/src/lib.rs
+++ b/crypto/stark/src/lib.rs
@@ -1,3 +1,7 @@
+#![cfg_attr(not(feature = "std"), no_std)]
+
+extern crate alloc;
+
 // `StorageMode::Disk` uses `memmap2`, which does not build on wasm32.
 // Fail at the crate root rather than as a transitive memmap2 error.
 #[cfg(all(target_arch = "wasm32", feature = "disk-spill"))]
diff --git a/crypto/stark/src/lookup.rs b/crypto/stark/src/lookup.rs
index 745736d4d..4de42d044 100644
--- a/crypto/stark/src/lookup.rs
+++ b/crypto/stark/src/lookup.rs
@@ -1,6 +1,10 @@
+use alloc::boxed::Box;
+use alloc::string::{String, ToString};
+use alloc::vec;
+use alloc::vec::Vec;
+use core::marker::PhantomData;
 #[cfg(feature = "debug-checks")]
-use std::collections::HashMap;
-use std::marker::PhantomData;
+use hashbrown::HashMap;
 
 use crate::{
     constraints::{
diff --git a/crypto/stark/src/par.rs b/crypto/stark/src/par.rs
index a20a452b6..b65e29720 100644
--- a/crypto/stark/src/par.rs
+++ b/crypto/stark/src/par.rs
@@ -1,6 +1,8 @@
 //! Helpers that abstract over `cfg(feature = "parallel")` for patterns
 //! that recur across the prover.
 
+use alloc::vec::Vec;
+
 /// Run `f(i)` for `i in 0..n` and return the unzipped pair of result vecs.
 /// Parallel when `feature = "parallel"`, sequential otherwise.
 pub(crate) fn map_unzip<A, B, F>(n: usize, f: F) -> (Vec<A>, Vec<B>)
diff --git a/crypto/stark/src/proof/options.rs b/crypto/stark/src/proof/options.rs
index 70976b993..8fe3f1e6d 100644
--- a/crypto/stark/src/proof/options.rs
+++ b/crypto/stark/src/proof/options.rs
@@ -101,11 +101,24 @@ impl GoldilocksCubicProofOptions {
             });
         }
 
+        #[cfg(feature = "std")]
+        let (sqrt, log2, ceil) = (
+            f64::sqrt as fn(f64) -> f64,
+            f64::log2 as fn(f64) -> f64,
+            f64::ceil as fn(f64) -> f64,
+        );
+        #[cfg(not(feature = "std"))]
+        let (sqrt, log2, ceil) = (
+            libm::sqrt as fn(f64) -> f64,
+            libm::log2 as fn(f64) -> f64,
+            libm::ceil as fn(f64) -> f64,
+        );
+
         let rate = 1.0 / blowup_factor as f64;
-        let proximity = 1.0 - rate.sqrt() - 1.0 / 300.0;
-        let bits_per_query = -(1.0 - proximity).log2();
+        let proximity = 1.0 - sqrt(rate) - 1.0 / 300.0;
+        let bits_per_query = -log2(1.0 - proximity);
         let fri_number_of_queries =
-            ((security_bits as f64 - grinding_factor as f64) / bits_per_query).ceil() as usize;
+            ceil((security_bits as f64 - grinding_factor as f64) / bits_per_query) as usize;
 
         Ok(ProofOptions {
             blowup_factor,
diff --git a/crypto/stark/src/proof/stark.rs b/crypto/stark/src/proof/stark.rs
index 1751d60fe..302649b29 100644
--- a/crypto/stark/src/proof/stark.rs
+++ b/crypto/stark/src/proof/stark.rs
@@ -1,3 +1,4 @@
+use alloc::vec::Vec;
 use crypto::merkle_tree::proof::Proof;
 use math::field::{
     element::FieldElement,
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 4da57559c..390ed09da 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -1,5 +1,8 @@
-use std::marker::PhantomData;
-use std::sync::Arc;
+use alloc::string::String;
+use alloc::sync::Arc;
+use alloc::vec;
+use alloc::vec::Vec;
+use core::marker::PhantomData;
 #[cfg(feature = "instruments")]
 use std::time::{Duration, Instant};
 
@@ -1687,8 +1690,8 @@ pub trait IsStarkProver<
         // Many tables share the same domain size (e.g., 7+ tables at 2^20).
         // Without dedup, each creates its own Domain (~24 MB) and LdeTwiddles (~32 MB).
         type DomainEntry<F> = (Arc<Domain<F>>, Arc<LdeTwiddles<F>>);
-        let mut domain_cache: std::collections::HashMap<(usize, usize, u64), DomainEntry<Field>> =
-            std::collections::HashMap::new();
+        let mut domain_cache: hashbrown::HashMap<(usize, usize, u64), DomainEntry<Field>> =
+            hashbrown::HashMap::new();
 
         let mut domains = Vec::with_capacity(num_airs);
         let mut twiddle_caches: Vec<Arc<LdeTwiddles<Field>>> = Vec::with_capacity(num_airs);
diff --git a/crypto/stark/src/r4_denoms.rs b/crypto/stark/src/r4_denoms.rs
index 77076ecfe..a79912b74 100644
--- a/crypto/stark/src/r4_denoms.rs
+++ b/crypto/stark/src/r4_denoms.rs
@@ -12,6 +12,8 @@
 //!   - `z_scalars = [z_power, z_shifted[0..]]`, length `1 + z_shifted.len()`
 //!   - `denoms[k * lde_size + i] = x_i - z_scalars[k]` (then inverted)
 
+use alloc::vec::Vec;
+
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 
diff --git a/crypto/stark/src/table.rs b/crypto/stark/src/table.rs
index 58938d5e4..24189cc10 100644
--- a/crypto/stark/src/table.rs
+++ b/crypto/stark/src/table.rs
@@ -1,3 +1,5 @@
+use alloc::vec::Vec;
+
 use crate::frame::Frame;
 #[cfg(feature = "disk-spill")]
 use crypto::mmap_util::spill_slice_to_mmap;
diff --git a/crypto/stark/src/tests/bus_tests/completeness_tests.rs b/crypto/stark/src/tests/bus_tests/completeness_tests.rs
index 83f8ac391..d51f8977e 100644
--- a/crypto/stark/src/tests/bus_tests/completeness_tests.rs
+++ b/crypto/stark/src/tests/bus_tests/completeness_tests.rs
@@ -7,6 +7,7 @@ use math::field::element::FieldElement;
 use math::field::{
     extensions_goldilocks::Degree3GoldilocksExtensionField, goldilocks::GoldilocksField,
 };
+use minicbor_serde;
 
 use crate::examples::multi_table_lookup::{
     new_add_air_with_lookup, new_cpu_air_with_lookup, new_mul_air_with_lookup,
@@ -377,9 +378,9 @@ fn test_serialization_roundtrip() {
         multi_prove_ram(air_trace_pairs, &mut DefaultTranscript::<E>::new(&[])).unwrap();
 
     // Serialize and deserialize
-    let serialized = serde_cbor::to_vec(&multi_proof).expect("serialization failed");
+    let serialized = minicbor_serde::to_vec(&multi_proof).expect("serialization failed");
     let deserialized: crate::proof::stark::MultiProof<F, E, ()> =
-        serde_cbor::from_slice(&serialized).expect("deserialization failed");
+        minicbor_serde::from_slice(&serialized).expect("deserialization failed");
 
     let airs: Vec<&dyn AIR<Field = F, FieldExtension = E, PublicInputs = ()>> =
         vec![&cpu_air, &add_air, &mul_air];
diff --git a/crypto/stark/src/tests/prove_verify_roundtrip_tests.rs b/crypto/stark/src/tests/prove_verify_roundtrip_tests.rs
index 4059ed481..717ff0be6 100644
--- a/crypto/stark/src/tests/prove_verify_roundtrip_tests.rs
+++ b/crypto/stark/src/tests/prove_verify_roundtrip_tests.rs
@@ -8,6 +8,7 @@ use math::field::element::FieldElement;
 use math::field::{
     extensions_goldilocks::Degree3GoldilocksExtensionField, goldilocks::GoldilocksField,
 };
+use minicbor_serde;
 
 use crate::constraints::transition::TransitionConstraintEvaluator;
 use crate::lookup::{
@@ -142,13 +143,13 @@ fn test_verify_serialized_multi_table_proofs() {
     // NETWORK TRANSMISSION - Serialize and deserialize (using CBOR binary format)
     // =========================================================================
 
-    let serialized = serde_cbor::to_vec(&proofs).expect("Failed to serialize proofs");
+    let serialized = minicbor_serde::to_vec(&proofs).expect("Failed to serialize proofs");
 
     // At this point, the prover's data is dropped (out of scope above)
     // The verifier only has the serialized data
 
     let received_proofs: MultiProof<F, E, ()> =
-        serde_cbor::from_slice(&serialized).expect("Failed to deserialize proofs");
+        minicbor_serde::from_slice(&serialized).expect("Failed to deserialize proofs");
 
     // =========================================================================
     // VERIFIER SIDE - Reconstruct AIRs and verify
diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs
index 405ce89f8..d6fdfe116 100644
--- a/crypto/stark/src/trace.rs
+++ b/crypto/stark/src/trace.rs
@@ -1,5 +1,7 @@
 use crate::domain::{Domain, DomainConstants};
 use crate::table::Table;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::traits::{IsField, IsSubFieldOf};
 use math::field::{element::FieldElement, traits::IsFFTField};
 use math::polynomial::barycentric_inv_denoms;
diff --git a/crypto/stark/src/traits.rs b/crypto/stark/src/traits.rs
index 06465b659..862dad155 100644
--- a/crypto/stark/src/traits.rs
+++ b/crypto/stark/src/traits.rs
@@ -1,4 +1,7 @@
-use std::collections::HashMap;
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
+use hashbrown::HashMap;
 
 use crypto::fiat_shamir::is_transcript::IsStarkTranscript;
 use math::{
diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 68819c76b..85e3209c1 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -12,6 +12,9 @@ use crate::{
     lookup::{LOGUP_CHALLENGE_ALPHA, LOGUP_NUM_CHALLENGES, PackingShifts, compute_alpha_powers},
     proof::stark::{DeepPolynomialOpening, MultiProof, PolynomialOpenings},
 };
+use alloc::vec;
+use alloc::vec::Vec;
+use core::marker::PhantomData;
 use crypto::{fiat_shamir::is_transcript::IsStarkTranscript, merkle_tree::proof::Proof};
 #[cfg(not(feature = "test_fiat_shamir"))]
 use log::error;
@@ -25,8 +28,7 @@ use math::{
     },
     traits::AsBytes,
 };
-use std::collections::HashMap;
-use std::marker::PhantomData;
+use hashbrown::HashMap;
 #[cfg(feature = "instruments")]
 use std::time::Instant;
 
@@ -314,7 +316,7 @@ pub trait IsStarkVerifier<
         E: IsField,
         Field: IsSubFieldOf<E>,
     {
-        proof.verify::<BatchedMerkleTreeBackend<E>>(root, index, &value.to_owned())
+        proof.verify::<BatchedMerkleTreeBackend<E>>(root, index, &value.to_vec())
     }
 
     /// Verify both (proof, evaluations) and (proof_sym, evaluations_sym) openings
diff --git a/executor/Cargo.toml b/executor/Cargo.toml
index 5d1e4ae49..343735693 100644
--- a/executor/Cargo.toml
+++ b/executor/Cargo.toml
@@ -4,10 +4,19 @@ version = "0.1.0"
 edition = "2024"
 license.workspace = true
 
+[features]
+default = ["std"]
+std = ["thiserror/std", "dep:rustc-demangle", "ecsm/std"]
+
+[[bin]]
+name = "executor"
+required-features = ["std"]
+
 [dependencies]
-thiserror = "1.0.68"
-rustc-demangle = "0.1"
-ecsm = { path = "../crypto/ecsm" }
+thiserror = { version = "2.0", default-features = false }
+rustc-demangle = { version = "0.1", optional = true }
+hashbrown = { version = "0.14", default-features = false, features = ["inline-more", "ahash"] }
+ecsm = { path = "../crypto/ecsm", default-features = false }
 
 [dev-dependencies]
 serde = { version = "1.0", features = ["derive"] }
diff --git a/executor/src/constants.rs b/executor/src/constants.rs
new file mode 100644
index 000000000..f84e05a2b
--- /dev/null
+++ b/executor/src/constants.rs
@@ -0,0 +1,58 @@
+//! VM memory layout constants shared between prover and verifier code paths.
+//!
+//! These live outside `vm/` because the verifier needs them even when the full
+//! VM executor is not compiled in (e.g. inside a RISC-V guest verifying a proof).
+
+/// Initial value of the stack pointer register (SP, x2).
+/// 64-bit max, aligned to 16 bytes per RV64 ABI.
+pub const STACK_TOP: u64 = 0xFFFFFFFFFFFFFFF0;
+
+/// Maximum byte length of the private-input region.
+///
+/// Bumped from 6.7 MB to 64 MB to accommodate serialized STARK proofs as
+/// private input for the naive recursion experiment.
+pub const MAX_PRIVATE_INPUT_SIZE: u64 = 64 * 1024 * 1024;
+
+/// Memory address where the private-input region starts.
+/// Layout: 4-byte LE length prefix at this address, then payload at +4.
+pub const PRIVATE_INPUT_START_INDEX: u64 = 0xFF000000;
+
+/// Syscall number for the Keccak-f[1600] precompile.
+pub const KECCAK_SYSCALL_NUMBER: u64 = u64::MAX - 1;
+
+/// Round constants for Keccak-f[1600] (24 rounds).
+pub const KECCAK_RC: [u64; 24] = [
+    0x0000000000000001,
+    0x0000000000008082,
+    0x800000000000808A,
+    0x8000000080008000,
+    0x000000000000808B,
+    0x0000000080000001,
+    0x8000000080008081,
+    0x8000000000008009,
+    0x000000000000008A,
+    0x0000000000000088,
+    0x0000000080008009,
+    0x000000008000000A,
+    0x000000008000808B,
+    0x800000000000008B,
+    0x8000000000008089,
+    0x8000000000008003,
+    0x8000000000008002,
+    0x8000000000000080,
+    0x000000000000800A,
+    0x800000008000000A,
+    0x8000000080008081,
+    0x8000000000008080,
+    0x0000000080000001,
+    0x8000000080008008,
+];
+
+/// Rotation offsets R[x][y] for the rho step of Keccak-f[1600].
+pub const KECCAK_RHO: [[u32; 5]; 5] = [
+    [0, 36, 3, 41, 18],
+    [1, 44, 10, 45, 2],
+    [62, 6, 43, 15, 61],
+    [28, 55, 25, 21, 56],
+    [27, 20, 39, 8, 14],
+];
diff --git a/executor/src/elf.rs b/executor/src/elf.rs
index ed79fb983..120436efd 100644
--- a/executor/src/elf.rs
+++ b/executor/src/elf.rs
@@ -1,3 +1,5 @@
+use alloc::string::{String, ToString};
+use alloc::vec::Vec;
 const EI_NIDENT: usize = 16;
 // Section header types
 const SHT_SYMTAB: u32 = 2;
@@ -557,4 +559,9 @@ impl SymbolTable {
     pub fn len(&self) -> usize {
         self.functions.len()
     }
+
+    /// Borrow the full function list (sorted by address).
+    pub fn functions(&self) -> &[FunctionSymbol] {
+        &self.functions
+    }
 }
diff --git a/executor/src/lib.rs b/executor/src/lib.rs
index d626ca1f4..cb6c99eed 100644
--- a/executor/src/lib.rs
+++ b/executor/src/lib.rs
@@ -1,4 +1,10 @@
+#![cfg_attr(not(feature = "std"), no_std)]
+
+extern crate alloc;
+
+pub mod constants;
 pub mod elf;
+#[cfg(feature = "std")]
 pub mod flamegraph;
 #[cfg(test)]
 pub mod tests;
diff --git a/executor/src/vm/execution.rs b/executor/src/vm/execution.rs
index 614aad649..81762cf19 100644
--- a/executor/src/vm/execution.rs
+++ b/executor/src/vm/execution.rs
@@ -1,4 +1,6 @@
-use std::{cmp::Ordering, fmt::Debug};
+use alloc::vec;
+use alloc::vec::Vec;
+use core::{cmp::Ordering, fmt::Debug};
 
 use crate::{
     elf::Elf,
@@ -103,6 +105,13 @@ impl Executor {
         self.get_return_values()
     }
 
+    /// Read-only access to the executor's memory. Exposed for diagnostic
+    /// tooling that needs to inspect the final memory state (e.g. counting
+    /// distinct 4 KB pages touched) after a streaming `resume()` loop.
+    pub fn memory(&self) -> &Memory {
+        &self.memory
+    }
+
     /// Run to completion and return all logs (consumes executor)
     pub fn run(mut self) -> Result<ExecutionResult, ExecutorError> {
         let mut logs = Vec::with_capacity(CHUNK_SIZE);
diff --git a/executor/src/vm/instruction/execution.rs b/executor/src/vm/instruction/execution.rs
index 148d7f86c..4ad257d70 100644
--- a/executor/src/vm/instruction/execution.rs
+++ b/executor/src/vm/instruction/execution.rs
@@ -1,3 +1,6 @@
+use alloc::borrow::ToOwned;
+use alloc::string::String;
+
 use crate::vm::{
     instruction::decoding::{ArithOp, Comparison, Instruction, LoadStoreWidth},
     logs::Log,
@@ -346,7 +349,11 @@ impl Instruction {
                         let bytes = memory.load_bytes(pointer, len)?;
                         let value =
                             str::from_utf8(&bytes).map_err(|_| ExecutionError::IncorrectMessage)?;
-                        println!("PRINT VM: {}", value);
+                        // No stdout when the executor itself runs inside a guest VM.
+                        #[cfg(feature = "std")]
+                        std::println!("PRINT VM: {}", value);
+                        #[cfg(not(feature = "std"))]
+                        let _ = value;
                     }
                     SyscallNumbers::Panic => {
                         // panic
@@ -617,42 +624,7 @@ pub enum ExecutionError {
 // Keccak-f[1600] permutation
 // =============================================================================
 
-/// Round constants for Keccak-f[1600] (24 rounds).
-pub const KECCAK_RC: [u64; 24] = [
-    0x0000000000000001,
-    0x0000000000008082,
-    0x800000000000808A,
-    0x8000000080008000,
-    0x000000000000808B,
-    0x0000000080000001,
-    0x8000000080008081,
-    0x8000000000008009,
-    0x000000000000008A,
-    0x0000000000000088,
-    0x0000000080008009,
-    0x000000008000000A,
-    0x000000008000808B,
-    0x800000000000008B,
-    0x8000000000008089,
-    0x8000000000008003,
-    0x8000000000008002,
-    0x8000000000000080,
-    0x000000000000800A,
-    0x800000008000000A,
-    0x8000000080008081,
-    0x8000000000008080,
-    0x0000000080000001,
-    0x8000000080008008,
-];
-
-/// Rotation offsets R[x][y] for the rho step of Keccak-f[1600].
-pub const KECCAK_RHO: [[u32; 5]; 5] = [
-    [0, 36, 3, 41, 18],
-    [1, 44, 10, 45, 2],
-    [62, 6, 43, 15, 61],
-    [28, 55, 25, 21, 56],
-    [27, 20, 39, 8, 14],
-];
+pub use crate::constants::{KECCAK_RC, KECCAK_RHO};
 
 /// Apply the Keccak-f[1600] permutation (24 rounds) to a 25-word state.
 ///
diff --git a/executor/src/vm/memory.rs b/executor/src/vm/memory.rs
index ea84e2620..28d156ae2 100644
--- a/executor/src/vm/memory.rs
+++ b/executor/src/vm/memory.rs
@@ -1,5 +1,6 @@
-use std::collections::HashMap;
-use std::hash::{BuildHasher, Hasher};
+use alloc::vec::Vec;
+use core::hash::{BuildHasher, Hasher};
+use hashbrown::HashMap;
 
 /// Fast hasher for u64 keys - uses the key directly as the hash value.
 /// This avoids the overhead of SipHash for integer keys.
@@ -42,13 +43,12 @@ pub type U64HashMap<V> = HashMap<u64, V, U64BuildHasher>;
 /// The COMMIT AIR concatenates calls via the running `x254` index, so this
 /// is enforced as a running-total budget rather than a per-call limit.
 pub const MAX_PUBLIC_OUTPUT_TOTAL_SIZE: u64 = 1024 * 1024;
-/// Maximum size of the private input memory region (in bytes).
-pub const MAX_PRIVATE_INPUT_SIZE: u64 = 6700000;
-/// Fixed high address where private input is mapped. Guest programs can read
-/// directly from this address (ZisK-style memory-mapped input).
-/// Layout: 4-byte LE length prefix at `PRIVATE_INPUT_START_INDEX`, then data at +4.
-/// Must match `PRIVATE_INPUT_START` in `syscalls/src/syscalls.rs`.
-pub const PRIVATE_INPUT_START_INDEX: u64 = 0xFF000000;
+/// Private-input region size cap and mapped base address. Re-exported from
+/// `constants` (the canonical definitions) rather than redeclared here — the
+/// old local `MAX_PRIVATE_INPUT_SIZE = 6.7 MiB` shadowed the 64 MiB constant
+/// and rejected larger recursion blobs (e.g. multi-query / high-blowup inner
+/// proofs) with `PrivateInputSizeExceeded`.
+pub use crate::constants::{MAX_PRIVATE_INPUT_SIZE, PRIVATE_INPUT_START_INDEX};
 
 #[derive(Default, Debug)]
 pub struct Memory {
@@ -204,6 +204,13 @@ impl Memory {
         Ok(self.public_output.clone())
     }
 
+    /// Read-only access to the underlying 4-byte cell map. Exposed for
+    /// diagnostic tooling (e.g. counting the distinct 4 KB memory pages a
+    /// program touches) — not part of the normal execution interface.
+    pub fn cells(&self) -> &U64HashMap<[u8; 4]> {
+        &self.cells
+    }
+
     /// Pre-loads private input bytes at `PRIVATE_INPUT_START_INDEX` as a
     /// 4-byte LE length prefix followed by the raw data. The guest reads these
     /// bytes directly via normal RISC-V loads (ZisK-style memory-mapped input).
@@ -232,7 +239,7 @@ impl Memory {
             let aligned = addr - (addr % 4);
             let bytes = self.cells.get(&aligned).cloned().unwrap_or_default();
             let offset = (addr % 4) as usize;
-            let take = std::cmp::min(4 - offset, (end - addr) as usize);
+            let take = core::cmp::min(4 - offset, (end - addr) as usize);
             result.extend_from_slice(&bytes[offset..offset + take]);
             addr += take as u64;
         }
diff --git a/executor/src/vm/registers.rs b/executor/src/vm/registers.rs
index 61945b732..743b90542 100644
--- a/executor/src/vm/registers.rs
+++ b/executor/src/vm/registers.rs
@@ -1,6 +1,7 @@
-use std::fmt::Display;
+use alloc::vec::Vec;
+use core::fmt::Display;
 
-pub const STACK_TOP: u64 = 0xFFFFFFFFFFFFFFF0; // 64-bit max (Multiple of 16 for RV64 ABI)
+pub use crate::constants::STACK_TOP;
 
 #[derive(Debug)]
 /// Holds the current value of all 32 registers
@@ -48,13 +49,13 @@ impl Registers {
 }
 
 impl Display for Registers {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         const REGISTER_NAMES: [&str; 32] = [
             "zero", "ra", "sp", "gp", "tp", "t0", "t1", "t2", "s0", "s1", "a0", "a1", "a2", "a3",
             "a4", "a5", "a6", "a7", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11",
             "t3", "t4", "t5", "t6",
         ];
-        let values = std::iter::once(0u64).chain(self.0.iter().copied());
+        let values = core::iter::once(0u64).chain(self.0.iter().copied());
 
         for (i, chunk) in REGISTER_NAMES
             .iter()
diff --git a/prover/Cargo.toml b/prover/Cargo.toml
index da9ceb9af..103f70f43 100644
--- a/prover/Cargo.toml
+++ b/prover/Cargo.toml
@@ -5,33 +5,44 @@ edition = "2024"
 license.workspace = true
 
 [features]
-default = ["parallel"]
-parallel = ["stark/parallel", "math/parallel", "crypto/parallel", "dep:rayon"]
+default = ["std", "prove", "parallel"]
+std = ["stark/std", "math/std", "crypto/std", "executor/std", "ecsm/std"]
+prove = []
+parallel = ["stark/parallel", "math/parallel", "crypto/parallel", "dep:rayon", "std"]
 cuda = ["stark/cuda"]
 test-cuda-faults = ["cuda", "stark/test-cuda-faults"]
-debug-checks = ["stark/debug-checks"]
-instruments = ["stark/instruments"]
-disk-spill = ["stark/disk-spill"]
+debug-checks = ["stark/debug-checks", "std"]
+instruments = ["stark/instruments", "std"]
+disk-spill = ["stark/disk-spill", "dep:sysinfo"]
 
 [dependencies]
-stark = { path = "../crypto/stark" }
-crypto = { path = "../crypto/crypto" }
-math = { path = "../crypto/math" }
-executor = { path = "../executor" }
-ecsm = { path = "../crypto/ecsm" }
-serde = { version = "1.0", features = ["derive"] }
+stark = { path = "../crypto/stark", default-features = false }
+crypto = { path = "../crypto/crypto", default-features = false, features = ["serde"] }
+math = { path = "../crypto/math", default-features = false, features = ["alloc", "lambdaworks-serde-binary"] }
+executor = { path = "../executor", default-features = false }
+ecsm = { path = "../crypto/ecsm", default-features = false }
+serde = { version = "1.0", default-features = false, features = ["derive", "alloc"] }
+hashbrown = { version = "0.14", default-features = false, features = ["inline-more", "ahash"] }
 rayon = { version = "1.8.0", optional = true }
-sysinfo = { version = "0.31", default-features = false, features = ["system"] }
+# Only the `disk-spill` auto-storage path queries OS memory. `sysinfo` is a
+# host crate (pulls `std` via `memchr`), so keep it optional and out of the
+# bare-metal guest builds that depend on the prover with no default features.
+sysinfo = { version = "0.31", default-features = false, features = ["system"], optional = true }
 log = "0.4"
 sha3 = { version = "0.10.8", default-features = false }
+postcard = { version = "1.0", default-features = false, features = ["alloc"] }
 
 [dev-dependencies]
 env_logger = "*"
 criterion = { version = "0.5", default-features = false }
 bincode = "1"
+postcard = { version = "1.0", features = ["alloc"] }
 tikv-jemallocator = "0.6"
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tiny-keccak = { version = "2.0", features = ["keccak"] }
+# Resolve guest PCs to source functions inside the histogram diagnostics
+# (replaces piping the printed addresses through the addr2line binary).
+addr2line = "0.27.0"
 # Enable stark's test-utils so cross-crate tests can reach
 # `compute_precomputed_commitment_for_testing`. Only active under cargo test/bench.
 stark = { path = "../crypto/stark", features = ["test-utils"] }
diff --git a/prover/src/constraints/cpu.rs b/prover/src/constraints/cpu.rs
index facc9e16d..4e3794a96 100644
--- a/prover/src/constraints/cpu.rs
+++ b/prover/src/constraints/cpu.rs
@@ -15,6 +15,9 @@
 //! `JALR` is the `mem_flags` byte read directly: under `BRANCH` only the JALR bit
 //! of `mem_flags` can be set, so `mem_flags ∈ {0,1} = JALR` there.
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
diff --git a/prover/src/constraints/templates.rs b/prover/src/constraints/templates.rs
index ef5b6c036..ec7177039 100644
--- a/prover/src/constraints/templates.rs
+++ b/prover/src/constraints/templates.rs
@@ -11,6 +11,8 @@
 //!   - lhs, rhs, sum: DWordWL (2 × 32-bit words)
 //!   - Embeds carry constraints inline
 
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::{constraints::transition::TransitionConstraint, table::TableView};
diff --git a/prover/src/instruments.rs b/prover/src/instruments.rs
index f15223e18..ef82f5ad2 100644
--- a/prover/src/instruments.rs
+++ b/prover/src/instruments.rs
@@ -1,3 +1,8 @@
+use alloc::format;
+use alloc::string::{String, ToString};
+use alloc::vec;
+use alloc::vec::Vec;
+#[cfg(feature = "prove")]
 use std::collections::BTreeMap;
 use std::time::Duration;
 
diff --git a/prover/src/lib.rs b/prover/src/lib.rs
index 81233d39f..23d95ae23 100644
--- a/prover/src/lib.rs
+++ b/prover/src/lib.rs
@@ -10,6 +10,15 @@
 //! assert!(lambda_vm_prover::verify(&vm_proof, &elf_bytes).unwrap());
 //! ```
 
+#![cfg_attr(not(feature = "std"), no_std)]
+// In guest builds (`prove` feature off) the prove-side helpers — trace generators,
+// executor-typed imports, internal Operation structs, etc. — are unreferenced.
+// They're real code, used by the host build, and there's nothing to fix there.
+// Silence the resulting dead_code / unused_imports noise in the guest build only.
+#![cfg_attr(not(feature = "prove"), allow(dead_code, unused_imports))]
+
+extern crate alloc;
+
 #[cfg(feature = "disk-spill")]
 pub mod auto_storage;
 pub mod constraints;
@@ -22,15 +31,24 @@ pub mod tables;
 pub mod test_utils;
 #[cfg(test)]
 pub mod tests;
+pub mod vkey;
 
-use std::fmt;
+pub use vkey::VmVerifyingKey;
+
+use alloc::format;
+use alloc::string::String;
+use alloc::vec;
+use alloc::vec::Vec;
+use core::fmt;
 
 use crypto::fiat_shamir::default_transcript::DefaultTranscript;
 use crypto::fiat_shamir::is_transcript::IsTranscript;
 use executor::elf::Elf;
+#[cfg(feature = "prove")]
 use executor::vm::execution::Executor;
 use math::field::element::FieldElement;
 use stark::config::Commitment;
+#[cfg(feature = "prove")]
 use stark::prover::{IsStarkProver, Prover};
 #[cfg(feature = "disk-spill")]
 use stark::storage_mode::StorageMode;
@@ -56,7 +74,7 @@ use crate::test_utils::{
     create_register_air, create_shift_air, create_store_air,
 };
 
-use stark::proof::options::{GoldilocksCubicProofOptions, ProofOptions};
+pub use stark::proof::options::{GoldilocksCubicProofOptions, ProofOptions};
 use stark::proof::stark::MultiProof;
 
 /// A run-length encoded range of contiguous zero-initialized 4KB pages.
@@ -201,7 +219,7 @@ impl fmt::Display for Error {
     }
 }
 
-impl std::error::Error for Error {}
+impl core::error::Error for Error {}
 
 /// Type alias for AIR-trace-public-inputs triples used in multi-table proving.
 type AirTracePair<'a> = (
@@ -243,6 +261,7 @@ pub(crate) struct VmAirs {
 
 impl VmAirs {
     /// Build `(air, trace, public_inputs)` triples for [`Prover::multi_prove`].
+    #[cfg(feature = "prove")]
     pub fn air_trace_pairs<'a>(&'a self, traces: &'a mut Traces) -> Vec<AirTracePair<'a>> {
         let mut pairs: Vec<AirTracePair<'a>> = vec![
             (&self.bitwise, &mut traces.bitwise, &()),
@@ -418,6 +437,32 @@ impl VmAirs {
         table_counts: &TableCounts,
         decode_commitment: Option<Commitment>,
         page_commitments: Option<&[(u64, Commitment)]>,
+    ) -> Self {
+        Self::new_with_vkey(
+            elf,
+            proof_options,
+            minimal_bitwise,
+            page_configs,
+            table_counts,
+            decode_commitment,
+            page_commitments,
+            None,
+        )
+    }
+
+    /// Same as [`Self::new`] but accepts a precomputed [`VmVerifyingKey`].
+    /// When `vkey` is `Some`, the bitwise preprocessed commitment is taken
+    /// from it instead of being recomputed from `proof_options` — that
+    /// recomputation is ~87% of verifier cycles inside the recursion guest.
+    pub fn new_with_vkey(
+        elf: &Elf,
+        proof_options: &ProofOptions,
+        minimal_bitwise: bool,
+        page_configs: &[crate::tables::page::PageConfig],
+        table_counts: &TableCounts,
+        decode_commitment: Option<Commitment>,
+        page_commitments: Option<&[(u64, Commitment)]>,
+        vkey: Option<&VmVerifyingKey>,
     ) -> Self {
         let cpus: Vec<_> = (0..table_counts.cpu)
             .map(|i| create_cpu_air(proof_options).with_name(&format!("CPU[{}]", i)))
@@ -425,10 +470,12 @@ impl VmAirs {
         let bitwise = if minimal_bitwise {
             create_bitwise_air(proof_options)
         } else {
-            create_bitwise_air(proof_options).with_preprocessed(
-                bitwise::preprocessed_commitment(proof_options),
-                bitwise::NUM_PRECOMPUTED_COLS,
-            )
+            let commitment = match vkey {
+                Some(vk) => vk.bitwise,
+                None => bitwise::preprocessed_commitment(proof_options),
+            };
+            create_bitwise_air(proof_options)
+                .with_preprocessed(commitment, bitwise::NUM_PRECOMPUTED_COLS)
         };
         let lts: Vec<_> = (0..table_counts.lt)
             .map(|i| create_lt_air(proof_options).with_name(&format!("LT[{}]", i)))
@@ -445,10 +492,12 @@ impl VmAirs {
         let loads: Vec<_> = (0..table_counts.load)
             .map(|i| create_load_air(proof_options).with_name(&format!("LOAD[{}]", i)))
             .collect();
-        let decode_root = decode_commitment.unwrap_or_else(|| {
-            decode::commitment_from_elf(elf, proof_options)
-                .expect("Failed to compute decode commitment")
-        });
+        let decode_root = decode_commitment
+            .or_else(|| vkey.map(|vk| vk.decode))
+            .unwrap_or_else(|| {
+                decode::commitment_from_elf(elf, proof_options)
+                    .expect("Failed to compute decode commitment")
+            });
         let decode = create_decode_air(proof_options)
             .with_preprocessed(decode_root, decode::NUM_PRECOMPUTED_COLS);
         let muls: Vec<_> = (0..table_counts.mul)
@@ -464,17 +513,21 @@ impl VmAirs {
         let commit = create_commit_air(proof_options);
         let keccak = create_keccak_air(proof_options);
         let keccak_rnd = create_keccak_rnd_air(proof_options);
+        let keccak_rc_commitment = vkey
+            .map(|vk| vk.keccak_rc)
+            .unwrap_or_else(|| tables::keccak_rc::preprocessed_commitment(proof_options));
         let keccak_rc = create_keccak_rc_air(proof_options).with_preprocessed(
-            tables::keccak_rc::preprocessed_commitment(proof_options),
+            keccak_rc_commitment,
             tables::keccak_rc::NUM_PRECOMPUTED_COLS,
         );
         let ecsm = create_ecsm_air(proof_options);
         let ec_scalar = create_ec_scalar_air(proof_options);
         let ecdas = create_ecdas_air(proof_options);
-        let register = create_register_air(proof_options).with_preprocessed(
-            register::preprocessed_commitment(proof_options, elf.entry_point),
-            register::NUM_PREPROCESSED_COLS,
-        );
+        let register_commitment = vkey
+            .map(|vk| vk.register)
+            .unwrap_or_else(|| register::preprocessed_commitment(proof_options, elf.entry_point));
+        let register = create_register_air(proof_options)
+            .with_preprocessed(register_commitment, register::NUM_PREPROCESSED_COLS);
         // Every zero-init page shares one preprocessed commitment: OFFSET is
         // page-relative and INIT is all-zero, so it depends only on
         // (blowup, coset) — all fixed here. Compute it once (static const
@@ -485,7 +538,8 @@ impl VmAirs {
 
         let pages: Vec<_> = page_configs
             .iter()
-            .map(|config| {
+            .enumerate()
+            .map(|(index, config)| {
                 let air = create_page_air(proof_options, config.page_base);
                 if config.is_private_input {
                     // Private-input pages: all columns are main trace (not preprocessed).
@@ -494,16 +548,21 @@ impl VmAirs {
                     air
                 } else if config.init_values.is_none() {
                     // Zero-init pages: the shared commitment computed once above.
+                    // `vkey.pages` caches the same static value for these slots,
+                    // so the local lookup is equivalent and equally cheap.
                     air.with_preprocessed(zero_init_commitment, page::NUM_PREPROCESSED_COLS)
                 } else {
                     // ELF data pages: INIT is program-specific, so the commitment is
                     // per-page. Prefer a caller-supplied `(page_base, commitment)`
-                    // (recursion guest); otherwise recompute from the ELF.
+                    // (recursion guest), then the vkey's cached per-page root
+                    // (indexed parallel to `page_configs`); otherwise recompute
+                    // from the ELF.
                     let commitment = page_commitments
                         .unwrap_or(&[])
                         .iter()
                         .find(|(pb, _)| *pb == config.page_base)
                         .map(|(_, c)| *c)
+                        .or_else(|| vkey.map(|vk| vk.pages[index]))
                         .unwrap_or_else(|| {
                             page::compute_precomputed_commitment(config, proof_options)
                         });
@@ -650,11 +709,13 @@ pub(crate) fn compute_expected_commit_bus_balance(
 // =============================================================================
 
 /// Prove an ELF binary execution. Returns a serializable proof bundle.
+#[cfg(feature = "prove")]
 pub fn prove(elf_bytes: &[u8]) -> Result<VmProof, Error> {
     prove_with_inputs(elf_bytes, &[])
 }
 
 /// Prove an ELF binary execution with private inputs. Returns a serializable proof bundle.
+#[cfg(feature = "prove")]
 pub fn prove_with_inputs(elf_bytes: &[u8], private_inputs: &[u8]) -> Result<VmProof, Error> {
     prove_with_options_and_inputs(
         elf_bytes,
@@ -672,6 +733,7 @@ pub fn prove_with_inputs(elf_bytes: &[u8], private_inputs: &[u8]) -> Result<VmPr
 /// is the sum of `rows × ⌈bus_interactions/2⌉` over all tables — i.e. the number
 /// of committed extension-field columns times rows (LogUp batching packs two
 /// interactions per column).
+#[cfg(feature = "prove")]
 pub fn count_elements(elf_bytes: &[u8], private_inputs: &[u8]) -> Result<(u64, u64), Error> {
     let program = Elf::load(elf_bytes).map_err(|e| Error::ElfLoad(format!("{e}")))?;
     let executor = Executor::new(&program, private_inputs.to_vec())
@@ -694,6 +756,7 @@ pub fn count_elements(elf_bytes: &[u8], private_inputs: &[u8]) -> Result<(u64, u
 }
 
 /// Prove an ELF binary execution with custom proof options and max rows config.
+#[cfg(feature = "prove")]
 pub fn prove_with_options(
     elf_bytes: &[u8],
     proof_options: &ProofOptions,
@@ -704,6 +767,7 @@ pub fn prove_with_options(
 
 /// Prove an ELF binary execution with custom proof options, max rows config,
 /// and explicit private inputs.
+#[cfg(feature = "prove")]
 pub fn prove_with_options_and_inputs(
     elf_bytes: &[u8],
     private_inputs: &[u8],
@@ -883,6 +947,30 @@ pub fn verify_with_options(
     proof_options: &ProofOptions,
     decode_commitment: Option<Commitment>,
     page_commitments: Option<&[(u64, Commitment)]>,
+) -> Result<bool, Error> {
+    verify_with_options_with_vkey(
+        vm_proof,
+        elf_bytes,
+        proof_options,
+        decode_commitment,
+        page_commitments,
+        None,
+    )
+}
+
+/// Same as [`verify_with_options`] but accepts a precomputed
+/// [`VmVerifyingKey`]. When `vkey` is `Some`, the bitwise preprocessed
+/// commitment is taken from it instead of being recomputed inside
+/// `VmAirs::new`. A tampered vkey is caught by Fiat-Shamir: the verifier
+/// feeds the supplied commitment into the transcript, derives different
+/// challenges from what the prover used, and the openings stop matching.
+pub fn verify_with_options_with_vkey(
+    vm_proof: &VmProof,
+    elf_bytes: &[u8],
+    proof_options: &ProofOptions,
+    decode_commitment: Option<Commitment>,
+    page_commitments: Option<&[(u64, Commitment)]>,
+    vkey: Option<&VmVerifyingKey>,
 ) -> Result<bool, Error> {
     // Validate table_counts before constructing AIRs.
     // A malicious prover could set counts to 0, removing entire constraint sets.
@@ -892,7 +980,7 @@ pub fn verify_with_options(
     // MAX_PRIVATE_INPUT_SIZE fits in ~26 pages of DEFAULT_PAGE_SIZE.
     {
         use crate::tables::page::DEFAULT_PAGE_SIZE;
-        use executor::vm::memory::MAX_PRIVATE_INPUT_SIZE;
+        use executor::constants::MAX_PRIVATE_INPUT_SIZE;
         let max_pages = (MAX_PRIVATE_INPUT_SIZE as usize + 4).div_ceil(DEFAULT_PAGE_SIZE) + 1;
         if vm_proof.num_private_input_pages > max_pages {
             return Err(Error::InvalidTableCounts(format!(
@@ -923,7 +1011,7 @@ pub fn verify_with_options(
         )));
     }
 
-    let airs = VmAirs::new(
+    let airs = VmAirs::new_with_vkey(
         &program,
         proof_options,
         false,
@@ -931,6 +1019,7 @@ pub fn verify_with_options(
         &vm_proof.table_counts,
         decode_commitment,
         page_commitments,
+        vkey,
     );
 
     // Recompute the COMMIT output bus offset from VmProof.public_output.
@@ -974,6 +1063,7 @@ pub fn verify_with_options(
 }
 
 /// Prove and verify in one call (convenience).
+#[cfg(feature = "prove")]
 pub fn prove_and_verify(elf_bytes: &[u8]) -> Result<bool, Error> {
     let vm_proof = prove(elf_bytes)?;
     verify(&vm_proof, elf_bytes)
diff --git a/prover/src/tables/bitwise.rs b/prover/src/tables/bitwise.rs
index 468e2a5b2..1ac4eddd0 100644
--- a/prover/src/tables/bitwise.rs
+++ b/prover/src/tables/bitwise.rs
@@ -25,6 +25,9 @@
 //! All lookups are provided as receivers with negative multiplicity,
 //! meaning other tables send to this table.
 
+use alloc::vec;
+use alloc::vec::Vec;
+
 use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
 use stark::config::{BatchedMerkleTree, Commitment};
diff --git a/prover/src/tables/branch.rs b/prover/src/tables/branch.rs
index 1680b9edb..4c0b86f62 100644
--- a/prover/src/tables/branch.rs
+++ b/prover/src/tables/branch.rs
@@ -26,6 +26,8 @@
 //! - Sender: IS_HALFWORD (×3 for next_pc_high[0..3])
 //! - Receiver: BRANCH (provides branch targets to CPU)
 
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::TransitionConstraint;
@@ -155,9 +157,11 @@ impl BranchOperation {
 ///
 /// Duplicate operations (same pc, offset, register, jalr) are merged into a single row
 /// with their multiplicities summed. The table is then padded to the next power of 2.
+#[cfg(feature = "prove")]
 pub fn generate_branch_trace(
     operations: &[BranchOperation],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
+    #[cfg(feature = "prove")]
     use std::collections::HashMap;
 
     // Deduplicate operations: (pc, offset, register, jalr) -> multiplicity
diff --git a/prover/src/tables/bytewise.rs b/prover/src/tables/bytewise.rs
index 82d7c8772..0721f3183 100644
--- a/prover/src/tables/bytewise.rs
+++ b/prover/src/tables/bytewise.rs
@@ -16,6 +16,8 @@
 //! - `res`: DWordBL (8 bytes) — output
 //! - `μ`: multiplicity
 
+use alloc::vec;
+use alloc::vec::Vec;
 use stark::lookup::{BusInteraction, BusValue, Multiplicity, Packing};
 use stark::trace::TraceTable;
 
@@ -97,7 +99,7 @@ impl BytewiseOperation {
 pub fn generate_bytewise_trace(
     operations: &[BytewiseOperation],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
-    use std::collections::HashMap;
+    use hashbrown::HashMap;
 
     let mut op_map: HashMap<BytewiseOperation, u64> = HashMap::new();
     for op in operations {
diff --git a/prover/src/tables/commit.rs b/prover/src/tables/commit.rs
index c1663711e..88b0cdb97 100644
--- a/prover/src/tables/commit.rs
+++ b/prover/src/tables/commit.rs
@@ -43,6 +43,9 @@
 //! - `count_decr_carry_0`: SUB template carry_0 for count_decr + 1 = count (degree 2)
 //! - `count_decr_carry_1`: SUB template carry_1 for count_decr + 1 = count (degree 2)
 //!
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
diff --git a/prover/src/tables/cpu.rs b/prover/src/tables/cpu.rs
index 1752022b9..6a469ce73 100644
--- a/prover/src/tables/cpu.rs
+++ b/prover/src/tables/cpu.rs
@@ -26,6 +26,9 @@
 
 use super::types::{BusId, DecodeEntry, FE, GoldilocksExtension, GoldilocksField, VmTable, alu_op};
 use crate::Error;
+use alloc::vec;
+use alloc::vec::Vec;
+#[cfg(feature = "prove")]
 use executor::vm::{
     instruction::{decoding::Instruction, execution::SyscallNumbers},
     logs::Log,
@@ -216,6 +219,7 @@ impl CpuOperation {
     }
 
     /// Creates a CpuOperation from an executor Log and a DecodeEntry.
+    #[cfg(feature = "prove")]
     pub fn from_log(log: &Log, timestamp: u64, decode: DecodeEntry) -> Self {
         let f = decode.fields;
         // Real byte length: the column stores half.
@@ -228,8 +232,7 @@ impl CpuOperation {
         } else {
             (0, 0)
         };
-        let ecall_keccak =
-            f.ecall && log.src1_val == executor::vm::instruction::execution::KECCAK_SYSCALL_NUMBER;
+        let ecall_keccak = f.ecall && log.src1_val == executor::constants::KECCAK_SYSCALL_NUMBER;
         let keccak_state_addr = if ecall_keccak { log.src2_val } else { 0 };
         // The ECSM operand addresses (x10/x11/x12) are recovered from the register state
         // in the trace builder.
@@ -377,6 +380,7 @@ impl CpuOperation {
     }
 
     /// Creates a CpuOperation from Log and Instruction (convenience).
+    #[cfg(feature = "prove")]
     pub fn from_log_and_instruction(log: &Log, timestamp: u64, instruction: Instruction) -> Self {
         let decode = DecodeEntry::from_instruction(log.current_pc, instruction, 4);
         Self::from_log(log, timestamp, decode)
@@ -555,6 +559,7 @@ pub fn generate_cpu_trace(
 }
 
 /// Generates the CPU trace table directly from executor logs.
+#[cfg(feature = "prove")]
 pub fn generate_cpu_trace_from_logs(
     logs: &[Log],
     instructions: &U64HashMap<Instruction>,
@@ -582,6 +587,7 @@ pub fn collect_bitwise_ops(operations: &[CpuOperation]) -> Vec<super::bitwise::B
 }
 
 /// Collects all BITWISE lookups from executor logs.
+#[cfg(feature = "prove")]
 pub fn collect_bitwise_ops_from_logs(
     logs: &[Log],
     instructions: &U64HashMap<Instruction>,
diff --git a/prover/src/tables/cpu32.rs b/prover/src/tables/cpu32.rs
index d7dbd5d6f..bd4a4aeea 100644
--- a/prover/src/tables/cpu32.rs
+++ b/prover/src/tables/cpu32.rs
@@ -17,6 +17,9 @@
 //!
 //! Register reads use the cast-to-`DWordWL` encoding.
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
@@ -280,7 +283,7 @@ fn register_dword(lo0: usize, lo1: usize, hi: usize) -> Vec<BusValue> {
             packing: Packing::Direct,
         },
     ];
-    v.extend(std::iter::repeat_n(BusValue::constant(0), 6));
+    v.extend(core::iter::repeat_n(BusValue::constant(0), 6));
     v
 }
 
@@ -349,7 +352,7 @@ fn reg_write(
             packing: Packing::Direct,
         },
     ];
-    values.extend(std::iter::repeat_n(BusValue::constant(0), 6)); // value[2..8]
+    values.extend(core::iter::repeat_n(BusValue::constant(0), 6)); // value[2..8]
     values.extend(timestamp_plus(ts_offset));
     values.push(BusValue::constant(1)); // write2 = 1
     values.push(BusValue::constant(0)); // write4
diff --git a/prover/src/tables/decode.rs b/prover/src/tables/decode.rs
index 6cef6a482..6d8d448d0 100644
--- a/prover/src/tables/decode.rs
+++ b/prover/src/tables/decode.rs
@@ -31,6 +31,8 @@
 //!
 //! - **Receiver**: DECODE bus - receives lookups from CPU table
 
+use alloc::vec;
+use alloc::vec::Vec;
 use executor::elf::Elf;
 use executor::vm::instruction::decoding::{Instruction, InstructionError};
 use executor::vm::memory::U64HashMap;
@@ -85,7 +87,7 @@ pub const NUM_PRECOMPUTED_COLS: usize = 5;
 // Trace generation
 // =========================================================================
 
-use std::collections::HashMap;
+use hashbrown::HashMap;
 
 /// Map from PC to row index in the DECODE trace table.
 pub type PcToRow = HashMap<u64, usize>;
@@ -176,6 +178,7 @@ pub fn generate_decode_trace(
 /// Updates multiplicities in the DECODE trace table.
 ///
 /// For each PC in `lookups`, increments the MU column in the corresponding row.
+#[cfg(feature = "prove")]
 pub fn update_multiplicities(
     trace: &mut TraceTable<GoldilocksField, GoldilocksExtension>,
     pc_to_row: &PcToRow,
@@ -349,6 +352,7 @@ pub fn commitment_from_elf(
 // =========================================================================
 
 /// Result of ELF processing for DECODE table.
+#[cfg(feature = "prove")]
 pub struct ElfTables {
     /// DECODE trace table
     pub decode: TraceTable<GoldilocksField, GoldilocksExtension>,
@@ -364,6 +368,7 @@ pub struct ElfTables {
 /// - `pc_to_row`: Map from PC to row index for DECODE multiplicity updates
 ///
 /// Table has multiplicities initialized to 0.
+#[cfg(feature = "prove")]
 pub fn tables_from_elf(elf: &Elf) -> Result<ElfTables, InstructionError> {
     let mut decode_entries = Vec::new();
     let mut pc_to_row = HashMap::with_capacity(elf.data.iter().map(|s| s.values.len()).sum());
@@ -387,6 +392,7 @@ pub fn tables_from_elf(elf: &Elf) -> Result<ElfTables, InstructionError> {
 }
 
 /// Build DECODE trace table from entries.
+#[cfg(feature = "prove")]
 fn build_decode_table(
     entries: Vec<DecodeEntry>,
     pc_to_row: &mut PcToRow,
diff --git a/prover/src/tables/dvrm.rs b/prover/src/tables/dvrm.rs
index d3adbdc53..6d70b4e26 100644
--- a/prover/src/tables/dvrm.rs
+++ b/prover/src/tables/dvrm.rs
@@ -29,6 +29,9 @@
 //! - Sender: ZERO (×5 for div_by_zero, overflow, NEG template)
 //! - Receiver: DVRM (×2 for quotient and remainder results)
 
+use alloc::vec;
+use alloc::vec::Vec;
+#[cfg(feature = "prove")]
 use std::collections::HashMap;
 
 use math::field::element::FieldElement;
@@ -284,6 +287,7 @@ impl DvrmOperation {
 ///
 /// # Arguments
 /// * `operations` - List of (DvrmOperation, wants_remainder) pairs
+#[cfg(feature = "prove")]
 pub fn generate_dvrm_trace(
     operations: &[(DvrmOperation, bool)],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
diff --git a/prover/src/tables/ec_scalar.rs b/prover/src/tables/ec_scalar.rs
index dd8d483a2..7779008c5 100644
--- a/prover/src/tables/ec_scalar.rs
+++ b/prover/src/tables/ec_scalar.rs
@@ -16,6 +16,9 @@
 //!
 //! `limb = Σ 2^i · limb_bits[i]` is virtual (a linear combination, never stored).
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
diff --git a/prover/src/tables/ecdas.rs b/prover/src/tables/ecdas.rs
index 6d508d363..a6118026b 100644
--- a/prover/src/tables/ecdas.rs
+++ b/prover/src/tables/ecdas.rs
@@ -10,6 +10,9 @@
 //! See `spec/src/ecdas.toml`. Constraints are **unconditional**; padding rows set the quotients
 //! to `r` and `op = 0`, which makes every relation hold with zero carries.
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
diff --git a/prover/src/tables/ecsm.rs b/prover/src/tables/ecsm.rs
index f8ec0859d..bed4418c6 100644
--- a/prover/src/tables/ecsm.rs
+++ b/prover/src/tables/ecsm.rs
@@ -17,6 +17,9 @@
 //! drops when `µ = 0`). Only that single `µ·b` term is µ-gated. The range checks /
 //! virtual-carry checks remain µ-gated as before.
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use executor::vm::instruction::execution::ECSM_SYSCALL_NUMBER;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
@@ -261,12 +264,12 @@ fn memw_write(
 
 /// The eight bytes of a 256-bit value at `col + 8*chunk` as MEMW value elements.
 fn dword_bytes(col: usize, chunk: usize) -> [BusValue; 8] {
-    std::array::from_fn(|b| packed(col + 8 * chunk + b))
+    core::array::from_fn(|b| packed(col + 8 * chunk + b))
 }
 
 /// A register value `[lo, hi, 0, 0, 0, 0, 0, 0]` as MEMW value elements.
 fn register_value(lo_col: usize, hi_col: usize) -> [BusValue; 8] {
-    let mut v: [BusValue; 8] = std::array::from_fn(|_| BusValue::constant(0));
+    let mut v: [BusValue; 8] = core::array::from_fn(|_| BusValue::constant(0));
     v[0] = packed(lo_col);
     v[1] = packed(hi_col);
     v
@@ -760,7 +763,7 @@ where
     let inv = FieldElement::<F>::from(INV_SHIFT_32);
     let hl = kind.addend_hl_base();
     let bl = kind.sum_bl_base();
-    let mut c: [FieldElement<F>; 8] = std::array::from_fn(|_| FieldElement::zero());
+    let mut c: [FieldElement<F>; 8] = core::array::from_fn(|_| FieldElement::zero());
     let mut prev = FieldElement::<F>::zero();
     for (i, slot) in c.iter_mut().enumerate() {
         // addend1 word i (from halfwords): hl[2i] + 2^16·hl[2i+1]
diff --git a/prover/src/tables/eq.rs b/prover/src/tables/eq.rs
index 453caa928..459beb6da 100644
--- a/prover/src/tables/eq.rs
+++ b/prover/src/tables/eq.rs
@@ -21,6 +21,9 @@
 //! four range-checked halves is `0` iff `diff == 0` iff `a == b`), and
 //! `res = eq XOR invert`.
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
@@ -120,7 +123,7 @@ impl EqOperation {
 pub fn generate_eq_trace(
     operations: &[EqOperation],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
-    use std::collections::HashMap;
+    use hashbrown::HashMap;
 
     let mut op_map: HashMap<EqOperation, u64> = HashMap::new();
     for op in operations {
diff --git a/prover/src/tables/halt.rs b/prover/src/tables/halt.rs
index 44bbf26cb..319653473 100644
--- a/prover/src/tables/halt.rs
+++ b/prover/src/tables/halt.rs
@@ -27,6 +27,8 @@
 //! ## Padding
 //! Single-row table (2^0 = 1), no padding needed.
 
+use alloc::vec;
+use alloc::vec::Vec;
 use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing};
 use stark::trace::TraceTable;
 
diff --git a/prover/src/tables/keccak.rs b/prover/src/tables/keccak.rs
index 0f305255b..3be69d15d 100644
--- a/prover/src/tables/keccak.rs
+++ b/prover/src/tables/keccak.rs
@@ -15,7 +15,10 @@
 //! | state_ptr      |  100 | Per-lane DWordHL addresses [25][4]             |
 //! | mu             |    1 | Multiplicity flag                              |
 
-use executor::vm::instruction::execution::KECCAK_SYSCALL_NUMBER;
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
+use executor::constants::KECCAK_SYSCALL_NUMBER;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
diff --git a/prover/src/tables/keccak_rc.rs b/prover/src/tables/keccak_rc.rs
index 3575c8ba1..8fafcf45e 100644
--- a/prover/src/tables/keccak_rc.rs
+++ b/prover/src/tables/keccak_rc.rs
@@ -8,6 +8,9 @@
 //! committed via a static lookup table (with recompute as fallback for
 //! `ProofOptions` not covered by the static table).
 
+use alloc::vec;
+use alloc::vec::Vec;
+
 use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
 use stark::config::{BatchedMerkleTree, Commitment};
diff --git a/prover/src/tables/keccak_rnd.rs b/prover/src/tables/keccak_rnd.rs
index 279b5c152..fe231f531 100644
--- a/prover/src/tables/keccak_rnd.rs
+++ b/prover/src/tables/keccak_rnd.rs
@@ -28,7 +28,10 @@
 //! `Cxz_right` is typed `[Bit, 4]` per spec d75944ee — HWSL with shift=1
 //! produces a single-bit carry, range-checked via IS_BIT polynomial constraints.
 
-use executor::vm::instruction::execution::{KECCAK_RC, KECCAK_RHO};
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
+use executor::constants::{KECCAK_RC, KECCAK_RHO};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
 use stark::lookup::{BusInteraction, BusValue, LinearTerm, Multiplicity, Packing};
 use stark::trace::TraceTable;
@@ -40,6 +43,7 @@ use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, VmTable, alu
 // =========================================================================
 
 pub mod cols {
+    use executor::constants::KECCAK_RHO;
     pub const TIMESTAMP_0: usize = 0;
     pub const TIMESTAMP_1: usize = 1;
     pub const ROUND: usize = 2;
@@ -159,7 +163,6 @@ pub mod cols {
     /// pair whose sum equals pi[x][y][z]. rbc is compile-time constant.
     #[inline]
     pub fn pi_src_cols(x: usize, y: usize, z: usize) -> (usize, usize) {
-        use executor::vm::instruction::execution::KECCAK_RHO;
         let sx = (x + 3 * y) % 5;
         let sy = x;
         let rho_offset = KECCAK_RHO[sx][sy] as usize;
@@ -239,6 +242,7 @@ fn hwsl(halfword: u16, shift: u8) -> (u16, u16) {
 ///
 /// Each `KeccakRoundOperation` produces 24 rows (one per round). The trace
 /// computes all intermediate values (θ, ρ, π, χ, ι) at byte granularity.
+#[cfg(feature = "prove")]
 pub fn generate_keccak_rnd_trace(
     ops: &[KeccakRoundOperation],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
diff --git a/prover/src/tables/load.rs b/prover/src/tables/load.rs
index 250d565b2..1c56b41df 100644
--- a/prover/src/tables/load.rs
+++ b/prover/src/tables/load.rs
@@ -23,6 +23,9 @@
 //! - Sender: MEMW (to read from memory)
 //! - Sender: MSB8 (for sign bit extraction)
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
diff --git a/prover/src/tables/lt.rs b/prover/src/tables/lt.rs
index 0b1a57616..5b6359d4d 100644
--- a/prover/src/tables/lt.rs
+++ b/prover/src/tables/lt.rs
@@ -26,6 +26,8 @@
 //! - Receiver: ALU (all less-than lookups — CPU SLT/BLT/BGE dispatch and the
 //!   internal `memw`/`memw_aligned`/`dvrm` timestamp / |r|<|d| checks)
 
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::TransitionConstraint;
@@ -158,9 +160,11 @@ impl LtOperation {
 ///
 /// Duplicate operations (same lhs, rhs, signed) are merged into a single row
 /// with their multiplicities summed. The table is then padded to the next power of 2.
+#[cfg(feature = "prove")]
 pub fn generate_lt_trace(
     operations: &[LtOperation],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
+    #[cfg(feature = "prove")]
     use std::collections::HashMap;
 
     // Deduplicate operations: (lhs, rhs, signed) -> multiplicity
diff --git a/prover/src/tables/memw.rs b/prover/src/tables/memw.rs
index 2b240747c..36b9cbdb1 100644
--- a/prover/src/tables/memw.rs
+++ b/prover/src/tables/memw.rs
@@ -29,6 +29,9 @@
 //!
 //! ## Constraints (11 total: 2 custom + 2 IS_BIT for multiplicities + 7 IS_BIT for carry)
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
diff --git a/prover/src/tables/memw_aligned.rs b/prover/src/tables/memw_aligned.rs
index 8042d9052..75f17662d 100644
--- a/prover/src/tables/memw_aligned.rs
+++ b/prover/src/tables/memw_aligned.rs
@@ -34,6 +34,9 @@
 //! - IS_HALF[base_address[i]] for i ∈ [0, 1]
 //! - IS_WORD[base_address[2]]
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
diff --git a/prover/src/tables/memw_register.rs b/prover/src/tables/memw_register.rs
index 14a696cb9..2d5740c1d 100644
--- a/prover/src/tables/memw_register.rs
+++ b/prover/src/tables/memw_register.rs
@@ -38,6 +38,9 @@
 //! - 4 Memory bus tokens (read-old + write-new, per word)
 //! - 2 MEMW output interactions (read + write, from CPU)
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
diff --git a/prover/src/tables/mul.rs b/prover/src/tables/mul.rs
index ba414dc63..3406c242a 100644
--- a/prover/src/tables/mul.rs
+++ b/prover/src/tables/mul.rs
@@ -30,6 +30,9 @@
 //! - Receiver: ALU (×2 for lo and hi results — every MUL lookup, CPU
 //!   MUL/MULH dispatch and dvrm's internal `d*q` consistency)
 
+use alloc::vec;
+use alloc::vec::Vec;
+#[cfg(feature = "prove")]
 use std::collections::HashMap;
 
 use math::field::element::FieldElement;
@@ -292,6 +295,7 @@ impl MulOperation {
 ///
 /// # Arguments
 /// * `operations` - List of (MulOperation, wants_hi) pairs
+#[cfg(feature = "prove")]
 pub fn generate_mul_trace(
     operations: &[(MulOperation, bool)],
 ) -> TraceTable<GoldilocksField, GoldilocksExtension> {
@@ -798,8 +802,8 @@ impl MulConstraint {
 
         // Build sign-extended values
         let sign_fill = FieldElement::<F>::from(SIGN_FILL);
-        let mut lhs_ext: [FieldElement<F>; 8] = std::array::from_fn(|_| FieldElement::zero());
-        let mut rhs_ext: [FieldElement<F>; 8] = std::array::from_fn(|_| FieldElement::zero());
+        let mut lhs_ext: [FieldElement<F>; 8] = core::array::from_fn(|_| FieldElement::zero());
+        let mut rhs_ext: [FieldElement<F>; 8] = core::array::from_fn(|_| FieldElement::zero());
 
         lhs_ext[..4].clone_from_slice(&lhs);
         rhs_ext[..4].clone_from_slice(&rhs);
diff --git a/prover/src/tables/page.rs b/prover/src/tables/page.rs
index 174225ffa..edb9c8f36 100644
--- a/prover/src/tables/page.rs
+++ b/prover/src/tables/page.rs
@@ -30,7 +30,10 @@
 //! | PAGE-C3    | Memory  | `[0, address, 0, init]` | -1 (receiver) |
 //! | PAGE-C4    | Memory  | `[0, address, timestamp, fini]` | 1 (sender) |
 
-use std::collections::HashMap;
+use alloc::vec;
+use alloc::vec::Vec;
+#[cfg(feature = "prove")]
+use hashbrown::HashMap;
 
 use math::fft::bit_reversing::in_place_bit_reverse_permute;
 use math::polynomial::Polynomial;
@@ -50,7 +53,7 @@ use super::types::{BusId, FE, GoldilocksExtension, GoldilocksField, VmTable};
 pub const DEFAULT_PAGE_SIZE: usize = 1 << 18;
 
 /// Stack top address (where SP starts). Re-exported from executor.
-pub use executor::vm::registers::STACK_TOP;
+pub use executor::constants::STACK_TOP;
 
 // =========================================================================
 // Column indices for PAGE table
@@ -98,6 +101,7 @@ pub struct FinalByteState {
 }
 
 /// Map from byte address to final state.
+#[cfg(feature = "prove")]
 pub type FinalStateMap = HashMap<u64, FinalByteState>;
 
 /// Configuration for a single PAGE table instance.
@@ -163,6 +167,7 @@ impl PageConfig {
 /// ## Returns
 ///
 /// The trace table for this page.
+#[cfg(feature = "prove")]
 pub fn generate_page_trace(
     config: &PageConfig,
     final_state: &FinalStateMap,
@@ -333,6 +338,26 @@ pub fn compute_precomputed_commitment(config: &PageConfig, options: &ProofOption
     tree.root
 }
 
+/// Returns a page's preprocessed commitment, preferring the cheap path.
+///
+/// Zero-init pages (INIT is all-zero) share a single commitment that depends
+/// only on `(blowup, coset)`, so they resolve to the static lookup in
+/// [`zero_init_preprocessed_commitment`] instead of rebuilding the FFT +
+/// Merkle tree. ELF data pages have program-specific INIT and fall through
+/// to [`compute_precomputed_commitment`]. This mirrors the per-page choice
+/// made in `VmAirs::new_with_vkey`, so a vkey built from this function caches
+/// exactly the commitments the verifier expects.
+///
+/// Private-input pages have no preprocessed commitment; callers must skip
+/// them before calling this.
+pub fn precomputed_commitment_cached(config: &PageConfig, options: &ProofOptions) -> Commitment {
+    if config.init_values.is_none() {
+        zero_init_preprocessed_commitment(options)
+    } else {
+        compute_precomputed_commitment(config, options)
+    }
+}
+
 /// Returns the zero-init PAGE preprocessed commitment.
 ///
 /// Looks up `blowup_factor` in [`static_zero_page_commitment`] when
diff --git a/prover/src/tables/register.rs b/prover/src/tables/register.rs
index 5a09fb2fa..26431fc16 100644
--- a/prover/src/tables/register.rs
+++ b/prover/src/tables/register.rs
@@ -18,6 +18,9 @@
 //! | fini | Word | Final value after execution |
 //! | timestamp | DWordWL | Final timestamp (1 if never accessed) |
 
+use alloc::vec;
+use alloc::vec::Vec;
+#[cfg(feature = "prove")]
 use std::collections::HashMap;
 
 use math::fft::bit_reversing::in_place_bit_reverse_permute;
@@ -91,6 +94,7 @@ pub struct FinalRegisterWordState {
 }
 
 /// Map from register Word address to final state.
+#[cfg(feature = "prove")]
 pub type FinalRegisterStateMap = HashMap<u64, FinalRegisterWordState>;
 
 // =========================================================================
@@ -144,6 +148,7 @@ fn init_value_for_address(word_addr: u64, entry_point: u64) -> u32 {
 /// ## Returns
 ///
 /// The trace table for registers.
+#[cfg(feature = "prove")]
 pub fn generate_register_trace(
     final_state: &FinalRegisterStateMap,
     entry_point: u64,
diff --git a/prover/src/tables/shift.rs b/prover/src/tables/shift.rs
index 3115784f6..453c8736f 100644
--- a/prover/src/tables/shift.rs
+++ b/prover/src/tables/shift.rs
@@ -17,6 +17,8 @@
 //! - Senders: MSB16, BYTE_ALU[AND] (×3), ZERO, HWSL (×5), IS_HALFWORD (×4)
 //! - Receiver: SHIFT (from CPU)
 
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::TransitionConstraint;
diff --git a/prover/src/tables/store.rs b/prover/src/tables/store.rs
index 1cdf0334e..6960d4ed2 100644
--- a/prover/src/tables/store.rs
+++ b/prover/src/tables/store.rs
@@ -19,6 +19,9 @@
 //! - `value`: DWordBL (8 bytes) — value to store
 //! - `μ`: multiplicity
 
+use alloc::boxed::Box;
+use alloc::vec;
+use alloc::vec::Vec;
 use math::field::element::FieldElement;
 use math::field::traits::{IsField, IsSubFieldOf};
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
diff --git a/prover/src/tables/trace_builder.rs b/prover/src/tables/trace_builder.rs
index 02371c1a0..42103f266 100644
--- a/prover/src/tables/trace_builder.rs
+++ b/prover/src/tables/trace_builder.rs
@@ -25,13 +25,20 @@
 //! // Use traces.cpus, traces.bitwise, traces.lts, traces.memws, traces.loads
 //! ```
 
+use alloc::vec::Vec;
+use alloc::format;
+use alloc::vec;
+#[cfg(feature = "prove")]
 use std::collections::HashMap;
 #[cfg(feature = "disk-spill")]
 use std::collections::HashSet;
 
 use executor::elf::Elf;
+#[cfg(feature = "prove")]
 use executor::vm::instruction::decoding::Instruction;
+#[cfg(feature = "prove")]
 use executor::vm::logs::Log;
+#[cfg(feature = "prove")]
 use executor::vm::memory::U64HashMap;
 #[cfg(feature = "disk-spill")]
 use stark::storage_mode::StorageMode;
@@ -59,12 +66,18 @@ use super::memw::{self, MemwOperation};
 use super::memw_aligned;
 use super::memw_register;
 use super::mul::{self, MulOperation};
-use super::page::{self, FinalByteState, FinalStateMap, PageConfig};
-use super::register::{self, FinalRegisterStateMap, FinalRegisterWordState};
+use super::page::{self, PageConfig};
+#[cfg(feature = "prove")]
+use super::page::{FinalByteState, FinalStateMap};
+#[cfg(feature = "prove")]
+use super::register::FinalRegisterStateMap;
+use super::register::{self, FinalRegisterWordState};
 use super::shift::{self, ShiftOperation};
 use super::store;
 use super::types::{GoldilocksExtension, GoldilocksField};
 use crate::Error;
+#[cfg(feature = "prove")]
+use crate::tables::decode::PcToRow;
 
 // =============================================================================
 // Memory and Register State Tracking
@@ -77,11 +90,13 @@ type MemoryCell = (u8, u64);
 type RegisterCell = (u64, u64);
 
 /// Memory state tracker for generating MEMW/LOAD traces.
+#[cfg(feature = "prove")]
 struct MemoryState {
     /// Map from byte address to (value, timestamp)
     cells: HashMap<u64, MemoryCell>,
 }
 
+#[cfg(feature = "prove")]
 impl MemoryState {
     fn new() -> Self {
         Self {
@@ -128,7 +143,8 @@ impl MemoryState {
         if private_input.is_empty() {
             return;
         }
-        use executor::vm::memory::PRIVATE_INPUT_START_INDEX;
+        #[cfg(feature = "prove")]
+        use executor::constants::PRIVATE_INPUT_START_INDEX;
         let start = PRIVATE_INPUT_START_INDEX;
         for (i, &b) in private_input_bytes(private_input).iter().enumerate() {
             self.cells.insert(start + i as u64, (b, 0));
@@ -167,6 +183,7 @@ impl MemoryState {
 }
 
 /// Register state tracker for generating MEMW register traces.
+#[cfg(feature = "prove")]
 struct RegisterState {
     /// Register file: (value, last_write_timestamp)
     regs: [RegisterCell; 32],
@@ -176,6 +193,7 @@ struct RegisterState {
     pc_register: RegisterCell,
 }
 
+#[cfg(feature = "prove")]
 impl RegisterState {
     fn new(entry_point: u64) -> Self {
         // Per spec/memory.typ: "register initialization happens at timestamp 1"
@@ -296,6 +314,7 @@ impl RegisterState {
 // =============================================================================
 
 /// Get byte count and signed flag from CpuOperation memory flags.
+#[cfg(feature = "prove")]
 fn cpu_op_to_bytes_and_signed(op: &CpuOperation) -> (usize, bool) {
     let f = &op.decode.fields;
     (f.mem_bytes(), f.mem_signed())
@@ -304,6 +323,7 @@ fn cpu_op_to_bytes_and_signed(op: &CpuOperation) -> (usize, bool) {
 /// Pack a 64-bit register value into the MEMW value format.
 ///
 /// For register operations, values are packed as [lo32, hi32, 0, 0, 0, 0, 0, 0].
+#[cfg(feature = "prove")]
 fn pack_register_value(value: u64) -> [u64; 8] {
     [value & 0xFFFF_FFFF, value >> 32, 0, 0, 0, 0, 0, 0]
 }
@@ -315,6 +335,7 @@ fn pack_register_value(value: u64) -> [u64; 8] {
 /// Collects CPU operations from execution logs.
 ///
 /// Returns a vector of CpuOperation, one per log entry.
+#[cfg(feature = "prove")]
 fn collect_cpu_ops(
     logs: &[Log],
     instructions: &U64HashMap<Instruction>,
@@ -356,6 +377,7 @@ fn collect_cpu_ops(
 /// Returns: (memw_ops, load_ops, lt_ops, shift_ops, bitwise_ops, commit_ops, keccak_ops,
 /// cpu32_ops, ecsm_ops, ec_scalar_ops, ecdas_ops)
 #[allow(clippy::type_complexity)]
+#[cfg(feature = "prove")]
 fn collect_ops_from_cpu(
     cpu_ops: &[CpuOperation],
     memory_state: &mut MemoryState,
@@ -534,6 +556,7 @@ fn collect_ops_from_cpu(
 /// Collects a LOAD operation and corresponding MEMW read from CpuOperation.
 ///
 /// Returns: (memw_op, load_op, bitwise_ops)
+#[cfg(feature = "prove")]
 fn collect_load_op_from_cpu(
     op: &CpuOperation,
     memory_state: &mut MemoryState,
@@ -596,6 +619,7 @@ fn collect_load_op_from_cpu(
 /// Collects a STORE operation as a MEMW write from CpuOperation.
 ///
 /// Returns: memw_op
+#[cfg(feature = "prove")]
 fn collect_store_op_from_cpu(op: &CpuOperation, memory_state: &mut MemoryState) -> MemwOperation {
     // res contains the effective address (base + offset)
     let base_address = op.res;
@@ -643,6 +667,7 @@ fn collect_store_op_from_cpu(op: &CpuOperation, memory_state: &mut MemoryState)
 /// `memory_state` / `register_state` (the offline read-old + write-new model), so later
 /// accesses always observe a strictly smaller old timestamp.
 #[allow(clippy::needless_range_loop)]
+#[cfg(feature = "prove")]
 fn collect_ecsm_ops(
     op: &CpuOperation,
     memory_state: &mut MemoryState,
@@ -760,6 +785,7 @@ fn collect_ecsm_ops(
 /// Collects register read/write operations (M1, M3, M5) from CpuOperation.
 ///
 /// Returns: Vec of MEMW operations for register accesses
+#[cfg(feature = "prove")]
 fn collect_register_ops_from_cpu(
     op: &CpuOperation,
     register_state: &mut RegisterState,
@@ -996,6 +1022,7 @@ fn cpu32_chip_op(
 /// Note: x17 (syscall number) is read by CPU's M1 interaction (read_register1=true, rs1=17).
 ///
 /// Returns: Vec of MEMW operations
+#[cfg(feature = "prove")]
 fn collect_commit_memw_ops(
     op: &CpuOperation,
     register_state: &mut RegisterState,
@@ -1092,6 +1119,7 @@ fn collect_commit_memw_ops(
 /// REGISTER final token is set separately by the caller, at the last padding
 /// timestamp). Also updates `register_state` so `to_final_state_map()` reflects
 /// the finalized GP register values.
+#[cfg(feature = "prove")]
 fn collect_halt_ops(register_state: &mut RegisterState) -> Vec<MemwOperation> {
     let mut ops = Vec::with_capacity(32);
     let ts = u64::MAX;
@@ -1147,6 +1175,7 @@ fn collect_halt_ops(register_state: &mut RegisterState) -> Vec<MemwOperation> {
 ///
 /// Generates 25 read operations (input lanes at timestamp) and 25 write
 /// operations (output lanes at timestamp+1). Each operation is 8 bytes wide.
+#[cfg(feature = "prove")]
 fn collect_keccak_memw_ops(
     op: &CpuOperation,
     input: &[u64; 25],
@@ -1215,6 +1244,7 @@ fn collect_keccak_memw_ops(
 /// - MEMW-C4 through MEMW-C7: old_timestamp[i] < timestamp (based on width)
 ///
 /// Returns: Vec of LT operations
+#[cfg(feature = "prove")]
 fn collect_lt_from_memw(memw_ops: &[MemwOperation]) -> Vec<LtOperation> {
     let mut lt_ops = Vec::with_capacity(memw_ops.len() * 8);
 
@@ -1267,6 +1297,7 @@ fn collect_lt_from_memw(memw_ops: &[MemwOperation]) -> Vec<LtOperation> {
 /// Collects LT operations from MEMW_A for timestamp ordering.
 ///
 /// Each aligned operation has a single old_timestamp < timestamp check.
+#[cfg(feature = "prove")]
 fn collect_lt_from_memw_aligned(memw_aligned_ops: &[MemwOperation]) -> Vec<LtOperation> {
     // Address overflow LT checks (R1-R3 in MEMW) are intentionally absent.
     // Alignment guarantees addr + (width-1) never wraps: the largest width-N
@@ -1282,6 +1313,7 @@ fn collect_lt_from_memw_aligned(memw_aligned_ops: &[MemwOperation]) -> Vec<LtOpe
 /// An operation is aligned if:
 /// 1. For width > 1: base_address is aligned to width (low bits are zero)
 /// 2. All accessed bytes share the same old_timestamp
+#[cfg(feature = "prove")]
 fn is_aligned_op(op: &MemwOperation) -> bool {
     let low = (op.base_address & 0xFFFF_FFFF) as u32;
     let width = op.width as u32;
@@ -1308,6 +1340,7 @@ fn is_aligned_op(op: &MemwOperation) -> bool {
 ///
 /// IS_HALF[base_address[i]] for i ∈ [0, 1] and IS_WORD[base_address[2]] are
 /// assumptions — the caller's (CPU's) responsibility.
+#[cfg(feature = "prove")]
 fn collect_bitwise_from_memw_aligned(ops: &[MemwOperation]) -> Vec<BitwiseOperation> {
     let mut bitwise_ops = Vec::with_capacity(ops.len());
 
@@ -1351,6 +1384,7 @@ fn collect_bitwise_from_memw_aligned(ops: &[MemwOperation]) -> Vec<BitwiseOperat
 ///
 /// Width-1 register ops (e.g. COMMIT x254) stay in MEMW, which has
 /// dynamic write flags. MEMW_R hardcodes write2=1.
+#[cfg(feature = "prove")]
 pub(crate) fn is_register_op(op: &MemwOperation) -> bool {
     if !op.is_register || op.width != 2 {
         return false;
@@ -1372,6 +1406,7 @@ pub(crate) fn is_register_op(op: &MemwOperation) -> bool {
 ///
 /// For each register op: checks that `timestamp[0] - old_timestamp_lo - 1` fits
 /// in a halfword (proving the timestamp delta is in range [1, 2^16]).
+#[cfg(feature = "prove")]
 fn collect_bitwise_from_memw_register(ops: &[MemwOperation]) -> Vec<BitwiseOperation> {
     ops.iter()
         .map(|op| {
@@ -1398,6 +1433,7 @@ fn collect_bitwise_from_memw_register(ops: &[MemwOperation]) -> Vec<BitwiseOpera
 /// Collects bitwise lookups from LT operations (MSB16 and IS_HALFWORD).
 ///
 /// Returns: Vec of bitwise lookups
+#[cfg(feature = "prove")]
 fn collect_bitwise_from_lt(lt_ops: &[LtOperation]) -> Vec<BitwiseOperation> {
     let mut bitwise_ops = Vec::with_capacity(lt_ops.len() * 8);
 
@@ -1457,6 +1493,7 @@ fn collect_bitwise_from_lt(lt_ops: &[LtOperation]) -> Vec<BitwiseOperation> {
 /// op that spans two instances is sent twice and must be tallied twice.
 ///
 /// Returns: Vec of bitwise lookups
+#[cfg(feature = "prove")]
 pub(crate) fn collect_bitwise_from_mul(
     mul_ops: &[(MulOperation, bool)],
     max_rows_mul: usize,
@@ -1514,7 +1551,7 @@ pub(crate) fn collect_bitwise_from_mul(
     // MSB16: dedup per chunk — the MUL AIR sends Msb16 once per unique signed row
     // per instance, so the collector must mirror the same chunk boundary.
     for chunk in mul_ops.chunks(max_rows_mul) {
-        let mut msb16_seen = std::collections::HashSet::new();
+        let mut msb16_seen = hashbrown::HashSet::new();
         for (op, _wants_hi) in chunk {
             if msb16_seen.insert((op.lhs, op.lhs_signed, op.rhs, op.rhs_signed)) {
                 if op.lhs_signed {
@@ -1554,6 +1591,7 @@ pub(crate) fn collect_bitwise_from_mul(
 /// chunk, mirroring `chunk_and_generate`.
 ///
 /// Returns: Vec of bitwise lookups
+#[cfg(feature = "prove")]
 pub(crate) fn collect_bitwise_from_dvrm(
     dvrm_ops: &[(DvrmOperation, bool)],
     max_rows_dvrm: usize,
@@ -1640,7 +1678,7 @@ pub(crate) fn collect_bitwise_from_dvrm(
 
     // MSB16: same per-chunk dedup as MUL (Column(SIGNED) is a bit, not a count).
     for chunk in dvrm_ops.chunks(max_rows_dvrm) {
-        let mut msb16_seen = std::collections::HashSet::new();
+        let mut msb16_seen = hashbrown::HashSet::new();
         for (op, _wants_remainder) in chunk {
             if op.signed && msb16_seen.insert(op.clone()) {
                 let r = op.compute_remainder();
@@ -1674,7 +1712,7 @@ pub(crate) fn collect_bitwise_from_dvrm(
 
     // ZERO (NEG template): same — SIGN_R/SIGN_D are bits, dedup per chunk.
     for chunk in dvrm_ops.chunks(max_rows_dvrm) {
-        let mut zero_seen = std::collections::HashSet::new();
+        let mut zero_seen = hashbrown::HashSet::new();
         for (op, _wants_remainder) in chunk {
             if zero_seen.insert(op.clone()) {
                 // C3: NEG for r (when sign_r = 1)
@@ -1724,6 +1762,7 @@ pub(crate) fn collect_bitwise_from_dvrm(
 /// - IS_HALFWORD[next_pc_high[0..3]] - range checks for bits 16-63
 ///
 /// Returns: Vec of bitwise lookups
+#[cfg(feature = "prove")]
 fn collect_bitwise_from_branch(branch_ops: &[BranchOperation]) -> Vec<BitwiseOperation> {
     let mut bitwise_ops = Vec::with_capacity(branch_ops.len() * 5);
 
@@ -1786,6 +1825,7 @@ fn collect_bitwise_from_branch(branch_ops: &[BranchOperation]) -> Vec<BitwiseOpe
 ///
 /// Per padding row: 1 AreBytes(0,0) for RS1+RS2, 1 AreBytes(0) for RD, and
 /// 12 AreBytes(0,0) for ARG1/ARG2/RES byte pairs = 14 ops.
+#[cfg(feature = "prove")]
 fn collect_byte_check_ops_for_padding(num_padding_rows: usize) -> Vec<BitwiseOperation> {
     if num_padding_rows == 0 {
         return Vec::new();
@@ -1832,10 +1872,10 @@ fn private_input_bytes(private_input: &[u8]) -> Vec<u8> {
         .collect()
 }
 
-fn build_init_page_data(elf: &Elf, private_input: &[u8]) -> HashMap<u64, Vec<u8>> {
-    use executor::vm::memory::PRIVATE_INPUT_START_INDEX;
+fn build_init_page_data(elf: &Elf, private_input: &[u8]) -> hashbrown::HashMap<u64, Vec<u8>> {
+    use executor::constants::PRIVATE_INPUT_START_INDEX;
     let page_size = page::DEFAULT_PAGE_SIZE;
-    let mut init_page_data: HashMap<u64, Vec<u8>> = HashMap::new();
+    let mut init_page_data: hashbrown::HashMap<u64, Vec<u8>> = hashbrown::HashMap::new();
     for segment in &elf.data {
         for (i, &word) in segment.values.iter().enumerate() {
             let word_addr = segment.base_addr + (i as u64 * 4);
@@ -1865,11 +1905,13 @@ fn build_init_page_data(elf: &Elf, private_input: &[u8]) -> HashMap<u64, Vec<u8>
     init_page_data
 }
 
+#[cfg(feature = "prove")]
 fn collect_bitwise_from_page(
     elf: &Elf,
     memory_state: &MemoryState,
     private_input: &[u8],
 ) -> Vec<BitwiseOperation> {
+    #[cfg(feature = "prove")]
     use std::collections::BTreeSet;
 
     let page_size = page::DEFAULT_PAGE_SIZE;
@@ -1922,6 +1964,7 @@ fn collect_bitwise_from_page(
 
 /// Expand one Commit ECALL into its per-byte COMMIT rows using the memory state
 /// at the moment the ECALL executes.
+#[cfg(feature = "prove")]
 fn expand_commit_operations_for_ecall(
     ecall: &CpuOperation,
     memory_state: &MemoryState,
@@ -1964,6 +2007,7 @@ fn expand_commit_operations_for_ecall(
 /// - Zero for end detection (1 per real row, mult = mu)
 ///
 /// Note: AreBytes for value is intentionally omitted per spec.
+#[cfg(feature = "prove")]
 fn collect_bitwise_from_commit(commit_ops: &[CommitOperation]) -> Vec<BitwiseOperation> {
     let mut lookups = Vec::new();
 
@@ -2097,6 +2141,7 @@ pub(crate) fn collect_bitwise_from_ecdas(ops: &[ecdas::EcdasOperation]) -> Vec<B
 /// interactions; the keccak core chip sends IS_HALF interactions.
 /// All of these must be registered so the BITWISE table's multiplicities are correct.
 #[allow(clippy::needless_range_loop)]
+#[cfg(feature = "prove")]
 pub(crate) fn collect_bitwise_from_keccak(keccak_ops: &[KeccakOperation]) -> Vec<BitwiseOperation> {
     use executor::vm::instruction::execution::{KECCAK_RC, KECCAK_RHO};
 
@@ -2347,6 +2392,7 @@ pub(crate) fn collect_bitwise_from_keccak(keccak_ops: &[KeccakOperation]) -> Vec
 
 /// every address accessed during execution (ELF init + runtime stores/loads).
 /// ELF pages get their init data from the binary; all others are zero-init.
+#[cfg(feature = "prove")]
 fn generate_page_tables(
     elf: &Elf,
     memory_state: &MemoryState,
@@ -2355,6 +2401,7 @@ fn generate_page_tables(
     Vec<TraceTable<GoldilocksField, GoldilocksExtension>>,
     Vec<PageConfig>,
 ) {
+    #[cfg(feature = "prove")]
     use std::collections::BTreeSet;
 
     // Collect init data from ELF segments + private input region
@@ -2378,14 +2425,14 @@ fn generate_page_tables(
     let mut page_configs = Vec::new();
 
     // Determine which page bases hold private input data.
-    let private_input_page_bases: std::collections::BTreeSet<u64> = if !private_input.is_empty() {
-        use executor::vm::memory::PRIVATE_INPUT_START_INDEX;
+    let private_input_page_bases: alloc::collections::BTreeSet<u64> = if !private_input.is_empty() {
+        use executor::constants::PRIVATE_INPUT_START_INDEX;
         let total_bytes = 4 + private_input.len(); // length prefix + data
         (0..total_bytes)
             .map(|i| page::page_base_for_address(PRIVATE_INPUT_START_INDEX + i as u64))
             .collect()
     } else {
-        std::collections::BTreeSet::new()
+        alloc::collections::BTreeSet::new()
     };
 
     for &page_base in &page_bases {
@@ -2551,6 +2598,7 @@ fn chunk_and_generate<T>(
 /// Takes the raw output of `collect_ops_from_cpu` plus `register_state`
 /// (for HALT finalization), and returns fully-routed ops ready for Phase 3+.
 #[allow(clippy::too_many_arguments)]
+#[cfg(feature = "prove")]
 fn collect_all_ops(
     cpu_ops: Vec<CpuOperation>,
     mut memw_ops: Vec<MemwOperation>,
@@ -2706,13 +2754,14 @@ fn collect_all_ops(
 /// `elf` controls PAGE table generation: `Some(elf)` generates real PAGE tables
 /// and PAGE bitwise lookups; `None` produces empty page tables.
 #[allow(clippy::too_many_arguments)]
+#[cfg(feature = "prove")]
 fn build_traces(
     ops: CollectedOps,
     elf: Option<&Elf>,
     memory_state: &MemoryState,
     entry_point: u64,
     decode_trace: TraceTable<GoldilocksField, GoldilocksExtension>,
-    decode_pc_to_row: HashMap<u64, usize>,
+    decode_pc_to_row: PcToRow,
     mut register_state: RegisterState,
     max_rows: &super::MaxRowsConfig,
     #[cfg(feature = "disk-spill")] storage_mode: StorageMode,
@@ -2927,7 +2976,7 @@ fn build_traces(
     // When CPU is split, each chunk pads independently
     let mut decode = decode_trace;
     let mut decode_lookups: Vec<u64> = cpu_ops.iter().map(|op| op.decode.pc).collect();
-    decode_lookups.extend(std::iter::repeat_n(cpu::CPU_PADDING_PC, num_padding_rows));
+    decode_lookups.extend(core::iter::repeat_n(cpu::CPU_PADDING_PC, num_padding_rows));
     decode::update_multiplicities(&mut decode, &decode_pc_to_row, &decode_lookups);
 
     // Prepare register final state before scope (needs register_state ownership)
@@ -3574,7 +3623,7 @@ impl Traces {
     /// init data populated. Used by the verifier to reconstruct the ELF
     /// portion of the PAGE table layout.
     pub fn page_configs_from_elf(elf: &Elf) -> Vec<PageConfig> {
-        use std::collections::BTreeSet;
+        use alloc::collections::BTreeSet;
 
         let init_page_data = build_init_page_data(elf, &[]);
 
@@ -3617,7 +3666,7 @@ impl Traces {
 
         // Add private-input pages (non-preprocessed, verifier doesn't know init values)
         if num_private_input_pages > 0 {
-            use executor::vm::memory::PRIVATE_INPUT_START_INDEX;
+            use executor::constants::PRIVATE_INPUT_START_INDEX;
             let first_page_base = page::page_base_for_address(PRIVATE_INPUT_START_INDEX);
             for i in 0..num_private_input_pages {
                 configs.push(PageConfig {
@@ -3682,6 +3731,7 @@ impl Traces {
     /// 3. MEMW → LT operations (timestamp ordering)
     /// 4. LT, MEMW, Branch → Bitwise lookups
     /// 5. Generate all traces including PAGE tables
+    #[cfg(feature = "prove")]
     pub fn from_elf_and_logs(
         elf: &Elf,
         logs: &[Log],
@@ -3755,6 +3805,7 @@ impl Traces {
     /// as it generates PAGE tables from ELF data.
     ///
     /// Note: This creates empty PAGE tables since no ELF is provided.
+    #[cfg(feature = "prove")]
     pub fn from_logs(
         logs: &[Log],
         instructions: U64HashMap<Instruction>,
diff --git a/prover/src/test_utils.rs b/prover/src/test_utils.rs
index fd9d9d40c..252920e25 100644
--- a/prover/src/test_utils.rs
+++ b/prover/src/test_utils.rs
@@ -10,13 +10,24 @@
 //! - Minimal trace generation for testing
 //! - AIR creation helpers
 
+use alloc::boxed::Box;
+use alloc::format;
+use alloc::vec;
+use alloc::vec::Vec;
+
+#[cfg(feature = "prove")]
 use std::path::PathBuf;
 
 use crypto::fiat_shamir::is_transcript::IsStarkTranscript;
+#[cfg(feature = "prove")]
 use executor::elf::Elf;
+#[cfg(feature = "prove")]
 use executor::vm::execution::Executor;
+#[cfg(feature = "prove")]
 use executor::vm::instruction::decoding::Instruction;
+#[cfg(feature = "prove")]
 use executor::vm::logs::Log;
+#[cfg(feature = "prove")]
 use executor::vm::memory::U64HashMap;
 use math::field::element::FieldElement;
 use stark::constraints::transition::{TransitionConstraint, TransitionConstraintEvaluator};
@@ -209,6 +220,7 @@ pub fn is_halfword_sender_columns(interactions: &[BusInteraction]) -> Vec<usize>
 // =============================================================================
 
 /// Returns the raw ELF bytes for an assembly test program.
+#[cfg(feature = "prove")]
 pub fn asm_elf_bytes(name: &str) -> Vec<u8> {
     let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
     let workspace_root = manifest_dir
@@ -227,6 +239,7 @@ pub fn asm_elf_bytes(name: &str) -> Vec<u8> {
 /// Helper to run an ELF from the program_artifacts directory.
 ///
 /// Returns the ELF, execution logs, and instruction map.
+#[cfg(feature = "prove")]
 pub fn run_asm_elf(name: &str) -> (Elf, Vec<Log>, U64HashMap<Instruction>) {
     let elf_data = asm_elf_bytes(name);
     let elf = Elf::load(&elf_data).expect("Failed to load ELF");
@@ -240,6 +253,7 @@ pub fn run_asm_elf(name: &str) -> (Elf, Vec<Log>, U64HashMap<Instruction>) {
 // =============================================================================
 
 /// Collect bitwise lookups from executor logs for minimal table generation.
+#[cfg(feature = "prove")]
 pub fn collect_bitwise_ops_from_logs(
     logs: &[Log],
     instructions: &U64HashMap<Instruction>,
@@ -258,10 +272,12 @@ pub fn collect_bitwise_ops_from_logs(
 ///
 /// For each instruction that triggers an SLT or BLT operation, creates an LtOperation
 /// with the arg1, arg2, and signed values.
+#[cfg(feature = "prove")]
 pub fn collect_lt_lookups_from_logs(
     logs: &[Log],
     instructions: &U64HashMap<Instruction>,
 ) -> Vec<LtOperation> {
+    #[cfg(feature = "prove")]
     use executor::vm::instruction::decoding::{ArithOp, Comparison};
 
     let mut lookups = Vec::new();
@@ -357,10 +373,12 @@ pub fn collect_lt_lookups_from_logs(
 /// Collect LOAD operations from executor logs.
 ///
 /// Creates LoadOperation objects for each Load instruction in the logs.
+#[cfg(feature = "prove")]
 pub fn collect_load_ops_from_logs(
     logs: &[Log],
     instructions: &U64HashMap<Instruction>,
 ) -> Vec<crate::tables::load::LoadOperation> {
+    #[cfg(feature = "prove")]
     use executor::vm::instruction::decoding::LoadStoreWidth;
 
     let mut load_ops = Vec::new();
@@ -423,6 +441,7 @@ pub fn collect_load_ops_from_logs(
 /// The LT table sends:
 /// - MSB16 lookups (×2 per row: for lhs_msb and rhs_msb)
 /// - IS_HALFWORD lookups (×6 per row: ×4 for lhs_sub_rhs, ×1 for lhs[1], ×1 for rhs[1])
+#[cfg(feature = "prove")]
 pub fn collect_bitwise_ops_from_lt(lt_ops: &[LtOperation]) -> Vec<BitwiseOperation> {
     let mut lookups = Vec::new();
 
@@ -481,6 +500,7 @@ pub fn collect_bitwise_ops_from_lt(lt_ops: &[LtOperation]) -> Vec<BitwiseOperati
 /// - read2: MSB8[res[1]] -> sign_bit
 /// - read4: MSB8[res[3]] -> sign_bit
 /// - read8: no MSB8 lookup (all 8 bytes are used)
+#[cfg(feature = "prove")]
 pub fn collect_bitwise_ops_from_load(
     load_ops: &[crate::tables::load::LoadOperation],
 ) -> Vec<BitwiseOperation> {
@@ -500,7 +520,9 @@ pub fn collect_bitwise_ops_from_load(
 ///
 /// **WARNING: FOR TESTING/BENCHMARKING ONLY - NOT PRODUCTION SAFE!**
 /// The verifier expects the full deterministic 2^20 row public table.
+#[cfg(feature = "prove")]
 pub fn generate_minimal_bitwise_trace(ops: &[BitwiseOperation]) -> TraceTable<F, E> {
+    #[cfg(feature = "prove")]
     use std::collections::HashMap;
 
     // Collect unique (lo_byte, hi_byte, shift) tuples and count multiplicities per lookup type
diff --git a/prover/src/tests/bitwise_bus_tests.rs b/prover/src/tests/bitwise_bus_tests.rs
index fd3b55cba..1a6a356a1 100644
--- a/prover/src/tests/bitwise_bus_tests.rs
+++ b/prover/src/tests/bitwise_bus_tests.rs
@@ -4,6 +4,7 @@
 //! - Completeness: Valid lookups to BITWISE are accepted
 //! - Soundness: Invalid lookups to BITWISE are rejected
 
+#[cfg(feature = "prove")]
 use std::collections::HashMap;
 
 use crypto::fiat_shamir::default_transcript::DefaultTranscript;
diff --git a/prover/src/tests/branch_bus_tests.rs b/prover/src/tests/branch_bus_tests.rs
index 636f6dd34..52e71c693 100644
--- a/prover/src/tests/branch_bus_tests.rs
+++ b/prover/src/tests/branch_bus_tests.rs
@@ -6,6 +6,7 @@
 //! - Padding: Auto-padding to power of 2 works correctly
 //! - Border cases: Edge values (0, MAX, signed boundaries) work
 
+#[cfg(feature = "prove")]
 use std::collections::HashMap;
 
 use crypto::fiat_shamir::default_transcript::DefaultTranscript;
diff --git a/prover/src/tests/decode_tests.rs b/prover/src/tests/decode_tests.rs
index 43e6991cf..229ff58b9 100644
--- a/prover/src/tests/decode_tests.rs
+++ b/prover/src/tests/decode_tests.rs
@@ -11,8 +11,11 @@ use crate::tables::types::DecodeEntry;
 use crate::test_utils::asm_elf_bytes;
 use crate::{prove, verify_with_options};
 
+#[cfg(feature = "prove")]
 use executor::elf::Elf;
+#[cfg(feature = "prove")]
 use executor::vm::instruction::decoding::{ArithOp, Comparison, Instruction, LoadStoreWidth};
+#[cfg(feature = "prove")]
 use executor::vm::memory::U64HashMap;
 use stark::proof::options::GoldilocksCubicProofOptions;
 
diff --git a/prover/src/tests/lt_bus_tests.rs b/prover/src/tests/lt_bus_tests.rs
index b6148cfdc..b41b9aab3 100644
--- a/prover/src/tests/lt_bus_tests.rs
+++ b/prover/src/tests/lt_bus_tests.rs
@@ -6,6 +6,7 @@
 //! - Padding: Auto-padding to power of 2 works correctly
 //! - Border cases: Edge values (0, MAX, signed boundaries) work
 
+#[cfg(feature = "prove")]
 use std::collections::HashMap;
 
 use crypto::fiat_shamir::default_transcript::DefaultTranscript;
diff --git a/prover/src/tests/mod.rs b/prover/src/tests/mod.rs
index 4d0ac4477..b253dd543 100644
--- a/prover/src/tests/mod.rs
+++ b/prover/src/tests/mod.rs
@@ -59,6 +59,8 @@ pub mod page_tests;
 #[cfg(test)]
 pub mod prove_elfs_tests;
 #[cfg(test)]
+pub mod recursion_smoke_test;
+#[cfg(test)]
 pub mod register_tests;
 #[cfg(test)]
 pub mod shift_tests;
@@ -74,3 +76,5 @@ pub mod templates_tests;
 pub mod trace_builder_tests;
 #[cfg(test)]
 pub mod trace_test_helpers;
+#[cfg(test)]
+pub mod vkey_tests;
diff --git a/prover/src/tests/prove_elfs_tests.rs b/prover/src/tests/prove_elfs_tests.rs
index a52383341..e0751d3e4 100644
--- a/prover/src/tests/prove_elfs_tests.rs
+++ b/prover/src/tests/prove_elfs_tests.rs
@@ -26,6 +26,7 @@ use crate::tables::MaxRowsConfig;
 use crate::tables::trace_builder::Traces;
 use crate::tables::types::{GoldilocksExtension, GoldilocksField};
 
+#[cfg(feature = "prove")]
 use executor::elf::Elf;
 use executor::vm::execution::Executor;
 
@@ -1440,6 +1441,7 @@ fn test_prove_elfs_all_instructions_64_full() {
 fn test_debug_memory_bus_tokens() {
     use crate::tables::memw::cols as memw_cols;
     use crate::tables::register::cols as reg_cols;
+    #[cfg(feature = "prove")]
     use std::collections::HashMap;
 
     let (_elf, logs, instructions) = run_asm_elf("sub_neg_result");
@@ -1705,6 +1707,7 @@ fn test_debug_memory_tokens_sb_sh() {
     use crate::tables::memw::cols as memw_cols;
     use crate::tables::page::cols as page_cols;
     use crate::tables::register::cols as reg_cols;
+    #[cfg(feature = "prove")]
     use std::collections::HashMap;
 
     let (elf, logs, _instructions) = run_asm_elf("test_sb_sh_8");
diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
new file mode 100644
index 000000000..06f66d3f8
--- /dev/null
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -0,0 +1,1342 @@
+//! End-to-end naive recursion pipeline smoke tests.
+//!
+//! Each test:
+//! 1. Proves an inner program on the host.
+//! 2. Serializes `(VmProof, inner_elf)` with postcard.
+//! 3. Hands that as private input to the recursion guest.
+//! 4. Proves the recursion guest's execution.
+//! 5. Verifies the outer proof.
+//!
+//! The ELFs are built on demand by `bench_vs/build_recursion_elfs.sh`.
+//!
+//! Tests are `#[ignore]`d because the outer proof runs the full STARK verifier
+//! inside the VM (minutes per run, large memory footprint).
+
+use std::path::PathBuf;
+use std::process::Command;
+
+fn workspace_root() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("workspace root")
+        .to_path_buf()
+}
+
+fn build_elfs(root: &std::path::Path) {
+    let status = Command::new("bash")
+        .arg(root.join("bench_vs/build_recursion_elfs.sh"))
+        .status()
+        .expect("failed to spawn build helper");
+    assert!(status.success(), "ELF build script failed");
+}
+
+/// Path to a guest ELF artifact from a bench_vs/lambda/<name>/ build.
+fn guest_elf_path(root: &std::path::Path, name: &str, bin_name: &str) -> PathBuf {
+    root.join(format!(
+        "bench_vs/lambda/{name}/target/riscv64im-lambda-vm-elf/release/{bin_name}"
+    ))
+}
+
+/// Read a guest ELF artifact from a bench_vs/lambda/<name>/ build.
+fn read_guest_elf(root: &std::path::Path, name: &str, bin_name: &str) -> Vec<u8> {
+    let path = guest_elf_path(root, name, bin_name);
+    std::fs::read(&path).unwrap_or_else(|e| panic!("failed to read {}: {e}", path.display()))
+}
+
+/// Resolve a guest PC to `function (file:line)` using the ELF's DWARF info,
+/// the same mapping `addr2line -e <elf> -f -i -C 0x<pc>` produces. Returns the
+/// innermost (most-inlined) frame; `<unknown>` when no DWARF frame covers the
+/// PC (e.g. PLT stubs or a release build that dropped line info).
+fn resolve_pc(loader: &addr2line::Loader, pc: u64) -> String {
+    let mut frames = match loader.find_frames(pc) {
+        Ok(frames) => frames,
+        Err(_) => return "<unknown>".to_string(),
+    };
+    match frames.next() {
+        Ok(Some(frame)) => {
+            let func = frame
+                .function
+                .as_ref()
+                .and_then(|f| f.demangle().ok().map(|n| n.into_owned()))
+                .unwrap_or_else(|| "<unknown fn>".to_string());
+            let loc = frame
+                .location
+                .as_ref()
+                .and_then(|l| l.file.map(|file| (file, l.line)))
+                .map(|(file, line)| match line {
+                    Some(line) => format!(" ({file}:{line})"),
+                    None => format!(" ({file})"),
+                })
+                .unwrap_or_default();
+            format!("{func}{loc}")
+        }
+        // No DWARF frame; fall back to the symbol table.
+        _ => loader.find_symbol(pc).map_or_else(
+            || "<unknown>".to_string(),
+            |s| addr2line::demangle_auto(s.into(), None).into_owned(),
+        ),
+    }
+}
+
+/// Print a PC histogram as two tables: a per-function summary (the cycles each
+/// resolved function accounts for, folded over all its PCs) followed by the
+/// top-100 per-address detail. `pc_hist` maps program counter → cycle count.
+///
+/// The per-function view is the one that matters: an inlined kernel is spread
+/// across dozens of PCs, so the raw per-address table scatters its true cost.
+fn print_pc_histogram(
+    title: &str,
+    loader: &addr2line::Loader,
+    pc_hist: std::collections::HashMap<u64, u64>,
+    total_cycles: u64,
+    exec_time: std::time::Duration,
+) {
+    let mut entries: Vec<(u64, u64)> = pc_hist.into_iter().collect();
+    entries.sort_unstable_by_key(|(_pc, count)| std::cmp::Reverse(*count));
+
+    // Aggregate the full histogram by resolved function, resolving each PC once.
+    let mut by_function: std::collections::HashMap<String, (u64, u64)> =
+        std::collections::HashMap::new();
+    for (pc, count) in &entries {
+        let entry = by_function.entry(resolve_pc(loader, *pc)).or_insert((0, 0));
+        entry.0 += *count; // cycles
+        entry.1 += 1; // distinct PCs folded into this function
+    }
+    let mut fn_entries: Vec<(String, (u64, u64))> = by_function.into_iter().collect();
+    fn_entries.sort_unstable_by_key(|(_name, (cycles, _pcs))| std::cmp::Reverse(*cycles));
+
+    let pct = |n: u64| 100.0 * (n as f64) / (total_cycles as f64);
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  {title}");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles : {total_cycles}");
+    eprintln!("  Unique PCs   : {}", entries.len());
+    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  Top 25 functions by cycle count (aggregated over their PCs):");
+    eprintln!(
+        "  {:>4}  {:>14}  {:>7}  {:>7}  {:>5}  {}",
+        "rank", "cycles", "%", "cum %", "PCs", "function (file:line)"
+    );
+    let mut fn_cumulative: u64 = 0;
+    for (rank, (name, (cycles, pcs))) in fn_entries.iter().take(25).enumerate() {
+        fn_cumulative += cycles;
+        eprintln!(
+            "  {:>4}  {:>14}  {:>6.2}%  {:>6.2}%  {:>5}  {}",
+            rank + 1,
+            cycles,
+            pct(*cycles),
+            pct(fn_cumulative),
+            pcs,
+            name,
+        );
+    }
+    eprintln!();
+    eprintln!("  Top 100 PCs by cycle count (per-address detail):");
+    eprintln!(
+        "  {:>4}  {:>18}  {:>14}  {:>7}  {:>7}  {}",
+        "rank", "pc", "cycles", "%", "cum %", "function (file:line)"
+    );
+    let mut cumulative: u64 = 0;
+    for (rank, (pc, count)) in entries.iter().take(100).enumerate() {
+        cumulative += count;
+        eprintln!(
+            "  {:>4}  {:#018x}  {:>14}  {:>6.2}%  {:>6.2}%  {}",
+            rank + 1,
+            pc,
+            count,
+            pct(*count),
+            pct(cumulative),
+            resolve_pc(loader, *pc),
+        );
+    }
+    eprintln!("============================================================");
+}
+
+/// Core pipeline: prove an inner program with the given options, hand the
+/// proof+ELF+options to the recursion guest, then prove and verify the outer
+/// proof.
+fn run_recursion_pipeline_with_options(
+    label: &str,
+    inner_elf_bytes: &[u8],
+    inner_private_input: &[u8],
+    inner_proof_options: stark::proof::options::ProofOptions,
+) {
+    let root = workspace_root();
+    build_elfs(&root);
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench");
+
+    eprintln!(
+        "[{label}] proving inner (blowup={}, fri_queries={}) ...",
+        inner_proof_options.blowup_factor, inner_proof_options.fri_number_of_queries
+    );
+    let inner_proof = crate::prove_with_options_and_inputs(
+        inner_elf_bytes,
+        inner_private_input,
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+    eprintln!("[{label}] inner proof generated");
+
+    assert!(
+        crate::verify_with_options(&inner_proof, inner_elf_bytes, &inner_proof_options, None, None)
+            .expect("inner verify errored"),
+        "inner proof must verify on host"
+    );
+
+    let elf_for_vkey = executor::elf::Elf::load(inner_elf_bytes).expect("ELF load failed");
+    let page_configs = crate::tables::trace_builder::Traces::page_configs_from_elf_and_runtime(
+        &elf_for_vkey,
+        &inner_proof.runtime_page_ranges,
+        inner_proof.num_private_input_pages,
+    );
+    let vkey = crate::VmVerifyingKey::from_elf_and_options(
+        &elf_for_vkey,
+        &inner_proof_options,
+        &page_configs,
+    );
+    let blob =
+        postcard::to_allocvec(&(&inner_proof, &inner_elf_bytes, &inner_proof_options, &vkey))
+            .expect("postcard encode failed");
+    eprintln!(
+        "[{label}] postcard blob: {} bytes (limit: MAX_PRIVATE_INPUT_SIZE)",
+        blob.len()
+    );
+    assert!(
+        blob.len() < executor::constants::MAX_PRIVATE_INPUT_SIZE as usize,
+        "recursion input exceeds MAX_PRIVATE_INPUT_SIZE"
+    );
+
+    eprintln!("[{label}] proving outer (recursion guest) ...");
+    let outer_proof =
+        crate::prove_with_inputs(&recursion_elf_bytes, &blob).expect("outer prove should succeed");
+    eprintln!("[{label}] outer proof generated");
+
+    assert!(
+        crate::verify(&outer_proof, &recursion_elf_bytes).expect("outer verify errored"),
+        "outer proof must verify on host"
+    );
+
+    assert_eq!(
+        outer_proof.public_output,
+        vec![1u8],
+        "guest should commit success marker"
+    );
+}
+
+/// Convenience wrapper using `blowup=8` for the inner proof — the default for
+/// the existing smoke tests, chosen to keep outer-prove memory tractable.
+fn run_recursion_pipeline(label: &str, inner_elf_bytes: &[u8], inner_private_input: &[u8]) {
+    let inner_proof_options = stark::proof::options::GoldilocksCubicProofOptions::with_blowup(8)
+        .expect("blowup=8 is always valid");
+    run_recursion_pipeline_with_options(
+        label,
+        inner_elf_bytes,
+        inner_private_input,
+        inner_proof_options,
+    );
+}
+
+/// Inner program: empty (halt immediately). Useful for measuring the
+/// lambda-vm verifier's intrinsic recursion overhead — i.e. what it costs
+/// to verify the smallest possible lambda-vm proof, with no inner workload.
+#[test]
+#[ignore = "slow: runs the full STARK verifier inside the VM"]
+fn test_recursion_smoke_empty() {
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    run_recursion_pipeline("recursion-empty", &empty_elf_bytes, &[]);
+}
+
+/// Inner program: empty, but with the absolute-minimum FRI parameters
+/// (blowup=2, **fri_number_of_queries=1**). This is a "can the pipeline even
+/// run end-to-end on a 125 GB box" experiment — security is intentionally
+/// terrible. Use only for capacity probing.
+#[test]
+#[ignore = "slow: runs the full STARK verifier inside the VM"]
+fn test_recursion_smoke_1query() {
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+
+    // Construct ProofOptions directly so we can pin fri_number_of_queries = 1.
+    // (GoldilocksCubicProofOptions::with_blowup derives queries from a 128-bit
+    // security target — way more than we want here.)
+    let inner_proof_options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    run_recursion_pipeline_with_options(
+        "recursion-1query",
+        &empty_elf_bytes,
+        &[],
+        inner_proof_options,
+    );
+}
+
+/// Diagnostic: build the inner proof and dump the recursion guest's private-input
+/// blob to `/tmp/recursion_input.bin` so the CLI's `execute --flamegraph` can
+/// consume it.
+///
+/// Usage after running this test:
+/// ```
+/// cargo run -p cli --release -- execute \
+///     bench_vs/lambda/recursion/target/riscv64im-lambda-vm-elf/release/recursion-bench \
+///     --private-input /tmp/recursion_input.bin \
+///     --flamegraph /tmp/recursion_folded.txt
+/// cat /tmp/recursion_folded.txt | inferno-flamegraph > /tmp/recursion_flamegraph.svg
+/// ```
+#[test]
+#[ignore = "diagnostic: writes recursion private input to /tmp/recursion_input.bin"]
+fn test_dump_recursion_input() {
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+
+    let inner_proof_options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    eprintln!("[dump-input] proving inner ...");
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let elf_for_vkey = executor::elf::Elf::load(&empty_elf_bytes).expect("ELF load failed");
+    let page_configs = crate::tables::trace_builder::Traces::page_configs_from_elf_and_runtime(
+        &elf_for_vkey,
+        &inner_proof.runtime_page_ranges,
+        inner_proof.num_private_input_pages,
+    );
+    let vkey = crate::VmVerifyingKey::from_elf_and_options(
+        &elf_for_vkey,
+        &inner_proof_options,
+        &page_configs,
+    );
+    let blob =
+        postcard::to_allocvec(&(&inner_proof, &empty_elf_bytes, &inner_proof_options, &vkey))
+            .expect("postcard encode failed");
+
+    let path = "/tmp/recursion_input.bin";
+    std::fs::write(path, &blob).expect("write blob");
+    eprintln!("[dump-input] wrote {} bytes to {path}", blob.len());
+}
+
+/// Diagnostic: build the inner proof + recursion guest input, then **execute
+/// only** the recursion guest (no STARK proving) and report cycle counts +
+/// trace size estimates.
+///
+/// This is the cheap way to find out how many RISC-V instructions the
+/// verifier actually executes inside the guest — a much faster signal than
+/// running the full outer prove (which can OOM on a 125 GB machine).
+#[test]
+#[ignore = "diagnostic: runs the executor only, prints cycle counts"]
+fn test_recursion_cycle_count() {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench");
+
+    // Build the inner proof exactly as the smoke test does, with the
+    // absolute-minimum FRI params so the inner is as small as possible.
+    let inner_proof_options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    eprintln!("[cycle-count] proving inner (empty, blowup=2, fri_queries=1) ...");
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let elf_for_vkey = executor::elf::Elf::load(&empty_elf_bytes).expect("ELF load failed");
+    let page_configs = crate::tables::trace_builder::Traces::page_configs_from_elf_and_runtime(
+        &elf_for_vkey,
+        &inner_proof.runtime_page_ranges,
+        inner_proof.num_private_input_pages,
+    );
+    let vkey = crate::VmVerifyingKey::from_elf_and_options(
+        &elf_for_vkey,
+        &inner_proof_options,
+        &page_configs,
+    );
+    let blob =
+        postcard::to_allocvec(&(&inner_proof, &empty_elf_bytes, &inner_proof_options, &vkey))
+            .expect("postcard encode failed");
+    eprintln!("[cycle-count] postcard blob: {} bytes", blob.len());
+
+    // Execute (NOT prove) the recursion guest. Use `resume()` in a loop and
+    // only count chunk sizes — never accumulate logs in memory. This avoids
+    // the Vec<Log> blow-up that OOMs even a 125 GB server (one Log is 40 B;
+    // a few billion of them is hundreds of GB).
+    eprintln!("[cycle-count] executing recursion guest (streaming counter only) ...");
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+    let start = std::time::Instant::now();
+    let mut cycle_count: usize = 0;
+    let mut chunks: usize = 0;
+    while let Some(logs) = executor.resume().expect("executor resume failed") {
+        cycle_count += logs.len();
+        chunks += 1;
+        if chunks.is_multiple_of(50) {
+            eprintln!(
+                "[cycle-count]   ... {chunks} chunks, {cycle_count} cycles, {:?} elapsed",
+                start.elapsed()
+            );
+        }
+    }
+    let exec_time = start.elapsed();
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  RECURSION GUEST EXECUTION SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Cycle count           : {cycle_count}");
+    eprintln!("  Executor wall time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  Rough memory estimate for outer prove:");
+    let bytes_per_field = 8usize;
+    let approx_columns = 250usize; // CPU + MEMW + DECODE + bus columns combined
+    let main_trace_bytes = cycle_count * approx_columns * bytes_per_field;
+    let blowup = 2usize;
+    let lde_main_bytes = main_trace_bytes * blowup;
+    eprintln!(
+        "    main trace            : ~{:.2} GB ({} cycles × ~{} cols × 8 B)",
+        main_trace_bytes as f64 / 1e9,
+        cycle_count,
+        approx_columns
+    );
+    eprintln!(
+        "    main LDE (blowup={})   : ~{:.2} GB",
+        blowup,
+        lde_main_bytes as f64 / 1e9
+    );
+    eprintln!("  (aux trace adds roughly 50% more, so peak peak ≈ 2-3× LDE)");
+    eprintln!("============================================================");
+}
+
+/// Diagnostic: count the distinct 4 KB memory pages the recursion guest
+/// touches when verifying a small inner proof.
+///
+/// We suspect the outer prover's 125 GB OOM wall is dominated by per-page
+/// PAGE-table overhead. The number of PAGE tables the prover would build
+/// equals the number of distinct 4 KB pages the executor touches — code,
+/// heap, private input, and stack. This test surfaces that count without
+/// running the prover.
+///
+/// Layout (per `executor::constants` + `bench_vs/lambda/recursion/src/main.rs`):
+/// - Code/static: whatever PT_LOAD segments the recursion ELF carries.
+/// - Heap: `_end .. 0xC000_0000` (`MAX_MEMORY_SIZE`); `TlsfHeap` scatters
+///   allocations across this region.
+/// - Private input: starts at `PRIVATE_INPUT_START_INDEX = 0xFF000000`.
+/// - Stack: top of address space (down from `STACK_TOP = 0xFFFFFFFFFFFFFFF0`).
+///
+/// Interpretation (rough):
+/// - <1,000 pages: PAGE-table overhead is not the bottleneck.
+/// - 10k-100k pages: TLSF heap fragmentation; design a tighter bump allocator
+///   and re-measure.
+/// - >100k pages: postcard decode dominates; consider streaming decode.
+#[test]
+#[ignore = "diagnostic: counts distinct 4 KB memory pages touched by the recursion guest"]
+fn test_recursion_page_count() {
+    use executor::constants::PRIVATE_INPUT_START_INDEX;
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+    use std::collections::HashSet;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench");
+
+    let inner_proof_options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    eprintln!("[page-count] proving inner (empty, blowup=2, fri_queries=1) ...");
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let elf_for_vkey = Elf::load(&empty_elf_bytes).expect("ELF load failed");
+    let page_configs = crate::tables::trace_builder::Traces::page_configs_from_elf_and_runtime(
+        &elf_for_vkey,
+        &inner_proof.runtime_page_ranges,
+        inner_proof.num_private_input_pages,
+    );
+    let vkey = crate::VmVerifyingKey::from_elf_and_options(
+        &elf_for_vkey,
+        &inner_proof_options,
+        &page_configs,
+    );
+    let blob =
+        postcard::to_allocvec(&(&inner_proof, &empty_elf_bytes, &inner_proof_options, &vkey))
+            .expect("postcard encode failed");
+    eprintln!("[page-count] postcard blob: {} bytes", blob.len());
+
+    // Precompute the recursion ELF's PT_LOAD ranges so we can bucket code/
+    // static pages separately from heap. `Elf::load` already expands BSS
+    // (memsz > filesz) into zero-valued words, so these ranges cover
+    // .text + .rodata + .data + .bss.
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let segment_ranges: Vec<(u64, u64)> = program
+        .data
+        .iter()
+        .map(|seg| (seg.base_addr, seg.base_addr + (seg.values.len() as u64 * 4)))
+        .collect();
+    eprintln!(
+        "[page-count] recursion ELF: {} PT_LOAD segment(s)",
+        segment_ranges.len(),
+    );
+    for (i, (lo, hi)) in segment_ranges.iter().enumerate() {
+        eprintln!(
+            "[page-count]   segment[{i}]: 0x{lo:016x} .. 0x{hi:016x} ({} bytes)",
+            hi - lo,
+        );
+    }
+
+    // Stream through execution — running to completion via `Executor::run`
+    // would accumulate ~67 M `Log` records (~2.7 GB) we don't need. We only
+    // care about the *final* memory state.
+    eprintln!("[page-count] executing recursion guest (streaming) ...");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+    let start = std::time::Instant::now();
+    let mut chunks: usize = 0;
+    let mut total_cycles: u64 = 0;
+    while let Some(logs) = executor.resume().expect("executor resume failed") {
+        total_cycles += logs.len() as u64;
+        chunks += 1;
+        if chunks.is_multiple_of(50) {
+            eprintln!(
+                "[page-count]   ... {chunks} chunks, {total_cycles} cycles, {:?} elapsed",
+                start.elapsed()
+            );
+        }
+    }
+    let exec_time = start.elapsed();
+
+    // Collect the set of distinct 4 KB pages from every cell touched during
+    // (a) program loading, (b) private-input loading, (c) execution.
+    const PAGE_MASK: u64 = !0xFFFu64;
+    let cells = executor.memory().cells();
+    let total_cells = cells.len();
+    let pages: HashSet<u64> = cells.keys().map(|&a| a & PAGE_MASK).collect();
+
+    // Bucket by region. A "code/static" page is any page that overlaps a
+    // PT_LOAD segment. Stack lives near the top of the 64-bit address
+    // space; private input lives in the [0xFF000000, ...) window above the
+    // 3 GB heap ceiling.
+    const HEAP_CEILING: u64 = 0xC000_0000;
+    const STACK_FLOOR: u64 = 0xFFFF_FFFF_0000_0000;
+
+    let mut code_pages = 0usize;
+    let mut heap_pages = 0usize;
+    let mut private_input_pages = 0usize;
+    let mut stack_pages = 0usize;
+    let mut other_pages = 0usize;
+
+    for &page in &pages {
+        let page_end = page.saturating_add(0x1000);
+        let in_code = segment_ranges
+            .iter()
+            .any(|&(lo, hi)| page < hi && lo < page_end);
+        if in_code {
+            code_pages += 1;
+        } else if page >= STACK_FLOOR {
+            stack_pages += 1;
+        } else if page >= PRIVATE_INPUT_START_INDEX {
+            private_input_pages += 1;
+        } else if page < HEAP_CEILING {
+            heap_pages += 1;
+        } else {
+            other_pages += 1;
+        }
+    }
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  RECURSION GUEST PAGE-COUNT SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles                  : {total_cycles}");
+    eprintln!("  Executor wall time            : {exec_time:?}");
+    eprintln!("  Memory cells touched (4 B ea) : {total_cells}");
+    eprintln!("  Distinct 4 KB pages touched   : {}", pages.len());
+    eprintln!();
+    eprintln!("  Pages per region:");
+    eprintln!("    code/static (ELF segments)     : {code_pages}");
+    eprintln!("    heap (0..0xC000_0000)          : {heap_pages}");
+    eprintln!("    private input (0xFF000000..)   : {private_input_pages}");
+    eprintln!("    stack (>= 0xFFFFFFFF_00000000) : {stack_pages}");
+    if other_pages > 0 {
+        eprintln!("    other (unclassified)           : {other_pages}");
+    }
+    eprintln!();
+    eprintln!("  Interpretation (PAGE-table overhead):");
+    eprintln!("    <1k pages     → PAGE overhead is not the bottleneck.");
+    eprintln!("    10k-100k      → TLSF heap fragmentation; try a bump alloc.");
+    eprintln!("    >100k         → postcard decode dominates; stream-decode?");
+    eprintln!("============================================================");
+}
+
+/// Build a PC histogram of the recursion guest verifying an `empty`-program
+/// inner proof produced with `inner_proof_options`, and print it via
+/// [`print_pc_histogram`] under `title`.
+///
+/// `blowup_factor` and `fri_number_of_queries` are coupled (the query count is
+/// derived from blowup for a fixed security target), so each `#[test]` below is
+/// just this runner with a different `ProofOptions` — a single query at low
+/// blowup, vs. the security-derived multi-query count at a higher blowup.
+///
+/// Streams chunks of logs via `Executor::resume()` so memory stays bounded to
+/// the histogram itself. Each PC is resolved to its source function via the
+/// `addr2line` crate (reading the recursion ELF's DWARF directly — no external
+/// tool needed).
+fn run_recursion_pc_histogram(title: &str, inner_proof_options: stark::proof::options::ProofOptions) {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+    use std::collections::HashMap;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let recursion_elf_path = guest_elf_path(&root, "recursion", "recursion-bench");
+    let recursion_elf_bytes =
+        std::fs::read(&recursion_elf_path).expect("failed to read recursion ELF");
+
+    eprintln!(
+        "[pc-hist] proving inner (empty, blowup={}, fri_queries={}) ...",
+        inner_proof_options.blowup_factor, inner_proof_options.fri_number_of_queries
+    );
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let elf_for_vkey = executor::elf::Elf::load(&empty_elf_bytes).expect("ELF load failed");
+    let page_configs = crate::tables::trace_builder::Traces::page_configs_from_elf_and_runtime(
+        &elf_for_vkey,
+        &inner_proof.runtime_page_ranges,
+        inner_proof.num_private_input_pages,
+    );
+    let vkey = crate::VmVerifyingKey::from_elf_and_options(
+        &elf_for_vkey,
+        &inner_proof_options,
+        &page_configs,
+    );
+    let blob =
+        postcard::to_allocvec(&(&inner_proof, &empty_elf_bytes, &inner_proof_options, &vkey))
+            .expect("postcard encode failed");
+    eprintln!("[pc-hist] postcard blob: {} bytes", blob.len());
+
+    eprintln!("[pc-hist] executing recursion guest (building PC histogram) ...");
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    let start = std::time::Instant::now();
+    let mut pc_hist: HashMap<u64, u64> = HashMap::with_capacity(300_000);
+    let mut total_cycles: u64 = 0;
+    let mut chunks: usize = 0;
+    while let Some(logs) = executor.resume().expect("executor resume failed") {
+        for log in logs {
+            *pc_hist.entry(log.current_pc).or_insert(0) += 1;
+        }
+        total_cycles += logs.len() as u64;
+        chunks += 1;
+        if chunks.is_multiple_of(500) {
+            eprintln!(
+                "[pc-hist]   ... {chunks} chunks, {total_cycles} cycles, {} unique PCs, {:?}",
+                pc_hist.len(),
+                start.elapsed()
+            );
+        }
+    }
+    let exec_time = start.elapsed();
+
+    // Resolve PCs to source functions directly from the ELF's DWARF, the same
+    // mapping `addr2line -e <elf> -f -i -C 0x<pc>` produces.
+    let loader = addr2line::Loader::new(&recursion_elf_path)
+        .expect("failed to load recursion ELF for addr2line");
+    print_pc_histogram(title, &loader, pc_hist, total_cycles, exec_time);
+}
+
+/// Diagnostic: PC histogram of the recursion guest with a **single** FRI query
+/// at blowup=2 — the cheapest verifier run, dominated by fixed setup cost
+/// (decode, allocator, postcard) rather than per-query FRI/Merkle work.
+#[test]
+#[ignore = "diagnostic: ~8 minutes; prints PC histogram of the verifier-in-VM"]
+fn test_recursion_pc_histogram() {
+    let inner_proof_options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+    run_recursion_pc_histogram("RECURSION GUEST PC HISTOGRAM (blowup=2, 1 query)", inner_proof_options);
+}
+
+/// Diagnostic: PC histogram of the recursion guest at **128-bit security**
+/// (blowup=8, FRI query count derived by the Johnson Bound Regime — tens of
+/// queries). Compared against the single-query runs, weight shifts toward the
+/// verifier's per-query FRI-layer / Merkle-opening and field arithmetic.
+#[test]
+#[ignore = "diagnostic: heavy; PC histogram of the multi-query verifier-in-VM"]
+fn test_recursion_pc_histogram_multiquery() {
+    let inner_proof_options = crate::GoldilocksCubicProofOptions::with_blowup(8)
+        .expect("blowup=8 is always valid");
+    run_recursion_pc_histogram(
+        &format!(
+            "RECURSION GUEST PC HISTOGRAM (blowup=8, {} queries, 128-bit)",
+            inner_proof_options.fri_number_of_queries
+        ),
+        inner_proof_options,
+    );
+}
+
+/// Diagnostic: build a **sampled** call-stack histogram of the recursion guest.
+///
+/// Like `test_recursion_pc_histogram` but groups by full call stack (not PC).
+/// To stay fast, only every `SAMPLE_RATE`-th log is recorded into the histogram.
+/// The call stack itself is updated on every log (skipping would corrupt it).
+///
+/// Output is written to `/tmp/recursion_folded_sampled.txt` in
+/// inferno-flamegraph "folded stacks" format. Pipe it through:
+///
+///     cat /tmp/recursion_folded_sampled.txt | inferno-flamegraph > svg.svg
+///
+/// Expect ~10-20 minutes for SAMPLE_RATE=100 on a 40B-cycle guest.
+#[test]
+#[ignore = "diagnostic: sampled flamegraph for the verifier-in-VM"]
+fn test_recursion_sampled_flamegraph() {
+    use executor::elf::Elf;
+    use executor::flamegraph::FlamegraphGenerator;
+    use executor::vm::execution::Executor;
+    use std::io::BufWriter;
+
+    /// 1 in N logs is fed to `process_logs`, which both updates the call
+    /// stack and records a sample. At 1, every cycle goes through — the call
+    /// stack stays exactly in sync with execution so frame widths are
+    /// trustworthy, but the per-cycle cost (~57µs) limits how many cycles
+    /// we can cover within a wall-clock budget.
+    ///
+    /// At SAMPLE_RATE > 1, every CALL/RETURN that lands on a skipped cycle
+    /// silently desyncs the stack, producing the "stuck-in-visit_seq" effect
+    /// we saw at 1:1000. Use values > 1 only when stack accuracy is
+    /// expendable.
+    const SAMPLE_RATE: usize = 1;
+
+    /// Stop the executor early once we've covered this many cycles.
+    /// Set to 0 to run to completion (40B+ cycles, hours at SAMPLE_RATE=1).
+    /// At SAMPLE_RATE=1, ~57µs per cycle means 5M cycles ≈ 5 min wall time.
+    const CYCLE_BUDGET: u64 = 5_000_000;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench");
+
+    let inner_proof_options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    eprintln!("[sampled-fg] proving inner (empty, blowup=2, fri_queries=1) ...");
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let elf_for_vkey = executor::elf::Elf::load(&empty_elf_bytes).expect("ELF load failed");
+    let page_configs = crate::tables::trace_builder::Traces::page_configs_from_elf_and_runtime(
+        &elf_for_vkey,
+        &inner_proof.runtime_page_ranges,
+        inner_proof.num_private_input_pages,
+    );
+    let vkey = crate::VmVerifyingKey::from_elf_and_options(
+        &elf_for_vkey,
+        &inner_proof_options,
+        &page_configs,
+    );
+    let blob =
+        postcard::to_allocvec(&(&inner_proof, &empty_elf_bytes, &inner_proof_options, &vkey))
+            .expect("postcard encode failed");
+    eprintln!("[sampled-fg] postcard blob: {} bytes", blob.len());
+
+    eprintln!("[sampled-fg] executing recursion guest (sampling 1-in-{SAMPLE_RATE}) ...",);
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes);
+    let entry_point = program.entry_point;
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    let mut generator = FlamegraphGenerator::new(symbols, entry_point);
+
+    // Path is defined here (not after the loop) so the periodic checkpoint
+    // writes below can target it. The final write at the end still happens.
+    let path = "/tmp/recursion_folded_sampled.txt";
+
+    let start = std::time::Instant::now();
+    let mut total_cycles: u64 = 0;
+    let mut chunks: usize = 0;
+    while let Some(logs) = executor.resume().expect("executor resume failed") {
+        // Pull the chunk into an owned Vec so we can use it after dropping the
+        // immutable borrow of `executor`.
+        let (sampled, chunk_len) = {
+            let len = logs.len();
+            // When SAMPLE_RATE == 1, this is the identity filter — `_ % 1 == 0`
+            // is trivially true. clippy::modulo_one is fired so we suppress it
+            // here; the generality of the filter is the point (lets us flip
+            // SAMPLE_RATE without touching the loop body).
+            #[allow(clippy::modulo_one)]
+            let sampled: Vec<_> = logs
+                .iter()
+                .enumerate()
+                .filter(|(i, _)| i % SAMPLE_RATE == 0)
+                .map(|(_, log)| log.clone())
+                .collect();
+            (sampled, len)
+        };
+
+        // Now we can re-borrow executor.instructions immutably for the
+        // flamegraph generator. We build the sampled subset of logs (every Nth)
+        // and call process_logs on it. THIS LOSES STACK ACCURACY for skipped
+        // logs but is fast — acceptable for diagnostic-quality data at this
+        // sample rate.
+        generator
+            .process_logs(&sampled, &executor.instructions)
+            .expect("flamegraph process_logs");
+
+        total_cycles += chunk_len as u64;
+        chunks += 1;
+        if chunks.is_multiple_of(500) {
+            eprintln!(
+                "[sampled-fg]   ... {chunks} chunks, {total_cycles} cycles, {:?} elapsed",
+                start.elapsed()
+            );
+            // Checkpoint: re-write the folded file in place so a killed run
+            // still leaves a usable (if partial) flamegraph on disk.
+            let file = std::fs::File::create(path).expect("create output file");
+            let mut writer = BufWriter::new(file);
+            generator
+                .write_folded(&mut writer)
+                .expect("write folded output");
+        }
+
+        // Early exit once we've covered the cycle budget. The flamegraph will
+        // reflect only the cycles we processed, but the dominant hot kernels
+        // are typically uniformly distributed across the verifier's runtime so
+        // a partial run still surfaces them clearly. Wrapped in #[allow] so
+        // CYCLE_BUDGET can be const-0 (full run) without tripping clippy.
+        #[allow(clippy::absurd_extreme_comparisons)]
+        if CYCLE_BUDGET > 0 && total_cycles >= CYCLE_BUDGET {
+            eprintln!("[sampled-fg] hit cycle budget ({CYCLE_BUDGET} cycles), stopping early");
+            break;
+        }
+    }
+    let exec_time = start.elapsed();
+
+    let file = std::fs::File::create(path).expect("create output file");
+    let mut writer = BufWriter::new(file);
+    generator
+        .write_folded(&mut writer)
+        .expect("write folded output");
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  SAMPLED FLAMEGRAPH SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles : {total_cycles}");
+    eprintln!("  Sample rate  : 1 in {SAMPLE_RATE}");
+    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!("  Output file  : {path}");
+    eprintln!("============================================================");
+    eprintln!();
+    eprintln!("  To render SVG (requires inferno):");
+    eprintln!("    cat {path} | inferno-flamegraph > /tmp/recursion_flamegraph_sampled.svg");
+    eprintln!("============================================================");
+}
+
+/// Diagnostic: host-side per-step timings for the verifier.
+///
+/// Runs an inner prove (empty guest, blowup=2, 1 query) and then verifies it
+/// on the host. When built with `--features stark/instruments`, the verifier
+/// prints `Time spent: ...` for each of the four steps (replay challenges,
+/// composition polynomial, FRI, DEEP openings) plus the step-1-replay it
+/// does before step 2. Lets us see the host-side split in seconds, without
+/// running anything inside the VM.
+///
+/// Usage:
+/// ```
+/// cargo test --release -p lambda-vm-prover --features stark/instruments \
+///   --lib test_host_verify_step_timings -- --ignored --nocapture
+/// ```
+#[test]
+#[ignore = "diagnostic: prints host-side verifier step timings"]
+fn test_host_verify_step_timings() {
+    let root = workspace_root();
+    let empty_path =
+        root.join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench");
+    if !empty_path.exists() {
+        build_elfs(&root);
+    }
+    let empty_elf_bytes = std::fs::read(&empty_path).expect("read empty-bench");
+
+    let inner_proof_options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    eprintln!("[host-verify] proving empty (blowup=2, fri_queries=1) ...");
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    eprintln!("[host-verify] verifying on host (with instruments) ...");
+    let ok =
+        crate::verify_with_options(&inner_proof, &empty_elf_bytes, &inner_proof_options, None, None)
+            .expect("verify errored");
+    assert!(ok, "proof must verify");
+    eprintln!("[host-verify] verified OK");
+}
+
+/// Diagnostic: cycle count for the **deserialize-only** counterpart of the
+/// recursion guest. Same input layout
+/// (`(VmProof, Vec<u8>, ProofOptions, VmVerifyingKey)`) and same proof, but
+/// the guest just postcard-decodes the blob and halts — it never calls
+/// `verify_with_options`.
+///
+/// The cycle delta between this and `test_recursion_cycle_count` is the
+/// actual cost of the STARK verifier inside the VM. Historically (40.5 B-cycle
+/// recursion guest) postcard decode was ~15.6 M cycles — negligible. Now that
+/// the recursion guest is ~67 M cycles, the same absolute cost would be ~23%
+/// of total; this test re-measures it.
+#[test]
+#[ignore = "diagnostic: runs the deserialize-only guest, prints cycle count"]
+fn test_deserialize_only_cycle_count() {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let deser_elf_bytes = read_guest_elf(&root, "deserialize-only", "deserialize-only-bench");
+
+    let inner_proof_options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    eprintln!("[deser-only] proving inner (empty, blowup=2, fri_queries=1) ...");
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let elf_for_vkey = executor::elf::Elf::load(&empty_elf_bytes).expect("ELF load failed");
+    let page_configs = crate::tables::trace_builder::Traces::page_configs_from_elf_and_runtime(
+        &elf_for_vkey,
+        &inner_proof.runtime_page_ranges,
+        inner_proof.num_private_input_pages,
+    );
+    let vkey = crate::VmVerifyingKey::from_elf_and_options(
+        &elf_for_vkey,
+        &inner_proof_options,
+        &page_configs,
+    );
+    let blob =
+        postcard::to_allocvec(&(&inner_proof, &empty_elf_bytes, &inner_proof_options, &vkey))
+            .expect("postcard encode failed");
+    eprintln!("[deser-only] postcard blob: {} bytes", blob.len());
+
+    eprintln!("[deser-only] executing deserialize-only guest (streaming) ...");
+    let program = Elf::load(&deser_elf_bytes).expect("ELF load failed");
+    eprintln!(
+        "[deser-only] ELF: {} bytes, entry_point=0x{:x}",
+        deser_elf_bytes.len(),
+        program.entry_point,
+    );
+    assert_ne!(
+        program.entry_point, 0,
+        "deserialize-only ELF has entry_point=0 — build artifact is malformed"
+    );
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    let start = std::time::Instant::now();
+    let mut cycle_count: usize = 0;
+    let mut chunks: usize = 0;
+    while let Some(logs) = executor.resume().expect("executor resume failed") {
+        cycle_count += logs.len();
+        chunks += 1;
+        if chunks.is_multiple_of(50) {
+            eprintln!(
+                "[deser-only]   ... {chunks} chunks, {cycle_count} cycles, {:?} elapsed",
+                start.elapsed()
+            );
+        }
+    }
+    let exec_time = start.elapsed();
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  DESERIALIZE-ONLY GUEST EXECUTION SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Cycle count           : {cycle_count}");
+    eprintln!("  Executor wall time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  Compare against test_recursion_cycle_count (~40.5B cycles");
+    eprintln!("  with the same proof). Delta = verifier-in-VM cost.");
+    eprintln!("============================================================");
+}
+
+/// Diagnostic: PC histogram for the **deserialize-only** guest.
+///
+/// Sibling of `test_recursion_pc_histogram`, but targeting the
+/// deserialize-only control guest so we can locate the hot kernel inside the
+/// 15.7 M-cycle postcard decode itself. Every cycle goes through the
+/// histogram (no sampling), so attribution is exact — the previous sampled
+/// flamegraph at 1:1000 had broken stack reconstruction on skipped
+/// CALL/RETURNs, which made it unreliable for a workload this small.
+///
+/// Each top PC is resolved to its source function via the `addr2line` crate,
+/// reading the guest ELF's DWARF directly (no external tool needed).
+#[test]
+#[ignore = "diagnostic: ~1 min; PC histogram for the deserialize-only guest"]
+fn test_deserialize_only_pc_histogram() {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+    use std::collections::HashMap;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let deser_elf_path = guest_elf_path(&root, "deserialize-only", "deserialize-only-bench");
+    let deser_elf_bytes =
+        std::fs::read(&deser_elf_path).expect("failed to read deserialize-only ELF");
+
+    let inner_proof_options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    eprintln!("[deser-pc-hist] proving inner (empty, blowup=2, fri_queries=1) ...");
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let elf_for_vkey = Elf::load(&empty_elf_bytes).expect("ELF load failed");
+    let page_configs = crate::tables::trace_builder::Traces::page_configs_from_elf_and_runtime(
+        &elf_for_vkey,
+        &inner_proof.runtime_page_ranges,
+        inner_proof.num_private_input_pages,
+    );
+    let vkey = crate::VmVerifyingKey::from_elf_and_options(
+        &elf_for_vkey,
+        &inner_proof_options,
+        &page_configs,
+    );
+    let blob =
+        postcard::to_allocvec(&(&inner_proof, &empty_elf_bytes, &inner_proof_options, &vkey))
+            .expect("postcard encode failed");
+    eprintln!("[deser-pc-hist] postcard blob: {} bytes", blob.len());
+
+    eprintln!("[deser-pc-hist] executing deserialize-only guest (building PC histogram) ...");
+    let program = Elf::load(&deser_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    let start = std::time::Instant::now();
+    // ~50k unique PCs is plenty: the deserialize-only guest is ~74 KB of ELF
+    // (~18k 4-byte instructions); the hot inner loop is much smaller still.
+    let mut pc_hist: HashMap<u64, u64> = HashMap::with_capacity(50_000);
+    let mut total_cycles: u64 = 0;
+    let mut chunks: usize = 0;
+    while let Some(logs) = executor.resume().expect("executor resume failed") {
+        for log in logs {
+            *pc_hist.entry(log.current_pc).or_insert(0) += 1;
+        }
+        total_cycles += logs.len() as u64;
+        chunks += 1;
+        if chunks.is_multiple_of(50) {
+            eprintln!(
+                "[deser-pc-hist]   ... {chunks} chunks, {total_cycles} cycles, {} unique PCs, {:?}",
+                pc_hist.len(),
+                start.elapsed()
+            );
+        }
+    }
+    let exec_time = start.elapsed();
+
+    // Resolve PCs to source functions directly from the ELF's DWARF, the same
+    // mapping `addr2line -e <elf> -f -i -C 0x<pc>` produces.
+    let loader = addr2line::Loader::new(&deser_elf_path)
+        .expect("failed to load deserialize-only ELF for addr2line");
+    print_pc_histogram(
+        "DESERIALIZE-ONLY GUEST PC HISTOGRAM",
+        &loader,
+        pc_hist,
+        total_cycles,
+        exec_time,
+    );
+}
+
+/// Diagnostic: bucket the recursion guest's cycles by which verifier step
+/// is currently executing.
+///
+/// The verifier's hot path is `verify_rounds_2_to_4`, which calls four
+/// sub-routines in a fixed order:
+///   1. `replay_rounds_after_round_1`               (recover challenges)
+///   2. `step_2_verify_claimed_composition_polynomial`
+///   3. `step_3_verify_fri`
+///   4. `step_4_verify_trace_and_composition_openings`
+///
+/// We resolve each sub-routine's entry PC from the recursion ELF's symbol
+/// table, then run a monotonic state machine over the execution stream:
+/// the active bucket only advances 0 → 1 → 2 → 3 → 4 (never backwards),
+/// so cycles inside a step's callees stay attributed to that step.
+///
+/// Bucket 0 ("setup") captures everything before step 1 is entered — the
+/// allocator init, postcard decode, and `VmAirs::new` (which contains the
+/// expensive preprocessed-commitment FFTs).
+///
+/// Streams chunks via `Executor::resume()` so memory stays bounded.
+#[test]
+#[ignore = "diagnostic: ~13 min; buckets the 40B cycles by verifier step"]
+fn test_recursion_step_breakdown() {
+    use executor::elf::{Elf, SymbolTable};
+    use executor::vm::execution::Executor;
+
+    let root = workspace_root();
+    build_elfs(&root);
+    let empty_elf_bytes = read_guest_elf(&root, "empty", "empty-bench");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion", "recursion-bench");
+
+    let inner_proof_options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    eprintln!("[step-bkd] proving inner (empty, blowup=2, fri_queries=1) ...");
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let elf_for_vkey = executor::elf::Elf::load(&empty_elf_bytes).expect("ELF load failed");
+    let page_configs = crate::tables::trace_builder::Traces::page_configs_from_elf_and_runtime(
+        &elf_for_vkey,
+        &inner_proof.runtime_page_ranges,
+        inner_proof.num_private_input_pages,
+    );
+    let vkey = crate::VmVerifyingKey::from_elf_and_options(
+        &elf_for_vkey,
+        &inner_proof_options,
+        &page_configs,
+    );
+    let blob =
+        postcard::to_allocvec(&(&inner_proof, &empty_elf_bytes, &inner_proof_options, &vkey))
+            .expect("postcard encode failed");
+    eprintln!("[step-bkd] postcard blob: {} bytes", blob.len());
+
+    // Build a per-step "advance bucket to N" lookup. The verifier's step
+    // functions get inlined by LLVM in release mode, so we can't rely on
+    // matching their entry PCs directly. Instead we anchor on closures the
+    // compiler emits *inside* each step's body — iterator combinators like
+    // `.fold(|...|)` keep the step's method name as a substring in their
+    // mangled symbol. Any PC that resolves to a symbol containing step N's
+    // keyword advances the bucket to N (monotonically).
+    //
+    // If step N has no matching symbol at all (e.g. step 4 is fully inlined
+    // with no closure children of its own), its cycles get attributed to the
+    // previous bucket. We report that explicitly in the summary.
+    let symbols = SymbolTable::parse(&recursion_elf_bytes);
+    assert!(
+        !symbols.is_empty(),
+        "recursion ELF has no symbol table — was it stripped?"
+    );
+
+    let step_keywords = [
+        "replay_rounds_after_round_1",
+        "step_2_verify_claimed_composition_polynomial",
+        "step_3_verify_fri",
+        "step_4_verify_trace_and_composition_openings",
+    ];
+    let step_found: [bool; 4] = std::array::from_fn(|i| {
+        symbols
+            .functions()
+            .iter()
+            .any(|f| f.name.contains(step_keywords[i]))
+    });
+    for (i, found) in step_found.iter().enumerate() {
+        let n_matches = symbols
+            .functions()
+            .iter()
+            .filter(|f| f.name.contains(step_keywords[i]))
+            .count();
+        eprintln!(
+            "[step-bkd] step {}: keyword={:?} -> {} symbol(s) {}",
+            i + 1,
+            step_keywords[i],
+            n_matches,
+            if *found {
+                ""
+            } else {
+                "(fully inlined; will merge into the previous bucket)"
+            }
+        );
+    }
+
+    // Monotonic state machine: 0=setup, 1..=4=inside step N (or its callees /
+    // inlined-step-N-cycles attributed here because step N+1 is missing).
+    let mut bucket: u8 = 0;
+    let mut buckets = [0u64; 5];
+
+    eprintln!("[step-bkd] executing recursion guest (streaming) ...");
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    // Cache the last symbol-table hit so we only do a binary search on
+    // function transitions, not on every cycle. Functions are typically
+    // long-running (>>1 instruction), so this cache hits ~all of the time.
+    let mut last_range: Option<(u64, u64)> = None;
+    let mut last_advance: u8 = 0;
+
+    let start = std::time::Instant::now();
+    let mut total_cycles: u64 = 0;
+    let mut chunks: usize = 0;
+    while let Some(logs) = executor.resume().expect("executor resume failed") {
+        for log in logs {
+            let pc = log.current_pc;
+            let in_cached = matches!(last_range, Some((s, e)) if pc >= s && pc < e);
+            if !in_cached {
+                // Slow path: refresh the cache from the symbol table.
+                if let Some(sym) = symbols.lookup(pc) {
+                    // SymbolTable accepts size=0 symbols as "any address >="; for
+                    // those we'd need the next symbol's start for a real upper
+                    // bound. Cheapest workaround: set a tiny range so we re-resolve
+                    // soon enough that wrong attribution is bounded.
+                    let end = sym.address + sym.size.max(1);
+                    last_range = Some((sym.address, end));
+                    last_advance = 0;
+                    for (i, kw) in step_keywords.iter().enumerate() {
+                        if sym.name.contains(kw) {
+                            last_advance = (i + 1) as u8;
+                        }
+                    }
+                } else {
+                    last_range = None;
+                    last_advance = 0;
+                }
+            }
+            if bucket < last_advance {
+                bucket = last_advance;
+            }
+            buckets[bucket as usize] += 1;
+        }
+        total_cycles += logs.len() as u64;
+        chunks += 1;
+        if chunks.is_multiple_of(500) {
+            eprintln!(
+                "[step-bkd]   ... {chunks} chunks, {total_cycles} cycles, bucket={bucket}, {:?}",
+                start.elapsed()
+            );
+        }
+    }
+    let exec_time = start.elapsed();
+
+    let labels = [
+        "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)",
+        "1. step 1: replay_rounds_after_round_1",
+        "2. step 2: verify_claimed_composition_polynomial",
+        "3. step 3: verify_fri",
+        "4. step 4: verify_trace_and_composition_openings (+ wrap-up)",
+    ];
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  RECURSION GUEST PER-STEP CYCLE BREAKDOWN");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles : {total_cycles}");
+    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  {:<60}  {:>14}  {:>7}", "bucket", "cycles", "%");
+    for (label, cycles) in labels.iter().zip(buckets.iter()) {
+        let pct = if total_cycles > 0 {
+            100.0 * (*cycles as f64) / (total_cycles as f64)
+        } else {
+            0.0
+        };
+        eprintln!("  {:<60}  {:>14}  {:>6.2}%", label, cycles, pct);
+    }
+    eprintln!("============================================================");
+}
+
+/// Inner program: fibonacci(10).
+#[test]
+#[ignore = "slow: runs the full STARK verifier inside the VM"]
+fn test_recursion_smoke() {
+    let root = workspace_root();
+    build_elfs(&root);
+    let fib_elf_bytes = read_guest_elf(&root, "fibonacci", "fibonacci-bench");
+
+    let n: u64 = 10;
+    let inner_private_input = n.to_le_bytes().to_vec();
+
+    run_recursion_pipeline("recursion-smoke", &fib_elf_bytes, &inner_private_input);
+}
diff --git a/prover/src/tests/trace_builder_tests.rs b/prover/src/tests/trace_builder_tests.rs
index b3c1e1514..9a5da7bfb 100644
--- a/prover/src/tests/trace_builder_tests.rs
+++ b/prover/src/tests/trace_builder_tests.rs
@@ -6,8 +6,11 @@ use crate::tables::lt;
 use crate::tables::memw_register;
 use crate::tables::trace_builder::Traces;
 use crate::tables::types::FE;
+#[cfg(feature = "prove")]
 use executor::vm::instruction::decoding::{ArithOp, Comparison, Instruction};
+#[cfg(feature = "prove")]
 use executor::vm::logs::Log;
+#[cfg(feature = "prove")]
 use executor::vm::memory::U64HashMap;
 
 fn make_log(pc: u64, rs1_val: u64, rs2_val: u64, dst_val: u64, taken: bool, offset: i32) -> Log {
diff --git a/prover/src/tests/vkey_tests.rs b/prover/src/tests/vkey_tests.rs
new file mode 100644
index 000000000..aba3420d0
--- /dev/null
+++ b/prover/src/tests/vkey_tests.rs
@@ -0,0 +1,180 @@
+//! Tests for [`crate::VmVerifyingKey`] and the vkey-aware verify path.
+
+use executor::elf::Elf;
+use stark::proof::options::{GoldilocksCubicProofOptions, ProofOptions};
+
+use crate::VmVerifyingKey;
+use crate::tables::page::PageConfig;
+use crate::tables::trace_builder::Traces;
+use crate::test_utils::asm_elf_bytes;
+use crate::vkey::VKEY_VERSION;
+use crate::{VmProof, prove};
+
+fn default_options() -> ProofOptions {
+    GoldilocksCubicProofOptions::with_blowup(2).expect("blowup=2 is always valid")
+}
+
+/// Derive the same `page_configs` slice the verifier would reconstruct from
+/// `vm_proof`. This is exactly what `verify_with_options_with_vkey` does
+/// internally, lifted into the test so the test-side and verifier-side
+/// `vkey.pages` indexing line up.
+fn page_configs_from_proof(elf: &Elf, vm_proof: &VmProof) -> Vec<PageConfig> {
+    Traces::page_configs_from_elf_and_runtime(
+        elf,
+        &vm_proof.runtime_page_ranges,
+        vm_proof.num_private_input_pages,
+    )
+}
+
+#[test]
+fn test_vkey_roundtrip() {
+    let elf_bytes = asm_elf_bytes("sub");
+    let vm_proof = prove(&elf_bytes).expect("inner prove should succeed");
+    let elf = Elf::load(&elf_bytes).expect("ELF load failed");
+    let options = default_options();
+    let page_configs = page_configs_from_proof(&elf, &vm_proof);
+
+    let vkey = VmVerifyingKey::from_elf_and_options(&elf, &options, &page_configs);
+    assert_eq!(vkey.version, VKEY_VERSION, "version field must be set");
+    assert_eq!(
+        vkey.pages.len(),
+        page_configs.len(),
+        "vkey.pages must have one entry per page config",
+    );
+    let digest_before = vkey.compute_digest();
+
+    // Two host derivations on the same inputs must produce the same vkey;
+    // the per-table commitment caches should not change between calls.
+    let vkey_again = VmVerifyingKey::from_elf_and_options(&elf, &options, &page_configs);
+    assert_eq!(vkey, vkey_again, "vkey derivation must be deterministic");
+
+    // postcard round-trip preserves every field.
+    let encoded = postcard::to_allocvec(&vkey).expect("postcard encode");
+    let decoded: VmVerifyingKey = postcard::from_bytes(&encoded).expect("postcard decode");
+    assert_eq!(vkey, decoded, "postcard round-trip must preserve the vkey");
+    assert_eq!(
+        decoded.compute_digest(),
+        digest_before,
+        "digest must be stable across serialization"
+    );
+}
+
+#[test]
+fn test_vkey_verify_equivalence() {
+    // Prove a tiny program once with the full (non-minimal) bitwise table,
+    // then verify it both ways: with and without a precomputed vkey.
+    // Both paths must accept the proof. This is the core correctness
+    // guarantee — the vkey shortcut produces identical results to the
+    // recompute-from-scratch path.
+    let elf_bytes = asm_elf_bytes("sub");
+    let vm_proof = prove(&elf_bytes).expect("inner prove should succeed");
+    let elf = Elf::load(&elf_bytes).expect("ELF load failed");
+    let options = default_options();
+    let page_configs = page_configs_from_proof(&elf, &vm_proof);
+    let vkey = VmVerifyingKey::from_elf_and_options(&elf, &options, &page_configs);
+
+    let baseline = crate::verify_with_options(&vm_proof, &elf_bytes, &options, None, None)
+        .expect("baseline verify errored");
+    assert!(baseline, "baseline verify must accept the proof");
+
+    let with_vkey =
+        crate::verify_with_options_with_vkey(&vm_proof, &elf_bytes, &options, None, None, Some(&vkey))
+            .expect("vkey verify errored");
+    assert!(with_vkey, "vkey verify must accept the same proof");
+}
+
+#[test]
+fn test_vkey_mismatch_rejects() {
+    // Tamper with vkey.bitwise. Without an explicit `vk_digest` field on
+    // VmProof (deferred to a later PR), rejection comes from Fiat-Shamir:
+    // the verifier feeds the tampered commitment into the transcript,
+    // derives different challenges from what the prover used, and the
+    // proof's openings stop matching.
+    let elf_bytes = asm_elf_bytes("sub");
+    let vm_proof = prove(&elf_bytes).expect("inner prove should succeed");
+    let elf = Elf::load(&elf_bytes).expect("ELF load failed");
+    let options = default_options();
+    let page_configs = page_configs_from_proof(&elf, &vm_proof);
+    let mut vkey = VmVerifyingKey::from_elf_and_options(&elf, &options, &page_configs);
+
+    vkey.bitwise[0] ^= 0xFF;
+
+    let result = crate::verify_with_options_with_vkey(&vm_proof, &elf_bytes, &options, None, None, Some(&vkey))
+        .expect("verify must not return Err — Fiat-Shamir mismatch is Ok(false)");
+    assert!(!result, "tampered bitwise commitment must cause rejection");
+}
+
+#[test]
+fn test_vkey_page_mismatch_rejects() {
+    // Same shape as `test_vkey_mismatch_rejects`, but tampers with the page
+    // table that gets it first non-private-input slot. Fiat-Shamir rejects
+    // the same way: the page commitment is in the verifier's transcript
+    // exactly like the bitwise one.
+    let elf_bytes = asm_elf_bytes("sub");
+    let vm_proof = prove(&elf_bytes).expect("inner prove should succeed");
+    let elf = Elf::load(&elf_bytes).expect("ELF load failed");
+    let options = default_options();
+    let page_configs = page_configs_from_proof(&elf, &vm_proof);
+    let mut vkey = VmVerifyingKey::from_elf_and_options(&elf, &options, &page_configs);
+
+    let target = page_configs
+        .iter()
+        .position(|c| !c.is_private_input)
+        .expect("test ELF must produce at least one non-private-input page");
+    vkey.pages[target][0] ^= 0xFF;
+
+    let result = crate::verify_with_options_with_vkey(&vm_proof, &elf_bytes, &options, None, None, Some(&vkey))
+        .expect("verify must not return Err — Fiat-Shamir mismatch is Ok(false)");
+    assert!(!result, "tampered page commitment must cause rejection");
+}
+
+#[test]
+fn test_vkey_decode_mismatch_rejects() {
+    let elf_bytes = asm_elf_bytes("sub");
+    let vm_proof = prove(&elf_bytes).expect("inner prove should succeed");
+    let elf = Elf::load(&elf_bytes).expect("ELF load failed");
+    let options = default_options();
+    let page_configs = page_configs_from_proof(&elf, &vm_proof);
+    let mut vkey = VmVerifyingKey::from_elf_and_options(&elf, &options, &page_configs);
+
+    vkey.decode[0] ^= 0xFF;
+
+    let result = crate::verify_with_options_with_vkey(&vm_proof, &elf_bytes, &options, None, None, Some(&vkey))
+        .expect("verify must not return Err — Fiat-Shamir mismatch is Ok(false)");
+    assert!(!result, "tampered decode commitment must cause rejection");
+}
+
+#[test]
+fn test_vkey_register_mismatch_rejects() {
+    let elf_bytes = asm_elf_bytes("sub");
+    let vm_proof = prove(&elf_bytes).expect("inner prove should succeed");
+    let elf = Elf::load(&elf_bytes).expect("ELF load failed");
+    let options = default_options();
+    let page_configs = page_configs_from_proof(&elf, &vm_proof);
+    let mut vkey = VmVerifyingKey::from_elf_and_options(&elf, &options, &page_configs);
+
+    vkey.register[0] ^= 0xFF;
+
+    let result = crate::verify_with_options_with_vkey(&vm_proof, &elf_bytes, &options, None, None, Some(&vkey))
+        .expect("verify must not return Err — Fiat-Shamir mismatch is Ok(false)");
+    assert!(!result, "tampered register commitment must cause rejection");
+}
+
+#[test]
+fn test_vkey_keccak_rc_mismatch_rejects() {
+    let elf_bytes = asm_elf_bytes("sub");
+    let vm_proof = prove(&elf_bytes).expect("inner prove should succeed");
+    let elf = Elf::load(&elf_bytes).expect("ELF load failed");
+    let options = default_options();
+    let page_configs = page_configs_from_proof(&elf, &vm_proof);
+    let mut vkey = VmVerifyingKey::from_elf_and_options(&elf, &options, &page_configs);
+
+    vkey.keccak_rc[0] ^= 0xFF;
+
+    let result = crate::verify_with_options_with_vkey(&vm_proof, &elf_bytes, &options, None, None, Some(&vkey))
+        .expect("verify must not return Err — Fiat-Shamir mismatch is Ok(false)");
+    assert!(
+        !result,
+        "tampered keccak_rc commitment must cause rejection"
+    );
+}
diff --git a/prover/src/vkey.rs b/prover/src/vkey.rs
new file mode 100644
index 000000000..a81d31bb3
--- /dev/null
+++ b/prover/src/vkey.rs
@@ -0,0 +1,126 @@
+//! Verifying key for the lambda-vm STARK verifier.
+//!
+//! Caches preprocessed-table Merkle commitments that the verifier would
+//! otherwise recompute on every call. Mirrors the SP1 `MachineVerifyingKey`
+//! pattern (preprocessed commitments derived once at setup, never recomputed
+//! per-proof) and the prover-side companion in
+//! <https://github.com/yetanotherco/lambda_vm/pull/282> (which caches the
+//! same data on the prover side).
+//!
+//! ## Current scope
+//!
+//! All five preprocessed tables — BITWISE, DECODE, REGISTER, KECCAK_RC, and
+//! every non-private-input PAGE — are cached here. `VmAirs::new_with_vkey`
+//! prefers the vkey-supplied commitment over recomputing when a vkey is
+//! provided. The `version` field exists so a vkey serialized against an
+//! older layout produces a different `compute_digest()` and stops
+//! validating.
+//!
+//! ## Security
+//!
+//! For this PR the verifying key is only a performance shortcut. The
+//! verifier still relies on Fiat-Shamir: every preprocessed commitment the
+//! prover used is bound into the proof's challenges, so a verifier that
+//! consumes a tampered `vkey` field derives different challenges, the
+//! openings stop matching, and verification fails. A future PR will
+//! additionally embed `vkey.compute_digest()` in `VmProof` so vkey
+//! substitution surfaces as an explicit error before any STARK work runs.
+
+use alloc::vec::Vec;
+
+use executor::elf::Elf;
+use sha3::{Digest, Keccak256};
+use stark::config::Commitment;
+use stark::proof::options::ProofOptions;
+
+use crate::tables::bitwise;
+use crate::tables::decode;
+use crate::tables::keccak_rc;
+use crate::tables::page::{self, PageConfig};
+use crate::tables::register;
+
+/// Current `VmVerifyingKey` layout version. Bump whenever fields are added,
+/// removed, or reordered so that vkeys serialized against an older layout
+/// produce a different `compute_digest()` and stop validating.
+pub const VKEY_VERSION: u32 = 3;
+
+/// Placeholder commitment stored in [`VmVerifyingKey::pages`] for
+/// private-input page slots, where there is no preprocessed commitment to
+/// cache. The verifier never reads these slots (private-input pages have no
+/// `with_preprocessed(...)` call in `VmAirs::new`).
+const PRIVATE_INPUT_PAGE_PLACEHOLDER: Commitment = [0u8; 32];
+
+/// Cached preprocessed-table commitments the verifier would otherwise
+/// recompute on every call.
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct VmVerifyingKey {
+    /// Layout version. See [`VKEY_VERSION`].
+    pub version: u32,
+    /// Merkle root over the LDE of the bitwise preprocessed columns.
+    /// Program-independent; depends only on `ProofOptions`.
+    pub bitwise: Commitment,
+    /// Merkle root over the LDE of the decode preprocessed columns.
+    /// Program-dependent: derived from the inner ELF's instruction stream.
+    pub decode: Commitment,
+    /// Merkle root over the LDE of the register preprocessed columns.
+    /// Program-dependent via the ELF's entry point.
+    pub register: Commitment,
+    /// Merkle root over the LDE of the keccak round-constants preprocessed
+    /// columns. Program-independent; depends only on `ProofOptions`.
+    pub keccak_rc: Commitment,
+    /// Per-page preprocessed Merkle roots, indexed parallel to the
+    /// `page_configs` slice the verifier reconstructs from the proof via
+    /// [`crate::tables::trace_builder::Traces::page_configs_from_elf_and_runtime`].
+    /// Private-input slots hold a zero placeholder and are never read by the
+    /// verifier — they exist only to keep the index aligned with
+    /// `page_configs`, which interleaves preprocessed and private-input pages.
+    pub pages: Vec<Commitment>,
+}
+
+impl VmVerifyingKey {
+    /// Derive the verifying key on the host.
+    ///
+    /// `elf` is read to derive the program-dependent commitments (DECODE
+    /// from the instruction stream, REGISTER from `elf.entry_point`).
+    ///
+    /// `page_configs` must match exactly what the verifier will reconstruct
+    /// from the proof — i.e. the output of
+    /// `Traces::page_configs_from_elf_and_runtime(elf, runtime_page_ranges,
+    /// num_private_input_pages)`. The host can call that helper with the
+    /// values it already has after producing the inner proof.
+    pub fn from_elf_and_options(
+        elf: &Elf,
+        options: &ProofOptions,
+        page_configs: &[PageConfig],
+    ) -> Self {
+        let pages = page_configs
+            .iter()
+            .map(|config| {
+                if config.is_private_input {
+                    PRIVATE_INPUT_PAGE_PLACEHOLDER
+                } else {
+                    page::precomputed_commitment_cached(config, options)
+                }
+            })
+            .collect();
+        Self {
+            version: VKEY_VERSION,
+            bitwise: bitwise::preprocessed_commitment(options),
+            decode: decode::commitment_from_elf(elf, options)
+                .expect("decode commitment must compute"),
+            register: register::preprocessed_commitment(options, elf.entry_point),
+            keccak_rc: keccak_rc::preprocessed_commitment(options),
+            pages,
+        }
+    }
+
+    /// Keccak256 fingerprint of the postcard-serialized vkey. Stable as long
+    /// as the field layout (and [`VKEY_VERSION`]) does not change.
+    pub fn compute_digest(&self) -> [u8; 32] {
+        let bytes = postcard::to_allocvec(self)
+            .expect("postcard serialization of VmVerifyingKey must succeed");
+        let mut hasher = Keccak256::new();
+        hasher.update(&bytes);
+        hasher.finalize().into()
+    }
+}