From 0d6ee308a8a79d9bb0509b66e42ffe2a3150fa1a Mon Sep 17 00:00:00 2001 From: Kristaps Karlsons Date: Mon, 15 Jun 2026 23:31:37 +0300 Subject: [PATCH 1/2] feat(oracle): unified tier-driven corpus runner + oracle.yml (C3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the per-language oracle-rust.yml / oracle-kernel.yml demos with one declarative, tier-driven runner over tools/oracle-corpora.toml (#164, C3). - tools/oracle-run.sh: run the oracle for ONE corpus end to end — read its profile, shallow-clone the repo at its pinned rev, run its prepare steps, index it with rag-rat, then `rag-rat oracle report` (which runs the oracle + applies the per-corpus health gate). Exits non-zero on a health violation while still writing the report JSON, so a Δ glue script can consume it. - tools/oracle-corpus.py: stdlib (tomllib) reader the bash runner shells out to for the pre-index fields (repo/rev/prepare/bindings) + tier corpus listing. - tools/oracle-report-bmf.py: report JSON -> Bencher BMF glue for the heavy tier (rag-rat emits JSON only; presentation/Bencher shaping is a glue concern). - .github/workflows/oracle.yml: small tier on PRs + main (GitHub-hosted matrix, per-corpus tool install, report artifact, health gate as the PR gate); heavy tier on release/dispatch (self-hosted bigmem, serial, pushed to Bencher). - Deletes the superseded oracle-rust.yml/oracle-kernel.yml workflows and their rust-scip-oracle.sh/kernel-c-oracle.sh scripts; updates docs/benchmarks.md. Verified end to end locally on the small tier: rust-semver (rust-analyzer, 1056 edges resolved 412->936) and c-cjson (cmake compdb + scip-clang, 3941 edges resolved 2742->3408) both run clean through the runner and pass the gate. --- .github/workflows/oracle-kernel.yml | 81 ----------- .github/workflows/oracle-rust.yml | 75 ---------- .github/workflows/oracle.yml | 206 ++++++++++++++++++++++++++++ docs/benchmarks.md | 12 +- tools/kernel-c-oracle.sh | 206 ---------------------------- tools/oracle-corpus.py | 87 ++++++++++++ tools/oracle-report-bmf.py | 64 +++++++++ tools/oracle-run.sh | 110 +++++++++++++++ tools/rust-scip-oracle.sh | 137 ------------------ 9 files changed, 474 insertions(+), 504 deletions(-) delete mode 100644 .github/workflows/oracle-kernel.yml delete mode 100644 .github/workflows/oracle-rust.yml create mode 100644 .github/workflows/oracle.yml delete mode 100755 tools/kernel-c-oracle.sh create mode 100755 tools/oracle-corpus.py create mode 100755 tools/oracle-report-bmf.py create mode 100755 tools/oracle-run.sh delete mode 100755 tools/rust-scip-oracle.sh diff --git a/.github/workflows/oracle-kernel.yml b/.github/workflows/oracle-kernel.yml deleted file mode 100644 index 25d15f8..0000000 --- a/.github/workflows/oracle-kernel.yml +++ /dev/null @@ -1,81 +0,0 @@ -name: oracle-kernel - -# Repeatable C edge-resolution demo: build the Linux kernel, run the scip-clang oracle over the -# compiled subset, and record the heuristic-vs-compiler resolution delta in Bencher (#71, feeds the -# #73 resolved-rate taxonomy). Heavyweight (kernel build + scip-clang), so dispatch-only — never -# per-push. See tools/kernel-c-oracle.sh and docs/bencher.md. - -on: - workflow_dispatch: - inputs: - kernel_config: - description: 'make config target for compile_commands.json — defconfig (fast, partial coverage) or allmodconfig (fuller, hours)' - required: false - default: 'defconfig' - -permissions: - contents: read - checks: write - -env: - BENCHER_PROJECT: rag-rat - BENCHER_TESTBED: hetzner-bigmem - BENCHER_API_KEY: ${{ secrets.BENCHER_API_TOKEN }} - KERNEL_TAG: v7.0 - -jobs: - c-oracle: - # Same self-hosted big-memory box as bench-release: the whole-kernel index + a kernel build + - # scip-clang need the RAM and disk, and scip-clang is installed there. Dispatch-only (never - # PRs), so a public-repo self-hosted runner is safe. - runs-on: [self-hosted, bigmem] - timeout-minutes: 360 - steps: - - uses: actions/checkout@v5 - - # Pinned bench environment (#71): scip-clang, rust-analyzer, the kernel-build deps (bc/…), - # and the Rust toolchain all live in the image, so a missing host package (the `bc` failure) - # can't recur and the SCIP indexer versions are reproducible. Layer-cached on the runner. - - name: Build bench image - run: docker build -t rag-rat-bench -f tools/bench.Containerfile . - - - name: Run the scip-clang oracle over the kernel and emit BMF - env: - KERNEL_CONFIG: ${{ github.event.inputs.kernel_config || 'defconfig' }} - # Mount the checkout at /repo (BMF lands here, on the host) and runner.temp at /work (the - # multi-GB kernel checkout + index DB stay on real disk, not the container overlay). Named - # volumes cache cargo + target across runs. rag-rat is built inside the image's pinned - # toolchain, then the script runs. - run: | - docker run --rm \ - -v "$PWD":/repo \ - -v "${{ runner.temp }}":/work \ - -v rag-rat-cargo-registry:/usr/local/cargo/registry \ - -v rag-rat-target:/repo/target \ - -e KERNEL_CONFIG="$KERNEL_CONFIG" \ - -e BMF_OUT=/work/kernel_c_oracle_bmf.json \ - -e KERNEL_WORK=/work/coracle-${{ github.run_id }} \ - -e RAG_RAT_BIN=/repo/target/release/rag-rat \ - rag-rat-bench \ - bash -c 'cargo build --release --no-default-features --bin rag-rat && bash tools/kernel-c-oracle.sh' - - - uses: bencherdev/bencher@main - - # No --err: this is a headline resolution signal, not a gate. A regression shows in the plots. - - name: Track C edge resolution (Bencher) - run: | - bencher run \ - --branch main \ - --adapter json \ - --file "${{ runner.temp }}/kernel_c_oracle_bmf.json" \ - --project "$BENCHER_PROJECT" \ - --testbed "$BENCHER_TESTBED" - - - name: Upload oracle report + BMF - if: always() - uses: actions/upload-artifact@v4 - with: - name: kernel-c-oracle - path: | - ${{ runner.temp }}/kernel_c_oracle_bmf.json - ${{ runner.temp }}/coracle-${{ github.run_id }}/oracle-report.json diff --git a/.github/workflows/oracle-rust.yml b/.github/workflows/oracle-rust.yml deleted file mode 100644 index 19ec3eb..0000000 --- a/.github/workflows/oracle-rust.yml +++ /dev/null @@ -1,75 +0,0 @@ -name: oracle-rust - -# Repeatable Rust edge-resolution demo: index rust-lang/cargo, run the rust-analyzer SCIP oracle over -# the whole workspace, and record the heuristic-vs-compiler resolution delta in Bencher (#61, the -# Rust sibling of oracle-kernel.yml's C/scip-clang demo). rust-analyzer's whole-program analysis is -# heavyweight (memory + a few minutes), so dispatch-only — never per-push. See -# tools/rust-scip-oracle.sh and docs/bencher.md. - -on: - workflow_dispatch: - -permissions: - contents: read - checks: write - -env: - BENCHER_PROJECT: rag-rat - BENCHER_TESTBED: hetzner-bigmem - BENCHER_API_KEY: ${{ secrets.BENCHER_API_TOKEN }} - CARGO_TAG: 0.97.1 - -jobs: - rust-oracle: - # Same self-hosted big-memory box as bench-release / oracle-kernel: rust-analyzer's whole-crate - # analysis of cargo wants the RAM, and rust-analyzer is installed in the bench image. Dispatch- - # only (never PRs), so a public-repo self-hosted runner is safe. - runs-on: [self-hosted, bigmem] - timeout-minutes: 120 - steps: - - uses: actions/checkout@v5 - - # Pinned bench environment (#71): rust-analyzer + the Rust toolchain live in the image, so the - # SCIP indexer version is reproducible (it's the content-addressed `tool_version` baked into - # every verdict). Layer-cached on the runner. - - name: Build bench image - run: docker build -t rag-rat-bench -f tools/bench.Containerfile . - - - name: Run the rust-analyzer oracle over cargo and emit BMF - # Mount the checkout at /repo (BMF lands here, on the host) and runner.temp at /work (the - # corpus checkout + index DB stay on real disk, not the container overlay). Named volumes - # cache cargo + target across runs. rag-rat is built inside the image's pinned toolchain, - # then the script runs. - run: | - docker run --rm \ - -v "$PWD":/repo \ - -v "${{ runner.temp }}":/work \ - -v rag-rat-cargo-registry:/usr/local/cargo/registry \ - -v rag-rat-target:/repo/target \ - -e CARGO_TAG="$CARGO_TAG" \ - -e BMF_OUT=/work/rust_scip_oracle_bmf.json \ - -e RUST_WORK=/work/rustoracle-${{ github.run_id }} \ - -e RAG_RAT_BIN=/repo/target/release/rag-rat \ - rag-rat-bench \ - bash -c 'cargo build --release --no-default-features --bin rag-rat && bash tools/rust-scip-oracle.sh' - - - uses: bencherdev/bencher@main - - # No --err: this is a headline resolution signal, not a gate. A regression shows in the plots. - - name: Track Rust edge resolution (Bencher) - run: | - bencher run \ - --branch main \ - --adapter json \ - --file "${{ runner.temp }}/rust_scip_oracle_bmf.json" \ - --project "$BENCHER_PROJECT" \ - --testbed "$BENCHER_TESTBED" - - - name: Upload oracle report + BMF - if: always() - uses: actions/upload-artifact@v4 - with: - name: rust-scip-oracle - path: | - ${{ runner.temp }}/rust_scip_oracle_bmf.json - ${{ runner.temp }}/rustoracle-${{ github.run_id }}/oracle-report.json diff --git a/.github/workflows/oracle.yml b/.github/workflows/oracle.yml new file mode 100644 index 0000000..daa6c92 --- /dev/null +++ b/.github/workflows/oracle.yml @@ -0,0 +1,206 @@ +name: oracle + +# Unified, tier-driven SCIP-oracle resolution runner (#164, C3). One workflow over the declarative +# corpus profiles in tools/oracle-corpora.toml, replacing the per-language oracle-rust.yml / +# oracle-kernel.yml demos. tools/oracle-run.sh does the work for one corpus (clone @ rev → prepare → +# index → `rag-rat oracle report`); this workflow just selects the tier and fans out. +# +# small — per-PR (and on main): fast corpora on GitHub-hosted runners. The health gate in +# `oracle report` makes a broken/regressed corpus FAIL the job. The Δ-vs-baseline PR +# comment is layered on later (tools/oracle-report-md.py, C5); here the report JSON is +# uploaded as an artifact. +# heavy — release / manual dispatch only: the big corpora on the self-hosted big-memory box, +# pushed to Bencher as the headline resolution series. Never on PRs. + +on: + pull_request: + paths: + - 'crates/**' + - 'tools/oracle-corpora.toml' + - 'tools/oracle-corpus.py' + - 'tools/oracle-run.sh' + - 'tools/oracle-report-bmf.py' + - '.github/workflows/oracle.yml' + push: + branches: [main] + paths: + - 'crates/**' + - 'tools/oracle-corpora.toml' + - 'tools/oracle-corpus.py' + - 'tools/oracle-run.sh' + - 'tools/oracle-report-bmf.py' + - '.github/workflows/oracle.yml' + release: + types: [published] + workflow_dispatch: + inputs: + tier: + description: 'Corpus tier to run (small | heavy)' + required: false + default: 'small' + +# Least privilege: read the repo only. The heavy job's Bencher upload uses its own API token. +permissions: + contents: read + +concurrency: + group: oracle-${{ github.ref }} + cancel-in-progress: true + +env: + # Pinned SCIP toolchain for the small tier, mirroring tools/bench.Containerfile so a PR number is + # produced by the same indexer build the heavy/Bencher tier uses. + SCIP_CLANG_VERSION: v0.4.0 + SCIP_PYTHON_VERSION: 0.6.6 + RUST_ANALYZER_URL: https://github.com/rust-lang/rust-analyzer/releases/latest/download/rust-analyzer-x86_64-unknown-linux-gnu.gz + +jobs: + matrix: + # Resolve the tier and emit its corpus ids as a JSON array for the fan-out. release → heavy; + # dispatch → the chosen input; PR/push → small. + runs-on: ubuntu-latest + outputs: + tier: ${{ steps.select.outputs.tier }} + corpora: ${{ steps.select.outputs.corpora }} + steps: + - uses: actions/checkout@v5 + - id: select + run: | + set -euo pipefail + case "${{ github.event_name }}" in + release) tier=heavy ;; + workflow_dispatch) tier="${{ github.event.inputs.tier }}" ;; + *) tier=small ;; + esac + echo "tier=$tier" >> "$GITHUB_OUTPUT" + corpora="$(python3 tools/oracle-corpus.py --list-tier "$tier" | jq -R . | jq -cs .)" + echo "corpora=$corpora" >> "$GITHUB_OUTPUT" + echo "tier=$tier corpora=$corpora" + + small: + needs: matrix + if: needs.matrix.outputs.tier == 'small' + # 24.04: the pinned scip-clang prebuilt links against a recent GLIBC (see bench.Containerfile). + runs-on: ubuntu-24.04 + strategy: + fail-fast: false + matrix: + corpus: ${{ fromJSON(needs.matrix.outputs.corpora) }} + steps: + - uses: actions/checkout@v5 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + + - name: Build rag-rat (release, hash embedder — no model download) + run: cargo build --release --no-default-features --bin rag-rat + + - name: Install the corpus's SCIP tool + run: | + set -euo pipefail + tool="$(python3 tools/oracle-corpus.py --corpus '${{ matrix.corpus }}' --field tool)" + echo "installing $tool for ${{ matrix.corpus }}" + case "$tool" in + rust-analyzer) + curl --proto '=https' --tlsv1.2 -sSfL "$RUST_ANALYZER_URL" \ + | gunzip > /usr/local/bin/rust-analyzer + chmod +x /usr/local/bin/rust-analyzer + rust-analyzer --version ;; + scip-clang) + curl --proto '=https' --tlsv1.2 -sSfL \ + "https://github.com/sourcegraph/scip-clang/releases/download/${SCIP_CLANG_VERSION}/scip-clang-x86_64-linux" \ + -o /usr/local/bin/scip-clang + chmod +x /usr/local/bin/scip-clang + scip-clang --version ;; + scip-python) + npm install -g "@sourcegraph/scip-python@${SCIP_PYTHON_VERSION}" + scip-python --version ;; + *) + echo "unknown tool '$tool'" >&2; exit 1 ;; + esac + + - name: Run the oracle for ${{ matrix.corpus }} + env: + CORPUS: ${{ matrix.corpus }} + RAG_RAT_BIN: target/release/rag-rat + ORACLE_WORK: ${{ runner.temp }}/oracle-${{ matrix.corpus }} + REPORT_OUT: ${{ runner.temp }}/${{ matrix.corpus }}-report.json + RAG_RAT_COMMIT: ${{ github.event.pull_request.head.sha || github.sha }} + run: bash tools/oracle-run.sh + + - name: Upload resolution report + if: always() + uses: actions/upload-artifact@v4 + with: + name: oracle-report-${{ matrix.corpus }} + path: ${{ runner.temp }}/${{ matrix.corpus }}-report.json + if-no-files-found: warn + + heavy: + needs: matrix + if: needs.matrix.outputs.tier == 'heavy' + # Self-hosted big-memory box (same as bench-release): the heavy corpora (whole cargo workspace, + # a kernel build + scip-clang) want the RAM + disk, and the SCIP tools live in the bench image. + # Heavy runs on release / dispatch only (never PRs), so a public-repo self-hosted runner is safe. + runs-on: [self-hosted, bigmem] + timeout-minutes: 360 + strategy: + fail-fast: false + # Serial: both corpora would contend for the box's RAM if run at once. + max-parallel: 1 + matrix: + corpus: ${{ fromJSON(needs.matrix.outputs.corpora) }} + env: + BENCHER_PROJECT: rag-rat + BENCHER_TESTBED: hetzner-bigmem + BENCHER_API_KEY: ${{ secrets.BENCHER_API_TOKEN }} + steps: + - uses: actions/checkout@v5 + + # Pinned bench environment: rust-analyzer + scip-clang + kernel-build deps live in the image, + # so the SCIP indexer versions are reproducible (the content-addressed tool_version). Layer- + # cached on the runner. + - name: Build bench image + run: docker build -t rag-rat-bench -f tools/bench.Containerfile . + + - name: Run the oracle for ${{ matrix.corpus }} and emit its report + run: | + docker run --rm \ + -v "$PWD":/repo \ + -v "${{ runner.temp }}":/work \ + -v rag-rat-cargo-registry:/usr/local/cargo/registry \ + -v rag-rat-target:/repo/target \ + -e CORPUS='${{ matrix.corpus }}' \ + -e ORACLE_WORK=/work/oracle-${{ matrix.corpus }} \ + -e REPORT_OUT=/work/${{ matrix.corpus }}-report.json \ + -e RAG_RAT_BIN=/repo/target/release/rag-rat \ + -e RAG_RAT_COMMIT='${{ github.sha }}' \ + rag-rat-bench \ + bash -c 'cargo build --release --no-default-features --bin rag-rat && bash tools/oracle-run.sh' + + - name: Convert the report to BMF + run: | + python3 tools/oracle-report-bmf.py \ + "${{ runner.temp }}/${{ matrix.corpus }}-report.json" \ + > "${{ runner.temp }}/${{ matrix.corpus }}-bmf.json" + + - uses: bencherdev/bencher@main + + # No --err: a headline resolution signal, not a gate. A regression shows in the Bencher plots. + - name: Track ${{ matrix.corpus }} edge resolution (Bencher) + run: | + bencher run \ + --branch main \ + --adapter json \ + --file "${{ runner.temp }}/${{ matrix.corpus }}-bmf.json" \ + --project "$BENCHER_PROJECT" \ + --testbed "$BENCHER_TESTBED" + + - name: Upload report + BMF + if: always() + uses: actions/upload-artifact@v4 + with: + name: oracle-heavy-${{ matrix.corpus }} + path: | + ${{ runner.temp }}/${{ matrix.corpus }}-report.json + ${{ runner.temp }}/${{ matrix.corpus }}-bmf.json + if-no-files-found: warn diff --git a/docs/benchmarks.md b/docs/benchmarks.md index f4a0c29..3b6406c 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -92,7 +92,7 @@ speedup, not a coverage regression. The headline resolves 58.7% of edges *syntactically* — by name, no compiler. The SCIP oracle (#61) measures how good that syntactic resolution actually is: it replays a real compilation through `scip-clang` (a clang-based SCIP indexer) and diffs its ground-truth bindings against the heuristic's. -Numbers below are one `oracle-kernel.yml` run (`scip-clang 0.4.0`, kernel `defconfig`, containerized +Numbers below are one heavy-tier `scip-clang` oracle run (`oracle.yml`, `scip-clang 0.4.0`, kernel `defconfig`, containerized bench image on the self-hosted bigmem box), so they cover the **compiled subset** — the translation units `defconfig` actually compiles — not the whole 62,903-file tree the headline indexes. Resolution *quality* and tree-wide *coverage* are different populations, reported side by side, not merged. @@ -138,7 +138,7 @@ index spans the whole tree while the compilation database spans `defconfig`.) ## Rust edge resolution: heuristic vs compiler (rust-analyzer SCIP oracle) -The Rust sibling, and the contrast that matters: one `oracle-rust.yml` run over **rust-lang/cargo** +The Rust sibling, and the contrast that matters: one heavy-tier `rust-analyzer` oracle run over **rust-lang/cargo** (tag 0.97.1, the same pinned corpus as the iai/criterion benches), `rust-analyzer 0.3.2929`. Unlike scip-clang's compiled subset, rust-analyzer analyzes the **whole workspace**, so these cover every indexed `.rs` file (no subset caveat). @@ -178,9 +178,11 @@ in-corpus call rate is not a weakness: cargo calls overwhelmingly into `std`/dep the oracle correctly bins **68k** of those as `resolved-external` rather than forcing a wrong in-corpus target. -Run them yourself: `oracle-kernel.yml` / `tools/kernel-c-oracle.sh` (C) and `oracle-rust.yml` / -`tools/rust-scip-oracle.sh` (Rust). Both pin the SCIP indexer via `tools/bench.Containerfile`, so the -`tool_version` baked into every verdict is reproducible. +Run them yourself via the unified runner: `CORPUS=linux-kernel bash tools/oracle-run.sh` (C) and +`CORPUS=rust-cargo bash tools/oracle-run.sh` (Rust), or dispatch `oracle.yml` with `tier=heavy` to +reproduce them on the Bencher box. The corpora are declared in `tools/oracle-corpora.toml`; the +heavy tier pins the SCIP indexer via `tools/bench.Containerfile`, so the `tool_version` baked into +every verdict is reproducible. ## Memory profile: where the peak lives diff --git a/tools/kernel-c-oracle.sh b/tools/kernel-c-oracle.sh deleted file mode 100755 index e47e319..0000000 --- a/tools/kernel-c-oracle.sh +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/env bash -# Present C edge resolution on the Linux kernel via the scip-clang oracle (#71), repeatably. -# -# Companion to tools/bench-kernel.sh: same pinned kernel + C-target conventions, but it additionally -# BUILDS the kernel to produce a compile_commands.json (scip-clang's input), runs -# `oracle run --tool scip-clang`, and emits the heuristic-vs-compiler resolution delta as a BMF for -# Bencher (benchmark `linux-kernel-/c-oracle`). -# -# COVERAGE CAVEAT (load-bearing): scip-clang only resolves translation units present in -# compile_commands.json, which is exactly the set the chosen KERNEL_CONFIG compiles — `defconfig` -# is a few thousand TUs, `allmodconfig` is most of the tree. Every resolution metric below is over -# that COMPILED SUBSET, not the whole-kernel 62k/67.4% headline that bench-kernel.sh reports. -# -# Env: -# KERNEL_TAG / KERNEL_SHA pinned kernel (default v7.0 / 028ef9c9…, matches bench-kernel.sh) -# KERNEL_CONFIG make config target for the compdb (default: defconfig) -# RAG_RAT_BIN release binary (default: target/release/rag-rat) -# KERNEL_WORK working dir (default: a fresh mktemp dir) -# BMF_OUT Bencher Metric Format output path (default: kernel_c_oracle_bmf.json) -set -euo pipefail - -KERNEL_TAG="${KERNEL_TAG:-v7.0}" -KERNEL_SHA="${KERNEL_SHA:-028ef9c96e96197026887c0f092424679298aae8}" -KERNEL_CONFIG="${KERNEL_CONFIG:-defconfig}" -RAG_RAT_BIN="${RAG_RAT_BIN:-target/release/rag-rat}" -WORK="${KERNEL_WORK:-$(mktemp -d)}" -BMF_OUT="${BMF_OUT:-kernel_c_oracle_bmf.json}" -# Resolve to an ABSOLUTE path before any cd — `command -v` can return a relative path (e.g. -# `target/release/rag-rat`), which breaks once the script cd's into the kernel tree, so always -# canonicalize the result. -RAG_RAT_BIN="$(command -v "$RAG_RAT_BIN" || echo "$RAG_RAT_BIN")" -RAG_RAT_BIN="$(readlink -f "$RAG_RAT_BIN")" -BMF_OUT="$(readlink -f "$BMF_OUT" 2>/dev/null || echo "$PWD/$BMF_OUT")" -mkdir -p "$WORK" -DB="$WORK/kernel-index.sqlite" -KDIR="$WORK/linux" - -[ -x "$RAG_RAT_BIN" ] || { echo "kernel-c-oracle: rag-rat not found at '$RAG_RAT_BIN'" >&2; exit 1; } -command -v scip-clang >/dev/null 2>&1 || { - echo "kernel-c-oracle: scip-clang not on PATH (install from github.com/sourcegraph/scip-clang)" >&2 - exit 1 -} - -echo "kernel-c-oracle: fetching Linux ${KERNEL_TAG} (${KERNEL_SHA}, shallow)" >&2 -git init -q "$KDIR" -git -C "$KDIR" remote add origin https://github.com/torvalds/linux.git -git -C "$KDIR" -c protocol.version=2 fetch -q --depth 1 origin "$KERNEL_SHA" -git -C "$KDIR" checkout -q "$KERNEL_SHA" - -# Build the kernel so its compile_commands.json target can read the per-object .cmd files. Quiet -# build; failures in a few TUs don't abort (|| true) — a partial compdb still demonstrates the join. -echo "kernel-c-oracle: building ${KERNEL_CONFIG} + compile_commands.json" >&2 -make -C "$KDIR" -s "$KERNEL_CONFIG" -make -C "$KDIR" -s -j"$(nproc)" 2>/dev/null || true -make -C "$KDIR" -s compile_commands.json -TUS="$(python3 -c "import json,sys; print(len(json.load(open('$KDIR/compile_commands.json'))))")" -echo "kernel-c-oracle: compile_commands.json covers $TUS translation units" >&2 - -cat > "$KDIR/rag-rat.toml" <&2 -( cd "$KDIR" && "$RAG_RAT_BIN" index --full >/dev/null ) - -# The scip-clang oracle pass over the compiled subset (stdout = clean JSON report). -echo "kernel-c-oracle: oracle run --tool scip-clang" >&2 -( cd "$KDIR" && "$RAG_RAT_BIN" oracle run --tool scip-clang --json ) > "$WORK/oracle-report.json" - -python3 - "$DB" "$WORK/oracle-report.json" "$TUS" "$BMF_OUT" "$KERNEL_TAG" <<'PY' -import json, sqlite3, sys -db, report_path, tus, bmf_out, tag = sys.argv[1], sys.argv[2], int(sys.argv[3]), sys.argv[4], sys.argv[5] -report = json.load(open(report_path)).get("report", {}) -conn = sqlite3.connect(db) -q = lambda s: conn.execute(s).fetchone()[0] - -# Whole-index heuristic baseline (rag-rat indexes the full tree; the oracle only covers the -# compiled subset, so these two populations differ — reported side by side, honestly). -total_calls = q("SELECT COUNT(*) FROM edges WHERE edge_kind='calls_name' AND callee_start_byte IS NOT NULL") -heur_resolved = q("SELECT COUNT(*) FROM edges WHERE edge_kind='calls_name' AND callee_start_byte IS NOT NULL AND to_symbol_id IS NOT NULL") -heur_rate = 100.0 * heur_resolved / total_calls if total_calls else 0.0 - -confirmed = report.get("confirmed", 0) -contradicted = report.get("contradicted", 0) -upgraded = report.get("upgraded", 0) -resolved_external = report.get("resolved_external", 0) -covered = report.get("covered_calls", 0) -oracle_only = report.get("oracle_only_calls", 0) -judged = confirmed + contradicted -precision = 100.0 * confirmed / judged if judged else 0.0 # compiler-confirmed fraction of resolved -recall = 100.0 * covered / (covered + oracle_only) if (covered + oracle_only) else 0.0 - -print(f"\n=== C edge resolution on Linux {tag} (compiled subset: {tus} TUs) ===") -print(f"whole-index heuristic calls_name resolved: {heur_resolved}/{total_calls} ({heur_rate:.1f}%)") -print(f"oracle (compiled subset): confirmed={confirmed} contradicted={contradicted} " - f"upgraded={upgraded} resolved_external={resolved_external}") -print(f"compiler-confirmed precision of heuristic-resolved edges: {precision:.1f}% " - f"(confirm/(confirm+contradict))") -print(f"call recall (oracle-seen calls a calls_name edge covered): {recall:.1f}%") - -# Precision split by edge_kind (#61): the blended `precision` above mixes function calls -# (`calls_name`) with type references (`references_type`) etc. They have very different -# characters — type refs suffer the forward-declaration-vs-definition problem — so the blended -# number under-sells call resolution. Report per-kind precision and surface calls vs types to the -# BMF. -print("\n--- compiler precision by edge_kind ---") -print(f"{'edge_kind':<18} {'confirm':>10} {'contra':>10} {'precision':>10}") -kind_prec = {} -for ek, c, x in conn.execute( - "SELECT e.edge_kind, " - "SUM(CASE WHEN o.kind='confirm' THEN 1 ELSE 0 END), " - "SUM(CASE WHEN o.kind='contradict' THEN 1 ELSE 0 END) " - "FROM edge_oracle o JOIN edges e ON e.id = o.edge_id " - "WHERE o.kind IN ('confirm','contradict') " - "GROUP BY e.edge_kind ORDER BY 2 DESC" -).fetchall(): - prec = 100.0 * c / (c + x) if (c + x) else 0.0 - kind_prec[ek] = prec - print(f"{ek or '':<18} {c:>10} {x:>10} {prec:>9.1f}%") -calls_precision = kind_prec.get("calls_name", 0.0) -types_precision = kind_prec.get("references_type", 0.0) - -# Contradiction attribution (#61): which heuristic RESOLUTION PATH produces the disagreements, and -# is the disagreement a same-NAME collision (heuristic bound the right name, wrong definition — -# improvable with linkage / include-scope disambiguation) or a name MISMATCH (call site is a macro -# expansion / function pointer the compiler resolved elsewhere — not syntactically fixable). This is -# the data that decides whether tree-sitter resolution can be pushed further or the oracle is the -# only lever. `edges` is a view exposing `resolution` (the path) and `confidence` (the tier). -print("\n--- contradiction attribution by heuristic resolution path ---") -print(f"{'confidence':<10} {'resolution':<22} {'confirm':>9} {'contra':>9} {'precision':>9}") -for conf, res, c, x in conn.execute( - "SELECT e.confidence, e.resolution, " - "SUM(CASE WHEN o.kind='confirm' THEN 1 ELSE 0 END), " - "SUM(CASE WHEN o.kind='contradict' THEN 1 ELSE 0 END) " - "FROM edge_oracle o JOIN edges e ON e.id = o.edge_id " - "WHERE o.kind IN ('confirm','contradict') " - "GROUP BY e.confidence, e.resolution ORDER BY 4 DESC" -).fetchall(): - prec = 100.0 * c / (c + x) if (c + x) else 0.0 - print(f"{conf or '':<10} {res or '':<22} {c:>9} {x:>9} {prec:>8.1f}%") - -same, tot = conn.execute( - "SELECT SUM(CASE WHEN o.scip_symbol LIKE '%'||e.to_name||'%' THEN 1 ELSE 0 END), COUNT(*) " - "FROM edge_oracle o JOIN edges e ON e.id = o.edge_id WHERE o.kind='contradict'" -).fetchone() -print(f"\ncontradictions where the call name appears in the compiler's symbol: {same}/{tot} " - f"({100.0 * same / tot if tot else 0:.1f}%)") -print(" high → same-name collision (improvable: linkage/include scoping); " - "low → macro/fn-pointer (oracle-only)") - -# logical_variant hypothesis (#61): does this path contradict because the heuristic picks a C -# function's prototype DECLARATION (smaller byte span, parsed first) while the compiler resolves to -# the DEFINITION (larger span, has a body) of the SAME path::name? If so, a definition-preference -# tiebreak fixes it. `hs` = heuristic-chosen symbol, `os` = oracle-resolved symbol. -lv = conn.execute( - "SELECT " - " SUM(CASE WHEN hs.qualified_name = os.qualified_name THEN 1 ELSE 0 END), " # same file+name - " SUM(CASE WHEN (hs.end_byte-hs.start_byte) < (os.end_byte-os.start_byte) THEN 1 ELSE 0 END), " # heuristic smaller (decl) - " COUNT(*) " - "FROM edge_oracle o JOIN edges e ON e.id=o.edge_id " - "LEFT JOIN symbols hs ON hs.id=e.to_symbol_id " - "LEFT JOIN symbols os ON os.id=o.resolved_symbol_id " - "WHERE o.kind='contradict' AND e.resolution='logical_variant' " - "AND hs.id IS NOT NULL AND os.id IS NOT NULL" -).fetchone() -sq, hsm, lvt = lv -print(f"\n--- logical_variant contradiction shape (n={lvt}) ---") -print(f" heuristic & oracle share qualified_name (same file+name, i.e. decl-vs-def): {sq}/{lvt} " - f"({100.0*sq/lvt if lvt else 0:.1f}%)") -print(f" heuristic span < oracle span (heuristic picked the smaller = declaration): {hsm}/{lvt} " - f"({100.0*hsm/lvt if lvt else 0:.1f}%)") -print(" both high → definition-preference tiebreak among same-path::name candidates is the fix") -print(" sample rows (call | heuristic name@span | oracle name@span):") -for tn, hn, hsp, on, osp in conn.execute( - "SELECT e.to_name, hs.name, hs.end_byte-hs.start_byte, os.name, os.end_byte-os.start_byte " - "FROM edge_oracle o JOIN edges e ON e.id=o.edge_id " - "JOIN symbols hs ON hs.id=e.to_symbol_id JOIN symbols os ON os.id=o.resolved_symbol_id " - "WHERE o.kind='contradict' AND e.resolution='logical_variant' LIMIT 12" -).fetchall(): - print(f" {tn:<24} {hn or '?'}@{hsp} -> {on or '?'}@{osp}") - -bmf = {f"linux-kernel-{tag}/c-oracle": { - "compiled_tus": {"value": tus}, - "heuristic_resolved_rate": {"value": heur_rate}, - "compiler_precision": {"value": precision}, - "compiler_precision_calls": {"value": calls_precision}, - "compiler_precision_types": {"value": types_precision}, - "call_recall": {"value": recall}, - "confirmed": {"value": confirmed}, - "contradicted": {"value": contradicted}, - "upgraded": {"value": upgraded}, - "resolved_external": {"value": resolved_external}, -}} -json.dump(bmf, open(bmf_out, "w"), indent=2) -print(f"wrote BMF -> {bmf_out}") -PY - -# Free the multi-GB kernel checkout + index DB (they accumulate per run on the self-hosted box); -# keep the small oracle-report.json in WORK for artifact upload and the BMF at $BMF_OUT. -rm -rf "$KDIR" "$DB" "$DB"-wal "$DB"-shm -echo "kernel-c-oracle: done (report: $WORK/oracle-report.json, BMF: $BMF_OUT)" >&2 diff --git a/tools/oracle-corpus.py b/tools/oracle-corpus.py new file mode 100755 index 0000000..962f0e2 --- /dev/null +++ b/tools/oracle-corpus.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +"""Read fields out of tools/oracle-corpora.toml for the shell oracle runner. + +The runner (tools/oracle-run.sh) is bash, which can't parse TOML; this is the thin reader it shells +out to. `rag-rat oracle report` consumes the corpus profile (tool, bindings, health) from the same +file directly — this helper only surfaces the few fields the runner needs *before* indexing: the +repo + rev to clone, the prepare steps to run, and the bindings to render into the corpus's +rag-rat.toml. It also lists a tier's corpus ids so a CI matrix can fan out over them. + +Pure stdlib (tomllib, 3.11+). No third-party deps so it runs on a bare CI runner. + +Usage: + oracle-corpus.py --list-tier small # corpus ids in a tier, one per line + oracle-corpus.py --corpus py-requests --field repo # a scalar field + oracle-corpus.py --corpus py-requests --field prepare # one prepare command per line + oracle-corpus.py --corpus py-requests --field bindings_toml # rag-rat.toml [target_bindings] body +""" + +from __future__ import annotations + +import argparse +import sys +import tomllib + +SCALAR_FIELDS = ("repo", "rev", "tool", "tier") + + +def load_corpora(path: str) -> list[dict]: + with open(path, "rb") as fh: + data = tomllib.load(fh) + corpora = data.get("corpus", []) + if not corpora: + sys.exit(f"oracle-corpus: {path} has no [[corpus]] entries (wrong table name?)") + return corpora + + +def find(corpora: list[dict], corpus_id: str) -> dict: + for corpus in corpora: + if corpus.get("corpus_id") == corpus_id: + return corpus + sys.exit(f"oracle-corpus: no corpus '{corpus_id}' in the corpora file") + + +def emit_field(corpus: dict, field: str) -> None: + if field in SCALAR_FIELDS: + print(corpus[field]) + elif field == "timeout_minutes": + print(corpus["health"]["timeout_minutes"]) + elif field == "prepare": + # One command per line; an empty prepare list prints nothing (the runner loops over zero + # lines). Commands may contain spaces — the runner reads them line-by-line, not word-split. + for command in corpus.get("prepare", []): + print(command) + elif field == "bindings_toml": + # Render the `[target_bindings]` body for the corpus's rag-rat.toml: one `lang = ["dir", …]` + # line per language. TOML string-quote each directory so paths with spaces survive. + for lang, dirs in corpus["bindings"].items(): + quoted = ", ".join('"' + d + '"' for d in dirs) + print(f"{lang} = [{quoted}]") + else: + sys.exit(f"oracle-corpus: unknown field '{field}'") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--corpora", default="tools/oracle-corpora.toml") + parser.add_argument("--corpus") + parser.add_argument("--field") + parser.add_argument("--list-tier") + args = parser.parse_args() + + corpora = load_corpora(args.corpora) + + if args.list_tier: + for corpus in corpora: + if corpus.get("tier") == args.list_tier: + print(corpus["corpus_id"]) + return + + if not args.corpus or not args.field: + parser.error("either --list-tier, or both --corpus and --field, are required") + + emit_field(find(corpora, args.corpus), args.field) + + +if __name__ == "__main__": + main() diff --git a/tools/oracle-report-bmf.py b/tools/oracle-report-bmf.py new file mode 100755 index 0000000..113d067 --- /dev/null +++ b/tools/oracle-report-bmf.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +"""Convert a C2 oracle resolution report (JSON) into Bencher Metric Format (BMF). + +rag-rat emits the typed `OracleResolutionReport` as JSON only; turning it into the shape a consumer +wants is a glue concern (see the output-rendering decision). This is the Bencher-headline glue for +the heavy tier: one BMF benchmark `/oracle` carrying the resolution rates + oracle +verdict metrics, so the release run tracks how compiler-grade resolution moves over time. + +(The PR Δ-table glue is a separate script, tools/oracle-report-md.py, C5.) + +Usage: + oracle-report-bmf.py [ ...] > bmf.json + +Each report becomes one `/oracle` benchmark; multiple reports merge into one BMF object, +so a single Bencher upload can carry every heavy corpus from one run. +""" + +from __future__ import annotations + +import json +import sys + + +def benchmark(report: dict) -> tuple[str, dict]: + resolution = report["resolution"] + total = resolution["total_edges"] + metrics = report.get("metrics", {}) + + def rate(numerator: int) -> float: + # Mirror the engine's vacuous-1.0 convention for an empty denominator. + return 100.0 if total == 0 else 100.0 * numerator / total + + name = f"{report['corpus_id']}/oracle" + body = { + "total_edges": {"value": resolution["total_edges"]}, + "resolved_rate_before": {"value": rate(resolution["resolved_before"])}, + "resolved_rate_after": {"value": rate(resolution["resolved_after"])}, + "precision": {"value": 100.0 * metrics.get("precision", 0.0)}, + "recall": {"value": 100.0 * metrics.get("recall", 0.0)}, + "name_only_recovery": {"value": 100.0 * metrics.get("name_only_recovery_rate", 0.0)}, + "confirmed": {"value": report.get("confirmed", 0)}, + "contradicted": {"value": report.get("contradicted", 0)}, + "upgraded": {"value": report.get("upgraded", 0)}, + "resolved_external": {"value": report.get("resolved_external", 0)}, + "symbols_with_moniker": {"value": report.get("symbols_with_moniker", 0)}, + } + return name, body + + +def main() -> None: + if len(sys.argv) < 2: + sys.exit("usage: oracle-report-bmf.py [ ...]") + bmf: dict[str, dict] = {} + for path in sys.argv[1:]: + with open(path) as fh: + report = json.load(fh) + name, body = benchmark(report) + bmf[name] = body + json.dump(bmf, sys.stdout, indent=2) + sys.stdout.write("\n") + + +if __name__ == "__main__": + main() diff --git a/tools/oracle-run.sh b/tools/oracle-run.sh new file mode 100755 index 0000000..fed4e1b --- /dev/null +++ b/tools/oracle-run.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# tools/oracle-run.sh — run the SCIP oracle for ONE declared corpus end to end and emit its C2 +# resolution report JSON (#164, C3). The single, tier-agnostic runner that replaces the per-language +# rust-scip-oracle.sh / kernel-c-oracle.sh demos: it reads the corpus profile from +# tools/oracle-corpora.toml, clones the repo at its pinned rev, runs the corpus's prepare steps, +# indexes it with rag-rat, then runs `rag-rat oracle report`. +# +# `oracle report` is what runs the oracle (produces the .scip with the corpus's tool, joins it, +# assembles the typed report) AND applies the per-corpus health gate — so a corpus whose run falls +# outside its thresholds makes this script exit non-zero even though every step "succeeded". The +# report JSON is still written in that case, so a Δ glue script (tools/oracle-report-md.py, C5) can +# consume it. rag-rat emits JSON only; markdown/Bencher formatting is a glue concern. +# +# Env: +# CORPUS (required) corpus_id from the corpora file +# CORPORA corpora file (default: /tools/oracle-corpora.toml) +# RAG_RAT_BIN rag-rat binary (default: target/release/rag-rat) +# ORACLE_WORK working dir (default: a fresh mktemp dir) +# REPORT_OUT report JSON output path (default: $ORACLE_WORK/-report.json) +# RAG_RAT_COMMIT provenance stamp baked into the report (default: this repo's HEAD) +# KEEP_CHECKOUT set to 1 to keep the corpus checkout + index DB (default: removed) +set -euo pipefail + +CORPUS="${CORPUS:?set CORPUS to a corpus_id from the corpora file}" + +# Resolve everything to ABSOLUTE paths while the CWD is still this repo, before any cd into the +# corpus tree (a relative path or `command -v` result breaks once we cd away). +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +HELPER="$SCRIPT_DIR/oracle-corpus.py" +CORPORA="${CORPORA:-$SCRIPT_DIR/oracle-corpora.toml}" +CORPORA="$(readlink -f "$CORPORA")" +RAG_RAT_BIN="${RAG_RAT_BIN:-target/release/rag-rat}" +RAG_RAT_BIN="$(command -v "$RAG_RAT_BIN" || echo "$RAG_RAT_BIN")" +RAG_RAT_BIN="$(readlink -f "$RAG_RAT_BIN")" +RAG_RAT_COMMIT="${RAG_RAT_COMMIT:-$(git rev-parse HEAD 2>/dev/null || echo unknown)}" +WORK="${ORACLE_WORK:-$(mktemp -d)}" +mkdir -p "$WORK" +REPORT_OUT="${REPORT_OUT:-$WORK/$CORPUS-report.json}" +REPORT_OUT="$(readlink -f "$REPORT_OUT" 2>/dev/null || echo "$PWD/$REPORT_OUT")" + +[ -x "$RAG_RAT_BIN" ] || { echo "oracle-run: rag-rat not found at '$RAG_RAT_BIN'" >&2; exit 1; } +[ -f "$CORPORA" ] || { echo "oracle-run: corpora file not found at '$CORPORA'" >&2; exit 1; } + +field() { python3 "$HELPER" --corpora "$CORPORA" --corpus "$CORPUS" --field "$1"; } + +REPO="$(field repo)" +REV="$(field rev)" +TOOL="$(field tool)" +TIMEOUT_MINUTES="$(field timeout_minutes)" +CHECKOUT="$WORK/checkout" +DB="$WORK/$CORPUS-index.sqlite" + +echo "oracle-run: corpus=$CORPUS tool=$TOOL repo=$REPO rev=$REV" >&2 + +# Shallow-clone the pinned rev. A tag or branch name resolves via `fetch `; a full SHA fetches +# directly. `--depth 1` keeps the corpus small — the oracle never needs history. +echo "oracle-run: cloning $REPO @ $REV (shallow)" >&2 +git init -q "$CHECKOUT" +git -C "$CHECKOUT" remote add origin "$REPO" +git -C "$CHECKOUT" -c protocol.version=2 fetch -q --depth 1 origin "$REV" +git -C "$CHECKOUT" checkout -q FETCH_HEAD + +# Corpus prepare steps (cargo fetch, cmake compdb, venv install, kernel build, …). Each line is one +# shell command run in the checkout root. A failing prepare step aborts the run (set -e through the +# subshell) — a broken environment must not be reported as a clean resolution number. +while IFS= read -r prepare_cmd; do + [ -n "$prepare_cmd" ] || continue + echo "oracle-run: prepare> $prepare_cmd" >&2 + ( cd "$CHECKOUT" && bash -c "$prepare_cmd" ) +done < <(field prepare) + +# Render the corpus's rag-rat.toml: index the checkout into $DB with the declared per-language +# bindings. The oracle report reads the SAME bindings from the corpora file for provenance; this +# file is what `rag-rat index` walks. +{ + echo "[index]" + echo "root = \"$CHECKOUT\"" + echo "database = \"$DB\"" + echo + echo "[target_bindings]" + field bindings_toml +} > "$CHECKOUT/rag-rat.toml" + +echo "oracle-run: rag-rat index --full" >&2 +( cd "$CHECKOUT" && "$RAG_RAT_BIN" index --full >/dev/null ) + +# `oracle report` runs the oracle + assembles the typed report + applies the health gate. Wrap it in +# the corpus's wall-clock budget; a timeout (exit 124) is a failure like any health violation. Keep +# its exit code so the caller (CI) sees the gate result, but always run cleanup + always leave the +# report JSON behind (it's written before the gate fails). +echo "oracle-run: oracle report --corpus $CORPUS (timeout ${TIMEOUT_MINUTES}m)" >&2 +set +e +( cd "$CHECKOUT" && RAG_RAT_COMMIT="$RAG_RAT_COMMIT" \ + timeout "${TIMEOUT_MINUTES}m" \ + "$RAG_RAT_BIN" --json oracle report --corpus "$CORPUS" --corpora "$CORPORA" ) > "$REPORT_OUT" +rc=$? +set -e + +if [ "${KEEP_CHECKOUT:-0}" != "1" ]; then + rm -rf "$CHECKOUT" "$DB" "$DB"-wal "$DB"-shm +fi + +if [ "$rc" -eq 0 ]; then + echo "oracle-run: done (healthy) — report: $REPORT_OUT" >&2 +elif [ "$rc" -eq 124 ]; then + echo "oracle-run: TIMED OUT after ${TIMEOUT_MINUTES}m — report: $REPORT_OUT" >&2 +else + echo "oracle-run: health gate FAILED (exit $rc) — report: $REPORT_OUT" >&2 +fi +exit "$rc" diff --git a/tools/rust-scip-oracle.sh b/tools/rust-scip-oracle.sh deleted file mode 100755 index a35b083..0000000 --- a/tools/rust-scip-oracle.sh +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env bash -# Present Rust edge resolution via the rust-analyzer SCIP oracle (#61), repeatably. -# -# The Rust mirror of tools/kernel-c-oracle.sh. Simpler than the C path: rust-analyzer analyzes the -# WHOLE Cargo workspace (no per-TU compilation database, no compiled subset), so the resolution -# metrics cover the entire indexed crate — not a configured subset. Steps: fetch the pinned corpus, -# `rag-rat index --full`, `oracle run --tool rust-analyzer`, emit the heuristic-vs-compiler delta as -# a BMF for Bencher (benchmark `rust-cargo-/rust-oracle`). -# -# Corpus is rust-lang/cargo pinned by SHA — the SAME snapshot the iai/criterion benches use -# (benches/shared/mod.rs), so all bench corpora stay consistent. -# -# Env: -# CARGO_TAG / CARGO_SHA pinned corpus (default 0.97.1 / fc1044d6…, matches benches/shared/mod.rs) -# RAG_RAT_BIN release binary (default: target/release/rag-rat) -# RUST_WORK working dir (default: a fresh mktemp dir) -# BMF_OUT Bencher Metric Format output path (default: rust_scip_oracle_bmf.json) -set -euo pipefail - -CARGO_TAG="${CARGO_TAG:-0.97.1}" -CARGO_SHA="${CARGO_SHA:-fc1044d6129608b3a3188566a919dc6126f7cb15}" -RAG_RAT_BIN="${RAG_RAT_BIN:-target/release/rag-rat}" -WORK="${RUST_WORK:-$(mktemp -d)}" -BMF_OUT="${BMF_OUT:-rust_scip_oracle_bmf.json}" -# Resolve to an ABSOLUTE path before any cd — `command -v` can return a relative path, which breaks -# once the script cd's into the corpus tree, so always canonicalize the result. -RAG_RAT_BIN="$(command -v "$RAG_RAT_BIN" || echo "$RAG_RAT_BIN")" -RAG_RAT_BIN="$(readlink -f "$RAG_RAT_BIN")" -BMF_OUT="$(readlink -f "$BMF_OUT" 2>/dev/null || echo "$PWD/$BMF_OUT")" -mkdir -p "$WORK" -DB="$WORK/rust-index.sqlite" -RDIR="$WORK/cargo" - -[ -x "$RAG_RAT_BIN" ] || { echo "rust-scip-oracle: rag-rat not found at '$RAG_RAT_BIN'" >&2; exit 1; } -command -v rust-analyzer >/dev/null 2>&1 || { - echo "rust-scip-oracle: rust-analyzer not on PATH (install from rust-lang/rust-analyzer releases)" >&2 - exit 1 -} - -echo "rust-scip-oracle: fetching rust-lang/cargo ${CARGO_TAG} (${CARGO_SHA}, shallow)" >&2 -git init -q "$RDIR" -git -C "$RDIR" remote add origin https://github.com/rust-lang/cargo.git -git -C "$RDIR" -c protocol.version=2 fetch -q --depth 1 origin "$CARGO_SHA" -git -C "$RDIR" checkout -q "$CARGO_SHA" - -# rust-analyzer loads the workspace via `cargo metadata`; pre-fetch the dependency graph so the SCIP -# pass doesn't race a cold registry. Non-fatal: rust-analyzer still resolves in-workspace symbols -# even if a dep can't be fetched. -echo "rust-scip-oracle: cargo fetch (warm the dep graph for rust-analyzer)" >&2 -( cd "$RDIR" && cargo fetch -q 2>/dev/null ) || true - -cat > "$RDIR/rag-rat.toml" <&2 -( cd "$RDIR" && "$RAG_RAT_BIN" index --full >/dev/null ) - -# The rust-analyzer oracle pass over the whole workspace (stdout = clean JSON report). -echo "rust-scip-oracle: oracle run --tool rust-analyzer" >&2 -( cd "$RDIR" && "$RAG_RAT_BIN" oracle run --tool rust-analyzer --json ) > "$WORK/oracle-report.json" - -python3 - "$DB" "$WORK/oracle-report.json" "$BMF_OUT" "$CARGO_TAG" <<'PY' -import json, sqlite3, sys -db, report_path, bmf_out, tag = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] -report = json.load(open(report_path)).get("report", {}) -conn = sqlite3.connect(db) -q = lambda s: conn.execute(s).fetchone()[0] - -rust_files = q("SELECT COUNT(*) FROM files WHERE language='rust'") -total_calls = q("SELECT COUNT(*) FROM edges WHERE edge_kind='calls_name' AND callee_start_byte IS NOT NULL") -heur_resolved = q("SELECT COUNT(*) FROM edges WHERE edge_kind='calls_name' AND callee_start_byte IS NOT NULL AND to_symbol_id IS NOT NULL") -heur_rate = 100.0 * heur_resolved / total_calls if total_calls else 0.0 - -confirmed = report.get("confirmed", 0) -contradicted = report.get("contradicted", 0) -upgraded = report.get("upgraded", 0) -resolved_external = report.get("resolved_external", 0) -covered = report.get("covered_calls", 0) -oracle_only = report.get("oracle_only_calls", 0) -judged = confirmed + contradicted -precision = 100.0 * confirmed / judged if judged else 0.0 # compiler-confirmed fraction of resolved -recall = 100.0 * covered / (covered + oracle_only) if (covered + oracle_only) else 0.0 - -print(f"\n=== Rust edge resolution on rust-lang/cargo {tag} ({rust_files} .rs files) ===") -print(f"heuristic calls_name resolved: {heur_resolved}/{total_calls} ({heur_rate:.1f}%)") -print(f"oracle: confirmed={confirmed} contradicted={contradicted} " - f"upgraded={upgraded} resolved_external={resolved_external}") -print(f"compiler-confirmed precision of heuristic-resolved edges: {precision:.1f}% " - f"(confirm/(confirm+contradict))") -print(f"call recall (oracle-seen calls a calls_name edge covered): {recall:.1f}%") - -# Precision split by edge_kind (#61): separate function calls (`calls_name`) from type references -# (`references_type`) so the blended number doesn't hide per-kind differences (the C oracle showed -# 85% calls vs 18% types). Surface calls vs types to the BMF. -print("\n--- compiler precision by edge_kind ---") -print(f"{'edge_kind':<18} {'confirm':>10} {'contra':>10} {'precision':>10}") -kind_prec = {} -for ek, c, x in conn.execute( - "SELECT e.edge_kind, " - "SUM(CASE WHEN o.kind='confirm' THEN 1 ELSE 0 END), " - "SUM(CASE WHEN o.kind='contradict' THEN 1 ELSE 0 END) " - "FROM edge_oracle o JOIN edges e ON e.id = o.edge_id " - "WHERE o.kind IN ('confirm','contradict') " - "GROUP BY e.edge_kind ORDER BY 2 DESC" -).fetchall(): - prec = 100.0 * c / (c + x) if (c + x) else 0.0 - kind_prec[ek] = prec - print(f"{ek or '':<18} {c:>10} {x:>10} {prec:>9.1f}%") -calls_precision = kind_prec.get("calls_name", 0.0) -types_precision = kind_prec.get("references_type", 0.0) - -bmf = {f"rust-cargo-{tag}/rust-oracle": { - "rust_files": {"value": rust_files}, - "heuristic_resolved_rate": {"value": heur_rate}, - "compiler_precision": {"value": precision}, - "compiler_precision_calls": {"value": calls_precision}, - "compiler_precision_types": {"value": types_precision}, - "call_recall": {"value": recall}, - "confirmed": {"value": confirmed}, - "contradicted": {"value": contradicted}, - "upgraded": {"value": upgraded}, - "resolved_external": {"value": resolved_external}, -}} -json.dump(bmf, open(bmf_out, "w"), indent=2) -print(f"wrote BMF -> {bmf_out}") -PY - -# Free the corpus checkout + index DB (they accumulate per run on the self-hosted box); keep the -# small oracle-report.json in WORK for artifact upload and the BMF at $BMF_OUT. -rm -rf "$RDIR" "$DB" "$DB"-wal "$DB"-shm -echo "rust-scip-oracle: done (report: $WORK/oracle-report.json, BMF: $BMF_OUT)" >&2 From e9226e3c2e0c1bcb048f85616f58989e940d59cb Mon Sep 17 00:00:00 2001 From: Kristaps Karlsons Date: Tue, 16 Jun 2026 00:10:02 +0300 Subject: [PATCH 2/2] fix(oracle): address Codex review on the unified runner (#177) Six P2s: - Whole-run timeout (oracle-run.sh): the corpus wall-clock budget wrapped only `oracle report`, so a hung clone/prepare/index sat until the Actions/job timeout. The runner now re-execs itself once under `timeout -k 60s m`, covering clone + prepare + index + report; an EXIT trap still removes the checkout on a timeout/gate-fail. - Preserve the virtualenv (oracle-run.sh): activate a prepare-created `.venv` (VIRTUAL_ENV + PATH) before index/report so scip-python (pyright) resolves against the project's installed deps, not the global interpreter. - Bencher series identity (oracle-report-bmf.py): the benchmark name was keyed only by corpus_id, so a profile/tool-version change (which makes reports incomparable) appended to the old series. Now keyed by corpus_id@+ so an incomparable change starts a fresh series. - Bound kernel make (oracle-corpora.toml): `make -j` is unlimited jobs; pinned to `make -j$(nproc)` so the heavy run doesn't oversubscribe the box. Recomputed the golden linux-kernel profile hash. - Trigger on dep changes (oracle.yml): added root Cargo.toml/Cargo.lock to the PR + push path filters so a workspace-dep bump can't skip the gate. - Pin rust-analyzer (oracle.yml): install it as a rustup component (pinned to the stable toolchain) instead of downloading releases/latest each run. --- .github/workflows/oracle.yml | 25 +++++++--- .../rag-rat-core/src/index/oracle/corpus.rs | 2 +- tools/oracle-corpora.toml | 8 +-- tools/oracle-report-bmf.py | 16 +++++- tools/oracle-run.sh | 49 ++++++++++++++----- 5 files changed, 77 insertions(+), 23 deletions(-) diff --git a/.github/workflows/oracle.yml b/.github/workflows/oracle.yml index daa6c92..e0a0e4d 100644 --- a/.github/workflows/oracle.yml +++ b/.github/workflows/oracle.yml @@ -16,6 +16,11 @@ on: pull_request: paths: - 'crates/**' + # Root Cargo.toml/Cargo.lock change the built rag-rat binary (workspace deps / pinned + # versions) without touching crates/** — gate on them too so a dependency bump can't merge + # parser/oracle behaviour changes past the resolution health gate. + - 'Cargo.toml' + - 'Cargo.lock' - 'tools/oracle-corpora.toml' - 'tools/oracle-corpus.py' - 'tools/oracle-run.sh' @@ -25,6 +30,8 @@ on: branches: [main] paths: - 'crates/**' + - 'Cargo.toml' + - 'Cargo.lock' - 'tools/oracle-corpora.toml' - 'tools/oracle-corpus.py' - 'tools/oracle-run.sh' @@ -48,11 +55,12 @@ concurrency: cancel-in-progress: true env: - # Pinned SCIP toolchain for the small tier, mirroring tools/bench.Containerfile so a PR number is - # produced by the same indexer build the heavy/Bencher tier uses. + # Pinned SCIP toolchain for the small tier so a PR number isn't perturbed by an unrelated indexer + # release. scip-clang/scip-python are pinned to explicit versions; rust-analyzer is installed as a + # rustup component (below), pinning it to the stable toolchain (a ~6-week cadence) instead of the + # weekly `releases/latest` — so a fresh RA build can't change `rust-semver`'s numbers mid-PR. SCIP_CLANG_VERSION: v0.4.0 SCIP_PYTHON_VERSION: 0.6.6 - RUST_ANALYZER_URL: https://github.com/rust-lang/rust-analyzer/releases/latest/download/rust-analyzer-x86_64-unknown-linux-gnu.gz jobs: matrix: @@ -89,6 +97,10 @@ jobs: steps: - uses: actions/checkout@v5 - uses: dtolnay/rust-toolchain@stable + # rust-analyzer as a toolchain component pins the SCIP emitter to the stable release (not the + # weekly `releases/latest`), so the `rust-semver` leg's numbers don't shift under the PR. + with: + components: rust-analyzer - uses: Swatinem/rust-cache@v2 - name: Build rag-rat (release, hash embedder — no model download) @@ -101,9 +113,10 @@ jobs: echo "installing $tool for ${{ matrix.corpus }}" case "$tool" in rust-analyzer) - curl --proto '=https' --tlsv1.2 -sSfL "$RUST_ANALYZER_URL" \ - | gunzip > /usr/local/bin/rust-analyzer - chmod +x /usr/local/bin/rust-analyzer + # Already installed as a rustup component (toolchain step). Resolve the proxy to an + # absolute path on PATH so the oracle's `rust-analyzer scip` probe finds it. + ra="$(rustup which --toolchain stable rust-analyzer)" + install -m 0755 "$ra" /usr/local/bin/rust-analyzer rust-analyzer --version ;; scip-clang) curl --proto '=https' --tlsv1.2 -sSfL \ diff --git a/crates/rag-rat-core/src/index/oracle/corpus.rs b/crates/rag-rat-core/src/index/oracle/corpus.rs index 53b3d99..885b864 100644 --- a/crates/rag-rat-core/src/index/oracle/corpus.rs +++ b/crates/rag-rat-core/src/index/oracle/corpus.rs @@ -194,7 +194,7 @@ health = { expected_min_heuristic_edges = 50000, expected_min_oracle_examined const GOLDEN_RUST_CARGO: &str = "60452736340151a253001bb5c33cc83efa2a4ceabba4d42a227d3188d7761d79"; const GOLDEN_LINUX_KERNEL: &str = - "abf87f3dca38d79ad6239348659c13ca484c70f2197fd36e3a7e1f97e27165ff"; + "9b64c26095bbf672884e8ca3c8d93bab44444f702904a0c8b35cd9feccd80fb6"; fn report_with( total_edges: u64, diff --git a/tools/oracle-corpora.toml b/tools/oracle-corpora.toml index 8557203..e659ef9 100644 --- a/tools/oracle-corpora.toml +++ b/tools/oracle-corpora.toml @@ -61,9 +61,9 @@ tier = "heavy" repo = "https://github.com/torvalds/linux" rev = "v7.0" tool = "scip-clang" -# The compile_commands.json target reads per-object `.cmd` files, so the kernel must be BUILT first -# (mirrors tools/kernel-c-oracle.sh). Without the build the compdb is empty and scip-clang sees -# nothing. -prepare = ["make defconfig", "make -j", "make compile_commands.json"] +# The compile_commands.json target reads per-object `.cmd` files, so the kernel must be BUILT first. +# Without the build the compdb is empty and scip-clang sees nothing. `-j$(nproc)` bounds the build to +# the core count — a bare `make -j` is UNLIMITED jobs and oversubscribes the box's CPU/RAM (#177). +prepare = ["make defconfig", "make -j$(nproc)", "make compile_commands.json"] bindings = { c = ["."] } health = { expected_min_heuristic_edges = 50000, expected_min_oracle_examined = 5000, expected_max_skipped_drifted = 0, expected_min_symbols_with_moniker = 1000, timeout_minutes = 120 } diff --git a/tools/oracle-report-bmf.py b/tools/oracle-report-bmf.py index 113d067..74d0fd1 100755 --- a/tools/oracle-report-bmf.py +++ b/tools/oracle-report-bmf.py @@ -18,9 +18,15 @@ from __future__ import annotations import json +import re import sys +def _slug(value: str) -> str: + """Make a tool-version string safe + compact for a benchmark name (drop spaces/punctuation).""" + return re.sub(r"[^A-Za-z0-9._-]+", "_", value).strip("_") + + def benchmark(report: dict) -> tuple[str, dict]: resolution = report["resolution"] total = resolution["total_edges"] @@ -30,7 +36,15 @@ def rate(numerator: int) -> float: # Mirror the engine's vacuous-1.0 convention for an empty denominator. return 100.0 if total == 0 else 100.0 * numerator / total - name = f"{report['corpus_id']}/oracle" + # Key the Bencher series by the FULL comparability identity, not just corpus_id (Codex on #177): + # a profile change (rev/bindings/prepare/threshold/tool) bumps `corpus_profile_hash`, and a SCIP + # indexer bump changes `tool_version` — both make reports incomparable (`comparable_to`), so each + # must start a FRESH series rather than silently append to the old one. Embedding the hash prefix + # + tool version in the benchmark name forces exactly that. + name = ( + f"{report['corpus_id']}@{report['corpus_profile_hash'][:12]}" + f"+{_slug(report['tool_version'])}/oracle" + ) body = { "total_edges": {"value": resolution["total_edges"]}, "resolved_rate_before": {"value": rate(resolution["resolved_before"])}, diff --git a/tools/oracle-run.sh b/tools/oracle-run.sh index fed4e1b..43ea265 100755 --- a/tools/oracle-run.sh +++ b/tools/oracle-run.sh @@ -50,7 +50,29 @@ TIMEOUT_MINUTES="$(field timeout_minutes)" CHECKOUT="$WORK/checkout" DB="$WORK/$CORPUS-index.sqlite" -echo "oracle-run: corpus=$CORPUS tool=$TOOL repo=$REPO rev=$REV" >&2 +# Enforce the corpus's wall-clock budget over the WHOLE run — clone + prepare (cargo fetch / a kernel +# build) + index + report — not just the report step (Codex on #177). Re-exec the script once under +# `timeout` after resolving the budget; the guard env var stops a second wrap, and the resolved paths +# are exported so the re-exec reuses the same work dir / binary rather than re-deriving them. `-k` +# escalates to SIGKILL if a hung tool ignores SIGTERM. A timeout exits 124 (a failure like any gate +# violation); the EXIT trap below still removes the checkout. +if [ -z "${ORACLE_RUN_TIMED:-}" ]; then + export ORACLE_RUN_TIMED=1 ORACLE_WORK="$WORK" CORPORA RAG_RAT_BIN RAG_RAT_COMMIT REPORT_OUT + export CORPUS KEEP_CHECKOUT="${KEEP_CHECKOUT:-0}" + exec timeout -k 60s "${TIMEOUT_MINUTES}m" "$0" +fi + +# Inside the timed re-exec. Remove the checkout + index DB on ANY exit (gate failure, timeout, error) +# while leaving the report JSON; idempotent, so the trap firing on both a signal and the final exit +# is harmless. +cleanup_checkout() { + if [ "${KEEP_CHECKOUT:-0}" != "1" ]; then + rm -rf "$CHECKOUT" "$DB" "$DB"-wal "$DB"-shm + fi +} +trap cleanup_checkout EXIT + +echo "oracle-run: corpus=$CORPUS tool=$TOOL repo=$REPO rev=$REV (budget ${TIMEOUT_MINUTES}m)" >&2 # Shallow-clone the pinned rev. A tag or branch name resolves via `fetch `; a full SHA fetches # directly. `--depth 1` keeps the corpus small — the oracle never needs history. @@ -69,6 +91,16 @@ while IFS= read -r prepare_cmd; do ( cd "$CHECKOUT" && bash -c "$prepare_cmd" ) done < <(field prepare) +# Activate a virtualenv the prepare steps created, so the oracle's indexer subprocess resolves +# against the project's installed deps rather than the global interpreter (Codex on #177). scip-python +# (pyright) finds dependency monikers only when its `python` is the venv's — prepare runs in child +# shells whose activation doesn't survive, so the runner re-establishes it for the index/report steps. +if [ -d "$CHECKOUT/.venv/bin" ]; then + echo "oracle-run: activating $CHECKOUT/.venv" >&2 + export VIRTUAL_ENV="$CHECKOUT/.venv" + export PATH="$CHECKOUT/.venv/bin:$PATH" +fi + # Render the corpus's rag-rat.toml: index the checkout into $DB with the declared per-language # bindings. The oracle report reads the SAME bindings from the corpora file for provenance; this # file is what `rag-rat index` walks. @@ -84,22 +116,17 @@ done < <(field prepare) echo "oracle-run: rag-rat index --full" >&2 ( cd "$CHECKOUT" && "$RAG_RAT_BIN" index --full >/dev/null ) -# `oracle report` runs the oracle + assembles the typed report + applies the health gate. Wrap it in -# the corpus's wall-clock budget; a timeout (exit 124) is a failure like any health violation. Keep -# its exit code so the caller (CI) sees the gate result, but always run cleanup + always leave the -# report JSON behind (it's written before the gate fails). -echo "oracle-run: oracle report --corpus $CORPUS (timeout ${TIMEOUT_MINUTES}m)" >&2 +# `oracle report` runs the oracle + assembles the typed report + applies the health gate. The whole +# run is already inside the corpus wall-clock budget (the re-exec `timeout` above). Keep its exit +# code so the caller (CI) sees the gate result; the report JSON is always written (it's emitted +# before the gate fails), and the EXIT trap removes the checkout. +echo "oracle-run: oracle report --corpus $CORPUS" >&2 set +e ( cd "$CHECKOUT" && RAG_RAT_COMMIT="$RAG_RAT_COMMIT" \ - timeout "${TIMEOUT_MINUTES}m" \ "$RAG_RAT_BIN" --json oracle report --corpus "$CORPUS" --corpora "$CORPORA" ) > "$REPORT_OUT" rc=$? set -e -if [ "${KEEP_CHECKOUT:-0}" != "1" ]; then - rm -rf "$CHECKOUT" "$DB" "$DB"-wal "$DB"-shm -fi - if [ "$rc" -eq 0 ]; then echo "oracle-run: done (healthy) — report: $REPORT_OUT" >&2 elif [ "$rc" -eq 124 ]; then