AppSprout-dev · jkbennitt · Jun 10, 2026 · Jun 10, 2026
diff --git a/results/benchmark_history.jsonl b/results/benchmark_history.jsonl
@@ -4,3 +4,4 @@
 {"timestamp": "2026-06-10T01:09:11.973836+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8677, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 26, "total_completion_tokens": 21596, "total_tokens": 21622, "estimated_cost_usd": 1.08006, "wall_time_s": 319.28, "num_calls": 13, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 128, "errors_by_type": {"deliberation_timeout": 1, "provider_error": 1}, "avg_deliberation_ms": 27602.18, "action_success_rate": 1.0, "total_tokens": 21622, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8677, "outcome": "timeout", "ticks": 2}]}
 {"timestamp": "2026-06-10T01:14:38.139936+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8598, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 28, "total_completion_tokens": 20578, "total_tokens": 20606, "estimated_cost_usd": 1.02918, "wall_time_s": 163.26, "num_calls": 14, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 99, "errors_by_type": {}, "avg_deliberation_ms": 25524.75, "action_success_rate": 0.7273, "total_tokens": 20606, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8598, "outcome": "timeout", "ticks": 2}]}
 {"timestamp": "2026-06-10T01:32:30.462265+00:00", "scoring_version": "1.0", "git_commit": "5d361d0", "git_branch": "master", "git_dirty": false, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 10, "outcome": "timeout", "final_score": 0.7536, "ticks_run": 10, "cost_snapshot": {"total_prompt_tokens": 140, "total_completion_tokens": 102045, "total_tokens": 102185, "estimated_cost_usd": 5.10365, "wall_time_s": 897.04, "num_calls": 70, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 633, "errors_by_type": {}, "avg_deliberation_ms": 26282.04, "action_success_rate": 0.7167, "total_tokens": 102185, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.7536, "outcome": "timeout", "ticks": 10}]}
+{"timestamp": "2026-06-10T04:12:15.578574+00:00", "scoring_version": "1.1", "git_commit": "98ee8e2", "git_branch": "fix/run-integrity", "git_dirty": false, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8594, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 28, "total_completion_tokens": 19000, "total_tokens": 19028, "estimated_cost_usd": 0.95028, "wall_time_s": 151.53, "num_calls": 14, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 96, "errors_by_type": {}, "avg_deliberation_ms": 24304.49, "action_success_rate": 0.7857, "total_tokens": 19028, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8594, "outcome": "timeout", "ticks": 2}]}
diff --git a/scripts/calibrate_baseline.py b/scripts/calibrate_baseline.py
@@ -0,0 +1,115 @@
+"""Phase B1: calibrate a scenario's no-agent baseline.
+
+Runs N seeded --no-agent --until-death runs against the live game, then
+aggregates them into a pinned <scenario>.baseline.json sidecar next to the
+scenario YAML. The baseline is an immutable scenario property — re-run this
+only when the scenario's save or SCORING_VERSION changes.
+
+Usage:
+    python scripts/calibrate_baseline.py crashlanded --seeds 42 43 44 45
+"""
+
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+from rle.scenarios.loader import baseline_path, load_baseline, load_scenario
+from rle.scoring.baseline import aggregate_baseline, read_run
+from rle.tracking.metadata import SCORING_VERSION, collect_metadata
+
+DEFINITIONS_DIR = (
+    Path(__file__).parent.parent / "src" / "rle" / "scenarios" / "definitions"
+)
+
+
+def _find_scenario_path(query: str) -> Path:
+    for path in sorted(DEFINITIONS_DIR.glob("*.yaml")):
+        if path.stem.startswith(query) or query in path.stem:
+            return path
+    raise SystemExit(f"No scenario matching {query!r} in {DEFINITIONS_DIR}")
+
+
+def _run_one(
+    scenario_query: str, seed: int, out_dir: Path, tick_interval: float,
+) -> None:
+    cmd = [
+        sys.executable, str(Path(__file__).parent / "run_scenario.py"),
+        scenario_query,
+        "--no-agent", "--until-death", "--no-pause",
+        "--seed", str(seed),
+        "--output", str(out_dir),
+        "--tick-interval", str(tick_interval),
+    ]
+    print(f"[baseline] seed {seed} -> {out_dir}")
+    result = subprocess.run(cmd)
+    if result.returncode != 0:
+        raise SystemExit(
+            f"Baseline run for seed {seed} failed (exit {result.returncode}); "
+            f"aborting calibration — partial baselines must not be written.",
+        )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("scenario", help="Scenario name or number prefix")
+    parser.add_argument(
+        "--seeds", type=int, nargs="+", default=[42, 43, 44, 45],
+        help="One run per seed (default: 42 43 44 45; N>=4 for validity)",
+    )
+    parser.add_argument(
+        "--output", default="results/baseline",
+        help="Directory for per-run artifacts (default: results/baseline)",
+    )
+    parser.add_argument("--tick-interval", type=float, default=30.0)
+    parser.add_argument(
+        "--aggregate-only", action="store_true",
+        help="Skip the runs; aggregate existing per-seed dirs under --output",
+    )
+    args = parser.parse_args()
+
+    scenario_path = _find_scenario_path(args.scenario)
+    scenario = load_scenario(scenario_path)
+    out_root = Path(args.output) / scenario_path.stem
+
+    run_dirs = [out_root / f"seed{seed}" for seed in args.seeds]
+    if not args.aggregate_only:
+        for seed, run_dir in zip(args.seeds, run_dirs):
+            run_dir.mkdir(parents=True, exist_ok=True)
+            _run_one(args.scenario, seed, run_dir, args.tick_interval)
+
+    runs = [read_run(d) for d in run_dirs]
+    metadata = collect_metadata()
+    reference = aggregate_baseline(
+        runs,
+        scenario_name=scenario.name,
+        recorded_on=datetime.now(timezone.utc).isoformat(),
+        scoring_version=SCORING_VERSION,
+        save_sha256=scenario.save_sha256,
+        rimapi_dll_sha256=str(metadata.get("rimapi_dll_sha256") or "") or None,
+        rle_commit=str(metadata.get("git_commit") or "") or None,
+    )
+
+    sidecar = baseline_path(scenario_path)
+    sidecar.write_text(reference.model_dump_json(indent=2), encoding="utf-8")
+    print(f"[baseline] wrote {sidecar}")
+
+    # Round-trip through the strict loader so a bad sidecar fails HERE,
+    # not at the start of someone's benchmark run.
+    loaded = load_baseline(scenario_path, scenario)
+    assert loaded is not None
+    print(
+        f"[baseline] {loaded.scenario_name}: n={loaded.n_runs} "
+        f"seeds={list(loaded.seeds)} outcomes={list(loaded.outcomes)} "
+        f"time_to_end={loaded.time_to_end_days_mean:.1f}d "
+        f"trajectory={len(loaded.score_trajectory)} points",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_scenario.py b/scripts/run_scenario.py
@@ -37,6 +37,10 @@
 
 DEFINITIONS_DIR = Path(__file__).parent.parent / "src" / "rle" / "scenarios" / "definitions"
 
+# Runaway guard for --until-death runs (the evaluator's terminal conditions
+# are the intended stop; this only catches a colony that never dies or wins).
+_UNTIL_DEATH_SAFETY_CAP = 5000
+
 
 def _find_scenario(query: str) -> Path:
     """Find a scenario YAML by name prefix or number."""
@@ -196,7 +200,15 @@ async def main(args: argparse.Namespace) -> None:
                 agent.agent_id, label=display["label"], color=display["color"],
             )
 
-    max_ticks = args.ticks or scenario.max_ticks
+    if args.until_death:
+        # Natural-conclusion mode (Phase B): no scenario tick cap — the run
+        # ends when the evaluator hits a terminal condition (all colonists
+        # dead, or victory). The safety cap only guards against a runaway
+        # loop if the colony somehow never reaches either.
+        scenario = scenario.model_copy(update={"max_ticks": None})
+        max_ticks = args.ticks or _UNTIL_DEATH_SAFETY_CAP
+    else:
+        max_ticks = args.ticks or scenario.max_ticks
 
     # Initialize tracking (optional, when --output is specified)
     event_log: EventLog | None = None
@@ -391,6 +403,12 @@ async def main(args: argparse.Namespace) -> None:
         "--no-agent", action="store_true",
         help="Baseline mode: no agent deliberation, colony runs unmanaged",
     )
+    parser.add_argument(
+        "--until-death", action="store_true",
+        help="Ignore the scenario tick cap; run until the evaluator reaches "
+             "a terminal condition (all colonists dead, or victory). "
+             "Phase B natural-conclusion mode.",
+    )
     parser.add_argument(
         "--no-pause", action="store_true",
         help="Don't pause game during deliberation (SSE-driven, game runs continuously)",

diff --git a/src/rle/scenarios/loader.py b/src/rle/scenarios/loader.py
@@ -6,8 +6,8 @@
 
 import yaml
 
-from rle.scenarios.schema import ScenarioConfig
-from rle.tracking.metadata import file_sha256
+from rle.scenarios.schema import BaselineReference, ScenarioConfig
+from rle.tracking.metadata import SCORING_VERSION, file_sha256
 
 # Canonical save mirror (the same files that get baked into the Docker image).
 # Resolves to <repo_root>/docker/saves/. Live game runs may use a save in
@@ -55,6 +55,51 @@ def load_scenario(
     return scenario
 
 
+class BaselineMismatchError(RuntimeError):
+    """Raised when a scenario's .baseline.json was calibrated against a
+    different save or scoring version than the scenario currently pins —
+    the baseline must be recharacterized (scripts/calibrate_baseline.py)."""
+
+
+def baseline_path(scenario_path: str | Path) -> Path:
+    """Sidecar .baseline.json path for a scenario YAML path."""
+    return Path(scenario_path).with_suffix(".baseline.json")
+
+
+def load_baseline(
+    scenario_path: str | Path, scenario: ScenarioConfig,
+) -> BaselineReference | None:
+    """Load a scenario's pinned baseline sidecar, if one exists.
+
+    Returns None when no sidecar is present. Fails fast (rather than
+    silently comparing against a stale reference) when the baseline was
+    calibrated against a different save_sha256 or SCORING_VERSION.
+    """
+    path = baseline_path(scenario_path)
+    if not path.is_file():
+        return None
+    ref = BaselineReference.model_validate_json(path.read_text(encoding="utf-8"))
+    if (
+        scenario.save_sha256
+        and ref.save_sha256
+        and ref.save_sha256 != scenario.save_sha256
+    ):
+        raise BaselineMismatchError(
+            f"Baseline {path} was calibrated against save_sha256="
+            f"{ref.save_sha256} but scenario {scenario.name!r} now pins "
+            f"{scenario.save_sha256}. Recharacterize via "
+            f"scripts/calibrate_baseline.py.",
+        )
+    if ref.scoring_version != SCORING_VERSION:
+        raise BaselineMismatchError(
+            f"Baseline {path} was recorded at scoring_version="
+            f"{ref.scoring_version} but the current version is "
+            f"{SCORING_VERSION}. Recharacterize via "
+            f"scripts/calibrate_baseline.py.",
+        )
+    return ref
+
+
 def list_scenarios(
     directory: str | Path | None = None, *, allow_unpinned: bool = False,
 ) -> list[ScenarioConfig]:

diff --git a/src/rle/scenarios/schema.py b/src/rle/scenarios/schema.py
@@ -68,3 +68,42 @@ class ScenarioConfig(BaseModel):
     mismatch unless allow_unpinned=True. Generate via scripts/hash_saves.py."""
     triggered_incidents: list[TriggeredIncident] = []
     setup_commands: list[SetupCommand] = []
+
+
+class BaselinePoint(BaseModel):
+    """One sampled point on a baseline score trajectory (loop-tick indexed)."""
+
+    model_config = ConfigDict(frozen=True)
+
+    tick: int
+    composite_mean: float
+    composite_ci95: tuple[float, float] | None = None
+    n_runs: int
+    metric_means: dict[str, float] = {}
+
+
+class BaselineReference(BaseModel):
+    """Pinned no-agent calibration for a scenario (Phase B1).
+
+    Persisted as a .baseline.json sidecar next to the scenario YAML and
+    treated as an immutable scenario property: recharacterize only when the
+    scenario's save_sha256 or SCORING_VERSION changes. Agent runs compute
+    lift against this pinned trajectory instead of re-running baselines.
+    """
+
+    model_config = ConfigDict(frozen=True)
+
+    scenario_name: str
+    n_runs: int
+    seeds: tuple[int, ...]
+    outcomes: tuple[str, ...]
+    """Per-run terminal outcome ("defeat" = natural colony death,
+    "victory" possible if the unmanaged colony outlasts the scenario)."""
+    time_to_end_days_mean: float
+    time_to_end_days_ci95: tuple[float, float] | None = None
+    score_trajectory: tuple[BaselinePoint, ...]
+    recorded_on: str
+    save_sha256: str | None = None
+    rimapi_dll_sha256: str | None = None
+    rle_commit: str | None = None
+    scoring_version: str
diff --git a/src/rle/scoring/baseline.py b/src/rle/scoring/baseline.py
@@ -0,0 +1,117 @@
+"""Phase B1 baseline aggregation: N no-agent runs → a pinned BaselineReference."""
+
+from __future__ import annotations
+
+import csv
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from statistics import fmean
+
+from rle.scenarios.schema import BaselinePoint, BaselineReference
+from rle.scoring.bootstrap import bootstrap_ci
+
+# Columns in the per-tick CSV that are not individual metrics.
+_NON_METRIC_COLUMNS = frozenset({"tick", "day", "composite"})
+
+
+@dataclass
+class BaselineRun:
+    """One no-agent run's per-tick series, as read from its artifacts."""
+
+    seed: int
+    outcome: str
+    days: list[float] = field(default_factory=list)
+    composites: list[float] = field(default_factory=list)
+    metrics: dict[str, list[float]] = field(default_factory=dict)
+
+    @property
+    def time_to_end_days(self) -> float:
+        return self.days[-1] if self.days else 0.0
+
+
+def read_run(run_dir: Path) -> BaselineRun:
+    """Read one run's CSV + summary JSON into a BaselineRun."""
+    csv_paths = sorted(run_dir.glob("*_survival.csv")) or sorted(run_dir.glob("*.csv"))
+    if not csv_paths:
+        raise FileNotFoundError(f"No per-tick CSV found in {run_dir}")
+    summary_paths = sorted(run_dir.glob("*_summary.json"))
+    if not summary_paths:
+        raise FileNotFoundError(f"No summary JSON found in {run_dir}")
+    summary = json.loads(summary_paths[0].read_text(encoding="utf-8"))
+
+    run = BaselineRun(
+        seed=int(summary.get("random_seed") or 0),
+        outcome=str(summary.get("outcome", "unknown")),
+    )
+    with open(csv_paths[0], encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            run.days.append(float(row["day"]))
+            run.composites.append(float(row["composite"]))
+            for col, value in row.items():
+                if col not in _NON_METRIC_COLUMNS:
+                    run.metrics.setdefault(col, []).append(float(value))
+    return run
+
+
+def aggregate_baseline(
+    runs: list[BaselineRun],
+    *,
+    scenario_name: str,
+    recorded_on: str,
+    scoring_version: str,
+    save_sha256: str | None = None,
+    rimapi_dll_sha256: str | None = None,
+    rle_commit: str | None = None,
+    ci_seed: int = 0,
+) -> BaselineReference:
+    """Aggregate N runs into a pinned BaselineReference.
+
+    Trajectory points are loop-tick indexed; runs end at different ticks, so
+    each point averages over the runs that were still going (n_runs records
+    how many). CIs need at least 2 values and are omitted otherwise.
+    """
+    if not runs:
+        raise ValueError("aggregate_baseline requires at least one run")
+
+    end_days = [r.time_to_end_days for r in runs]
+    end_ci: tuple[float, float] | None = None
+    if len(end_days) >= 2:
+        ci = bootstrap_ci(end_days, seed=ci_seed)
+        end_ci = (ci.ci_lower, ci.ci_upper)
+
+    metric_names = sorted({m for r in runs for m in r.metrics})
+    points: list[BaselinePoint] = []
+    for i in range(max(len(r.composites) for r in runs)):
+        alive = [r for r in runs if len(r.composites) > i]
+        composites = [r.composites[i] for r in alive]
+        composite_ci: tuple[float, float] | None = None
+        if len(composites) >= 2:
+            ci = bootstrap_ci(composites, seed=ci_seed)
+            composite_ci = (ci.ci_lower, ci.ci_upper)
+        points.append(BaselinePoint(
+            tick=i,
+            composite_mean=fmean(composites),
+            composite_ci95=composite_ci,
+            n_runs=len(alive),
+            metric_means={
+                m: fmean(r.metrics[m][i] for r in alive if m in r.metrics)
+                for m in metric_names
+                if any(m in r.metrics for r in alive)
+            },
+        ))
+
+    return BaselineReference(
+        scenario_name=scenario_name,
+        n_runs=len(runs),
+        seeds=tuple(r.seed for r in runs),
+        outcomes=tuple(r.outcome for r in runs),
+        time_to_end_days_mean=fmean(end_days),
+        time_to_end_days_ci95=end_ci,
+        score_trajectory=tuple(points),
+        recorded_on=recorded_on,
+        save_sha256=save_sha256,
+        rimapi_dll_sha256=rimapi_dll_sha256,
+        rle_commit=rle_commit,
+        scoring_version=scoring_version,
+    )