diff --git a/results/benchmark_history.jsonl b/results/benchmark_history.jsonl index e988b70..699a557 100644 --- a/results/benchmark_history.jsonl +++ b/results/benchmark_history.jsonl @@ -4,3 +4,4 @@ {"timestamp": "2026-06-10T01:09:11.973836+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8677, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 26, "total_completion_tokens": 21596, "total_tokens": 21622, "estimated_cost_usd": 1.08006, "wall_time_s": 319.28, "num_calls": 13, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 128, "errors_by_type": {"deliberation_timeout": 1, "provider_error": 1}, "avg_deliberation_ms": 27602.18, "action_success_rate": 1.0, "total_tokens": 21622, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8677, "outcome": "timeout", "ticks": 2}]} {"timestamp": "2026-06-10T01:14:38.139936+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8598, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 28, "total_completion_tokens": 20578, "total_tokens": 20606, "estimated_cost_usd": 1.02918, "wall_time_s": 163.26, "num_calls": 14, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 99, "errors_by_type": {}, "avg_deliberation_ms": 25524.75, "action_success_rate": 0.7273, "total_tokens": 20606, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8598, "outcome": "timeout", "ticks": 2}]} {"timestamp": "2026-06-10T01:32:30.462265+00:00", "scoring_version": "1.0", "git_commit": "5d361d0", "git_branch": "master", "git_dirty": false, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 10, "outcome": "timeout", "final_score": 0.7536, "ticks_run": 10, "cost_snapshot": {"total_prompt_tokens": 140, "total_completion_tokens": 102045, "total_tokens": 102185, "estimated_cost_usd": 5.10365, "wall_time_s": 897.04, "num_calls": 70, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 633, "errors_by_type": {}, "avg_deliberation_ms": 26282.04, "action_success_rate": 0.7167, "total_tokens": 102185, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.7536, "outcome": "timeout", "ticks": 10}]} +{"timestamp": "2026-06-10T04:12:15.578574+00:00", "scoring_version": "1.1", "git_commit": "98ee8e2", "git_branch": "fix/run-integrity", "git_dirty": false, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8594, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 28, "total_completion_tokens": 19000, "total_tokens": 19028, "estimated_cost_usd": 0.95028, "wall_time_s": 151.53, "num_calls": 14, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 96, "errors_by_type": {}, "avg_deliberation_ms": 24304.49, "action_success_rate": 0.7857, "total_tokens": 19028, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8594, "outcome": "timeout", "ticks": 2}]} diff --git a/scripts/calibrate_baseline.py b/scripts/calibrate_baseline.py new file mode 100644 index 0000000..10920bf --- /dev/null +++ b/scripts/calibrate_baseline.py @@ -0,0 +1,115 @@ +"""Phase B1: calibrate a scenario's no-agent baseline. + +Runs N seeded --no-agent --until-death runs against the live game, then +aggregates them into a pinned .baseline.json sidecar next to the +scenario YAML. The baseline is an immutable scenario property — re-run this +only when the scenario's save or SCORING_VERSION changes. + +Usage: + python scripts/calibrate_baseline.py crashlanded --seeds 42 43 44 45 +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from rle.scenarios.loader import baseline_path, load_baseline, load_scenario +from rle.scoring.baseline import aggregate_baseline, read_run +from rle.tracking.metadata import SCORING_VERSION, collect_metadata + +DEFINITIONS_DIR = ( + Path(__file__).parent.parent / "src" / "rle" / "scenarios" / "definitions" +) + + +def _find_scenario_path(query: str) -> Path: + for path in sorted(DEFINITIONS_DIR.glob("*.yaml")): + if path.stem.startswith(query) or query in path.stem: + return path + raise SystemExit(f"No scenario matching {query!r} in {DEFINITIONS_DIR}") + + +def _run_one( + scenario_query: str, seed: int, out_dir: Path, tick_interval: float, +) -> None: + cmd = [ + sys.executable, str(Path(__file__).parent / "run_scenario.py"), + scenario_query, + "--no-agent", "--until-death", "--no-pause", + "--seed", str(seed), + "--output", str(out_dir), + "--tick-interval", str(tick_interval), + ] + print(f"[baseline] seed {seed} -> {out_dir}") + result = subprocess.run(cmd) + if result.returncode != 0: + raise SystemExit( + f"Baseline run for seed {seed} failed (exit {result.returncode}); " + f"aborting calibration — partial baselines must not be written.", + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("scenario", help="Scenario name or number prefix") + parser.add_argument( + "--seeds", type=int, nargs="+", default=[42, 43, 44, 45], + help="One run per seed (default: 42 43 44 45; N>=4 for validity)", + ) + parser.add_argument( + "--output", default="results/baseline", + help="Directory for per-run artifacts (default: results/baseline)", + ) + parser.add_argument("--tick-interval", type=float, default=30.0) + parser.add_argument( + "--aggregate-only", action="store_true", + help="Skip the runs; aggregate existing per-seed dirs under --output", + ) + args = parser.parse_args() + + scenario_path = _find_scenario_path(args.scenario) + scenario = load_scenario(scenario_path) + out_root = Path(args.output) / scenario_path.stem + + run_dirs = [out_root / f"seed{seed}" for seed in args.seeds] + if not args.aggregate_only: + for seed, run_dir in zip(args.seeds, run_dirs): + run_dir.mkdir(parents=True, exist_ok=True) + _run_one(args.scenario, seed, run_dir, args.tick_interval) + + runs = [read_run(d) for d in run_dirs] + metadata = collect_metadata() + reference = aggregate_baseline( + runs, + scenario_name=scenario.name, + recorded_on=datetime.now(timezone.utc).isoformat(), + scoring_version=SCORING_VERSION, + save_sha256=scenario.save_sha256, + rimapi_dll_sha256=str(metadata.get("rimapi_dll_sha256") or "") or None, + rle_commit=str(metadata.get("git_commit") or "") or None, + ) + + sidecar = baseline_path(scenario_path) + sidecar.write_text(reference.model_dump_json(indent=2), encoding="utf-8") + print(f"[baseline] wrote {sidecar}") + + # Round-trip through the strict loader so a bad sidecar fails HERE, + # not at the start of someone's benchmark run. + loaded = load_baseline(scenario_path, scenario) + assert loaded is not None + print( + f"[baseline] {loaded.scenario_name}: n={loaded.n_runs} " + f"seeds={list(loaded.seeds)} outcomes={list(loaded.outcomes)} " + f"time_to_end={loaded.time_to_end_days_mean:.1f}d " + f"trajectory={len(loaded.score_trajectory)} points", + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_scenario.py b/scripts/run_scenario.py index 5cf62fd..7beef11 100644 --- a/scripts/run_scenario.py +++ b/scripts/run_scenario.py @@ -37,6 +37,10 @@ DEFINITIONS_DIR = Path(__file__).parent.parent / "src" / "rle" / "scenarios" / "definitions" +# Runaway guard for --until-death runs (the evaluator's terminal conditions +# are the intended stop; this only catches a colony that never dies or wins). +_UNTIL_DEATH_SAFETY_CAP = 5000 + def _find_scenario(query: str) -> Path: """Find a scenario YAML by name prefix or number.""" @@ -196,7 +200,15 @@ async def main(args: argparse.Namespace) -> None: agent.agent_id, label=display["label"], color=display["color"], ) - max_ticks = args.ticks or scenario.max_ticks + if args.until_death: + # Natural-conclusion mode (Phase B): no scenario tick cap — the run + # ends when the evaluator hits a terminal condition (all colonists + # dead, or victory). The safety cap only guards against a runaway + # loop if the colony somehow never reaches either. + scenario = scenario.model_copy(update={"max_ticks": None}) + max_ticks = args.ticks or _UNTIL_DEATH_SAFETY_CAP + else: + max_ticks = args.ticks or scenario.max_ticks # Initialize tracking (optional, when --output is specified) event_log: EventLog | None = None @@ -391,6 +403,12 @@ async def main(args: argparse.Namespace) -> None: "--no-agent", action="store_true", help="Baseline mode: no agent deliberation, colony runs unmanaged", ) + parser.add_argument( + "--until-death", action="store_true", + help="Ignore the scenario tick cap; run until the evaluator reaches " + "a terminal condition (all colonists dead, or victory). " + "Phase B natural-conclusion mode.", + ) parser.add_argument( "--no-pause", action="store_true", help="Don't pause game during deliberation (SSE-driven, game runs continuously)", diff --git a/src/rle/scenarios/loader.py b/src/rle/scenarios/loader.py index 26a4f74..45cc1f1 100644 --- a/src/rle/scenarios/loader.py +++ b/src/rle/scenarios/loader.py @@ -6,8 +6,8 @@ import yaml -from rle.scenarios.schema import ScenarioConfig -from rle.tracking.metadata import file_sha256 +from rle.scenarios.schema import BaselineReference, ScenarioConfig +from rle.tracking.metadata import SCORING_VERSION, file_sha256 # Canonical save mirror (the same files that get baked into the Docker image). # Resolves to /docker/saves/. Live game runs may use a save in @@ -55,6 +55,51 @@ def load_scenario( return scenario +class BaselineMismatchError(RuntimeError): + """Raised when a scenario's .baseline.json was calibrated against a + different save or scoring version than the scenario currently pins — + the baseline must be recharacterized (scripts/calibrate_baseline.py).""" + + +def baseline_path(scenario_path: str | Path) -> Path: + """Sidecar .baseline.json path for a scenario YAML path.""" + return Path(scenario_path).with_suffix(".baseline.json") + + +def load_baseline( + scenario_path: str | Path, scenario: ScenarioConfig, +) -> BaselineReference | None: + """Load a scenario's pinned baseline sidecar, if one exists. + + Returns None when no sidecar is present. Fails fast (rather than + silently comparing against a stale reference) when the baseline was + calibrated against a different save_sha256 or SCORING_VERSION. + """ + path = baseline_path(scenario_path) + if not path.is_file(): + return None + ref = BaselineReference.model_validate_json(path.read_text(encoding="utf-8")) + if ( + scenario.save_sha256 + and ref.save_sha256 + and ref.save_sha256 != scenario.save_sha256 + ): + raise BaselineMismatchError( + f"Baseline {path} was calibrated against save_sha256=" + f"{ref.save_sha256} but scenario {scenario.name!r} now pins " + f"{scenario.save_sha256}. Recharacterize via " + f"scripts/calibrate_baseline.py.", + ) + if ref.scoring_version != SCORING_VERSION: + raise BaselineMismatchError( + f"Baseline {path} was recorded at scoring_version=" + f"{ref.scoring_version} but the current version is " + f"{SCORING_VERSION}. Recharacterize via " + f"scripts/calibrate_baseline.py.", + ) + return ref + + def list_scenarios( directory: str | Path | None = None, *, allow_unpinned: bool = False, ) -> list[ScenarioConfig]: diff --git a/src/rle/scenarios/schema.py b/src/rle/scenarios/schema.py index 94891b3..5ca05ea 100644 --- a/src/rle/scenarios/schema.py +++ b/src/rle/scenarios/schema.py @@ -68,3 +68,42 @@ class ScenarioConfig(BaseModel): mismatch unless allow_unpinned=True. Generate via scripts/hash_saves.py.""" triggered_incidents: list[TriggeredIncident] = [] setup_commands: list[SetupCommand] = [] + + +class BaselinePoint(BaseModel): + """One sampled point on a baseline score trajectory (loop-tick indexed).""" + + model_config = ConfigDict(frozen=True) + + tick: int + composite_mean: float + composite_ci95: tuple[float, float] | None = None + n_runs: int + metric_means: dict[str, float] = {} + + +class BaselineReference(BaseModel): + """Pinned no-agent calibration for a scenario (Phase B1). + + Persisted as a .baseline.json sidecar next to the scenario YAML and + treated as an immutable scenario property: recharacterize only when the + scenario's save_sha256 or SCORING_VERSION changes. Agent runs compute + lift against this pinned trajectory instead of re-running baselines. + """ + + model_config = ConfigDict(frozen=True) + + scenario_name: str + n_runs: int + seeds: tuple[int, ...] + outcomes: tuple[str, ...] + """Per-run terminal outcome ("defeat" = natural colony death, + "victory" possible if the unmanaged colony outlasts the scenario).""" + time_to_end_days_mean: float + time_to_end_days_ci95: tuple[float, float] | None = None + score_trajectory: tuple[BaselinePoint, ...] + recorded_on: str + save_sha256: str | None = None + rimapi_dll_sha256: str | None = None + rle_commit: str | None = None + scoring_version: str diff --git a/src/rle/scoring/baseline.py b/src/rle/scoring/baseline.py new file mode 100644 index 0000000..fdc2bf0 --- /dev/null +++ b/src/rle/scoring/baseline.py @@ -0,0 +1,117 @@ +"""Phase B1 baseline aggregation: N no-agent runs → a pinned BaselineReference.""" + +from __future__ import annotations + +import csv +import json +from dataclasses import dataclass, field +from pathlib import Path +from statistics import fmean + +from rle.scenarios.schema import BaselinePoint, BaselineReference +from rle.scoring.bootstrap import bootstrap_ci + +# Columns in the per-tick CSV that are not individual metrics. +_NON_METRIC_COLUMNS = frozenset({"tick", "day", "composite"}) + + +@dataclass +class BaselineRun: + """One no-agent run's per-tick series, as read from its artifacts.""" + + seed: int + outcome: str + days: list[float] = field(default_factory=list) + composites: list[float] = field(default_factory=list) + metrics: dict[str, list[float]] = field(default_factory=dict) + + @property + def time_to_end_days(self) -> float: + return self.days[-1] if self.days else 0.0 + + +def read_run(run_dir: Path) -> BaselineRun: + """Read one run's CSV + summary JSON into a BaselineRun.""" + csv_paths = sorted(run_dir.glob("*_survival.csv")) or sorted(run_dir.glob("*.csv")) + if not csv_paths: + raise FileNotFoundError(f"No per-tick CSV found in {run_dir}") + summary_paths = sorted(run_dir.glob("*_summary.json")) + if not summary_paths: + raise FileNotFoundError(f"No summary JSON found in {run_dir}") + summary = json.loads(summary_paths[0].read_text(encoding="utf-8")) + + run = BaselineRun( + seed=int(summary.get("random_seed") or 0), + outcome=str(summary.get("outcome", "unknown")), + ) + with open(csv_paths[0], encoding="utf-8") as f: + for row in csv.DictReader(f): + run.days.append(float(row["day"])) + run.composites.append(float(row["composite"])) + for col, value in row.items(): + if col not in _NON_METRIC_COLUMNS: + run.metrics.setdefault(col, []).append(float(value)) + return run + + +def aggregate_baseline( + runs: list[BaselineRun], + *, + scenario_name: str, + recorded_on: str, + scoring_version: str, + save_sha256: str | None = None, + rimapi_dll_sha256: str | None = None, + rle_commit: str | None = None, + ci_seed: int = 0, +) -> BaselineReference: + """Aggregate N runs into a pinned BaselineReference. + + Trajectory points are loop-tick indexed; runs end at different ticks, so + each point averages over the runs that were still going (n_runs records + how many). CIs need at least 2 values and are omitted otherwise. + """ + if not runs: + raise ValueError("aggregate_baseline requires at least one run") + + end_days = [r.time_to_end_days for r in runs] + end_ci: tuple[float, float] | None = None + if len(end_days) >= 2: + ci = bootstrap_ci(end_days, seed=ci_seed) + end_ci = (ci.ci_lower, ci.ci_upper) + + metric_names = sorted({m for r in runs for m in r.metrics}) + points: list[BaselinePoint] = [] + for i in range(max(len(r.composites) for r in runs)): + alive = [r for r in runs if len(r.composites) > i] + composites = [r.composites[i] for r in alive] + composite_ci: tuple[float, float] | None = None + if len(composites) >= 2: + ci = bootstrap_ci(composites, seed=ci_seed) + composite_ci = (ci.ci_lower, ci.ci_upper) + points.append(BaselinePoint( + tick=i, + composite_mean=fmean(composites), + composite_ci95=composite_ci, + n_runs=len(alive), + metric_means={ + m: fmean(r.metrics[m][i] for r in alive if m in r.metrics) + for m in metric_names + if any(m in r.metrics for r in alive) + }, + )) + + return BaselineReference( + scenario_name=scenario_name, + n_runs=len(runs), + seeds=tuple(r.seed for r in runs), + outcomes=tuple(r.outcome for r in runs), + time_to_end_days_mean=fmean(end_days), + time_to_end_days_ci95=end_ci, + score_trajectory=tuple(points), + recorded_on=recorded_on, + save_sha256=save_sha256, + rimapi_dll_sha256=rimapi_dll_sha256, + rle_commit=rle_commit, + scoring_version=scoring_version, + ) diff --git a/tests/unit/test_baseline.py b/tests/unit/test_baseline.py new file mode 100644 index 0000000..bcddbbe --- /dev/null +++ b/tests/unit/test_baseline.py @@ -0,0 +1,161 @@ +"""Tests for Phase B1 baseline aggregation and the sidecar loader.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from rle.scenarios.loader import ( + BaselineMismatchError, + baseline_path, + load_baseline, + load_scenario, +) +from rle.scenarios.schema import BaselineReference +from rle.scoring.baseline import BaselineRun, aggregate_baseline, read_run +from rle.tracking.metadata import SCORING_VERSION + +_SCENARIO_YAML = """\ +name: Test Scenario +description: test +difficulty: easy +expected_duration_days: 30 +initial_population: 3 +victory_conditions: [] +failure_conditions: [] +save_name: test_save +save_sha256: aabbcc +""" + + +def _make_run( + seed: int, composites: list[float], outcome: str = "defeat", +) -> BaselineRun: + return BaselineRun( + seed=seed, + outcome=outcome, + days=[i * 0.5 for i in range(len(composites))], + composites=composites, + metrics={"mood": [0.5] * len(composites)}, + ) + + +class TestAggregateBaseline: + def test_unequal_run_lengths(self) -> None: + ref = aggregate_baseline( + [ + _make_run(42, [0.9, 0.8, 0.7]), + _make_run(43, [0.9, 0.6]), + ], + scenario_name="Test", recorded_on="2026-06-10T00:00:00Z", + scoring_version="1.1", + ) + assert ref.n_runs == 2 + assert ref.seeds == (42, 43) + assert len(ref.score_trajectory) == 3 + assert ref.score_trajectory[0].n_runs == 2 + assert ref.score_trajectory[0].composite_mean == pytest.approx(0.9) + assert ref.score_trajectory[1].composite_mean == pytest.approx(0.7) + # Only the longer run is alive at tick 2 — no CI with one value + assert ref.score_trajectory[2].n_runs == 1 + assert ref.score_trajectory[2].composite_ci95 is None + assert ref.score_trajectory[2].composite_mean == pytest.approx(0.7) + + def test_time_to_end_stats(self) -> None: + ref = aggregate_baseline( + [ + _make_run(42, [0.9] * 21), # days 0..10 + _make_run(43, [0.9] * 41), # days 0..20 + ], + scenario_name="Test", recorded_on="2026-06-10T00:00:00Z", + scoring_version="1.1", + ) + assert ref.time_to_end_days_mean == pytest.approx(15.0) + assert ref.time_to_end_days_ci95 is not None + + def test_metric_means_carried(self) -> None: + ref = aggregate_baseline( + [_make_run(42, [0.9, 0.8])], + scenario_name="Test", recorded_on="2026-06-10T00:00:00Z", + scoring_version="1.1", + ) + assert ref.score_trajectory[0].metric_means["mood"] == pytest.approx(0.5) + + def test_empty_runs_rejected(self) -> None: + with pytest.raises(ValueError, match="at least one run"): + aggregate_baseline( + [], scenario_name="Test", + recorded_on="2026-06-10T00:00:00Z", scoring_version="1.1", + ) + + +class TestReadRun: + def test_reads_csv_and_summary(self, tmp_path: Path) -> None: + (tmp_path / "01_test_survival.csv").write_text( + "tick,day,mood,composite\n100,0,0.5,0.9\n200,1,0.4,0.8\n", + encoding="utf-8", + ) + (tmp_path / "01_test_summary.json").write_text( + json.dumps({"random_seed": 42, "outcome": "defeat"}), + encoding="utf-8", + ) + run = read_run(tmp_path) + assert run.seed == 42 + assert run.outcome == "defeat" + assert run.composites == [0.9, 0.8] + assert run.metrics["mood"] == [0.5, 0.4] + assert run.time_to_end_days == pytest.approx(1.0) + + def test_missing_csv_raises(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError, match="CSV"): + read_run(tmp_path) + + +class TestLoadBaseline: + def _write_scenario(self, tmp_path: Path) -> Path: + path = tmp_path / "01_test.yaml" + path.write_text(_SCENARIO_YAML, encoding="utf-8") + return path + + def _write_baseline( + self, scenario_yaml: Path, *, save_sha256: str = "aabbcc", + scoring_version: str = SCORING_VERSION, + ) -> None: + ref = BaselineReference( + scenario_name="Test Scenario", n_runs=1, seeds=(42,), + outcomes=("defeat",), time_to_end_days_mean=10.0, + score_trajectory=(), recorded_on="2026-06-10T00:00:00Z", + save_sha256=save_sha256, scoring_version=scoring_version, + ) + baseline_path(scenario_yaml).write_text( + ref.model_dump_json(), encoding="utf-8", + ) + + def test_no_sidecar_returns_none(self, tmp_path: Path) -> None: + path = self._write_scenario(tmp_path) + scenario = load_scenario(path, allow_unpinned=True) + assert load_baseline(path, scenario) is None + + def test_matching_baseline_loads(self, tmp_path: Path) -> None: + path = self._write_scenario(tmp_path) + scenario = load_scenario(path, allow_unpinned=True) + self._write_baseline(path) + ref = load_baseline(path, scenario) + assert ref is not None + assert ref.scenario_name == "Test Scenario" + + def test_save_sha_mismatch_fails_fast(self, tmp_path: Path) -> None: + path = self._write_scenario(tmp_path) + scenario = load_scenario(path, allow_unpinned=True) + self._write_baseline(path, save_sha256="stale") + with pytest.raises(BaselineMismatchError, match="save_sha256"): + load_baseline(path, scenario) + + def test_scoring_version_mismatch_fails_fast(self, tmp_path: Path) -> None: + path = self._write_scenario(tmp_path) + scenario = load_scenario(path, allow_unpinned=True) + self._write_baseline(path, scoring_version="0.9") + with pytest.raises(BaselineMismatchError, match="scoring_version"): + load_baseline(path, scenario)