Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions results/benchmark_history.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
{"timestamp": "2026-06-10T01:09:11.973836+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8677, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 26, "total_completion_tokens": 21596, "total_tokens": 21622, "estimated_cost_usd": 1.08006, "wall_time_s": 319.28, "num_calls": 13, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 128, "errors_by_type": {"deliberation_timeout": 1, "provider_error": 1}, "avg_deliberation_ms": 27602.18, "action_success_rate": 1.0, "total_tokens": 21622, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8677, "outcome": "timeout", "ticks": 2}]}
{"timestamp": "2026-06-10T01:14:38.139936+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8598, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 28, "total_completion_tokens": 20578, "total_tokens": 20606, "estimated_cost_usd": 1.02918, "wall_time_s": 163.26, "num_calls": 14, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 99, "errors_by_type": {}, "avg_deliberation_ms": 25524.75, "action_success_rate": 0.7273, "total_tokens": 20606, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8598, "outcome": "timeout", "ticks": 2}]}
{"timestamp": "2026-06-10T01:32:30.462265+00:00", "scoring_version": "1.0", "git_commit": "5d361d0", "git_branch": "master", "git_dirty": false, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 10, "outcome": "timeout", "final_score": 0.7536, "ticks_run": 10, "cost_snapshot": {"total_prompt_tokens": 140, "total_completion_tokens": 102045, "total_tokens": 102185, "estimated_cost_usd": 5.10365, "wall_time_s": 897.04, "num_calls": 70, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 633, "errors_by_type": {}, "avg_deliberation_ms": 26282.04, "action_success_rate": 0.7167, "total_tokens": 102185, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.7536, "outcome": "timeout", "ticks": 10}]}
{"timestamp": "2026-06-10T04:12:15.578574+00:00", "scoring_version": "1.1", "git_commit": "98ee8e2", "git_branch": "fix/run-integrity", "git_dirty": false, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8594, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 28, "total_completion_tokens": 19000, "total_tokens": 19028, "estimated_cost_usd": 0.95028, "wall_time_s": 151.53, "num_calls": 14, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 96, "errors_by_type": {}, "avg_deliberation_ms": 24304.49, "action_success_rate": 0.7857, "total_tokens": 19028, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8594, "outcome": "timeout", "ticks": 2}]}
115 changes: 115 additions & 0 deletions scripts/calibrate_baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Phase B1: calibrate a scenario's no-agent baseline.

Runs N seeded --no-agent --until-death runs against the live game, then
aggregates them into a pinned <scenario>.baseline.json sidecar next to the
scenario YAML. The baseline is an immutable scenario property — re-run this
only when the scenario's save or SCORING_VERSION changes.

Usage:
python scripts/calibrate_baseline.py crashlanded --seeds 42 43 44 45
"""

from __future__ import annotations

import argparse
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from rle.scenarios.loader import baseline_path, load_baseline, load_scenario
from rle.scoring.baseline import aggregate_baseline, read_run
from rle.tracking.metadata import SCORING_VERSION, collect_metadata

DEFINITIONS_DIR = (
Path(__file__).parent.parent / "src" / "rle" / "scenarios" / "definitions"
)


def _find_scenario_path(query: str) -> Path:
for path in sorted(DEFINITIONS_DIR.glob("*.yaml")):
if path.stem.startswith(query) or query in path.stem:
return path
raise SystemExit(f"No scenario matching {query!r} in {DEFINITIONS_DIR}")


def _run_one(
scenario_query: str, seed: int, out_dir: Path, tick_interval: float,
) -> None:
cmd = [
sys.executable, str(Path(__file__).parent / "run_scenario.py"),
scenario_query,
"--no-agent", "--until-death", "--no-pause",
"--seed", str(seed),
"--output", str(out_dir),
"--tick-interval", str(tick_interval),
]
print(f"[baseline] seed {seed} -> {out_dir}")
result = subprocess.run(cmd)
if result.returncode != 0:
raise SystemExit(
f"Baseline run for seed {seed} failed (exit {result.returncode}); "
f"aborting calibration — partial baselines must not be written.",
)


def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("scenario", help="Scenario name or number prefix")
parser.add_argument(
"--seeds", type=int, nargs="+", default=[42, 43, 44, 45],
help="One run per seed (default: 42 43 44 45; N>=4 for validity)",
)
parser.add_argument(
"--output", default="results/baseline",
help="Directory for per-run artifacts (default: results/baseline)",
)
parser.add_argument("--tick-interval", type=float, default=30.0)
parser.add_argument(
"--aggregate-only", action="store_true",
help="Skip the runs; aggregate existing per-seed dirs under --output",
)
args = parser.parse_args()

scenario_path = _find_scenario_path(args.scenario)
scenario = load_scenario(scenario_path)
out_root = Path(args.output) / scenario_path.stem

run_dirs = [out_root / f"seed{seed}" for seed in args.seeds]
if not args.aggregate_only:
for seed, run_dir in zip(args.seeds, run_dirs):
run_dir.mkdir(parents=True, exist_ok=True)
_run_one(args.scenario, seed, run_dir, args.tick_interval)

runs = [read_run(d) for d in run_dirs]
metadata = collect_metadata()
reference = aggregate_baseline(
runs,
scenario_name=scenario.name,
recorded_on=datetime.now(timezone.utc).isoformat(),
scoring_version=SCORING_VERSION,
save_sha256=scenario.save_sha256,
rimapi_dll_sha256=str(metadata.get("rimapi_dll_sha256") or "") or None,
rle_commit=str(metadata.get("git_commit") or "") or None,
)

sidecar = baseline_path(scenario_path)
sidecar.write_text(reference.model_dump_json(indent=2), encoding="utf-8")
print(f"[baseline] wrote {sidecar}")

# Round-trip through the strict loader so a bad sidecar fails HERE,
# not at the start of someone's benchmark run.
loaded = load_baseline(scenario_path, scenario)
assert loaded is not None
print(
f"[baseline] {loaded.scenario_name}: n={loaded.n_runs} "
f"seeds={list(loaded.seeds)} outcomes={list(loaded.outcomes)} "
f"time_to_end={loaded.time_to_end_days_mean:.1f}d "
f"trajectory={len(loaded.score_trajectory)} points",
)


if __name__ == "__main__":
main()
20 changes: 19 additions & 1 deletion scripts/run_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@

DEFINITIONS_DIR = Path(__file__).parent.parent / "src" / "rle" / "scenarios" / "definitions"

# Runaway guard for --until-death runs (the evaluator's terminal conditions
# are the intended stop; this only catches a colony that never dies or wins).
_UNTIL_DEATH_SAFETY_CAP = 5000


def _find_scenario(query: str) -> Path:
"""Find a scenario YAML by name prefix or number."""
Expand Down Expand Up @@ -196,7 +200,15 @@ async def main(args: argparse.Namespace) -> None:
agent.agent_id, label=display["label"], color=display["color"],
)

max_ticks = args.ticks or scenario.max_ticks
if args.until_death:
# Natural-conclusion mode (Phase B): no scenario tick cap — the run
# ends when the evaluator hits a terminal condition (all colonists
# dead, or victory). The safety cap only guards against a runaway
# loop if the colony somehow never reaches either.
scenario = scenario.model_copy(update={"max_ticks": None})
max_ticks = args.ticks or _UNTIL_DEATH_SAFETY_CAP
else:
max_ticks = args.ticks or scenario.max_ticks

# Initialize tracking (optional, when --output is specified)
event_log: EventLog | None = None
Expand Down Expand Up @@ -391,6 +403,12 @@ async def main(args: argparse.Namespace) -> None:
"--no-agent", action="store_true",
help="Baseline mode: no agent deliberation, colony runs unmanaged",
)
parser.add_argument(
"--until-death", action="store_true",
help="Ignore the scenario tick cap; run until the evaluator reaches "
"a terminal condition (all colonists dead, or victory). "
"Phase B natural-conclusion mode.",
)
parser.add_argument(
"--no-pause", action="store_true",
help="Don't pause game during deliberation (SSE-driven, game runs continuously)",
Expand Down
49 changes: 47 additions & 2 deletions src/rle/scenarios/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

import yaml

from rle.scenarios.schema import ScenarioConfig
from rle.tracking.metadata import file_sha256
from rle.scenarios.schema import BaselineReference, ScenarioConfig
from rle.tracking.metadata import SCORING_VERSION, file_sha256

# Canonical save mirror (the same files that get baked into the Docker image).
# Resolves to <repo_root>/docker/saves/. Live game runs may use a save in
Expand Down Expand Up @@ -55,6 +55,51 @@ def load_scenario(
return scenario


class BaselineMismatchError(RuntimeError):
"""Raised when a scenario's .baseline.json was calibrated against a
different save or scoring version than the scenario currently pins —
the baseline must be recharacterized (scripts/calibrate_baseline.py)."""


def baseline_path(scenario_path: str | Path) -> Path:
"""Sidecar .baseline.json path for a scenario YAML path."""
return Path(scenario_path).with_suffix(".baseline.json")


def load_baseline(
scenario_path: str | Path, scenario: ScenarioConfig,
) -> BaselineReference | None:
"""Load a scenario's pinned baseline sidecar, if one exists.

Returns None when no sidecar is present. Fails fast (rather than
silently comparing against a stale reference) when the baseline was
calibrated against a different save_sha256 or SCORING_VERSION.
"""
path = baseline_path(scenario_path)
if not path.is_file():
return None
ref = BaselineReference.model_validate_json(path.read_text(encoding="utf-8"))
if (
scenario.save_sha256
and ref.save_sha256
and ref.save_sha256 != scenario.save_sha256
):
raise BaselineMismatchError(
f"Baseline {path} was calibrated against save_sha256="
f"{ref.save_sha256} but scenario {scenario.name!r} now pins "
f"{scenario.save_sha256}. Recharacterize via "
f"scripts/calibrate_baseline.py.",
)
if ref.scoring_version != SCORING_VERSION:
raise BaselineMismatchError(
f"Baseline {path} was recorded at scoring_version="
f"{ref.scoring_version} but the current version is "
f"{SCORING_VERSION}. Recharacterize via "
f"scripts/calibrate_baseline.py.",
)
return ref


def list_scenarios(
directory: str | Path | None = None, *, allow_unpinned: bool = False,
) -> list[ScenarioConfig]:
Expand Down
39 changes: 39 additions & 0 deletions src/rle/scenarios/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,42 @@ class ScenarioConfig(BaseModel):
mismatch unless allow_unpinned=True. Generate via scripts/hash_saves.py."""
triggered_incidents: list[TriggeredIncident] = []
setup_commands: list[SetupCommand] = []


class BaselinePoint(BaseModel):
"""One sampled point on a baseline score trajectory (loop-tick indexed)."""

model_config = ConfigDict(frozen=True)

tick: int
composite_mean: float
composite_ci95: tuple[float, float] | None = None
n_runs: int
metric_means: dict[str, float] = {}


class BaselineReference(BaseModel):
"""Pinned no-agent calibration for a scenario (Phase B1).

Persisted as a .baseline.json sidecar next to the scenario YAML and
treated as an immutable scenario property: recharacterize only when the
scenario's save_sha256 or SCORING_VERSION changes. Agent runs compute
lift against this pinned trajectory instead of re-running baselines.
"""

model_config = ConfigDict(frozen=True)

scenario_name: str
n_runs: int
seeds: tuple[int, ...]
outcomes: tuple[str, ...]
"""Per-run terminal outcome ("defeat" = natural colony death,
"victory" possible if the unmanaged colony outlasts the scenario)."""
time_to_end_days_mean: float
time_to_end_days_ci95: tuple[float, float] | None = None
score_trajectory: tuple[BaselinePoint, ...]
recorded_on: str
save_sha256: str | None = None
rimapi_dll_sha256: str | None = None
rle_commit: str | None = None
scoring_version: str
117 changes: 117 additions & 0 deletions src/rle/scoring/baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""Phase B1 baseline aggregation: N no-agent runs → a pinned BaselineReference."""

from __future__ import annotations

import csv
import json
from dataclasses import dataclass, field
from pathlib import Path
from statistics import fmean

from rle.scenarios.schema import BaselinePoint, BaselineReference
from rle.scoring.bootstrap import bootstrap_ci

# Columns in the per-tick CSV that are not individual metrics.
_NON_METRIC_COLUMNS = frozenset({"tick", "day", "composite"})


@dataclass
class BaselineRun:
"""One no-agent run's per-tick series, as read from its artifacts."""

seed: int
outcome: str
days: list[float] = field(default_factory=list)
composites: list[float] = field(default_factory=list)
metrics: dict[str, list[float]] = field(default_factory=dict)

@property
def time_to_end_days(self) -> float:
return self.days[-1] if self.days else 0.0


def read_run(run_dir: Path) -> BaselineRun:
"""Read one run's CSV + summary JSON into a BaselineRun."""
csv_paths = sorted(run_dir.glob("*_survival.csv")) or sorted(run_dir.glob("*.csv"))
if not csv_paths:
raise FileNotFoundError(f"No per-tick CSV found in {run_dir}")
summary_paths = sorted(run_dir.glob("*_summary.json"))
if not summary_paths:
raise FileNotFoundError(f"No summary JSON found in {run_dir}")
summary = json.loads(summary_paths[0].read_text(encoding="utf-8"))

run = BaselineRun(
seed=int(summary.get("random_seed") or 0),
outcome=str(summary.get("outcome", "unknown")),
)
with open(csv_paths[0], encoding="utf-8") as f:
for row in csv.DictReader(f):
run.days.append(float(row["day"]))
run.composites.append(float(row["composite"]))
for col, value in row.items():
if col not in _NON_METRIC_COLUMNS:
run.metrics.setdefault(col, []).append(float(value))
return run


def aggregate_baseline(
runs: list[BaselineRun],
*,
scenario_name: str,
recorded_on: str,
scoring_version: str,
save_sha256: str | None = None,
rimapi_dll_sha256: str | None = None,
rle_commit: str | None = None,
ci_seed: int = 0,
) -> BaselineReference:
"""Aggregate N runs into a pinned BaselineReference.

Trajectory points are loop-tick indexed; runs end at different ticks, so
each point averages over the runs that were still going (n_runs records
how many). CIs need at least 2 values and are omitted otherwise.
"""
if not runs:
raise ValueError("aggregate_baseline requires at least one run")

end_days = [r.time_to_end_days for r in runs]
end_ci: tuple[float, float] | None = None
if len(end_days) >= 2:
ci = bootstrap_ci(end_days, seed=ci_seed)
end_ci = (ci.ci_lower, ci.ci_upper)

metric_names = sorted({m for r in runs for m in r.metrics})
points: list[BaselinePoint] = []
for i in range(max(len(r.composites) for r in runs)):
alive = [r for r in runs if len(r.composites) > i]
composites = [r.composites[i] for r in alive]
composite_ci: tuple[float, float] | None = None
if len(composites) >= 2:
ci = bootstrap_ci(composites, seed=ci_seed)
composite_ci = (ci.ci_lower, ci.ci_upper)
points.append(BaselinePoint(
tick=i,
composite_mean=fmean(composites),
composite_ci95=composite_ci,
n_runs=len(alive),
metric_means={
m: fmean(r.metrics[m][i] for r in alive if m in r.metrics)
for m in metric_names
if any(m in r.metrics for r in alive)
},
))

return BaselineReference(
scenario_name=scenario_name,
n_runs=len(runs),
seeds=tuple(r.seed for r in runs),
outcomes=tuple(r.outcome for r in runs),
time_to_end_days_mean=fmean(end_days),
time_to_end_days_ci95=end_ci,
score_trajectory=tuple(points),
recorded_on=recorded_on,
save_sha256=save_sha256,
rimapi_dll_sha256=rimapi_dll_sha256,
rle_commit=rle_commit,
scoring_version=scoring_version,
)
Loading
Loading