diff --git a/results/benchmark_history.jsonl b/results/benchmark_history.jsonl index 89b17d1..e988b70 100644 --- a/results/benchmark_history.jsonl +++ b/results/benchmark_history.jsonl @@ -3,3 +3,4 @@ {"timestamp": "2026-06-10T00:59:13.107962+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8265, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 4, "total_completion_tokens": 1903, "total_tokens": 1907, "estimated_cost_usd": 0.09519, "wall_time_s": 258.38, "num_calls": 2, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 82, "errors_by_type": {"parse_failure": 12}, "avg_deliberation_ms": 46786.6, "action_success_rate": 1.0, "total_tokens": 1907, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8265, "outcome": "timeout", "ticks": 2}]} {"timestamp": "2026-06-10T01:09:11.973836+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8677, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 26, "total_completion_tokens": 21596, "total_tokens": 21622, "estimated_cost_usd": 1.08006, "wall_time_s": 319.28, "num_calls": 13, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 128, "errors_by_type": {"deliberation_timeout": 1, "provider_error": 1}, "avg_deliberation_ms": 27602.18, "action_success_rate": 1.0, "total_tokens": 21622, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8677, "outcome": "timeout", "ticks": 2}]} {"timestamp": "2026-06-10T01:14:38.139936+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8598, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 28, "total_completion_tokens": 20578, "total_tokens": 20606, "estimated_cost_usd": 1.02918, "wall_time_s": 163.26, "num_calls": 14, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 99, "errors_by_type": {}, "avg_deliberation_ms": 25524.75, "action_success_rate": 0.7273, "total_tokens": 20606, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8598, "outcome": "timeout", "ticks": 2}]} +{"timestamp": "2026-06-10T01:32:30.462265+00:00", "scoring_version": "1.0", "git_commit": "5d361d0", "git_branch": "master", "git_dirty": false, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 10, "outcome": "timeout", "final_score": 0.7536, "ticks_run": 10, "cost_snapshot": {"total_prompt_tokens": 140, "total_completion_tokens": 102045, "total_tokens": 102185, "estimated_cost_usd": 5.10365, "wall_time_s": 897.04, "num_calls": 70, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 633, "errors_by_type": {}, "avg_deliberation_ms": 26282.04, "action_success_rate": 0.7167, "total_tokens": 102185, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.7536, "outcome": "timeout", "ticks": 10}]} diff --git a/scripts/analysis/fable5_launch_charts.py b/scripts/analysis/fable5_launch_charts.py new file mode 100644 index 0000000..fa438c1 --- /dev/null +++ b/scripts/analysis/fable5_launch_charts.py @@ -0,0 +1,140 @@ +"""One-off chart generation for the Fable 5 launch thread. + +Reads results/fable5-live-N1 artifacts and writes two PNGs next to them: +score trajectory annotated with the threat_response artifact, and the +wandering-shelter blueprint map (issue #26). +""" + +from __future__ import annotations + +import csv +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from matplotlib.patches import Rectangle + +RUN_DIR = Path("results/fable5-live-N1") +THREAT_WEIGHT = 0.08 # crashlanded scenario override + +# Blueprint rectangle origins per tick (from events.jsonl, issue #26) +SHELTER_SITES = [ + (133, 139), (133, 145), (140, 134), (147, 135), (152, 139), + (153, 127), (156, 119), (144, 128), (145, 135), (145, 138), +] +WATER = (93, 105, 132, 185) # approx, per DefenseCommander terrain reads + + +def load_scores() -> tuple[list[int], list[float], list[float]]: + ticks, composite, threat = [], [], [] + with open(RUN_DIR / "01_crashlanded_survival.csv", encoding="utf-8") as f: + for i, row in enumerate(csv.DictReader(f)): + ticks.append(i) + composite.append(float(row["composite"])) + threat.append(float(row["threat_response"])) + return ticks, composite, threat + + +def trajectory_chart() -> None: + ticks, composite, threat = load_scores() + # Composite with the threat_response zeroing backed out + ex_artifact = [c + (1.0 - t) * THREAT_WEIGHT for c, t in zip(composite, threat)] + decay = [composite[0] - 0.013 * i for i in ticks] + + fig, ax = plt.subplots(figsize=(10, 5.5), dpi=160) + ax.plot(ticks, composite, "o-", lw=2.5, color="#d4452c", label="Fable 5 composite (as scored)") + ax.plot(ticks, ex_artifact, "o--", lw=2, color="#2c7fb8", + label="Ex-artifact (threat_response bug backed out)") + ax.plot(ticks, decay, ":", lw=1.5, color="#888888", + label="Natural decay reference (−0.013/tick)") + + drop_tick = next(i for i, t in enumerate(threat) if t == 0.0) + ax.annotate( + "threat_response → 0.0\n(no raid ever existed —\n" + "agent refused to draft, correctly.\nHarness bug, RLE issue #25)", + xy=(drop_tick, composite[drop_tick]), + xytext=(drop_tick - 4.4, composite[drop_tick] - 0.018), + fontsize=9, color="#d4452c", + arrowprops={"arrowstyle": "->", "color": "#d4452c"}, + ) + ax.set_xlabel("Tick") + ax.set_ylabel("Composite score") + ax.set_title("Claude Fable 5 × RimWorld Crashlanded — 10 ticks, 7 agents (RLE, seed 42)") + ax.legend(loc="lower left", fontsize=9) + ax.grid(alpha=0.25) + fig.tight_layout() + fig.savefig(RUN_DIR / "fable5_trajectory.png") + + +def shelter_chart() -> None: + fig, ax = plt.subplots(figsize=(7.5, 7.5), dpi=160) + wx1, wz1, wx2, wz2 = WATER + ax.add_patch(Rectangle((wx1, wz1), wx2 - wx1, wz2 - wz1, + facecolor="#a6cee3", edgecolor="none", alpha=0.6)) + ax.text((wx1 + wx2) / 2, (wz1 + wz2) / 2, "water", ha="center", + color="#1f6090", fontsize=10, style="italic") + + xs = [s[0] for s in SHELTER_SITES] + zs = [s[1] for s in SHELTER_SITES] + for i, (x, z) in enumerate(SHELTER_SITES): + ax.add_patch(Rectangle((x, z), 7, 7, facecolor="none", + edgecolor="#d4452c", lw=1.4, alpha=0.85)) + ax.annotate(str(i), (x + 3.5, z + 3.5), ha="center", va="center", + fontsize=9, color="#d4452c", weight="bold") + ax.plot( + [x + 3.5 for x in xs], [z + 3.5 for z in zs], + "--", color="#555555", lw=1, alpha=0.7, + ) + + ax.set_xlim(85, 175) + ax.set_ylim(100, 195) + ax.set_aspect("equal") + ax.set_xlabel("map x") + ax.set_ylabel("map z") + ax.set_title( + "The wandering shelter — one blueprint per tick, ticks 0–9\n" + "10 sites, 10 successful placements, 0 shelters built (RLE issue #26)", + fontsize=11, + ) + ax.grid(alpha=0.2) + fig.tight_layout() + fig.savefig(RUN_DIR / "fable5_wandering_shelter.png") + + +def quote_card() -> None: + """Transcript card for the DefenseCommander tick-9 quote (verbatim).""" + bg, dim, body, hot = "#0d1117", "#8b949e", "#c9d1d9", "#f0883e" + fig = plt.figure(figsize=(7.5, 4.22), dpi=160) + fig.patch.set_facecolor(bg) + + def t(y: float, s: str, color: str, size: int, weight: str = "normal") -> None: + fig.text(0.06, y, s, color=color, fontsize=size, family="monospace", + weight=weight, va="top") + + t(0.93, "RLE · RimWorld Learning Environment — events.jsonl, verbatim", dim, 10) + t(0.86, "agent: defense_commander (claude-fable-5) tick: 9 confidence: 0.92", dim, 10) + + t(0.74, '"Threat assessment: the single \'threat\' entry has', body, 12) + t(0.685, 'enemy_count=0 and threat_level=0.0 — there is no hostile', body, 12) + t(0.63, "presence on the map. The 'ThreatBig' alert in recent events", body, 12) + t(0.575, 'is a MENTAL BREAK warning for Bob (mood 0.22), not a raid."', body, 12) + + t(0.44, '"Drafting Bob in his current state would be', hot, 14, "bold") + t(0.375, ' the single most dangerous \'defensive\'', hot, 14, "bold") + t(0.31, ' action available."', hot, 14, "bold") + + t(0.14, "Our scoring zeroed threat_response for this. The agent was right.", dim, 10) + t(0.08, "github.com/AppSprout-dev/RLE", dim, 10) + + fig.savefig(RUN_DIR / "fable5_quote_card.png", facecolor=bg) + + +if __name__ == "__main__": + trajectory_chart() + shelter_chart() + quote_card() + print("wrote", RUN_DIR / "fable5_trajectory.png") + print("wrote", RUN_DIR / "fable5_wandering_shelter.png") + print("wrote", RUN_DIR / "fable5_quote_card.png") diff --git a/src/rle/orchestration/action_executor.py b/src/rle/orchestration/action_executor.py index 4ffb71b..aff34e9 100644 --- a/src/rle/orchestration/action_executor.py +++ b/src/rle/orchestration/action_executor.py @@ -39,6 +39,7 @@ class ActionOutcome(BaseModel): target_colonist_id: str | None = None success: bool error: str | None = None + parameters: dict[str, Any] = {} class ExecutionResult(BaseModel): @@ -64,6 +65,31 @@ def _extract_rimapi_error(detail: str) -> str: return detail +def _normalize_work_priorities(params: dict[str, Any]) -> dict[str, int]: + """Accept the parameter shapes models actually emit for work_priority. + + Documented shape is flat ``{"": <1-4>}``, but frontier models + also emit ``{"work_type": "Research", "priority": 2}`` and + ``{"work_priorities": {"Growing": 1, ...}}`` (issue #27). Passing those + through verbatim posted garbage like work="work_type" to RIMAPI. + """ + nested = params.get("work_priorities") + if isinstance(nested, dict): + return {str(work): int(pri) for work, pri in nested.items()} + if "work_type" in params: + return {str(params["work_type"]): int(params.get("priority", 1))} + flat = { + str(work): int(pri) for work, pri in params.items() + if isinstance(pri, int) and not isinstance(pri, bool) + } + if not flat: + raise ValueError( + 'work_priority requires {"": <1-4>} parameters ' + "(e.g. {\"Growing\": 1})" + ) + return flat + + class ActionExecutor: """Dispatches agent actions to RIMAPI write endpoints. @@ -93,6 +119,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult: endpoint=endpoint, target_colonist_id=action.target_colonist_id, success=True, + parameters=action.parameters, )) except RimAPIResponseError as exc: logger.warning("Action %s failed: %s", endpoint, exc.detail) @@ -103,6 +130,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult: target_colonist_id=action.target_colonist_id, success=False, error=_extract_rimapi_error(exc.detail), + parameters=action.parameters, )) except Exception as exc: logger.warning("Action %s failed", endpoint, exc_info=True) @@ -113,6 +141,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult: target_colonist_id=action.target_colonist_id, success=False, error=str(exc) or type(exc).__name__, + parameters=action.parameters, )) return ExecutionResult( executed=executed, @@ -149,7 +178,9 @@ async def _dispatch(self, action: Action, endpoint: str) -> None: # -- Specialized handlers (parameter mapping for complex DTOs) ----------- async def _h_work_priority(self, cid: str, params: dict[str, Any]) -> None: - await self._client.set_work_priorities(cid, params) + await self._client.set_work_priorities( + cid, _normalize_work_priorities(params), + ) async def _h_draft(self, cid: str, params: dict[str, Any]) -> None: is_drafted = params.get("is_drafted", True) @@ -159,11 +190,21 @@ async def _h_move(self, cid: str, params: dict[str, Any]) -> None: await self._client.move_colonist(cid, params.get("x", 0), params.get("z", 0)) async def _h_job_assign(self, cid: str, params: dict[str, Any]) -> None: + # Models emit "job" as often as the documented "job_def" (issue #27); + # posting an empty JobDef is a guaranteed RIMAPI error. + job = params.get("job_def") or params.get("job") or "" + if not job: + raise ValueError( + 'job_assign requires a "job_def" parameter (e.g. "Sow", "Mine")' + ) + target_position = params.get("target_position") + if target_position is None and "x" in params and "z" in params: + target_position = (int(params["x"]), int(params["z"])) await self._client.set_colonist_job( cid, - job=params.get("job_def", ""), + job=str(job), target_thing_id=params.get("target_thing_id"), - target_position=params.get("target_position"), + target_position=target_position, ) async def _h_time_assignment(self, cid: str, params: dict[str, Any]) -> None: diff --git a/src/rle/orchestration/game_loop.py b/src/rle/orchestration/game_loop.py index 35369b0..723f3cc 100644 --- a/src/rle/orchestration/game_loop.py +++ b/src/rle/orchestration/game_loop.py @@ -13,7 +13,7 @@ from felix_agent_sdk.visualization import HelixVisualizer from pydantic import BaseModel, ConfigDict -from rle.agents.actions import ActionPlan, ActionPlanParseError +from rle.agents.actions import ActionPlan, ActionPlanParseError, resolve_endpoint from rle.agents.base_role import RimWorldRoleAgent from rle.config import RLEConfig from rle.orchestration.action_executor import ActionExecutor, ExecutionResult @@ -387,14 +387,46 @@ def _export_tick_json( json.dumps(data, indent=2), ) - def _update_metric_context(self, result: TickResult, state: GameState) -> None: + def _update_metric_context( + self, result: TickResult, state: GameState, tick_num: int, + ) -> None: """Append tick data to metric context for scoring history.""" - self._metric_context.tick_results.append(result) - self._metric_context.state_history.append(state) + ctx = self._metric_context + ctx.tick_results.append(result) + ctx.state_history.append(state) + already_drafted = any(c.is_drafted for c in state.colonists) + seen_ids = {t.threat_id for t in ctx.threats_seen} for threat in state.threats: - seen_ids = {t.threat_id for t in self._metric_context.threats_seen} - if threat.threat_id not in seen_ids: - self._metric_context.threats_seen.append(threat) + # Null incident placeholders (the /incidents endpoint emits them) + # are not threats — counting one made threat_response unwinnable + # in a run with zero hostiles (issue #25). + if threat.enemy_count <= 0 and threat.threat_level <= 0.0: + continue + if threat.threat_id in seen_ids: + continue + ctx.threats_seen.append(threat) + ctx.threat_seen_tick[threat.threat_id] = tick_num + if already_drafted: + ctx.first_draft_tick[threat.threat_id] = 0 + + def _record_draft_response( + self, exec_result: ExecutionResult, tick_num: int, + ) -> None: + """Record per-threat response delay once a draft action executes (#25).""" + drafted = any( + o.success + and resolve_endpoint(o.action_type) == "draft" + and o.parameters.get("is_drafted", True) is not False + for o in exec_result.outcomes + ) + if not drafted: + return + ctx = self._metric_context + for threat in ctx.threats_seen: + seen = ctx.threat_seen_tick.get(threat.threat_id, tick_num) + ctx.first_draft_tick.setdefault( + threat.threat_id, max(0, tick_num - seen), + ) def _broadcast_phase_if_changed(self, current_time: float) -> None: """Broadcast PHASE_ANNOUNCE when macro_time crosses a phase boundary.""" @@ -573,8 +605,12 @@ async def run_tick(self) -> TickResult: target=outcome.target_colonist_id, success=outcome.success, error=outcome.error, + parameters=outcome.parameters, ) + # 7a. Track draft responses for the threat_response metric (issue #25) + self._record_draft_response(exec_result, tick_num) + # 7b. Surface per-action errors back to agents so they can avoid # re-proposing the same invalid action next tick (e.g. researching # an already-finished project, setting priority for a disabled work type). @@ -658,7 +694,7 @@ async def run_tick(self) -> TickResult: self._tick_results.append(result) # 9. Update metric context and evaluate scenario - self._update_metric_context(result, state) + self._update_metric_context(result, state, tick_num) if self._evaluator: eval_result = self._evaluator.evaluate( state, self._metric_context, tick_count=len(self._tick_results), diff --git a/src/rle/rimapi/client.py b/src/rle/rimapi/client.py index bc8bd54..3759b8f 100644 --- a/src/rle/rimapi/client.py +++ b/src/rle/rimapi/client.py @@ -64,6 +64,7 @@ class RimAPIClient: def __init__(self, base_url: str = "http://localhost:8765") -> None: self._base_url = base_url.rstrip("/") self._client: httpx.AsyncClient | None = None + self._terrain_summary_pin: TerrainSummary | None = None async def __aenter__(self) -> RimAPIClient: self._client = httpx.AsyncClient(base_url=self._base_url, timeout=10.0) @@ -686,7 +687,14 @@ async def get_terrain_summary( Decodes the RLE terrain grid, classifies tiles, and finds the best areas for building, farming, and stockpiling near the colony center. + + The first successful summary is pinned for the client's lifetime: + terrain is static, and re-deriving the colony center from live pawn + positions every tick made the recommended sites chase the builders — + 10 shelter blueprints at 10 locations, none completed (issue #26). """ + if self._terrain_summary_pin is not None: + return self._terrain_summary_pin try: data = await self._get(f"/api/v1/map/terrain?map_id={map_id}") if not isinstance(data, dict): @@ -803,13 +811,15 @@ def _find_clear_rect( # Stockpile: buildable 5x5 near center stockpile = _find_clear_rect(cx, cz, 5, _is_buildable) - return TerrainSummary( + summary = TerrainSummary( colony_center=(cx, cz), water_areas=water_areas, recommended_shelter=shelter, recommended_farm=farm, recommended_stockpile=stockpile, ) + self._terrain_summary_pin = summary + return summary except (RimAPIResponseError, RimAPIConnectionError): return None diff --git a/src/rle/scoring/metrics.py b/src/rle/scoring/metrics.py index 7185c88..4dad794 100644 --- a/src/rle/scoring/metrics.py +++ b/src/rle/scoring/metrics.py @@ -19,6 +19,9 @@ class MetricContext: tick_results: list[TickResult] = field(default_factory=list) state_history: list[GameState] = field(default_factory=list) threats_seen: list[ThreatData] = field(default_factory=list) + # Loop tick at which each threat was first observed (issue #25) + threat_seen_tick: dict[str, int] = field(default_factory=dict) + # Response delay in loop ticks per threat, recorded when a draft executes first_draft_tick: dict[str, int] = field(default_factory=dict) initial_wealth: float = 0.0 # Process metrics (populated by game loop after conflict resolution) diff --git a/src/rle/tracking/metadata.py b/src/rle/tracking/metadata.py index f750e33..7e85d6d 100644 --- a/src/rle/tracking/metadata.py +++ b/src/rle/tracking/metadata.py @@ -15,10 +15,13 @@ # metric implementations, or composite math change in a way that makes scores # from older runs not directly comparable. The leaderboard re-scores artifacts # at the current version on render; mismatches are surfaced, not silently -# elided. Current scope (1.0): the original 10-metric composite as wired by -# PR #16 / #18, with coordination + communication_efficiency at the broken +# elided. 1.1 (issue #25): threat_response now tracks actual draft responses +# (first_draft_tick wired, was permanently 0.0 once any threat registered) +# and null incident placeholders (enemy_count=0, threat_level=0.0) no longer +# count as threats — 1.0 scores with non-empty threats_seen are not +# comparable. coordination + communication_efficiency remain the broken # implementations to be repaired in Phase C. -SCORING_VERSION = "1.0" +SCORING_VERSION = "1.1" # Conventional install path for the RIMAPI Workshop mod we deploy our fork DLL # over. Best-effort — if Steam lives elsewhere set the RIMAPI_DLL_PATH env var. diff --git a/tests/unit/test_action_executor.py b/tests/unit/test_action_executor.py index f6307fd..6983989 100644 --- a/tests/unit/test_action_executor.py +++ b/tests/unit/test_action_executor.py @@ -306,3 +306,103 @@ async def test_destroy_rect(self) -> None: client.destroy_rect.assert_awaited_once_with( map_id=0, x1=5, z1=5, x2=15, z2=15, ) + + +class TestWorkPriorityNormalization: + """Issue #27: accept the parameter shapes models actually emit.""" + + async def test_work_type_shape_normalized(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="work_priority", + target_colonist_id="181", + parameters={"work_type": "Research", "priority": 2}, + )) + result = await executor.execute(plan) + assert result.executed == 1 + client.set_work_priorities.assert_awaited_once_with("181", {"Research": 2}) + + async def test_nested_work_priorities_shape_normalized(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="work_priority", + target_colonist_id="184", + parameters={"work_priorities": {"Growing": 1, "Hauling": 2}}, + )) + result = await executor.execute(plan) + assert result.executed == 1 + client.set_work_priorities.assert_awaited_once_with( + "184", {"Growing": 1, "Hauling": 2}, + ) + + async def test_flat_documented_shape_passes_through(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="work_priority", + target_colonist_id="187", + parameters={"Cooking": 1, "Hauling": 1}, + )) + result = await executor.execute(plan) + assert result.executed == 1 + client.set_work_priorities.assert_awaited_once_with( + "187", {"Cooking": 1, "Hauling": 1}, + ) + + async def test_garbage_params_fail_visibly(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="work_priority", + target_colonist_id="181", + parameters={"note": "make Bob research"}, + )) + result = await executor.execute(plan) + assert result.failed == 1 + assert result.outcomes[0].success is False + assert "WorkType" in (result.outcomes[0].error or "") + client.set_work_priorities.assert_not_awaited() + + +class TestJobAssignNormalization: + """Issue #27: 'job' alias accepted, empty JobDef fails visibly.""" + + async def test_job_alias_accepted_with_xz_position(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="job_assign", + target_colonist_id="184", + parameters={"job": "Sow", "x": 136, "z": 142}, + )) + result = await executor.execute(plan) + assert result.executed == 1 + client.set_colonist_job.assert_awaited_once_with( + "184", job="Sow", target_thing_id=None, target_position=(136, 142), + ) + + async def test_empty_job_fails_visibly(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="job_assign", + target_colonist_id="184", + parameters={"x": 136, "z": 142}, + )) + result = await executor.execute(plan) + assert result.failed == 1 + assert "job_def" in (result.outcomes[0].error or "") + client.set_colonist_job.assert_not_awaited() + + async def test_outcomes_carry_parameters(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="job_assign", + target_colonist_id="184", + parameters={"job_def": "Mine", "x": 204, "z": 6}, + )) + result = await executor.execute(plan) + assert result.outcomes[0].parameters == {"job_def": "Mine", "x": 204, "z": 6} diff --git a/tests/unit/test_metadata.py b/tests/unit/test_metadata.py index 88ae028..c6a1fba 100644 --- a/tests/unit/test_metadata.py +++ b/tests/unit/test_metadata.py @@ -17,7 +17,7 @@ def test_scoring_version_pins_a_string() -> None: that requires this test (and the dataset card) to be updated.""" assert isinstance(SCORING_VERSION, str) assert SCORING_VERSION - assert SCORING_VERSION == "1.0" + assert SCORING_VERSION == "1.1" def test_file_sha256_returns_none_for_missing_path() -> None: diff --git a/tests/unit/test_rimapi_client.py b/tests/unit/test_rimapi_client.py index 2cab9a5..45b95f3 100644 --- a/tests/unit/test_rimapi_client.py +++ b/tests/unit/test_rimapi_client.py @@ -737,3 +737,39 @@ def handler(request: httpx.Request) -> httpx.Response: assert call_count == 1, ( f"power_info should be fetched once per tick, got {call_count}" ) + + +class TestTerrainSummaryPinning: + """Issue #26: the first terrain summary is pinned for the client lifetime.""" + + _TERRAIN = { + "width": 10, "height": 10, + "palette": ["Soil", "WaterMovingShallow", "SoilRich", "Granite_Rough"], + "grid": [100, 0], + "floor_palette": [], "floor_grid": [100, 0], + } + + async def test_summary_pinned_across_center_changes(self) -> None: + from unittest.mock import AsyncMock + + client = RimAPIClient("http://test") + client._get = AsyncMock(return_value=dict(self._TERRAIN)) # type: ignore[method-assign] + + first = await client.get_terrain_summary(colonist_positions=[(2, 2)]) + second = await client.get_terrain_summary(colonist_positions=[(8, 8)]) + + assert first is not None + assert second is first # pinned object, not recomputed + client._get.assert_awaited_once() # terrain fetched exactly once + + async def test_failed_fetch_does_not_pin(self) -> None: + from unittest.mock import AsyncMock + + client = RimAPIClient("http://test") + client._get = AsyncMock( # type: ignore[method-assign] + side_effect=RimAPIConnectionError("down"), + ) + assert await client.get_terrain_summary() is None + + client._get = AsyncMock(return_value=dict(self._TERRAIN)) # type: ignore[method-assign] + assert await client.get_terrain_summary() is not None diff --git a/tests/unit/test_threat_tracking.py b/tests/unit/test_threat_tracking.py new file mode 100644 index 0000000..6fe0565 --- /dev/null +++ b/tests/unit/test_threat_tracking.py @@ -0,0 +1,133 @@ +"""Tests for issue #25: threat filtering and draft-response tracking.""" + +from __future__ import annotations + +import pytest + +from rle.agents.actions import ActionPlan +from rle.orchestration.action_executor import ActionOutcome, ExecutionResult +from rle.orchestration.game_loop import RLEGameLoop, TickResult +from rle.rimapi.schemas import GameState, ThreatData +from rle.scoring.metrics import MetricContext, threat_response + + +def _loop_with_context() -> tuple[RLEGameLoop, MetricContext]: + """A bare game loop carrying only the metric context (unit scope).""" + loop = object.__new__(RLEGameLoop) + ctx = MetricContext() + loop._metric_context = ctx + return loop, ctx + + +def _threat(tid: str, enemies: int = 3, level: float = 0.5) -> ThreatData: + return ThreatData( + threat_id=tid, threat_type="raid", faction="pirates", + enemy_count=enemies, threat_level=level, + ) + + +def _tick_result(state: GameState) -> TickResult: + return TickResult( + tick=state.colony.tick, day=state.colony.day, macro_time=0.1, + plan=ActionPlan(role="test", tick=1, actions=[]), + execution=ExecutionResult(executed=0, failed=0, total=0), + score=None, + ) + + +def _draft_outcome(success: bool = True, is_drafted: bool = True) -> ActionOutcome: + return ActionOutcome( + action_type="draft", endpoint="draft", target_colonist_id="181", + success=success, parameters={"is_drafted": is_drafted}, + ) + + +class TestThreatFiltering: + def test_null_placeholder_not_counted( + self, sample_game_state: GameState, + ) -> None: + loop, ctx = _loop_with_context() + state = sample_game_state.model_copy(update={ + "threats": (_threat("phantom", enemies=0, level=0.0),), + }) + loop._update_metric_context(_tick_result(state), state, tick_num=3) + assert ctx.threats_seen == [] + assert threat_response(state, ctx) == 1.0 + + def test_real_threat_recorded_with_seen_tick( + self, sample_game_state: GameState, + ) -> None: + loop, ctx = _loop_with_context() + state = sample_game_state.model_copy(update={"threats": (_threat("raid-1"),)}) + loop._update_metric_context(_tick_result(state), state, tick_num=4) + assert [t.threat_id for t in ctx.threats_seen] == ["raid-1"] + assert ctx.threat_seen_tick == {"raid-1": 4} + + def test_threat_while_already_drafted_is_instant_response( + self, sample_game_state: GameState, + ) -> None: + loop, ctx = _loop_with_context() + colonists = tuple( + c.model_copy(update={"is_drafted": True}) + for c in sample_game_state.colonists + ) + state = sample_game_state.model_copy(update={ + "threats": (_threat("raid-2"),), "colonists": colonists, + }) + loop._update_metric_context(_tick_result(state), state, tick_num=5) + assert ctx.first_draft_tick == {"raid-2": 0} + assert threat_response(state, ctx) == 1.0 + + +class TestDraftResponseRecording: + def test_draft_records_delay_per_threat(self) -> None: + loop, ctx = _loop_with_context() + ctx.threats_seen.append(_threat("raid-1")) + ctx.threat_seen_tick["raid-1"] = 4 + exec_result = ExecutionResult( + executed=1, failed=0, total=1, outcomes=(_draft_outcome(),), + ) + loop._record_draft_response(exec_result, tick_num=6) + assert ctx.first_draft_tick == {"raid-1": 2} + + def test_undraft_does_not_count(self) -> None: + loop, ctx = _loop_with_context() + ctx.threats_seen.append(_threat("raid-1")) + ctx.threat_seen_tick["raid-1"] = 4 + exec_result = ExecutionResult( + executed=1, failed=0, total=1, + outcomes=(_draft_outcome(is_drafted=False),), + ) + loop._record_draft_response(exec_result, tick_num=6) + assert ctx.first_draft_tick == {} + + def test_failed_draft_does_not_count(self) -> None: + loop, ctx = _loop_with_context() + ctx.threats_seen.append(_threat("raid-1")) + ctx.threat_seen_tick["raid-1"] = 4 + exec_result = ExecutionResult( + executed=0, failed=1, total=1, + outcomes=(_draft_outcome(success=False),), + ) + loop._record_draft_response(exec_result, tick_num=6) + assert ctx.first_draft_tick == {} + + def test_first_response_is_not_overwritten(self) -> None: + loop, ctx = _loop_with_context() + ctx.threats_seen.append(_threat("raid-1")) + ctx.threat_seen_tick["raid-1"] = 4 + ctx.first_draft_tick["raid-1"] = 1 + exec_result = ExecutionResult( + executed=1, failed=0, total=1, outcomes=(_draft_outcome(),), + ) + loop._record_draft_response(exec_result, tick_num=9) + assert ctx.first_draft_tick == {"raid-1": 1} + + def test_metric_rewards_fast_response( + self, sample_game_state: GameState, + ) -> None: + _, ctx = _loop_with_context() + ctx.threats_seen.append(_threat("raid-1")) + ctx.first_draft_tick["raid-1"] = 2 + # 2-tick response → 1 - 2/10 = 0.8 + assert threat_response(sample_game_state, ctx) == pytest.approx(0.8)