From 98ee8e2d4ea25a0870ac828ae9ebe5b7dee88266 Mon Sep 17 00:00:00 2001 From: jkbennitt Date: Tue, 9 Jun 2026 23:56:39 -0400 Subject: [PATCH] Fix run-integrity issues #25/#26/#27 from the Fable 5 live run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #25 — threat_response was unwinnable: first_draft_tick was never written anywhere, so the metric zeroed permanently once any threat registered, and null incident placeholders (enemy_count=0, threat_level=0.0) counted as threats. Draft-action execution now records per-threat response delays (instant if already drafted when the threat appears), placeholders are filtered, and SCORING_VERSION bumps to 1.1 — 1.0 scores with non-empty threats_seen are not comparable. #26 — MAP_SUMMARY sites chased the builders: the colony center was re-derived from live pawn positions every tick, invalidating the previous shelter blueprint (10 sites in 10 ticks, none built). The first successful terrain summary is now pinned for the client's lifetime; terrain is static. #27 — pawn-targeted writes were 1/13: work_priority params passed verbatim posted garbage (work="work_type", priority="Research") and job_assign defaulted a missing job_def to an empty string. Both handlers now normalize the shapes models actually emit, and unnormalizable params fail visibly through ActionOutcome instead of posting junk. Observability: ActionOutcome (and ACTION_EXEC events) now carry the action parameters — debugging #27 required digging payloads out of raw LLM output because events omitted them. Also includes the launch-chart/quote-card generation script for the Fable 5 run artifacts. Co-Authored-By: Claude Fable 5 --- results/benchmark_history.jsonl | 1 + scripts/analysis/fable5_launch_charts.py | 140 +++++++++++++++++++++++ src/rle/orchestration/action_executor.py | 47 +++++++- src/rle/orchestration/game_loop.py | 52 +++++++-- src/rle/rimapi/client.py | 12 +- src/rle/scoring/metrics.py | 3 + src/rle/tracking/metadata.py | 9 +- tests/unit/test_action_executor.py | 100 ++++++++++++++++ tests/unit/test_metadata.py | 2 +- tests/unit/test_rimapi_client.py | 36 ++++++ tests/unit/test_threat_tracking.py | 133 +++++++++++++++++++++ 11 files changed, 519 insertions(+), 16 deletions(-) create mode 100644 scripts/analysis/fable5_launch_charts.py create mode 100644 tests/unit/test_threat_tracking.py diff --git a/results/benchmark_history.jsonl b/results/benchmark_history.jsonl index 89b17d1..e988b70 100644 --- a/results/benchmark_history.jsonl +++ b/results/benchmark_history.jsonl @@ -3,3 +3,4 @@ {"timestamp": "2026-06-10T00:59:13.107962+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8265, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 4, "total_completion_tokens": 1903, "total_tokens": 1907, "estimated_cost_usd": 0.09519, "wall_time_s": 258.38, "num_calls": 2, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 82, "errors_by_type": {"parse_failure": 12}, "avg_deliberation_ms": 46786.6, "action_success_rate": 1.0, "total_tokens": 1907, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8265, "outcome": "timeout", "ticks": 2}]} {"timestamp": "2026-06-10T01:09:11.973836+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8677, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 26, "total_completion_tokens": 21596, "total_tokens": 21622, "estimated_cost_usd": 1.08006, "wall_time_s": 319.28, "num_calls": 13, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 128, "errors_by_type": {"deliberation_timeout": 1, "provider_error": 1}, "avg_deliberation_ms": 27602.18, "action_success_rate": 1.0, "total_tokens": 21622, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8677, "outcome": "timeout", "ticks": 2}]} {"timestamp": "2026-06-10T01:14:38.139936+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8598, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 28, "total_completion_tokens": 20578, "total_tokens": 20606, "estimated_cost_usd": 1.02918, "wall_time_s": 163.26, "num_calls": 14, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 99, "errors_by_type": {}, "avg_deliberation_ms": 25524.75, "action_success_rate": 0.7273, "total_tokens": 20606, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8598, "outcome": "timeout", "ticks": 2}]} +{"timestamp": "2026-06-10T01:32:30.462265+00:00", "scoring_version": "1.0", "git_commit": "5d361d0", "git_branch": "master", "git_dirty": false, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 10, "outcome": "timeout", "final_score": 0.7536, "ticks_run": 10, "cost_snapshot": {"total_prompt_tokens": 140, "total_completion_tokens": 102045, "total_tokens": 102185, "estimated_cost_usd": 5.10365, "wall_time_s": 897.04, "num_calls": 70, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 633, "errors_by_type": {}, "avg_deliberation_ms": 26282.04, "action_success_rate": 0.7167, "total_tokens": 102185, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.7536, "outcome": "timeout", "ticks": 10}]} diff --git a/scripts/analysis/fable5_launch_charts.py b/scripts/analysis/fable5_launch_charts.py new file mode 100644 index 0000000..fa438c1 --- /dev/null +++ b/scripts/analysis/fable5_launch_charts.py @@ -0,0 +1,140 @@ +"""One-off chart generation for the Fable 5 launch thread. + +Reads results/fable5-live-N1 artifacts and writes two PNGs next to them: +score trajectory annotated with the threat_response artifact, and the +wandering-shelter blueprint map (issue #26). +""" + +from __future__ import annotations + +import csv +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +from matplotlib.patches import Rectangle + +RUN_DIR = Path("results/fable5-live-N1") +THREAT_WEIGHT = 0.08 # crashlanded scenario override + +# Blueprint rectangle origins per tick (from events.jsonl, issue #26) +SHELTER_SITES = [ + (133, 139), (133, 145), (140, 134), (147, 135), (152, 139), + (153, 127), (156, 119), (144, 128), (145, 135), (145, 138), +] +WATER = (93, 105, 132, 185) # approx, per DefenseCommander terrain reads + + +def load_scores() -> tuple[list[int], list[float], list[float]]: + ticks, composite, threat = [], [], [] + with open(RUN_DIR / "01_crashlanded_survival.csv", encoding="utf-8") as f: + for i, row in enumerate(csv.DictReader(f)): + ticks.append(i) + composite.append(float(row["composite"])) + threat.append(float(row["threat_response"])) + return ticks, composite, threat + + +def trajectory_chart() -> None: + ticks, composite, threat = load_scores() + # Composite with the threat_response zeroing backed out + ex_artifact = [c + (1.0 - t) * THREAT_WEIGHT for c, t in zip(composite, threat)] + decay = [composite[0] - 0.013 * i for i in ticks] + + fig, ax = plt.subplots(figsize=(10, 5.5), dpi=160) + ax.plot(ticks, composite, "o-", lw=2.5, color="#d4452c", label="Fable 5 composite (as scored)") + ax.plot(ticks, ex_artifact, "o--", lw=2, color="#2c7fb8", + label="Ex-artifact (threat_response bug backed out)") + ax.plot(ticks, decay, ":", lw=1.5, color="#888888", + label="Natural decay reference (−0.013/tick)") + + drop_tick = next(i for i, t in enumerate(threat) if t == 0.0) + ax.annotate( + "threat_response → 0.0\n(no raid ever existed —\n" + "agent refused to draft, correctly.\nHarness bug, RLE issue #25)", + xy=(drop_tick, composite[drop_tick]), + xytext=(drop_tick - 4.4, composite[drop_tick] - 0.018), + fontsize=9, color="#d4452c", + arrowprops={"arrowstyle": "->", "color": "#d4452c"}, + ) + ax.set_xlabel("Tick") + ax.set_ylabel("Composite score") + ax.set_title("Claude Fable 5 × RimWorld Crashlanded — 10 ticks, 7 agents (RLE, seed 42)") + ax.legend(loc="lower left", fontsize=9) + ax.grid(alpha=0.25) + fig.tight_layout() + fig.savefig(RUN_DIR / "fable5_trajectory.png") + + +def shelter_chart() -> None: + fig, ax = plt.subplots(figsize=(7.5, 7.5), dpi=160) + wx1, wz1, wx2, wz2 = WATER + ax.add_patch(Rectangle((wx1, wz1), wx2 - wx1, wz2 - wz1, + facecolor="#a6cee3", edgecolor="none", alpha=0.6)) + ax.text((wx1 + wx2) / 2, (wz1 + wz2) / 2, "water", ha="center", + color="#1f6090", fontsize=10, style="italic") + + xs = [s[0] for s in SHELTER_SITES] + zs = [s[1] for s in SHELTER_SITES] + for i, (x, z) in enumerate(SHELTER_SITES): + ax.add_patch(Rectangle((x, z), 7, 7, facecolor="none", + edgecolor="#d4452c", lw=1.4, alpha=0.85)) + ax.annotate(str(i), (x + 3.5, z + 3.5), ha="center", va="center", + fontsize=9, color="#d4452c", weight="bold") + ax.plot( + [x + 3.5 for x in xs], [z + 3.5 for z in zs], + "--", color="#555555", lw=1, alpha=0.7, + ) + + ax.set_xlim(85, 175) + ax.set_ylim(100, 195) + ax.set_aspect("equal") + ax.set_xlabel("map x") + ax.set_ylabel("map z") + ax.set_title( + "The wandering shelter — one blueprint per tick, ticks 0–9\n" + "10 sites, 10 successful placements, 0 shelters built (RLE issue #26)", + fontsize=11, + ) + ax.grid(alpha=0.2) + fig.tight_layout() + fig.savefig(RUN_DIR / "fable5_wandering_shelter.png") + + +def quote_card() -> None: + """Transcript card for the DefenseCommander tick-9 quote (verbatim).""" + bg, dim, body, hot = "#0d1117", "#8b949e", "#c9d1d9", "#f0883e" + fig = plt.figure(figsize=(7.5, 4.22), dpi=160) + fig.patch.set_facecolor(bg) + + def t(y: float, s: str, color: str, size: int, weight: str = "normal") -> None: + fig.text(0.06, y, s, color=color, fontsize=size, family="monospace", + weight=weight, va="top") + + t(0.93, "RLE · RimWorld Learning Environment — events.jsonl, verbatim", dim, 10) + t(0.86, "agent: defense_commander (claude-fable-5) tick: 9 confidence: 0.92", dim, 10) + + t(0.74, '"Threat assessment: the single \'threat\' entry has', body, 12) + t(0.685, 'enemy_count=0 and threat_level=0.0 — there is no hostile', body, 12) + t(0.63, "presence on the map. The 'ThreatBig' alert in recent events", body, 12) + t(0.575, 'is a MENTAL BREAK warning for Bob (mood 0.22), not a raid."', body, 12) + + t(0.44, '"Drafting Bob in his current state would be', hot, 14, "bold") + t(0.375, ' the single most dangerous \'defensive\'', hot, 14, "bold") + t(0.31, ' action available."', hot, 14, "bold") + + t(0.14, "Our scoring zeroed threat_response for this. The agent was right.", dim, 10) + t(0.08, "github.com/AppSprout-dev/RLE", dim, 10) + + fig.savefig(RUN_DIR / "fable5_quote_card.png", facecolor=bg) + + +if __name__ == "__main__": + trajectory_chart() + shelter_chart() + quote_card() + print("wrote", RUN_DIR / "fable5_trajectory.png") + print("wrote", RUN_DIR / "fable5_wandering_shelter.png") + print("wrote", RUN_DIR / "fable5_quote_card.png") diff --git a/src/rle/orchestration/action_executor.py b/src/rle/orchestration/action_executor.py index 4ffb71b..aff34e9 100644 --- a/src/rle/orchestration/action_executor.py +++ b/src/rle/orchestration/action_executor.py @@ -39,6 +39,7 @@ class ActionOutcome(BaseModel): target_colonist_id: str | None = None success: bool error: str | None = None + parameters: dict[str, Any] = {} class ExecutionResult(BaseModel): @@ -64,6 +65,31 @@ def _extract_rimapi_error(detail: str) -> str: return detail +def _normalize_work_priorities(params: dict[str, Any]) -> dict[str, int]: + """Accept the parameter shapes models actually emit for work_priority. + + Documented shape is flat ``{"": <1-4>}``, but frontier models + also emit ``{"work_type": "Research", "priority": 2}`` and + ``{"work_priorities": {"Growing": 1, ...}}`` (issue #27). Passing those + through verbatim posted garbage like work="work_type" to RIMAPI. + """ + nested = params.get("work_priorities") + if isinstance(nested, dict): + return {str(work): int(pri) for work, pri in nested.items()} + if "work_type" in params: + return {str(params["work_type"]): int(params.get("priority", 1))} + flat = { + str(work): int(pri) for work, pri in params.items() + if isinstance(pri, int) and not isinstance(pri, bool) + } + if not flat: + raise ValueError( + 'work_priority requires {"": <1-4>} parameters ' + "(e.g. {\"Growing\": 1})" + ) + return flat + + class ActionExecutor: """Dispatches agent actions to RIMAPI write endpoints. @@ -93,6 +119,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult: endpoint=endpoint, target_colonist_id=action.target_colonist_id, success=True, + parameters=action.parameters, )) except RimAPIResponseError as exc: logger.warning("Action %s failed: %s", endpoint, exc.detail) @@ -103,6 +130,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult: target_colonist_id=action.target_colonist_id, success=False, error=_extract_rimapi_error(exc.detail), + parameters=action.parameters, )) except Exception as exc: logger.warning("Action %s failed", endpoint, exc_info=True) @@ -113,6 +141,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult: target_colonist_id=action.target_colonist_id, success=False, error=str(exc) or type(exc).__name__, + parameters=action.parameters, )) return ExecutionResult( executed=executed, @@ -149,7 +178,9 @@ async def _dispatch(self, action: Action, endpoint: str) -> None: # -- Specialized handlers (parameter mapping for complex DTOs) ----------- async def _h_work_priority(self, cid: str, params: dict[str, Any]) -> None: - await self._client.set_work_priorities(cid, params) + await self._client.set_work_priorities( + cid, _normalize_work_priorities(params), + ) async def _h_draft(self, cid: str, params: dict[str, Any]) -> None: is_drafted = params.get("is_drafted", True) @@ -159,11 +190,21 @@ async def _h_move(self, cid: str, params: dict[str, Any]) -> None: await self._client.move_colonist(cid, params.get("x", 0), params.get("z", 0)) async def _h_job_assign(self, cid: str, params: dict[str, Any]) -> None: + # Models emit "job" as often as the documented "job_def" (issue #27); + # posting an empty JobDef is a guaranteed RIMAPI error. + job = params.get("job_def") or params.get("job") or "" + if not job: + raise ValueError( + 'job_assign requires a "job_def" parameter (e.g. "Sow", "Mine")' + ) + target_position = params.get("target_position") + if target_position is None and "x" in params and "z" in params: + target_position = (int(params["x"]), int(params["z"])) await self._client.set_colonist_job( cid, - job=params.get("job_def", ""), + job=str(job), target_thing_id=params.get("target_thing_id"), - target_position=params.get("target_position"), + target_position=target_position, ) async def _h_time_assignment(self, cid: str, params: dict[str, Any]) -> None: diff --git a/src/rle/orchestration/game_loop.py b/src/rle/orchestration/game_loop.py index 35369b0..723f3cc 100644 --- a/src/rle/orchestration/game_loop.py +++ b/src/rle/orchestration/game_loop.py @@ -13,7 +13,7 @@ from felix_agent_sdk.visualization import HelixVisualizer from pydantic import BaseModel, ConfigDict -from rle.agents.actions import ActionPlan, ActionPlanParseError +from rle.agents.actions import ActionPlan, ActionPlanParseError, resolve_endpoint from rle.agents.base_role import RimWorldRoleAgent from rle.config import RLEConfig from rle.orchestration.action_executor import ActionExecutor, ExecutionResult @@ -387,14 +387,46 @@ def _export_tick_json( json.dumps(data, indent=2), ) - def _update_metric_context(self, result: TickResult, state: GameState) -> None: + def _update_metric_context( + self, result: TickResult, state: GameState, tick_num: int, + ) -> None: """Append tick data to metric context for scoring history.""" - self._metric_context.tick_results.append(result) - self._metric_context.state_history.append(state) + ctx = self._metric_context + ctx.tick_results.append(result) + ctx.state_history.append(state) + already_drafted = any(c.is_drafted for c in state.colonists) + seen_ids = {t.threat_id for t in ctx.threats_seen} for threat in state.threats: - seen_ids = {t.threat_id for t in self._metric_context.threats_seen} - if threat.threat_id not in seen_ids: - self._metric_context.threats_seen.append(threat) + # Null incident placeholders (the /incidents endpoint emits them) + # are not threats — counting one made threat_response unwinnable + # in a run with zero hostiles (issue #25). + if threat.enemy_count <= 0 and threat.threat_level <= 0.0: + continue + if threat.threat_id in seen_ids: + continue + ctx.threats_seen.append(threat) + ctx.threat_seen_tick[threat.threat_id] = tick_num + if already_drafted: + ctx.first_draft_tick[threat.threat_id] = 0 + + def _record_draft_response( + self, exec_result: ExecutionResult, tick_num: int, + ) -> None: + """Record per-threat response delay once a draft action executes (#25).""" + drafted = any( + o.success + and resolve_endpoint(o.action_type) == "draft" + and o.parameters.get("is_drafted", True) is not False + for o in exec_result.outcomes + ) + if not drafted: + return + ctx = self._metric_context + for threat in ctx.threats_seen: + seen = ctx.threat_seen_tick.get(threat.threat_id, tick_num) + ctx.first_draft_tick.setdefault( + threat.threat_id, max(0, tick_num - seen), + ) def _broadcast_phase_if_changed(self, current_time: float) -> None: """Broadcast PHASE_ANNOUNCE when macro_time crosses a phase boundary.""" @@ -573,8 +605,12 @@ async def run_tick(self) -> TickResult: target=outcome.target_colonist_id, success=outcome.success, error=outcome.error, + parameters=outcome.parameters, ) + # 7a. Track draft responses for the threat_response metric (issue #25) + self._record_draft_response(exec_result, tick_num) + # 7b. Surface per-action errors back to agents so they can avoid # re-proposing the same invalid action next tick (e.g. researching # an already-finished project, setting priority for a disabled work type). @@ -658,7 +694,7 @@ async def run_tick(self) -> TickResult: self._tick_results.append(result) # 9. Update metric context and evaluate scenario - self._update_metric_context(result, state) + self._update_metric_context(result, state, tick_num) if self._evaluator: eval_result = self._evaluator.evaluate( state, self._metric_context, tick_count=len(self._tick_results), diff --git a/src/rle/rimapi/client.py b/src/rle/rimapi/client.py index bc8bd54..3759b8f 100644 --- a/src/rle/rimapi/client.py +++ b/src/rle/rimapi/client.py @@ -64,6 +64,7 @@ class RimAPIClient: def __init__(self, base_url: str = "http://localhost:8765") -> None: self._base_url = base_url.rstrip("/") self._client: httpx.AsyncClient | None = None + self._terrain_summary_pin: TerrainSummary | None = None async def __aenter__(self) -> RimAPIClient: self._client = httpx.AsyncClient(base_url=self._base_url, timeout=10.0) @@ -686,7 +687,14 @@ async def get_terrain_summary( Decodes the RLE terrain grid, classifies tiles, and finds the best areas for building, farming, and stockpiling near the colony center. + + The first successful summary is pinned for the client's lifetime: + terrain is static, and re-deriving the colony center from live pawn + positions every tick made the recommended sites chase the builders — + 10 shelter blueprints at 10 locations, none completed (issue #26). """ + if self._terrain_summary_pin is not None: + return self._terrain_summary_pin try: data = await self._get(f"/api/v1/map/terrain?map_id={map_id}") if not isinstance(data, dict): @@ -803,13 +811,15 @@ def _find_clear_rect( # Stockpile: buildable 5x5 near center stockpile = _find_clear_rect(cx, cz, 5, _is_buildable) - return TerrainSummary( + summary = TerrainSummary( colony_center=(cx, cz), water_areas=water_areas, recommended_shelter=shelter, recommended_farm=farm, recommended_stockpile=stockpile, ) + self._terrain_summary_pin = summary + return summary except (RimAPIResponseError, RimAPIConnectionError): return None diff --git a/src/rle/scoring/metrics.py b/src/rle/scoring/metrics.py index 7185c88..4dad794 100644 --- a/src/rle/scoring/metrics.py +++ b/src/rle/scoring/metrics.py @@ -19,6 +19,9 @@ class MetricContext: tick_results: list[TickResult] = field(default_factory=list) state_history: list[GameState] = field(default_factory=list) threats_seen: list[ThreatData] = field(default_factory=list) + # Loop tick at which each threat was first observed (issue #25) + threat_seen_tick: dict[str, int] = field(default_factory=dict) + # Response delay in loop ticks per threat, recorded when a draft executes first_draft_tick: dict[str, int] = field(default_factory=dict) initial_wealth: float = 0.0 # Process metrics (populated by game loop after conflict resolution) diff --git a/src/rle/tracking/metadata.py b/src/rle/tracking/metadata.py index f750e33..7e85d6d 100644 --- a/src/rle/tracking/metadata.py +++ b/src/rle/tracking/metadata.py @@ -15,10 +15,13 @@ # metric implementations, or composite math change in a way that makes scores # from older runs not directly comparable. The leaderboard re-scores artifacts # at the current version on render; mismatches are surfaced, not silently -# elided. Current scope (1.0): the original 10-metric composite as wired by -# PR #16 / #18, with coordination + communication_efficiency at the broken +# elided. 1.1 (issue #25): threat_response now tracks actual draft responses +# (first_draft_tick wired, was permanently 0.0 once any threat registered) +# and null incident placeholders (enemy_count=0, threat_level=0.0) no longer +# count as threats — 1.0 scores with non-empty threats_seen are not +# comparable. coordination + communication_efficiency remain the broken # implementations to be repaired in Phase C. -SCORING_VERSION = "1.0" +SCORING_VERSION = "1.1" # Conventional install path for the RIMAPI Workshop mod we deploy our fork DLL # over. Best-effort — if Steam lives elsewhere set the RIMAPI_DLL_PATH env var. diff --git a/tests/unit/test_action_executor.py b/tests/unit/test_action_executor.py index f6307fd..6983989 100644 --- a/tests/unit/test_action_executor.py +++ b/tests/unit/test_action_executor.py @@ -306,3 +306,103 @@ async def test_destroy_rect(self) -> None: client.destroy_rect.assert_awaited_once_with( map_id=0, x1=5, z1=5, x2=15, z2=15, ) + + +class TestWorkPriorityNormalization: + """Issue #27: accept the parameter shapes models actually emit.""" + + async def test_work_type_shape_normalized(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="work_priority", + target_colonist_id="181", + parameters={"work_type": "Research", "priority": 2}, + )) + result = await executor.execute(plan) + assert result.executed == 1 + client.set_work_priorities.assert_awaited_once_with("181", {"Research": 2}) + + async def test_nested_work_priorities_shape_normalized(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="work_priority", + target_colonist_id="184", + parameters={"work_priorities": {"Growing": 1, "Hauling": 2}}, + )) + result = await executor.execute(plan) + assert result.executed == 1 + client.set_work_priorities.assert_awaited_once_with( + "184", {"Growing": 1, "Hauling": 2}, + ) + + async def test_flat_documented_shape_passes_through(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="work_priority", + target_colonist_id="187", + parameters={"Cooking": 1, "Hauling": 1}, + )) + result = await executor.execute(plan) + assert result.executed == 1 + client.set_work_priorities.assert_awaited_once_with( + "187", {"Cooking": 1, "Hauling": 1}, + ) + + async def test_garbage_params_fail_visibly(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="work_priority", + target_colonist_id="181", + parameters={"note": "make Bob research"}, + )) + result = await executor.execute(plan) + assert result.failed == 1 + assert result.outcomes[0].success is False + assert "WorkType" in (result.outcomes[0].error or "") + client.set_work_priorities.assert_not_awaited() + + +class TestJobAssignNormalization: + """Issue #27: 'job' alias accepted, empty JobDef fails visibly.""" + + async def test_job_alias_accepted_with_xz_position(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="job_assign", + target_colonist_id="184", + parameters={"job": "Sow", "x": 136, "z": 142}, + )) + result = await executor.execute(plan) + assert result.executed == 1 + client.set_colonist_job.assert_awaited_once_with( + "184", job="Sow", target_thing_id=None, target_position=(136, 142), + ) + + async def test_empty_job_fails_visibly(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="job_assign", + target_colonist_id="184", + parameters={"x": 136, "z": 142}, + )) + result = await executor.execute(plan) + assert result.failed == 1 + assert "job_def" in (result.outcomes[0].error or "") + client.set_colonist_job.assert_not_awaited() + + async def test_outcomes_carry_parameters(self) -> None: + client = AsyncMock() + executor = ActionExecutor(client) + plan = _make_plan(Action( + action_type="job_assign", + target_colonist_id="184", + parameters={"job_def": "Mine", "x": 204, "z": 6}, + )) + result = await executor.execute(plan) + assert result.outcomes[0].parameters == {"job_def": "Mine", "x": 204, "z": 6} diff --git a/tests/unit/test_metadata.py b/tests/unit/test_metadata.py index 88ae028..c6a1fba 100644 --- a/tests/unit/test_metadata.py +++ b/tests/unit/test_metadata.py @@ -17,7 +17,7 @@ def test_scoring_version_pins_a_string() -> None: that requires this test (and the dataset card) to be updated.""" assert isinstance(SCORING_VERSION, str) assert SCORING_VERSION - assert SCORING_VERSION == "1.0" + assert SCORING_VERSION == "1.1" def test_file_sha256_returns_none_for_missing_path() -> None: diff --git a/tests/unit/test_rimapi_client.py b/tests/unit/test_rimapi_client.py index 2cab9a5..45b95f3 100644 --- a/tests/unit/test_rimapi_client.py +++ b/tests/unit/test_rimapi_client.py @@ -737,3 +737,39 @@ def handler(request: httpx.Request) -> httpx.Response: assert call_count == 1, ( f"power_info should be fetched once per tick, got {call_count}" ) + + +class TestTerrainSummaryPinning: + """Issue #26: the first terrain summary is pinned for the client lifetime.""" + + _TERRAIN = { + "width": 10, "height": 10, + "palette": ["Soil", "WaterMovingShallow", "SoilRich", "Granite_Rough"], + "grid": [100, 0], + "floor_palette": [], "floor_grid": [100, 0], + } + + async def test_summary_pinned_across_center_changes(self) -> None: + from unittest.mock import AsyncMock + + client = RimAPIClient("http://test") + client._get = AsyncMock(return_value=dict(self._TERRAIN)) # type: ignore[method-assign] + + first = await client.get_terrain_summary(colonist_positions=[(2, 2)]) + second = await client.get_terrain_summary(colonist_positions=[(8, 8)]) + + assert first is not None + assert second is first # pinned object, not recomputed + client._get.assert_awaited_once() # terrain fetched exactly once + + async def test_failed_fetch_does_not_pin(self) -> None: + from unittest.mock import AsyncMock + + client = RimAPIClient("http://test") + client._get = AsyncMock( # type: ignore[method-assign] + side_effect=RimAPIConnectionError("down"), + ) + assert await client.get_terrain_summary() is None + + client._get = AsyncMock(return_value=dict(self._TERRAIN)) # type: ignore[method-assign] + assert await client.get_terrain_summary() is not None diff --git a/tests/unit/test_threat_tracking.py b/tests/unit/test_threat_tracking.py new file mode 100644 index 0000000..6fe0565 --- /dev/null +++ b/tests/unit/test_threat_tracking.py @@ -0,0 +1,133 @@ +"""Tests for issue #25: threat filtering and draft-response tracking.""" + +from __future__ import annotations + +import pytest + +from rle.agents.actions import ActionPlan +from rle.orchestration.action_executor import ActionOutcome, ExecutionResult +from rle.orchestration.game_loop import RLEGameLoop, TickResult +from rle.rimapi.schemas import GameState, ThreatData +from rle.scoring.metrics import MetricContext, threat_response + + +def _loop_with_context() -> tuple[RLEGameLoop, MetricContext]: + """A bare game loop carrying only the metric context (unit scope).""" + loop = object.__new__(RLEGameLoop) + ctx = MetricContext() + loop._metric_context = ctx + return loop, ctx + + +def _threat(tid: str, enemies: int = 3, level: float = 0.5) -> ThreatData: + return ThreatData( + threat_id=tid, threat_type="raid", faction="pirates", + enemy_count=enemies, threat_level=level, + ) + + +def _tick_result(state: GameState) -> TickResult: + return TickResult( + tick=state.colony.tick, day=state.colony.day, macro_time=0.1, + plan=ActionPlan(role="test", tick=1, actions=[]), + execution=ExecutionResult(executed=0, failed=0, total=0), + score=None, + ) + + +def _draft_outcome(success: bool = True, is_drafted: bool = True) -> ActionOutcome: + return ActionOutcome( + action_type="draft", endpoint="draft", target_colonist_id="181", + success=success, parameters={"is_drafted": is_drafted}, + ) + + +class TestThreatFiltering: + def test_null_placeholder_not_counted( + self, sample_game_state: GameState, + ) -> None: + loop, ctx = _loop_with_context() + state = sample_game_state.model_copy(update={ + "threats": (_threat("phantom", enemies=0, level=0.0),), + }) + loop._update_metric_context(_tick_result(state), state, tick_num=3) + assert ctx.threats_seen == [] + assert threat_response(state, ctx) == 1.0 + + def test_real_threat_recorded_with_seen_tick( + self, sample_game_state: GameState, + ) -> None: + loop, ctx = _loop_with_context() + state = sample_game_state.model_copy(update={"threats": (_threat("raid-1"),)}) + loop._update_metric_context(_tick_result(state), state, tick_num=4) + assert [t.threat_id for t in ctx.threats_seen] == ["raid-1"] + assert ctx.threat_seen_tick == {"raid-1": 4} + + def test_threat_while_already_drafted_is_instant_response( + self, sample_game_state: GameState, + ) -> None: + loop, ctx = _loop_with_context() + colonists = tuple( + c.model_copy(update={"is_drafted": True}) + for c in sample_game_state.colonists + ) + state = sample_game_state.model_copy(update={ + "threats": (_threat("raid-2"),), "colonists": colonists, + }) + loop._update_metric_context(_tick_result(state), state, tick_num=5) + assert ctx.first_draft_tick == {"raid-2": 0} + assert threat_response(state, ctx) == 1.0 + + +class TestDraftResponseRecording: + def test_draft_records_delay_per_threat(self) -> None: + loop, ctx = _loop_with_context() + ctx.threats_seen.append(_threat("raid-1")) + ctx.threat_seen_tick["raid-1"] = 4 + exec_result = ExecutionResult( + executed=1, failed=0, total=1, outcomes=(_draft_outcome(),), + ) + loop._record_draft_response(exec_result, tick_num=6) + assert ctx.first_draft_tick == {"raid-1": 2} + + def test_undraft_does_not_count(self) -> None: + loop, ctx = _loop_with_context() + ctx.threats_seen.append(_threat("raid-1")) + ctx.threat_seen_tick["raid-1"] = 4 + exec_result = ExecutionResult( + executed=1, failed=0, total=1, + outcomes=(_draft_outcome(is_drafted=False),), + ) + loop._record_draft_response(exec_result, tick_num=6) + assert ctx.first_draft_tick == {} + + def test_failed_draft_does_not_count(self) -> None: + loop, ctx = _loop_with_context() + ctx.threats_seen.append(_threat("raid-1")) + ctx.threat_seen_tick["raid-1"] = 4 + exec_result = ExecutionResult( + executed=0, failed=1, total=1, + outcomes=(_draft_outcome(success=False),), + ) + loop._record_draft_response(exec_result, tick_num=6) + assert ctx.first_draft_tick == {} + + def test_first_response_is_not_overwritten(self) -> None: + loop, ctx = _loop_with_context() + ctx.threats_seen.append(_threat("raid-1")) + ctx.threat_seen_tick["raid-1"] = 4 + ctx.first_draft_tick["raid-1"] = 1 + exec_result = ExecutionResult( + executed=1, failed=0, total=1, outcomes=(_draft_outcome(),), + ) + loop._record_draft_response(exec_result, tick_num=9) + assert ctx.first_draft_tick == {"raid-1": 1} + + def test_metric_rewards_fast_response( + self, sample_game_state: GameState, + ) -> None: + _, ctx = _loop_with_context() + ctx.threats_seen.append(_threat("raid-1")) + ctx.first_draft_tick["raid-1"] = 2 + # 2-tick response → 1 - 2/10 = 0.8 + assert threat_response(sample_game_state, ctx) == pytest.approx(0.8)