AppSprout-dev · jkbennitt · Jun 10, 2026 · Jun 10, 2026
diff --git a/results/benchmark_history.jsonl b/results/benchmark_history.jsonl
@@ -3,3 +3,4 @@
 {"timestamp": "2026-06-10T00:59:13.107962+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8265, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 4, "total_completion_tokens": 1903, "total_tokens": 1907, "estimated_cost_usd": 0.09519, "wall_time_s": 258.38, "num_calls": 2, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 82, "errors_by_type": {"parse_failure": 12}, "avg_deliberation_ms": 46786.6, "action_success_rate": 1.0, "total_tokens": 1907, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8265, "outcome": "timeout", "ticks": 2}]}
 {"timestamp": "2026-06-10T01:09:11.973836+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8677, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 26, "total_completion_tokens": 21596, "total_tokens": 21622, "estimated_cost_usd": 1.08006, "wall_time_s": 319.28, "num_calls": 13, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 128, "errors_by_type": {"deliberation_timeout": 1, "provider_error": 1}, "avg_deliberation_ms": 27602.18, "action_success_rate": 1.0, "total_tokens": 21622, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8677, "outcome": "timeout", "ticks": 2}]}
 {"timestamp": "2026-06-10T01:14:38.139936+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8598, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 28, "total_completion_tokens": 20578, "total_tokens": 20606, "estimated_cost_usd": 1.02918, "wall_time_s": 163.26, "num_calls": 14, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 99, "errors_by_type": {}, "avg_deliberation_ms": 25524.75, "action_success_rate": 0.7273, "total_tokens": 20606, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8598, "outcome": "timeout", "ticks": 2}]}
+{"timestamp": "2026-06-10T01:32:30.462265+00:00", "scoring_version": "1.0", "git_commit": "5d361d0", "git_branch": "master", "git_dirty": false, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 10, "outcome": "timeout", "final_score": 0.7536, "ticks_run": 10, "cost_snapshot": {"total_prompt_tokens": 140, "total_completion_tokens": 102045, "total_tokens": 102185, "estimated_cost_usd": 5.10365, "wall_time_s": 897.04, "num_calls": 70, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 633, "errors_by_type": {}, "avg_deliberation_ms": 26282.04, "action_success_rate": 0.7167, "total_tokens": 102185, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.7536, "outcome": "timeout", "ticks": 10}]}
diff --git a/scripts/analysis/fable5_launch_charts.py b/scripts/analysis/fable5_launch_charts.py
@@ -0,0 +1,140 @@
+"""One-off chart generation for the Fable 5 launch thread.
+
+Reads results/fable5-live-N1 artifacts and writes two PNGs next to them:
+score trajectory annotated with the threat_response artifact, and the
+wandering-shelter blueprint map (issue #26).
+"""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+
+RUN_DIR = Path("results/fable5-live-N1")
+THREAT_WEIGHT = 0.08  # crashlanded scenario override
+
+# Blueprint rectangle origins per tick (from events.jsonl, issue #26)
+SHELTER_SITES = [
+    (133, 139), (133, 145), (140, 134), (147, 135), (152, 139),
+    (153, 127), (156, 119), (144, 128), (145, 135), (145, 138),
+]
+WATER = (93, 105, 132, 185)  # approx, per DefenseCommander terrain reads
+
+
+def load_scores() -> tuple[list[int], list[float], list[float]]:
+    ticks, composite, threat = [], [], []
+    with open(RUN_DIR / "01_crashlanded_survival.csv", encoding="utf-8") as f:
+        for i, row in enumerate(csv.DictReader(f)):
+            ticks.append(i)
+            composite.append(float(row["composite"]))
+            threat.append(float(row["threat_response"]))
+    return ticks, composite, threat
+
+
+def trajectory_chart() -> None:
+    ticks, composite, threat = load_scores()
+    # Composite with the threat_response zeroing backed out
+    ex_artifact = [c + (1.0 - t) * THREAT_WEIGHT for c, t in zip(composite, threat)]
+    decay = [composite[0] - 0.013 * i for i in ticks]
+
+    fig, ax = plt.subplots(figsize=(10, 5.5), dpi=160)
+    ax.plot(ticks, composite, "o-", lw=2.5, color="#d4452c", label="Fable 5 composite (as scored)")
+    ax.plot(ticks, ex_artifact, "o--", lw=2, color="#2c7fb8",
+            label="Ex-artifact (threat_response bug backed out)")
+    ax.plot(ticks, decay, ":", lw=1.5, color="#888888",
+            label="Natural decay reference (−0.013/tick)")
+
+    drop_tick = next(i for i, t in enumerate(threat) if t == 0.0)
+    ax.annotate(
+        "threat_response → 0.0\n(no raid ever existed —\n"
+        "agent refused to draft, correctly.\nHarness bug, RLE issue #25)",
+        xy=(drop_tick, composite[drop_tick]),
+        xytext=(drop_tick - 4.4, composite[drop_tick] - 0.018),
+        fontsize=9, color="#d4452c",
+        arrowprops={"arrowstyle": "->", "color": "#d4452c"},
+    )
+    ax.set_xlabel("Tick")
+    ax.set_ylabel("Composite score")
+    ax.set_title("Claude Fable 5 × RimWorld Crashlanded — 10 ticks, 7 agents (RLE, seed 42)")
+    ax.legend(loc="lower left", fontsize=9)
+    ax.grid(alpha=0.25)
+    fig.tight_layout()
+    fig.savefig(RUN_DIR / "fable5_trajectory.png")
+
+
+def shelter_chart() -> None:
+    fig, ax = plt.subplots(figsize=(7.5, 7.5), dpi=160)
+    wx1, wz1, wx2, wz2 = WATER
+    ax.add_patch(Rectangle((wx1, wz1), wx2 - wx1, wz2 - wz1,
+                           facecolor="#a6cee3", edgecolor="none", alpha=0.6))
+    ax.text((wx1 + wx2) / 2, (wz1 + wz2) / 2, "water", ha="center",
+            color="#1f6090", fontsize=10, style="italic")
+
+    xs = [s[0] for s in SHELTER_SITES]
+    zs = [s[1] for s in SHELTER_SITES]
+    for i, (x, z) in enumerate(SHELTER_SITES):
+        ax.add_patch(Rectangle((x, z), 7, 7, facecolor="none",
+                               edgecolor="#d4452c", lw=1.4, alpha=0.85))
+        ax.annotate(str(i), (x + 3.5, z + 3.5), ha="center", va="center",
+                    fontsize=9, color="#d4452c", weight="bold")
+    ax.plot(
+        [x + 3.5 for x in xs], [z + 3.5 for z in zs],
+        "--", color="#555555", lw=1, alpha=0.7,
+    )
+
+    ax.set_xlim(85, 175)
+    ax.set_ylim(100, 195)
+    ax.set_aspect("equal")
+    ax.set_xlabel("map x")
+    ax.set_ylabel("map z")
+    ax.set_title(
+        "The wandering shelter — one blueprint per tick, ticks 0–9\n"
+        "10 sites, 10 successful placements, 0 shelters built (RLE issue #26)",
+        fontsize=11,
+    )
+    ax.grid(alpha=0.2)
+    fig.tight_layout()
+    fig.savefig(RUN_DIR / "fable5_wandering_shelter.png")
+
+
+def quote_card() -> None:
+    """Transcript card for the DefenseCommander tick-9 quote (verbatim)."""
+    bg, dim, body, hot = "#0d1117", "#8b949e", "#c9d1d9", "#f0883e"
+    fig = plt.figure(figsize=(7.5, 4.22), dpi=160)
+    fig.patch.set_facecolor(bg)
+
+    def t(y: float, s: str, color: str, size: int, weight: str = "normal") -> None:
+        fig.text(0.06, y, s, color=color, fontsize=size, family="monospace",
+                 weight=weight, va="top")
+
+    t(0.93, "RLE · RimWorld Learning Environment — events.jsonl, verbatim", dim, 10)
+    t(0.86, "agent: defense_commander (claude-fable-5)   tick: 9   confidence: 0.92", dim, 10)
+
+    t(0.74, '"Threat assessment: the single \'threat\' entry has', body, 12)
+    t(0.685, 'enemy_count=0 and threat_level=0.0 — there is no hostile', body, 12)
+    t(0.63, "presence on the map. The 'ThreatBig' alert in recent events", body, 12)
+    t(0.575, 'is a MENTAL BREAK warning for Bob (mood 0.22), not a raid."', body, 12)
+
+    t(0.44, '"Drafting Bob in his current state would be', hot, 14, "bold")
+    t(0.375, ' the single most dangerous \'defensive\'', hot, 14, "bold")
+    t(0.31, ' action available."', hot, 14, "bold")
+
+    t(0.14, "Our scoring zeroed threat_response for this. The agent was right.", dim, 10)
+    t(0.08, "github.com/AppSprout-dev/RLE", dim, 10)
+
+    fig.savefig(RUN_DIR / "fable5_quote_card.png", facecolor=bg)
+
+
+if __name__ == "__main__":
+    trajectory_chart()
+    shelter_chart()
+    quote_card()
+    print("wrote", RUN_DIR / "fable5_trajectory.png")
+    print("wrote", RUN_DIR / "fable5_wandering_shelter.png")
+    print("wrote", RUN_DIR / "fable5_quote_card.png")
diff --git a/src/rle/orchestration/action_executor.py b/src/rle/orchestration/action_executor.py
@@ -39,6 +39,7 @@ class ActionOutcome(BaseModel):
     target_colonist_id: str | None = None
     success: bool
     error: str | None = None
+    parameters: dict[str, Any] = {}
 
 
 class ExecutionResult(BaseModel):
@@ -64,6 +65,31 @@ def _extract_rimapi_error(detail: str) -> str:
     return detail
 
 
+def _normalize_work_priorities(params: dict[str, Any]) -> dict[str, int]:
+    """Accept the parameter shapes models actually emit for work_priority.
+
+    Documented shape is flat ``{"<WorkType>": <1-4>}``, but frontier models
+    also emit ``{"work_type": "Research", "priority": 2}`` and
+    ``{"work_priorities": {"Growing": 1, ...}}`` (issue #27). Passing those
+    through verbatim posted garbage like work="work_type" to RIMAPI.
+    """
+    nested = params.get("work_priorities")
+    if isinstance(nested, dict):
+        return {str(work): int(pri) for work, pri in nested.items()}
+    if "work_type" in params:
+        return {str(params["work_type"]): int(params.get("priority", 1))}
+    flat = {
+        str(work): int(pri) for work, pri in params.items()
+        if isinstance(pri, int) and not isinstance(pri, bool)
+    }
+    if not flat:
+        raise ValueError(
+            'work_priority requires {"<WorkType>": <1-4>} parameters '
+            "(e.g. {\"Growing\": 1})"
+        )
+    return flat
+
+
 class ActionExecutor:
     """Dispatches agent actions to RIMAPI write endpoints.
 
@@ -93,6 +119,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult:
                     endpoint=endpoint,
                     target_colonist_id=action.target_colonist_id,
                     success=True,
+                    parameters=action.parameters,
                 ))
             except RimAPIResponseError as exc:
                 logger.warning("Action %s failed: %s", endpoint, exc.detail)
@@ -103,6 +130,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult:
                     target_colonist_id=action.target_colonist_id,
                     success=False,
                     error=_extract_rimapi_error(exc.detail),
+                    parameters=action.parameters,
                 ))
             except Exception as exc:
                 logger.warning("Action %s failed", endpoint, exc_info=True)
@@ -113,6 +141,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult:
                     target_colonist_id=action.target_colonist_id,
                     success=False,
                     error=str(exc) or type(exc).__name__,
+                    parameters=action.parameters,
                 ))
         return ExecutionResult(
             executed=executed,
@@ -149,7 +178,9 @@ async def _dispatch(self, action: Action, endpoint: str) -> None:
     # -- Specialized handlers (parameter mapping for complex DTOs) -----------
 
     async def _h_work_priority(self, cid: str, params: dict[str, Any]) -> None:
-        await self._client.set_work_priorities(cid, params)
+        await self._client.set_work_priorities(
+            cid, _normalize_work_priorities(params),
+        )
 
     async def _h_draft(self, cid: str, params: dict[str, Any]) -> None:
         is_drafted = params.get("is_drafted", True)
@@ -159,11 +190,21 @@ async def _h_move(self, cid: str, params: dict[str, Any]) -> None:
         await self._client.move_colonist(cid, params.get("x", 0), params.get("z", 0))
 
     async def _h_job_assign(self, cid: str, params: dict[str, Any]) -> None:
+        # Models emit "job" as often as the documented "job_def" (issue #27);
+        # posting an empty JobDef is a guaranteed RIMAPI error.
+        job = params.get("job_def") or params.get("job") or ""
+        if not job:
+            raise ValueError(
+                'job_assign requires a "job_def" parameter (e.g. "Sow", "Mine")'
+            )
+        target_position = params.get("target_position")
+        if target_position is None and "x" in params and "z" in params:
+            target_position = (int(params["x"]), int(params["z"]))
         await self._client.set_colonist_job(
             cid,
-            job=params.get("job_def", ""),
+            job=str(job),
             target_thing_id=params.get("target_thing_id"),
-            target_position=params.get("target_position"),
+            target_position=target_position,
         )
 
     async def _h_time_assignment(self, cid: str, params: dict[str, Any]) -> None:

diff --git a/src/rle/orchestration/game_loop.py b/src/rle/orchestration/game_loop.py
@@ -13,7 +13,7 @@
 from felix_agent_sdk.visualization import HelixVisualizer
 from pydantic import BaseModel, ConfigDict
 
-from rle.agents.actions import ActionPlan, ActionPlanParseError
+from rle.agents.actions import ActionPlan, ActionPlanParseError, resolve_endpoint
 from rle.agents.base_role import RimWorldRoleAgent
 from rle.config import RLEConfig
 from rle.orchestration.action_executor import ActionExecutor, ExecutionResult
@@ -387,14 +387,46 @@ def _export_tick_json(
             json.dumps(data, indent=2),
         )
 
-    def _update_metric_context(self, result: TickResult, state: GameState) -> None:
+    def _update_metric_context(
+        self, result: TickResult, state: GameState, tick_num: int,
+    ) -> None:
         """Append tick data to metric context for scoring history."""
-        self._metric_context.tick_results.append(result)
-        self._metric_context.state_history.append(state)
+        ctx = self._metric_context
+        ctx.tick_results.append(result)
+        ctx.state_history.append(state)
+        already_drafted = any(c.is_drafted for c in state.colonists)
+        seen_ids = {t.threat_id for t in ctx.threats_seen}
         for threat in state.threats:
-            seen_ids = {t.threat_id for t in self._metric_context.threats_seen}
-            if threat.threat_id not in seen_ids:
-                self._metric_context.threats_seen.append(threat)
+            # Null incident placeholders (the /incidents endpoint emits them)
+            # are not threats — counting one made threat_response unwinnable
+            # in a run with zero hostiles (issue #25).
+            if threat.enemy_count <= 0 and threat.threat_level <= 0.0:
+                continue
+            if threat.threat_id in seen_ids:
+                continue
+            ctx.threats_seen.append(threat)
+            ctx.threat_seen_tick[threat.threat_id] = tick_num
+            if already_drafted:
+                ctx.first_draft_tick[threat.threat_id] = 0
+
+    def _record_draft_response(
+        self, exec_result: ExecutionResult, tick_num: int,
+    ) -> None:
+        """Record per-threat response delay once a draft action executes (#25)."""
+        drafted = any(
+            o.success
+            and resolve_endpoint(o.action_type) == "draft"
+            and o.parameters.get("is_drafted", True) is not False
+            for o in exec_result.outcomes
+        )
+        if not drafted:
+            return
+        ctx = self._metric_context
+        for threat in ctx.threats_seen:
+            seen = ctx.threat_seen_tick.get(threat.threat_id, tick_num)
+            ctx.first_draft_tick.setdefault(
+                threat.threat_id, max(0, tick_num - seen),
+            )
 
     def _broadcast_phase_if_changed(self, current_time: float) -> None:
         """Broadcast PHASE_ANNOUNCE when macro_time crosses a phase boundary."""
@@ -573,8 +605,12 @@ async def run_tick(self) -> TickResult:
                 target=outcome.target_colonist_id,
                 success=outcome.success,
                 error=outcome.error,
+                parameters=outcome.parameters,
             )
 
+        # 7a. Track draft responses for the threat_response metric (issue #25)
+        self._record_draft_response(exec_result, tick_num)
+
         # 7b. Surface per-action errors back to agents so they can avoid
         # re-proposing the same invalid action next tick (e.g. researching
         # an already-finished project, setting priority for a disabled work type).
@@ -658,7 +694,7 @@ async def run_tick(self) -> TickResult:
         self._tick_results.append(result)
 
         # 9. Update metric context and evaluate scenario
-        self._update_metric_context(result, state)
+        self._update_metric_context(result, state, tick_num)
         if self._evaluator:
             eval_result = self._evaluator.evaluate(
                 state, self._metric_context, tick_count=len(self._tick_results),