Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions results/benchmark_history.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
{"timestamp": "2026-06-10T00:59:13.107962+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8265, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 4, "total_completion_tokens": 1903, "total_tokens": 1907, "estimated_cost_usd": 0.09519, "wall_time_s": 258.38, "num_calls": 2, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 82, "errors_by_type": {"parse_failure": 12}, "avg_deliberation_ms": 46786.6, "action_success_rate": 1.0, "total_tokens": 1907, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8265, "outcome": "timeout", "ticks": 2}]}
{"timestamp": "2026-06-10T01:09:11.973836+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8677, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 26, "total_completion_tokens": 21596, "total_tokens": 21622, "estimated_cost_usd": 1.08006, "wall_time_s": 319.28, "num_calls": 13, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 128, "errors_by_type": {"deliberation_timeout": 1, "provider_error": 1}, "avg_deliberation_ms": 27602.18, "action_success_rate": 1.0, "total_tokens": 21622, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8677, "outcome": "timeout", "ticks": 2}]}
{"timestamp": "2026-06-10T01:14:38.139936+00:00", "scoring_version": "1.0", "git_commit": "295f478", "git_branch": "fix/strict-action-schema", "git_dirty": true, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 2, "outcome": "timeout", "final_score": 0.8598, "ticks_run": 2, "cost_snapshot": {"total_prompt_tokens": 28, "total_completion_tokens": 20578, "total_tokens": 20606, "estimated_cost_usd": 1.02918, "wall_time_s": 163.26, "num_calls": 14, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 99, "errors_by_type": {}, "avg_deliberation_ms": 25524.75, "action_success_rate": 0.7273, "total_tokens": 20606, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.8598, "outcome": "timeout", "ticks": 2}]}
{"timestamp": "2026-06-10T01:32:30.462265+00:00", "scoring_version": "1.0", "git_commit": "5d361d0", "git_branch": "master", "git_dirty": false, "rle_version": "0.1.0", "felix_sdk_version": "0.2.2", "platform": "win32", "python_version": "3.14.0", "docker_mode": false, "random_seed": 42, "rimapi_dll_path": "C:\\Steam\\steamapps\\workshop\\content\\294100\\3593423732\\1.6\\Assemblies\\RIMAPI.dll", "rimapi_dll_sha256": "8b26c3820e37bb21ae7227094a7e84fce52ef1099c913899c39a6c78c8f0f4e1", "rimapi_fork_commit": "", "scenario": "Crashlanded Survival", "scenario_save_name": "rle_crashlanded_v1", "model": "claude-fable-5", "provider": "claude-code", "base_url": null, "no_think": false, "parallel": true, "no_agent": false, "no_pause": false, "tick_interval": 30.0, "max_ticks": 10, "outcome": "timeout", "final_score": 0.7536, "ticks_run": 10, "cost_snapshot": {"total_prompt_tokens": 140, "total_completion_tokens": 102045, "total_tokens": 102185, "estimated_cost_usd": 5.10365, "wall_time_s": 897.04, "num_calls": 70, "prompt_price_per_token": 1e-05, "completion_price_per_token": 5e-05, "pricing_source": "override"}, "event_summary": {"total_events": 633, "errors_by_type": {}, "avg_deliberation_ms": 26282.04, "action_success_rate": 0.7167, "total_tokens": 102185, "estimated_cost_usd": 0.0}, "run_type": "scenario", "scenarios": [{"name": "Crashlanded Survival", "difficulty": "easy", "score": 0.7536, "outcome": "timeout", "ticks": 10}]}
140 changes: 140 additions & 0 deletions scripts/analysis/fable5_launch_charts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""One-off chart generation for the Fable 5 launch thread.

Reads results/fable5-live-N1 artifacts and writes two PNGs next to them:
score trajectory annotated with the threat_response artifact, and the
wandering-shelter blueprint map (issue #26).
"""

from __future__ import annotations

import csv
from pathlib import Path

import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

RUN_DIR = Path("results/fable5-live-N1")
THREAT_WEIGHT = 0.08 # crashlanded scenario override

# Blueprint rectangle origins per tick (from events.jsonl, issue #26)
SHELTER_SITES = [
(133, 139), (133, 145), (140, 134), (147, 135), (152, 139),
(153, 127), (156, 119), (144, 128), (145, 135), (145, 138),
]
WATER = (93, 105, 132, 185) # approx, per DefenseCommander terrain reads


def load_scores() -> tuple[list[int], list[float], list[float]]:
ticks, composite, threat = [], [], []
with open(RUN_DIR / "01_crashlanded_survival.csv", encoding="utf-8") as f:
for i, row in enumerate(csv.DictReader(f)):
ticks.append(i)
composite.append(float(row["composite"]))
threat.append(float(row["threat_response"]))
return ticks, composite, threat


def trajectory_chart() -> None:
ticks, composite, threat = load_scores()
# Composite with the threat_response zeroing backed out
ex_artifact = [c + (1.0 - t) * THREAT_WEIGHT for c, t in zip(composite, threat)]
decay = [composite[0] - 0.013 * i for i in ticks]

fig, ax = plt.subplots(figsize=(10, 5.5), dpi=160)
ax.plot(ticks, composite, "o-", lw=2.5, color="#d4452c", label="Fable 5 composite (as scored)")
ax.plot(ticks, ex_artifact, "o--", lw=2, color="#2c7fb8",
label="Ex-artifact (threat_response bug backed out)")
ax.plot(ticks, decay, ":", lw=1.5, color="#888888",
label="Natural decay reference (−0.013/tick)")

drop_tick = next(i for i, t in enumerate(threat) if t == 0.0)
ax.annotate(
"threat_response → 0.0\n(no raid ever existed —\n"
"agent refused to draft, correctly.\nHarness bug, RLE issue #25)",
xy=(drop_tick, composite[drop_tick]),
xytext=(drop_tick - 4.4, composite[drop_tick] - 0.018),
fontsize=9, color="#d4452c",
arrowprops={"arrowstyle": "->", "color": "#d4452c"},
)
ax.set_xlabel("Tick")
ax.set_ylabel("Composite score")
ax.set_title("Claude Fable 5 × RimWorld Crashlanded — 10 ticks, 7 agents (RLE, seed 42)")
ax.legend(loc="lower left", fontsize=9)
ax.grid(alpha=0.25)
fig.tight_layout()
fig.savefig(RUN_DIR / "fable5_trajectory.png")


def shelter_chart() -> None:
fig, ax = plt.subplots(figsize=(7.5, 7.5), dpi=160)
wx1, wz1, wx2, wz2 = WATER
ax.add_patch(Rectangle((wx1, wz1), wx2 - wx1, wz2 - wz1,
facecolor="#a6cee3", edgecolor="none", alpha=0.6))
ax.text((wx1 + wx2) / 2, (wz1 + wz2) / 2, "water", ha="center",
color="#1f6090", fontsize=10, style="italic")

xs = [s[0] for s in SHELTER_SITES]
zs = [s[1] for s in SHELTER_SITES]
for i, (x, z) in enumerate(SHELTER_SITES):
ax.add_patch(Rectangle((x, z), 7, 7, facecolor="none",
edgecolor="#d4452c", lw=1.4, alpha=0.85))
ax.annotate(str(i), (x + 3.5, z + 3.5), ha="center", va="center",
fontsize=9, color="#d4452c", weight="bold")
ax.plot(
[x + 3.5 for x in xs], [z + 3.5 for z in zs],
"--", color="#555555", lw=1, alpha=0.7,
)

ax.set_xlim(85, 175)
ax.set_ylim(100, 195)
ax.set_aspect("equal")
ax.set_xlabel("map x")
ax.set_ylabel("map z")
ax.set_title(
"The wandering shelter — one blueprint per tick, ticks 0–9\n"
"10 sites, 10 successful placements, 0 shelters built (RLE issue #26)",
fontsize=11,
)
ax.grid(alpha=0.2)
fig.tight_layout()
fig.savefig(RUN_DIR / "fable5_wandering_shelter.png")


def quote_card() -> None:
"""Transcript card for the DefenseCommander tick-9 quote (verbatim)."""
bg, dim, body, hot = "#0d1117", "#8b949e", "#c9d1d9", "#f0883e"
fig = plt.figure(figsize=(7.5, 4.22), dpi=160)
fig.patch.set_facecolor(bg)

def t(y: float, s: str, color: str, size: int, weight: str = "normal") -> None:
fig.text(0.06, y, s, color=color, fontsize=size, family="monospace",
weight=weight, va="top")

t(0.93, "RLE · RimWorld Learning Environment — events.jsonl, verbatim", dim, 10)
t(0.86, "agent: defense_commander (claude-fable-5) tick: 9 confidence: 0.92", dim, 10)

t(0.74, '"Threat assessment: the single \'threat\' entry has', body, 12)
t(0.685, 'enemy_count=0 and threat_level=0.0 — there is no hostile', body, 12)
t(0.63, "presence on the map. The 'ThreatBig' alert in recent events", body, 12)
t(0.575, 'is a MENTAL BREAK warning for Bob (mood 0.22), not a raid."', body, 12)

t(0.44, '"Drafting Bob in his current state would be', hot, 14, "bold")
t(0.375, ' the single most dangerous \'defensive\'', hot, 14, "bold")
t(0.31, ' action available."', hot, 14, "bold")

t(0.14, "Our scoring zeroed threat_response for this. The agent was right.", dim, 10)
t(0.08, "github.com/AppSprout-dev/RLE", dim, 10)

fig.savefig(RUN_DIR / "fable5_quote_card.png", facecolor=bg)


if __name__ == "__main__":
trajectory_chart()
shelter_chart()
quote_card()
print("wrote", RUN_DIR / "fable5_trajectory.png")
print("wrote", RUN_DIR / "fable5_wandering_shelter.png")
print("wrote", RUN_DIR / "fable5_quote_card.png")
47 changes: 44 additions & 3 deletions src/rle/orchestration/action_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class ActionOutcome(BaseModel):
target_colonist_id: str | None = None
success: bool
error: str | None = None
parameters: dict[str, Any] = {}


class ExecutionResult(BaseModel):
Expand All @@ -64,6 +65,31 @@ def _extract_rimapi_error(detail: str) -> str:
return detail


def _normalize_work_priorities(params: dict[str, Any]) -> dict[str, int]:
"""Accept the parameter shapes models actually emit for work_priority.

Documented shape is flat ``{"<WorkType>": <1-4>}``, but frontier models
also emit ``{"work_type": "Research", "priority": 2}`` and
``{"work_priorities": {"Growing": 1, ...}}`` (issue #27). Passing those
through verbatim posted garbage like work="work_type" to RIMAPI.
"""
nested = params.get("work_priorities")
if isinstance(nested, dict):
return {str(work): int(pri) for work, pri in nested.items()}
if "work_type" in params:
return {str(params["work_type"]): int(params.get("priority", 1))}
flat = {
str(work): int(pri) for work, pri in params.items()
if isinstance(pri, int) and not isinstance(pri, bool)
}
if not flat:
raise ValueError(
'work_priority requires {"<WorkType>": <1-4>} parameters '
"(e.g. {\"Growing\": 1})"
)
return flat


class ActionExecutor:
"""Dispatches agent actions to RIMAPI write endpoints.

Expand Down Expand Up @@ -93,6 +119,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult:
endpoint=endpoint,
target_colonist_id=action.target_colonist_id,
success=True,
parameters=action.parameters,
))
except RimAPIResponseError as exc:
logger.warning("Action %s failed: %s", endpoint, exc.detail)
Expand All @@ -103,6 +130,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult:
target_colonist_id=action.target_colonist_id,
success=False,
error=_extract_rimapi_error(exc.detail),
parameters=action.parameters,
))
except Exception as exc:
logger.warning("Action %s failed", endpoint, exc_info=True)
Expand All @@ -113,6 +141,7 @@ async def execute(self, plan: ActionPlan) -> ExecutionResult:
target_colonist_id=action.target_colonist_id,
success=False,
error=str(exc) or type(exc).__name__,
parameters=action.parameters,
))
return ExecutionResult(
executed=executed,
Expand Down Expand Up @@ -149,7 +178,9 @@ async def _dispatch(self, action: Action, endpoint: str) -> None:
# -- Specialized handlers (parameter mapping for complex DTOs) -----------

async def _h_work_priority(self, cid: str, params: dict[str, Any]) -> None:
await self._client.set_work_priorities(cid, params)
await self._client.set_work_priorities(
cid, _normalize_work_priorities(params),
)

async def _h_draft(self, cid: str, params: dict[str, Any]) -> None:
is_drafted = params.get("is_drafted", True)
Expand All @@ -159,11 +190,21 @@ async def _h_move(self, cid: str, params: dict[str, Any]) -> None:
await self._client.move_colonist(cid, params.get("x", 0), params.get("z", 0))

async def _h_job_assign(self, cid: str, params: dict[str, Any]) -> None:
# Models emit "job" as often as the documented "job_def" (issue #27);
# posting an empty JobDef is a guaranteed RIMAPI error.
job = params.get("job_def") or params.get("job") or ""
if not job:
raise ValueError(
'job_assign requires a "job_def" parameter (e.g. "Sow", "Mine")'
)
target_position = params.get("target_position")
if target_position is None and "x" in params and "z" in params:
target_position = (int(params["x"]), int(params["z"]))
await self._client.set_colonist_job(
cid,
job=params.get("job_def", ""),
job=str(job),
target_thing_id=params.get("target_thing_id"),
target_position=params.get("target_position"),
target_position=target_position,
)

async def _h_time_assignment(self, cid: str, params: dict[str, Any]) -> None:
Expand Down
52 changes: 44 additions & 8 deletions src/rle/orchestration/game_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from felix_agent_sdk.visualization import HelixVisualizer
from pydantic import BaseModel, ConfigDict

from rle.agents.actions import ActionPlan, ActionPlanParseError
from rle.agents.actions import ActionPlan, ActionPlanParseError, resolve_endpoint
from rle.agents.base_role import RimWorldRoleAgent
from rle.config import RLEConfig
from rle.orchestration.action_executor import ActionExecutor, ExecutionResult
Expand Down Expand Up @@ -387,14 +387,46 @@ def _export_tick_json(
json.dumps(data, indent=2),
)

def _update_metric_context(self, result: TickResult, state: GameState) -> None:
def _update_metric_context(
self, result: TickResult, state: GameState, tick_num: int,
) -> None:
"""Append tick data to metric context for scoring history."""
self._metric_context.tick_results.append(result)
self._metric_context.state_history.append(state)
ctx = self._metric_context
ctx.tick_results.append(result)
ctx.state_history.append(state)
already_drafted = any(c.is_drafted for c in state.colonists)
seen_ids = {t.threat_id for t in ctx.threats_seen}
for threat in state.threats:
seen_ids = {t.threat_id for t in self._metric_context.threats_seen}
if threat.threat_id not in seen_ids:
self._metric_context.threats_seen.append(threat)
# Null incident placeholders (the /incidents endpoint emits them)
# are not threats — counting one made threat_response unwinnable
# in a run with zero hostiles (issue #25).
if threat.enemy_count <= 0 and threat.threat_level <= 0.0:
continue
if threat.threat_id in seen_ids:
continue
ctx.threats_seen.append(threat)
ctx.threat_seen_tick[threat.threat_id] = tick_num
if already_drafted:
ctx.first_draft_tick[threat.threat_id] = 0

def _record_draft_response(
self, exec_result: ExecutionResult, tick_num: int,
) -> None:
"""Record per-threat response delay once a draft action executes (#25)."""
drafted = any(
o.success
and resolve_endpoint(o.action_type) == "draft"
and o.parameters.get("is_drafted", True) is not False
for o in exec_result.outcomes
)
if not drafted:
return
ctx = self._metric_context
for threat in ctx.threats_seen:
seen = ctx.threat_seen_tick.get(threat.threat_id, tick_num)
ctx.first_draft_tick.setdefault(
threat.threat_id, max(0, tick_num - seen),
)

def _broadcast_phase_if_changed(self, current_time: float) -> None:
"""Broadcast PHASE_ANNOUNCE when macro_time crosses a phase boundary."""
Expand Down Expand Up @@ -573,8 +605,12 @@ async def run_tick(self) -> TickResult:
target=outcome.target_colonist_id,
success=outcome.success,
error=outcome.error,
parameters=outcome.parameters,
)

# 7a. Track draft responses for the threat_response metric (issue #25)
self._record_draft_response(exec_result, tick_num)

# 7b. Surface per-action errors back to agents so they can avoid
# re-proposing the same invalid action next tick (e.g. researching
# an already-finished project, setting priority for a disabled work type).
Expand Down Expand Up @@ -658,7 +694,7 @@ async def run_tick(self) -> TickResult:
self._tick_results.append(result)

# 9. Update metric context and evaluate scenario
self._update_metric_context(result, state)
self._update_metric_context(result, state, tick_num)
if self._evaluator:
eval_result = self._evaluator.evaluate(
state, self._metric_context, tick_count=len(self._tick_results),
Expand Down
Loading
Loading