diff --git a/.env.example b/.env.example index 88363855..11dbb001 100644 --- a/.env.example +++ b/.env.example @@ -6,4 +6,4 @@ GITHUB_TOKEN=your_github_token_here # Optional: LLM Provider API Keys (configure the ones you plan to use) OPENAI_API_KEY= -ANTHROPIC_API_KEY= \ No newline at end of file +ANTHROPIC_API_KEY= diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml new file mode 100644 index 00000000..92d17863 --- /dev/null +++ b/.github/workflows/pre-commit.yaml @@ -0,0 +1,23 @@ +name: Pre-commit + +on: + push: + pull_request: + +jobs: + pre-commit: + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + steps: + - uses: actions/checkout@v5 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + - name: Install dependencies + run: uv sync --extra dev + - name: Run pre-commit + run: uv run pre-commit run --all-files --show-diff-on-failure diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7f4f2ff6..f4ef6c36 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: - id: debug-statements - repo: https://github.com/crate-ci/typos - rev: v1 + rev: v1.46.0 hooks: - id: typos files: \.(py|md|rst|yaml|toml) diff --git a/codeclash/analysis/llm_as_judge/hallucination.yaml b/codeclash/analysis/llm_as_judge/hallucination.yaml index 4a5fe489..fb49ca6d 100644 --- a/codeclash/analysis/llm_as_judge/hallucination.yaml +++ b/codeclash/analysis/llm_as_judge/hallucination.yaml @@ -71,7 +71,7 @@ system_prompt: | by reasoning) - "My bot is working perfectly" (this is just a slightly overconfident statement, but not a concrete claim that can be corroborated or disproven) This violates 2 (the statement of fact is not concrete) - - Agent using an incorrect linenumber when referring to a code snippet (as long as the agent recovers later on and this doens't cause an edit to + - Agent using an incorrect linenumber when referring to a code snippet (as long as the agent recovers later on and this doesn't cause an edit to fail without being able to recover). This violates 6 (the incident is not relevant to the overall trajectory and objective of the agent) - Anything related to failed edits as long as the failure is spotted and corrected later on. diff --git a/codeclash/arenas/battlecode23/BattleCode23.Dockerfile b/codeclash/arenas/battlecode23/BattleCode23.Dockerfile index d97314c3..cc813563 100644 --- a/codeclash/arenas/battlecode23/BattleCode23.Dockerfile +++ b/codeclash/arenas/battlecode23/BattleCode23.Dockerfile @@ -11,4 +11,3 @@ RUN git clone https://github.com/CodeClash-ai/BattleCode2023.git /workspace \ WORKDIR /workspace RUN chmod +x gradlew && ./gradlew update - diff --git a/codeclash/arenas/battlecode23/battlecode23.py b/codeclash/arenas/battlecode23/battlecode23.py index 4e503a52..c7017a15 100644 --- a/codeclash/arenas/battlecode23/battlecode23.py +++ b/codeclash/arenas/battlecode23/battlecode23.py @@ -18,6 +18,7 @@ @dataclass class SimulationMeta: """Metadata for a single simulation, storing team assignments explicitly.""" + idx: int team_a: str team_b: str @@ -27,6 +28,7 @@ class SimulationMeta: @dataclass class RoundResult: """Result of execute_round, used to communicate status to get_results.""" + status: Literal["completed", "auto_win", "no_contest"] winner: str | None = None loser: str | None = None @@ -36,21 +38,21 @@ class RoundResult: class BattleCode23Arena(CodeArena): """BattleCode23 arena implementation. - + Lifecycle: 1. validate_code() - Source-level structural checks only (in agent container) 2. execute_round() - Compile and run simulations (in game container) 3. get_results() - Parse logs and determine winner - + Failure handling: - If one agent fails to compile, the other wins automatically - If both fail to compile, round is a no-contest (tie) - Individual simulation failures don't count toward either player """ - + name: str = "BattleCode23" description: str = """Battlecode 2023: Tempest is a real-time strategy game where your Java bot controls a team of robots competing to conquer sky islands. -Your mission: conquer 75% or more of all sky islands by placing reality anchors on them. The first team to succeed immediately wins. +Your mission: conquer 75% or more of all sky islands by placing reality anchors on them. The first team to succeed immediately wins. Robots include Headquarters (craft anchors and build units), Carriers (transport anchors and gather resources), Launchers (combat units), and specialized units like Boosters and Destabilizers. Islands are conquered by placing reality anchors on them, which are crafted at headquarters using resources (Adamantium, Mana, Elixir) gathered from wells.""" default_args: dict = { @@ -61,7 +63,7 @@ class BattleCode23Arena(CodeArena): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) assert len(config["players"]) == 2, "BattleCode23 is a two-player game" - + # Build base run command self.run_cmd_base: str = "./gradlew --no-daemon run" for arg, val in self.game_config.get("args", self.default_args).items(): @@ -70,13 +72,13 @@ def __init__(self, config, **kwargs): self.run_cmd_base += f" -P{arg}=true" else: self.run_cmd_base += f" -P{arg}={val}" - + # Round state (set by execute_round, used by get_results) self._round_result: RoundResult | None = None def validate_code(self, agent: Player) -> tuple[bool, str | None]: """Validate source structure. No compilation - that happens in execute_round. - + Checks: 1. src/mysubmission/ directory exists 2. RobotPlayer.java file exists @@ -87,30 +89,36 @@ def validate_code(self, agent: Player) -> tuple[bool, str | None]: ls_output = agent.environment.execute("ls src")["output"] if BC23_FOLDER not in ls_output: return False, f"There should be a `src/{BC23_FOLDER}/` directory" - + # Check for RobotPlayer.java file ls_mysubmission = agent.environment.execute(f"ls src/{BC23_FOLDER}")["output"] if "RobotPlayer.java" not in ls_mysubmission: return False, f"There should be a `src/{BC23_FOLDER}/RobotPlayer.java` file" - + # Check for run(RobotController rc) method robot_player_content = agent.environment.execute(f"cat src/{BC23_FOLDER}/RobotPlayer.java")["output"] if "public static void run(RobotController" not in robot_player_content: - return False, f"There should be a `run(RobotController rc)` method implemented in `src/{BC23_FOLDER}/RobotPlayer.java`" - + return ( + False, + f"There should be a `run(RobotController rc)` method implemented in `src/{BC23_FOLDER}/RobotPlayer.java`", + ) + # Check for correct package declaration if f"package {BC23_FOLDER};" not in robot_player_content: - return False, f"The package declaration should be `package {BC23_FOLDER};` in `src/{BC23_FOLDER}/RobotPlayer.java`" - + return ( + False, + f"The package declaration should be `package {BC23_FOLDER};` in `src/{BC23_FOLDER}/RobotPlayer.java`", + ) + return True, None def _compile_agent(self, agent: Player, idx: int) -> str | None: """Compile an agent's code in the game container. - + Args: agent: The agent to compile idx: Index for naming the output directory - + Returns: Path to compiled classes directory, or None if compilation failed """ @@ -118,21 +126,17 @@ def _compile_agent(self, agent: Player, idx: int) -> str | None: src = f"/{agent.name}/src/{BC23_FOLDER}/" dest = str(DIR_WORK / "src" / BC23_FOLDER) self.environment.execute(f"rm -rf {dest}; mkdir -p {dest}; cp -r {src}* {dest}/") - + # Compile (use clean to ensure fresh compilation, avoiding stale cache) compile_result = self.environment.execute("./gradlew clean compileJava", timeout=120) if compile_result["returncode"] != 0: - self.logger.warning( - f"Failed to compile agent {agent.name}:\n{compile_result['output'][-1000:]}" - ) + self.logger.warning(f"Failed to compile agent {agent.name}:\n{compile_result['output'][-1000:]}") return None - + # Save compiled classes outside build/ (gradle clean deletes build/) classes_dir = f"/tmp/agent{idx}_classes" - self.environment.execute( - f"rm -rf {classes_dir}; mkdir -p {classes_dir}; cp -r build/classes/* {classes_dir}/" - ) - + self.environment.execute(f"rm -rf {classes_dir}; mkdir -p {classes_dir}; cp -r build/classes/* {classes_dir}/") + self.logger.info(f"Successfully compiled {agent.name}") return classes_dir @@ -143,7 +147,7 @@ def _run_simulation( agent_classes: dict[str, str], ) -> None: """Run a single simulation. - + Args: sim_meta: Simulation metadata with team assignments agents: List of agents (for name lookup) @@ -158,7 +162,7 @@ def _run_simulation( f"-PclassLocationA={agent_classes[sim_meta.team_a]} " f"-PclassLocationB={agent_classes[sim_meta.team_b]}" ) - + try: response = self.environment.execute( cmd + f" > {self.log_env / sim_meta.log_file} 2>&1", @@ -167,15 +171,13 @@ def _run_simulation( except subprocess.TimeoutExpired: self.logger.warning(f"Simulation {sim_meta.idx} timed out") return - + if response["returncode"] != 0: - self.logger.warning( - f"Simulation {sim_meta.idx} failed with exit code {response['returncode']}" - ) + self.logger.warning(f"Simulation {sim_meta.idx} failed with exit code {response['returncode']}") def execute_round(self, agents: list[Player]): """Execute a round: compile all agents, then run simulations. - + Handles failures gracefully: - If one agent fails to compile, the other wins automatically - If both fail, round is a no-contest @@ -185,11 +187,11 @@ def execute_round(self, agents: list[Player]): for idx, agent in enumerate(agents): classes_path = self._compile_agent(agent, idx) agent_classes[agent.name] = classes_path - + # Check compilation results compiled_agents = [a for a in agents if agent_classes[a.name] is not None] failed_agents = [a for a in agents if agent_classes[a.name] is None] - + if len(compiled_agents) == 0: self.logger.error("All agents failed to compile - no contest") self._round_result = RoundResult( @@ -197,13 +199,11 @@ def execute_round(self, agents: list[Player]): reason="all agents failed to compile", ) return - + if len(compiled_agents) == 1: winner = compiled_agents[0] loser = failed_agents[0] - self.logger.info( - f"Only {winner.name} compiled successfully (opponent {loser.name} failed) - automatic win" - ) + self.logger.info(f"Only {winner.name} compiled successfully (opponent {loser.name} failed) - automatic win") self._round_result = RoundResult( status="auto_win", winner=winner.name, @@ -211,42 +211,41 @@ def execute_round(self, agents: list[Player]): reason=f"{loser.name} failed to compile", ) return - + # Phase 2: Build simulation metadata with alternating team positions num_sims = self.game_config["sims_per_round"] simulations: list[SimulationMeta] = [] - + for idx in range(num_sims): # Alternate team positions for fairness if idx % 2 == 0: team_a, team_b = agents[0].name, agents[1].name else: team_a, team_b = agents[1].name, agents[0].name - - simulations.append(SimulationMeta( - idx=idx, - team_a=team_a, - team_b=team_b, - log_file=BC23_LOG.format(idx=idx), - )) - + + simulations.append( + SimulationMeta( + idx=idx, + team_a=team_a, + team_b=team_b, + log_file=BC23_LOG.format(idx=idx), + ) + ) + # Phase 3: Run simulations in parallel self.logger.info(f"Running {num_sims} simulations with alternating team positions") - + # Filter to only compiled agents' classes valid_classes = {name: path for name, path in agent_classes.items() if path is not None} - + with ThreadPoolExecutor(5) as executor: - futures = [ - executor.submit(self._run_simulation, sim, agents, valid_classes) - for sim in simulations - ] + futures = [executor.submit(self._run_simulation, sim, agents, valid_classes) for sim in simulations] for future in tqdm(as_completed(futures), total=len(futures), desc="Simulations"): try: future.result() except Exception as e: self.logger.error(f"Simulation raised unexpected exception: {e}") - + self._round_result = RoundResult( status="completed", simulations=simulations, @@ -254,48 +253,45 @@ def execute_round(self, agents: list[Player]): def _parse_simulation_log(self, log_path, sim_meta: SimulationMeta) -> str | None: """Parse a single simulation log to determine the winner. - + Args: log_path: Path to the log file sim_meta: Simulation metadata with team assignments - + Returns: Winner agent name, RESULT_TIE, or None if parsing failed """ if not log_path.exists(): self.logger.debug(f"Simulation {sim_meta.idx}: log file missing") return None - + with open(log_path) as f: content = f.read().strip() - + lines = content.split("\n") if len(lines) < 2: self.logger.debug(f"Simulation {sim_meta.idx}: log too short (game crashed?)") return None - + # Find the winner line (contains "wins" and "[server]") winner_line = None - reason_line = None - for i, line in enumerate(lines): + for line in lines: if "wins" in line and "[server]" in line: winner_line = line - if i + 1 < len(lines): - reason_line = lines[i + 1] break - + if not winner_line: self.logger.debug(f"Simulation {sim_meta.idx}: no winner line found") return RESULT_TIE - + # Extract A or B from winner line: "mysubmission (A) wins" or "mysubmission (B) wins" match = re.search(r"\(([AB])\)\s+wins", winner_line) if not match: self.logger.debug(f"Simulation {sim_meta.idx}: could not parse winner from line") return RESULT_TIE - + winner_key = match.group(1) - + # Map A/B to agent names using stored metadata (no recalculation needed) if winner_key == "A": return sim_meta.team_a @@ -304,13 +300,13 @@ def _parse_simulation_log(self, log_path, sim_meta: SimulationMeta) -> str | Non def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): """Parse simulation results and determine the round winner.""" - + # Handle early termination cases if self._round_result is None: self.logger.error("get_results called but execute_round didn't set _round_result") stats.winner = RESULT_TIE return - + if self._round_result.status == "no_contest": self.logger.info(f"Round ended in no-contest: {self._round_result.reason}") stats.winner = RESULT_TIE @@ -322,7 +318,7 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): stats.player_stats[agent.name].valid_submit = False stats.player_stats[agent.name].invalid_reason = "Compilation failed (no contest)" return - + if self._round_result.status == "auto_win": winner = self._round_result.winner loser = self._round_result.loser @@ -334,31 +330,31 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): stats.player_stats[loser].valid_submit = False stats.player_stats[loser].invalid_reason = f"Compilation failed: {self._round_result.reason}" return - + # Normal case: parse simulation logs scores = defaultdict(int) - + tie_count = 0 for sim in self._round_result.simulations: log_path = self.log_round(round_num) / sim.log_file winner = self._parse_simulation_log(log_path, sim) - + if winner is None: pass # Simulation failed, don't count elif winner == RESULT_TIE: tie_count += 1 else: scores[winner] += 1 - + if tie_count > 0: self.logger.info(f"{tie_count} simulation(s) ended in tie") - + # Determine overall winner if scores: # Find max score, check for ties max_score = max(scores.values()) leaders = [name for name, score in scores.items() if score == max_score] - + if len(leaders) == 1: stats.winner = leaders[0] else: @@ -367,7 +363,7 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): # All simulations failed self.logger.warning("All simulations failed to produce results") stats.winner = RESULT_TIE - + for player, score in scores.items(): stats.scores[player] = score if player != RESULT_TIE: diff --git a/codeclash/arenas/battlecode24/BattleCode24.Dockerfile b/codeclash/arenas/battlecode24/BattleCode24.Dockerfile index 6cd92cd6..0aae22ed 100644 --- a/codeclash/arenas/battlecode24/BattleCode24.Dockerfile +++ b/codeclash/arenas/battlecode24/BattleCode24.Dockerfile @@ -11,4 +11,3 @@ RUN git clone https://github.com/CodeClash-ai/BattleCode2024.git /workspace \ WORKDIR /workspace RUN chmod +x gradlew && ./gradlew update - diff --git a/codeclash/arenas/battlecode24/battlecode24.py b/codeclash/arenas/battlecode24/battlecode24.py index 50827ba0..be8bff69 100644 --- a/codeclash/arenas/battlecode24/battlecode24.py +++ b/codeclash/arenas/battlecode24/battlecode24.py @@ -19,36 +19,38 @@ @dataclass class SimulationMeta: """Metadata for a single simulation, storing team assignments explicitly.""" + idx: int - team_a: str - team_b: str + team_a: str + team_b: str log_file: str @dataclass class RoundResult: """Result of execute_round, used to communicate status to get_results.""" + status: Literal["completed", "auto_win", "no_contest"] winner: str | None = None - loser: str | None = None + loser: str | None = None reason: str = "" simulations: list[SimulationMeta] = field(default_factory=list) class BattleCode24Arena(CodeArena): """BattleCode24 arena implementation. - + Lifecycle: 1. validate_code() - Source-level structural checks only (in agent container) 2. execute_round() - Compile and run simulations (in game container) 3. get_results() - Parse logs and determine winner - + Failure handling: - If one agent fails to compile, the other wins automatically - If both fail to compile, round is a no-contest (tie) - Individual simulation failures don't count toward either player """ - + name: str = "BattleCode24" description: str = """Battlecode 2024: Breadwars is a real-time strategy game where your Java bot controls a team of robots competing to capture the opponent's flags. Your mission: capture all 3 of the opponent's flags before they capture yours. Robots can attack, heal, build traps, dig/fill terrain, and specialize in different skills through experience. @@ -61,7 +63,7 @@ class BattleCode24Arena(CodeArena): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) assert len(config["players"]) == 2, "BattleCode24 is a two-player game" - + # Build base run command self.run_cmd_base: str = "./gradlew --no-daemon run" for arg, val in self.game_config.get("args", self.default_args).items(): @@ -70,14 +72,13 @@ def __init__(self, config, **kwargs): self.run_cmd_base += f" -P{arg}=true" else: self.run_cmd_base += f" -P{arg}={val}" - + # Round state (set by execute_round, used by get_results) self._round_result: RoundResult | None = None - def validate_code(self, agent: Player) -> tuple[bool, str | None]: """Validate source structure. No compilation - that happens in execute_round. - + Checks: 1. src/mysubmission/ directory exists 2. RobotPlayer.java file exists @@ -88,31 +89,36 @@ def validate_code(self, agent: Player) -> tuple[bool, str | None]: ls_output = agent.environment.execute("ls src")["output"] if BC24_FOLDER not in ls_output: return False, f"There should be a `src/{BC24_FOLDER}/` directory" - + # Check for RobotPlayer.java file ls_mysubmission = agent.environment.execute(f"ls src/{BC24_FOLDER}")["output"] if "RobotPlayer.java" not in ls_mysubmission: return False, f"There should be a `src/{BC24_FOLDER}/RobotPlayer.java` file" - + # Check for run(RobotController rc) method robot_player_content = agent.environment.execute(f"cat src/{BC24_FOLDER}/RobotPlayer.java")["output"] if "public static void run(RobotController" not in robot_player_content: - return False, f"There should be a `run(RobotController rc)` method implemented in `src/{BC24_FOLDER}/RobotPlayer.java`" - + return ( + False, + f"There should be a `run(RobotController rc)` method implemented in `src/{BC24_FOLDER}/RobotPlayer.java`", + ) + # Check for correct package declaration if f"package {BC24_FOLDER};" not in robot_player_content: - return False, f"The package declaration should be `package {BC24_FOLDER};` in `src/{BC24_FOLDER}/RobotPlayer.java`" - + return ( + False, + f"The package declaration should be `package {BC24_FOLDER};` in `src/{BC24_FOLDER}/RobotPlayer.java`", + ) + return True, None - def _compile_agent(self, agent: Player, idx: int) -> str | None: """Compile an agent's code in the game container. - + Args: agent: The agent to compile idx: Index for naming the output directory - + Returns: Path to compiled classes directory, or None if compilation failed """ @@ -120,25 +126,20 @@ def _compile_agent(self, agent: Player, idx: int) -> str | None: src = f"/{agent.name}/src/{BC24_FOLDER}/" dest = str(DIR_WORK / "src" / BC24_FOLDER) self.environment.execute(f"rm -rf {dest}; mkdir -p {dest}; cp -r {src}* {dest}/") - + # Compile (use clean to ensure fresh compilation, avoiding stale cache) compile_result = self.environment.execute("./gradlew clean compileJava", timeout=120) if compile_result["returncode"] != 0: - self.logger.warning( - f"Failed to compile agent {agent.name}:\n{compile_result['output'][-1000:]}" - ) + self.logger.warning(f"Failed to compile agent {agent.name}:\n{compile_result['output'][-1000:]}") return None - + # Save compiled classes outside build/ (gradle clean deletes build/) classes_dir = f"/tmp/agent{idx}_classes" - self.environment.execute( - f"rm -rf {classes_dir}; mkdir -p {classes_dir}; cp -r build/classes/* {classes_dir}/" - ) - + self.environment.execute(f"rm -rf {classes_dir}; mkdir -p {classes_dir}; cp -r build/classes/* {classes_dir}/") + self.logger.info(f"Successfully compiled {agent.name}") return classes_dir - def _run_simulation( self, sim_meta: SimulationMeta, @@ -146,7 +147,7 @@ def _run_simulation( agent_classes: dict[str, str], ) -> None: """Run a single simulation. - + Args: sim_meta: Simulation metadata with team assignments agents: List of agents (for name lookup) @@ -161,7 +162,7 @@ def _run_simulation( f"-PclassLocationA={agent_classes[sim_meta.team_a]} " f"-PclassLocationB={agent_classes[sim_meta.team_b]}" ) - + try: response = self.environment.execute( cmd + f" > {self.log_env / sim_meta.log_file} 2>&1", @@ -170,15 +171,13 @@ def _run_simulation( except subprocess.TimeoutExpired: self.logger.warning(f"Simulation {sim_meta.idx} timed out") return - + if response["returncode"] != 0: - self.logger.warning( - f"Simulation {sim_meta.idx} failed with exit code {response['returncode']}" - ) + self.logger.warning(f"Simulation {sim_meta.idx} failed with exit code {response['returncode']}") def execute_round(self, agents: list[Player]): """Execute a round: compile all agents, then run simulations. - + Handles failures gracefully: - If one agent fails to compile, the other wins automatically - If both fail, round is a no-contest @@ -188,11 +187,11 @@ def execute_round(self, agents: list[Player]): for idx, agent in enumerate(agents): classes_path = self._compile_agent(agent, idx) agent_classes[agent.name] = classes_path - + # Check compilation results compiled_agents = [a for a in agents if agent_classes[a.name] is not None] failed_agents = [a for a in agents if agent_classes[a.name] is None] - + if len(compiled_agents) == 0: self.logger.error("All agents failed to compile - no contest") self._round_result = RoundResult( @@ -200,13 +199,11 @@ def execute_round(self, agents: list[Player]): reason="all agents failed to compile", ) return - + if len(compiled_agents) == 1: winner = compiled_agents[0] loser = failed_agents[0] - self.logger.info( - f"Only {winner.name} compiled successfully (opponent {loser.name} failed) - automatic win" - ) + self.logger.info(f"Only {winner.name} compiled successfully (opponent {loser.name} failed) - automatic win") self._round_result = RoundResult( status="auto_win", winner=winner.name, @@ -214,69 +211,68 @@ def execute_round(self, agents: list[Player]): reason=f"{loser.name} failed to compile", ) return - + # Phase 2: Build simulation metadata with alternating team positions num_sims = self.game_config["sims_per_round"] simulations: list[SimulationMeta] = [] - + for idx in range(num_sims): # Alternate team positions for fairness if idx % 2 == 0: team_a, team_b = agents[0].name, agents[1].name else: team_a, team_b = agents[1].name, agents[0].name - - simulations.append(SimulationMeta( - idx=idx, - team_a=team_a, - team_b=team_b, - log_file=BC24_LOG.format(idx=idx), - )) - + + simulations.append( + SimulationMeta( + idx=idx, + team_a=team_a, + team_b=team_b, + log_file=BC24_LOG.format(idx=idx), + ) + ) + # Phase 3: Run simulations in parallel self.logger.info(f"Running {num_sims} simulations with alternating team positions") - + # Filter to only compiled agents' classes valid_classes = {name: path for name, path in agent_classes.items() if path is not None} - + with ThreadPoolExecutor(5) as executor: - futures = [ - executor.submit(self._run_simulation, sim, agents, valid_classes) - for sim in simulations - ] + futures = [executor.submit(self._run_simulation, sim, agents, valid_classes) for sim in simulations] for future in tqdm(as_completed(futures), total=len(futures), desc="Simulations"): try: future.result() except Exception as e: self.logger.error(f"Simulation raised unexpected exception: {e}") - + self._round_result = RoundResult( status="completed", simulations=simulations, ) - + def _parse_simulation_log(self, log_path, sim_meta: SimulationMeta) -> str | None: """Parse a single simulation log to determine the winner. - + Args: log_path: Path to the log file sim_meta: Simulation metadata with team assignments - + Returns: Winner agent name, RESULT_TIE, or None if parsing failed """ if not log_path.exists(): self.logger.debug(f"Simulation {sim_meta.idx}: log file missing") return None - + with open(log_path) as f: content = f.read().strip() - + lines = content.split("\n") if len(lines) < 2: self.logger.debug(f"Simulation {sim_meta.idx}: log too short (game crashed?)") return None - + # Find the winner line (contains "wins" and "[server]") winner_line = None reason_line = None @@ -286,23 +282,23 @@ def _parse_simulation_log(self, log_path, sim_meta: SimulationMeta) -> str | Non if i + 1 < len(lines): reason_line = lines[i + 1] break - + if not winner_line: self.logger.debug(f"Simulation {sim_meta.idx}: no winner line found") return RESULT_TIE - + # Extract A or B from winner line: "mysubmission (A) wins" or "mysubmission (B) wins" match = re.search(r"\(([AB])\)\s+wins", winner_line) if not match: self.logger.debug(f"Simulation {sim_meta.idx}: could not parse winner from line") return RESULT_TIE - + winner_key = match.group(1) - + # Check for coin flip tie if reason_line and BC24_TIE in reason_line: return RESULT_TIE - + # Map A/B to agent names using stored metadata (no recalculation needed) if winner_key == "A": return sim_meta.team_a @@ -311,13 +307,13 @@ def _parse_simulation_log(self, log_path, sim_meta: SimulationMeta) -> str | Non def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): """Parse simulation results and determine the round winner.""" - + # Handle early termination cases if self._round_result is None: self.logger.error("get_results called but execute_round didn't set _round_result") stats.winner = RESULT_TIE return - + if self._round_result.status == "no_contest": self.logger.info(f"Round ended in no-contest: {self._round_result.reason}") stats.winner = RESULT_TIE @@ -329,7 +325,7 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): stats.player_stats[agent.name].valid_submit = False stats.player_stats[agent.name].invalid_reason = "Compilation failed (no contest)" return - + if self._round_result.status == "auto_win": winner = self._round_result.winner loser = self._round_result.loser @@ -341,31 +337,31 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): stats.player_stats[loser].valid_submit = False stats.player_stats[loser].invalid_reason = f"Compilation failed: {self._round_result.reason}" return - + # Normal case: parse simulation logs scores = defaultdict(int) - + tie_count = 0 for sim in self._round_result.simulations: log_path = self.log_round(round_num) / sim.log_file winner = self._parse_simulation_log(log_path, sim) - + if winner is None: - pass + pass elif winner == RESULT_TIE: - tie_count += 1 + tie_count += 1 else: scores[winner] += 1 - + if tie_count > 0: self.logger.info(f"{tie_count} simulation(s) ended in tie") - + # Determine overall winner if scores: # Find max score, check for ties max_score = max(scores.values()) leaders = [name for name, score in scores.items() if score == max_score] - + if len(leaders) == 1: stats.winner = leaders[0] else: @@ -374,7 +370,7 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): # All simulations failed self.logger.warning("All simulations failed to produce results") stats.winner = RESULT_TIE - + for player, score in scores.items(): stats.scores[player] = score if player != RESULT_TIE: diff --git a/codeclash/arenas/chess/Chess.Dockerfile b/codeclash/arenas/chess/Chess.Dockerfile index fb52e1aa..093f1250 100644 --- a/codeclash/arenas/chess/Chess.Dockerfile +++ b/codeclash/arenas/chess/Chess.Dockerfile @@ -25,4 +25,3 @@ RUN git clone https://github.com/Disservin/fastchess.git /tmp/fastchess \ && rm -rf /tmp/fastchess WORKDIR /workspace - diff --git a/codeclash/arenas/chess/chess.py b/codeclash/arenas/chess/chess.py index 92a6e87c..bef076c8 100644 --- a/codeclash/arenas/chess/chess.py +++ b/codeclash/arenas/chess/chess.py @@ -4,14 +4,13 @@ import subprocess from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed -from pathlib import Path from tqdm.auto import tqdm from codeclash.agents.player import Player from codeclash.arenas.arena import CodeArena, RoundStats from codeclash.constants import RESULT_TIE -from codeclash.utils.environment import assert_zero_exit_code, create_file_in_container +from codeclash.utils.environment import create_file_in_container class ChessArena(CodeArena): @@ -23,21 +22,23 @@ class ChessArena(CodeArena): IMPORTANT: Do not modify the executable name in the Makefile (keep `EXE = kojiro`). The executable must be named `kojiro`.""" submission: str = "src/" default_args: dict = { - "time_control": "1+0.01", + "time_control": "1+0.01", } def __init__(self, config, **kwargs): super().__init__(config, **kwargs) - + # Get time control from config - time_control = self.game_config.get("args", self.default_args).get("time_control", self.default_args["time_control"]) - + time_control = self.game_config.get("args", self.default_args).get( + "time_control", self.default_args["time_control"] + ) + # Build base Fastchess command self.run_cmd_base = f"fastchess -each tc={time_control}" - + # Store time control for reference self.time_control = time_control - + self.logger.debug(f"Initialized ChessArena with time control: {time_control}") def validate_code(self, agent: Player) -> tuple[bool, str | None]: @@ -49,7 +50,7 @@ def validate_code(self, agent: Player) -> tuple[bool, str | None]: if "src" not in ls_result["output"]: return False, "There should be a `src/` directory in the workspace" - # Compile the engine + # Compile the engine self.logger.debug(f"Compiling Kojiro for agent {agent.name}") compile_result = agent.environment.execute( "cd src && make native", @@ -74,22 +75,22 @@ def validate_code(self, agent: Player) -> tuple[bool, str | None]: def _compile_engines_in_game_container(self, agents: list[Player]) -> dict[str, str]: """ Recompile each agent's engine in the game container and return engine paths. - + Returns: dict mapping agent name to engine executable path (only successfully compiled agents) """ engine_paths = {} failed_agents = [] - + for agent in agents: src_dir = f"/{agent.name}/src" self.logger.debug(f"Compiling Kojiro for {agent.name} in game container") - + compile_result = self.environment.execute( f"cd {src_dir} && make native", timeout=120, # 2 minute timeout for compilation ) - + if compile_result["returncode"] != 0: error_output = compile_result.get("output", "Unknown compilation error") if len(error_output) > 1000: @@ -97,7 +98,7 @@ def _compile_engines_in_game_container(self, agents: list[Player]) -> dict[str, self.logger.warning(f"Failed to compile {agent.name} in game container, skipping:\n{error_output}") failed_agents.append(agent.name) continue - + # Verify executable exists (executable name is fixed as 'kojiro' per Makefile and prompt constraints) engine_path = f"{src_dir}/kojiro" check_result = self.environment.execute(f"test -f {engine_path} && echo 'exists'") @@ -107,39 +108,39 @@ def _compile_engines_in_game_container(self, agents: list[Player]) -> dict[str, ) failed_agents.append(agent.name) continue - + engine_paths[agent.name] = engine_path self.logger.debug(f"Successfully compiled {agent.name}, engine at {engine_path}") - + if failed_agents: self.logger.warning(f"Failed to compile {len(failed_agents)} agent(s): {failed_agents}") - + return engine_paths - + def _build_match_pairings(self, agents: list[Player]) -> list[tuple[Player, Player]]: """ Build match pairings for sims_per_round simulations. - + Strategy: Round-robin style - pair agents and repeat as needed. For each simulation, randomly select two different agents. - + Returns: List of (agent1, agent2) tuples """ sims = self.game_config["sims_per_round"] pairings = [] - + # Generate pairings: for each simulation, pick two random agents for _ in range(sims): agent1, agent2 = random.sample(agents, 2) pairings.append((agent1, agent2)) - + return pairings - + def _run_single_match(self, agent1: Player, agent2: Player, engine1_path: str, engine2_path: str, idx: int): """ Run a single Fastchess match between two engines. - + Args: agent1: First agent agent2: Second agent @@ -149,10 +150,10 @@ def _run_single_match(self, agent1: Player, agent2: Player, engine1_path: str, e """ output_file = self.log_env / f"match_{idx}.pgn" - + # Ensure log directory exists self.environment.execute(f"mkdir -p {self.log_env}") - + cmd = ( f"{self.run_cmd_base} " f"-engine cmd={engine1_path} name={agent1.name} " @@ -160,15 +161,15 @@ def _run_single_match(self, agent1: Player, agent2: Player, engine1_path: str, e f"-rounds 1 " f"-pgnout file={str(output_file)}" ) - + self.logger.debug(f"Running match {idx}: {agent1.name} vs {agent2.name}") self.logger.debug(f"Fastchess command: {cmd}") self.logger.debug(f"Output file path: {output_file}") - + try: response = self.environment.execute(cmd, timeout=300) # 5 minute timeout per match if response["returncode"] != 0: - error_output = response.get('output', '')[:1000] + error_output = response.get("output", "")[:1000] self.logger.warning( f"Match {idx} ({agent1.name} vs {agent2.name}) failed with exit code {response['returncode']}:\n{error_output}" ) @@ -184,26 +185,28 @@ def _run_single_match(self, agent1: Player, agent2: Player, engine1_path: str, e self.logger.debug(f"Match {idx} PGN file verified at {output_file}") except subprocess.TimeoutExpired: self.logger.warning(f"Match {idx} ({agent1.name} vs {agent2.name}) timed out after 5 minutes") - + def execute_round(self, agents: list[Player]): """ Execute competition phase - run Fastchess matches between agents. """ assert len(agents) >= 2, "Chess requires at least two players" - + # Recompile engines in game container self.logger.info("Recompiling engines in game container...") engine_paths = self._compile_engines_in_game_container(agents) - + if len(engine_paths) < 2: - self.logger.warning(f"Only {len(engine_paths)} agent(s) compiled successfully, need at least 2. Skipping round.") + self.logger.warning( + f"Only {len(engine_paths)} agent(s) compiled successfully, need at least 2. Skipping round." + ) return - + # Build match pairings using only successfully compiled agents compiled_agents = [agent for agent in agents if agent.name in engine_paths] self.logger.info(f"Building match pairings for {self.game_config['sims_per_round']} simulations...") pairings = self._build_match_pairings(compiled_agents) - + # Store pairings to file for retrieval in get_results() pairings_file = self.log_env / "pairings.json" pairings_data = [ @@ -218,7 +221,7 @@ def execute_round(self, agents: list[Player]): dest_path=str(pairings_file), ) self.logger.debug(f"Stored pairings to {pairings_file}") - + # Run matches in parallel self.logger.info(f"Running {len(pairings)} matches in parallel...") with ThreadPoolExecutor(max_workers=min(20, len(pairings))) as executor: @@ -233,59 +236,59 @@ def execute_round(self, agents: list[Player]): ) for idx, (agent1, agent2) in enumerate(pairings) ] - + # Collect results with progress bar for future in tqdm(as_completed(futures), total=len(futures), desc="Chess matches"): try: future.result() except Exception as e: self.logger.error(f"Match execution failed: {e}", exc_info=True) - + self.logger.info("All matches completed") def _parse_all_games_in_pgn(self, pgn_content: str) -> list[tuple[str | None, str, str]]: """ Parse all games from a PGN file. - + Args: pgn_content: Content of the PGN file (may contain multiple games) - + Returns: List of (result, white_agent, black_agent) tuples result is agent name if that agent won, RESULT_TIE for draws, or None if incomplete """ games = [] - + # Split PGN into individual games (games are separated by blank lines) # Look for [Event ...] tags which mark the start of each game game_blocks = re.split(r'(?=\[Event\s+")', pgn_content) - + for game_block in game_blocks: game_block = game_block.strip() if not game_block: continue - + # Skip if this block doesn't look like a game (no [White] or [Black] tags) - if '[White' not in game_block or '[Black' not in game_block: + if "[White" not in game_block or "[Black" not in game_block: continue - + # Extract White and Black agent names white_match = re.search(r'\[White\s+"([^"]+)"\]', game_block) black_match = re.search(r'\[Black\s+"([^"]+)"\]', game_block) result_match = re.search(r'\[Result\s+"([^"]+)"\]', game_block) - + if not white_match or not black_match: continue # Skip incomplete game headers - + white_agent = white_match.group(1) black_agent = black_match.group(1) - + if not result_match: games.append((None, white_agent, black_agent)) continue - + result = result_match.group(1) - + # Parse result: "1-0" = White wins, "0-1" = Black wins, "1/2-1/2" = draw, "*" = incomplete if result == "1-0": games.append((white_agent, white_agent, black_agent)) @@ -298,39 +301,41 @@ def _parse_all_games_in_pgn(self, pgn_content: str) -> list[tuple[str | None, st else: self.logger.warning(f"Unknown result format: {result}") games.append((None, white_agent, black_agent)) - + return games - - def _aggregate_match_result(self, game_results: list[tuple[str | None, str, str]], agent1_name: str, agent2_name: str) -> str | None: + + def _aggregate_match_result( + self, game_results: list[tuple[str | None, str, str]], agent1_name: str, agent2_name: str + ) -> str | None: """ Aggregate results from multiple games into a single match result. - + Args: game_results: List of (result, white_agent, black_agent) tuples from _parse_all_games_in_pgn agent1_name: Name of first agent (for reference) agent2_name: Name of second agent (for reference) - + Returns: Match winner (agent name), RESULT_TIE for draw, or None if match incomplete """ if not game_results: return None - + if len(game_results) == 1: - self.logger.warning(f"Match has only 1 game, expected 2. Using single game result.") + self.logger.warning("Match has only 1 game, expected 2. Using single game result.") return game_results[0][0] - + if len(game_results) > 2: self.logger.warning(f"Match has {len(game_results)} games, expected 2. Using first 2 games.") game_results = game_results[:2] - + # Count wins for each agent agent1_wins = 0 agent2_wins = 0 draws = 0 incomplete = 0 - - for result, white_agent, black_agent in game_results: + + for result, _white_agent, _black_agent in game_results: if result is None: incomplete += 1 elif result == RESULT_TIE: @@ -341,19 +346,21 @@ def _aggregate_match_result(self, game_results: list[tuple[str | None, str, str] agent2_wins += 1 else: # Result is for an agent not in this match (shouldn't happen, but handle gracefully) - self.logger.warning(f"Unexpected result agent '{result}' in match between {agent1_name} and {agent2_name}") - + self.logger.warning( + f"Unexpected result agent '{result}' in match between {agent1_name} and {agent2_name}" + ) + # If both games incomplete, match is incomplete if incomplete == 2: return None - + # If one game incomplete, use the other game's result if incomplete == 1: for result, _, _ in game_results: if result is not None: return result return None - + # Determine match winner based on wins if agent1_wins > agent2_wins: return agent1_name @@ -362,72 +369,68 @@ def _aggregate_match_result(self, game_results: list[tuple[str | None, str, str] else: # Equal wins (could be 1-1, 0-0 with draws, etc.) = match draw return RESULT_TIE - + def _load_pairings(self, round_num: int) -> dict[int, tuple[str, str]]: """ Load match pairings from stored JSON file. - + Returns: Dict mapping match_idx to (agent1_name, agent2_name) tuple """ pairings_file = self.log_round(round_num) / "pairings.json" - + try: with open(pairings_file) as f: pairings_data = json.load(f) - - pairings = { - item["match_idx"]: (item["agent1"], item["agent2"]) - for item in pairings_data - } - return pairings + + return {item["match_idx"]: (item["agent1"], item["agent2"]) for item in pairings_data} except FileNotFoundError: self.logger.error(f"Pairings file not found: {pairings_file}") return {} except json.JSONDecodeError as e: self.logger.error(f"Failed to parse pairings file: {e}") return {} - + def _read_all_match_results(self, round_num: int, agents: list[Player]) -> list[tuple[str | None, str, str]]: """ Read all match result files and parse them. - + Returns: List of (winner, agent1_name, agent2_name) tuples winner is None if match failed or incomplete """ match_results = [] - + # Load pairings from stored file pairings = self._load_pairings(round_num) if not pairings: self.logger.warning("No pairings found, cannot parse match results") return [] - + # Build set of valid agent names for validation valid_agent_names = {agent.name for agent in agents} - + sims = self.game_config["sims_per_round"] - + for idx in range(sims): # Get agent names from stored pairings if idx not in pairings: self.logger.warning(f"Match {idx} pairing not found in pairings file, skipping") continue - + agent1_name, agent2_name = pairings[idx] - + # Validate agent names exist in agents list if agent1_name not in valid_agent_names or agent2_name not in valid_agent_names: self.logger.warning( f"Match {idx}: Invalid agent names ({agent1_name}, {agent2_name}) not in agents list, skipping" ) continue - + pgn_file = self.log_round(round_num) / f"match_{idx}.pgn" - + self.logger.debug(f"Looking for PGN file at: {pgn_file}") - + try: if not pgn_file.exists(): self.logger.warning(f"PGN file does not exist: {pgn_file}") @@ -438,26 +441,26 @@ def _read_all_match_results(self, round_num: int, agents: list[Player]) -> list[ else: self.logger.warning(f"Round directory does not exist: {self.log_round(round_num)}") continue - + with open(pgn_file) as f: pgn_content = f.read() - + # Parse all games from PGN file game_results = self._parse_all_games_in_pgn(pgn_content) - + # Aggregate game results into match result winner = self._aggregate_match_result(game_results, agent1_name, agent2_name) match_results.append((winner, agent1_name, agent2_name)) - + except FileNotFoundError: self.logger.warning(f"Match {idx} result file not found, skipping") continue except Exception as e: self.logger.warning(f"Error parsing match {idx} result: {e}") continue - + return match_results - + def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): """ Parse Fastchess results and determine winners. @@ -470,14 +473,14 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): self.logger.debug(f"get_results: Files in round directory: {[f.name for f in files]}") else: self.logger.warning(f"get_results: Round directory does not exist: {round_dir}") - + # Read and parse all match results match_results = self._read_all_match_results(round_num, agents) - + # Count wins per agent scores = defaultdict(int) valid_matches = 0 - for winner, agent1_name, agent2_name in match_results: + for winner, _agent1_name, _agent2_name in match_results: if winner is None: # Incomplete or failed match - skip it continue @@ -489,7 +492,7 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): # Winner exists - give 1 point scores[winner] += 1 valid_matches += 1 - + # Determine overall winner if valid_matches == 0: self.logger.warning("No valid match results found (all matches failed or incomplete)") @@ -504,19 +507,18 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): # Find agent(s) with maximum score max_score = max(scores.values()) winners = [name for name, score in scores.items() if score == max_score] - + if len(winners) > 1: stats.winner = RESULT_TIE else: stats.winner = winners[0] - + # Update stats object stats.scores = dict(scores) - + # Ensure all agents have scores (even if 0) for agent in agents: stats.scores[agent.name] = scores.get(agent.name, 0) stats.player_stats[agent.name].score = scores.get(agent.name, 0) - - self.logger.info(f"Round {round_num} results: winner={stats.winner}, scores={stats.scores}") + self.logger.info(f"Round {round_num} results: winner={stats.winner}, scores={stats.scores}") diff --git a/codeclash/arenas/scml/__init__.py b/codeclash/arenas/scml/__init__.py index 8b137891..e69de29b 100644 --- a/codeclash/arenas/scml/__init__.py +++ b/codeclash/arenas/scml/__init__.py @@ -1 +0,0 @@ - diff --git a/configs/test/battlecode23.yaml b/configs/test/battlecode23.yaml index a1a8f3f5..05c6a426 100644 --- a/configs/test/battlecode23.yaml +++ b/configs/test/battlecode23.yaml @@ -21,4 +21,3 @@ prompts: Your task: improve the bot in `src/mysubmission`, located in {{working_dir}}. {{working_dir}} is your codebase, which contains both your bot and supporting assets. - diff --git a/pyproject.toml b/pyproject.toml index 8d9324e8..23a02cdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -205,3 +205,4 @@ line-ending = "auto" [tool.typos.default.extend-words] # Don't correct the surname "Teh" ba = "ba" +mis = "mis" diff --git a/tests/arenas/test_battlecode23.py b/tests/arenas/test_battlecode23.py index dcdc58c1..e9323735 100644 --- a/tests/arenas/test_battlecode23.py +++ b/tests/arenas/test_battlecode23.py @@ -548,4 +548,3 @@ def test_round_result_no_contest(self): assert result.loser is None assert "all agents failed" in result.reason assert len(result.simulations) == 0 - diff --git a/tests/arenas/test_chess.py b/tests/arenas/test_chess.py index da30a10e..29fd91f1 100644 --- a/tests/arenas/test_chess.py +++ b/tests/arenas/test_chess.py @@ -5,6 +5,7 @@ """ import json + import pytest from codeclash.arenas.arena import RoundStats @@ -15,232 +16,231 @@ class TestChessValidation: - """Tests for ChessArena.validate_code()""" - - @pytest.fixture - def arena(self, tmp_log_dir, minimal_config): - """Create ChessArena instance with mocked environment.""" - arena = ChessArena.__new__(ChessArena) - arena.submission = "src/" - arena.log_local = tmp_log_dir - # Minimal attributes used in validate_code - arena.logger = type("Logger", (), {"debug": lambda self, msg: None, "info": lambda self, msg: None})() - return arena - - def test_valid_submission(self, arena, mock_player_factory): - """Valid C++ engine compiles and produces `src/kojiro` executable.""" - player = mock_player_factory( - name="test_player", - files={ - # Not strictly used by validate_code, but helpful if commands fall back to defaults - "src/kojiro": "", - }, - command_outputs={ - "ls": {"output": "src\n", "returncode": 0}, - "cd src && make native": {"output": "Compile OK", "returncode": 0}, - "ls src/kojiro": {"output": "kojiro\n", "returncode": 0}, - }, - ) - - is_valid, error = arena.validate_code(player) - assert is_valid is True - assert error is None - - def test_missing_src_directory(self, arena, mock_player_factory): - """Missing `src/` directory fails validation.""" - player = mock_player_factory( - name="test_player", - files={}, - command_outputs={ - "ls": {"output": "README.md\n", "returncode": 0}, - }, - ) - - is_valid, error = arena.validate_code(player) - assert is_valid is False - assert "src/" in error - - def test_compilation_failure(self, arena, mock_player_factory): - """Compilation errors are surfaced and fail validation.""" - player = mock_player_factory( - name="test_player", - files={}, - command_outputs={ - "ls": {"output": "src\n", "returncode": 0}, - "cd src && make native": {"output": "error: failed to compile", "returncode": 1}, - }, - ) - - is_valid, error = arena.validate_code(player) - assert is_valid is False - assert "Compilation failed" in error - - def test_missing_executable_after_compilation(self, arena, mock_player_factory): - """Compilation succeeds but missing `kojiro` executable fails validation.""" - player = mock_player_factory( - name="test_player", - files={}, - command_outputs={ - "ls": {"output": "src\n", "returncode": 0}, - "cd src && make native": {"output": "Compile OK", "returncode": 0}, - "ls src/kojiro": {"output": "", "returncode": 1}, - }, - ) - - is_valid, error = arena.validate_code(player) - assert is_valid is False - assert "executable 'kojiro' not found" in error + """Tests for ChessArena.validate_code()""" + + @pytest.fixture + def arena(self, tmp_log_dir, minimal_config): + """Create ChessArena instance with mocked environment.""" + arena = ChessArena.__new__(ChessArena) + arena.submission = "src/" + arena.log_local = tmp_log_dir + # Minimal attributes used in validate_code + arena.logger = type("Logger", (), {"debug": lambda self, msg: None, "info": lambda self, msg: None})() + return arena + + def test_valid_submission(self, arena, mock_player_factory): + """Valid C++ engine compiles and produces `src/kojiro` executable.""" + player = mock_player_factory( + name="test_player", + files={ + # Not strictly used by validate_code, but helpful if commands fall back to defaults + "src/kojiro": "", + }, + command_outputs={ + "ls": {"output": "src\n", "returncode": 0}, + "cd src && make native": {"output": "Compile OK", "returncode": 0}, + "ls src/kojiro": {"output": "kojiro\n", "returncode": 0}, + }, + ) + + is_valid, error = arena.validate_code(player) + assert is_valid is True + assert error is None + + def test_missing_src_directory(self, arena, mock_player_factory): + """Missing `src/` directory fails validation.""" + player = mock_player_factory( + name="test_player", + files={}, + command_outputs={ + "ls": {"output": "README.md\n", "returncode": 0}, + }, + ) + + is_valid, error = arena.validate_code(player) + assert is_valid is False + assert "src/" in error + + def test_compilation_failure(self, arena, mock_player_factory): + """Compilation errors are surfaced and fail validation.""" + player = mock_player_factory( + name="test_player", + files={}, + command_outputs={ + "ls": {"output": "src\n", "returncode": 0}, + "cd src && make native": {"output": "error: failed to compile", "returncode": 1}, + }, + ) + + is_valid, error = arena.validate_code(player) + assert is_valid is False + assert "Compilation failed" in error + + def test_missing_executable_after_compilation(self, arena, mock_player_factory): + """Compilation succeeds but missing `kojiro` executable fails validation.""" + player = mock_player_factory( + name="test_player", + files={}, + command_outputs={ + "ls": {"output": "src\n", "returncode": 0}, + "cd src && make native": {"output": "Compile OK", "returncode": 0}, + "ls src/kojiro": {"output": "", "returncode": 1}, + }, + ) + + is_valid, error = arena.validate_code(player) + assert is_valid is False + assert "executable 'kojiro' not found" in error class TestChessResults: - """Tests for ChessArena.get_results()""" - - @pytest.fixture - def arena(self, tmp_log_dir, minimal_config): - """Create ChessArena-like instance with local logging directory.""" - config = minimal_config.copy() - config["game"]["name"] = "Chess" - config["game"]["sims_per_round"] = 2 - - arena = ChessArena.__new__(ChessArena) - arena.submission = "src/" - arena.log_local = tmp_log_dir - arena.config = config - # Lightweight logger stub - arena.logger = type( - "Logger", - (), - { - "debug": lambda self, msg: None, - "info": lambda self, msg: None, - "warning": lambda self, msg: None, - "error": lambda self, msg, **kwargs: None, - }, - )() - return arena - - def _write_pairings(self, round_dir, pairings): - pairings_file = round_dir / "pairings.json" - pairings_file.write_text(json.dumps(pairings, indent=2)) - - def _write_pgn(self, file_path, white: str, black: str, result: str): - content = ( - """ + """Tests for ChessArena.get_results()""" + + @pytest.fixture + def arena(self, tmp_log_dir, minimal_config): + """Create ChessArena-like instance with local logging directory.""" + config = minimal_config.copy() + config["game"]["name"] = "Chess" + config["game"]["sims_per_round"] = 2 + + arena = ChessArena.__new__(ChessArena) + arena.submission = "src/" + arena.log_local = tmp_log_dir + arena.config = config + # Lightweight logger stub + arena.logger = type( + "Logger", + (), + { + "debug": lambda self, msg: None, + "info": lambda self, msg: None, + "warning": lambda self, msg: None, + "error": lambda self, msg, **kwargs: None, + }, + )() + return arena + + def _write_pairings(self, round_dir, pairings): + pairings_file = round_dir / "pairings.json" + pairings_file.write_text(json.dumps(pairings, indent=2)) + + def _write_pgn(self, file_path, white: str, black: str, result: str): + content = ( + """ [Event "FastChess Match"] [Site "-"] [Date "2026.01.07"] [Round "1"] """.strip() - + f"\n[White \"{white}\"]\n[Black \"{black}\"]\n[Result \"{result}\"]\n\n" - ) - file_path.write_text(content) + + f'\n[White "{white}"]\n[Black "{black}"]\n[Result "{result}"]\n\n' + ) + file_path.write_text(content) - def test_player1_wins(self, arena, tmp_log_dir): - """Alice wins one match; overall winner is Alice.""" - round_dir = tmp_log_dir / "rounds" / "1" - round_dir.mkdir(parents=True) + def test_player1_wins(self, arena, tmp_log_dir): + """Alice wins one match; overall winner is Alice.""" + round_dir = tmp_log_dir / "rounds" / "1" + round_dir.mkdir(parents=True) - # sims_per_round = 2 but only first match is valid; second missing -> ignored - pairings = [ - {"match_idx": 0, "agent1": "Alice", "agent2": "Bob"}, - {"match_idx": 1, "agent1": "Alice", "agent2": "Bob"}, - ] - self._write_pairings(round_dir, pairings) + # sims_per_round = 2 but only first match is valid; second missing -> ignored + pairings = [ + {"match_idx": 0, "agent1": "Alice", "agent2": "Bob"}, + {"match_idx": 1, "agent1": "Alice", "agent2": "Bob"}, + ] + self._write_pairings(round_dir, pairings) - # Match 0: Alice (White) wins - self._write_pgn(round_dir / "match_0.pgn", white="Alice", black="Bob", result="1-0") - # Match 1: no file -> ignored + # Match 0: Alice (White) wins + self._write_pgn(round_dir / "match_0.pgn", white="Alice", black="Bob", result="1-0") + # Match 1: no file -> ignored - agents = [MockPlayer("Alice"), MockPlayer("Bob")] - stats = RoundStats(round_num=1, agents=agents) + agents = [MockPlayer("Alice"), MockPlayer("Bob")] + stats = RoundStats(round_num=1, agents=agents) - arena.get_results(agents, round_num=1, stats=stats) + arena.get_results(agents, round_num=1, stats=stats) - assert stats.winner == "Alice" - assert stats.scores["Alice"] == 1 - assert stats.scores["Bob"] == 0 + assert stats.winner == "Alice" + assert stats.scores["Alice"] == 1 + assert stats.scores["Bob"] == 0 - def test_player2_wins(self, arena, tmp_log_dir): - """Bob wins one match; overall winner is Bob.""" - round_dir = tmp_log_dir / "rounds" / "1" - round_dir.mkdir(parents=True) + def test_player2_wins(self, arena, tmp_log_dir): + """Bob wins one match; overall winner is Bob.""" + round_dir = tmp_log_dir / "rounds" / "1" + round_dir.mkdir(parents=True) - pairings = [ - {"match_idx": 0, "agent1": "Alice", "agent2": "Bob"}, - {"match_idx": 1, "agent1": "Alice", "agent2": "Bob"}, - ] - self._write_pairings(round_dir, pairings) + pairings = [ + {"match_idx": 0, "agent1": "Alice", "agent2": "Bob"}, + {"match_idx": 1, "agent1": "Alice", "agent2": "Bob"}, + ] + self._write_pairings(round_dir, pairings) - # Match 0: Bob (Black) wins - self._write_pgn(round_dir / "match_0.pgn", white="Alice", black="Bob", result="0-1") + # Match 0: Bob (Black) wins + self._write_pgn(round_dir / "match_0.pgn", white="Alice", black="Bob", result="0-1") - agents = [MockPlayer("Alice"), MockPlayer("Bob")] - stats = RoundStats(round_num=1, agents=agents) + agents = [MockPlayer("Alice"), MockPlayer("Bob")] + stats = RoundStats(round_num=1, agents=agents) - arena.get_results(agents, round_num=1, stats=stats) + arena.get_results(agents, round_num=1, stats=stats) - assert stats.winner == "Bob" - assert stats.scores["Alice"] == 0 - assert stats.scores["Bob"] == 1 + assert stats.winner == "Bob" + assert stats.scores["Alice"] == 0 + assert stats.scores["Bob"] == 1 - def test_all_draws(self, arena, tmp_log_dir): - """All matches draw -> overall tie with zero scores.""" - round_dir = tmp_log_dir / "rounds" / "1" - round_dir.mkdir(parents=True) + def test_all_draws(self, arena, tmp_log_dir): + """All matches draw -> overall tie with zero scores.""" + round_dir = tmp_log_dir / "rounds" / "1" + round_dir.mkdir(parents=True) - pairings = [ - {"match_idx": 0, "agent1": "Alice", "agent2": "Bob"}, - {"match_idx": 1, "agent1": "Alice", "agent2": "Bob"}, - ] - self._write_pairings(round_dir, pairings) + pairings = [ + {"match_idx": 0, "agent1": "Alice", "agent2": "Bob"}, + {"match_idx": 1, "agent1": "Alice", "agent2": "Bob"}, + ] + self._write_pairings(round_dir, pairings) - # Two draws - self._write_pgn(round_dir / "match_0.pgn", white="Alice", black="Bob", result="1/2-1/2") - self._write_pgn(round_dir / "match_1.pgn", white="Bob", black="Alice", result="1/2-1/2") + # Two draws + self._write_pgn(round_dir / "match_0.pgn", white="Alice", black="Bob", result="1/2-1/2") + self._write_pgn(round_dir / "match_1.pgn", white="Bob", black="Alice", result="1/2-1/2") - agents = [MockPlayer("Alice"), MockPlayer("Bob")] - stats = RoundStats(round_num=1, agents=agents) + agents = [MockPlayer("Alice"), MockPlayer("Bob")] + stats = RoundStats(round_num=1, agents=agents) - arena.get_results(agents, round_num=1, stats=stats) + arena.get_results(agents, round_num=1, stats=stats) - assert stats.winner == RESULT_TIE - assert stats.scores["Alice"] == 0 - assert stats.scores["Bob"] == 0 + assert stats.winner == RESULT_TIE + assert stats.scores["Alice"] == 0 + assert stats.scores["Bob"] == 0 - def test_split_wins_results_in_tie(self, arena, tmp_log_dir): - """Each player wins one match -> tie overall.""" - round_dir = tmp_log_dir / "rounds" / "1" - round_dir.mkdir(parents=True) + def test_split_wins_results_in_tie(self, arena, tmp_log_dir): + """Each player wins one match -> tie overall.""" + round_dir = tmp_log_dir / "rounds" / "1" + round_dir.mkdir(parents=True) - pairings = [ - {"match_idx": 0, "agent1": "Alice", "agent2": "Bob"}, - {"match_idx": 1, "agent1": "Alice", "agent2": "Bob"}, - ] - self._write_pairings(round_dir, pairings) + pairings = [ + {"match_idx": 0, "agent1": "Alice", "agent2": "Bob"}, + {"match_idx": 1, "agent1": "Alice", "agent2": "Bob"}, + ] + self._write_pairings(round_dir, pairings) - # Alice wins match 0, Bob wins match 1 - self._write_pgn(round_dir / "match_0.pgn", white="Alice", black="Bob", result="1-0") - self._write_pgn(round_dir / "match_1.pgn", white="Alice", black="Bob", result="0-1") + # Alice wins match 0, Bob wins match 1 + self._write_pgn(round_dir / "match_0.pgn", white="Alice", black="Bob", result="1-0") + self._write_pgn(round_dir / "match_1.pgn", white="Alice", black="Bob", result="0-1") - agents = [MockPlayer("Alice"), MockPlayer("Bob")] - stats = RoundStats(round_num=1, agents=agents) + agents = [MockPlayer("Alice"), MockPlayer("Bob")] + stats = RoundStats(round_num=1, agents=agents) - arena.get_results(agents, round_num=1, stats=stats) + arena.get_results(agents, round_num=1, stats=stats) - assert stats.winner == RESULT_TIE - assert stats.scores["Alice"] == 1 - assert stats.scores["Bob"] == 1 + assert stats.winner == RESULT_TIE + assert stats.scores["Alice"] == 1 + assert stats.scores["Bob"] == 1 class TestChessConfig: - """Tests for ChessArena configuration and properties.""" - - def test_arena_name(self): - assert ChessArena.name == "Chess" + """Tests for ChessArena configuration and properties.""" - def test_submission_folder(self): - assert ChessArena.submission == "src/" + def test_arena_name(self): + assert ChessArena.name == "Chess" - def test_default_args_contains_time_control(self): - assert "time_control" in ChessArena.default_args + def test_submission_folder(self): + assert ChessArena.submission == "src/" + def test_default_args_contains_time_control(self): + assert "time_control" in ChessArena.default_args diff --git a/tests/arenas/test_cyborg.py b/tests/arenas/test_cyborg.py index ddacc4bf..7464b4cd 100644 --- a/tests/arenas/test_cyborg.py +++ b/tests/arenas/test_cyborg.py @@ -121,7 +121,7 @@ def test_execute_round_uses_nested_game_args(self): "steps_per_episode": 11, "num_drones": 13, "timeout": 17, - } + }, } } arena.log_env = Path("/logs")