From 439af9dfe91628a6f2b47e46d01ab39b4c37c464 Mon Sep 17 00:00:00 2001 From: Joseph Mattiello Date: Fri, 1 May 2026 19:51:42 -0400 Subject: [PATCH 1/3] perf: profile data + decision doc -- redirect to GPU/DSP RISC dynarec Captured baseline FPS + sample-based hot-function breakdown across 6 ROMs (yarc, jagniccc, Iron Soldier 1/2, Doom, Skyhammer) on Apple Silicon. Result contradicts the assumptions in spikes #123 (OP) and #124 (blitter): yarc.j64 -> GPUExec ~74% of frame time Iron Soldier -> DSPExec ~67% of frame time Skyhammer fast -> GPU+DSP ~57% of frame time, blitter <2% Skyhammer acc -> blitter ~21%, but only on opt-in accurate OP -> 1% on Iron Soldier, doesn't crack top-15 elsewhere 68K -> 0.7-2.6% everywhere; JIT not worth the licensing The right next perf target is GPU/DSP RISC dynarec or cached IR (half of issue #122). Both share the same Tom RISC ISA, so a single dispatcher attacks both. docs/op-perf-profile.md captures the methodology, the ROM-by-ROM numbers, and the recommendation. Drive-by: quote $(BENCH_ROM) in the Makefile benchmark recipe so ROM paths with spaces / parens (every commercial ROM filename) work. Co-Authored-By: Claude Opus 4.7 --- Makefile | 2 +- docs/op-perf-profile.md | 121 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 docs/op-perf-profile.md diff --git a/Makefile b/Makefile index 086e3db6..064b5e16 100644 --- a/Makefile +++ b/Makefile @@ -878,7 +878,7 @@ benchmark: $(TARGET) $(CC) -O2 -Wall -std=c99 $(INCFLAGS) \ -o test/tools/test_benchmark test/tools/test_benchmark.c \ $(if $(filter Linux,$(shell uname -s)),-ldl) - ./test/tools/test_benchmark ./$(TARGET) $(BENCH_ROM) $(BENCH_FRAMES) \ + ./test/tools/test_benchmark ./$(TARGET) "$(BENCH_ROM)" $(BENCH_FRAMES) \ --warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER) print-%: diff --git a/docs/op-perf-profile.md b/docs/op-perf-profile.md new file mode 100644 index 00000000..d771eb40 --- /dev/null +++ b/docs/op-perf-profile.md @@ -0,0 +1,121 @@ +# Performance baseline + profile: OP vs blitter vs GPU/DSP + +Captured 2026-05-01 on Apple Silicon (M-series Mac), arm64 build, default flags + `RELEASE_DEBUG_INFO=1`. See `make benchmark` and `docs/profiling.md` for the methodology. + +## TL;DR + +**OP is not the bottleneck. Blitter is rarely the bottleneck either. GPU and DSP RISC interpretation dominate frame time across the board.** + +This redirects the next-up perf work from issue #123 (OP) to a different target — see "Recommendation" at the bottom. + +## Baseline FPS + +`make benchmark` 600 frames after 60 warmup, headless, no video presentation, no audio output. Same Mac, same thermal state. + +| ROM | Fast blitter | Accurate blitter | Slowdown | +|---|---:|---:|---:| +| `yarc.j64` (raycasting demo) | 280 FPS | 229 FPS | 1.22× | +| `jagniccc.j64` (NICCC compo demo) | 355 FPS | 258 FPS | 1.37× | +| `Iron Soldier (1994).jag` | 312 FPS | 258 FPS | 1.21× | +| `Iron Soldier 2 (World).j64` | 313 FPS | 266 FPS | 1.18× | +| `Doom - Evil Unleashed (1994).jag` | 306 FPS | 261 FPS | 1.17× | +| `Skyhammer_(1999).jag` | 339 FPS | 215 FPS | **1.58×** | + +All ROMs run well above 60 FPS on M-series. The "where does it hurt" question is meaningful for slower hosts (Pi, mobile) — same ratios, lower absolute numbers. + +## Profile breakdown (`/usr/bin/sample`, 8s @ 6000-frame run) + +Captured to `/tmp/op-baseline/sample-*-*.txt`. Aggregated top-of-stack by symbol: + +### `yarc.j64` (demo, fast blitter) + +| Function | Samples | % of frame | +|---|---:|---:| +| `GPUExec` (per-instruction dispatch in `src/tom/gpu.c`) | ~5018 | **~74%** | +| `blitter_generic` | ~649 | 10% | +| `HalflineCallback` | 79 | 1.2% | +| (`OPProcessFixedBitmap` etc. didn't make top-15) | – | <1% | + +**Demo content with heavy GPU programs. This pattern would generalize to anything that uses the GPU's RISC for software rendering.** + +### `Iron Soldier (1994).jag` (commercial, fast blitter) + +| Function | Samples | % of frame | +|---|---:|---:| +| `DSPExec` (per-instruction dispatch in `src/jerry/dsp.c`) | 3456 | **~51%** | +| `dsp_opcode_jr` | 818 | 12% | +| `dsp_opcode_*` (load_r15_indexed, jump, load) | ~250 | 4% | +| **DSP TOTAL** | **~4524** | **~67%** | +| `HalflineCallback` / `JaguarReadLong` / `JaguarExecuteNew` | ~250 | 4% | +| `OPProcessFixedBitmap` | 68 | **1%** | +| `m68k_execute` | 49 | 0.7% | +| `blitter_generic` | 48 | 0.7% | + +**OP at 1%. Blitter at 0.7%. 68K at 0.7%. Two-thirds of frame time in DSP interpretation.** + +### `Skyhammer_(1999).jag` (commercial, fast blitter) + +| Function | Samples | % of frame | +|---|---:|---:| +| `DSPExec` | 2260 | **~33%** | +| `GPUExec` | ~1600 | **~24%** | +| `m68k_execute` | 177 | 2.6% | +| `blitter_generic` | 112 | 1.6% | +| `HalflineCallback` | 106 | 1.6% | + +**GPU + DSP = 57% of frame time. Blitter is irrelevant here on the fast path.** + +### `Skyhammer_(1999).jag` (commercial, accurate blitter) + +| Function | Samples | % of frame | +|---|---:|---:| +| `DSPExec` | 1525 | **~22%** | +| `BlitterMidsummer2` + `DATA` + `ADDARRAY` | ~1430 | **~21%** | +| `GPUExec` | ~720 | **~11%** | +| `m68k_execute` | 97 | 1.4% | + +**The one case where the blitter genuinely matters — and only because the user opted into accurate mode. Even here, DSP is comparable.** + +## What this changes + +The original `[spike] OP performance audit` (#123) and `[spike] Blitter performance audit` (#124) both assumed those subsystems dominate. They don't. + +| Target | Hypothesis going in | Reality | ROI | +|---|---|---|---| +| **OP** (#123) | "dominates on heavy-OP scenes (Wolf3D, T2K, Iron Soldier)" | 1% on Iron Soldier; doesn't make top-15 elsewhere | Low — even a 10× OP speedup buys ~1% of frame time | +| **Blitter** (#124) | "fast vs accurate matters for some games" | Fast is <2% everywhere except where accurate is opted-in (then ~21% on Skyhammer) | Medium — only for users running accurate blitter on specific titles | +| **GPU/DSP RISC** (#122 sub-component) | "JIT might help on mobile/Pi" | **24-74% of frame time, every ROM** | **High** — single dynarec helps both because they share an ISA | +| **68K** (#122 sub-component) | "wrap UAE JIT or Cyclone68k" | 0.7-2.6% — barely visible in any profile | Low | + +## Recommendation + +**Redirect to #122 (JIT / dynarec / cached IR), specifically the GPU/DSP RISC half.** Both Tom GPU and Jerry DSP share the same ISA (~64 opcodes, fixed 16-bit encoding, no MMU). A single basic-block dynarec or cached-IR dispatcher covers both, and the profile data says it would attack the actual hot path on every game tested: + +- Demos (yarc/jagniccc) → mostly GPU. +- Commercial games (Iron Soldier, Doom, Skyhammer) → mostly DSP, sometimes both. + +Closing #123 (OP) and #124 (blitter) as **wontfix-for-now** based on profile data is the honest call. Cheap wins documented in those spikes (e.g., the OP `O(N²)` discovery bug, fast-blitter SIMD widening) can still land opportunistically — they're correct fixes, just won't move the headline number. + +## Why we DIDN'T touch OP/blitter as planned + +- The OP spike (#123) called out an `O(N²)` linear scan in `OPObjectExists`. That's still a real bug worth fixing for code-quality reasons. But it costs ~1% of frame time, not the ~30% the spike speculated. Not a perf win. +- The blitter accurate-mode wins (#124) are real for users who deliberately enable that mode on a specific title. But the default user experience runs the fast blitter, which is single-digit% of frame time. + +## What 68K JIT would buy us + +Per profile: 0.7-2.6% of frame time across all ROMs. Even a 10× speedup of the 68K interpreter → at most 2-3% wall-clock improvement. **Not worth the GPLv2 / GPLv3 license dance with UAE-JIT or the ARM-only constraint of Cyclone68k.** + +The profile result also explains why the standalone Virtual Jaguar interpreter has been "fast enough" for years on desktop — modern CPUs eat the 68K interpreter for breakfast. The GPU/DSP RISC don't eat as well because of the call-overhead-per-instruction pattern. + +## Next steps + +1. Update issues #122, #123, #124 with this profile data. Keep #122 open and re-scope to GPU/DSP-only. Close #123 and #124 with the linked profile evidence. +2. New spike: feasibility of a Tom RISC cached-IR / threaded-code dispatcher — the lowest-cost approach, works on JIT-restricted platforms (iOS, Switch). Block JIT comes later if the cached IR proves the model. +3. Profile data + this doc lives at `docs/op-perf-profile.md` (this file). Re-run periodically as a same-host commit-to-commit delta. + +## Files / commands + +- Baseline benchmarks: `make benchmark BENCH_ROM= BENCH_BLITTER={fast,accurate}` — 600 frames + 60 warmup default +- Profile capture: `./test/tools/test_benchmark 6000 --warmup 60 --blitter &` then `sample $! 8 -file /tmp/sample.txt` +- Test ROMs in tree: `test/roms/yarc.j64`, `test/roms/jagniccc.j64` +- Commercial ROMs (private): `test/roms/private/Iron Soldier (1994).jag` and friends From 79a65d05e4c65a9579efbec1867cf6784ac8e623 Mon Sep 17 00:00:00 2001 From: Joseph Mattiello Date: Fri, 1 May 2026 20:05:06 -0400 Subject: [PATCH 2/3] perf: profile gameplay-state too (AvP), correct the blitter conclusion Original profile only sampled headless boot/menu state, which doesn't exercise the blitter much. Added --load-state to test_benchmark (handles RetroArch RASTATE container by extracting the MEM chunk), loaded the user's AvP state6 save (active gameplay), and re-profiled. Result: the spike reports' hypothesis about the blitter was right -- it just didn't show up in boot profiles. AvP gameplay, fast blitter: DSP ~63% (same as before), blitter 5%, GPU 4%, OP 1% AvP gameplay, accurate blitter: BlitterMidsummer2+ADDARRAY+DATA ~34% (matches docs/spike #124), DSP ~31%, GPU small OP stays at ~1% even on gameplay -- closing #123 as wontfix-for-now is still the honest call. Updated recommendation: two genuine targets, suggested order: 1. Accurate-blitter SIMD widening (#124) -- smaller surface, lower risk, immediate visible win for AvP-style accurate-blitter slowdown. ADDARRAY (src/tom/blitter.c:2090-2094) is the obvious first target. 2. RISC dynarec / cached IR (#122 RISC half) -- bigger lift, helps every game on every blitter mode. Co-Authored-By: Claude Opus 4.7 --- docs/op-perf-profile.md | 62 ++++++++++++++++++++++++---- test/tools/test_benchmark.c | 82 ++++++++++++++++++++++++++++++++++++- 2 files changed, 135 insertions(+), 9 deletions(-) diff --git a/docs/op-perf-profile.md b/docs/op-perf-profile.md index d771eb40..4e6615d4 100644 --- a/docs/op-perf-profile.md +++ b/docs/op-perf-profile.md @@ -4,9 +4,9 @@ Captured 2026-05-01 on Apple Silicon (M-series Mac), arm64 build, default flags ## TL;DR -**OP is not the bottleneck. Blitter is rarely the bottleneck either. GPU and DSP RISC interpretation dominate frame time across the board.** +**OP is not the bottleneck. GPU/DSP RISC interpretation is the biggest single target across the board. The accurate blitter is genuinely hot during gameplay (not just at boot) — comparable in cost to DSP for users who enable that mode.** -This redirects the next-up perf work from issue #123 (OP) to a different target — see "Recommendation" at the bottom. +> **Methodology fix mid-investigation:** the first round of this doc profiled the headless boot/menu state and concluded blitter was always small. That was misleading — boot screens don't draw much. Re-profiled with a save state loaded on top of the live ROM (Alien vs Predator at an in-game state6 save) and the picture changes substantially for the accurate-blitter case. Both data sets are below. ## Baseline FPS @@ -76,6 +76,33 @@ Captured to `/tmp/op-baseline/sample-*-*.txt`. Aggregated top-of-stack by symbo **The one case where the blitter genuinely matters — and only because the user opted into accurate mode. Even here, DSP is comparable.** +### `Alien vs Predator (1994)` at gameplay state6 (fast blitter) + +| Function | Samples | % of frame | +|---|---:|---:| +| `DSPExec` | 3303 | **~49%** | +| `dsp_opcode_jr` | 977 | 14% | +| **DSP TOTAL** | **~4280** | **~63%** | +| `blitter_generic` | ~340 | 5% | +| `GPUExec` | ~254 | 4% | +| `HalflineCallback` / `JaguarExecuteNew` / `JaguarReadLong` | ~310 | 5% | +| `OPProcessFixedBitmap` | 72 | 1% | + +**Same DSP-dominated picture as the boot profiles.** Blitter is meaningfully present (5%) since the scene is actively drawing, but still single-digit. + +### `Alien vs Predator (1994)` at gameplay state6 (accurate blitter) + +| Function | Samples | % of frame | +|---|---:|---:| +| **`BlitterMidsummer2` family** (`+ ADDARRAY 2090-2094, DATA, BlitterMidsummer2 itself`) | ~2330 | **~34%** | +| `DSPExec` | 1652 | **~24%** | +| `dsp_opcode_jr` | 443 | 6.5% | +| **DSP TOTAL** | **~2095** | **~31%** | + +**Accurate blitter is co-equal with DSP during gameplay.** The hot inner code is `ADDARRAY` (lines 2090-2094 in `src/tom/blitter.c`) — the per-cycle FDSYNC cascade the spike report (#124) flagged — plus `DATA()` (line 2514). Confirms the spike's hypothesis was right; the boot-only profile just didn't expose it. + +This matches the user-reported behavior: AvP slows down noticeably when the character moves, and the slowdown is worse with the accurate blitter selected. + ## What this changes The original `[spike] OP performance audit` (#123) and `[spike] Blitter performance audit` (#124) both assumed those subsystems dominate. They don't. @@ -83,18 +110,37 @@ The original `[spike] OP performance audit` (#123) and `[spike] Blitter performa | Target | Hypothesis going in | Reality | ROI | |---|---|---|---| | **OP** (#123) | "dominates on heavy-OP scenes (Wolf3D, T2K, Iron Soldier)" | 1% on Iron Soldier; doesn't make top-15 elsewhere | Low — even a 10× OP speedup buys ~1% of frame time | -| **Blitter** (#124) | "fast vs accurate matters for some games" | Fast is <2% everywhere except where accurate is opted-in (then ~21% on Skyhammer) | Medium — only for users running accurate blitter on specific titles | -| **GPU/DSP RISC** (#122 sub-component) | "JIT might help on mobile/Pi" | **24-74% of frame time, every ROM** | **High** — single dynarec helps both because they share an ISA | +| **Blitter — fast** (#124) | "default path matters" | <2% on boot, 5% on AvP gameplay — small everywhere | Low | +| **Blitter — accurate** (#124) | "matters for some games" | **~34% on AvP gameplay**, 21% on Skyhammer (boot); ADDARRAY cycle cascade dominates | **High for accurate-blitter users** | +| **GPU/DSP RISC** (#122 sub-component) | "JIT might help on mobile/Pi" | **24-74% on fast blitter, ~31% even with accurate** | **High** — single dynarec covers both (shared ISA) | | **68K** (#122 sub-component) | "wrap UAE JIT or Cyclone68k" | 0.7-2.6% — barely visible in any profile | Low | ## Recommendation -**Redirect to #122 (JIT / dynarec / cached IR), specifically the GPU/DSP RISC half.** Both Tom GPU and Jerry DSP share the same ISA (~64 opcodes, fixed 16-bit encoding, no MMU). A single basic-block dynarec or cached-IR dispatcher covers both, and the profile data says it would attack the actual hot path on every game tested: +Two genuine wins, depending on which user we optimize for: + +### Path A — RISC dynarec / cached IR (#122, RISC half only) + +Helps **everyone** (every ROM tested, every blitter mode): +- Fast-blitter users: attacks the dominant 50-75% slice +- Accurate-blitter users: attacks the ~31% DSP slice (the other ~34% is blitter) +- Cross-platform reach: cached IR / threaded code works on JIT-restricted hosts (iOS without entitlement, Switch) +- Both Tom GPU and Jerry DSP share the same ISA → single dispatcher covers both + +### Path B — accurate-blitter SIMD widening (#124, narrow scope) + +Helps **users who enable accurate blitter** specifically: +- ADDARRAY (`src/tom/blitter.c:2090-2094`, the per-cycle FDSYNC cascade) is the biggest single function — ~15% of frame time on AvP gameplay +- `BlitterMidsummer2` itself + `DATA()` add another ~19% +- Existing `test_blitter_compare` infrastructure already gates bit-exactness regressions +- Lower risk per change than dynarec — can be done in small, mergeable batches + +### Suggested order -- Demos (yarc/jagniccc) → mostly GPU. -- Commercial games (Iron Soldier, Doom, Skyhammer) → mostly DSP, sometimes both. +1. **Start with Path B (accurate-blitter perf)** — smaller surface, lower risk, immediate visible win for users hitting the AvP-style slowdown. ADDARRAY is the obvious first target. +2. **Then Path A (RISC dynarec)** — bigger lift, bigger payoff, helps everyone including accurate-blitter users still bottlenecked on DSP after step 1. -Closing #123 (OP) and #124 (blitter) as **wontfix-for-now** based on profile data is the honest call. Cheap wins documented in those spikes (e.g., the OP `O(N²)` discovery bug, fast-blitter SIMD widening) can still land opportunistically — they're correct fixes, just won't move the headline number. +Closing #123 (OP) as **wontfix-for-now** is still the honest call — even on gameplay it's <2%. The cheap wins documented in #123 (OP `O(N²)` discovery scan, hoist transparency check) remain valid as code-quality fixes but won't move the headline number. ## Why we DIDN'T touch OP/blitter as planned diff --git a/test/tools/test_benchmark.c b/test/tools/test_benchmark.c index fc6b1992..d06f47c9 100644 --- a/test/tools/test_benchmark.c +++ b/test/tools/test_benchmark.c @@ -34,6 +34,8 @@ static void (*pretro_run)(void); static void (*pretro_unload_game)(void); static void *(*pretro_get_memory_data)(unsigned); static size_t (*pretro_get_memory_size)(unsigned); +static size_t (*pretro_serialize_size)(void); +static bool (*pretro_unserialize)(const void *, size_t); /* Options state */ static int bios_option_set = 0; @@ -181,13 +183,17 @@ static void print_usage(const char *progname) fprintf(stderr, "Usage: %s [num_frames]\n" " [--blitter fast|accurate] [--warmup N] [--load-srm file]\n" + " [--load-state file]\n" "\n" "Options:\n" " num_frames Number of frames to benchmark (default: 300)\n" " --blitter fast Use fast blitter (default)\n" " --blitter accurate Use accurate (Midsummer2) blitter\n" " --warmup N Run N warmup frames before timing\n" - " --load-srm file Load EEPROM save data from file\n", + " --load-srm file Load EEPROM save data from file\n" + " --load-state file Load a save state into the core after retro_load_game.\n" + " Accepts raw retro_serialize() payloads or RetroArch\n" + " RASTATE container files (the MEM chunk is extracted).\n", progname); } @@ -197,6 +203,7 @@ int main(int argc, char **argv) const char *core_path; const char *rom_path; const char *srm_load_path = NULL; + const char *state_load_path = NULL; struct retro_game_info info; FILE *f; long fsize; @@ -235,6 +242,8 @@ int main(int argc, char **argv) warmup_frames = atoi(argv[++i]); else if (strcmp(argv[i], "--load-srm") == 0 && i + 1 < argc) srm_load_path = argv[++i]; + else if (strcmp(argv[i], "--load-state") == 0 && i + 1 < argc) + state_load_path = argv[++i]; else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { print_usage(argv[0]); @@ -298,6 +307,8 @@ int main(int argc, char **argv) LOAD_SYM(retro_unload_game); LOAD_SYM(retro_get_memory_data); LOAD_SYM(retro_get_memory_size); + LOAD_SYM(retro_serialize_size); + LOAD_SYM(retro_unserialize); pretro_set_environment(environment_cb); pretro_set_video_refresh(video_refresh); @@ -355,6 +366,75 @@ int main(int argc, char **argv) fprintf(stderr, "WARNING: Core reports no SAVE_RAM area\n"); } + /* Load save state if provided. Accepts both raw retro_serialize() + * payloads and RetroArch RASTATE container files (extracts the + * MEM chunk). See https://github.com/libretro/RetroArch on the + * RASTATE format. */ + if (state_load_path) + { + FILE *stf = fopen(state_load_path, "rb"); + if (!stf) + { + fprintf(stderr, "ERROR: cannot open state file: %s\n", state_load_path); + return 1; + } + { + long st_total; + uint8_t *st_buf; + const uint8_t *payload; + size_t payload_size; + size_t expected; + fseek(stf, 0, SEEK_END); + st_total = ftell(stf); + fseek(stf, 0, SEEK_SET); + st_buf = (uint8_t *)malloc(st_total); + if (fread(st_buf, 1, st_total, stf) != (size_t)st_total) + { + fprintf(stderr, "ERROR: short read on state file\n"); + free(st_buf); fclose(stf); return 1; + } + fclose(stf); + payload = st_buf; + payload_size = (size_t)st_total; + /* RASTATE container? "RASTATE" + 1 version byte, then chunks. */ + if (st_total >= 16 && memcmp(st_buf, "RASTATE", 7) == 0) + { + const uint8_t *p = st_buf + 8; /* past magic+version */ + const uint8_t *end = st_buf + st_total; + int found = 0; + while (p + 8 <= end) + { + uint32_t chunk_size = (uint32_t)p[4] | ((uint32_t)p[5] << 8) + | ((uint32_t)p[6] << 16) | ((uint32_t)p[7] << 24); + if (memcmp(p, "MEM ", 4) == 0) + { + payload = p + 8; + payload_size = chunk_size; + found = 1; + break; + } + p += 8 + chunk_size; + } + if (!found) + { + fprintf(stderr, "ERROR: no MEM chunk in RASTATE file\n"); + free(st_buf); return 1; + } + fprintf(stderr, "--- RASTATE: extracted MEM chunk (%zu bytes) ---\n", payload_size); + } + expected = pretro_serialize_size(); + fprintf(stderr, "--- State payload: %zu bytes (core expects %zu) ---\n", + payload_size, expected); + if (!pretro_unserialize(payload, payload_size)) + { + fprintf(stderr, "ERROR: retro_unserialize failed\n"); + free(st_buf); return 1; + } + fprintf(stderr, "--- State loaded from %s ---\n", state_load_path); + free(st_buf); + } + } + /* Run warmup frames (not timed) */ if (warmup_frames > 0) { From 2be427811caf8375fd5093ee73e136206a0ade22 Mon Sep 17 00:00:00 2001 From: Joseph Mattiello Date: Fri, 1 May 2026 22:37:41 -0400 Subject: [PATCH 3/3] perf: address Copilot review on test_benchmark --load-state Three concerns flagged by automated review on PR #128: * Validate fseek/ftell/malloc/fread when reading the state file (negative size, NULL malloc, short read) instead of trusting them. * Bounds-check RASTATE chunk_size against the buffer end before treating any bytes past the chunk header as payload, so a truncated/corrupt container can't push retro_unserialize past the allocation. * Route every state-load failure through a single cleanup label that closes the FILE, frees the buffer, and tears down retro_load_game / retro_init / dlopen before returning, matching the cleanup in the normal exit path. No functional change for valid state files. Smoke-tested with make benchmark (no --load-state) -- still reports FPS as before. Co-Authored-By: Claude Opus 4.7 --- test/tools/test_benchmark.c | 139 +++++++++++++++++++++++------------- 1 file changed, 91 insertions(+), 48 deletions(-) diff --git a/test/tools/test_benchmark.c b/test/tools/test_benchmark.c index d06f47c9..2ce00cb8 100644 --- a/test/tools/test_benchmark.c +++ b/test/tools/test_benchmark.c @@ -372,66 +372,109 @@ int main(int argc, char **argv) * RASTATE format. */ if (state_load_path) { - FILE *stf = fopen(state_load_path, "rb"); + FILE *stf = NULL; + long st_total = 0; + uint8_t *st_buf = NULL; + const uint8_t *payload = NULL; + size_t payload_size = 0; + size_t expected; + const char *state_err = NULL; + + stf = fopen(state_load_path, "rb"); if (!stf) { - fprintf(stderr, "ERROR: cannot open state file: %s\n", state_load_path); - return 1; + state_err = "cannot open state file"; + goto state_fail; } + if (fseek(stf, 0, SEEK_END) != 0) { - long st_total; - uint8_t *st_buf; - const uint8_t *payload; - size_t payload_size; - size_t expected; - fseek(stf, 0, SEEK_END); - st_total = ftell(stf); - fseek(stf, 0, SEEK_SET); - st_buf = (uint8_t *)malloc(st_total); - if (fread(st_buf, 1, st_total, stf) != (size_t)st_total) - { - fprintf(stderr, "ERROR: short read on state file\n"); - free(st_buf); fclose(stf); return 1; - } - fclose(stf); - payload = st_buf; - payload_size = (size_t)st_total; - /* RASTATE container? "RASTATE" + 1 version byte, then chunks. */ - if (st_total >= 16 && memcmp(st_buf, "RASTATE", 7) == 0) + state_err = "fseek to end failed"; + goto state_fail; + } + st_total = ftell(stf); + if (st_total <= 0) + { + state_err = "ftell failed or empty state file"; + goto state_fail; + } + if (fseek(stf, 0, SEEK_SET) != 0) + { + state_err = "fseek to start failed"; + goto state_fail; + } + st_buf = (uint8_t *)malloc((size_t)st_total); + if (!st_buf) + { + state_err = "malloc failed for state buffer"; + goto state_fail; + } + if (fread(st_buf, 1, (size_t)st_total, stf) != (size_t)st_total) + { + state_err = "short read on state file"; + goto state_fail; + } + fclose(stf); + stf = NULL; + payload = st_buf; + payload_size = (size_t)st_total; + /* RASTATE container? "RASTATE" + 1 version byte, then chunks. */ + if (st_total >= 16 && memcmp(st_buf, "RASTATE", 7) == 0) + { + const uint8_t *p = st_buf + 8; /* past magic+version */ + const uint8_t *end = st_buf + st_total; + int found = 0; + /* Each chunk: 4-byte type + 4-byte LE size + payload. */ + while (p + 8 <= end) { - const uint8_t *p = st_buf + 8; /* past magic+version */ - const uint8_t *end = st_buf + st_total; - int found = 0; - while (p + 8 <= end) + uint32_t chunk_size = (uint32_t)p[4] | ((uint32_t)p[5] << 8) + | ((uint32_t)p[6] << 16) | ((uint32_t)p[7] << 24); + /* Bounds-check the declared chunk size against the buffer. */ + if (chunk_size > (uint32_t)(end - (p + 8))) { - uint32_t chunk_size = (uint32_t)p[4] | ((uint32_t)p[5] << 8) - | ((uint32_t)p[6] << 16) | ((uint32_t)p[7] << 24); - if (memcmp(p, "MEM ", 4) == 0) - { - payload = p + 8; - payload_size = chunk_size; - found = 1; - break; - } - p += 8 + chunk_size; + state_err = "RASTATE chunk size overruns buffer"; + goto state_fail; } - if (!found) + if (memcmp(p, "MEM ", 4) == 0) { - fprintf(stderr, "ERROR: no MEM chunk in RASTATE file\n"); - free(st_buf); return 1; + payload = p + 8; + payload_size = chunk_size; + found = 1; + break; } - fprintf(stderr, "--- RASTATE: extracted MEM chunk (%zu bytes) ---\n", payload_size); + p += 8 + chunk_size; } - expected = pretro_serialize_size(); - fprintf(stderr, "--- State payload: %zu bytes (core expects %zu) ---\n", - payload_size, expected); - if (!pretro_unserialize(payload, payload_size)) + if (!found) { - fprintf(stderr, "ERROR: retro_unserialize failed\n"); - free(st_buf); return 1; + state_err = "no MEM chunk in RASTATE file"; + goto state_fail; } - fprintf(stderr, "--- State loaded from %s ---\n", state_load_path); - free(st_buf); + fprintf(stderr, "--- RASTATE: extracted MEM chunk (%zu bytes) ---\n", payload_size); + } + expected = pretro_serialize_size(); + fprintf(stderr, "--- State payload: %zu bytes (core expects %zu) ---\n", + payload_size, expected); + if (!pretro_unserialize(payload, payload_size)) + { + state_err = "retro_unserialize failed"; + goto state_fail; + } + fprintf(stderr, "--- State loaded from %s ---\n", state_load_path); + free(st_buf); + st_buf = NULL; + + if (0) + { +state_fail: + fprintf(stderr, "ERROR: %s: %s\n", + state_err ? state_err : "state load error", + state_load_path); + if (stf) fclose(stf); + if (st_buf) free(st_buf); + pretro_unload_game(); + pretro_deinit(); + free((void *)info.data); + dlclose(handle); + return 1; } }