Skip to content

Commit 25278cb

Browse files
JoeMattclaude
andcommitted
bench: per-frame variance + blitter trace + perf counter lookup
The +15% inlining win shipped in the previous commit didn't fix the real-world AvP audio stutter, because audio dropouts are governed by *worst-case* frame time, not average. This commit lands the diagnostic infrastructure that found the actual cost driver. * test_benchmark now records per-frame timing and reports p50 / p99 / p999 / max ms-per-frame plus the count of frames that blew the 16.67 ms (60 Hz) budget. * perf_counters.h gains `perf_counters_find(name)` so the harness can snapshot a counter before/after each retro_run() call and report per-frame deltas. * test_benchmark uses the new lookup to print the slowest frames along with their blitter call count and inner-iteration count, and the slow-vs-avg ratio. * Makefile gains BLITTER_TRACE=1 toggle that wires -DBLITTER_TRACE; src/tom/blitter.c grows a per-blit elapsed-ms dump that fires for any single BlitterMidsummer2 call exceeding a threshold, so we can tell whether worst-case frame spikes are one giant blit or many small ones. What this revealed for AvP gameplay (state7, accurate blitter, 1200 frames after 120 warmup, M-series host): avg 5.1 ms, p99 17.0 ms, p999 18.2 ms, max 18.4 ms 25 / 1200 frames over 16.67 ms (2%) Avg per frame: 403 blits, 49,774 inner iters Slow per frame: 2009 blits, 247,508 inner iters (5x) The slow frames are perfectly periodic and consistent (5x the work on every spike, every spike same exact count). No single blit is slow -- the BLITTER_TRACE dump prints zero lines even at a 0.3 ms threshold. AvP is just doing 5x more blitter work on roughly every 24th frame. That's the audio dropout root cause: the inlining helped average throughput but the periodic 5x spikes still blow the budget. Co-Authored-By: Claude Opus 4.7 <[email protected]>
1 parent 930bcca commit 25278cb

7 files changed

Lines changed: 197 additions & 24 deletions

File tree

Makefile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,15 @@ ifeq ($(BENCH_PROFILE),1)
6060
CFLAGS += -DBENCH_PROFILE
6161
endif
6262

63+
# Per-blit slow-path tracing in BlitterMidsummer2.
64+
# `make BLITTER_TRACE=1` enables an stderr dump of any single blit
65+
# whose wall time exceeds ~1.5 ms (configurable via the threshold in
66+
# src/tom/blitter.c). Useful for finding pathological blit commands
67+
# that dominate frame-time variance. macOS-only (uses mach_*).
68+
ifeq ($(BLITTER_TRACE),1)
69+
CFLAGS += -DBLITTER_TRACE
70+
endif
71+
6372
# Symbol export gating.
6473
#
6574
# GNU ld (Linux, Windows MSYS2, ARM, ...) honours --version-script:

exports-test.list

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,4 @@ _vjs
3636
_perf_counters_dump
3737
_perf_counters_reset
3838
_perf_counters_register
39+
_perf_counters_find

link-test.T

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,5 +39,6 @@
3939
perf_counters_dump;
4040
perf_counters_reset;
4141
perf_counters_register;
42+
perf_counters_find;
4243
local: *;
4344
};

src/core/perf_counters.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* In !BENCH_PROFILE builds the bodies are no-ops and no PERF_COUNTER
77
* calls perf_counters_register, so the registry stays empty.
88
*/
9+
#include <string.h>
910
#include "perf_counters.h"
1011

1112
#ifdef BENCH_PROFILE
@@ -50,3 +51,17 @@ void perf_counters_dump(FILE *out)
5051
(void)out;
5152
#endif
5253
}
54+
55+
unsigned long long *perf_counters_find(const char *name)
56+
{
57+
#ifdef BENCH_PROFILE
58+
perf_counter_entry_t *e;
59+
if (!name) return (unsigned long long *)0;
60+
for (e = perf_head; e; e = e->next)
61+
if (e->name && strcmp(e->name, name) == 0)
62+
return e->value;
63+
#else
64+
(void)name;
65+
#endif
66+
return (unsigned long long *)0;
67+
}

src/core/perf_counters.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ typedef struct perf_counter_entry
4848
void perf_counters_register(perf_counter_entry_t *entry);
4949
void perf_counters_dump(FILE *out);
5050
void perf_counters_reset(void);
51+
/* Return a pointer to the named counter's value, or NULL if unknown.
52+
* Lets harnesses snapshot a counter before/after retro_run for
53+
* per-frame deltas without exporting individual symbols. */
54+
unsigned long long *perf_counters_find(const char *name);
5155

5256
#ifdef BENCH_PROFILE
5357

src/tom/blitter.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1676,9 +1676,19 @@ Unused[0] := DUMMY (unused[0]);
16761676
END;*/
16771677
}
16781678

1679+
#ifdef BLITTER_TRACE
1680+
#include <mach/mach_time.h>
1681+
#include <stdio.h>
1682+
static double bm2_trace_threshold_ms = 0.3; /* dump any blit slower than this */
1683+
static uint64_t bm2_trace_t0;
1684+
#endif
1685+
16791686
void BlitterMidsummer2(void)
16801687
{
16811688
uint32_t cmd = (PERF_INC(blitter_calls), GET32(blitter_ram, COMMAND));
1689+
#ifdef BLITTER_TRACE
1690+
bm2_trace_t0 = mach_absolute_time();
1691+
#endif
16821692

16831693

16841694
// Line states passed in via the command register
@@ -2685,6 +2695,29 @@ A1_outside := OR6 (a1_outside, a1_x{15}, a1xgr, a1xeq, a1_y{15}, a1ygr, a1yeq);
26852695
SET16(blitter_ram, A2_PIXEL + 2, a2_x);
26862696
SET16(blitter_ram, A2_PIXEL + 0, a2_y);
26872697

2698+
#ifdef BLITTER_TRACE
2699+
{
2700+
static mach_timebase_info_data_t tb;
2701+
uint64_t t1 = mach_absolute_time();
2702+
double ms;
2703+
if (tb.denom == 0) mach_timebase_info(&tb);
2704+
ms = (double)(t1 - bm2_trace_t0) * (double)tb.numer / (double)tb.denom / 1e6;
2705+
if (ms >= bm2_trace_threshold_ms) {
2706+
uint16_t pcount = GET16(blitter_ram, PIXLINECOUNTER + 2);
2707+
uint16_t lcount = GET16(blitter_ram, PIXLINECOUNTER);
2708+
uint8_t pixsize = (blitter_ram[A1_FLAGS + 3] & 0x38) >> 3;
2709+
fprintf(stderr,
2710+
"[BLITTER_TRACE] %.2f ms cmd=%08x pixsize=%u inner=%u outer=%u "
2711+
"src(en=%d enx=%d enz=%d) dst(en=%d enz=%d wrz=%d) "
2712+
"gourd=%d gourz=%d srcshade=%d bcompen=%d dcompen=%d\n",
2713+
ms, cmd, pixsize, pcount, lcount,
2714+
(int)srcen, (int)srcenx, (int)srcenz,
2715+
(int)dsten, (int)dstenz, (int)dstwrz,
2716+
(int)gourd, (int)gourz, (int)srcshade,
2717+
(int)bcompen, (int)dcompen);
2718+
}
2719+
}
2720+
#endif
26882721
}
26892722

26902723
// Various pieces of the blitter puzzle are teased out here...

test/tools/test_benchmark.c

Lines changed: 134 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ static size_t (*pretro_serialize_size)(void);
3838
static bool (*pretro_unserialize)(const void *, size_t);
3939
/* Optional: only present when the core was built with BENCH_PROFILE=1. */
4040
static void (*pperf_counters_dump)(FILE *);
41+
static unsigned long long *(*pperf_counters_find)(const char *);
4142

4243
/* Options state */
4344
static int bios_option_set = 0;
@@ -312,8 +313,9 @@ int main(int argc, char **argv)
312313
LOAD_SYM(retro_serialize_size);
313314
LOAD_SYM(retro_unserialize);
314315

315-
/* Optional perf-counter dump; absent unless built with BENCH_PROFILE=1. */
316+
/* Optional perf-counter access; absent unless built with BENCH_PROFILE=1. */
316317
pperf_counters_dump = dlsym(handle, "perf_counters_dump");
318+
pperf_counters_find = dlsym(handle, "perf_counters_find");
317319

318320
pretro_set_environment(environment_cb);
319321
pretro_set_video_refresh(video_refresh);
@@ -492,29 +494,137 @@ int main(int argc, char **argv)
492494
fprintf(stderr, "--- Warmup complete ---\n");
493495
}
494496

495-
/* Timed run */
496-
fprintf(stderr, "--- Benchmarking %d frames ---\n", num_frames);
497-
t_start = timer_now();
498-
499-
for (i = 0; i < num_frames; i++)
500-
pretro_run();
501-
502-
t_end = timer_now();
503-
504-
elapsed = timer_elapsed_sec(t_start, t_end);
505-
fps = (double)num_frames / elapsed;
506-
ms_per_frame = (elapsed * 1000.0) / (double)num_frames;
507-
508-
/* Print results */
509-
printf("\n=== BENCHMARK RESULTS ===\n");
510-
printf("Blitter mode: %s\n",
511-
strcmp(blitter_value, "enabled") == 0 ? "fast" : "accurate");
512-
printf("Frames measured: %d\n", num_frames);
513-
printf("Warmup frames: %d\n", warmup_frames);
514-
printf("Total time: %.3f s\n", elapsed);
515-
printf("Frames/sec: %.2f\n", fps);
516-
printf("Time/frame: %.3f ms\n", ms_per_frame);
517-
printf("=========================\n");
497+
/* Timed run with per-frame samples to expose variance. Audio
498+
* dropouts in real frontends are caused by *worst-case* frames
499+
* exceeding the 16.6 ms (60 Hz) budget, not by the average. */
500+
{
501+
double *frame_ms = (double *)malloc((size_t)num_frames * sizeof(double));
502+
unsigned long long *blit_calls_at_frame = (unsigned long long *)malloc((size_t)num_frames * sizeof(unsigned long long));
503+
unsigned long long *blit_inner_at_frame = (unsigned long long *)malloc((size_t)num_frames * sizeof(unsigned long long));
504+
double frame_budget_ms = 1000.0 / 60.0;
505+
int over_budget = 0;
506+
double max_ms = 0.0;
507+
double p50_ms = 0.0, p99_ms = 0.0, p999_ms = 0.0;
508+
unsigned long long *blit_calls_ctr = pperf_counters_find ? pperf_counters_find("blitter_calls") : NULL;
509+
unsigned long long *blit_inner_ctr = pperf_counters_find ? pperf_counters_find("blitter_inner") : NULL;
510+
unsigned long long blit_calls_prev = blit_calls_ctr ? *blit_calls_ctr : 0;
511+
unsigned long long blit_inner_prev = blit_inner_ctr ? *blit_inner_ctr : 0;
512+
513+
if (!frame_ms || !blit_calls_at_frame || !blit_inner_at_frame)
514+
{
515+
fprintf(stderr, "ERROR: malloc failed for per-frame timing\n");
516+
pretro_unload_game(); pretro_deinit();
517+
free((void *)info.data); dlclose(handle);
518+
return 1;
519+
}
520+
521+
fprintf(stderr, "--- Benchmarking %d frames ---\n", num_frames);
522+
t_start = timer_now();
523+
524+
for (i = 0; i < num_frames; i++)
525+
{
526+
uint64_t f0 = timer_now();
527+
uint64_t f1;
528+
pretro_run();
529+
f1 = timer_now();
530+
frame_ms[i] = timer_elapsed_sec(f0, f1) * 1000.0;
531+
if (blit_calls_ctr) {
532+
blit_calls_at_frame[i] = *blit_calls_ctr - blit_calls_prev;
533+
blit_calls_prev = *blit_calls_ctr;
534+
} else blit_calls_at_frame[i] = 0;
535+
if (blit_inner_ctr) {
536+
blit_inner_at_frame[i] = *blit_inner_ctr - blit_inner_prev;
537+
blit_inner_prev = *blit_inner_ctr;
538+
} else blit_inner_at_frame[i] = 0;
539+
}
540+
541+
t_end = timer_now();
542+
543+
elapsed = timer_elapsed_sec(t_start, t_end);
544+
fps = (double)num_frames / elapsed;
545+
ms_per_frame = (elapsed * 1000.0) / (double)num_frames;
546+
547+
/* Quicksort copy so the original order is preserved for any
548+
* later analysis (currently we don't print it, but cheap). */
549+
{
550+
double *sorted = (double *)malloc((size_t)num_frames * sizeof(double));
551+
int j;
552+
if (sorted)
553+
{
554+
memcpy(sorted, frame_ms, (size_t)num_frames * sizeof(double));
555+
/* Insertion sort (small N typical). */
556+
for (i = 1; i < num_frames; i++)
557+
{
558+
double key = sorted[i];
559+
j = i - 1;
560+
while (j >= 0 && sorted[j] > key) { sorted[j + 1] = sorted[j]; j--; }
561+
sorted[j + 1] = key;
562+
}
563+
p50_ms = sorted[(int)((double)num_frames * 0.50)];
564+
p99_ms = sorted[(int)((double)num_frames * 0.99)];
565+
p999_ms = sorted[(int)((double)num_frames * 0.999)];
566+
max_ms = sorted[num_frames - 1];
567+
free(sorted);
568+
}
569+
}
570+
for (i = 0; i < num_frames; i++)
571+
if (frame_ms[i] > frame_budget_ms) over_budget++;
572+
573+
/* Print results */
574+
printf("\n=== BENCHMARK RESULTS ===\n");
575+
printf("Blitter mode: %s\n",
576+
strcmp(blitter_value, "enabled") == 0 ? "fast" : "accurate");
577+
printf("Frames measured: %d\n", num_frames);
578+
printf("Warmup frames: %d\n", warmup_frames);
579+
printf("Total time: %.3f s\n", elapsed);
580+
printf("Frames/sec: %.2f\n", fps);
581+
printf("Time/frame avg: %.3f ms\n", ms_per_frame);
582+
printf("Time/frame p50: %.3f ms\n", p50_ms);
583+
printf("Time/frame p99: %.3f ms\n", p99_ms);
584+
printf("Time/frame p999: %.3f ms\n", p999_ms);
585+
printf("Time/frame max: %.3f ms\n", max_ms);
586+
printf("Over 16.67 ms: %d / %d frames (%.2f%%)\n",
587+
over_budget, num_frames, 100.0 * over_budget / num_frames);
588+
printf("=========================\n");
589+
590+
/* If we have per-frame blitter counters, dump the slowest frames
591+
* so we can correlate blit volume with frame-time spikes. */
592+
if (over_budget > 0 && blit_calls_ctr) {
593+
int j;
594+
double avg_calls = 0.0, avg_inner = 0.0;
595+
double slow_calls = 0.0, slow_inner = 0.0;
596+
int slow_n = 0;
597+
printf("\n--- Worst frames (>16.67ms) -----------------------------\n");
598+
printf(" idx frame_ms blit_calls blit_inner_iter\n");
599+
for (j = 0; j < num_frames; j++) {
600+
avg_calls += blit_calls_at_frame[j];
601+
avg_inner += blit_inner_at_frame[j];
602+
if (frame_ms[j] > frame_budget_ms) {
603+
slow_calls += blit_calls_at_frame[j];
604+
slow_inner += blit_inner_at_frame[j];
605+
slow_n++;
606+
if (slow_n <= 12)
607+
printf(" %4d %7.2f %10llu %15llu\n",
608+
j, frame_ms[j],
609+
blit_calls_at_frame[j],
610+
blit_inner_at_frame[j]);
611+
}
612+
}
613+
printf("---\n");
614+
printf("Avg per frame (all): blits=%.0f inner_iter=%.0f\n",
615+
avg_calls / num_frames, avg_inner / num_frames);
616+
if (slow_n > 0)
617+
printf("Avg per frame (slow): blits=%.0f inner_iter=%.0f (%dx, %dx vs avg)\n",
618+
slow_calls / slow_n, slow_inner / slow_n,
619+
(int)((slow_calls / slow_n) / (avg_calls / num_frames + 1e-9)),
620+
(int)((slow_inner / slow_n) / (avg_inner / num_frames + 1e-9)));
621+
printf("=========================================================\n");
622+
}
623+
624+
free(frame_ms);
625+
free(blit_calls_at_frame);
626+
free(blit_inner_at_frame);
627+
}
518628

519629
if (pperf_counters_dump)
520630
pperf_counters_dump(stderr);

0 commit comments

Comments
 (0)