Merge pull request #129 from libretro/feature/blitter-perf-pass1

JoeMatt · web-flow · commit 484f205e5b65 · 2026-05-02T15:57:31.000-04:00
perf(blitter): +15% AvP gameplay accurate via inlining ADDARRAY/DATA/COMP_CTRL
diff --git a/Makefile b/Makefile
@@ -53,6 +53,22 @@ ifeq ($(DEBUG),1)
    CFLAGS += -DBUILD_TIMESTAMP="\"debug $(shell date -u +%Y-%m-%dT%H:%M:%SZ)\""
 endif
 
+# Opt-in instrumentation counters (src/core/perf_counters.h).
+# `make BENCH_PROFILE=1` defines the macro so PERF_COUNTER/PERF_INC
+# emit real code; otherwise every counter macro is a no-op.
+ifeq ($(BENCH_PROFILE),1)
+   CFLAGS += -DBENCH_PROFILE
+endif
+
+# Per-blit slow-path tracing in BlitterMidsummer2.
+# `make BLITTER_TRACE=1` enables an stderr dump of any single blit
+# whose wall time exceeds ~1.5 ms (configurable via the threshold in
+# src/tom/blitter.c).  Useful for finding pathological blit commands
+# that dominate frame-time variance.  macOS-only (uses mach_*).
+ifeq ($(BLITTER_TRACE),1)
+   CFLAGS += -DBLITTER_TRACE
+endif
+
 # Symbol export gating.
 #
 #   GNU ld (Linux, Windows MSYS2, ARM, ...) honours --version-script:
@@ -869,17 +885,25 @@ BENCH_ROM     ?= test/roms/yarc.j64
 BENCH_FRAMES  ?= 600
 BENCH_WARMUP  ?= 60
 BENCH_BLITTER ?= fast
-benchmark: $(TARGET)
-	@# Build the harness inline so this works whether or not TEST_EXPORTS=1
-	@# was used for $(TARGET); the harness only uses retro_* exports.
-	@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc
-	@# (and Apple's clang silently accepts -ldl as a no-op, but other
-	@# linkers may not).
+# BENCH_PROFILE=1 enables src/core/perf_counters.h instrumentation and
+# wide-export ABI so test_benchmark can dlsym `perf_counters_dump`.
+ifeq ($(BENCH_PROFILE),1)
+BENCH_TEST_EXPORTS := TEST_EXPORTS=1
+else
+BENCH_TEST_EXPORTS :=
+endif
+benchmark:
+	@# Re-invoke make so BENCH_PROFILE / TEST_EXPORTS take effect on the .so/.dylib.
+	$(MAKE) $(BENCH_TEST_EXPORTS) BENCH_PROFILE=$(BENCH_PROFILE) -j$(shell getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)
+	@# Build the harness inline; it dlopens the core, so it only needs the retro_* ABI
+	@# (plus the optional perf_counters_dump symbol when BENCH_PROFILE=1).
+	@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc.
 	$(CC) -O2 -Wall -std=c99 $(INCFLAGS) \
 		-o test/tools/test_benchmark test/tools/test_benchmark.c \
 		$(if $(filter Linux,$(shell uname -s)),-ldl)
 	./test/tools/test_benchmark ./$(TARGET) "$(BENCH_ROM)" $(BENCH_FRAMES) \
-		--warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER)
+		--warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER) \
+		$(if $(BENCH_STATE),--load-state "$(BENCH_STATE)")
 
 print-%:
 	@echo '$*=$($*)'
diff --git a/Makefile.common b/Makefile.common
@@ -33,6 +33,7 @@ SOURCES_C :=  \
 	$(CORE_DIR)/src/cd/cdrom.c \
 	$(CORE_DIR)/src/core/cheat.c \
 	$(CORE_DIR)/src/core/crc32.c \
+	$(CORE_DIR)/src/core/perf_counters.c \
 	$(CORE_DIR)/src/core/event.c \
 	$(CORE_DIR)/src/jerry/eeprom.c \
 	$(CORE_DIR)/src/core/filedb.c \
diff --git a/docs/profiling.md b/docs/profiling.md
@@ -23,11 +23,21 @@ Reports `Frames/sec`, `Time/frame`, total wall time.  Boots the core via `dlopen
 
 **Instruments (Time Profiler)** is the easiest way to get a flame graph on macOS.
 
+The wrapper at `scripts/profile-mac.sh` builds the core, runs the benchmark
+under `xctrace`, and writes a `.trace` bundle you can open in Instruments:
+
+```bash
+scripts/profile-mac.sh                                    # default: Time Profiler, accurate blitter
+scripts/profile-mac.sh --template "CPU Counters"          # PMU: cycles, instructions, branch misses
+scripts/profile-mac.sh --rom test/roms/yarc.j64 --open    # auto-open the trace
+```
+
+Manual invocation if you'd rather attach to a running process:
+
 ```bash
 make benchmark BENCH_FRAMES=6000 BENCH_WARMUP=120 &
 BENCH_PID=$!
 
-# Sample for 30 seconds, output to .trace bundle
 xcrun xctrace record --template "Time Profiler" --attach $BENCH_PID --output bench.trace --time-limit 30s
 open bench.trace
 ```
@@ -41,6 +51,57 @@ sample $BENCH_PID 5 -file /tmp/sample.txt
 # 5-second sample.  Read /tmp/sample.txt for collapsed call stacks.
 ```
 
+## Bespoke counters — `BENCH_PROFILE=1`
+
+Sampling profilers tell you *where* time goes; counters tell you *how often*
+something happens.  When you want exact iteration counts (e.g., "did my
+fast-path actually skip the inner loop?"), use the `perf_counters` system in
+`src/core/perf_counters.h`.
+
+```bash
+make benchmark BENCH_PROFILE=1 BENCH_BLITTER=accurate BENCH_FRAMES=300
+# ...
+# [perf] counter dump:
+# [perf]   blitter_phrase_writes                    3034993
+# [perf]   blitter_phrase_reads                     931821
+# [perf]   blitter_inner_io                         3966814
+# [perf]   blitter_inner                            4131151
+# [perf]   blitter_outer                            337722
+# [perf]   blitter_calls                            131628
+```
+
+The macros are zero-overhead when `BENCH_PROFILE` is undefined (default
+build) — every `PERF_INC` becomes `((void)0)`, every `PERF_COUNTER`
+becomes a typedef.  Use them freely in hot paths to instrument
+hypotheses.
+
+Adding a counter:
+
+```c
+#include "perf_counters.h"
+
+PERF_COUNTER(my_event);             /* file scope */
+
+void hot(void) {
+    PERF_INC(my_event);             /* in-loop */
+    PERF_ADD(my_event, n);          /* batch */
+}
+```
+
+The harness (`test/tools/test_benchmark.c`) calls
+`perf_counters_dump(stderr)` at exit; counter values appear right
+before the `BENCHMARK RESULTS` block.
+
+When to reach for this vs. Time Profiler:
+
+| Question | Tool |
+|---|---|
+| "Where are we spending cycles?" | `xctrace` Time Profiler |
+| "How many times does the inner loop run per frame?" | `BENCH_PROFILE=1` |
+| "What fraction of inner iterations are no-ops?" | `BENCH_PROFILE=1` |
+| "Are we hitting L1 / branch-mispredicting?" | `xctrace` CPU Counters |
+| "Did this optimization change behavior, not just timing?" | `BENCH_PROFILE=1` (deltas in counts) |
+
 ## Linux — `perf` + flamegraph
 
 ```bash
diff --git a/exports-test.list b/exports-test.list
@@ -33,3 +33,7 @@ _sclk
 _smode
 _lowerField
 _vjs
+_perf_counters_dump
+_perf_counters_reset
+_perf_counters_register
+_perf_counters_find
diff --git a/link-test.T b/link-test.T
@@ -36,5 +36,9 @@
       smode;
       lowerField;
       vjs;
+      perf_counters_dump;
+      perf_counters_reset;
+      perf_counters_register;
+      perf_counters_find;
    local: *;
 };
diff --git a/scripts/profile-mac.sh b/scripts/profile-mac.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+#
+# profile-mac.sh -- Run test_benchmark under Xcode Instruments on Apple Silicon
+# (or any Mac with Xcode CLT).
+#
+# Usage:
+#   scripts/profile-mac.sh [--template NAME] [--frames N] [--warmup N]
+#                          [--blitter fast|accurate] [--rom PATH] [--open]
+#
+# Defaults:
+#   template = "Time Profiler"
+#   frames   = 600  warmup = 60  blitter = accurate
+#   rom      = test/roms/yarc.j64
+#   --open   = open the .trace bundle in Instruments when finished
+#
+# Common templates:
+#   "Time Profiler"   -- where time is being spent (call tree / flame)
+#   "CPU Counters"    -- Apple Silicon PMU (cycles, instr, branches, misses)
+#   "System Trace"    -- syscalls, scheduler, VM events
+#
+set -euo pipefail
+
+TEMPLATE="Time Profiler"
+FRAMES=600
+WARMUP=60
+BLITTER=accurate
+ROM="test/roms/yarc.j64"
+OPEN_TRACE=0
+
+while [ $# -gt 0 ]; do
+   case "$1" in
+      --template) TEMPLATE="$2"; shift 2 ;;
+      --frames)   FRAMES="$2"; shift 2 ;;
+      --warmup)   WARMUP="$2"; shift 2 ;;
+      --blitter)  BLITTER="$2"; shift 2 ;;
+      --rom)      ROM="$2"; shift 2 ;;
+      --open)     OPEN_TRACE=1; shift ;;
+      -h|--help)
+         sed -n '2,20p' "$0"
+         exit 0 ;;
+      *)
+         echo "Unknown arg: $1" >&2
+         exit 2 ;;
+   esac
+done
+
+if ! command -v xctrace >/dev/null 2>&1; then
+   echo "xctrace not found. Install Xcode Command Line Tools: xcode-select --install" >&2
+   exit 1
+fi
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+mkdir -p build
+TRACE="build/profile-$(date +%Y%m%d-%H%M%S).trace"
+
+# Make sure the core + harness are built (no BENCH_PROFILE; profiling
+# instrumentation skews sampling results).
+make -j"$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)" >/dev/null
+cc -O2 -Wall -std=c99 -I. -I./libretro-common/include \
+   -o test/tools/test_benchmark test/tools/test_benchmark.c
+
+CORE="./virtualjaguar_libretro.dylib"
+HARNESS="./test/tools/test_benchmark"
+
+echo ">>> xctrace template:   $TEMPLATE"
+echo ">>> trace output:       $TRACE"
+echo ">>> rom / blitter:      $ROM / $BLITTER"
+echo ">>> frames (+warmup):   $FRAMES (+$WARMUP)"
+
+xctrace record \
+   --template "$TEMPLATE" \
+   --output "$TRACE" \
+   --launch -- "$HARNESS" "$CORE" "$ROM" "$FRAMES" \
+                          --warmup "$WARMUP" --blitter "$BLITTER"
+
+echo ">>> trace written to $TRACE"
+if [ "$OPEN_TRACE" = "1" ]; then
+   open "$TRACE"
+fi
diff --git a/src/core/perf_counters.c b/src/core/perf_counters.c
@@ -0,0 +1,67 @@
+/*
+ * perf_counters.c - registry + dump for opt-in instrumentation counters.
+ *
+ * The register/dump/reset functions are *always* defined so they can be
+ * exported through the test ABI without conditional linker scripts.
+ * In !BENCH_PROFILE builds the bodies are no-ops and no PERF_COUNTER
+ * calls perf_counters_register, so the registry stays empty.
+ */
+#include <string.h>
+#include "perf_counters.h"
+
+#ifdef BENCH_PROFILE
+static perf_counter_entry_t *perf_head = (perf_counter_entry_t *)0;
+#endif
+
+void perf_counters_register(perf_counter_entry_t *entry)
+{
+#ifdef BENCH_PROFILE
+   if (!entry || entry->next)
+      return; /* already linked */
+   entry->next = perf_head;
+   perf_head = entry;
+#else
+   (void)entry;
+#endif
+}
+
+void perf_counters_reset(void)
+{
+#ifdef BENCH_PROFILE
+   perf_counter_entry_t *e;
+   for (e = perf_head; e; e = e->next)
+      *e->value = 0;
+#endif
+}
+
+void perf_counters_dump(FILE *out)
+{
+#ifdef BENCH_PROFILE
+   perf_counter_entry_t *e;
+   if (!out)
+      out = stderr;
+   if (!perf_head) {
+      fprintf(out, "[perf] no counters registered\n");
+      return;
+   }
+   fprintf(out, "[perf] counter dump:\n");
+   for (e = perf_head; e; e = e->next)
+      fprintf(out, "[perf]   %-40s %llu\n", e->name, *e->value);
+#else
+   (void)out;
+#endif
+}
+
+unsigned long long *perf_counters_find(const char *name)
+{
+#ifdef BENCH_PROFILE
+   perf_counter_entry_t *e;
+   if (!name) return (unsigned long long *)0;
+   for (e = perf_head; e; e = e->next)
+      if (e->name && strcmp(e->name, name) == 0)
+         return e->value;
+#else
+   (void)name;
+#endif
+   return (unsigned long long *)0;
+}
diff --git a/src/core/perf_counters.h b/src/core/perf_counters.h
diff --git a/src/tom/blitter.c b/src/tom/blitter.c
diff --git a/test/tools/test_benchmark.c b/test/tools/test_benchmark.c