Skip to content

Commit 484f205

Browse files
authored
Merge pull request #129 from libretro/feature/blitter-perf-pass1
perf(blitter): +15% AvP gameplay accurate via inlining ADDARRAY/DATA/COMP_CTRL
2 parents 662b13f + 57741b4 commit 484f205

10 files changed

Lines changed: 2290 additions & 1736 deletions

File tree

Makefile

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,22 @@ ifeq ($(DEBUG),1)
5353
CFLAGS += -DBUILD_TIMESTAMP="\"debug $(shell date -u +%Y-%m-%dT%H:%M:%SZ)\""
5454
endif
5555

56+
# Opt-in instrumentation counters (src/core/perf_counters.h).
57+
# `make BENCH_PROFILE=1` defines the macro so PERF_COUNTER/PERF_INC
58+
# emit real code; otherwise every counter macro is a no-op.
59+
ifeq ($(BENCH_PROFILE),1)
60+
CFLAGS += -DBENCH_PROFILE
61+
endif
62+
63+
# Per-blit slow-path tracing in BlitterMidsummer2.
64+
# `make BLITTER_TRACE=1` enables an stderr dump of any single blit
65+
# whose wall time exceeds ~1.5 ms (configurable via the threshold in
66+
# src/tom/blitter.c). Useful for finding pathological blit commands
67+
# that dominate frame-time variance. macOS-only (uses mach_*).
68+
ifeq ($(BLITTER_TRACE),1)
69+
CFLAGS += -DBLITTER_TRACE
70+
endif
71+
5672
# Symbol export gating.
5773
#
5874
# GNU ld (Linux, Windows MSYS2, ARM, ...) honours --version-script:
@@ -869,17 +885,25 @@ BENCH_ROM ?= test/roms/yarc.j64
869885
BENCH_FRAMES ?= 600
870886
BENCH_WARMUP ?= 60
871887
BENCH_BLITTER ?= fast
872-
benchmark: $(TARGET)
873-
@# Build the harness inline so this works whether or not TEST_EXPORTS=1
874-
@# was used for $(TARGET); the harness only uses retro_* exports.
875-
@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc
876-
@# (and Apple's clang silently accepts -ldl as a no-op, but other
877-
@# linkers may not).
888+
# BENCH_PROFILE=1 enables src/core/perf_counters.h instrumentation and
889+
# wide-export ABI so test_benchmark can dlsym `perf_counters_dump`.
890+
ifeq ($(BENCH_PROFILE),1)
891+
BENCH_TEST_EXPORTS := TEST_EXPORTS=1
892+
else
893+
BENCH_TEST_EXPORTS :=
894+
endif
895+
benchmark:
896+
@# Re-invoke make so BENCH_PROFILE / TEST_EXPORTS take effect on the .so/.dylib.
897+
$(MAKE) $(BENCH_TEST_EXPORTS) BENCH_PROFILE=$(BENCH_PROFILE) -j$(shell getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)
898+
@# Build the harness inline; it dlopens the core, so it only needs the retro_* ABI
899+
@# (plus the optional perf_counters_dump symbol when BENCH_PROFILE=1).
900+
@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc.
878901
$(CC) -O2 -Wall -std=c99 $(INCFLAGS) \
879902
-o test/tools/test_benchmark test/tools/test_benchmark.c \
880903
$(if $(filter Linux,$(shell uname -s)),-ldl)
881904
./test/tools/test_benchmark ./$(TARGET) "$(BENCH_ROM)" $(BENCH_FRAMES) \
882-
--warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER)
905+
--warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER) \
906+
$(if $(BENCH_STATE),--load-state "$(BENCH_STATE)")
883907

884908
print-%:
885909
@echo '$*=$($*)'

Makefile.common

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ SOURCES_C := \
3333
$(CORE_DIR)/src/cd/cdrom.c \
3434
$(CORE_DIR)/src/core/cheat.c \
3535
$(CORE_DIR)/src/core/crc32.c \
36+
$(CORE_DIR)/src/core/perf_counters.c \
3637
$(CORE_DIR)/src/core/event.c \
3738
$(CORE_DIR)/src/jerry/eeprom.c \
3839
$(CORE_DIR)/src/core/filedb.c \

docs/profiling.md

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,21 @@ Reports `Frames/sec`, `Time/frame`, total wall time. Boots the core via `dlopen
2323

2424
**Instruments (Time Profiler)** is the easiest way to get a flame graph on macOS.
2525

26+
The wrapper at `scripts/profile-mac.sh` builds the core, runs the benchmark
27+
under `xctrace`, and writes a `.trace` bundle you can open in Instruments:
28+
29+
```bash
30+
scripts/profile-mac.sh # default: Time Profiler, accurate blitter
31+
scripts/profile-mac.sh --template "CPU Counters" # PMU: cycles, instructions, branch misses
32+
scripts/profile-mac.sh --rom test/roms/yarc.j64 --open # auto-open the trace
33+
```
34+
35+
Manual invocation if you'd rather attach to a running process:
36+
2637
```bash
2738
make benchmark BENCH_FRAMES=6000 BENCH_WARMUP=120 &
2839
BENCH_PID=$!
2940

30-
# Sample for 30 seconds, output to .trace bundle
3141
xcrun xctrace record --template "Time Profiler" --attach $BENCH_PID --output bench.trace --time-limit 30s
3242
open bench.trace
3343
```
@@ -41,6 +51,57 @@ sample $BENCH_PID 5 -file /tmp/sample.txt
4151
# 5-second sample. Read /tmp/sample.txt for collapsed call stacks.
4252
```
4353

54+
## Bespoke counters — `BENCH_PROFILE=1`
55+
56+
Sampling profilers tell you *where* time goes; counters tell you *how often*
57+
something happens. When you want exact iteration counts (e.g., "did my
58+
fast-path actually skip the inner loop?"), use the `perf_counters` system in
59+
`src/core/perf_counters.h`.
60+
61+
```bash
62+
make benchmark BENCH_PROFILE=1 BENCH_BLITTER=accurate BENCH_FRAMES=300
63+
# ...
64+
# [perf] counter dump:
65+
# [perf] blitter_phrase_writes 3034993
66+
# [perf] blitter_phrase_reads 931821
67+
# [perf] blitter_inner_io 3966814
68+
# [perf] blitter_inner 4131151
69+
# [perf] blitter_outer 337722
70+
# [perf] blitter_calls 131628
71+
```
72+
73+
The macros are zero-overhead when `BENCH_PROFILE` is undefined (default
74+
build) — every `PERF_INC` becomes `((void)0)`, every `PERF_COUNTER`
75+
becomes a typedef. Use them freely in hot paths to instrument
76+
hypotheses.
77+
78+
Adding a counter:
79+
80+
```c
81+
#include "perf_counters.h"
82+
83+
PERF_COUNTER(my_event); /* file scope */
84+
85+
void hot(void) {
86+
PERF_INC(my_event); /* in-loop */
87+
PERF_ADD(my_event, n); /* batch */
88+
}
89+
```
90+
91+
The harness (`test/tools/test_benchmark.c`) calls
92+
`perf_counters_dump(stderr)` at exit; counter values appear right
93+
before the `BENCHMARK RESULTS` block.
94+
95+
When to reach for this vs. Time Profiler:
96+
97+
| Question | Tool |
98+
|---|---|
99+
| "Where are we spending cycles?" | `xctrace` Time Profiler |
100+
| "How many times does the inner loop run per frame?" | `BENCH_PROFILE=1` |
101+
| "What fraction of inner iterations are no-ops?" | `BENCH_PROFILE=1` |
102+
| "Are we hitting L1 / branch-mispredicting?" | `xctrace` CPU Counters |
103+
| "Did this optimization change behavior, not just timing?" | `BENCH_PROFILE=1` (deltas in counts) |
104+
44105
## Linux — `perf` + flamegraph
45106
46107
```bash

exports-test.list

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,7 @@ _sclk
3333
_smode
3434
_lowerField
3535
_vjs
36+
_perf_counters_dump
37+
_perf_counters_reset
38+
_perf_counters_register
39+
_perf_counters_find

link-test.T

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,9 @@
3636
smode;
3737
lowerField;
3838
vjs;
39+
perf_counters_dump;
40+
perf_counters_reset;
41+
perf_counters_register;
42+
perf_counters_find;
3943
local: *;
4044
};

scripts/profile-mac.sh

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env bash
2+
#
3+
# profile-mac.sh -- Run test_benchmark under Xcode Instruments on Apple Silicon
4+
# (or any Mac with Xcode CLT).
5+
#
6+
# Usage:
7+
# scripts/profile-mac.sh [--template NAME] [--frames N] [--warmup N]
8+
# [--blitter fast|accurate] [--rom PATH] [--open]
9+
#
10+
# Defaults:
11+
# template = "Time Profiler"
12+
# frames = 600 warmup = 60 blitter = accurate
13+
# rom = test/roms/yarc.j64
14+
# --open = open the .trace bundle in Instruments when finished
15+
#
16+
# Common templates:
17+
# "Time Profiler" -- where time is being spent (call tree / flame)
18+
# "CPU Counters" -- Apple Silicon PMU (cycles, instr, branches, misses)
19+
# "System Trace" -- syscalls, scheduler, VM events
20+
#
21+
set -euo pipefail
22+
23+
TEMPLATE="Time Profiler"
24+
FRAMES=600
25+
WARMUP=60
26+
BLITTER=accurate
27+
ROM="test/roms/yarc.j64"
28+
OPEN_TRACE=0
29+
30+
while [ $# -gt 0 ]; do
31+
case "$1" in
32+
--template) TEMPLATE="$2"; shift 2 ;;
33+
--frames) FRAMES="$2"; shift 2 ;;
34+
--warmup) WARMUP="$2"; shift 2 ;;
35+
--blitter) BLITTER="$2"; shift 2 ;;
36+
--rom) ROM="$2"; shift 2 ;;
37+
--open) OPEN_TRACE=1; shift ;;
38+
-h|--help)
39+
sed -n '2,20p' "$0"
40+
exit 0 ;;
41+
*)
42+
echo "Unknown arg: $1" >&2
43+
exit 2 ;;
44+
esac
45+
done
46+
47+
if ! command -v xctrace >/dev/null 2>&1; then
48+
echo "xctrace not found. Install Xcode Command Line Tools: xcode-select --install" >&2
49+
exit 1
50+
fi
51+
52+
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
53+
cd "$ROOT"
54+
55+
mkdir -p build
56+
TRACE="build/profile-$(date +%Y%m%d-%H%M%S).trace"
57+
58+
# Make sure the core + harness are built (no BENCH_PROFILE; profiling
59+
# instrumentation skews sampling results).
60+
make -j"$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)" >/dev/null
61+
cc -O2 -Wall -std=c99 -I. -I./libretro-common/include \
62+
-o test/tools/test_benchmark test/tools/test_benchmark.c
63+
64+
CORE="./virtualjaguar_libretro.dylib"
65+
HARNESS="./test/tools/test_benchmark"
66+
67+
echo ">>> xctrace template: $TEMPLATE"
68+
echo ">>> trace output: $TRACE"
69+
echo ">>> rom / blitter: $ROM / $BLITTER"
70+
echo ">>> frames (+warmup): $FRAMES (+$WARMUP)"
71+
72+
xctrace record \
73+
--template "$TEMPLATE" \
74+
--output "$TRACE" \
75+
--launch -- "$HARNESS" "$CORE" "$ROM" "$FRAMES" \
76+
--warmup "$WARMUP" --blitter "$BLITTER"
77+
78+
echo ">>> trace written to $TRACE"
79+
if [ "$OPEN_TRACE" = "1" ]; then
80+
open "$TRACE"
81+
fi

src/core/perf_counters.c

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* perf_counters.c - registry + dump for opt-in instrumentation counters.
3+
*
4+
* The register/dump/reset functions are *always* defined so they can be
5+
* exported through the test ABI without conditional linker scripts.
6+
* In !BENCH_PROFILE builds the bodies are no-ops and no PERF_COUNTER
7+
* calls perf_counters_register, so the registry stays empty.
8+
*/
9+
#include <string.h>
10+
#include "perf_counters.h"
11+
12+
#ifdef BENCH_PROFILE
13+
static perf_counter_entry_t *perf_head = (perf_counter_entry_t *)0;
14+
#endif
15+
16+
void perf_counters_register(perf_counter_entry_t *entry)
17+
{
18+
#ifdef BENCH_PROFILE
19+
if (!entry || entry->next)
20+
return; /* already linked */
21+
entry->next = perf_head;
22+
perf_head = entry;
23+
#else
24+
(void)entry;
25+
#endif
26+
}
27+
28+
void perf_counters_reset(void)
29+
{
30+
#ifdef BENCH_PROFILE
31+
perf_counter_entry_t *e;
32+
for (e = perf_head; e; e = e->next)
33+
*e->value = 0;
34+
#endif
35+
}
36+
37+
void perf_counters_dump(FILE *out)
38+
{
39+
#ifdef BENCH_PROFILE
40+
perf_counter_entry_t *e;
41+
if (!out)
42+
out = stderr;
43+
if (!perf_head) {
44+
fprintf(out, "[perf] no counters registered\n");
45+
return;
46+
}
47+
fprintf(out, "[perf] counter dump:\n");
48+
for (e = perf_head; e; e = e->next)
49+
fprintf(out, "[perf] %-40s %llu\n", e->name, *e->value);
50+
#else
51+
(void)out;
52+
#endif
53+
}
54+
55+
unsigned long long *perf_counters_find(const char *name)
56+
{
57+
#ifdef BENCH_PROFILE
58+
perf_counter_entry_t *e;
59+
if (!name) return (unsigned long long *)0;
60+
for (e = perf_head; e; e = e->next)
61+
if (e->name && strcmp(e->name, name) == 0)
62+
return e->value;
63+
#else
64+
(void)name;
65+
#endif
66+
return (unsigned long long *)0;
67+
}

0 commit comments

Comments
 (0)