Skip to content

Commit 28b56ce

Browse files
JoeMattclaude
andcommitted
perf: add reusable perf_counters.h instrumentation system
Generalizes the ad-hoc BLITTER_PROFILE pattern into a reusable, zero-overhead-when-off counter system any subsystem can use. * src/core/perf_counters.{h,c} - PERF_COUNTER / PERF_INC / PERF_ADD macros backed by a constructor-registered linked list. When BENCH_PROFILE is undefined every macro expands to (0) so there is no runtime, code-size, or symbol cost in shipped builds. * src/tom/blitter.c - migrate the existing BLITTER_PROFILE counters in BlitterMidsummer2 onto the new system. Counters are embedded in existing initializers via the comma operator so the file stays C89-clean (no statements before declarations). * Makefile - `make BENCH_PROFILE=1` defines the macro globally. `make benchmark BENCH_PROFILE=1` re-invokes with TEST_EXPORTS=1 so test_benchmark can dlsym `perf_counters_dump` and print all registered counters next to the FPS report. * test/tools/test_benchmark.c - dlsym the optional dump symbol; if present (BENCH_PROFILE build), call it before the BENCHMARK RESULTS block. No effect on default builds. * exports-test.list / link-test.T - add perf_counters_{dump,reset, register} so harnesses can reach them under TEST_EXPORTS=1. * scripts/profile-mac.sh - one-line wrapper around `xctrace record`. Defaults to Time Profiler; --template "CPU Counters" for PMU events on Apple Silicon. Builds the core + harness, runs the benchmark under instrumentation, writes a .trace bundle, and can auto-open it with --open. * docs/profiling.md - new sections covering BENCH_PROFILE counters (when to use vs sampling profilers) and the profile-mac.sh wrapper. Validated end-to-end with `make benchmark BENCH_PROFILE=1 BENCH_BLITTER=accurate`: counters populate (blitter_inner=283994 over 120 frames of yarc), and default builds remain unchanged. Co-Authored-By: Claude Opus 4.7 <[email protected]>
1 parent 662b13f commit 28b56ce

10 files changed

Lines changed: 349 additions & 10 deletions

File tree

Makefile

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ ifeq ($(DEBUG),1)
5353
CFLAGS += -DBUILD_TIMESTAMP="\"debug $(shell date -u +%Y-%m-%dT%H:%M:%SZ)\""
5454
endif
5555

56+
# Opt-in instrumentation counters (src/core/perf_counters.h).
57+
# `make BENCH_PROFILE=1` defines the macro so PERF_COUNTER/PERF_INC
58+
# emit real code; otherwise every counter macro is a no-op.
59+
ifeq ($(BENCH_PROFILE),1)
60+
CFLAGS += -DBENCH_PROFILE
61+
endif
62+
5663
# Symbol export gating.
5764
#
5865
# GNU ld (Linux, Windows MSYS2, ARM, ...) honours --version-script:
@@ -869,12 +876,19 @@ BENCH_ROM ?= test/roms/yarc.j64
869876
BENCH_FRAMES ?= 600
870877
BENCH_WARMUP ?= 60
871878
BENCH_BLITTER ?= fast
872-
benchmark: $(TARGET)
873-
@# Build the harness inline so this works whether or not TEST_EXPORTS=1
874-
@# was used for $(TARGET); the harness only uses retro_* exports.
875-
@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc
876-
@# (and Apple's clang silently accepts -ldl as a no-op, but other
877-
@# linkers may not).
879+
# BENCH_PROFILE=1 enables src/core/perf_counters.h instrumentation and
880+
# wide-export ABI so test_benchmark can dlsym `perf_counters_dump`.
881+
ifeq ($(BENCH_PROFILE),1)
882+
BENCH_TEST_EXPORTS := TEST_EXPORTS=1
883+
else
884+
BENCH_TEST_EXPORTS :=
885+
endif
886+
benchmark:
887+
@# Re-invoke make so BENCH_PROFILE / TEST_EXPORTS take effect on the .so/.dylib.
888+
$(MAKE) $(BENCH_TEST_EXPORTS) BENCH_PROFILE=$(BENCH_PROFILE) -j$(shell getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)
889+
@# Build the harness inline; it dlopens the core, so it only needs the retro_* ABI
890+
@# (plus the optional perf_counters_dump symbol when BENCH_PROFILE=1).
891+
@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc.
878892
$(CC) -O2 -Wall -std=c99 $(INCFLAGS) \
879893
-o test/tools/test_benchmark test/tools/test_benchmark.c \
880894
$(if $(filter Linux,$(shell uname -s)),-ldl)

Makefile.common

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ SOURCES_C := \
3333
$(CORE_DIR)/src/cd/cdrom.c \
3434
$(CORE_DIR)/src/core/cheat.c \
3535
$(CORE_DIR)/src/core/crc32.c \
36+
$(CORE_DIR)/src/core/perf_counters.c \
3637
$(CORE_DIR)/src/core/event.c \
3738
$(CORE_DIR)/src/jerry/eeprom.c \
3839
$(CORE_DIR)/src/core/filedb.c \

docs/profiling.md

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,21 @@ Reports `Frames/sec`, `Time/frame`, total wall time. Boots the core via `dlopen
2323

2424
**Instruments (Time Profiler)** is the easiest way to get a flame graph on macOS.
2525

26+
The wrapper at `scripts/profile-mac.sh` builds the core, runs the benchmark
27+
under `xctrace`, and writes a `.trace` bundle you can open in Instruments:
28+
29+
```bash
30+
scripts/profile-mac.sh # default: Time Profiler, accurate blitter
31+
scripts/profile-mac.sh --template "CPU Counters" # PMU: cycles, instructions, branch misses
32+
scripts/profile-mac.sh --rom test/roms/yarc.j64 --open # auto-open the trace
33+
```
34+
35+
Manual invocation if you'd rather attach to a running process:
36+
2637
```bash
2738
make benchmark BENCH_FRAMES=6000 BENCH_WARMUP=120 &
2839
BENCH_PID=$!
2940

30-
# Sample for 30 seconds, output to .trace bundle
3141
xcrun xctrace record --template "Time Profiler" --attach $BENCH_PID --output bench.trace --time-limit 30s
3242
open bench.trace
3343
```
@@ -41,6 +51,57 @@ sample $BENCH_PID 5 -file /tmp/sample.txt
4151
# 5-second sample. Read /tmp/sample.txt for collapsed call stacks.
4252
```
4353

54+
## Bespoke counters — `BENCH_PROFILE=1`
55+
56+
Sampling profilers tell you *where* time goes; counters tell you *how often*
57+
something happens. When you want exact iteration counts (e.g., "did my
58+
fast-path actually skip the inner loop?"), use the `perf_counters` system in
59+
`src/core/perf_counters.h`.
60+
61+
```bash
62+
make benchmark BENCH_PROFILE=1 BENCH_BLITTER=accurate BENCH_FRAMES=300
63+
# ...
64+
# [perf] counter dump:
65+
# [perf] blitter_phrase_writes 3034993
66+
# [perf] blitter_phrase_reads 931821
67+
# [perf] blitter_inner_io 3966814
68+
# [perf] blitter_inner 4131151
69+
# [perf] blitter_outer 337722
70+
# [perf] blitter_calls 131628
71+
```
72+
73+
The macros are zero-overhead when `BENCH_PROFILE` is undefined (default
74+
build) — every `PERF_INC` becomes `((void)0)`, every `PERF_COUNTER`
75+
becomes a typedef. Use them freely in hot paths to instrument
76+
hypotheses.
77+
78+
Adding a counter:
79+
80+
```c
81+
#include "perf_counters.h"
82+
83+
PERF_COUNTER(my_event); /* file scope */
84+
85+
void hot(void) {
86+
PERF_INC(my_event); /* in-loop */
87+
PERF_ADD(my_event, n); /* batch */
88+
}
89+
```
90+
91+
The harness (`test/tools/test_benchmark.c`) calls
92+
`perf_counters_dump(stderr)` at exit; counter values appear right
93+
before the `BENCHMARK RESULTS` block.
94+
95+
When to reach for this vs. Time Profiler:
96+
97+
| Question | Tool |
98+
|---|---|
99+
| "Where are we spending cycles?" | `xctrace` Time Profiler |
100+
| "How many times does the inner loop run per frame?" | `BENCH_PROFILE=1` |
101+
| "What fraction of inner iterations are no-ops?" | `BENCH_PROFILE=1` |
102+
| "Are we hitting L1 / branch-mispredicting?" | `xctrace` CPU Counters |
103+
| "Did this optimization change behavior, not just timing?" | `BENCH_PROFILE=1` (deltas in counts) |
104+
44105
## Linux — `perf` + flamegraph
45106
46107
```bash

exports-test.list

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,6 @@ _sclk
3333
_smode
3434
_lowerField
3535
_vjs
36+
_perf_counters_dump
37+
_perf_counters_reset
38+
_perf_counters_register

link-test.T

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,8 @@
3636
smode;
3737
lowerField;
3838
vjs;
39+
perf_counters_dump;
40+
perf_counters_reset;
41+
perf_counters_register;
3942
local: *;
4043
};

scripts/profile-mac.sh

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env bash
2+
#
3+
# profile-mac.sh -- Run test_benchmark under Xcode Instruments on Apple Silicon
4+
# (or any Mac with Xcode CLT).
5+
#
6+
# Usage:
7+
# scripts/profile-mac.sh [--template NAME] [--frames N] [--warmup N]
8+
# [--blitter fast|accurate] [--rom PATH] [--open]
9+
#
10+
# Defaults:
11+
# template = "Time Profiler"
12+
# frames = 600 warmup = 60 blitter = accurate
13+
# rom = test/roms/yarc.j64
14+
# --open = open the .trace bundle in Instruments when finished
15+
#
16+
# Common templates:
17+
# "Time Profiler" -- where time is being spent (call tree / flame)
18+
# "CPU Counters" -- Apple Silicon PMU (cycles, instr, branches, misses)
19+
# "System Trace" -- syscalls, scheduler, VM events
20+
#
21+
set -euo pipefail
22+
23+
TEMPLATE="Time Profiler"
24+
FRAMES=600
25+
WARMUP=60
26+
BLITTER=accurate
27+
ROM="test/roms/yarc.j64"
28+
OPEN_TRACE=0
29+
30+
while [ $# -gt 0 ]; do
31+
case "$1" in
32+
--template) TEMPLATE="$2"; shift 2 ;;
33+
--frames) FRAMES="$2"; shift 2 ;;
34+
--warmup) WARMUP="$2"; shift 2 ;;
35+
--blitter) BLITTER="$2"; shift 2 ;;
36+
--rom) ROM="$2"; shift 2 ;;
37+
--open) OPEN_TRACE=1; shift ;;
38+
-h|--help)
39+
sed -n '2,20p' "$0"
40+
exit 0 ;;
41+
*)
42+
echo "Unknown arg: $1" >&2
43+
exit 2 ;;
44+
esac
45+
done
46+
47+
if ! command -v xctrace >/dev/null 2>&1; then
48+
echo "xctrace not found. Install Xcode Command Line Tools: xcode-select --install" >&2
49+
exit 1
50+
fi
51+
52+
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
53+
cd "$ROOT"
54+
55+
mkdir -p build
56+
TRACE="build/profile-$(date +%Y%m%d-%H%M%S).trace"
57+
58+
# Make sure the core + harness are built (no BENCH_PROFILE; profiling
59+
# instrumentation skews sampling results).
60+
make -j"$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)" >/dev/null
61+
cc -O2 -Wall -std=c99 -I. -I./libretro-common/include \
62+
-o test/tools/test_benchmark test/tools/test_benchmark.c
63+
64+
CORE="./virtualjaguar_libretro.dylib"
65+
HARNESS="./test/tools/test_benchmark"
66+
67+
echo ">>> xctrace template: $TEMPLATE"
68+
echo ">>> trace output: $TRACE"
69+
echo ">>> rom / blitter: $ROM / $BLITTER"
70+
echo ">>> frames (+warmup): $FRAMES (+$WARMUP)"
71+
72+
xctrace record \
73+
--template "$TEMPLATE" \
74+
--output "$TRACE" \
75+
--launch -- "$HARNESS" "$CORE" "$ROM" "$FRAMES" \
76+
--warmup "$WARMUP" --blitter "$BLITTER"
77+
78+
echo ">>> trace written to $TRACE"
79+
if [ "$OPEN_TRACE" = "1" ]; then
80+
open "$TRACE"
81+
fi

src/core/perf_counters.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
* perf_counters.c - registry + dump for opt-in instrumentation counters.
3+
* Only compiled into the program when BENCH_PROFILE is defined; the header
4+
* provides no-op stubs otherwise.
5+
*/
6+
#include "perf_counters.h"
7+
8+
#ifdef BENCH_PROFILE
9+
10+
static perf_counter_entry_t *perf_head = (perf_counter_entry_t *)0;
11+
12+
void perf_counters_register(perf_counter_entry_t *entry)
13+
{
14+
if (!entry || entry->next)
15+
return; /* already linked */
16+
entry->next = perf_head;
17+
perf_head = entry;
18+
}
19+
20+
void perf_counters_reset(void)
21+
{
22+
perf_counter_entry_t *e;
23+
for (e = perf_head; e; e = e->next)
24+
*e->value = 0;
25+
}
26+
27+
void perf_counters_dump(FILE *out)
28+
{
29+
perf_counter_entry_t *e;
30+
if (!out)
31+
out = stderr;
32+
if (!perf_head) {
33+
fprintf(out, "[perf] no counters registered\n");
34+
return;
35+
}
36+
fprintf(out, "[perf] counter dump:\n");
37+
for (e = perf_head; e; e = e->next)
38+
fprintf(out, "[perf] %-40s %llu\n", e->name, *e->value);
39+
}
40+
41+
#endif /* BENCH_PROFILE */

src/core/perf_counters.h

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
* perf_counters.h - lightweight, opt-in instrumentation counters.
3+
*
4+
* Define BENCH_PROFILE at compile time to enable. Otherwise every macro
5+
* expands to (void)0 and there is no runtime, code-size, or symbol cost.
6+
*
7+
* Usage:
8+
*
9+
* #include "perf_counters.h"
10+
*
11+
* PERF_COUNTER(blitter_inner);
12+
* PERF_COUNTER(blitter_phrase_reads);
13+
*
14+
* void hot(void) {
15+
* PERF_INC(blitter_inner);
16+
* PERF_ADD(blitter_phrase_reads, 2);
17+
* }
18+
*
19+
* // Somewhere at shutdown (e.g., test harness atexit):
20+
* perf_counters_dump(stderr);
21+
*
22+
* Counters self-register via constructor functions, so PERF_COUNTER must
23+
* appear at file scope. Only one definition per name across the program.
24+
*
25+
* C89-clean. No designated initializers, no mid-block declarations.
26+
*/
27+
#ifndef VJ_PERF_COUNTERS_H
28+
#define VJ_PERF_COUNTERS_H
29+
30+
#include <stdio.h>
31+
32+
#ifdef __cplusplus
33+
extern "C" {
34+
#endif
35+
36+
#ifdef BENCH_PROFILE
37+
38+
typedef struct perf_counter_entry
39+
{
40+
const char *name;
41+
unsigned long long *value;
42+
struct perf_counter_entry *next;
43+
} perf_counter_entry_t;
44+
45+
void perf_counters_register(perf_counter_entry_t *entry);
46+
void perf_counters_dump(FILE *out);
47+
void perf_counters_reset(void);
48+
49+
#define PERF_COUNTER(name) \
50+
static unsigned long long perf_##name = 0; \
51+
static perf_counter_entry_t perf_entry_##name = \
52+
{ #name, &perf_##name, (perf_counter_entry_t *)0 }; \
53+
__attribute__((constructor)) \
54+
static void perf_register_##name(void) { \
55+
perf_counters_register(&perf_entry_##name); \
56+
} \
57+
typedef int perf_##name##_decl_semicolon_eater
58+
59+
/* PERF_INC / PERF_ADD are expressions of integer type (not statements),
60+
* so they can be embedded in declaration initializers via the comma
61+
* operator without violating C89's no-decl-after-statement rule:
62+
* uint32_t cmd = (PERF_INC(my_event), real_value());
63+
*/
64+
#define PERF_INC(name) (++perf_##name)
65+
#define PERF_ADD(name, n) (perf_##name += (unsigned long long)(n))
66+
67+
#else /* !BENCH_PROFILE */
68+
69+
#define PERF_COUNTER(name) typedef int perf_##name##_unused
70+
/* No-op forms remain expressions of integer type (not void) so callers
71+
* can use them inside comma operators without code changes. */
72+
#define PERF_INC(name) (0)
73+
#define PERF_ADD(name, n) ((void)(n), 0)
74+
75+
/* Stubs so callers don't need their own #ifdef around dump/reset. */
76+
static __inline void perf_counters_dump(FILE *out) { (void)out; }
77+
static __inline void perf_counters_reset(void) { }
78+
79+
#endif /* BENCH_PROFILE */
80+
81+
#ifdef __cplusplus
82+
}
83+
#endif
84+
85+
#endif /* VJ_PERF_COUNTERS_H */

0 commit comments

Comments
 (0)