Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,22 @@ ifeq ($(DEBUG),1)
CFLAGS += -DBUILD_TIMESTAMP="\"debug $(shell date -u +%Y-%m-%dT%H:%M:%SZ)\""
endif

# Opt-in instrumentation counters (src/core/perf_counters.h).
# `make BENCH_PROFILE=1` defines the macro so PERF_COUNTER/PERF_INC
# emit real code; otherwise every counter macro is a no-op.
ifeq ($(BENCH_PROFILE),1)
CFLAGS += -DBENCH_PROFILE
endif

# Per-blit slow-path tracing in BlitterMidsummer2.
# `make BLITTER_TRACE=1` enables an stderr dump of any single blit
# whose wall time exceeds ~1.5 ms (configurable via the threshold in
# src/tom/blitter.c). Useful for finding pathological blit commands
# that dominate frame-time variance. macOS-only (uses mach_*).
ifeq ($(BLITTER_TRACE),1)
CFLAGS += -DBLITTER_TRACE
endif

# Symbol export gating.
#
# GNU ld (Linux, Windows MSYS2, ARM, ...) honours --version-script:
Expand Down Expand Up @@ -869,17 +885,25 @@ BENCH_ROM ?= test/roms/yarc.j64
BENCH_FRAMES ?= 600
BENCH_WARMUP ?= 60
BENCH_BLITTER ?= fast
benchmark: $(TARGET)
@# Build the harness inline so this works whether or not TEST_EXPORTS=1
@# was used for $(TARGET); the harness only uses retro_* exports.
@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc
@# (and Apple's clang silently accepts -ldl as a no-op, but other
@# linkers may not).
# BENCH_PROFILE=1 enables src/core/perf_counters.h instrumentation and
# wide-export ABI so test_benchmark can dlsym `perf_counters_dump`.
ifeq ($(BENCH_PROFILE),1)
BENCH_TEST_EXPORTS := TEST_EXPORTS=1
else
BENCH_TEST_EXPORTS :=
endif
benchmark:
@# Re-invoke make so BENCH_PROFILE / TEST_EXPORTS take effect on the .so/.dylib.
$(MAKE) $(BENCH_TEST_EXPORTS) BENCH_PROFILE=$(BENCH_PROFILE) -j$(shell getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)
@# Build the harness inline; it dlopens the core, so it only needs the retro_* ABI
@# (plus the optional perf_counters_dump symbol when BENCH_PROFILE=1).
@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc.
$(CC) -O2 -Wall -std=c99 $(INCFLAGS) \
-o test/tools/test_benchmark test/tools/test_benchmark.c \
$(if $(filter Linux,$(shell uname -s)),-ldl)
./test/tools/test_benchmark ./$(TARGET) "$(BENCH_ROM)" $(BENCH_FRAMES) \
--warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER)
--warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER) \
$(if $(BENCH_STATE),--load-state "$(BENCH_STATE)")

print-%:
@echo '$*=$($*)'
Expand Down
1 change: 1 addition & 0 deletions Makefile.common
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ SOURCES_C := \
$(CORE_DIR)/src/cd/cdrom.c \
$(CORE_DIR)/src/core/cheat.c \
$(CORE_DIR)/src/core/crc32.c \
$(CORE_DIR)/src/core/perf_counters.c \
$(CORE_DIR)/src/core/event.c \
$(CORE_DIR)/src/jerry/eeprom.c \
$(CORE_DIR)/src/core/filedb.c \
Expand Down
63 changes: 62 additions & 1 deletion docs/profiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,21 @@ Reports `Frames/sec`, `Time/frame`, total wall time. Boots the core via `dlopen

**Instruments (Time Profiler)** is the easiest way to get a flame graph on macOS.

The wrapper at `scripts/profile-mac.sh` builds the core, runs the benchmark
under `xctrace`, and writes a `.trace` bundle you can open in Instruments:

```bash
scripts/profile-mac.sh # default: Time Profiler, accurate blitter
scripts/profile-mac.sh --template "CPU Counters" # PMU: cycles, instructions, branch misses
scripts/profile-mac.sh --rom test/roms/yarc.j64 --open # auto-open the trace
```

Manual invocation if you'd rather attach to a running process:

```bash
make benchmark BENCH_FRAMES=6000 BENCH_WARMUP=120 &
BENCH_PID=$!

# Sample for 30 seconds, output to .trace bundle
xcrun xctrace record --template "Time Profiler" --attach $BENCH_PID --output bench.trace --time-limit 30s
open bench.trace
```
Expand All @@ -41,6 +51,57 @@ sample $BENCH_PID 5 -file /tmp/sample.txt
# 5-second sample. Read /tmp/sample.txt for collapsed call stacks.
```

## Bespoke counters — `BENCH_PROFILE=1`

Sampling profilers tell you *where* time goes; counters tell you *how often*
something happens. When you want exact iteration counts (e.g., "did my
fast-path actually skip the inner loop?"), use the `perf_counters` system in
`src/core/perf_counters.h`.

```bash
make benchmark BENCH_PROFILE=1 BENCH_BLITTER=accurate BENCH_FRAMES=300
# ...
# [perf] counter dump:
# [perf] blitter_phrase_writes 3034993
# [perf] blitter_phrase_reads 931821
# [perf] blitter_inner_io 3966814
# [perf] blitter_inner 4131151
# [perf] blitter_outer 337722
# [perf] blitter_calls 131628
```

The macros are zero-overhead when `BENCH_PROFILE` is undefined (default
build) — every `PERF_INC` becomes `((void)0)`, every `PERF_COUNTER`
becomes a typedef. Use them freely in hot paths to instrument
hypotheses.

Adding a counter:

```c
#include "perf_counters.h"

PERF_COUNTER(my_event); /* file scope */

void hot(void) {
PERF_INC(my_event); /* in-loop */
PERF_ADD(my_event, n); /* batch */
}
```

The harness (`test/tools/test_benchmark.c`) calls
`perf_counters_dump(stderr)` at exit; counter values appear right
before the `BENCHMARK RESULTS` block.

When to reach for this vs. Time Profiler:

| Question | Tool |
|---|---|
| "Where are we spending cycles?" | `xctrace` Time Profiler |
| "How many times does the inner loop run per frame?" | `BENCH_PROFILE=1` |
| "What fraction of inner iterations are no-ops?" | `BENCH_PROFILE=1` |
| "Are we hitting L1 / branch-mispredicting?" | `xctrace` CPU Counters |
| "Did this optimization change behavior, not just timing?" | `BENCH_PROFILE=1` (deltas in counts) |

## Linux — `perf` + flamegraph

```bash
Expand Down
4 changes: 4 additions & 0 deletions exports-test.list
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,7 @@ _sclk
_smode
_lowerField
_vjs
_perf_counters_dump
_perf_counters_reset
_perf_counters_register
_perf_counters_find
4 changes: 4 additions & 0 deletions link-test.T
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,9 @@
smode;
lowerField;
vjs;
perf_counters_dump;
perf_counters_reset;
perf_counters_register;
perf_counters_find;
local: *;
};
81 changes: 81 additions & 0 deletions scripts/profile-mac.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env bash
#
# profile-mac.sh -- Run test_benchmark under Xcode Instruments on Apple Silicon
# (or any Mac with Xcode CLT).
#
# Usage:
# scripts/profile-mac.sh [--template NAME] [--frames N] [--warmup N]
# [--blitter fast|accurate] [--rom PATH] [--open]
#
# Defaults:
# template = "Time Profiler"
# frames = 600 warmup = 60 blitter = accurate
# rom = test/roms/yarc.j64
# --open = open the .trace bundle in Instruments when finished
#
# Common templates:
# "Time Profiler" -- where time is being spent (call tree / flame)
# "CPU Counters" -- Apple Silicon PMU (cycles, instr, branches, misses)
# "System Trace" -- syscalls, scheduler, VM events
#
set -euo pipefail

TEMPLATE="Time Profiler"
FRAMES=600
WARMUP=60
BLITTER=accurate
ROM="test/roms/yarc.j64"
OPEN_TRACE=0

while [ $# -gt 0 ]; do
case "$1" in
--template) TEMPLATE="$2"; shift 2 ;;
--frames) FRAMES="$2"; shift 2 ;;
--warmup) WARMUP="$2"; shift 2 ;;
--blitter) BLITTER="$2"; shift 2 ;;
--rom) ROM="$2"; shift 2 ;;
--open) OPEN_TRACE=1; shift ;;
-h|--help)
sed -n '2,20p' "$0"
exit 0 ;;
*)
echo "Unknown arg: $1" >&2
exit 2 ;;
esac
done

if ! command -v xctrace >/dev/null 2>&1; then
echo "xctrace not found. Install Xcode Command Line Tools: xcode-select --install" >&2
exit 1
fi

ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "$ROOT"

mkdir -p build
TRACE="build/profile-$(date +%Y%m%d-%H%M%S).trace"

# Make sure the core + harness are built (no BENCH_PROFILE; profiling
# instrumentation skews sampling results).
make -j"$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)" >/dev/null
cc -O2 -Wall -std=c99 -I. -I./libretro-common/include \
-o test/tools/test_benchmark test/tools/test_benchmark.c

CORE="./virtualjaguar_libretro.dylib"
HARNESS="./test/tools/test_benchmark"

echo ">>> xctrace template: $TEMPLATE"
echo ">>> trace output: $TRACE"
echo ">>> rom / blitter: $ROM / $BLITTER"
echo ">>> frames (+warmup): $FRAMES (+$WARMUP)"

xctrace record \
--template "$TEMPLATE" \
--output "$TRACE" \
--launch -- "$HARNESS" "$CORE" "$ROM" "$FRAMES" \
--warmup "$WARMUP" --blitter "$BLITTER"

echo ">>> trace written to $TRACE"
if [ "$OPEN_TRACE" = "1" ]; then
open "$TRACE"
fi
67 changes: 67 additions & 0 deletions src/core/perf_counters.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* perf_counters.c - registry + dump for opt-in instrumentation counters.
*
* The register/dump/reset functions are *always* defined so they can be
* exported through the test ABI without conditional linker scripts.
* In !BENCH_PROFILE builds the bodies are no-ops and no PERF_COUNTER
* calls perf_counters_register, so the registry stays empty.
*/
#include <string.h>
#include "perf_counters.h"

#ifdef BENCH_PROFILE
static perf_counter_entry_t *perf_head = (perf_counter_entry_t *)0;
#endif

void perf_counters_register(perf_counter_entry_t *entry)
{
#ifdef BENCH_PROFILE
if (!entry || entry->next)
return; /* already linked */
entry->next = perf_head;
perf_head = entry;
#else
(void)entry;
#endif
}

void perf_counters_reset(void)
{
#ifdef BENCH_PROFILE
perf_counter_entry_t *e;
for (e = perf_head; e; e = e->next)
*e->value = 0;
#endif
}

void perf_counters_dump(FILE *out)
{
#ifdef BENCH_PROFILE
perf_counter_entry_t *e;
if (!out)
out = stderr;
if (!perf_head) {
fprintf(out, "[perf] no counters registered\n");
return;
}
fprintf(out, "[perf] counter dump:\n");
for (e = perf_head; e; e = e->next)
fprintf(out, "[perf] %-40s %llu\n", e->name, *e->value);
#else
(void)out;
#endif
}

unsigned long long *perf_counters_find(const char *name)
{
#ifdef BENCH_PROFILE
perf_counter_entry_t *e;
if (!name) return (unsigned long long *)0;
for (e = perf_head; e; e = e->next)
if (e->name && strcmp(e->name, name) == 0)
return e->value;
#else
(void)name;
#endif
return (unsigned long long *)0;
}
Loading
Loading