Skip to content

Commit 0bc0441

Browse files
JoeMattclaude
andcommitted
Address PR #101 review: portable timer, cross-compile override, cleaner intrinsics
- Makefile.common: add BLITTER_SIMD=scalar|sse2|neon override for cross-compilation - blitter_simd.h: clarify byte_merge mask bit semantics in comment - blitter_simd_sse2.c: replace stack array + memcpy with direct bit ops in byte_merge - blitter_simd_neon.c: replace static const array with inline vcreate_u8 - test_blitter_simd.c: portable TIMER_DECL/START/STOP/NS macros (POSIX + Windows), fix build instructions (link one impl, not both) Co-Authored-By: Claude Opus 4.6 <[email protected]>
1 parent e46f527 commit 0bc0441

5 files changed

Lines changed: 99 additions & 73 deletions

File tree

Makefile.common

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,23 +49,30 @@ SOURCES_C := \
4949
$(CORE_DIR)/src/wavetable.c
5050

5151
# SIMD-accelerated blitter operations: select arch-specific implementation.
52+
# Override with BLITTER_SIMD=scalar|sse2|neon for cross-compilation.
5253
# Default to portable scalar fallback.
5354
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_scalar.c
5455

55-
# x86/x64: use SSE2 (baseline for all x86_64, available since Pentium 4)
56-
ifneq (,$(filter x86_64 x86 i686 i386,$(shell uname -m 2>/dev/null)))
57-
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
58-
endif
59-
# MSYS2/MinGW on x86_64
60-
ifneq (,$(filter MINGW64%,$(MSYSTEM)))
61-
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
56+
ifdef BLITTER_SIMD
57+
# Explicit override for cross-compilation or custom builds
58+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_$(BLITTER_SIMD).c
59+
else
60+
# Auto-detect from host architecture (native builds only)
61+
# x86/x64: use SSE2 (baseline for all x86_64, available since Pentium 4)
62+
ifneq (,$(filter x86_64 x86 i686 i386,$(shell uname -m 2>/dev/null)))
63+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
64+
endif
65+
# MSYS2/MinGW on x86_64
66+
ifneq (,$(filter MINGW64%,$(MSYSTEM)))
67+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
68+
endif
69+
# ARM64 (AArch64): NEON is always available
70+
ifneq (,$(filter aarch64 arm64,$(shell uname -m 2>/dev/null)))
71+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
72+
endif
6273
endif
6374

64-
# ARM64 (AArch64): NEON is always available
65-
ifneq (,$(filter aarch64 arm64,$(shell uname -m 2>/dev/null)))
66-
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
67-
endif
68-
# iOS/tvOS ARM64 cross-compilation
75+
# Platform-based overrides (cross-compilation targets)
6976
ifneq (,$(filter ios-arm64 tvos-arm64,$(platform)))
7077
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
7178
endif

src/blitter_simd.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ typedef struct
3030
uint8_t (*zcomp)(uint64_t srcz, uint64_t dstz, uint8_t zmode);
3131

3232
/* Byte Mask Merge: select bytes from src or dst based on 16-bit mask.
33-
* Bit 0 controls byte 0 (per-bit within byte 0), bits 8-14 control bytes 1-7.
33+
* Bits 0-7 control byte 0 (per-bit blend within the lowest byte).
34+
* Bits 8-14 control bytes 1-7 (whole-byte select, one bit each).
3435
* Used for both pixel data (ddat/dstd) and Z data (srcz/dstz). */
3536
uint64_t (*byte_merge)(uint64_t src, uint64_t dst, uint16_t mask);
3637
} blitter_simd_ops_t;

src/blitter_simd_neon.c

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,15 @@ static uint8_t neon_dcomp(uint64_t patd, uint64_t srcd, uint64_t dstd, bool cmpd
5454
uint8x8_t vzero = vdup_n_u8(0);
5555
uint8x8_t vcmp = vceq_u8(vxor, vzero); /* 0xFF where equal, 0 otherwise */
5656

57-
/* Extract one bit per byte.
58-
* Multiply each lane by a power-of-2 weight and horizontal add.
59-
* Weights: 1, 2, 4, 8, 16, 32, 64, 128 */
60-
static const uint8_t weights[8] = { 1, 2, 4, 8, 16, 32, 64, 128 };
61-
uint8x8_t vw = vld1_u8(weights);
62-
63-
/* AND with weights (0xFF & weight = weight, 0 & weight = 0) */
57+
/* Extract one bit per byte using power-of-2 weights.
58+
* vcreate avoids a static const load on every call. */
59+
uint8x8_t vw = vcreate_u8(0x8040201008040201ULL);
6460
uint8x8_t vbits = vand_u8(vcmp, vw);
6561

66-
/* Pairwise add to collapse 8 bytes -> 4 -> 2 -> 1 */
67-
uint8x8_t sum1 = vpadd_u8(vbits, vbits); /* 4 sums */
68-
uint8x8_t sum2 = vpadd_u8(sum1, sum1); /* 2 sums */
69-
uint8x8_t sum3 = vpadd_u8(sum2, sum2); /* 1 sum */
62+
/* Pairwise horizontal add: 8 -> 4 -> 2 -> 1 */
63+
uint8x8_t sum1 = vpadd_u8(vbits, vbits);
64+
uint8x8_t sum2 = vpadd_u8(sum1, sum1);
65+
uint8x8_t sum3 = vpadd_u8(sum2, sum2);
7066

7167
return vget_lane_u8(sum3, 0);
7268
}

src/blitter_simd_sse2.c

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -124,21 +124,18 @@ static uint8_t sse2_zcomp(uint64_t srcz, uint64_t dstz, uint8_t zmode)
124124
*/
125125
static uint64_t sse2_byte_merge(uint64_t src, uint64_t dst, uint16_t mask)
126126
{
127-
/* Build an 8-byte selection mask:
128-
* byte 0 = low 8 bits of mask (per-bit)
129-
* bytes 1-7 = 0xFF or 0x00 based on mask bits 8-14 */
130-
uint8_t sel[8];
131-
sel[0] = (uint8_t)(mask & 0xFF);
132-
sel[1] = (mask & 0x0100) ? 0xFF : 0x00;
133-
sel[2] = (mask & 0x0200) ? 0xFF : 0x00;
134-
sel[3] = (mask & 0x0400) ? 0xFF : 0x00;
135-
sel[4] = (mask & 0x0800) ? 0xFF : 0x00;
136-
sel[5] = (mask & 0x1000) ? 0xFF : 0x00;
137-
sel[6] = (mask & 0x2000) ? 0xFF : 0x00;
138-
sel[7] = (mask & 0x4000) ? 0xFF : 0x00;
139-
140-
uint64_t sel64;
141-
__builtin_memcpy(&sel64, sel, 8);
127+
/* Build an 8-byte selection mask directly via bit arithmetic.
128+
* Byte 0 = low 8 bits of mask (per-bit blend).
129+
* Bytes 1-7 = 0xFF or 0x00 from mask bits 8-14 (whole-byte select).
130+
* We expand each bit to a full 0xFF byte using sign-extension. */
131+
uint64_t sel64 = (uint64_t)(mask & 0xFF); /* byte 0: per-bit */
132+
sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 8) & 1))) << 8;
133+
sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 9) & 1))) << 16;
134+
sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 10) & 1))) << 24;
135+
sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 11) & 1))) << 32;
136+
sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 12) & 1))) << 40;
137+
sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 13) & 1))) << 48;
138+
sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 14) & 1))) << 56;
142139

143140
__m128i vmask = _mm_set_epi64x(0, (int64_t)sel64);
144141
__m128i vsrc = _mm_set_epi64x(0, (int64_t)src);

test/test_blitter_simd.c

Lines changed: 59 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
/*
22
* Bit-exactness and performance test for blitter SIMD operations.
33
*
4-
* Build (from repo root):
4+
* Build (from repo root — link exactly one SIMD implementation):
55
* # On macOS ARM64 (NEON):
66
* cc -O2 -o test/test_blitter_simd test/test_blitter_simd.c \
7-
* src/blitter_simd_neon.c src/blitter_simd_scalar.c
7+
* src/blitter_simd_neon.c
88
*
99
* # On x86_64 (SSE2):
1010
* cc -O2 -msse2 -o test/test_blitter_simd test/test_blitter_simd.c \
11-
* src/blitter_simd_sse2.c src/blitter_simd_scalar.c
11+
* src/blitter_simd_sse2.c
1212
*
1313
* # Scalar-only (any platform):
1414
* cc -O2 -o test/test_blitter_simd test/test_blitter_simd.c \
@@ -297,31 +297,51 @@ static void test_byte_merge(void)
297297

298298
#define BENCH_ITERS 1000000
299299

300-
static double elapsed_ns(struct timespec start, struct timespec end)
300+
/* Portable high-resolution timer.
301+
* Uses clock_gettime on POSIX, QueryPerformanceCounter on Windows.
302+
* Declare TIMER_DECL() once per scope, then use START/STOP/NS freely. */
303+
#ifdef _WIN32
304+
#include <windows.h>
305+
static double get_time_ns(void)
301306
{
302-
return (double)(end.tv_sec - start.tv_sec) * 1e9
303-
+ (double)(end.tv_nsec - start.tv_nsec);
307+
static LARGE_INTEGER freq = {0};
308+
LARGE_INTEGER count;
309+
if (freq.QuadPart == 0)
310+
QueryPerformanceFrequency(&freq);
311+
QueryPerformanceCounter(&count);
312+
return (double)count.QuadPart / (double)freq.QuadPart * 1e9;
304313
}
314+
#define TIMER_DECL() double _timer_t0, _timer_t1
315+
#define TIMER_START() (_timer_t0 = get_time_ns())
316+
#define TIMER_STOP() (_timer_t1 = get_time_ns())
317+
#define TIMER_NS() (_timer_t1 - _timer_t0)
318+
#else
319+
#define TIMER_DECL() struct timespec _timer_ts0, _timer_ts1
320+
#define TIMER_START() clock_gettime(CLOCK_MONOTONIC, &_timer_ts0)
321+
#define TIMER_STOP() clock_gettime(CLOCK_MONOTONIC, &_timer_ts1)
322+
#define TIMER_NS() (((double)(_timer_ts1.tv_sec - _timer_ts0.tv_sec) * 1e9) + (double)(_timer_ts1.tv_nsec - _timer_ts0.tv_nsec))
323+
#endif
305324

306325
static void bench_lfu(void)
307326
{
308-
struct timespec t0, t1;
327+
TIMER_DECL();
309328
volatile uint64_t sink = 0;
310329
int i;
330+
double ref_ns, simd_ns;
311331

312332
/* Ref */
313-
clock_gettime(CLOCK_MONOTONIC, &t0);
333+
TIMER_START();
314334
for (i = 0; i < BENCH_ITERS; i++)
315335
sink += ref_lfu(0xAAAAAAAAAAAAAAAAULL, 0x5555555555555555ULL, (uint8_t)(i & 0x0F));
316-
clock_gettime(CLOCK_MONOTONIC, &t1);
317-
double ref_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
336+
TIMER_STOP();
337+
ref_ns = TIMER_NS() / BENCH_ITERS;
318338

319339
/* SIMD */
320-
clock_gettime(CLOCK_MONOTONIC, &t0);
340+
TIMER_START();
321341
for (i = 0; i < BENCH_ITERS; i++)
322342
sink += blitter_simd_ops.lfu(0xAAAAAAAAAAAAAAAAULL, 0x5555555555555555ULL, (uint8_t)(i & 0x0F));
323-
clock_gettime(CLOCK_MONOTONIC, &t1);
324-
double simd_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
343+
TIMER_STOP();
344+
simd_ns = TIMER_NS() / BENCH_ITERS;
325345

326346
printf(" LFU: ref=%6.1f ns/op simd=%6.1f ns/op speedup=%.2fx\n",
327347
ref_ns, simd_ns, ref_ns / simd_ns);
@@ -330,21 +350,22 @@ static void bench_lfu(void)
330350

331351
static void bench_dcomp(void)
332352
{
333-
struct timespec t0, t1;
353+
TIMER_DECL();
334354
volatile uint8_t sink = 0;
335355
int i;
356+
double ref_ns, simd_ns;
336357

337-
clock_gettime(CLOCK_MONOTONIC, &t0);
358+
TIMER_START();
338359
for (i = 0; i < BENCH_ITERS; i++)
339360
sink += ref_dcomp(0x0102030405060708ULL, (uint64_t)i, 0, false);
340-
clock_gettime(CLOCK_MONOTONIC, &t1);
341-
double ref_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
361+
TIMER_STOP();
362+
ref_ns = TIMER_NS() / BENCH_ITERS;
342363

343-
clock_gettime(CLOCK_MONOTONIC, &t0);
364+
TIMER_START();
344365
for (i = 0; i < BENCH_ITERS; i++)
345366
sink += blitter_simd_ops.dcomp(0x0102030405060708ULL, (uint64_t)i, 0, false);
346-
clock_gettime(CLOCK_MONOTONIC, &t1);
347-
double simd_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
367+
TIMER_STOP();
368+
simd_ns = TIMER_NS() / BENCH_ITERS;
348369

349370
printf(" DCOMP: ref=%6.1f ns/op simd=%6.1f ns/op speedup=%.2fx\n",
350371
ref_ns, simd_ns, ref_ns / simd_ns);
@@ -353,21 +374,22 @@ static void bench_dcomp(void)
353374

354375
static void bench_zcomp(void)
355376
{
356-
struct timespec t0, t1;
377+
TIMER_DECL();
357378
volatile uint8_t sink = 0;
358379
int i;
380+
double ref_ns, simd_ns;
359381

360-
clock_gettime(CLOCK_MONOTONIC, &t0);
382+
TIMER_START();
361383
for (i = 0; i < BENCH_ITERS; i++)
362384
sink += ref_zcomp(0x0001000200030004ULL, 0x0002000200020002ULL, (uint8_t)(i & 0x07));
363-
clock_gettime(CLOCK_MONOTONIC, &t1);
364-
double ref_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
385+
TIMER_STOP();
386+
ref_ns = TIMER_NS() / BENCH_ITERS;
365387

366-
clock_gettime(CLOCK_MONOTONIC, &t0);
388+
TIMER_START();
367389
for (i = 0; i < BENCH_ITERS; i++)
368390
sink += blitter_simd_ops.zcomp(0x0001000200030004ULL, 0x0002000200020002ULL, (uint8_t)(i & 0x07));
369-
clock_gettime(CLOCK_MONOTONIC, &t1);
370-
double simd_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
391+
TIMER_STOP();
392+
simd_ns = TIMER_NS() / BENCH_ITERS;
371393

372394
printf(" ZCOMP: ref=%6.1f ns/op simd=%6.1f ns/op speedup=%.2fx\n",
373395
ref_ns, simd_ns, ref_ns / simd_ns);
@@ -376,21 +398,22 @@ static void bench_zcomp(void)
376398

377399
static void bench_byte_merge(void)
378400
{
379-
struct timespec t0, t1;
401+
TIMER_DECL();
380402
volatile uint64_t sink = 0;
381403
int i;
404+
double ref_ns, simd_ns;
382405

383-
clock_gettime(CLOCK_MONOTONIC, &t0);
406+
TIMER_START();
384407
for (i = 0; i < BENCH_ITERS; i++)
385408
sink += ref_byte_merge(0xAAAAAAAAAAAAAAAAULL, 0x5555555555555555ULL, (uint16_t)(i & 0x7FFF));
386-
clock_gettime(CLOCK_MONOTONIC, &t1);
387-
double ref_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
409+
TIMER_STOP();
410+
ref_ns = TIMER_NS() / BENCH_ITERS;
388411

389-
clock_gettime(CLOCK_MONOTONIC, &t0);
412+
TIMER_START();
390413
for (i = 0; i < BENCH_ITERS; i++)
391414
sink += blitter_simd_ops.byte_merge(0xAAAAAAAAAAAAAAAAULL, 0x5555555555555555ULL, (uint16_t)(i & 0x7FFF));
392-
clock_gettime(CLOCK_MONOTONIC, &t1);
393-
double simd_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
415+
TIMER_STOP();
416+
simd_ns = TIMER_NS() / BENCH_ITERS;
394417

395418
printf(" byte_merge: ref=%6.1f ns/op simd=%6.1f ns/op speedup=%.2fx\n",
396419
ref_ns, simd_ns, ref_ns / simd_ns);
@@ -424,3 +447,5 @@ int main(int argc, char *argv[])
424447

425448
return failures > 0 ? 1 : 0;
426449
}
450+
? 1 : 0;
451+
}

0 commit comments

Comments
 (0)