Address PR #101 review: portable timer, cross-compile override, cleaner intrinsics

JoeMatt · claude · JoeMatt · commit 0bc0441b41bf · 2026-04-16T00:13:59.000-04:00
- Makefile.common: add BLITTER_SIMD=scalar|sse2|neon override for cross-compilation
- blitter_simd.h: clarify byte_merge mask bit semantics in comment
- blitter_simd_sse2.c: replace stack array + memcpy with direct bit ops in byte_merge
- blitter_simd_neon.c: replace static const array with inline vcreate_u8
- test_blitter_simd.c: portable TIMER_DECL/START/STOP/NS macros (POSIX + Windows),
  fix build instructions (link one impl, not both)

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/Makefile.common b/Makefile.common
@@ -49,23 +49,30 @@ SOURCES_C :=  \
 	$(CORE_DIR)/src/wavetable.c
 
 # SIMD-accelerated blitter operations: select arch-specific implementation.
+# Override with BLITTER_SIMD=scalar|sse2|neon for cross-compilation.
 # Default to portable scalar fallback.
 BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_scalar.c
 
-# x86/x64: use SSE2 (baseline for all x86_64, available since Pentium 4)
-ifneq (,$(filter x86_64 x86 i686 i386,$(shell uname -m 2>/dev/null)))
-   BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
-endif
-# MSYS2/MinGW on x86_64
-ifneq (,$(filter MINGW64%,$(MSYSTEM)))
-   BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
+ifdef BLITTER_SIMD
+   # Explicit override for cross-compilation or custom builds
+   BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_$(BLITTER_SIMD).c
+else
+   # Auto-detect from host architecture (native builds only)
+   # x86/x64: use SSE2 (baseline for all x86_64, available since Pentium 4)
+   ifneq (,$(filter x86_64 x86 i686 i386,$(shell uname -m 2>/dev/null)))
+      BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
+   endif
+   # MSYS2/MinGW on x86_64
+   ifneq (,$(filter MINGW64%,$(MSYSTEM)))
+      BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
+   endif
+   # ARM64 (AArch64): NEON is always available
+   ifneq (,$(filter aarch64 arm64,$(shell uname -m 2>/dev/null)))
+      BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
+   endif
 endif
 
-# ARM64 (AArch64): NEON is always available
-ifneq (,$(filter aarch64 arm64,$(shell uname -m 2>/dev/null)))
-   BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
-endif
-# iOS/tvOS ARM64 cross-compilation
+# Platform-based overrides (cross-compilation targets)
 ifneq (,$(filter ios-arm64 tvos-arm64,$(platform)))
    BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
 endif
diff --git a/src/blitter_simd.h b/src/blitter_simd.h
@@ -30,7 +30,8 @@ typedef struct
    uint8_t  (*zcomp)(uint64_t srcz, uint64_t dstz, uint8_t zmode);
 
    /* Byte Mask Merge: select bytes from src or dst based on 16-bit mask.
-    * Bit 0 controls byte 0 (per-bit within byte 0), bits 8-14 control bytes 1-7.
+    * Bits 0-7 control byte 0 (per-bit blend within the lowest byte).
+    * Bits 8-14 control bytes 1-7 (whole-byte select, one bit each).
     * Used for both pixel data (ddat/dstd) and Z data (srcz/dstz). */
    uint64_t (*byte_merge)(uint64_t src, uint64_t dst, uint16_t mask);
 } blitter_simd_ops_t;
diff --git a/src/blitter_simd_neon.c b/src/blitter_simd_neon.c
@@ -54,19 +54,15 @@ static uint8_t neon_dcomp(uint64_t patd, uint64_t srcd, uint64_t dstd, bool cmpd
    uint8x8_t vzero = vdup_n_u8(0);
    uint8x8_t vcmp = vceq_u8(vxor, vzero);  /* 0xFF where equal, 0 otherwise */
 
-   /* Extract one bit per byte.
-    * Multiply each lane by a power-of-2 weight and horizontal add.
-    * Weights: 1, 2, 4, 8, 16, 32, 64, 128 */
-   static const uint8_t weights[8] = { 1, 2, 4, 8, 16, 32, 64, 128 };
-   uint8x8_t vw = vld1_u8(weights);
-
-   /* AND with weights (0xFF & weight = weight, 0 & weight = 0) */
+   /* Extract one bit per byte using power-of-2 weights.
+    * vcreate avoids a static const load on every call. */
+   uint8x8_t vw = vcreate_u8(0x8040201008040201ULL);
    uint8x8_t vbits = vand_u8(vcmp, vw);
 
-   /* Pairwise add to collapse 8 bytes -> 4 -> 2 -> 1 */
-   uint8x8_t sum1 = vpadd_u8(vbits, vbits);  /* 4 sums */
-   uint8x8_t sum2 = vpadd_u8(sum1, sum1);     /* 2 sums */
-   uint8x8_t sum3 = vpadd_u8(sum2, sum2);     /* 1 sum  */
+   /* Pairwise horizontal add: 8 -> 4 -> 2 -> 1 */
+   uint8x8_t sum1 = vpadd_u8(vbits, vbits);
+   uint8x8_t sum2 = vpadd_u8(sum1, sum1);
+   uint8x8_t sum3 = vpadd_u8(sum2, sum2);
 
    return vget_lane_u8(sum3, 0);
 }
diff --git a/src/blitter_simd_sse2.c b/src/blitter_simd_sse2.c
@@ -124,21 +124,18 @@ static uint8_t sse2_zcomp(uint64_t srcz, uint64_t dstz, uint8_t zmode)
  */
 static uint64_t sse2_byte_merge(uint64_t src, uint64_t dst, uint16_t mask)
 {
-   /* Build an 8-byte selection mask:
-    * byte 0 = low 8 bits of mask (per-bit)
-    * bytes 1-7 = 0xFF or 0x00 based on mask bits 8-14 */
-   uint8_t sel[8];
-   sel[0] = (uint8_t)(mask & 0xFF);
-   sel[1] = (mask & 0x0100) ? 0xFF : 0x00;
-   sel[2] = (mask & 0x0200) ? 0xFF : 0x00;
-   sel[3] = (mask & 0x0400) ? 0xFF : 0x00;
-   sel[4] = (mask & 0x0800) ? 0xFF : 0x00;
-   sel[5] = (mask & 0x1000) ? 0xFF : 0x00;
-   sel[6] = (mask & 0x2000) ? 0xFF : 0x00;
-   sel[7] = (mask & 0x4000) ? 0xFF : 0x00;
-
-   uint64_t sel64;
-   __builtin_memcpy(&sel64, sel, 8);
+   /* Build an 8-byte selection mask directly via bit arithmetic.
+    * Byte 0 = low 8 bits of mask (per-bit blend).
+    * Bytes 1-7 = 0xFF or 0x00 from mask bits 8-14 (whole-byte select).
+    * We expand each bit to a full 0xFF byte using sign-extension. */
+   uint64_t sel64 = (uint64_t)(mask & 0xFF);  /* byte 0: per-bit */
+   sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 8)  & 1))) << 8;
+   sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 9)  & 1))) << 16;
+   sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 10) & 1))) << 24;
+   sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 11) & 1))) << 32;
+   sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 12) & 1))) << 40;
+   sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 13) & 1))) << 48;
+   sel64 |= (uint64_t)((uint8_t)(-(int8_t)((mask >> 14) & 1))) << 56;
 
    __m128i vmask = _mm_set_epi64x(0, (int64_t)sel64);
    __m128i vsrc  = _mm_set_epi64x(0, (int64_t)src);
diff --git a/test/test_blitter_simd.c b/test/test_blitter_simd.c
@@ -1,14 +1,14 @@
 /*
  * Bit-exactness and performance test for blitter SIMD operations.
  *
- * Build (from repo root):
+ * Build (from repo root — link exactly one SIMD implementation):
  *   # On macOS ARM64 (NEON):
  *   cc -O2 -o test/test_blitter_simd test/test_blitter_simd.c \
- *      src/blitter_simd_neon.c src/blitter_simd_scalar.c
+ *      src/blitter_simd_neon.c
  *
  *   # On x86_64 (SSE2):
  *   cc -O2 -msse2 -o test/test_blitter_simd test/test_blitter_simd.c \
- *      src/blitter_simd_sse2.c src/blitter_simd_scalar.c
+ *      src/blitter_simd_sse2.c
  *
  *   # Scalar-only (any platform):
  *   cc -O2 -o test/test_blitter_simd test/test_blitter_simd.c \
@@ -297,31 +297,51 @@ static void test_byte_merge(void)
 
 #define BENCH_ITERS 1000000
 
-static double elapsed_ns(struct timespec start, struct timespec end)
+/* Portable high-resolution timer.
+ * Uses clock_gettime on POSIX, QueryPerformanceCounter on Windows.
+ * Declare TIMER_DECL() once per scope, then use START/STOP/NS freely. */
+#ifdef _WIN32
+#include <windows.h>
+static double get_time_ns(void)
 {
-   return (double)(end.tv_sec - start.tv_sec) * 1e9
-        + (double)(end.tv_nsec - start.tv_nsec);
+   static LARGE_INTEGER freq = {0};
+   LARGE_INTEGER count;
+   if (freq.QuadPart == 0)
+      QueryPerformanceFrequency(&freq);
+   QueryPerformanceCounter(&count);
+   return (double)count.QuadPart / (double)freq.QuadPart * 1e9;
 }
+#define TIMER_DECL()  double _timer_t0, _timer_t1
+#define TIMER_START() (_timer_t0 = get_time_ns())
+#define TIMER_STOP()  (_timer_t1 = get_time_ns())
+#define TIMER_NS()    (_timer_t1 - _timer_t0)
+#else
+#define TIMER_DECL()  struct timespec _timer_ts0, _timer_ts1
+#define TIMER_START() clock_gettime(CLOCK_MONOTONIC, &_timer_ts0)
+#define TIMER_STOP()  clock_gettime(CLOCK_MONOTONIC, &_timer_ts1)
+#define TIMER_NS()    (((double)(_timer_ts1.tv_sec - _timer_ts0.tv_sec) * 1e9) + (double)(_timer_ts1.tv_nsec - _timer_ts0.tv_nsec))
+#endif
 
 static void bench_lfu(void)
 {
-   struct timespec t0, t1;
+   TIMER_DECL();
    volatile uint64_t sink = 0;
    int i;
+   double ref_ns, simd_ns;
 
    /* Ref */
-   clock_gettime(CLOCK_MONOTONIC, &t0);
+   TIMER_START();
    for (i = 0; i < BENCH_ITERS; i++)
       sink += ref_lfu(0xAAAAAAAAAAAAAAAAULL, 0x5555555555555555ULL, (uint8_t)(i & 0x0F));
-   clock_gettime(CLOCK_MONOTONIC, &t1);
-   double ref_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
+   TIMER_STOP();
+   ref_ns = TIMER_NS() / BENCH_ITERS;
 
    /* SIMD */
-   clock_gettime(CLOCK_MONOTONIC, &t0);
+   TIMER_START();
    for (i = 0; i < BENCH_ITERS; i++)
       sink += blitter_simd_ops.lfu(0xAAAAAAAAAAAAAAAAULL, 0x5555555555555555ULL, (uint8_t)(i & 0x0F));
-   clock_gettime(CLOCK_MONOTONIC, &t1);
-   double simd_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
+   TIMER_STOP();
+   simd_ns = TIMER_NS() / BENCH_ITERS;
 
    printf("  LFU:        ref=%6.1f ns/op  simd=%6.1f ns/op  speedup=%.2fx\n",
           ref_ns, simd_ns, ref_ns / simd_ns);
@@ -330,21 +350,22 @@ static void bench_lfu(void)
 
 static void bench_dcomp(void)
 {
-   struct timespec t0, t1;
+   TIMER_DECL();
    volatile uint8_t sink = 0;
    int i;
+   double ref_ns, simd_ns;
 
-   clock_gettime(CLOCK_MONOTONIC, &t0);
+   TIMER_START();
    for (i = 0; i < BENCH_ITERS; i++)
       sink += ref_dcomp(0x0102030405060708ULL, (uint64_t)i, 0, false);
-   clock_gettime(CLOCK_MONOTONIC, &t1);
-   double ref_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
+   TIMER_STOP();
+   ref_ns = TIMER_NS() / BENCH_ITERS;
 
-   clock_gettime(CLOCK_MONOTONIC, &t0);
+   TIMER_START();
    for (i = 0; i < BENCH_ITERS; i++)
       sink += blitter_simd_ops.dcomp(0x0102030405060708ULL, (uint64_t)i, 0, false);
-   clock_gettime(CLOCK_MONOTONIC, &t1);
-   double simd_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
+   TIMER_STOP();
+   simd_ns = TIMER_NS() / BENCH_ITERS;
 
    printf("  DCOMP:      ref=%6.1f ns/op  simd=%6.1f ns/op  speedup=%.2fx\n",
           ref_ns, simd_ns, ref_ns / simd_ns);
@@ -353,21 +374,22 @@ static void bench_dcomp(void)
 
 static void bench_zcomp(void)
 {
-   struct timespec t0, t1;
+   TIMER_DECL();
    volatile uint8_t sink = 0;
    int i;
+   double ref_ns, simd_ns;
 
-   clock_gettime(CLOCK_MONOTONIC, &t0);
+   TIMER_START();
    for (i = 0; i < BENCH_ITERS; i++)
       sink += ref_zcomp(0x0001000200030004ULL, 0x0002000200020002ULL, (uint8_t)(i & 0x07));
-   clock_gettime(CLOCK_MONOTONIC, &t1);
-   double ref_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
+   TIMER_STOP();
+   ref_ns = TIMER_NS() / BENCH_ITERS;
 
-   clock_gettime(CLOCK_MONOTONIC, &t0);
+   TIMER_START();
    for (i = 0; i < BENCH_ITERS; i++)
       sink += blitter_simd_ops.zcomp(0x0001000200030004ULL, 0x0002000200020002ULL, (uint8_t)(i & 0x07));
-   clock_gettime(CLOCK_MONOTONIC, &t1);
-   double simd_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
+   TIMER_STOP();
+   simd_ns = TIMER_NS() / BENCH_ITERS;
 
    printf("  ZCOMP:      ref=%6.1f ns/op  simd=%6.1f ns/op  speedup=%.2fx\n",
           ref_ns, simd_ns, ref_ns / simd_ns);
@@ -376,21 +398,22 @@ static void bench_zcomp(void)
 
 static void bench_byte_merge(void)
 {
-   struct timespec t0, t1;
+   TIMER_DECL();
    volatile uint64_t sink = 0;
    int i;
+   double ref_ns, simd_ns;
 
-   clock_gettime(CLOCK_MONOTONIC, &t0);
+   TIMER_START();
    for (i = 0; i < BENCH_ITERS; i++)
       sink += ref_byte_merge(0xAAAAAAAAAAAAAAAAULL, 0x5555555555555555ULL, (uint16_t)(i & 0x7FFF));
-   clock_gettime(CLOCK_MONOTONIC, &t1);
-   double ref_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
+   TIMER_STOP();
+   ref_ns = TIMER_NS() / BENCH_ITERS;
 
-   clock_gettime(CLOCK_MONOTONIC, &t0);
+   TIMER_START();
    for (i = 0; i < BENCH_ITERS; i++)
       sink += blitter_simd_ops.byte_merge(0xAAAAAAAAAAAAAAAAULL, 0x5555555555555555ULL, (uint16_t)(i & 0x7FFF));
-   clock_gettime(CLOCK_MONOTONIC, &t1);
-   double simd_ns = elapsed_ns(t0, t1) / BENCH_ITERS;
+   TIMER_STOP();
+   simd_ns = TIMER_NS() / BENCH_ITERS;
 
    printf("  byte_merge: ref=%6.1f ns/op  simd=%6.1f ns/op  speedup=%.2fx\n",
           ref_ns, simd_ns, ref_ns / simd_ns);
@@ -424,3 +447,5 @@ int main(int argc, char *argv[])
 
    return failures > 0 ? 1 : 0;
 }
+? 1 : 0;
+}