Skip to content

Commit 5514b46

Browse files
JoeMattclaude
andcommitted
Address remaining PR #101 review: improved Makefile detection, lighter NEON dcomp
- Makefile.common: validate BLITTER_SIMD value, detect via $(platform)/$(ARCH) variables for cross-compilation, keep uname -m as native fallback only - blitter_simd_neon.c: replace vcreate_u8 weights + vpadd chain with vshr_n_u8 + lane extraction for fewer instructions in dcomp - test_blitter_simd.c: fix duplicate lines from rebase Co-Authored-By: Claude Opus 4.6 <[email protected]>
1 parent 0bc0441 commit 5514b46

3 files changed

Lines changed: 55 additions & 35 deletions

File tree

Makefile.common

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -49,38 +49,57 @@ SOURCES_C := \
4949
$(CORE_DIR)/src/wavetable.c
5050

5151
# SIMD-accelerated blitter operations: select arch-specific implementation.
52-
# Override with BLITTER_SIMD=scalar|sse2|neon for cross-compilation.
53-
# Default to portable scalar fallback.
52+
# BLITTER_SIMD may be set explicitly to one of: scalar, sse2, neon.
5453
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_scalar.c
5554

56-
ifdef BLITTER_SIMD
57-
# Explicit override for cross-compilation or custom builds
58-
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_$(BLITTER_SIMD).c
59-
else
60-
# Auto-detect from host architecture (native builds only)
61-
# x86/x64: use SSE2 (baseline for all x86_64, available since Pentium 4)
62-
ifneq (,$(filter x86_64 x86 i686 i386,$(shell uname -m 2>/dev/null)))
63-
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
64-
endif
65-
# MSYS2/MinGW on x86_64
66-
ifneq (,$(filter MINGW64%,$(MSYSTEM)))
67-
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
68-
endif
69-
# ARM64 (AArch64): NEON is always available
70-
ifneq (,$(filter aarch64 arm64,$(shell uname -m 2>/dev/null)))
71-
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
72-
endif
55+
ifneq ($(BLITTER_SIMD),)
56+
ifeq (,$(filter scalar sse2 neon,$(BLITTER_SIMD)))
57+
$(error Unsupported BLITTER_SIMD '$(BLITTER_SIMD)'; expected one of: scalar sse2 neon)
58+
endif
7359
endif
7460

75-
# Platform-based overrides (cross-compilation targets)
61+
ifeq ($(BLITTER_SIMD),sse2)
62+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
63+
else ifeq ($(BLITTER_SIMD),neon)
64+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
65+
else ifeq ($(BLITTER_SIMD),scalar)
66+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_scalar.c
67+
else
68+
# ARM targets: prefer NEON when the target configuration says it is available.
69+
ifeq ($(HAVE_NEON), 1)
70+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
71+
endif
7672
ifneq (,$(filter ios-arm64 tvos-arm64,$(platform)))
7773
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
7874
endif
79-
# ARMv7 with NEON (set by Makefile for classic_armv7 targets)
80-
ifeq ($(HAVE_NEON), 1)
75+
ifneq (,$(filter aarch64 arm64 arm,$(ARCH)))
76+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
77+
endif
78+
ifneq (,$(filter arm64 armv8% armv7% armhf arm,$(platform)))
8179
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
8280
endif
8381

82+
# x86/x64 targets: use SSE2.
83+
ifneq (,$(filter x86_64 x86 i686 i386,$(ARCH)))
84+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
85+
endif
86+
ifneq (,$(filter x86_64 x86 i686 i386 win-x64 win32,$(platform)))
87+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
88+
endif
89+
# MSYS2/MinGW on x86_64
90+
ifneq (,$(filter MINGW64%,$(MSYSTEM)))
91+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
92+
endif
93+
94+
# Native build fallback: auto-detect from host architecture
95+
ifneq (,$(filter x86_64 x86 i686 i386,$(shell uname -m 2>/dev/null)))
96+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
97+
endif
98+
ifneq (,$(filter aarch64 arm64,$(shell uname -m 2>/dev/null)))
99+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
100+
endif
101+
endif
102+
84103
SOURCES_C += $(BLITTER_SIMD_SRC)
85104

86105
ifneq ($(STATIC_LINKING), 1)

src/blitter_simd_neon.c

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,20 @@ static uint8_t neon_dcomp(uint64_t patd, uint64_t srcd, uint64_t dstd, bool cmpd
5454
uint8x8_t vzero = vdup_n_u8(0);
5555
uint8x8_t vcmp = vceq_u8(vxor, vzero); /* 0xFF where equal, 0 otherwise */
5656

57-
/* Extract one bit per byte using power-of-2 weights.
58-
* vcreate avoids a static const load on every call. */
59-
uint8x8_t vw = vcreate_u8(0x8040201008040201ULL);
60-
uint8x8_t vbits = vand_u8(vcmp, vw);
61-
62-
/* Pairwise horizontal add: 8 -> 4 -> 2 -> 1 */
63-
uint8x8_t sum1 = vpadd_u8(vbits, vbits);
64-
uint8x8_t sum2 = vpadd_u8(sum1, sum1);
65-
uint8x8_t sum3 = vpadd_u8(sum2, sum2);
66-
67-
return vget_lane_u8(sum3, 0);
57+
/* Convert compare lanes from 0xFF/0x00 to 1/0, then pack into
58+
* the result byte via lane extraction — avoids weights load
59+
* and vpadd chain. */
60+
uint8x8_t vbits = vshr_n_u8(vcmp, 7);
61+
62+
return (uint8_t)(
63+
(vget_lane_u8(vbits, 0) << 0) |
64+
(vget_lane_u8(vbits, 1) << 1) |
65+
(vget_lane_u8(vbits, 2) << 2) |
66+
(vget_lane_u8(vbits, 3) << 3) |
67+
(vget_lane_u8(vbits, 4) << 4) |
68+
(vget_lane_u8(vbits, 5) << 5) |
69+
(vget_lane_u8(vbits, 6) << 6) |
70+
(vget_lane_u8(vbits, 7) << 7));
6871
}
6972

7073
/* Z-buffer Comparator — NEON

test/test_blitter_simd.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -447,5 +447,3 @@ int main(int argc, char *argv[])
447447

448448
return failures > 0 ? 1 : 0;
449449
}
450-
? 1 : 0;
451-
}

0 commit comments

Comments
 (0)