Skip to content

Commit 2c6160f

Browse files
JoeMattclaude
andcommitted
Address review: safer cross-compile detection, optimize NEON byte_merge
Makefile.common: - Restrict armv7/armhf NEON auto-selection to HAVE_NEON=1 only - Guard uname -m fallback to native-build platforms (unix/osx/win) - Add -msse2 flag consistently for all 32-bit x86 selection paths blitter_simd_neon.c: - Build neon_byte_merge mask in registers via vcreate_u64 instead of stack array + vld1_u8 memory load Co-Authored-By: Claude Opus 4.6 <[email protected]>
1 parent b51b63f commit 2c6160f

2 files changed

Lines changed: 35 additions & 25 deletions

File tree

Makefile.common

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -65,49 +65,57 @@ else ifeq ($(BLITTER_SIMD),neon)
6565
else ifeq ($(BLITTER_SIMD),scalar)
6666
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_scalar.c
6767
else
68-
# ARM targets: prefer NEON when the target configuration says it is available.
68+
# ARM targets: prefer NEON when guaranteed or explicitly enabled.
6969
ifeq ($(HAVE_NEON), 1)
7070
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
7171
endif
7272
ifneq (,$(filter ios-arm64 tvos-arm64,$(platform)))
7373
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
7474
endif
75-
# Only aarch64/arm64 guaranteed to have NEON; plain 'arm' may lack it
75+
# aarch64/arm64 always have NEON; plain 'arm' may lack it.
7676
ifneq (,$(filter aarch64 arm64,$(ARCH)))
7777
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
7878
endif
79-
ifneq (,$(filter arm64 armv8% armv7% armhf,$(platform)))
79+
# armv8+ implies NEON; armv7/armhf only use NEON if HAVE_NEON was set above.
80+
ifneq (,$(filter arm64 armv8%,$(platform)))
8081
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
8182
endif
8283

8384
# x86/x64 targets: use SSE2.
8485
ifneq (,$(filter x86_64 x86 i686 i386,$(ARCH)))
8586
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
86-
# 32-bit x86 may need explicit SSE2 enable
87-
ifneq (,$(filter i686 i386 x86,$(ARCH)))
88-
CFLAGS += -msse2
89-
endif
9087
endif
9188
ifneq (,$(filter x86_64 x86 i686 i386 win-x64 win32,$(platform)))
9289
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
9390
endif
94-
# MSYS2/MinGW on x86_64
95-
ifneq (,$(filter MINGW64%,$(MSYSTEM)))
91+
# MSYS2/MinGW
92+
ifneq (,$(filter MINGW64% MINGW32%,$(MSYSTEM)))
9693
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
9794
endif
95+
# 32-bit x86 needs explicit -msse2 (x86_64 has it baseline).
96+
ifeq ($(BLITTER_SIMD_SRC),$(CORE_DIR)/src/blitter_simd_sse2.c)
97+
ifneq (,$(filter i686 i386 x86 win32,$(ARCH) $(platform)))
98+
CFLAGS += -msse2
99+
endif
100+
ifneq (,$(filter MINGW32%,$(MSYSTEM)))
101+
CFLAGS += -msse2
102+
endif
103+
endif
98104

99-
# Native build fallback: auto-detect from host when no SIMD was selected above.
105+
# Native build fallback: auto-detect from host architecture, but only for
106+
# native-build platforms (unix/osx/win). Cross-compile targets (vita, ps3,
107+
# libnx, etc.) set platform explicitly and should not use host detection.
100108
ifeq ($(BLITTER_SIMD_SRC),)
101-
ifneq (,$(filter x86_64 x86 i686 i386,$(shell uname -m 2>/dev/null)))
109+
ifneq (,$(filter unix osx win,$(platform)))
110+
ifneq (,$(filter x86_64 i686 i386,$(shell uname -m 2>/dev/null)))
102111
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
103112
endif
104-
endif
105-
ifeq ($(BLITTER_SIMD_SRC),)
106113
ifneq (,$(filter aarch64 arm64,$(shell uname -m 2>/dev/null)))
107114
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
108115
endif
109116
endif
110117
endif
118+
endif
111119

112120
# Fall back to scalar if no SIMD was selected (e.g., exotic platforms)
113121
ifeq ($(BLITTER_SIMD_SRC),)

src/blitter_simd_neon.c

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -108,18 +108,20 @@ static uint8_t neon_zcomp(uint64_t srcz, uint64_t dstz, uint8_t zmode)
108108
*/
109109
static uint64_t neon_byte_merge(uint64_t src, uint64_t dst, uint16_t mask)
110110
{
111-
/* Build 8-byte selection mask */
112-
uint8_t sel[8];
113-
sel[0] = (uint8_t)(mask & 0xFF);
114-
sel[1] = (mask & 0x0100) ? 0xFF : 0x00;
115-
sel[2] = (mask & 0x0200) ? 0xFF : 0x00;
116-
sel[3] = (mask & 0x0400) ? 0xFF : 0x00;
117-
sel[4] = (mask & 0x0800) ? 0xFF : 0x00;
118-
sel[5] = (mask & 0x1000) ? 0xFF : 0x00;
119-
sel[6] = (mask & 0x2000) ? 0xFF : 0x00;
120-
sel[7] = (mask & 0x4000) ? 0xFF : 0x00;
121-
122-
uint8x8_t vmask = vld1_u8(sel);
111+
/* Build 8-byte selection mask entirely in registers.
112+
* Byte 0 uses the low 8 bits directly; bytes 1-7 are all-ones or
113+
* all-zero based on mask bits 8-14. */
114+
uint64_t sel =
115+
((uint64_t)(mask & 0x00FF)) |
116+
((uint64_t)((mask & 0x0100) ? 0xFF : 0x00) << 8) |
117+
((uint64_t)((mask & 0x0200) ? 0xFF : 0x00) << 16) |
118+
((uint64_t)((mask & 0x0400) ? 0xFF : 0x00) << 24) |
119+
((uint64_t)((mask & 0x0800) ? 0xFF : 0x00) << 32) |
120+
((uint64_t)((mask & 0x1000) ? 0xFF : 0x00) << 40) |
121+
((uint64_t)((mask & 0x2000) ? 0xFF : 0x00) << 48) |
122+
((uint64_t)((mask & 0x4000) ? 0xFF : 0x00) << 56);
123+
124+
uint8x8_t vmask = vreinterpret_u8_u64(vcreate_u64(sel));
123125
uint8x8_t vsrc = vreinterpret_u8_u64(vcreate_u64(src));
124126
uint8x8_t vdst = vreinterpret_u8_u64(vcreate_u64(dst));
125127

0 commit comments

Comments
 (0)