Skip to content

Commit dd44259

Browse files
authored
Merge pull request #101 from Provenance-Emu/libretro/feature/simd-blitter
SIMD-accelerated blitter operations (SSE2, NEON)
2 parents d1869d4 + 896012c commit dd44259

9 files changed

Lines changed: 1004 additions & 67 deletions

File tree

.github/workflows/c-cpp.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,26 @@ jobs:
6969
- name: Build
7070
run: make -j4 CC=${{ matrix.config.cc }} CXX=${{ matrix.config.cxx }}
7171

72+
- name: Run SIMD blitter tests
73+
run: |
74+
# Detect which SIMD impl to test based on runner architecture
75+
ARCH=$(uname -m)
76+
case "$ARCH" in
77+
x86_64|i686|i386) SIMD_SRC=src/blitter_simd_sse2.c; EXTRA="-msse2" ;;
78+
aarch64|arm64) SIMD_SRC=src/blitter_simd_neon.c; EXTRA="" ;;
79+
*) SIMD_SRC=src/blitter_simd_scalar.c; EXTRA="" ;;
80+
esac
81+
82+
echo "==> Testing ${SIMD_SRC}..."
83+
${{ matrix.config.cc }} -O2 -Wall ${EXTRA} -I src \
84+
-o test_blitter_simd test/test_blitter_simd.c ${SIMD_SRC}
85+
./test_blitter_simd
86+
87+
echo "==> Cross-checking against scalar..."
88+
${{ matrix.config.cc }} -O2 -Wall -I src \
89+
-o test_blitter_scalar test/test_blitter_simd.c src/blitter_simd_scalar.c
90+
./test_blitter_scalar
91+
7292
- name: Upload artifact
7393
uses: actions/upload-artifact@v4
7494
with:

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
.DS_Store
99
.build
1010
/.claude
11+
test/test_blitter_simd

Makefile.common

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,82 @@ SOURCES_C := \
4848
$(CORE_DIR)/src/universalhdr.c \
4949
$(CORE_DIR)/src/wavetable.c
5050

51+
# SIMD-accelerated blitter operations: select arch-specific implementation.
52+
# BLITTER_SIMD may be set explicitly to one of: scalar, sse2, neon.
53+
BLITTER_SIMD_SRC :=
54+
55+
ifneq ($(BLITTER_SIMD),)
56+
ifeq (,$(filter scalar sse2 neon,$(BLITTER_SIMD)))
57+
$(error Unsupported BLITTER_SIMD '$(BLITTER_SIMD)'; expected one of: scalar sse2 neon)
58+
endif
59+
endif
60+
61+
ifeq ($(BLITTER_SIMD),sse2)
62+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
63+
else ifeq ($(BLITTER_SIMD),neon)
64+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
65+
else ifeq ($(BLITTER_SIMD),scalar)
66+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_scalar.c
67+
else
68+
# ARM targets: prefer NEON when guaranteed or explicitly enabled.
69+
ifeq ($(HAVE_NEON), 1)
70+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
71+
endif
72+
ifneq (,$(filter ios-arm64 tvos-arm64,$(platform)))
73+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
74+
endif
75+
# aarch64/arm64 always have NEON; plain 'arm' may lack it.
76+
ifneq (,$(filter aarch64 arm64,$(ARCH)))
77+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
78+
endif
79+
# armv8+ implies NEON; armv7/armhf only use NEON if HAVE_NEON was set above.
80+
ifneq (,$(filter arm64 armv8%,$(platform)))
81+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
82+
endif
83+
84+
# x86/x64 targets: use SSE2.
85+
ifneq (,$(filter x86_64 x86 i686 i386,$(ARCH)))
86+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
87+
endif
88+
ifneq (,$(filter x86_64 x86 i686 i386 win-x64 win32,$(platform)))
89+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
90+
endif
91+
# MSYS2/MinGW
92+
ifneq (,$(filter MINGW64% MINGW32%,$(MSYSTEM)))
93+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
94+
endif
95+
# 32-bit x86 needs explicit -msse2 (x86_64 has it baseline).
96+
ifeq ($(BLITTER_SIMD_SRC),$(CORE_DIR)/src/blitter_simd_sse2.c)
97+
ifneq (,$(filter i686 i386 x86 win32,$(ARCH) $(platform)))
98+
CFLAGS += -msse2
99+
endif
100+
ifneq (,$(filter MINGW32%,$(MSYSTEM)))
101+
CFLAGS += -msse2
102+
endif
103+
endif
104+
105+
# Native build fallback: auto-detect from host architecture, but only for
106+
# native-build platforms (unix/osx/win). Cross-compile targets (vita, ps3,
107+
# libnx, etc.) set platform explicitly and should not use host detection.
108+
ifeq ($(BLITTER_SIMD_SRC),)
109+
ifneq (,$(filter unix osx win,$(platform)))
110+
ifneq (,$(filter x86_64 i686 i386,$(shell uname -m 2>/dev/null)))
111+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_sse2.c
112+
endif
113+
ifneq (,$(filter aarch64 arm64,$(shell uname -m 2>/dev/null)))
114+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_neon.c
115+
endif
116+
endif
117+
endif
118+
endif
119+
120+
# Fall back to scalar if no SIMD was selected (e.g., exotic platforms)
121+
ifeq ($(BLITTER_SIMD_SRC),)
122+
BLITTER_SIMD_SRC := $(CORE_DIR)/src/blitter_simd_scalar.c
123+
endif
124+
125+
SOURCES_C += $(BLITTER_SIMD_SRC)
126+
51127
ifneq ($(STATIC_LINKING), 1)
52128
SOURCES_C += \
53129
$(LIBRETRO_COMM_DIR)/compat/compat_strcasestr.c \

src/blitter.c

Lines changed: 6 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
//
2222

2323
#include "blitter.h"
24+
#include "blitter_simd.h"
2425

2526
#include <stdlib.h>
2627
#include <string.h>
@@ -2833,12 +2834,7 @@ Patdhi := JOIN (patdhi, patd[32..63]);*/
28332834

28342835
/*Lfu := LFU (lfu[0..1], srcdlo, srcdhi, dstdlo, dstdhi, lfu_func[0..3]);*/
28352836
////////////////////////////////////// C++ CODE //////////////////////////////////////
2836-
uint64_t funcmask[2] = { 0, 0xFFFFFFFFFFFFFFFFLL };
2837-
uint64_t func0 = funcmask[lfu_func & 0x01];
2838-
uint64_t func1 = funcmask[(lfu_func >> 1) & 0x01];
2839-
uint64_t func2 = funcmask[(lfu_func >> 2) & 0x01];
2840-
uint64_t func3 = funcmask[(lfu_func >> 3) & 0x01];
2841-
uint64_t lfu = (~srcd & ~dstd & func0) | (~srcd & dstd & func1) | (srcd & ~dstd & func2) | (srcd & dstd & func3);
2837+
uint64_t lfu = blitter_simd_ops.lfu(srcd, dstd, lfu_func);
28422838
bool mir_bit, mir_byte;
28432839
uint16_t masku;
28442840
uint8_t e_coarse, e_fine;
@@ -2850,14 +2846,12 @@ Patdhi := JOIN (patdhi, patd[32..63]);*/
28502846
uint8_t dech38el[2][8] = { { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 },
28512847
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } };
28522848
int en;
2853-
uint64_t cmpd;
28542849
uint8_t dbinht;
28552850
uint16_t addq[4];
28562851
uint8_t initcin[4] = { 0, 0, 0, 0 };
28572852
uint16_t mask;
28582853
uint64_t dmux[4];
28592854
uint64_t ddat;
2860-
uint64_t zwdata;
28612855
//////////////////////////////////////////////////////////////////////////////////////
28622856

28632857
// Increment and Step Registers
@@ -2873,25 +2867,7 @@ Zstep := JOIN (zstep, zstep[0..31]);*/
28732867

28742868
/*Datacomp := DATACOMP (dcomp[0..7], cmpdst, dstdlo, dstdhi, patdlo, patdhi, srcdlo, srcdhi);*/
28752869
////////////////////////////////////// C++ CODE //////////////////////////////////////
2876-
*dcomp = 0;
2877-
cmpd = *patd ^ (cmpdst ? dstd : srcd);
2878-
2879-
if ((cmpd & 0x00000000000000FFLL) == 0)
2880-
*dcomp |= 0x01u;
2881-
if ((cmpd & 0x000000000000FF00LL) == 0)
2882-
*dcomp |= 0x02u;
2883-
if ((cmpd & 0x0000000000FF0000LL) == 0)
2884-
*dcomp |= 0x04u;
2885-
if ((cmpd & 0x00000000FF000000LL) == 0)
2886-
*dcomp |= 0x08u;
2887-
if ((cmpd & 0x000000FF00000000LL) == 0)
2888-
*dcomp |= 0x10u;
2889-
if ((cmpd & 0x0000FF0000000000LL) == 0)
2890-
*dcomp |= 0x20u;
2891-
if ((cmpd & 0x00FF000000000000LL) == 0)
2892-
*dcomp |= 0x40u;
2893-
if ((cmpd & 0xFF00000000000000LL) == 0)
2894-
*dcomp |= 0x80u;
2870+
*dcomp = blitter_simd_ops.dcomp(*patd, srcd, dstd, cmpdst);
28952871
//////////////////////////////////////////////////////////////////////////////////////
28962872

28972873
// Zed comparator for Z-buffer operations
@@ -2907,27 +2883,7 @@ Zstep := JOIN (zstep, zstep[0..31]);*/
29072883
with srcshift bits 4 & 5 selecting the start position
29082884
*/
29092885
//So... basically what we have here is:
2910-
*zcomp = 0;
2911-
2912-
if ((((*srcz & 0x000000000000FFFFLL) < (dstz & 0x000000000000FFFFLL)) && (zmode & 0x01))
2913-
|| (((*srcz & 0x000000000000FFFFLL) == (dstz & 0x000000000000FFFFLL)) && (zmode & 0x02))
2914-
|| (((*srcz & 0x000000000000FFFFLL) > (dstz & 0x000000000000FFFFLL)) && (zmode & 0x04)))
2915-
*zcomp |= 0x01u;
2916-
2917-
if ((((*srcz & 0x00000000FFFF0000LL) < (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x01))
2918-
|| (((*srcz & 0x00000000FFFF0000LL) == (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x02))
2919-
|| (((*srcz & 0x00000000FFFF0000LL) > (dstz & 0x00000000FFFF0000LL)) && (zmode & 0x04)))
2920-
*zcomp |= 0x02u;
2921-
2922-
if ((((*srcz & 0x0000FFFF00000000LL) < (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x01))
2923-
|| (((*srcz & 0x0000FFFF00000000LL) == (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x02))
2924-
|| (((*srcz & 0x0000FFFF00000000LL) > (dstz & 0x0000FFFF00000000LL)) && (zmode & 0x04)))
2925-
*zcomp |= 0x04u;
2926-
2927-
if ((((*srcz & 0xFFFF000000000000LL) < (dstz & 0xFFFF000000000000LL)) && (zmode & 0x01))
2928-
|| (((*srcz & 0xFFFF000000000000LL) == (dstz & 0xFFFF000000000000LL)) && (zmode & 0x02))
2929-
|| (((*srcz & 0xFFFF000000000000LL) > (dstz & 0xFFFF000000000000LL)) && (zmode & 0x04)))
2930-
*zcomp |= 0x08u;
2886+
*zcomp = blitter_simd_ops.zcomp(*srcz, dstz, zmode);
29312887

29322888
//TEMP, TO TEST IF ZCOMP IS THE CULPRIT...
29332889
//Nope, this is NOT the problem...
@@ -3159,25 +3115,8 @@ Dat[40-47] := MX4 (dat[40-47], dstdhi{8-15}, ddathi{8-15}, dstzhi{8-15}, srcz
31593115
Dat[48-55] := MX4 (dat[48-55], dstdhi{16-23}, ddathi{16-23}, dstzhi{16-23}, srczhi{16-23}, mask[13], zed_selb[1]);
31603116
Dat[56-63] := MX4 (dat[56-63], dstdhi{24-31}, ddathi{24-31}, dstzhi{24-31}, srczhi{24-31}, mask[14], zed_selb[1]);*/
31613117
////////////////////////////////////// C++ CODE //////////////////////////////////////
3162-
*wdata = ((ddat & mask) | (dstd & ~mask)) & 0x00000000000000FFLL;
3163-
*wdata |= ((mask & 0x0100) ? ddat : dstd) & 0x000000000000FF00LL;
3164-
*wdata |= ((mask & 0x0200) ? ddat : dstd) & 0x0000000000FF0000LL;
3165-
*wdata |= ((mask & 0x0400) ? ddat : dstd) & 0x00000000FF000000LL;
3166-
*wdata |= ((mask & 0x0800) ? ddat : dstd) & 0x000000FF00000000LL;
3167-
*wdata |= ((mask & 0x1000) ? ddat : dstd) & 0x0000FF0000000000LL;
3168-
*wdata |= ((mask & 0x2000) ? ddat : dstd) & 0x00FF000000000000LL;
3169-
*wdata |= ((mask & 0x4000) ? ddat : dstd) & 0xFF00000000000000LL;
3170-
3171-
//This is a crappy way of handling this, but it should work for now...
3172-
zwdata = ((*srcz & mask) | (dstz & ~mask)) & 0x00000000000000FFLL;
3173-
zwdata |= ((mask & 0x0100) ? *srcz : dstz) & 0x000000000000FF00LL;
3174-
zwdata |= ((mask & 0x0200) ? *srcz : dstz) & 0x0000000000FF0000LL;
3175-
zwdata |= ((mask & 0x0400) ? *srcz : dstz) & 0x00000000FF000000LL;
3176-
zwdata |= ((mask & 0x0800) ? *srcz : dstz) & 0x000000FF00000000LL;
3177-
zwdata |= ((mask & 0x1000) ? *srcz : dstz) & 0x0000FF0000000000LL;
3178-
zwdata |= ((mask & 0x2000) ? *srcz : dstz) & 0x00FF000000000000LL;
3179-
zwdata |= ((mask & 0x4000) ? *srcz : dstz) & 0xFF00000000000000LL;
3180-
*srcz = zwdata;
3118+
*wdata = blitter_simd_ops.byte_merge(ddat, dstd, mask);
3119+
*srcz = blitter_simd_ops.byte_merge(*srcz, dstz, mask);
31813120
//////////////////////////////////////////////////////////////////////////////////////
31823121

31833122
/*Data_enab[0-1] := BUF8 (data_enab[0-1], data_ena);

src/blitter_simd.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
* SIMD-accelerated blitter operations for Virtual Jaguar
3+
*
4+
* Provides architecture-specific implementations of the blitter's
5+
* hottest data-path operations. Only one implementation file is
6+
* compiled per build (selected in Makefile.common).
7+
*
8+
* Each arch file defines:
9+
* const blitter_simd_ops_t blitter_simd_ops = { ... };
10+
*/
11+
12+
#ifndef BLITTER_SIMD_H
13+
#define BLITTER_SIMD_H
14+
15+
#include <stdint.h>
16+
#include <stdbool.h>
17+
18+
typedef struct
19+
{
20+
/* Logic Function Unit: 64-bit truth table over srcd/dstd.
21+
* lfu_func is a 4-bit selector (0-15). */
22+
uint64_t (*lfu)(uint64_t srcd, uint64_t dstd, uint8_t lfu_func);
23+
24+
/* Data Comparator: per-byte equality of patd vs (cmpdst ? dstd : srcd).
25+
* Returns 8-bit mask, one bit per byte. */
26+
uint8_t (*dcomp)(uint64_t patd, uint64_t srcd, uint64_t dstd, bool cmpdst);
27+
28+
/* Z-buffer Comparator: 4 independent 16-bit comparisons.
29+
* zmode bits: 0=LT, 1=EQ, 2=GT. Returns 4-bit mask. */
30+
uint8_t (*zcomp)(uint64_t srcz, uint64_t dstz, uint8_t zmode);
31+
32+
/* Byte Mask Merge: select bytes from src or dst based on 16-bit mask.
33+
* Bits 0-7 control byte 0 (per-bit blend within the lowest byte).
34+
* Bits 8-14 control bytes 1-7 (whole-byte select, one bit each).
35+
* Used for both pixel data (ddat/dstd) and Z data (srcz/dstz). */
36+
uint64_t (*byte_merge)(uint64_t src, uint64_t dst, uint16_t mask);
37+
} blitter_simd_ops_t;
38+
39+
extern const blitter_simd_ops_t blitter_simd_ops;
40+
41+
#endif /* BLITTER_SIMD_H */

0 commit comments

Comments
 (0)