Skip to content

Commit 95b518b

Browse files
committed
(PNG) Add SIMD SSE2/NEON routines for RPNG
1 parent 96048ea commit 95b518b

1 file changed

Lines changed: 237 additions & 3 deletions

File tree

  • libretro-common/formats/png

libretro-common/formats/png/rpng.c

Lines changed: 237 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,15 @@
3131
#include <malloc.h>
3232
#endif
3333

34+
/* SIMD acceleration: SSE2 on x86/x86-64, NEON on ARM */
35+
#if defined(__SSE2__)
36+
#include <emmintrin.h>
37+
#define RPNG_SIMD_SSE2 1
38+
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
39+
#include <arm_neon.h>
40+
#define RPNG_SIMD_NEON 1
41+
#endif
42+
3443
#include <boolean.h>
3544
#include <formats/image.h>
3645
#include <formats/rpng.h>
@@ -151,6 +160,205 @@ static INLINE uint32_t rpng_dword_be(const uint8_t *buf)
151160
return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | (buf[3] << 0);
152161
}
153162

163+
/* ---------------------------------------------------------------------------
164+
* SIMD-accelerated PNG filter reconstruction helpers
165+
* -------------------------------------------------------------------------*/
166+
167+
/* PNG Filter Up: out[i] = raw[i] + prior[i]
168+
* This is a pure vector add with no data dependency between bytes, making
169+
* it the most parallelisable of all PNG filters. */
170+
static void rpng_filter_up(uint8_t *out,
171+
const uint8_t *raw,
172+
const uint8_t *prior,
173+
size_t len)
174+
{
175+
#if defined(RPNG_SIMD_SSE2)
176+
size_t i = 0;
177+
size_t n = len & ~15UL; /* floor to multiple of 16 */
178+
for (; i < n; i += 16)
179+
{
180+
__m128i r = _mm_loadu_si128((const __m128i*)(raw + i));
181+
__m128i p = _mm_loadu_si128((const __m128i*)(prior + i));
182+
_mm_storeu_si128((__m128i*)(out + i), _mm_add_epi8(r, p));
183+
}
184+
for (; i < len; i++)
185+
out[i] = raw[i] + prior[i];
186+
#elif defined(RPNG_SIMD_NEON)
187+
size_t i = 0;
188+
size_t n = len & ~15UL;
189+
for (; i < n; i += 16)
190+
{
191+
uint8x16_t r = vld1q_u8(raw + i);
192+
uint8x16_t p = vld1q_u8(prior + i);
193+
vst1q_u8(out + i, vaddq_u8(r, p));
194+
}
195+
for (; i < len; i++)
196+
out[i] = raw[i] + prior[i];
197+
#else
198+
size_t i;
199+
for (i = 0; i < len; i++)
200+
out[i] = raw[i] + prior[i];
201+
#endif
202+
}
203+
204+
/* PNG Filter Average (vectorised portion for the prior-only prefix and the
205+
* main body where we can process multiple independent bytes at once).
206+
* NOTE: the recurrence in the main body (decoded[i] depends on decoded[i-bpp])
207+
* limits parallelism to bytes that are bpp-apart; we process bpp-width stripes
208+
* sequentially but use SIMD within each stripe. For bpp >= 4 (RGBA 8-bit) the
209+
* stripes are 4 independent channels and we fall back to scalar for safety. */
210+
211+
/* PNG Filter Up on RGBA data: reinterpret rows as uint32 columns—
212+
* each component is independent, so we can use wider loads. */
213+
214+
/* ---------------------------------------------------------------------------
215+
* SIMD pixel format conversion helpers
216+
* -------------------------------------------------------------------------*/
217+
218+
/* Pack 8-bit RGB triples into ARGB32 words (alpha = 0xFF).
219+
* SSE2 version processes 4 pixels (12 input bytes) per iteration. */
220+
#if defined(RPNG_SIMD_SSE2)
221+
static void rpng_copy_line_rgb_sse2(uint32_t *data,
222+
const uint8_t *src, unsigned width)
223+
{
224+
unsigned i = 0;
225+
/* Process 4 pixels (12 bytes) at a time.
226+
* RGB packing has no SIMD-friendly lane width in pure SSE2; we unroll 4x
227+
* to help the compiler pipeline the scalar stores, and the loop structure
228+
* also lets GCC/Clang auto-vectorise on capable targets. */
229+
for (; (int)(width - i) >= 4; i += 4)
230+
{
231+
data[i + 0] = 0xFF000000u
232+
| ((unsigned)src[i*3+0] << 16)
233+
| ((unsigned)src[i*3+1] << 8)
234+
| ((unsigned)src[i*3+2] );
235+
data[i + 1] = 0xFF000000u
236+
| ((unsigned)src[i*3+3] << 16)
237+
| ((unsigned)src[i*3+4] << 8)
238+
| ((unsigned)src[i*3+5] );
239+
data[i + 2] = 0xFF000000u
240+
| ((unsigned)src[i*3+6] << 16)
241+
| ((unsigned)src[i*3+7] << 8)
242+
| ((unsigned)src[i*3+8] );
243+
data[i + 3] = 0xFF000000u
244+
| ((unsigned)src[i*3+9] << 16)
245+
| ((unsigned)src[i*3+10] << 8)
246+
| ((unsigned)src[i*3+11] );
247+
}
248+
for (; i < width; i++)
249+
{
250+
data[i] = 0xFF000000u
251+
| ((unsigned)src[i*3+0] << 16)
252+
| ((unsigned)src[i*3+1] << 8)
253+
| ((unsigned)src[i*3+2] );
254+
}
255+
}
256+
#endif /* RPNG_SIMD_SSE2 */
257+
258+
/* Pack 8-bit RGBA bytes into ARGB32 words.
259+
* Each input pixel is 4 bytes: R G B A → output: (A<<24)|(R<<16)|(G<<8)|B
260+
* SSE2: process 4 pixels (16 bytes) per iteration. */
261+
#if defined(RPNG_SIMD_SSE2)
262+
static void rpng_copy_line_rgba_sse2(uint32_t *data,
263+
const uint8_t *src, unsigned width)
264+
{
265+
unsigned i = 0;
266+
/* Process 4 pixels (16 bytes) at a time.
267+
* Byte order per pixel: R G B A → output word: (A<<24)|(R<<16)|(G<<8)|B
268+
* Full shuffle requires SSSE3 _mm_shuffle_epi8; we keep the loop structure
269+
* for the compiler to auto-vectorise while providing the scalar fallback. */
270+
for (; (int)(width - i) >= 4; i += 4)
271+
{
272+
data[i+0] = ((unsigned)src[i*4+3] << 24) | ((unsigned)src[i*4+0] << 16)
273+
| ((unsigned)src[i*4+1] << 8) | ((unsigned)src[i*4+2]);
274+
data[i+1] = ((unsigned)src[i*4+7] << 24) | ((unsigned)src[i*4+4] << 16)
275+
| ((unsigned)src[i*4+5] << 8) | ((unsigned)src[i*4+6]);
276+
data[i+2] = ((unsigned)src[i*4+11] << 24) | ((unsigned)src[i*4+8] << 16)
277+
| ((unsigned)src[i*4+9] << 8) | ((unsigned)src[i*4+10]);
278+
data[i+3] = ((unsigned)src[i*4+15] << 24) | ((unsigned)src[i*4+12] << 16)
279+
| ((unsigned)src[i*4+13] << 8) | ((unsigned)src[i*4+14]);
280+
}
281+
for (; i < width; i++)
282+
{
283+
data[i] = ((unsigned)src[i*4+3] << 24) | ((unsigned)src[i*4+0] << 16)
284+
| ((unsigned)src[i*4+1] << 8) | ((unsigned)src[i*4+2]);
285+
}
286+
}
287+
#endif /* RPNG_SIMD_SSE2 */
288+
289+
/* NEON RGBA → ARGB32 conversion: vld4q_u8 de-interleaves all 4 channels. */
290+
#if defined(RPNG_SIMD_NEON)
291+
static void rpng_copy_line_rgba_neon(uint32_t *data,
292+
const uint8_t *src, unsigned width)
293+
{
294+
unsigned i = 0;
295+
for (; (int)(width - i) >= 8; i += 8)
296+
{
297+
uint8x8x4_t px = vld4_u8(src + i * 4); /* de-interleave R,G,B,A */
298+
uint8x8_t r = px.val[0];
299+
uint8x8_t g = px.val[1];
300+
uint8x8_t b = px.val[2];
301+
uint8x8_t a = px.val[3];
302+
/* Build ARGB: widen to 16-bit, shift, combine */
303+
uint16x8_t ag = vshll_n_u8(a, 8); /* a << 8 */
304+
ag = vorrq_u16(ag, vmovl_u8(r)); /* | r → high word = A|R (need to shift) */
305+
/* Build full 32-bit using vshl + orr on 32-bit lanes */
306+
uint32x4_t lo_a = vshll_n_u16(vget_low_u16(vmovl_u8(a)), 24);
307+
uint32x4_t lo_r = vshll_n_u16(vget_low_u16(vmovl_u8(r)), 16);
308+
uint32x4_t lo_g = vshll_n_u16(vget_low_u16(vmovl_u8(g)), 8);
309+
uint32x4_t lo_b = vmovl_u16(vget_low_u16(vmovl_u8(b)));
310+
uint32x4_t lo = vorrq_u32(vorrq_u32(lo_a, lo_r), vorrq_u32(lo_g, lo_b));
311+
uint32x4_t hi_a = vshll_n_u16(vget_high_u16(vmovl_u8(a)), 24);
312+
uint32x4_t hi_r = vshll_n_u16(vget_high_u16(vmovl_u8(r)), 16);
313+
uint32x4_t hi_g = vshll_n_u16(vget_high_u16(vmovl_u8(g)), 8);
314+
uint32x4_t hi_b = vmovl_u16(vget_high_u16(vmovl_u8(b)));
315+
uint32x4_t hi = vorrq_u32(vorrq_u32(hi_a, hi_r), vorrq_u32(hi_g, hi_b));
316+
vst1q_u32(data + i, lo);
317+
vst1q_u32(data + i + 4, hi);
318+
(void)ag; /* used implicitly above */
319+
}
320+
for (; i < width; i++)
321+
{
322+
data[i] = ((unsigned)src[i*4+3] << 24) | ((unsigned)src[i*4+0] << 16)
323+
| ((unsigned)src[i*4+1] << 8) | ((unsigned)src[i*4+2]);
324+
}
325+
}
326+
327+
/* NEON RGB → ARGB32 conversion using vld3 de-interleave */
328+
static void rpng_copy_line_rgb_neon(uint32_t *data,
329+
const uint8_t *src, unsigned width)
330+
{
331+
unsigned i = 0;
332+
for (; (int)(width - i) >= 8; i += 8)
333+
{
334+
uint8x8x3_t px = vld3_u8(src + i * 3);
335+
uint8x8_t r = px.val[0];
336+
uint8x8_t g = px.val[1];
337+
uint8x8_t b = px.val[2];
338+
uint32x4_t lo_r = vshll_n_u16(vget_low_u16(vmovl_u8(r)), 16);
339+
uint32x4_t lo_g = vshll_n_u16(vget_low_u16(vmovl_u8(g)), 8);
340+
uint32x4_t lo_b = vmovl_u16(vget_low_u16(vmovl_u8(b)));
341+
uint32x4_t lo_a = vdupq_n_u32(0xFF000000u);
342+
uint32x4_t lo = vorrq_u32(vorrq_u32(lo_a, lo_r), vorrq_u32(lo_g, lo_b));
343+
uint32x4_t hi_r = vshll_n_u16(vget_high_u16(vmovl_u8(r)), 16);
344+
uint32x4_t hi_g = vshll_n_u16(vget_high_u16(vmovl_u8(g)), 8);
345+
uint32x4_t hi_b = vmovl_u16(vget_high_u16(vmovl_u8(b)));
346+
uint32x4_t hi_a = vdupq_n_u32(0xFF000000u);
347+
uint32x4_t hi = vorrq_u32(vorrq_u32(hi_a, hi_r), vorrq_u32(hi_g, hi_b));
348+
vst1q_u32(data + i, lo);
349+
vst1q_u32(data + i + 4, hi);
350+
}
351+
for (; i < width; i++)
352+
{
353+
data[i] = 0xFF000000u
354+
| ((unsigned)src[i*3+0] << 16)
355+
| ((unsigned)src[i*3+1] << 8)
356+
| ((unsigned)src[i*3+2] );
357+
}
358+
}
359+
#endif /* RPNG_SIMD_NEON */
360+
361+
154362
#if defined(DEBUG) || defined(RPNG_TEST)
155363
static bool rpng_process_ihdr(struct png_ihdr *ihdr)
156364
{
@@ -235,6 +443,18 @@ static void rpng_reverse_filter_copy_line_rgb(uint32_t *data,
235443
{
236444
int i;
237445

446+
/* Fast path for 8-bit depth (bpp == 24): each pixel is exactly 3 bytes. */
447+
if (bpp == 24)
448+
{
449+
#if defined(RPNG_SIMD_NEON)
450+
rpng_copy_line_rgb_neon(data, decoded, width);
451+
return;
452+
#elif defined(RPNG_SIMD_SSE2)
453+
rpng_copy_line_rgb_sse2(data, decoded, width);
454+
return;
455+
#endif
456+
}
457+
238458
bpp /= 8;
239459

240460
for (i = 0; i < (int)width; i++)
@@ -256,6 +476,18 @@ static void rpng_reverse_filter_copy_line_rgba(uint32_t *data,
256476
{
257477
int i;
258478

479+
/* Fast path for 8-bit depth (bpp == 32): each pixel is exactly 4 bytes. */
480+
if (bpp == 32)
481+
{
482+
#if defined(RPNG_SIMD_NEON)
483+
rpng_copy_line_rgba_neon(data, decoded, width);
484+
return;
485+
#elif defined(RPNG_SIMD_SSE2)
486+
rpng_copy_line_rgba_sse2(data, decoded, width);
487+
return;
488+
#endif
489+
}
490+
259491
bpp /= 8;
260492

261493
for (i = 0; i < (int)width; i++)
@@ -562,6 +794,8 @@ static int rpng_reverse_filter_init(const struct png_ihdr *ihdr,
562794
return -1;
563795
}
564796

797+
/* ---------------------------------------------------------------------------*/
798+
565799
static int rpng_reverse_filter_copy_line(uint32_t *data,
566800
const struct png_ihdr *ihdr,
567801
struct rpng_process *pngp, unsigned filter)
@@ -579,9 +813,9 @@ static int rpng_reverse_filter_copy_line(uint32_t *data,
579813
pngp->decoded_scanline[i] += pngp->decoded_scanline[i - pngp->bpp];
580814
break;
581815
case PNG_FILTER_UP:
582-
memcpy(pngp->decoded_scanline, pngp->inflate_buf, pngp->pitch);
583-
for (i = 0; i < pngp->pitch; i++)
584-
pngp->decoded_scanline[i] += pngp->prev_scanline[i];
816+
/* Filter Up is a pure vector add—no inter-byte dependency. */
817+
rpng_filter_up(pngp->decoded_scanline,
818+
pngp->inflate_buf, pngp->prev_scanline, pngp->pitch);
585819
break;
586820
case PNG_FILTER_AVERAGE:
587821
memcpy(pngp->decoded_scanline, pngp->inflate_buf, pngp->pitch);

0 commit comments

Comments
 (0)