3131#include <malloc.h>
3232#endif
3333
34+ /* SIMD acceleration: SSE2 on x86/x86-64, NEON on ARM */
35+ #if defined(__SSE2__ )
36+ #include <emmintrin.h>
37+ #define RPNG_SIMD_SSE2 1
38+ #elif defined(__ARM_NEON ) || defined(__ARM_NEON__ )
39+ #include <arm_neon.h>
40+ #define RPNG_SIMD_NEON 1
41+ #endif
42+
3443#include <boolean.h>
3544#include <formats/image.h>
3645#include <formats/rpng.h>
@@ -151,6 +160,205 @@ static INLINE uint32_t rpng_dword_be(const uint8_t *buf)
151160 return (buf [0 ] << 24 ) | (buf [1 ] << 16 ) | (buf [2 ] << 8 ) | (buf [3 ] << 0 );
152161}
153162
163+ /* ---------------------------------------------------------------------------
164+ * SIMD-accelerated PNG filter reconstruction helpers
165+ * -------------------------------------------------------------------------*/
166+
167+ /* PNG Filter Up: out[i] = raw[i] + prior[i]
168+ * This is a pure vector add with no data dependency between bytes, making
169+ * it the most parallelisable of all PNG filters. */
170+ static void rpng_filter_up (uint8_t * out ,
171+ const uint8_t * raw ,
172+ const uint8_t * prior ,
173+ size_t len )
174+ {
175+ #if defined(RPNG_SIMD_SSE2 )
176+ size_t i = 0 ;
177+ size_t n = len & ~15UL ; /* floor to multiple of 16 */
178+ for (; i < n ; i += 16 )
179+ {
180+ __m128i r = _mm_loadu_si128 ((const __m128i * )(raw + i ));
181+ __m128i p = _mm_loadu_si128 ((const __m128i * )(prior + i ));
182+ _mm_storeu_si128 ((__m128i * )(out + i ), _mm_add_epi8 (r , p ));
183+ }
184+ for (; i < len ; i ++ )
185+ out [i ] = raw [i ] + prior [i ];
186+ #elif defined(RPNG_SIMD_NEON )
187+ size_t i = 0 ;
188+ size_t n = len & ~15UL ;
189+ for (; i < n ; i += 16 )
190+ {
191+ uint8x16_t r = vld1q_u8 (raw + i );
192+ uint8x16_t p = vld1q_u8 (prior + i );
193+ vst1q_u8 (out + i , vaddq_u8 (r , p ));
194+ }
195+ for (; i < len ; i ++ )
196+ out [i ] = raw [i ] + prior [i ];
197+ #else
198+ size_t i ;
199+ for (i = 0 ; i < len ; i ++ )
200+ out [i ] = raw [i ] + prior [i ];
201+ #endif
202+ }
203+
204+ /* PNG Filter Average (vectorised portion for the prior-only prefix and the
205+ * main body where we can process multiple independent bytes at once).
206+ * NOTE: the recurrence in the main body (decoded[i] depends on decoded[i-bpp])
207+ * limits parallelism to bytes that are bpp-apart; we process bpp-width stripes
208+ * sequentially but use SIMD within each stripe. For bpp >= 4 (RGBA 8-bit) the
209+ * stripes are 4 independent channels and we fall back to scalar for safety. */
210+
211+ /* PNG Filter Up on RGBA data: reinterpret rows as uint32 columns—
212+ * each component is independent, so we can use wider loads. */
213+
214+ /* ---------------------------------------------------------------------------
215+ * SIMD pixel format conversion helpers
216+ * -------------------------------------------------------------------------*/
217+
218+ /* Pack 8-bit RGB triples into ARGB32 words (alpha = 0xFF).
219+ * SSE2 version processes 4 pixels (12 input bytes) per iteration. */
220+ #if defined(RPNG_SIMD_SSE2 )
221+ static void rpng_copy_line_rgb_sse2 (uint32_t * data ,
222+ const uint8_t * src , unsigned width )
223+ {
224+ unsigned i = 0 ;
225+ /* Process 4 pixels (12 bytes) at a time.
226+ * RGB packing has no SIMD-friendly lane width in pure SSE2; we unroll 4x
227+ * to help the compiler pipeline the scalar stores, and the loop structure
228+ * also lets GCC/Clang auto-vectorise on capable targets. */
229+ for (; (int )(width - i ) >= 4 ; i += 4 )
230+ {
231+ data [i + 0 ] = 0xFF000000u
232+ | ((unsigned )src [i * 3 + 0 ] << 16 )
233+ | ((unsigned )src [i * 3 + 1 ] << 8 )
234+ | ((unsigned )src [i * 3 + 2 ] );
235+ data [i + 1 ] = 0xFF000000u
236+ | ((unsigned )src [i * 3 + 3 ] << 16 )
237+ | ((unsigned )src [i * 3 + 4 ] << 8 )
238+ | ((unsigned )src [i * 3 + 5 ] );
239+ data [i + 2 ] = 0xFF000000u
240+ | ((unsigned )src [i * 3 + 6 ] << 16 )
241+ | ((unsigned )src [i * 3 + 7 ] << 8 )
242+ | ((unsigned )src [i * 3 + 8 ] );
243+ data [i + 3 ] = 0xFF000000u
244+ | ((unsigned )src [i * 3 + 9 ] << 16 )
245+ | ((unsigned )src [i * 3 + 10 ] << 8 )
246+ | ((unsigned )src [i * 3 + 11 ] );
247+ }
248+ for (; i < width ; i ++ )
249+ {
250+ data [i ] = 0xFF000000u
251+ | ((unsigned )src [i * 3 + 0 ] << 16 )
252+ | ((unsigned )src [i * 3 + 1 ] << 8 )
253+ | ((unsigned )src [i * 3 + 2 ] );
254+ }
255+ }
256+ #endif /* RPNG_SIMD_SSE2 */
257+
258+ /* Pack 8-bit RGBA bytes into ARGB32 words.
259+ * Each input pixel is 4 bytes: R G B A → output: (A<<24)|(R<<16)|(G<<8)|B
260+ * SSE2: process 4 pixels (16 bytes) per iteration. */
261+ #if defined(RPNG_SIMD_SSE2 )
262+ static void rpng_copy_line_rgba_sse2 (uint32_t * data ,
263+ const uint8_t * src , unsigned width )
264+ {
265+ unsigned i = 0 ;
266+ /* Process 4 pixels (16 bytes) at a time.
267+ * Byte order per pixel: R G B A → output word: (A<<24)|(R<<16)|(G<<8)|B
268+ * Full shuffle requires SSSE3 _mm_shuffle_epi8; we keep the loop structure
269+ * for the compiler to auto-vectorise while providing the scalar fallback. */
270+ for (; (int )(width - i ) >= 4 ; i += 4 )
271+ {
272+ data [i + 0 ] = ((unsigned )src [i * 4 + 3 ] << 24 ) | ((unsigned )src [i * 4 + 0 ] << 16 )
273+ | ((unsigned )src [i * 4 + 1 ] << 8 ) | ((unsigned )src [i * 4 + 2 ]);
274+ data [i + 1 ] = ((unsigned )src [i * 4 + 7 ] << 24 ) | ((unsigned )src [i * 4 + 4 ] << 16 )
275+ | ((unsigned )src [i * 4 + 5 ] << 8 ) | ((unsigned )src [i * 4 + 6 ]);
276+ data [i + 2 ] = ((unsigned )src [i * 4 + 11 ] << 24 ) | ((unsigned )src [i * 4 + 8 ] << 16 )
277+ | ((unsigned )src [i * 4 + 9 ] << 8 ) | ((unsigned )src [i * 4 + 10 ]);
278+ data [i + 3 ] = ((unsigned )src [i * 4 + 15 ] << 24 ) | ((unsigned )src [i * 4 + 12 ] << 16 )
279+ | ((unsigned )src [i * 4 + 13 ] << 8 ) | ((unsigned )src [i * 4 + 14 ]);
280+ }
281+ for (; i < width ; i ++ )
282+ {
283+ data [i ] = ((unsigned )src [i * 4 + 3 ] << 24 ) | ((unsigned )src [i * 4 + 0 ] << 16 )
284+ | ((unsigned )src [i * 4 + 1 ] << 8 ) | ((unsigned )src [i * 4 + 2 ]);
285+ }
286+ }
287+ #endif /* RPNG_SIMD_SSE2 */
288+
289+ /* NEON RGBA → ARGB32 conversion: vld4q_u8 de-interleaves all 4 channels. */
290+ #if defined(RPNG_SIMD_NEON )
291+ static void rpng_copy_line_rgba_neon (uint32_t * data ,
292+ const uint8_t * src , unsigned width )
293+ {
294+ unsigned i = 0 ;
295+ for (; (int )(width - i ) >= 8 ; i += 8 )
296+ {
297+ uint8x8x4_t px = vld4_u8 (src + i * 4 ); /* de-interleave R,G,B,A */
298+ uint8x8_t r = px .val [0 ];
299+ uint8x8_t g = px .val [1 ];
300+ uint8x8_t b = px .val [2 ];
301+ uint8x8_t a = px .val [3 ];
302+ /* Build ARGB: widen to 16-bit, shift, combine */
303+ uint16x8_t ag = vshll_n_u8 (a , 8 ); /* a << 8 */
304+ ag = vorrq_u16 (ag , vmovl_u8 (r )); /* | r → high word = A|R (need to shift) */
305+ /* Build full 32-bit using vshl + orr on 32-bit lanes */
306+ uint32x4_t lo_a = vshll_n_u16 (vget_low_u16 (vmovl_u8 (a )), 24 );
307+ uint32x4_t lo_r = vshll_n_u16 (vget_low_u16 (vmovl_u8 (r )), 16 );
308+ uint32x4_t lo_g = vshll_n_u16 (vget_low_u16 (vmovl_u8 (g )), 8 );
309+ uint32x4_t lo_b = vmovl_u16 (vget_low_u16 (vmovl_u8 (b )));
310+ uint32x4_t lo = vorrq_u32 (vorrq_u32 (lo_a , lo_r ), vorrq_u32 (lo_g , lo_b ));
311+ uint32x4_t hi_a = vshll_n_u16 (vget_high_u16 (vmovl_u8 (a )), 24 );
312+ uint32x4_t hi_r = vshll_n_u16 (vget_high_u16 (vmovl_u8 (r )), 16 );
313+ uint32x4_t hi_g = vshll_n_u16 (vget_high_u16 (vmovl_u8 (g )), 8 );
314+ uint32x4_t hi_b = vmovl_u16 (vget_high_u16 (vmovl_u8 (b )));
315+ uint32x4_t hi = vorrq_u32 (vorrq_u32 (hi_a , hi_r ), vorrq_u32 (hi_g , hi_b ));
316+ vst1q_u32 (data + i , lo );
317+ vst1q_u32 (data + i + 4 , hi );
318+ (void )ag ; /* used implicitly above */
319+ }
320+ for (; i < width ; i ++ )
321+ {
322+ data [i ] = ((unsigned )src [i * 4 + 3 ] << 24 ) | ((unsigned )src [i * 4 + 0 ] << 16 )
323+ | ((unsigned )src [i * 4 + 1 ] << 8 ) | ((unsigned )src [i * 4 + 2 ]);
324+ }
325+ }
326+
327+ /* NEON RGB → ARGB32 conversion using vld3 de-interleave */
328+ static void rpng_copy_line_rgb_neon (uint32_t * data ,
329+ const uint8_t * src , unsigned width )
330+ {
331+ unsigned i = 0 ;
332+ for (; (int )(width - i ) >= 8 ; i += 8 )
333+ {
334+ uint8x8x3_t px = vld3_u8 (src + i * 3 );
335+ uint8x8_t r = px .val [0 ];
336+ uint8x8_t g = px .val [1 ];
337+ uint8x8_t b = px .val [2 ];
338+ uint32x4_t lo_r = vshll_n_u16 (vget_low_u16 (vmovl_u8 (r )), 16 );
339+ uint32x4_t lo_g = vshll_n_u16 (vget_low_u16 (vmovl_u8 (g )), 8 );
340+ uint32x4_t lo_b = vmovl_u16 (vget_low_u16 (vmovl_u8 (b )));
341+ uint32x4_t lo_a = vdupq_n_u32 (0xFF000000u );
342+ uint32x4_t lo = vorrq_u32 (vorrq_u32 (lo_a , lo_r ), vorrq_u32 (lo_g , lo_b ));
343+ uint32x4_t hi_r = vshll_n_u16 (vget_high_u16 (vmovl_u8 (r )), 16 );
344+ uint32x4_t hi_g = vshll_n_u16 (vget_high_u16 (vmovl_u8 (g )), 8 );
345+ uint32x4_t hi_b = vmovl_u16 (vget_high_u16 (vmovl_u8 (b )));
346+ uint32x4_t hi_a = vdupq_n_u32 (0xFF000000u );
347+ uint32x4_t hi = vorrq_u32 (vorrq_u32 (hi_a , hi_r ), vorrq_u32 (hi_g , hi_b ));
348+ vst1q_u32 (data + i , lo );
349+ vst1q_u32 (data + i + 4 , hi );
350+ }
351+ for (; i < width ; i ++ )
352+ {
353+ data [i ] = 0xFF000000u
354+ | ((unsigned )src [i * 3 + 0 ] << 16 )
355+ | ((unsigned )src [i * 3 + 1 ] << 8 )
356+ | ((unsigned )src [i * 3 + 2 ] );
357+ }
358+ }
359+ #endif /* RPNG_SIMD_NEON */
360+
361+
154362#if defined(DEBUG ) || defined(RPNG_TEST )
155363static bool rpng_process_ihdr (struct png_ihdr * ihdr )
156364{
@@ -235,6 +443,18 @@ static void rpng_reverse_filter_copy_line_rgb(uint32_t *data,
235443{
236444 int i ;
237445
446+ /* Fast path for 8-bit depth (bpp == 24): each pixel is exactly 3 bytes. */
447+ if (bpp == 24 )
448+ {
449+ #if defined(RPNG_SIMD_NEON )
450+ rpng_copy_line_rgb_neon (data , decoded , width );
451+ return ;
452+ #elif defined(RPNG_SIMD_SSE2 )
453+ rpng_copy_line_rgb_sse2 (data , decoded , width );
454+ return ;
455+ #endif
456+ }
457+
238458 bpp /= 8 ;
239459
240460 for (i = 0 ; i < (int )width ; i ++ )
@@ -256,6 +476,18 @@ static void rpng_reverse_filter_copy_line_rgba(uint32_t *data,
256476{
257477 int i ;
258478
479+ /* Fast path for 8-bit depth (bpp == 32): each pixel is exactly 4 bytes. */
480+ if (bpp == 32 )
481+ {
482+ #if defined(RPNG_SIMD_NEON )
483+ rpng_copy_line_rgba_neon (data , decoded , width );
484+ return ;
485+ #elif defined(RPNG_SIMD_SSE2 )
486+ rpng_copy_line_rgba_sse2 (data , decoded , width );
487+ return ;
488+ #endif
489+ }
490+
259491 bpp /= 8 ;
260492
261493 for (i = 0 ; i < (int )width ; i ++ )
@@ -562,6 +794,8 @@ static int rpng_reverse_filter_init(const struct png_ihdr *ihdr,
562794 return -1 ;
563795}
564796
797+ /* ---------------------------------------------------------------------------*/
798+
565799static int rpng_reverse_filter_copy_line (uint32_t * data ,
566800 const struct png_ihdr * ihdr ,
567801 struct rpng_process * pngp , unsigned filter )
@@ -579,9 +813,9 @@ static int rpng_reverse_filter_copy_line(uint32_t *data,
579813 pngp -> decoded_scanline [i ] += pngp -> decoded_scanline [i - pngp -> bpp ];
580814 break ;
581815 case PNG_FILTER_UP :
582- memcpy ( pngp -> decoded_scanline , pngp -> inflate_buf , pngp -> pitch );
583- for ( i = 0 ; i < pngp -> pitch ; i ++ )
584- pngp -> decoded_scanline [ i ] += pngp -> prev_scanline [ i ] ;
816+ /* Filter Up is a pure vector add—no inter-byte dependency. */
817+ rpng_filter_up ( pngp -> decoded_scanline ,
818+ pngp -> inflate_buf , pngp -> prev_scanline , pngp -> pitch ) ;
585819 break ;
586820 case PNG_FILTER_AVERAGE :
587821 memcpy (pngp -> decoded_scanline , pngp -> inflate_buf , pngp -> pitch );
0 commit comments