11/*
22 * Bit-exactness and performance test for blitter SIMD operations.
33 *
4- * Build (from repo root):
4+ * Build (from repo root — link exactly one SIMD implementation ):
55 * # On macOS ARM64 (NEON):
66 * cc -O2 -o test/test_blitter_simd test/test_blitter_simd.c \
7- * src/blitter_simd_neon.c src/blitter_simd_scalar.c
7+ * src/blitter_simd_neon.c
88 *
99 * # On x86_64 (SSE2):
1010 * cc -O2 -msse2 -o test/test_blitter_simd test/test_blitter_simd.c \
11- * src/blitter_simd_sse2.c src/blitter_simd_scalar.c
11+ * src/blitter_simd_sse2.c
1212 *
1313 * # Scalar-only (any platform):
1414 * cc -O2 -o test/test_blitter_simd test/test_blitter_simd.c \
@@ -297,31 +297,51 @@ static void test_byte_merge(void)
297297
298298#define BENCH_ITERS 1000000
299299
300- static double elapsed_ns (struct timespec start , struct timespec end )
300+ /* Portable high-resolution timer.
301+ * Uses clock_gettime on POSIX, QueryPerformanceCounter on Windows.
302+ * Declare TIMER_DECL() once per scope, then use START/STOP/NS freely. */
303+ #ifdef _WIN32
304+ #include <windows.h>
305+ static double get_time_ns (void )
301306{
302- return (double )(end .tv_sec - start .tv_sec ) * 1e9
303- + (double )(end .tv_nsec - start .tv_nsec );
307+ static LARGE_INTEGER freq = {0 };
308+ LARGE_INTEGER count ;
309+ if (freq .QuadPart == 0 )
310+ QueryPerformanceFrequency (& freq );
311+ QueryPerformanceCounter (& count );
312+ return (double )count .QuadPart / (double )freq .QuadPart * 1e9 ;
304313}
314+ #define TIMER_DECL () double _timer_t0, _timer_t1
315+ #define TIMER_START () (_timer_t0 = get_time_ns())
316+ #define TIMER_STOP () (_timer_t1 = get_time_ns())
317+ #define TIMER_NS () (_timer_t1 - _timer_t0)
318+ #else
319+ #define TIMER_DECL () struct timespec _timer_ts0, _timer_ts1
320+ #define TIMER_START () clock_gettime(CLOCK_MONOTONIC, &_timer_ts0)
321+ #define TIMER_STOP () clock_gettime(CLOCK_MONOTONIC, &_timer_ts1)
322+ #define TIMER_NS () (((double)(_timer_ts1.tv_sec - _timer_ts0.tv_sec) * 1e9) + (double)(_timer_ts1.tv_nsec - _timer_ts0.tv_nsec))
323+ #endif
305324
306325static void bench_lfu (void )
307326{
308- struct timespec t0 , t1 ;
327+ TIMER_DECL () ;
309328 volatile uint64_t sink = 0 ;
310329 int i ;
330+ double ref_ns , simd_ns ;
311331
312332 /* Ref */
313- clock_gettime ( CLOCK_MONOTONIC , & t0 );
333+ TIMER_START ( );
314334 for (i = 0 ; i < BENCH_ITERS ; i ++ )
315335 sink += ref_lfu (0xAAAAAAAAAAAAAAAAULL , 0x5555555555555555ULL , (uint8_t )(i & 0x0F ));
316- clock_gettime ( CLOCK_MONOTONIC , & t1 );
317- double ref_ns = elapsed_ns ( t0 , t1 ) / BENCH_ITERS ;
336+ TIMER_STOP ( );
337+ ref_ns = TIMER_NS ( ) / BENCH_ITERS ;
318338
319339 /* SIMD */
320- clock_gettime ( CLOCK_MONOTONIC , & t0 );
340+ TIMER_START ( );
321341 for (i = 0 ; i < BENCH_ITERS ; i ++ )
322342 sink += blitter_simd_ops .lfu (0xAAAAAAAAAAAAAAAAULL , 0x5555555555555555ULL , (uint8_t )(i & 0x0F ));
323- clock_gettime ( CLOCK_MONOTONIC , & t1 );
324- double simd_ns = elapsed_ns ( t0 , t1 ) / BENCH_ITERS ;
343+ TIMER_STOP ( );
344+ simd_ns = TIMER_NS ( ) / BENCH_ITERS ;
325345
326346 printf (" LFU: ref=%6.1f ns/op simd=%6.1f ns/op speedup=%.2fx\n" ,
327347 ref_ns , simd_ns , ref_ns / simd_ns );
@@ -330,21 +350,22 @@ static void bench_lfu(void)
330350
331351static void bench_dcomp (void )
332352{
333- struct timespec t0 , t1 ;
353+ TIMER_DECL () ;
334354 volatile uint8_t sink = 0 ;
335355 int i ;
356+ double ref_ns , simd_ns ;
336357
337- clock_gettime ( CLOCK_MONOTONIC , & t0 );
358+ TIMER_START ( );
338359 for (i = 0 ; i < BENCH_ITERS ; i ++ )
339360 sink += ref_dcomp (0x0102030405060708ULL , (uint64_t )i , 0 , false);
340- clock_gettime ( CLOCK_MONOTONIC , & t1 );
341- double ref_ns = elapsed_ns ( t0 , t1 ) / BENCH_ITERS ;
361+ TIMER_STOP ( );
362+ ref_ns = TIMER_NS ( ) / BENCH_ITERS ;
342363
343- clock_gettime ( CLOCK_MONOTONIC , & t0 );
364+ TIMER_START ( );
344365 for (i = 0 ; i < BENCH_ITERS ; i ++ )
345366 sink += blitter_simd_ops .dcomp (0x0102030405060708ULL , (uint64_t )i , 0 , false);
346- clock_gettime ( CLOCK_MONOTONIC , & t1 );
347- double simd_ns = elapsed_ns ( t0 , t1 ) / BENCH_ITERS ;
367+ TIMER_STOP ( );
368+ simd_ns = TIMER_NS ( ) / BENCH_ITERS ;
348369
349370 printf (" DCOMP: ref=%6.1f ns/op simd=%6.1f ns/op speedup=%.2fx\n" ,
350371 ref_ns , simd_ns , ref_ns / simd_ns );
@@ -353,21 +374,22 @@ static void bench_dcomp(void)
353374
354375static void bench_zcomp (void )
355376{
356- struct timespec t0 , t1 ;
377+ TIMER_DECL () ;
357378 volatile uint8_t sink = 0 ;
358379 int i ;
380+ double ref_ns , simd_ns ;
359381
360- clock_gettime ( CLOCK_MONOTONIC , & t0 );
382+ TIMER_START ( );
361383 for (i = 0 ; i < BENCH_ITERS ; i ++ )
362384 sink += ref_zcomp (0x0001000200030004ULL , 0x0002000200020002ULL , (uint8_t )(i & 0x07 ));
363- clock_gettime ( CLOCK_MONOTONIC , & t1 );
364- double ref_ns = elapsed_ns ( t0 , t1 ) / BENCH_ITERS ;
385+ TIMER_STOP ( );
386+ ref_ns = TIMER_NS ( ) / BENCH_ITERS ;
365387
366- clock_gettime ( CLOCK_MONOTONIC , & t0 );
388+ TIMER_START ( );
367389 for (i = 0 ; i < BENCH_ITERS ; i ++ )
368390 sink += blitter_simd_ops .zcomp (0x0001000200030004ULL , 0x0002000200020002ULL , (uint8_t )(i & 0x07 ));
369- clock_gettime ( CLOCK_MONOTONIC , & t1 );
370- double simd_ns = elapsed_ns ( t0 , t1 ) / BENCH_ITERS ;
391+ TIMER_STOP ( );
392+ simd_ns = TIMER_NS ( ) / BENCH_ITERS ;
371393
372394 printf (" ZCOMP: ref=%6.1f ns/op simd=%6.1f ns/op speedup=%.2fx\n" ,
373395 ref_ns , simd_ns , ref_ns / simd_ns );
@@ -376,21 +398,22 @@ static void bench_zcomp(void)
376398
377399static void bench_byte_merge (void )
378400{
379- struct timespec t0 , t1 ;
401+ TIMER_DECL () ;
380402 volatile uint64_t sink = 0 ;
381403 int i ;
404+ double ref_ns , simd_ns ;
382405
383- clock_gettime ( CLOCK_MONOTONIC , & t0 );
406+ TIMER_START ( );
384407 for (i = 0 ; i < BENCH_ITERS ; i ++ )
385408 sink += ref_byte_merge (0xAAAAAAAAAAAAAAAAULL , 0x5555555555555555ULL , (uint16_t )(i & 0x7FFF ));
386- clock_gettime ( CLOCK_MONOTONIC , & t1 );
387- double ref_ns = elapsed_ns ( t0 , t1 ) / BENCH_ITERS ;
409+ TIMER_STOP ( );
410+ ref_ns = TIMER_NS ( ) / BENCH_ITERS ;
388411
389- clock_gettime ( CLOCK_MONOTONIC , & t0 );
412+ TIMER_START ( );
390413 for (i = 0 ; i < BENCH_ITERS ; i ++ )
391414 sink += blitter_simd_ops .byte_merge (0xAAAAAAAAAAAAAAAAULL , 0x5555555555555555ULL , (uint16_t )(i & 0x7FFF ));
392- clock_gettime ( CLOCK_MONOTONIC , & t1 );
393- double simd_ns = elapsed_ns ( t0 , t1 ) / BENCH_ITERS ;
415+ TIMER_STOP ( );
416+ simd_ns = TIMER_NS ( ) / BENCH_ITERS ;
394417
395418 printf (" byte_merge: ref=%6.1f ns/op simd=%6.1f ns/op speedup=%.2fx\n" ,
396419 ref_ns , simd_ns , ref_ns / simd_ns );
@@ -424,3 +447,5 @@ int main(int argc, char *argv[])
424447
425448 return failures > 0 ? 1 : 0 ;
426449}
450+ ? 1 : 0 ;
451+ }
0 commit comments