@@ -38,16 +38,8 @@ macro_rules! adler32_chunk8 {
3838 } ;
3939}
4040
41- // Optimization: Manual loop unrolling for Adler32 tail processing.
42- // This processes 4 bytes per iteration to reduce loop overhead and improve instruction pipelining.
43- // It is significantly faster than a scalar iterator loop for small tails (e.g., 15-63 bytes).
44- //
45- // Safety:
46- // * `$ptr` must be valid for reads of `$len` bytes.
47- // * `$s1` and `$s2` must not overflow u32 before modulo (guaranteed by BLOCK_SIZE check in caller).
4841macro_rules! adler32_tail {
4942 ( $s1: expr, $s2: expr, $ptr: expr, $len: expr) => {
50- // We know len < 16 here because larger chunks are handled by SIMD or unrolled loops before calling this macro.
5143 if $len > 0 {
5244 if $len >= 8 {
5345 adler32_chunk8!( $s1, $s2, $ptr, $len) ;
@@ -66,7 +58,6 @@ macro_rules! adler32_tail {
6658 $len -= 4 ;
6759 }
6860
69- // Remaining 0-3 bytes.
7061 match $len {
7162 3 => {
7263 let v = ( $ptr as * const u16 ) . read_unaligned( ) as u32 ;
@@ -120,8 +111,7 @@ pub unsafe fn adler32_x86_sse2(adler: u32, p: &[u8]) -> u32 {
120111 let mut v_byte_sums_d = _mm_setzero_si128 ( ) ;
121112
122113 let mut chunk_n = n;
123- // Optimization: Unroll loop to process 64 bytes per iteration (two 32-byte chunks).
124- // This amortizes loop overhead and allows better pipelining of the `sad` and `unpack` operations.
114+
125115 while chunk_n >= 128 {
126116 let data_a_1 = _mm_loadu_si128 ( data. as_ptr ( ) as * const __m128i ) ;
127117 let data_b_1 = _mm_loadu_si128 ( data. as_ptr ( ) . add ( 16 ) as * const __m128i ) ;
@@ -132,7 +122,6 @@ pub unsafe fn adler32_x86_sse2(adler: u32, p: &[u8]) -> u32 {
132122 let data_a_4 = _mm_loadu_si128 ( data. as_ptr ( ) . add ( 96 ) as * const __m128i ) ;
133123 let data_b_4 = _mm_loadu_si128 ( data. as_ptr ( ) . add ( 112 ) as * const __m128i ) ;
134124
135- // Accumulate byte sums
136125 v_byte_sums_a = _mm_add_epi16 ( v_byte_sums_a, _mm_unpacklo_epi8 ( data_a_1, v_zero) ) ;
137126 v_byte_sums_b = _mm_add_epi16 ( v_byte_sums_b, _mm_unpackhi_epi8 ( data_a_1, v_zero) ) ;
138127 v_byte_sums_c = _mm_add_epi16 ( v_byte_sums_c, _mm_unpacklo_epi8 ( data_b_1, v_zero) ) ;
@@ -153,7 +142,6 @@ pub unsafe fn adler32_x86_sse2(adler: u32, p: &[u8]) -> u32 {
153142 v_byte_sums_c = _mm_add_epi16 ( v_byte_sums_c, _mm_unpacklo_epi8 ( data_b_4, v_zero) ) ;
154143 v_byte_sums_d = _mm_add_epi16 ( v_byte_sums_d, _mm_unpackhi_epi8 ( data_b_4, v_zero) ) ;
155144
156- // SAD calculation
157145 let sad_1 = _mm_add_epi32 (
158146 _mm_sad_epu8 ( data_a_1, v_zero) ,
159147 _mm_sad_epu8 ( data_b_1, v_zero) ,
@@ -171,16 +159,13 @@ pub unsafe fn adler32_x86_sse2(adler: u32, p: &[u8]) -> u32 {
171159 _mm_sad_epu8 ( data_b_4, v_zero) ,
172160 ) ;
173161
174- // Update v_s1_sums
175- // v_s1_sums += 4 * v_s1 (initial) + 3*sad_1 + 2*sad_2 + 1*sad_3
176162 let s1_x4 = _mm_slli_epi32 ( v_s1, 2 ) ;
177163 let inc_1 = _mm_add_epi32 (
178- _mm_add_epi32 ( sad_1, _mm_add_epi32 ( sad_1, sad_1) ) , // 3*sad_1
179- _mm_add_epi32 ( _mm_add_epi32 ( sad_2, sad_2) , sad_3) , // 2*sad_2 + sad_3
164+ _mm_add_epi32 ( sad_1, _mm_add_epi32 ( sad_1, sad_1) ) ,
165+ _mm_add_epi32 ( _mm_add_epi32 ( sad_2, sad_2) , sad_3) ,
180166 ) ;
181167 v_s1_sums = _mm_add_epi32 ( v_s1_sums, _mm_add_epi32 ( s1_x4, inc_1) ) ;
182168
183- // Update v_s1
184169 let total_sad = _mm_add_epi32 ( _mm_add_epi32 ( sad_1, sad_2) , _mm_add_epi32 ( sad_3, sad_4) ) ;
185170 v_s1 = _mm_add_epi32 ( v_s1, total_sad) ;
186171
@@ -321,7 +306,6 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
321306 }
322307 }
323308
324- // Optimization: Hoist vector constants out of the main loop to avoid redundant loads.
325309 let weights = _mm256_set_epi8 (
326310 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 ,
327311 26 , 27 , 28 , 29 , 30 , 31 , 32 ,
@@ -346,8 +330,6 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
346330 let mut v_s2_c = _mm256_setzero_si256 ( ) ;
347331 let mut v_s2_d = _mm256_setzero_si256 ( ) ;
348332
349- // Optimization: For chunks >= 256 bytes, use an unrolled loop with 8 independent accumulators.
350- // This increases instruction-level parallelism to hide the latency of multiply-adds.
351333 if chunk_n >= 256 {
352334 let mut v_s2_e = _mm256_setzero_si256 ( ) ;
353335 let mut v_s2_f = _mm256_setzero_si256 ( ) ;
@@ -421,9 +403,6 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
421403 let s_h = _mm256_madd_epi16 ( p8, ones_i16) ;
422404 v_s2_h = _mm256_add_epi32 ( v_s2_h, s_h) ;
423405
424- // Update v_s1 and v_inc accumulators
425- // For the first 128 bytes: v_s1 contributes to the next 128 bytes as well.
426- // v_s1_acc accumulates v_s1 twice (once for first 128, once for second).
427406 v_s1_acc = _mm256_add_epi32 ( v_s1_acc, v_s1) ;
428407 v_inc_acc_a = _mm256_add_epi32 ( v_inc_acc_a, inc_part_1) ;
429408 v_s1 = _mm256_add_epi32 ( v_s1, sum_sads_1) ;
@@ -503,24 +482,15 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
503482 let data_a = _mm256_loadu_si256 ( ptr as * const __m256i ) ;
504483 let data_b = _mm256_loadu_si256 ( ptr. add ( 32 ) as * const __m256i ) ;
505484
506- // Optimization: Parallelize SAD calculation and s1/s2 updates to reduce dependency chains.
507- // By computing sad_a and sad_b in parallel, we can accumulate s1 sums and s1 in larger steps.
508485 let sad_a = _mm256_sad_epu8 ( data_a, v_zero) ;
509486 let sad_b = _mm256_sad_epu8 ( data_b, v_zero) ;
510487
511- // Update v_s1_sums:
512- // The contribution of the current v_s1 to the sums over the next 64 bytes is:
513- // - For the first 32 bytes: v_s1 * 32
514- // - For the second 32 bytes: (v_s1 + sad_a) * 32
515- // Total: v_s1 * 64 + sad_a * 32
516488 let v_s1_x64 = _mm256_slli_epi32 ( v_s1, 6 ) ;
517489 let sad_a_x32 = _mm256_slli_epi32 ( sad_a, 5 ) ;
518490 v_s1_sums = _mm256_add_epi32 ( v_s1_sums, _mm256_add_epi32 ( v_s1_x64, sad_a_x32) ) ;
519491
520- // Update v_s1: v_s1 += sad_a + sad_b
521492 v_s1 = _mm256_add_epi32 ( v_s1, _mm256_add_epi32 ( sad_a, sad_b) ) ;
522493
523- // Update v_s2: Calculate partial s2 contributions in parallel
524494 let p1 = _mm256_maddubs_epi16 ( data_a, weights) ;
525495 let s_a = _mm256_madd_epi16 ( p1, ones_i16) ;
526496 let p2 = _mm256_maddubs_epi16 ( data_b, weights) ;
@@ -538,16 +508,12 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
538508 while chunk_n >= 32 {
539509 let d = _mm256_loadu_si256 ( ptr as * const __m256i ) ;
540510
541- // Update v_s1_sums: v_s1 contributes to next 32 bytes.
542- // v_s1_sums += v_s1 * 32
543511 let v_s1_x32 = _mm256_slli_epi32 ( v_s1, 5 ) ;
544512 v_s1_sums = _mm256_add_epi32 ( v_s1_sums, v_s1_x32) ;
545513
546- // Update v_s1: v_s1 += sad
547514 let sad = _mm256_sad_epu8 ( d, v_zero) ;
548515 v_s1 = _mm256_add_epi32 ( v_s1, sad) ;
549516
550- // Update v_s2: v_s2 += weighted_sum
551517 let p = _mm256_maddubs_epi16 ( d, weights) ;
552518 let s = _mm256_madd_epi16 ( p, ones_i16) ;
553519 v_s2 = _mm256_add_epi32 ( v_s2, s) ;
@@ -648,9 +614,6 @@ pub unsafe fn adler32_x86_avx2_vnni(adler: u32, p: &[u8]) -> u32 {
648614
649615 let mut chunk_n = n;
650616
651- // Optimization: For chunks >= 256 bytes, use an unrolled loop with 8 independent accumulators.
652- // This increases instruction-level parallelism to hide the latency of `vpdpbusd` (5 cycles on Golden Cove).
653- // We merge the global `v_s2` into `v_s2_a` to save a register, keeping total usage within AVX2 limits (16 YMMs).
654617 if chunk_n >= 256 {
655618 let mut ptr = data. as_ptr ( ) ;
656619 let mut v_s2_a = v_s2;
@@ -663,7 +626,6 @@ pub unsafe fn adler32_x86_avx2_vnni(adler: u32, p: &[u8]) -> u32 {
663626 let mut v_s2_h = _mm256_setzero_si256 ( ) ;
664627
665628 while chunk_n >= 256 {
666- // Block 1 (0..128)
667629 let d1 = _mm256_loadu_si256 ( ptr as * const __m256i ) ;
668630 v_s2_a = _mm256_dpbusd_avx_epi32 ( v_s2_a, d1, mults) ;
669631 let u1 = _mm256_dpbusd_avx_epi32 ( _mm256_setzero_si256 ( ) , d1, ones) ;
@@ -692,7 +654,6 @@ pub unsafe fn adler32_x86_avx2_vnni(adler: u32, p: &[u8]) -> u32 {
692654 v_s1_sums = _mm256_add_epi32 ( v_s1_sums, inc_a) ;
693655 v_s1 = _mm256_add_epi32 ( v_s1, total_u_a) ;
694656
695- // Block 2 (128..256)
696657 let d5 = _mm256_loadu_si256 ( ptr. add ( 128 ) as * const __m256i ) ;
697658 v_s2_e = _mm256_dpbusd_avx_epi32 ( v_s2_e, d5, mults) ;
698659 let u5 = _mm256_dpbusd_avx_epi32 ( _mm256_setzero_si256 ( ) , d5, ones) ;
0 commit comments