@@ -405,6 +405,98 @@ unsafe fn match_len_avx2(a: *const u8, b: *const u8, max_len: usize) -> usize {
405405
406406 let v_zero = _mm256_setzero_si256 ( ) ;
407407
408+ while len + 256 <= max_len {
409+ let v1 = _mm256_loadu_si256 ( a. add ( len) as * const __m256i ) ;
410+ let v2 = _mm256_loadu_si256 ( b. add ( len) as * const __m256i ) ;
411+ let xor1 = _mm256_xor_si256 ( v1, v2) ;
412+
413+ let v3 = _mm256_loadu_si256 ( a. add ( len + 32 ) as * const __m256i ) ;
414+ let v4 = _mm256_loadu_si256 ( b. add ( len + 32 ) as * const __m256i ) ;
415+ let xor2 = _mm256_xor_si256 ( v3, v4) ;
416+
417+ let v5 = _mm256_loadu_si256 ( a. add ( len + 64 ) as * const __m256i ) ;
418+ let v6 = _mm256_loadu_si256 ( b. add ( len + 64 ) as * const __m256i ) ;
419+ let xor3 = _mm256_xor_si256 ( v5, v6) ;
420+
421+ let v7 = _mm256_loadu_si256 ( a. add ( len + 96 ) as * const __m256i ) ;
422+ let v8 = _mm256_loadu_si256 ( b. add ( len + 96 ) as * const __m256i ) ;
423+ let xor4 = _mm256_xor_si256 ( v7, v8) ;
424+
425+ let or1 = _mm256_or_si256 ( xor1, xor2) ;
426+ let or2 = _mm256_or_si256 ( xor3, xor4) ;
427+ let or_all_1 = _mm256_or_si256 ( or1, or2) ;
428+
429+ let v9 = _mm256_loadu_si256 ( a. add ( len + 128 ) as * const __m256i ) ;
430+ let v10 = _mm256_loadu_si256 ( b. add ( len + 128 ) as * const __m256i ) ;
431+ let xor5 = _mm256_xor_si256 ( v9, v10) ;
432+
433+ let v11 = _mm256_loadu_si256 ( a. add ( len + 160 ) as * const __m256i ) ;
434+ let v12 = _mm256_loadu_si256 ( b. add ( len + 160 ) as * const __m256i ) ;
435+ let xor6 = _mm256_xor_si256 ( v11, v12) ;
436+
437+ let v13 = _mm256_loadu_si256 ( a. add ( len + 192 ) as * const __m256i ) ;
438+ let v14 = _mm256_loadu_si256 ( b. add ( len + 192 ) as * const __m256i ) ;
439+ let xor7 = _mm256_xor_si256 ( v13, v14) ;
440+
441+ let v15 = _mm256_loadu_si256 ( a. add ( len + 224 ) as * const __m256i ) ;
442+ let v16 = _mm256_loadu_si256 ( b. add ( len + 224 ) as * const __m256i ) ;
443+ let xor8 = _mm256_xor_si256 ( v15, v16) ;
444+
445+ let or3 = _mm256_or_si256 ( xor5, xor6) ;
446+ let or4 = _mm256_or_si256 ( xor7, xor8) ;
447+ let or_all_2 = _mm256_or_si256 ( or3, or4) ;
448+
449+ let or_final = _mm256_or_si256 ( or_all_1, or_all_2) ;
450+
451+ if _mm256_testz_si256 ( or_final, or_final) == 1 {
452+ len += 256 ;
453+ continue ;
454+ }
455+
456+ if _mm256_testz_si256 ( or_all_1, or_all_1) == 0 {
457+ if _mm256_testz_si256 ( or1, or1) == 0 {
458+ if _mm256_testz_si256 ( xor1, xor1) == 0 {
459+ let cmp = _mm256_cmpeq_epi8 ( xor1, v_zero) ;
460+ let mask = _mm256_movemask_epi8 ( cmp) as u32 ;
461+ return len + ( !mask) . trailing_zeros ( ) as usize ;
462+ } else {
463+ let cmp = _mm256_cmpeq_epi8 ( xor2, v_zero) ;
464+ let mask = _mm256_movemask_epi8 ( cmp) as u32 ;
465+ return len + 32 + ( !mask) . trailing_zeros ( ) as usize ;
466+ }
467+ } else if _mm256_testz_si256 ( xor3, xor3) == 0 {
468+ let cmp = _mm256_cmpeq_epi8 ( xor3, v_zero) ;
469+ let mask = _mm256_movemask_epi8 ( cmp) as u32 ;
470+ return len + 64 + ( !mask) . trailing_zeros ( ) as usize ;
471+ } else {
472+ let cmp = _mm256_cmpeq_epi8 ( xor4, v_zero) ;
473+ let mask = _mm256_movemask_epi8 ( cmp) as u32 ;
474+ return len + 96 + ( !mask) . trailing_zeros ( ) as usize ;
475+ }
476+ } else {
477+ len += 128 ;
478+ if _mm256_testz_si256 ( or3, or3) == 0 {
479+ if _mm256_testz_si256 ( xor5, xor5) == 0 {
480+ let cmp = _mm256_cmpeq_epi8 ( xor5, v_zero) ;
481+ let mask = _mm256_movemask_epi8 ( cmp) as u32 ;
482+ return len + ( !mask) . trailing_zeros ( ) as usize ;
483+ } else {
484+ let cmp = _mm256_cmpeq_epi8 ( xor6, v_zero) ;
485+ let mask = _mm256_movemask_epi8 ( cmp) as u32 ;
486+ return len + 32 + ( !mask) . trailing_zeros ( ) as usize ;
487+ }
488+ } else if _mm256_testz_si256 ( xor7, xor7) == 0 {
489+ let cmp = _mm256_cmpeq_epi8 ( xor7, v_zero) ;
490+ let mask = _mm256_movemask_epi8 ( cmp) as u32 ;
491+ return len + 64 + ( !mask) . trailing_zeros ( ) as usize ;
492+ } else {
493+ let cmp = _mm256_cmpeq_epi8 ( xor8, v_zero) ;
494+ let mask = _mm256_movemask_epi8 ( cmp) as u32 ;
495+ return len + 96 + ( !mask) . trailing_zeros ( ) as usize ;
496+ }
497+ }
498+ }
499+
408500 while len + 128 <= max_len {
409501 let v1 = _mm256_loadu_si256 ( a. add ( len) as * const __m256i ) ;
410502 let v2 = _mm256_loadu_si256 ( b. add ( len) as * const __m256i ) ;
0 commit comments