Skip to content

Commit f044cd5

Browse files
⚡ Bolt: Unroll AVX2 match length comparison to 256 bytes
Co-authored-by: 404Setup <[email protected]>
1 parent ebca788 commit f044cd5

1 file changed

Lines changed: 92 additions & 0 deletions

File tree

src/compress/matchfinder.rs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,98 @@ unsafe fn match_len_avx2(a: *const u8, b: *const u8, max_len: usize) -> usize {
405405

406406
let v_zero = _mm256_setzero_si256();
407407

408+
while len + 256 <= max_len {
409+
let v1 = _mm256_loadu_si256(a.add(len) as *const __m256i);
410+
let v2 = _mm256_loadu_si256(b.add(len) as *const __m256i);
411+
let xor1 = _mm256_xor_si256(v1, v2);
412+
413+
let v3 = _mm256_loadu_si256(a.add(len + 32) as *const __m256i);
414+
let v4 = _mm256_loadu_si256(b.add(len + 32) as *const __m256i);
415+
let xor2 = _mm256_xor_si256(v3, v4);
416+
417+
let v5 = _mm256_loadu_si256(a.add(len + 64) as *const __m256i);
418+
let v6 = _mm256_loadu_si256(b.add(len + 64) as *const __m256i);
419+
let xor3 = _mm256_xor_si256(v5, v6);
420+
421+
let v7 = _mm256_loadu_si256(a.add(len + 96) as *const __m256i);
422+
let v8 = _mm256_loadu_si256(b.add(len + 96) as *const __m256i);
423+
let xor4 = _mm256_xor_si256(v7, v8);
424+
425+
let or1 = _mm256_or_si256(xor1, xor2);
426+
let or2 = _mm256_or_si256(xor3, xor4);
427+
let or_all_1 = _mm256_or_si256(or1, or2);
428+
429+
let v9 = _mm256_loadu_si256(a.add(len + 128) as *const __m256i);
430+
let v10 = _mm256_loadu_si256(b.add(len + 128) as *const __m256i);
431+
let xor5 = _mm256_xor_si256(v9, v10);
432+
433+
let v11 = _mm256_loadu_si256(a.add(len + 160) as *const __m256i);
434+
let v12 = _mm256_loadu_si256(b.add(len + 160) as *const __m256i);
435+
let xor6 = _mm256_xor_si256(v11, v12);
436+
437+
let v13 = _mm256_loadu_si256(a.add(len + 192) as *const __m256i);
438+
let v14 = _mm256_loadu_si256(b.add(len + 192) as *const __m256i);
439+
let xor7 = _mm256_xor_si256(v13, v14);
440+
441+
let v15 = _mm256_loadu_si256(a.add(len + 224) as *const __m256i);
442+
let v16 = _mm256_loadu_si256(b.add(len + 224) as *const __m256i);
443+
let xor8 = _mm256_xor_si256(v15, v16);
444+
445+
let or3 = _mm256_or_si256(xor5, xor6);
446+
let or4 = _mm256_or_si256(xor7, xor8);
447+
let or_all_2 = _mm256_or_si256(or3, or4);
448+
449+
let or_final = _mm256_or_si256(or_all_1, or_all_2);
450+
451+
if _mm256_testz_si256(or_final, or_final) == 1 {
452+
len += 256;
453+
continue;
454+
}
455+
456+
if _mm256_testz_si256(or_all_1, or_all_1) == 0 {
457+
if _mm256_testz_si256(or1, or1) == 0 {
458+
if _mm256_testz_si256(xor1, xor1) == 0 {
459+
let cmp = _mm256_cmpeq_epi8(xor1, v_zero);
460+
let mask = _mm256_movemask_epi8(cmp) as u32;
461+
return len + (!mask).trailing_zeros() as usize;
462+
} else {
463+
let cmp = _mm256_cmpeq_epi8(xor2, v_zero);
464+
let mask = _mm256_movemask_epi8(cmp) as u32;
465+
return len + 32 + (!mask).trailing_zeros() as usize;
466+
}
467+
} else if _mm256_testz_si256(xor3, xor3) == 0 {
468+
let cmp = _mm256_cmpeq_epi8(xor3, v_zero);
469+
let mask = _mm256_movemask_epi8(cmp) as u32;
470+
return len + 64 + (!mask).trailing_zeros() as usize;
471+
} else {
472+
let cmp = _mm256_cmpeq_epi8(xor4, v_zero);
473+
let mask = _mm256_movemask_epi8(cmp) as u32;
474+
return len + 96 + (!mask).trailing_zeros() as usize;
475+
}
476+
} else {
477+
len += 128;
478+
if _mm256_testz_si256(or3, or3) == 0 {
479+
if _mm256_testz_si256(xor5, xor5) == 0 {
480+
let cmp = _mm256_cmpeq_epi8(xor5, v_zero);
481+
let mask = _mm256_movemask_epi8(cmp) as u32;
482+
return len + (!mask).trailing_zeros() as usize;
483+
} else {
484+
let cmp = _mm256_cmpeq_epi8(xor6, v_zero);
485+
let mask = _mm256_movemask_epi8(cmp) as u32;
486+
return len + 32 + (!mask).trailing_zeros() as usize;
487+
}
488+
} else if _mm256_testz_si256(xor7, xor7) == 0 {
489+
let cmp = _mm256_cmpeq_epi8(xor7, v_zero);
490+
let mask = _mm256_movemask_epi8(cmp) as u32;
491+
return len + 64 + (!mask).trailing_zeros() as usize;
492+
} else {
493+
let cmp = _mm256_cmpeq_epi8(xor8, v_zero);
494+
let mask = _mm256_movemask_epi8(cmp) as u32;
495+
return len + 96 + (!mask).trailing_zeros() as usize;
496+
}
497+
}
498+
}
499+
408500
while len + 128 <= max_len {
409501
let v1 = _mm256_loadu_si256(a.add(len) as *const __m256i);
410502
let v2 = _mm256_loadu_si256(b.add(len) as *const __m256i);

0 commit comments

Comments
 (0)