Skip to content

Commit 0b92618

Browse files
feat(decompress): optimize offset 13 loop unrolling
Unrolls the `decompress_offset_13` loop to 8 stores (104 bytes per iteration) instead of 4 stores (52 bytes). This reduces loop overhead for long matches and improves throughput by ~4.1%. Benchmark (Decompress offset13/libdeflate-rs offset13): Before: ~8.03 GiB/s After: ~8.36 GiB/s Change: +4.1% Co-authored-by: 404Setup <[email protected]>
1 parent 15c4d16 commit 0b92618

1 file changed

Lines changed: 17 additions & 0 deletions

File tree

src/decompress/x86.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -963,6 +963,23 @@ decompress_offset_simple! {
963963
_mm_shuffle_epi8(v_raw, mask)
964964
},
965965
unrolled_loops: {
966+
// Unroll loop 8x for offset 13 (8 * 13 = 104 bytes per iteration).
967+
// This reduces loop overhead for long matches.
968+
// Safety: The last write is at offset 91 (7 * 13).
969+
// A 16-byte write at 91 requires 91 + 16 = 107 bytes.
970+
// We check for 120 bytes to be safe and consistent with other offsets.
971+
while copied + 120 <= length {
972+
_mm_storeu_si128(out_next.add(copied) as *mut __m128i, v_pat);
973+
_mm_storeu_si128(out_next.add(copied + 13) as *mut __m128i, v_pat);
974+
_mm_storeu_si128(out_next.add(copied + 26) as *mut __m128i, v_pat);
975+
_mm_storeu_si128(out_next.add(copied + 39) as *mut __m128i, v_pat);
976+
_mm_storeu_si128(out_next.add(copied + 52) as *mut __m128i, v_pat);
977+
_mm_storeu_si128(out_next.add(copied + 65) as *mut __m128i, v_pat);
978+
_mm_storeu_si128(out_next.add(copied + 78) as *mut __m128i, v_pat);
979+
_mm_storeu_si128(out_next.add(copied + 91) as *mut __m128i, v_pat);
980+
copied += 104;
981+
}
982+
966983
while copied + 64 <= length {
967984
_mm_storeu_si128(out_next.add(copied) as *mut __m128i, v_pat);
968985
_mm_storeu_si128(out_next.add(copied + 13) as *mut __m128i, v_pat);

0 commit comments

Comments
 (0)