Skip to content

Commit 2e57be5

Browse files
perf: Optimize MatchFinder::skip_positions loop unrolling (#410)
Refactored the `skip_positions` method to use an `#[inline(always)]` generic helper method `skip_positions_generic<M: MatchLen>`. This abstraction allows the Rust compiler to monomorphize the loop per SIMD strategy (e.g., `Sse2Strategy`, `NeonStrategy`), enabling it to fully inline the inner `advance_one_byte_generic` call and unroll the loop effectively. Also included minor logic cleanups automatically flagged by `cargo clippy`. Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
1 parent 889f910 commit 2e57be5

2 files changed

Lines changed: 46 additions & 78 deletions

File tree

src/compress/matchfinder.rs

Lines changed: 41 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1605,98 +1605,72 @@ impl BtMatchFinder {
16051605
}
16061606
}
16071607

1608-
pub fn skip_positions(
1608+
#[inline(always)]
1609+
unsafe fn skip_positions_generic<M: MatchLen>(
16091610
&mut self,
16101611
data: &[u8],
16111612
mut pos: usize,
16121613
count: usize,
16131614
max_depth: usize,
16141615
nice_len: usize,
1616+
) {
1617+
for _ in 0..count {
1618+
self.advance_one_byte_generic::<M, _>(
1619+
data,
1620+
pos,
1621+
DEFLATE_MAX_MATCH_LEN,
1622+
nice_len,
1623+
max_depth,
1624+
NoOpVisitor,
1625+
);
1626+
pos += 1;
1627+
}
1628+
}
1629+
1630+
pub fn skip_positions(
1631+
&mut self,
1632+
data: &[u8],
1633+
pos: usize,
1634+
count: usize,
1635+
max_depth: usize,
1636+
nice_len: usize,
16151637
) {
16161638
unsafe {
16171639
match self.match_len {
16181640
MatchLenStrategy::Scalar => {
1619-
for _ in 0..count {
1620-
self.advance_one_byte_generic::<ScalarStrategy, _>(
1621-
data,
1622-
pos,
1623-
DEFLATE_MAX_MATCH_LEN,
1624-
nice_len,
1625-
max_depth,
1626-
NoOpVisitor,
1627-
);
1628-
pos += 1;
1629-
}
1641+
self.skip_positions_generic::<ScalarStrategy>(
1642+
data, pos, count, max_depth, nice_len,
1643+
);
16301644
}
16311645
#[cfg(target_arch = "x86_64")]
16321646
MatchLenStrategy::Sse2 => {
1633-
for _ in 0..count {
1634-
self.advance_one_byte_generic::<Sse2Strategy, _>(
1635-
data,
1636-
pos,
1637-
DEFLATE_MAX_MATCH_LEN,
1638-
nice_len,
1639-
max_depth,
1640-
NoOpVisitor,
1641-
);
1642-
pos += 1;
1643-
}
1647+
self.skip_positions_generic::<Sse2Strategy>(
1648+
data, pos, count, max_depth, nice_len,
1649+
);
16441650
}
16451651
#[cfg(target_arch = "x86_64")]
16461652
MatchLenStrategy::Avx2 => {
1647-
for _ in 0..count {
1648-
self.advance_one_byte_generic::<Avx2Strategy, _>(
1649-
data,
1650-
pos,
1651-
DEFLATE_MAX_MATCH_LEN,
1652-
nice_len,
1653-
max_depth,
1654-
NoOpVisitor,
1655-
);
1656-
pos += 1;
1657-
}
1653+
self.skip_positions_generic::<Avx2Strategy>(
1654+
data, pos, count, max_depth, nice_len,
1655+
);
16581656
}
16591657
#[cfg(target_arch = "x86_64")]
16601658
MatchLenStrategy::Avx512 => {
1661-
for _ in 0..count {
1662-
self.advance_one_byte_generic::<Avx512Strategy, _>(
1663-
data,
1664-
pos,
1665-
DEFLATE_MAX_MATCH_LEN,
1666-
nice_len,
1667-
max_depth,
1668-
NoOpVisitor,
1669-
);
1670-
pos += 1;
1671-
}
1659+
self.skip_positions_generic::<Avx512Strategy>(
1660+
data, pos, count, max_depth, nice_len,
1661+
);
16721662
}
16731663
#[cfg(target_arch = "x86_64")]
16741664
MatchLenStrategy::Avx10 => {
1675-
for _ in 0..count {
1676-
self.advance_one_byte_generic::<Avx10Strategy, _>(
1677-
data,
1678-
pos,
1679-
DEFLATE_MAX_MATCH_LEN,
1680-
nice_len,
1681-
max_depth,
1682-
NoOpVisitor,
1683-
);
1684-
pos += 1;
1685-
}
1665+
self.skip_positions_generic::<Avx10Strategy>(
1666+
data, pos, count, max_depth, nice_len,
1667+
);
16861668
}
16871669
#[cfg(target_arch = "aarch64")]
16881670
MatchLenStrategy::Neon => {
1689-
for _ in 0..count {
1690-
self.advance_one_byte_generic::<NeonStrategy, _>(
1691-
data,
1692-
pos,
1693-
DEFLATE_MAX_MATCH_LEN,
1694-
nice_len,
1695-
max_depth,
1696-
NoOpVisitor,
1697-
);
1698-
pos += 1;
1699-
}
1671+
self.skip_positions_generic::<NeonStrategy>(
1672+
data, pos, count, max_depth, nice_len,
1673+
);
17001674
}
17011675
}
17021676
}

src/compress/mod.rs

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1916,13 +1916,7 @@ impl Compressor {
19161916
}
19171917

19181918
#[inline(always)]
1919-
unsafe fn get_literals_4_code(
1920-
&self,
1921-
lit1: u8,
1922-
lit2: u8,
1923-
lit3: u8,
1924-
lit4: u8,
1925-
) -> (u64, u32) {
1919+
unsafe fn get_literals_4_code(&self, lit1: u8, lit2: u8, lit3: u8, lit4: u8) -> (u64, u32) {
19261920
let entry1 = *self.litlen_table.get_unchecked(lit1 as usize);
19271921
let entry2 = *self.litlen_table.get_unchecked(lit2 as usize);
19281922
let entry3 = *self.litlen_table.get_unchecked(lit3 as usize);
@@ -2013,7 +2007,7 @@ impl Compressor {
20132007
);
20142008
let new_bitcount = bitcount + len;
20152009
if new_bitcount >= 32 {
2016-
let buf = bitbuf | ((code as u64) << bitcount);
2010+
let buf = bitbuf | (code << bitcount);
20172011
std::ptr::write_unaligned(
20182012
out_ptr.add(out_idx) as *mut u32,
20192013
(buf as u32).to_le(),
@@ -2022,7 +2016,7 @@ impl Compressor {
20222016
bitbuf = buf >> 32;
20232017
bitcount = new_bitcount - 32;
20242018
} else {
2025-
bitbuf |= (code as u64) << bitcount;
2019+
bitbuf |= code << bitcount;
20262020
bitcount = new_bitcount;
20272021
}
20282022
}
@@ -2034,7 +2028,7 @@ impl Compressor {
20342028
let (code, len) = self.get_literal_code(*input.get_unchecked(in_pos));
20352029
let new_bitcount = bitcount + len;
20362030
if new_bitcount >= 32 {
2037-
let buf = bitbuf | ((code as u64) << bitcount);
2031+
let buf = bitbuf | (code << bitcount);
20382032
std::ptr::write_unaligned(
20392033
out_ptr.add(out_idx) as *mut u32,
20402034
(buf as u32).to_le(),
@@ -2043,7 +2037,7 @@ impl Compressor {
20432037
bitbuf = buf >> 32;
20442038
bitcount = new_bitcount - 32;
20452039
} else {
2046-
bitbuf |= (code as u64) << bitcount;
2040+
bitbuf |= code << bitcount;
20472041
bitcount = new_bitcount;
20482042
}
20492043
}

0 commit comments

Comments
 (0)