Skip to content

Commit 42e7161

Browse files
Optimize decompress_bmi2 for offset 3 using SIMD
This commit optimizes the decompression of `offset == 3` matches in `decompress_bmi2` (x86-64). Previously, this case fell back to a slow byte-by-byte scalar copy loop. The optimization uses `_mm_shuffle_epi8` with precomputed cyclic masks (`OFFSET3_MASKS`) to construct 16-byte vectors containing the repeating 3-byte pattern from the first 16 bytes loaded from `src` (safely masking out garbage). The copy loop is unrolled to process 48 bytes (LCM of 3 and 16) per iteration. Performance Impact: - `Decompress offset3` throughput improved by ~540% (~1.44 GiB/s -> ~9.22 GiB/s). - `Decompress offset3 small` throughput improved by ~1.4%. - `Decompress offset30` throughput improved by ~4%. - `Decompress offset31` throughput improved by ~9%. - `Decompress offset32` throughput improved by ~1.8%. This aligns the performance of offset 3 with other small offsets that are already optimized. Co-authored-by: 404Setup <[email protected]>
1 parent 6502d7b commit 42e7161

2 files changed

Lines changed: 67 additions & 1 deletion

File tree

.jules/bolt.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,7 @@
2727
## 2026-06-04 - [Vector Precomputation vs Alignr Chain]
2828
**Learning:** For overlapping patterns where offset is a multiple of 8 (e.g., offset 24), breaking the `alignr` dependency chain by precomputing all vectors in the cycle (LCM of offset and vector size) allowed for effective loop unrolling. This yielded a 32% throughput improvement (7.7 GiB/s -> 10.2 GiB/s) by increasing ILP compared to the serial dependency of iterative `alignr`.
2929
**Action:** When optimizing decompression loops for specific offsets, determine if the pattern cycle is short enough to precompute fully. If so, prefer storing precomputed vectors in an unrolled loop over calculating the next vector from the previous one.
30+
31+
## 2026-06-04 - [Offset 3 Optimization]
32+
**Learning:** Decompressing `offset == 3` using a byte-by-byte scalar loop is extremely slow (~1.44 GiB/s). By using precomputed shuffle masks (3 vectors for the 48-byte cycle, LCM(3, 16)) and loading 16 bytes from `src` (safely masking out garbage), we can process 48 bytes per iteration using SIMD. This yielded a ~540% throughput improvement (~9.2 GiB/s).
33+
**Action:** For small offsets (like 3) that don't fit power-of-2 optimizations, use `pshufb` with precomputed cyclic masks to construct the pattern vectors.

src/decompress/x86.rs

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ static OFFSET12_MASKS: [u8; 48] = [
4949
6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5050
];
5151

52+
// LCM(3, 16) = 48. 3 vectors.
53+
static OFFSET3_MASKS: [u8; 48] = [
54+
0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1,
55+
2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2,
56+
];
57+
5258
// LCM(10, 16) = 80. 5 vectors.
5359
static OFFSET10_MASKS: [u8; 80] = [
5460
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
@@ -931,8 +937,64 @@ pub unsafe fn decompress_bmi2(
931937
*out_next.add(i) = (pattern >> ((i & 7) * 8)) as u8;
932938
i += 1;
933939
}
940+
} else if offset == 3 {
941+
let dest_ptr = out_next;
942+
let src_ptr = src;
943+
let mut copied = 0;
944+
945+
if length >= 16 {
946+
let v_src = _mm_loadu_si128(src_ptr as *const __m128i);
947+
let masks_ptr =
948+
OFFSET3_MASKS.as_ptr() as *const __m128i;
949+
let v_base =
950+
_mm_shuffle_epi8(v_src, _mm_loadu_si128(masks_ptr));
951+
952+
while copied + 48 <= length {
953+
_mm_storeu_si128(
954+
dest_ptr.add(copied) as *mut __m128i,
955+
v_base,
956+
);
957+
_mm_storeu_si128(
958+
dest_ptr.add(copied + 16) as *mut __m128i,
959+
_mm_shuffle_epi8(
960+
v_src,
961+
_mm_loadu_si128(masks_ptr.add(1)),
962+
),
963+
);
964+
_mm_storeu_si128(
965+
dest_ptr.add(copied + 32) as *mut __m128i,
966+
_mm_shuffle_epi8(
967+
v_src,
968+
_mm_loadu_si128(masks_ptr.add(2)),
969+
),
970+
);
971+
copied += 48;
972+
}
973+
974+
while copied + 16 <= length {
975+
let idx = (copied % 48) / 16;
976+
let v = if idx == 0 {
977+
v_base
978+
} else {
979+
_mm_shuffle_epi8(
980+
v_src,
981+
_mm_loadu_si128(masks_ptr.add(idx)),
982+
)
983+
};
984+
_mm_storeu_si128(
985+
dest_ptr.add(copied) as *mut __m128i,
986+
v,
987+
);
988+
copied += 16;
989+
}
990+
}
991+
992+
while copied < length {
993+
*dest_ptr.add(copied) = *src_ptr.add(copied);
994+
copied += 1;
995+
}
934996
} else {
935-
// Simple loop for offsets 3, 5, 6, 7
997+
// Simple loop for offsets 5, 6, 7
936998
let mut copied = 0;
937999
while copied < length {
9381000
*out_next.add(copied) = *src.add(copied);

0 commit comments

Comments
 (0)