Skip to content

Commit 288d3a7

Browse files
authored
Merge pull request #227 from 404Setup/bolt-optimize-decompress-offset3-3204733861974539570
⚡ Bolt: Optimize offset 3 decompression with AVX2
2 parents b75332a + 20c8d07 commit 288d3a7

1 file changed

Lines changed: 45 additions & 1 deletion

File tree

src/decompress/x86.rs

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,45 @@ const OFFSET13_MASKS: [u8; 208] = [
122122
12,
123123
];
124124

125+
#[cfg(target_arch = "x86_64")]
126+
#[target_feature(enable = "avx2")]
127+
unsafe fn copy_match_offset3_avx2(dest: *mut u8, val: u32, length: usize) -> usize {
128+
let v_pat = _mm256_set1_epi32(val as i32);
129+
let masks_ptr = OFFSET3_MASKS.as_ptr() as *const __m128i;
130+
let m0 = _mm_loadu_si128(masks_ptr);
131+
let m1 = _mm_loadu_si128(masks_ptr.add(1));
132+
let m2 = _mm_loadu_si128(masks_ptr.add(2));
133+
134+
// Construct 256-bit shuffle masks by combining 128-bit masks.
135+
// The pattern repeats every 3 bytes.
136+
// m0 covers bytes 0..16. m1 covers 16..32. m2 covers 32..48.
137+
//
138+
// Iteration 1 (0..32): Uses [m0, m1].
139+
// Iteration 2 (32..64): Starts at offset 32 (32 % 3 = 2). Needs pattern starting with 2 (m2).
140+
// Lane 1 starts at offset 48 (48 % 3 = 0). Needs pattern starting with 0 (m0).
141+
// Uses [m2, m0].
142+
// Iteration 3 (64..96): Starts at offset 64 (64 % 3 = 1). Needs pattern starting with 1 (m1).
143+
// Lane 1 starts at offset 80 (80 % 3 = 2). Needs pattern starting with 2 (m2).
144+
// Uses [m1, m2].
145+
let mask_a = _mm256_inserti128_si256(_mm256_castsi128_si256(m0), m1, 1);
146+
let mask_b = _mm256_inserti128_si256(_mm256_castsi128_si256(m2), m0, 1);
147+
let mask_c = _mm256_inserti128_si256(_mm256_castsi128_si256(m1), m2, 1);
148+
149+
let mut copied = 0;
150+
while copied + 96 <= length {
151+
let v_a = _mm256_shuffle_epi8(v_pat, mask_a);
152+
let v_b = _mm256_shuffle_epi8(v_pat, mask_b);
153+
let v_c = _mm256_shuffle_epi8(v_pat, mask_c);
154+
155+
_mm256_storeu_si256(dest.add(copied) as *mut __m256i, v_a);
156+
_mm256_storeu_si256(dest.add(copied + 32) as *mut __m256i, v_b);
157+
_mm256_storeu_si256(dest.add(copied + 64) as *mut __m256i, v_c);
158+
159+
copied += 96;
160+
}
161+
copied
162+
}
163+
125164
#[cfg(target_arch = "x86_64")]
126165
#[target_feature(enable = "bmi2,ssse3")]
127166
pub unsafe fn decompress_bmi2(
@@ -981,12 +1020,17 @@ pub unsafe fn decompress_bmi2(
9811020
let v1 = std::ptr::read_unaligned(src.add(1) as *const u16)
9821021
as u32;
9831022
let val = v0 | (v1 << 8);
1023+
1024+
let mut copied = 0;
1025+
if is_x86_feature_detected!("avx2") {
1026+
copied = copy_match_offset3_avx2(dest_ptr, val, length);
1027+
}
1028+
9841029
let v_pat = _mm_cvtsi32_si128(val as i32);
9851030
let masks_ptr = OFFSET3_MASKS.as_ptr() as *const __m128i;
9861031
let v_base =
9871032
_mm_shuffle_epi8(v_pat, _mm_loadu_si128(masks_ptr));
9881033

989-
let mut copied = 0;
9901034
while copied + 48 <= length {
9911035
_mm_storeu_si128(
9921036
dest_ptr.add(copied) as *mut __m128i,

0 commit comments

Comments
 (0)