@@ -122,6 +122,45 @@ const OFFSET13_MASKS: [u8; 208] = [
122122 12 ,
123123] ;
124124
125+ #[ cfg( target_arch = "x86_64" ) ]
126+ #[ target_feature( enable = "avx2" ) ]
127+ unsafe fn copy_match_offset3_avx2 ( dest : * mut u8 , val : u32 , length : usize ) -> usize {
128+ let v_pat = _mm256_set1_epi32 ( val as i32 ) ;
129+ let masks_ptr = OFFSET3_MASKS . as_ptr ( ) as * const __m128i ;
130+ let m0 = _mm_loadu_si128 ( masks_ptr) ;
131+ let m1 = _mm_loadu_si128 ( masks_ptr. add ( 1 ) ) ;
132+ let m2 = _mm_loadu_si128 ( masks_ptr. add ( 2 ) ) ;
133+
134+ // Construct 256-bit shuffle masks by combining 128-bit masks.
135+ // The pattern repeats every 3 bytes.
136+ // m0 covers bytes 0..16. m1 covers 16..32. m2 covers 32..48.
137+ //
138+ // Iteration 1 (0..32): Uses [m0, m1].
139+ // Iteration 2 (32..64): Starts at offset 32 (32 % 3 = 2). Needs pattern starting with 2 (m2).
140+ // Lane 1 starts at offset 48 (48 % 3 = 0). Needs pattern starting with 0 (m0).
141+ // Uses [m2, m0].
142+ // Iteration 3 (64..96): Starts at offset 64 (64 % 3 = 1). Needs pattern starting with 1 (m1).
143+ // Lane 1 starts at offset 80 (80 % 3 = 2). Needs pattern starting with 2 (m2).
144+ // Uses [m1, m2].
145+ let mask_a = _mm256_inserti128_si256 ( _mm256_castsi128_si256 ( m0) , m1, 1 ) ;
146+ let mask_b = _mm256_inserti128_si256 ( _mm256_castsi128_si256 ( m2) , m0, 1 ) ;
147+ let mask_c = _mm256_inserti128_si256 ( _mm256_castsi128_si256 ( m1) , m2, 1 ) ;
148+
149+ let mut copied = 0 ;
150+ while copied + 96 <= length {
151+ let v_a = _mm256_shuffle_epi8 ( v_pat, mask_a) ;
152+ let v_b = _mm256_shuffle_epi8 ( v_pat, mask_b) ;
153+ let v_c = _mm256_shuffle_epi8 ( v_pat, mask_c) ;
154+
155+ _mm256_storeu_si256 ( dest. add ( copied) as * mut __m256i , v_a) ;
156+ _mm256_storeu_si256 ( dest. add ( copied + 32 ) as * mut __m256i , v_b) ;
157+ _mm256_storeu_si256 ( dest. add ( copied + 64 ) as * mut __m256i , v_c) ;
158+
159+ copied += 96 ;
160+ }
161+ copied
162+ }
163+
125164#[ cfg( target_arch = "x86_64" ) ]
126165#[ target_feature( enable = "bmi2,ssse3" ) ]
127166pub unsafe fn decompress_bmi2 (
@@ -981,12 +1020,17 @@ pub unsafe fn decompress_bmi2(
9811020 let v1 = std:: ptr:: read_unaligned ( src. add ( 1 ) as * const u16 )
9821021 as u32 ;
9831022 let val = v0 | ( v1 << 8 ) ;
1023+
1024+ let mut copied = 0 ;
1025+ if is_x86_feature_detected ! ( "avx2" ) {
1026+ copied = copy_match_offset3_avx2 ( dest_ptr, val, length) ;
1027+ }
1028+
9841029 let v_pat = _mm_cvtsi32_si128 ( val as i32 ) ;
9851030 let masks_ptr = OFFSET3_MASKS . as_ptr ( ) as * const __m128i ;
9861031 let v_base =
9871032 _mm_shuffle_epi8 ( v_pat, _mm_loadu_si128 ( masks_ptr) ) ;
9881033
989- let mut copied = 0 ;
9901034 while copied + 48 <= length {
9911035 _mm_storeu_si128 (
9921036 dest_ptr. add ( copied) as * mut __m128i ,
0 commit comments