@@ -55,6 +55,12 @@ static OFFSET3_MASKS: [u8; 48] = [
5555 2 , 0 , 1 , 2 , 0 , 1 , 2 , 0 , 1 , 2 , 0 , 1 , 2 , 0 , 1 , 2 ,
5656] ;
5757
58+ // LCM(6, 16) = 48. 3 vectors.
59+ static OFFSET6_MASKS : [ u8 ; 48 ] = [
60+ 0 , 1 , 2 , 3 , 4 , 5 , 0 , 1 , 2 , 3 , 4 , 5 , 0 , 1 , 2 , 3 , 4 , 5 , 0 , 1 , 2 , 3 , 4 , 5 , 0 , 1 , 2 , 3 , 4 , 5 , 0 , 1 ,
61+ 2 , 3 , 4 , 5 , 0 , 1 , 2 , 3 , 4 , 5 , 0 , 1 , 2 , 3 , 4 , 5 ,
62+ ] ;
63+
5864// LCM(10, 16) = 80. 5 vectors.
5965static OFFSET10_MASKS : [ u8 ; 80 ] = [
6066 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 0 , 1 ,
@@ -989,12 +995,67 @@ pub unsafe fn decompress_bmi2(
989995 }
990996 }
991997
998+ while copied < length {
999+ * dest_ptr. add ( copied) = * src_ptr. add ( copied) ;
1000+ copied += 1 ;
1001+ }
1002+ } else if offset == 6 {
1003+ let dest_ptr = out_next;
1004+ let src_ptr = src;
1005+ let mut copied = 0 ;
1006+
1007+ if length >= 16 {
1008+ let v_src = _mm_loadu_si128 ( src_ptr as * const __m128i ) ;
1009+ let masks_ptr = OFFSET6_MASKS . as_ptr ( ) as * const __m128i ;
1010+ let v_base =
1011+ _mm_shuffle_epi8 ( v_src, _mm_loadu_si128 ( masks_ptr) ) ;
1012+
1013+ while copied + 48 <= length {
1014+ _mm_storeu_si128 (
1015+ dest_ptr. add ( copied) as * mut __m128i ,
1016+ v_base,
1017+ ) ;
1018+ _mm_storeu_si128 (
1019+ dest_ptr. add ( copied + 16 ) as * mut __m128i ,
1020+ _mm_shuffle_epi8 (
1021+ v_src,
1022+ _mm_loadu_si128 ( masks_ptr. add ( 1 ) ) ,
1023+ ) ,
1024+ ) ;
1025+ _mm_storeu_si128 (
1026+ dest_ptr. add ( copied + 32 ) as * mut __m128i ,
1027+ _mm_shuffle_epi8 (
1028+ v_src,
1029+ _mm_loadu_si128 ( masks_ptr. add ( 2 ) ) ,
1030+ ) ,
1031+ ) ;
1032+ copied += 48 ;
1033+ }
1034+
1035+ while copied + 16 <= length {
1036+ let idx = ( copied % 48 ) / 16 ;
1037+ let v = if idx == 0 {
1038+ v_base
1039+ } else {
1040+ _mm_shuffle_epi8 (
1041+ v_src,
1042+ _mm_loadu_si128 ( masks_ptr. add ( idx) ) ,
1043+ )
1044+ } ;
1045+ _mm_storeu_si128 (
1046+ dest_ptr. add ( copied) as * mut __m128i ,
1047+ v,
1048+ ) ;
1049+ copied += 16 ;
1050+ }
1051+ }
1052+
9921053 while copied < length {
9931054 * dest_ptr. add ( copied) = * src_ptr. add ( copied) ;
9941055 copied += 1 ;
9951056 }
9961057 } else {
997- // Simple loop for offsets 5, 6, 7
1058+ // Simple loop for offsets 5, 7
9981059 let mut copied = 0 ;
9991060 while copied < length {
10001061 * out_next. add ( copied) = * src. add ( copied) ;
0 commit comments