Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions src/decompress/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,28 @@ unsafe fn decompress_offset_alignr_cycle<const SHIFT: i32>(
_mm_storeu_si128(out_next.add(copied + 64) as *mut __m128i, v_next4);
_mm_storeu_si128(out_next.add(copied + 80) as *mut __m128i, v_next5);

v_prev = v_next5;
v_align = v_next4;
copied += 96;
}
} else if SHIFT == 7 {
// Optimization for Offset 25 (Shift 7): Unroll loop to write 96 bytes (6 vectors) per iteration.
while copied + 96 <= length {
let v_next0 = _mm_alignr_epi8::<7>(v_prev, v_align);
let v_next1 = _mm_alignr_epi8::<7>(v_next0, v_prev);

let v_next2 = _mm_alignr_epi8::<14>(v_prev, v_align);
let v_next3 = _mm_alignr_epi8::<14>(v_next0, v_prev);
let v_next4 = _mm_alignr_epi8::<14>(v_next1, v_next0);
let v_next5 = _mm_alignr_epi8::<14>(v_next2, v_next1);

_mm_storeu_si128(out_next.add(copied) as *mut __m128i, v_next0);
_mm_storeu_si128(out_next.add(copied + 16) as *mut __m128i, v_next1);
_mm_storeu_si128(out_next.add(copied + 32) as *mut __m128i, v_next2);
_mm_storeu_si128(out_next.add(copied + 48) as *mut __m128i, v_next3);
_mm_storeu_si128(out_next.add(copied + 64) as *mut __m128i, v_next4);
_mm_storeu_si128(out_next.add(copied + 80) as *mut __m128i, v_next5);

v_prev = v_next5;
v_align = v_next4;
copied += 96;
Expand Down