Skip to content

Commit 183fc18

Browse files
Optimize decompress_offset_alignr_cycle for Offset 25 (Shift 7) (#381)
Optimized the decompression hot path for Offset 25 by specializing `decompress_offset_alignr_cycle` for `SHIFT=7`. The loop was unrolled to a stride of 96 bytes (6 vectors). The serial dependency chain of `alignr` instructions was optimized by computing vectors `v_next2`, `v_next4`, and `v_next5` using accumulated shift constants (e.g., using shift 14 on `v_prev` and `v_align` directly instead of relying on `v_next1`). This reduces the dependency depth and increases instruction-level parallelism. Performance Impact: - Throughput for `Decompress offset25` improved by ~1.4% (from ~10.07 GiB/s to ~10.23 GiB/s). - Verified correctness with `cargo test`. Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
1 parent 3bd7ff7 commit 183fc18

1 file changed

Lines changed: 22 additions & 0 deletions

File tree

src/decompress/x86.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,28 @@ unsafe fn decompress_offset_alignr_cycle<const SHIFT: i32>(
442442
_mm_storeu_si128(out_next.add(copied + 64) as *mut __m128i, v_next4);
443443
_mm_storeu_si128(out_next.add(copied + 80) as *mut __m128i, v_next5);
444444

445+
v_prev = v_next5;
446+
v_align = v_next4;
447+
copied += 96;
448+
}
449+
} else if SHIFT == 7 {
450+
// Optimization for Offset 25 (Shift 7): Unroll loop to write 96 bytes (6 vectors) per iteration.
451+
while copied + 96 <= length {
452+
let v_next0 = _mm_alignr_epi8::<7>(v_prev, v_align);
453+
let v_next1 = _mm_alignr_epi8::<7>(v_next0, v_prev);
454+
455+
let v_next2 = _mm_alignr_epi8::<14>(v_prev, v_align);
456+
let v_next3 = _mm_alignr_epi8::<14>(v_next0, v_prev);
457+
let v_next4 = _mm_alignr_epi8::<14>(v_next1, v_next0);
458+
let v_next5 = _mm_alignr_epi8::<14>(v_next2, v_next1);
459+
460+
_mm_storeu_si128(out_next.add(copied) as *mut __m128i, v_next0);
461+
_mm_storeu_si128(out_next.add(copied + 16) as *mut __m128i, v_next1);
462+
_mm_storeu_si128(out_next.add(copied + 32) as *mut __m128i, v_next2);
463+
_mm_storeu_si128(out_next.add(copied + 48) as *mut __m128i, v_next3);
464+
_mm_storeu_si128(out_next.add(copied + 64) as *mut __m128i, v_next4);
465+
_mm_storeu_si128(out_next.add(copied + 80) as *mut __m128i, v_next5);
466+
445467
v_prev = v_next5;
446468
v_align = v_next4;
447469
copied += 96;

0 commit comments

Comments
 (0)