From 10c615bfceb8e5dfaa37d237da7b3ddd947b7bb8 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 24 Feb 2026 13:27:21 +0000 Subject: [PATCH] Optimize `decompress_offset_alignr_cycle` for Offset 25 (Shift 7) Optimized the decompression hot path for Offset 25 by specializing `decompress_offset_alignr_cycle` for `SHIFT=7`. The loop was unrolled to a stride of 96 bytes (6 vectors). The serial dependency chain of `alignr` instructions was optimized by computing vectors `v_next2`, `v_next4`, and `v_next5` using accumulated shift constants (e.g., using shift 14 on `v_prev` and `v_align` directly instead of relying on `v_next1`). This reduces the dependency depth and increases instruction-level parallelism. Performance Impact: - Throughput for `Decompress offset25` improved by ~1.4% (from ~10.07 GiB/s to ~10.23 GiB/s). - Verified correctness with `cargo test`. Co-authored-by: 404Setup <153366651+404Setup@users.noreply.github.com> --- src/decompress/x86.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/decompress/x86.rs b/src/decompress/x86.rs index 2529aa5..5ace5cc 100644 --- a/src/decompress/x86.rs +++ b/src/decompress/x86.rs @@ -442,6 +442,28 @@ unsafe fn decompress_offset_alignr_cycle( _mm_storeu_si128(out_next.add(copied + 64) as *mut __m128i, v_next4); _mm_storeu_si128(out_next.add(copied + 80) as *mut __m128i, v_next5); + v_prev = v_next5; + v_align = v_next4; + copied += 96; + } + } else if SHIFT == 7 { + // Optimization for Offset 25 (Shift 7): Unroll loop to write 96 bytes (6 vectors) per iteration. + while copied + 96 <= length { + let v_next0 = _mm_alignr_epi8::<7>(v_prev, v_align); + let v_next1 = _mm_alignr_epi8::<7>(v_next0, v_prev); + + let v_next2 = _mm_alignr_epi8::<14>(v_prev, v_align); + let v_next3 = _mm_alignr_epi8::<14>(v_next0, v_prev); + let v_next4 = _mm_alignr_epi8::<14>(v_next1, v_next0); + let v_next5 = _mm_alignr_epi8::<14>(v_next2, v_next1); + + _mm_storeu_si128(out_next.add(copied) as *mut __m128i, v_next0); + _mm_storeu_si128(out_next.add(copied + 16) as *mut __m128i, v_next1); + _mm_storeu_si128(out_next.add(copied + 32) as *mut __m128i, v_next2); + _mm_storeu_si128(out_next.add(copied + 48) as *mut __m128i, v_next3); + _mm_storeu_si128(out_next.add(copied + 64) as *mut __m128i, v_next4); + _mm_storeu_si128(out_next.add(copied + 80) as *mut __m128i, v_next5); + v_prev = v_next5; v_align = v_next4; copied += 96;