Skip to content

Commit 9f24881

Browse files
committed
clean code
1 parent abeb9fd commit 9f24881

16 files changed

Lines changed: 32 additions & 597 deletions

File tree

src/adler32/x86.rs

Lines changed: 3 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,8 @@ macro_rules! adler32_chunk8 {
3838
};
3939
}
4040

41-
// Optimization: Manual loop unrolling for Adler32 tail processing.
42-
// This processes 4 bytes per iteration to reduce loop overhead and improve instruction pipelining.
43-
// It is significantly faster than a scalar iterator loop for small tails (e.g., 15-63 bytes).
44-
//
45-
// Safety:
46-
// * `$ptr` must be valid for reads of `$len` bytes.
47-
// * `$s1` and `$s2` must not overflow u32 before modulo (guaranteed by BLOCK_SIZE check in caller).
4841
macro_rules! adler32_tail {
4942
($s1:expr, $s2:expr, $ptr:expr, $len:expr) => {
50-
// We know len < 16 here because larger chunks are handled by SIMD or unrolled loops before calling this macro.
5143
if $len > 0 {
5244
if $len >= 8 {
5345
adler32_chunk8!($s1, $s2, $ptr, $len);
@@ -66,7 +58,6 @@ macro_rules! adler32_tail {
6658
$len -= 4;
6759
}
6860

69-
// Remaining 0-3 bytes.
7061
match $len {
7162
3 => {
7263
let v = ($ptr as *const u16).read_unaligned() as u32;
@@ -120,8 +111,7 @@ pub unsafe fn adler32_x86_sse2(adler: u32, p: &[u8]) -> u32 {
120111
let mut v_byte_sums_d = _mm_setzero_si128();
121112

122113
let mut chunk_n = n;
123-
// Optimization: Unroll loop to process 64 bytes per iteration (two 32-byte chunks).
124-
// This amortizes loop overhead and allows better pipelining of the `sad` and `unpack` operations.
114+
125115
while chunk_n >= 128 {
126116
let data_a_1 = _mm_loadu_si128(data.as_ptr() as *const __m128i);
127117
let data_b_1 = _mm_loadu_si128(data.as_ptr().add(16) as *const __m128i);
@@ -132,7 +122,6 @@ pub unsafe fn adler32_x86_sse2(adler: u32, p: &[u8]) -> u32 {
132122
let data_a_4 = _mm_loadu_si128(data.as_ptr().add(96) as *const __m128i);
133123
let data_b_4 = _mm_loadu_si128(data.as_ptr().add(112) as *const __m128i);
134124

135-
// Accumulate byte sums
136125
v_byte_sums_a = _mm_add_epi16(v_byte_sums_a, _mm_unpacklo_epi8(data_a_1, v_zero));
137126
v_byte_sums_b = _mm_add_epi16(v_byte_sums_b, _mm_unpackhi_epi8(data_a_1, v_zero));
138127
v_byte_sums_c = _mm_add_epi16(v_byte_sums_c, _mm_unpacklo_epi8(data_b_1, v_zero));
@@ -153,7 +142,6 @@ pub unsafe fn adler32_x86_sse2(adler: u32, p: &[u8]) -> u32 {
153142
v_byte_sums_c = _mm_add_epi16(v_byte_sums_c, _mm_unpacklo_epi8(data_b_4, v_zero));
154143
v_byte_sums_d = _mm_add_epi16(v_byte_sums_d, _mm_unpackhi_epi8(data_b_4, v_zero));
155144

156-
// SAD calculation
157145
let sad_1 = _mm_add_epi32(
158146
_mm_sad_epu8(data_a_1, v_zero),
159147
_mm_sad_epu8(data_b_1, v_zero),
@@ -171,16 +159,13 @@ pub unsafe fn adler32_x86_sse2(adler: u32, p: &[u8]) -> u32 {
171159
_mm_sad_epu8(data_b_4, v_zero),
172160
);
173161

174-
// Update v_s1_sums
175-
// v_s1_sums += 4 * v_s1 (initial) + 3*sad_1 + 2*sad_2 + 1*sad_3
176162
let s1_x4 = _mm_slli_epi32(v_s1, 2);
177163
let inc_1 = _mm_add_epi32(
178-
_mm_add_epi32(sad_1, _mm_add_epi32(sad_1, sad_1)), // 3*sad_1
179-
_mm_add_epi32(_mm_add_epi32(sad_2, sad_2), sad_3), // 2*sad_2 + sad_3
164+
_mm_add_epi32(sad_1, _mm_add_epi32(sad_1, sad_1)),
165+
_mm_add_epi32(_mm_add_epi32(sad_2, sad_2), sad_3),
180166
);
181167
v_s1_sums = _mm_add_epi32(v_s1_sums, _mm_add_epi32(s1_x4, inc_1));
182168

183-
// Update v_s1
184169
let total_sad = _mm_add_epi32(_mm_add_epi32(sad_1, sad_2), _mm_add_epi32(sad_3, sad_4));
185170
v_s1 = _mm_add_epi32(v_s1, total_sad);
186171

@@ -321,7 +306,6 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
321306
}
322307
}
323308

324-
// Optimization: Hoist vector constants out of the main loop to avoid redundant loads.
325309
let weights = _mm256_set_epi8(
326310
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
327311
26, 27, 28, 29, 30, 31, 32,
@@ -346,8 +330,6 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
346330
let mut v_s2_c = _mm256_setzero_si256();
347331
let mut v_s2_d = _mm256_setzero_si256();
348332

349-
// Optimization: For chunks >= 256 bytes, use an unrolled loop with 8 independent accumulators.
350-
// This increases instruction-level parallelism to hide the latency of multiply-adds.
351333
if chunk_n >= 256 {
352334
let mut v_s2_e = _mm256_setzero_si256();
353335
let mut v_s2_f = _mm256_setzero_si256();
@@ -421,9 +403,6 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
421403
let s_h = _mm256_madd_epi16(p8, ones_i16);
422404
v_s2_h = _mm256_add_epi32(v_s2_h, s_h);
423405

424-
// Update v_s1 and v_inc accumulators
425-
// For the first 128 bytes: v_s1 contributes to the next 128 bytes as well.
426-
// v_s1_acc accumulates v_s1 twice (once for first 128, once for second).
427406
v_s1_acc = _mm256_add_epi32(v_s1_acc, v_s1);
428407
v_inc_acc_a = _mm256_add_epi32(v_inc_acc_a, inc_part_1);
429408
v_s1 = _mm256_add_epi32(v_s1, sum_sads_1);
@@ -503,24 +482,15 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
503482
let data_a = _mm256_loadu_si256(ptr as *const __m256i);
504483
let data_b = _mm256_loadu_si256(ptr.add(32) as *const __m256i);
505484

506-
// Optimization: Parallelize SAD calculation and s1/s2 updates to reduce dependency chains.
507-
// By computing sad_a and sad_b in parallel, we can accumulate s1 sums and s1 in larger steps.
508485
let sad_a = _mm256_sad_epu8(data_a, v_zero);
509486
let sad_b = _mm256_sad_epu8(data_b, v_zero);
510487

511-
// Update v_s1_sums:
512-
// The contribution of the current v_s1 to the sums over the next 64 bytes is:
513-
// - For the first 32 bytes: v_s1 * 32
514-
// - For the second 32 bytes: (v_s1 + sad_a) * 32
515-
// Total: v_s1 * 64 + sad_a * 32
516488
let v_s1_x64 = _mm256_slli_epi32(v_s1, 6);
517489
let sad_a_x32 = _mm256_slli_epi32(sad_a, 5);
518490
v_s1_sums = _mm256_add_epi32(v_s1_sums, _mm256_add_epi32(v_s1_x64, sad_a_x32));
519491

520-
// Update v_s1: v_s1 += sad_a + sad_b
521492
v_s1 = _mm256_add_epi32(v_s1, _mm256_add_epi32(sad_a, sad_b));
522493

523-
// Update v_s2: Calculate partial s2 contributions in parallel
524494
let p1 = _mm256_maddubs_epi16(data_a, weights);
525495
let s_a = _mm256_madd_epi16(p1, ones_i16);
526496
let p2 = _mm256_maddubs_epi16(data_b, weights);
@@ -538,16 +508,12 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
538508
while chunk_n >= 32 {
539509
let d = _mm256_loadu_si256(ptr as *const __m256i);
540510

541-
// Update v_s1_sums: v_s1 contributes to next 32 bytes.
542-
// v_s1_sums += v_s1 * 32
543511
let v_s1_x32 = _mm256_slli_epi32(v_s1, 5);
544512
v_s1_sums = _mm256_add_epi32(v_s1_sums, v_s1_x32);
545513

546-
// Update v_s1: v_s1 += sad
547514
let sad = _mm256_sad_epu8(d, v_zero);
548515
v_s1 = _mm256_add_epi32(v_s1, sad);
549516

550-
// Update v_s2: v_s2 += weighted_sum
551517
let p = _mm256_maddubs_epi16(d, weights);
552518
let s = _mm256_madd_epi16(p, ones_i16);
553519
v_s2 = _mm256_add_epi32(v_s2, s);
@@ -648,9 +614,6 @@ pub unsafe fn adler32_x86_avx2_vnni(adler: u32, p: &[u8]) -> u32 {
648614

649615
let mut chunk_n = n;
650616

651-
// Optimization: For chunks >= 256 bytes, use an unrolled loop with 8 independent accumulators.
652-
// This increases instruction-level parallelism to hide the latency of `vpdpbusd` (5 cycles on Golden Cove).
653-
// We merge the global `v_s2` into `v_s2_a` to save a register, keeping total usage within AVX2 limits (16 YMMs).
654617
if chunk_n >= 256 {
655618
let mut ptr = data.as_ptr();
656619
let mut v_s2_a = v_s2;
@@ -663,7 +626,6 @@ pub unsafe fn adler32_x86_avx2_vnni(adler: u32, p: &[u8]) -> u32 {
663626
let mut v_s2_h = _mm256_setzero_si256();
664627

665628
while chunk_n >= 256 {
666-
// Block 1 (0..128)
667629
let d1 = _mm256_loadu_si256(ptr as *const __m256i);
668630
v_s2_a = _mm256_dpbusd_avx_epi32(v_s2_a, d1, mults);
669631
let u1 = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), d1, ones);
@@ -692,7 +654,6 @@ pub unsafe fn adler32_x86_avx2_vnni(adler: u32, p: &[u8]) -> u32 {
692654
v_s1_sums = _mm256_add_epi32(v_s1_sums, inc_a);
693655
v_s1 = _mm256_add_epi32(v_s1, total_u_a);
694656

695-
// Block 2 (128..256)
696657
let d5 = _mm256_loadu_si256(ptr.add(128) as *const __m256i);
697658
v_s2_e = _mm256_dpbusd_avx_epi32(v_s2_e, d5, mults);
698659
let u5 = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), d5, ones);

src/api.rs

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,6 @@ impl Compressor {
7979
let mut output = Vec::new();
8080
output.try_reserve_exact(bound).map_err(io::Error::other)?;
8181

82-
// Use spare_capacity_mut to avoid zero-initialization.
83-
// Since len is 0, this returns the entire capacity as MaybeUninit.
84-
// We slice it to 'bound' to match the requested size.
8582
let out_uninit = output.spare_capacity_mut();
8683
let out_uninit = &mut out_uninit[..bound];
8784

@@ -210,9 +207,6 @@ impl Decompressor {
210207
&mut [std::mem::MaybeUninit<u8>],
211208
) -> (crate::decompress::DecompressResult, usize, usize),
212209
{
213-
// Security check: prevent massive allocations for small inputs (Zip bomb prevention)
214-
// Max compression ratio for Deflate is ~1032:1. We use a generous limit of 2000:1 + overhead.
215-
// This prevents allocating GBs of memory for small inputs.
216210
let limit = data
217211
.len()
218212
.saturating_mul(self.limit_ratio)
@@ -243,9 +237,7 @@ impl Decompressor {
243237
.try_reserve_exact(expected_size)
244238
.map_err(io::Error::other)?;
245239

246-
// Use spare_capacity_mut to avoid zero-initialization.
247240
let out_uninit = output.spare_capacity_mut();
248-
// Ensure we only use the expected size
249241
let out_uninit = &mut out_uninit[..expected_size];
250242

251243
let (res, _, size) = f(&mut self.inner, data, out_uninit);
@@ -281,7 +273,7 @@ impl Decompressor {
281273
"Input and output buffers overlap",
282274
));
283275
}
284-
// Safe to cast &mut [u8] to &mut [MaybeUninit<u8>]
276+
285277
let out_uninit = unsafe {
286278
std::slice::from_raw_parts_mut(
287279
output.as_mut_ptr() as *mut std::mem::MaybeUninit<u8>,

src/batch.rs

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,14 @@ use rayon::prelude::*;
44

55
pub struct BatchCompressor {
66
level: usize,
7-
#[cfg(feature = "cuda")]
8-
cuda_compressor: std::sync::OnceLock<Option<crate::batch_cuda::CudaBatchCompressor>>,
97
}
108

119
impl BatchCompressor {
1210
pub fn new(level: usize) -> Self {
13-
Self {
14-
level,
15-
#[cfg(feature = "cuda")]
16-
cuda_compressor: std::sync::OnceLock::new(),
17-
}
11+
Self { level }
1812
}
1913

2014
pub fn compress_batch(&self, inputs: &[&[u8]]) -> Vec<Vec<u8>> {
21-
#[cfg(feature = "cuda")]
22-
{
23-
if let Some(cuda_impl) = self.cuda_compressor.get_or_init(|| {
24-
std::panic::catch_unwind(|| crate::batch_cuda::CudaBatchCompressor::new(self.level))
25-
.ok()
26-
.and_then(|res| res.ok())
27-
}) {
28-
if let Ok(res) = cuda_impl.compress_batch(inputs) {
29-
return res;
30-
}
31-
}
32-
}
33-
3415
inputs
3516
.par_iter()
3617
.map_init(

src/batch_cuda.rs

Lines changed: 0 additions & 151 deletions
This file was deleted.

0 commit comments

Comments
 (0)