diff --git a/src/adler32/arm.rs b/src/adler32/arm.rs index e2507c6..6b18822 100644 --- a/src/adler32/arm.rs +++ b/src/adler32/arm.rs @@ -28,7 +28,7 @@ pub unsafe fn adler32_arm_neon(adler: u32, p: &[u8]) -> u32 { ]; while data.len() > 0 { - let n = std::cmp::min(data.len(), 5504) & !63; + let n = std::cmp::min(data.len(), 4032) & !63; if n == 0 { break; } @@ -118,7 +118,7 @@ pub unsafe fn adler32_arm_neon_dotprod(adler: u32, p: &[u8]) -> u32 { let ones = vdupq_n_u8(1); while data.len() > 0 { - let n = std::cmp::min(data.len(), 5504) & !63; + let n = std::cmp::min(data.len(), 4032) & !63; if n == 0 { break; } diff --git a/src/adler32/mod.rs b/src/adler32/mod.rs index 6354381..2a89fd6 100644 --- a/src/adler32/mod.rs +++ b/src/adler32/mod.rs @@ -2,7 +2,7 @@ use std::cmp::min; use std::sync::OnceLock; const DIVISOR: u32 = 65521; -const MAX_CHUNK_LEN: usize = 4096; +const MAX_CHUNK_LEN: usize = 4032; #[inline] fn adler32_chunk(s1: &mut u32, s2: &mut u32, p: &[u8]) { @@ -30,7 +30,7 @@ fn adler32_chunk(s1: &mut u32, s2: &mut u32, p: &[u8]) { let b14 = unsafe { *ptr.add(14) as u32 }; let b15 = unsafe { *ptr.add(15) as u32 }; - s2_local += (s1_local << 4) + s2_local += (s1_local * 16) + (b0 * 16) + (b1 * 15) + (b2 * 14) @@ -57,21 +57,6 @@ fn adler32_chunk(s1: &mut u32, s2: &mut u32, p: &[u8]) { len -= 16; } - while len >= 4 { - let b0 = unsafe { *ptr.add(0) as u32 }; - let b1 = unsafe { *ptr.add(1) as u32 }; - let b2 = unsafe { *ptr.add(2) as u32 }; - let b3 = unsafe { *ptr.add(3) as u32 }; - - s2_local += (s1_local << 2) + (b0 * 4) + (b1 * 3) + (b2 * 2) + b3; - s1_local += b0 + b1 + b2 + b3; - - unsafe { - ptr = ptr.add(4); - } - len -= 4; - } - while len > 0 { let b = unsafe { *ptr as u32 }; s1_local += b; @@ -104,10 +89,10 @@ pub fn adler32_generic(adler: u32, mut buffer: &[u8]) -> u32 { } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -mod x86; +pub mod x86; #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] -mod arm; +pub mod arm; type Adler32Fn = unsafe fn(u32, &[u8]) -> u32; diff --git a/src/adler32/x86.rs b/src/adler32/x86.rs index 34edd40..3dd2935 100644 --- a/src/adler32/x86.rs +++ b/src/adler32/x86.rs @@ -4,7 +4,7 @@ use core::arch::x86::*; use core::arch::x86_64::*; const DIVISOR: u32 = 65521; -const BLOCK_SIZE: usize = 4096; +const BLOCK_SIZE: usize = 4032; macro_rules! adler32_chunk8 { ($s1:expr, $s2:expr, $ptr:expr, $len:expr) => { @@ -576,7 +576,7 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 { s1 %= DIVISOR; s2 %= DIVISOR; - (s2 << 16) | s1 + (s2 % DIVISOR) << 16 | (s1 % DIVISOR) } #[target_feature(enable = "avxvnni")] @@ -850,7 +850,7 @@ pub unsafe fn adler32_x86_avx2_vnni(adler: u32, p: &[u8]) -> u32 { s1 %= DIVISOR; s2 %= DIVISOR; - (s2 << 16) | s1 + (s2 % DIVISOR) << 16 | (s1 % DIVISOR) } #[cfg(target_arch = "x86_64")] @@ -1103,5 +1103,5 @@ pub unsafe fn adler32_x86_avx512_vnni(adler: u32, p: &[u8]) -> u32 { s1 %= DIVISOR; s2 %= DIVISOR; - (s2 << 16) | s1 + (s2 % DIVISOR) << 16 | (s1 % DIVISOR) } diff --git a/src/batch.rs b/src/batch.rs index b7d4d58..98f3e6d 100644 --- a/src/batch.rs +++ b/src/batch.rs @@ -63,15 +63,18 @@ impl BatchDecompressor { .map_init( || (Decompressor::new(), Vec::new()), |(decompressor, buffer), (&input, &max_size)| { + buffer.clear(); if buffer.capacity() < max_size { - buffer.reserve(max_size.saturating_sub(buffer.len())); - } - unsafe { - buffer.set_len(max_size); + buffer.reserve(max_size); } + let buf_uninit = &mut buffer.spare_capacity_mut()[..max_size]; - let (res, _, size) = decompressor.decompress(input, buffer); + let (res, _, size) = unsafe { decompressor.decompress_uninit(input, buf_uninit) }; + buffer.clear(); if res == DecompressResult::Success { + unsafe { + buffer.set_len(size); + } Some(buffer[..size].to_vec()) } else { None diff --git a/src/compress/mod.rs b/src/compress/mod.rs index 11567e8..f824624 100644 --- a/src/compress/mod.rs +++ b/src/compress/mod.rs @@ -683,28 +683,28 @@ impl Compressor { let mode = if is_last { flush_mode } else { FlushMode::Sync }; let bound = Self::deflate_compress_bound(chunk.len()); + buf.clear(); if buf.capacity() < bound { - buf.reserve(bound - buf.len()); - } - unsafe { - buf.set_len(bound); + buf.reserve(bound); } - let buf_uninit = slice_as_uninit_mut(buf); + let buf_uninit = &mut buf.spare_capacity_mut()[..bound]; let (res, size, _) = compressor.compress(chunk, buf_uninit, mode); if res == CompressResult::Success { unsafe { buf.set_len(size); } - if size < buf.capacity() / 2 { + let result = if size < buf.capacity() / 2 { Ok(buf.to_vec()) } else { Ok(std::mem::replace( buf, Vec::with_capacity(chunk_size + chunk_size / 2), )) - } + }; + buf.clear(); + result } else { Err(io::Error::other("Compression failed")) } @@ -920,12 +920,7 @@ impl Compressor { self.dp_costs[0] = 0; self.dp_path.clear(); - if self.dp_path.capacity() < processed + 1 { - self.dp_path.reserve(processed + 1 - self.dp_path.len()); - } - unsafe { - self.dp_path.set_len(processed + 1); - } + self.dp_path.resize(processed + 1, 0); mf.reset(); let mut pos = 0; @@ -1717,12 +1712,7 @@ impl Compressor { self.dp_costs[0] = 0; self.dp_path.clear(); - if self.dp_path.capacity() < processed + 1 { - self.dp_path.reserve(processed + 1 - self.dp_path.len()); - } - unsafe { - self.dp_path.set_len(processed + 1); - } + self.dp_path.resize(processed + 1, 0); mf.reset(); let mut pos = 0; diff --git a/src/stream.rs b/src/stream.rs index 36c8fce..c31777f 100644 --- a/src/stream.rs +++ b/src/stream.rs @@ -65,23 +65,24 @@ impl DeflateEncoder { if !final_block { bound += 5; } - if output.len() < bound { - output - .try_reserve(bound - output.len()) - .map_err(io::Error::other)?; - unsafe { - output.set_len(bound); - } - } + + output.clear(); + output + .try_reserve(bound) + .map_err(io::Error::other)?; let mode = if final_block { crate::compress::FlushMode::Finish } else { crate::compress::FlushMode::Sync }; - let out_uninit = crate::common::slice_as_uninit_mut(output); + let out_uninit = &mut output.spare_capacity_mut()[..bound]; let (res, size, _) = compressor.compress(chunk, out_uninit, mode); + output.clear(); if res == CompressResult::Success { + unsafe { + output.set_len(size); + } if let Some(writer) = &mut self.writer { writer.write_all(&output[..size])?; } @@ -99,23 +100,24 @@ impl DeflateEncoder { if !(final_block && i == num_chunks - 1) { bound += 5; } - if output.len() < bound { - output - .try_reserve(bound - output.len()) - .map_err(io::Error::other)?; - unsafe { - output.set_len(bound); - } - } + + output.clear(); + output + .try_reserve(bound) + .map_err(io::Error::other)?; let mode = if final_block && i == num_chunks - 1 { crate::compress::FlushMode::Finish } else { crate::compress::FlushMode::Sync }; - let out_uninit = crate::common::slice_as_uninit_mut(output); + let out_uninit = &mut output.spare_capacity_mut()[..bound]; let (res, size, _) = compressor.compress(chunk, out_uninit, mode); + output.clear(); if res == CompressResult::Success { + unsafe { + output.set_len(size); + } Ok(size) } else { Err(io::Error::other("Compression failed")) @@ -144,23 +146,24 @@ impl DeflateEncoder { if !final_block { bound += 5; } - if output.len() < bound { - output - .try_reserve(bound - output.len()) - .map_err(io::Error::other)?; - unsafe { - output.set_len(bound); - } - } + + output.clear(); + output + .try_reserve(bound) + .map_err(io::Error::other)?; let mode = if final_block { crate::compress::FlushMode::Finish } else { crate::compress::FlushMode::Sync }; - let out_uninit = crate::common::slice_as_uninit_mut(output); + let out_uninit = &mut output.spare_capacity_mut()[..bound]; let (res, size, _) = compressor.compress(&self.buffer, out_uninit, mode); + output.clear(); if res == CompressResult::Success { + unsafe { + output.set_len(size); + } if let Some(writer) = &mut self.writer { writer.write_all(&output[..size])?; } diff --git a/test_adler_all b/test_adler_all new file mode 100755 index 0000000..1e54b04 Binary files /dev/null and b/test_adler_all differ diff --git a/test_adler_all.rs b/test_adler_all.rs new file mode 100644 index 0000000..1a83a54 --- /dev/null +++ b/test_adler_all.rs @@ -0,0 +1,36 @@ +use std::cmp::min; + +const DIVISOR: u32 = 65521; +const MAX_CHUNK_LEN: usize = 4032; + +fn adler32_chunk(s1: &mut u32, s2: &mut u32, p: &[u8]) { + let mut s1_local = *s1; + let mut s2_local = *s2; + for &b in p { + s1_local += b as u32; + s2_local += s1_local; + } + *s1 = s1_local % DIVISOR; + *s2 = s2_local % DIVISOR; +} + +fn adler32_generic(adler: u32, mut buffer: &[u8]) -> u32 { + let mut s1 = adler & 0xFFFF; + let mut s2 = adler >> 16; + let mut len = buffer.len(); + while len > 0 { + let n = min(len, MAX_CHUNK_LEN); + let (chunk, rest) = buffer.split_at(n); + buffer = rest; + len -= n; + adler32_chunk(&mut s1, &mut s2, chunk); + } + (s2 % DIVISOR) << 16 | (s1 % DIVISOR) +} + +fn main() { + let size = 100000; + let data = vec![0xFF; size]; + let expected = adler32_generic(1, &data); + println!("Expected: {}", expected); +} diff --git a/test_adler_all_v2 b/test_adler_all_v2 new file mode 100755 index 0000000..8dc4971 Binary files /dev/null and b/test_adler_all_v2 differ diff --git a/test_adler_all_v2.rs b/test_adler_all_v2.rs new file mode 100644 index 0000000..429b843 --- /dev/null +++ b/test_adler_all_v2.rs @@ -0,0 +1,67 @@ +use std::cmp::min; + +const DIVISOR: u32 = 65521; +const MAX_CHUNK_LEN: usize = 4032; + +fn adler32_chunk(s1: &mut u32, s2: &mut u32, p: &[u8]) { + let mut s1_local = *s1; + let mut s2_local = *s2; + let mut ptr = p.as_ptr(); + let mut len = p.len(); + while len >= 16 { + let b0 = unsafe { *ptr.add(0) as u32 }; + let b1 = unsafe { *ptr.add(1) as u32 }; + let b2 = unsafe { *ptr.add(2) as u32 }; + let b3 = unsafe { *ptr.add(3) as u32 }; + let b4 = unsafe { *ptr.add(4) as u32 }; + let b5 = unsafe { *ptr.add(5) as u32 }; + let b6 = unsafe { *ptr.add(6) as u32 }; + let b7 = unsafe { *ptr.add(7) as u32 }; + let b8 = unsafe { *ptr.add(8) as u32 }; + let b9 = unsafe { *ptr.add(9) as u32 }; + let b10 = unsafe { *ptr.add(10) as u32 }; + let b11 = unsafe { *ptr.add(11) as u32 }; + let b12 = unsafe { *ptr.add(12) as u32 }; + let b13 = unsafe { *ptr.add(13) as u32 }; + let b14 = unsafe { *ptr.add(14) as u32 }; + let b15 = unsafe { *ptr.add(15) as u32 }; + s2_local += (s1_local * 16) + + (b0 * 16) + (b1 * 15) + (b2 * 14) + (b3 * 13) + + (b4 * 12) + (b5 * 11) + (b6 * 10) + (b7 * 9) + + (b8 * 8) + (b9 * 7) + (b10 * 6) + (b11 * 5) + + (b12 * 4) + (b13 * 3) + (b14 * 2) + b15; + s1_local += b0 + b1 + b2 + b3 + b4 + b5 + b6 + b7 + b8 + b9 + b10 + b11 + b12 + b13 + b14 + b15; + unsafe { ptr = ptr.add(16); } + len -= 16; + } + while len > 0 { + let b = unsafe { *ptr as u32 }; + s1_local += b; + s2_local += s1_local; + unsafe { ptr = ptr.add(1); } + len -= 1; + } + *s1 = s1_local % DIVISOR; + *s2 = s2_local % DIVISOR; +} + +fn adler32_generic(adler: u32, mut buffer: &[u8]) -> u32 { + let mut s1 = adler & 0xFFFF; + let mut s2 = adler >> 16; + let mut len = buffer.len(); + while len > 0 { + let n = min(len, MAX_CHUNK_LEN); + let (chunk, rest) = buffer.split_at(n); + buffer = rest; + len -= n; + adler32_chunk(&mut s1, &mut s2, chunk); + } + (s2 % DIVISOR) << 16 | (s1 % DIVISOR) +} + +fn main() { + let size = 100000; + let data = vec![0xFF; size]; + let actual = adler32_generic(1, &data); + println!("Actual: {}", actual); +} diff --git a/test_final b/test_final new file mode 100755 index 0000000..2e12bf2 Binary files /dev/null and b/test_final differ diff --git a/test_final.rs b/test_final.rs new file mode 100644 index 0000000..dddcecb --- /dev/null +++ b/test_final.rs @@ -0,0 +1,27 @@ +const DIVISOR: u32 = 65521; +fn main() { + let mut s1: u32 = 65520; + let mut s2: u32 = 65520; + + // Process 64 bytes of 0xFF + for _ in 0..64 { + s1 += 255; + s2 += s1; + } + println!("Ref s1: {}, s2: {}", s1 % DIVISOR, s2 % DIVISOR); + + // SSE2 scalar tail part: + let mut s1_sse2: u32 = 65520; + let mut s2_sse2: u32 = 65520; + let n = 64; + // s2 = ((s2 as u64 + s1 as u64 * n as u64) % DIVISOR as u64) as u32; + s2_sse2 = ((s2_sse2 as u64 + s1_sse2 as u64 * n as u64) % DIVISOR as u64) as u32; + // sum_s2 = sum( (n-i+1)*bi ) = sum( (64-i+1)*255 ) + let sum_s2: u64 = (1..=64).map(|i| (64-i+1)*255).sum(); + s2_sse2 = ((s2_sse2 as u64 + sum_s2) % DIVISOR as u64) as u32; + + let sum_s1: u64 = 64 * 255; + s1_sse2 = ((s1_sse2 as u64 + sum_s1) % DIVISOR as u64) as u32; + + println!("SSE2 s1: {}, s2: {}", s1_sse2, s2_sse2); +} diff --git a/test_final_v2 b/test_final_v2 new file mode 100755 index 0000000..127c1ba Binary files /dev/null and b/test_final_v2 differ diff --git a/test_final_v2.rs b/test_final_v2.rs new file mode 100644 index 0000000..42a81de --- /dev/null +++ b/test_final_v2.rs @@ -0,0 +1,21 @@ +const DIVISOR: u32 = 65521; +fn main() { + let mut s1: u32 = 65520; + let mut s2: u32 = 65520; + for _ in 0..32 { + s1 += 255; + s2 += s1; + } + println!("Ref s1: {}, s2: {}", s1 % DIVISOR, s2 % DIVISOR); + + let mut s1_avx2: u32 = 65520; + let mut s2_avx2: u32 = 65520; + let n = 32; + s2_avx2 = ((s2_avx2 as u64 + s1_avx2 as u64 * n as u64) % DIVISOR as u64) as u32; + let sum_s2: u64 = (1..=32).map(|i| (32-i+1)*255).sum(); + s2_avx2 = ((s2_avx2 as u64 + sum_s2) % DIVISOR as u64) as u32; + let sum_s1: u64 = 32 * 255; + s1_avx2 = ((s1_avx2 as u64 + sum_s1) % DIVISOR as u64) as u32; + + println!("AVX2 s1: {}, s2: {}", s1_avx2, s2_avx2); +} diff --git a/test_scalar_s2_sse2 b/test_scalar_s2_sse2 new file mode 100755 index 0000000..5564d99 Binary files /dev/null and b/test_scalar_s2_sse2 differ diff --git a/test_scalar_s2_sse2.rs b/test_scalar_s2_sse2.rs new file mode 100644 index 0000000..137b15c --- /dev/null +++ b/test_scalar_s2_sse2.rs @@ -0,0 +1,14 @@ +const DIVISOR: u32 = 65521; +fn main() { + let mut s1: u32 = 65520; + let mut s2: u32 = 65520; + let n = 16; + let mut sum_s2: u32 = 2080 * 255; // Placeholder for SIMD sum_s2 + + // Equivalent of s2 = ((s2 as u64 + s1 as u64 * 16) % DIVISOR as u64) as u32; + s2 = ((s2 as u64 + s1 as u64 * 16) % DIVISOR as u64) as u32; + // Equivalent of s2 = ((s2 as u64 + sum_s2 as u64) % DIVISOR as u64) as u32; + s2 = ((s2 as u64 + sum_s2 as u64) % DIVISOR as u64) as u32; + + println!("s2: {}", s2); +} diff --git a/test_simd_overflow_5552_v2 b/test_simd_overflow_5552_v2 new file mode 100755 index 0000000..3033cf2 Binary files /dev/null and b/test_simd_overflow_5552_v2 differ diff --git a/test_simd_overflow_5552_v2.rs b/test_simd_overflow_5552_v2.rs new file mode 100644 index 0000000..0754b18 --- /dev/null +++ b/test_simd_overflow_5552_v2.rs @@ -0,0 +1,9 @@ +fn main() { + let mut s1: u32 = 65520; // Max s1 before modulo + let mut s2: u32 = 0; + for _ in 0..5552 { + s1 += 255; + s2 += s1; + } + println!("s1: {}, s2: {}", s1, s2); +} diff --git a/test_simd_overflow_5552_v3 b/test_simd_overflow_5552_v3 new file mode 100755 index 0000000..d184c1b Binary files /dev/null and b/test_simd_overflow_5552_v3 differ diff --git a/test_simd_overflow_5552_v3.rs b/test_simd_overflow_5552_v3.rs new file mode 100644 index 0000000..0f06a9d --- /dev/null +++ b/test_simd_overflow_5552_v3.rs @@ -0,0 +1,9 @@ +fn main() { + let mut s1: u32 = 65520; + let mut s2: u32 = 65520; // Initial s2 + for _ in 0..5552 { + s1 += 255; + s2 += s1; + } + println!("s1: {}, s2: {}", s1, s2); +} diff --git a/test_simd_overflow_5552_v4 b/test_simd_overflow_5552_v4 new file mode 100755 index 0000000..9295a46 Binary files /dev/null and b/test_simd_overflow_5552_v4 differ diff --git a/test_simd_overflow_5552_v4.rs b/test_simd_overflow_5552_v4.rs new file mode 100644 index 0000000..63f9fa9 --- /dev/null +++ b/test_simd_overflow_5552_v4.rs @@ -0,0 +1,9 @@ +fn main() { + let mut s1: u32 = 65520; + let mut s2: u32 = 65520; + for _ in 0..5553 { // One more byte + s1 += 255; + s2 += s1; + } + println!("s1: {}, s2: {}", s1, s2); +} diff --git a/test_u32_max b/test_u32_max new file mode 100755 index 0000000..0ca9e4d Binary files /dev/null and b/test_u32_max differ diff --git a/test_u32_max.rs b/test_u32_max.rs new file mode 100644 index 0000000..9e4811a --- /dev/null +++ b/test_u32_max.rs @@ -0,0 +1,3 @@ +fn main() { + println!("{}", u32::MAX); +}