diff --git a/Cargo.toml b/Cargo.toml index 14525a8..566a520 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,3 +57,7 @@ libdeflater = "1.25.0" [[bench]] name = "bench_main" harness = false + +[[bench]] +name = "encoder_perf" +harness = false diff --git a/benches/encoder_perf.rs b/benches/encoder_perf.rs new file mode 100644 index 0000000..121e97b --- /dev/null +++ b/benches/encoder_perf.rs @@ -0,0 +1,28 @@ +use criterion::{Criterion, Throughput, criterion_group, criterion_main}; +use libdeflate::stream::DeflateEncoder; +use std::io::Write; + +fn bench_encoder_parallel(c: &mut Criterion) { + let size = 10 * 1024 * 1024; // 10MB + let mut data = Vec::with_capacity(size); + for i in 0..size { + data.push((i % 256) as u8); + } + + let mut group = c.benchmark_group("DeflateEncoder Parallel"); + group.throughput(Throughput::Bytes(size as u64)); + + group.bench_function("write_all 10MB", |b| { + b.iter(|| { + let sink = std::io::sink(); + let mut encoder = DeflateEncoder::new(sink, 6); // Default 1MB buffer + encoder.write_all(&data).unwrap(); + encoder.finish().unwrap(); + }); + }); + + group.finish(); +} + +criterion_group!(benches, bench_encoder_parallel); +criterion_main!(benches); diff --git a/src/adler32/mod.rs b/src/adler32/mod.rs index 0734c62..b7e13bc 100644 --- a/src/adler32/mod.rs +++ b/src/adler32/mod.rs @@ -46,7 +46,7 @@ fn adler32_chunk(s1: &mut u32, s2: &mut u32, p: &[u8]) { + (b12 * 4) + (b13 * 3) + (b14 * 2) - + (b15 * 1); + + b15; s1_local += b0 + b1 + b2 + b3 + b4 + b5 + b6 + b7 + b8 + b9 + b10 + b11 + b12 + b13 + b14 + b15; @@ -63,7 +63,7 @@ fn adler32_chunk(s1: &mut u32, s2: &mut u32, p: &[u8]) { let b2 = unsafe { *ptr.add(2) as u32 }; let b3 = unsafe { *ptr.add(3) as u32 }; - s2_local += (s1_local << 2) + (b0 * 4) + (b1 * 3) + (b2 * 2) + (b3 * 1); + s2_local += (s1_local << 2) + (b0 * 4) + (b1 * 3) + (b2 * 2) + b3; s1_local += b0 + b1 + b2 + b3; unsafe { diff --git a/src/api.rs b/src/api.rs index 4da83ae..7579769 100644 --- a/src/api.rs +++ b/src/api.rs @@ -8,7 +8,7 @@ pub struct Compressor { impl Compressor { pub fn new(level: i32) -> io::Result { - if level < 0 || level > 12 { + if !(0..=12).contains(&level) { return Err(io::Error::new( io::ErrorKind::InvalidInput, "Compression level must be between 0 and 12", @@ -79,7 +79,7 @@ impl Compressor { let mut output = Vec::new(); output .try_reserve_exact(bound) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + .map_err(io::Error::other)?; // Use spare_capacity_mut to avoid zero-initialization. // Since len is 0, this returns the entire capacity as MaybeUninit. @@ -96,7 +96,7 @@ impl Compressor { Ok(output) } CompressResult::InsufficientSpace => { - Err(io::Error::new(io::ErrorKind::Other, "Insufficient space")) + Err(io::Error::other("Insufficient space")) } } } @@ -131,7 +131,7 @@ impl Compressor { if res == CompressResult::Success { Ok(size) } else { - Err(io::Error::new(io::ErrorKind::Other, error_msg)) + Err(io::Error::other(error_msg)) } } } @@ -142,6 +142,12 @@ pub struct Decompressor { limit_ratio: usize, } +impl Default for Decompressor { + fn default() -> Self { + Self::new() + } +} + impl Decompressor { pub fn new() -> Self { Self { @@ -239,7 +245,7 @@ impl Decompressor { let mut output = Vec::new(); output .try_reserve_exact(expected_size) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + .map_err(io::Error::other)?; // Use spare_capacity_mut to avoid zero-initialization. let out_uninit = output.spare_capacity_mut(); @@ -304,8 +310,8 @@ fn is_overlapping(s1: &[u8], s2: &[u8]) -> bool { let p2 = s2.as_ptr() as usize; let len2 = s2.len(); - let end1 = p1.checked_add(len1).unwrap_or(usize::MAX); - let end2 = p2.checked_add(len2).unwrap_or(usize::MAX); + let end1 = p1.saturating_add(len1); + let end2 = p2.saturating_add(len2); use std::cmp::{max, min}; max(p1, p2) < min(end1, end2) diff --git a/src/batch.rs b/src/batch.rs index 0d51c6d..52090d4 100644 --- a/src/batch.rs +++ b/src/batch.rs @@ -60,6 +60,12 @@ impl BatchCompressor { pub struct BatchDecompressor; +impl Default for BatchDecompressor { + fn default() -> Self { + Self::new() + } +} + impl BatchDecompressor { pub fn new() -> Self { Self diff --git a/src/compress/huffman_comp.rs b/src/compress/huffman_comp.rs index 9b9ca20..947c4e4 100644 --- a/src/compress/huffman_comp.rs +++ b/src/compress/huffman_comp.rs @@ -42,7 +42,7 @@ fn build_tree(a: &mut [u32], sym_count: usize) { let mut e = 0; while e < last_idx { let new_freq; - if i + 1 <= last_idx && (b == e || (a[i + 1] & FREQ_MASK) <= (a[b] & FREQ_MASK)) { + if i < last_idx && (b == e || (a[i + 1] & FREQ_MASK) <= (a[b] & FREQ_MASK)) { new_freq = (a[i] & FREQ_MASK) + (a[i + 1] & FREQ_MASK); i += 2; } else if b + 2 <= e && (i > last_idx || (a[b + 1] & FREQ_MASK) < (a[i] & FREQ_MASK)) { diff --git a/src/compress/matchfinder.rs b/src/compress/matchfinder.rs index 9eb34a8..096c44c 100644 --- a/src/compress/matchfinder.rs +++ b/src/compress/matchfinder.rs @@ -441,16 +441,14 @@ unsafe fn match_len_avx2(a: *const u8, b: *const u8, max_len: usize) -> usize { let mask = _mm256_movemask_epi8(cmp) as u32; return len + 32 + (!mask).trailing_zeros() as usize; } + } else if _mm256_testz_si256(xor3, xor3) == 0 { + let cmp = _mm256_cmpeq_epi8(xor3, v_zero); + let mask = _mm256_movemask_epi8(cmp) as u32; + return len + 64 + (!mask).trailing_zeros() as usize; } else { - if _mm256_testz_si256(xor3, xor3) == 0 { - let cmp = _mm256_cmpeq_epi8(xor3, v_zero); - let mask = _mm256_movemask_epi8(cmp) as u32; - return len + 64 + (!mask).trailing_zeros() as usize; - } else { - let cmp = _mm256_cmpeq_epi8(xor4, v_zero); - let mask = _mm256_movemask_epi8(cmp) as u32; - return len + 96 + (!mask).trailing_zeros() as usize; - } + let cmp = _mm256_cmpeq_epi8(xor4, v_zero); + let mask = _mm256_movemask_epi8(cmp) as u32; + return len + 96 + (!mask).trailing_zeros() as usize; } } @@ -691,7 +689,7 @@ impl MatchFinder { where F: FnMut(usize, usize), { - if pos.checked_add(3).map_or(true, |end| end > data.len()) { + if pos.checked_add(3).is_none_or(|end| end > data.len()) { return (0, 0); } @@ -705,7 +703,7 @@ impl MatchFinder { src_val_4 = (src as *const u32).read_unaligned(); src_val = src_val_4 & 0xFFFFFF; } else { - src_val = ((src.read() as u32) << 0) + src_val = (src.read() as u32) | ((src.add(1).read() as u32) << 8) | ((src.add(2).read() as u32) << 16); } @@ -756,11 +754,10 @@ impl MatchFinder { } let mut match_ok = true; - if best_len >= 3 { - if *match_ptr.add(best_len) != *src.add(best_len) { + if best_len >= 3 + && *match_ptr.add(best_len) != *src.add(best_len) { match_ok = false; } - } if match_ok { if safe_to_read_u32 { @@ -787,7 +784,7 @@ impl MatchFinder { if p_rel + 4 <= data.len() { match_val = (match_ptr as *const u32).read_unaligned() & 0xFFFFFF; } else { - match_val = ((match_ptr.read() as u32) << 0) + match_val = (match_ptr.read() as u32) | ((match_ptr.add(1).read() as u32) << 8) | ((match_ptr.add(2).read() as u32) << 16); } @@ -949,7 +946,7 @@ impl MatchFinder { } } pub fn skip_match(&mut self, data: &[u8], pos: usize) { - if pos.checked_add(3).map_or(true, |end| end > data.len()) { + if pos.checked_add(3).is_none_or(|end| end > data.len()) { return; } unsafe { @@ -958,7 +955,7 @@ impl MatchFinder { if pos + 4 <= data.len() { src_val = (src as *const u32).read_unaligned() & 0xFFFFFF; } else { - src_val = ((src.read() as u32) << 0) + src_val = (src.read() as u32) | ((src.add(1).read() as u32) << 8) | ((src.add(2).read() as u32) << 16); } @@ -993,7 +990,7 @@ impl MatchFinder { } if pos .checked_add(count + 3) - .map_or(true, |end| end > data.len()) + .is_none_or(|end| end > data.len()) { for i in 0..count { self.skip_match(data, pos + i); @@ -1068,7 +1065,7 @@ impl HtMatchFinder { } pub fn find_match(&mut self, data: &[u8], pos: usize) -> (usize, usize) { - if pos.checked_add(3).map_or(true, |end| end > data.len()) { + if pos.checked_add(3).is_none_or(|end| end > data.len()) { return (0, 0); } @@ -1081,7 +1078,7 @@ impl HtMatchFinder { if safe_to_read_u32 { src_val = (src as *const u32).read_unaligned() & 0xFFFFFF; } else { - src_val = ((src.read() as u32) << 0) + src_val = (src.read() as u32) | ((src.add(1).read() as u32) << 8) | ((src.add(2).read() as u32) << 16); } @@ -1112,7 +1109,7 @@ impl HtMatchFinder { } else if p_rel + 4 <= data.len() { match_val = (match_ptr as *const u32).read_unaligned() & 0xFFFFFF; } else { - match_val = ((match_ptr.read() as u32) << 0) + match_val = (match_ptr.read() as u32) | ((match_ptr.add(1).read() as u32) << 8) | ((match_ptr.add(2).read() as u32) << 16); } @@ -1139,7 +1136,7 @@ impl HtMatchFinder { } pub fn skip_match(&mut self, data: &[u8], pos: usize) { - if pos.checked_add(3).map_or(true, |end| end > data.len()) { + if pos.checked_add(3).is_none_or(|end| end > data.len()) { return; } unsafe { @@ -1148,7 +1145,7 @@ impl HtMatchFinder { if pos + 4 <= data.len() { src_val = (src as *const u32).read_unaligned() & 0xFFFFFF; } else { - src_val = ((src.read() as u32) << 0) + src_val = (src.read() as u32) | ((src.add(1).read() as u32) << 8) | ((src.add(2).read() as u32) << 16); } @@ -1281,7 +1278,7 @@ impl BtMatchFinder { max_depth: usize, mut visitor: V, ) -> V { - if pos.checked_add(4).map_or(true, |end| end > data.len()) { + if pos.checked_add(4).is_none_or(|end| end > data.len()) { return visitor; } diff --git a/src/compress/mod.rs b/src/compress/mod.rs index 814a1dc..61dd8c2 100644 --- a/src/compress/mod.rs +++ b/src/compress/mod.rs @@ -738,7 +738,7 @@ impl Compressor { )) } } else { - Err(io::Error::new(io::ErrorKind::Other, "Compression failed")) + Err(io::Error::other("Compression failed")) } }, ) @@ -808,7 +808,7 @@ impl Compressor { } mf.advance(input.len()); - (total_bits + 7) / 8 + total_bits.div_ceil(8) } fn calculate_block_size_fast( @@ -972,7 +972,7 @@ impl Compressor { let lit_cost = self.litlen_lens[block_input[pos] as usize] as u32; if cur_cost + lit_cost < self.dp_costs[pos + 1] { self.dp_costs[pos + 1] = cur_cost + lit_cost; - self.dp_path[pos + 1] = (1 as u32) | (0 as u32) << 16; + self.dp_path[pos + 1] = 1_u32; } mf.find_matches( @@ -1069,7 +1069,7 @@ impl Compressor { pub fn compress_to_size(&mut self, input: &[u8], final_block: bool) -> usize { if self.compression_level == 0 { let num_blocks = input.len() / 65535 - + if input.len() % 65535 != 0 || (input.len() == 0 && final_block) { + + if !input.len().is_multiple_of(65535) || (input.is_empty() && final_block) { 1 } else { 0 @@ -1094,7 +1094,7 @@ impl Compressor { mf: &mut T, input: &[u8], start_pos: usize, - lazy_depth: u32, + _lazy_depth: u32, ) -> usize { self.split_stats.reset(); let mut in_idx = start_pos; @@ -1201,14 +1201,12 @@ impl Compressor { precode_freqs[17] += 1; run -= min(run, 10); } - } else { - if run >= 4 { - precode_freqs[len as usize] += 1; - run -= 1; - while run >= 3 { - precode_freqs[16] += 1; - run -= min(run, 6); - } + } else if run >= 4 { + precode_freqs[len as usize] += 1; + run -= 1; + while run >= 3 { + precode_freqs[16] += 1; + run -= min(run, 6); } } while run > 0 { @@ -1769,7 +1767,7 @@ impl Compressor { let lit_cost = self.litlen_lens[block_input[pos] as usize] as u32; if cur_cost + lit_cost < self.dp_costs[pos + 1] { self.dp_costs[pos + 1] = cur_cost + lit_cost; - self.dp_path[pos + 1] = (1 as u32) | (0 as u32) << 16; + self.dp_path[pos + 1] = 1_u32; } mf.find_matches( @@ -1910,19 +1908,17 @@ impl Compressor { precode_freqs[17] += 1; run -= c; } - } else { - if run >= 4 { - precode_items[num_precode_items] = (len as u16) << 8; + } else if run >= 4 { + precode_items[num_precode_items] = (len as u16) << 8; + num_precode_items += 1; + precode_freqs[len as usize] += 1; + run -= 1; + while run >= 3 { + let c = min(run, 6); + precode_items[num_precode_items] = (16 << 8) | ((c - 3) as u16); num_precode_items += 1; - precode_freqs[len as usize] += 1; - run -= 1; - while run >= 3 { - let c = min(run, 6); - precode_items[num_precode_items] = (16 << 8) | ((c - 3) as u16); - num_precode_items += 1; - precode_freqs[16] += 1; - run -= c; - } + precode_freqs[16] += 1; + run -= c; } } while run > 0 { @@ -1971,11 +1967,10 @@ impl Compressor { if !bs.write_bits(extra, 3) { return false; } - } else if sym == 18 { - if !bs.write_bits(extra, 7) { + } else if sym == 18 + && !bs.write_bits(extra, 7) { return false; } - } } true } @@ -1988,7 +1983,8 @@ impl Compressor { self.offset_lens.copy_from_slice(&tables.offset_lens); self.litlen_table.copy_from_slice(&tables.litlen_table); self.offset_table.copy_from_slice(&tables.offset_table); - self.match_len_table.copy_from_slice(&tables.match_len_table); + self.match_len_table + .copy_from_slice(&tables.match_len_table); } #[inline(always)] @@ -2107,7 +2103,7 @@ impl Compressor { for len in 3..=DEFLATE_MAX_MATCH_LEN { let len_info = unsafe { *LENGTH_WRITE_TABLE.get_unchecked(len) }; let len_slot = (len_info >> 24) as usize; - let len_extra_bits = ((len_info >> 16) & 0xFF) as u32; + let len_extra_bits = (len_info >> 16) & 0xFF ; let len_cost = unsafe { *self.litlen_lens.get_unchecked(257 + len_slot) } as u32 + len_extra_bits; diff --git a/src/crc32/mod.rs b/src/crc32/mod.rs index 5c5bf26..704ff47 100644 --- a/src/crc32/mod.rs +++ b/src/crc32/mod.rs @@ -57,22 +57,22 @@ pub fn crc32_slice8(mut crc: u32, p: &[u8]) -> u32 { let t4 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x300 + idx4) }; let t5 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x200 + idx5) }; let t6 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x100 + idx6) }; - let t7 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x000 + idx7) }; + let t7 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(idx7) }; let t12 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x300 + idx12) }; let t13 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x200 + idx13) }; let t14 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x100 + idx14) }; - let t15 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x000 + idx15) }; + let t15 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(idx15) }; let t20 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x300 + idx20) }; let t21 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x200 + idx21) }; let t22 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x100 + idx22) }; - let t23 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x000 + idx23) }; + let t23 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(idx23) }; let t28 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x300 + idx28) }; let t29 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x200 + idx29) }; let t30 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x100 + idx30) }; - let t31 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x000 + idx31) }; + let t31 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(idx31) }; // Dependent chain // Chunk A @@ -156,13 +156,13 @@ pub fn crc32_slice8(mut crc: u32, p: &[u8]) -> u32 { let t4 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x300 + idx4) }; let t5 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x200 + idx5) }; let t6 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x100 + idx6) }; - let t7 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x000 + idx7) }; + let t7 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(idx7) }; // Start independent lookups for the second chunk early let t12 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x300 + idx12) }; let t13 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x200 + idx13) }; let t14 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x100 + idx14) }; - let t15 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x000 + idx15) }; + let t15 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(idx15) }; crc = ((t0 ^ t1) ^ (t2 ^ t3)) ^ ((t4 ^ t5) ^ (t6 ^ t7)); @@ -205,7 +205,7 @@ pub fn crc32_slice8(mut crc: u32, p: &[u8]) -> u32 { let t4 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x300 + idx4) }; let t5 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x200 + idx5) }; let t6 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x100 + idx6) }; - let t7 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(0x000 + idx7) }; + let t7 = unsafe { *CRC32_SLICE8_TABLE.get_unchecked(idx7) }; // Optimization: Use tree-based XOR reduction to break dependency chains and increase ILP. crc = ((t0 ^ t1) ^ (t2 ^ t3)) ^ ((t4 ^ t5) ^ (t6 ^ t7)); @@ -222,7 +222,7 @@ pub fn crc32_slice8(mut crc: u32, p: &[u8]) -> u32 { *CRC32_SLICE8_TABLE.get_unchecked(0x300 + (crc as u8) as usize) ^ *CRC32_SLICE8_TABLE.get_unchecked(0x200 + ((crc >> 8) as u8) as usize) ^ *CRC32_SLICE8_TABLE.get_unchecked(0x100 + ((crc >> 16) as u8) as usize) - ^ *CRC32_SLICE8_TABLE.get_unchecked(0x000 + ((crc >> 24) as u8) as usize) + ^ *CRC32_SLICE8_TABLE.get_unchecked(((crc >> 24) as u8) as usize ) }; unsafe { ptr = ptr.add(4); @@ -245,7 +245,7 @@ pub fn crc32_slice8(mut crc: u32, p: &[u8]) -> u32 { (crc >> 24) ^ *CRC32_SLICE8_TABLE.get_unchecked(0x200 + idx0 as usize) ^ *CRC32_SLICE8_TABLE.get_unchecked(0x100 + idx1 as usize) - ^ *CRC32_SLICE8_TABLE.get_unchecked(0x000 + idx2 as usize) + ^ *CRC32_SLICE8_TABLE.get_unchecked(idx2 as usize ) }; } 2 => { @@ -259,7 +259,7 @@ pub fn crc32_slice8(mut crc: u32, p: &[u8]) -> u32 { crc = unsafe { (crc >> 16) ^ *CRC32_SLICE8_TABLE.get_unchecked(0x100 + idx0 as usize) - ^ *CRC32_SLICE8_TABLE.get_unchecked(0x000 + idx1 as usize) + ^ *CRC32_SLICE8_TABLE.get_unchecked(idx1 as usize ) }; } 1 => { diff --git a/src/crc32/x86.rs b/src/crc32/x86.rs index 0fe07b4..498b803 100644 --- a/src/crc32/x86.rs +++ b/src/crc32/x86.rs @@ -148,24 +148,22 @@ pub unsafe fn crc32_x86_pclmulqdq(mut crc: u32, p: &[u8]) -> u32 { let x0_new = fold_vec128(x0, x2, mults_256b); let x1_new = fold_vec128(x1, x3, mults_256b); x0 = fold_vec128(x0_new, x1_new, mults_128b); - } else { - if len >= 48 { - let v0 = _mm_loadu_si128(data.as_ptr() as *const __m128i); - let v1 = _mm_loadu_si128(data.as_ptr().add(16) as *const __m128i); - let v2 = _mm_loadu_si128(data.as_ptr().add(32) as *const __m128i); - x0 = _mm_xor_si128(x0, v0); + } else if len >= 48 { + let v0 = _mm_loadu_si128(data.as_ptr() as *const __m128i); + let v1 = _mm_loadu_si128(data.as_ptr().add(16) as *const __m128i); + let v2 = _mm_loadu_si128(data.as_ptr().add(32) as *const __m128i); + x0 = _mm_xor_si128(x0, v0); - let t1 = fold_vec128(x0, v2, mults_256b); - x0 = fold_vec128(v1, t1, mults_128b); + let t1 = fold_vec128(x0, v2, mults_256b); + x0 = fold_vec128(v1, t1, mults_128b); - data = &data[48..]; - len -= 48; - } else { - let v0 = _mm_loadu_si128(data.as_ptr() as *const __m128i); - x0 = _mm_xor_si128(x0, v0); - data = &data[16..]; - len -= 16; - } + data = &data[48..]; + len -= 48; + } else { + let v0 = _mm_loadu_si128(data.as_ptr() as *const __m128i); + x0 = _mm_xor_si128(x0, v0); + data = &data[16..]; + len -= 16; } if len >= 32 { diff --git a/src/decompress/mod.rs b/src/decompress/mod.rs index 7dc2d25..fe75bb2 100644 --- a/src/decompress/mod.rs +++ b/src/decompress/mod.rs @@ -84,6 +84,12 @@ pub enum DecompressResult { ShortInput, } +impl Default for Decompressor { + fn default() -> Self { + Self::new() + } +} + impl Decompressor { pub fn new() -> Self { Self { @@ -738,26 +744,24 @@ impl Decompressor { i += 1; } } + } else if offset >= length { + std::ptr::copy_nonoverlapping(src, out_next, length); } else { - if offset >= length { - std::ptr::copy_nonoverlapping(src, out_next, length); - } else { - // Optimization: Use u64 copy loop for overlapping case with offset >= 8. - // This avoids function call overhead of copy_nonoverlapping for small chunks. - // Since offset >= 8, we can read 8 bytes and write 8 bytes safely - // (the read source is at least 8 bytes behind the write destination). - let src_ptr = src; - let dest_ptr = out_next; - let mut i = 0; - while i + 8 <= length { - let val = (src_ptr.add(i) as *const u64).read_unaligned(); - (dest_ptr.add(i) as *mut u64).write_unaligned(val); - i += 8; - } - while i < length { - *dest_ptr.add(i) = *src_ptr.add(i); - i += 1; - } + // Optimization: Use u64 copy loop for overlapping case with offset >= 8. + // This avoids function call overhead of copy_nonoverlapping for small chunks. + // Since offset >= 8, we can read 8 bytes and write 8 bytes safely + // (the read source is at least 8 bytes behind the write destination). + let src_ptr = src; + let dest_ptr = out_next; + let mut i = 0; + while i + 8 <= length { + let val = (src_ptr.add(i) as *const u64).read_unaligned(); + (dest_ptr.add(i) as *mut u64).write_unaligned(val); + i += 8; + } + while i < length { + *dest_ptr.add(i) = *src_ptr.add(i); + i += 1; } } out_next = out_next.add(length); @@ -1077,7 +1081,7 @@ impl Decompressor { } let hdr = u16::from_be_bytes([input[0], input[1]]); - if hdr % 31 != 0 { + if !hdr.is_multiple_of(31) { return (DecompressResult::BadData, 0, 0); } if ((hdr >> 8) & 0xF) as u8 != ZLIB_CM_DEFLATE { diff --git a/src/decompress/x86.rs b/src/decompress/x86.rs index 648be7d..6e81552 100644 --- a/src/decompress/x86.rs +++ b/src/decompress/x86.rs @@ -2266,9 +2266,9 @@ pub unsafe fn decompress_bmi2_ptr( 41 => decompress_offset_cycle3::<7>( out_next, src, v, length, ), - 42 => decompress_offset_42( - out_next, src, v, length, - ), + 42 => { + decompress_offset_42(out_next, src, v, length) + } 43 => decompress_offset_cycle3::<5>( out_next, src, v, length, ), diff --git a/src/stream.rs b/src/stream.rs index 47c07f2..e1d2fb4 100644 --- a/src/stream.rs +++ b/src/stream.rs @@ -66,7 +66,7 @@ impl DeflateEncoder { if output.len() < bound { output .try_reserve(bound - output.len()) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + .map_err(io::Error::other)?; // SAFETY: We just reserved sufficient capacity. The compressor writes to // the buffer using `MaybeUninit` pointers, so uninitialized memory is fine. unsafe { @@ -91,7 +91,7 @@ impl DeflateEncoder { writer.write_all(&output[..size])?; } } else { - return Err(io::Error::new(io::ErrorKind::Other, "Compression failed")); + return Err(io::Error::other("Compression failed")); } } else { let compressed_chunks: Vec> = chunks @@ -104,7 +104,7 @@ impl DeflateEncoder { if output.len() < bound { output .try_reserve(bound - output.len()) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + .map_err(io::Error::other)?; // SAFETY: We just reserved sufficient capacity. The compressor writes to // the buffer using `MaybeUninit` pointers, so uninitialized memory is fine. unsafe { @@ -127,7 +127,7 @@ impl DeflateEncoder { if res == CompressResult::Success { Ok(size) } else { - Err(io::Error::new(io::ErrorKind::Other, "Compression failed")) + Err(io::Error::other("Compression failed")) } }) .collect(); @@ -153,7 +153,7 @@ impl DeflateEncoder { if output.len() < bound { output .try_reserve(bound - output.len()) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + .map_err(io::Error::other)?; // SAFETY: We just reserved sufficient capacity. The compressor writes to // the buffer using `MaybeUninit` pointers, so uninitialized memory is fine. unsafe { @@ -178,7 +178,7 @@ impl DeflateEncoder { writer.write_all(&output[..size])?; } } else { - return Err(io::Error::new(io::ErrorKind::Other, "Compression failed")); + return Err(io::Error::other("Compression failed")); } } @@ -265,8 +265,8 @@ impl Read for DeflateDecoder { } loop { - if self.write_pos >= 64 * 1024 { - if self.read_pos >= 32 * 1024 { + if self.write_pos >= 64 * 1024 + && self.read_pos >= 32 * 1024 { self.window.copy_within( self.read_pos - 32 * 1024..self.write_pos, 32 * 1024 - (self.read_pos - 32 * 1024), @@ -278,7 +278,6 @@ impl Read for DeflateDecoder { self.write_pos = amount_to_keep; self.read_pos -= shift; } - } let mut output_full = false; if self.input_pos < self.input_cap { @@ -340,7 +339,7 @@ impl Read for DeflateDecoder { if self.input_buffer.len() < 1024 * 1024 { self.input_buffer.resize(self.input_buffer.len() * 2, 0); } else { - return Err(io::Error::new(io::ErrorKind::Other, "input buffer full")); + return Err(io::Error::other("input buffer full")); } }