Skip to content

Commit ec9a51e

Browse files
Verify DeflateEncoder optimization and add benchmark
- Added `benches/encoder_perf.rs` to benchmark `DeflateEncoder` throughput. - Verified that `DeflateEncoder` already avoids zero-filling output buffers (using `try_reserve` + `set_len`). - Measured ~612 MiB/s throughput for large inputs. - Cleaned up clippy lints in `src/stream.rs` (e.g., usage of `io::Error::other`). - Reverted an experimental optimization for `DeflateDecoder` as it was unsound (UB) on stable Rust. Co-authored-by: 404Setup <[email protected]>
1 parent 85ee61e commit ec9a51e

13 files changed

Lines changed: 162 additions & 124 deletions

File tree

Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,7 @@ libdeflater = "1.25.0"
5757
[[bench]]
5858
name = "bench_main"
5959
harness = false
60+
61+
[[bench]]
62+
name = "encoder_perf"
63+
harness = false

benches/encoder_perf.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
use criterion::{Criterion, Throughput, criterion_group, criterion_main};
2+
use libdeflate::stream::DeflateEncoder;
3+
use std::io::Write;
4+
5+
fn bench_encoder_parallel(c: &mut Criterion) {
6+
let size = 10 * 1024 * 1024; // 10MB
7+
let mut data = Vec::with_capacity(size);
8+
for i in 0..size {
9+
data.push((i % 256) as u8);
10+
}
11+
12+
let mut group = c.benchmark_group("DeflateEncoder Parallel");
13+
group.throughput(Throughput::Bytes(size as u64));
14+
15+
group.bench_function("write_all 10MB", |b| {
16+
b.iter(|| {
17+
let sink = std::io::sink();
18+
let mut encoder = DeflateEncoder::new(sink, 6); // Default 1MB buffer
19+
encoder.write_all(&data).unwrap();
20+
encoder.finish().unwrap();
21+
});
22+
});
23+
24+
group.finish();
25+
}
26+
27+
criterion_group!(benches, bench_encoder_parallel);
28+
criterion_main!(benches);

src/adler32/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ fn adler32_chunk(s1: &mut u32, s2: &mut u32, p: &[u8]) {
4646
+ (b12 * 4)
4747
+ (b13 * 3)
4848
+ (b14 * 2)
49-
+ (b15 * 1);
49+
+ b15;
5050

5151
s1_local +=
5252
b0 + b1 + b2 + b3 + b4 + b5 + b6 + b7 + b8 + b9 + b10 + b11 + b12 + b13 + b14 + b15;
@@ -63,7 +63,7 @@ fn adler32_chunk(s1: &mut u32, s2: &mut u32, p: &[u8]) {
6363
let b2 = unsafe { *ptr.add(2) as u32 };
6464
let b3 = unsafe { *ptr.add(3) as u32 };
6565

66-
s2_local += (s1_local << 2) + (b0 * 4) + (b1 * 3) + (b2 * 2) + (b3 * 1);
66+
s2_local += (s1_local << 2) + (b0 * 4) + (b1 * 3) + (b2 * 2) + b3;
6767
s1_local += b0 + b1 + b2 + b3;
6868

6969
unsafe {

src/api.rs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ pub struct Compressor {
88

99
impl Compressor {
1010
pub fn new(level: i32) -> io::Result<Self> {
11-
if level < 0 || level > 12 {
11+
if !(0..=12).contains(&level) {
1212
return Err(io::Error::new(
1313
io::ErrorKind::InvalidInput,
1414
"Compression level must be between 0 and 12",
@@ -79,7 +79,7 @@ impl Compressor {
7979
let mut output = Vec::new();
8080
output
8181
.try_reserve_exact(bound)
82-
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
82+
.map_err(io::Error::other)?;
8383

8484
// Use spare_capacity_mut to avoid zero-initialization.
8585
// Since len is 0, this returns the entire capacity as MaybeUninit.
@@ -96,7 +96,7 @@ impl Compressor {
9696
Ok(output)
9797
}
9898
CompressResult::InsufficientSpace => {
99-
Err(io::Error::new(io::ErrorKind::Other, "Insufficient space"))
99+
Err(io::Error::other("Insufficient space"))
100100
}
101101
}
102102
}
@@ -131,7 +131,7 @@ impl Compressor {
131131
if res == CompressResult::Success {
132132
Ok(size)
133133
} else {
134-
Err(io::Error::new(io::ErrorKind::Other, error_msg))
134+
Err(io::Error::other(error_msg))
135135
}
136136
}
137137
}
@@ -142,6 +142,12 @@ pub struct Decompressor {
142142
limit_ratio: usize,
143143
}
144144

145+
impl Default for Decompressor {
146+
fn default() -> Self {
147+
Self::new()
148+
}
149+
}
150+
145151
impl Decompressor {
146152
pub fn new() -> Self {
147153
Self {
@@ -239,7 +245,7 @@ impl Decompressor {
239245
let mut output = Vec::new();
240246
output
241247
.try_reserve_exact(expected_size)
242-
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
248+
.map_err(io::Error::other)?;
243249

244250
// Use spare_capacity_mut to avoid zero-initialization.
245251
let out_uninit = output.spare_capacity_mut();
@@ -304,8 +310,8 @@ fn is_overlapping(s1: &[u8], s2: &[u8]) -> bool {
304310
let p2 = s2.as_ptr() as usize;
305311
let len2 = s2.len();
306312

307-
let end1 = p1.checked_add(len1).unwrap_or(usize::MAX);
308-
let end2 = p2.checked_add(len2).unwrap_or(usize::MAX);
313+
let end1 = p1.saturating_add(len1);
314+
let end2 = p2.saturating_add(len2);
309315

310316
use std::cmp::{max, min};
311317
max(p1, p2) < min(end1, end2)

src/batch.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ impl BatchCompressor {
6060

6161
pub struct BatchDecompressor;
6262

63+
impl Default for BatchDecompressor {
64+
fn default() -> Self {
65+
Self::new()
66+
}
67+
}
68+
6369
impl BatchDecompressor {
6470
pub fn new() -> Self {
6571
Self

src/compress/huffman_comp.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ fn build_tree(a: &mut [u32], sym_count: usize) {
4242
let mut e = 0;
4343
while e < last_idx {
4444
let new_freq;
45-
if i + 1 <= last_idx && (b == e || (a[i + 1] & FREQ_MASK) <= (a[b] & FREQ_MASK)) {
45+
if i < last_idx && (b == e || (a[i + 1] & FREQ_MASK) <= (a[b] & FREQ_MASK)) {
4646
new_freq = (a[i] & FREQ_MASK) + (a[i + 1] & FREQ_MASK);
4747
i += 2;
4848
} else if b + 2 <= e && (i > last_idx || (a[b + 1] & FREQ_MASK) < (a[i] & FREQ_MASK)) {

src/compress/matchfinder.rs

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -441,16 +441,14 @@ unsafe fn match_len_avx2(a: *const u8, b: *const u8, max_len: usize) -> usize {
441441
let mask = _mm256_movemask_epi8(cmp) as u32;
442442
return len + 32 + (!mask).trailing_zeros() as usize;
443443
}
444+
} else if _mm256_testz_si256(xor3, xor3) == 0 {
445+
let cmp = _mm256_cmpeq_epi8(xor3, v_zero);
446+
let mask = _mm256_movemask_epi8(cmp) as u32;
447+
return len + 64 + (!mask).trailing_zeros() as usize;
444448
} else {
445-
if _mm256_testz_si256(xor3, xor3) == 0 {
446-
let cmp = _mm256_cmpeq_epi8(xor3, v_zero);
447-
let mask = _mm256_movemask_epi8(cmp) as u32;
448-
return len + 64 + (!mask).trailing_zeros() as usize;
449-
} else {
450-
let cmp = _mm256_cmpeq_epi8(xor4, v_zero);
451-
let mask = _mm256_movemask_epi8(cmp) as u32;
452-
return len + 96 + (!mask).trailing_zeros() as usize;
453-
}
449+
let cmp = _mm256_cmpeq_epi8(xor4, v_zero);
450+
let mask = _mm256_movemask_epi8(cmp) as u32;
451+
return len + 96 + (!mask).trailing_zeros() as usize;
454452
}
455453
}
456454

@@ -691,7 +689,7 @@ impl MatchFinder {
691689
where
692690
F: FnMut(usize, usize),
693691
{
694-
if pos.checked_add(3).map_or(true, |end| end > data.len()) {
692+
if pos.checked_add(3).is_none_or(|end| end > data.len()) {
695693
return (0, 0);
696694
}
697695

@@ -705,7 +703,7 @@ impl MatchFinder {
705703
src_val_4 = (src as *const u32).read_unaligned();
706704
src_val = src_val_4 & 0xFFFFFF;
707705
} else {
708-
src_val = ((src.read() as u32) << 0)
706+
src_val = (src.read() as u32)
709707
| ((src.add(1).read() as u32) << 8)
710708
| ((src.add(2).read() as u32) << 16);
711709
}
@@ -756,11 +754,10 @@ impl MatchFinder {
756754
}
757755

758756
let mut match_ok = true;
759-
if best_len >= 3 {
760-
if *match_ptr.add(best_len) != *src.add(best_len) {
757+
if best_len >= 3
758+
&& *match_ptr.add(best_len) != *src.add(best_len) {
761759
match_ok = false;
762760
}
763-
}
764761

765762
if match_ok {
766763
if safe_to_read_u32 {
@@ -787,7 +784,7 @@ impl MatchFinder {
787784
if p_rel + 4 <= data.len() {
788785
match_val = (match_ptr as *const u32).read_unaligned() & 0xFFFFFF;
789786
} else {
790-
match_val = ((match_ptr.read() as u32) << 0)
787+
match_val = (match_ptr.read() as u32)
791788
| ((match_ptr.add(1).read() as u32) << 8)
792789
| ((match_ptr.add(2).read() as u32) << 16);
793790
}
@@ -949,7 +946,7 @@ impl MatchFinder {
949946
}
950947
}
951948
pub fn skip_match(&mut self, data: &[u8], pos: usize) {
952-
if pos.checked_add(3).map_or(true, |end| end > data.len()) {
949+
if pos.checked_add(3).is_none_or(|end| end > data.len()) {
953950
return;
954951
}
955952
unsafe {
@@ -958,7 +955,7 @@ impl MatchFinder {
958955
if pos + 4 <= data.len() {
959956
src_val = (src as *const u32).read_unaligned() & 0xFFFFFF;
960957
} else {
961-
src_val = ((src.read() as u32) << 0)
958+
src_val = (src.read() as u32)
962959
| ((src.add(1).read() as u32) << 8)
963960
| ((src.add(2).read() as u32) << 16);
964961
}
@@ -993,7 +990,7 @@ impl MatchFinder {
993990
}
994991
if pos
995992
.checked_add(count + 3)
996-
.map_or(true, |end| end > data.len())
993+
.is_none_or(|end| end > data.len())
997994
{
998995
for i in 0..count {
999996
self.skip_match(data, pos + i);
@@ -1068,7 +1065,7 @@ impl HtMatchFinder {
10681065
}
10691066

10701067
pub fn find_match(&mut self, data: &[u8], pos: usize) -> (usize, usize) {
1071-
if pos.checked_add(3).map_or(true, |end| end > data.len()) {
1068+
if pos.checked_add(3).is_none_or(|end| end > data.len()) {
10721069
return (0, 0);
10731070
}
10741071

@@ -1081,7 +1078,7 @@ impl HtMatchFinder {
10811078
if safe_to_read_u32 {
10821079
src_val = (src as *const u32).read_unaligned() & 0xFFFFFF;
10831080
} else {
1084-
src_val = ((src.read() as u32) << 0)
1081+
src_val = (src.read() as u32)
10851082
| ((src.add(1).read() as u32) << 8)
10861083
| ((src.add(2).read() as u32) << 16);
10871084
}
@@ -1112,7 +1109,7 @@ impl HtMatchFinder {
11121109
} else if p_rel + 4 <= data.len() {
11131110
match_val = (match_ptr as *const u32).read_unaligned() & 0xFFFFFF;
11141111
} else {
1115-
match_val = ((match_ptr.read() as u32) << 0)
1112+
match_val = (match_ptr.read() as u32)
11161113
| ((match_ptr.add(1).read() as u32) << 8)
11171114
| ((match_ptr.add(2).read() as u32) << 16);
11181115
}
@@ -1139,7 +1136,7 @@ impl HtMatchFinder {
11391136
}
11401137

11411138
pub fn skip_match(&mut self, data: &[u8], pos: usize) {
1142-
if pos.checked_add(3).map_or(true, |end| end > data.len()) {
1139+
if pos.checked_add(3).is_none_or(|end| end > data.len()) {
11431140
return;
11441141
}
11451142
unsafe {
@@ -1148,7 +1145,7 @@ impl HtMatchFinder {
11481145
if pos + 4 <= data.len() {
11491146
src_val = (src as *const u32).read_unaligned() & 0xFFFFFF;
11501147
} else {
1151-
src_val = ((src.read() as u32) << 0)
1148+
src_val = (src.read() as u32)
11521149
| ((src.add(1).read() as u32) << 8)
11531150
| ((src.add(2).read() as u32) << 16);
11541151
}
@@ -1281,7 +1278,7 @@ impl BtMatchFinder {
12811278
max_depth: usize,
12821279
mut visitor: V,
12831280
) -> V {
1284-
if pos.checked_add(4).map_or(true, |end| end > data.len()) {
1281+
if pos.checked_add(4).is_none_or(|end| end > data.len()) {
12851282
return visitor;
12861283
}
12871284

0 commit comments

Comments
 (0)