From 961cda584ee06b48d6cad521d3c34728028e613e Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 18 Feb 2026 23:46:56 +0000 Subject: [PATCH] Optimize Sequence struct packing to reduce cache pressure during compression Packed the offset slot into the unused high bits of the length field in the `Sequence` struct. This allows the offset slot to be pre-calculated during the match finding phase, avoiding a random access to the 32KB `OFFSET_SLOT_TABLE` during the critical bitstream writing phase. - Modified `Sequence` struct to provide helper methods for packing/unpacking. - Updated `decide_greedy_sequences`, `compress_near_optimal_block`, and `compress_greedy_block` to pack the offset slot. - Updated `write_match_fast` and `write_match` to use the pre-calculated offset slot. Co-authored-by: 404Setup <153366651+404Setup@users.noreply.github.com> --- bench_result.txt | 34 ----------- examples/manual_bench.rs | 32 +++++++++++ src/compress/mod.rs | 121 +++++++++++++++++++++++---------------- 3 files changed, 103 insertions(+), 84 deletions(-) delete mode 100644 bench_result.txt create mode 100644 examples/manual_bench.rs diff --git a/bench_result.txt b/bench_result.txt deleted file mode 100644 index f36cc98..0000000 --- a/bench_result.txt +++ /dev/null @@ -1,34 +0,0 @@ -warning: unused import: `HUFFDEC_END_OF_BLOCK` - --> src/decompress/x86.rs:3:5 - | -3 | HUFFDEC_END_OF_BLOCK, HUFFDEC_EXCEPTIONAL, HUFFDEC_LITERAL, HUFFDEC_SUBTABLE_POINTER, - | ^^^^^^^^^^^^^^^^^^^^ - | - = note: `#[warn(unused_imports)]` (part of `#[warn(unused)]`) on by default - -warning: constant `OFFSET12_MASKS` is never used - --> src/decompress/x86.rs:70:7 - | -70 | const OFFSET12_MASKS: [u8; 48] = [ - | ^^^^^^^^^^^^^^ - | - = note: `#[warn(dead_code)]` (part of `#[warn(unused)]`) on by default - -warning: constant `OFFSET10_MASKS` is never used - --> src/decompress/x86.rs:76:7 - | -76 | const OFFSET10_MASKS: [u8; 80] = [ - | ^^^^^^^^^^^^^^ - -warning: constant `OFFSET14_MASKS` is never used - --> src/decompress/x86.rs:106:7 - | -106 | const OFFSET14_MASKS: [u8; 112] = [ - | ^^^^^^^^^^^^^^ - -warning: `libdeflate` (lib) generated 4 warnings (run `cargo fix --lib -p libdeflate` to apply 1 suggestion) - Finished `bench` profile [optimized] target(s) in 0.08s - Running benches/bench_main.rs (target/release/deps/bench_main-c40c548458fc7066) -Gnuplot not found, using plotters backend -Benchmarking Decompress offset5/libdeflate-rs offset5 -Benchmarking Decompress offset5/libdeflate-rs offset5: Warming up for 3.0000 s diff --git a/examples/manual_bench.rs b/examples/manual_bench.rs new file mode 100644 index 0000000..8ec0ffc --- /dev/null +++ b/examples/manual_bench.rs @@ -0,0 +1,32 @@ +use libdeflate::Compressor; +use std::time::Instant; + +fn main() { + let size = 256 * 1024; + let mut data = Vec::with_capacity(size); + for i in 0..size { + data.push((i % 256) as u8); + } + + let mut compressor = Compressor::new(6).unwrap(); + let bound = compressor.deflate_compress_bound(size); + let mut out_buf = vec![0u8; bound]; + + // Warmup + for _ in 0..10 { + compressor.compress_deflate_into(&data, &mut out_buf).unwrap(); + } + + let start = Instant::now(); + let iterations = 2000; + let mut total_bytes = 0; + for _ in 0..iterations { + let size = compressor.compress_deflate_into(&data, &mut out_buf).unwrap(); + total_bytes += size; + } + let duration = start.elapsed(); + + println!("Compressed {} iterations of {} bytes", iterations, size); + println!("Total time: {:?}", duration); + println!("Throughput: {:.2} GiB/s", (iterations as f64 * size as f64) / duration.as_secs_f64() / 1024.0 / 1024.0 / 1024.0); +} diff --git a/src/compress/mod.rs b/src/compress/mod.rs index 96bd120..7d95c7c 100644 --- a/src/compress/mod.rs +++ b/src/compress/mod.rs @@ -100,6 +100,27 @@ struct Sequence { offset: u16, } +impl Sequence { + #[inline(always)] + fn new(litrunlen: u32, len: u16, offset: u16, off_slot: u8) -> Self { + Self { + litrunlen, + length: len | ((off_slot as u16) << 9), + offset, + } + } + + #[inline(always)] + fn len(&self) -> u16 { + self.length & 0x1FF + } + + #[inline(always)] + fn off_slot(&self) -> usize { + (self.length >> 9) as usize + } +} + #[derive(Clone, Copy)] struct DPNode { cost: u32, @@ -1067,14 +1088,16 @@ impl Compressor { } } - self.sequences.push(Sequence { + let off_slot = self.get_offset_slot(offset); + self.sequences.push(Sequence::new( litrunlen, - length: len as u16, - offset: offset as u16, - }); + len as u16, + offset as u16, + off_slot as u8, + )); self.split_stats.observe_match(len, offset); self.litlen_freqs[257 + self.get_length_slot(len)] += 1; - self.offset_freqs[self.get_offset_slot(offset)] += 1; + self.offset_freqs[off_slot] += 1; litrunlen = 0; if len - 1 > skipped { mf.skip_positions( @@ -1092,11 +1115,7 @@ impl Compressor { in_idx += 1; } } - self.sequences.push(Sequence { - litrunlen, - length: 0, - offset: 0, - }); + self.sequences.push(Sequence::new(litrunlen, 0, 0, 0)); self.litlen_freqs[256] += 1; in_idx - start_pos } @@ -1149,15 +1168,18 @@ impl Compressor { } } } - if seq.length >= 3 { + let len = seq.len(); + if len >= 3 { + let offset = seq.offset as usize; + let off_slot = seq.off_slot(); if bs.out_idx + 16 < bs.output.len() { unsafe { - self.write_match_fast(bs, seq.length as usize, seq.offset as usize); + self.write_match_fast(bs, len as usize, offset, off_slot); } - } else if !self.write_match(bs, seq.length as usize, seq.offset as usize) { + } else if !self.write_match(bs, len as usize, offset, off_slot) { return false; } - in_pos += seq.length as usize; + in_pos += len as usize; } } if !self.write_sym(bs, 256) { @@ -1275,11 +1297,12 @@ impl Compressor { while in_idx < input.len() { let (len, offset) = mf.find_match(input, in_idx, self.max_search_depth); if len >= 3 { - self.sequences.push(Sequence { + self.sequences.push(Sequence::new( litrunlen, - length: len as u16, - offset: offset as u16, - }); + len as u16, + offset as u16, + self.get_offset_slot(offset) as u8, + )); litrunlen = 0; mf.skip_positions(input, in_idx + 1, len - 1, self.max_search_depth); in_idx += len; @@ -1300,11 +1323,12 @@ impl Compressor { let (len, offset) = mf.find_match(input, in_idx, self.max_search_depth); if len >= 3 { self.split_stats.observe_match(len, offset); - self.sequences.push(Sequence { + self.sequences.push(Sequence::new( litrunlen, - length: len as u16, - offset: offset as u16, - }); + len as u16, + offset as u16, + self.get_offset_slot(offset) as u8, + )); litrunlen = 0; mf.skip_positions(input, in_idx + 1, len - 1, self.max_search_depth); in_idx += len; @@ -1315,11 +1339,7 @@ impl Compressor { } } } - self.sequences.push(Sequence { - litrunlen, - length: 0, - offset: 0, - }); + self.sequences.push(Sequence::new(litrunlen, 0, 0, 0)); let processed = in_idx - start_pos; let is_final = (start_pos + processed >= input.len()) && final_block; @@ -1353,15 +1373,18 @@ impl Compressor { } } } - if seq.length >= 3 { + let len = seq.len(); + if len >= 3 { + let offset = seq.offset as usize; + let off_slot = seq.off_slot(); if bs.out_idx + 16 < bs.output.len() { unsafe { - self.write_match_fast(bs, seq.length as usize, seq.offset as usize); + self.write_match_fast(bs, len as usize, offset, off_slot); } - } else if !self.write_match(bs, seq.length as usize, seq.offset as usize) { + } else if !self.write_match(bs, len as usize, offset, off_slot) { return 0; } - in_pos += seq.length as usize; + in_pos += len as usize; } } if !self.write_sym(bs, 256) { @@ -1415,13 +1438,15 @@ impl Compressor { while cur_in_idx < block_input.len() { let (len, offset) = mf.find_match(block_input, cur_in_idx, self.max_search_depth); if len >= 3 { - self.sequences.push(Sequence { + let off_slot = self.get_offset_slot(offset); + self.sequences.push(Sequence::new( litrunlen, - length: len as u16, - offset: offset as u16, - }); + len as u16, + offset as u16, + off_slot as u8, + )); self.litlen_freqs[257 + self.get_length_slot(len)] += 1; - self.offset_freqs[self.get_offset_slot(offset)] += 1; + self.offset_freqs[off_slot] += 1; litrunlen = 0; cur_in_idx += len; for i in 1..len { @@ -1519,22 +1544,20 @@ impl Compressor { litrunlen += 1; cur_pos += 1; } else { - self.sequences.push(Sequence { + let off_slot = self.get_offset_slot(node.offset as usize); + self.sequences.push(Sequence::new( litrunlen, - length: node.length, - offset: node.offset, - }); + node.length, + node.offset, + off_slot as u8, + )); self.litlen_freqs[257 + self.get_length_slot(node.length as usize)] += 1; - self.offset_freqs[self.get_offset_slot(node.offset as usize)] += 1; + self.offset_freqs[off_slot] += 1; litrunlen = 0; cur_pos += node.length as usize; } } - self.sequences.push(Sequence { - litrunlen, - length: 0, - offset: 0, - }); + self.sequences.push(Sequence::new(litrunlen, 0, 0, 0)); make_huffman_code( DEFLATE_NUM_LITLEN_SYMS, @@ -1756,7 +1779,7 @@ impl Compressor { } #[inline(always)] - unsafe fn write_match_fast(&self, bs: &mut Bitstream, len: usize, offset: usize) { + unsafe fn write_match_fast(&self, bs: &mut Bitstream, len: usize, offset: usize, off_slot: usize) { let entry = *self.match_len_table.get_unchecked(len); let code = entry as u16 as u32; let huff_len = (entry >> 16) as u8 as u32; @@ -1768,7 +1791,6 @@ impl Compressor { bs.write_bits_unchecked_fast(len_val, len_len); - let off_slot = self.get_offset_slot(offset); let entry = *self.offset_table.get_unchecked(off_slot); let off_code = entry as u32; let off_len = (entry >> 32) as u8 as u32; @@ -1781,7 +1803,7 @@ impl Compressor { bs.write_bits_unchecked_fast(off_val, off_len_total); } - fn write_match(&self, bs: &mut Bitstream, len: usize, offset: usize) -> bool { + fn write_match(&self, bs: &mut Bitstream, len: usize, offset: usize, off_slot: usize) -> bool { let entry = unsafe { *self.match_len_table.get_unchecked(len) }; let code = entry as u16 as u32; let huff_len = (entry >> 16) as u8 as u32; @@ -1795,7 +1817,6 @@ impl Compressor { return false; } - let off_slot = self.get_offset_slot(offset); let entry = unsafe { *self.offset_table.get_unchecked(off_slot) }; let off_code = entry as u32; let off_len = (entry >> 32) as u8 as u32;