Skip to content

Commit 961cda5

Browse files
Optimize Sequence struct packing to reduce cache pressure during compression
Packed the offset slot into the unused high bits of the length field in the `Sequence` struct. This allows the offset slot to be pre-calculated during the match finding phase, avoiding a random access to the 32KB `OFFSET_SLOT_TABLE` during the critical bitstream writing phase. - Modified `Sequence` struct to provide helper methods for packing/unpacking. - Updated `decide_greedy_sequences`, `compress_near_optimal_block`, and `compress_greedy_block` to pack the offset slot. - Updated `write_match_fast` and `write_match` to use the pre-calculated offset slot. Co-authored-by: 404Setup <[email protected]>
1 parent cbc39e8 commit 961cda5

3 files changed

Lines changed: 103 additions & 84 deletions

File tree

bench_result.txt

Lines changed: 0 additions & 34 deletions
This file was deleted.

examples/manual_bench.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
use libdeflate::Compressor;
2+
use std::time::Instant;
3+
4+
fn main() {
5+
let size = 256 * 1024;
6+
let mut data = Vec::with_capacity(size);
7+
for i in 0..size {
8+
data.push((i % 256) as u8);
9+
}
10+
11+
let mut compressor = Compressor::new(6).unwrap();
12+
let bound = compressor.deflate_compress_bound(size);
13+
let mut out_buf = vec![0u8; bound];
14+
15+
// Warmup
16+
for _ in 0..10 {
17+
compressor.compress_deflate_into(&data, &mut out_buf).unwrap();
18+
}
19+
20+
let start = Instant::now();
21+
let iterations = 2000;
22+
let mut total_bytes = 0;
23+
for _ in 0..iterations {
24+
let size = compressor.compress_deflate_into(&data, &mut out_buf).unwrap();
25+
total_bytes += size;
26+
}
27+
let duration = start.elapsed();
28+
29+
println!("Compressed {} iterations of {} bytes", iterations, size);
30+
println!("Total time: {:?}", duration);
31+
println!("Throughput: {:.2} GiB/s", (iterations as f64 * size as f64) / duration.as_secs_f64() / 1024.0 / 1024.0 / 1024.0);
32+
}

src/compress/mod.rs

Lines changed: 71 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,27 @@ struct Sequence {
100100
offset: u16,
101101
}
102102

103+
impl Sequence {
104+
#[inline(always)]
105+
fn new(litrunlen: u32, len: u16, offset: u16, off_slot: u8) -> Self {
106+
Self {
107+
litrunlen,
108+
length: len | ((off_slot as u16) << 9),
109+
offset,
110+
}
111+
}
112+
113+
#[inline(always)]
114+
fn len(&self) -> u16 {
115+
self.length & 0x1FF
116+
}
117+
118+
#[inline(always)]
119+
fn off_slot(&self) -> usize {
120+
(self.length >> 9) as usize
121+
}
122+
}
123+
103124
#[derive(Clone, Copy)]
104125
struct DPNode {
105126
cost: u32,
@@ -1067,14 +1088,16 @@ impl Compressor {
10671088
}
10681089
}
10691090

1070-
self.sequences.push(Sequence {
1091+
let off_slot = self.get_offset_slot(offset);
1092+
self.sequences.push(Sequence::new(
10711093
litrunlen,
1072-
length: len as u16,
1073-
offset: offset as u16,
1074-
});
1094+
len as u16,
1095+
offset as u16,
1096+
off_slot as u8,
1097+
));
10751098
self.split_stats.observe_match(len, offset);
10761099
self.litlen_freqs[257 + self.get_length_slot(len)] += 1;
1077-
self.offset_freqs[self.get_offset_slot(offset)] += 1;
1100+
self.offset_freqs[off_slot] += 1;
10781101
litrunlen = 0;
10791102
if len - 1 > skipped {
10801103
mf.skip_positions(
@@ -1092,11 +1115,7 @@ impl Compressor {
10921115
in_idx += 1;
10931116
}
10941117
}
1095-
self.sequences.push(Sequence {
1096-
litrunlen,
1097-
length: 0,
1098-
offset: 0,
1099-
});
1118+
self.sequences.push(Sequence::new(litrunlen, 0, 0, 0));
11001119
self.litlen_freqs[256] += 1;
11011120
in_idx - start_pos
11021121
}
@@ -1149,15 +1168,18 @@ impl Compressor {
11491168
}
11501169
}
11511170
}
1152-
if seq.length >= 3 {
1171+
let len = seq.len();
1172+
if len >= 3 {
1173+
let offset = seq.offset as usize;
1174+
let off_slot = seq.off_slot();
11531175
if bs.out_idx + 16 < bs.output.len() {
11541176
unsafe {
1155-
self.write_match_fast(bs, seq.length as usize, seq.offset as usize);
1177+
self.write_match_fast(bs, len as usize, offset, off_slot);
11561178
}
1157-
} else if !self.write_match(bs, seq.length as usize, seq.offset as usize) {
1179+
} else if !self.write_match(bs, len as usize, offset, off_slot) {
11581180
return false;
11591181
}
1160-
in_pos += seq.length as usize;
1182+
in_pos += len as usize;
11611183
}
11621184
}
11631185
if !self.write_sym(bs, 256) {
@@ -1275,11 +1297,12 @@ impl Compressor {
12751297
while in_idx < input.len() {
12761298
let (len, offset) = mf.find_match(input, in_idx, self.max_search_depth);
12771299
if len >= 3 {
1278-
self.sequences.push(Sequence {
1300+
self.sequences.push(Sequence::new(
12791301
litrunlen,
1280-
length: len as u16,
1281-
offset: offset as u16,
1282-
});
1302+
len as u16,
1303+
offset as u16,
1304+
self.get_offset_slot(offset) as u8,
1305+
));
12831306
litrunlen = 0;
12841307
mf.skip_positions(input, in_idx + 1, len - 1, self.max_search_depth);
12851308
in_idx += len;
@@ -1300,11 +1323,12 @@ impl Compressor {
13001323
let (len, offset) = mf.find_match(input, in_idx, self.max_search_depth);
13011324
if len >= 3 {
13021325
self.split_stats.observe_match(len, offset);
1303-
self.sequences.push(Sequence {
1326+
self.sequences.push(Sequence::new(
13041327
litrunlen,
1305-
length: len as u16,
1306-
offset: offset as u16,
1307-
});
1328+
len as u16,
1329+
offset as u16,
1330+
self.get_offset_slot(offset) as u8,
1331+
));
13081332
litrunlen = 0;
13091333
mf.skip_positions(input, in_idx + 1, len - 1, self.max_search_depth);
13101334
in_idx += len;
@@ -1315,11 +1339,7 @@ impl Compressor {
13151339
}
13161340
}
13171341
}
1318-
self.sequences.push(Sequence {
1319-
litrunlen,
1320-
length: 0,
1321-
offset: 0,
1322-
});
1342+
self.sequences.push(Sequence::new(litrunlen, 0, 0, 0));
13231343

13241344
let processed = in_idx - start_pos;
13251345
let is_final = (start_pos + processed >= input.len()) && final_block;
@@ -1353,15 +1373,18 @@ impl Compressor {
13531373
}
13541374
}
13551375
}
1356-
if seq.length >= 3 {
1376+
let len = seq.len();
1377+
if len >= 3 {
1378+
let offset = seq.offset as usize;
1379+
let off_slot = seq.off_slot();
13571380
if bs.out_idx + 16 < bs.output.len() {
13581381
unsafe {
1359-
self.write_match_fast(bs, seq.length as usize, seq.offset as usize);
1382+
self.write_match_fast(bs, len as usize, offset, off_slot);
13601383
}
1361-
} else if !self.write_match(bs, seq.length as usize, seq.offset as usize) {
1384+
} else if !self.write_match(bs, len as usize, offset, off_slot) {
13621385
return 0;
13631386
}
1364-
in_pos += seq.length as usize;
1387+
in_pos += len as usize;
13651388
}
13661389
}
13671390
if !self.write_sym(bs, 256) {
@@ -1415,13 +1438,15 @@ impl Compressor {
14151438
while cur_in_idx < block_input.len() {
14161439
let (len, offset) = mf.find_match(block_input, cur_in_idx, self.max_search_depth);
14171440
if len >= 3 {
1418-
self.sequences.push(Sequence {
1441+
let off_slot = self.get_offset_slot(offset);
1442+
self.sequences.push(Sequence::new(
14191443
litrunlen,
1420-
length: len as u16,
1421-
offset: offset as u16,
1422-
});
1444+
len as u16,
1445+
offset as u16,
1446+
off_slot as u8,
1447+
));
14231448
self.litlen_freqs[257 + self.get_length_slot(len)] += 1;
1424-
self.offset_freqs[self.get_offset_slot(offset)] += 1;
1449+
self.offset_freqs[off_slot] += 1;
14251450
litrunlen = 0;
14261451
cur_in_idx += len;
14271452
for i in 1..len {
@@ -1519,22 +1544,20 @@ impl Compressor {
15191544
litrunlen += 1;
15201545
cur_pos += 1;
15211546
} else {
1522-
self.sequences.push(Sequence {
1547+
let off_slot = self.get_offset_slot(node.offset as usize);
1548+
self.sequences.push(Sequence::new(
15231549
litrunlen,
1524-
length: node.length,
1525-
offset: node.offset,
1526-
});
1550+
node.length,
1551+
node.offset,
1552+
off_slot as u8,
1553+
));
15271554
self.litlen_freqs[257 + self.get_length_slot(node.length as usize)] += 1;
1528-
self.offset_freqs[self.get_offset_slot(node.offset as usize)] += 1;
1555+
self.offset_freqs[off_slot] += 1;
15291556
litrunlen = 0;
15301557
cur_pos += node.length as usize;
15311558
}
15321559
}
1533-
self.sequences.push(Sequence {
1534-
litrunlen,
1535-
length: 0,
1536-
offset: 0,
1537-
});
1560+
self.sequences.push(Sequence::new(litrunlen, 0, 0, 0));
15381561

15391562
make_huffman_code(
15401563
DEFLATE_NUM_LITLEN_SYMS,
@@ -1756,7 +1779,7 @@ impl Compressor {
17561779
}
17571780

17581781
#[inline(always)]
1759-
unsafe fn write_match_fast(&self, bs: &mut Bitstream, len: usize, offset: usize) {
1782+
unsafe fn write_match_fast(&self, bs: &mut Bitstream, len: usize, offset: usize, off_slot: usize) {
17601783
let entry = *self.match_len_table.get_unchecked(len);
17611784
let code = entry as u16 as u32;
17621785
let huff_len = (entry >> 16) as u8 as u32;
@@ -1768,7 +1791,6 @@ impl Compressor {
17681791

17691792
bs.write_bits_unchecked_fast(len_val, len_len);
17701793

1771-
let off_slot = self.get_offset_slot(offset);
17721794
let entry = *self.offset_table.get_unchecked(off_slot);
17731795
let off_code = entry as u32;
17741796
let off_len = (entry >> 32) as u8 as u32;
@@ -1781,7 +1803,7 @@ impl Compressor {
17811803
bs.write_bits_unchecked_fast(off_val, off_len_total);
17821804
}
17831805

1784-
fn write_match(&self, bs: &mut Bitstream, len: usize, offset: usize) -> bool {
1806+
fn write_match(&self, bs: &mut Bitstream, len: usize, offset: usize, off_slot: usize) -> bool {
17851807
let entry = unsafe { *self.match_len_table.get_unchecked(len) };
17861808
let code = entry as u16 as u32;
17871809
let huff_len = (entry >> 16) as u8 as u32;
@@ -1795,7 +1817,6 @@ impl Compressor {
17951817
return false;
17961818
}
17971819

1798-
let off_slot = self.get_offset_slot(offset);
17991820
let entry = unsafe { *self.offset_table.get_unchecked(off_slot) };
18001821
let off_code = entry as u32;
18011822
let off_len = (entry >> 32) as u8 as u32;

0 commit comments

Comments
 (0)