Skip to content

Commit bb9bcc7

Browse files
authored
Merge pull request #234 from 404Setup/optimize-sequence-packing-16937576356832658656
Optimize Sequence packing to avoid table lookup in write loop
2 parents cbc39e8 + 961cda5 commit bb9bcc7

3 files changed

Lines changed: 103 additions & 84 deletions

File tree

bench_result.txt

Lines changed: 0 additions & 34 deletions
This file was deleted.

examples/manual_bench.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
use libdeflate::Compressor;
2+
use std::time::Instant;
3+
4+
fn main() {
5+
let size = 256 * 1024;
6+
let mut data = Vec::with_capacity(size);
7+
for i in 0..size {
8+
data.push((i % 256) as u8);
9+
}
10+
11+
let mut compressor = Compressor::new(6).unwrap();
12+
let bound = compressor.deflate_compress_bound(size);
13+
let mut out_buf = vec![0u8; bound];
14+
15+
// Warmup
16+
for _ in 0..10 {
17+
compressor.compress_deflate_into(&data, &mut out_buf).unwrap();
18+
}
19+
20+
let start = Instant::now();
21+
let iterations = 2000;
22+
let mut total_bytes = 0;
23+
for _ in 0..iterations {
24+
let size = compressor.compress_deflate_into(&data, &mut out_buf).unwrap();
25+
total_bytes += size;
26+
}
27+
let duration = start.elapsed();
28+
29+
println!("Compressed {} iterations of {} bytes", iterations, size);
30+
println!("Total time: {:?}", duration);
31+
println!("Throughput: {:.2} GiB/s", (iterations as f64 * size as f64) / duration.as_secs_f64() / 1024.0 / 1024.0 / 1024.0);
32+
}

src/compress/mod.rs

Lines changed: 71 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,27 @@ struct Sequence {
100100
offset: u16,
101101
}
102102

103+
impl Sequence {
104+
#[inline(always)]
105+
fn new(litrunlen: u32, len: u16, offset: u16, off_slot: u8) -> Self {
106+
Self {
107+
litrunlen,
108+
length: len | ((off_slot as u16) << 9),
109+
offset,
110+
}
111+
}
112+
113+
#[inline(always)]
114+
fn len(&self) -> u16 {
115+
self.length & 0x1FF
116+
}
117+
118+
#[inline(always)]
119+
fn off_slot(&self) -> usize {
120+
(self.length >> 9) as usize
121+
}
122+
}
123+
103124
#[derive(Clone, Copy)]
104125
struct DPNode {
105126
cost: u32,
@@ -1067,14 +1088,16 @@ impl Compressor {
10671088
}
10681089
}
10691090

1070-
self.sequences.push(Sequence {
1091+
let off_slot = self.get_offset_slot(offset);
1092+
self.sequences.push(Sequence::new(
10711093
litrunlen,
1072-
length: len as u16,
1073-
offset: offset as u16,
1074-
});
1094+
len as u16,
1095+
offset as u16,
1096+
off_slot as u8,
1097+
));
10751098
self.split_stats.observe_match(len, offset);
10761099
self.litlen_freqs[257 + self.get_length_slot(len)] += 1;
1077-
self.offset_freqs[self.get_offset_slot(offset)] += 1;
1100+
self.offset_freqs[off_slot] += 1;
10781101
litrunlen = 0;
10791102
if len - 1 > skipped {
10801103
mf.skip_positions(
@@ -1092,11 +1115,7 @@ impl Compressor {
10921115
in_idx += 1;
10931116
}
10941117
}
1095-
self.sequences.push(Sequence {
1096-
litrunlen,
1097-
length: 0,
1098-
offset: 0,
1099-
});
1118+
self.sequences.push(Sequence::new(litrunlen, 0, 0, 0));
11001119
self.litlen_freqs[256] += 1;
11011120
in_idx - start_pos
11021121
}
@@ -1149,15 +1168,18 @@ impl Compressor {
11491168
}
11501169
}
11511170
}
1152-
if seq.length >= 3 {
1171+
let len = seq.len();
1172+
if len >= 3 {
1173+
let offset = seq.offset as usize;
1174+
let off_slot = seq.off_slot();
11531175
if bs.out_idx + 16 < bs.output.len() {
11541176
unsafe {
1155-
self.write_match_fast(bs, seq.length as usize, seq.offset as usize);
1177+
self.write_match_fast(bs, len as usize, offset, off_slot);
11561178
}
1157-
} else if !self.write_match(bs, seq.length as usize, seq.offset as usize) {
1179+
} else if !self.write_match(bs, len as usize, offset, off_slot) {
11581180
return false;
11591181
}
1160-
in_pos += seq.length as usize;
1182+
in_pos += len as usize;
11611183
}
11621184
}
11631185
if !self.write_sym(bs, 256) {
@@ -1275,11 +1297,12 @@ impl Compressor {
12751297
while in_idx < input.len() {
12761298
let (len, offset) = mf.find_match(input, in_idx, self.max_search_depth);
12771299
if len >= 3 {
1278-
self.sequences.push(Sequence {
1300+
self.sequences.push(Sequence::new(
12791301
litrunlen,
1280-
length: len as u16,
1281-
offset: offset as u16,
1282-
});
1302+
len as u16,
1303+
offset as u16,
1304+
self.get_offset_slot(offset) as u8,
1305+
));
12831306
litrunlen = 0;
12841307
mf.skip_positions(input, in_idx + 1, len - 1, self.max_search_depth);
12851308
in_idx += len;
@@ -1300,11 +1323,12 @@ impl Compressor {
13001323
let (len, offset) = mf.find_match(input, in_idx, self.max_search_depth);
13011324
if len >= 3 {
13021325
self.split_stats.observe_match(len, offset);
1303-
self.sequences.push(Sequence {
1326+
self.sequences.push(Sequence::new(
13041327
litrunlen,
1305-
length: len as u16,
1306-
offset: offset as u16,
1307-
});
1328+
len as u16,
1329+
offset as u16,
1330+
self.get_offset_slot(offset) as u8,
1331+
));
13081332
litrunlen = 0;
13091333
mf.skip_positions(input, in_idx + 1, len - 1, self.max_search_depth);
13101334
in_idx += len;
@@ -1315,11 +1339,7 @@ impl Compressor {
13151339
}
13161340
}
13171341
}
1318-
self.sequences.push(Sequence {
1319-
litrunlen,
1320-
length: 0,
1321-
offset: 0,
1322-
});
1342+
self.sequences.push(Sequence::new(litrunlen, 0, 0, 0));
13231343

13241344
let processed = in_idx - start_pos;
13251345
let is_final = (start_pos + processed >= input.len()) && final_block;
@@ -1353,15 +1373,18 @@ impl Compressor {
13531373
}
13541374
}
13551375
}
1356-
if seq.length >= 3 {
1376+
let len = seq.len();
1377+
if len >= 3 {
1378+
let offset = seq.offset as usize;
1379+
let off_slot = seq.off_slot();
13571380
if bs.out_idx + 16 < bs.output.len() {
13581381
unsafe {
1359-
self.write_match_fast(bs, seq.length as usize, seq.offset as usize);
1382+
self.write_match_fast(bs, len as usize, offset, off_slot);
13601383
}
1361-
} else if !self.write_match(bs, seq.length as usize, seq.offset as usize) {
1384+
} else if !self.write_match(bs, len as usize, offset, off_slot) {
13621385
return 0;
13631386
}
1364-
in_pos += seq.length as usize;
1387+
in_pos += len as usize;
13651388
}
13661389
}
13671390
if !self.write_sym(bs, 256) {
@@ -1415,13 +1438,15 @@ impl Compressor {
14151438
while cur_in_idx < block_input.len() {
14161439
let (len, offset) = mf.find_match(block_input, cur_in_idx, self.max_search_depth);
14171440
if len >= 3 {
1418-
self.sequences.push(Sequence {
1441+
let off_slot = self.get_offset_slot(offset);
1442+
self.sequences.push(Sequence::new(
14191443
litrunlen,
1420-
length: len as u16,
1421-
offset: offset as u16,
1422-
});
1444+
len as u16,
1445+
offset as u16,
1446+
off_slot as u8,
1447+
));
14231448
self.litlen_freqs[257 + self.get_length_slot(len)] += 1;
1424-
self.offset_freqs[self.get_offset_slot(offset)] += 1;
1449+
self.offset_freqs[off_slot] += 1;
14251450
litrunlen = 0;
14261451
cur_in_idx += len;
14271452
for i in 1..len {
@@ -1519,22 +1544,20 @@ impl Compressor {
15191544
litrunlen += 1;
15201545
cur_pos += 1;
15211546
} else {
1522-
self.sequences.push(Sequence {
1547+
let off_slot = self.get_offset_slot(node.offset as usize);
1548+
self.sequences.push(Sequence::new(
15231549
litrunlen,
1524-
length: node.length,
1525-
offset: node.offset,
1526-
});
1550+
node.length,
1551+
node.offset,
1552+
off_slot as u8,
1553+
));
15271554
self.litlen_freqs[257 + self.get_length_slot(node.length as usize)] += 1;
1528-
self.offset_freqs[self.get_offset_slot(node.offset as usize)] += 1;
1555+
self.offset_freqs[off_slot] += 1;
15291556
litrunlen = 0;
15301557
cur_pos += node.length as usize;
15311558
}
15321559
}
1533-
self.sequences.push(Sequence {
1534-
litrunlen,
1535-
length: 0,
1536-
offset: 0,
1537-
});
1560+
self.sequences.push(Sequence::new(litrunlen, 0, 0, 0));
15381561

15391562
make_huffman_code(
15401563
DEFLATE_NUM_LITLEN_SYMS,
@@ -1756,7 +1779,7 @@ impl Compressor {
17561779
}
17571780

17581781
#[inline(always)]
1759-
unsafe fn write_match_fast(&self, bs: &mut Bitstream, len: usize, offset: usize) {
1782+
unsafe fn write_match_fast(&self, bs: &mut Bitstream, len: usize, offset: usize, off_slot: usize) {
17601783
let entry = *self.match_len_table.get_unchecked(len);
17611784
let code = entry as u16 as u32;
17621785
let huff_len = (entry >> 16) as u8 as u32;
@@ -1768,7 +1791,6 @@ impl Compressor {
17681791

17691792
bs.write_bits_unchecked_fast(len_val, len_len);
17701793

1771-
let off_slot = self.get_offset_slot(offset);
17721794
let entry = *self.offset_table.get_unchecked(off_slot);
17731795
let off_code = entry as u32;
17741796
let off_len = (entry >> 32) as u8 as u32;
@@ -1781,7 +1803,7 @@ impl Compressor {
17811803
bs.write_bits_unchecked_fast(off_val, off_len_total);
17821804
}
17831805

1784-
fn write_match(&self, bs: &mut Bitstream, len: usize, offset: usize) -> bool {
1806+
fn write_match(&self, bs: &mut Bitstream, len: usize, offset: usize, off_slot: usize) -> bool {
17851807
let entry = unsafe { *self.match_len_table.get_unchecked(len) };
17861808
let code = entry as u16 as u32;
17871809
let huff_len = (entry >> 16) as u8 as u32;
@@ -1795,7 +1817,6 @@ impl Compressor {
17951817
return false;
17961818
}
17971819

1798-
let off_slot = self.get_offset_slot(offset);
17991820
let entry = unsafe { *self.offset_table.get_unchecked(off_slot) };
18001821
let off_code = entry as u32;
18011822
let off_len = (entry >> 32) as u8 as u32;

0 commit comments

Comments
 (0)