Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions benches/bench_main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1443,6 +1443,7 @@ criterion_group!(
bench_decompress_offset3_small,
bench_decompress_offset9_small,
bench_decompress_offset5,
bench_decompress_offset6_micro,
bench_decompress_offset1,
bench_decompress_offset2,
bench_decompress_offset4,
Expand Down Expand Up @@ -1680,3 +1681,35 @@ fn bench_crc32_slice8_tail(c: &mut Criterion) {

group.finish();
}

fn bench_decompress_offset6_micro(c: &mut Criterion) {
let size = 1024 * 1024; // 1MB
let pattern = b"123456"; // 6 bytes
let mut original_data = Vec::with_capacity(size);
while original_data.len() < size {
original_data.extend_from_slice(pattern);
}
original_data.truncate(size);

let mut compressor = Compressor::new(6).unwrap();
let mut compressed_data = vec![0u8; size + size / 2 + 1024];
let compressed_size = compressor
.compress_deflate_into(&original_data, &mut compressed_data)
.unwrap();

let mut out_buf = vec![0u8; size];

let mut group = c.benchmark_group("Decompress offset6 Micro");
group.throughput(Throughput::Bytes(size as u64));

group.bench_with_input("libdeflate-rs offset6 micro", &size, |b, &_size| {
let mut decompressor = Decompressor::new();
b.iter(|| {
decompressor
.decompress_deflate_into(&compressed_data[..compressed_size], &mut out_buf)
.unwrap_or(0)
});
});

group.finish();
}
63 changes: 62 additions & 1 deletion src/decompress/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ static OFFSET3_MASKS: [u8; 48] = [
2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2,
];

// LCM(6, 16) = 48. 3 vectors.
static OFFSET6_MASKS: [u8; 48] = [
0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
];

// LCM(10, 16) = 80. 5 vectors.
static OFFSET10_MASKS: [u8; 80] = [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
Expand Down Expand Up @@ -989,12 +995,67 @@ pub unsafe fn decompress_bmi2(
}
}

while copied < length {
*dest_ptr.add(copied) = *src_ptr.add(copied);
copied += 1;
}
} else if offset == 6 {
let dest_ptr = out_next;
let src_ptr = src;
let mut copied = 0;

if length >= 16 {
let v_src = _mm_loadu_si128(src_ptr as *const __m128i);
let masks_ptr = OFFSET6_MASKS.as_ptr() as *const __m128i;
let v_base =
_mm_shuffle_epi8(v_src, _mm_loadu_si128(masks_ptr));

while copied + 48 <= length {
_mm_storeu_si128(
dest_ptr.add(copied) as *mut __m128i,
v_base,
);
_mm_storeu_si128(
dest_ptr.add(copied + 16) as *mut __m128i,
_mm_shuffle_epi8(
v_src,
_mm_loadu_si128(masks_ptr.add(1)),
),
);
_mm_storeu_si128(
dest_ptr.add(copied + 32) as *mut __m128i,
_mm_shuffle_epi8(
v_src,
_mm_loadu_si128(masks_ptr.add(2)),
),
);
copied += 48;
}

while copied + 16 <= length {
let idx = (copied % 48) / 16;
let v = if idx == 0 {
v_base
} else {
_mm_shuffle_epi8(
v_src,
_mm_loadu_si128(masks_ptr.add(idx)),
)
};
_mm_storeu_si128(
dest_ptr.add(copied) as *mut __m128i,
v,
);
copied += 16;
}
}

while copied < length {
*dest_ptr.add(copied) = *src_ptr.add(copied);
copied += 1;
}
} else {
// Simple loop for offsets 5, 6, 7
// Simple loop for offsets 5, 7
let mut copied = 0;
while copied < length {
*out_next.add(copied) = *src.add(copied);
Expand Down