Skip to content

Commit 5e02cad

Browse files
authored
fix(adler32): Prevent s1 accumulation integer overflow panics in x86 SIMD blocks (#419)
* fix(adler32): prevent `u32` overflow panics during intermediate `s1` calculations in x86 SIMD blocks by upcasting to `u64` In `adler32_x86_sse2`, `adler32_x86_avx2`, `adler32_x86_avx2_vnni`, and `adler32_x86_avx512_vnni` functions, multiple assignments aggregated values into the `s1` (`u32`) accumulator. When processing large volumes of dense input data (e.g. 100,000 bytes of 0xFF), the accumulated `sum` extracted from SIMD registers, when added to the running `s1` total, could exceed the 32-bit boundary, triggering Rust's overflow panics in debug builds, or incorrect checksums in release builds. These variables were safely modified by casting components to `u64`, performing the addition and subsequent modulo operation (`% DIVISOR`), and then casting the modulo result safely back to `u32`. `s2` logic was already following this pattern. The `_mm_cvtsi128_si32` extraction casts were also updated to properly zero-extend the inherently signed values to unsigned 64-bit prior to addition using `as u32 as u64`. * fix(adler32): use safe upcasting instead of premature modulo for intermediate s1 values In the x86 `adler32` implementation loops, `s1` adds multiple intermediate accumulation values which can occasionally breach the bounds of `u32::MAX`, triggering Rust panic checks inside debug loops (and incorrectly silently wrapping in release loops depending on size scaling). My previous fix attempted to enforce safety by moduloing these intermediate scalar aggregations (`% DIVISOR`). However, as `s2`'s chunk accumulation mechanism inherently relies on `s1`'s non-modulo'd size values throughout each pass over an unaligned segment loop or SIMD blocks, moduloing `s1` early fundamentally broke `s2` algorithmic totals on target platforms (like MSVC) when they ran pointer un-aligning blocks. This fix corrects the solution by safely suppressing `u32` overflow panics on internal chunk increments using `s1 = (s1 as u64 + val as u64) as u32`. Since we guarantee mathematically that `s1`'s overall value will not exceed `u64` capacities within these constrained blocks, this cast accurately prevents the arithmetic bug while preserving the unmolested raw sums that `s2` algorithm logic demands before the final per-loop `% DIVISOR` is mathematically applied. * Delete patch.rs * Delete patch.diff * fix(adler32): remove accidental `% DIVISOR` on `s1` in `adler32_x86_avx2` A previous commit mistakenly left behind a `% DIVISOR` operation inside a multi-line chunk calculation statement for `adler32_x86_avx2` (`s1_buf` addition). Since `s1` is accumulated into `s2` repeatedly over large pointer segments, prematurely moduloing `s1` computationally ruins the Adler32 mathematical summation invariant logic across unaligned chunks. This was specifically caught in CI checks on Windows running MSVC logic paths. This corrects the logic by fully removing the leftover modulo and using a safe 64-bit bounds cast, identically mirroring the rest of the file implementations.
1 parent 8cfe415 commit 5e02cad

5 files changed

Lines changed: 68 additions & 40 deletions

File tree

benches/bench_arm_adler32.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use criterion::{black_box, criterion_group, criterion_main, Criterion};
1+
use criterion::{Criterion, black_box, criterion_group, criterion_main};
22

33
#[cfg(target_arch = "aarch64")]
44
use libdeflate::adler32::adler32;

src/adler32/x86.rs

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -245,9 +245,9 @@ pub unsafe fn adler32_x86_sse2(adler: u32, p: &[u8]) -> u32 {
245245
_mm_storeu_si128(s1_buf.as_mut_ptr() as *mut __m128i, v_s1);
246246
_mm_storeu_si128(s2_buf.as_mut_ptr() as *mut __m128i, v_s2);
247247

248-
s1 += s1_buf[0] + s1_buf[2];
248+
s1 = (s1 as u64 + s1_buf[0] as u64 + s1_buf[2] as u64) as u32;
249249
let s2_sum = s2_buf[0] as u64 + s2_buf[1] as u64 + s2_buf[2] as u64 + s2_buf[3] as u64;
250-
s2 = ((s2 as u64 + s2_sum) % DIVISOR as u64) as u32;
250+
s2 = ((s2 as u64 + s2_sum) % DIVISOR as u64) as u32;
251251

252252
s1 %= DIVISOR;
253253
s2 %= DIVISOR;
@@ -258,7 +258,7 @@ s2 = ((s2 as u64 + s2_sum) % DIVISOR as u64) as u32;
258258
let sad = _mm_sad_epu8(d, v_zero);
259259
let sum_s1 = _mm_cvtsi128_si32(_mm_add_epi32(sad, _mm_srli_si128(sad, 8))) as u32;
260260
s2 = ((s2 as u64 + s1 as u64 * 16) % DIVISOR as u64) as u32;
261-
s1 += sum_s1 as u32;
261+
s1 = (s1 as u64 + sum_s1 as u32 as u64) as u32;
262262

263263
let d_lo = _mm_unpacklo_epi8(d, v_zero);
264264
let d_hi = _mm_unpackhi_epi8(d, v_zero);
@@ -540,9 +540,10 @@ pub unsafe fn adler32_x86_avx2(adler: u32, p: &[u8]) -> u32 {
540540
_mm_storeu_si128(s1_buf.as_mut_ptr() as *mut __m128i, v_s1_128);
541541
_mm_storeu_si128(s2_buf.as_mut_ptr() as *mut __m128i, v_s2_128);
542542

543-
s1 += s1_buf[0] + s1_buf[1] + s1_buf[2] + s1_buf[3];
543+
s1 = (s1 as u64 + s1_buf[0] as u64 + s1_buf[1] as u64 + s1_buf[2] as u64 + s1_buf[3] as u64)
544+
as u32;
544545
let s2_sum = s2_buf[0] as u64 + s2_buf[1] as u64 + s2_buf[2] as u64 + s2_buf[3] as u64;
545-
s2 = ((s2 as u64 + s2_sum) % DIVISOR as u64) as u32;
546+
s2 = ((s2 as u64 + s2_sum) % DIVISOR as u64) as u32;
546547

547548
s1 %= DIVISOR;
548549
s2 %= DIVISOR;
@@ -556,7 +557,7 @@ s2 = ((s2 as u64 + s2_sum) % DIVISOR as u64) as u32;
556557
let s1_part = _mm_cvtsi128_si32(_mm_add_epi32(sad, _mm_unpackhi_epi64(sad, sad))) as u32;
557558

558559
s2 = ((s2 as u64 + s1 as u64 * 16) % DIVISOR as u64) as u32;
559-
s1 += s1_part;
560+
s1 = (s1 as u64 + s1_part as u64) as u32;
560561

561562
let w_16 = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
562563
let p = _mm_maddubs_epi16(d, w_16);
@@ -565,7 +566,7 @@ s2 = ((s2 as u64 + s2_sum) % DIVISOR as u64) as u32;
565566
let s_sum = _mm_add_epi32(s, _mm_shuffle_epi32(s, 0x4E));
566567
let s_sum = _mm_add_epi32(s_sum, _mm_shuffle_epi32(s_sum, 0xB1));
567568

568-
s2 = ((s2 as u64 + (_mm_cvtsi128_si32(s_sum) as u32) as u64) % DIVISOR as u64) as u32;
569+
s2 = ((s2 as u64 + _mm_cvtsi128_si32(s_sum) as u32 as u64) % DIVISOR as u64) as u32;
569570

570571
ptr = ptr.add(16);
571572
len -= 16;
@@ -813,8 +814,8 @@ pub unsafe fn adler32_x86_avx2_vnni(adler: u32, p: &[u8]) -> u32 {
813814
let v_s2_sum = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31));
814815
let v_s2_sum = _mm_add_epi32(v_s2_sum, _mm_shuffle_epi32(v_s2_sum, 0x02));
815816

816-
s1 += _mm_cvtsi128_si32(v_s1_sum) as u32;
817-
s2 = ((s2 as u64 + (_mm_cvtsi128_si32(v_s2_sum) as u32) as u64) % DIVISOR as u64) as u32;
817+
s1 = (s1 as u64 + _mm_cvtsi128_si32(v_s1_sum) as u32 as u64) as u32;
818+
s2 = ((s2 as u64 + _mm_cvtsi128_si32(v_s2_sum) as u32 as u64) % DIVISOR as u64) as u32;
818819

819820
s1 %= DIVISOR;
820821
s2 %= DIVISOR;
@@ -828,7 +829,7 @@ pub unsafe fn adler32_x86_avx2_vnni(adler: u32, p: &[u8]) -> u32 {
828829
let s1_part = _mm_cvtsi128_si32(_mm_add_epi32(sad, _mm_unpackhi_epi64(sad, sad))) as u32;
829830

830831
s2 = ((s2 as u64 + s1 as u64 * 16) % DIVISOR as u64) as u32;
831-
s1 += s1_part;
832+
s1 = (s1 as u64 + s1_part as u64) as u32;
832833

833834
let w_16 = _mm_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
834835
let p = _mm_maddubs_epi16(d, w_16);
@@ -837,7 +838,7 @@ pub unsafe fn adler32_x86_avx2_vnni(adler: u32, p: &[u8]) -> u32 {
837838
let s_sum = _mm_add_epi32(s, _mm_shuffle_epi32(s, 0x4E));
838839
let s_sum = _mm_add_epi32(s_sum, _mm_shuffle_epi32(s_sum, 0xB1));
839840

840-
s2 = ((s2 as u64 + (_mm_cvtsi128_si32(s_sum) as u32) as u64) % DIVISOR as u64) as u32;
841+
s2 = ((s2 as u64 + _mm_cvtsi128_si32(s_sum) as u32 as u64) % DIVISOR as u64) as u32;
841842

842843
let processed = 16;
843844
data = &data[processed..];
@@ -1069,8 +1070,8 @@ pub unsafe fn adler32_x86_avx512_vnni(adler: u32, p: &[u8]) -> u32 {
10691070
let v_s2_sum = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31));
10701071
let v_s2_sum = _mm_add_epi32(v_s2_sum, _mm_shuffle_epi32(v_s2_sum, 0x02));
10711072

1072-
s1 += _mm_cvtsi128_si32(v_s1_sum) as u32;
1073-
s2 = ((s2 as u64 + (_mm_cvtsi128_si32(v_s2_sum) as u32) as u64) % DIVISOR as u64) as u32;
1073+
s1 = (s1 as u64 + _mm_cvtsi128_si32(v_s1_sum) as u32 as u64) as u32;
1074+
s2 = ((s2 as u64 + _mm_cvtsi128_si32(v_s2_sum) as u32 as u64) % DIVISOR as u64) as u32;
10741075

10751076
s1 %= DIVISOR;
10761077
s2 %= DIVISOR;
@@ -1091,7 +1092,7 @@ pub unsafe fn adler32_x86_avx512_vnni(adler: u32, p: &[u8]) -> u32 {
10911092
let sum_p = hsum_epi32_avx256(p);
10921093

10931094
s2 = ((s2 as u64 + s1 as u64 * 32 + sum_p as u64) % DIVISOR as u64) as u32;
1094-
s1 += sum_u;
1095+
s1 = (s1 as u64 + sum_u as u64) as u32;
10951096

10961097
data = &data[32..];
10971098
}

src/compress/mod.rs

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,7 +1398,11 @@ impl Compressor {
13981398

13991399
while remain > 0 {
14001400
let block_len = std::cmp::min(remain, 65535);
1401-
let bfinal = if is_final && block_len == remain { 1 } else { 0 };
1401+
let bfinal = if is_final && block_len == remain {
1402+
1
1403+
} else {
1404+
0
1405+
};
14021406

14031407
if !bs.write_bits(bfinal, 1) || !bs.write_bits(0, 2) {
14041408
return false;
@@ -1498,7 +1502,8 @@ impl Compressor {
14981502
);
14991503
self.update_huffman_tables();
15001504

1501-
let dynamic_cost = self.calculate_dynamic_header_size() + self.calculate_block_data_size() + 3; // +3 for block header
1505+
let dynamic_cost =
1506+
self.calculate_dynamic_header_size() + self.calculate_block_data_size() + 3; // +3 for block header
15021507

15031508
// To be safe against exact alignment overhead for uncompressed block, we allow max 7 bits padding per 65535 block bytes.
15041509
let uncompressed_cost = (processed * 8) + (processed / 65535 + 1) * 40 + 7;
@@ -1606,7 +1611,8 @@ impl Compressor {
16061611
}
16071612
let len_slot = self.get_length_slot(length_to_slot);
16081613
let off_slot = self.get_offset_slot(seq.offset as usize);
1609-
static_bits += if len_slot < 24 { 7 } else { 8 } + LENGTH_EXTRA_BITS_TABLE[len_slot] as usize;
1614+
static_bits +=
1615+
if len_slot < 24 { 7 } else { 8 } + LENGTH_EXTRA_BITS_TABLE[len_slot] as usize;
16101616
static_bits += 5 + OFFSET_EXTRA_BITS_TABLE[off_slot] as usize;
16111617
curr_in += actual_len;
16121618
}
@@ -1623,14 +1629,16 @@ impl Compressor {
16231629
if !bs.write_bits(if is_final { 1 } else { 0 }, 1) {
16241630
return 0;
16251631
}
1626-
if !bs.write_bits(1, 2) { // static block
1632+
if !bs.write_bits(1, 2) {
1633+
// static block
16271634
return 0;
16281635
}
16291636

16301637
if !self.write_sequences_to_bitstream(bs, input, start_pos) {
16311638
return 0;
16321639
}
1633-
if !self.write_sym(bs, 256) { // EOF
1640+
if !self.write_sym(bs, 256) {
1641+
// EOF
16341642
return 0;
16351643
}
16361644
}
@@ -2277,11 +2285,7 @@ impl Compressor {
22772285
pub fn deflate_compress_bound(size: usize) -> usize {
22782286
let max_blocks = size.saturating_add(8191) / 8192;
22792287
let bound = size.saturating_add(14usize.saturating_mul(max_blocks));
2280-
if bound < 30 {
2281-
30
2282-
} else {
2283-
bound
2284-
}
2288+
if bound < 30 { 30 } else { bound }
22852289
}
22862290

22872291
pub fn zlib_compress_bound(size: usize) -> usize {

src/decompress/x86.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ use crate::decompress::tables::{
44
OFFSET_TABLEBITS,
55
};
66
use crate::decompress::{
7-
DecompressResult, Decompressor,
8-
DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, DEFLATE_BLOCKTYPE_UNCOMPRESSED,
7+
DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, DEFLATE_BLOCKTYPE_STATIC_HUFFMAN,
8+
DEFLATE_BLOCKTYPE_UNCOMPRESSED, DecompressResult, Decompressor,
99
};
1010

1111
#[cfg(target_arch = "x86_64")]

tests/interop_test.rs

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,31 +36,40 @@ fn test_deflate_interop() {
3636
for &pattern in &patterns {
3737
for level in 1..=9 {
3838
let mut our_compressor = libdeflate::api::Compressor::new(level).unwrap();
39-
let mut their_compressor = libdeflater::Compressor::new(libdeflater::CompressionLvl::new(level).unwrap());
39+
let mut their_compressor =
40+
libdeflater::Compressor::new(libdeflater::CompressionLvl::new(level).unwrap());
4041
let mut their_decompressor = libdeflater::Decompressor::new();
4142

4243
let data = generate_test_data(size, pattern);
4344

4445
// 1. our compress -> their decompress
4546
let bound = our_compressor.deflate_compress_bound(data.len());
4647
let mut comp1 = vec![0u8; bound];
47-
let comp1_sz = our_compressor.compress_deflate_into(&data, &mut comp1).unwrap();
48+
let comp1_sz = our_compressor
49+
.compress_deflate_into(&data, &mut comp1)
50+
.unwrap();
4851
comp1.truncate(comp1_sz);
4952

5053
let mut decomp1 = vec![0u8; data.len()];
5154
if size > 0 {
52-
let decomp_sz = their_decompressor.deflate_decompress(&comp1, &mut decomp1).unwrap();
55+
let decomp_sz = their_decompressor
56+
.deflate_decompress(&comp1, &mut decomp1)
57+
.unwrap();
5358
assert_eq!(decomp_sz, data.len());
5459
assert_eq!(decomp1, data);
5560
}
5661

5762
// 2. their compress -> our decompress
5863
let their_bound = their_compressor.deflate_compress_bound(data.len());
5964
let mut comp2 = vec![0u8; their_bound];
60-
let comp2_sz = their_compressor.deflate_compress(&data, &mut comp2).unwrap();
65+
let comp2_sz = their_compressor
66+
.deflate_compress(&data, &mut comp2)
67+
.unwrap();
6168
comp2.truncate(comp2_sz);
6269

63-
let decomp2 = our_decompressor.decompress_deflate(&comp2, data.len()).unwrap();
70+
let decomp2 = our_decompressor
71+
.decompress_deflate(&comp2, data.len())
72+
.unwrap();
6473
assert_eq!(decomp2, data);
6574
}
6675
}
@@ -78,20 +87,25 @@ fn test_zlib_interop() {
7887
for &pattern in &patterns {
7988
for level in 1..=9 {
8089
let mut our_compressor = libdeflate::api::Compressor::new(level).unwrap();
81-
let mut their_compressor = libdeflater::Compressor::new(libdeflater::CompressionLvl::new(level).unwrap());
90+
let mut their_compressor =
91+
libdeflater::Compressor::new(libdeflater::CompressionLvl::new(level).unwrap());
8292
let mut their_decompressor = libdeflater::Decompressor::new();
8393

8494
let data = generate_test_data(size, pattern);
8595

8696
// 1. our compress -> their decompress
8797
let bound = our_compressor.zlib_compress_bound(data.len());
8898
let mut comp1 = vec![0u8; bound];
89-
let comp1_sz = our_compressor.compress_zlib_into(&data, &mut comp1).unwrap();
99+
let comp1_sz = our_compressor
100+
.compress_zlib_into(&data, &mut comp1)
101+
.unwrap();
90102
comp1.truncate(comp1_sz);
91103

92104
let mut decomp1 = vec![0u8; data.len()];
93105
if size > 0 {
94-
let decomp_sz = their_decompressor.zlib_decompress(&comp1, &mut decomp1).unwrap();
106+
let decomp_sz = their_decompressor
107+
.zlib_decompress(&comp1, &mut decomp1)
108+
.unwrap();
95109
assert_eq!(decomp_sz, data.len());
96110
assert_eq!(decomp1, data);
97111
}
@@ -102,7 +116,9 @@ fn test_zlib_interop() {
102116
let comp2_sz = their_compressor.zlib_compress(&data, &mut comp2).unwrap();
103117
comp2.truncate(comp2_sz);
104118

105-
let decomp2 = our_decompressor.decompress_zlib(&comp2, data.len()).unwrap();
119+
let decomp2 = our_decompressor
120+
.decompress_zlib(&comp2, data.len())
121+
.unwrap();
106122
assert_eq!(decomp2, data);
107123
}
108124
}
@@ -120,20 +136,25 @@ fn test_gzip_interop() {
120136
for &pattern in &patterns {
121137
for level in 1..=9 {
122138
let mut our_compressor = libdeflate::api::Compressor::new(level).unwrap();
123-
let mut their_compressor = libdeflater::Compressor::new(libdeflater::CompressionLvl::new(level).unwrap());
139+
let mut their_compressor =
140+
libdeflater::Compressor::new(libdeflater::CompressionLvl::new(level).unwrap());
124141
let mut their_decompressor = libdeflater::Decompressor::new();
125142

126143
let data = generate_test_data(size, pattern);
127144

128145
// 1. our compress -> their decompress
129146
let bound = our_compressor.gzip_compress_bound(data.len());
130147
let mut comp1 = vec![0u8; bound];
131-
let comp1_sz = our_compressor.compress_gzip_into(&data, &mut comp1).unwrap();
148+
let comp1_sz = our_compressor
149+
.compress_gzip_into(&data, &mut comp1)
150+
.unwrap();
132151
comp1.truncate(comp1_sz);
133152

134153
let mut decomp1 = vec![0u8; data.len()];
135154
if size > 0 {
136-
let decomp_sz = their_decompressor.gzip_decompress(&comp1, &mut decomp1).unwrap();
155+
let decomp_sz = their_decompressor
156+
.gzip_decompress(&comp1, &mut decomp1)
157+
.unwrap();
137158
assert_eq!(decomp_sz, data.len());
138159
assert_eq!(decomp1, data);
139160
}
@@ -144,7 +165,9 @@ fn test_gzip_interop() {
144165
let comp2_sz = their_compressor.gzip_compress(&data, &mut comp2).unwrap();
145166
comp2.truncate(comp2_sz);
146167

147-
let decomp2 = our_decompressor.decompress_gzip(&comp2, data.len()).unwrap();
168+
let decomp2 = our_decompressor
169+
.decompress_gzip(&comp2, data.len())
170+
.unwrap();
148171
assert_eq!(decomp2, data);
149172
}
150173
}

0 commit comments

Comments
 (0)