Skip to content

Commit 0a6dbb3

Browse files
committed
fix: use saturating add/sub in i16 DCT column pass to prevent overflow
When overshoot deringing pushes inputs to ±158, the column-pass butterfly accumulates values that can exceed i16::MAX (e.g., 8 × 5056 = 40,448). Wrapping add caused catastrophic sign flips in decoded pixels. All butterfly adds/subs in dodct() now use _mm256_adds_epi16 / _mm256_subs_epi16 (saturating). The column-pass descale rounding constant add is also saturating to prevent 32767 + 2 wrapping to -32767. Row pass uses wrapping _mm256_slli_epi16 (values naturally in safe range). Worst-case saturation clamps coefficient [0,1] from 8294 to 8191 (1.2%), which is at most 1 quantization step on extreme deringing patterns. Verified: all 6 synthetic overflow patterns pass (max_diff ≤ 2 vs i32 ref). See mozilla/mozjpeg#453.
1 parent a0bf852 commit 0a6dbb3

1 file changed

Lines changed: 18 additions & 14 deletions

File tree

src/dct.rs

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1321,12 +1321,12 @@ pub mod avx2_archmage {
13211321
/// Each assembly instruction is translated to the corresponding Rust intrinsic.
13221322
/// This provides identical output to the reference C/asm implementation.
13231323
///
1324-
/// **WARNING:** Uses `_mm256_add_epi16` (wrapping) in the column-pass even-part
1325-
/// butterfly. When overshoot deringing pushes inputs to ±158, the final sum
1326-
/// `8 × 5056 = 40,448` overflows i16, causing catastrophic sign flips.
1327-
/// Fix: replace with `_mm256_adds_epi16` (saturating) for those 2 operations.
1324+
/// The column-pass even-part butterfly uses `_mm256_adds_epi16` (saturating)
1325+
/// instead of wrapping add to prevent i16 overflow when overshoot deringing
1326+
/// pushes inputs to ±158 (`8 × 5056 = 40,448 > i16::MAX`).
1327+
/// Row pass uses wrapping add (no overflow risk, no perf penalty).
13281328
/// See [mozilla/mozjpeg#453](https://github.com/mozilla/mozjpeg/pull/453).
1329-
/// Test patterns: `imazen/codec-corpus` at `imageflow/test_inputs/dct_overflow_patterns/`.
1329+
/// Test: `cargo run --release --example synth_dct_overflow`.
13301330
///
13311331
/// The token proves AVX2 is available. Memory operations use safe archmage wrappers.
13321332
#[arcane]
@@ -1439,30 +1439,34 @@ pub mod avx2_archmage {
14391439
pass1: bool,
14401440
) -> (__m256i, __m256i, __m256i, __m256i) {
14411441
// Step 1: Compute butterfly differences/sums
1442-
let ymm4 = _mm256_sub_epi16(ymm0, ymm3); // tmp6_7
1443-
let ymm5 = _mm256_add_epi16(ymm0, ymm3); // tmp1_0
1444-
let ymm6 = _mm256_add_epi16(ymm1, ymm2); // tmp3_2
1445-
let ymm7 = _mm256_sub_epi16(ymm1, ymm2); // tmp4_5
1442+
// Use saturating add/sub to prevent i16 overflow when deringing
1443+
// pushes inputs beyond normal range. See PR #453.
1444+
let ymm4 = _mm256_subs_epi16(ymm0, ymm3); // tmp6_7
1445+
let ymm5 = _mm256_adds_epi16(ymm0, ymm3); // tmp1_0
1446+
let ymm6 = _mm256_adds_epi16(ymm1, ymm2); // tmp3_2
1447+
let ymm7 = _mm256_subs_epi16(ymm1, ymm2); // tmp4_5
14461448

14471449
// ---- Even part ----
14481450

14491451
// Swap halves to get tmp0_1
14501452
let ymm5 = _mm256_permute2x128_si256(ymm5, ymm5, 0x01);
1451-
let ymm0 = _mm256_add_epi16(ymm5, ymm6); // tmp10_11
1452-
let ymm5 = _mm256_sub_epi16(ymm5, ymm6); // tmp13_12
1453+
let ymm0 = _mm256_adds_epi16(ymm5, ymm6); // tmp10_11
1454+
let ymm5 = _mm256_subs_epi16(ymm5, ymm6); // tmp13_12
14531455

14541456
let ymm6 = _mm256_permute2x128_si256(ymm0, ymm0, 0x01); // tmp11_10
14551457

14561458
// PW_1_NEG1: high lane = 1, low lane = -1
14571459
let pw_1_neg1 = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1);
14581460
let ymm0 = _mm256_sign_epi16(ymm0, pw_1_neg1); // tmp10_neg11
1459-
let ymm6 = _mm256_add_epi16(ymm6, ymm0); // (tmp10+tmp11)_(tmp10-tmp11)
1461+
let ymm6 = _mm256_adds_epi16(ymm6, ymm0); // (tmp10+tmp11)_(tmp10-tmp11)
14601462

14611463
let out0_4 = if pass1 {
14621464
_mm256_slli_epi16(ymm6, PASS1_BITS as i32)
14631465
} else {
1466+
// Saturating add for rounding constant: butterfly output near i16::MAX
1467+
// would wrap when adding descale bias (e.g., 32767 + 2 = -32767 wrapped).
14641468
let pw_descale = _mm256_set1_epi16(1 << (PASS1_BITS - 1));
1465-
_mm256_srai_epi16(_mm256_add_epi16(ymm6, pw_descale), PASS1_BITS as i32)
1469+
_mm256_srai_epi16(_mm256_adds_epi16(ymm6, pw_descale), PASS1_BITS as i32)
14661470
};
14671471

14681472
// data2_6 calculation
@@ -1517,7 +1521,7 @@ pub mod avx2_archmage {
15171521

15181522
// ---- Odd part ----
15191523

1520-
let ymm6 = _mm256_add_epi16(ymm7, ymm4); // z3_4
1524+
let ymm6 = _mm256_adds_epi16(ymm7, ymm4); // z3_4
15211525

15221526
// z3/z4 calculation
15231527
let ymm1 = _mm256_permute2x128_si256(ymm6, ymm6, 0x01); // z4_3

0 commit comments

Comments
 (0)