diff --git a/CMakeLists.txt b/CMakeLists.txt index a2a09da26..0d243cb52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1756,6 +1756,21 @@ foreach(libtype ${TEST_LIBTYPES}) set_tests_properties(example-${sample_bits}bit-${libtype}-decompress-cmp PROPERTIES DEPENDS example-${sample_bits}bit-${libtype}-decompress) + # Regression test for issue #444: overshoot deringing + SIMD DCT overflow. + # An 8x8 half-black/half-white image triggers maximum overshoot, which + # overflows the 16-bit column pass butterfly in the SIMD forward DCT. + # Without the fix, the block's brightness is completely inverted. + # NOTE: no -revert flag, so mozjpeg extensions (deringing) are active. + if(sample_bits EQUAL 8) + add_test(NAME ${cjpeg}-${libtype}-issue444 + COMMAND cjpeg${suffix} -dct int -quality 25 -sample 1x1 + -outfile ${testout}_issue444.jpg ${TESTIMAGES}/issue444.ppm) + add_bittest(${djpeg} issue444 "-dct;int;-ppm" + ${testout}_issue444.ppm ${testout}_issue444.jpg + ca2f28ac51471eba0e0d30bc7492901c + ${cjpeg}-${libtype}-issue444) + endif() + endforeach() endforeach() diff --git a/jcdctmgr.c b/jcdctmgr.c index 32bd25745..35fe3a1ca 100644 --- a/jcdctmgr.c +++ b/jcdctmgr.c @@ -440,7 +440,9 @@ preprocess_deringing(DCTELEM *data, const JQUANT_TBL *quantization_table) return; } - /* Too much overshoot is not good: increased amplitude will cost bits, and the cost is proportional to quantization (here using DC quant as a rough guide). */ + /* Too much overshoot is not good: increased amplitude will cost bits, and + the cost is proportional to quantization (here using DC quant as a rough + guide). */ maxovershoot = maxsample + MIN(MIN(31, 2*quantization_table->quantval[0]), (maxsample * size - sum) / maxsample_count); n = 0; diff --git a/simd/arm/jfdctint-neon.c b/simd/arm/jfdctint-neon.c index ccfc07b15..6f0af1bee 100644 --- a/simd/arm/jfdctint-neon.c +++ b/simd/arm/jfdctint-neon.c @@ -274,8 +274,12 @@ void jsimd_fdct_islow_neon(DCTELEM *data) tmp11 = vaddq_s16(tmp1, tmp2); tmp12 = vsubq_s16(tmp1, tmp2); - row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS); - row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS); + /* Use saturating arithmetic for the final butterfly to prevent 16-bit + * overflow when overshoot deringing produces large uniform column values. + * See https://github.com/mozilla/mozjpeg/issues/444 + */ + row0 = vrshrq_n_s16(vqaddq_s16(tmp10, tmp11), PASS1_BITS); + row4 = vrshrq_n_s16(vqsubq_s16(tmp10, tmp11), PASS1_BITS); tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13); z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2); diff --git a/simd/i386/jfdctint-avx2.asm b/simd/i386/jfdctint-avx2.asm index 05ea86548..6d8da5b4c 100644 --- a/simd/i386/jfdctint-avx2.asm +++ b/simd/i386/jfdctint-avx2.asm @@ -118,7 +118,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10 vpsignw %1, %1, [GOTOFF(ebx, PW_1_NEG1)] ; %1=tmp10_neg11 - vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11) + vpaddsw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11) %if %9 == 1 vpsllw %1, %7, PASS1_BITS ; %1=data0_4 %else diff --git a/simd/i386/jfdctint-mmx.asm b/simd/i386/jfdctint-mmx.asm index 7d4c61cd7..6e1c3febe 100644 --- a/simd/i386/jfdctint-mmx.asm +++ b/simd/i386/jfdctint-mmx.asm @@ -438,8 +438,8 @@ EXTN(jsimd_fdct_islow_mmx): psubw mm6, mm4 ; mm6=tmp12 movq mm7, mm5 - paddw mm5, mm0 ; mm5=tmp10+tmp11 - psubw mm7, mm0 ; mm7=tmp10-tmp11 + paddsw mm5, mm0 ; mm5=tmp10+tmp11 + psubsw mm7, mm0 ; mm7=tmp10-tmp11 paddw mm5, [GOTOFF(ebx,PW_DESCALE_P2X)] paddw mm7, [GOTOFF(ebx,PW_DESCALE_P2X)] diff --git a/simd/i386/jfdctint-sse2.asm b/simd/i386/jfdctint-sse2.asm index 7ed5c9501..a5c7f17e1 100644 --- a/simd/i386/jfdctint-sse2.asm +++ b/simd/i386/jfdctint-sse2.asm @@ -457,8 +457,8 @@ EXTN(jsimd_fdct_islow_sse2): psubw xmm6, xmm4 ; xmm6=tmp12 movdqa xmm5, xmm7 - paddw xmm7, xmm2 ; xmm7=tmp10+tmp11 - psubw xmm5, xmm2 ; xmm5=tmp10-tmp11 + paddsw xmm7, xmm2 ; xmm7=tmp10+tmp11 + psubsw xmm5, xmm2 ; xmm5=tmp10-tmp11 paddw xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)] paddw xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)] diff --git a/simd/mips64/jfdctint-mmi.c b/simd/mips64/jfdctint-mmi.c index 7f4dfe912..fbbaccccb 100644 --- a/simd/mips64/jfdctint-mmi.c +++ b/simd/mips64/jfdctint-mmi.c @@ -356,8 +356,11 @@ static uint64_t const_value[] = { tmp11 = _mm_add_pi16(tmp1, tmp2); /* tmp11=tmp1+tmp2 */ \ tmp12 = _mm_sub_pi16(tmp1, tmp2); /* tmp12=tmp1-tmp2 */ \ \ - out0 = _mm_add_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \ - out4 = _mm_sub_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \ + /* Use saturating add/sub to prevent 16-bit overflow when overshoot \ + * deringing produces large uniform column values (issue #444). \ + */ \ + out0 = _mm_adds_pi16(tmp10, tmp11); /* out0=tmp10+tmp11 */ \ + out4 = _mm_subs_pi16(tmp10, tmp11); /* out4=tmp10-tmp11 */ \ \ out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \ out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \ diff --git a/simd/powerpc/jfdctint-altivec.c b/simd/powerpc/jfdctint-altivec.c index 3d4f01710..bea87aca5 100644 --- a/simd/powerpc/jfdctint-altivec.c +++ b/simd/powerpc/jfdctint-altivec.c @@ -168,10 +168,13 @@ tmp11 = vec_add(tmp1, tmp2); \ tmp12 = vec_sub(tmp1, tmp2); \ \ - out0 = vec_add(tmp10, tmp11); \ + /* Use saturating add/sub to prevent 16-bit overflow when overshoot \ + * deringing produces large uniform column values (issue #444). \ + */ \ + out0 = vec_adds(tmp10, tmp11); \ out0 = vec_add(out0, pw_descale_p2x); \ out0 = vec_sra(out0, pass1_bits); \ - out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_subs(tmp10, tmp11); \ out4 = vec_add(out4, pw_descale_p2x); \ out4 = vec_sra(out4, pass1_bits); \ \ diff --git a/simd/x86_64/jfdctint-avx2.asm b/simd/x86_64/jfdctint-avx2.asm index 0c4528612..58884c11f 100644 --- a/simd/x86_64/jfdctint-avx2.asm +++ b/simd/x86_64/jfdctint-avx2.asm @@ -118,7 +118,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) vperm2i128 %7, %1, %1, 0x01 ; %7=tmp11_10 vpsignw %1, %1, [rel PW_1_NEG1] ; %1=tmp10_neg11 - vpaddw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11) + vpaddsw %7, %7, %1 ; %7=(tmp10+tmp11)_(tmp10-tmp11) %if %9 == 1 vpsllw %1, %7, PASS1_BITS ; %1=data0_4 %else diff --git a/simd/x86_64/jfdctint-sse2.asm b/simd/x86_64/jfdctint-sse2.asm index 3a6be020c..e934bf2c4 100644 --- a/simd/x86_64/jfdctint-sse2.asm +++ b/simd/x86_64/jfdctint-sse2.asm @@ -449,8 +449,8 @@ EXTN(jsimd_fdct_islow_sse2): psubw xmm6, xmm4 ; xmm6=tmp12 movdqa xmm5, xmm7 - paddw xmm7, xmm2 ; xmm7=tmp10+tmp11 - psubw xmm5, xmm2 ; xmm5=tmp10-tmp11 + paddsw xmm7, xmm2 ; xmm7=tmp10+tmp11 + psubsw xmm5, xmm2 ; xmm5=tmp10-tmp11 paddw xmm7, [rel PW_DESCALE_P2X] paddw xmm5, [rel PW_DESCALE_P2X] diff --git a/testimages/issue444.ppm b/testimages/issue444.ppm new file mode 100644 index 000000000..a7795c392 Binary files /dev/null and b/testimages/issue444.ppm differ