diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2a09da26..0d243cb52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1756,6 +1756,21 @@ foreach(libtype ${TEST_LIBTYPES})
     set_tests_properties(example-${sample_bits}bit-${libtype}-decompress-cmp
       PROPERTIES DEPENDS example-${sample_bits}bit-${libtype}-decompress)
 
+    # Regression test for issue #444: overshoot deringing + SIMD DCT overflow.
+    # An 8x8 half-black/half-white image triggers maximum overshoot, which
+    # overflows the 16-bit column pass butterfly in the SIMD forward DCT.
+    # Without the fix, the block's brightness is completely inverted.
+    # NOTE: no -revert flag, so mozjpeg extensions (deringing) are active.
+    if(sample_bits EQUAL 8)
+      add_test(NAME ${cjpeg}-${libtype}-issue444
+        COMMAND cjpeg${suffix} -dct int -quality 25 -sample 1x1
+          -outfile ${testout}_issue444.jpg ${TESTIMAGES}/issue444.ppm)
+      add_bittest(${djpeg} issue444 "-dct;int;-ppm"
+        ${testout}_issue444.ppm ${testout}_issue444.jpg
+        ca2f28ac51471eba0e0d30bc7492901c
+        ${cjpeg}-${libtype}-issue444)
+    endif()
+
   endforeach()
 
 endforeach()
diff --git a/jcdctmgr.c b/jcdctmgr.c
index 32bd25745..35fe3a1ca 100644
--- a/jcdctmgr.c
+++ b/jcdctmgr.c
@@ -440,7 +440,9 @@ preprocess_deringing(DCTELEM *data, const JQUANT_TBL *quantization_table)
     return;
   }
 
-  /* Too much overshoot is not good: increased amplitude will cost bits, and the cost is proportional to quantization (here using DC quant as a rough guide). */
+  /* Too much overshoot is not good: increased amplitude will cost bits, and
+     the cost is proportional to quantization (here using DC quant as a rough
+     guide). */
   maxovershoot = maxsample + MIN(MIN(31, 2*quantization_table->quantval[0]), (maxsample * size - sum) / maxsample_count);
 
   n = 0;
diff --git a/simd/arm/jfdctint-neon.c b/simd/arm/jfdctint-neon.c
index ccfc07b15..6f0af1bee 100644
--- a/simd/arm/jfdctint-neon.c
+++ b/simd/arm/jfdctint-neon.c
@@ -274,8 +274,12 @@ void jsimd_fdct_islow_neon(DCTELEM *data)
   tmp11 = vaddq_s16(tmp1, tmp2);
   tmp12 = vsubq_s16(tmp1, tmp2);
 
-  row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
-  row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+  /* Use saturating arithmetic for the final butterfly to prevent 16-bit
+   * overflow when overshoot deringing produces large uniform column values.
+   * See https://github.com/mozilla/mozjpeg/issues/444
+   */
+  row0 = vrshrq_n_s16(vqaddq_s16(tmp10, tmp11), PASS1_BITS);
+  row4 = vrshrq_n_s16(vqsubq_s16(tmp10, tmp11), PASS1_BITS);
 
   tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
   z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
diff --git a/simd/i386/jfdctint-avx2.asm b/simd/i386/jfdctint-avx2.asm
index 05ea86548..6d8da5b4c 100644
--- a/simd/i386/jfdctint-avx2.asm
+++ b/simd/i386/jfdctint-avx2.asm
@@ -118,7 +118,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 
     vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
     vpsignw     %1, %1, [GOTOFF(ebx, PW_1_NEG1)]  ; %1=tmp10_neg11
-    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+    vpaddsw     %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
 %if %9 == 1
     vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
 %else
diff --git a/simd/i386/jfdctint-mmx.asm b/simd/i386/jfdctint-mmx.asm
index 7d4c61cd7..6e1c3febe 100644
--- a/simd/i386/jfdctint-mmx.asm
+++ b/simd/i386/jfdctint-mmx.asm
@@ -438,8 +438,8 @@ EXTN(jsimd_fdct_islow_mmx):
     psubw       mm6, mm4                ; mm6=tmp12
 
     movq        mm7, mm5
-    paddw       mm5, mm0                ; mm5=tmp10+tmp11
-    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+    paddsw      mm5, mm0                ; mm5=tmp10+tmp11
+    psubsw      mm7, mm0                ; mm7=tmp10-tmp11
 
     paddw       mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
     paddw       mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
diff --git a/simd/i386/jfdctint-sse2.asm b/simd/i386/jfdctint-sse2.asm
index 7ed5c9501..a5c7f17e1 100644
--- a/simd/i386/jfdctint-sse2.asm
+++ b/simd/i386/jfdctint-sse2.asm
@@ -457,8 +457,8 @@ EXTN(jsimd_fdct_islow_sse2):
     psubw       xmm6, xmm4              ; xmm6=tmp12
 
     movdqa      xmm5, xmm7
-    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
-    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+    paddsw      xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubsw      xmm5, xmm2              ; xmm5=tmp10-tmp11
 
     paddw       xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
     paddw       xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
diff --git a/simd/mips64/jfdctint-mmi.c b/simd/mips64/jfdctint-mmi.c
index 7f4dfe912..fbbaccccb 100644
--- a/simd/mips64/jfdctint-mmi.c
+++ b/simd/mips64/jfdctint-mmi.c
@@ -356,8 +356,11 @@ static uint64_t const_value[] = {
   tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
   tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
   \
-  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
-  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  /* Use saturating add/sub to prevent 16-bit overflow when overshoot \
+   * deringing produces large uniform column values (issue #444).     \
+   */ \
+  out0 = _mm_adds_pi16(tmp10, tmp11);         /* out0=tmp10+tmp11 */ \
+  out4 = _mm_subs_pi16(tmp10, tmp11);         /* out4=tmp10-tmp11 */ \
   \
   out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
   out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
diff --git a/simd/powerpc/jfdctint-altivec.c b/simd/powerpc/jfdctint-altivec.c
index 3d4f01710..bea87aca5 100644
--- a/simd/powerpc/jfdctint-altivec.c
+++ b/simd/powerpc/jfdctint-altivec.c
@@ -168,10 +168,13 @@
   tmp11 = vec_add(tmp1, tmp2); \
   tmp12 = vec_sub(tmp1, tmp2); \
   \
-  out0  = vec_add(tmp10, tmp11); \
+  /* Use saturating add/sub to prevent 16-bit overflow when overshoot \
+   * deringing produces large uniform column values (issue #444).     \
+   */ \
+  out0  = vec_adds(tmp10, tmp11); \
   out0  = vec_add(out0, pw_descale_p2x); \
   out0  = vec_sra(out0, pass1_bits); \
-  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_subs(tmp10, tmp11); \
   out4  = vec_add(out4, pw_descale_p2x); \
   out4  = vec_sra(out4, pass1_bits); \
   \
diff --git a/simd/x86_64/jfdctint-avx2.asm b/simd/x86_64/jfdctint-avx2.asm
index 0c4528612..58884c11f 100644
--- a/simd/x86_64/jfdctint-avx2.asm
+++ b/simd/x86_64/jfdctint-avx2.asm
@@ -118,7 +118,7 @@ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
 
     vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
     vpsignw     %1, %1, [rel PW_1_NEG1]  ; %1=tmp10_neg11
-    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+    vpaddsw     %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
 %if %9 == 1
     vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
 %else
diff --git a/simd/x86_64/jfdctint-sse2.asm b/simd/x86_64/jfdctint-sse2.asm
index 3a6be020c..e934bf2c4 100644
--- a/simd/x86_64/jfdctint-sse2.asm
+++ b/simd/x86_64/jfdctint-sse2.asm
@@ -449,8 +449,8 @@ EXTN(jsimd_fdct_islow_sse2):
     psubw       xmm6, xmm4              ; xmm6=tmp12
 
     movdqa      xmm5, xmm7
-    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
-    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+    paddsw      xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubsw      xmm5, xmm2              ; xmm5=tmp10-tmp11
 
     paddw       xmm7, [rel PW_DESCALE_P2X]
     paddw       xmm5, [rel PW_DESCALE_P2X]
diff --git a/testimages/issue444.ppm b/testimages/issue444.ppm
new file mode 100644
index 000000000..a7795c392
Binary files /dev/null and b/testimages/issue444.ppm differ