diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0cc2b4ec..b1afaecb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@
 -----------
 
 
+### Unreleased
+
+#### Bug fixes and improvements
+- Decode TIFF predictor=2 sample-wise so multi-byte integer files written by libtiff/GDAL read back exactly
+
+
 ### Version 0.9.9 - 2026-05-05
 
 #### Bug fixes and improvements
diff --git a/xrspatial/geotiff/_compression.py b/xrspatial/geotiff/_compression.py
index 087ebbe9..cd7b06cd 100644
--- a/xrspatial/geotiff/_compression.py
+++ b/xrspatial/geotiff/_compression.py
@@ -333,80 +333,164 @@ def lzw_compress(data: bytes) -> bytes:
 
 
 # -- Horizontal predictor (Numba) --------------------------------------------
+#
+# TIFF predictor=2 (horizontal differencing) operates on whole *samples*
+# (per-pixel-component values), not on individual bytes.  For multi-byte
+# integer samples the difference is computed across full sample values, so
+# carries between the low and high bytes are honoured.  The byte stream that
+# lands on disk is the difference samples re-encoded in the file's byte
+# order (always little-endian for files this writer emits, but readers must
+# also handle big-endian sources).
+#
+# The previous implementation here did byte-wise cumulative sum at a sample
+# stride, which produced the right answer for 8-bit data but silently
+# scrambled multi-byte integers whenever the difference between adjacent
+# samples crossed a byte boundary (i.e. needed a carry).  Both halves were
+# wrong in matching ways, so files this codebase wrote could be re-read,
+# but cross-tool reads of files written by libtiff/GDAL produced incorrect
+# pixel values.  See accuracy sweep finding (2026-05-06).
 
 @ngjit
-def _predictor_decode(data, width, height, bytes_per_sample):
-    """Undo horizontal differencing predictor (TIFF predictor=2).
+def _predictor_decode_u8(data, row_len, height, stride):
+    """Undo horizontal differencing on a byte stream (in-place, mod 256).
 
-    Operates in-place on the flat byte array, performing cumulative sum
-    per row at the sample level.
+    ``row_len`` is the full row length in bytes; ``stride`` is the number
+    of bytes between adjacent same-channel samples (e.g. samples_per_pixel
+    for chunky 8-bit multi-band data).
     """
-    row_bytes = width * bytes_per_sample
     for row in range(height):
-        row_start = row * row_bytes
-        for col in range(bytes_per_sample, row_bytes):
+        row_start = row * row_len
+        for col in range(stride, row_len):
             idx = row_start + col
-            data[idx] = np.uint8((np.int32(data[idx]) + np.int32(data[idx - bytes_per_sample])) & 0xFF)
+            data[idx] = np.uint8(
+                (np.int32(data[idx]) + np.int32(data[idx - stride])) & 0xFF)
 
 
 @ngjit
-def _predictor_encode(data, width, height, bytes_per_sample):
-    """Apply horizontal differencing predictor (TIFF predictor=2).
-
-    Operates in-place, converting absolute values to differences.
-    Process right-to-left to avoid overwriting values we still need.
-    """
-    row_bytes = width * bytes_per_sample
+def _predictor_encode_u8(data, row_len, height, stride):
+    """Apply horizontal differencing on a byte stream (in-place, mod 256)."""
     for row in range(height):
-        row_start = row * row_bytes
-        for col in range(row_bytes - 1, bytes_per_sample - 1, -1):
+        row_start = row * row_len
+        for col in range(row_len - 1, stride - 1, -1):
             idx = row_start + col
-            data[idx] = np.uint8((np.int32(data[idx]) - np.int32(data[idx - bytes_per_sample])) & 0xFF)
+            data[idx] = np.uint8(
+                (np.int32(data[idx]) - np.int32(data[idx - stride])) & 0xFF)
+
+
+def _predictor_decode_samples(data, image_width, height, dtype, samples=1):
+    """Sample-wise decode for multi-byte sample dtypes.
+
+    Modifies *data* (a flat uint8 buffer) in place.  ``dtype`` carries the
+    file's byte order so that the buffer is interpreted with the right
+    endianness; the result is written back in the same byte order.
+
+    For chunky multi-band data the cumulative sum runs per-channel so that
+    each channel's value is differenced against the previous same-channel
+    pixel (TIFF Technical Note 1, predictor=2).
+    """
+    if samples == 1:
+        view = data.view(dtype).reshape(height, image_width)
+    else:
+        view = data.view(dtype).reshape(height, image_width, samples)
+    # In-place cumsum in the dtype's modular arithmetic.  The ``out`` cast
+    # keeps the result wrapping inside the sample width for fixed-width
+    # integers.  Float dtypes never reach this branch (the public
+    # ``predictor_decode`` routes them to the byte-wise legacy kernel)
+    # because predictor=2 differencing on floats does not invert exactly.
+    np.cumsum(view, axis=1, dtype=dtype, out=view)
+
+
+def _predictor_encode_samples(data, image_width, height, dtype, samples=1):
+    """Sample-wise encode for multi-byte sample dtypes (in-place)."""
+    if samples == 1:
+        view = data.view(dtype).reshape(height, image_width)
+    else:
+        view = data.view(dtype).reshape(height, image_width, samples)
+    view[:, 1:] = np.diff(view, axis=1)
 
 
 def predictor_decode(data: np.ndarray, width: int, height: int,
-                     bytes_per_sample: int) -> np.ndarray:
+                     bytes_per_sample: int,
+                     dtype: np.dtype | None = None,
+                     samples: int = 1) -> np.ndarray:
     """Undo horizontal differencing predictor (predictor=2).
 
     Parameters
     ----------
     data : np.ndarray
         Flat uint8 array of decompressed pixel data (modified in-place).
-    width, height : int
-        Image dimensions.
+    width : int
+        Image width in pixels.  For chunky multi-band data the number of
+        samples per row is ``width * samples``.
+    height : int
+        Number of rows.
     bytes_per_sample : int
-        Bytes per sample (e.g. 1 for uint8, 4 for float32).
+        Bytes per sample (1, 2, 4, or 8).
+    dtype : np.dtype, optional
+        Sample dtype including the file's byte order.  Required when
+        ``bytes_per_sample > 1`` so the byte stream can be interpreted
+        as whole sample values; per TIFF spec, differencing is
+        sample-level (carries propagate within each sample's bytes).
+    samples : int
+        Samples per pixel for chunky planar config.
 
     Returns
     -------
     np.ndarray
         Same array, modified in-place.
+
+    Notes
+    -----
+    Predictor=2 differences each sample against the previous same-channel
+    sample (TIFF Tech Note 1).  For chunky multi-band data the byte
+    stride between same-channel samples is ``samples * bytes_per_sample``.
     """
     buf = np.ascontiguousarray(data)
-    _predictor_decode(buf, width, height, bytes_per_sample)
+    if bytes_per_sample == 1:
+        # Byte-wise math is exact for 8-bit samples; stride = channels.
+        _predictor_decode_u8(buf, width * samples, height, samples)
+    elif dtype is not None and np.dtype(dtype).kind in ('i', 'u'):
+        _predictor_decode_samples(buf, width, height, np.dtype(dtype),
+                                  samples=samples)
+    elif dtype is not None and np.dtype(dtype).kind == 'f':
+        # Predictor=2 on floats is outside the TIFF spec (which restricts
+        # this predictor to integers).  xrspatial historically allowed it
+        # with byte-wise differencing, which round-trips because the byte
+        # sequence is restored exactly even though the per-sample
+        # arithmetic is meaningless.  Preserve that behaviour here so
+        # files written by older versions still read back unchanged.
+        _predictor_decode_u8(buf, width * samples * bytes_per_sample,
+                             height, samples * bytes_per_sample)
+    else:
+        # Legacy path: byte-wise with caller-supplied stride.  Only correct
+        # for round-trips with a matching byte-wise encoder; do not feed
+        # this path libtiff/GDAL files for multi-byte sample data.
+        _predictor_decode_u8(buf, width * bytes_per_sample, height,
+                             bytes_per_sample)
     return buf
 
 
 def predictor_encode(data: np.ndarray, width: int, height: int,
-                     bytes_per_sample: int) -> np.ndarray:
+                     bytes_per_sample: int,
+                     dtype: np.dtype | None = None,
+                     samples: int = 1) -> np.ndarray:
     """Apply horizontal differencing predictor (predictor=2).
 
-    Parameters
-    ----------
-    data : np.ndarray
-        Flat uint8 array of pixel data (modified in-place).
-    width, height : int
-        Image dimensions.
-    bytes_per_sample : int
-        Bytes per sample.
-
-    Returns
-    -------
-    np.ndarray
-        Same array, modified in-place.
+    Parameters mirror :func:`predictor_decode`.
     """
     buf = np.ascontiguousarray(data)
-    _predictor_encode(buf, width, height, bytes_per_sample)
+    if bytes_per_sample == 1:
+        _predictor_encode_u8(buf, width * samples, height, samples)
+    elif dtype is not None and np.dtype(dtype).kind in ('i', 'u'):
+        _predictor_encode_samples(buf, width, height, np.dtype(dtype),
+                                  samples=samples)
+    elif dtype is not None and np.dtype(dtype).kind == 'f':
+        # See ``predictor_decode`` for why floats stay byte-wise.
+        _predictor_encode_u8(buf, width * samples * bytes_per_sample,
+                             height, samples * bytes_per_sample)
+    else:
+        _predictor_encode_u8(buf, width * bytes_per_sample, height,
+                             bytes_per_sample)
     return buf
 
 
diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index 67c36829..2dd5c82a 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -629,7 +629,12 @@ def _inflate_tiles_kernel(
 
 @cuda.jit
 def _predictor_decode_kernel(data, width, height, bytes_per_sample):
-    """Undo horizontal differencing (predictor=2), one thread per row."""
+    """Undo horizontal differencing (predictor=2), one thread per row.
+
+    Byte-wise variant retained for 8-bit samples; multi-byte samples must
+    use :func:`_predictor_decode_kernel_samples` so carries between bytes
+    of the same sample propagate correctly.
+    """
     row = cuda.grid(1)
     if row >= height:
         return
@@ -643,6 +648,27 @@ def _predictor_decode_kernel(data, width, height, bytes_per_sample):
             (numba_int32(data[idx]) + numba_int32(data[idx - bytes_per_sample])) & 0xFF)
 
 
+@cuda.jit
+def _predictor_decode_kernel_samples(data_view, samples_per_row, height,
+                                     stride):
+    """Sample-wise horizontal-differencing decode, one thread per row.
+
+    *data_view* is a 1-D device array typed as the file's sample dtype
+    (e.g. uint16 / int32 / float32) so that ``+=`` between elements
+    preserves carries between bytes within the same sample.  *stride* is
+    the number of typed samples between adjacent same-channel samples
+    (1 for single-band, samples_per_pixel for chunky multi-band) per
+    TIFF Tech Note 1.  Operates in place.
+    """
+    row = cuda.grid(1)
+    if row >= height:
+        return
+    row_start = row * samples_per_row
+    for col in range(stride, samples_per_row):
+        idx = row_start + col
+        data_view[idx] = data_view[idx] + data_view[idx - stride]
+
+
 @cuda.jit
 def _fp_predictor_decode_kernel(data, tmp, width, height, bps):
     """Undo floating-point predictor (predictor=3), one thread per row.
@@ -1383,11 +1409,21 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
         total_rows = n_tiles * tile_height
         tpb = min(256, total_rows)
         bpg = math.ceil(total_rows / tpb)
-        # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
-        # width and full per-pixel size (itemsize * samples). Matches CPU
-        # call at _reader.py _apply_predictor(..., bytes_per_sample * samples).
-        _predictor_decode_kernel[bpg, tpb](
-            d_decomp, tile_width, total_rows, dtype.itemsize * samples)
+        # TIFF predictor=2 differences whole samples (per channel).  For
+        # 8-bit samples (and for floats, where the spec disallows
+        # predictor=2 but xrspatial historically wrote byte-wise diffs
+        # that round-trip via the same byte-wise decoder) the byte-wise
+        # kernel runs.  Multi-byte integer samples view the buffer as the
+        # sample dtype so carries propagate within each sample.
+        sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
+        if not sample_wise:
+            _predictor_decode_kernel[bpg, tpb](
+                d_decomp, tile_width, total_rows, samples * dtype.itemsize)
+        else:
+            samples_per_row = tile_width * samples
+            view = d_decomp.view(dtype=cupy.dtype(dtype))
+            _predictor_decode_kernel_samples[bpg, tpb](
+                view, samples_per_row, total_rows, samples)
         cuda.synchronize()
     elif predictor == 3:
         total_rows = n_tiles * tile_height
@@ -1629,15 +1665,22 @@ def gpu_decode_tiles(
 
     # Apply predictor on GPU
     if predictor == 2:
-        # Horizontal differencing: one thread per row across all tiles
+        # Horizontal differencing: one thread per row across all tiles.
+        # Multi-byte integer samples use the sample-wise kernel so carries
+        # propagate; 8-bit samples and floats stay byte-wise (see comment
+        # in _apply_predictor_and_assemble for why floats are byte-wise).
         total_rows = n_tiles * tile_height
         tpb = min(256, total_rows)
         bpg = math.ceil(total_rows / tpb)
-        # Reshape so each tile's rows are contiguous (they already are).
-        # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
-        # width and full per-pixel size (itemsize * samples).
-        _predictor_decode_kernel[bpg, tpb](
-            d_decomp, tile_width, total_rows, dtype.itemsize * samples)
+        sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
+        if not sample_wise:
+            _predictor_decode_kernel[bpg, tpb](
+                d_decomp, tile_width, total_rows, samples * dtype.itemsize)
+        else:
+            samples_per_row = tile_width * samples
+            view = d_decomp.view(dtype=cupy.dtype(dtype))
+            _predictor_decode_kernel_samples[bpg, tpb](
+                view, samples_per_row, total_rows, samples)
         cuda.synchronize()
 
     elif predictor == 3:
@@ -1723,6 +1766,8 @@ def _extract_tiles_kernel(
 def _predictor_encode_kernel(data, width, height, bytes_per_sample):
     """Apply horizontal differencing (predictor=2), one thread per row.
     Process right-to-left to avoid overwriting values we still need.
+
+    Byte-wise variant retained for 8-bit samples.
     """
     row = cuda.grid(1)
     if row >= height:
@@ -1737,6 +1782,24 @@ def _predictor_encode_kernel(data, width, height, bytes_per_sample):
             (numba_int32(data[idx]) - numba_int32(data[idx - bytes_per_sample])) & 0xFF)
 
 
+@cuda.jit
+def _predictor_encode_kernel_samples(data_view, samples_per_row, height,
+                                     stride):
+    """Sample-wise horizontal-differencing encode, one thread per row.
+
+    Counterpart to :func:`_predictor_decode_kernel_samples` -- *stride* is
+    the per-channel sample stride (1 for single-band, samples_per_pixel
+    for chunky multi-band).  Operates right-to-left, in place.
+    """
+    row = cuda.grid(1)
+    if row >= height:
+        return
+    row_start = row * samples_per_row
+    for col in range(samples_per_row - 1, stride - 1, -1):
+        idx = row_start + col
+        data_view[idx] = data_view[idx] - data_view[idx - stride]
+
+
 @cuda.jit
 def _fp_predictor_encode_kernel(data, tmp, width, height, bps):
     """Apply floating-point predictor (predictor=3), one thread per row."""
@@ -2307,8 +2370,19 @@ def gpu_compress_tiles(d_image, tile_width, tile_height,
     if predictor == 2:
         tpb_r = min(256, total_rows)
         bpg_r = math.ceil(total_rows / tpb_r)
-        _predictor_encode_kernel[bpg_r, tpb_r](
-            d_tile_buf, tile_width * samples, total_rows, dtype.itemsize * samples)
+        # Sample-wise encode for multi-byte integer dtypes; byte-wise for
+        # 8-bit samples and floats (predictor=2 on floats is outside spec
+        # but xrspatial historically used byte-wise diffs, which the
+        # decoder still expects).
+        sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
+        if not sample_wise:
+            _predictor_encode_kernel[bpg_r, tpb_r](
+                d_tile_buf, tile_width, total_rows, samples * dtype.itemsize)
+        else:
+            samples_per_row = tile_width * samples
+            view = d_tile_buf.view(dtype=cupy.dtype(dtype))
+            _predictor_encode_kernel_samples[bpg_r, tpb_r](
+                view, samples_per_row, total_rows, samples)
         cuda.synchronize()
     elif predictor == 3:
         tpb_r = min(256, total_rows)
diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py
index feb59574..72d0649e 100644
--- a/xrspatial/geotiff/_reader.py
+++ b/xrspatial/geotiff/_reader.py
@@ -341,15 +341,18 @@ def _open_source(source: str):
 
 def _apply_predictor(chunk: np.ndarray, pred: int, width: int,
                      height: int, bytes_per_sample: int,
-                     samples: int = 1) -> np.ndarray:
+                     samples: int = 1,
+                     dtype: np.dtype | None = None) -> np.ndarray:
     """Apply the appropriate predictor decode to decompressed data.
 
     ``width``, ``height``, ``bytes_per_sample``, and ``samples`` describe
     the raw pixel layout before predictor inversion: ``width * samples``
     samples per row, each ``bytes_per_sample`` bytes wide.
 
-    Predictor=2 (horizontal differencing) works byte-wise on a stride of
-    ``bytes_per_sample * samples``.
+    Predictor=2 (horizontal differencing) is sample-wise per TIFF spec,
+    using ``dtype`` (with the file's byte order) to interpret the byte
+    stream as whole sample values.  An 8-bit fast path keeps the legacy
+    byte-wise loop.
 
     Predictor=3 (floating-point) byte-swizzles each row into
     ``bytes_per_sample`` interleaved lanes of length ``width * samples``,
@@ -359,7 +362,8 @@ def _apply_predictor(chunk: np.ndarray, pred: int, width: int,
     """
     if pred == 2:
         return predictor_decode(chunk, width, height,
-                                bytes_per_sample * samples)
+                                bytes_per_sample, dtype=dtype,
+                                samples=samples)
     elif pred == 3:
         return fp_predictor_decode(chunk, width * samples, height,
                                    bytes_per_sample)
@@ -412,8 +416,14 @@ def _decode_strip_or_tile(data_slice, compression, width, height, samples,
     if pred in (2, 3) and not is_sub_byte:
         if not chunk.flags.writeable:
             chunk = chunk.copy()
+        # Predictor=2 needs the sample dtype with the file's byte order so
+        # that multi-byte samples can be decoded sample-wise (carries
+        # propagate within each sample).  Predictor=3 only operates on the
+        # byte stream itself so dtype is unused there.
+        file_sample_dtype = dtype.newbyteorder(byte_order)
         chunk = _apply_predictor(chunk, pred, width, height,
-                                 bytes_per_sample, samples=samples)
+                                 bytes_per_sample, samples=samples,
+                                 dtype=file_sample_dtype)
 
     if is_sub_byte:
         pixels = unpack_bits(chunk, bps, pixel_count)
diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py
index 8f0d7197..1ff998b5 100644
--- a/xrspatial/geotiff/_writer.py
+++ b/xrspatial/geotiff/_writer.py
@@ -93,11 +93,18 @@ def normalize_predictor(predictor, dtype, compression: int) -> int:
 
 def _apply_predictor_encode(buf: np.ndarray, predictor: int,
                             width: int, height: int,
-                            bytes_per_sample: int, samples: int) -> np.ndarray:
-    """Apply the chosen predictor to a flat uint8 buffer."""
+                            bytes_per_sample: int, samples: int,
+                            dtype: np.dtype | None = None) -> np.ndarray:
+    """Apply the chosen predictor to a flat uint8 buffer.
+
+    For predictor=2 with multi-byte samples the buffer is interpreted as
+    samples in *dtype*'s byte order (the file's byte order, always
+    little-endian for files this writer emits).
+    """
     if predictor == 2:
         return predictor_encode(buf, width, height,
-                                bytes_per_sample * samples)
+                                bytes_per_sample, dtype=dtype,
+                                samples=samples)
     if predictor == 3:
         return fp_predictor_encode(buf, width * samples, height,
                                    bytes_per_sample)
@@ -380,7 +387,8 @@ def _write_stripped(data: np.ndarray, compression: int, predictor: int,
             strip_arr = np.ascontiguousarray(data[r0:r1])
             buf = strip_arr.view(np.uint8).ravel().copy()
             buf = _apply_predictor_encode(
-                buf, predictor, width, strip_rows, bytes_per_sample, samples)
+                buf, predictor, width, strip_rows, bytes_per_sample, samples,
+                dtype=np.dtype(dtype).newbyteorder(BO))
             strip_data = buf.tobytes()
             if compression_level is None:
                 compressed = compress(strip_data, compression)
@@ -447,7 +455,8 @@ def _prepare_tile(data, tr, tc, th, tw, height, width, samples, dtype,
     elif predictor != 1 and compression != COMPRESSION_NONE:
         buf = tile_arr.view(np.uint8).ravel().copy()
         buf = _apply_predictor_encode(
-            buf, predictor, tw, th, bytes_per_sample, samples)
+            buf, predictor, tw, th, bytes_per_sample, samples,
+            dtype=np.dtype(dtype).newbyteorder(BO))
         tile_data = buf.tobytes()
     else:
         tile_data = tile_arr.tobytes()
@@ -1089,7 +1098,8 @@ def _compress_block(arr, block_w, block_h, samples, dtype, bytes_per_sample,
     if predictor != 1 and compression != COMPRESSION_NONE:
         buf = arr.view(np.uint8).ravel().copy()
         buf = _apply_predictor_encode(
-            buf, predictor, block_w, block_h, bytes_per_sample, samples)
+            buf, predictor, block_w, block_h, bytes_per_sample, samples,
+            dtype=np.dtype(dtype).newbyteorder(BO))
         raw_data = buf.tobytes()
     else:
         raw_data = arr.tobytes()
diff --git a/xrspatial/geotiff/tests/test_predictor2_samplewise.py b/xrspatial/geotiff/tests/test_predictor2_samplewise.py
new file mode 100644
index 00000000..449e000a
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_predictor2_samplewise.py
@@ -0,0 +1,190 @@
+"""Predictor=2 sample-wise round-trip tests against libtiff/GDAL.
+
+The historical implementation differenced the byte stream at a sample
+stride (so it lost carries between bytes inside a single multi-byte
+sample).  Files xrspatial wrote could be read back because both halves
+were wrong in matching ways, but reading multi-byte integer files
+written by libtiff/GDAL produced silently-wrong pixel values whenever a
+difference between adjacent samples crossed a byte boundary.
+
+These tests use rasterio (libtiff) and tifffile/imagecodecs as reference
+encoders and check that xrspatial reads them back exactly.
+"""
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from xrspatial.geotiff._reader import read_to_array
+
+rasterio = pytest.importorskip("rasterio")
+tifffile = pytest.importorskip("tifffile")
+pytest.importorskip("imagecodecs")  # required by tifffile for predictor
+
+
+def _carry_array(dtype):
+    """Pixel grid that crosses a byte boundary between adjacent samples."""
+    return np.array(
+        [
+            [1, 2, 3, 4],
+            [5, 6, 7, 8],
+            [256, 257, 258, 259],
+            [1000, 2000, 3000, 4000],
+        ],
+        dtype=dtype,
+    )
+
+
+@pytest.mark.parametrize("dtype", ["uint16", "int16", "uint32", "int32"])
+def test_libtiff_predictor2_le_round_trip(tmp_path, dtype):
+    """xrspatial reads libtiff-encoded LE predictor=2 files exactly."""
+    arr = _carry_array(dtype)
+    path = tmp_path / f"libtiff_pred2_le_{dtype}.tif"
+    transform = rasterio.transform.from_origin(0, 4, 1, 1)
+    with rasterio.open(
+        str(path),
+        "w",
+        driver="GTiff",
+        height=4,
+        width=4,
+        count=1,
+        dtype=dtype,
+        compress="deflate",
+        predictor=2,
+        transform=transform,
+    ) as dst:
+        dst.write(arr, 1)
+
+    out, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(out, arr)
+
+
+@pytest.mark.parametrize("dtype", ["uint16", "int16", "uint32", "int32"])
+def test_tifffile_predictor2_be_round_trip(tmp_path, dtype):
+    """xrspatial reads big-endian predictor=2 files exactly."""
+    arr = _carry_array(dtype)
+    path = tmp_path / f"tifffile_pred2_be_{dtype}.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=2, compression="deflate")
+
+    out, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(out, arr)
+
+
+def test_libtiff_multiband_predictor2_chunky(tmp_path):
+    """Chunky multi-band uint16 predictor=2 (libtiff per-channel diffs)."""
+    arr = np.array(
+        [
+            [[1, 100], [2, 200]],
+            [[3, 300], [4, 400]],
+        ],
+        dtype="uint16",
+    )
+    arr_band_first = np.moveaxis(arr, -1, 0)  # rasterio expects (band, y, x)
+
+    path = tmp_path / "libtiff_pred2_multiband.tif"
+    transform = rasterio.transform.from_origin(0, 2, 1, 1)
+    with rasterio.open(
+        str(path),
+        "w",
+        driver="GTiff",
+        height=2,
+        width=2,
+        count=2,
+        dtype="uint16",
+        compress="deflate",
+        predictor=2,
+        interleave="pixel",
+        transform=transform,
+    ) as dst:
+        dst.write(arr_band_first)
+
+    out, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(out, arr)
+
+
+def test_xrspatial_round_trip_uint16_predictor2(tmp_path):
+    """xrspatial-written predictor=2 uint16 still round-trips."""
+    from xrspatial.geotiff import open_geotiff, to_geotiff
+    import xarray as xr
+
+    arr = _carry_array("uint16")
+    da = xr.DataArray(
+        arr,
+        dims=("y", "x"),
+        coords={
+            "x": np.arange(4, dtype=np.float64),
+            "y": np.arange(4, dtype=np.float64),
+        },
+    )
+    path = tmp_path / "self_pred2_u16.tif"
+    to_geotiff(da, str(path), compression="deflate", predictor=2)
+
+    out = open_geotiff(str(path))
+    np.testing.assert_array_equal(out.values, arr)
+
+
+def test_xrspatial_uint16_predictor2_readable_by_rasterio(tmp_path):
+    """xrspatial-written predictor=2 uint16 is readable by libtiff."""
+    from xrspatial.geotiff import to_geotiff
+    import xarray as xr
+
+    arr = _carry_array("uint16")
+    da = xr.DataArray(
+        arr,
+        dims=("y", "x"),
+        coords={
+            "x": np.arange(4, dtype=np.float64),
+            "y": np.arange(4, dtype=np.float64),
+        },
+    )
+    path = tmp_path / "interop_pred2_u16.tif"
+    to_geotiff(da, str(path), compression="deflate", predictor=2)
+
+    with rasterio.open(str(path)) as src:
+        rio_arr = src.read(1)
+    np.testing.assert_array_equal(rio_arr, arr)
+
+
+def _gpu_to_numpy(da):
+    """Materialize a possibly cupy-backed DataArray as a numpy ndarray."""
+    arr = da.data
+    if hasattr(arr, "get"):
+        return arr.get()
+    return np.asarray(arr)
+
+
+@pytest.mark.parametrize("dtype", ["uint16", "int32"])
+def test_gpu_predictor2_multibyte_matches_libtiff(tmp_path, dtype):
+    """The GPU path also decodes libtiff predictor=2 multi-byte files."""
+    cupy = pytest.importorskip("cupy")
+    if not cupy.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    from xrspatial.geotiff import open_geotiff
+
+    arr = _carry_array(dtype)
+    path = tmp_path / f"gpu_pred2_{dtype}.tif"
+    transform = rasterio.transform.from_origin(0, 4, 1, 1)
+    with rasterio.open(
+        str(path),
+        "w",
+        driver="GTiff",
+        height=4,
+        width=4,
+        count=1,
+        dtype=dtype,
+        compress="deflate",
+        predictor=2,
+        tiled=True,
+        blockxsize=16,
+        blockysize=16,
+        transform=transform,
+    ) as dst:
+        dst.write(arr, 1)
+
+    cpu_arr = open_geotiff(str(path)).values
+    np.testing.assert_array_equal(cpu_arr, arr)
+
+    gpu_arr = _gpu_to_numpy(open_geotiff(str(path), gpu=True))
+    np.testing.assert_array_equal(gpu_arr, arr)