From 207b9debe3263ae51c390f13f11cc4dee6aa29cc Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 6 May 2026 05:56:24 -0700
Subject: [PATCH] Decode TIFF predictor=2 sample-wise so multi-byte integer
 files round-trip with libtiff

The horizontal-differencing predictor was implemented as a byte-stream
cumulative sum at a sample stride. That works for 8-bit data but loses
carries between bytes inside the same multi-byte sample, so any difference
between adjacent samples that crossed a byte boundary came out wrong. Files
written by xrspatial read back fine because both halves were wrong in
matching ways, but TIFFs from libtiff/GDAL with predictor=2 and uint16,
int16, uint32 or int32 samples decoded silently to incorrect pixel values.

Now the decode operates on whole sample values, viewed as the file's
sample dtype with the file's byte order, and per-channel for chunky
multi-band data. The 8-bit byte-wise fast path stays. Predictor=2 on
floats stays byte-wise to keep the historical xrspatial round-trip
working (the spec restricts predictor=2 to integers; floats are out of
spec for this predictor).

The GPU decode and encode kernels gained sample-wise variants and the
encode dispatch was also fixed: it had been called with row_bytes
multiplied by samples-per-pixel twice, which only happened to be inert
because the byte-wise math was wrong on both sides.

New regression tests in test_predictor2_samplewise.py read libtiff- and
tifffile-written files (LE and BE; uint16, int16, uint32, int32; chunky
multi-band) and confirm exact equality on both CPU and GPU paths.
---
 CHANGELOG.md                                  |   6 +
 xrspatial/geotiff/_compression.py             | 160 +++++++++++----
 xrspatial/geotiff/_gpu_decode.py              | 102 ++++++++--
 xrspatial/geotiff/_reader.py                  |  20 +-
 xrspatial/geotiff/_writer.py                  |  22 +-
 .../tests/test_predictor2_samplewise.py       | 190 ++++++++++++++++++
 6 files changed, 437 insertions(+), 63 deletions(-)
 create mode 100644 xrspatial/geotiff/tests/test_predictor2_samplewise.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0cc2b4ec..b1afaecb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@
 -----------
 
 
+### Unreleased
+
+#### Bug fixes and improvements
+- Decode TIFF predictor=2 sample-wise so multi-byte integer files written by libtiff/GDAL read back exactly
+
+
 ### Version 0.9.9 - 2026-05-05
 
 #### Bug fixes and improvements
diff --git a/xrspatial/geotiff/_compression.py b/xrspatial/geotiff/_compression.py
index 087ebbe9..cd7b06cd 100644
--- a/xrspatial/geotiff/_compression.py
+++ b/xrspatial/geotiff/_compression.py
@@ -333,80 +333,164 @@ def lzw_compress(data: bytes) -> bytes:
 
 
 # -- Horizontal predictor (Numba) --------------------------------------------
+#
+# TIFF predictor=2 (horizontal differencing) operates on whole *samples*
+# (per-pixel-component values), not on individual bytes.  For multi-byte
+# integer samples the difference is computed across full sample values, so
+# carries between the low and high bytes are honoured.  The byte stream that
+# lands on disk is the difference samples re-encoded in the file's byte
+# order (always little-endian for files this writer emits, but readers must
+# also handle big-endian sources).
+#
+# The previous implementation here did byte-wise cumulative sum at a sample
+# stride, which produced the right answer for 8-bit data but silently
+# scrambled multi-byte integers whenever the difference between adjacent
+# samples crossed a byte boundary (i.e. needed a carry).  Both halves were
+# wrong in matching ways, so files this codebase wrote could be re-read,
+# but cross-tool reads of files written by libtiff/GDAL produced incorrect
+# pixel values.  See accuracy sweep finding (2026-05-06).
 
 @ngjit
-def _predictor_decode(data, width, height, bytes_per_sample):
-    """Undo horizontal differencing predictor (TIFF predictor=2).
+def _predictor_decode_u8(data, row_len, height, stride):
+    """Undo horizontal differencing on a byte stream (in-place, mod 256).
 
-    Operates in-place on the flat byte array, performing cumulative sum
-    per row at the sample level.
+    ``row_len`` is the full row length in bytes; ``stride`` is the number
+    of bytes between adjacent same-channel samples (e.g. samples_per_pixel
+    for chunky 8-bit multi-band data).
     """
-    row_bytes = width * bytes_per_sample
     for row in range(height):
-        row_start = row * row_bytes
-        for col in range(bytes_per_sample, row_bytes):
+        row_start = row * row_len
+        for col in range(stride, row_len):
             idx = row_start + col
-            data[idx] = np.uint8((np.int32(data[idx]) + np.int32(data[idx - bytes_per_sample])) & 0xFF)
+            data[idx] = np.uint8(
+                (np.int32(data[idx]) + np.int32(data[idx - stride])) & 0xFF)
 
 
 @ngjit
-def _predictor_encode(data, width, height, bytes_per_sample):
-    """Apply horizontal differencing predictor (TIFF predictor=2).
-
-    Operates in-place, converting absolute values to differences.
-    Process right-to-left to avoid overwriting values we still need.
-    """
-    row_bytes = width * bytes_per_sample
+def _predictor_encode_u8(data, row_len, height, stride):
+    """Apply horizontal differencing on a byte stream (in-place, mod 256)."""
     for row in range(height):
-        row_start = row * row_bytes
-        for col in range(row_bytes - 1, bytes_per_sample - 1, -1):
+        row_start = row * row_len
+        for col in range(row_len - 1, stride - 1, -1):
             idx = row_start + col
-            data[idx] = np.uint8((np.int32(data[idx]) - np.int32(data[idx - bytes_per_sample])) & 0xFF)
+            data[idx] = np.uint8(
+                (np.int32(data[idx]) - np.int32(data[idx - stride])) & 0xFF)
+
+
+def _predictor_decode_samples(data, image_width, height, dtype, samples=1):
+    """Sample-wise decode for multi-byte sample dtypes.
+
+    Modifies *data* (a flat uint8 buffer) in place.  ``dtype`` carries the
+    file's byte order so that the buffer is interpreted with the right
+    endianness; the result is written back in the same byte order.
+
+    For chunky multi-band data the cumulative sum runs per-channel so that
+    each channel's value is differenced against the previous same-channel
+    pixel (TIFF Technical Note 1, predictor=2).
+    """
+    if samples == 1:
+        view = data.view(dtype).reshape(height, image_width)
+    else:
+        view = data.view(dtype).reshape(height, image_width, samples)
+    # In-place cumsum in the dtype's modular arithmetic.  The ``out`` cast
+    # keeps the result wrapping inside the sample width for fixed-width
+    # integers.  Float dtypes never reach this branch (the public
+    # ``predictor_decode`` routes them to the byte-wise legacy kernel)
+    # because predictor=2 differencing on floats does not invert exactly.
+    np.cumsum(view, axis=1, dtype=dtype, out=view)
+
+
+def _predictor_encode_samples(data, image_width, height, dtype, samples=1):
+    """Sample-wise encode for multi-byte sample dtypes (in-place)."""
+    if samples == 1:
+        view = data.view(dtype).reshape(height, image_width)
+    else:
+        view = data.view(dtype).reshape(height, image_width, samples)
+    view[:, 1:] = np.diff(view, axis=1)
 
 
 def predictor_decode(data: np.ndarray, width: int, height: int,
-                     bytes_per_sample: int) -> np.ndarray:
+                     bytes_per_sample: int,
+                     dtype: np.dtype | None = None,
+                     samples: int = 1) -> np.ndarray:
     """Undo horizontal differencing predictor (predictor=2).
 
     Parameters
     ----------
     data : np.ndarray
         Flat uint8 array of decompressed pixel data (modified in-place).
-    width, height : int
-        Image dimensions.
+    width : int
+        Image width in pixels.  For chunky multi-band data the number of
+        samples per row is ``width * samples``.
+    height : int
+        Number of rows.
     bytes_per_sample : int
-        Bytes per sample (e.g. 1 for uint8, 4 for float32).
+        Bytes per sample (1, 2, 4, or 8).
+    dtype : np.dtype, optional
+        Sample dtype including the file's byte order.  Required when
+        ``bytes_per_sample > 1`` so the byte stream can be interpreted
+        as whole sample values; per TIFF spec, differencing is
+        sample-level (carries propagate within each sample's bytes).
+    samples : int
+        Samples per pixel for chunky planar config.
 
     Returns
     -------
     np.ndarray
         Same array, modified in-place.
+
+    Notes
+    -----
+    Predictor=2 differences each sample against the previous same-channel
+    sample (TIFF Tech Note 1).  For chunky multi-band data the byte
+    stride between same-channel samples is ``samples * bytes_per_sample``.
     """
     buf = np.ascontiguousarray(data)
-    _predictor_decode(buf, width, height, bytes_per_sample)
+    if bytes_per_sample == 1:
+        # Byte-wise math is exact for 8-bit samples; stride = channels.
+        _predictor_decode_u8(buf, width * samples, height, samples)
+    elif dtype is not None and np.dtype(dtype).kind in ('i', 'u'):
+        _predictor_decode_samples(buf, width, height, np.dtype(dtype),
+                                  samples=samples)
+    elif dtype is not None and np.dtype(dtype).kind == 'f':
+        # Predictor=2 on floats is outside the TIFF spec (which restricts
+        # this predictor to integers).  xrspatial historically allowed it
+        # with byte-wise differencing, which round-trips because the byte
+        # sequence is restored exactly even though the per-sample
+        # arithmetic is meaningless.  Preserve that behaviour here so
+        # files written by older versions still read back unchanged.
+        _predictor_decode_u8(buf, width * samples * bytes_per_sample,
+                             height, samples * bytes_per_sample)
+    else:
+        # Legacy path: byte-wise with caller-supplied stride.  Only correct
+        # for round-trips with a matching byte-wise encoder; do not feed
+        # this path libtiff/GDAL files for multi-byte sample data.
+        _predictor_decode_u8(buf, width * bytes_per_sample, height,
+                             bytes_per_sample)
     return buf
 
 
 def predictor_encode(data: np.ndarray, width: int, height: int,
-                     bytes_per_sample: int) -> np.ndarray:
+                     bytes_per_sample: int,
+                     dtype: np.dtype | None = None,
+                     samples: int = 1) -> np.ndarray:
     """Apply horizontal differencing predictor (predictor=2).
 
-    Parameters
-    ----------
-    data : np.ndarray
-        Flat uint8 array of pixel data (modified in-place).
-    width, height : int
-        Image dimensions.
-    bytes_per_sample : int
-        Bytes per sample.
-
-    Returns
-    -------
-    np.ndarray
-        Same array, modified in-place.
+    Parameters mirror :func:`predictor_decode`.
     """
     buf = np.ascontiguousarray(data)
-    _predictor_encode(buf, width, height, bytes_per_sample)
+    if bytes_per_sample == 1:
+        _predictor_encode_u8(buf, width * samples, height, samples)
+    elif dtype is not None and np.dtype(dtype).kind in ('i', 'u'):
+        _predictor_encode_samples(buf, width, height, np.dtype(dtype),
+                                  samples=samples)
+    elif dtype is not None and np.dtype(dtype).kind == 'f':
+        # See ``predictor_decode`` for why floats stay byte-wise.
+        _predictor_encode_u8(buf, width * samples * bytes_per_sample,
+                             height, samples * bytes_per_sample)
+    else:
+        _predictor_encode_u8(buf, width * bytes_per_sample, height,
+                             bytes_per_sample)
     return buf
 
 
diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index 67c36829..2dd5c82a 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -629,7 +629,12 @@ def _inflate_tiles_kernel(
 
 @cuda.jit
 def _predictor_decode_kernel(data, width, height, bytes_per_sample):
-    """Undo horizontal differencing (predictor=2), one thread per row."""
+    """Undo horizontal differencing (predictor=2), one thread per row.
+
+    Byte-wise variant retained for 8-bit samples; multi-byte samples must
+    use :func:`_predictor_decode_kernel_samples` so carries between bytes
+    of the same sample propagate correctly.
+    """
     row = cuda.grid(1)
     if row >= height:
         return
@@ -643,6 +648,27 @@ def _predictor_decode_kernel(data, width, height, bytes_per_sample):
             (numba_int32(data[idx]) + numba_int32(data[idx - bytes_per_sample])) & 0xFF)
 
 
+@cuda.jit
+def _predictor_decode_kernel_samples(data_view, samples_per_row, height,
+                                     stride):
+    """Sample-wise horizontal-differencing decode, one thread per row.
+
+    *data_view* is a 1-D device array typed as the file's sample dtype
+    (e.g. uint16 / int32 / float32) so that ``+=`` between elements
+    preserves carries between bytes within the same sample.  *stride* is
+    the number of typed samples between adjacent same-channel samples
+    (1 for single-band, samples_per_pixel for chunky multi-band) per
+    TIFF Tech Note 1.  Operates in place.
+    """
+    row = cuda.grid(1)
+    if row >= height:
+        return
+    row_start = row * samples_per_row
+    for col in range(stride, samples_per_row):
+        idx = row_start + col
+        data_view[idx] = data_view[idx] + data_view[idx - stride]
+
+
 @cuda.jit
 def _fp_predictor_decode_kernel(data, tmp, width, height, bps):
     """Undo floating-point predictor (predictor=3), one thread per row.
@@ -1383,11 +1409,21 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
         total_rows = n_tiles * tile_height
         tpb = min(256, total_rows)
         bpg = math.ceil(total_rows / tpb)
-        # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
-        # width and full per-pixel size (itemsize * samples). Matches CPU
-        # call at _reader.py _apply_predictor(..., bytes_per_sample * samples).
-        _predictor_decode_kernel[bpg, tpb](
-            d_decomp, tile_width, total_rows, dtype.itemsize * samples)
+        # TIFF predictor=2 differences whole samples (per channel).  For
+        # 8-bit samples (and for floats, where the spec disallows
+        # predictor=2 but xrspatial historically wrote byte-wise diffs
+        # that round-trip via the same byte-wise decoder) the byte-wise
+        # kernel runs.  Multi-byte integer samples view the buffer as the
+        # sample dtype so carries propagate within each sample.
+        sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
+        if not sample_wise:
+            _predictor_decode_kernel[bpg, tpb](
+                d_decomp, tile_width, total_rows, samples * dtype.itemsize)
+        else:
+            samples_per_row = tile_width * samples
+            view = d_decomp.view(dtype=cupy.dtype(dtype))
+            _predictor_decode_kernel_samples[bpg, tpb](
+                view, samples_per_row, total_rows, samples)
         cuda.synchronize()
     elif predictor == 3:
         total_rows = n_tiles * tile_height
@@ -1629,15 +1665,22 @@ def gpu_decode_tiles(
 
     # Apply predictor on GPU
     if predictor == 2:
-        # Horizontal differencing: one thread per row across all tiles
+        # Horizontal differencing: one thread per row across all tiles.
+        # Multi-byte integer samples use the sample-wise kernel so carries
+        # propagate; 8-bit samples and floats stay byte-wise (see comment
+        # in _apply_predictor_and_assemble for why floats are byte-wise).
         total_rows = n_tiles * tile_height
         tpb = min(256, total_rows)
         bpg = math.ceil(total_rows / tpb)
-        # Reshape so each tile's rows are contiguous (they already are).
-        # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
-        # width and full per-pixel size (itemsize * samples).
-        _predictor_decode_kernel[bpg, tpb](
-            d_decomp, tile_width, total_rows, dtype.itemsize * samples)
+        sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
+        if not sample_wise:
+            _predictor_decode_kernel[bpg, tpb](
+                d_decomp, tile_width, total_rows, samples * dtype.itemsize)
+        else:
+            samples_per_row = tile_width * samples
+            view = d_decomp.view(dtype=cupy.dtype(dtype))
+            _predictor_decode_kernel_samples[bpg, tpb](
+                view, samples_per_row, total_rows, samples)
         cuda.synchronize()
 
     elif predictor == 3:
@@ -1723,6 +1766,8 @@ def _extract_tiles_kernel(
 def _predictor_encode_kernel(data, width, height, bytes_per_sample):
     """Apply horizontal differencing (predictor=2), one thread per row.
     Process right-to-left to avoid overwriting values we still need.
+
+    Byte-wise variant retained for 8-bit samples.
     """
     row = cuda.grid(1)
     if row >= height:
@@ -1737,6 +1782,24 @@ def _predictor_encode_kernel(data, width, height, bytes_per_sample):
             (numba_int32(data[idx]) - numba_int32(data[idx - bytes_per_sample])) & 0xFF)
 
 
+@cuda.jit
+def _predictor_encode_kernel_samples(data_view, samples_per_row, height,
+                                     stride):
+    """Sample-wise horizontal-differencing encode, one thread per row.
+
+    Counterpart to :func:`_predictor_decode_kernel_samples` -- *stride* is
+    the per-channel sample stride (1 for single-band, samples_per_pixel
+    for chunky multi-band).  Operates right-to-left, in place.
+    """
+    row = cuda.grid(1)
+    if row >= height:
+        return
+    row_start = row * samples_per_row
+    for col in range(samples_per_row - 1, stride - 1, -1):
+        idx = row_start + col
+        data_view[idx] = data_view[idx] - data_view[idx - stride]
+
+
 @cuda.jit
 def _fp_predictor_encode_kernel(data, tmp, width, height, bps):
     """Apply floating-point predictor (predictor=3), one thread per row."""
@@ -2307,8 +2370,19 @@ def gpu_compress_tiles(d_image, tile_width, tile_height,
     if predictor == 2:
         tpb_r = min(256, total_rows)
         bpg_r = math.ceil(total_rows / tpb_r)
-        _predictor_encode_kernel[bpg_r, tpb_r](
-            d_tile_buf, tile_width * samples, total_rows, dtype.itemsize * samples)
+        # Sample-wise encode for multi-byte integer dtypes; byte-wise for
+        # 8-bit samples and floats (predictor=2 on floats is outside spec
+        # but xrspatial historically used byte-wise diffs, which the
+        # decoder still expects).
+        sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
+        if not sample_wise:
+            _predictor_encode_kernel[bpg_r, tpb_r](
+                d_tile_buf, tile_width, total_rows, samples * dtype.itemsize)
+        else:
+            samples_per_row = tile_width * samples
+            view = d_tile_buf.view(dtype=cupy.dtype(dtype))
+            _predictor_encode_kernel_samples[bpg_r, tpb_r](
+                view, samples_per_row, total_rows, samples)
         cuda.synchronize()
     elif predictor == 3:
         tpb_r = min(256, total_rows)
diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py
index feb59574..72d0649e 100644
--- a/xrspatial/geotiff/_reader.py
+++ b/xrspatial/geotiff/_reader.py
@@ -341,15 +341,18 @@ def _open_source(source: str):
 
 def _apply_predictor(chunk: np.ndarray, pred: int, width: int,
                      height: int, bytes_per_sample: int,
-                     samples: int = 1) -> np.ndarray:
+                     samples: int = 1,
+                     dtype: np.dtype | None = None) -> np.ndarray:
     """Apply the appropriate predictor decode to decompressed data.
 
     ``width``, ``height``, ``bytes_per_sample``, and ``samples`` describe
     the raw pixel layout before predictor inversion: ``width * samples``
     samples per row, each ``bytes_per_sample`` bytes wide.
 
-    Predictor=2 (horizontal differencing) works byte-wise on a stride of
-    ``bytes_per_sample * samples``.
+    Predictor=2 (horizontal differencing) is sample-wise per TIFF spec,
+    using ``dtype`` (with the file's byte order) to interpret the byte
+    stream as whole sample values.  An 8-bit fast path keeps the legacy
+    byte-wise loop.
 
     Predictor=3 (floating-point) byte-swizzles each row into
     ``bytes_per_sample`` interleaved lanes of length ``width * samples``,
@@ -359,7 +362,8 @@ def _apply_predictor(chunk: np.ndarray, pred: int, width: int,
     """
     if pred == 2:
         return predictor_decode(chunk, width, height,
-                                bytes_per_sample * samples)
+                                bytes_per_sample, dtype=dtype,
+                                samples=samples)
     elif pred == 3:
         return fp_predictor_decode(chunk, width * samples, height,
                                    bytes_per_sample)
@@ -412,8 +416,14 @@ def _decode_strip_or_tile(data_slice, compression, width, height, samples,
     if pred in (2, 3) and not is_sub_byte:
         if not chunk.flags.writeable:
             chunk = chunk.copy()
+        # Predictor=2 needs the sample dtype with the file's byte order so
+        # that multi-byte samples can be decoded sample-wise (carries
+        # propagate within each sample).  Predictor=3 only operates on the
+        # byte stream itself so dtype is unused there.
+        file_sample_dtype = dtype.newbyteorder(byte_order)
         chunk = _apply_predictor(chunk, pred, width, height,
-                                 bytes_per_sample, samples=samples)
+                                 bytes_per_sample, samples=samples,
+                                 dtype=file_sample_dtype)
 
     if is_sub_byte:
         pixels = unpack_bits(chunk, bps, pixel_count)
diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py
index 8f0d7197..1ff998b5 100644
--- a/xrspatial/geotiff/_writer.py
+++ b/xrspatial/geotiff/_writer.py
@@ -93,11 +93,18 @@ def normalize_predictor(predictor, dtype, compression: int) -> int:
 
 def _apply_predictor_encode(buf: np.ndarray, predictor: int,
                             width: int, height: int,
-                            bytes_per_sample: int, samples: int) -> np.ndarray:
-    """Apply the chosen predictor to a flat uint8 buffer."""
+                            bytes_per_sample: int, samples: int,
+                            dtype: np.dtype | None = None) -> np.ndarray:
+    """Apply the chosen predictor to a flat uint8 buffer.
+
+    For predictor=2 with multi-byte samples the buffer is interpreted as
+    samples in *dtype*'s byte order (the file's byte order, always
+    little-endian for files this writer emits).
+    """
     if predictor == 2:
         return predictor_encode(buf, width, height,
-                                bytes_per_sample * samples)
+                                bytes_per_sample, dtype=dtype,
+                                samples=samples)
     if predictor == 3:
         return fp_predictor_encode(buf, width * samples, height,
                                    bytes_per_sample)
@@ -380,7 +387,8 @@ def _write_stripped(data: np.ndarray, compression: int, predictor: int,
             strip_arr = np.ascontiguousarray(data[r0:r1])
             buf = strip_arr.view(np.uint8).ravel().copy()
             buf = _apply_predictor_encode(
-                buf, predictor, width, strip_rows, bytes_per_sample, samples)
+                buf, predictor, width, strip_rows, bytes_per_sample, samples,
+                dtype=np.dtype(dtype).newbyteorder(BO))
             strip_data = buf.tobytes()
             if compression_level is None:
                 compressed = compress(strip_data, compression)
@@ -447,7 +455,8 @@ def _prepare_tile(data, tr, tc, th, tw, height, width, samples, dtype,
     elif predictor != 1 and compression != COMPRESSION_NONE:
         buf = tile_arr.view(np.uint8).ravel().copy()
         buf = _apply_predictor_encode(
-            buf, predictor, tw, th, bytes_per_sample, samples)
+            buf, predictor, tw, th, bytes_per_sample, samples,
+            dtype=np.dtype(dtype).newbyteorder(BO))
         tile_data = buf.tobytes()
     else:
         tile_data = tile_arr.tobytes()
@@ -1089,7 +1098,8 @@ def _compress_block(arr, block_w, block_h, samples, dtype, bytes_per_sample,
     if predictor != 1 and compression != COMPRESSION_NONE:
         buf = arr.view(np.uint8).ravel().copy()
         buf = _apply_predictor_encode(
-            buf, predictor, block_w, block_h, bytes_per_sample, samples)
+            buf, predictor, block_w, block_h, bytes_per_sample, samples,
+            dtype=np.dtype(dtype).newbyteorder(BO))
         raw_data = buf.tobytes()
     else:
         raw_data = arr.tobytes()
diff --git a/xrspatial/geotiff/tests/test_predictor2_samplewise.py b/xrspatial/geotiff/tests/test_predictor2_samplewise.py
new file mode 100644
index 00000000..449e000a
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_predictor2_samplewise.py
@@ -0,0 +1,190 @@
+"""Predictor=2 sample-wise round-trip tests against libtiff/GDAL.
+
+The historical implementation differenced the byte stream at a sample
+stride (so it lost carries between bytes inside a single multi-byte
+sample).  Files xrspatial wrote could be read back because both halves
+were wrong in matching ways, but reading multi-byte integer files
+written by libtiff/GDAL produced silently-wrong pixel values whenever a
+difference between adjacent samples crossed a byte boundary.
+
+These tests use rasterio (libtiff) and tifffile/imagecodecs as reference
+encoders and check that xrspatial reads them back exactly.
+"""
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from xrspatial.geotiff._reader import read_to_array
+
+rasterio = pytest.importorskip("rasterio")
+tifffile = pytest.importorskip("tifffile")
+pytest.importorskip("imagecodecs")  # required by tifffile for predictor
+
+
+def _carry_array(dtype):
+    """Pixel grid that crosses a byte boundary between adjacent samples."""
+    return np.array(
+        [
+            [1, 2, 3, 4],
+            [5, 6, 7, 8],
+            [256, 257, 258, 259],
+            [1000, 2000, 3000, 4000],
+        ],
+        dtype=dtype,
+    )
+
+
+@pytest.mark.parametrize("dtype", ["uint16", "int16", "uint32", "int32"])
+def test_libtiff_predictor2_le_round_trip(tmp_path, dtype):
+    """xrspatial reads libtiff-encoded LE predictor=2 files exactly."""
+    arr = _carry_array(dtype)
+    path = tmp_path / f"libtiff_pred2_le_{dtype}.tif"
+    transform = rasterio.transform.from_origin(0, 4, 1, 1)
+    with rasterio.open(
+        str(path),
+        "w",
+        driver="GTiff",
+        height=4,
+        width=4,
+        count=1,
+        dtype=dtype,
+        compress="deflate",
+        predictor=2,
+        transform=transform,
+    ) as dst:
+        dst.write(arr, 1)
+
+    out, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(out, arr)
+
+
+@pytest.mark.parametrize("dtype", ["uint16", "int16", "uint32", "int32"])
+def test_tifffile_predictor2_be_round_trip(tmp_path, dtype):
+    """xrspatial reads big-endian predictor=2 files exactly."""
+    arr = _carry_array(dtype)
+    path = tmp_path / f"tifffile_pred2_be_{dtype}.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=2, compression="deflate")
+
+    out, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(out, arr)
+
+
+def test_libtiff_multiband_predictor2_chunky(tmp_path):
+    """Chunky multi-band uint16 predictor=2 (libtiff per-channel diffs)."""
+    arr = np.array(
+        [
+            [[1, 100], [2, 200]],
+            [[3, 300], [4, 400]],
+        ],
+        dtype="uint16",
+    )
+    arr_band_first = np.moveaxis(arr, -1, 0)  # rasterio expects (band, y, x)
+
+    path = tmp_path / "libtiff_pred2_multiband.tif"
+    transform = rasterio.transform.from_origin(0, 2, 1, 1)
+    with rasterio.open(
+        str(path),
+        "w",
+        driver="GTiff",
+        height=2,
+        width=2,
+        count=2,
+        dtype="uint16",
+        compress="deflate",
+        predictor=2,
+        interleave="pixel",
+        transform=transform,
+    ) as dst:
+        dst.write(arr_band_first)
+
+    out, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(out, arr)
+
+
+def test_xrspatial_round_trip_uint16_predictor2(tmp_path):
+    """xrspatial-written predictor=2 uint16 still round-trips."""
+    from xrspatial.geotiff import open_geotiff, to_geotiff
+    import xarray as xr
+
+    arr = _carry_array("uint16")
+    da = xr.DataArray(
+        arr,
+        dims=("y", "x"),
+        coords={
+            "x": np.arange(4, dtype=np.float64),
+            "y": np.arange(4, dtype=np.float64),
+        },
+    )
+    path = tmp_path / "self_pred2_u16.tif"
+    to_geotiff(da, str(path), compression="deflate", predictor=2)
+
+    out = open_geotiff(str(path))
+    np.testing.assert_array_equal(out.values, arr)
+
+
+def test_xrspatial_uint16_predictor2_readable_by_rasterio(tmp_path):
+    """xrspatial-written predictor=2 uint16 is readable by libtiff."""
+    from xrspatial.geotiff import to_geotiff
+    import xarray as xr
+
+    arr = _carry_array("uint16")
+    da = xr.DataArray(
+        arr,
+        dims=("y", "x"),
+        coords={
+            "x": np.arange(4, dtype=np.float64),
+            "y": np.arange(4, dtype=np.float64),
+        },
+    )
+    path = tmp_path / "interop_pred2_u16.tif"
+    to_geotiff(da, str(path), compression="deflate", predictor=2)
+
+    with rasterio.open(str(path)) as src:
+        rio_arr = src.read(1)
+    np.testing.assert_array_equal(rio_arr, arr)
+
+
+def _gpu_to_numpy(da):
+    """Materialize a possibly cupy-backed DataArray as a numpy ndarray."""
+    arr = da.data
+    if hasattr(arr, "get"):
+        return arr.get()
+    return np.asarray(arr)
+
+
+@pytest.mark.parametrize("dtype", ["uint16", "int32"])
+def test_gpu_predictor2_multibyte_matches_libtiff(tmp_path, dtype):
+    """The GPU path also decodes libtiff predictor=2 multi-byte files."""
+    cupy = pytest.importorskip("cupy")
+    if not cupy.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    from xrspatial.geotiff import open_geotiff
+
+    arr = _carry_array(dtype)
+    path = tmp_path / f"gpu_pred2_{dtype}.tif"
+    transform = rasterio.transform.from_origin(0, 4, 1, 1)
+    with rasterio.open(
+        str(path),
+        "w",
+        driver="GTiff",
+        height=4,
+        width=4,
+        count=1,
+        dtype=dtype,
+        compress="deflate",
+        predictor=2,
+        tiled=True,
+        blockxsize=16,
+        blockysize=16,
+        transform=transform,
+    ) as dst:
+        dst.write(arr, 1)
+
+    cpu_arr = open_geotiff(str(path)).values
+    np.testing.assert_array_equal(cpu_arr, arr)
+
+    gpu_arr = _gpu_to_numpy(open_geotiff(str(path), gpu=True))
+    np.testing.assert_array_equal(gpu_arr, arr)