diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cc2b4ec..b1afaecb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ ----------- +### Unreleased + +#### Bug fixes and improvements +- Decode TIFF predictor=2 sample-wise so multi-byte integer files written by libtiff/GDAL read back exactly + + ### Version 0.9.9 - 2026-05-05 #### Bug fixes and improvements diff --git a/xrspatial/geotiff/_compression.py b/xrspatial/geotiff/_compression.py index 087ebbe9..cd7b06cd 100644 --- a/xrspatial/geotiff/_compression.py +++ b/xrspatial/geotiff/_compression.py @@ -333,80 +333,164 @@ def lzw_compress(data: bytes) -> bytes: # -- Horizontal predictor (Numba) -------------------------------------------- +# +# TIFF predictor=2 (horizontal differencing) operates on whole *samples* +# (per-pixel-component values), not on individual bytes. For multi-byte +# integer samples the difference is computed across full sample values, so +# carries between the low and high bytes are honoured. The byte stream that +# lands on disk is the difference samples re-encoded in the file's byte +# order (always little-endian for files this writer emits, but readers must +# also handle big-endian sources). +# +# The previous implementation here did byte-wise cumulative sum at a sample +# stride, which produced the right answer for 8-bit data but silently +# scrambled multi-byte integers whenever the difference between adjacent +# samples crossed a byte boundary (i.e. needed a carry). Both halves were +# wrong in matching ways, so files this codebase wrote could be re-read, +# but cross-tool reads of files written by libtiff/GDAL produced incorrect +# pixel values. See accuracy sweep finding (2026-05-06). @ngjit -def _predictor_decode(data, width, height, bytes_per_sample): - """Undo horizontal differencing predictor (TIFF predictor=2). +def _predictor_decode_u8(data, row_len, height, stride): + """Undo horizontal differencing on a byte stream (in-place, mod 256). - Operates in-place on the flat byte array, performing cumulative sum - per row at the sample level. + ``row_len`` is the full row length in bytes; ``stride`` is the number + of bytes between adjacent same-channel samples (e.g. samples_per_pixel + for chunky 8-bit multi-band data). """ - row_bytes = width * bytes_per_sample for row in range(height): - row_start = row * row_bytes - for col in range(bytes_per_sample, row_bytes): + row_start = row * row_len + for col in range(stride, row_len): idx = row_start + col - data[idx] = np.uint8((np.int32(data[idx]) + np.int32(data[idx - bytes_per_sample])) & 0xFF) + data[idx] = np.uint8( + (np.int32(data[idx]) + np.int32(data[idx - stride])) & 0xFF) @ngjit -def _predictor_encode(data, width, height, bytes_per_sample): - """Apply horizontal differencing predictor (TIFF predictor=2). - - Operates in-place, converting absolute values to differences. - Process right-to-left to avoid overwriting values we still need. - """ - row_bytes = width * bytes_per_sample +def _predictor_encode_u8(data, row_len, height, stride): + """Apply horizontal differencing on a byte stream (in-place, mod 256).""" for row in range(height): - row_start = row * row_bytes - for col in range(row_bytes - 1, bytes_per_sample - 1, -1): + row_start = row * row_len + for col in range(row_len - 1, stride - 1, -1): idx = row_start + col - data[idx] = np.uint8((np.int32(data[idx]) - np.int32(data[idx - bytes_per_sample])) & 0xFF) + data[idx] = np.uint8( + (np.int32(data[idx]) - np.int32(data[idx - stride])) & 0xFF) + + +def _predictor_decode_samples(data, image_width, height, dtype, samples=1): + """Sample-wise decode for multi-byte sample dtypes. + + Modifies *data* (a flat uint8 buffer) in place. ``dtype`` carries the + file's byte order so that the buffer is interpreted with the right + endianness; the result is written back in the same byte order. + + For chunky multi-band data the cumulative sum runs per-channel so that + each channel's value is differenced against the previous same-channel + pixel (TIFF Technical Note 1, predictor=2). + """ + if samples == 1: + view = data.view(dtype).reshape(height, image_width) + else: + view = data.view(dtype).reshape(height, image_width, samples) + # In-place cumsum in the dtype's modular arithmetic. The ``out`` cast + # keeps the result wrapping inside the sample width for fixed-width + # integers. Float dtypes never reach this branch (the public + # ``predictor_decode`` routes them to the byte-wise legacy kernel) + # because predictor=2 differencing on floats does not invert exactly. + np.cumsum(view, axis=1, dtype=dtype, out=view) + + +def _predictor_encode_samples(data, image_width, height, dtype, samples=1): + """Sample-wise encode for multi-byte sample dtypes (in-place).""" + if samples == 1: + view = data.view(dtype).reshape(height, image_width) + else: + view = data.view(dtype).reshape(height, image_width, samples) + view[:, 1:] = np.diff(view, axis=1) def predictor_decode(data: np.ndarray, width: int, height: int, - bytes_per_sample: int) -> np.ndarray: + bytes_per_sample: int, + dtype: np.dtype | None = None, + samples: int = 1) -> np.ndarray: """Undo horizontal differencing predictor (predictor=2). Parameters ---------- data : np.ndarray Flat uint8 array of decompressed pixel data (modified in-place). - width, height : int - Image dimensions. + width : int + Image width in pixels. For chunky multi-band data the number of + samples per row is ``width * samples``. + height : int + Number of rows. bytes_per_sample : int - Bytes per sample (e.g. 1 for uint8, 4 for float32). + Bytes per sample (1, 2, 4, or 8). + dtype : np.dtype, optional + Sample dtype including the file's byte order. Required when + ``bytes_per_sample > 1`` so the byte stream can be interpreted + as whole sample values; per TIFF spec, differencing is + sample-level (carries propagate within each sample's bytes). + samples : int + Samples per pixel for chunky planar config. Returns ------- np.ndarray Same array, modified in-place. + + Notes + ----- + Predictor=2 differences each sample against the previous same-channel + sample (TIFF Tech Note 1). For chunky multi-band data the byte + stride between same-channel samples is ``samples * bytes_per_sample``. """ buf = np.ascontiguousarray(data) - _predictor_decode(buf, width, height, bytes_per_sample) + if bytes_per_sample == 1: + # Byte-wise math is exact for 8-bit samples; stride = channels. + _predictor_decode_u8(buf, width * samples, height, samples) + elif dtype is not None and np.dtype(dtype).kind in ('i', 'u'): + _predictor_decode_samples(buf, width, height, np.dtype(dtype), + samples=samples) + elif dtype is not None and np.dtype(dtype).kind == 'f': + # Predictor=2 on floats is outside the TIFF spec (which restricts + # this predictor to integers). xrspatial historically allowed it + # with byte-wise differencing, which round-trips because the byte + # sequence is restored exactly even though the per-sample + # arithmetic is meaningless. Preserve that behaviour here so + # files written by older versions still read back unchanged. + _predictor_decode_u8(buf, width * samples * bytes_per_sample, + height, samples * bytes_per_sample) + else: + # Legacy path: byte-wise with caller-supplied stride. Only correct + # for round-trips with a matching byte-wise encoder; do not feed + # this path libtiff/GDAL files for multi-byte sample data. + _predictor_decode_u8(buf, width * bytes_per_sample, height, + bytes_per_sample) return buf def predictor_encode(data: np.ndarray, width: int, height: int, - bytes_per_sample: int) -> np.ndarray: + bytes_per_sample: int, + dtype: np.dtype | None = None, + samples: int = 1) -> np.ndarray: """Apply horizontal differencing predictor (predictor=2). - Parameters - ---------- - data : np.ndarray - Flat uint8 array of pixel data (modified in-place). - width, height : int - Image dimensions. - bytes_per_sample : int - Bytes per sample. - - Returns - ------- - np.ndarray - Same array, modified in-place. + Parameters mirror :func:`predictor_decode`. """ buf = np.ascontiguousarray(data) - _predictor_encode(buf, width, height, bytes_per_sample) + if bytes_per_sample == 1: + _predictor_encode_u8(buf, width * samples, height, samples) + elif dtype is not None and np.dtype(dtype).kind in ('i', 'u'): + _predictor_encode_samples(buf, width, height, np.dtype(dtype), + samples=samples) + elif dtype is not None and np.dtype(dtype).kind == 'f': + # See ``predictor_decode`` for why floats stay byte-wise. + _predictor_encode_u8(buf, width * samples * bytes_per_sample, + height, samples * bytes_per_sample) + else: + _predictor_encode_u8(buf, width * bytes_per_sample, height, + bytes_per_sample) return buf diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index 67c36829..2dd5c82a 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -629,7 +629,12 @@ def _inflate_tiles_kernel( @cuda.jit def _predictor_decode_kernel(data, width, height, bytes_per_sample): - """Undo horizontal differencing (predictor=2), one thread per row.""" + """Undo horizontal differencing (predictor=2), one thread per row. + + Byte-wise variant retained for 8-bit samples; multi-byte samples must + use :func:`_predictor_decode_kernel_samples` so carries between bytes + of the same sample propagate correctly. + """ row = cuda.grid(1) if row >= height: return @@ -643,6 +648,27 @@ def _predictor_decode_kernel(data, width, height, bytes_per_sample): (numba_int32(data[idx]) + numba_int32(data[idx - bytes_per_sample])) & 0xFF) +@cuda.jit +def _predictor_decode_kernel_samples(data_view, samples_per_row, height, + stride): + """Sample-wise horizontal-differencing decode, one thread per row. + + *data_view* is a 1-D device array typed as the file's sample dtype + (e.g. uint16 / int32 / float32) so that ``+=`` between elements + preserves carries between bytes within the same sample. *stride* is + the number of typed samples between adjacent same-channel samples + (1 for single-band, samples_per_pixel for chunky multi-band) per + TIFF Tech Note 1. Operates in place. + """ + row = cuda.grid(1) + if row >= height: + return + row_start = row * samples_per_row + for col in range(stride, samples_per_row): + idx = row_start + col + data_view[idx] = data_view[idx] + data_view[idx - stride] + + @cuda.jit def _fp_predictor_decode_kernel(data, tmp, width, height, bps): """Undo floating-point predictor (predictor=3), one thread per row. @@ -1383,11 +1409,21 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, total_rows = n_tiles * tile_height tpb = min(256, total_rows) bpg = math.ceil(total_rows / tpb) - # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel - # width and full per-pixel size (itemsize * samples). Matches CPU - # call at _reader.py _apply_predictor(..., bytes_per_sample * samples). - _predictor_decode_kernel[bpg, tpb]( - d_decomp, tile_width, total_rows, dtype.itemsize * samples) + # TIFF predictor=2 differences whole samples (per channel). For + # 8-bit samples (and for floats, where the spec disallows + # predictor=2 but xrspatial historically wrote byte-wise diffs + # that round-trip via the same byte-wise decoder) the byte-wise + # kernel runs. Multi-byte integer samples view the buffer as the + # sample dtype so carries propagate within each sample. + sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u')) + if not sample_wise: + _predictor_decode_kernel[bpg, tpb]( + d_decomp, tile_width, total_rows, samples * dtype.itemsize) + else: + samples_per_row = tile_width * samples + view = d_decomp.view(dtype=cupy.dtype(dtype)) + _predictor_decode_kernel_samples[bpg, tpb]( + view, samples_per_row, total_rows, samples) cuda.synchronize() elif predictor == 3: total_rows = n_tiles * tile_height @@ -1629,15 +1665,22 @@ def gpu_decode_tiles( # Apply predictor on GPU if predictor == 2: - # Horizontal differencing: one thread per row across all tiles + # Horizontal differencing: one thread per row across all tiles. + # Multi-byte integer samples use the sample-wise kernel so carries + # propagate; 8-bit samples and floats stay byte-wise (see comment + # in _apply_predictor_and_assemble for why floats are byte-wise). total_rows = n_tiles * tile_height tpb = min(256, total_rows) bpg = math.ceil(total_rows / tpb) - # Reshape so each tile's rows are contiguous (they already are). - # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel - # width and full per-pixel size (itemsize * samples). - _predictor_decode_kernel[bpg, tpb]( - d_decomp, tile_width, total_rows, dtype.itemsize * samples) + sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u')) + if not sample_wise: + _predictor_decode_kernel[bpg, tpb]( + d_decomp, tile_width, total_rows, samples * dtype.itemsize) + else: + samples_per_row = tile_width * samples + view = d_decomp.view(dtype=cupy.dtype(dtype)) + _predictor_decode_kernel_samples[bpg, tpb]( + view, samples_per_row, total_rows, samples) cuda.synchronize() elif predictor == 3: @@ -1723,6 +1766,8 @@ def _extract_tiles_kernel( def _predictor_encode_kernel(data, width, height, bytes_per_sample): """Apply horizontal differencing (predictor=2), one thread per row. Process right-to-left to avoid overwriting values we still need. + + Byte-wise variant retained for 8-bit samples. """ row = cuda.grid(1) if row >= height: @@ -1737,6 +1782,24 @@ def _predictor_encode_kernel(data, width, height, bytes_per_sample): (numba_int32(data[idx]) - numba_int32(data[idx - bytes_per_sample])) & 0xFF) +@cuda.jit +def _predictor_encode_kernel_samples(data_view, samples_per_row, height, + stride): + """Sample-wise horizontal-differencing encode, one thread per row. + + Counterpart to :func:`_predictor_decode_kernel_samples` -- *stride* is + the per-channel sample stride (1 for single-band, samples_per_pixel + for chunky multi-band). Operates right-to-left, in place. + """ + row = cuda.grid(1) + if row >= height: + return + row_start = row * samples_per_row + for col in range(samples_per_row - 1, stride - 1, -1): + idx = row_start + col + data_view[idx] = data_view[idx] - data_view[idx - stride] + + @cuda.jit def _fp_predictor_encode_kernel(data, tmp, width, height, bps): """Apply floating-point predictor (predictor=3), one thread per row.""" @@ -2307,8 +2370,19 @@ def gpu_compress_tiles(d_image, tile_width, tile_height, if predictor == 2: tpb_r = min(256, total_rows) bpg_r = math.ceil(total_rows / tpb_r) - _predictor_encode_kernel[bpg_r, tpb_r]( - d_tile_buf, tile_width * samples, total_rows, dtype.itemsize * samples) + # Sample-wise encode for multi-byte integer dtypes; byte-wise for + # 8-bit samples and floats (predictor=2 on floats is outside spec + # but xrspatial historically used byte-wise diffs, which the + # decoder still expects). + sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u')) + if not sample_wise: + _predictor_encode_kernel[bpg_r, tpb_r]( + d_tile_buf, tile_width, total_rows, samples * dtype.itemsize) + else: + samples_per_row = tile_width * samples + view = d_tile_buf.view(dtype=cupy.dtype(dtype)) + _predictor_encode_kernel_samples[bpg_r, tpb_r]( + view, samples_per_row, total_rows, samples) cuda.synchronize() elif predictor == 3: tpb_r = min(256, total_rows) diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index feb59574..72d0649e 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -341,15 +341,18 @@ def _open_source(source: str): def _apply_predictor(chunk: np.ndarray, pred: int, width: int, height: int, bytes_per_sample: int, - samples: int = 1) -> np.ndarray: + samples: int = 1, + dtype: np.dtype | None = None) -> np.ndarray: """Apply the appropriate predictor decode to decompressed data. ``width``, ``height``, ``bytes_per_sample``, and ``samples`` describe the raw pixel layout before predictor inversion: ``width * samples`` samples per row, each ``bytes_per_sample`` bytes wide. - Predictor=2 (horizontal differencing) works byte-wise on a stride of - ``bytes_per_sample * samples``. + Predictor=2 (horizontal differencing) is sample-wise per TIFF spec, + using ``dtype`` (with the file's byte order) to interpret the byte + stream as whole sample values. An 8-bit fast path keeps the legacy + byte-wise loop. Predictor=3 (floating-point) byte-swizzles each row into ``bytes_per_sample`` interleaved lanes of length ``width * samples``, @@ -359,7 +362,8 @@ def _apply_predictor(chunk: np.ndarray, pred: int, width: int, """ if pred == 2: return predictor_decode(chunk, width, height, - bytes_per_sample * samples) + bytes_per_sample, dtype=dtype, + samples=samples) elif pred == 3: return fp_predictor_decode(chunk, width * samples, height, bytes_per_sample) @@ -412,8 +416,14 @@ def _decode_strip_or_tile(data_slice, compression, width, height, samples, if pred in (2, 3) and not is_sub_byte: if not chunk.flags.writeable: chunk = chunk.copy() + # Predictor=2 needs the sample dtype with the file's byte order so + # that multi-byte samples can be decoded sample-wise (carries + # propagate within each sample). Predictor=3 only operates on the + # byte stream itself so dtype is unused there. + file_sample_dtype = dtype.newbyteorder(byte_order) chunk = _apply_predictor(chunk, pred, width, height, - bytes_per_sample, samples=samples) + bytes_per_sample, samples=samples, + dtype=file_sample_dtype) if is_sub_byte: pixels = unpack_bits(chunk, bps, pixel_count) diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index 8f0d7197..1ff998b5 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -93,11 +93,18 @@ def normalize_predictor(predictor, dtype, compression: int) -> int: def _apply_predictor_encode(buf: np.ndarray, predictor: int, width: int, height: int, - bytes_per_sample: int, samples: int) -> np.ndarray: - """Apply the chosen predictor to a flat uint8 buffer.""" + bytes_per_sample: int, samples: int, + dtype: np.dtype | None = None) -> np.ndarray: + """Apply the chosen predictor to a flat uint8 buffer. + + For predictor=2 with multi-byte samples the buffer is interpreted as + samples in *dtype*'s byte order (the file's byte order, always + little-endian for files this writer emits). + """ if predictor == 2: return predictor_encode(buf, width, height, - bytes_per_sample * samples) + bytes_per_sample, dtype=dtype, + samples=samples) if predictor == 3: return fp_predictor_encode(buf, width * samples, height, bytes_per_sample) @@ -380,7 +387,8 @@ def _write_stripped(data: np.ndarray, compression: int, predictor: int, strip_arr = np.ascontiguousarray(data[r0:r1]) buf = strip_arr.view(np.uint8).ravel().copy() buf = _apply_predictor_encode( - buf, predictor, width, strip_rows, bytes_per_sample, samples) + buf, predictor, width, strip_rows, bytes_per_sample, samples, + dtype=np.dtype(dtype).newbyteorder(BO)) strip_data = buf.tobytes() if compression_level is None: compressed = compress(strip_data, compression) @@ -447,7 +455,8 @@ def _prepare_tile(data, tr, tc, th, tw, height, width, samples, dtype, elif predictor != 1 and compression != COMPRESSION_NONE: buf = tile_arr.view(np.uint8).ravel().copy() buf = _apply_predictor_encode( - buf, predictor, tw, th, bytes_per_sample, samples) + buf, predictor, tw, th, bytes_per_sample, samples, + dtype=np.dtype(dtype).newbyteorder(BO)) tile_data = buf.tobytes() else: tile_data = tile_arr.tobytes() @@ -1089,7 +1098,8 @@ def _compress_block(arr, block_w, block_h, samples, dtype, bytes_per_sample, if predictor != 1 and compression != COMPRESSION_NONE: buf = arr.view(np.uint8).ravel().copy() buf = _apply_predictor_encode( - buf, predictor, block_w, block_h, bytes_per_sample, samples) + buf, predictor, block_w, block_h, bytes_per_sample, samples, + dtype=np.dtype(dtype).newbyteorder(BO)) raw_data = buf.tobytes() else: raw_data = arr.tobytes() diff --git a/xrspatial/geotiff/tests/test_predictor2_samplewise.py b/xrspatial/geotiff/tests/test_predictor2_samplewise.py new file mode 100644 index 00000000..449e000a --- /dev/null +++ b/xrspatial/geotiff/tests/test_predictor2_samplewise.py @@ -0,0 +1,190 @@ +"""Predictor=2 sample-wise round-trip tests against libtiff/GDAL. + +The historical implementation differenced the byte stream at a sample +stride (so it lost carries between bytes inside a single multi-byte +sample). Files xrspatial wrote could be read back because both halves +were wrong in matching ways, but reading multi-byte integer files +written by libtiff/GDAL produced silently-wrong pixel values whenever a +difference between adjacent samples crossed a byte boundary. + +These tests use rasterio (libtiff) and tifffile/imagecodecs as reference +encoders and check that xrspatial reads them back exactly. +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff._reader import read_to_array + +rasterio = pytest.importorskip("rasterio") +tifffile = pytest.importorskip("tifffile") +pytest.importorskip("imagecodecs") # required by tifffile for predictor + + +def _carry_array(dtype): + """Pixel grid that crosses a byte boundary between adjacent samples.""" + return np.array( + [ + [1, 2, 3, 4], + [5, 6, 7, 8], + [256, 257, 258, 259], + [1000, 2000, 3000, 4000], + ], + dtype=dtype, + ) + + +@pytest.mark.parametrize("dtype", ["uint16", "int16", "uint32", "int32"]) +def test_libtiff_predictor2_le_round_trip(tmp_path, dtype): + """xrspatial reads libtiff-encoded LE predictor=2 files exactly.""" + arr = _carry_array(dtype) + path = tmp_path / f"libtiff_pred2_le_{dtype}.tif" + transform = rasterio.transform.from_origin(0, 4, 1, 1) + with rasterio.open( + str(path), + "w", + driver="GTiff", + height=4, + width=4, + count=1, + dtype=dtype, + compress="deflate", + predictor=2, + transform=transform, + ) as dst: + dst.write(arr, 1) + + out, _ = read_to_array(str(path)) + np.testing.assert_array_equal(out, arr) + + +@pytest.mark.parametrize("dtype", ["uint16", "int16", "uint32", "int32"]) +def test_tifffile_predictor2_be_round_trip(tmp_path, dtype): + """xrspatial reads big-endian predictor=2 files exactly.""" + arr = _carry_array(dtype) + path = tmp_path / f"tifffile_pred2_be_{dtype}.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=2, compression="deflate") + + out, _ = read_to_array(str(path)) + np.testing.assert_array_equal(out, arr) + + +def test_libtiff_multiband_predictor2_chunky(tmp_path): + """Chunky multi-band uint16 predictor=2 (libtiff per-channel diffs).""" + arr = np.array( + [ + [[1, 100], [2, 200]], + [[3, 300], [4, 400]], + ], + dtype="uint16", + ) + arr_band_first = np.moveaxis(arr, -1, 0) # rasterio expects (band, y, x) + + path = tmp_path / "libtiff_pred2_multiband.tif" + transform = rasterio.transform.from_origin(0, 2, 1, 1) + with rasterio.open( + str(path), + "w", + driver="GTiff", + height=2, + width=2, + count=2, + dtype="uint16", + compress="deflate", + predictor=2, + interleave="pixel", + transform=transform, + ) as dst: + dst.write(arr_band_first) + + out, _ = read_to_array(str(path)) + np.testing.assert_array_equal(out, arr) + + +def test_xrspatial_round_trip_uint16_predictor2(tmp_path): + """xrspatial-written predictor=2 uint16 still round-trips.""" + from xrspatial.geotiff import open_geotiff, to_geotiff + import xarray as xr + + arr = _carry_array("uint16") + da = xr.DataArray( + arr, + dims=("y", "x"), + coords={ + "x": np.arange(4, dtype=np.float64), + "y": np.arange(4, dtype=np.float64), + }, + ) + path = tmp_path / "self_pred2_u16.tif" + to_geotiff(da, str(path), compression="deflate", predictor=2) + + out = open_geotiff(str(path)) + np.testing.assert_array_equal(out.values, arr) + + +def test_xrspatial_uint16_predictor2_readable_by_rasterio(tmp_path): + """xrspatial-written predictor=2 uint16 is readable by libtiff.""" + from xrspatial.geotiff import to_geotiff + import xarray as xr + + arr = _carry_array("uint16") + da = xr.DataArray( + arr, + dims=("y", "x"), + coords={ + "x": np.arange(4, dtype=np.float64), + "y": np.arange(4, dtype=np.float64), + }, + ) + path = tmp_path / "interop_pred2_u16.tif" + to_geotiff(da, str(path), compression="deflate", predictor=2) + + with rasterio.open(str(path)) as src: + rio_arr = src.read(1) + np.testing.assert_array_equal(rio_arr, arr) + + +def _gpu_to_numpy(da): + """Materialize a possibly cupy-backed DataArray as a numpy ndarray.""" + arr = da.data + if hasattr(arr, "get"): + return arr.get() + return np.asarray(arr) + + +@pytest.mark.parametrize("dtype", ["uint16", "int32"]) +def test_gpu_predictor2_multibyte_matches_libtiff(tmp_path, dtype): + """The GPU path also decodes libtiff predictor=2 multi-byte files.""" + cupy = pytest.importorskip("cupy") + if not cupy.cuda.is_available(): + pytest.skip("CUDA not available") + + from xrspatial.geotiff import open_geotiff + + arr = _carry_array(dtype) + path = tmp_path / f"gpu_pred2_{dtype}.tif" + transform = rasterio.transform.from_origin(0, 4, 1, 1) + with rasterio.open( + str(path), + "w", + driver="GTiff", + height=4, + width=4, + count=1, + dtype=dtype, + compress="deflate", + predictor=2, + tiled=True, + blockxsize=16, + blockysize=16, + transform=transform, + ) as dst: + dst.write(arr, 1) + + cpu_arr = open_geotiff(str(path)).values + np.testing.assert_array_equal(cpu_arr, arr) + + gpu_arr = _gpu_to_numpy(open_geotiff(str(path), gpu=True)) + np.testing.assert_array_equal(gpu_arr, arr)