Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions xrspatial/geotiff/_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,13 @@ def samples_per_pixel(self) -> int:
def sample_format(self) -> int:
v = self.get_value(TAG_SAMPLE_FORMAT, 1)
if isinstance(v, tuple):
# A SampleFormat tag with count=0 has been seen in malformed
# TIFFs (single-byte corruption flips the count field). Fall back
# to the default rather than raising IndexError -- the caller can
# then either succeed with a sensible dtype or fail with a typed
# ValueError downstream.
if len(v) == 0:
return 1
return v[0]
return v

Expand Down
17 changes: 17 additions & 0 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,12 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader,
if offsets is None or byte_counts is None:
raise ValueError("Missing strip offsets or byte counts")

# A corrupt header can report RowsPerStrip=0, which would divide by zero
# below. Reject it as a typed parse error rather than letting the
# ZeroDivisionError leak out to the caller.
if rps is None or rps <= 0:
raise ValueError(f"Invalid RowsPerStrip: {rps!r}")

planar = ifd.planar_config # 1=chunky (interleaved), 2=planar (separate)

# Determine output region
Expand All @@ -940,6 +946,17 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader,

_check_dimensions(out_w, out_h, samples, max_pixels)

# StripByteCounts must have at least one entry per strip; a corrupt count
# field can shrink it. Detect the mismatch after the dimension safety
# check so an oversized header raises the safety-limit error first, then
# raise a typed ValueError here instead of IndexError when the loop
# indexes past the end.
n_strips_expected = (height + rps - 1) // rps
if len(offsets) < n_strips_expected or len(byte_counts) < n_strips_expected:
raise ValueError(
f"Strip table truncated: expected {n_strips_expected} entries, "
f"got offsets={len(offsets)}, byte_counts={len(byte_counts)}")

# Sparse strips (StripByteCounts == 0) must materialise as nodata or 0
# rather than be decoded. Pre-fill the result so any skipped strips
# land on a known fill value.
Expand Down
323 changes: 323 additions & 0 deletions xrspatial/geotiff/tests/test_fuzz_hypothesis_1661.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
"""Hypothesis property and fuzz tests for the geotiff module (#1661).

Three property groups:

1. Round-trip: random valid (dtype, compression, tiled, predictor, nodata) ->
write with ``to_geotiff`` -> read with ``open_geotiff`` -> assert array
equality and attrs preservation.

2. IFD layout permutations via ``make_minimal_tiff``: assert ``open_geotiff``
returns a valid array, or raises ``ValueError`` / ``TypeError`` from the
geotiff module. Never bare ``IndexError`` / ``struct.error`` /
``UnicodeDecodeError``.

3. Single-byte mutation: flip one byte in a valid TIFF at a Hypothesis-chosen
offset. Reader must either parse consistently or raise a typed exception.

The whole file is skipped if ``hypothesis`` is not installed -- it is not a
hard test dep yet (see issue #1661 unresolved questions). Each test bounds
example count and disables Hypothesis's deadline so CI variance doesn't
flake.
"""
from __future__ import annotations

import io
import struct

import numpy as np
import pytest
import xarray as xr

hypothesis = pytest.importorskip("hypothesis")
from hypothesis import HealthCheck, example, given, settings # noqa: E402
from hypothesis import strategies as st # noqa: E402

from xrspatial.geotiff import open_geotiff, to_geotiff # noqa: E402

from .conftest import make_minimal_tiff # noqa: E402


# Exception types the geotiff module is allowed to raise on invalid input.
# Any other exception class indicates an undocumented failure mode -- either
# the strategy generated something we should reject explicitly, or there's
# a real bug.
ALLOWED_PARSE_EXCEPTIONS = (ValueError, TypeError)

# Codecs safe for round-trip on every dtype in our strategy. 'jpeg' is
# explicitly rejected on write (see _VALID_COMPRESSIONS docstring); 'lerc' and
# 'jpeg2000' are lossy or dtype-restricted and would need their own narrower
# strategies, so they're omitted here.
LOSSLESS_CODECS = ['none', 'deflate', 'lzw', 'packbits', 'zstd', 'lz4']

# Dtype set kept small to keep CI fast. Float and int, signed and unsigned.
ROUND_TRIP_DTYPES = ['uint8', 'uint16', 'int16', 'int32', 'float32', 'float64']


# --- Strategies ---

@st.composite
def round_trip_inputs(draw):
"""Generate (DataArray, compression, tiled, predictor) for round-trip."""
width = draw(st.integers(min_value=1, max_value=32))
height = draw(st.integers(min_value=1, max_value=32))
dtype = draw(st.sampled_from(ROUND_TRIP_DTYPES))
compression = draw(st.sampled_from(LOSSLESS_CODECS))
tiled = draw(st.booleans())

np_dtype = np.dtype(dtype)
if np_dtype.kind == 'f':
# Predictor 3 is for floats only; 0/1 means no predictor.
predictor = draw(st.sampled_from([False, 3]))
data = draw(st.integers(min_value=0, max_value=1_000_000))
rng = np.random.default_rng(data)
arr = rng.standard_normal((height, width)).astype(np_dtype)
else:
# Predictor 2 is horizontal differencing, good for ints.
predictor = draw(st.sampled_from([False, 2]))
seed = draw(st.integers(min_value=0, max_value=1_000_000))
rng = np.random.default_rng(seed)
info = np.iinfo(np_dtype)
# Avoid the extreme edge of the type range; some codecs reserve sentinels.
arr = rng.integers(
low=info.min // 2 if info.min < 0 else 0,
high=info.max // 2,
size=(height, width),
dtype=np_dtype,
)

da = xr.DataArray(arr, dims=('y', 'x'))
return da, compression, tiled, predictor


@st.composite
def ifd_layout_inputs(draw):
"""Generate a valid (or borderline) make_minimal_tiff invocation."""
width = draw(st.integers(min_value=1, max_value=16))
height = draw(st.integers(min_value=1, max_value=16))
dtype = draw(st.sampled_from(['uint8', 'uint16', 'int16', 'float32']))
compression = 1 # Uncompressed: make_minimal_tiff only supports type 1.
tiled = draw(st.booleans())
tile_size = draw(st.sampled_from([4, 8, 16]))
big_endian = draw(st.booleans())
with_geo = draw(st.booleans())

return dict(
width=width,
height=height,
dtype=np.dtype(dtype),
compression=compression,
tiled=tiled,
tile_size=tile_size,
big_endian=big_endian,
with_geo=with_geo,
)


# --- Group 1: round-trip property ---

@given(inputs=round_trip_inputs())
@settings(
max_examples=50,
deadline=None,
suppress_health_check=[HealthCheck.too_slow, HealthCheck.function_scoped_fixture],
)
def test_round_trip_property(tmp_path_factory, inputs):
"""to_geotiff -> open_geotiff preserves array values bitwise."""
da, compression, tiled, predictor = inputs

tmp_dir = tmp_path_factory.mktemp("fuzz_1661_rt")
path = str(tmp_dir / "rt.tif")

to_geotiff(
da,
path,
compression=compression,
tiled=tiled,
predictor=predictor,
)

got = open_geotiff(path, dtype=str(da.dtype))

# Reader may add a leading band axis; squeeze for the 2D comparison.
got_arr = got.values
if got_arr.ndim == 3 and got_arr.shape[0] == 1:
got_arr = got_arr[0]

np.testing.assert_array_equal(got_arr, da.values)


# --- Group 2: IFD layout permutations ---

@given(spec=ifd_layout_inputs())
@settings(
max_examples=50,
deadline=None,
suppress_health_check=[HealthCheck.too_slow],
)
def test_ifd_layout_typed_errors_only(spec):
"""make_minimal_tiff variations parse cleanly or raise a typed exception.

The reader is allowed to refuse any specific combination with a
ValueError/TypeError; what is not allowed is a bare IndexError,
struct.error, UnicodeDecodeError, or anything else that suggests we
walked off the end of the byte buffer without checking.
"""
geo_transform = None
epsg = None
if spec['with_geo']:
geo_transform = (-120.0, 45.0, 0.001, -0.001)
epsg = 4326

tiff_bytes = make_minimal_tiff(
width=spec['width'],
height=spec['height'],
dtype=spec['dtype'],
compression=spec['compression'],
tiled=spec['tiled'],
tile_size=spec['tile_size'],
big_endian=spec['big_endian'],
geo_transform=geo_transform,
epsg=epsg,
)

try:
da = open_geotiff(io.BytesIO(tiff_bytes))
except ALLOWED_PARSE_EXCEPTIONS:
return # Typed refusal -- acceptable.
except Exception as exc:
pytest.fail(
f"open_geotiff raised non-typed {type(exc).__name__} on a "
f"valid-by-construction TIFF: {spec!r} -> {exc!r}"
)

# If it parsed, shape should match what we asked for. Reader may add a
# leading band axis (samples=1), so check the last two dims.
assert da.shape[-2:] == (spec['height'], spec['width']), (
f"shape mismatch: got {da.shape}, expected last dims "
f"({spec['height']}, {spec['width']}) for {spec!r}"
)


# --- Group 3: byte-level mutation fuzz ---

# Hold a single corpus TIFF and let Hypothesis pick a byte offset + new byte
# value to splice in. Using a fixed corpus keeps the strategy fast (no
# nested TIFF generation per example) and concentrates the search on the
# parser's response to bit-rot.
_CORPUS_SPECS = [
# (kwargs to make_minimal_tiff, label)
(dict(width=4, height=4, dtype=np.dtype('float32')), 'le_strip_f32'),
(dict(width=4, height=4, dtype=np.dtype('uint16'), big_endian=True), 'be_strip_u16'),
(dict(width=8, height=8, dtype=np.dtype('float32'), tiled=True, tile_size=4),
'le_tiled_f32'),
(dict(width=4, height=4, dtype=np.dtype('float32'),
geo_transform=(-120.0, 45.0, 0.001, -0.001), epsg=4326),
'le_geo_f32'),
]
_CORPUS = [(label, make_minimal_tiff(**kw)) for kw, label in _CORPUS_SPECS]


@pytest.mark.parametrize("label,base_tiff", _CORPUS, ids=[lab for lab, _ in _CORPUS])
# Regression seeds for bugs surfaced by the initial Hypothesis run on
# the le_strip_f32 corpus member (4x4 float32, 198 bytes total):
# offset 102, byte 0x00 -> ZeroDivisionError in _read_strips (rps=0)
# offset 110, byte 0x00 -> IndexError in _read_strips (StripByteCounts trunc)
# offset 122, byte 0x00 -> IndexError in sample_format (empty tuple)
# These offsets are specific to the le_strip_f32 layout; the other corpus
# entries will exercise the same code with different offsets, and that's
# fine -- the example just guarantees we cover the regression each run.
@example(offset_frac=102 / 198, new_byte=0x00)
@example(offset_frac=110 / 198, new_byte=0x00)
@example(offset_frac=122 / 198, new_byte=0x00)
@given(
offset_frac=st.floats(min_value=0.0, max_value=0.999),
new_byte=st.integers(min_value=0, max_value=255),
)
@settings(
max_examples=50,
deadline=None,
suppress_health_check=[HealthCheck.too_slow, HealthCheck.function_scoped_fixture],
)
def test_single_byte_mutation_typed_errors(label, base_tiff, offset_frac, new_byte):
"""Flip one byte of a valid TIFF; reader must parse or raise typed exc.

The mutated file might still parse (the byte landed in pixel data, which
is a valid value for that dtype). What is unacceptable is a bare
``IndexError`` / ``struct.error`` from reading past the buffer, or a
segfault from the GPU/dask paths -- those are kept off this test by
using the eager numpy path only.
"""
mutated = bytearray(base_tiff)
offset = int(offset_frac * len(mutated))
# Make sure the mutation is actually a flip (not a no-op).
if mutated[offset] == new_byte:
new_byte = (new_byte + 1) & 0xFF
mutated[offset] = new_byte

try:
da = open_geotiff(io.BytesIO(bytes(mutated)))
except ALLOWED_PARSE_EXCEPTIONS:
return
except (MemoryError, OverflowError):
# Header field could decode to an absurd dimension/offset. We treat
# these as acceptable refusals because the user gets a clear failure
# rather than wrong data.
return
except Exception as exc:
pytest.fail(
f"[{label}] single-byte mutation at offset {offset} -> {new_byte:#x} "
f"raised non-typed {type(exc).__name__}: {exc!r}"
)

# If it parsed, the result must at least be a real DataArray with the
# claimed dtype actually realised. Materialise to catch lazy errors.
assert isinstance(da, xr.DataArray)
_ = np.asarray(da.values)


# --- Smoke test that the module wired itself up ---

def test_corpus_baseline_parses():
"""Sanity check: every corpus TIFF parses without mutation."""
for label, base in _CORPUS:
da = open_geotiff(io.BytesIO(base))
assert isinstance(da, xr.DataArray), label
assert da.size > 0, label


# --- Targeted regressions for bugs found by the property tests above ---
# These three were caught by the byte-mutation property on first run and
# fixed alongside this PR. They live here (not in a separate file) so the
# regression context stays next to the harness that found them.

def test_regression_rows_per_strip_zero_is_typed_error():
"""rps=0 must raise ValueError, not ZeroDivisionError."""
base = make_minimal_tiff(4, 4, np.dtype('float32'))
mut = bytearray(base)
mut[102] = 0 # Zeroes the RowsPerStrip value in this layout.
with pytest.raises(ValueError):
open_geotiff(io.BytesIO(bytes(mut)))


def test_regression_strip_table_truncated_is_typed_error():
"""StripByteCounts shorter than strip count must raise ValueError."""
base = make_minimal_tiff(4, 4, np.dtype('float32'))
mut = bytearray(base)
mut[110] = 0 # Truncates the strip table count in this layout.
with pytest.raises(ValueError):
open_geotiff(io.BytesIO(bytes(mut)))


def test_regression_empty_sample_format_tuple_does_not_indexerror():
"""SampleFormat tag with count=0 must fall back, not IndexError."""
base = make_minimal_tiff(4, 4, np.dtype('float32'))
mut = bytearray(base)
mut[122] = 0 # Zeroes the SampleFormat count field in this layout.
# Either parses with the default sample_format (1 = unsigned int) and
# produces a DataArray, or fails downstream with a typed ValueError --
# both are acceptable. The non-acceptable outcome is IndexError.
try:
da = open_geotiff(io.BytesIO(bytes(mut)))
assert isinstance(da, xr.DataArray)
except ValueError:
pass
Loading