diff --git a/xrspatial/geotiff/_header.py b/xrspatial/geotiff/_header.py index dea1182e..63531d8e 100644 --- a/xrspatial/geotiff/_header.py +++ b/xrspatial/geotiff/_header.py @@ -164,6 +164,13 @@ def samples_per_pixel(self) -> int: def sample_format(self) -> int: v = self.get_value(TAG_SAMPLE_FORMAT, 1) if isinstance(v, tuple): + # A SampleFormat tag with count=0 has been seen in malformed + # TIFFs (single-byte corruption flips the count field). Fall back + # to the default rather than raising IndexError -- the caller can + # then either succeed with a sensible dtype or fail with a typed + # ValueError downstream. + if len(v) == 0: + return 1 return v[0] return v diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 49235e95..cfc39a18 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -923,6 +923,12 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, if offsets is None or byte_counts is None: raise ValueError("Missing strip offsets or byte counts") + # A corrupt header can report RowsPerStrip=0, which would divide by zero + # below. Reject it as a typed parse error rather than letting the + # ZeroDivisionError leak out to the caller. + if rps is None or rps <= 0: + raise ValueError(f"Invalid RowsPerStrip: {rps!r}") + planar = ifd.planar_config # 1=chunky (interleaved), 2=planar (separate) # Determine output region @@ -940,6 +946,17 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, _check_dimensions(out_w, out_h, samples, max_pixels) + # StripByteCounts must have at least one entry per strip; a corrupt count + # field can shrink it. Detect the mismatch after the dimension safety + # check so an oversized header raises the safety-limit error first, then + # raise a typed ValueError here instead of IndexError when the loop + # indexes past the end. + n_strips_expected = (height + rps - 1) // rps + if len(offsets) < n_strips_expected or len(byte_counts) < n_strips_expected: + raise ValueError( + f"Strip table truncated: expected {n_strips_expected} entries, " + f"got offsets={len(offsets)}, byte_counts={len(byte_counts)}") + # Sparse strips (StripByteCounts == 0) must materialise as nodata or 0 # rather than be decoded. Pre-fill the result so any skipped strips # land on a known fill value. diff --git a/xrspatial/geotiff/tests/test_fuzz_hypothesis_1661.py b/xrspatial/geotiff/tests/test_fuzz_hypothesis_1661.py new file mode 100644 index 00000000..cd1ee73c --- /dev/null +++ b/xrspatial/geotiff/tests/test_fuzz_hypothesis_1661.py @@ -0,0 +1,323 @@ +"""Hypothesis property and fuzz tests for the geotiff module (#1661). + +Three property groups: + +1. Round-trip: random valid (dtype, compression, tiled, predictor, nodata) -> + write with ``to_geotiff`` -> read with ``open_geotiff`` -> assert array + equality and attrs preservation. + +2. IFD layout permutations via ``make_minimal_tiff``: assert ``open_geotiff`` + returns a valid array, or raises ``ValueError`` / ``TypeError`` from the + geotiff module. Never bare ``IndexError`` / ``struct.error`` / + ``UnicodeDecodeError``. + +3. Single-byte mutation: flip one byte in a valid TIFF at a Hypothesis-chosen + offset. Reader must either parse consistently or raise a typed exception. + +The whole file is skipped if ``hypothesis`` is not installed -- it is not a +hard test dep yet (see issue #1661 unresolved questions). Each test bounds +example count and disables Hypothesis's deadline so CI variance doesn't +flake. +""" +from __future__ import annotations + +import io +import struct + +import numpy as np +import pytest +import xarray as xr + +hypothesis = pytest.importorskip("hypothesis") +from hypothesis import HealthCheck, example, given, settings # noqa: E402 +from hypothesis import strategies as st # noqa: E402 + +from xrspatial.geotiff import open_geotiff, to_geotiff # noqa: E402 + +from .conftest import make_minimal_tiff # noqa: E402 + + +# Exception types the geotiff module is allowed to raise on invalid input. +# Any other exception class indicates an undocumented failure mode -- either +# the strategy generated something we should reject explicitly, or there's +# a real bug. +ALLOWED_PARSE_EXCEPTIONS = (ValueError, TypeError) + +# Codecs safe for round-trip on every dtype in our strategy. 'jpeg' is +# explicitly rejected on write (see _VALID_COMPRESSIONS docstring); 'lerc' and +# 'jpeg2000' are lossy or dtype-restricted and would need their own narrower +# strategies, so they're omitted here. +LOSSLESS_CODECS = ['none', 'deflate', 'lzw', 'packbits', 'zstd', 'lz4'] + +# Dtype set kept small to keep CI fast. Float and int, signed and unsigned. +ROUND_TRIP_DTYPES = ['uint8', 'uint16', 'int16', 'int32', 'float32', 'float64'] + + +# --- Strategies --- + +@st.composite +def round_trip_inputs(draw): + """Generate (DataArray, compression, tiled, predictor) for round-trip.""" + width = draw(st.integers(min_value=1, max_value=32)) + height = draw(st.integers(min_value=1, max_value=32)) + dtype = draw(st.sampled_from(ROUND_TRIP_DTYPES)) + compression = draw(st.sampled_from(LOSSLESS_CODECS)) + tiled = draw(st.booleans()) + + np_dtype = np.dtype(dtype) + if np_dtype.kind == 'f': + # Predictor 3 is for floats only; 0/1 means no predictor. + predictor = draw(st.sampled_from([False, 3])) + data = draw(st.integers(min_value=0, max_value=1_000_000)) + rng = np.random.default_rng(data) + arr = rng.standard_normal((height, width)).astype(np_dtype) + else: + # Predictor 2 is horizontal differencing, good for ints. + predictor = draw(st.sampled_from([False, 2])) + seed = draw(st.integers(min_value=0, max_value=1_000_000)) + rng = np.random.default_rng(seed) + info = np.iinfo(np_dtype) + # Avoid the extreme edge of the type range; some codecs reserve sentinels. + arr = rng.integers( + low=info.min // 2 if info.min < 0 else 0, + high=info.max // 2, + size=(height, width), + dtype=np_dtype, + ) + + da = xr.DataArray(arr, dims=('y', 'x')) + return da, compression, tiled, predictor + + +@st.composite +def ifd_layout_inputs(draw): + """Generate a valid (or borderline) make_minimal_tiff invocation.""" + width = draw(st.integers(min_value=1, max_value=16)) + height = draw(st.integers(min_value=1, max_value=16)) + dtype = draw(st.sampled_from(['uint8', 'uint16', 'int16', 'float32'])) + compression = 1 # Uncompressed: make_minimal_tiff only supports type 1. + tiled = draw(st.booleans()) + tile_size = draw(st.sampled_from([4, 8, 16])) + big_endian = draw(st.booleans()) + with_geo = draw(st.booleans()) + + return dict( + width=width, + height=height, + dtype=np.dtype(dtype), + compression=compression, + tiled=tiled, + tile_size=tile_size, + big_endian=big_endian, + with_geo=with_geo, + ) + + +# --- Group 1: round-trip property --- + +@given(inputs=round_trip_inputs()) +@settings( + max_examples=50, + deadline=None, + suppress_health_check=[HealthCheck.too_slow, HealthCheck.function_scoped_fixture], +) +def test_round_trip_property(tmp_path_factory, inputs): + """to_geotiff -> open_geotiff preserves array values bitwise.""" + da, compression, tiled, predictor = inputs + + tmp_dir = tmp_path_factory.mktemp("fuzz_1661_rt") + path = str(tmp_dir / "rt.tif") + + to_geotiff( + da, + path, + compression=compression, + tiled=tiled, + predictor=predictor, + ) + + got = open_geotiff(path, dtype=str(da.dtype)) + + # Reader may add a leading band axis; squeeze for the 2D comparison. + got_arr = got.values + if got_arr.ndim == 3 and got_arr.shape[0] == 1: + got_arr = got_arr[0] + + np.testing.assert_array_equal(got_arr, da.values) + + +# --- Group 2: IFD layout permutations --- + +@given(spec=ifd_layout_inputs()) +@settings( + max_examples=50, + deadline=None, + suppress_health_check=[HealthCheck.too_slow], +) +def test_ifd_layout_typed_errors_only(spec): + """make_minimal_tiff variations parse cleanly or raise a typed exception. + + The reader is allowed to refuse any specific combination with a + ValueError/TypeError; what is not allowed is a bare IndexError, + struct.error, UnicodeDecodeError, or anything else that suggests we + walked off the end of the byte buffer without checking. + """ + geo_transform = None + epsg = None + if spec['with_geo']: + geo_transform = (-120.0, 45.0, 0.001, -0.001) + epsg = 4326 + + tiff_bytes = make_minimal_tiff( + width=spec['width'], + height=spec['height'], + dtype=spec['dtype'], + compression=spec['compression'], + tiled=spec['tiled'], + tile_size=spec['tile_size'], + big_endian=spec['big_endian'], + geo_transform=geo_transform, + epsg=epsg, + ) + + try: + da = open_geotiff(io.BytesIO(tiff_bytes)) + except ALLOWED_PARSE_EXCEPTIONS: + return # Typed refusal -- acceptable. + except Exception as exc: + pytest.fail( + f"open_geotiff raised non-typed {type(exc).__name__} on a " + f"valid-by-construction TIFF: {spec!r} -> {exc!r}" + ) + + # If it parsed, shape should match what we asked for. Reader may add a + # leading band axis (samples=1), so check the last two dims. + assert da.shape[-2:] == (spec['height'], spec['width']), ( + f"shape mismatch: got {da.shape}, expected last dims " + f"({spec['height']}, {spec['width']}) for {spec!r}" + ) + + +# --- Group 3: byte-level mutation fuzz --- + +# Hold a single corpus TIFF and let Hypothesis pick a byte offset + new byte +# value to splice in. Using a fixed corpus keeps the strategy fast (no +# nested TIFF generation per example) and concentrates the search on the +# parser's response to bit-rot. +_CORPUS_SPECS = [ + # (kwargs to make_minimal_tiff, label) + (dict(width=4, height=4, dtype=np.dtype('float32')), 'le_strip_f32'), + (dict(width=4, height=4, dtype=np.dtype('uint16'), big_endian=True), 'be_strip_u16'), + (dict(width=8, height=8, dtype=np.dtype('float32'), tiled=True, tile_size=4), + 'le_tiled_f32'), + (dict(width=4, height=4, dtype=np.dtype('float32'), + geo_transform=(-120.0, 45.0, 0.001, -0.001), epsg=4326), + 'le_geo_f32'), +] +_CORPUS = [(label, make_minimal_tiff(**kw)) for kw, label in _CORPUS_SPECS] + + +@pytest.mark.parametrize("label,base_tiff", _CORPUS, ids=[lab for lab, _ in _CORPUS]) +# Regression seeds for bugs surfaced by the initial Hypothesis run on +# the le_strip_f32 corpus member (4x4 float32, 198 bytes total): +# offset 102, byte 0x00 -> ZeroDivisionError in _read_strips (rps=0) +# offset 110, byte 0x00 -> IndexError in _read_strips (StripByteCounts trunc) +# offset 122, byte 0x00 -> IndexError in sample_format (empty tuple) +# These offsets are specific to the le_strip_f32 layout; the other corpus +# entries will exercise the same code with different offsets, and that's +# fine -- the example just guarantees we cover the regression each run. +@example(offset_frac=102 / 198, new_byte=0x00) +@example(offset_frac=110 / 198, new_byte=0x00) +@example(offset_frac=122 / 198, new_byte=0x00) +@given( + offset_frac=st.floats(min_value=0.0, max_value=0.999), + new_byte=st.integers(min_value=0, max_value=255), +) +@settings( + max_examples=50, + deadline=None, + suppress_health_check=[HealthCheck.too_slow, HealthCheck.function_scoped_fixture], +) +def test_single_byte_mutation_typed_errors(label, base_tiff, offset_frac, new_byte): + """Flip one byte of a valid TIFF; reader must parse or raise typed exc. + + The mutated file might still parse (the byte landed in pixel data, which + is a valid value for that dtype). What is unacceptable is a bare + ``IndexError`` / ``struct.error`` from reading past the buffer, or a + segfault from the GPU/dask paths -- those are kept off this test by + using the eager numpy path only. + """ + mutated = bytearray(base_tiff) + offset = int(offset_frac * len(mutated)) + # Make sure the mutation is actually a flip (not a no-op). + if mutated[offset] == new_byte: + new_byte = (new_byte + 1) & 0xFF + mutated[offset] = new_byte + + try: + da = open_geotiff(io.BytesIO(bytes(mutated))) + except ALLOWED_PARSE_EXCEPTIONS: + return + except (MemoryError, OverflowError): + # Header field could decode to an absurd dimension/offset. We treat + # these as acceptable refusals because the user gets a clear failure + # rather than wrong data. + return + except Exception as exc: + pytest.fail( + f"[{label}] single-byte mutation at offset {offset} -> {new_byte:#x} " + f"raised non-typed {type(exc).__name__}: {exc!r}" + ) + + # If it parsed, the result must at least be a real DataArray with the + # claimed dtype actually realised. Materialise to catch lazy errors. + assert isinstance(da, xr.DataArray) + _ = np.asarray(da.values) + + +# --- Smoke test that the module wired itself up --- + +def test_corpus_baseline_parses(): + """Sanity check: every corpus TIFF parses without mutation.""" + for label, base in _CORPUS: + da = open_geotiff(io.BytesIO(base)) + assert isinstance(da, xr.DataArray), label + assert da.size > 0, label + + +# --- Targeted regressions for bugs found by the property tests above --- +# These three were caught by the byte-mutation property on first run and +# fixed alongside this PR. They live here (not in a separate file) so the +# regression context stays next to the harness that found them. + +def test_regression_rows_per_strip_zero_is_typed_error(): + """rps=0 must raise ValueError, not ZeroDivisionError.""" + base = make_minimal_tiff(4, 4, np.dtype('float32')) + mut = bytearray(base) + mut[102] = 0 # Zeroes the RowsPerStrip value in this layout. + with pytest.raises(ValueError): + open_geotiff(io.BytesIO(bytes(mut))) + + +def test_regression_strip_table_truncated_is_typed_error(): + """StripByteCounts shorter than strip count must raise ValueError.""" + base = make_minimal_tiff(4, 4, np.dtype('float32')) + mut = bytearray(base) + mut[110] = 0 # Truncates the strip table count in this layout. + with pytest.raises(ValueError): + open_geotiff(io.BytesIO(bytes(mut))) + + +def test_regression_empty_sample_format_tuple_does_not_indexerror(): + """SampleFormat tag with count=0 must fall back, not IndexError.""" + base = make_minimal_tiff(4, 4, np.dtype('float32')) + mut = bytearray(base) + mut[122] = 0 # Zeroes the SampleFormat count field in this layout. + # Either parses with the default sample_format (1 = unsigned int) and + # produces a DataArray, or fails downstream with a typed ValueError -- + # both are acceptable. The non-acceptable outcome is IndexError. + try: + da = open_geotiff(io.BytesIO(bytes(mut))) + assert isinstance(da, xr.DataArray) + except ValueError: + pass