diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index abe22bf3..6297cd05 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -1562,6 +1562,77 @@ def _compress_block(arr, block_w, block_h, samples, dtype, bytes_per_sample, # Streaming writer (dask -> monolithic TIFF without full materialisation) # --------------------------------------------------------------------------- +def _compute_classic_ifd_overhead(tags: list) -> int: + """Return the on-disk size of the classic-TIFF IFD for ``tags``. + + Sums the fixed IFD block (entry count + 12 bytes per entry + next-IFD + pointer) and the variable overflow heap (values whose serialised size + exceeds the 4-byte inline limit, including ASCII strings such as + ``gdal_metadata`` and user-supplied ``extra_tags``). + + The heap size is recovered by building the IFD with + ``_build_ifd(tags, overflow_base=0, bigtiff=False)`` and measuring the + returned overflow buffer; this matches the bytes the streaming writer + will actually emit, with no fudge constant. + """ + num_tags = len(tags) + # classic IFD: 2-byte count + 12-byte entries + 4-byte next-IFD pointer + ifd_block_size = 2 + 12 * num_tags + 4 + _, overflow_bytes = _build_ifd(tags, overflow_base=0, bigtiff=False) + return ifd_block_size + len(overflow_bytes) + + +def _should_use_bigtiff_streaming(uncompressed_bytes: int, + n_entries: int, + ifd_overhead_bytes: int, + header_size_classic: int = 8) -> bool: + """Decide whether the streaming writer must emit BigTIFF. + + Classic TIFF stores offsets as uint32, so the file size addressable + via classic offsets is at most ``UINT32_MAX`` bytes (offsets run + ``0..UINT32_MAX - 1``). The streaming writer appends pixel data after + the header and IFD, so the final file size is + ``header + ifd + overflow + strip_table + uncompressed_bytes``. + + The comparison is ``> UINT32_MAX`` to match the eager + ``_assemble_tiff`` decision (``estimated_file_size > UINT32_MAX``): + a file that is exactly ``UINT32_MAX`` bytes still fits classic. + + See issue #1785 and the Copilot review on PR #1787: the previous + helper applied a 200-byte fudge for IFD overhead, which silently + underestimated when ``gdal_metadata_xml`` or large ``extra_tags`` + pushed the actual overflow heap well past that constant. + + Parameters + ---------- + uncompressed_bytes : int + Total pixel-data bytes that will be written after the IFD. + n_entries : int + Number of strip or tile entries; each contributes a LONG offset + (4 bytes) plus a LONG byte-count (4 bytes) to the overflow heap. + Pass ``0`` if ``ifd_overhead_bytes`` already covers the strip + table (the streaming-writer caller does this by passing the + actual tag list through ``_compute_classic_ifd_overhead``). + ifd_overhead_bytes : int + Classic-TIFF IFD size: fixed entry block plus variable overflow + heap (ASCII metadata, geo tags, strip/tile offset arrays, etc.). + Computed via ``_compute_classic_ifd_overhead(tags)``. + header_size_classic : int, optional + Classic-TIFF header size (8 bytes). + """ + # strip/tile-table overhead is 8 bytes per entry (LONG offset + LONG + # byte count). If the caller already accounted for the offset arrays + # inside ``ifd_overhead_bytes`` they should pass n_entries=0. + strip_table_overhead = n_entries * 8 + reserved_overhead = ( + header_size_classic + ifd_overhead_bytes + strip_table_overhead + ) + UINT32_MAX = 0xFFFFFFFF + # ``> UINT32_MAX`` matches the eager path's + # ``estimated_file_size > UINT32_MAX`` check in ``_assemble_tiff``. + return uncompressed_bytes + reserved_overhead > UINT32_MAX + + def write_streaming(dask_data, path: str, *, geo_transform: 'GeoTransform | None' = None, crs_epsg: int | None = None, @@ -1649,17 +1720,18 @@ def write_streaming(dask_data, path: str, *, rows_per_strip = min(256, height) n_entries = math.ceil(height / rows_per_strip) - # BigTIFF detection (use uncompressed size as conservative estimate) + # BigTIFF detection has to wait until the full tag list is built so + # that variable-length payloads (gdal_metadata, geo tags, user + # extra_tags) feed into the IFD-overhead calculation. Build the tag + # list assuming classic offsets first, then decide BigTIFF, then + # promote the strip/tile offset arrays to LONG8 if needed. See + # issue #1785 and the Copilot review on PR #1787. uncompressed_bytes = height * width * bytes_per_sample * samples - UINT32_MAX = 0xFFFFFFFF - if bigtiff is not None: - use_bigtiff = bigtiff - else: - use_bigtiff = uncompressed_bytes > UINT32_MAX - - header_size = 16 if use_bigtiff else 8 # ---- Build tag list (mirrors _assemble_tiff for level 0) ---- + # Start with classic offset types; the offset arrays are promoted to + # LONG8 below once BigTIFF is chosen. + use_bigtiff = bool(bigtiff) if bigtiff is not None else False tags = [] tags.append((TAG_IMAGE_WIDTH, LONG, 1, width)) tags.append((TAG_IMAGE_LENGTH, LONG, 1, height)) @@ -1714,10 +1786,11 @@ def write_streaming(dask_data, path: str, *, if resolution_unit is not None: tags.append((TAG_RESOLUTION_UNIT, SHORT, 1, resolution_unit)) - # Layout tags with placeholder offsets / byte-counts. BigTIFF - # needs 64-bit offsets (LONG8) since strip/tile positions can - # exceed 4 GB; classic TIFF uses LONG (uint32). - offset_type = LONG8 if use_bigtiff else LONG + # Layout tags with placeholder offsets / byte-counts. Use classic + # LONG (uint32) here; if the auto-BigTIFF decision below promotes + # the file, ``_promote_offsets_to_long8`` retypes these to LONG8. + # A caller-forced ``bigtiff=True`` is also resolved at that point. + offset_type = LONG placeholder = [0] * n_entries if tiled: tags.append((TAG_TILE_WIDTH, SHORT, 1, tile_size)) @@ -1769,6 +1842,31 @@ def write_streaming(dask_data, path: str, *, and etag_id not in _DANGEROUS_EXTRA_TAG_IDS): tags.append((etag_id, etype_id, ecount, evalue)) + # ---- BigTIFF decision (auto path) ---- + # Compute the real classic-TIFF IFD overhead from the actual tag + # list, including overflow heap (gdal_metadata, geo ascii params, + # strip/tile offset arrays, user extra_tags). This replaces the + # 200-byte fudge constant the original PR used; with metadata-heavy + # writes that constant silently underestimated overhead and let + # sub-4 GiB rasters overflow classic offsets late in the write. + # See issue #1785 and the Copilot review on PR #1787. + if bigtiff is None: + ifd_overhead_bytes = _compute_classic_ifd_overhead(tags) + # n_entries=0 because the strip/tile offset arrays are already + # inside ``tags`` and therefore in ``ifd_overhead_bytes``. + use_bigtiff = _should_use_bigtiff_streaming( + uncompressed_bytes, + n_entries=0, + ifd_overhead_bytes=ifd_overhead_bytes, + header_size_classic=8, + ) + + header_size = 16 if use_bigtiff else 8 + + # Promote the strip/tile offset arrays to LONG8 once BigTIFF is set. + if use_bigtiff: + tags = _promote_offsets_to_long8(tags) + # ---- Pre-compute IFD reservation size ---- sorted_tags = sorted(tags, key=lambda t: t[0]) entry_size = 20 if use_bigtiff else 12 diff --git a/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py b/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py new file mode 100644 index 00000000..24d1638f --- /dev/null +++ b/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py @@ -0,0 +1,262 @@ +"""Regression tests for issue #1785. + +The streaming writer's auto-BigTIFF decision used to compare only the +uncompressed pixel-data size against ``UINT32_MAX``. For rasters just +under 4 GiB the IFD plus the strip/tile-offset table pushed the actual +file past the classic-TIFF uint32 offset ceiling, and the write failed +late with ``struct.error``. + +These tests pin the corrected decision: + +* The helper takes an actual ``ifd_overhead_bytes`` value (computed from + the real tag list via ``_compute_classic_ifd_overhead``) rather than a + 200-byte fudge constant; large ``gdal_metadata_xml`` or ``extra_tags`` + payloads must not silently undercount overhead. See the Copilot review + on PR #1787. +* The comparison is ``> UINT32_MAX``, matching the eager + ``_assemble_tiff`` decision (``estimated_file_size > UINT32_MAX``). A + file that is exactly ``UINT32_MAX`` bytes still fits classic. +* The explicit ``bigtiff=True``/``False`` user override still wins. +""" +from __future__ import annotations + +import struct + +import dask.array as da +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import to_geotiff +from xrspatial.geotiff._dtypes import ASCII, LONG, SHORT +from xrspatial.geotiff._header import ( + TAG_BITS_PER_SAMPLE, + TAG_COMPRESSION, + TAG_GDAL_METADATA, + TAG_IMAGE_LENGTH, + TAG_IMAGE_WIDTH, + TAG_PHOTOMETRIC, + TAG_SAMPLE_FORMAT, + TAG_SAMPLES_PER_PIXEL, + TAG_STRIP_BYTE_COUNTS, + TAG_STRIP_OFFSETS, +) +from xrspatial.geotiff._writer import ( + _compute_classic_ifd_overhead, + _should_use_bigtiff_streaming, +) + + +UINT32_MAX = 0xFFFFFFFF + + +def _minimal_tag_list(n_entries: int, gdal_metadata_size: int = 0) -> list: + """Build a representative classic-TIFF tag list for sizing. + + Mirrors the streaming writer's tag set for a stripped uint8 raster + so ``_compute_classic_ifd_overhead`` returns a realistic byte count. + ``gdal_metadata_size`` injects an ASCII overflow payload to model + metadata-heavy writes. + """ + tags = [ + (TAG_IMAGE_WIDTH, LONG, 1, 1), + (TAG_IMAGE_LENGTH, LONG, 1, 1), + (TAG_BITS_PER_SAMPLE, SHORT, 1, 8), + (TAG_COMPRESSION, SHORT, 1, 1), + (TAG_PHOTOMETRIC, SHORT, 1, 1), + (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1), + (TAG_SAMPLE_FORMAT, SHORT, 1, 1), + (TAG_STRIP_OFFSETS, LONG, n_entries, [0] * n_entries), + (TAG_STRIP_BYTE_COUNTS, LONG, n_entries, [0] * n_entries), + ] + if gdal_metadata_size > 0: + blob = 'x' * gdal_metadata_size + tags.append((TAG_GDAL_METADATA, ASCII, len(blob) + 1, blob)) + return tags + + +# -- Helper-level unit tests ------------------------------------------------ + +class TestShouldUseBigTIFFStreaming: + def test_just_under_uint32_max_promotes(self): + """uncompressed = UINT32_MAX - 50 with non-trivial overhead promotes. + + Even ~50 bytes of slack disappears once IFD + strip-table overhead + is added, so this case must promote to BigTIFF. + """ + # 1024 entries: strip table contributes 8 * 1024 = 8 KiB. + tags = _minimal_tag_list(n_entries=1024) + overhead = _compute_classic_ifd_overhead(tags) + assert _should_use_bigtiff_streaming( + uncompressed_bytes=UINT32_MAX - 50, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is True + + def test_half_uint32_max_stays_classic(self): + """uncompressed = UINT32_MAX // 2 stays classic.""" + tags = _minimal_tag_list(n_entries=1024) + overhead = _compute_classic_ifd_overhead(tags) + assert _should_use_bigtiff_streaming( + uncompressed_bytes=UINT32_MAX // 2, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is False + + def test_exactly_uint32_max_stays_classic(self): + """Boundary: total file size == UINT32_MAX bytes still fits classic. + + Eager ``_assemble_tiff`` uses ``estimated_file_size > UINT32_MAX``; + the streaming helper must match. A file of exactly ``UINT32_MAX`` + bytes has its last byte at offset ``UINT32_MAX - 1``, which is a + valid classic-TIFF offset. + """ + # Construct uncompressed_bytes so total = exactly UINT32_MAX. + tags = _minimal_tag_list(n_entries=1) + overhead = _compute_classic_ifd_overhead(tags) + header = 8 + uncompressed = UINT32_MAX - header - overhead + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is False + # One byte over and we must promote. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed + 1, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is True + + def test_small_raster_no_overhead_stays_classic(self): + """Small rasters with one strip stay classic.""" + tags = _minimal_tag_list(n_entries=1) + overhead = _compute_classic_ifd_overhead(tags) + assert _should_use_bigtiff_streaming( + uncompressed_bytes=1024, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is False + + def test_large_strip_table_alone_can_promote(self): + """Even a small pixel payload can need BigTIFF if n_entries is huge. + + This documents the strip-table contribution: ~536 M entries puts + the table itself near 4 GiB. Not realistic in practice, but it + proves the overhead is wired through. + """ + n_entries = (UINT32_MAX // 8) + 1 + tags = _minimal_tag_list(n_entries=n_entries) + overhead = _compute_classic_ifd_overhead(tags) + assert _should_use_bigtiff_streaming( + uncompressed_bytes=0, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is True + + def test_overhead_pushes_just_under_threshold_over(self): + """Regression: a payload that fits classic by raw bytes but not + once header + IFD + strip table is added must promote. + """ + n_entries = 100_000 # ~800 KB strip table + tags = _minimal_tag_list(n_entries=n_entries) + overhead = _compute_classic_ifd_overhead(tags) + # Choose uncompressed so the total equals exactly UINT32_MAX + 1. + header = 8 + uncompressed = UINT32_MAX + 1 - header - overhead + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is True + # One byte less and we stay classic (boundary is now ``>``). + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed - 1, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is False + + def test_large_gdal_metadata_flips_decision(self): + """A 5000-byte gdal_metadata blob must flip a borderline case. + + Under the old 200-byte fudge, ``uncompressed + 200 < UINT32_MAX`` + could stay classic even when a multi-KB gdal_metadata overflow + pushed real overhead well past 200 bytes. With the actual + overhead computed from the tag list, the decision flips. + """ + n_entries = 1024 + big_blob = 5000 # ASCII overflow heap entry + plain_tags = _minimal_tag_list(n_entries=n_entries) + meta_tags = _minimal_tag_list( + n_entries=n_entries, gdal_metadata_size=big_blob) + + plain_overhead = _compute_classic_ifd_overhead(plain_tags) + meta_overhead = _compute_classic_ifd_overhead(meta_tags) + # Metadata blob really does increase computed overhead. + assert meta_overhead - plain_overhead >= big_blob + + # Pick uncompressed so plain_overhead path stays classic but + # the metadata path tips over. + header = 8 + uncompressed = UINT32_MAX - header - plain_overhead + # Plain: total == UINT32_MAX -> classic. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=plain_overhead, + ) is False + # With the large metadata blob folded into the real overhead, + # the total now exceeds UINT32_MAX and we must promote. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=meta_overhead, + ) is True + + +# -- Integration tests against the writer ------------------------------------ + +def _read_tiff_magic(path: str) -> int: + """Return the TIFF version field: 42 (0x002A) classic, 43 (0x002B) BigTIFF.""" + with open(path, 'rb') as f: + head = f.read(4) + byte_order = head[:2] + if byte_order == b'II': + fmt = '