From 4ab0f7175d629752652ace47a791330f71be4ba6 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 13 May 2026 06:34:21 -0700 Subject: [PATCH 1/2] geotiff: account for IFD overhead when picking BigTIFF in streaming writer (#1785) The streaming writer's auto-BigTIFF decision compared only the uncompressed pixel-data size against UINT32_MAX. For rasters just under 4 GiB the IFD plus the strip/tile-offset table pushed the actual file past the classic-TIFF uint32 offset ceiling, and the write failed late with struct.error during offset patching. Factor the decision into _should_use_bigtiff_streaming() and include header (8 bytes), an IFD estimate (~200 bytes), and the strip/tile table (n_entries * 8). Compare with >= because UINT32_MAX itself is not a referenceable offset in classic TIFF. The explicit bigtiff=True/False user override is unchanged. Closes #1785 --- xrspatial/geotiff/_writer.py | 35 +++- ...eotiff_streaming_bigtiff_threshold_1785.py | 149 ++++++++++++++++++ 2 files changed, 181 insertions(+), 3 deletions(-) create mode 100644 xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index abe22bf3..fd3714e7 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -1562,6 +1562,33 @@ def _compress_block(arr, block_w, block_h, samples, dtype, bytes_per_sample, # Streaming writer (dask -> monolithic TIFF without full materialisation) # --------------------------------------------------------------------------- +def _should_use_bigtiff_streaming(uncompressed_bytes: int, + n_entries: int, + header_size_classic: int = 8, + ifd_overhead_estimate: int = 200) -> bool: + """Decide whether the streaming writer must emit BigTIFF. + + Classic TIFF stores offsets as uint32, so any offset referenced from + the IFD must satisfy ``offset < UINT32_MAX``. The streaming writer + appends pixel data after the header and IFD, so the last byte of + pixel data sits at approximately + ``header + ifd + strip_table + uncompressed_bytes``. This helper + promotes to BigTIFF if that total reaches ``UINT32_MAX``. + + See issue #1785: the previous check only compared ``uncompressed_bytes`` + against ``UINT32_MAX`` and missed the IFD/strip-table overhead, which + pushed sub-4 GiB rasters over the classic-TIFF limit late in the write. + """ + # 8 bytes per strip/tile entry: 4 for offset (LONG) + 4 for byte count. + strip_table_overhead = n_entries * 8 + reserved_overhead = ( + header_size_classic + ifd_overhead_estimate + strip_table_overhead + ) + UINT32_MAX = 0xFFFFFFFF + # Use >= because UINT32_MAX itself is not a referenceable offset. + return uncompressed_bytes + reserved_overhead >= UINT32_MAX + + def write_streaming(dask_data, path: str, *, geo_transform: 'GeoTransform | None' = None, crs_epsg: int | None = None, @@ -1649,13 +1676,15 @@ def write_streaming(dask_data, path: str, *, rows_per_strip = min(256, height) n_entries = math.ceil(height / rows_per_strip) - # BigTIFF detection (use uncompressed size as conservative estimate) + # BigTIFF detection. Account for IFD + strip/tile-table overhead so + # rasters just under 4 GiB do not silently overflow classic-TIFF + # uint32 offsets late in the write. See issue #1785. uncompressed_bytes = height * width * bytes_per_sample * samples - UINT32_MAX = 0xFFFFFFFF if bigtiff is not None: use_bigtiff = bigtiff else: - use_bigtiff = uncompressed_bytes > UINT32_MAX + use_bigtiff = _should_use_bigtiff_streaming( + uncompressed_bytes, n_entries, header_size_classic=8) header_size = 16 if use_bigtiff else 8 diff --git a/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py b/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py new file mode 100644 index 00000000..6e856c67 --- /dev/null +++ b/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py @@ -0,0 +1,149 @@ +"""Regression tests for issue #1785. + +The streaming writer's auto-BigTIFF decision used to compare only the +uncompressed pixel-data size against ``UINT32_MAX``. For rasters just +under 4 GiB the IFD plus the strip/tile-offset table pushed the actual +file past the classic-TIFF uint32 offset ceiling, and the write failed +late with ``struct.error``. + +These tests pin the corrected decision: the helper accounts for IFD and +strip-table overhead, and the comparison is ``>=`` because +``UINT32_MAX`` itself is not a referenceable offset in classic TIFF. +The explicit ``bigtiff=True``/``False`` user override must still win. +""" +from __future__ import annotations + +import struct + +import dask.array as da +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import to_geotiff +from xrspatial.geotiff._writer import _should_use_bigtiff_streaming + + +UINT32_MAX = 0xFFFFFFFF + + +# -- Helper-level unit tests ------------------------------------------------ + +class TestShouldUseBigTIFFStreaming: + def test_just_under_uint32_max_promotes(self): + """uncompressed = UINT32_MAX - 50 with non-trivial overhead promotes. + + Even ~50 bytes of slack disappears once IFD + strip-table overhead + is added, so this case must promote to BigTIFF. + """ + # 1024 entries => 8 KiB strip-table overhead, well above 50 bytes. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=UINT32_MAX - 50, + n_entries=1024, + ) is True + + def test_half_uint32_max_stays_classic(self): + """uncompressed = UINT32_MAX // 2 stays classic.""" + assert _should_use_bigtiff_streaming( + uncompressed_bytes=UINT32_MAX // 2, + n_entries=1024, + ) is False + + def test_exactly_uint32_max_promotes(self): + """Boundary: uncompressed == UINT32_MAX promotes because of ``>=``. + + Classic TIFF cannot reference an offset equal to UINT32_MAX (the + last valid offset is UINT32_MAX - 1), so equality must promote. + """ + assert _should_use_bigtiff_streaming( + uncompressed_bytes=UINT32_MAX, + n_entries=1, + ) is True + + def test_small_raster_no_overhead_stays_classic(self): + """Small rasters with one strip stay classic.""" + assert _should_use_bigtiff_streaming( + uncompressed_bytes=1024, + n_entries=1, + ) is False + + def test_large_strip_table_alone_can_promote(self): + """Even a small pixel payload can need BigTIFF if n_entries is huge. + + This documents the strip-table contribution: ~536 M entries puts + the table itself near 4 GiB. Not realistic in practice, but it + proves the overhead is wired through. + """ + # Choose n_entries so that 8 * n_entries alone is close to UINT32_MAX. + n_entries = (UINT32_MAX // 8) + 1 + assert _should_use_bigtiff_streaming( + uncompressed_bytes=0, + n_entries=n_entries, + ) is True + + def test_overhead_pushes_just_under_threshold_over(self): + """Regression: a payload that fits classic by raw bytes but not + once header + IFD + strip table is added must promote. + """ + # Pick overhead components large enough that the sum crosses + # UINT32_MAX even though uncompressed_bytes < UINT32_MAX. + n_entries = 100_000 # 800 KB strip table + uncompressed = UINT32_MAX - (8 + 200 + n_entries * 8) + # Sum equals exactly UINT32_MAX -> promote (>=). + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=n_entries, + ) is True + # One byte less and we stay classic. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed - 1, + n_entries=n_entries, + ) is False + + +# -- Integration tests against the writer ------------------------------------ + +def _read_tiff_magic(path: str) -> int: + """Return the TIFF version field: 42 (0x002A) classic, 43 (0x002B) BigTIFF.""" + with open(path, 'rb') as f: + head = f.read(4) + byte_order = head[:2] + if byte_order == b'II': + fmt = ' Date: Wed, 13 May 2026 06:50:18 -0700 Subject: [PATCH 2/2] geotiff: compute streaming-writer IFD overhead from real tags, fix BigTIFF boundary Address Copilot review feedback on PR #1787: * The 200-byte ifd_overhead_estimate constant in _should_use_bigtiff_streaming was not guaranteed conservative. Variable-length payloads (gdal_metadata_xml ASCII overflow, user extra_tags, geo ascii params) can push real IFD overhead well past 200 bytes and let sub-4 GiB rasters overflow classic-TIFF uint32 offsets late in the write. Replace the fudge with _compute_classic_ifd_overhead, which reuses _build_ifd to measure the actual entry block plus overflow heap from the tag list. write_streaming now builds tags first, measures, decides BigTIFF, then promotes offset arrays via _promote_offsets_to_long8. * The streaming helper used >= UINT32_MAX while the eager _assemble_tiff path uses > UINT32_MAX. Align both: a file of exactly UINT32_MAX bytes still fits classic (last byte sits at offset UINT32_MAX - 1, a valid uint32). Update the corresponding boundary tests so equality stays classic and the +1 case promotes. Tests cover the boundary flip and a 5000-byte gdal_metadata blob that the old 200-byte fudge would have ignored. --- xrspatial/geotiff/_writer.py | 127 ++++++++++---- ...eotiff_streaming_bigtiff_threshold_1785.py | 163 +++++++++++++++--- 2 files changed, 236 insertions(+), 54 deletions(-) diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index fd3714e7..6297cd05 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -1562,31 +1562,75 @@ def _compress_block(arr, block_w, block_h, samples, dtype, bytes_per_sample, # Streaming writer (dask -> monolithic TIFF without full materialisation) # --------------------------------------------------------------------------- +def _compute_classic_ifd_overhead(tags: list) -> int: + """Return the on-disk size of the classic-TIFF IFD for ``tags``. + + Sums the fixed IFD block (entry count + 12 bytes per entry + next-IFD + pointer) and the variable overflow heap (values whose serialised size + exceeds the 4-byte inline limit, including ASCII strings such as + ``gdal_metadata`` and user-supplied ``extra_tags``). + + The heap size is recovered by building the IFD with + ``_build_ifd(tags, overflow_base=0, bigtiff=False)`` and measuring the + returned overflow buffer; this matches the bytes the streaming writer + will actually emit, with no fudge constant. + """ + num_tags = len(tags) + # classic IFD: 2-byte count + 12-byte entries + 4-byte next-IFD pointer + ifd_block_size = 2 + 12 * num_tags + 4 + _, overflow_bytes = _build_ifd(tags, overflow_base=0, bigtiff=False) + return ifd_block_size + len(overflow_bytes) + + def _should_use_bigtiff_streaming(uncompressed_bytes: int, n_entries: int, - header_size_classic: int = 8, - ifd_overhead_estimate: int = 200) -> bool: + ifd_overhead_bytes: int, + header_size_classic: int = 8) -> bool: """Decide whether the streaming writer must emit BigTIFF. - Classic TIFF stores offsets as uint32, so any offset referenced from - the IFD must satisfy ``offset < UINT32_MAX``. The streaming writer - appends pixel data after the header and IFD, so the last byte of - pixel data sits at approximately - ``header + ifd + strip_table + uncompressed_bytes``. This helper - promotes to BigTIFF if that total reaches ``UINT32_MAX``. + Classic TIFF stores offsets as uint32, so the file size addressable + via classic offsets is at most ``UINT32_MAX`` bytes (offsets run + ``0..UINT32_MAX - 1``). The streaming writer appends pixel data after + the header and IFD, so the final file size is + ``header + ifd + overflow + strip_table + uncompressed_bytes``. + + The comparison is ``> UINT32_MAX`` to match the eager + ``_assemble_tiff`` decision (``estimated_file_size > UINT32_MAX``): + a file that is exactly ``UINT32_MAX`` bytes still fits classic. - See issue #1785: the previous check only compared ``uncompressed_bytes`` - against ``UINT32_MAX`` and missed the IFD/strip-table overhead, which - pushed sub-4 GiB rasters over the classic-TIFF limit late in the write. + See issue #1785 and the Copilot review on PR #1787: the previous + helper applied a 200-byte fudge for IFD overhead, which silently + underestimated when ``gdal_metadata_xml`` or large ``extra_tags`` + pushed the actual overflow heap well past that constant. + + Parameters + ---------- + uncompressed_bytes : int + Total pixel-data bytes that will be written after the IFD. + n_entries : int + Number of strip or tile entries; each contributes a LONG offset + (4 bytes) plus a LONG byte-count (4 bytes) to the overflow heap. + Pass ``0`` if ``ifd_overhead_bytes`` already covers the strip + table (the streaming-writer caller does this by passing the + actual tag list through ``_compute_classic_ifd_overhead``). + ifd_overhead_bytes : int + Classic-TIFF IFD size: fixed entry block plus variable overflow + heap (ASCII metadata, geo tags, strip/tile offset arrays, etc.). + Computed via ``_compute_classic_ifd_overhead(tags)``. + header_size_classic : int, optional + Classic-TIFF header size (8 bytes). """ - # 8 bytes per strip/tile entry: 4 for offset (LONG) + 4 for byte count. + # strip/tile-table overhead is 8 bytes per entry (LONG offset + LONG + # byte count). If the caller already accounted for the offset arrays + # inside ``ifd_overhead_bytes`` they should pass n_entries=0. strip_table_overhead = n_entries * 8 reserved_overhead = ( - header_size_classic + ifd_overhead_estimate + strip_table_overhead + header_size_classic + ifd_overhead_bytes + strip_table_overhead ) UINT32_MAX = 0xFFFFFFFF - # Use >= because UINT32_MAX itself is not a referenceable offset. - return uncompressed_bytes + reserved_overhead >= UINT32_MAX + # ``> UINT32_MAX`` matches the eager path's + # ``estimated_file_size > UINT32_MAX`` check in ``_assemble_tiff``. + return uncompressed_bytes + reserved_overhead > UINT32_MAX def write_streaming(dask_data, path: str, *, @@ -1676,19 +1720,18 @@ def write_streaming(dask_data, path: str, *, rows_per_strip = min(256, height) n_entries = math.ceil(height / rows_per_strip) - # BigTIFF detection. Account for IFD + strip/tile-table overhead so - # rasters just under 4 GiB do not silently overflow classic-TIFF - # uint32 offsets late in the write. See issue #1785. + # BigTIFF detection has to wait until the full tag list is built so + # that variable-length payloads (gdal_metadata, geo tags, user + # extra_tags) feed into the IFD-overhead calculation. Build the tag + # list assuming classic offsets first, then decide BigTIFF, then + # promote the strip/tile offset arrays to LONG8 if needed. See + # issue #1785 and the Copilot review on PR #1787. uncompressed_bytes = height * width * bytes_per_sample * samples - if bigtiff is not None: - use_bigtiff = bigtiff - else: - use_bigtiff = _should_use_bigtiff_streaming( - uncompressed_bytes, n_entries, header_size_classic=8) - - header_size = 16 if use_bigtiff else 8 # ---- Build tag list (mirrors _assemble_tiff for level 0) ---- + # Start with classic offset types; the offset arrays are promoted to + # LONG8 below once BigTIFF is chosen. + use_bigtiff = bool(bigtiff) if bigtiff is not None else False tags = [] tags.append((TAG_IMAGE_WIDTH, LONG, 1, width)) tags.append((TAG_IMAGE_LENGTH, LONG, 1, height)) @@ -1743,10 +1786,11 @@ def write_streaming(dask_data, path: str, *, if resolution_unit is not None: tags.append((TAG_RESOLUTION_UNIT, SHORT, 1, resolution_unit)) - # Layout tags with placeholder offsets / byte-counts. BigTIFF - # needs 64-bit offsets (LONG8) since strip/tile positions can - # exceed 4 GB; classic TIFF uses LONG (uint32). - offset_type = LONG8 if use_bigtiff else LONG + # Layout tags with placeholder offsets / byte-counts. Use classic + # LONG (uint32) here; if the auto-BigTIFF decision below promotes + # the file, ``_promote_offsets_to_long8`` retypes these to LONG8. + # A caller-forced ``bigtiff=True`` is also resolved at that point. + offset_type = LONG placeholder = [0] * n_entries if tiled: tags.append((TAG_TILE_WIDTH, SHORT, 1, tile_size)) @@ -1798,6 +1842,31 @@ def write_streaming(dask_data, path: str, *, and etag_id not in _DANGEROUS_EXTRA_TAG_IDS): tags.append((etag_id, etype_id, ecount, evalue)) + # ---- BigTIFF decision (auto path) ---- + # Compute the real classic-TIFF IFD overhead from the actual tag + # list, including overflow heap (gdal_metadata, geo ascii params, + # strip/tile offset arrays, user extra_tags). This replaces the + # 200-byte fudge constant the original PR used; with metadata-heavy + # writes that constant silently underestimated overhead and let + # sub-4 GiB rasters overflow classic offsets late in the write. + # See issue #1785 and the Copilot review on PR #1787. + if bigtiff is None: + ifd_overhead_bytes = _compute_classic_ifd_overhead(tags) + # n_entries=0 because the strip/tile offset arrays are already + # inside ``tags`` and therefore in ``ifd_overhead_bytes``. + use_bigtiff = _should_use_bigtiff_streaming( + uncompressed_bytes, + n_entries=0, + ifd_overhead_bytes=ifd_overhead_bytes, + header_size_classic=8, + ) + + header_size = 16 if use_bigtiff else 8 + + # Promote the strip/tile offset arrays to LONG8 once BigTIFF is set. + if use_bigtiff: + tags = _promote_offsets_to_long8(tags) + # ---- Pre-compute IFD reservation size ---- sorted_tags = sorted(tags, key=lambda t: t[0]) entry_size = 20 if use_bigtiff else 12 diff --git a/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py b/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py index 6e856c67..24d1638f 100644 --- a/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py +++ b/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py @@ -6,10 +6,17 @@ file past the classic-TIFF uint32 offset ceiling, and the write failed late with ``struct.error``. -These tests pin the corrected decision: the helper accounts for IFD and -strip-table overhead, and the comparison is ``>=`` because -``UINT32_MAX`` itself is not a referenceable offset in classic TIFF. -The explicit ``bigtiff=True``/``False`` user override must still win. +These tests pin the corrected decision: + +* The helper takes an actual ``ifd_overhead_bytes`` value (computed from + the real tag list via ``_compute_classic_ifd_overhead``) rather than a + 200-byte fudge constant; large ``gdal_metadata_xml`` or ``extra_tags`` + payloads must not silently undercount overhead. See the Copilot review + on PR #1787. +* The comparison is ``> UINT32_MAX``, matching the eager + ``_assemble_tiff`` decision (``estimated_file_size > UINT32_MAX``). A + file that is exactly ``UINT32_MAX`` bytes still fits classic. +* The explicit ``bigtiff=True``/``False`` user override still wins. """ from __future__ import annotations @@ -21,12 +28,53 @@ import xarray as xr from xrspatial.geotiff import to_geotiff -from xrspatial.geotiff._writer import _should_use_bigtiff_streaming +from xrspatial.geotiff._dtypes import ASCII, LONG, SHORT +from xrspatial.geotiff._header import ( + TAG_BITS_PER_SAMPLE, + TAG_COMPRESSION, + TAG_GDAL_METADATA, + TAG_IMAGE_LENGTH, + TAG_IMAGE_WIDTH, + TAG_PHOTOMETRIC, + TAG_SAMPLE_FORMAT, + TAG_SAMPLES_PER_PIXEL, + TAG_STRIP_BYTE_COUNTS, + TAG_STRIP_OFFSETS, +) +from xrspatial.geotiff._writer import ( + _compute_classic_ifd_overhead, + _should_use_bigtiff_streaming, +) UINT32_MAX = 0xFFFFFFFF +def _minimal_tag_list(n_entries: int, gdal_metadata_size: int = 0) -> list: + """Build a representative classic-TIFF tag list for sizing. + + Mirrors the streaming writer's tag set for a stripped uint8 raster + so ``_compute_classic_ifd_overhead`` returns a realistic byte count. + ``gdal_metadata_size`` injects an ASCII overflow payload to model + metadata-heavy writes. + """ + tags = [ + (TAG_IMAGE_WIDTH, LONG, 1, 1), + (TAG_IMAGE_LENGTH, LONG, 1, 1), + (TAG_BITS_PER_SAMPLE, SHORT, 1, 8), + (TAG_COMPRESSION, SHORT, 1, 1), + (TAG_PHOTOMETRIC, SHORT, 1, 1), + (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1), + (TAG_SAMPLE_FORMAT, SHORT, 1, 1), + (TAG_STRIP_OFFSETS, LONG, n_entries, [0] * n_entries), + (TAG_STRIP_BYTE_COUNTS, LONG, n_entries, [0] * n_entries), + ] + if gdal_metadata_size > 0: + blob = 'x' * gdal_metadata_size + tags.append((TAG_GDAL_METADATA, ASCII, len(blob) + 1, blob)) + return tags + + # -- Helper-level unit tests ------------------------------------------------ class TestShouldUseBigTIFFStreaming: @@ -36,35 +84,58 @@ def test_just_under_uint32_max_promotes(self): Even ~50 bytes of slack disappears once IFD + strip-table overhead is added, so this case must promote to BigTIFF. """ - # 1024 entries => 8 KiB strip-table overhead, well above 50 bytes. + # 1024 entries: strip table contributes 8 * 1024 = 8 KiB. + tags = _minimal_tag_list(n_entries=1024) + overhead = _compute_classic_ifd_overhead(tags) assert _should_use_bigtiff_streaming( uncompressed_bytes=UINT32_MAX - 50, - n_entries=1024, + n_entries=0, + ifd_overhead_bytes=overhead, ) is True def test_half_uint32_max_stays_classic(self): """uncompressed = UINT32_MAX // 2 stays classic.""" + tags = _minimal_tag_list(n_entries=1024) + overhead = _compute_classic_ifd_overhead(tags) assert _should_use_bigtiff_streaming( uncompressed_bytes=UINT32_MAX // 2, - n_entries=1024, + n_entries=0, + ifd_overhead_bytes=overhead, ) is False - def test_exactly_uint32_max_promotes(self): - """Boundary: uncompressed == UINT32_MAX promotes because of ``>=``. + def test_exactly_uint32_max_stays_classic(self): + """Boundary: total file size == UINT32_MAX bytes still fits classic. - Classic TIFF cannot reference an offset equal to UINT32_MAX (the - last valid offset is UINT32_MAX - 1), so equality must promote. + Eager ``_assemble_tiff`` uses ``estimated_file_size > UINT32_MAX``; + the streaming helper must match. A file of exactly ``UINT32_MAX`` + bytes has its last byte at offset ``UINT32_MAX - 1``, which is a + valid classic-TIFF offset. """ + # Construct uncompressed_bytes so total = exactly UINT32_MAX. + tags = _minimal_tag_list(n_entries=1) + overhead = _compute_classic_ifd_overhead(tags) + header = 8 + uncompressed = UINT32_MAX - header - overhead assert _should_use_bigtiff_streaming( - uncompressed_bytes=UINT32_MAX, - n_entries=1, + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is False + # One byte over and we must promote. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed + 1, + n_entries=0, + ifd_overhead_bytes=overhead, ) is True def test_small_raster_no_overhead_stays_classic(self): """Small rasters with one strip stay classic.""" + tags = _minimal_tag_list(n_entries=1) + overhead = _compute_classic_ifd_overhead(tags) assert _should_use_bigtiff_streaming( uncompressed_bytes=1024, - n_entries=1, + n_entries=0, + ifd_overhead_bytes=overhead, ) is False def test_large_strip_table_alone_can_promote(self): @@ -74,31 +145,73 @@ def test_large_strip_table_alone_can_promote(self): the table itself near 4 GiB. Not realistic in practice, but it proves the overhead is wired through. """ - # Choose n_entries so that 8 * n_entries alone is close to UINT32_MAX. n_entries = (UINT32_MAX // 8) + 1 + tags = _minimal_tag_list(n_entries=n_entries) + overhead = _compute_classic_ifd_overhead(tags) assert _should_use_bigtiff_streaming( uncompressed_bytes=0, - n_entries=n_entries, + n_entries=0, + ifd_overhead_bytes=overhead, ) is True def test_overhead_pushes_just_under_threshold_over(self): """Regression: a payload that fits classic by raw bytes but not once header + IFD + strip table is added must promote. """ - # Pick overhead components large enough that the sum crosses - # UINT32_MAX even though uncompressed_bytes < UINT32_MAX. - n_entries = 100_000 # 800 KB strip table - uncompressed = UINT32_MAX - (8 + 200 + n_entries * 8) - # Sum equals exactly UINT32_MAX -> promote (>=). + n_entries = 100_000 # ~800 KB strip table + tags = _minimal_tag_list(n_entries=n_entries) + overhead = _compute_classic_ifd_overhead(tags) + # Choose uncompressed so the total equals exactly UINT32_MAX + 1. + header = 8 + uncompressed = UINT32_MAX + 1 - header - overhead assert _should_use_bigtiff_streaming( uncompressed_bytes=uncompressed, - n_entries=n_entries, + n_entries=0, + ifd_overhead_bytes=overhead, ) is True - # One byte less and we stay classic. + # One byte less and we stay classic (boundary is now ``>``). assert _should_use_bigtiff_streaming( uncompressed_bytes=uncompressed - 1, - n_entries=n_entries, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is False + + def test_large_gdal_metadata_flips_decision(self): + """A 5000-byte gdal_metadata blob must flip a borderline case. + + Under the old 200-byte fudge, ``uncompressed + 200 < UINT32_MAX`` + could stay classic even when a multi-KB gdal_metadata overflow + pushed real overhead well past 200 bytes. With the actual + overhead computed from the tag list, the decision flips. + """ + n_entries = 1024 + big_blob = 5000 # ASCII overflow heap entry + plain_tags = _minimal_tag_list(n_entries=n_entries) + meta_tags = _minimal_tag_list( + n_entries=n_entries, gdal_metadata_size=big_blob) + + plain_overhead = _compute_classic_ifd_overhead(plain_tags) + meta_overhead = _compute_classic_ifd_overhead(meta_tags) + # Metadata blob really does increase computed overhead. + assert meta_overhead - plain_overhead >= big_blob + + # Pick uncompressed so plain_overhead path stays classic but + # the metadata path tips over. + header = 8 + uncompressed = UINT32_MAX - header - plain_overhead + # Plain: total == UINT32_MAX -> classic. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=plain_overhead, ) is False + # With the large metadata blob folded into the real overhead, + # the total now exceeds UINT32_MAX and we must promote. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=meta_overhead, + ) is True # -- Integration tests against the writer ------------------------------------