xarray-contrib · brendancol · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py
@@ -1562,6 +1562,77 @@ def _compress_block(arr, block_w, block_h, samples, dtype, bytes_per_sample,
 # Streaming writer (dask -> monolithic TIFF without full materialisation)
 # ---------------------------------------------------------------------------
 
+def _compute_classic_ifd_overhead(tags: list) -> int:
+    """Return the on-disk size of the classic-TIFF IFD for ``tags``.
+
+    Sums the fixed IFD block (entry count + 12 bytes per entry + next-IFD
+    pointer) and the variable overflow heap (values whose serialised size
+    exceeds the 4-byte inline limit, including ASCII strings such as
+    ``gdal_metadata`` and user-supplied ``extra_tags``).
+
+    The heap size is recovered by building the IFD with
+    ``_build_ifd(tags, overflow_base=0, bigtiff=False)`` and measuring the
+    returned overflow buffer; this matches the bytes the streaming writer
+    will actually emit, with no fudge constant.
+    """
+    num_tags = len(tags)
+    # classic IFD: 2-byte count + 12-byte entries + 4-byte next-IFD pointer
+    ifd_block_size = 2 + 12 * num_tags + 4
+    _, overflow_bytes = _build_ifd(tags, overflow_base=0, bigtiff=False)
+    return ifd_block_size + len(overflow_bytes)
+
+
+def _should_use_bigtiff_streaming(uncompressed_bytes: int,
+                                  n_entries: int,
+                                  ifd_overhead_bytes: int,
+                                  header_size_classic: int = 8) -> bool:
+    """Decide whether the streaming writer must emit BigTIFF.
+
+    Classic TIFF stores offsets as uint32, so the file size addressable
+    via classic offsets is at most ``UINT32_MAX`` bytes (offsets run
+    ``0..UINT32_MAX - 1``). The streaming writer appends pixel data after
+    the header and IFD, so the final file size is
+    ``header + ifd + overflow + strip_table + uncompressed_bytes``.
+
+    The comparison is ``> UINT32_MAX`` to match the eager
+    ``_assemble_tiff`` decision (``estimated_file_size > UINT32_MAX``):
+    a file that is exactly ``UINT32_MAX`` bytes still fits classic.
+
+    See issue #1785 and the Copilot review on PR #1787: the previous
+    helper applied a 200-byte fudge for IFD overhead, which silently
+    underestimated when ``gdal_metadata_xml`` or large ``extra_tags``
+    pushed the actual overflow heap well past that constant.
+
+    Parameters
+    ----------
+    uncompressed_bytes : int
+        Total pixel-data bytes that will be written after the IFD.
+    n_entries : int
+        Number of strip or tile entries; each contributes a LONG offset
+        (4 bytes) plus a LONG byte-count (4 bytes) to the overflow heap.
+        Pass ``0`` if ``ifd_overhead_bytes`` already covers the strip
+        table (the streaming-writer caller does this by passing the
+        actual tag list through ``_compute_classic_ifd_overhead``).
+    ifd_overhead_bytes : int
+        Classic-TIFF IFD size: fixed entry block plus variable overflow
+        heap (ASCII metadata, geo tags, strip/tile offset arrays, etc.).
+        Computed via ``_compute_classic_ifd_overhead(tags)``.
+    header_size_classic : int, optional
+        Classic-TIFF header size (8 bytes).
+    """
+    # strip/tile-table overhead is 8 bytes per entry (LONG offset + LONG
+    # byte count). If the caller already accounted for the offset arrays
+    # inside ``ifd_overhead_bytes`` they should pass n_entries=0.
+    strip_table_overhead = n_entries * 8
+    reserved_overhead = (
+        header_size_classic + ifd_overhead_bytes + strip_table_overhead
+    )
+    UINT32_MAX = 0xFFFFFFFF
+    # ``> UINT32_MAX`` matches the eager path's
+    # ``estimated_file_size > UINT32_MAX`` check in ``_assemble_tiff``.
+    return uncompressed_bytes + reserved_overhead > UINT32_MAX
+
+
 def write_streaming(dask_data, path: str, *,
                     geo_transform: 'GeoTransform | None' = None,
                     crs_epsg: int | None = None,
@@ -1649,17 +1720,18 @@ def write_streaming(dask_data, path: str, *,
         rows_per_strip = min(256, height)
         n_entries = math.ceil(height / rows_per_strip)
 
-    # BigTIFF detection (use uncompressed size as conservative estimate)
+    # BigTIFF detection has to wait until the full tag list is built so
+    # that variable-length payloads (gdal_metadata, geo tags, user
+    # extra_tags) feed into the IFD-overhead calculation. Build the tag
+    # list assuming classic offsets first, then decide BigTIFF, then
+    # promote the strip/tile offset arrays to LONG8 if needed. See
+    # issue #1785 and the Copilot review on PR #1787.
     uncompressed_bytes = height * width * bytes_per_sample * samples
-    UINT32_MAX = 0xFFFFFFFF
-    if bigtiff is not None:
-        use_bigtiff = bigtiff
-    else:
-        use_bigtiff = uncompressed_bytes > UINT32_MAX
-
-    header_size = 16 if use_bigtiff else 8
 
     # ---- Build tag list (mirrors _assemble_tiff for level 0) ----
+    # Start with classic offset types; the offset arrays are promoted to
+    # LONG8 below once BigTIFF is chosen.
+    use_bigtiff = bool(bigtiff) if bigtiff is not None else False
     tags = []
     tags.append((TAG_IMAGE_WIDTH, LONG, 1, width))
     tags.append((TAG_IMAGE_LENGTH, LONG, 1, height))
@@ -1714,10 +1786,11 @@ def write_streaming(dask_data, path: str, *,
     if resolution_unit is not None:
         tags.append((TAG_RESOLUTION_UNIT, SHORT, 1, resolution_unit))
 
-    # Layout tags with placeholder offsets / byte-counts.  BigTIFF
-    # needs 64-bit offsets (LONG8) since strip/tile positions can
-    # exceed 4 GB; classic TIFF uses LONG (uint32).
-    offset_type = LONG8 if use_bigtiff else LONG
+    # Layout tags with placeholder offsets / byte-counts. Use classic
+    # LONG (uint32) here; if the auto-BigTIFF decision below promotes
+    # the file, ``_promote_offsets_to_long8`` retypes these to LONG8.
+    # A caller-forced ``bigtiff=True`` is also resolved at that point.
+    offset_type = LONG
     placeholder = [0] * n_entries
     if tiled:
         tags.append((TAG_TILE_WIDTH, SHORT, 1, tile_size))
@@ -1769,6 +1842,31 @@ def write_streaming(dask_data, path: str, *,
                     and etag_id not in _DANGEROUS_EXTRA_TAG_IDS):
                 tags.append((etag_id, etype_id, ecount, evalue))
 
+    # ---- BigTIFF decision (auto path) ----
+    # Compute the real classic-TIFF IFD overhead from the actual tag
+    # list, including overflow heap (gdal_metadata, geo ascii params,
+    # strip/tile offset arrays, user extra_tags). This replaces the
+    # 200-byte fudge constant the original PR used; with metadata-heavy
+    # writes that constant silently underestimated overhead and let
+    # sub-4 GiB rasters overflow classic offsets late in the write.
+    # See issue #1785 and the Copilot review on PR #1787.
+    if bigtiff is None:
+        ifd_overhead_bytes = _compute_classic_ifd_overhead(tags)
+        # n_entries=0 because the strip/tile offset arrays are already
+        # inside ``tags`` and therefore in ``ifd_overhead_bytes``.
+        use_bigtiff = _should_use_bigtiff_streaming(
+            uncompressed_bytes,
+            n_entries=0,
+            ifd_overhead_bytes=ifd_overhead_bytes,
+            header_size_classic=8,
+        )
+
+    header_size = 16 if use_bigtiff else 8
+
+    # Promote the strip/tile offset arrays to LONG8 once BigTIFF is set.
+    if use_bigtiff:
+        tags = _promote_offsets_to_long8(tags)
+
     # ---- Pre-compute IFD reservation size ----
     sorted_tags = sorted(tags, key=lambda t: t[0])
     entry_size = 20 if use_bigtiff else 12