Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude/sweep-metadata-state.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module,last_inspected,issue,severity_max,categories_found,notes
geotiff,2026-05-11,1640,HIGH,1;2;4,"open_geotiff(overview_level>=1) silently dropped CRS, transform attr, and georeferenced y/x coords because GDAL-style COGs (including to_geotiff output) write GeoKeys only on the level-0 IFD. Fixed by inheriting and rescaling georef from the level-0 IFD across all four read backends (#1640)."
geotiff,2026-05-12,1657,HIGH,1;5,NewSubfileType (254) and SubIFDs (330) leaked through attrs['extra_tags'] across all four backends. Round-trip read overview -> write produced primary IFD wrongly marked as overview (NewSubfileType=1) and SubIFDs with stale byte offsets that crashed readers. Fixed by adding both tags to _MANAGED_TAGS and to _DANGEROUS_EXTRA_TAG_IDS writer-side filter (#1657).
reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch.
13 changes: 11 additions & 2 deletions xrspatial/geotiff/_geotags.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@

from ._header import (
IFD,
TAG_NEW_SUBFILE_TYPE,
TAG_IMAGE_WIDTH, TAG_IMAGE_LENGTH, TAG_BITS_PER_SAMPLE,
TAG_COMPRESSION, TAG_PHOTOMETRIC,
TAG_STRIP_OFFSETS, TAG_ORIENTATION, TAG_SAMPLES_PER_PIXEL,
TAG_ROWS_PER_STRIP, TAG_STRIP_BYTE_COUNTS,
TAG_X_RESOLUTION, TAG_Y_RESOLUTION,
TAG_PLANAR_CONFIG, TAG_RESOLUTION_UNIT,
TAG_PREDICTOR, TAG_COLORMAP,
TAG_PREDICTOR, TAG_COLORMAP, TAG_SUB_IFDS,
TAG_TILE_WIDTH, TAG_TILE_LENGTH,
TAG_TILE_OFFSETS, TAG_TILE_BYTE_COUNTS,
TAG_EXTRA_SAMPLES,
Expand All @@ -32,14 +33,22 @@
# extra_tags pass-through. ColorMap (320), ExtraSamples (338, only emitted
# automatically when samples > 1), and ImageDescription (270) intentionally
# stay OUT of this set so they round-trip without dedicated writer plumbing.
#
# NewSubfileType (254) and SubIFDs (330) are also managed: NewSubfileType
# is a per-IFD status flag (overview / mask marker) that the writer emits
# on its own for level > 0 IFDs, so leaking the source value to extra_tags
# would mis-mark a primary IFD as an overview after a read overview ->
# write round-trip. SubIFDs holds absolute byte offsets into the source
# file, which become garbage in the rewritten output. See issue #1657.
_MANAGED_TAGS = frozenset({
TAG_NEW_SUBFILE_TYPE,
TAG_IMAGE_WIDTH, TAG_IMAGE_LENGTH, TAG_BITS_PER_SAMPLE,
TAG_COMPRESSION, TAG_PHOTOMETRIC,
TAG_STRIP_OFFSETS, TAG_ORIENTATION, TAG_SAMPLES_PER_PIXEL,
TAG_ROWS_PER_STRIP, TAG_STRIP_BYTE_COUNTS,
TAG_X_RESOLUTION, TAG_Y_RESOLUTION,
TAG_PLANAR_CONFIG, TAG_RESOLUTION_UNIT,
TAG_PREDICTOR,
TAG_PREDICTOR, TAG_SUB_IFDS,
TAG_TILE_WIDTH, TAG_TILE_LENGTH,
TAG_TILE_OFFSETS, TAG_TILE_BYTE_COUNTS,
TAG_SAMPLE_FORMAT, TAG_GDAL_METADATA, TAG_GDAL_NODATA,
Expand Down
1 change: 1 addition & 0 deletions xrspatial/geotiff/_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
TAG_TILE_OFFSETS = 324
TAG_TILE_BYTE_COUNTS = 325
TAG_COLORMAP = 320
TAG_SUB_IFDS = 330
TAG_EXTRA_SAMPLES = 338
TAG_SAMPLE_FORMAT = 339
TAG_JPEG_TABLES = 347
Expand Down
25 changes: 21 additions & 4 deletions xrspatial/geotiff/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
TAG_STRIP_OFFSETS,
TAG_ROWS_PER_STRIP,
TAG_STRIP_BYTE_COUNTS,
TAG_SUB_IFDS,
TAG_X_RESOLUTION,
TAG_Y_RESOLUTION,
TAG_RESOLUTION_UNIT,
Expand All @@ -65,6 +66,15 @@
TAG_GDAL_METADATA,
)

# Tag IDs the writer must never accept from ``extra_tags``. NewSubfileType
# (254) is a per-IFD status flag the writer emits on its own for overview
# IFDs; copying a level-1 source value onto a level-0 destination would
# mis-mark the primary IFD as a reduced-resolution overview. SubIFDs
# (330) carries absolute byte offsets, which become garbage after a
# rewrite. The read side now filters both via ``_MANAGED_TAGS``; this
# constant is the writer-side belt-and-braces guard. See issue #1657.
_DANGEROUS_EXTRA_TAG_IDS = frozenset({TAG_NEW_SUBFILE_TYPE, TAG_SUB_IFDS})

# Byte order: always write little-endian
BO = '<'

Expand Down Expand Up @@ -828,11 +838,15 @@ def _assemble_tiff(width: int, height: int, dtype: np.dtype,

# Extra tags (pass-through from source file)
if extra_tags is not None:
# Compute existing tag IDs once; update as we append to keep
# this loop O(len(extra_tags) + len(tags)) instead of O(N*M).
# See issue #1657 for the filter rationale.
existing_ids = {t[0] for t in tags}
for etag_id, etype_id, ecount, evalue in extra_tags:
# Skip any tag we already wrote to avoid duplicates
existing_ids = {t[0] for t in tags}
if etag_id not in existing_ids:
if (etag_id not in existing_ids
and etag_id not in _DANGEROUS_EXTRA_TAG_IDS):
Comment on lines 839 to +847
tags.append((etag_id, etype_id, ecount, evalue))
existing_ids.add(etag_id)

ifd_specs.append(tags)

Expand Down Expand Up @@ -1467,7 +1481,10 @@ def write_streaming(dask_data, path: str, *,
if extra_tags is not None:
existing_ids = {t[0] for t in tags}
for etag_id, etype_id, ecount, evalue in extra_tags:
if etag_id not in existing_ids:
# Skip dangerous tags (NewSubfileType, SubIFDs) that would
# mis-mark the IFD or carry stale offsets. See issue #1657.
if (etag_id not in existing_ids
and etag_id not in _DANGEROUS_EXTRA_TAG_IDS):
tags.append((etag_id, etype_id, ecount, evalue))

# ---- Pre-compute IFD reservation size ----
Expand Down
276 changes: 276 additions & 0 deletions xrspatial/geotiff/tests/test_extra_tags_safe_filter_1657.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
"""Regression tests for issue #1657: dangerous tag leakage through extra_tags.

Before the fix, reading an overview level (NewSubfileType=1) or any TIFF
with a SubIFDs entry leaked those tags into ``attrs['extra_tags']`` because
they were not in ``_MANAGED_TAGS``. Writing the DataArray back via
``to_geotiff`` or ``write_geotiff_gpu`` then re-emitted them on the output
IFD, producing:

* A primary IFD wrongly marked as a reduced-resolution overview
(``NewSubfileType=1``), so GDAL / rasterio skip it when picking the
primary image.
* Stale absolute byte offsets in ``SubIFDs`` that point into the new file's
pixel data, crashing readers that follow the chain.

The fix adds both tags to ``_MANAGED_TAGS`` (read-side filter) and to
``_DANGEROUS_EXTRA_TAG_IDS`` in ``_writer.py`` (write-side belt-and-braces
guard so caller-supplied ``attrs['extra_tags']`` still produces a clean
file).

These tests pin the contract on every available backend.
"""
from __future__ import annotations

import importlib.util

import numpy as np
import pytest

from xrspatial.geotiff import open_geotiff, to_geotiff

tifffile = pytest.importorskip("tifffile")


def _gpu_available() -> bool:
if importlib.util.find_spec("cupy") is None:
return False
try:
import cupy
return bool(cupy.cuda.is_available())
except Exception:
return False


_HAS_GPU = _gpu_available()
_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required")


def _make_cog(path) -> None:
"""Write a small COG with overviews so each backend can read an overview.

Uses tile_size=64 + explicit overview_levels so the small test fixture
actually produces a multi-IFD pyramid (auto-overview halves until the
smallest level fits in one tile, so 256x256 with the default 256-tile
size yields zero overviews).
"""
import xarray as xr
da = xr.DataArray(
np.arange(512 * 512, dtype=np.float32).reshape(512, 512),
dims=['y', 'x'],
coords={'y': np.arange(512) * -0.5 + 10.0,
'x': np.arange(512) * 0.5 - 10.0},
attrs={'crs': 4326},
)
to_geotiff(da, str(path), cog=True,
tile_size=64, overview_levels=[2, 4])


# ---------------------------------------------------------------------------
# Read side: overview reads must not leak NewSubfileType into attrs
# ---------------------------------------------------------------------------

def test_overview_read_does_not_leak_newsubfiletype_numpy(tmp_path):
"""Reading an overview level must not surface NewSubfileType in attrs."""
cog_path = tmp_path / 'cog.tif'
_make_cog(cog_path)
ov = open_geotiff(str(cog_path), overview_level=1)
extra = ov.attrs.get('extra_tags')
assert extra is None or not any(t[0] == 254 for t in extra), (
f"NewSubfileType (tag 254) leaked into attrs['extra_tags']: {extra}"
)


def test_overview_read_does_not_leak_newsubfiletype_dask(tmp_path):
"""Same contract for the dask backend."""
cog_path = tmp_path / 'cog.tif'
_make_cog(cog_path)
ov = open_geotiff(str(cog_path), overview_level=1, chunks=32)
extra = ov.attrs.get('extra_tags')
assert extra is None or not any(t[0] == 254 for t in extra), (
f"NewSubfileType (tag 254) leaked under dask: {extra}"
)


@_gpu_only
def test_overview_read_does_not_leak_newsubfiletype_cupy(tmp_path):
"""Same contract for the cupy backend."""
cog_path = tmp_path / 'cog.tif'
_make_cog(cog_path)
ov = open_geotiff(str(cog_path), overview_level=1, gpu=True)
extra = ov.attrs.get('extra_tags')
assert extra is None or not any(t[0] == 254 for t in extra), (
f"NewSubfileType (tag 254) leaked under cupy: {extra}"
)


@_gpu_only
def test_overview_read_does_not_leak_newsubfiletype_dask_cupy(tmp_path):
"""Same contract for the dask+cupy backend."""
cog_path = tmp_path / 'cog.tif'
_make_cog(cog_path)
ov = open_geotiff(
str(cog_path), overview_level=1, gpu=True, chunks=32)
extra = ov.attrs.get('extra_tags')
assert extra is None or not any(t[0] == 254 for t in extra), (
f"NewSubfileType (tag 254) leaked under dask+cupy: {extra}"
)


# ---------------------------------------------------------------------------
# Read side: SubIFDs must not leak from level-0 IFDs that carry one
# ---------------------------------------------------------------------------

def test_subifds_does_not_leak_into_attrs(tmp_path):
"""tifffile writes SubIFDs by default on multi-page TIFFs.

Anything carrying tag 330 must not surface in ``attrs['extra_tags']``
because the byte offsets are file-absolute and cannot be replayed
into a rewritten file.
"""
data = np.arange(64 * 64, dtype=np.float32).reshape(64, 64)
path = tmp_path / 'subifd.tif'
with tifffile.TiffWriter(str(path)) as tw:
tw.write(data, tile=(32, 32), subifds=1)
tw.write(data[::2, ::2], tile=(32, 32), subfiletype=1)

da = open_geotiff(str(path))
extra = da.attrs.get('extra_tags')
assert extra is None or not any(t[0] == 330 for t in extra), (
f"SubIFDs (tag 330) leaked into attrs['extra_tags']: {extra}"
)


# ---------------------------------------------------------------------------
# Round-trip: overview read -> write must produce a valid primary IFD
# ---------------------------------------------------------------------------

def _read_subfile_type(path) -> int | None:
"""Return the NewSubfileType value on page 0 of a TIFF, or None if absent."""
with tifffile.TiffFile(str(path)) as tf:
page = tf.pages[0]
tag = page.tags.get('NewSubfileType')
return None if tag is None else int(tag.value)


def test_overview_roundtrip_primary_ifd_clean_numpy(tmp_path):
"""Round-tripping an overview must not mark the output as an overview."""
cog_path = tmp_path / 'cog.tif'
_make_cog(cog_path)
ov = open_geotiff(str(cog_path), overview_level=1)
out = tmp_path / 'out_numpy.tif'
to_geotiff(ov, str(out))
sft = _read_subfile_type(out)
assert sft in (None, 0), (
f"Round-tripped overview produced NewSubfileType={sft} on the "
f"primary IFD (expected None or 0)."
)


def test_overview_roundtrip_primary_ifd_clean_dask(tmp_path):
"""Same contract for the dask read path."""
cog_path = tmp_path / 'cog.tif'
_make_cog(cog_path)
ov = open_geotiff(str(cog_path), overview_level=1, chunks=32)
out = tmp_path / 'out_dask.tif'
to_geotiff(ov, str(out))
sft = _read_subfile_type(out)
assert sft in (None, 0), (
f"Dask round-tripped overview produced NewSubfileType={sft}."
)


@_gpu_only
def test_overview_roundtrip_primary_ifd_clean_cupy(tmp_path):
"""Same contract for the cupy read + write path."""
cog_path = tmp_path / 'cog.tif'
_make_cog(cog_path)
ov = open_geotiff(str(cog_path), overview_level=1, gpu=True)
out = tmp_path / 'out_cupy.tif'
to_geotiff(ov, str(out)) # auto-dispatches to GPU writer
sft = _read_subfile_type(out)
assert sft in (None, 0), (
f"Cupy round-tripped overview produced NewSubfileType={sft}."
)


# ---------------------------------------------------------------------------
# Writer belt-and-braces: caller-supplied dangerous extra_tags get filtered
# ---------------------------------------------------------------------------

def test_writer_filters_caller_supplied_newsubfiletype(tmp_path):
"""A caller passing extra_tags with NewSubfileType=1 still gets a
clean primary IFD on output."""
import xarray as xr
da = xr.DataArray(
np.zeros((32, 32), dtype=np.float32),
dims=['y', 'x'],
coords={'y': np.arange(32) * -0.5 + 10.0,
'x': np.arange(32) * 0.5 - 10.0},
attrs={
'crs': 4326,
# Tag 254 with LONG (type id 4) count 1 value 1 -- format
# produced by the read path before the fix.
'extra_tags': [(254, 4, 1, 1)],
},
)
out = tmp_path / 'with_dangerous_extra_tag.tif'
to_geotiff(da, str(out))
sft = _read_subfile_type(out)
assert sft in (None, 0), (
f"Writer accepted dangerous extra_tags[254]={sft}, expected None/0."
)


def test_writer_filters_caller_supplied_subifds(tmp_path):
"""A caller passing extra_tags with SubIFDs offsets must not emit
those stale offsets onto the output file."""
import xarray as xr
da = xr.DataArray(
np.zeros((32, 32), dtype=np.float32),
dims=['y', 'x'],
coords={'y': np.arange(32) * -0.5 + 10.0,
'x': np.arange(32) * 0.5 - 10.0},
attrs={
'crs': 4326,
# Garbage offsets that would crash readers if emitted.
'extra_tags': [(330, 4, 2, (999999, 888888))],
},
)
out = tmp_path / 'with_subifds.tif'
to_geotiff(da, str(out))
with tifffile.TiffFile(str(out)) as tf:
sub = tf.pages[0].tags.get('SubIFDs')
assert sub is None, (
f"Writer emitted SubIFDs={sub.value}, should have filtered it."
)


def test_writer_keeps_benign_extra_tags(tmp_path):
"""The filter must only drop the dangerous IDs (254, 330) and let
other extra_tags entries through. Use Software (305) -- ASCII, no
embedded offsets -- as a benign canary."""
import xarray as xr
da = xr.DataArray(
np.zeros((32, 32), dtype=np.float32),
dims=['y', 'x'],
coords={'y': np.arange(32) * -0.5 + 10.0,
'x': np.arange(32) * 0.5 - 10.0},
attrs={
'crs': 4326,
'extra_tags': [
(254, 4, 1, 1), # dangerous, must be filtered
(305, 2, 12, 'tifffile.py'), # benign, must survive
],
},
)
out = tmp_path / 'mixed_extra_tags.tif'
to_geotiff(da, str(out))
with tifffile.TiffFile(str(out)) as tf:
page = tf.pages[0]
assert page.tags.get('NewSubfileType') is None
software = page.tags.get('Software')
assert software is not None, (
"Benign extra_tag (305 Software) was filtered too -- "
"filter is too aggressive."
)
assert 'tifffile' in str(software.value)
Loading