diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index c1e27405..6c58d797 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -3090,26 +3090,37 @@ def gpu_compress_tiles(d_image, tile_width, tile_height,
 def _block_reduce_2d_gpu(arr2d, method, nodata=None):
     """2x block-reduce a single 2D CuPy plane using *method*.
 
+    Output dimensions follow GDAL's ceil semantics (5x5 -> 3x3) so the
+    overview pyramid covers the full source extent for odd-sized rasters.
+    Odd inputs are NaN-padded along the trailing edge (float-promoted for
+    integer dtypes) so the 2x2 block reshape works and the residual block
+    is reduced via the same nan-aware aggregations (issue #2105). Mirrors
+    the CPU helper :func:`xrspatial.geotiff._writer._block_reduce_2d` so
+    the two backends produce identical overviews.
+
     When ``nodata`` is supplied and ``arr2d`` is a float dtype, cells that
     equal the sentinel are masked back to NaN before the reduction so the
-    ``cupy.nan*`` aggregation routines correctly skip them. Mirrors the
-    CPU helper :func:`xrspatial.geotiff._writer._block_reduce_2d` so the
-    two backends produce identical overviews when ``nodata`` is set.
+    ``cupy.nan*`` aggregation routines correctly skip them.
     """
     import cupy
     import numpy as np
 
     h, w = arr2d.shape
-    h2 = (h // 2) * 2
-    w2 = (w // 2) * 2
-    cropped = arr2d[:h2, :w2]
-    oh, ow = h2 // 2, w2 // 2
+    # Ceil semantics match GDAL and the CPU mirror in _writer.
+    oh, ow = (h + 1) // 2, (w + 1) // 2
+    h2, w2 = 2 * oh, 2 * ow
+    need_pad = (h2, w2) != (h, w)
 
     if method == 'nearest':
-        return cropped[::2, ::2].copy()
+        # Top-left pixel of each 2x2 block; direct stride keeps the
+        # trailing row/col for odd-sized inputs (issue #2105). The
+        # ``.copy()`` materialises a contiguous device buffer so
+        # downstream nodata-mask rewrites don't touch ``arr2d``.
+        return arr2d[::2, ::2].copy()
 
     if method == 'mode':
-        # Mode is expensive on GPU; fall back to CPU
+        # Mode is expensive on GPU; fall back to CPU. The CPU helper
+        # now handles odd-sized inputs natively.
         cpu_arr = arr2d.get()
         from ._writer import _block_reduce_2d
         cpu_result = _block_reduce_2d(cpu_arr, 'mode', nodata=nodata)
@@ -3120,15 +3131,26 @@ def _block_reduce_2d_gpu(arr2d, method, nodata=None):
         # factors with the same prefilter=False NaN-safety the CPU
         # helper uses for issue #1623. Fall back to CPU so cubic on
         # the GPU writer path produces the same overview bytes as the
-        # CPU writer and so the sentinel handling matches.
+        # CPU writer and so the sentinel handling matches. The CPU
+        # helper handles odd-sized inputs via edge-replicate padding.
         cpu_arr = arr2d.get()
         from ._writer import _block_reduce_2d
         cpu_result = _block_reduce_2d(cpu_arr, 'cubic', nodata=nodata)
         return cupy.asarray(cpu_result)
 
-    # Block reshape for mean/min/max/median
+    # Block reshape for mean/min/max/median. Odd-sized inputs get a
+    # NaN-padded trailing edge so the residual block reduces correctly;
+    # the nan-aware reductions exclude the padded cells. The padded
+    # buffer keeps the source float dtype (float32 stays float32) so an
+    # odd-shape GPU read does not pay a 2x device-memory cost for a
+    # silent promote to float64.
     if arr2d.dtype.kind == 'f':
-        blocks = cropped.reshape(oh, 2, ow, 2)
+        if need_pad:
+            padded = cupy.full((h2, w2), np.nan, dtype=arr2d.dtype)
+            padded[:h, :w] = arr2d
+            blocks = padded.reshape(oh, 2, ow, 2)
+        else:
+            blocks = arr2d.reshape(oh, 2, ow, 2)
         # Mask the sentinel back to NaN so cupy.nanmean and friends
         # honour it as missing-data (issue #1613). Match the upstream
         # NaN->sentinel rewrite gate so ``nodata=+/-inf`` is masked here.
@@ -3143,7 +3165,12 @@ def _block_reduce_2d_gpu(arr2d, method, nodata=None):
                     blocks = cupy.where(
                         mask, cupy.float64('nan'), blocks)
     else:
-        blocks = cropped.astype(cupy.float64).reshape(oh, 2, ow, 2)
+        if need_pad:
+            blocks = cupy.full((h2, w2), cupy.float64('nan'), dtype=cupy.float64)
+            blocks[:h, :w] = arr2d
+            blocks = blocks.reshape(oh, 2, ow, 2)
+        else:
+            blocks = arr2d.astype(cupy.float64).reshape(oh, 2, ow, 2)
         # Integer GPU mirror of the CPU sentinel-mask: without it,
         # cupy.nanmean / nanmin / nanmax / nanmedian average the sentinel
         # value into surrounding valid cells and produce overview pixels
@@ -3158,8 +3185,18 @@ def _block_reduce_2d_gpu(arr2d, method, nodata=None):
             info = np.iinfo(arr2d.dtype)
             if info.min <= nodata_int <= info.max:
                 sentinel = np.dtype(str(arr2d.dtype)).type(nodata_int)
-                int_blocks = cropped.reshape(oh, 2, ow, 2)
-                mask = int_blocks == sentinel
+                if need_pad:
+                    # Compute the integer-width sentinel mask before
+                    # padding, so a 64-bit sentinel near INT64_MAX (not
+                    # exactly representable in float64) still matches.
+                    # The padded edge already holds NaN in ``blocks`` and
+                    # stays False here so the where below leaves it alone.
+                    int_mask = cupy.zeros((h2, w2), dtype=bool)
+                    int_mask[:h, :w] = arr2d == sentinel
+                    mask = int_mask.reshape(oh, 2, ow, 2)
+                else:
+                    int_blocks = arr2d.reshape(oh, 2, ow, 2)
+                    mask = int_blocks == sentinel
                 if bool(mask.any().item()):
                     blocks = cupy.where(
                         mask, cupy.float64('nan'), blocks)
diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py
index 80ad9087..97df2704 100644
--- a/xrspatial/geotiff/_writer.py
+++ b/xrspatial/geotiff/_writer.py
@@ -432,9 +432,44 @@ def _resolve_int_nodata(dtype, nodata):
     return None
 
 
+def _replicate_pad_2d(src, h2, w2):
+    """Pad ``src`` to ``(h2, w2)`` by replicating its trailing row/col.
+
+    Used by the cubic overview branch to extend an odd-sized input to
+    even dimensions so a 0.5-zoom lands on the GDAL ceil shape and the
+    spline kernel sees a valid neighbourhood at the trailing edge. The
+    helper is dtype-preserving (no float promotion) so integer inputs
+    stay integer through the pad. Returns ``src`` unchanged when no
+    padding is needed.
+    """
+    h, w = src.shape
+    if (h, w) == (h2, w2):
+        return src
+    padded = np.empty((h2, w2), dtype=src.dtype)
+    padded[:h, :w] = src
+    if h2 != h:
+        padded[h:, :w] = src[h - 1:h, :]
+    if w2 != w:
+        padded[:h, w:] = src[:, w - 1:w]
+    if h2 != h and w2 != w:
+        padded[h:, w:] = src[h - 1, w - 1]
+    return padded
+
+
 def _block_reduce_2d(arr2d, method, nodata=None):
     """2x block-reduce a single 2D plane using *method*.
 
+    Output dimensions follow GDAL's ceil semantics (5x5 -> 3x3): every
+    base pixel contributes to the overview. Inputs with an odd height
+    or width are padded with NaN (or float-promoted and NaN-padded for
+    integer dtypes) along the trailing edge so the 2x2 block reshape
+    still works; the nan-aware aggregations then exclude the padded
+    cells from the residual blocks. ``nearest`` strides directly,
+    ``cubic`` replicates the last row/col so the spline kernel sees a
+    valid neighbourhood at the trailing edge, and ``mode`` ignores
+    NaN-padded cells when counting. Even-sized inputs take the
+    existing fast path with no padding overhead (issue #2105).
+
     When ``nodata`` is supplied, cells that equal the sentinel are
     treated as NaN during the reduction so the ``nan*`` aggregation
     routines correctly skip them. The float branch keeps any all-
@@ -458,14 +493,23 @@ def _block_reduce_2d(arr2d, method, nodata=None):
     well-defined (issue #1975).
     """
     h, w = arr2d.shape
-    h2 = (h // 2) * 2
-    w2 = (w // 2) * 2
-    cropped = arr2d[:h2, :w2]
-    oh, ow = h2 // 2, w2 // 2
+    # GDAL-style ceil semantics keep the trailing row/col when the input
+    # is odd-sized. Floor-and-crop drops those pixels and produces an
+    # overview that no longer covers the source extent (issue #2105).
+    oh, ow = (h + 1) // 2, (w + 1) // 2
+    h2, w2 = 2 * oh, 2 * ow
+    need_pad = (h2, w2) != (h, w)
 
     if method == 'nearest':
-        # Top-left pixel of each 2x2 block
-        return cropped[::2, ::2].copy()
+        # Top-left pixel of each 2x2 block. For odd-sized inputs the
+        # trailing 1x2 / 2x1 / 1x1 residual block's top-left is still
+        # the source pixel at (2*i, 2*j), so direct striding gives the
+        # correct ceil-shape output without padding. The trailing
+        # ``.copy()`` materialises the strided view so the caller owns
+        # a contiguous buffer (downstream nodata-mask rewrites mutate
+        # the result in place); without it those writes would also
+        # touch ``arr2d``.
+        return arr2d[::2, ::2].copy()
 
     if method == 'cubic':
         try:
@@ -495,6 +539,17 @@ def _block_reduce_2d(arr2d, method, nodata=None):
         # NaN can carry through the spline; the result is rewritten
         # back to the sentinel and rounded before casting to the source
         # integer dtype (issue #1975, integer mirror of #1623).
+        #
+        # Odd-sized inputs are first edge-replicated to even dimensions
+        # so the 0.5 zoom factor lands on the GDAL ceil shape and the
+        # spline sees a valid neighbourhood at the trailing edge.
+        # Replicating sentinel values is fine: they are masked to NaN
+        # in the steps below alongside any in-bounds sentinel, and the
+        # post-zoom NaN->sentinel restore stamps the trailing overview
+        # pixels back to the sentinel as expected (issue #2105). The
+        # ``_replicate_pad_2d`` helper is module-level so the cubic
+        # path does not redefine a closure on every call.
+
         if (nodata is not None
                 and arr2d.dtype.kind == 'f'
                 and not np.isnan(nodata)):
@@ -505,7 +560,9 @@ def _block_reduce_2d(arr2d, method, nodata=None):
             if sentinel is not None:
                 mask = arr2d == sentinel
                 if mask.any():
-                    masked = np.where(mask, np.float64('nan'), arr2d)
+                    src = _replicate_pad_2d(arr2d, h2, w2)
+                    src_mask = src == sentinel
+                    masked = np.where(src_mask, np.float64('nan'), src)
                     with warnings.catch_warnings():
                         warnings.simplefilter('ignore', RuntimeWarning)
                         result = zoom(masked, 0.5, order=3,
@@ -515,13 +572,18 @@ def _block_reduce_2d(arr2d, method, nodata=None):
                         result = result.copy()
                         result[nan_mask] = float(nodata)
                     return result.astype(arr2d.dtype)
+                # No sentinel pixels present; fall through to the
+                # default cubic path below so the spline runs without
+                # the prefilter=False NaN-safety branch.
         nodata_int = _resolve_int_nodata(arr2d.dtype, nodata)
         if nodata_int is not None:
             sentinel = arr2d.dtype.type(nodata_int)
             mask = arr2d == sentinel
             if mask.any():
-                masked = np.where(mask, np.float64('nan'),
-                                  arr2d.astype(np.float64))
+                src = _replicate_pad_2d(arr2d, h2, w2)
+                src_mask = src == sentinel
+                masked = np.where(src_mask, np.float64('nan'),
+                                  src.astype(np.float64))
                 with warnings.catch_warnings():
                     warnings.simplefilter('ignore', RuntimeWarning)
                     result = zoom(masked, 0.5, order=3,
@@ -531,7 +593,10 @@ def _block_reduce_2d(arr2d, method, nodata=None):
                     result = np.where(nan_mask, float(nodata_int),
                                       result)
                 return np.round(result).astype(arr2d.dtype)
-        return zoom(arr2d, 0.5, order=3).astype(arr2d.dtype)
+            # No sentinel pixels present in this integer array; fall
+            # through to the default cubic path below.
+        src = _replicate_pad_2d(arr2d, h2, w2)
+        return zoom(src, 0.5, order=3).astype(arr2d.dtype)
 
     if method == 'mode':
         # Most-common value per 2x2 block (useful for classified rasters).
@@ -539,7 +604,28 @@ def _block_reduce_2d(arr2d, method, nodata=None):
         # how many cells equal it. argmax picks the leftmost max-count
         # position, which (post-sort) is the smallest tied value, matching
         # the prior np.unique+argmax tie-break behavior ("lowest wins").
-        blocks = cropped.reshape(oh, 2, ow, 2).transpose(0, 2, 1, 3).reshape(oh, ow, 4)
+        #
+        # Odd-sized inputs are float-promoted and NaN-padded so the
+        # residual block has at least one valid cell plus NaN cells that
+        # never match themselves (NaN != NaN). Counts for the NaN cells
+        # stay at zero, so argmax picks a valid cell, and numpy's sort
+        # places NaN at the end so the "lowest valid wins" tie-break
+        # still holds (issue #2105). The result is cast back to the
+        # source dtype.
+        if need_pad:
+            padded = np.full((h2, w2), np.nan, dtype=np.float64)
+            padded[:h, :w] = arr2d
+            blocks = padded.reshape(oh, 2, ow, 2).transpose(0, 2, 1, 3).reshape(oh, ow, 4)
+            srt = np.sort(blocks, axis=-1)
+            counts = np.zeros(srt.shape, dtype=np.int8)
+            for i in range(4):
+                counts[..., i] = np.sum(srt == srt[..., i:i + 1], axis=-1)
+            pick = np.argmax(counts, axis=-1)
+            picked = np.take_along_axis(srt, pick[..., None], axis=-1).squeeze(-1)
+            if arr2d.dtype.kind in ('i', 'u'):
+                return np.round(picked).astype(arr2d.dtype)
+            return picked.astype(arr2d.dtype)
+        blocks = arr2d.reshape(oh, 2, ow, 2).transpose(0, 2, 1, 3).reshape(oh, ow, 4)
         srt = np.sort(blocks, axis=-1)
         counts = np.empty_like(srt, dtype=np.int8)
         for i in range(4):
@@ -547,9 +633,20 @@ def _block_reduce_2d(arr2d, method, nodata=None):
         pick = np.argmax(counts, axis=-1)
         return np.take_along_axis(srt, pick[..., None], axis=-1).squeeze(-1)
 
-    # Block reshape for mean/min/max/median
+    # Block reshape for mean/min/max/median. Odd-sized inputs are padded
+    # with NaN along the trailing edge so the reshape works on every
+    # input size; the padded cells are naturally excluded by the
+    # nan-aware aggregations below (issue #2105). The padded buffer
+    # keeps the source's float dtype (float32 inputs stay float32) so
+    # an odd-shape read does not pay a 2x intermediate-memory cost for
+    # a silent promote to float64.
     if arr2d.dtype.kind == 'f':
-        blocks = cropped.reshape(oh, 2, ow, 2)
+        if need_pad:
+            padded = np.full((h2, w2), np.nan, dtype=arr2d.dtype)
+            padded[:h, :w] = arr2d
+            blocks = padded.reshape(oh, 2, ow, 2)
+        else:
+            blocks = arr2d.reshape(oh, 2, ow, 2)
         # When a sentinel was used in place of NaN by an upstream
         # NaN-to-sentinel rewrite, mask it back to NaN here so nanmean /
         # nanmin / nanmax / nanmedian honour the missing-data semantic.
@@ -569,28 +666,48 @@ def _block_reduce_2d(arr2d, method, nodata=None):
                     # array so the caller's input is not mutated.
                     blocks = np.where(mask, np.float64('nan'), blocks)
     else:
-        blocks = cropped.astype(np.float64).reshape(oh, 2, ow, 2)
-        # Integer rasters with a sentinel need the same NaN-mask the float
-        # branch above applies: without it, nanmean / nanmin / nanmax /
-        # nanmedian average the sentinel value into surrounding valid
-        # cells and produce overview pixels that are neither the sentinel
-        # nor any real measurement. The read-side int-to-NaN mask in
-        # ``open_geotiff`` only catches exact sentinel hits, so the
-        # poisoned values survive as silent garbage at every zoom level
-        # above 0. Gate on the sentinel being representable in the
-        # source integer dtype (mirrors ``_int_nodata_in_range`` in
-        # ``_reader.py``) so an out-of-range sentinel pair like
-        # ``uint16`` + ``GDAL_NODATA="-9999"`` stays a no-op rather than
-        # tripping ``OverflowError`` on the dtype cast.
+        if need_pad:
+            blocks = np.full((h2, w2), np.nan, dtype=np.float64)
+            blocks[:h, :w] = arr2d
+            blocks = blocks.reshape(oh, 2, ow, 2)
+        else:
+            blocks = arr2d.astype(np.float64).reshape(oh, 2, ow, 2)
+        # Integer rasters with a sentinel still need the NaN-mask the
+        # float branch above applies. The sentinel-equal cells in the
+        # original integer array are computed at the dtype's native
+        # width below (so a 64-bit sentinel near INT64_MAX still
+        # matches, which a float-cast comparison would miss) and the
+        # mask is then broadcast into the float64 ``blocks`` view.
+        # Without this, nanmean / nanmin / nanmax / nanmedian average
+        # the sentinel value into surrounding valid cells and produce
+        # overview pixels that the read side cannot remap because they
+        # no longer equal the sentinel. Gate on the sentinel being
+        # representable in the source integer dtype (mirrors
+        # ``_int_nodata_in_range`` in ``_reader.py``) so an out-of-
+        # range sentinel pair like ``uint16`` + ``GDAL_NODATA=-9999``
+        # stays a no-op rather than tripping ``OverflowError`` on the
+        # dtype cast.
         nodata_int = _resolve_int_nodata(arr2d.dtype, nodata)
         if nodata_int is not None:
             sentinel = arr2d.dtype.type(nodata_int)
-            # Compare against the original integer block view so the
-            # equality runs at the integer's native width (avoids any
-            # float-cast rounding on adjacent values). The boolean
-            # mask broadcasts into the float64 block layout below.
-            int_blocks = cropped.reshape(oh, 2, ow, 2)
-            mask = int_blocks == sentinel
+            if need_pad:
+                # Compute the sentinel mask at the integer's native width
+                # before padding, so a 64-bit sentinel near INT64_MAX (which
+                # is not exactly representable in float64) still matches.
+                # The mask is then padded with False to the same (h2, w2)
+                # shape as the float-promoted blocks view, so the residual
+                # padded cells (already NaN in ``blocks``) are not also
+                # rewritten via the where below.
+                int_mask = np.zeros((h2, w2), dtype=bool)
+                int_mask[:h, :w] = arr2d == sentinel
+                mask = int_mask.reshape(oh, 2, ow, 2)
+            else:
+                # Compare against the original integer block view so the
+                # equality runs at the integer's native width (avoids any
+                # float-cast rounding on adjacent values). The boolean
+                # mask broadcasts into the float64 block layout below.
+                int_blocks = arr2d.reshape(oh, 2, ow, 2)
+                mask = int_blocks == sentinel
             if mask.any():
                 blocks = np.where(mask, np.float64('nan'), blocks)
 
diff --git a/xrspatial/geotiff/tests/test_cog_overview_ceil_2105.py b/xrspatial/geotiff/tests/test_cog_overview_ceil_2105.py
new file mode 100644
index 00000000..48d0fc4d
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_cog_overview_ceil_2105.py
@@ -0,0 +1,330 @@
+"""COG overviews use ceil semantics for odd-sized rasters (issue #2105).
+
+Before the fix, ``_block_reduce_2d`` floored both dimensions to an even
+multiple and cropped the trailing row/col before reducing. A 5x5 input
+became a 4x4 crop and then a 2x2 overview, silently dropping the bottom
+row and right column. GDAL's overview generator uses ceil semantics
+(5x5 -> 3x3) so the residual edge cells still contribute.
+
+These tests pin the contract that every base pixel reaches the overview,
+across all resampling methods, on both even and odd input shapes, and
+with the nodata sentinel honoured along the trailing edge.
+"""
+from __future__ import annotations
+
+import importlib.util
+
+import numpy as np
+import pytest
+
+from xrspatial.geotiff._writer import _block_reduce_2d
+
+
+def _gpu_available() -> bool:
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy
+        return bool(cupy.cuda.is_available())
+    except Exception:
+        return False
+
+
+_HAS_GPU = _gpu_available()
+_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required")
+
+
+# ---------------------------------------------------------------------------
+# Output-shape contract: ceil((h+1)/2, (w+1)/2) for every method and dtype.
+# ---------------------------------------------------------------------------
+@pytest.mark.parametrize(
+    "shape,expected",
+    [
+        ((5, 5), (3, 3)),
+        ((5, 4), (3, 2)),
+        ((4, 5), (2, 3)),
+        ((7, 3), (4, 2)),
+        ((1, 1), (1, 1)),
+        ((1, 5), (1, 3)),
+        ((5, 1), (3, 1)),
+        ((4, 4), (2, 2)),
+        ((6, 6), (3, 3)),
+    ],
+)
+@pytest.mark.parametrize(
+    "method", ["nearest", "mean", "min", "max", "median", "mode"]
+)
+def test_ceil_output_shape_float(shape, expected, method):
+    arr = np.arange(shape[0] * shape[1], dtype=np.float32).reshape(shape)
+    out = _block_reduce_2d(arr, method)
+    assert out.shape == expected
+
+
+@pytest.mark.parametrize(
+    "shape,expected",
+    [
+        ((5, 5), (3, 3)),
+        ((3, 7), (2, 4)),
+        ((1, 1), (1, 1)),
+    ],
+)
+@pytest.mark.parametrize("method", ["nearest", "mean", "min", "max", "median", "mode"])
+def test_ceil_output_shape_int(shape, expected, method):
+    arr = np.arange(shape[0] * shape[1], dtype=np.int16).reshape(shape)
+    out = _block_reduce_2d(arr, method)
+    assert out.shape == expected
+    assert out.dtype == arr.dtype
+
+
+def test_ceil_output_shape_cubic_float():
+    pytest.importorskip("scipy")
+    arr = np.arange(25, dtype=np.float32).reshape(5, 5)
+    out = _block_reduce_2d(arr, "cubic")
+    assert out.shape == (3, 3)
+
+
+def test_ceil_output_shape_cubic_int():
+    pytest.importorskip("scipy")
+    arr = np.arange(25, dtype=np.int16).reshape(5, 5)
+    out = _block_reduce_2d(arr, "cubic")
+    assert out.shape == (3, 3)
+    assert out.dtype == arr.dtype
+
+
+# ---------------------------------------------------------------------------
+# Trailing-edge pixel values: the last row/col of the source must reach the
+# overview rather than being dropped.
+# ---------------------------------------------------------------------------
+def test_nearest_5x5_preserves_trailing_pixels():
+    arr = np.arange(25, dtype=np.float64).reshape(5, 5)
+    out = _block_reduce_2d(arr, "nearest")
+    # Nearest = top-left of every 2x2 block. With ceil, that's arr[::2, ::2].
+    assert out.shape == (3, 3)
+    np.testing.assert_array_equal(out, arr[::2, ::2])
+    # The trailing row/col of the source IS represented in the overview.
+    assert out[-1, -1] == arr[4, 4]
+
+
+def test_mean_5x5_trailing_residual_block_uses_valid_cell():
+    # Residual at row 4, col 4 is a 1x1 block containing arr[4,4] alone.
+    arr = np.zeros((5, 5), dtype=np.float32)
+    arr[4, 4] = 100.0
+    out = _block_reduce_2d(arr, "mean")
+    assert out.shape == (3, 3)
+    # The corner residual is a 1x1 block, so its mean is the single pixel.
+    assert out[2, 2] == pytest.approx(100.0)
+
+
+def test_max_5x5_residual_block_uses_valid_cell():
+    arr = np.zeros((5, 5), dtype=np.float32)
+    arr[4, :] = 9.0  # trailing row should reach overview[2, :]
+    arr[:, 4] = 7.0
+    out = _block_reduce_2d(arr, "max")
+    assert out.shape == (3, 3)
+    # Bottom overview row picks max of (arr[4, 2*j:2*j+2]) -> 9.0 everywhere.
+    np.testing.assert_array_equal(out[2, :2], [9.0, 9.0])
+    # Right column gets max from (arr[2*i:2*i+2, 4]) -> 7.0 except corner.
+    assert out[0, 2] == 7.0
+    assert out[1, 2] == 7.0
+    # arr[4, 4] = 7.0 (set by the trailing-column sweep, after row sweep).
+    assert out[2, 2] == 7.0
+
+
+def test_min_5x5_residual_block_uses_valid_cell():
+    arr = np.full((5, 5), 10.0, dtype=np.float32)
+    arr[4, 4] = -1.0
+    out = _block_reduce_2d(arr, "min")
+    assert out[2, 2] == -1.0
+
+
+def test_median_5x5_residual_block_uses_valid_cell():
+    arr = np.full((5, 5), 5.0, dtype=np.float32)
+    arr[4, 4] = 99.0
+    out = _block_reduce_2d(arr, "median")
+    # 1x1 residual: median is the single value.
+    assert out[2, 2] == pytest.approx(99.0)
+
+
+def test_mode_5x5_residual_block_picks_valid_cell():
+    arr = np.array(
+        [[1, 1, 2, 2, 3],
+         [1, 1, 2, 2, 3],
+         [4, 4, 5, 5, 6],
+         [4, 4, 5, 5, 6],
+         [7, 7, 8, 8, 9]],
+        dtype=np.int16,
+    )
+    out = _block_reduce_2d(arr, "mode")
+    assert out.shape == (3, 3)
+    # Trailing 1x1 block at (4,4) is just the value 9.
+    assert out[2, 2] == 9
+    # Trailing column (rows 0..1 / 2..3, col 4) is 1x1 blocks containing 3 / 6.
+    assert out[0, 2] == 3
+    assert out[1, 2] == 6
+    # Trailing row (col 0..1 / 2..3, row 4) is 1x2 blocks: [7,7] -> 7, [8,8] -> 8.
+    assert out[2, 0] == 7
+    assert out[2, 1] == 8
+
+
+def test_cubic_5x5_covers_source_extent():
+    pytest.importorskip("scipy")
+    # Smoothly varying ramp so cubic interpolation is well-behaved.
+    arr = np.arange(25, dtype=np.float32).reshape(5, 5)
+    out = _block_reduce_2d(arr, "cubic")
+    assert out.shape == (3, 3)
+    # Output should not be entirely zero/NaN, and trailing corner should
+    # roughly reflect the high source values around (4, 4).
+    assert np.isfinite(out).all()
+    assert out[2, 2] > out[0, 0]
+
+
+# ---------------------------------------------------------------------------
+# Sentinel masking still works on odd-sized inputs.
+# ---------------------------------------------------------------------------
+def test_mean_5x5_with_nodata_excludes_sentinel_in_residual():
+    sentinel = -9999.0
+    arr = np.full((5, 5), 1.0, dtype=np.float32)
+    arr[4, 4] = sentinel
+    out = _block_reduce_2d(arr, "mean", nodata=sentinel)
+    # The 1x1 trailing residual is all-sentinel -> all-NaN block, which
+    # the post-overview rewrite (in the caller) handles. Here we just
+    # confirm the sentinel did not bias the reduction: out[2, 2] is NaN,
+    # not (1.0 + sentinel)/2 or similar.
+    assert np.isnan(out[2, 2])
+    # Other overview cells with at least one valid neighbour stay valid.
+    assert out[0, 0] == pytest.approx(1.0)
+
+
+def test_min_int_5x5_with_nodata_does_not_select_sentinel_in_residual():
+    sentinel = -9999
+    arr = np.full((5, 5), 10, dtype=np.int16)
+    # Trailing column has a sentinel + valid cell in 2x1 residual blocks.
+    arr[0, 4] = sentinel
+    arr[2, 4] = sentinel
+    arr[4, 4] = sentinel
+    out = _block_reduce_2d(arr, "min", nodata=sentinel)
+    # The 2x1 residual at (0..1, 4) is [-9999, 10] -> min ignoring sentinel = 10.
+    assert out[0, 2] == 10
+    assert out[1, 2] == 10
+    # The 1x1 residual at (4, 4) is sentinel -> rewritten to sentinel.
+    assert out[2, 2] == sentinel
+
+
+def test_int64_sentinel_near_max_masks_in_padded_branch():
+    # INT64_MAX is not exactly representable in float64: float(INT64_MAX)
+    # rounds up to 2**63, which would miss the sentinel if the mask were
+    # computed against the float-padded view. The reader must compute the
+    # mask at native integer width before padding.
+    sentinel = np.iinfo(np.int64).max
+    arr = np.full((5, 5), 10, dtype=np.int64)
+    arr[0, 0] = sentinel
+    # Pad branch fires because shape (5, 5) is odd.
+    out = _block_reduce_2d(arr, "min", nodata=sentinel)
+    # Top-left 2x2 block has 1 sentinel + 3 valid 10s. nanmin -> 10
+    # (sentinel masked out). If the mask missed the sentinel, the int64
+    # value would be cast to float and the float min would pick up the
+    # sentinel's value or produce noise; either way out[0,0] would not
+    # be 10.
+    assert out[0, 0] == 10
+
+
+def test_uint64_sentinel_near_max_masks_in_padded_branch():
+    # UINT64_MAX = 2**64 - 1 is also not exactly representable in float64
+    # (float(UINT64_MAX) rounds up to 2**64). The native-width mask path
+    # must catch the sentinel for unsigned 64-bit dtypes too.
+    sentinel = np.iinfo(np.uint64).max
+    arr = np.full((5, 5), 10, dtype=np.uint64)
+    arr[0, 0] = sentinel
+    out = _block_reduce_2d(arr, "min", nodata=sentinel)
+    assert out[0, 0] == 10
+
+
+def test_float32_padded_branch_keeps_source_dtype():
+    # The padded mean/min/max/median branch used to allocate a float64
+    # NaN buffer regardless of the source dtype, doubling intermediate
+    # memory for an odd-shape float32 read. Verify the helper now keeps
+    # the source dtype across the pad so a float32 input round-trips as
+    # float32. The contract is checked end-to-end via the output dtype.
+    arr = np.arange(25, dtype=np.float32).reshape(5, 5)
+    out = _block_reduce_2d(arr, "mean")
+    assert out.dtype == np.float32
+    # And the values still match what a manual ceil-mean would produce
+    # for the top-left 2x2 block.
+    top_left_mean = float(arr[:2, :2].mean())
+    assert out[0, 0] == pytest.approx(top_left_mean)
+
+
+def test_max_int_5x5_with_nodata_does_not_select_sentinel_in_residual():
+    sentinel = -9999
+    arr = np.full((5, 5), 10, dtype=np.int16)
+    arr[4, 4] = sentinel
+    out = _block_reduce_2d(arr, "max", nodata=sentinel)
+    # The 1x1 corner block is all-sentinel -> sentinel.
+    assert out[2, 2] == sentinel
+    # Adjacent 2x1 residual (rows 4, cols 0..1) has valid values only.
+    assert out[2, 0] == 10
+    assert out[2, 1] == 10
+
+
+# ---------------------------------------------------------------------------
+# Even-sized inputs keep the existing fast-path semantics.
+# ---------------------------------------------------------------------------
+@pytest.mark.parametrize(
+    "method", ["nearest", "mean", "min", "max", "median", "mode"]
+)
+def test_even_input_matches_legacy_2x2_behaviour(method):
+    rng = np.random.default_rng(2105)
+    arr = rng.integers(0, 100, size=(6, 8)).astype(np.int16)
+    out = _block_reduce_2d(arr, method)
+    assert out.shape == (3, 4)
+    # Spot-check a single block matches a direct reduction.
+    block = arr[0:2, 0:2]
+    if method == "nearest":
+        assert out[0, 0] == block[0, 0]
+    elif method == "mean":
+        # Integer outputs are rounded after the float reduction.
+        assert out[0, 0] == int(round(block.astype(np.float64).mean()))
+    elif method == "min":
+        assert out[0, 0] == block.min()
+    elif method == "max":
+        assert out[0, 0] == block.max()
+    elif method == "median":
+        assert out[0, 0] == int(round(float(np.median(block))))
+    elif method == "mode":
+        # Lowest-value tie-break for unique cells.
+        vals, counts = np.unique(block, return_counts=True)
+        expected = vals[np.argmax(counts)]
+        assert out[0, 0] == expected
+
+
+# ---------------------------------------------------------------------------
+# GPU mirror: identical shape and identical values on odd-sized inputs.
+# ---------------------------------------------------------------------------
+@_gpu_only
+@pytest.mark.parametrize(
+    "method", ["nearest", "mean", "min", "max", "median"]
+)
+@pytest.mark.parametrize("shape", [(5, 5), (5, 4), (4, 5), (7, 3)])
+def test_gpu_block_reduce_matches_cpu_on_odd_shapes(method, shape):
+    import cupy
+    from xrspatial.geotiff._gpu_decode import _block_reduce_2d_gpu
+
+    rng = np.random.default_rng(2105)
+    arr = rng.random(shape, dtype=np.float32)
+    cpu_out = _block_reduce_2d(arr, method)
+    gpu_out = _block_reduce_2d_gpu(cupy.asarray(arr), method).get()
+    assert gpu_out.shape == cpu_out.shape
+    np.testing.assert_allclose(gpu_out, cpu_out, equal_nan=True, rtol=1e-6)
+
+
+@_gpu_only
+def test_gpu_block_reduce_int_5x5_with_nodata():
+    import cupy
+    from xrspatial.geotiff._gpu_decode import _block_reduce_2d_gpu
+
+    sentinel = -9999
+    arr = np.full((5, 5), 10, dtype=np.int16)
+    arr[4, 4] = sentinel
+    cpu_out = _block_reduce_2d(arr, "max", nodata=sentinel)
+    gpu_out = _block_reduce_2d_gpu(cupy.asarray(arr), "max", nodata=sentinel).get()
+    np.testing.assert_array_equal(gpu_out, cpu_out)
diff --git a/xrspatial/geotiff/tests/test_mode_overview_perf.py b/xrspatial/geotiff/tests/test_mode_overview_perf.py
index a24517cd..5b794727 100644
--- a/xrspatial/geotiff/tests/test_mode_overview_perf.py
+++ b/xrspatial/geotiff/tests/test_mode_overview_perf.py
@@ -14,24 +14,22 @@
 
 
 def _mode_resample_reference(arr2d):
-    """Original per-pixel reference implementation (pre-vectorization).
+    """Per-pixel mode reference using GDAL ceil semantics (issue #2105).
 
-    Copied verbatim from the loop body that used ``np.unique`` per 2x2
-    block, with the same crop-to-even-dims behavior as the production
-    function.
+    Walks each ceil-shaped output block over the source array, takes the
+    intersection of the 2x2 block window with the actual source extent,
+    and picks the most-frequent value with the "lowest wins" tie-break
+    that the vectorized production path also uses.
     """
     h, w = arr2d.shape
-    h2 = (h // 2) * 2
-    w2 = (w // 2) * 2
-    cropped = arr2d[:h2, :w2]
-    oh, ow = h2 // 2, w2 // 2
-    blocks = (
-        cropped.reshape(oh, 2, ow, 2).transpose(0, 2, 1, 3).reshape(oh, ow, 4)
-    )
+    oh, ow = (h + 1) // 2, (w + 1) // 2
     out = np.empty((oh, ow), dtype=arr2d.dtype)
     for r in range(oh):
         for c in range(ow):
-            vals, counts = np.unique(blocks[r, c], return_counts=True)
+            r0, r1 = 2 * r, min(2 * r + 2, h)
+            c0, c1 = 2 * c, min(2 * c + 2, w)
+            window = arr2d[r0:r1, c0:c1].ravel()
+            vals, counts = np.unique(window, return_counts=True)
             out[r, c] = vals[counts.argmax()]
     return out
 
@@ -48,12 +46,6 @@ def test_bit_exact_match_reference(dtype, shape):
     hi = min(info.max, 7)
     arr = rng.integers(lo, hi + 1, size=shape, dtype=dtype)
 
-    # Skip pathological 1x1 / 1xN / Nx1 cases that crop to empty.
-    h2 = (shape[0] // 2) * 2
-    w2 = (shape[1] // 2) * 2
-    if h2 == 0 or w2 == 0:
-        return
-
     expected = _mode_resample_reference(arr)
     actual = _block_reduce_2d(arr, 'mode')