From 3b34a12c158065835cd27d0003d6c1d409622688 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Wed, 11 Feb 2026 19:19:39 +0100
Subject: [PATCH 1/3] xenium cache zip store opening; vectorize xenium cell id
 encoding via bit shift

---
 src/spatialdata_io/readers/xenium.py | 83 ++++++++++++++++++++--------
 tests/test_xenium.py                 | 18 +++++-
 2 files changed, 76 insertions(+), 25 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index 152e644e..e6938f17 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -188,8 +188,20 @@ def xenium(
     else:
         table = None
 
+    # open cells.zarr.zip once and reuse across all functions that need it
+    cells_zarr: zarr.Group | None = None
+    need_cells_zarr = (
+        nucleus_labels
+        or cells_labels
+        or (version is not None and version >= packaging.version.parse("2.0.0") and table is not None)
+    )
+    if need_cells_zarr:
+        cells_zarr_store = zarr.storage.ZipStore(path / XeniumKeys.CELLS_ZARR, read_only=True)
+        cells_zarr = zarr.open(cells_zarr_store, mode="r")
+
     if version is not None and version >= packaging.version.parse("2.0.0") and table is not None:
-        cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs)
+        assert cells_zarr is not None
+        cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr, specs)
         if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
             warnings.warn(
                 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
@@ -214,20 +226,24 @@ def xenium(
     # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
     # labels.
     if nucleus_labels:
+        assert cells_zarr is not None
         labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
             path=path,
             specs=specs,
             mask_index=0,
             labels_name="nucleus_labels",
             labels_models_kwargs=labels_models_kwargs,
+            cells_zarr=cells_zarr,
         )
     if cells_labels:
+        assert cells_zarr is not None
         labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
             path=path,
             specs=specs,
             mask_index=1,
             labels_name="cell_labels",
             labels_models_kwargs=labels_models_kwargs,
+            cells_zarr=cells_zarr,
         )
         if cell_labels_indices_mapping is not None and table is not None:
             if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
@@ -459,16 +475,14 @@ def _get_labels_and_indices_mapping(
     specs: dict[str, Any],
     mask_index: int,
     labels_name: str,
+    cells_zarr: zarr.Group,
     labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
 ) -> tuple[GeoDataFrame, pd.DataFrame | None]:
     if mask_index not in [0, 1]:
         raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.")
 
-    zip_file = path / XeniumKeys.CELLS_ZARR
-    store = zarr.storage.ZipStore(zip_file, read_only=True)
-    z = zarr.open(store, mode="r")
     # get the labels
-    masks = da.from_array(z["masks"][f"{mask_index}"])
+    masks = da.from_array(cells_zarr["masks"][f"{mask_index}"])
     labels = Labels2DModel.parse(masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs)
 
     # build the matching table
@@ -481,11 +495,11 @@ def _get_labels_and_indices_mapping(
         # supported in versions < 1.3.0
         return labels, None
 
-    cell_id, dataset_suffix = z["cell_id"][...].T
+    cell_id, dataset_suffix = cells_zarr["cell_id"][...].T
     cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)
 
     if version < packaging.version.parse("2.0.0"):
-        label_index = z["seg_mask_value"][...]
+        label_index = cells_zarr["seg_mask_value"][...]
     else:
         # For v >= 2.0.0, seg_mask_value is no longer available in the zarr;
         # read label_id from the corresponding parquet boundary file instead
@@ -522,8 +536,7 @@ def _get_labels_and_indices_mapping(
 
 @inject_docs(xx=XeniumKeys)
 def _get_cells_metadata_table_from_zarr(
-    path: Path,
-    file: str,
+    cells_zarr: zarr.Group,
     specs: dict[str, Any],
 ) -> AnnData:
     """Read cells metadata from ``{xx.CELLS_ZARR}``.
@@ -531,17 +544,11 @@ def _get_cells_metadata_table_from_zarr(
     Read the cells summary table, which contains the z_level information for versions < 2.0.0, and also the
     nucleus_count for versions >= 2.0.0.
     """
-    # for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks
-    zip_file = path / XeniumKeys.CELLS_ZARR
-    store = zarr.storage.ZipStore(zip_file, read_only=True)
-
-    z = zarr.open(store, mode="r")
-    x = z["cell_summary"][...]
-    column_names = z["cell_summary"].attrs["column_names"]
+    x = cells_zarr["cell_summary"][...]
+    column_names = cells_zarr["cell_summary"].attrs["column_names"]
     df = pd.DataFrame(x, columns=column_names)
-    cell_id_prefix = z["cell_id"][:, 0]
-    dataset_suffix = z["cell_id"][:, 1]
-    store.close()
+    cell_id_prefix = cells_zarr["cell_id"][:, 0]
+    dataset_suffix = cells_zarr["cell_id"][:, 1]
 
     cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
     df[XeniumKeys.CELL_ID] = cell_id_str
@@ -850,13 +857,18 @@ def _parse_version_of_xenium_analyzer(
         return None
 
 
-def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike:
-    # explained here:
-    # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
+def _cell_id_str_from_prefix_suffix_uint32_reference(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike:
+    """Reference implementation of cell_id_str_from_prefix_suffix_uint32.
+
+    Readable but slow for large arrays due to Python-level string operations.
+    Kept as ground truth for testing the optimized version.
+
+    See https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
+    """
     # convert to hex, remove the 0x prefix
     cell_id_prefix_hex = [hex(x)[2:] for x in cell_id_prefix]
 
-    # shift the hex values
+    # shift the hex values: '0'->'a', ..., '9'->'j', 'a'->'k', ..., 'f'->'p'
     hex_shift = {str(i): chr(ord("a") + i) for i in range(10)} | {
         chr(ord("a") + i): chr(ord("a") + 10 + i) for i in range(6)
     }
@@ -870,6 +882,31 @@ def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suf
     return np.array(cell_id_str)
 
 
+def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike:
+    """Convert cell ID prefix/suffix uint32 pairs to the Xenium string representation.
+
+    Each uint32 prefix is converted to 8 hex nibbles, each mapped to a character
+    (0->'a', 1->'b', ..., 15->'p'), then joined with "-{suffix}".
+
+    See https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
+    """
+    cell_id_prefix = np.asarray(cell_id_prefix, dtype=np.uint32)
+    dataset_suffix = np.asarray(dataset_suffix)
+
+    # Extract 8 hex nibbles (4 bits each) from each uint32, most significant first.
+    # Each nibble maps to a character: 0->'a', 1->'b', ..., 9->'j', 10->'k', ..., 15->'p'.
+    # Leading zero nibbles become 'a', equivalent to rjust(8, 'a') padding.
+    shifts = np.array([28, 24, 20, 16, 12, 8, 4, 0], dtype=np.uint32)
+    nibbles = (cell_id_prefix[:, np.newaxis] >> shifts) & 0xF
+    char_codes = (nibbles + ord("a")).astype(np.uint8)
+
+    # View the (n, 8) uint8 array as n byte-strings of length 8
+    prefix_strs = char_codes.view("S8").ravel().astype("U8")
+
+    suffix_strs = np.char.add("-", dataset_suffix.astype("U"))
+    return np.char.add(prefix_strs, suffix_strs)
+
+
 def prefix_suffix_uint32_from_cell_id_str(
     cell_id_str: ArrayLike,
 ) -> tuple[ArrayLike, ArrayLike]:
diff --git a/tests/test_xenium.py b/tests/test_xenium.py
index b6bff790..20687ab0 100644
--- a/tests/test_xenium.py
+++ b/tests/test_xenium.py
@@ -10,6 +10,7 @@
 
 from spatialdata_io.__main__ import xenium_wrapper
 from spatialdata_io.readers.xenium import (
+    _cell_id_str_from_prefix_suffix_uint32_reference,
     cell_id_str_from_prefix_suffix_uint32,
     prefix_suffix_uint32_from_cell_id_str,
     xenium,
@@ -20,9 +21,22 @@
 def test_cell_id_str_from_prefix_suffix_uint32() -> None:
     cell_id_prefix = np.array([1, 1437536272, 1437536273], dtype=np.uint32)
     dataset_suffix = np.array([1, 1, 2])
+    expected = np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"])
 
-    cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
-    assert np.array_equal(cell_id_str, np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"]))
+    result = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
+    reference = _cell_id_str_from_prefix_suffix_uint32_reference(cell_id_prefix, dataset_suffix)
+    assert np.array_equal(result, expected)
+    assert np.array_equal(reference, expected)
+
+
+def test_cell_id_str_optimized_matches_reference() -> None:
+    rng = np.random.default_rng(42)
+    cell_id_prefix = rng.integers(0, 2**32, size=10_000, dtype=np.uint32)
+    dataset_suffix = rng.integers(0, 10, size=10_000)
+
+    result = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
+    reference = _cell_id_str_from_prefix_suffix_uint32_reference(cell_id_prefix, dataset_suffix)
+    assert np.array_equal(result, reference)
 
 
 def test_prefix_suffix_uint32_from_cell_id_str() -> None:

From c86aab59e67dcceaef8018148367024855651a98 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Wed, 11 Feb 2026 20:14:12 +0100
Subject: [PATCH 2/3] remove unnecessary cast to str for df index

---
 src/spatialdata_io/readers/xenium.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index e6938f17..af521a9d 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -202,7 +202,7 @@ def xenium(
     if version is not None and version >= packaging.version.parse("2.0.0") and table is not None:
         assert cells_zarr is not None
         cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr, specs)
-        if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
+        if not np.array_equal(cell_summary_table[XeniumKeys.CELL_ID].values, table.obs[XeniumKeys.CELL_ID].values):
             warnings.warn(
                 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
                 " table. This could be due to trying to read a new version that is not supported yet. Please "
@@ -210,8 +210,8 @@ def xenium(
                 UserWarning,
                 stacklevel=2,
             )
-        table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
-        table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]
+        table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL].values
+        table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT].values
 
     polygons = {}
     labels = {}
@@ -246,7 +246,9 @@ def xenium(
             cells_zarr=cells_zarr,
         )
         if cell_labels_indices_mapping is not None and table is not None:
-            if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
+            if not np.array_equal(
+                cell_labels_indices_mapping["cell_id"].values, table.obs[str(XeniumKeys.CELL_ID)].values
+            ):
                 warnings.warn(
                     "The cell_id column in the cell_labels_table does not match the cell_id column derived from the "
                     "cell labels data. This could be due to trying to read a new version that is not supported yet. "
@@ -255,7 +257,7 @@ def xenium(
                     stacklevel=2,
                 )
             else:
-                table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]
+                table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"].values
                 if not cells_as_circles:
                     table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
 
@@ -529,8 +531,6 @@ def _get_labels_and_indices_mapping(
             "label_index": label_index.astype(np.int64),
         }
     )
-    # because AnnData converts the indices to str
-    indices_mapping.index = indices_mapping.index.astype(str)
     return labels, indices_mapping
 
 
@@ -552,8 +552,6 @@ def _get_cells_metadata_table_from_zarr(
 
     cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
     df[XeniumKeys.CELL_ID] = cell_id_str
-    # because AnnData converts the indices to str
-    df.index = df.index.astype(str)
     return df
 
 

From 722fa338f9e515b53ba74b7eada66a5aff8b45e8 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Wed, 11 Feb 2026 20:41:52 +0100
Subject: [PATCH 3/3] avoid anndata index casting; compute xenium cell ids once

---
 src/spatialdata_io/readers/xenium.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index af521a9d..5ead02f0 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -199,9 +199,17 @@ def xenium(
         cells_zarr_store = zarr.storage.ZipStore(path / XeniumKeys.CELLS_ZARR, read_only=True)
         cells_zarr = zarr.open(cells_zarr_store, mode="r")
 
+    # pre-compute cell_id strings from the zarr once, to avoid redundant conversion
+    # in both _get_cells_metadata_table_from_zarr and _get_labels_and_indices_mapping.
+    cells_zarr_cell_id_str: np.ndarray | None = None
+    if cells_zarr is not None:
+        cell_id_raw = cells_zarr["cell_id"][...]
+        cell_id_prefix, dataset_suffix = cell_id_raw[:, 0], cell_id_raw[:, 1]
+        cells_zarr_cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
+
     if version is not None and version >= packaging.version.parse("2.0.0") and table is not None:
         assert cells_zarr is not None
-        cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr, specs)
+        cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr, specs, cells_zarr_cell_id_str)
         if not np.array_equal(cell_summary_table[XeniumKeys.CELL_ID].values, table.obs[XeniumKeys.CELL_ID].values):
             warnings.warn(
                 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
@@ -226,7 +234,6 @@ def xenium(
     # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
     # labels.
     if nucleus_labels:
-        assert cells_zarr is not None
         labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
             path=path,
             specs=specs,
@@ -234,9 +241,9 @@ def xenium(
             labels_name="nucleus_labels",
             labels_models_kwargs=labels_models_kwargs,
             cells_zarr=cells_zarr,
+            cell_id_str=cells_zarr_cell_id_str,
         )
     if cells_labels:
-        assert cells_zarr is not None
         labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
             path=path,
             specs=specs,
@@ -244,6 +251,7 @@ def xenium(
             labels_name="cell_labels",
             labels_models_kwargs=labels_models_kwargs,
             cells_zarr=cells_zarr,
+            cell_id_str=cells_zarr_cell_id_str,
         )
         if cell_labels_indices_mapping is not None and table is not None:
             if not np.array_equal(
@@ -478,6 +486,7 @@ def _get_labels_and_indices_mapping(
     mask_index: int,
     labels_name: str,
     cells_zarr: zarr.Group,
+    cell_id_str: ArrayLike,
     labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
 ) -> tuple[GeoDataFrame, pd.DataFrame | None]:
     if mask_index not in [0, 1]:
@@ -497,9 +506,6 @@ def _get_labels_and_indices_mapping(
         # supported in versions < 1.3.0
         return labels, None
 
-    cell_id, dataset_suffix = cells_zarr["cell_id"][...].T
-    cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)
-
     if version < packaging.version.parse("2.0.0"):
         label_index = cells_zarr["seg_mask_value"][...]
     else:
@@ -538,6 +544,7 @@ def _get_labels_and_indices_mapping(
 def _get_cells_metadata_table_from_zarr(
     cells_zarr: zarr.Group,
     specs: dict[str, Any],
+    cell_id_str: ArrayLike,
 ) -> AnnData:
     """Read cells metadata from ``{xx.CELLS_ZARR}``.
 
@@ -547,10 +554,7 @@ def _get_cells_metadata_table_from_zarr(
     x = cells_zarr["cell_summary"][...]
     column_names = cells_zarr["cell_summary"].attrs["column_names"]
     df = pd.DataFrame(x, columns=column_names)
-    cell_id_prefix = cells_zarr["cell_id"][:, 0]
-    dataset_suffix = cells_zarr["cell_id"][:, 1]
 
-    cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
     df[XeniumKeys.CELL_ID] = cell_id_str
     return df
 
@@ -601,6 +605,8 @@ def _get_tables_and_circles(
     circ = metadata[[XeniumKeys.CELL_X, XeniumKeys.CELL_Y]].to_numpy()
     adata.obsm["spatial"] = circ
     metadata.drop([XeniumKeys.CELL_X, XeniumKeys.CELL_Y], axis=1, inplace=True)
+    # avoids anndata's ImplicitModificationWarning
+    metadata.index = adata.obs_names
     adata.obs = metadata
     adata.obs["region"] = specs["region"]
     adata.obs["region"] = adata.obs["region"].astype("category")