From 3b34a12c158065835cd27d0003d6c1d409622688 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Wed, 11 Feb 2026 19:19:39 +0100 Subject: [PATCH 1/3] xenium cache zip store opening; vectorize xenium cell id encoding via bit shift --- src/spatialdata_io/readers/xenium.py | 83 ++++++++++++++++++++-------- tests/test_xenium.py | 18 +++++- 2 files changed, 76 insertions(+), 25 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 152e644e..e6938f17 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -188,8 +188,20 @@ def xenium( else: table = None + # open cells.zarr.zip once and reuse across all functions that need it + cells_zarr: zarr.Group | None = None + need_cells_zarr = ( + nucleus_labels + or cells_labels + or (version is not None and version >= packaging.version.parse("2.0.0") and table is not None) + ) + if need_cells_zarr: + cells_zarr_store = zarr.storage.ZipStore(path / XeniumKeys.CELLS_ZARR, read_only=True) + cells_zarr = zarr.open(cells_zarr_store, mode="r") + if version is not None and version >= packaging.version.parse("2.0.0") and table is not None: - cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs) + assert cells_zarr is not None + cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr, specs) if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]): warnings.warn( 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation' @@ -214,20 +226,24 @@ def xenium( # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus # labels. if nucleus_labels: + assert cells_zarr is not None labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( path=path, specs=specs, mask_index=0, labels_name="nucleus_labels", labels_models_kwargs=labels_models_kwargs, + cells_zarr=cells_zarr, ) if cells_labels: + assert cells_zarr is not None labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( path=path, specs=specs, mask_index=1, labels_name="cell_labels", labels_models_kwargs=labels_models_kwargs, + cells_zarr=cells_zarr, ) if cell_labels_indices_mapping is not None and table is not None: if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]): @@ -459,16 +475,14 @@ def _get_labels_and_indices_mapping( specs: dict[str, Any], mask_index: int, labels_name: str, + cells_zarr: zarr.Group, labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> tuple[GeoDataFrame, pd.DataFrame | None]: if mask_index not in [0, 1]: raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.") - zip_file = path / XeniumKeys.CELLS_ZARR - store = zarr.storage.ZipStore(zip_file, read_only=True) - z = zarr.open(store, mode="r") # get the labels - masks = da.from_array(z["masks"][f"{mask_index}"]) + masks = da.from_array(cells_zarr["masks"][f"{mask_index}"]) labels = Labels2DModel.parse(masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs) # build the matching table @@ -481,11 +495,11 @@ def _get_labels_and_indices_mapping( # supported in versions < 1.3.0 return labels, None - cell_id, dataset_suffix = z["cell_id"][...].T + cell_id, dataset_suffix = cells_zarr["cell_id"][...].T cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix) if version < packaging.version.parse("2.0.0"): - label_index = z["seg_mask_value"][...] + label_index = cells_zarr["seg_mask_value"][...] else: # For v >= 2.0.0, seg_mask_value is no longer available in the zarr; # read label_id from the corresponding parquet boundary file instead @@ -522,8 +536,7 @@ def _get_labels_and_indices_mapping( @inject_docs(xx=XeniumKeys) def _get_cells_metadata_table_from_zarr( - path: Path, - file: str, + cells_zarr: zarr.Group, specs: dict[str, Any], ) -> AnnData: """Read cells metadata from ``{xx.CELLS_ZARR}``. @@ -531,17 +544,11 @@ def _get_cells_metadata_table_from_zarr( Read the cells summary table, which contains the z_level information for versions < 2.0.0, and also the nucleus_count for versions >= 2.0.0. """ - # for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks - zip_file = path / XeniumKeys.CELLS_ZARR - store = zarr.storage.ZipStore(zip_file, read_only=True) - - z = zarr.open(store, mode="r") - x = z["cell_summary"][...] - column_names = z["cell_summary"].attrs["column_names"] + x = cells_zarr["cell_summary"][...] + column_names = cells_zarr["cell_summary"].attrs["column_names"] df = pd.DataFrame(x, columns=column_names) - cell_id_prefix = z["cell_id"][:, 0] - dataset_suffix = z["cell_id"][:, 1] - store.close() + cell_id_prefix = cells_zarr["cell_id"][:, 0] + dataset_suffix = cells_zarr["cell_id"][:, 1] cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) df[XeniumKeys.CELL_ID] = cell_id_str @@ -850,13 +857,18 @@ def _parse_version_of_xenium_analyzer( return None -def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike: - # explained here: - # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID +def _cell_id_str_from_prefix_suffix_uint32_reference(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike: + """Reference implementation of cell_id_str_from_prefix_suffix_uint32. + + Readable but slow for large arrays due to Python-level string operations. + Kept as ground truth for testing the optimized version. + + See https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID + """ # convert to hex, remove the 0x prefix cell_id_prefix_hex = [hex(x)[2:] for x in cell_id_prefix] - # shift the hex values + # shift the hex values: '0'->'a', ..., '9'->'j', 'a'->'k', ..., 'f'->'p' hex_shift = {str(i): chr(ord("a") + i) for i in range(10)} | { chr(ord("a") + i): chr(ord("a") + 10 + i) for i in range(6) } @@ -870,6 +882,31 @@ def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suf return np.array(cell_id_str) +def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike: + """Convert cell ID prefix/suffix uint32 pairs to the Xenium string representation. + + Each uint32 prefix is converted to 8 hex nibbles, each mapped to a character + (0->'a', 1->'b', ..., 15->'p'), then joined with "-{suffix}". + + See https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID + """ + cell_id_prefix = np.asarray(cell_id_prefix, dtype=np.uint32) + dataset_suffix = np.asarray(dataset_suffix) + + # Extract 8 hex nibbles (4 bits each) from each uint32, most significant first. + # Each nibble maps to a character: 0->'a', 1->'b', ..., 9->'j', 10->'k', ..., 15->'p'. + # Leading zero nibbles become 'a', equivalent to rjust(8, 'a') padding. + shifts = np.array([28, 24, 20, 16, 12, 8, 4, 0], dtype=np.uint32) + nibbles = (cell_id_prefix[:, np.newaxis] >> shifts) & 0xF + char_codes = (nibbles + ord("a")).astype(np.uint8) + + # View the (n, 8) uint8 array as n byte-strings of length 8 + prefix_strs = char_codes.view("S8").ravel().astype("U8") + + suffix_strs = np.char.add("-", dataset_suffix.astype("U")) + return np.char.add(prefix_strs, suffix_strs) + + def prefix_suffix_uint32_from_cell_id_str( cell_id_str: ArrayLike, ) -> tuple[ArrayLike, ArrayLike]: diff --git a/tests/test_xenium.py b/tests/test_xenium.py index b6bff790..20687ab0 100644 --- a/tests/test_xenium.py +++ b/tests/test_xenium.py @@ -10,6 +10,7 @@ from spatialdata_io.__main__ import xenium_wrapper from spatialdata_io.readers.xenium import ( + _cell_id_str_from_prefix_suffix_uint32_reference, cell_id_str_from_prefix_suffix_uint32, prefix_suffix_uint32_from_cell_id_str, xenium, @@ -20,9 +21,22 @@ def test_cell_id_str_from_prefix_suffix_uint32() -> None: cell_id_prefix = np.array([1, 1437536272, 1437536273], dtype=np.uint32) dataset_suffix = np.array([1, 1, 2]) + expected = np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"]) - cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) - assert np.array_equal(cell_id_str, np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"])) + result = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) + reference = _cell_id_str_from_prefix_suffix_uint32_reference(cell_id_prefix, dataset_suffix) + assert np.array_equal(result, expected) + assert np.array_equal(reference, expected) + + +def test_cell_id_str_optimized_matches_reference() -> None: + rng = np.random.default_rng(42) + cell_id_prefix = rng.integers(0, 2**32, size=10_000, dtype=np.uint32) + dataset_suffix = rng.integers(0, 10, size=10_000) + + result = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) + reference = _cell_id_str_from_prefix_suffix_uint32_reference(cell_id_prefix, dataset_suffix) + assert np.array_equal(result, reference) def test_prefix_suffix_uint32_from_cell_id_str() -> None: From c86aab59e67dcceaef8018148367024855651a98 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Wed, 11 Feb 2026 20:14:12 +0100 Subject: [PATCH 2/3] remove unnecessary cast to str for df index --- src/spatialdata_io/readers/xenium.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index e6938f17..af521a9d 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -202,7 +202,7 @@ def xenium( if version is not None and version >= packaging.version.parse("2.0.0") and table is not None: assert cells_zarr is not None cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr, specs) - if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]): + if not np.array_equal(cell_summary_table[XeniumKeys.CELL_ID].values, table.obs[XeniumKeys.CELL_ID].values): warnings.warn( 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation' " table. This could be due to trying to read a new version that is not supported yet. Please " @@ -210,8 +210,8 @@ def xenium( UserWarning, stacklevel=2, ) - table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL] - table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT] + table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL].values + table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT].values polygons = {} labels = {} @@ -246,7 +246,9 @@ def xenium( cells_zarr=cells_zarr, ) if cell_labels_indices_mapping is not None and table is not None: - if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]): + if not np.array_equal( + cell_labels_indices_mapping["cell_id"].values, table.obs[str(XeniumKeys.CELL_ID)].values + ): warnings.warn( "The cell_id column in the cell_labels_table does not match the cell_id column derived from the " "cell labels data. This could be due to trying to read a new version that is not supported yet. " @@ -255,7 +257,7 @@ def xenium( stacklevel=2, ) else: - table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"] + table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"].values if not cells_as_circles: table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" @@ -529,8 +531,6 @@ def _get_labels_and_indices_mapping( "label_index": label_index.astype(np.int64), } ) - # because AnnData converts the indices to str - indices_mapping.index = indices_mapping.index.astype(str) return labels, indices_mapping @@ -552,8 +552,6 @@ def _get_cells_metadata_table_from_zarr( cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) df[XeniumKeys.CELL_ID] = cell_id_str - # because AnnData converts the indices to str - df.index = df.index.astype(str) return df From 722fa338f9e515b53ba74b7eada66a5aff8b45e8 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Wed, 11 Feb 2026 20:41:52 +0100 Subject: [PATCH 3/3] avoid anndata index casting; compute xenium cell ids once --- src/spatialdata_io/readers/xenium.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index af521a9d..5ead02f0 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -199,9 +199,17 @@ def xenium( cells_zarr_store = zarr.storage.ZipStore(path / XeniumKeys.CELLS_ZARR, read_only=True) cells_zarr = zarr.open(cells_zarr_store, mode="r") + # pre-compute cell_id strings from the zarr once, to avoid redundant conversion + # in both _get_cells_metadata_table_from_zarr and _get_labels_and_indices_mapping. + cells_zarr_cell_id_str: np.ndarray | None = None + if cells_zarr is not None: + cell_id_raw = cells_zarr["cell_id"][...] + cell_id_prefix, dataset_suffix = cell_id_raw[:, 0], cell_id_raw[:, 1] + cells_zarr_cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) + if version is not None and version >= packaging.version.parse("2.0.0") and table is not None: assert cells_zarr is not None - cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr, specs) + cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr, specs, cells_zarr_cell_id_str) if not np.array_equal(cell_summary_table[XeniumKeys.CELL_ID].values, table.obs[XeniumKeys.CELL_ID].values): warnings.warn( 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation' @@ -226,7 +234,6 @@ def xenium( # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus # labels. if nucleus_labels: - assert cells_zarr is not None labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( path=path, specs=specs, @@ -234,9 +241,9 @@ def xenium( labels_name="nucleus_labels", labels_models_kwargs=labels_models_kwargs, cells_zarr=cells_zarr, + cell_id_str=cells_zarr_cell_id_str, ) if cells_labels: - assert cells_zarr is not None labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( path=path, specs=specs, @@ -244,6 +251,7 @@ def xenium( labels_name="cell_labels", labels_models_kwargs=labels_models_kwargs, cells_zarr=cells_zarr, + cell_id_str=cells_zarr_cell_id_str, ) if cell_labels_indices_mapping is not None and table is not None: if not np.array_equal( @@ -478,6 +486,7 @@ def _get_labels_and_indices_mapping( mask_index: int, labels_name: str, cells_zarr: zarr.Group, + cell_id_str: ArrayLike, labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> tuple[GeoDataFrame, pd.DataFrame | None]: if mask_index not in [0, 1]: @@ -497,9 +506,6 @@ def _get_labels_and_indices_mapping( # supported in versions < 1.3.0 return labels, None - cell_id, dataset_suffix = cells_zarr["cell_id"][...].T - cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix) - if version < packaging.version.parse("2.0.0"): label_index = cells_zarr["seg_mask_value"][...] else: @@ -538,6 +544,7 @@ def _get_labels_and_indices_mapping( def _get_cells_metadata_table_from_zarr( cells_zarr: zarr.Group, specs: dict[str, Any], + cell_id_str: ArrayLike, ) -> AnnData: """Read cells metadata from ``{xx.CELLS_ZARR}``. @@ -547,10 +554,7 @@ def _get_cells_metadata_table_from_zarr( x = cells_zarr["cell_summary"][...] column_names = cells_zarr["cell_summary"].attrs["column_names"] df = pd.DataFrame(x, columns=column_names) - cell_id_prefix = cells_zarr["cell_id"][:, 0] - dataset_suffix = cells_zarr["cell_id"][:, 1] - cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) df[XeniumKeys.CELL_ID] = cell_id_str return df @@ -601,6 +605,8 @@ def _get_tables_and_circles( circ = metadata[[XeniumKeys.CELL_X, XeniumKeys.CELL_Y]].to_numpy() adata.obsm["spatial"] = circ metadata.drop([XeniumKeys.CELL_X, XeniumKeys.CELL_Y], axis=1, inplace=True) + # avoids anndata's ImplicitModificationWarning + metadata.index = adata.obs_names adata.obs = metadata adata.obs["region"] = specs["region"] adata.obs["region"] = adata.obs["region"].astype("category")