Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 76 additions & 35 deletions src/spatialdata_io/readers/xenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,18 +188,38 @@ def xenium(
else:
table = None

# open cells.zarr.zip once and reuse across all functions that need it
cells_zarr: zarr.Group | None = None
need_cells_zarr = (
nucleus_labels
or cells_labels
or (version is not None and version >= packaging.version.parse("2.0.0") and table is not None)
)
if need_cells_zarr:
cells_zarr_store = zarr.storage.ZipStore(path / XeniumKeys.CELLS_ZARR, read_only=True)
cells_zarr = zarr.open(cells_zarr_store, mode="r")

# pre-compute cell_id strings from the zarr once, to avoid redundant conversion
# in both _get_cells_metadata_table_from_zarr and _get_labels_and_indices_mapping.
cells_zarr_cell_id_str: np.ndarray | None = None
if cells_zarr is not None:
cell_id_raw = cells_zarr["cell_id"][...]
cell_id_prefix, dataset_suffix = cell_id_raw[:, 0], cell_id_raw[:, 1]
cells_zarr_cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)

if version is not None and version >= packaging.version.parse("2.0.0") and table is not None:
cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs)
if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
assert cells_zarr is not None
cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr, specs, cells_zarr_cell_id_str)
if not np.array_equal(cell_summary_table[XeniumKeys.CELL_ID].values, table.obs[XeniumKeys.CELL_ID].values):
warnings.warn(
'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
" table. This could be due to trying to read a new version that is not supported yet. Please "
"report this issue.",
UserWarning,
stacklevel=2,
)
table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]
table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL].values
table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT].values

polygons = {}
labels = {}
Expand All @@ -220,6 +240,8 @@ def xenium(
mask_index=0,
labels_name="nucleus_labels",
labels_models_kwargs=labels_models_kwargs,
cells_zarr=cells_zarr,
cell_id_str=cells_zarr_cell_id_str,
)
if cells_labels:
labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
Expand All @@ -228,9 +250,13 @@ def xenium(
mask_index=1,
labels_name="cell_labels",
labels_models_kwargs=labels_models_kwargs,
cells_zarr=cells_zarr,
cell_id_str=cells_zarr_cell_id_str,
)
if cell_labels_indices_mapping is not None and table is not None:
if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
if not np.array_equal(
cell_labels_indices_mapping["cell_id"].values, table.obs[str(XeniumKeys.CELL_ID)].values
):
warnings.warn(
"The cell_id column in the cell_labels_table does not match the cell_id column derived from the "
"cell labels data. This could be due to trying to read a new version that is not supported yet. "
Expand All @@ -239,7 +265,7 @@ def xenium(
stacklevel=2,
)
else:
table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]
table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"].values
if not cells_as_circles:
table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"

Expand Down Expand Up @@ -459,16 +485,15 @@ def _get_labels_and_indices_mapping(
specs: dict[str, Any],
mask_index: int,
labels_name: str,
cells_zarr: zarr.Group,
cell_id_str: ArrayLike,
labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
) -> tuple[GeoDataFrame, pd.DataFrame | None]:
if mask_index not in [0, 1]:
raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.")

zip_file = path / XeniumKeys.CELLS_ZARR
store = zarr.storage.ZipStore(zip_file, read_only=True)
z = zarr.open(store, mode="r")
# get the labels
masks = da.from_array(z["masks"][f"{mask_index}"])
masks = da.from_array(cells_zarr["masks"][f"{mask_index}"])
labels = Labels2DModel.parse(masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs)

# build the matching table
Expand All @@ -481,11 +506,8 @@ def _get_labels_and_indices_mapping(
# supported in versions < 1.3.0
return labels, None

cell_id, dataset_suffix = z["cell_id"][...].T
cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)

if version < packaging.version.parse("2.0.0"):
label_index = z["seg_mask_value"][...]
label_index = cells_zarr["seg_mask_value"][...]
else:
# For v >= 2.0.0, seg_mask_value is no longer available in the zarr;
# read label_id from the corresponding parquet boundary file instead
Expand Down Expand Up @@ -515,38 +537,25 @@ def _get_labels_and_indices_mapping(
"label_index": label_index.astype(np.int64),
}
)
# because AnnData converts the indices to str
indices_mapping.index = indices_mapping.index.astype(str)
return labels, indices_mapping


@inject_docs(xx=XeniumKeys)
def _get_cells_metadata_table_from_zarr(
path: Path,
file: str,
cells_zarr: zarr.Group,
specs: dict[str, Any],
cell_id_str: ArrayLike,
) -> AnnData:
"""Read cells metadata from ``{xx.CELLS_ZARR}``.

Read the cells summary table, which contains the z_level information for versions < 2.0.0, and also the
nucleus_count for versions >= 2.0.0.
"""
# for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks
zip_file = path / XeniumKeys.CELLS_ZARR
store = zarr.storage.ZipStore(zip_file, read_only=True)

z = zarr.open(store, mode="r")
x = z["cell_summary"][...]
column_names = z["cell_summary"].attrs["column_names"]
x = cells_zarr["cell_summary"][...]
column_names = cells_zarr["cell_summary"].attrs["column_names"]
df = pd.DataFrame(x, columns=column_names)
cell_id_prefix = z["cell_id"][:, 0]
dataset_suffix = z["cell_id"][:, 1]
store.close()

cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
df[XeniumKeys.CELL_ID] = cell_id_str
# because AnnData converts the indices to str
df.index = df.index.astype(str)
return df


Expand Down Expand Up @@ -596,6 +605,8 @@ def _get_tables_and_circles(
circ = metadata[[XeniumKeys.CELL_X, XeniumKeys.CELL_Y]].to_numpy()
adata.obsm["spatial"] = circ
metadata.drop([XeniumKeys.CELL_X, XeniumKeys.CELL_Y], axis=1, inplace=True)
# avoids anndata's ImplicitModificationWarning
metadata.index = adata.obs_names
adata.obs = metadata
adata.obs["region"] = specs["region"]
adata.obs["region"] = adata.obs["region"].astype("category")
Expand Down Expand Up @@ -850,13 +861,18 @@ def _parse_version_of_xenium_analyzer(
return None


def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike:
# explained here:
# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
def _cell_id_str_from_prefix_suffix_uint32_reference(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike:
"""Reference implementation of cell_id_str_from_prefix_suffix_uint32.

Readable but slow for large arrays due to Python-level string operations.
Kept as ground truth for testing the optimized version.

See https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
"""
# convert to hex, remove the 0x prefix
cell_id_prefix_hex = [hex(x)[2:] for x in cell_id_prefix]

# shift the hex values
# shift the hex values: '0'->'a', ..., '9'->'j', 'a'->'k', ..., 'f'->'p'
hex_shift = {str(i): chr(ord("a") + i) for i in range(10)} | {
chr(ord("a") + i): chr(ord("a") + 10 + i) for i in range(6)
}
Expand All @@ -870,6 +886,31 @@ def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suf
return np.array(cell_id_str)


def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike:
"""Convert cell ID prefix/suffix uint32 pairs to the Xenium string representation.

Each uint32 prefix is converted to 8 hex nibbles, each mapped to a character
(0->'a', 1->'b', ..., 15->'p'), then joined with "-{suffix}".

See https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
"""
cell_id_prefix = np.asarray(cell_id_prefix, dtype=np.uint32)
dataset_suffix = np.asarray(dataset_suffix)

# Extract 8 hex nibbles (4 bits each) from each uint32, most significant first.
# Each nibble maps to a character: 0->'a', 1->'b', ..., 9->'j', 10->'k', ..., 15->'p'.
# Leading zero nibbles become 'a', equivalent to rjust(8, 'a') padding.
shifts = np.array([28, 24, 20, 16, 12, 8, 4, 0], dtype=np.uint32)
nibbles = (cell_id_prefix[:, np.newaxis] >> shifts) & 0xF
char_codes = (nibbles + ord("a")).astype(np.uint8)

# View the (n, 8) uint8 array as n byte-strings of length 8
prefix_strs = char_codes.view("S8").ravel().astype("U8")

suffix_strs = np.char.add("-", dataset_suffix.astype("U"))
return np.char.add(prefix_strs, suffix_strs)


def prefix_suffix_uint32_from_cell_id_str(
cell_id_str: ArrayLike,
) -> tuple[ArrayLike, ArrayLike]:
Expand Down
18 changes: 16 additions & 2 deletions tests/test_xenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from spatialdata_io.__main__ import xenium_wrapper
from spatialdata_io.readers.xenium import (
_cell_id_str_from_prefix_suffix_uint32_reference,
cell_id_str_from_prefix_suffix_uint32,
prefix_suffix_uint32_from_cell_id_str,
xenium,
Expand All @@ -20,9 +21,22 @@
def test_cell_id_str_from_prefix_suffix_uint32() -> None:
cell_id_prefix = np.array([1, 1437536272, 1437536273], dtype=np.uint32)
dataset_suffix = np.array([1, 1, 2])
expected = np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"])

cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
assert np.array_equal(cell_id_str, np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"]))
result = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
reference = _cell_id_str_from_prefix_suffix_uint32_reference(cell_id_prefix, dataset_suffix)
assert np.array_equal(result, expected)
assert np.array_equal(reference, expected)


def test_cell_id_str_optimized_matches_reference() -> None:
rng = np.random.default_rng(42)
cell_id_prefix = rng.integers(0, 2**32, size=10_000, dtype=np.uint32)
dataset_suffix = rng.integers(0, 10, size=10_000)

result = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
reference = _cell_id_str_from_prefix_suffix_uint32_reference(cell_id_prefix, dataset_suffix)
assert np.array_equal(result, reference)


def test_prefix_suffix_uint32_from_cell_id_str() -> None:
Expand Down
Loading