diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 6922ce06..beb6ad9b 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -29,6 +29,7 @@ from shapely import Polygon from spatial_image import SpatialImage from spatialdata import SpatialData +from spatialdata._core.query.relational_query import _get_unique_label_values_as_index from spatialdata._types import ArrayLike from spatialdata.models import ( Image2DModel, @@ -91,8 +92,8 @@ def xenium( cells_as_circles Whether to read cells also as circles. Useful for performant visualization. The radii of the nuclei, not the ones of cells, will be used; using the radii of cells would make the visualization too cluttered - as the cell boundaries are computed as a maximum expansion of the nuclei location and therefore the - corresponding circles would present considerable overlap. + (the cell boundaries are computed as a maximum expansion of the nuclei location and therefore the + corresponding circles would show considerable overlap). cell_boundaries Whether to read cell boundaries (polygons). nucleus_boundaries @@ -148,31 +149,58 @@ def xenium( if version >= packaging.version.parse("2.0.0"): cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs) - assert cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]) + if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]): + warnings.warn( + 'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation' + " table. This could be due to trying to read a new version that is not supported yet. Please " + "report this issue.", + UserWarning, + stacklevel=2, + ) table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL] table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT] polygons = {} labels = {} + tables = {} + points = {} + images = {} + # From the public release notes here: + # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa + # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used. + # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match + # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus + # labels. if nucleus_labels: - labels["nucleus_labels"] = _get_labels( + labels["nucleus_labels"], _ = _get_labels_and_indices_mapping( path, XeniumKeys.CELLS_ZARR, specs, mask_index=0, - idx=table.obs[str(XeniumKeys.CELL_ID)].copy(), + labels_name="nucleus_labels", labels_models_kwargs=labels_models_kwargs, ) if cell_labels: - labels["cell_labels"] = _get_labels( + labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping( path, XeniumKeys.CELLS_ZARR, specs, mask_index=1, - idx=table.obs[str(XeniumKeys.CELL_ID)].copy(), + labels_name="cell_labels", labels_models_kwargs=labels_models_kwargs, ) + if cell_labels_indices_mapping is not None: + if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]): + warnings.warn( + "The cell_id column in the cell_labels_table does not match the cell_id column derived from the cell " + "labels data. This could be due to trying to read a new version that is not supported yet. Please " + "report this issue.", + UserWarning, + stacklevel=2, + ) + else: + table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"] if nucleus_boundaries: polygons["nucleus_boundaries"] = _get_polygons( @@ -192,11 +220,9 @@ def xenium( idx=table.obs[str(XeniumKeys.CELL_ID)].copy(), ) - points = {} if transcripts: points["transcripts"] = _get_points(path, specs) - images = {} if version < packaging.version.parse("2.0.0"): if morphology_mip: images["morphology_mip"] = _get_images( @@ -268,7 +294,9 @@ def filter(self, record: logging.LogRecord) -> bool: del image_models_kwargs["c_coords"] logger.removeFilter(IgnoreSpecificMessage()) - elements_dict = {"images": images, "labels": labels, "points": points, "table": table, "shapes": polygons} + tables["table"] = table + + elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons} if cells_as_circles: elements_dict["shapes"][specs["region"]] = circles sdata = SpatialData(**elements_dict) @@ -316,32 +344,100 @@ def _poly(arr: ArrayLike) -> Polygon: if not np.unique(geo_df.index).size == len(geo_df): warnings.warn( "Found non-unique polygon indices, this will be addressed in a future version of the reader. For the " - "time being please consider merging non-unique polygons into single multi-polygons.", + "time being please consider merging polygons with non-unique indices into single multi-polygons.", + UserWarning, stacklevel=2, ) scale = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y")) return ShapesModel.parse(geo_df, transformations={"global": scale}) -def _get_labels( +def _get_labels_and_indices_mapping( path: Path, file: str, specs: dict[str, Any], mask_index: int, - idx: Optional[ArrayLike] = None, + labels_name: str, labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}), -) -> GeoDataFrame: +) -> tuple[GeoDataFrame, pd.DataFrame | None]: + if mask_index not in [0, 1]: + raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.") + with tempfile.TemporaryDirectory() as tmpdir: zip_file = path / XeniumKeys.CELLS_ZARR with zipfile.ZipFile(zip_file, "r") as zip_ref: zip_ref.extractall(tmpdir) with zarr.open(str(tmpdir), mode="r") as z: + # get the labels masks = z["masks"][f"{mask_index}"][...] - return Labels2DModel.parse( + labels = Labels2DModel.parse( masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs ) + # build the matching table + version = _parse_version_of_xenium_analyzer(specs) + if version is not None and version < packaging.version.parse("1.3.0"): + # supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not + # supported in versions < 1.3.0 + indices_mapping = None + elif mask_index == 0: + # nuclei currently not supported + indices_mapping = None + else: + version = _parse_version_of_xenium_analyzer(specs) + if version is None or version < packaging.version.parse("2.0.0"): + expected_label_index = z["seg_mask_value"][...] + real_label_index = _get_unique_label_values_as_index(labels).values + + # background removal + if real_label_index[0] == 0: + real_label_index = real_label_index[1:] + + if not np.array_equal(expected_label_index, real_label_index): + raise ValueError( + "The label indices from the labels differ from the ones from the input data. Please report " + f"this issue. Real label indices: {real_label_index}, expected label indices: " + f"{expected_label_index}." + ) + cell_id, dataset_suffix = z["cell_id"][...].T + cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix) + + # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems + indices_mapping = pd.DataFrame( + { + "region": labels_name, + "cell_id": cell_id_str, + "label_index": real_label_index.astype(np.int64), + } + ) + else: + labels_positional_indices = z["polygon_sets"][mask_index]["cell_index"][...] + if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))): + raise ValueError( + "The positional indices of the labels do not match the expected range. Please report this " + "issue." + ) + # this information will probably be available in the `label_id` column (see public release notes + # mentioned above) + real_label_index = _get_unique_label_values_as_index(labels) + cell_id, dataset_suffix = z["cell_id"][...].T + cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix) + + # background removal + if real_label_index[0] == 0: + real_label_index = real_label_index[1:] + + # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems + indices_mapping = pd.DataFrame( + { + "region": labels_name, + "cell_id": cell_id_str, + "label_index": real_label_index.astype(np.int64), + } + ) + return labels, indices_mapping + @inject_docs(xx=XeniumKeys) def _get_cells_metadata_table_from_zarr( @@ -426,10 +522,10 @@ def _get_images( ) -> SpatialImage | MultiscaleSpatialImage: image = imread(path / file, **imread_kwargs) if "c_coords" in image_models_kwargs and "dummy" in image_models_kwargs["c_coords"]: - # Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will not - # be merged soon. - # Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA, let's - # add a dummy channel as a temporary workaround. + # Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will + # not be merged soon. + # Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA, + # let's add a dummy channel as a temporary workaround. image = da.concatenate([image, da.zeros_like(image[0:1])], axis=0) return Image2DModel.parse( image, transformations={"global": Identity()}, dims=("c", "y", "x"), **image_models_kwargs @@ -500,7 +596,6 @@ def xenium_aligned_image( # In fact, it could be that the len(image.shape) == 4 has actually dimes (1, x, y, c) and not (1, y, x, c). This is # not a problem because the transformation is constructed to be consistent, but if is the case, the data orientation # would be transposed compared to the original image, not ideal. - # print(image.shape) if len(image.shape) == 4: assert image.shape[0] == 1 assert image.shape[-1] == 3 @@ -532,7 +627,11 @@ def xenium_aligned_image( def xenium_explorer_selection(path: str | Path, pixel_size: float = 0.2125) -> Polygon: - """Read the coordinates of a selection `.csv` file exported from the `Xenium Explorer `_. + """Read the coordinates of a selection `.csv` file exported from the `Xenium Explorer`. + + .. seealso:: + + `Xenium Explorer``_. This file can be generated by the "Freehand Selection" or the "Rectangular Selection". The output `Polygon` can be used for a polygon query on the pixel coordinate @@ -567,7 +666,11 @@ def _parse_version_of_xenium_analyzer( # Input: xenium-2.0.0.6-35-ga7e17149a # Output: 2.0.0.6-35 - warning_message = f"Could not parse the version of the Xenium Analyzer from the string: {string}. This may happen for experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\nThe reader will continue assuming the latest version of the Xenium Analyzer." + warning_message = ( + f"Could not parse the version of the Xenium Analyzer from the string: {string}. This may happen for " + "experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\n" + "The reader will continue assuming the latest version of the Xenium Analyzer." + ) if result is None: if not hide_warning: @@ -584,12 +687,35 @@ def _parse_version_of_xenium_analyzer( def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike: - # explained here: https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID + # explained here: + # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID + # convert to hex, remove the 0x prefix cell_id_prefix_hex = [hex(x)[2:] for x in cell_id_prefix] + + # shift the hex values hex_shift = {str(i): chr(ord("a") + i) for i in range(10)} | { chr(ord("a") + i): chr(ord("a") + 10 + i) for i in range(6) } cell_id_prefix_hex_shifted = ["".join([hex_shift[c] for c in x]) for x in cell_id_prefix_hex] + + # merge the prefix and the suffix cell_id_str = [str(x[0]).rjust(8, "a") + f"-{x[1]}" for x in zip(cell_id_prefix_hex_shifted, dataset_suffix)] return np.array(cell_id_str) + + +def prefix_suffix_uint32_from_cell_id_str(cell_id_str: ArrayLike) -> tuple[ArrayLike, ArrayLike]: + # parse the string into the prefix and suffix + cell_id_prefix_str, dataset_suffix = zip(*[x.split("-") for x in cell_id_str]) + dataset_suffix_int = [int(x) for x in dataset_suffix] + + # reverse the shifted hex conversion + hex_unshift = {chr(ord("a") + i): str(i) for i in range(10)} | { + chr(ord("a") + 10 + i): chr(ord("a") + i) for i in range(6) + } + cell_id_prefix_hex = ["".join([hex_unshift[c] for c in x]) for x in cell_id_prefix_str] + + # Convert hex (no need to add the 0x prefix) + cell_id_prefix = [int(x, 16) for x in cell_id_prefix_hex] + + return np.array(cell_id_prefix, dtype=np.uint32), np.array(dataset_suffix_int) diff --git a/tests/test_xenium.py b/tests/test_xenium.py index 10276932..6665eec1 100644 --- a/tests/test_xenium.py +++ b/tests/test_xenium.py @@ -1,11 +1,34 @@ import numpy as np -from spatialdata_io.readers.xenium import cell_id_str_from_prefix_suffix_uint32 +from spatialdata_io.readers.xenium import ( + cell_id_str_from_prefix_suffix_uint32, + prefix_suffix_uint32_from_cell_id_str, +) def test_cell_id_str_from_prefix_suffix_uint32() -> None: - cell_id_prefix = np.array([1, 1437536272, 1437536273]) + cell_id_prefix = np.array([1, 1437536272, 1437536273], dtype=np.uint32) dataset_suffix = np.array([1, 1, 2]) cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix) assert np.array_equal(cell_id_str, np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"])) + + +def test_prefix_suffix_uint32_from_cell_id_str() -> None: + cell_id_str = np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"]) + + cell_id_prefix, dataset_suffix = prefix_suffix_uint32_from_cell_id_str(cell_id_str) + assert np.array_equal(cell_id_prefix, np.array([1, 1437536272, 1437536273], dtype=np.uint32)) + assert np.array_equal(dataset_suffix, np.array([1, 1, 2])) + + +def test_roundtrip_with_data_limits() -> None: + # min and max values for uint32 + cell_id_prefix = np.array([0, 4294967295], dtype=np.uint32) + dataset_suffix = np.array([1, 1]) + cell_id_str = np.array(["aaaaaaaa-1", "pppppppp-1"]) + f0 = cell_id_str_from_prefix_suffix_uint32 + f1 = prefix_suffix_uint32_from_cell_id_str + assert np.array_equal(cell_id_prefix, f1(f0(cell_id_prefix, dataset_suffix))[0]) + assert np.array_equal(dataset_suffix, f1(f0(cell_id_prefix, dataset_suffix))[1]) + assert np.array_equal(cell_id_str, f0(*f1(cell_id_str)))