xenium linking the cell labels to the table; added indices helper fun…

…ction
quentinblampey · Mar 24, 2024 · b043d12 · b043d12
1 parent f7718c5
commit b043d12
Show file tree

Hide file tree

Showing 2 changed files with 174 additions and 25 deletions.
diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
@@ -29,6 +29,7 @@
 from shapely import Polygon
 from spatial_image import SpatialImage
 from spatialdata import SpatialData
+from spatialdata._core.query.relational_query import _get_unique_label_values_as_index
 from spatialdata._types import ArrayLike
 from spatialdata.models import (
     Image2DModel,
@@ -91,8 +92,8 @@ def xenium(
     cells_as_circles
         Whether to read cells also as circles. Useful for performant visualization. The radii of the nuclei,
         not the ones of cells, will be used; using the radii of cells would make the visualization too cluttered
-        as the cell boundaries are computed as a maximum expansion of the nuclei location and therefore the
-        corresponding circles would present considerable overlap.
+        (the cell boundaries are computed as a maximum expansion of the nuclei location and therefore the
+        corresponding circles would show considerable overlap).
     cell_boundaries
         Whether to read cell boundaries (polygons).
     nucleus_boundaries
@@ -148,31 +149,58 @@ def xenium(
 
     if version >= packaging.version.parse("2.0.0"):
         cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs)
-        assert cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID])
+        if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
+            warnings.warn(
+                'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
+                " table. This could be due to trying to read a new version that is not supported yet. Please "
+                "report this issue.",
+                UserWarning,
+                stacklevel=2,
+            )
         table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
         table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]
 
     polygons = {}
     labels = {}
+    tables = {}
+    points = {}
+    images = {}
 
+    # From the public release notes here:
+    # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
+    # we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used.
+    # This column is currently not found in the preview data, while I think it is needed in order to unambiguously match
+    # nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
+    # labels.
     if nucleus_labels:
-        labels["nucleus_labels"] = _get_labels(
+        labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
             path,
             XeniumKeys.CELLS_ZARR,
             specs,
             mask_index=0,
-            idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
+            labels_name="nucleus_labels",
             labels_models_kwargs=labels_models_kwargs,
         )
     if cell_labels:
-        labels["cell_labels"] = _get_labels(
+        labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
             path,
             XeniumKeys.CELLS_ZARR,
             specs,
             mask_index=1,
-            idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
+            labels_name="cell_labels",
             labels_models_kwargs=labels_models_kwargs,
         )
+    if cell_labels_indices_mapping is not None:
+        if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
+            warnings.warn(
+                "The cell_id column in the cell_labels_table does not match the cell_id column derived from the cell "
+                "labels data. This could be due to trying to read a new version that is not supported yet. Please "
+                "report this issue.",
+                UserWarning,
+                stacklevel=2,
+            )
+        else:
+            table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]
 
     if nucleus_boundaries:
         polygons["nucleus_boundaries"] = _get_polygons(
@@ -192,11 +220,9 @@ def xenium(
             idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
         )
 
-    points = {}
     if transcripts:
         points["transcripts"] = _get_points(path, specs)
 
-    images = {}
     if version < packaging.version.parse("2.0.0"):
         if morphology_mip:
             images["morphology_mip"] = _get_images(
@@ -268,7 +294,9 @@ def filter(self, record: logging.LogRecord) -> bool:
             del image_models_kwargs["c_coords"]
             logger.removeFilter(IgnoreSpecificMessage())
 
-    elements_dict = {"images": images, "labels": labels, "points": points, "table": table, "shapes": polygons}
+    tables["table"] = table
+
+    elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons}
     if cells_as_circles:
         elements_dict["shapes"][specs["region"]] = circles
     sdata = SpatialData(**elements_dict)
@@ -316,32 +344,100 @@ def _poly(arr: ArrayLike) -> Polygon:
         if not np.unique(geo_df.index).size == len(geo_df):
             warnings.warn(
                 "Found non-unique polygon indices, this will be addressed in a future version of the reader. For the "
-                "time being please consider merging non-unique polygons into single multi-polygons.",
+                "time being please consider merging polygons with non-unique indices into single multi-polygons.",
+                UserWarning,
                 stacklevel=2,
             )
     scale = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y"))
     return ShapesModel.parse(geo_df, transformations={"global": scale})
 
 
-def _get_labels(
+def _get_labels_and_indices_mapping(
     path: Path,
     file: str,
     specs: dict[str, Any],
     mask_index: int,
-    idx: Optional[ArrayLike] = None,
+    labels_name: str,
     labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
-) -> GeoDataFrame:
+) -> tuple[GeoDataFrame, pd.DataFrame | None]:
+    if mask_index not in [0, 1]:
+        raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.")
+
     with tempfile.TemporaryDirectory() as tmpdir:
         zip_file = path / XeniumKeys.CELLS_ZARR
         with zipfile.ZipFile(zip_file, "r") as zip_ref:
             zip_ref.extractall(tmpdir)
 
         with zarr.open(str(tmpdir), mode="r") as z:
+            # get the labels
             masks = z["masks"][f"{mask_index}"][...]
-            return Labels2DModel.parse(
+            labels = Labels2DModel.parse(
                 masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs
             )
 
+            # build the matching table
+            version = _parse_version_of_xenium_analyzer(specs)
+            if version is not None and version < packaging.version.parse("1.3.0"):
+                # supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not
+                # supported in versions < 1.3.0
+                indices_mapping = None
+            elif mask_index == 0:
+                # nuclei currently not supported
+                indices_mapping = None
+            else:
+                version = _parse_version_of_xenium_analyzer(specs)
+                if version is None or version < packaging.version.parse("2.0.0"):
+                    expected_label_index = z["seg_mask_value"][...]
+                    real_label_index = _get_unique_label_values_as_index(labels).values
+
+                    # background removal
+                    if real_label_index[0] == 0:
+                        real_label_index = real_label_index[1:]
+
+                    if not np.array_equal(expected_label_index, real_label_index):
+                        raise ValueError(
+                            "The label indices from the labels differ from the ones from the input data. Please report "
+                            f"this issue. Real label indices: {real_label_index}, expected label indices: "
+                            f"{expected_label_index}."
+                        )
+                    cell_id, dataset_suffix = z["cell_id"][...].T
+                    cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)
+
+                    # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
+                    indices_mapping = pd.DataFrame(
+                        {
+                            "region": labels_name,
+                            "cell_id": cell_id_str,
+                            "label_index": real_label_index.astype(np.int64),
+                        }
+                    )
+                else:
+                    labels_positional_indices = z["polygon_sets"][mask_index]["cell_index"][...]
+                    if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))):
+                        raise ValueError(
+                            "The positional indices of the labels do not match the expected range. Please report this "
+                            "issue."
+                        )
+                    # this information will probably be available in the `label_id` column (see public release notes
+                    # mentioned above)
+                    real_label_index = _get_unique_label_values_as_index(labels)
+                    cell_id, dataset_suffix = z["cell_id"][...].T
+                    cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)
+
+                    # background removal
+                    if real_label_index[0] == 0:
+                        real_label_index = real_label_index[1:]
+
+                    # labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
+                    indices_mapping = pd.DataFrame(
+                        {
+                            "region": labels_name,
+                            "cell_id": cell_id_str,
+                            "label_index": real_label_index.astype(np.int64),
+                        }
+                    )
+            return labels, indices_mapping
+
 
 @inject_docs(xx=XeniumKeys)
 def _get_cells_metadata_table_from_zarr(
@@ -426,10 +522,10 @@ def _get_images(
 ) -> SpatialImage | MultiscaleSpatialImage:
     image = imread(path / file, **imread_kwargs)
     if "c_coords" in image_models_kwargs and "dummy" in image_models_kwargs["c_coords"]:
-        # Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will not
-        # be merged soon.
-        # Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA, let's
-        # add a dummy channel as a temporary workaround.
+        # Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will
+        # not be merged soon.
+        # Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA,
+        # let's add a dummy channel as a temporary workaround.
         image = da.concatenate([image, da.zeros_like(image[0:1])], axis=0)
     return Image2DModel.parse(
         image, transformations={"global": Identity()}, dims=("c", "y", "x"), **image_models_kwargs
@@ -500,7 +596,6 @@ def xenium_aligned_image(
     # In fact, it could be that the len(image.shape) == 4 has actually dimes (1, x, y, c) and not (1, y, x, c). This is
     # not a problem because the transformation is constructed to be consistent, but if is the case, the data orientation
     # would be transposed compared to the original image, not ideal.
-    # print(image.shape)
     if len(image.shape) == 4:
         assert image.shape[0] == 1
         assert image.shape[-1] == 3
@@ -532,7 +627,11 @@ def xenium_aligned_image(
 
 
 def xenium_explorer_selection(path: str | Path, pixel_size: float = 0.2125) -> Polygon:
-    """Read the coordinates of a selection `.csv` file exported from the `Xenium Explorer  <https://www.10xgenomics.com/support/software/xenium-explorer/latest>`_.
+    """Read the coordinates of a selection `.csv` file exported from the `Xenium Explorer`.
+
+    .. seealso::
+
+         `Xenium Explorer`<https://www.10xgenomics.com/support/software/xenium-explorer/latest>`_.
 
     This file can be generated by the "Freehand Selection" or the "Rectangular Selection".
     The output `Polygon` can be used for a polygon query on the pixel coordinate
@@ -567,7 +666,11 @@ def _parse_version_of_xenium_analyzer(
     # Input: xenium-2.0.0.6-35-ga7e17149a
     # Output: 2.0.0.6-35
 
-    warning_message = f"Could not parse the version of the Xenium Analyzer from the string: {string}. This may happen for experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\nThe reader will continue assuming the latest version of the Xenium Analyzer."
+    warning_message = (
+        f"Could not parse the version of the Xenium Analyzer from the string: {string}. This may happen for "
+        "experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\n"
+        "The reader will continue assuming the latest version of the Xenium Analyzer."
+    )
 
     if result is None:
         if not hide_warning:
@@ -584,12 +687,35 @@ def _parse_version_of_xenium_analyzer(
 
 
 def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike:
-    # explained here: https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
+    # explained here:
+    # https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
+    # convert to hex, remove the 0x prefix
     cell_id_prefix_hex = [hex(x)[2:] for x in cell_id_prefix]
+
+    # shift the hex values
     hex_shift = {str(i): chr(ord("a") + i) for i in range(10)} | {
         chr(ord("a") + i): chr(ord("a") + 10 + i) for i in range(6)
     }
     cell_id_prefix_hex_shifted = ["".join([hex_shift[c] for c in x]) for x in cell_id_prefix_hex]
+
+    # merge the prefix and the suffix
     cell_id_str = [str(x[0]).rjust(8, "a") + f"-{x[1]}" for x in zip(cell_id_prefix_hex_shifted, dataset_suffix)]
 
     return np.array(cell_id_str)
+
+
+def prefix_suffix_uint32_from_cell_id_str(cell_id_str: ArrayLike) -> tuple[ArrayLike, ArrayLike]:
+    # parse the string into the prefix and suffix
+    cell_id_prefix_str, dataset_suffix = zip(*[x.split("-") for x in cell_id_str])
+    dataset_suffix_int = [int(x) for x in dataset_suffix]
+
+    # reverse the shifted hex conversion
+    hex_unshift = {chr(ord("a") + i): str(i) for i in range(10)} | {
+        chr(ord("a") + 10 + i): chr(ord("a") + i) for i in range(6)
+    }
+    cell_id_prefix_hex = ["".join([hex_unshift[c] for c in x]) for x in cell_id_prefix_str]
+
+    # Convert hex (no need to add the 0x prefix)
+    cell_id_prefix = [int(x, 16) for x in cell_id_prefix_hex]
+
+    return np.array(cell_id_prefix, dtype=np.uint32), np.array(dataset_suffix_int)
diff --git a/tests/test_xenium.py b/tests/test_xenium.py
@@ -1,11 +1,34 @@
 import numpy as np
 
-from spatialdata_io.readers.xenium import cell_id_str_from_prefix_suffix_uint32
+from spatialdata_io.readers.xenium import (
+    cell_id_str_from_prefix_suffix_uint32,
+    prefix_suffix_uint32_from_cell_id_str,
+)
 
 
 def test_cell_id_str_from_prefix_suffix_uint32() -> None:
-    cell_id_prefix = np.array([1, 1437536272, 1437536273])
+    cell_id_prefix = np.array([1, 1437536272, 1437536273], dtype=np.uint32)
     dataset_suffix = np.array([1, 1, 2])
 
     cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
     assert np.array_equal(cell_id_str, np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"]))
+
+
+def test_prefix_suffix_uint32_from_cell_id_str() -> None:
+    cell_id_str = np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"])
+
+    cell_id_prefix, dataset_suffix = prefix_suffix_uint32_from_cell_id_str(cell_id_str)
+    assert np.array_equal(cell_id_prefix, np.array([1, 1437536272, 1437536273], dtype=np.uint32))
+    assert np.array_equal(dataset_suffix, np.array([1, 1, 2]))
+
+
+def test_roundtrip_with_data_limits() -> None:
+    # min and max values for uint32
+    cell_id_prefix = np.array([0, 4294967295], dtype=np.uint32)
+    dataset_suffix = np.array([1, 1])
+    cell_id_str = np.array(["aaaaaaaa-1", "pppppppp-1"])
+    f0 = cell_id_str_from_prefix_suffix_uint32
+    f1 = prefix_suffix_uint32_from_cell_id_str
+    assert np.array_equal(cell_id_prefix, f1(f0(cell_id_prefix, dataset_suffix))[0])
+    assert np.array_equal(dataset_suffix, f1(f0(cell_id_prefix, dataset_suffix))[1])
+    assert np.array_equal(cell_id_str, f0(*f1(cell_id_str)))