[go: nahoru, domu]

Skip to content

Commit

Permalink
xenium linking the cell labels to the table; added indices helper fun…
Browse files Browse the repository at this point in the history
…ction
  • Loading branch information
LucaMarconato committed Mar 24, 2024
1 parent f7718c5 commit b043d12
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 25 deletions.
172 changes: 149 additions & 23 deletions src/spatialdata_io/readers/xenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from shapely import Polygon
from spatial_image import SpatialImage
from spatialdata import SpatialData
from spatialdata._core.query.relational_query import _get_unique_label_values_as_index
from spatialdata._types import ArrayLike
from spatialdata.models import (
Image2DModel,
Expand Down Expand Up @@ -91,8 +92,8 @@ def xenium(
cells_as_circles
Whether to read cells also as circles. Useful for performant visualization. The radii of the nuclei,
not the ones of cells, will be used; using the radii of cells would make the visualization too cluttered
as the cell boundaries are computed as a maximum expansion of the nuclei location and therefore the
corresponding circles would present considerable overlap.
(the cell boundaries are computed as a maximum expansion of the nuclei location and therefore the
corresponding circles would show considerable overlap).
cell_boundaries
Whether to read cell boundaries (polygons).
nucleus_boundaries
Expand Down Expand Up @@ -148,31 +149,58 @@ def xenium(

if version >= packaging.version.parse("2.0.0"):
cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs)
assert cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID])
if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
warnings.warn(
'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
" table. This could be due to trying to read a new version that is not supported yet. Please "
"report this issue.",
UserWarning,
stacklevel=2,
)
table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]

polygons = {}
labels = {}
tables = {}
points = {}
images = {}

# From the public release notes here:
# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
# we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used.
# This column is currently not found in the preview data, while I think it is needed in order to unambiguously match
# nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
# labels.
if nucleus_labels:
labels["nucleus_labels"] = _get_labels(
labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
path,
XeniumKeys.CELLS_ZARR,
specs,
mask_index=0,
idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
labels_name="nucleus_labels",
labels_models_kwargs=labels_models_kwargs,
)
if cell_labels:
labels["cell_labels"] = _get_labels(
labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
path,
XeniumKeys.CELLS_ZARR,
specs,
mask_index=1,
idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
labels_name="cell_labels",
labels_models_kwargs=labels_models_kwargs,
)
if cell_labels_indices_mapping is not None:
if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
warnings.warn(
"The cell_id column in the cell_labels_table does not match the cell_id column derived from the cell "
"labels data. This could be due to trying to read a new version that is not supported yet. Please "
"report this issue.",
UserWarning,
stacklevel=2,
)
else:
table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]

if nucleus_boundaries:
polygons["nucleus_boundaries"] = _get_polygons(
Expand All @@ -192,11 +220,9 @@ def xenium(
idx=table.obs[str(XeniumKeys.CELL_ID)].copy(),
)

points = {}
if transcripts:
points["transcripts"] = _get_points(path, specs)

images = {}
if version < packaging.version.parse("2.0.0"):
if morphology_mip:
images["morphology_mip"] = _get_images(
Expand Down Expand Up @@ -268,7 +294,9 @@ def filter(self, record: logging.LogRecord) -> bool:
del image_models_kwargs["c_coords"]
logger.removeFilter(IgnoreSpecificMessage())

elements_dict = {"images": images, "labels": labels, "points": points, "table": table, "shapes": polygons}
tables["table"] = table

elements_dict = {"images": images, "labels": labels, "points": points, "tables": tables, "shapes": polygons}
if cells_as_circles:
elements_dict["shapes"][specs["region"]] = circles
sdata = SpatialData(**elements_dict)
Expand Down Expand Up @@ -316,32 +344,100 @@ def _poly(arr: ArrayLike) -> Polygon:
if not np.unique(geo_df.index).size == len(geo_df):
warnings.warn(
"Found non-unique polygon indices, this will be addressed in a future version of the reader. For the "
"time being please consider merging non-unique polygons into single multi-polygons.",
"time being please consider merging polygons with non-unique indices into single multi-polygons.",
UserWarning,
stacklevel=2,
)
scale = Scale([1.0 / specs["pixel_size"], 1.0 / specs["pixel_size"]], axes=("x", "y"))
return ShapesModel.parse(geo_df, transformations={"global": scale})


def _get_labels(
def _get_labels_and_indices_mapping(
path: Path,
file: str,
specs: dict[str, Any],
mask_index: int,
idx: Optional[ArrayLike] = None,
labels_name: str,
labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
) -> GeoDataFrame:
) -> tuple[GeoDataFrame, pd.DataFrame | None]:
if mask_index not in [0, 1]:
raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.")

with tempfile.TemporaryDirectory() as tmpdir:
zip_file = path / XeniumKeys.CELLS_ZARR
with zipfile.ZipFile(zip_file, "r") as zip_ref:
zip_ref.extractall(tmpdir)

with zarr.open(str(tmpdir), mode="r") as z:
# get the labels
masks = z["masks"][f"{mask_index}"][...]
return Labels2DModel.parse(
labels = Labels2DModel.parse(
masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs
)

# build the matching table
version = _parse_version_of_xenium_analyzer(specs)
if version is not None and version < packaging.version.parse("1.3.0"):
# supported in version 1.3.0 and not supported in version 1.0.2; conservatively, let's assume it is not
# supported in versions < 1.3.0
indices_mapping = None
elif mask_index == 0:
# nuclei currently not supported
indices_mapping = None
else:
version = _parse_version_of_xenium_analyzer(specs)
if version is None or version < packaging.version.parse("2.0.0"):
expected_label_index = z["seg_mask_value"][...]
real_label_index = _get_unique_label_values_as_index(labels).values

# background removal
if real_label_index[0] == 0:
real_label_index = real_label_index[1:]

if not np.array_equal(expected_label_index, real_label_index):
raise ValueError(
"The label indices from the labels differ from the ones from the input data. Please report "
f"this issue. Real label indices: {real_label_index}, expected label indices: "
f"{expected_label_index}."
)
cell_id, dataset_suffix = z["cell_id"][...].T
cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)

# labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
indices_mapping = pd.DataFrame(
{
"region": labels_name,
"cell_id": cell_id_str,
"label_index": real_label_index.astype(np.int64),
}
)
else:
labels_positional_indices = z["polygon_sets"][mask_index]["cell_index"][...]
if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))):
raise ValueError(
"The positional indices of the labels do not match the expected range. Please report this "
"issue."
)
# this information will probably be available in the `label_id` column (see public release notes
# mentioned above)
real_label_index = _get_unique_label_values_as_index(labels)
cell_id, dataset_suffix = z["cell_id"][...].T
cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)

# background removal
if real_label_index[0] == 0:
real_label_index = real_label_index[1:]

# labels_index is an uint32, so let's cast to np.int64 to avoid the risk of overflow on some systems
indices_mapping = pd.DataFrame(
{
"region": labels_name,
"cell_id": cell_id_str,
"label_index": real_label_index.astype(np.int64),
}
)
return labels, indices_mapping


@inject_docs(xx=XeniumKeys)
def _get_cells_metadata_table_from_zarr(
Expand Down Expand Up @@ -426,10 +522,10 @@ def _get_images(
) -> SpatialImage | MultiscaleSpatialImage:
image = imread(path / file, **imread_kwargs)
if "c_coords" in image_models_kwargs and "dummy" in image_models_kwargs["c_coords"]:
# Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will not
# be merged soon.
# Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA, let's
# add a dummy channel as a temporary workaround.
# Napari currently interprets 4 channel images as RGB; a series of PRs to fix this is almost ready but they will
# not be merged soon.
# Here, since the new data from the xenium analyzer version 2.0.0 gives 4-channel images that are not RGBA,
# let's add a dummy channel as a temporary workaround.
image = da.concatenate([image, da.zeros_like(image[0:1])], axis=0)
return Image2DModel.parse(
image, transformations={"global": Identity()}, dims=("c", "y", "x"), **image_models_kwargs
Expand Down Expand Up @@ -500,7 +596,6 @@ def xenium_aligned_image(
# In fact, it could be that the len(image.shape) == 4 has actually dimes (1, x, y, c) and not (1, y, x, c). This is
# not a problem because the transformation is constructed to be consistent, but if is the case, the data orientation
# would be transposed compared to the original image, not ideal.
# print(image.shape)
if len(image.shape) == 4:
assert image.shape[0] == 1
assert image.shape[-1] == 3
Expand Down Expand Up @@ -532,7 +627,11 @@ def xenium_aligned_image(


def xenium_explorer_selection(path: str | Path, pixel_size: float = 0.2125) -> Polygon:
"""Read the coordinates of a selection `.csv` file exported from the `Xenium Explorer <https://www.10xgenomics.com/support/software/xenium-explorer/latest>`_.
"""Read the coordinates of a selection `.csv` file exported from the `Xenium Explorer`.
.. seealso::
`Xenium Explorer`<https://www.10xgenomics.com/support/software/xenium-explorer/latest>`_.
This file can be generated by the "Freehand Selection" or the "Rectangular Selection".
The output `Polygon` can be used for a polygon query on the pixel coordinate
Expand Down Expand Up @@ -567,7 +666,11 @@ def _parse_version_of_xenium_analyzer(
# Input: xenium-2.0.0.6-35-ga7e17149a
# Output: 2.0.0.6-35

warning_message = f"Could not parse the version of the Xenium Analyzer from the string: {string}. This may happen for experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\nThe reader will continue assuming the latest version of the Xenium Analyzer."
warning_message = (
f"Could not parse the version of the Xenium Analyzer from the string: {string}. This may happen for "
"experimental version of the data. Please report in GitHub https://github.com/scverse/spatialdata-io/issues.\n"
"The reader will continue assuming the latest version of the Xenium Analyzer."
)

if result is None:
if not hide_warning:
Expand All @@ -584,12 +687,35 @@ def _parse_version_of_xenium_analyzer(


def cell_id_str_from_prefix_suffix_uint32(cell_id_prefix: ArrayLike, dataset_suffix: ArrayLike) -> ArrayLike:
# explained here: https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
# explained here:
# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-zarr#cellID
# convert to hex, remove the 0x prefix
cell_id_prefix_hex = [hex(x)[2:] for x in cell_id_prefix]

# shift the hex values
hex_shift = {str(i): chr(ord("a") + i) for i in range(10)} | {
chr(ord("a") + i): chr(ord("a") + 10 + i) for i in range(6)
}
cell_id_prefix_hex_shifted = ["".join([hex_shift[c] for c in x]) for x in cell_id_prefix_hex]

# merge the prefix and the suffix
cell_id_str = [str(x[0]).rjust(8, "a") + f"-{x[1]}" for x in zip(cell_id_prefix_hex_shifted, dataset_suffix)]

return np.array(cell_id_str)


def prefix_suffix_uint32_from_cell_id_str(cell_id_str: ArrayLike) -> tuple[ArrayLike, ArrayLike]:
# parse the string into the prefix and suffix
cell_id_prefix_str, dataset_suffix = zip(*[x.split("-") for x in cell_id_str])
dataset_suffix_int = [int(x) for x in dataset_suffix]

# reverse the shifted hex conversion
hex_unshift = {chr(ord("a") + i): str(i) for i in range(10)} | {
chr(ord("a") + 10 + i): chr(ord("a") + i) for i in range(6)
}
cell_id_prefix_hex = ["".join([hex_unshift[c] for c in x]) for x in cell_id_prefix_str]

# Convert hex (no need to add the 0x prefix)
cell_id_prefix = [int(x, 16) for x in cell_id_prefix_hex]

return np.array(cell_id_prefix, dtype=np.uint32), np.array(dataset_suffix_int)
27 changes: 25 additions & 2 deletions tests/test_xenium.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,34 @@
import numpy as np

from spatialdata_io.readers.xenium import cell_id_str_from_prefix_suffix_uint32
from spatialdata_io.readers.xenium import (
cell_id_str_from_prefix_suffix_uint32,
prefix_suffix_uint32_from_cell_id_str,
)


def test_cell_id_str_from_prefix_suffix_uint32() -> None:
cell_id_prefix = np.array([1, 1437536272, 1437536273])
cell_id_prefix = np.array([1, 1437536272, 1437536273], dtype=np.uint32)
dataset_suffix = np.array([1, 1, 2])

cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
assert np.array_equal(cell_id_str, np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"]))


def test_prefix_suffix_uint32_from_cell_id_str() -> None:
cell_id_str = np.array(["aaaaaaab-1", "ffkpbaba-1", "ffkpbabb-2"])

cell_id_prefix, dataset_suffix = prefix_suffix_uint32_from_cell_id_str(cell_id_str)
assert np.array_equal(cell_id_prefix, np.array([1, 1437536272, 1437536273], dtype=np.uint32))
assert np.array_equal(dataset_suffix, np.array([1, 1, 2]))


def test_roundtrip_with_data_limits() -> None:
# min and max values for uint32
cell_id_prefix = np.array([0, 4294967295], dtype=np.uint32)
dataset_suffix = np.array([1, 1])
cell_id_str = np.array(["aaaaaaaa-1", "pppppppp-1"])
f0 = cell_id_str_from_prefix_suffix_uint32
f1 = prefix_suffix_uint32_from_cell_id_str
assert np.array_equal(cell_id_prefix, f1(f0(cell_id_prefix, dataset_suffix))[0])
assert np.array_equal(dataset_suffix, f1(f0(cell_id_prefix, dataset_suffix))[1])
assert np.array_equal(cell_id_str, f0(*f1(cell_id_str)))

0 comments on commit b043d12

Please sign in to comment.