Merge branch 'main' into feat/to_legacy_anndata

quentinblampey · Mar 18, 2024 · afff8e5 · afff8e5
2 parents 8a1430f + 896af31
commit afff8e5
Show file tree

Hide file tree

Showing 6 changed files with 90 additions and 50 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
       hooks:
           - id: isort
     - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: v1.8.0
+      rev: v1.9.0
       hooks:
           - id: mypy
             additional_dependencies: [numpy, types-PyYAML]

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,7 +16,8 @@ and this project adheres to [Semantic Versioning][].
 -   (Xenium) support for post-xenium aligned images (IF, HE)
 -   (Xenium) reader for the selection coordinates file from the Xenium Explorer
 -   (DBiT-seq) reader
--   converter functions `to_legacy_anndata()` and `from_legacy_anndata()`
+-   converter functions `experimental.to_legacy_anndata()` and `experimental.from_legacy_anndata()`
+-   (Visium) support for raw reads (capture locations not under tissue)
 
 ### Fixed
 

diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ This package contains reader functions to load common spatial omics formats into
 -   Curio Seeker
 -   Akoya PhenoCycler (formerly CODEX)
 -   Vizgen MERSCOPE (MERFISH)
+-   DBiT-seq
 
 ## Getting started
 

diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
@@ -110,7 +110,8 @@ class VisiumKeys(ModeEnum):
     """Keys for *10X Genomics Visium* formatted dataset."""
 
     # files and directories
-    COUNTS_FILE = "filtered_feature_bc_matrix.h5"
+    FILTERED_COUNTS_FILE = "filtered_feature_bc_matrix.h5"
+    RAW_COUNTS_FILE = "raw_feature_bc_matrix.h5"
 
     # images
     IMAGE_HIRES_FILE = "spatial/tissue_hires_image.png"
@@ -124,8 +125,8 @@ class VisiumKeys(ModeEnum):
     # spots
     SPOTS_FILE_1 = "tissue_positions_list.csv"
     SPOTS_FILE_2 = "tissue_positions.csv"
-    SPOTS_X = "pxl_row_in_fullres"
-    SPOTS_Y = "pxl_col_in_fullres"
+    SPOTS_X = "pxl_col_in_fullres"
+    SPOTS_Y = "pxl_row_in_fullres"
 
 
 @unique

diff --git a/src/spatialdata_io/readers/dbit.py b/src/spatialdata_io/readers/dbit.py
@@ -105,8 +105,11 @@ def _barcode_check(barcode_file: Path) -> pd.DataFrame:
     Returns
     -------
     pd.DataFrame
-        A pandas.DataFrame with 2 columns, named 'A' and 'B', with a barcode as row index.
-        Columns 'A' and 'B' contains an int each, that are the spatial coordinate of the barcode.
+        A pandas.DataFrame with 2 columns, named 'A' and 'B', with a number
+        (from 1 to 50, extreme included) as row index.
+        Columns 'A' and 'B' contains a 8 nucleotides barcode.
+        The header of the column and the row index are the spatial coordinate of the barcode.
+        For example, the element of row 1, column 'A' is the barcode with coordinate A1.
         The columns are ordered in ascending order.
     """
     df = pd.read_csv(barcode_file, header=None, sep="\t")
@@ -121,7 +124,7 @@ def _barcode_check(barcode_file: Path) -> pd.DataFrame:
     # Pattern 2: match nucleotides string of 8 char. Case insensitive.
     patt_barcode = re.compile(r"^[A|a|T|t|C|c|G|g]{8}$")
     # dict, used to collect data after matching
-    bc_positions: dict[str, dict[str, str]] = {}
+    bc_positions: dict[int, dict[str, str]] = {}
     # line[0]: row index, line[1] row values. line[1][0] : barcode coordinates, line[1][1] : barcode
     for line in df.iterrows():
         if not bool(patt_position.fullmatch(line[1][0])):
@@ -134,11 +137,12 @@ def _barcode_check(barcode_file: Path) -> pd.DataFrame:
             )
         barcode = line[1][1]
         letter = line[1][0][0]
+        num = int(line[1][0][1:])
         try:
-            bc_positions[barcode][letter] = line[1][0][1:]
+            bc_positions[num][letter] = barcode
         except KeyError:
-            bc_positions[barcode] = {}
-            bc_positions[barcode][letter] = line[1][0][1:]
+            bc_positions[num] = {}
+            bc_positions[num][letter] = barcode
     # return pandas.DataFrame, in (pseudo)long form
     return pd.DataFrame(bc_positions).transpose()
 
@@ -265,10 +269,11 @@ def dbit(
 
     # add barcode positions to annData.
     # A and B naming follow original publication and protocol
-    adata.obs["array_A"] = [int(bc_df.loc[x[8:16], "A"]) for x in adata.obs_names]
-    adata.obs["array_B"] = [int(bc_df.loc[x[:8], "B"]) for x in adata.obs_names]
+    adata.obs["array_A"] = [bc_df[bc_df.loc[:, "A"] == x[8:16]].index[0] for x in adata.obs_names]
+    adata.obs["array_B"] = [bc_df[bc_df.loc[:, "B"] == x[:8]].index[0] for x in adata.obs_names]
     # sort annData by barcode position. Barcode A first, then Barcode B
-    adata.obs.sort_values(by=["array_A", "array_B"], inplace=True)
+    idx = adata.obs.sort_values(by=["array_A", "array_B"]).index
+    adata = adata[idx]
 
     # populate annData
     if dataset_id is None:  # if no dataset_id, use file name as id.

diff --git a/src/spatialdata_io/readers/visium.py b/src/spatialdata_io/readers/visium.py
@@ -7,11 +7,12 @@
 from functools import partial
 from pathlib import Path
 from types import MappingProxyType
-from typing import Any, Optional
+from typing import Any
 
 import numpy as np
 import pandas as pd
 from dask_image.imread import imread
+from imageio import imread as imread2
 from spatialdata import SpatialData
 from spatialdata._logging import logger
 from spatialdata.models import Image2DModel, ShapesModel, TableModel
@@ -28,11 +29,11 @@
 @inject_docs(vx=VisiumKeys)
 def visium(
     path: str | Path,
-    dataset_id: Optional[str] = None,
-    counts_file: str = VisiumKeys.COUNTS_FILE,
-    fullres_image_file: Optional[str | Path] = None,
-    tissue_positions_file: Optional[str | Path] = None,
-    scalefactors_file: Optional[str | Path] = None,
+    dataset_id: str | None = None,
+    counts_file: str = VisiumKeys.FILTERED_COUNTS_FILE,
+    fullres_image_file: str | Path | None = None,
+    tissue_positions_file: str | Path | None = None,
+    scalefactors_file: str | Path | None = None,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
     image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
     **kwargs: Any,
@@ -42,7 +43,7 @@ def visium(
 
     This function reads the following files:
 
-        - ``(<dataset_id>_)`{vx.COUNTS_FILE!r}```: Counts and metadata file.
+        - ``(<dataset_id>_)`{vx.FILTERED_COUNTS_FILE!r}```: Counts and metadata file.
         - ``{vx.IMAGE_HIRES_FILE!r}``: High resolution image.
         - ``{vx.IMAGE_LOWRES_FILE!r}``: Low resolution image.
         - ``{vx.SCALEFACTORS_FILE!r}``: Scalefactors file.
@@ -60,9 +61,12 @@ def visium(
         Path to the directory containing the data.
     dataset_id
         Dataset identifier to name the constructed `SpatialData` elements. The reader will try to infer it from the
-        ``{vx.COUNTS_FILE!r}`` file name. If the file name does not contain the dataset id, it will try to infer it from the metadata, it needs to be provided.
+        counts_file filen name (which defaults to ``{vx.FILTERED_COUNTS_FILE!r}``) file name. If the file name does not
+        contain the dataset id, it needs to be provided.
     counts_file
-        Name of the counts file. Use only if counts is not in `h5` format.
+        Name of the counts file, defaults to ``{vx.FILTERED_COUNTS_FILE!r}``; a common alternative is
+        ``{vx.RAW_COUNTS_FILE!r}``. Also, use when the file names are not following the standard SpaceRanger
+        conventions.
     fullres_image_file
         Path to the full-resolution image.
     tissue_positions_file
@@ -81,29 +85,31 @@ def visium(
     path = Path(path)
     imread_kwargs = dict(imread_kwargs)
     image_models_kwargs = dict(image_models_kwargs)
+
     # try to infer library_id from the counts file
     library_id = None
     try:
-        patt = re.compile(f".*{VisiumKeys.COUNTS_FILE}")
+        patt = re.compile(f".*{counts_file}")
         first_file = [i for i in os.listdir(path) if patt.match(i)][0]
 
-        if f"_{VisiumKeys.COUNTS_FILE}" in first_file:
-            library_id = first_file.replace(f"_{VisiumKeys.COUNTS_FILE}", "")
-            counts_file = f"{library_id}_{VisiumKeys.COUNTS_FILE}"
-        elif VisiumKeys.COUNTS_FILE == first_file:
+        if f"_{counts_file}" in first_file:
+            library_id = first_file.replace(f"_{counts_file}", "")
+            counts_file = f"{library_id}_{counts_file}"
+        elif counts_file == first_file:
             library_id = None
-            counts_file = VisiumKeys.COUNTS_FILE
+            counts_file = counts_file
         else:
             raise ValueError(
-                f"Cannot determine the library_id. Expecting a file with format (<library_id>_){VisiumKeys.COUNTS_FILE}"
-                f". If the files have been renamed you may need to specify their file names (not their paths), with "
-                f"some of the following arguments: `counts_file`, `fullres_image_file`, `tissue_positions_file`, "
-                f"`scalefactors_file` arguments."
+                f"Cannot determine the library_id. Expecting a file with format (<library_id>_)"
+                f"{counts_file}. If the files have been renamed you may need to specify their file "
+                f"names (not their paths), with some of the following arguments: `counts_file`, `fullres_image_file`, "
+                f"`tissue_positions_file`, `scalefactors_file` arguments."
             )
     except IndexError as e:
         if counts_file is None:
             logger.error(
-                f"{e}. \nError is due to the fact that the library id could not be found, if the counts file is `.mtx` (or else), Please provide a `counts_file`.",
+                f"{e}. \nError is due to the fact that the library id could not be found, if the counts file is `.mtx` "
+                f"(or else), Please provide a `counts_file`.",
             )
             raise e
     assert counts_file is not None
@@ -122,11 +128,11 @@ def visium(
         dataset_id = library_id
     assert dataset_id is not None
 
-    # Yhe second element of the returned tuple is the full library as contained in the metadata of
-    # VisiumKeys.COUNTS_FILE. For instance for the spatialdata-sandbox/visium dataset it is:
+    # The second element of the returned tuple is the full library as contained in the metadata of
+    # VisiumKeys.FILTERED_COUNTS_FILE. For instance, for the spatialdata-sandbox/visium dataset it is:
     #     spaceranger100_count_30458_ST8059048_mm10-3_0_0_premrna
-    # We discard this value and use the one inferred from the filename of VisiumKeys.COUNTS_FILE, or the one provided by
-    # the user in dataset_id
+    # We discard this value and use the one inferred from the filename of VisiumKeys.FILTERED_COUNTS_FILE, or the one
+    # provided by the user in dataset_id
     adata, _ = _read_counts(path, counts_file=counts_file, library_id=library_id, **kwargs)
 
     if (path / "spatial" / VisiumKeys.SPOTS_FILE_1).exists() or (
@@ -152,12 +158,8 @@ def visium(
     coords = read_coords(tissue_positions_file)
 
     # to handle spaceranger_version < 2.0.0, where no column names are provided
-    # in fact, from spaceranger 2.0.0, the column names are provided in the file, and
-    # are "pxl_col_in_fullres", "pxl_row_in_fullres" are inverted.
-    # But, in the case of CytAssist, the column names provided but the image is flipped
-    # so we need to invert the columns.
-    if "in_tissue" not in coords.columns or "CytAssist" in str(fullres_image_file):
-        coords.columns = ["in_tissue", "array_row", "array_col", "pxl_col_in_fullres", "pxl_row_in_fullres"]
+    if "in_tissue" not in coords.columns:
+        coords.columns = ["in_tissue", "array_row", "array_col", "pxl_row_in_fullres", "pxl_col_in_fullres"]
 
     adata.obs = pd.merge(adata.obs, coords, how="left", left_index=True, right_index=True)
     coords = adata.obs[[VisiumKeys.SPOTS_X, VisiumKeys.SPOTS_Y]].values
@@ -203,12 +205,8 @@ def visium(
     if fullres_image_file is not None:
         fullres_image_file = path / Path(fullres_image_file)
         if fullres_image_file.exists():
-            if "MAX_IMAGE_PIXELS" in imread_kwargs:
-                from PIL import Image as ImagePIL
-
-                ImagePIL.MAX_IMAGE_PIXELS = imread_kwargs.pop("MAX_IMAGE_PIXELS")
-            full_image = imread(fullres_image_file, **imread_kwargs).squeeze().transpose(2, 0, 1)
-            full_image = DataArray(full_image, dims=("c", "y", "x"))
+            image = _read_image(fullres_image_file, imread_kwargs)
+            full_image = DataArray(image, dims=("c", "y", "x"))
             images[dataset_id + "_full_image"] = Image2DModel.parse(
                 full_image,
                 scale_factors=[2, 2, 2, 2],
@@ -232,3 +230,37 @@ def visium(
         )
 
     return SpatialData(images=images, shapes=shapes, table=table)
+
+
+def _read_image(image_file: Path, imread_kwargs: dict[str, Any]) -> Any:
+    if "MAX_IMAGE_PIXELS" in imread_kwargs:
+        from PIL import Image as ImagePIL
+
+        ImagePIL.MAX_IMAGE_PIXELS = imread_kwargs.pop("MAX_IMAGE_PIXELS")
+    if image_file.suffix != ".btf":
+        im = imread(image_file, **imread_kwargs)
+    else:
+        # dask_image doesn't recognize .btf automatically
+        im = imread2(image_file, **imread_kwargs)
+        # Depending on the versions of the pipeline, the axes of the image file from the tiff data is ordered in
+        # different ways; here let's implement a simple check on the shape to determine the axes ordering.
+        # Note that a more robust check could be implemented; this could be the work of a future PR. Unfortunately,
+        # the tif data does not (or does not always) have OME metadata, so even such more general parser could lead
+        # to edge cases that could be addressed by a more interoperable file format.
+    if len(im.shape) not in [3, 4]:
+        raise ValueError(f"Image shape {im.shape} is not supported.")
+    if len(im.shape) == 4:
+        if im.shape[0] == 1:
+            im = im.squeeze(0)
+        else:
+            raise ValueError(f"Image shape {im.shape} is not supported.")
+    # for immunofluerence images there could be an arbitrary number of channels (usually, 2, 3 or 4); we can detect this
+    # as the dimension which has the minimum size
+    min_size = np.argmin(im.shape)
+    if min_size == 0:
+        image = im
+    elif min_size == 2:
+        image = im.transpose(2, 0, 1)
+    else:
+        raise ValueError(f"Image shape {im.shape} is not supported.")
+    return image