[go: nahoru, domu]

Skip to content

Commit

Permalink
Merge branch 'main' into feat/to_legacy_anndata
Browse files Browse the repository at this point in the history
  • Loading branch information
LucaMarconato committed Mar 18, 2024
2 parents 8a1430f + 896af31 commit afff8e5
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 50 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.8.0
rev: v1.9.0
hooks:
- id: mypy
additional_dependencies: [numpy, types-PyYAML]
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ and this project adheres to [Semantic Versioning][].
- (Xenium) support for post-xenium aligned images (IF, HE)
- (Xenium) reader for the selection coordinates file from the Xenium Explorer
- (DBiT-seq) reader
- converter functions `to_legacy_anndata()` and `from_legacy_anndata()`
- converter functions `experimental.to_legacy_anndata()` and `experimental.from_legacy_anndata()`
- (Visium) support for raw reads (capture locations not under tissue)

### Fixed

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ This package contains reader functions to load common spatial omics formats into
- Curio Seeker
- Akoya PhenoCycler (formerly CODEX)
- Vizgen MERSCOPE (MERFISH)
- DBiT-seq

## Getting started

Expand Down
7 changes: 4 additions & 3 deletions src/spatialdata_io/_constants/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ class VisiumKeys(ModeEnum):
"""Keys for *10X Genomics Visium* formatted dataset."""

# files and directories
COUNTS_FILE = "filtered_feature_bc_matrix.h5"
FILTERED_COUNTS_FILE = "filtered_feature_bc_matrix.h5"
RAW_COUNTS_FILE = "raw_feature_bc_matrix.h5"

# images
IMAGE_HIRES_FILE = "spatial/tissue_hires_image.png"
Expand All @@ -124,8 +125,8 @@ class VisiumKeys(ModeEnum):
# spots
SPOTS_FILE_1 = "tissue_positions_list.csv"
SPOTS_FILE_2 = "tissue_positions.csv"
SPOTS_X = "pxl_row_in_fullres"
SPOTS_Y = "pxl_col_in_fullres"
SPOTS_X = "pxl_col_in_fullres"
SPOTS_Y = "pxl_row_in_fullres"


@unique
Expand Down
23 changes: 14 additions & 9 deletions src/spatialdata_io/readers/dbit.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,11 @@ def _barcode_check(barcode_file: Path) -> pd.DataFrame:
Returns
-------
pd.DataFrame
A pandas.DataFrame with 2 columns, named 'A' and 'B', with a barcode as row index.
Columns 'A' and 'B' contains an int each, that are the spatial coordinate of the barcode.
A pandas.DataFrame with 2 columns, named 'A' and 'B', with a number
(from 1 to 50, extreme included) as row index.
Columns 'A' and 'B' contains a 8 nucleotides barcode.
The header of the column and the row index are the spatial coordinate of the barcode.
For example, the element of row 1, column 'A' is the barcode with coordinate A1.
The columns are ordered in ascending order.
"""
df = pd.read_csv(barcode_file, header=None, sep="\t")
Expand All @@ -121,7 +124,7 @@ def _barcode_check(barcode_file: Path) -> pd.DataFrame:
# Pattern 2: match nucleotides string of 8 char. Case insensitive.
patt_barcode = re.compile(r"^[A|a|T|t|C|c|G|g]{8}$")
# dict, used to collect data after matching
bc_positions: dict[str, dict[str, str]] = {}
bc_positions: dict[int, dict[str, str]] = {}
# line[0]: row index, line[1] row values. line[1][0] : barcode coordinates, line[1][1] : barcode
for line in df.iterrows():
if not bool(patt_position.fullmatch(line[1][0])):
Expand All @@ -134,11 +137,12 @@ def _barcode_check(barcode_file: Path) -> pd.DataFrame:
)
barcode = line[1][1]
letter = line[1][0][0]
num = int(line[1][0][1:])
try:
bc_positions[barcode][letter] = line[1][0][1:]
bc_positions[num][letter] = barcode
except KeyError:
bc_positions[barcode] = {}
bc_positions[barcode][letter] = line[1][0][1:]
bc_positions[num] = {}
bc_positions[num][letter] = barcode
# return pandas.DataFrame, in (pseudo)long form
return pd.DataFrame(bc_positions).transpose()

Expand Down Expand Up @@ -265,10 +269,11 @@ def dbit(

# add barcode positions to annData.
# A and B naming follow original publication and protocol
adata.obs["array_A"] = [int(bc_df.loc[x[8:16], "A"]) for x in adata.obs_names]
adata.obs["array_B"] = [int(bc_df.loc[x[:8], "B"]) for x in adata.obs_names]
adata.obs["array_A"] = [bc_df[bc_df.loc[:, "A"] == x[8:16]].index[0] for x in adata.obs_names]
adata.obs["array_B"] = [bc_df[bc_df.loc[:, "B"] == x[:8]].index[0] for x in adata.obs_names]
# sort annData by barcode position. Barcode A first, then Barcode B
adata.obs.sort_values(by=["array_A", "array_B"], inplace=True)
idx = adata.obs.sort_values(by=["array_A", "array_B"]).index
adata = adata[idx]

# populate annData
if dataset_id is None: # if no dataset_id, use file name as id.
Expand Down
104 changes: 68 additions & 36 deletions src/spatialdata_io/readers/visium.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
from functools import partial
from pathlib import Path
from types import MappingProxyType
from typing import Any, Optional
from typing import Any

import numpy as np
import pandas as pd
from dask_image.imread import imread
from imageio import imread as imread2
from spatialdata import SpatialData
from spatialdata._logging import logger
from spatialdata.models import Image2DModel, ShapesModel, TableModel
Expand All @@ -28,11 +29,11 @@
@inject_docs(vx=VisiumKeys)
def visium(
path: str | Path,
dataset_id: Optional[str] = None,
counts_file: str = VisiumKeys.COUNTS_FILE,
fullres_image_file: Optional[str | Path] = None,
tissue_positions_file: Optional[str | Path] = None,
scalefactors_file: Optional[str | Path] = None,
dataset_id: str | None = None,
counts_file: str = VisiumKeys.FILTERED_COUNTS_FILE,
fullres_image_file: str | Path | None = None,
tissue_positions_file: str | Path | None = None,
scalefactors_file: str | Path | None = None,
imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
**kwargs: Any,
Expand All @@ -42,7 +43,7 @@ def visium(
This function reads the following files:
- ``(<dataset_id>_)`{vx.COUNTS_FILE!r}```: Counts and metadata file.
- ``(<dataset_id>_)`{vx.FILTERED_COUNTS_FILE!r}```: Counts and metadata file.
- ``{vx.IMAGE_HIRES_FILE!r}``: High resolution image.
- ``{vx.IMAGE_LOWRES_FILE!r}``: Low resolution image.
- ``{vx.SCALEFACTORS_FILE!r}``: Scalefactors file.
Expand All @@ -60,9 +61,12 @@ def visium(
Path to the directory containing the data.
dataset_id
Dataset identifier to name the constructed `SpatialData` elements. The reader will try to infer it from the
``{vx.COUNTS_FILE!r}`` file name. If the file name does not contain the dataset id, it will try to infer it from the metadata, it needs to be provided.
counts_file filen name (which defaults to ``{vx.FILTERED_COUNTS_FILE!r}``) file name. If the file name does not
contain the dataset id, it needs to be provided.
counts_file
Name of the counts file. Use only if counts is not in `h5` format.
Name of the counts file, defaults to ``{vx.FILTERED_COUNTS_FILE!r}``; a common alternative is
``{vx.RAW_COUNTS_FILE!r}``. Also, use when the file names are not following the standard SpaceRanger
conventions.
fullres_image_file
Path to the full-resolution image.
tissue_positions_file
Expand All @@ -81,29 +85,31 @@ def visium(
path = Path(path)
imread_kwargs = dict(imread_kwargs)
image_models_kwargs = dict(image_models_kwargs)

# try to infer library_id from the counts file
library_id = None
try:
patt = re.compile(f".*{VisiumKeys.COUNTS_FILE}")
patt = re.compile(f".*{counts_file}")
first_file = [i for i in os.listdir(path) if patt.match(i)][0]

if f"_{VisiumKeys.COUNTS_FILE}" in first_file:
library_id = first_file.replace(f"_{VisiumKeys.COUNTS_FILE}", "")
counts_file = f"{library_id}_{VisiumKeys.COUNTS_FILE}"
elif VisiumKeys.COUNTS_FILE == first_file:
if f"_{counts_file}" in first_file:
library_id = first_file.replace(f"_{counts_file}", "")
counts_file = f"{library_id}_{counts_file}"
elif counts_file == first_file:
library_id = None
counts_file = VisiumKeys.COUNTS_FILE
counts_file = counts_file
else:
raise ValueError(
f"Cannot determine the library_id. Expecting a file with format (<library_id>_){VisiumKeys.COUNTS_FILE}"
f". If the files have been renamed you may need to specify their file names (not their paths), with "
f"some of the following arguments: `counts_file`, `fullres_image_file`, `tissue_positions_file`, "
f"`scalefactors_file` arguments."
f"Cannot determine the library_id. Expecting a file with format (<library_id>_)"
f"{counts_file}. If the files have been renamed you may need to specify their file "
f"names (not their paths), with some of the following arguments: `counts_file`, `fullres_image_file`, "
f"`tissue_positions_file`, `scalefactors_file` arguments."
)
except IndexError as e:
if counts_file is None:
logger.error(
f"{e}. \nError is due to the fact that the library id could not be found, if the counts file is `.mtx` (or else), Please provide a `counts_file`.",
f"{e}. \nError is due to the fact that the library id could not be found, if the counts file is `.mtx` "
f"(or else), Please provide a `counts_file`.",
)
raise e
assert counts_file is not None
Expand All @@ -122,11 +128,11 @@ def visium(
dataset_id = library_id
assert dataset_id is not None

# Yhe second element of the returned tuple is the full library as contained in the metadata of
# VisiumKeys.COUNTS_FILE. For instance for the spatialdata-sandbox/visium dataset it is:
# The second element of the returned tuple is the full library as contained in the metadata of
# VisiumKeys.FILTERED_COUNTS_FILE. For instance, for the spatialdata-sandbox/visium dataset it is:
# spaceranger100_count_30458_ST8059048_mm10-3_0_0_premrna
# We discard this value and use the one inferred from the filename of VisiumKeys.COUNTS_FILE, or the one provided by
# the user in dataset_id
# We discard this value and use the one inferred from the filename of VisiumKeys.FILTERED_COUNTS_FILE, or the one
# provided by the user in dataset_id
adata, _ = _read_counts(path, counts_file=counts_file, library_id=library_id, **kwargs)

if (path / "spatial" / VisiumKeys.SPOTS_FILE_1).exists() or (
Expand All @@ -152,12 +158,8 @@ def visium(
coords = read_coords(tissue_positions_file)

# to handle spaceranger_version < 2.0.0, where no column names are provided
# in fact, from spaceranger 2.0.0, the column names are provided in the file, and
# are "pxl_col_in_fullres", "pxl_row_in_fullres" are inverted.
# But, in the case of CytAssist, the column names provided but the image is flipped
# so we need to invert the columns.
if "in_tissue" not in coords.columns or "CytAssist" in str(fullres_image_file):
coords.columns = ["in_tissue", "array_row", "array_col", "pxl_col_in_fullres", "pxl_row_in_fullres"]
if "in_tissue" not in coords.columns:
coords.columns = ["in_tissue", "array_row", "array_col", "pxl_row_in_fullres", "pxl_col_in_fullres"]

adata.obs = pd.merge(adata.obs, coords, how="left", left_index=True, right_index=True)
coords = adata.obs[[VisiumKeys.SPOTS_X, VisiumKeys.SPOTS_Y]].values
Expand Down Expand Up @@ -203,12 +205,8 @@ def visium(
if fullres_image_file is not None:
fullres_image_file = path / Path(fullres_image_file)
if fullres_image_file.exists():
if "MAX_IMAGE_PIXELS" in imread_kwargs:
from PIL import Image as ImagePIL

ImagePIL.MAX_IMAGE_PIXELS = imread_kwargs.pop("MAX_IMAGE_PIXELS")
full_image = imread(fullres_image_file, **imread_kwargs).squeeze().transpose(2, 0, 1)
full_image = DataArray(full_image, dims=("c", "y", "x"))
image = _read_image(fullres_image_file, imread_kwargs)
full_image = DataArray(image, dims=("c", "y", "x"))
images[dataset_id + "_full_image"] = Image2DModel.parse(
full_image,
scale_factors=[2, 2, 2, 2],
Expand All @@ -232,3 +230,37 @@ def visium(
)

return SpatialData(images=images, shapes=shapes, table=table)


def _read_image(image_file: Path, imread_kwargs: dict[str, Any]) -> Any:
if "MAX_IMAGE_PIXELS" in imread_kwargs:
from PIL import Image as ImagePIL

ImagePIL.MAX_IMAGE_PIXELS = imread_kwargs.pop("MAX_IMAGE_PIXELS")
if image_file.suffix != ".btf":
im = imread(image_file, **imread_kwargs)
else:
# dask_image doesn't recognize .btf automatically
im = imread2(image_file, **imread_kwargs)
# Depending on the versions of the pipeline, the axes of the image file from the tiff data is ordered in
# different ways; here let's implement a simple check on the shape to determine the axes ordering.
# Note that a more robust check could be implemented; this could be the work of a future PR. Unfortunately,
# the tif data does not (or does not always) have OME metadata, so even such more general parser could lead
# to edge cases that could be addressed by a more interoperable file format.
if len(im.shape) not in [3, 4]:
raise ValueError(f"Image shape {im.shape} is not supported.")
if len(im.shape) == 4:
if im.shape[0] == 1:
im = im.squeeze(0)
else:
raise ValueError(f"Image shape {im.shape} is not supported.")
# for immunofluerence images there could be an arbitrary number of channels (usually, 2, 3 or 4); we can detect this
# as the dimension which has the minimum size
min_size = np.argmin(im.shape)
if min_size == 0:
image = im
elif min_size == 2:
image = im.transpose(2, 0, 1)
else:
raise ValueError(f"Image shape {im.shape} is not supported.")
return image

0 comments on commit afff8e5

Please sign in to comment.