"""Bundled example datasets shipped inside the scmidas wheel.
These are toy-sized subsets of real datasets, designed to make the README
quickstart runnable in under a minute on a single GPU. They are NOT meant
for benchmarking — see the basics tutorials for full-size data.
"""
from __future__ import annotations
import logging
from importlib import resources
from pathlib import Path
from typing import Optional, Union
import anndata as ad
import mudata as mu
logger = logging.getLogger(__name__)
[docs]
def quickstart_path() -> Path:
"""Return the on-disk path of the bundled quickstart .h5mu file.
Returns:
Path:
Absolute path to ``quickstart_pbmc_mosaic.h5mu`` inside the
installed scmidas package.
"""
with resources.as_file(
resources.files('scmidas').joinpath('data/quickstart_pbmc_mosaic.h5mu')
) as p:
return Path(p)
[docs]
def quickstart() -> mu.MuData:
"""Load the bundled quickstart MuData (PBMC RNA+ADT mosaic, 1600 cells).
The dataset is a hand-tuned subset of the WNN PBMC mosaic dataset:
4 batches × 400 cells each (RNA-only, ADT-only, two paired) with
500 RNA HVGs + 224 ADT features, sized so that
``scmidas.integrate(...)`` finishes in roughly one minute on a
single mid-range GPU. **It is intended for the quickstart only**;
its size and feature count are not appropriate for serious analysis.
Returns:
MuData:
A MuData with two modalities (``'rna'``, ``'adt'``) and the
following ``obs`` columns at top level:
``'batch'`` and ``'celltype'``.
"""
return mu.read_h5mu(str(quickstart_path()))
[docs]
def from_dir(
dir_path: Union[str, Path],
label_dir: Optional[Union[str, Path]] = None,
label_col: str = 'label',
) -> mu.MuData:
"""Load a MIDAS directory-format dataset as a :class:`MuData`.
The directory format (used by the basics tutorials) lays each batch's
counts out as MatrixMarket .mtx files plus per-feature mask CSVs::
dir_path/
feat/feat_dims.toml # per-modality chunk sizes
<batch>/
cell_names.csv # cell IDs (1 column, no header beyond default)
mat/<modality>.mtx # (n_cells, n_features), Matrix Market
mask/<modality>.csv # 1-row CSV, n_features columns (0/1 mask)
...
The returned MuData has:
- One modality per modality file present in ``feat/feat_dims.toml``.
- ``mdata[m].obs['batch']`` set to the source batch name.
- ``mdata[m].uns[f'mask_{batch}']`` for any per-batch feature masks
that exist (matches the lookup in ``MIDAS.get_info_from_mdata``).
- ``mdata.uns['feat_dims']`` mirroring ``feat_dims.toml`` so callers
can pass ``dims_x=mdata.uns['feat_dims']`` to ``setup_mudata``
(needed for ATAC chromosome chunking).
- If ``label_dir`` is given, ``mdata[m].obs[label_col]`` is filled in
from ``label_dir/<batch>.csv`` (matched positionally to cells in
that batch).
Parameters:
dir_path : str or Path
Path to the ``data/`` directory described above.
label_dir : str or Path, optional
Path to the sibling ``label/`` directory; one CSV per batch.
label_col : str
Name of the obs column to write labels under.
Returns:
MuData: One AnnData per modality, indexed by batch.
Examples:
>>> import scmidas
>>> mdata = scmidas.datasets.from_dir(
... 'dataset/teadog_mosaic_mtx/data',
... label_dir='dataset/teadog_mosaic_mtx/label',
... )
>>> scmidas.MIDAS.setup_mudata(mdata, dims_x=mdata.uns['feat_dims'])
>>> model = scmidas.MIDAS(mdata)
"""
import natsort
import numpy as np
import pandas as pd
import scipy.io
import scipy.sparse as sp
import toml
p = Path(dir_path)
if not (p / 'feat' / 'feat_dims.toml').exists():
raise FileNotFoundError(
f"{p} doesn't look like a MIDAS dataset dir "
f"(missing feat/feat_dims.toml)."
)
feat_dims = toml.load(p / 'feat' / 'feat_dims.toml')
modalities = list(feat_dims.keys())
batch_dirs = [
d for d in natsort.natsorted([d for d in p.iterdir() if d.is_dir()])
if d.name != 'feat'
]
if not batch_dirs:
raise ValueError(f"No batch directories found under {p}.")
per_mod = {m: {'mats': [], 'cells': [], 'batches': [], 'masks': {}} for m in modalities}
for bd in batch_dirs:
b = bd.name
cn_path = bd / 'cell_names.csv'
explicit_cell_names = None
if cn_path.exists():
explicit_cell_names = pd.read_csv(cn_path, index_col=0).iloc[:, 0].astype(str).tolist()
mat_dir = bd / 'mat'
mask_dir = bd / 'mask'
for m in modalities:
mat_file = mat_dir / f'{m}.mtx'
if not mat_file.exists():
continue
mat = scipy.io.mmread(str(mat_file)).tocsr()
n = mat.shape[0]
if explicit_cell_names is not None:
if n != len(explicit_cell_names):
raise ValueError(
f"{mat_file}: rows={n} but {cn_path} has {len(explicit_cell_names)} cells."
)
cell_names = explicit_cell_names
else:
cell_names = [f'{b}_{i}' for i in range(n)]
per_mod[m]['mats'].append(mat)
per_mod[m]['cells'].append(cell_names)
per_mod[m]['batches'].append(b)
mask_file = mask_dir / f'{m}.csv'
if mask_file.exists():
mask_arr = pd.read_csv(mask_file, index_col=0).values.flatten().astype(np.float32)
per_mod[m]['masks'][b] = mask_arr
mdict = {}
for m, info in per_mod.items():
if not info['mats']:
logger.info("Modality %r had no matrices in any batch; skipping.", m)
continue
X = sp.vstack(info['mats']).tocsr()
all_cells: list = []
all_batches: list = []
for cells, batch in zip(info['cells'], info['batches']):
all_cells.extend(cells)
all_batches.extend([batch] * len(cells))
adata = ad.AnnData(X=X)
adata.obs_names = all_cells
adata.obs_names_make_unique()
adata.obs['batch'] = pd.Categorical(all_batches)
for b, mask in info['masks'].items():
adata.uns[f'mask_{b}'] = mask
mdict[m] = adata
if not mdict:
raise ValueError(f"No modalities loaded from {p}; nothing in feat_dims.toml matched files on disk.")
mdata = mu.MuData(mdict)
mdata.uns['feat_dims'] = {m: list(map(int, v)) for m, v in feat_dims.items() if m in mdict}
# Push 'batch' to top-level mdata.obs so plotting tools that read
# mdata.obs can find it without modality prefixes. For each cell in mdata.obs_names,
# take 'batch' from whichever modality contains it (they agree by
# construction).
batch_top = pd.Series(index=mdata.obs_names, dtype=object)
for m, ad_m in mdict.items():
for cid, b in zip(ad_m.obs_names, ad_m.obs['batch'].astype(str)):
if pd.isna(batch_top.get(cid, np.nan)):
batch_top.loc[cid] = b
mdata.obs['batch'] = pd.Categorical(batch_top.values)
if label_dir is not None:
ld = Path(label_dir)
for m in mdict:
label_series = pd.Series(index=mdict[m].obs_names, dtype=object)
for b, group_cells in mdict[m].obs.groupby('batch', observed=True):
lf = ld / f'{b}.csv'
if not lf.exists():
continue
labels = pd.read_csv(lf, index_col=0).iloc[:, 0].astype(str).values
if len(labels) != len(group_cells):
logger.warning(
"Label file %s has %d rows but modality %r has %d cells in batch %s; skipping labels for that batch.",
lf, len(labels), m, len(group_cells), b,
)
continue
label_series.loc[group_cells.index] = labels
mdict[m].obs[label_col] = pd.Categorical(label_series.values)
# Push the label to top-level mdata.obs the same way as 'batch'.
label_top = pd.Series(index=mdata.obs_names, dtype=object)
for m, ad_m in mdict.items():
if label_col not in ad_m.obs.columns:
continue
for cid, lab in zip(ad_m.obs_names, ad_m.obs[label_col].astype(str)):
if pd.isna(label_top.get(cid, np.nan)):
label_top.loc[cid] = lab
mdata.obs[label_col] = pd.Categorical(label_top.values)
logger.info(
"from_dir: loaded %s modalities (%s) across %d batches.",
len(mdict), list(mdict.keys()), len(batch_dirs),
)
return mdata