Source code for scmidas.datasets

"""Bundled example datasets shipped inside the scmidas wheel.

These are toy-sized subsets of real datasets, designed to make the README
quickstart runnable in under a minute on a single GPU. They are NOT meant
for benchmarking — see the basics tutorials for full-size data.
"""
from __future__ import annotations

import logging
from importlib import resources
from pathlib import Path
from typing import Optional, Union

import anndata as ad
import mudata as mu

logger = logging.getLogger(__name__)


[docs] def quickstart_path() -> Path: """Return the on-disk path of the bundled quickstart .h5mu file. Returns: Path: Absolute path to ``quickstart_pbmc_mosaic.h5mu`` inside the installed scmidas package. """ with resources.as_file( resources.files('scmidas').joinpath('data/quickstart_pbmc_mosaic.h5mu') ) as p: return Path(p)
[docs] def quickstart() -> mu.MuData: """Load the bundled quickstart MuData (PBMC RNA+ADT mosaic, 1600 cells). The dataset is a hand-tuned subset of the WNN PBMC mosaic dataset: 4 batches × 400 cells each (RNA-only, ADT-only, two paired) with 500 RNA HVGs + 224 ADT features, sized so that ``scmidas.integrate(...)`` finishes in roughly one minute on a single mid-range GPU. **It is intended for the quickstart only**; its size and feature count are not appropriate for serious analysis. Returns: MuData: A MuData with two modalities (``'rna'``, ``'adt'``) and the following ``obs`` columns at top level: ``'batch'`` and ``'celltype'``. """ return mu.read_h5mu(str(quickstart_path()))
[docs] def from_dir( dir_path: Union[str, Path], label_dir: Optional[Union[str, Path]] = None, label_col: str = 'label', ) -> mu.MuData: """Load a MIDAS directory-format dataset as a :class:`MuData`. The directory format (used by the basics tutorials) lays each batch's counts out as MatrixMarket .mtx files plus per-feature mask CSVs:: dir_path/ feat/feat_dims.toml # per-modality chunk sizes <batch>/ cell_names.csv # cell IDs (1 column, no header beyond default) mat/<modality>.mtx # (n_cells, n_features), Matrix Market mask/<modality>.csv # 1-row CSV, n_features columns (0/1 mask) ... The returned MuData has: - One modality per modality file present in ``feat/feat_dims.toml``. - ``mdata[m].obs['batch']`` set to the source batch name. - ``mdata[m].uns[f'mask_{batch}']`` for any per-batch feature masks that exist (matches the lookup in ``MIDAS.get_info_from_mdata``). - ``mdata.uns['feat_dims']`` mirroring ``feat_dims.toml`` so callers can pass ``dims_x=mdata.uns['feat_dims']`` to ``setup_mudata`` (needed for ATAC chromosome chunking). - If ``label_dir`` is given, ``mdata[m].obs[label_col]`` is filled in from ``label_dir/<batch>.csv`` (matched positionally to cells in that batch). Parameters: dir_path : str or Path Path to the ``data/`` directory described above. label_dir : str or Path, optional Path to the sibling ``label/`` directory; one CSV per batch. label_col : str Name of the obs column to write labels under. Returns: MuData: One AnnData per modality, indexed by batch. Examples: >>> import scmidas >>> mdata = scmidas.datasets.from_dir( ... 'dataset/teadog_mosaic_mtx/data', ... label_dir='dataset/teadog_mosaic_mtx/label', ... ) >>> scmidas.MIDAS.setup_mudata(mdata, dims_x=mdata.uns['feat_dims']) >>> model = scmidas.MIDAS(mdata) """ import natsort import numpy as np import pandas as pd import scipy.io import scipy.sparse as sp import toml p = Path(dir_path) if not (p / 'feat' / 'feat_dims.toml').exists(): raise FileNotFoundError( f"{p} doesn't look like a MIDAS dataset dir " f"(missing feat/feat_dims.toml)." ) feat_dims = toml.load(p / 'feat' / 'feat_dims.toml') modalities = list(feat_dims.keys()) batch_dirs = [ d for d in natsort.natsorted([d for d in p.iterdir() if d.is_dir()]) if d.name != 'feat' ] if not batch_dirs: raise ValueError(f"No batch directories found under {p}.") per_mod = {m: {'mats': [], 'cells': [], 'batches': [], 'masks': {}} for m in modalities} for bd in batch_dirs: b = bd.name cn_path = bd / 'cell_names.csv' explicit_cell_names = None if cn_path.exists(): explicit_cell_names = pd.read_csv(cn_path, index_col=0).iloc[:, 0].astype(str).tolist() mat_dir = bd / 'mat' mask_dir = bd / 'mask' for m in modalities: mat_file = mat_dir / f'{m}.mtx' if not mat_file.exists(): continue mat = scipy.io.mmread(str(mat_file)).tocsr() n = mat.shape[0] if explicit_cell_names is not None: if n != len(explicit_cell_names): raise ValueError( f"{mat_file}: rows={n} but {cn_path} has {len(explicit_cell_names)} cells." ) cell_names = explicit_cell_names else: cell_names = [f'{b}_{i}' for i in range(n)] per_mod[m]['mats'].append(mat) per_mod[m]['cells'].append(cell_names) per_mod[m]['batches'].append(b) mask_file = mask_dir / f'{m}.csv' if mask_file.exists(): mask_arr = pd.read_csv(mask_file, index_col=0).values.flatten().astype(np.float32) per_mod[m]['masks'][b] = mask_arr mdict = {} for m, info in per_mod.items(): if not info['mats']: logger.info("Modality %r had no matrices in any batch; skipping.", m) continue X = sp.vstack(info['mats']).tocsr() all_cells: list = [] all_batches: list = [] for cells, batch in zip(info['cells'], info['batches']): all_cells.extend(cells) all_batches.extend([batch] * len(cells)) adata = ad.AnnData(X=X) adata.obs_names = all_cells adata.obs_names_make_unique() adata.obs['batch'] = pd.Categorical(all_batches) for b, mask in info['masks'].items(): adata.uns[f'mask_{b}'] = mask mdict[m] = adata if not mdict: raise ValueError(f"No modalities loaded from {p}; nothing in feat_dims.toml matched files on disk.") mdata = mu.MuData(mdict) mdata.uns['feat_dims'] = {m: list(map(int, v)) for m, v in feat_dims.items() if m in mdict} # Push 'batch' to top-level mdata.obs so plotting tools that read # mdata.obs can find it without modality prefixes. For each cell in mdata.obs_names, # take 'batch' from whichever modality contains it (they agree by # construction). batch_top = pd.Series(index=mdata.obs_names, dtype=object) for m, ad_m in mdict.items(): for cid, b in zip(ad_m.obs_names, ad_m.obs['batch'].astype(str)): if pd.isna(batch_top.get(cid, np.nan)): batch_top.loc[cid] = b mdata.obs['batch'] = pd.Categorical(batch_top.values) if label_dir is not None: ld = Path(label_dir) for m in mdict: label_series = pd.Series(index=mdict[m].obs_names, dtype=object) for b, group_cells in mdict[m].obs.groupby('batch', observed=True): lf = ld / f'{b}.csv' if not lf.exists(): continue labels = pd.read_csv(lf, index_col=0).iloc[:, 0].astype(str).values if len(labels) != len(group_cells): logger.warning( "Label file %s has %d rows but modality %r has %d cells in batch %s; skipping labels for that batch.", lf, len(labels), m, len(group_cells), b, ) continue label_series.loc[group_cells.index] = labels mdict[m].obs[label_col] = pd.Categorical(label_series.values) # Push the label to top-level mdata.obs the same way as 'batch'. label_top = pd.Series(index=mdata.obs_names, dtype=object) for m, ad_m in mdict.items(): if label_col not in ad_m.obs.columns: continue for cid, lab in zip(ad_m.obs_names, ad_m.obs[label_col].astype(str)): if pd.isna(label_top.get(cid, np.nan)): label_top.loc[cid] = lab mdata.obs[label_col] = pd.Categorical(label_top.values) logger.info( "from_dir: loaded %s modalities (%s) across %d batches.", len(mdict), list(mdict.keys()), len(batch_dirs), ) return mdata