Source code for cellbender.remove_background.downstream

"""Functions for downstream work with outputs of remove-background."""

from cellbender.remove_background.data.io import load_data

import tables
import numpy as np
import scipy.sparse as sp
import anndata
from typing import Dict, Optional


[docs]def dict_from_h5(file: str) -> Dict[str, np.ndarray]:
    """Read in everything from an h5 file and put into a dictionary.

    Args:
        file: The h5 file

    Returns:
        Dictionary containing all the information from the h5 file
    """
    d = {}
    with tables.open_file(file) as f:
        # read in everything
        for array in f.walk_nodes("/", "Array"):
            d[array.name] = array.read()
    return d


[docs]def anndata_from_h5(file: str,
                    analyzed_barcodes_only: bool = True) -> anndata.AnnData:
    """Load an output h5 file into an AnnData object for downstream work.

    Args:
        file: The h5 file
        analyzed_barcodes_only: False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count matrix.
            True to load a limited set of barcodes: only those analyzed by the
            algorithm. This allows relevant latent variables to be loaded
            properly into adata.obs and adata.obsm, rather than adata.uns.

    Returns:
        anndata.AnnData: The anndata object, populated with inferred latent variables
            and metadata.

    """

    d = dict_from_h5(file)
    X = sp.csc_matrix((d.pop('data'), d.pop('indices'), d.pop('indptr')),
                      shape=d.pop('shape')).transpose().tocsr()

    # check and see if we have barcode index annotations, and if the file is filtered
    barcode_key = [k for k in d.keys() if (('barcode' in k) and ('ind' in k))]
    if len(barcode_key) > 0:
        max_barcode_ind = d[barcode_key[0]].max()
        filtered_file = (max_barcode_ind >= X.shape[0])
    else:
        filtered_file = True

    if analyzed_barcodes_only:
        if filtered_file:
            # filtered file being read, so we don't need to subset
            print('Assuming we are loading a "filtered" file that contains only cells.')
            pass
        elif 'barcode_indices_for_latents' in d.keys():
            X = X[d['barcode_indices_for_latents'], :]
            d['barcodes'] = d['barcodes'][d['barcode_indices_for_latents']]
        elif 'barcodes_analyzed_inds' in d.keys():
            X = X[d['barcodes_analyzed_inds'], :]
            d['barcodes'] = d['barcodes'][d['barcodes_analyzed_inds']]
        else:
            print('Warning: analyzed_barcodes_only=True, but the key '
                  '"barcodes_analyzed_inds" or "barcode_indices_for_latents" '
                  'is missing from the h5 file. '
                  'Will output all barcodes, and proceed as if '
                  'analyzed_barcodes_only=False')

    # Construct the anndata object.
    adata = anndata.AnnData(X=X,
                            obs={'barcode': d.pop('barcodes').astype(str)},
                            var={'gene_name': (d.pop('gene_names') if 'gene_names' in d.keys()
                                               else d.pop('name')).astype(str)},
                            dtype=X.dtype)
    adata.obs.set_index('barcode', inplace=True)
    adata.var.set_index('gene_name', inplace=True)

    # For CellRanger v2 legacy format, "gene_ids" was called "genes"... rename this
    if 'genes' in d.keys():
        d['id'] = d.pop('genes')

    # For purely aesthetic purposes, rename "id" to "gene_id"
    if 'id' in d.keys():
        d['gene_id'] = d.pop('id')

    # If genomes are empty, try to guess them based on gene_id
    if 'genome' in d.keys():
        if np.array([s.decode() == '' for s in d['genome']]).all():
            if '_' in d['gene_id'][0].decode():
                print('Genome field blank, so attempting to guess genomes based on gene_id prefixes')
                d['genome'] = np.array([s.decode().split('_')[0] for s in d['gene_id']], dtype=str)

    # Add other information to the anndata object in the appropriate slot.
    _fill_adata_slots_automatically(adata, d)

    # Add a special additional field to .var if it exists.
    if 'features_analyzed_inds' in adata.uns.keys():
        adata.var['cellbender_analyzed'] = [True if (i in adata.uns['features_analyzed_inds'])
                                            else False for i in range(adata.shape[1])]
    elif 'features_analyzed_inds' in adata.var.keys():
        adata.var['cellbender_analyzed'] = [True if (i in adata.var['features_analyzed_inds'].values)
                                            else False for i in range(adata.shape[1])]

    if analyzed_barcodes_only:
        for col in adata.obs.columns[adata.obs.columns.str.startswith('barcodes_analyzed')
                                     | adata.obs.columns.str.startswith('barcode_indices')]:
            try:
                del adata.obs[col]
            except Exception:
                pass
    else:
        # Add a special additional field to .obs if all barcodes are included.
        if 'barcodes_analyzed_inds' in adata.uns.keys():
            adata.obs['cellbender_analyzed'] = [True if (i in adata.uns['barcodes_analyzed_inds'])
                                                else False for i in range(adata.shape[0])]
        elif 'barcodes_analyzed_inds' in adata.obs.keys():
            adata.obs['cellbender_analyzed'] = [True if (i in adata.obs['barcodes_analyzed_inds'].values)
                                                else False for i in range(adata.shape[0])]

    return adata


def _fill_adata_slots_automatically(adata, d):
    """Add other information to the adata object in the appropriate slot."""

    # TODO: what about "features_analyzed_inds"?  If not all features are analyzed, does this work?

    for key, value in d.items():
        try:
            if value is None:
                continue
            value = np.asarray(value)
            if len(value.shape) == 0:
                adata.uns[key] = value
            elif value.shape[0] == adata.shape[0]:
                if (len(value.shape) < 2) or (value.shape[1] < 2):
                    adata.obs[key] = value
                else:
                    adata.obsm[key] = value
            elif value.shape[0] == adata.shape[1]:
                if value.dtype.name.startswith('bytes'):
                    adata.var[key] = value.astype(str)
                else:
                    adata.var[key] = value
            else:
                adata.uns[key] = value
        except Exception:
            print('Unable to load data into AnnData: ', key, value, type(value))


[docs]def load_anndata_from_input(input_file: str) -> anndata.AnnData:
    """Load an input file into an AnnData object (used in report generation).
    Equivalent to something like scanpy.read(), but uses cellbender's io.

    Args:
        input_file: The raw data file

    Returns:
        adata.AnnData: The anndata object

    """

    # Load data as dict.
    d = load_data(input_file=input_file)

    # For purely aesthetic purposes, rename slots from the plural to singluar.
    for key in ['gene_id', 'barcode', 'genome', 'feature_type', 'gene_name']:
        if key + 's' in d.keys():
            d[key] = d.pop(key + 's')

    # Create anndata object from dict.
    adata = anndata.AnnData(X=d.pop('matrix'),
                            obs={'barcode': d.pop('barcode').astype(str)},
                            var={'gene_name': d.pop('gene_name').astype(str)},
                            dtype=int)
    adata.obs.set_index('barcode', inplace=True)
    adata.var.set_index('gene_name', inplace=True)

    # Add other information to the anndata object in the appropriate slot.
    _fill_adata_slots_automatically(adata, d)

    return adata


[docs]def load_anndata_from_input_and_output(input_file: str,
                                       output_file: str,
                                       analyzed_barcodes_only: bool = True,
                                       input_layer_key: str = 'cellranger',
                                       retain_input_metadata: bool = False,
                                       gene_expression_encoding_key: str = 'cellbender_embedding',
                                       truth_file: Optional[str] = None) -> anndata.AnnData:
    """Load remove-background output count matrix into an anndata object,
    together with remove-background metadata and the raw input counts.

    Args:
        input_file: Raw h5 file (or other compatible remove-background input)
            used as input for remove-background.
        output_file: Output h5 file created by remove-background (can be
            filtered or not).
        analyzed_barcodes_only: Argument passed to anndata_from_h5().
            False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count matrix.
            True to load a limited set of barcodes: only those analyzed by the
            algorithm. This allows relevant latent variables to be loaded
            properly into adata.obs and adata.obsm, rather than adata.uns.
        input_layer_key: Key of the anndata.layer that is created for the raw
            input count matrix.
        retain_input_metadata: In addition to loading the CellBender metadata,
            which happens automatically, set this to True to retain all the
            metadata from the raw input file as well.
        gene_expression_encoding_key: The CellBender gene expression embedding
            will be loaded into adata.obsm[gene_expression_encoding_key]
        truth_file: File containing truth data if this is a simulation

    Return:
        anndata.AnnData: AnnData object with counts before and after remove-background,
            as well as inferred latent variables from remove-background.

    """

    # Load input data.
    adata_raw = load_anndata_from_input(input_file=input_file)

    # Load remove-background output data.
    adata_out = anndata_from_h5(output_file, analyzed_barcodes_only=analyzed_barcodes_only)

    # Subset the raw dataset to the relevant barcodes.
    adata_raw = adata_raw[adata_out.obs.index]

    # TODO: keep the stuff from the raw file too: from obs and var and uns
    # TODO: maybe use _fill_adata_slots_automatically()?  or just copy stuff

    # Put count matrices into 'layers' in anndata for clarity.
    adata_out.layers[input_layer_key] = adata_raw.X.copy()
    adata_out.layers['cellbender'] = adata_out.X.copy()

    # Pre-compute a bit of metadata.
    adata_out.var['n_' + input_layer_key] = \
        np.array(adata_out.layers[input_layer_key].sum(axis=0), dtype=int).squeeze()
    adata_out.var['n_cellbender'] = \
        np.array(adata_out.layers['cellbender'].sum(axis=0), dtype=int).squeeze()
    adata_out.obs['n_' + input_layer_key] = \
        np.array(adata_out.layers[input_layer_key].sum(axis=1), dtype=int).squeeze()
    adata_out.obs['n_cellbender'] = \
        np.array(adata_out.layers['cellbender'].sum(axis=1), dtype=int).squeeze()

    # Load truth data, if present.
    if truth_file is not None:
        adata_truth = anndata_from_h5(truth_file, analyzed_barcodes_only=False)
        adata_truth = adata_truth[adata_out.obs.index]
        adata_out.layers['truth'] = adata_truth.X.copy()
        adata_out.var['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=0), dtype=int).squeeze()
        adata_out.obs['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=1), dtype=int).squeeze()
        for key in adata_truth.obs.keys():
            if key.startswith('truth_'):
                adata_out.obs[key] = adata_truth.obs[key].copy()
        for key in adata_truth.uns.keys():
            if key.startswith('truth_'):
                adata_out.uns[key] = adata_truth.uns[key].copy()
        for key in adata_truth.var.keys():
            if key.startswith('truth_'):
                adata_out.var[key] = adata_truth.var[key].copy()

    # Rename the CellBender encoding of gene expression.
    if analyzed_barcodes_only:
        slot = adata_out.obsm
    else:
        slot = adata_out.uns
    embedding_key = None
    for key in ['gene_expression_encoding', 'latent_gene_encoding']:
        if key in slot.keys():
            embedding_key = key
            break
    if gene_expression_encoding_key != embedding_key:
        slot[gene_expression_encoding_key] = slot[embedding_key].copy()
        del slot[embedding_key]

    return adata_out


def _load_anndata_from_input_and_decontx(input_file: str,
                                         output: str,
                                         input_layer_key: str = 'cellranger',
                                         truth_file: Optional[str] = None) -> anndata.AnnData:
    """Load decontX output count matrix into an anndata object,
    together with remove-background metadata and the raw input counts.

    NOTE: this is used only for dev purposes and only in the report

    Args:
        input_file: Raw h5 file (or other compatible remove-background input)
            used as input for remove-background.
        output: Output h5 file, or a directory where decontX MTX and TSV files
            are stored
        input_layer_key: Key of the anndata.layer that is created for the raw
            input count matrix.
        truth_file: File containing truth data if this is a simulation

    Return:
        anndata.AnnData: AnnData object with counts before and after remove-background,
            as well as inferred latent variables from remove-background.

    """

    # Load decontX output data.
    print('UNSTABLE FEATURE: Trying to load decontX format MTX output')
    adata_out = load_anndata_from_input(input_file=output)
    adata_out.var_names_make_unique()

    # Load input data.
    adata_raw = load_anndata_from_input(input_file=input_file)
    adata_raw.var_names_make_unique()

    adata_raw = adata_raw[:, [g in adata_out.var.index for g in adata_raw.var.index]].copy()
    adata_out.var['genome'] = adata_raw.var['genome'].copy()
    adata_out.var['feature_type'] = adata_raw.var['feature_type'].copy()
    adata_out.var['gene_id'] = adata_raw.var['gene_id'].copy()

    # Subset the raw dataset to the relevant barcodes.
    empty_logic = np.array([b not in adata_out.obs.index for b in adata_raw.obs.index])
    empty_counts = np.array(adata_raw.X[empty_logic].sum(axis=1)).squeeze()
    approx_ambient = np.array(adata_raw.X[empty_logic][empty_counts > 5].sum(axis=0)).squeeze()
    approx_ambient = approx_ambient / (approx_ambient.sum() + 1e-10)
    print(f'Estimated that there are about {np.median(empty_counts[empty_counts > 5])} counts in empties')
    adata_raw = adata_raw[adata_out.obs.index].copy()
    adata_out.uns['empty_droplet_size_lognormal_loc'] = np.log(np.median(empty_counts[empty_counts > 5]))

    # Put count matrices into 'layers' in anndata for clarity.
    adata_out.layers[input_layer_key] = adata_raw.X.copy()
    adata_out.layers['decontx'] = adata_out.X.copy()

    # Pre-compute a bit of metadata.
    adata_out.var['n_' + input_layer_key] = np.array(adata_out.layers[input_layer_key].sum(axis=0)).squeeze()
    adata_out.var['n_decontx'] = np.array(adata_out.layers['decontx'].sum(axis=0)).squeeze()
    adata_out.obs['n_' + input_layer_key] = np.array(adata_out.layers[input_layer_key].sum(axis=1)).squeeze()
    adata_out.obs['n_decontx'] = np.array(adata_out.layers['decontx'].sum(axis=1)).squeeze()
    adata_out.obs['cell_probability'] = 1.  # because decontx data contains only cells
    adata_out.uns['target_false_positive_rate'] = 0.01  # TODO: placeholder
    adata_out.uns['approximate_ambient_profile'] = approx_ambient
    adata_out.var['ambient_expression'] = np.nan

    # Load truth data, if present.
    if truth_file is not None:
        adata_truth = anndata_from_h5(truth_file, analyzed_barcodes_only=False)
        adata_truth = adata_truth[adata_out.obs.index]

        # TODO; a check
        adata_truth = adata_truth[:, [g in adata_out.var.index for g in adata_truth.var.index]].copy()

        adata_out.layers['truth'] = adata_truth.X.copy()
        adata_out.var['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=0)).squeeze()
        adata_out.obs['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=1)).squeeze()
        for key in adata_truth.obs.keys():
            if key.startswith('truth_'):
                adata_out.obs[key] = adata_truth.obs[key].copy()
        for key in adata_truth.uns.keys():
            if key.startswith('truth_'):
                adata_out.uns[key] = adata_truth.uns[key].copy()
        for key in adata_truth.var.keys():
            if key.startswith('truth_'):
                adata_out.var[key] = adata_truth.var[key].copy()

    return adata_out


[docs]def load_anndata_from_input_and_outputs(input_file: str,
                                        output_files: Dict[str, str],
                                        analyzed_barcodes_only: bool = True,
                                        input_layer_key: str = 'cellranger',
                                        gene_expression_encoding_key: str = 'cellbender_embedding',
                                        truth_file: Optional[str] = None) -> anndata.AnnData:
    """Load remove-background output count matrices into an anndata object,
    together with remove-background metadata and the raw input counts.

    The use case would typically be cellbender runs with multiple output files
    at different FPRs, which we want to compare.

    Args:
        input_file: Raw h5 file (or other compatible remove-background input)
            used as input for remove-background.
        output_files: Output h5 files created by remove-background (can be
            filtered or not) or some other method.  Dict whose keys are layer keys
            and whose values are file names.
        analyzed_barcodes_only: Argument passed to anndata_from_h5().
            False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count matrix.
            True to load a limited set of barcodes: only those analyzed by the
            algorithm. This allows relevant latent variables to be loaded
            properly into adata.obs and adata.obsm, rather than adata.uns.
        input_layer_key: Key of the anndata.layer that is created for the raw
            input count matrix.
        gene_expression_encoding_key: The CellBender gene expression embedding
            will be loaded into adata.obsm[gene_expression_encoding_key]
        truth_file: File containing truth data if this is a simulation

    Return:
        anndata.AnnData: AnnData object with counts before and after remove-background,
            as well as inferred latent variables from remove-background.

    """

    # Load input data.
    adata_raw = load_anndata_from_input(input_file=input_file)
    adata_raw.var_names_make_unique()

    # Load remove-background output data.
    assert type(output_files) == dict, 'output_files must be a dict whose keys are ' \
                                       'layer names and whose values are file paths.'
    outs = {}
    for key, output_file in output_files.items():
        outs[key] = anndata_from_h5(output_file, analyzed_barcodes_only=analyzed_barcodes_only)
        outs[key].var_names_make_unique()

    # Subset all datasets to the relevant barcodes and features.
    relevant_barcodes = set(adata_raw.obs_names)
    relevant_features = set(adata_raw.var_names)
    for key, ad in outs.items():
        relevant_barcodes = relevant_barcodes.intersection(set(ad.obs_names))
        relevant_features = relevant_features.intersection(set(ad.var_names))
    if len(relevant_barcodes) < len(adata_raw):
        print(f'Warning: subsetting to barcodes common to all datasets: there '
              f'are {len(relevant_barcodes)}')
    if len(relevant_features) < adata_raw.shape[1]:
        print(f'Warning: subsetting to features common to all datasets: there '
              f'are {len(relevant_features)}')
    adata_raw = adata_raw[list(relevant_barcodes)].copy()
    adata_raw = adata_raw[:, list(relevant_features)].copy()
    for i, (key, ad) in enumerate(outs.items()):
        outs[key] = ad[list(relevant_barcodes)].copy()
        outs[key] = outs[key][:, list(relevant_features)].copy()
        if i == 0:
            print(f'Loading latent variables from one output file: {key}')
            adata_out = outs[key].copy()

    # Put count matrices into 'layers' in anndata for clarity.
    adata_out.layers[input_layer_key] = adata_raw.X.copy()
    for key, ad in outs.items():
        adata_out.layers[key] = ad.X.copy()

    # Load truth data, if present.
    if truth_file is not None:
        adata_truth = anndata_from_h5(truth_file, analyzed_barcodes_only=False)
        adata_truth = adata_truth[adata_out.obs.index]
        adata_out.layers['truth'] = adata_truth.X.copy()
        adata_out.var['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=0)).squeeze()
        adata_out.obs['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=1)).squeeze()
        for key in adata_truth.obs.keys():
            if key.startswith('truth_'):
                adata_out.obs[key] = adata_truth.obs[key].copy()
        for key in adata_truth.uns.keys():
            if key.startswith('truth_'):
                adata_out.uns[key] = adata_truth.uns[key].copy()
        for key in adata_truth.var.keys():
            if key.startswith('truth_'):
                adata_out.var[key] = adata_truth.var[key].copy()

    # Rename the CellBender encoding of gene expression.
    if analyzed_barcodes_only:
        slot = adata_out.obsm
    else:
        slot = adata_out.uns
    embedding_key = None
    for key in ['gene_expression_encoding', 'latent_gene_encoding']:
        if key in slot.keys():
            embedding_key = key
            break
    if gene_expression_encoding_key != embedding_key:
        slot[gene_expression_encoding_key] = slot[embedding_key].copy()
        del slot[embedding_key]

    return adata_out