Source code for cellbender.remove_background.downstream

"""Functions for downstream work with outputs of remove-background."""

from import load_data

import tables
import numpy as np
import scipy.sparse as sp
import anndata
from typing import Dict, Optional

[docs]def dict_from_h5(file: str) -> Dict[str, np.ndarray]: """Read in everything from an h5 file and put into a dictionary. Args: file: The h5 file Returns: Dictionary containing all the information from the h5 file """ d = {} with tables.open_file(file) as f: # read in everything for array in f.walk_nodes("/", "Array"): d[] = return d
[docs]def anndata_from_h5(file: str, analyzed_barcodes_only: bool = True) -> anndata.AnnData: """Load an output h5 file into an AnnData object for downstream work. Args: file: The h5 file analyzed_barcodes_only: False to load all barcodes, so that the size of the AnnData object will match the size of the input raw count matrix. True to load a limited set of barcodes: only those analyzed by the algorithm. This allows relevant latent variables to be loaded properly into adata.obs and adata.obsm, rather than adata.uns. Returns: anndata.AnnData: The anndata object, populated with inferred latent variables and metadata. """ d = dict_from_h5(file) X = sp.csc_matrix((d.pop('data'), d.pop('indices'), d.pop('indptr')), shape=d.pop('shape')).transpose().tocsr() # check and see if we have barcode index annotations, and if the file is filtered barcode_key = [k for k in d.keys() if (('barcode' in k) and ('ind' in k))] if len(barcode_key) > 0: max_barcode_ind = d[barcode_key[0]].max() filtered_file = (max_barcode_ind >= X.shape[0]) else: filtered_file = True if analyzed_barcodes_only: if filtered_file: # filtered file being read, so we don't need to subset print('Assuming we are loading a "filtered" file that contains only cells.') pass elif 'barcode_indices_for_latents' in d.keys(): X = X[d['barcode_indices_for_latents'], :] d['barcodes'] = d['barcodes'][d['barcode_indices_for_latents']] elif 'barcodes_analyzed_inds' in d.keys(): X = X[d['barcodes_analyzed_inds'], :] d['barcodes'] = d['barcodes'][d['barcodes_analyzed_inds']] else: print('Warning: analyzed_barcodes_only=True, but the key ' '"barcodes_analyzed_inds" or "barcode_indices_for_latents" ' 'is missing from the h5 file. ' 'Will output all barcodes, and proceed as if ' 'analyzed_barcodes_only=False') # Construct the anndata object. adata = anndata.AnnData(X=X, obs={'barcode': d.pop('barcodes').astype(str)}, var={'gene_name': (d.pop('gene_names') if 'gene_names' in d.keys() else d.pop('name')).astype(str)}, dtype=X.dtype) adata.obs.set_index('barcode', inplace=True) adata.var.set_index('gene_name', inplace=True) # For CellRanger v2 legacy format, "gene_ids" was called "genes"... rename this if 'genes' in d.keys(): d['id'] = d.pop('genes') # For purely aesthetic purposes, rename "id" to "gene_id" if 'id' in d.keys(): d['gene_id'] = d.pop('id') # If genomes are empty, try to guess them based on gene_id if 'genome' in d.keys(): if np.array([s.decode() == '' for s in d['genome']]).all(): if '_' in d['gene_id'][0].decode(): print('Genome field blank, so attempting to guess genomes based on gene_id prefixes') d['genome'] = np.array([s.decode().split('_')[0] for s in d['gene_id']], dtype=str) # Add other information to the anndata object in the appropriate slot. _fill_adata_slots_automatically(adata, d) # Add a special additional field to .var if it exists. if 'features_analyzed_inds' in adata.uns.keys(): adata.var['cellbender_analyzed'] = [True if (i in adata.uns['features_analyzed_inds']) else False for i in range(adata.shape[1])] elif 'features_analyzed_inds' in adata.var.keys(): adata.var['cellbender_analyzed'] = [True if (i in adata.var['features_analyzed_inds'].values) else False for i in range(adata.shape[1])] if analyzed_barcodes_only: for col in adata.obs.columns[adata.obs.columns.str.startswith('barcodes_analyzed') | adata.obs.columns.str.startswith('barcode_indices')]: try: del adata.obs[col] except Exception: pass else: # Add a special additional field to .obs if all barcodes are included. if 'barcodes_analyzed_inds' in adata.uns.keys(): adata.obs['cellbender_analyzed'] = [True if (i in adata.uns['barcodes_analyzed_inds']) else False for i in range(adata.shape[0])] elif 'barcodes_analyzed_inds' in adata.obs.keys(): adata.obs['cellbender_analyzed'] = [True if (i in adata.obs['barcodes_analyzed_inds'].values) else False for i in range(adata.shape[0])] return adata
def _fill_adata_slots_automatically(adata, d): """Add other information to the adata object in the appropriate slot.""" # TODO: what about "features_analyzed_inds"? If not all features are analyzed, does this work? for key, value in d.items(): try: if value is None: continue value = np.asarray(value) if len(value.shape) == 0: adata.uns[key] = value elif value.shape[0] == adata.shape[0]: if (len(value.shape) < 2) or (value.shape[1] < 2): adata.obs[key] = value else: adata.obsm[key] = value elif value.shape[0] == adata.shape[1]: if'bytes'): adata.var[key] = value.astype(str) else: adata.var[key] = value else: adata.uns[key] = value except Exception: print('Unable to load data into AnnData: ', key, value, type(value))
[docs]def load_anndata_from_input(input_file: str) -> anndata.AnnData: """Load an input file into an AnnData object (used in report generation). Equivalent to something like, but uses cellbender's io. Args: input_file: The raw data file Returns: adata.AnnData: The anndata object """ # Load data as dict. d = load_data(input_file=input_file) # For purely aesthetic purposes, rename slots from the plural to singluar. for key in ['gene_id', 'barcode', 'genome', 'feature_type', 'gene_name']: if key + 's' in d.keys(): d[key] = d.pop(key + 's') # Create anndata object from dict. adata = anndata.AnnData(X=d.pop('matrix'), obs={'barcode': d.pop('barcode').astype(str)}, var={'gene_name': d.pop('gene_name').astype(str)}, dtype=int) adata.obs.set_index('barcode', inplace=True) adata.var.set_index('gene_name', inplace=True) # Add other information to the anndata object in the appropriate slot. _fill_adata_slots_automatically(adata, d) return adata
[docs]def load_anndata_from_input_and_output(input_file: str, output_file: str, analyzed_barcodes_only: bool = True, input_layer_key: str = 'cellranger', retain_input_metadata: bool = False, gene_expression_encoding_key: str = 'cellbender_embedding', truth_file: Optional[str] = None) -> anndata.AnnData: """Load remove-background output count matrix into an anndata object, together with remove-background metadata and the raw input counts. Args: input_file: Raw h5 file (or other compatible remove-background input) used as input for remove-background. output_file: Output h5 file created by remove-background (can be filtered or not). analyzed_barcodes_only: Argument passed to anndata_from_h5(). False to load all barcodes, so that the size of the AnnData object will match the size of the input raw count matrix. True to load a limited set of barcodes: only those analyzed by the algorithm. This allows relevant latent variables to be loaded properly into adata.obs and adata.obsm, rather than adata.uns. input_layer_key: Key of the anndata.layer that is created for the raw input count matrix. retain_input_metadata: In addition to loading the CellBender metadata, which happens automatically, set this to True to retain all the metadata from the raw input file as well. gene_expression_encoding_key: The CellBender gene expression embedding will be loaded into adata.obsm[gene_expression_encoding_key] truth_file: File containing truth data if this is a simulation Return: anndata.AnnData: AnnData object with counts before and after remove-background, as well as inferred latent variables from remove-background. """ # Load input data. adata_raw = load_anndata_from_input(input_file=input_file) # Load remove-background output data. adata_out = anndata_from_h5(output_file, analyzed_barcodes_only=analyzed_barcodes_only) # Subset the raw dataset to the relevant barcodes. adata_raw = adata_raw[adata_out.obs.index] # TODO: keep the stuff from the raw file too: from obs and var and uns # TODO: maybe use _fill_adata_slots_automatically()? or just copy stuff # Put count matrices into 'layers' in anndata for clarity. adata_out.layers[input_layer_key] = adata_raw.X.copy() adata_out.layers['cellbender'] = adata_out.X.copy() # Pre-compute a bit of metadata. adata_out.var['n_' + input_layer_key] = \ np.array(adata_out.layers[input_layer_key].sum(axis=0), dtype=int).squeeze() adata_out.var['n_cellbender'] = \ np.array(adata_out.layers['cellbender'].sum(axis=0), dtype=int).squeeze() adata_out.obs['n_' + input_layer_key] = \ np.array(adata_out.layers[input_layer_key].sum(axis=1), dtype=int).squeeze() adata_out.obs['n_cellbender'] = \ np.array(adata_out.layers['cellbender'].sum(axis=1), dtype=int).squeeze() # Load truth data, if present. if truth_file is not None: adata_truth = anndata_from_h5(truth_file, analyzed_barcodes_only=False) adata_truth = adata_truth[adata_out.obs.index] adata_out.layers['truth'] = adata_truth.X.copy() adata_out.var['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=0), dtype=int).squeeze() adata_out.obs['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=1), dtype=int).squeeze() for key in adata_truth.obs.keys(): if key.startswith('truth_'): adata_out.obs[key] = adata_truth.obs[key].copy() for key in adata_truth.uns.keys(): if key.startswith('truth_'): adata_out.uns[key] = adata_truth.uns[key].copy() for key in adata_truth.var.keys(): if key.startswith('truth_'): adata_out.var[key] = adata_truth.var[key].copy() # Rename the CellBender encoding of gene expression. if analyzed_barcodes_only: slot = adata_out.obsm else: slot = adata_out.uns embedding_key = None for key in ['gene_expression_encoding', 'latent_gene_encoding']: if key in slot.keys(): embedding_key = key break if gene_expression_encoding_key != embedding_key: slot[gene_expression_encoding_key] = slot[embedding_key].copy() del slot[embedding_key] return adata_out
def _load_anndata_from_input_and_decontx(input_file: str, output: str, input_layer_key: str = 'cellranger', truth_file: Optional[str] = None) -> anndata.AnnData: """Load decontX output count matrix into an anndata object, together with remove-background metadata and the raw input counts. NOTE: this is used only for dev purposes and only in the report Args: input_file: Raw h5 file (or other compatible remove-background input) used as input for remove-background. output: Output h5 file, or a directory where decontX MTX and TSV files are stored input_layer_key: Key of the anndata.layer that is created for the raw input count matrix. truth_file: File containing truth data if this is a simulation Return: anndata.AnnData: AnnData object with counts before and after remove-background, as well as inferred latent variables from remove-background. """ # Load decontX output data. print('UNSTABLE FEATURE: Trying to load decontX format MTX output') adata_out = load_anndata_from_input(input_file=output) adata_out.var_names_make_unique() # Load input data. adata_raw = load_anndata_from_input(input_file=input_file) adata_raw.var_names_make_unique() adata_raw = adata_raw[:, [g in adata_out.var.index for g in adata_raw.var.index]].copy() adata_out.var['genome'] = adata_raw.var['genome'].copy() adata_out.var['feature_type'] = adata_raw.var['feature_type'].copy() adata_out.var['gene_id'] = adata_raw.var['gene_id'].copy() # Subset the raw dataset to the relevant barcodes. empty_logic = np.array([b not in adata_out.obs.index for b in adata_raw.obs.index]) empty_counts = np.array(adata_raw.X[empty_logic].sum(axis=1)).squeeze() approx_ambient = np.array(adata_raw.X[empty_logic][empty_counts > 5].sum(axis=0)).squeeze() approx_ambient = approx_ambient / (approx_ambient.sum() + 1e-10) print(f'Estimated that there are about {np.median(empty_counts[empty_counts > 5])} counts in empties') adata_raw = adata_raw[adata_out.obs.index].copy() adata_out.uns['empty_droplet_size_lognormal_loc'] = np.log(np.median(empty_counts[empty_counts > 5])) # Put count matrices into 'layers' in anndata for clarity. adata_out.layers[input_layer_key] = adata_raw.X.copy() adata_out.layers['decontx'] = adata_out.X.copy() # Pre-compute a bit of metadata. adata_out.var['n_' + input_layer_key] = np.array(adata_out.layers[input_layer_key].sum(axis=0)).squeeze() adata_out.var['n_decontx'] = np.array(adata_out.layers['decontx'].sum(axis=0)).squeeze() adata_out.obs['n_' + input_layer_key] = np.array(adata_out.layers[input_layer_key].sum(axis=1)).squeeze() adata_out.obs['n_decontx'] = np.array(adata_out.layers['decontx'].sum(axis=1)).squeeze() adata_out.obs['cell_probability'] = 1. # because decontx data contains only cells adata_out.uns['target_false_positive_rate'] = 0.01 # TODO: placeholder adata_out.uns['approximate_ambient_profile'] = approx_ambient adata_out.var['ambient_expression'] = np.nan # Load truth data, if present. if truth_file is not None: adata_truth = anndata_from_h5(truth_file, analyzed_barcodes_only=False) adata_truth = adata_truth[adata_out.obs.index] # TODO; a check adata_truth = adata_truth[:, [g in adata_out.var.index for g in adata_truth.var.index]].copy() adata_out.layers['truth'] = adata_truth.X.copy() adata_out.var['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=0)).squeeze() adata_out.obs['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=1)).squeeze() for key in adata_truth.obs.keys(): if key.startswith('truth_'): adata_out.obs[key] = adata_truth.obs[key].copy() for key in adata_truth.uns.keys(): if key.startswith('truth_'): adata_out.uns[key] = adata_truth.uns[key].copy() for key in adata_truth.var.keys(): if key.startswith('truth_'): adata_out.var[key] = adata_truth.var[key].copy() return adata_out
[docs]def load_anndata_from_input_and_outputs(input_file: str, output_files: Dict[str, str], analyzed_barcodes_only: bool = True, input_layer_key: str = 'cellranger', gene_expression_encoding_key: str = 'cellbender_embedding', truth_file: Optional[str] = None) -> anndata.AnnData: """Load remove-background output count matrices into an anndata object, together with remove-background metadata and the raw input counts. The use case would typically be cellbender runs with multiple output files at different FPRs, which we want to compare. Args: input_file: Raw h5 file (or other compatible remove-background input) used as input for remove-background. output_files: Output h5 files created by remove-background (can be filtered or not) or some other method. Dict whose keys are layer keys and whose values are file names. analyzed_barcodes_only: Argument passed to anndata_from_h5(). False to load all barcodes, so that the size of the AnnData object will match the size of the input raw count matrix. True to load a limited set of barcodes: only those analyzed by the algorithm. This allows relevant latent variables to be loaded properly into adata.obs and adata.obsm, rather than adata.uns. input_layer_key: Key of the anndata.layer that is created for the raw input count matrix. gene_expression_encoding_key: The CellBender gene expression embedding will be loaded into adata.obsm[gene_expression_encoding_key] truth_file: File containing truth data if this is a simulation Return: anndata.AnnData: AnnData object with counts before and after remove-background, as well as inferred latent variables from remove-background. """ # Load input data. adata_raw = load_anndata_from_input(input_file=input_file) adata_raw.var_names_make_unique() # Load remove-background output data. assert type(output_files) == dict, 'output_files must be a dict whose keys are ' \ 'layer names and whose values are file paths.' outs = {} for key, output_file in output_files.items(): outs[key] = anndata_from_h5(output_file, analyzed_barcodes_only=analyzed_barcodes_only) outs[key].var_names_make_unique() # Subset all datasets to the relevant barcodes and features. relevant_barcodes = set(adata_raw.obs_names) relevant_features = set(adata_raw.var_names) for key, ad in outs.items(): relevant_barcodes = relevant_barcodes.intersection(set(ad.obs_names)) relevant_features = relevant_features.intersection(set(ad.var_names)) if len(relevant_barcodes) < len(adata_raw): print(f'Warning: subsetting to barcodes common to all datasets: there ' f'are {len(relevant_barcodes)}') if len(relevant_features) < adata_raw.shape[1]: print(f'Warning: subsetting to features common to all datasets: there ' f'are {len(relevant_features)}') adata_raw = adata_raw[list(relevant_barcodes)].copy() adata_raw = adata_raw[:, list(relevant_features)].copy() for i, (key, ad) in enumerate(outs.items()): outs[key] = ad[list(relevant_barcodes)].copy() outs[key] = outs[key][:, list(relevant_features)].copy() if i == 0: print(f'Loading latent variables from one output file: {key}') adata_out = outs[key].copy() # Put count matrices into 'layers' in anndata for clarity. adata_out.layers[input_layer_key] = adata_raw.X.copy() for key, ad in outs.items(): adata_out.layers[key] = ad.X.copy() # Load truth data, if present. if truth_file is not None: adata_truth = anndata_from_h5(truth_file, analyzed_barcodes_only=False) adata_truth = adata_truth[adata_out.obs.index] adata_out.layers['truth'] = adata_truth.X.copy() adata_out.var['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=0)).squeeze() adata_out.obs['n_truth'] = np.array(adata_out.layers['truth'].sum(axis=1)).squeeze() for key in adata_truth.obs.keys(): if key.startswith('truth_'): adata_out.obs[key] = adata_truth.obs[key].copy() for key in adata_truth.uns.keys(): if key.startswith('truth_'): adata_out.uns[key] = adata_truth.uns[key].copy() for key in adata_truth.var.keys(): if key.startswith('truth_'): adata_out.var[key] = adata_truth.var[key].copy() # Rename the CellBender encoding of gene expression. if analyzed_barcodes_only: slot = adata_out.obsm else: slot = adata_out.uns embedding_key = None for key in ['gene_expression_encoding', 'latent_gene_encoding']: if key in slot.keys(): embedding_key = key break if gene_expression_encoding_key != embedding_key: slot[gene_expression_encoding_key] = slot[embedding_key].copy() del slot[embedding_key] return adata_out