Source code for treasmo.ds

import numpy as np
import pandas as pd
import anndata as ad
import time

from minisom import MiniSom


from scipy import stats
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import fdrcorrection
from sklearn.gaussian_process import GaussianProcessRegressor

import random
import scanpy as sc

import os
import sys
from sklearn import preprocessing

__author__ = "Chaozhong Liu"
__email__ = "czliubioinfo@gmail.com"


#=========================================================================================
# Cluster / Group marker identification
# Trajectory-based regulatory dynamics
# TF regulatory dynamics
# Statistical test for dynamics
#=========================================================================================




#=========================================================================================
# Group marker identification
#=========================================================================================

[docs]def FindAllMarkers(mudata, ident, mods=['rna','atac'], corrct_method='bonferroni', seed=1): """ Function to discover regulatory gene-peak markers in all groups Target group correlation strength is compared with all remaining groups by t-test. Parameters ------------ mudata: MuData single-cell multi-omics data saved as MuData object ident: str column name in ``mudata.obs`` containing group labels mods: List[str, str] scRNA-seq and scATAC-seq modality name in MuData object corrct_method: str multi-test correction method, one of ['bonferroni', 'fdr'] seed: int random seed to make the results reproducible Returns --------- DataFrame Differentially regulated pairs statistical test results """ if 'Local_L' in mudata.uns.keys(): local_L_df = pd.DataFrame(mudata.uns['Local_L']) local_L_df.columns = mudata.uns['Local_L_names'] else: raise Exception("No previously calculated local correlation matrix found. \nPlease calculate the matrix first.") #mudata = core.Local_L(mudata, genes, peaks, # mods=mods, # rm_dropout=False, # seed=1, max_RAM=16) #local_L_df = pd.DataFrame(mudata.uns['Local_L']) #local_L_df.columns = mudata.uns['Local_L_names'] #print("========= Finished ========") # cluster comparison print("Performing statistical test for correlation differences among identities...") start = time.time() # get clusters have at least 2 mini-bulk groups = mudata.obs[ident].value_counts() > 1 groups = groups.loc[groups].index.to_list() local_L_df['clus'] = mudata.obs[ident].to_numpy() stat_df_all = pd.DataFrame(np.empty((0,13))) stat_df_all.columns = ['group','name','Mean.1','Mean.2','Std.1','Std.2', 'obsn.1','obsn.2','score','p.value','p.adj','Frac.gene.1','Frac.peak.1'] for group in groups: stat_df = marker_test(local_L_df, group_1=group, group_2=None, corrct_method=corrct_method) try: stat_df['Frac.gene.1'], stat_df['Frac.peak.1'] = add_feature_sparsity(stat_df, mudata, mods=mods, group=group) except: print('\tCluster %s sparsity information not found. Skip.'%group) stat_df['Frac.gene.1'] = np.NaN stat_df['Frac.peak.1'] = np.NaN stat_df_all = pd.concat([stat_df_all,stat_df]) print("Completed! %.2fs past."%(time.time()-start)) return stat_df_all
[docs]def FindMarkers(mudata, ident, group_1, group_2, mods=['rna','atac'], corrct_method='bonferroni',seed=1, log=True): """ Function to compare regulatory gene-peak pairs between two group by t-test. Parameters ------------ mudata: MuData single-cell multi-omics data saved as MuData object ident: str column name in mudata.obs containing group labels group_1: str, int first group name in ident to compare with the second group_2: str, int second group name in ident to compare with the first mods: List[str, str] scRNA-seq and scATAC-seq modality name in MuData object corrct_method: str multi-test correction method, one of ['bonferroni', 'fdr'] seed: int random seed to make the results reproducible Returns --------- DataFrame Differentially regulated pairs statistical test results """ if 'Local_L' in mudata.uns.keys(): local_L_df = pd.DataFrame(mudata.uns['Local_L']) local_L_df.columns = mudata.uns['Local_L_names'] else: raise Exception("No previously calculated local correlation matrix found. \nPlease calculate the matrix first.") #mudata = core.Local_L(mudata, genes, peaks, # mods=mods, # rm_dropout=False, # seed=1, max_RAM=16) #local_L_df = pd.DataFrame(mudata.uns['Local_L']) #local_L_df.columns = mudata.uns['Local_L_names'] #print("========= Finished ========") # cluster comparison if log: print("Perform statistical test for correlation differences between selected two group...") start = time.time() local_L_df['clus'] = mudata.obs[ident].to_numpy() stat_df = marker_test(local_L_df, group_1=group_1, group_2=group_2, corrct_method=corrct_method) try: stat_df['Frac.gene.1'], stat_df['Frac.peak.1'] = add_feature_sparsity(stat_df, mudata, mods=mods, group=group_1) stat_df['Frac.gene.2'], stat_df['Frac.peak.2'] = add_feature_sparsity(stat_df, mudata, mods=mods, group=group_2) except: if log: print('\tSparsity information not found. Skip.') stat_df['Frac.gene.1'] = np.NaN stat_df['Frac.peak.1'] = np.NaN stat_df['Frac.gene.2'] = np.NaN stat_df['Frac.peak.2'] = np.NaN if log: print("Completed! %.2fs past."%(time.time()-start)) return stat_df
[docs]def marker_test(local_L_df, group_1, group_2=None, corrct_method='bonferroni'): """ Function unit to perform statistical test with given data. No need to be called from user end. """ #stat_df.columns = ['group','name','Mean.1','Mean.2','Std.1', # 'Std.2','obsn.1','obsn.2','score','p.value','p.adj'] stat_df = group_stat(local_L_df, group_1=group_1, group_2=group_2) stat_df['score'] = np.NaN stat_df['p.value'] = np.NaN for i in range(stat_df.shape[0]): ttest = stats.ttest_ind_from_stats( mean1=stat_df['Mean.1'][i], std1=stat_df['Std.1'][i], nobs1=stat_df['obsn.1'][i], mean2=stat_df['Mean.2'][i], std2=stat_df['Std.2'][i], nobs2=stat_df['obsn.2'][i], equal_var=True, # Welch's ) stat_df.loc[stat_df.index[i],'score'] = ttest[0] stat_df.loc[stat_df.index[i],'p.value'] = ttest[1] if corrct_method == 'fdr': _, stat_df['p.adj'] = fdrcorrection( stat_df['p.value'], alpha=0.05, method='indep' ) elif corrct_method == 'bonferroni': stat_df['p.adj'] = np.minimum(stat_df['p.value'] * stat_df.shape[0], 1.0) else: #print("Please select correction methods from ['fdr', 'bonferroni']!") stat_df['p.adj'] = np.NaN #stat_df['Frac.gene'], stat_df['Frac.peak'] = _add_clus_sparsity(stat_df, anndat_multiome, group) #stat_df_all = pd.concat([stat_df_all,stat_df]) return stat_df
[docs]def group_stat(local_L, group_1, group_2=None): """ Function unit to calculate group statitics. No need to be called from user end. """ groups_label = local_L['clus'].to_numpy() mask_1 = groups_label == group_1 obsn1 = np.sum(mask_1) if group_2 is None: group_2 = 'others' mask_2 = groups_label != group_1 obsn2 = len(mask_2) - obsn1 else: mask_2 = groups_label == group_2 obsn2 = np.sum(mask_2) local_L_df = local_L.iloc[:,:-1].copy() local_L_df['clus'] = 'others' local_L_df.loc[mask_1, 'clus'] = group_1 local_L_df.loc[mask_2, 'clus'] = group_2 mean_df = local_L_df.groupby('clus').mean() #.reset_index() mean_df = mean_df.loc[[group_1,group_2],:] std_df = local_L_df.groupby('clus').std() std_df = std_df.loc[[group_1,group_2],:] mean_df = mean_df.T mean_df.columns = ['Mean.1','Mean.2'] std_df = std_df.T std_df.columns = ['Std.1','Std.2'] stat_df = pd.concat([mean_df, std_df], axis=1) stat_df.insert(0,'name',stat_df.index) stat_df = stat_df.reset_index(drop=True) if group_2 == 'others': stat_df.insert(0,'group',group_1) else: stat_df.insert(0,'group.1',group_1) stat_df.insert(1,'group.2',group_2) stat_df['obsn.1'] = obsn1 stat_df['obsn.2'] = obsn2 return stat_df
[docs]def add_feature_sparsity(stat_df, mudata, group, mods=['rna','atac']): """ Helper function to add feature sparsity information in final group comparison results. """ GP_G = [gp.split('~')[0] for gp in stat_df['name']] GP_P = [gp.split('~')[1] for gp in stat_df['name']] return mudata.mod[mods[0]].var.loc[GP_G,:]['Frac.%s'%group].to_numpy(),\ mudata.mod[mods[1]].var.loc[GP_P,:]['Frac.%s'%group].to_numpy()
[docs]def MarkerFilter(statDf, min_pct_rna=0.1, min_pct_atac=0.05, mean_diff=1.0, p_cutoff=1e-12, plot=False): """ Function to filter markers from statistical test results by sparsity, correlation difference, and p-value Parameters ------------ statDf: DataFrame Differentially regulated pairs statistical test results min_pct_rna: float sparsity filter cutoff: percentage of cells that express the gene min_pct_atac: float sparsity filter cutoff: percentage of cells that have the peak mean_diff: float mean correlation strength difference between the group and background (all other groups) p_cutoff: float adjusted p-value cutoff plot: bool if True, plot volcano plot Returns --------- DataFrame Filtered marker list with the same columns as stat_df if plot==True, also return volcano plot """ stat_df = statDf.copy() pd.options.mode.chained_assignment = None stat_df.loc[:,'score.abs'] = np.abs(stat_df['score']) if 'group' in stat_df.columns: stat_df = stat_df.sort_values(by=['group','score.abs'], ascending=False) else: stat_df = stat_df.sort_values(by=['group.1','score.abs'], ascending=False) mask = (stat_df['Frac.gene.1']>min_pct_rna) & (stat_df['Frac.peak.1']>min_pct_atac) stat_df = stat_df.loc[mask,:] filt = ((stat_df['p.adj']<p_cutoff) & (np.abs(stat_df['Mean.1']-stat_df['Mean.2']) > mean_diff)) if plot: plt.scatter(stat_df['Mean.1'][~filt]-stat_df['Mean.2'][~filt], -np.log10(stat_df['p.adj'][~filt]), s=2, marker='o', c='grey') plt.scatter(stat_df['Mean.1'][filt]-stat_df['Mean.2'][filt], -np.log10(stat_df['p.adj'][filt]), s=2, marker='o', c='red') plt.grid(which='both', linestyle='-', linewidth='0') plt.xlabel('mean_1 - mean_2') plt.ylabel('-log10(p.adj)') return stat_df.loc[filt,:]
#========================================================================================= # Trajectory Path Analysis #=========================================================================================
[docs]def FindPathMarkers(mudata, ident, path, mods=['rna','atac'], corrct_method='bonferroni', seed=1): """ One-to-one comparison of gene-peak correlation among groups in the trajectory path by t-test. Parameters ------------ mudata: MuData single-cell multi-omics data saved as MuData object ident: str column name in ``mudata.obs`` containing group labels path: List list of clusters ordered by their sequence on the trajectory. A path here should have no branch. mods: List[str, str] scRNA-seq and scATAC-seq modality name in MuData object corrct_method: str multi-test correction method, one of ['bonferroni', 'fdr'] seed: int random seed to make the results reproducible Returns --------- DataFrame Differentially regulated pairs statistical test results """ # One-to-one comparison start = time.time() dfList = [] for i in range(len(path)): for j in range(i+1, len(path)): stat_df = FindMarkers(mudata, ident, group_1=path[j], group_2=path[i], mods=mods, corrct_method=corrct_method, seed=seed, log=False) dfList.append(stat_df) statDf = pd.concat(dfList) print("Completed! %.3fs past."%(time.time()-start)) return statDf
def _fit_data(timebinDf, xfit): """ Helper function to fit correlation strength along trajectory by GaussianProcessRegressor. No need to be called from user end. """ xdata = timebinDf['time'].to_numpy() ydata = timebinDf['value'].to_numpy() filter_na = ~np.isnan(ydata) xdata = xdata[filter_na] ydata = ydata[filter_na] # Compute the Gaussian process fit gp = GaussianProcessRegressor(random_state=1) gp.fit(xdata[:, np.newaxis], ydata) #xfit = np.linspace(data['time'].min(), data['time'].max(), bins) yfit, _ = gp.predict(xfit[:, np.newaxis], return_std=True) return yfit
[docs]def TimeBinData(mudata, ident, path, pseudotime, features, bins=100, rm_outlier=False, fitted=None): """ Helper function to generate bined data along trajectory. Parameters -------------- mudata: MuData single-cell multi-omics data saved as MuData object It must have correlation strength index calculated. ident: str column name in ``mudata.obs`` containing trajectory group labels path: List list of clusters ordered by their sequence on the trajectory. A path here should have no branch. pseudotime: str column name in ``mudata.obs`` containing trajectory pseudotime labels features: List, numpy.array List of gene-peak pair names. Can be selected from ``muData.uns['Local_L_names']`` bins: int number of bins to divide the trajectory into rm_outlier: bool whether or not adding cap and limit strength index within +- 2*std fitted: int, default is None if an int, return GaussianProcessRegressor fitted data with ``fitted`` bins. if None, return only bined raw data Returns --------- DataFrame bined raw data Optional: GaussianProcessRegressor fitted data """ cells_bool = mudata.obs[ident].isin(path).to_numpy() #features = mudata.uns['Local_L_names'] CorDf = pd.DataFrame(mudata.uns['Local_L'], columns=mudata.uns['Local_L_names']) CorDf = CorDf.loc[cells_bool, features] CorDf['time'] = mudata.obs.loc[cells_bool, pseudotime].to_numpy() CorDf = CorDf.sort_values(by='time') # Construct time bins time_range = (CorDf['time'].min(), CorDf['time'].max()) N_interval = bins interv_range = (time_range[1] - time_range[0]) / N_interval group_intev = pd.cut(CorDf['time'], np.arange(time_range[0]-1e-3, time_range[1]+interv_range, interv_range)) data = CorDf.groupby(group_intev).mean().dropna() print("Empty bins removed. %.i bins left"%(data.shape[0])) data.index = np.arange(data.shape[0]) data.index.name = None if rm_outlier: data = data[np.abs(stats.zscore(data)) <= 2] data = data.sort_values(by='time') if fitted is not None: fitDf = pd.DataFrame(np.zeros((fitted, data.shape[1])), columns=data.columns) timefit = np.linspace(data['time'].min(), data['time'].max(), fitted) fitDf['time'] = timefit for feature in features: tmpDf = data[['time', feature]].copy() tmpDf.columns = ['time', 'value'] fitDf[feature] = _fit_data(tmpDf, timefit) return data, fitDf else: return data
[docs]def TimeBinProportion(mudata, ident, path, pseudotime, bins=100): """ Function to calculate bined cell type proportion along trajectory Parameters -------------- mudata: MuData single-cell multi-omics data saved as MuData object ident: str column name in ``mudata.obs`` containing trajectory group labels path: List list of clusters ordered by their sequence on the trajectory. A path here should have no branch. pseudotime: str column name in ``mudata.obs`` containing trajectory pseudotime labels bins: int number of bins to divide the trajectory into Returns --------- DataFrame bined data with pseudotime and cell type proportion """ # Construct DataFrame cells_bool = mudata.obs[ident].isin(path).to_numpy() CorDf = pd.DataFrame(pd.get_dummies(mudata.obs.loc[cells_bool, ident]).astype('float32').loc[:,path].to_numpy(), columns=path) CorDf['time'] = mudata.obs.loc[cells_bool, pseudotime].to_numpy() CorDf = CorDf.sort_values(by='time') # Construct time bins time_range = (CorDf['time'].min(), CorDf['time'].max()) N_interval = bins interv_range = (time_range[1] - time_range[0]) / N_interval group_intev = pd.cut(CorDf['time'], np.arange(time_range[0]-1e-3, time_range[1]+interv_range, interv_range)) data = CorDf.groupby(group_intev).mean().dropna() print("Empty bins removed. %.i bins left"%(data.shape[0])) data.index = np.arange(data.shape[0]) data.index.name = None data.columns.name = ident return data
[docs]def FindPathDynamics(mudata, ident, path, pseudotime, rm_outlier=True, var_cutoff=0.1, range_cutoff=1.0, bins=100, plot=False): """ Detect highly variable gene-peak pairs along the trajectory by correlation strength range (max-min) and variance Parameters -------------- mudata: MuData single-cell multi-omics data saved as MuData object ident: str column name in ``mudata.obs`` containing trajectory group labels path: List list of clusters ordered by their sequence on the trajectory. A path here should have no branch. pseudotime: str column name in ``mudata.obs`` containing trajectory pseudotime labels bins: int (argument passed to TimeBinData) number of bins to divide the trajectory into rm_outlier: bool (argument passed to TimeBinData) whether or not adding cap and limit strength index within +- 2*std var_cutoff: float minimum variance cutoff range_cutoff: float minimum range (max - min) cutoff plot: bool if True, plot volcano plot Returns --------- DataFrame Dynamic gene-peak pairs along the trajectory """ # Construct DataFrame features = mudata.uns['Local_L_names'] data = TimeBinData(mudata, ident, path, pseudotime, features, bins=bins, rm_outlier=True) sumDf = pd.DataFrame(data[features].var(), columns=['variance']) sumDf['max'] = data[features].max() sumDf['min'] = data[features].min() sumDf['range'] = sumDf['max'] - sumDf['min'] sumDf = sumDf.sort_values(by='variance',ascending=False) filt = ((sumDf['variance']>=var_cutoff) &(sumDf['range']>=range_cutoff)) if plot: plt.scatter(sumDf['range'][~filt], sumDf['variance'][~filt], s=2, marker='o', c='grey') plt.scatter(sumDf['range'][filt], sumDf['variance'][filt], s=2, marker='o', c='red') plt.grid(which='both', linestyle='-', linewidth='0') plt.xlabel('min - max') plt.ylabel('variance') return sumDf[filt]
[docs]def PathDynamics(mudata, gene, peaks, ident, path, pseudotime, bins=100): """ Quantify regulatory dynamics along the trajectory for a single gene and its regulatory elements. Parameters ------------ mudata: MuData single-cell multi-omics data saved as MuData object gene: str a single gene name peak: List, numpy.array a list of peaks correlated with the gene (gene-peak pair should exist in mudata.uns['Local_L_names']) ident: str column name in ``mudata.obs`` containing trajectory group labels path: List list of clusters ordered by their sequence on the trajectory. A path here should have no branch. pseudotime: str pseudotime label for the trajectory saved in ``mudata.obs`` bins: int number of bins to divide the trajectory into Returns --------- MuData DataFrame added in mudata.uns['pathDym'][path][gene] describing correlation and cluster proportion changes along the trajectory """ features = [f"{gene}~{peak}" for peak in peaks] # Construct DataFrame cells_bool = mudata.obs[ident].isin(path).to_numpy() CorDf = pd.DataFrame(mudata.uns['Local_L'], columns=mudata.uns['Local_L_names']) CorDf = CorDf.loc[cells_bool, features] CorDf['time'] = mudata.obs.loc[cells_bool, pseudotime].to_numpy() CorDf[path] = pd.get_dummies(mudata.obs.loc[cells_bool, ident]).astype('float32').loc[:,path].to_numpy() CorDf = CorDf.sort_values(by='time') # Construct time bins time_range = (CorDf['time'].min(), CorDf['time'].max()) N_interval = bins interv_range = (time_range[1] - time_range[0]) / N_interval group_intev = pd.cut(CorDf['time'], np.arange(time_range[0]-1e-3, time_range[1]+interv_range, interv_range)) ''' agg_dict = {feature:['mean', 'std']} agg_dict['time'] = 'mean' for ct in path: agg_dict[ct] = 'mean' data = CorDf.groupby(group_intev).agg(agg_dict).dropna() #.mean().dropna() ''' data = CorDf.groupby(group_intev).mean().dropna() print("Empty bins removed. %.i bins left"%(data.shape[0])) data.index = np.arange(data.shape[0]) data.index.name = None path_name = '_'.join(path) if 'pathDym' not in mudata.uns.keys(): mudata.uns['pathDym'] = {} if path_name not in mudata.uns['pathDym'].keys(): mudata.uns['pathDym'][path_name] = {gene:data} else: mudata.uns['pathDym'][path_name][gene] = data return mudata
[docs]def DynamicModule(mudata, ident, path, pseudotime, features=None, bins=100, fitted=100, num_iteration=5000, som_shape=(2,2), sigma=0.5, learning_rate=.1, random_seed=1): """ Function to cluster gene-peak modules by Self-Organizing Map along the trajectory Parameters ------------ mudata: MuData single-cell multi-omics data saved as MuData object ident: str column name in ``mudata.obs`` containing trajectory group labels path: List list of clusters ordered by their sequence on the trajectory. A path here should have no branch. pseudotime: str pseudotime label for the trajectory saved in ``mudata.obs`` bins: int number of bins to divide the trajectory into fitted: int number of bins to divide the trajectory into for GaussianProcessRegressor fitted data ``bins`` sets bined data for later plotting; ``fitted`` sets bined data for clustering; It is recommended to keep the two the same num_iteration: int maximum number of iteration to optimize the SOM som_shape: Tuple[int, int] shape of the map, defines number and similarity structure of modules sigma: float the radius of the different neighbors in the SOM learning_rate: float optimization speed, how much weights are adjusted during each iteration random_seed: int random seed to make the results reproducible Returns --------- Dict key is module index, value is time bin data """ if features is None: features = mudata.uns['Local_L_names'] data, fitDf = TimeBinData(mudata, ident, path, pseudotime, features, bins=bins, rm_outlier=False, fitted=fitted) som_mtx = fitDf.to_numpy()[:,:-1].T som_mtx = stats.zscore(som_mtx, axis=1) som_mtx[som_mtx > 3] = 3 som_mtx[som_mtx < -3] = -3 print("Start training...") som = MiniSom(som_shape[0], som_shape[1], som_mtx.shape[1], sigma=sigma, learning_rate=learning_rate, neighborhood_function='gaussian', random_seed=random_seed) som.train_batch(som_mtx, num_iteration=num_iteration, verbose=True) # record module index winner_coordinates = np.array([som.winner(x) for x in som_mtx]).T cluster_index = np.ravel_multi_index(winner_coordinates, som_shape) somDict = {clsi:data.loc[:,np.concatenate([cluster_index==clsi, [True]])].copy() for clsi in np.unique(cluster_index)} return somDict