import numpy as np
import pandas as pd
import scanpy as sc
import random
import anndata as ad
import time
from treasmo.moran_vec import Moran
import treasmo.lee_vec as lee_vec
from libpysal.weights import W
import scipy as sp
from scipy import stats
#from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multitest import fdrcorrection
__author__ = "Chaozhong Liu"
__email__ = "czliubioinfo@gmail.com"
[docs]def Morans_I(mudata, mods=['rna','atac'], seed=1, max_RAM=16):
"""
Function to calculate Moran's I for all the features in multiome data
Parameters
--------------
mudata: MuData
single-cell multi-omics data saved as MuData object
mods: List[str, str]
scRNA-seq and scATAC-seq modality name in MuData object
seed: int
random seed to make the results reproducible
max_RAM: int
maximum limitation of memory usage (Gb)
Returns
--------------
MuData
added with var['Morans.I']
"""
# Check if mods exist in MuData
if sum([i in mudata.mod.keys() for i in mods]) == len(mods):
pass
else:
print("Not all mods provided are in MuData! Running stopped.")
return
# Start
random.seed(seed)
np.random.seed(seed)
start = time.time()
# Construct connection matrix
cong_mtx = mudata.obsp['connectivities'].toarray()
#cong_mtx = cong_mtx/cong_mtx.sum(axis=1)[:,np.newaxis]
neighbors = {}
weights = {}
for i in range(cong_mtx.shape[0]):
neighbors[i] = np.nonzero(cong_mtx[i])[0].tolist()
weights[i] = cong_mtx[i][np.nonzero(cong_mtx[i])[0]].tolist()
w = W(neighbors, weights)
mi = Moran(mudata.n_obs, w, permutations=0)
mudata.var['Morans.I'] = np.NaN
for mod in mods:
mi.calc_i(mudata.mod[mod].X, seed=seed, max_RAM=max_RAM)
mudata.mod[mod].var['Morans.I'] = mi.I
mudata.var['Morans.I'] = np.concatenate([mudata.mod[mod].var['Morans.I'].to_numpy() for mod in mudata.mod.keys()])
#print(mi.I)
print("Finished calculating Moran's I. %.3fs past"%(time.time()-start))
return mudata
def _check_array(mtx):
if sp.sparse.issparse(mtx): #isinstance(mtx, sp.spmatrix):
return mtx.toarray()
elif isinstance(mtx, np.ndarray):
return mtx
else:
raise Exception("Omics Data should be either numpy array or scipy sparse matrix.")
def _prepare_mtx(anndat, features):
index_df = pd.DataFrame({'index':np.arange(anndat.n_vars)})
index_df.index = anndat.var.index
feature_index = index_df.loc[features,'index'].to_numpy()
return _check_array(anndat.X[:,feature_index])
[docs]def Global_L(mudata, pairsDf, mods=['rna','atac'], permutations=0, percent=0.1, seed=1, max_RAM=16):
"""
Function to calculate the global L index (mean of correlation strength index) for all the pairs in multiome data
Parameters
--------------
mudata: MuData
single-cell multi-omics data saved as MuData object
mods: List[str, str]
scRNA-seq and scATAC-seq modality name in MuData object
pairsDf: pandas.DataFrame
gene-peak pair DataFrame containing pairs to be calculated
self-prepared or called from treasmo.tl.peaks_within_distance / TFBS_match
permutations: int
Number of permutations for significance test.
Default is 0, meaning no significance test
999 is a good choice for most cases, but it might take a long time (hours) to finish depending on the number of pairs
percent: float
percentage of cells to shuffle during permutation.
For most of the time, default 0.1 is already a good choice.
seed: int
random seed to make the results reproducible
max_RAM: int
maximum limitation of memory usage(Gb)
Returns
--------------
DataFrame
pairsDf added with extra columns:
Global L results
QC metrics (feature sparsity)
"""
random.seed(seed)
np.random.seed(seed)
start = time.time()
print("Calculating KNN graph-based global correlation...")
cong_mtx = mudata.obsp['connectivities'].toarray()
np.fill_diagonal(cong_mtx, 1.0)
eSP = lee_vec.Spatial_Pearson(cong_mtx, permutations=permutations)
genes = pairsDf['Gene']
peaks = pairsDf['Peak']
gene_X = _prepare_mtx(mudata.mod[mods[0]], genes)
peak_X = _prepare_mtx(mudata.mod[mods[1]], peaks)
eSP = eSP.fit(gene_X, peak_X, percent=percent, seed=seed, max_RAM=max_RAM)
pairsDf['L'] = eSP.association_
pairsDf['L.p_value'] = eSP.significance_ if permutations else np.NaN
if permutations:
_,pairsDf['L.FDR'] = fdrcorrection(peaks_nearby['L.p_value'],
alpha=0.05, method='indep')
else:
pairsDf['L.FDR'] = np.NaN
pairsDf['gene.pct'] = mudata.mod[mods[0]].var.loc[genes,'Frac.all'].to_numpy()
pairsDf['peak.pct'] = mudata.mod[mods[1]].var.loc[peaks,'Frac.all'].to_numpy()
print("Finished calculating correlation. %.3fs past"%(time.time()-start))
return pairsDf
def _calculate_LL(mudata, genes, peaks,
mods=['rna','atac'],
seed=1, max_RAM=16):
random.seed(seed)
np.random.seed(seed)
cong_mtx = mudata.obsp['connectivities'].toarray()
np.fill_diagonal(cong_mtx, 1.0)
gene_X = _prepare_mtx(mudata.mod[mods[0]], genes)
peak_X = _prepare_mtx(mudata.mod[mods[1]], peaks)
eSP2 = lee_vec.Spatial_Pearson_Local(cong_mtx, permutations=0)
eSP2 = eSP2.fit(gene_X,peak_X,seed=seed, max_RAM=max_RAM)
'''
if permutations:
local_p_df = pd.DataFrame(eSP2.significance_)
local_p_df.columns = pair_names
'''
L_mtx = eSP2.associations_
pair_names = pd.Series(genes) + '~' + pd.Series(peaks)
pair_names = pair_names.to_numpy()
return L_mtx, pair_names
[docs]def Local_L(mudata, genes, peaks,
mods=['rna','atac'],
rm_dropout=False,
seed=1, max_RAM=16):
"""
Function to calculate the single-cell gene-peak correlation strength index for all the pairs in multiome data
Parameters
--------------
mudata: MuData
single-cell multi-omics data saved as MuData object
mods: List[str, str]
scRNA-seq and scATAC-seq modality name in MuData object
genes, peaks: List, numpy.array
one-to-one lists containing gene and peak names
rm_dropout: bool
make correlation strength index to be 0 if feature value is 0 (dropout)
seed: int
random seed to make the results reproducible
max_RAM: int
maximum limitation of memory usage (Gb)
Returns
--------------
MuData
added with Local L matrix and gene-peak pair names in ``mudata.uns``
"""
random.seed(seed)
np.random.seed(seed)
start = time.time()
print("Inferring KNN graph-based local correlation...")
L_mtx, L_mtx_names = _calculate_LL(mudata, genes, peaks,
mods=mods,
seed=1, max_RAM=16)
if rm_dropout:
print("Setting local correlation to 0 for cells with no expression on either feature of a certain pair...")
GP_names = L_mtx_names
Dropout_mtx = _dropout_filter(mudata, GP_names, mods)
L_mtx = L_mtx * Dropout_mtx
mudata.uns['Local_L'] = L_mtx
mudata.uns['Local_L_names'] = L_mtx_names
print("Following changes made to the AnnData object:")
print("\tKNN graph-based local correlation results saved in uns['Local_L']")
print("\tGene-peak pair names saved in uns['Local_L_names']")
print("Finished Inferring local correlation. %.3fs past"%(time.time()-start))
return mudata
def _dropout_filter(mudata, GP_names, mods):
# Gene dropout
index_df = pd.DataFrame({'index':np.arange(len(mudata.mod[mods[0]].var_names))})
index_df.index = mudata.mod[mods[0]].var_names
GP_G = [gp.split('~')[0] for gp in GP_names]
GP_genes_index = index_df.loc[GP_G,:]['index'].to_numpy()
E_mtx = _check_array(mudata.mod[mods[0]].X)
E_mtx_dropout_value = np.zeros(E_mtx.shape)
Dropout_mtx = (~np.isclose(E_mtx_dropout_value,E_mtx, 1e-3)).astype(int)
Dropout_mtx_G = Dropout_mtx[:,GP_genes_index]
# Peak dropout
index_df = pd.DataFrame({'index':np.arange(len(mudata.mod[mods[1]].var_names))})
index_df.index = mudata.mod[mods[1]].var_names
GP_P = [gp.split('~')[1] for gp in GP_names]
GP_peaks_index = index_df.loc[GP_P,:]['index'].to_numpy()
E_mtx = _check_array(mudata.mod[mods[1]].X)
E_mtx_dropout_value = np.zeros(E_mtx.shape)
Dropout_mtx = (~np.isclose(E_mtx_dropout_value,E_mtx, 1e-3)).astype(int)
Dropout_mtx_P = Dropout_mtx[:,GP_peaks_index]
Dropout_mtx = Dropout_mtx_G * Dropout_mtx_P
return Dropout_mtx
#=========================================================================================
# Pearson correlation function for benchmark purpose
#=========================================================================================
[docs]def Pearsonr(mudata, genes, peaks,
mods=['rna','atac'],
p_value=False, seed=1):
"""
Function to calculate the Pearson correlation between genes and peaks
Parameters
--------------
mudata: MuData
single-cell multi-omics data saved as MuData object
mods: List[str, str]
scRNA-seq and scATAC-seq modality name in MuData object
genes, peaks: List, numpy.array
one-to-one lists containing gene and peak names
p_value: bool
perform significance testing or not
seed: int
random seed to make the results reproducible
Returns
--------------
DataFrame
containing Pearson correlation r, p-value and FDR for all gene-peak pairs
"""
random.seed(seed)
np.random.seed(seed)
gene_X = _prepare_mtx(mudata.mod[mods[0]], genes)
peak_X = _prepare_mtx(mudata.mod[mods[1]], peaks)
gene_X = pd.DataFrame(gene_X, index=np.arange(gene_X.shape[0]), columns=np.arange(gene_X.shape[1]))
peak_X = pd.DataFrame(peak_X, index=np.arange(peak_X.shape[0]), columns=np.arange(gene_X.shape[1]))
global_P_df = pd.DataFrame(peak_X.corrwith(gene_X, method='pearson', axis=0))
global_P_df.index = pd.Series(genes) + '~' + pd.Series(peaks)
global_P_df.columns = ['r']
if p_value:
print("Calculating p-values...")
p_list = []
for i in range(global_P_df.shape[0]):
p_list.append(stats.pearsonr(gene_X.iloc[:,i].to_numpy(), peak_X.iloc[:,i].to_numpy())[1])
global_P_df['r.p_value'] = p_list
print("Calculating FDR...")
_, global_P_df['r.FDR'] = fdrcorrection(global_P_df['r.p_value'],
alpha=0.05, method='indep')
else:
global_P_df['r.p_value'] = np.NaN
global_P_df['r.FDR'] = np.NaN
return global_P_df