本文整理汇总了Python中statsmodels.sandbox.stats.multicomp.multipletests函数的典型用法代码示例。如果您正苦于以下问题:Python multipletests函数的具体用法?Python multipletests怎么用?Python multipletests使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了multipletests函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: calc_overlap_stats
def calc_overlap_stats(test_set, geneset_dict, total_genes):
""" get the overlaps and compute hypergeometric stats"""
overlaps = get_overlaps(test_set, geneset_dict)
p = overlaps.apply( lambda x: (
scipy.stats.hypergeom.sf(
x.ix["match_count"]-1, # number of differentially expressed genes in set
total_genes, # total number of genes
x.ix["size of set"], # number of genes in current set
len( test_set ))), # total number of genes in test set
axis=1)
p = pd.DataFrame(p, columns=["hypergeom p-val"])
overlaps = overlaps.select(lambda x: overlaps.ix[x, "match_count"] > 0)
overlaps = overlaps.merge(p,
left_index=True,
right_index=True).sort("hypergeom p-val", ascending=True)
if len(overlaps.index) > 0:
overlaps["bonferroni"] = multicomp.multipletests(overlaps.ix[:,"hypergeom p-val"],
method="bonferroni")[1]
overlaps["b-h fdr adj pval"] = multicomp.multipletests(
overlaps.ix[:,"hypergeom p-val"].fillna(1.0),
method="fdr_bh")[1]
return overlaps.sort("hypergeom p-val", ascending=True)
示例2: adjustPvalue
def adjustPvalue(s, preAdjusted, layoutAware):
if layoutAware:
idx = 2
else:
idx = 1
try:
# Some hosts do not have this library. If not we don't adjust
import statsmodels.sandbox.stats.multicomp as multicomp
adjust = True
except Exception:
adjust = False
with open(s.file, 'w') as f:
f = csv.writer(f, delimiter='\t')
preAdjVals = []
for i, row in enumerate(preAdjusted):
if not adjust:
# No adjustment will happen so just write a NaN value to
# the file so the UI won't try to display this value
f.writerow(row + [float('NaN')])
continue
# Extract the p-values from the data.
# Translate NaNs to one so the stats routine will take it.
if math.isnan(row[idx]):
preAdjVals.append(1)
else:
preAdjVals.append(row[idx])
if not adjust:
return
try:
# Benjamini-Hochberg FDR correction for p-values returns:
# [reject, p_vals_corrected, alphacSidak, alphacBonf]
# http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.multipletests.html
reject, adjPvals, alphacSidak, alphacBonf = multicomp.multipletests(preAdjVals, alpha=0.05, method='fdr_bh')
except Exception:
adjPvals = [1 for x in preAdjusted]
try:
# Bonferroni correction for p-values returns:
# [reject, p_vals_corrected, alphacSidak, alphacBonf]
# http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.multipletests.html
reject, adjPvalsB, alphacSidak, alphacBonf = multicomp.multipletests(preAdjVals, alpha=0.05, method='bonferroni')
except Exception:
adjPvalsB = [1 for x in preAdjusted]
for i, row in enumerate(preAdjusted):
f.writerow(row + [sigDigs(adjPvals[i]), sigDigs(adjPvalsB[i])])
示例3: __enrich_counts
def __enrich_counts(db, to_r, query, pval_cutoff):
to_r["pval"] = to_r.apply(compute_p, axis=1, M=db.bicluster_info.count(), N=query.shape[0])
to_r["qval_BH"] = multipletests(to_r.pval, method='fdr_bh')[1]
to_r["qval_bonferroni"] = multipletests(to_r.pval, method='bonferroni')[1]
to_r = to_r.sort_values(["pval","counts"], ascending=True)
# only return below pval cutoff
to_r = to_r.loc[to_r.pval <= pval_cutoff, :]
to_r.index = map(int, to_r.index) # make sure GRE ids are integers
return to_r
示例4: celltype_overrepresntation_list
def celltype_overrepresntation_list(self, enrichmentdf):
'''
This method will save the result of significance in one DF.
'''
significance = 1
column = ['celltype', 'gene', 'enrichment', 'binom_pval', 'FDR']
cellgenedf = pd.DataFrame()
#print(self.binom_pval_df.head())
for gene, celltype in self.binom_pval_df.iterrows():
for cell, pval in celltype.iteritems():
if pval < significance:
cellgenedf = cellgenedf.append(
pd.Series([cell, gene, enrichmentdf.loc[gene, cell], pval, 0]), ignore_index=True)
#print cellgenedf.head(10)
cellgenedf.columns = column
cellgenedf = cellgenedf.sort_values(['celltype', 'binom_pval'], ascending=[True, True])
cellgenedf.index = range(len(cellgenedf))
pvals = cellgenedf['binom_pval'].values
corr_pvals = statsmodels.multipletests(pvals=pvals, alpha=0.05, method='fdr_bh')
#print(pvals)
#print(corr_pvals)
cellgenedf['FDR'] = 0
cellgenedf['FDR'] = corr_pvals[1]
'''
for ind, row in cellgenedf.iterrows():
fdr = (row['Binom p-val'] * len(cellgenedf)) / (ind + 1)
cellgenedf.iloc[ind, 4] = fdr
'''
print('cellgenedf shape:', cellgenedf.shape)
#cellgenedf = self.filter_df(cellgenedf)
self.cellgenedf = cellgenedf
print('cellgenedf shape after:', cellgenedf.shape)
#self.filter_cellgenedf() # Filter single cell multigene enrihment
self.overall_significant_celltypes()
示例5: binom_significant_celltypes
def binom_significant_celltypes(self):
'''
Binomial test for significance of celltype enrichment.
'''
print('Testing celltype enrichment....')
sigcelltype = self.sigCelltypedf
cellgroup = self.cellgenedf.groupby(self.cellgenedf['celltype'])
binom_prob_occu = self.binom_prob_occu
sigcelltype.loc[:, 'binom_pval'] = 1
col = sigcelltype.columns.get_loc('binom_pval')
for index, row in sigcelltype.iterrows():
#print(row['celltype'])
#print(row['genecluster'], totalgenes, len(cellgroup.get_group(row['celltype'])), allsiggenes)
bprob_ind = binom_prob_occu[binom_prob_occu['celltype'] == row['celltype']].index[0]
#print(bprob_ind)
background_prob = binom_prob_occu.loc[bprob_ind, 'background_prob']
#print(background_prob)
binom_pval = stats.binom_test(row['genecluster']-1, len(cellgroup.get_group(row['celltype'])), background_prob, alternative='two-sided')
sigcelltype.iloc[index, col] = binom_pval
sigcelltype.loc[:, 'binom_FDR'] = 1
sigcelltype = sigcelltype.sort_values('binom_pval', ascending=True)
sigcelltype.index = range(len(sigcelltype))
pvals = sigcelltype['binom_pval'].values
corr_pvals = statsmodels.multipletests(pvals=pvals, alpha=0.05, method='fdr_bh')
#print(pvals)
#print(corr_pvals)
sigcelltype['binom_FDR'] = corr_pvals[1]
self.sigCelltypedf = sigcelltype
示例6: test_multi_pvalcorrection
def test_multi_pvalcorrection():
# test against R package multtest mt.rawp2adjp
# because of sort this doesn't check correct sequence - TODO: rewrite DONE
rmethods = {
"rawp": (0, "pval"),
"Bonferroni": (1, "b"),
"Holm": (2, "h"),
"Hochberg": (3, "sh"),
"SidakSS": (4, "s"),
"SidakSD": (5, "hs"),
"BH": (6, "fdr_i"),
"BY": (7, "fdr_n"),
}
for k, v in rmethods.items():
if v[1] in ["b", "s", "sh", "hs", "h", "fdr_i", "fdr_n"]:
# pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1])
r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3]
pvalscorr = multipletests(pval0, alpha=0.1, method=v[1])[1][r_sortindex]
assert_almost_equal(pvalscorr, res_multtest[:, v[0]], 15)
pvalscorr = np.sort(fdrcorrection0(pval0, method="n")[1])
assert_almost_equal(pvalscorr, res_multtest[:, 7], 15)
pvalscorr = np.sort(fdrcorrection0(pval0, method="i")[1])
assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
示例7: run
def run(study, pop, gene_set, adjust='fdr_bh'):
'''
Run a Over-represent analysis toward a gene set
:param study: the significant gene set
:param pop: the background gene set
:param gene_set: the function set
:param adjust: the adjust method in the multiple tests,
details in http://www.statsmodels.org/dev/generated/statsmodels.sandbox.stats.multicomp.multipletests.html
:return: the ORA analysis result
'''
gene_sets = gene_set if type(gene_set) == dict else GMTUtils.parse_gmt_file(gene_set)
mapped = {k: list(set(v) & set([str(x) for x in pop])) for k, v in gene_sets.items()}
s_mapped = {k: list(set(v) & set([str(x) for x in study])) for k, v in gene_sets.items()}
result = {}
for k, v in mapped.items():
result[k] = stats.hypergeom.sf(len(s_mapped[k]) - 1, len(pop), len(mapped[k]), len(study))
_, o, _, _ = multicomp.multipletests(list(result.values()), method=adjust)
rfdr = {list(result.keys())[i]: o[i] for i in range(len(list(result.keys())))}
# !
df_result = {'name': [], 'mapped': [], 'number in study': [], 'p-value': [], 'fdr': []}
for k, v in mapped.items():
df_result['name'].append(k)
df_result['mapped'].append(len(mapped[k]))
df_result['number in study'].append(len(s_mapped[k]))
df_result['p-value'].append(result[k])
df_result['fdr'].append(rfdr[k])
df = pd.DataFrame(df_result)
df = df[['name', 'mapped', 'number in study', 'p-value', 'fdr']]
return ORA(df, study, pop, adjust)
示例8: combine
def combine(self, results):
"""
Stouffer combination of zscores
:param results:
:return:
"""
zscores = results.sum(axis=1) / np.sqrt(results.count(axis=1))
size = zscores.size
is_nan = zscores.mask
valid_indices = np.where(~is_nan)
invalid_indices = np.where(is_nan)
pv = stats.norm.sf(zscores[valid_indices])
pvalues = np.empty(size)
pvalues[valid_indices] = pv
pvalues[invalid_indices] = np.nan
if pv.size != 0:
qv = multipletests(pv, method='fdr_bh')[1]
else:
qv = np.array([])
qvalues = np.empty(size)
qvalues[valid_indices] = qv
qvalues[invalid_indices] = np.nan
return np.array([zscores, pvalues, qvalues])
示例9: stat_test
def stat_test(f, test_name, test, fdr):
print('Testing', test_name, f, 'fdr', fdr)
df = pd.read_csv(f, sep='\t')
# Drop contigs
df = df.loc[[bool(re.match('chr[0-9XYM]+$', c)) for c in df['chr']]]
ods = [c for c in df.columns.values if is_od(c)]
yds = [c for c in df.columns.values if is_yd(c)]
pvals = np.array([test(row[ods], row[yds]) for _, row in df.iterrows()])
res = multipletests(pvals, fdr, "fdr_bh")
h0_rejects = res[0]
pvals_adj = res[1]
df['pval'] = pvals
df['pval_adj'] = pvals_adj
df['od_mean'] = df[ods].mean(axis=1).to_frame('od_mean')['od_mean']
df['yd_mean'] = df[yds].mean(axis=1).to_frame('yd_mean')['yd_mean']
df['logfc'] = np.log(df['od_mean'] / df['yd_mean'])
# Sort by pvalue
pvals_order = pvals.argsort()
df = df.loc[pvals_order]
h0_rejects = h0_rejects[pvals_order]
# Save results
results = re.sub(r'\.tsv', '_{}.tsv'.format(test_name), f)
df[['chr', 'start', 'end', 'yd_mean', 'od_mean', 'logfc', 'pval', 'pval_adj']] \
.to_csv(results, sep='\t', index=None, header=True)
print('Saved test results to', results)
# Save significant results
if sum(h0_rejects) > 0:
results_fdr = re.sub(r'\.tsv', '_{}_diff_fdr_{}.bed'.format(test_name, fdr), f)
df.loc[h0_rejects][['chr', 'start', 'end']] \
.to_csv(results_fdr, sep='\t', index=None, header=True)
print('Saved {} significant results at FDR={} to {}'.format(
sum(h0_rejects), fdr, results_fdr))
示例10: combine
def combine(self, results):
"""
Fisher's combination of pvalues
:param results:
:return:
"""
results = np.copy(results)
results[results < PVALUE_EPSILON] = PVALUE_EPSILON
log = np.ma.log(results)
s = log.sum(axis=1)
count = log.count(axis=1)
size = s.size
is_nan = s.mask
valid_indices = np.where(~is_nan)
invalid_indices = np.where(is_nan)
pv = 1.0 - stats.chi2.cdf(-2.0 * s[valid_indices], 2 * count[valid_indices])
pvalues = np.empty(size)
pvalues[valid_indices] = pv
pvalues[invalid_indices] = np.nan
if pv.size != 0:
qv = multipletests(pv, method='fdr_bh')[1]
else:
qv = np.array([])
qvalues = np.empty(size)
qvalues[valid_indices] = qv
qvalues[invalid_indices] = np.nan
return np.array([pvalues, qvalues])
示例11: perform_multiple_comparison_stat
def perform_multiple_comparison_stat(data1,data2, alpha=0.05):
"""
:param data1:
:param data2:
:return: True if they are statistically different
"""
mat1 = np.array(data1)
mat2 = np.array(data2)
comparisons = len(data1[0])
pvals = [ttest_ind(mat1[:,i].tolist(),mat2[:,i])[1] for i in range(comparisons)]
mult_comparison = multipletests(pvals, alpha=alpha)
#print(mult_comparison)
print(mult_comparison[0])
"""Version where just once is enough
for val in mult_comparison[0]:
if val == True:
return True
return False
"""
# Version where the number of trues must exceed alpha (useful when you have A LOT of elements)
true_counter = 0
for val in mult_comparison[0]:
if val == True:
true_counter += 1
return True if true_counter/len(mult_comparison[0]) >= alpha else False
示例12: hypergeometric_significant_celltypes
def hypergeometric_significant_celltypes(self):
'''
hypergeometric test for significance of celltype enrichment.
'''
print('Testing celltype enrichment....')
sigcelltype = self.sigCelltypedf
cellgroup = self.cellgenedf.groupby(self.cellgenedf['celltype'])
totalgenes = self.occurrencedf.shape[0]
allsiggenes = self.cellgenedf
allsiggenes = allsiggenes[allsiggenes['FDR'] <= 0.05]
allsiggenes = len(set(allsiggenes['gene']))
sigcelltype.loc[:, 'hyper_pval'] = 1
col = sigcelltype.columns.get_loc('hyper_pval')
for index, row in sigcelltype.iterrows():
#print(row['celltype'])
#print(row['genecluster'], totalgenes, len(cellgroup.get_group(row['celltype'])), allsiggenes)
## stats.hypergeom.sf(x, M, n, N)
hyper_pval = stats.hypergeom.sf(row['genecluster']-1, totalgenes, allsiggenes, len(cellgroup.get_group(row['celltype'])))
#print(hyper_pval)
sigcelltype.iloc[index, col] = hyper_pval
sigcelltype.loc[:, 'hyper_FDR'] = 1
#ind_fdr = sigcelltype.columns.get_loc('hyper_FDR')
sigcelltype = sigcelltype.sort_values('hyper_pval', ascending=True)
sigcelltype.index = range(len(sigcelltype))
pvals = sigcelltype['hyper_pval'].values
corr_pvals = statsmodels.multipletests(pvals=pvals, alpha=0.05, method='fdr_bh')
sigcelltype['hyper_FDR'] = corr_pvals[1]
self.sigCelltypedf = sigcelltype
示例13: calculate_gene_expression_similarity
def calculate_gene_expression_similarity(reduced_stat_map_data, mask="full"):
store_file = "/ahba_data/store_max1_reduced.h5"
subcortex_mask = "/ahba_data/subcortex_mask.npy"
results_dfs = []
with pd.HDFStore(store_file, 'r') as store:
for donor_id in store.keys():
print "Loading expression data (%s)" % donor_id
expression_data = store.get(donor_id.replace(".", "_"))
print "Getting statmap values (%s)" % donor_id
nifti_values = reduced_stat_map_data[expression_data.columns]
print "Removing missing values (%s)" % donor_id
na_mask = np.isnan(nifti_values)
if mask == "subcortex":
na_mask = np.logical_or(na_mask,
np.isnan(np.load(subcortex_mask)[expression_data.columns]))
elif mask == "cortex":
na_mask = np.logical_or(na_mask, np.logical_not(np.isnan(
np.load(subcortex_mask)[expression_data.columns])))
else:
assert mask == "full"
nifti_values = np.array(nifti_values)[np.logical_not(na_mask)]
expression_data.drop(expression_data.columns[na_mask], axis=1, inplace=True)
print "z scoring (%s)" % donor_id
expression_data = pd.DataFrame(zscore(expression_data, axis=1), columns=expression_data.columns,
index=expression_data.index)
nifti_values = zscore(nifti_values)
print "Calculating linear regressions (%s)" % donor_id
regression_results = np.linalg.lstsq(np.c_[nifti_values, np.ones_like(nifti_values)], expression_data.T)
results_df = pd.DataFrame({"slope": regression_results[0][0]}, index=expression_data.index)
results_df.columns = pd.MultiIndex.from_tuples([(donor_id[1:], c,) for c in results_df.columns],
names=['donor_id', 'parameter'])
results_dfs.append(results_df)
print "Concatenating results"
results_df = pd.concat(results_dfs, axis=1)
del results_dfs
t, p = ttest_1samp(results_df, 0.0, axis=1)
group_results_df = pd.DataFrame({"t": t, "p": p}, columns=['t', 'p'], index=expression_data.index)
_, group_results_df["p (FDR corrected)"], _, _ = multipletests(group_results_df.p, method='fdr_bh')
group_results_df["variance explained (mean)"] = (results_df.xs('slope', axis=1, level=1) ** 2 * 100).mean(axis=1)
group_results_df["variance explained (std)"] = (results_df.xs('slope', axis=1, level=1) ** 2 * 100).std(axis=1)
del results_df
probe_info = pd.read_csv("/ahba_data/probe_info_max1.csv", index_col=0).drop(['chromosome', "gene_id"], axis=1)
group_results_df = group_results_df.join(probe_info)
group_results_df = group_results_df[["gene_symbol", "entrez_id.1", "gene_name","t", "p", "p (FDR corrected)",
"variance explained (mean)", "variance explained (std)"]]
return group_results_df
示例14: calculate_quantile_pvalue
def calculate_quantile_pvalue(
self, quantile, minvalues=10
):
# Check arguments
if isinstance(quantile, float):
quantile = [quantile]
elif isinstance(quantile, list):
for q in quantile:
if not isinstance(q, float):
raise TypeError('quantile list must contain floats')
else:
raise TypeError('quantile must be float or list of floats')
# Create colnames for output dataframe
colNames = []
for condition in self.matrices:
for sample in self.matrices[condition]:
colNames.append('{}_{}_no'.format(condition, sample))
colNames.append('{}_{}_mean'.format(condition, sample))
for condition in self.matrices:
colNames.append('{}_no'.format(condition))
colNames.append('{}_mean'.format(condition))
colNames.extend(['pvalue', 'fdr'])
# Create output dataframe
outDF = pd.DataFrame(index=quantile, columns=colNames)
outDF = outDF.sort_index()
# Extract quantile distance data
quantData = self.extract_dist_quantile(quantile)
splitQuant = quantData.groupby('quan')
for q, data in splitQuant:
# Extract data for conditions and samples
condValues = []
for cond in self.matrices:
# Extract data for condition
condData = data[data['cond'] == cond]
condDist = condData['dist']
condValues.append(condDist)
# Add condition data to output
colPrefix = '{}_'.format(cond)
outDF.loc[q, colPrefix + 'no'] = condDist.size
outDF.loc[q, colPrefix + 'mean'] = condDist.mean()
for smpl in self.matrices[cond]:
# Extract data for sample
smplData = condData[condData['smpl'] == smpl]
smplDist = smplData['dist']
# Add sample data to output
colPrefix = '{}_{}_'.format(cond, smpl)
outDF.loc[q, colPrefix + 'no'] = smplDist.size
outDF.loc[q, colPrefix + 'mean'] = smplDist.mean()
# Calculate pvalues
dist1, dist2 = condValues
if dist1.size >= minvalues and dist2.size >= minvalues:
outDF.loc[q, 'pvalue'] = mannwhitneyu(dist1, dist2)[1]
# Add fdr and return
pvalueIndex = outDF.index[~outDF['pvalue'].isnull()]
outDF.loc[pvalueIndex, 'fdr'] = multipletests(
outDF.loc[pvalueIndex, 'pvalue'], method='fdr_bh')[1]
return(outDF)
示例15: bhCorrection
def bhCorrection(s, n=None):
s = s.fillna(1.)
if n > len(s):
p_vals = list(s) + [1] * (n - len(s))
else:
p_vals = list(s)
q = multicomp.multipletests(p_vals, method='fdr_bh')[1][:len(s)]
q = pd.Series(q[:len(s)], s.index, name='p_adj')
return q