本文整理汇总了Python中CGAT.Database.fetch_DataFrame方法的典型用法代码示例。如果您正苦于以下问题:Python Database.fetch_DataFrame方法的具体用法?Python Database.fetch_DataFrame怎么用?Python Database.fetch_DataFrame使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类CGAT.Database
的用法示例。
在下文中一共展示了Database.fetch_DataFrame方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: numberGenesDetectedCufflinks
# 需要导入模块: from CGAT import Database [as 别名]
# 或者: from CGAT.Database import fetch_DataFrame [as 别名]
def numberGenesDetectedCufflinks(infile, outfile):
'''Count no genes detected at copynumer > 0 in each sample'''
table = P.toTable(infile)
attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()
statement = '''select distinct c.*, gene_biotype from %(table)s c
inner join anndb.gene_info i
on c.tracking_id=i.gene_id
''' % locals()
df = DB.fetch_DataFrame(statement, DATABASE, attach)
# snip off the cufflinks replicate field
df.columns = [x[:-len("_0")] if x.endswith("_0") else x
for x in df.columns]
melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"])
grouped_df = melted_df.groupby(["gene_biotype", "variable"])
agg_df = grouped_df.agg({"value": lambda x:
np.sum([1 for y in x if y > 0])})
agg_df.reset_index(inplace=True)
count_df = pd.pivot_table(agg_df, index="variable",
values="value", columns="gene_biotype")
count_df["total"] = count_df.apply(np.sum, 1)
count_df["sample_id"] = count_df.index
count_df.to_csv(outfile, index=False, sep="\t")
示例2: numberGenesDetectedFeatureCounts
# 需要导入模块: from CGAT import Database [as 别名]
# 或者: from CGAT.Database import fetch_DataFrame [as 别名]
def numberGenesDetectedFeatureCounts(infile, outfile):
'''Count no genes detected by featureCount at counts > 0 in each sample'''
table = P.toTable(infile)
attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()
statement = '''select distinct h.*, gene_biotype from %(table)s h
inner join anndb.gene_info i
on h.gene_id=i.gene_id
''' % locals()
melted_df = DB.fetch_DataFrame(statement, DATABASE, attach)
grouped_df = melted_df.groupby(["gene_biotype", "track"])
agg_df = grouped_df.agg({"counts": lambda x:
np.sum([1 for y in x if y > 0])})
agg_df.reset_index(inplace=True)
count_df = pd.pivot_table(agg_df, index="track",
values="counts", columns="gene_biotype")
count_df["total"] = count_df.apply(np.sum, 1)
count_df["sample_id"] = count_df.index
count_df.to_csv(outfile, index=False, sep="\t")
示例3: qcSummary
# 需要导入模块: from CGAT import Database [as 别名]
# 或者: from CGAT.Database import fetch_DataFrame [as 别名]
def qcSummary(infiles, outfile):
'''create a summary table of relevant QC metrics'''
# Some QC metrics are specific to paired end data
if PAIRED:
exclude = []
paired_columns = '''READ_PAIRS_EXAMINED as no_pairs,
PERCENT_DUPLICATION as pct_duplication,
ESTIMATED_LIBRARY_SIZE as library_size,
PCT_READS_ALIGNED_IN_PAIRS
as pct_reads_aligned_in_pairs,
MEDIAN_INSERT_SIZE
as median_insert_size,
'''
pcat = "PAIR"
else:
exclude = ["qc_library_complexity", "qc_insert_size_metrics"]
paired_columns = ''
pcat = "UNPAIRED"
tables = [P.toTable(x) for x in infiles
if P.toTable(x) not in exclude]
t1 = tables[0]
name_fields = PARAMS["name_field_titles"].strip()
stat_start = '''select distinct %(name_fields)s,
sample_information.sample_id,
fraction_spliced,
fraction_spike,
qc_no_genes_cufflinks.protein_coding
as cufflinks_no_genes_pc,
qc_no_genes_cufflinks.total
as cufflinks_no_genes,
qc_no_genes_featurecounts.protein_coding
as featurecounts_no_genes_pc,
qc_no_genes_featurecounts.total
as featurecounts_no_genes,
three_prime_bias
as three_prime_bias,
nreads_uniq_map_genome,
nreads_uniq_map_spike,
%(paired_columns)s
PCT_MRNA_BASES
as pct_mrna,
PCT_CODING_BASES
as pct_coding,
PCT_PF_READS_ALIGNED
as pct_reads_aligned,
TOTAL_READS
as total_reads,
PCT_ADAPTER
as pct_adapter,
PF_HQ_ALIGNED_READS*1.0/PF_READS
as pct_pf_reads_aligned_hq
from %(t1)s
''' % locals()
join_stat = ""
for table in tables[1:]:
join_stat += "left join " + table + "\n"
join_stat += "on " + t1 + ".sample_id=" + table + ".sample_id\n"
where_stat = '''where qc_alignment_summary_metrics.CATEGORY="%(pcat)s"
''' % locals()
statement = "\n".join([stat_start, join_stat, where_stat])
df = DB.fetch_DataFrame(statement, PARAMS["database_name"])
df.to_csv(outfile, sep="\t", index=False)
示例4: generatePeakSets
# 需要导入模块: from CGAT import Database [as 别名]
# 或者: from CGAT.Database import fetch_DataFrame [as 别名]
def generatePeakSets(infile, outfiles):
outf_con, outf_opt = outfiles
# retrieve maximum number of peaks obtained from inter-replicate IDR
# (table created by loadNPeaksForIndividualReplicates)
statement = ("SELECT"
" Experiment,"
" max(n_peaks) AS nPeaks"
" FROM individual_replicates_nPeaks"
" GROUP BY experiment")
df = Database.fetch_DataFrame(statement)
# reassign experiment as index
df = df.set_index("Experiment")
# retrieve number of peaks obtained from pooled_pseudoreplicate IDR
# (table created by loadNPeaksForPooledPseudoreplicates)
statement = ("SELECT"
" Experiment,"
" n_peaks AS nPeaks"
" FROM pooled_pseudoreplicates_nPeaks")
df2 = Database.fetch_DataFrame(statement)
# reassign experiment as index
df2 = df2.set_index("Experiment")
# split the infile name to obtain experiment
sample_id = os.path.basename(infile).split("_VS_")[0]
sample = sample_id.split("-")
experiment = "_".join([sample[0], sample[1]])
# retrieve max_numPeaks for experiment
nPeaks = int(df.loc[experiment])
# retrieve numPeaks_Rep0 for experiment
nPeaks_rep0 = int(df2.loc[experiment])
# retrieve maximumn of the two
nPeaks_max = max(nPeaks, nPeaks_rep0)
# establish which column to sort by
if PARAMS["idr_options_ranking_measure"] == "signal.value":
sort_statement = "sort -k7nr,7nr"
elif PARAMS["idr_options_ranking_measure"] == "p.value":
sort_statement = "sort -k8nr,8nr"
elif PARAMS["idr_options_ranking_measure"] == "q.value":
sort_statement = "sort -k9nr,9nr"
else:
raise ValueError("Unrecognised ranking_measure"
" %s don't know which column"
" to sort on" % PARAMS["idr_options_ranking_measure"])
# sort infile by column and write top nPeaks to outfile (conservative)
ignore_pipe_errors = True
statement = ("zcat %(infile)s |"
" %(sort_statement)s |"
" head -%(nPeaks)s |"
" gzip > %(outf_con)s")
P.run()
# sort infile by column and write top nPeaks_max to outfile (optimum)
ignore_pipe_errors = True
statement = ("zcat %(infile)s |"
" %(sort_statement)s |"
" head -%(nPeaks_max)s |"
" gzip > %(outf_opt)s")
P.run()