当前位置: 首页>>代码示例>>Python>>正文


Python Database.fetch_DataFrame方法代码示例

本文整理汇总了Python中CGAT.Database.fetch_DataFrame方法的典型用法代码示例。如果您正苦于以下问题:Python Database.fetch_DataFrame方法的具体用法?Python Database.fetch_DataFrame怎么用?Python Database.fetch_DataFrame使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在CGAT.Database的用法示例。


在下文中一共展示了Database.fetch_DataFrame方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: numberGenesDetectedCufflinks

# 需要导入模块: from CGAT import Database [as 别名]
# 或者: from CGAT.Database import fetch_DataFrame [as 别名]
def numberGenesDetectedCufflinks(infile, outfile):
    '''Count no genes detected at copynumer > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()

    statement = '''select distinct c.*, gene_biotype from %(table)s c
                   inner join anndb.gene_info i
                   on c.tracking_id=i.gene_id
                ''' % locals()

    df = DB.fetch_DataFrame(statement, DATABASE, attach)

    # snip off the cufflinks replicate field
    df.columns = [x[:-len("_0")] if x.endswith("_0") else x
                  for x in df.columns]

    melted_df = pd.melt(df, id_vars=["tracking_id", "gene_biotype"])

    grouped_df = melted_df.groupby(["gene_biotype", "variable"])

    agg_df = grouped_df.agg({"value": lambda x:
                             np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df, index="variable",
                              values="value", columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
开发者ID:snsansom,项目名称:scseq,代码行数:34,代码来源:pipeline_scrnaseq.py

示例2: numberGenesDetectedFeatureCounts

# 需要导入模块: from CGAT import Database [as 别名]
# 或者: from CGAT.Database import fetch_DataFrame [as 别名]
def numberGenesDetectedFeatureCounts(infile, outfile):
    '''Count no genes detected by featureCount at counts > 0 in each sample'''

    table = P.toTable(infile)

    attach = '''attach "%(ANN_DATABASE)s" as anndb''' % globals()
    statement = '''select distinct h.*, gene_biotype from %(table)s h
                   inner join anndb.gene_info i
                   on h.gene_id=i.gene_id
               ''' % locals()

    melted_df = DB.fetch_DataFrame(statement, DATABASE, attach)

    grouped_df = melted_df.groupby(["gene_biotype", "track"])

    agg_df = grouped_df.agg({"counts": lambda x:
                             np.sum([1 for y in x if y > 0])})
    agg_df.reset_index(inplace=True)

    count_df = pd.pivot_table(agg_df, index="track",
                              values="counts", columns="gene_biotype")
    count_df["total"] = count_df.apply(np.sum, 1)
    count_df["sample_id"] = count_df.index

    count_df.to_csv(outfile, index=False, sep="\t")
开发者ID:snsansom,项目名称:scseq,代码行数:27,代码来源:pipeline_scrnaseq.py

示例3: qcSummary

# 需要导入模块: from CGAT import Database [as 别名]
# 或者: from CGAT.Database import fetch_DataFrame [as 别名]
def qcSummary(infiles, outfile):
    '''create a summary table of relevant QC metrics'''

    # Some QC metrics are specific to paired end data
    if PAIRED:
        exclude = []
        paired_columns = '''READ_PAIRS_EXAMINED as no_pairs,
                              PERCENT_DUPLICATION as pct_duplication,
                              ESTIMATED_LIBRARY_SIZE as library_size,
                              PCT_READS_ALIGNED_IN_PAIRS
                                       as pct_reads_aligned_in_pairs,
                              MEDIAN_INSERT_SIZE
                                       as median_insert_size,
                           '''
        pcat = "PAIR"

    else:
        exclude = ["qc_library_complexity", "qc_insert_size_metrics"]
        paired_columns = ''
        pcat = "UNPAIRED"

    tables = [P.toTable(x) for x in infiles
              if P.toTable(x) not in exclude]

    t1 = tables[0]

    name_fields = PARAMS["name_field_titles"].strip()

    stat_start = '''select distinct %(name_fields)s,
                                    sample_information.sample_id,
                                    fraction_spliced,
                                    fraction_spike,
                                    qc_no_genes_cufflinks.protein_coding
                                       as cufflinks_no_genes_pc,
                                    qc_no_genes_cufflinks.total
                                       as cufflinks_no_genes,
                                    qc_no_genes_featurecounts.protein_coding
                                       as featurecounts_no_genes_pc,
                                    qc_no_genes_featurecounts.total
                                       as featurecounts_no_genes,
                                    three_prime_bias
                                       as three_prime_bias,
                                    nreads_uniq_map_genome,
                                    nreads_uniq_map_spike,
                                    %(paired_columns)s
                                    PCT_MRNA_BASES
                                       as pct_mrna,
                                    PCT_CODING_BASES
                                       as pct_coding,
                                    PCT_PF_READS_ALIGNED
                                       as pct_reads_aligned,
                                    TOTAL_READS
                                       as total_reads,
                                    PCT_ADAPTER
                                       as pct_adapter,
                                    PF_HQ_ALIGNED_READS*1.0/PF_READS
                                       as pct_pf_reads_aligned_hq
                   from %(t1)s
                ''' % locals()

    join_stat = ""
    for table in tables[1:]:
        join_stat += "left join " + table + "\n"
        join_stat += "on " + t1 + ".sample_id=" + table + ".sample_id\n"

    where_stat = '''where qc_alignment_summary_metrics.CATEGORY="%(pcat)s"
                 ''' % locals()

    statement = "\n".join([stat_start, join_stat, where_stat])

    df = DB.fetch_DataFrame(statement, PARAMS["database_name"])
    df.to_csv(outfile, sep="\t", index=False)
开发者ID:snsansom,项目名称:scseq,代码行数:74,代码来源:pipeline_scrnaseq.py

示例4: generatePeakSets

# 需要导入模块: from CGAT import Database [as 别名]
# 或者: from CGAT.Database import fetch_DataFrame [as 别名]
def generatePeakSets(infile, outfiles):
    outf_con, outf_opt = outfiles

    # retrieve maximum number of peaks obtained from inter-replicate IDR
    # (table created by loadNPeaksForIndividualReplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " max(n_peaks) AS nPeaks"
                 " FROM individual_replicates_nPeaks"
                 " GROUP BY experiment")
    df = Database.fetch_DataFrame(statement)
    # reassign experiment as index
    df = df.set_index("Experiment")

    # retrieve number of peaks obtained from pooled_pseudoreplicate IDR
    # (table created by loadNPeaksForPooledPseudoreplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " n_peaks AS nPeaks"
                 " FROM pooled_pseudoreplicates_nPeaks")
    df2 = Database.fetch_DataFrame(statement)

    # reassign experiment as index
    df2 = df2.set_index("Experiment")

    # split the infile name to obtain experiment
    sample_id = os.path.basename(infile).split("_VS_")[0]
    sample = sample_id.split("-")
    experiment = "_".join([sample[0], sample[1]])

    # retrieve max_numPeaks for experiment
    nPeaks = int(df.loc[experiment])
    # retrieve numPeaks_Rep0 for experiment
    nPeaks_rep0 = int(df2.loc[experiment])
    # retrieve maximumn of the two
    nPeaks_max = max(nPeaks, nPeaks_rep0)

    # establish which column to sort by
    if PARAMS["idr_options_ranking_measure"] == "signal.value":
        sort_statement = "sort -k7nr,7nr"
    elif PARAMS["idr_options_ranking_measure"] == "p.value":
        sort_statement = "sort -k8nr,8nr"
    elif PARAMS["idr_options_ranking_measure"] == "q.value":
        sort_statement = "sort -k9nr,9nr"
    else:
        raise ValueError("Unrecognised ranking_measure"
                         " %s don't know which column"
                         " to sort on" % PARAMS["idr_options_ranking_measure"])

    # sort infile by column and write top nPeaks to outfile (conservative)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks)s |"
                 " gzip > %(outf_con)s")
    P.run()

    # sort infile by column and write top nPeaks_max to outfile (optimum)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks_max)s |"
                 " gzip > %(outf_opt)s")
    P.run()
开发者ID:gjaime,项目名称:CGATPipelines,代码行数:66,代码来源:pipeline_idr.py


注:本文中的CGAT.Database.fetch_DataFrame方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。