当前位置: 首页>>代码示例>>Python>>正文


Python Stats.doFDR方法代码示例

本文整理汇总了Python中CGAT.Stats.doFDR方法的典型用法代码示例。如果您正苦于以下问题:Python Stats.doFDR方法的具体用法?Python Stats.doFDR怎么用?Python Stats.doFDR使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在CGAT.Stats的用法示例。


在下文中一共展示了Stats.doFDR方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: checkFDR

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]
    def checkFDR(self, pi0_method):

        result = Stats.doFDR(
            self.mPvalues, fdr_level=0.05, pi0_method=pi0_method)
        R("""require ('qvalue')""")
        qvalues = R.qvalue(ro.FloatVector(self.mPvalues),
                           fdr_level=0.05,
                           pi0_method=pi0_method)

        assert qvalues.names[1] == "pi0"
        assert qvalues.names[2] == "qvalues"
        assert qvalues.names[5] == "significant"
        assert qvalues.names[6] == "lambda"

        r_qvalues = qvalues[2]
        r_pi0 = qvalues[1][0]

        self.assertEqual(len(result.mQValues), len(qvalues[2]))
        self.assertEqual(len(result.mLambda), len(qvalues[6]))
        self.assertEqual(result.mPi0, r_pi0)
        for a, b in zip(result.mQValues, r_qvalues):
            self.assertAlmostEqual(a, b, 2, "unequal: %f != %f" % (a, b))

        for a, b in zip(result.mPassed, qvalues[5]):
            self.assertEqual(
                a, b, "threshold-passed flag not equal: %s != %s" % (a, b))
开发者ID:lesheng,项目名称:cgat,代码行数:28,代码来源:Stats_test.py

示例2: computeFDR

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]
def computeFDR(all_results,
               qvalue_method="storey"):
    '''compute FDR.

    update GOResult structure with field .fdr
    '''

    # flatten all_results
    results = []
    for key, data in all_results.iteritems():
        results.extend(data.mResults.values())

    observed_min_pvalues = [min(
        x.mProbabilityOverRepresentation,
        x.mProbabilityUnderRepresentation) for x in results]

    if qvalue_method == "storey":

        # compute fdr via Storey's method
        fdr_data = Stats.doFDR(observed_min_pvalues, vlambda=0.1)

        E.info("estimated proportion of true null hypotheses = %6.4f" %
               fdr_data.mPi0)

        if fdr_data.mPi0 < 0.1:
            E.warn(
                "estimated proportion of true null hypotheses is "
                "less than 10%%  (%6.4f)" % fdr_data.mPi0)

        for result, qvalue in zip(results, fdr_data.mQValues):
            result.fdr = qvalue

    elif options.qvalue_method == "empirical":
        assert options.sample > 0, "requiring a sample size of > 0"
        raise NotImplementedError("empirical needs some work")
开发者ID:Charlie-George,项目名称:cgat,代码行数:37,代码来源:genelist_analysis.py

示例3: doOldFDR

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]
def doOldFDR(options, args):
    """apply fdr to output of annotator."""

    # read input
    annotators = []
    for filename in args:
        infile = open(filename, "r")
        annotators.append(readAnnotator(infile))
        infile.close()

    # apply filters and create diagnostic plots
    for filename, data in zip(args, annotators):
        ninput = len(data)
        pvalues = [x.mPValue for x in data]
        vlambda = numpy.arange(0, max(pvalues), 0.05)
        try:
            qvalues = Stats.doFDR(
                pvalues, vlambda=vlambda, fdr_level=options.fdr)
        except ValueError, msg:
            E.warn("%s: fdr could not be computed - no filtering: %s" %
                   (filename, msg))
            continue

        qvalues.plot(filename + "_diagnostics.png")

        data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]
开发者ID:lesheng,项目名称:cgat,代码行数:28,代码来源:annotator2tsv.py

示例4: analysePolyphen

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]
def analysePolyphen(infile, outfile):
    '''compute enrichment of SNPs within genes
    and deleterious SNPs within SNPs within genes.

    del: enrichment of deleterious snps within snps per gene
    len: enrichment of snps within genes
    com: enrichment of deleterious snps within gene
    '''

    table = P.toTable(infile)
    tablename_map = "polyphen_map"

    dbhandle = connect()
    cc = dbhandle.cursor()

    statement = '''
        SELECT i.gene_id,
               COUNT(DISTINCT map.locus_id) as nsnps,
               COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious,
               MAX(s.length)
               FROM %(table)s as t,
                    %(tablename_map)s as map,
                    annotations.protein_stats as s,
                    annotations.transcript_info as i
        WHERE map.snp_id = t.snp_id AND
              i.transcript_id = map.transcript_id AND
              s.protein_id = map.protein_id
        GROUP BY i.gene_id
     ''' % locals()

    data = cc.execute(statement).fetchall()

    statement = '''SELECT DISTINCT i.gene_id, MAX(s.length)
                   FROM annotations.transcript_info AS i, annotations.protein_stats AS s
                   WHERE s.protein_id = i.protein_id
                   GROUP BY i.gene_id'''
    gene_ids = cc.execute(statement).fetchall()

    total_nsnps = sum([x[1] for x in data])
    total_ndel = sum([x[2] for x in data])
    total_length = sum([x[1] for x in gene_ids])
    del_p = float(total_ndel) / total_nsnps
    len_p = float(total_nsnps) / total_length
    com_p = float(total_ndel) / total_length

    E.info("del: background probability: %i/%i = %f" %
           (total_ndel, total_nsnps, del_p))
    E.info("len: background probability: %i/%i = %f" %
           (total_nsnps, total_length, len_p))
    E.info("com: background probability: %i/%i = %f" %
           (total_ndel, total_length, com_p))

    outf = open(outfile, "w")
    outf.write("\t".join(("gene_id", "code",
                          "length", "nsnps", "ndel",
                          "del_p", "del_pvalue", "del_qvalue",
                          "len_p", "len_pvalue", "len_qvalue",
                          "com_p", "com_pvalue", "com_qvalue", )) + "\n")

    del_pvalues, len_pvalues, com_pvalues = [], [], []
    for gene_id, nsnps, ndel, length in data:

        # use -1, because I need P( x >= X)
        # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x
        # > X ).
        del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p))
        len_pvalues.append(
            scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p))
        com_pvalues.append(
            scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p))

    if len(del_pvalues) > 10:
        del_qvalues = Stats.doFDR(del_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        del_qvalues = del_pvalues

    if len(len_pvalues) > 10:
        len_qvalues = Stats.doFDR(len_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        len_qvalues = len_pvalues

    if len(com_pvalues) > 10:
        com_q = Stats.doFDR(com_pvalues).mQValues
    else:
        E.warn("no FDR computed for com")
        com_qvalues = com_pvalues

    fdr = PARAMS["polyphen_fdr"]

    found = set()

    for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \
            zip(data,
                del_pvalues, del_qvalues,
                len_pvalues, len_qvalues,
                com_pvalues, com_qvalues,
                ):
        gene_id, nsnps, ndel, length = a
#.........这里部分代码省略.........
开发者ID:CGATOxford,项目名称:CGATPipelines,代码行数:103,代码来源:pipeline_variant_annotation.py

示例5: loadGOs

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]
def loadGOs(infiles, outfile, tablename):
    '''import GO results into a single table.

    This method also computes a global QValue over all
    tracks, genesets and annotation sets.

    Arguments
    ---------
    infiles : string
       Output files of several runGO analyses
    outfile : string
       Output filename, contains log information
    tablename : string
       Table name for storing results.
    '''

    header = False

    tempf1 = P.getTempFile()

    pvalues = []

    for infile in infiles:
        indir = infile + ".dir"

        if not os.path.exists(indir):
            continue

        track, geneset, annotationset = re.search(
            "^(\S+)_vs_(\S+)\.(\S+)", infile).groups()

        for filename in glob.glob(os.path.join(indir, "*.overall")):
            for line in open(filename, "r"):
                if line.startswith("#"):
                    continue
                data = line[:-1].split("\t")
                if line.startswith("code"):
                    if header:
                        continue
                    tempf1.write("track\tgeneset\tannotationset\t%s" % line)
                    header = True
                    assert data[10] == "pover" and data[
                        11] == "punder", "format error, expected pover-punder, got %s-%s" % (data[10], data[11])
                    continue
                tempf1.write("%s\t%s\t%s\t%s" %
                             (track, geneset, annotationset, line))
                pvalues.append(min(float(data[10]), float(data[11])))

    tempf1.close()

    E.info("analysing %i pvalues" % len(pvalues))
    fdr = Stats.doFDR(pvalues)
    E.info("got %i qvalues" % len(fdr.mQValues))
    qvalues = ["global_qvalue"] + fdr.mQValues

    tempf2 = P.getTempFile()

    for line, qvalue in zip(open(tempf1.name, "r"), qvalues):
        tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue)))

    tempf2.close()

    P.load(tempf2.name, outfile,
           tablename=tablename,
           options="--allow-empty-file "
           "--add-index=category "
           "--add-index=track,geneset,annotationset "
           "--add-index=geneset "
           "--add-index=annotationset "
           "--add-index=goid ")

    os.unlink(tempf1.name)
    os.unlink(tempf2.name)
开发者ID:CGATOxford,项目名称:CGATPipelines,代码行数:75,代码来源:PipelineGO.py

示例6: main

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]

#.........这里部分代码省略.........
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue,
                alternatives) ) )

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-": s = Genomics.complement( s )
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write( "\t%s" % s )
                
            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile( "fdr" )
            outfile.write("bin\thist\tnobserved\n" )
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
                outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()


    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run( seq.sequence,
                              options.arrangements,
                              qvalue_threshold = None )
            for m in mm:
                matches.append( m._replace( sequence = seq.title ) )

        # estimate qvalues for all matches across all sequences
        pvalues = [ x.pvalue for x in matches ]
        fdr = Stats.doFDR( pvalues )
        qvalues = fdr.mQValues
        results = []
        for m, qvalue in zip(matches, qvalues):
            if qvalue > options.qvalue_threshold: continue
            results.append( m._replace( qvalue = qvalue ) )

        if options.combine:            
            results =  Nubiscan.combineMotifs( results )

        # output
        for r in results:
            options.stdout.write( "\t".join( ( 
                r.id,
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue ) ) + "\n" )

            noutput += 1

    elif options.fdr_control == "per-sequence":
        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            result = matcher.run( seq.sequence,
                                  options.arrangements,
                                  qvalue_threshold = options.qvalue_threshold )
            
            if options.combine:
                result =  Nubiscan.combineMotifs( result )

            t = re.sub(" .*","",  seq.title)
            for r in result:
                options.stdout.write( "\t".join( ( 
                    t,
                    "%i" % r.start,
                    "%i" % r.end,
                    r.strand,
                    r.arrangement,
                    "%6.4f" % r.score,
                    "%6.4f" % r.zscore,
                    "%f" % r.pvalue,
                    "%f" % r.qvalue ) ) + "\n" )

            noutput += 1
    
    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) )

    ## write footer and output benchmark information.
    E.Stop()
开发者ID:BioinformaticsArchive,项目名称:cgat,代码行数:104,代码来源:run_nubiscan.py

示例7: doFDR

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]
def doFDR(options, args):

    # read input
    annotators = []
    for filename in args:
        infile = open(filename, "r")
        annotators.append(readAnnotator(infile))
        infile.close()

    do_filter = options.fdr_qvalue is not None

    extra_headers = set()
    for data, fdr, synonyms, input_files in annotators:
        for key, value in input_files.iteritems():
            extra_headers.add(key)
    extra_headers = sorted(list(extra_headers))

    # note: id used to be file
    options.stdout.write("id\tover\tcategory\tpvalue\tfold\tobserved\texpected\tci95low\tci95high\tstddev\tfdr\tqvalue\t%s\n" %
                         "\t".join(extra_headers))

    # apply filters and create diagnostic plots
    for filename, vv in zip(args, annotators):
        data, fdr, synonyms, input_files = vv

        ninput = len(data)

        E.info("processing %s with %i data points" % (filename, ninput))
        no_fdr = False

        if options.fdr_method in ("annotator", "annotator-estimate"):
            pvalues = fdr.keys()
            pvalues.sort()
            pvalues.reverse()
            for pvalue in pvalues:
                try:
                    d = fdr[pvalue]["Significant"]
                except KeyError:
                    continue

                if d.mObserved == 0:
                    E.info("no data after fdr")
                    break

                elif d.mAverage / d.mObserved < options.fdr_qvalue:
                    E.info("filtering with P-value of %f" % pvalue)
                    if do_filter:
                        data = [x for x in data if x.mPValue < pvalue]
                    for d in data:
                        if d.mPValue < pvalue:
                            d.mFDR = 1
                            d.mQValue = options.fdr_qvalue
                    break
                else:
                    E.warn("fdr could not be computed - compute more samples (at P = %f, actual fdr=%f)" %
                           (pvalue, d.mAverage / d.mObserved))
                    no_fdr = True

        if options.fdr_method == "estimate" or (options.fdr_method == "annotator-estimate" and no_fdr):

            E.info("estimating FDR from observed P-Values")
            pvalues = [x.mPValue for x in data]
            vlambda = numpy.arange(0, max(pvalues), 0.05)
            try:
                qvalues = Stats.doFDR(
                    pvalues, vlambda=vlambda, fdr_level=options.fdr_qvalue)
            except ValueError, msg:
                E.warn("fdr could not be computed - no output: %s" % msg)
                no_fdr = True
            else:
                for d, p, q in zip(data, qvalues.mPassed, qvalues.mQValues):
                    if p:
                        d.mFDR = 1
                    d.mQValue = q

                if do_filter:
                    data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]

        if do_filter and no_fdr:
            data = []

        nremoved = ninput - len(data)

        E.info("%s: %i data points left, %i removed" %
               (filename, len(data), nremoved))

        extra_values = []
        for key in extra_headers:
            if key in input_files:
                extra_values.append(input_files[key])
            else:
                extra_values.append("")

        extra_values = "\t".join(map(str, extra_values))

        for d in data:
            if d.mFoldChange < 1:
                code = "-"
            else:
                code = "+"
#.........这里部分代码省略.........
开发者ID:lesheng,项目名称:cgat,代码行数:103,代码来源:annotator2tsv.py

示例8: Collect

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]

#.........这里部分代码省略.........
                    annotator_fdr[annotator_level][dd[0]] = d
                continue
            else:
                if line[0] == "Z":
                    continue  # skip header
                if len(line[:-1].split('\t')) != 9:
                    continue  # HACK: accounts for a bug in Annotator output

                try:
                    (z, percentchange, pvalue, observed, expected, low95,
                     up95, stddev, description) = line[:-1].split('\t')[:9]
                except ValueError:
                    raise ValueError("# parsing error in line: %s" % line[:-1])

            d = DataPoint()
            d.mAnnotation = description
            d.mPValue = float(pvalue)
            d.mFoldChange = 1.0 + float(percentchange) / 100.0
            data.append(d)
    else:

        for line in lines:
            try:
                (code, goid, scount, stotal, spercent, bcount, btotal, bpercent, ratio,
                 pover, punder, goid, category, description) = line[:-1].split("\t")[:14]
            except ValueError:
                raise ValueError("# parsing error in line: %s" % line[:-1])

            if code == "+":
                p = pover
            else:
                p = punder

            d = DataPoint()
            d.mAnnotation = description
            d.mPValue = float(p)
            d.mFoldChange = float(spercent) / float(bpercent)
            data.append(d)

    # apply filters
    for c in delims:
        for d in data:
            d.mAnnotation = d.mAnnotation.split(c)[0]
    for c in ignore:
        for d in data:
            d.mAnnotation = d.mAnnotation.replace(c, '')

    ninput = len(data)
    no_fdr = False
    # apply filters

    if ninput > 0:
        if max_qvalue is not None:
            if use_annotator_fdr:
                pvalues = list(annotator_fdr.keys())
                pvalues.sort()
                pvalues.reverse()
                for pvalue in pvalues:
                    try:
                        d = annotator_fdr[pvalue]["Significant"]
                    except KeyError:
                        continue
                    if d.mObserved == 0:
                        E.info("no data remaining after fdr filtering")
                        data = []
                        break
                    elif d.mAverage / d.mObserved < max_qvalue:
                        E.info("filtering with P-value of %f" % pvalue)
                        data = [x for x in data if x.mPValue < pvalue]
                        break
                else:
                    E.warn("fdr could not be computed - compute more "
                           "samples (at P = %f, actual fdr=%f)" %
                           (pvalue, d.mAverage / d.mObserved))
                    no_fdr = True

            if no_fdr:
                if use_annotator_fdr:
                    E.info("estimating FDR from observed P-Values")

                pvalues = [x.mPValue for x in data]
                vlambda = numpy.arange(0, max(pvalues), 0.05)
                try:
                    qvalues = Stats.doFDR(
                        pvalues, vlambda=vlambda, fdr_level=max_qvalue)
                except ValueError as msg:
                    E.warn(
                        "fdr could not be computed - no filtering: %s" % msg)
                    no_fdr = True
                else:
                    data = [x[0] for x in zip(data, qvalues.mPassed) if x[1]]
        elif max_pvalue is not None:
            data = [x for x in data if x.mPValue < max_pvalue]

    if no_fdr:
        data = []

    nremoved = ninput - len(data)

    return data, nremoved, no_fdr
开发者ID:CGATOxford,项目名称:cgat,代码行数:104,代码来源:go2svg.py

示例9: pairwiseGOEnrichment

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]

#.........这里部分代码省略.........
            shared = x_go_categories.intersection(y_go_categories)

            c = E.Counter()

            for category in shared:
                c.shared += 1
                xx = genelist1[category]
                yy = genelist2[category]

                # discard all tests with few observations in the observed
                # counts
                if xx.mSampleCountsCategory < min_observed_counts and yy.mSampleCountsCategory < min_observed_counts:
                    c.skipped += 1
                    continue

                observed = (xx.mSampleCountsCategory, yy.mSampleCountsCategory)

                aa, bb, cc, dd = \
                    (xx.mSampleCountsCategory,
                     yy.mSampleCountsCategory,
                     xx.mSampleCountsTotal - xx.mSampleCountsCategory,
                     yy.mSampleCountsTotal - yy.mSampleCountsCategory)

                if cc == dd == 0:
                    c.skipped += 1
                    continue

                c.tested += 1

                fisher, pvalue = scipy.stats.fisher_exact(numpy.array(
                    ((aa, bb),
                     (cc, dd))))

                if pvalue < 0.05:
                    c.significant_pvalue += 1
                else:
                    c.insignificant_pvalue += 1

                results.append(PairResult._make((category,
                                                 labels[x],
                                                 labels[y],
                                                 xx.mSampleCountsCategory,
                                                 xx.mSampleCountsTotal,
                                                 xx.mPValue,
                                                 xx.mQValue,
                                                 yy.mSampleCountsCategory,
                                                 yy.mSampleCountsTotal,
                                                 yy.mPValue,
                                                 yy.mQValue,
                                                 pvalue,
                                                 1.0,
                                                 go2info[category].mDescription)))

            outfile.write("\t".join(map(str,
                                        (labels[x], labels[y],
                                         len(x_go_categories),
                                         len(y_go_categories),
                                         c.shared,
                                         c.skipped,
                                         c.tested,
                                         c.significant_pvalue,
                                         c.insignicant_pvalue))) + "\n")
    if options.output_filename_pattern:
        outfile.close()

    if options.fdr:
        pvalues = [x.pvalue for x in results]

        if options.qvalue_method == "storey":

            # compute fdr via Storey's method
            try:
                fdr_data = Stats.doFDR(pvalues)

            except ValueError as msg:
                E.warn("failure in q-value computation: %s" % msg)
                E.warn("reverting to Bonferroni correction")
                method = "bonf"
                fdr_data = Stats.FDRResult()
                l = float(len(pvalues))
                fdr_data.mQValues = [min(1.0, x * l) for x in pvalues]

            qvalues = fdr_data.mQValues
        else:
            qvalues = R['p.adjust'](pvalues, method=options.qvalue_method)

        # update qvalues
        results = [x._replace(qvalue=y) for x, y in zip(results, qvalues)]

    outfile = getFileName(options,
                          go=test_ontology,
                          section='pairs',
                          set="pairs")

    outfile.write("\t".join(PairResult._fields) + "\n")
    for result in results:
        outfile.write("\t".join(map(str, result)) + "\n")

    if options.output_filename_pattern:
        outfile.close()
开发者ID:CGATOxford,项目名称:cgat,代码行数:104,代码来源:GO.py

示例10: computeFDRs

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]
def computeFDRs(go_results,
                foreground,
                background,
                options,
                test_ontology,
                gene2go,
                go2info):

    pairs = sorted(go_results.mResults.items())

    E.info("calculating the FDRs using method `%s`" % options.qvalue_method)

    samples = None

    observed_min_pvalues = [min(x[1].mProbabilityOverRepresentation,
                                x[1].mProbabilityUnderRepresentation) for x in pairs]

    fdrs = {}

    method = options.qvalue_method

    if options.qvalue_method == "storey":

        # compute fdr via Storey's method
        try:
            fdr_data = Stats.doFDR(observed_min_pvalues)

        except ValueError as msg:
            E.warn("failure in q-value computation: %s" % msg)
            E.warn("reverting to Bonferroni correction")
            method = "bonf"
            fdr_data = Stats.FDRResult()
            l = float(len(observed_min_pvalues))
            fdr_data.mQValues = [min(1.0, x * l) for x in observed_min_pvalues]

        for pair, qvalue in zip(pairs, fdr_data.mQValues):
            fdrs[pair[0]] = (qvalue, 1.0, 1.0)

    elif options.qvalue_method == "empirical":
        assert options.sample > 0, "requiring a sample size of > 0"

        #######################################################################
        # sampling
        # for each GO-category:
        # get maximum and minimum counts in x samples -> calculate minimum/maximum significance
        # get average and stdev counts in x samples -> calculate z-scores for
        # test set
        samples, simulation_min_pvalues = getSamples(gene2go,
                                                     foreground,
                                                     background,
                                                     options,
                                                     test_ontology,
                                                     go2info)

        # compute P-values from sampling
        observed_min_pvalues.sort()
        observed_min_pvalues = numpy.array(observed_min_pvalues)

        sample_size = options.sample

        for k, v in pairs:

            if k in samples:
                s = samples[k]
            else:
                raise KeyError("category %s not in samples" % k)

            # calculate values for z-score
            if s.mStddev > 0:
                zscore = abs(
                    float(v.mSampleCountsCategory) - s.mMean) / s.mStddev
            else:
                zscore = 0.0

            #############################################################
            # FDR:
            # For each p-Value p at node n:
            #   a = average number of nodes in each simulation run with P-Value < p
            #           this can be obtained from the array of all p-values and all nodes
            #           simply divided by the number of samples.
            #      aka: expfpos=experimental false positive rate
            #   b = number of nodes in observed data, that have a P-Value of less than p.
            #      aka: pos=positives in observed data
            #   fdr = a/b
            pvalue = v.mPValue

            # calculate values for FDR:
            # nfdr = number of entries with P-Value better than node.
            a = 0
            while a < len(simulation_min_pvalues) and \
                    simulation_min_pvalues[a] < pvalue:
                a += 1
            a = float(a) / float(sample_size)
            b = 0
            while b < len(observed_min_pvalues) and \
                    observed_min_pvalues[b] < pvalue:
                b += 1

            if b > 0:
                fdr = min(1.0, float(a) / float(b))
#.........这里部分代码省略.........
开发者ID:CGATOxford,项目名称:cgat,代码行数:103,代码来源:GO.py

示例11: loadGOs

# 需要导入模块: from CGAT import Stats [as 别名]
# 或者: from CGAT.Stats import doFDR [as 别名]
def loadGOs( infiles, outfile, tablename ):
    '''import GO results into a single table.

    This method also computes a global QValue over all
    tracks, genesets and annotation sets.
    '''

    header = False

    tempf1 = P.getTempFile()

    pvalues = []

    for infile in infiles:
        indir = infile + ".dir"

        if not os.path.exists( indir ):
            continue

        track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)", infile ).groups()

        for filename in glob.glob( os.path.join(indir, "*.overall") ):
            for line in open(filename, "r" ):
                if line.startswith("#"): continue
                data = line[:-1].split("\t")
                if line.startswith("code"):
                    if header: continue
                    tempf1.write( "track\tgeneset\tannotationset\t%s" % line )
                    header = True
                    assert data[10] == "pover" and data[11] == "punder", "format error, expected pover-punder, got %s-%s" % (data[10], data[11])
                    continue
                tempf1.write( "%s\t%s\t%s\t%s" % (track, geneset, annotationset, line) )
                pvalues.append( min( float(data[10]), float(data[11]) ) )

    tempf1.close()

    E.info( "analysing %i pvalues" % len(pvalues ))
    fdr = Stats.doFDR( pvalues )
    E.info( "got %i qvalues" % len(fdr.mQValues ))
    qvalues = ["global_qvalue" ] + fdr.mQValues

    tempf2 = P.getTempFile()

    for line, qvalue in zip( open(tempf1.name,"r"), qvalues ):
        tempf2.write( "%s\t%s\n" % (line[:-1], str(qvalue)) )

    tempf2.close()
    tempfilename = tempf2.name
    print tempf1.name
    print tempf2.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --allow-empty 
              --index=category 
              --index=track,geneset,annotationset
              --index=geneset
              --index=annotationset
              --index=goid 
              --table=%(tablename)s 
    < %(tempfilename)s
    > %(outfile)s
    '''
    P.run()
开发者ID:BioinformaticsArchive,项目名称:cgat,代码行数:66,代码来源:PipelineGO.py


注:本文中的CGAT.Stats.doFDR方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。