当前位置: 首页>>代码示例>>Python>>正文


Python toolshed.reader函数代码示例

本文整理汇总了Python中toolshed.reader函数的典型用法代码示例。如果您正苦于以下问题:Python reader函数的具体用法?Python reader怎么用?Python reader使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了reader函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

def main(count_files, metadata):
    pools = defaultdict(list)
    for toks in reader(metadata):
        for k, v in toks.iteritems():
            if k.startswith("Pool") and v == "TRUE":
                # get the samples
                pool_name = k.split("_")[-1]
                pools[pool_name].append(toks['alias'])
    for pool, samples in pools.iteritems():
        print >>sys.stderr, ">> processing", pool
        for strand in ["pos", "neg"]:
            files = [f for f in count_files if os.path.basename(f).split(".")[0] in samples and strand in os.path.basename(f)]
            # simplest way to join files into a dataframe
            raw_count_data = {}
            for file_path in files:
                sample = get_sample_name(file_path)
                raw_count_data[sample] = {}
                for toks in reader(file_path, header=['gene', 'site', 'count']):
                    raw_count_data[sample]["{gene}:{site}".format(gene=toks['gene'], site=toks['site'])] = int(toks['count'])
            # dataframe from dict of dicts
            count_data = pd.DataFrame(raw_count_data)
            # will need to split into multiindex here to match new count fmt
            count_data.index = pd.MultiIndex.from_tuples([x.split(":") for x in count_data.index], names=['gene','site'])
            # normalize the counts
            count_data = norm_deseq(count_data)
            # round the normalized counts up to int
            # don't want to throw out single counts at any site
            count_data = count_data.apply(np.ceil)
            # sum the rows
            count_data[pool] = count_data.sum(axis=1)
            # print results
            out_file = gzip.open("{pool}.{strand}.txt.gz".format(pool=pool, strand=strand), "wb")
            count_data[pool].astype('int').to_csv(out_file, sep="\t")
            out_file.close()
开发者ID:brwnj,项目名称:cu_projects,代码行数:34,代码来源:get_pool_counts.py

示例2: main

def main(args):
    # create a dataframe with all miRNA counts from all samples
    allcounts = {}
    for f in args.counts:
        fname = op.basename(f).split(args.fullext)[0]
        casecounts = {}
        for line in reader(f, header="chrom start stop name score strand count".split()):
            casecounts[line['name']] = int(line['count'])
        allcounts[fname] = casecounts
    countsdf = pd.DataFrame(allcounts)

    # create a set of unique miRNAs from all the miRNA lists
    uniquemirnas = []
    for f in args.mirnalist:
        for line in reader(f, header=['name']):
            uniquemirnas.append(line['name'])
    uniquemirnas = set(uniquemirnas)
    
    # log the counts
    # countsdf = np.log(countsdf + 1)
    
    manojset = "MP1 MP2 MP9 MP20 MP21 MP24 MP34 MP35 MP36 MP38 MP42.ACTG MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA MP45.ACTG MP45.TCGA".split()
    manojset = "MP2 MP9 MP20 MP21 MP24 MP34 MP35 MP36 MP38 MP42.ACTG MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA MP45.ACTG MP45.TCGA".split()
    # manojset = "MP2 MP9 MP20 MP21 MP34 MP35 MP36 MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA".split()
    peterset1 = "PK11 PK21 PK24 PK31 PK41 PK42 PK51 PK52 PK54".split()
    peterset2 = "PK11 PK12 PK21 PK22 PK31 PK32 PK41 PK51 PK52 PK53".split()
    
    # print matrix
    countsdf.ix[uniquemirnas,manojset].to_csv(args.out, sep=",", header=True)
    countsdf.ix[uniquemirnas,peterset1].to_csv("peter1_top50.csv", sep=",", header=True)
    countsdf.ix[uniquemirnas,peterset2].to_csv("peter2_top50.csv", sep=",", header=True)
开发者ID:brwnj,项目名称:cu_projects,代码行数:31,代码来源:mirna_heatmap_data.py

示例3: multi_intersect

def multi_intersect(files, cutoff):
    """files = {sample_name:file_path}"""
    sitestmp = open(tempfile.mkstemp(suffix=".bed")[1], 'wb')
    snames = [op.basename(f).split(".")[0].split("_")[0] for f in files]
    cmd = ("|bedtools multiinter -cluster -header "
                "-names {names} -i {files}").format(names=" ".join(snames),
                                                    files=" ".join(files))
    # apply cutoff, name peaks
    for i, l in enumerate(reader(cmd, header=True)):
        if int(l['num']) < cutoff: continue
        print >>sitestmp, "\t".join([l['chrom'], l['start'], l['end'],
                                        "peak_{i}".format(i=i)])
    sitestmp.close()
    # annotate the merged sites by intersecting with all of the files
    classtmp = open(tempfile.mkstemp(suffix=".bed")[1], 'wb')
    annotated_peaks = sitestmp.name
    # pull out peak classes from input files
    for f in files:
        annotated_peaks = map_peak_class(f, annotated_peaks)
    for peak in reader(annotated_peaks, header=AnnotatedPeak):
        if peak.name is None: continue
        print >>classtmp, "{chrom}\t{start}\t{stop}\t{name}\n".format(
                                chrom=peak.chrom, start=peak.start,
                                stop=peak.stop, name=peak.name)
    classtmp.close()
    return classtmp.name
开发者ID:jayhesselberth,项目名称:polya,代码行数:26,代码来源:merge_sites.py

示例4: local_shuffle

def local_shuffle(bed, loc='500000'):
    """
    Randomize the location of each interval in `bed` by moving its
    start location to within `loc` bp of its current location or to
    its containing interval in `loc`.

    Arguments:
        bed - input bed file
        loc - shuffle intervals to within this distance (+ or -).
               If not an integer, then this should be a BED file containing
               regions such that each interval in `bed` is shuffled within
               its containing interval in `loc`
    """
    from random import randint
    if str(loc).isdigit():
        dist = abs(int(loc))
        with nopen(bed) as fh:
            for toks in (l.rstrip('\r\n').split('\t') for l in fh):
                d = randint(-dist, dist)
                toks[1:3] = [str(max(0, int(bloc) + d)) for bloc in toks[1:3]]
                print "\t".join(toks)
    else:
        # we are using dist as the windows within which to shuffle
        assert os.path.exists(loc)
        bed4 = mktemp()
        with open(bed4, 'w') as fh:
            # this step is so we don't have to track the number of columns in A
            for toks in reader(bed, header=False):
                fh.write("%s\t%s\n" % ("\t".join(toks[:3]), SEP.join(toks)))

        missing = 0
        # we first find the b-interval that contains each a-interval by
        # using bedtools intersect
        for toks in reader("|bedtools intersect -wao -a {bed4} -b {loc}"
                           .format(**locals()), header=False):
            ajoin = toks[:4]
            a = ajoin[3].split(SEP)  # extract the full interval
            b = toks[4:]

            if int(b[-1]) == 0:
                missing += 1
                continue
            assert a[0] == b[0], ('chroms dont match', a, b)

            alen = int(a[2]) - int(a[1])
            # doesn't care if the new interval is completely contained in b
            astart = randint(int(b[1]), int(b[2]))

            # subtract half the time.
            aend = (astart - alen) if randint(0, 1) == 0 and astart > alen \
                else (astart + alen)

            a[1], a[2] = map(str, (astart, aend) if astart < aend
                             else (aend, astart))

            print "\t".join(a)
        if missing > 0:
            print >> sys.stderr, ("found {missing} intervals in {bed} that "
                                  " were not contained in {loc}"
                                  .format(**locals()))
开发者ID:astatham,项目名称:poverlap,代码行数:60,代码来源:poverlap.py

示例5: search

def search(args):
    """Given fasta, gff, and bam, parses for sequence, annotates feature, and
    reports coverage.
    Args: bedgraph, fasta, gff, seq, feature, verbose.
    """
    match_seq = args.seq.upper()

    # write a temp bed of sequence match sites
    site_temp = open(tempfile.mktemp(suffix=".bed"), 'wb')
    with nopen(args.fasta) as fasta:
        for chrom, seq in read_fasta(fasta):            
            if args.verbose: sys.stderr.write(">> processing %s...\n" % chrom)
            # for each sequence match
            for i, m in enumerate([s.start() for s in re.finditer(match_seq, seq)]):
                start = m
                stop = start + 2
                name = "%s_%s_%d" % (chrom, match_seq, i)
                fields = [chrom, start, stop, name]
                site_temp.write("\t".join(map(str, fields)) + "\n")
    site_temp.close()

    # convert gff to bed with gene name as bed name field
    gff_temp = open(tempfile.mktemp(suffix=".bed"), 'wb')
    result_header = "chrom source feature start stop score strand frame attributes comments".split()
    # for filtering unique and storing start and stop for each gene
    genes = {}
    if args.verbose: sys.stderr.write(">> selecting %s from gff records...\n" % args.feature)
    for g in reader(args.gff, header=result_header):
        try:
            if not g['feature'] == args.feature: continue
            # regex gene name out
            gene_name = re.findall(r'Name=([\w\.]+)', g['attributes'])[0]
            # skip already seen
            if genes.has_key(gene_name): continue
            genes[gene_name] = {'start':int(g['start']), 'stop':int(g['stop']), 'strand':g['strand']}
            fields = [g['chrom'], g['start'], g['stop'], gene_name]
            gff_temp.write("\t".join(map(str, fields)) + "\n")
        except KeyError:
            if not g['chrom'].startswith("#"):
                sys.stderr.write("ERROR parsing gff!\n")
                sys.exit(1)
    gff_temp.close()

    # sort the gene bed, map and collapse genes onto site_temp, then add counts
    if args.verbose: sys.stderr.write(">> finding relative gene location per sequence match...\n")
    result_header = "chrom start stop name gene_name counts".split()
    cmd = "|sortBed -i %s | mapBed -a %s -b - -c 4 -o collapse | mapBed -a - -b %s -c 4 -o sum"\
            % (gff_temp.name, site_temp.name, args.bedgraph)
    for b in reader(cmd, header=result_header):
        # sequence position(s) relative to gene(s) it overlaps
        locs = get_locs(int(b['start']), b['gene_name'], genes)
        fields = [b['chrom'], b['start'], b['stop'], b['name'], b['gene_name'], b['counts'], locs]
        print "\t".join(map(str, fields))
开发者ID:brwnj,项目名称:seqanno,代码行数:53,代码来源:seqanno_functions.py

示例6: uniprot

def uniprot(args):
    """Add Uniprot annotation to gene list. Args: genes, uniprotdb, column"""
    uniprot_db = {}
    uniprot_header = header(args.uniprotdb)
    for entry in reader(args.uniprotdb):
        for gene in entry['Gene names'].split():
            uniprot_db[gene] = entry
    for entry in reader(args.genes, header=False):
        uniprot_fields = []
        for gene in entry[int(args.column) - 1].split(","):
            uniprot = uniprot_db.get(gene)
            if uniprot:
                for h in uniprot_header:
                    uniprot_fields.append(uniprot[h])
        print "\t".join(entry) + "\t".join(map(str, uniprot_fields))
开发者ID:brwnj,项目名称:seqanno,代码行数:15,代码来源:seqanno_functions.py

示例7: read_pvalues

def read_pvalues(bedfilename, log_pvalues, verbose):
    ''' read in p-values from a bed file score field.

    returns: list sorted by signifance (most significant first)'''
    pvals = []

    if verbose:
        print >>sys.stderr, ">> reading p-values from %s .." % bedfilename

    for d in reader(bedfilename, header=['chrom','start','end','name','score','strand']):
        if log_pvalues:
            pval = float(d['score'])
        else:
            pval = -1 * log10(pval)
        pvals.append(pval)

    if verbose:
        print >>sys.stderr, ">> read %d p-values" % len(pvals)

    # sort the pvalues from most to least signif (smallest to largest) and
    # reverse so largest are first
    pvals.sort()

    # if pvals are log transformed, biggest (i.e. most significant) are
    # first
    if log_pvalues: pvals.reverse()

    return pvals
开发者ID:brwnj,项目名称:peaktools,代码行数:28,代码来源:qvalues.py

示例8: main

def main(args):
    gm = defaultdict(dict)
    for f in args.files:
        for l in reader(f, header="chrom start stop name score strand abundance".split()):
            try:
                if int(l['abundance']) == 0: continue
                # gm[<parsed file name>][<miRNA name>] = abundance value
                gm[op.basename(f).split(".mirna_abundance", 1)[0]][l['name']] = l['abundance']
            except KeyError:
                # header failed to set l['abundance']
                pass
    # the sample names
    caselist = sorted(gm.keys())
    # only save lines where at least one sample has a positive value
    completeset = []
    for i, case in enumerate(caselist):
        keys = sorted(gm[caselist[i]].keys())
        for k in keys:
            completeset.append(k)
    mirnas = set(completeset)
    
    # print the matrix
    print "\t".join(k for k in caselist)
    for mirna in mirnas:
        fields = [mirna]
        for c in caselist:
            try:
                fields.append(gm[c][mirna])
            except KeyError:
                # miRNA not present in this case
                fields.append("0.0")
        print "\t".join(map(str, fields))
开发者ID:brwnj,项目名称:cu_projects,代码行数:32,代码来源:data2matrix.py

示例9: main

def main(table):
    d = {}
    for toks in reader(table, header=True, sep=" "):
        row_gene = toks['Genes']
        d[row_gene] = {}
        for col_gene in toks.keys():
            # row 1, col 1 is a generic header entry
            if col_gene == "Genes": continue
            d[row_gene][col_gene] = int(toks[col_gene])

    # print node size attributes
    node_out = open("node_attrs.txt", "wb")
    print >>node_out, "source\ttotal_mutations"
    for k in d.keys():
        print >>node_out, "{gene}\t{count}".format(gene=k, count=d[k][k])
    node_out.close()

    # print network and edge attributes
    interaction_type = "pp"
    network_out = open("network.txt", "wb")
    print >>network_out, "source\tinteraction_type\ttarget\tcomutation_count"
    seen = set()
    for row_gene in d.keys():
        for col_gene, count in d[row_gene].iteritems():
            if count == 0: continue
            # double checking these were filtered out
            if row_gene == col_gene: continue
            # check to see if the interaction was already added in the opposite direction
            if "{gene2}_{gene1}".format(gene2=col_gene, gene1=row_gene) in seen: continue
            print >>network_out, "{gene1}\t{interaction}\t{gene2}\t{count}".format(gene1=row_gene, interaction=interaction_type, gene2=col_gene, count=count)
            seen.add("{gene1}_{gene2}".format(gene1=row_gene, gene2=col_gene))
    network_out.close()
开发者ID:brwnj,项目名称:cu_projects,代码行数:32,代码来源:data2cytoscape.py

示例10: write_region_bed

def write_region_bed(feature_iter, true_regions, out_fh):
    """
    Write a region bed file suitable for use in :func:`~evaluate`.
    given true regions (likely from an external program, otherwise use
    :func:`~write_modeled_regions`).

    Parameters
    ----------

    feature_iter : iterable of Features

    true_regions : file
        BED file containing true regions

    out_fh : filehandle
        where to write the data
    """
    fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n"
    out_fh.write(ts.fmt2header(fmt))

    regions = defaultdict(InterLap)

    for i, toks in enumerate(ts.reader(true_regions, header=False)):
        # see if it's a header.
        if i == 0 and not (toks[1] + toks[2]).isdigit(): continue
        chrom, start, end = toks[0], int(toks[1]), int(toks[2])
        regions[chrom].add((start, end))

    for f in feature_iter:
        truth = 'true' if (f.position, f.position) in regions[f.chrom] else 'false'
        out_fh.write(fmt.format(chrom=f.chrom, start=f.position - 1,
                    end=f.position, truth=truth, size=1))
    out_fh.flush()
开发者ID:LeonardJ09,项目名称:crystal,代码行数:33,代码来源:evaluate.py

示例11: main

def main():
    p = argparse.ArgumentParser(__doc__)

    p.add_argument("-g", dest="group", help="group by the first column (usually"
                 " chromosome or probe) if this [optional]", default=False,
                 action="store_true")

    p.add_argument("--skip", dest="skip", help="Maximum number of intervening "
             "basepairs to skip before seeing a value. If this number is "
                 "exceeded, the region is ended chromosome or probe "
                 "[default: %default]", type=int, default=50000)
    p.add_argument("--min-region-size", dest="min-region", help="minimum "
            "length of the region. regions shorter than this are not printed"
                 "[default: %default] (no minimum)", type=int, default=0)
    p.add_argument("--seed", dest="seed", help="A value must be at least this"
                 " large in order to seed a region. [default: %default]",
                 type=float, default=5.0)
    p.add_argument("--keep-cols", dest="keep", help="comma separated list of"
            "columns to add to the output data", default="")

    p.add_argument("--threshold", dest="threshold", help="After seeding, a value"
                 "of at least this number can extend a region [default: "
                 "%default]", type=float, default=3.0)
    p.add_argument("regions")

    args = p.parse_args()

    f = reader(args.regions, header=False, sep="\t")
    keep = [int(k) for k in args.keep.strip().split(",") if k]
    report_cutoff = args.seed
    for key, region in gen_regions(f, args.skip, args.seed, args.threshold,
            args.group, keep, report_cutoff):
        print key + "\t" + "\t".join(map(str, region))
开发者ID:AlexBrandt,项目名称:bio-playground,代码行数:33,代码来源:find-peaks.py

示例12: main

def main(bam, output):
    sample = path.basename(bam).rsplit(".bam", 1)[0]
    plot_file = output if output else bam.rsplit(".bam", 1)[0] + "_lorenz_curve.png"

    coverages = []
    print("Calculating coverages", file=sys.stderr)
    for toks in reader("|bedtools genomecov -5 -d -ibam %s" % bam, header=['name', 'start', 'coverage']):
        coverages.append(int(toks['coverage']))

    coverages_r = IntVector(coverages)

    print("Generating Lorenz curve", file=sys.stderr)

    # Gini coefficient
    G = ineq.Gini(coverages_r)
    l = "G = %.3f" % G[0]

    grdevices.png(plot_file, width=1200, height=800)
    # draw the plot
    plot(ineq.Lc(coverages_r), xlab="Genome Fraction",
        ylab="Coverage Fraction", bty="n", lwd=1, main="Lorenz Curve of %s" % sample,
        col="black", xaxs="r", yaxs="r")

    # add the Gini coefficient to the plot
    legend('topleft', legend=l, bty='n', cex=1.3)
    grdevices.dev_off()

    print("Gini Coefficient = %f" % G[0])
开发者ID:hjanime,项目名称:scgc_scripts,代码行数:28,代码来源:lorenz_curve.py

示例13: merge_beds

def merge_beds(excl_list, genome, prefix="ex"):
    if not os.path.exists(genome):
        fgen = mktemp()
        genome = Shuffler.genome(genome, fgen)

    if len(excl_list) == 1:
        excl = excl_list[0]
    else:
        excl = mktemp()
        _run("|cut -f 1-3 %s | sort -k1,1 -k2,2n | bedtools merge -i - > %s" \
                % (" ".join(excl_list), excl))

    bases = []
    for i, f in enumerate((genome, excl)):
        n_bases = 0
        for toks in reader(f, header=False):
            try:
                if i == 0:
                    n_bases += int(toks[1])
                else:
                    n_bases += (int(toks[2]) - int(toks[1]))
            except ValueError:
                pass
        bases.append(n_bases)

    #print >>sys.stderr, "# %scluding %5g out of %5g total bases (%.3g%%) in the genome" % \
    #        (prefix, bases[1] , bases[0], 100. * bases[1] / float(bases[0]))
    return excl
开发者ID:brentp,项目名称:shuffler,代码行数:28,代码来源:shuffler.py

示例14: filter

def filter(p_bed, region_bed, max_p=None, p_col_name="P.Value"):
    ph = ['p' + h for h in get_header(p_bed)]
    rh = get_header(region_bed)
    if isinstance(p_col_name, (int, long)):
        p_col_name = ph[p_col_name][1:]

    a = dict(p_bed=p_bed, region_bed=region_bed)
    a['p_bed'] = fix_header(a['p_bed'])

    yield rh + ["t-pos", "t-neg", "t-sum", "n_gt_p05", "n_gt_p1"]
    for group, plist in groupby(reader('|bedtools intersect -b %(p_bed)s -a %(region_bed)s -wo' % a,
            header=rh + ph), itemgetter('chrom','start','end')):
        plist = list(plist)
        plist = [x for x in plist if (int(x['start']) <= int(x['pstart']) <= int(x['pend'])) and ((int(x['start']) <= int(x['pend']) <= int(x['end'])))]
        tscores = [float(row['pt']) for row in plist if 'pt' in row]

        if max_p:
            if any(float(row['p' + p_col_name]) > max_p for row in plist):
                continue

        ngt05  = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.05)
        ngt1  = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.1)
        tpos = sum(1 for ts in tscores if ts > 0)
        tneg = sum(1 for ts in tscores if ts < 0)
        tsum = sum(ts for ts in tscores)
        frow = [plist[0][h] for h in rh] + [str(tpos), str(tneg), str(tsum), str(ngt05), str(ngt1)]
        yield frow
开发者ID:dgaston,项目名称:combined-pvalues,代码行数:27,代码来源:filter.py

示例15: main

def main(args):
    gm = defaultdict(dict)
    for f in args.files:
        for l in reader(f, header="chrom start stop name counts nonzero blength nonzerofracofb".split()):
            try:
                # gm[<parsed file name>][<peak name>] = count value
                fullname = "%s:%s:%s:%s" % (l["name"], l["chrom"], l["start"], l["stop"])
                gm[f.split(".", 1)[0]][fullname] = l["counts"]
            except KeyError:
                # header failed to set l['val']
                pass

    # print the matrix
    caselist = sorted(gm.keys())
    # this step is unnecessary as they all have counts for the same peaks
    completeset = []
    for i, case in enumerate(caselist):
        keys = sorted(gm[caselist[i]].keys())
        for k in keys:
            completeset.append(k)

    peaks = set(completeset)
    print "#peak_name\t" + "\t".join(k for k in caselist)
    for peak in peaks:
        fields = [peak]
        for c in caselist:
            try:
                fields.append(gm[c][peak])
            except KeyError:
                # miRNA not present in this case
                fields.append("0.0")
        print "\t".join(map(str, fields))
开发者ID:brwnj,项目名称:cu_projects,代码行数:32,代码来源:data2matrix.py


注:本文中的toolshed.reader函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。