本文整理汇总了Python中toolshed.reader函数的典型用法代码示例。如果您正苦于以下问题:Python reader函数的具体用法?Python reader怎么用?Python reader使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了reader函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main(count_files, metadata):
pools = defaultdict(list)
for toks in reader(metadata):
for k, v in toks.iteritems():
if k.startswith("Pool") and v == "TRUE":
# get the samples
pool_name = k.split("_")[-1]
pools[pool_name].append(toks['alias'])
for pool, samples in pools.iteritems():
print >>sys.stderr, ">> processing", pool
for strand in ["pos", "neg"]:
files = [f for f in count_files if os.path.basename(f).split(".")[0] in samples and strand in os.path.basename(f)]
# simplest way to join files into a dataframe
raw_count_data = {}
for file_path in files:
sample = get_sample_name(file_path)
raw_count_data[sample] = {}
for toks in reader(file_path, header=['gene', 'site', 'count']):
raw_count_data[sample]["{gene}:{site}".format(gene=toks['gene'], site=toks['site'])] = int(toks['count'])
# dataframe from dict of dicts
count_data = pd.DataFrame(raw_count_data)
# will need to split into multiindex here to match new count fmt
count_data.index = pd.MultiIndex.from_tuples([x.split(":") for x in count_data.index], names=['gene','site'])
# normalize the counts
count_data = norm_deseq(count_data)
# round the normalized counts up to int
# don't want to throw out single counts at any site
count_data = count_data.apply(np.ceil)
# sum the rows
count_data[pool] = count_data.sum(axis=1)
# print results
out_file = gzip.open("{pool}.{strand}.txt.gz".format(pool=pool, strand=strand), "wb")
count_data[pool].astype('int').to_csv(out_file, sep="\t")
out_file.close()
示例2: main
def main(args):
# create a dataframe with all miRNA counts from all samples
allcounts = {}
for f in args.counts:
fname = op.basename(f).split(args.fullext)[0]
casecounts = {}
for line in reader(f, header="chrom start stop name score strand count".split()):
casecounts[line['name']] = int(line['count'])
allcounts[fname] = casecounts
countsdf = pd.DataFrame(allcounts)
# create a set of unique miRNAs from all the miRNA lists
uniquemirnas = []
for f in args.mirnalist:
for line in reader(f, header=['name']):
uniquemirnas.append(line['name'])
uniquemirnas = set(uniquemirnas)
# log the counts
# countsdf = np.log(countsdf + 1)
manojset = "MP1 MP2 MP9 MP20 MP21 MP24 MP34 MP35 MP36 MP38 MP42.ACTG MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA MP45.ACTG MP45.TCGA".split()
manojset = "MP2 MP9 MP20 MP21 MP24 MP34 MP35 MP36 MP38 MP42.ACTG MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA MP45.ACTG MP45.TCGA".split()
# manojset = "MP2 MP9 MP20 MP21 MP34 MP35 MP36 MP43.ACTG MP43.TCGA MP44.ACTG MP44.TCGA".split()
peterset1 = "PK11 PK21 PK24 PK31 PK41 PK42 PK51 PK52 PK54".split()
peterset2 = "PK11 PK12 PK21 PK22 PK31 PK32 PK41 PK51 PK52 PK53".split()
# print matrix
countsdf.ix[uniquemirnas,manojset].to_csv(args.out, sep=",", header=True)
countsdf.ix[uniquemirnas,peterset1].to_csv("peter1_top50.csv", sep=",", header=True)
countsdf.ix[uniquemirnas,peterset2].to_csv("peter2_top50.csv", sep=",", header=True)
示例3: multi_intersect
def multi_intersect(files, cutoff):
"""files = {sample_name:file_path}"""
sitestmp = open(tempfile.mkstemp(suffix=".bed")[1], 'wb')
snames = [op.basename(f).split(".")[0].split("_")[0] for f in files]
cmd = ("|bedtools multiinter -cluster -header "
"-names {names} -i {files}").format(names=" ".join(snames),
files=" ".join(files))
# apply cutoff, name peaks
for i, l in enumerate(reader(cmd, header=True)):
if int(l['num']) < cutoff: continue
print >>sitestmp, "\t".join([l['chrom'], l['start'], l['end'],
"peak_{i}".format(i=i)])
sitestmp.close()
# annotate the merged sites by intersecting with all of the files
classtmp = open(tempfile.mkstemp(suffix=".bed")[1], 'wb')
annotated_peaks = sitestmp.name
# pull out peak classes from input files
for f in files:
annotated_peaks = map_peak_class(f, annotated_peaks)
for peak in reader(annotated_peaks, header=AnnotatedPeak):
if peak.name is None: continue
print >>classtmp, "{chrom}\t{start}\t{stop}\t{name}\n".format(
chrom=peak.chrom, start=peak.start,
stop=peak.stop, name=peak.name)
classtmp.close()
return classtmp.name
示例4: local_shuffle
def local_shuffle(bed, loc='500000'):
"""
Randomize the location of each interval in `bed` by moving its
start location to within `loc` bp of its current location or to
its containing interval in `loc`.
Arguments:
bed - input bed file
loc - shuffle intervals to within this distance (+ or -).
If not an integer, then this should be a BED file containing
regions such that each interval in `bed` is shuffled within
its containing interval in `loc`
"""
from random import randint
if str(loc).isdigit():
dist = abs(int(loc))
with nopen(bed) as fh:
for toks in (l.rstrip('\r\n').split('\t') for l in fh):
d = randint(-dist, dist)
toks[1:3] = [str(max(0, int(bloc) + d)) for bloc in toks[1:3]]
print "\t".join(toks)
else:
# we are using dist as the windows within which to shuffle
assert os.path.exists(loc)
bed4 = mktemp()
with open(bed4, 'w') as fh:
# this step is so we don't have to track the number of columns in A
for toks in reader(bed, header=False):
fh.write("%s\t%s\n" % ("\t".join(toks[:3]), SEP.join(toks)))
missing = 0
# we first find the b-interval that contains each a-interval by
# using bedtools intersect
for toks in reader("|bedtools intersect -wao -a {bed4} -b {loc}"
.format(**locals()), header=False):
ajoin = toks[:4]
a = ajoin[3].split(SEP) # extract the full interval
b = toks[4:]
if int(b[-1]) == 0:
missing += 1
continue
assert a[0] == b[0], ('chroms dont match', a, b)
alen = int(a[2]) - int(a[1])
# doesn't care if the new interval is completely contained in b
astart = randint(int(b[1]), int(b[2]))
# subtract half the time.
aend = (astart - alen) if randint(0, 1) == 0 and astart > alen \
else (astart + alen)
a[1], a[2] = map(str, (astart, aend) if astart < aend
else (aend, astart))
print "\t".join(a)
if missing > 0:
print >> sys.stderr, ("found {missing} intervals in {bed} that "
" were not contained in {loc}"
.format(**locals()))
示例5: search
def search(args):
"""Given fasta, gff, and bam, parses for sequence, annotates feature, and
reports coverage.
Args: bedgraph, fasta, gff, seq, feature, verbose.
"""
match_seq = args.seq.upper()
# write a temp bed of sequence match sites
site_temp = open(tempfile.mktemp(suffix=".bed"), 'wb')
with nopen(args.fasta) as fasta:
for chrom, seq in read_fasta(fasta):
if args.verbose: sys.stderr.write(">> processing %s...\n" % chrom)
# for each sequence match
for i, m in enumerate([s.start() for s in re.finditer(match_seq, seq)]):
start = m
stop = start + 2
name = "%s_%s_%d" % (chrom, match_seq, i)
fields = [chrom, start, stop, name]
site_temp.write("\t".join(map(str, fields)) + "\n")
site_temp.close()
# convert gff to bed with gene name as bed name field
gff_temp = open(tempfile.mktemp(suffix=".bed"), 'wb')
result_header = "chrom source feature start stop score strand frame attributes comments".split()
# for filtering unique and storing start and stop for each gene
genes = {}
if args.verbose: sys.stderr.write(">> selecting %s from gff records...\n" % args.feature)
for g in reader(args.gff, header=result_header):
try:
if not g['feature'] == args.feature: continue
# regex gene name out
gene_name = re.findall(r'Name=([\w\.]+)', g['attributes'])[0]
# skip already seen
if genes.has_key(gene_name): continue
genes[gene_name] = {'start':int(g['start']), 'stop':int(g['stop']), 'strand':g['strand']}
fields = [g['chrom'], g['start'], g['stop'], gene_name]
gff_temp.write("\t".join(map(str, fields)) + "\n")
except KeyError:
if not g['chrom'].startswith("#"):
sys.stderr.write("ERROR parsing gff!\n")
sys.exit(1)
gff_temp.close()
# sort the gene bed, map and collapse genes onto site_temp, then add counts
if args.verbose: sys.stderr.write(">> finding relative gene location per sequence match...\n")
result_header = "chrom start stop name gene_name counts".split()
cmd = "|sortBed -i %s | mapBed -a %s -b - -c 4 -o collapse | mapBed -a - -b %s -c 4 -o sum"\
% (gff_temp.name, site_temp.name, args.bedgraph)
for b in reader(cmd, header=result_header):
# sequence position(s) relative to gene(s) it overlaps
locs = get_locs(int(b['start']), b['gene_name'], genes)
fields = [b['chrom'], b['start'], b['stop'], b['name'], b['gene_name'], b['counts'], locs]
print "\t".join(map(str, fields))
示例6: uniprot
def uniprot(args):
"""Add Uniprot annotation to gene list. Args: genes, uniprotdb, column"""
uniprot_db = {}
uniprot_header = header(args.uniprotdb)
for entry in reader(args.uniprotdb):
for gene in entry['Gene names'].split():
uniprot_db[gene] = entry
for entry in reader(args.genes, header=False):
uniprot_fields = []
for gene in entry[int(args.column) - 1].split(","):
uniprot = uniprot_db.get(gene)
if uniprot:
for h in uniprot_header:
uniprot_fields.append(uniprot[h])
print "\t".join(entry) + "\t".join(map(str, uniprot_fields))
示例7: read_pvalues
def read_pvalues(bedfilename, log_pvalues, verbose):
''' read in p-values from a bed file score field.
returns: list sorted by signifance (most significant first)'''
pvals = []
if verbose:
print >>sys.stderr, ">> reading p-values from %s .." % bedfilename
for d in reader(bedfilename, header=['chrom','start','end','name','score','strand']):
if log_pvalues:
pval = float(d['score'])
else:
pval = -1 * log10(pval)
pvals.append(pval)
if verbose:
print >>sys.stderr, ">> read %d p-values" % len(pvals)
# sort the pvalues from most to least signif (smallest to largest) and
# reverse so largest are first
pvals.sort()
# if pvals are log transformed, biggest (i.e. most significant) are
# first
if log_pvalues: pvals.reverse()
return pvals
示例8: main
def main(args):
gm = defaultdict(dict)
for f in args.files:
for l in reader(f, header="chrom start stop name score strand abundance".split()):
try:
if int(l['abundance']) == 0: continue
# gm[<parsed file name>][<miRNA name>] = abundance value
gm[op.basename(f).split(".mirna_abundance", 1)[0]][l['name']] = l['abundance']
except KeyError:
# header failed to set l['abundance']
pass
# the sample names
caselist = sorted(gm.keys())
# only save lines where at least one sample has a positive value
completeset = []
for i, case in enumerate(caselist):
keys = sorted(gm[caselist[i]].keys())
for k in keys:
completeset.append(k)
mirnas = set(completeset)
# print the matrix
print "\t".join(k for k in caselist)
for mirna in mirnas:
fields = [mirna]
for c in caselist:
try:
fields.append(gm[c][mirna])
except KeyError:
# miRNA not present in this case
fields.append("0.0")
print "\t".join(map(str, fields))
示例9: main
def main(table):
d = {}
for toks in reader(table, header=True, sep=" "):
row_gene = toks['Genes']
d[row_gene] = {}
for col_gene in toks.keys():
# row 1, col 1 is a generic header entry
if col_gene == "Genes": continue
d[row_gene][col_gene] = int(toks[col_gene])
# print node size attributes
node_out = open("node_attrs.txt", "wb")
print >>node_out, "source\ttotal_mutations"
for k in d.keys():
print >>node_out, "{gene}\t{count}".format(gene=k, count=d[k][k])
node_out.close()
# print network and edge attributes
interaction_type = "pp"
network_out = open("network.txt", "wb")
print >>network_out, "source\tinteraction_type\ttarget\tcomutation_count"
seen = set()
for row_gene in d.keys():
for col_gene, count in d[row_gene].iteritems():
if count == 0: continue
# double checking these were filtered out
if row_gene == col_gene: continue
# check to see if the interaction was already added in the opposite direction
if "{gene2}_{gene1}".format(gene2=col_gene, gene1=row_gene) in seen: continue
print >>network_out, "{gene1}\t{interaction}\t{gene2}\t{count}".format(gene1=row_gene, interaction=interaction_type, gene2=col_gene, count=count)
seen.add("{gene1}_{gene2}".format(gene1=row_gene, gene2=col_gene))
network_out.close()
示例10: write_region_bed
def write_region_bed(feature_iter, true_regions, out_fh):
"""
Write a region bed file suitable for use in :func:`~evaluate`.
given true regions (likely from an external program, otherwise use
:func:`~write_modeled_regions`).
Parameters
----------
feature_iter : iterable of Features
true_regions : file
BED file containing true regions
out_fh : filehandle
where to write the data
"""
fmt = "{chrom}\t{start}\t{end}\t{truth}\t{size}\n"
out_fh.write(ts.fmt2header(fmt))
regions = defaultdict(InterLap)
for i, toks in enumerate(ts.reader(true_regions, header=False)):
# see if it's a header.
if i == 0 and not (toks[1] + toks[2]).isdigit(): continue
chrom, start, end = toks[0], int(toks[1]), int(toks[2])
regions[chrom].add((start, end))
for f in feature_iter:
truth = 'true' if (f.position, f.position) in regions[f.chrom] else 'false'
out_fh.write(fmt.format(chrom=f.chrom, start=f.position - 1,
end=f.position, truth=truth, size=1))
out_fh.flush()
示例11: main
def main():
p = argparse.ArgumentParser(__doc__)
p.add_argument("-g", dest="group", help="group by the first column (usually"
" chromosome or probe) if this [optional]", default=False,
action="store_true")
p.add_argument("--skip", dest="skip", help="Maximum number of intervening "
"basepairs to skip before seeing a value. If this number is "
"exceeded, the region is ended chromosome or probe "
"[default: %default]", type=int, default=50000)
p.add_argument("--min-region-size", dest="min-region", help="minimum "
"length of the region. regions shorter than this are not printed"
"[default: %default] (no minimum)", type=int, default=0)
p.add_argument("--seed", dest="seed", help="A value must be at least this"
" large in order to seed a region. [default: %default]",
type=float, default=5.0)
p.add_argument("--keep-cols", dest="keep", help="comma separated list of"
"columns to add to the output data", default="")
p.add_argument("--threshold", dest="threshold", help="After seeding, a value"
"of at least this number can extend a region [default: "
"%default]", type=float, default=3.0)
p.add_argument("regions")
args = p.parse_args()
f = reader(args.regions, header=False, sep="\t")
keep = [int(k) for k in args.keep.strip().split(",") if k]
report_cutoff = args.seed
for key, region in gen_regions(f, args.skip, args.seed, args.threshold,
args.group, keep, report_cutoff):
print key + "\t" + "\t".join(map(str, region))
示例12: main
def main(bam, output):
sample = path.basename(bam).rsplit(".bam", 1)[0]
plot_file = output if output else bam.rsplit(".bam", 1)[0] + "_lorenz_curve.png"
coverages = []
print("Calculating coverages", file=sys.stderr)
for toks in reader("|bedtools genomecov -5 -d -ibam %s" % bam, header=['name', 'start', 'coverage']):
coverages.append(int(toks['coverage']))
coverages_r = IntVector(coverages)
print("Generating Lorenz curve", file=sys.stderr)
# Gini coefficient
G = ineq.Gini(coverages_r)
l = "G = %.3f" % G[0]
grdevices.png(plot_file, width=1200, height=800)
# draw the plot
plot(ineq.Lc(coverages_r), xlab="Genome Fraction",
ylab="Coverage Fraction", bty="n", lwd=1, main="Lorenz Curve of %s" % sample,
col="black", xaxs="r", yaxs="r")
# add the Gini coefficient to the plot
legend('topleft', legend=l, bty='n', cex=1.3)
grdevices.dev_off()
print("Gini Coefficient = %f" % G[0])
示例13: merge_beds
def merge_beds(excl_list, genome, prefix="ex"):
if not os.path.exists(genome):
fgen = mktemp()
genome = Shuffler.genome(genome, fgen)
if len(excl_list) == 1:
excl = excl_list[0]
else:
excl = mktemp()
_run("|cut -f 1-3 %s | sort -k1,1 -k2,2n | bedtools merge -i - > %s" \
% (" ".join(excl_list), excl))
bases = []
for i, f in enumerate((genome, excl)):
n_bases = 0
for toks in reader(f, header=False):
try:
if i == 0:
n_bases += int(toks[1])
else:
n_bases += (int(toks[2]) - int(toks[1]))
except ValueError:
pass
bases.append(n_bases)
#print >>sys.stderr, "# %scluding %5g out of %5g total bases (%.3g%%) in the genome" % \
# (prefix, bases[1] , bases[0], 100. * bases[1] / float(bases[0]))
return excl
示例14: filter
def filter(p_bed, region_bed, max_p=None, p_col_name="P.Value"):
ph = ['p' + h for h in get_header(p_bed)]
rh = get_header(region_bed)
if isinstance(p_col_name, (int, long)):
p_col_name = ph[p_col_name][1:]
a = dict(p_bed=p_bed, region_bed=region_bed)
a['p_bed'] = fix_header(a['p_bed'])
yield rh + ["t-pos", "t-neg", "t-sum", "n_gt_p05", "n_gt_p1"]
for group, plist in groupby(reader('|bedtools intersect -b %(p_bed)s -a %(region_bed)s -wo' % a,
header=rh + ph), itemgetter('chrom','start','end')):
plist = list(plist)
plist = [x for x in plist if (int(x['start']) <= int(x['pstart']) <= int(x['pend'])) and ((int(x['start']) <= int(x['pend']) <= int(x['end'])))]
tscores = [float(row['pt']) for row in plist if 'pt' in row]
if max_p:
if any(float(row['p' + p_col_name]) > max_p for row in plist):
continue
ngt05 = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.05)
ngt1 = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.1)
tpos = sum(1 for ts in tscores if ts > 0)
tneg = sum(1 for ts in tscores if ts < 0)
tsum = sum(ts for ts in tscores)
frow = [plist[0][h] for h in rh] + [str(tpos), str(tneg), str(tsum), str(ngt05), str(ngt1)]
yield frow
示例15: main
def main(args):
gm = defaultdict(dict)
for f in args.files:
for l in reader(f, header="chrom start stop name counts nonzero blength nonzerofracofb".split()):
try:
# gm[<parsed file name>][<peak name>] = count value
fullname = "%s:%s:%s:%s" % (l["name"], l["chrom"], l["start"], l["stop"])
gm[f.split(".", 1)[0]][fullname] = l["counts"]
except KeyError:
# header failed to set l['val']
pass
# print the matrix
caselist = sorted(gm.keys())
# this step is unnecessary as they all have counts for the same peaks
completeset = []
for i, case in enumerate(caselist):
keys = sorted(gm[caselist[i]].keys())
for k in keys:
completeset.append(k)
peaks = set(completeset)
print "#peak_name\t" + "\t".join(k for k in caselist)
for peak in peaks:
fields = [peak]
for c in caselist:
try:
fields.append(gm[c][peak])
except KeyError:
# miRNA not present in this case
fields.append("0.0")
print "\t".join(map(str, fields))