本文整理匯總了Python中pyfasta.Fasta.iterkeys方法的典型用法代碼示例。如果您正苦於以下問題:Python Fasta.iterkeys方法的具體用法?Python Fasta.iterkeys怎麽用?Python Fasta.iterkeys使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyfasta.Fasta
的用法示例。
在下文中一共展示了Fasta.iterkeys方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: split
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import iterkeys [as 別名]
def split(args):
parser = optparse.OptionParser("""\
split a fasta file into separated files.
pyfasta split -n 6 [-k 5000 ] some.fasta
the output will be some.0.fasta, some.1.fasta ... some.6.fasta
the sizes will be as even as reasonable.
""")
parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
help="""this overrides all other options. if specified, it will
split the file into a separate file for each header. it
will be a template specifying the file name for each new file.
e.g.: "%(fasta)s.%(seqid)s.fasta"
where 'fasta' is the basename of the input fasta file and seqid
is the header of each entry in the fasta file.""" ,default=None)
parser.add_option("-n", "--n", type="int", dest="nsplits",
help="number of new files to create")
parser.add_option("-o", "--overlap", type="int", dest="overlap",
help="overlap in basepairs", default=0)
parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
help="""\
split big files into pieces of this size in basepairs. default
default of -1 means do not split the sequence up into k-mers, just
split based on the headers. a reasonable value would be 10Kbp""")
options, fasta = parser.parse_args(args)
if not (fasta and (options.nsplits or options.header)):
sys.exit(parser.print_help())
if isinstance(fasta, (tuple, list)):
assert len(fasta) == 1, fasta
fasta = fasta[0]
kmer = options.kmers if options.kmers != -1 else None
overlap = options.overlap if options.overlap != 0 else None
f = Fasta(fasta)
if options.header:
names = dict([(seqid, options.header % \
dict(fasta=f.fasta_name, seqid=seqid)) \
for seqid in f.iterkeys()])
"""
if len(names) > 0:
assert names[0][1] != names[1][1], ("problem with header format", options.header)
fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
"""
return with_header_names(f, names)
else:
names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap,
header=options.header)
#fhs = [open(n, 'wb') for n in names]
if options.kmers == -1:
return without_kmers(f, names)
else:
return with_kmers(f, names, options.kmers, options.overlap)
示例2: write_c2t
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import iterkeys [as 別名]
def write_c2t(fasta_name, unconverted, colorspace=False):
"""
given a fasta file, write a new file:
`some.fr.c2t.fasta` which contains:
+ the same headers prefixed with 'f' with all C's converted to T
+ headers prefixed with 'r' reverse complemented with
all C's converted to T.
if unconverted is false, then also save a file with the forward and reverse
without conversion.
"""
d = op.join(op.dirname(fasta_name), "bowtie_index")
if colorspace: d += "_colorspace"
if not op.exists(d): os.mkdir(d)
p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta
fname = "%s/%s.fr.c2t%s" % (d, p, ext)
# no conversion, just copy the file into the index dir.
unconverted_fname = "%s/%s.fr%s" % (d, p, ext)
if op.exists(fname):
if not unconverted: return fname, unconverted_fname
elif op.exists(unconverted_fname): return fname, unconverted_fname
fasta = Fasta(fasta_name)
c2t_fh = open(fname, 'w')
unc_fh = open(unconverted_fname, 'w') if unconverted else None
print >>sys.stderr, "writing forward and reverse c2t to: %s" % (fname,)
try:
for header in fasta.iterkeys():
seq = str(fasta[header]).upper()
assert not ">" in seq
# c2t, prefix header with f and write
print >>c2t_fh, ">f%s" % header
print >>c2t_fh, seq.replace('C', 'T')
# then r-c, c2t, prefix header with r and write
print >>c2t_fh, ">r%s" % header
rseq = revcomp(seq)
print >>c2t_fh, rseq.replace('C', 'T')
if unc_fh is not None:
print >>unc_fh, ">f%s\n%s" % (header, seq)
print >>unc_fh, ">r%s\n%s" % (header, rseq)
c2t_fh.close()
except:
os.unlink(fname)
os.unlink(unconverted_fname)
raise
return fname, unconverted_fname
示例3: mask
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import iterkeys [as 別名]
def mask(fasta_file, org, cutoff, mask_value='X'):
h5, node = get_node(org, 'r')
outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \
+ fasta_file[fasta_file.rfind("."):]
print "> masking sequence to file:", outfile
out = open(outfile ,'w')
fasta = Fasta(fasta_file)
soft_mask = mask_value.lower() == 'soft'
for seqid in sorted(fasta.iterkeys()):
masked = 0
if soft_mask:
seq = str(fasta[seqid])
# mask is the lowercase sequence.
mask_value = np.array(seq.lower(), dtype='c')
seq = np.array(seq.upper(), dtype='c')
else:
fasta[seqid].tostring = False
seq = fasta[seqid][:] # a
if not 'c' + seqid in node:
print >>sys.stderr, seqid,\
'! not found in masked, writing unchanged\n' \
' this means that no section of this sequence appeared\n' \
' more than %i times' % cutoff
out.write('>' + seqid + '\n')
out.write(seq.tostring() + '\n')
continue
hit_counts = getattr(node, 'c' + seqid)[:]
masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff)
, mask_value, seq).tostring()
l = len(masked_seq)
print >>sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (seqid, l,
100.0 * masked_seq.count(mask_value) / l)
assert len(seq) == l
out.write('>' + seqid + '\n')
out.write(masked_seq + '\n')
out.close()
# write out a file .fasta.version containing
# the svnversion (if available of this script
# that was used to create the file.
path = os.path.dirname(__file__)
os.system('svnversion %s > %s.version' % (path, outfile))
h5.close()
示例4: write_file
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import iterkeys [as 別名]
chh_idxs, = np.where(~(first_g | second_g))
chh = c_idxs[chh_idxs]
methyl[chh] = CHH + adder
return methyl
def write_file(seq, file_path):
methyl = calc_methylation(str(seq))
methyl.tofile(file_path)
if __name__ == "__main__":
import optparse
import sys
import doctest
import os
doctest.testmod()
p = optparse.OptionParser(__doc__)
opts, args = p.parse_args()
if not (len(args) and args[0]): sys.exit(p.print_help())
from pyfasta import Fasta
path = args[0]
f = Fasta(path)
for k in f.iterkeys():
print "calculating methylation data for seqid: %s" % k
binpath = "%s.%s.methyl.type.bin" % (path, k)
write_file(f[k], binpath)
print "saved to %s" % binpath
示例5: Fasta
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import iterkeys [as 別名]
subject_fasta_file = sys.argv[2]
out_fasta = "%s.features%s" % op.splitext(subject_fasta_file)
by_subject = collections.defaultdict(list)
fa = Fasta(subject_fasta_file)
def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = tee(iterable)
next(b, None)
return izip(a, b)
by_subject = {}
seqids = []
for seqid in fa.iterkeys():
by_subject[seqid] = np.zeros((len(fa[seqid]) + 1,), dtype=np.uint8)
seqids.append((len(fa[seqid]),seqid))
by_query_subject = collections.defaultdict(dict)
for line in open(blast_file):
args = line.split("\t")
query, subject = args[0], args[1]
sstart, sstop = sorted(map(int, args[8:10]))
by_subject[subject][sstart: sstop + 1] |= 1
if not subject in by_query_subject[query]:
by_query_subject[query][subject] = [(sstart, sstop)]
else:
by_query_subject[query][subject].append((sstart, sstop))
示例6: parse_gsnap_sam
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import iterkeys [as 別名]
def parse_gsnap_sam(gsnap_f, ref_path, out_dir, paired_end):
fa = Fasta(ref_path)
fc, ft, fmethyltype = \
bin_paths_from_fasta(fa.fasta_name, out_dir)
counts = get_counts(fc, ft, fa)
chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys())
print >>sys.stderr, "tabulating methylation"
gsnap_subset = open(gsnap_f.replace(".gsnap.sam", ".sam"), "w")
for sline in open(gsnap_f):
if sline.startswith("@SQ"):
print >>gsnap_subset, sline.strip()
continue
# the ends didn't map to same spot.
line = sline.split("\t")
sam_flag = int(line[1])
if paired_end:
if line[6] != "=": continue
#print >>gsnap_subset, sline.strip()
else:
# no reported alignments.
if sam_flag == 4: continue
print >>gsnap_subset, sline.rstrip("\n")
seqid = line[2]
aln_seq = line[9]
read_length = len(aln_seq)
bp0 = int(line[3]) - 1
ga = ((sam_flag & 16) != 0) ^ (sam_flag & 128 != 0)
insert_length = int(line[8])
#line[9] = aln_seq
#line[10] = line[10][:len(aln_seq)]
# both ends start at exactly the same place.
if insert_length == 0: continue
# handle overlapping reads. one side has + insert, the other is -
if -read_length < insert_length < 0:
insert_length = abs(insert_length)
aln_seq = aln_seq[:-(read_length - insert_length)]
read_length = len(aln_seq)
if line[7] == '0': continue
bp1 = bp0 + read_length
ref_seq = (fa[seqid][bp0:bp1]).upper()
letters = 'GA' if ga else 'CT'
read_length = len(ref_seq)
assert read_length > 0, (bp0, bp1)
_update_conversions(ref_seq, aln_seq, bp0, letters,
counts[seqid]['c'], counts[seqid]['t'],
50, read_length, line[5])
write_files(fa.fasta_name, out_dir, counts)
cmd = open(out_dir +"/cmd.ran", "w")
import datetime
print >>cmd, "#date:", str(datetime.date.today())
print >>cmd, "#path:", op.abspath(".")
print >>cmd, " ".join(sys.argv)
write_sam_commands(out_dir, fa, "methylcoder.gsnap")
示例7: count_conversions
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import iterkeys [as 別名]
def count_conversions(original_fasta, sam_iter, raw_reads_list, c2t_reads_list, index_class, out_dir,
allowed_mismatches, mode='w', counts=None, is_colorspace=False):
print >>sys.stderr, "tabulating methylation for %s" % (original_fasta,)
fh_raw_reads_list = [open(raw_reads) for raw_reads in raw_reads_list]
fh_c2t_reads_list = [open(c2t_reads) for c2t_reads in c2t_reads_list]
fqidx_list = [index_class(c2t_reads) for c2t_reads in c2t_reads_list]
fa = Fasta(original_fasta)
#fh_raw_reads = open(raw_reads, 'r')
#fh_c2t_reads = open(c2t_reads, 'r')
def get_records(header, idx):
return get_raw_and_c2t(header, fqidx_list[idx], fh_raw_reads_list[idx], fh_c2t_reads_list[idx])
chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys())
out_sam = open(out_dir + "/methylcoded.sam", mode)
# get the 3 possible binary files for each chr
fc, ft, fmethyltype = \
bin_paths_from_fasta(original_fasta, out_dir)
# if we're re-running without conversion, it was passed via kwargs.
# otherwise, we get a new set.
retry = counts is not None
unmapped_name = op.join(out_dir, "unmapped.reads")
if retry:
unmapped_name += ".try2"
else:
counts = get_counts(fc, ft, fa)
skipped = 0
align_count = 0
for p, sam_line, read_len, direction in parse_sam(sam_iter, chr_lengths,
get_records,
unmapped_name,
is_colorspace, out_sam):
pairs = "CT" if direction == "f" else "GA" #
read_id = p['read_id']
pos0 = p['pos0']
align_count += 1
raw = p['raw_read']
sam_flag = int(sam_line[1])
# paired and first of read and ends have overlap.
# this section makes sure that the overlap is only counted 1x.
if sam_line[7] != '0' and 0 <= int(sam_line[8]) < read_len:
offset = int(sam_line[8])
pos0 = pos0 + read_len - offset
read_len = offset
raw = raw[-offset:]
# chop the read info to the non-overlapping bases.
p['read_sequence'] = p['read_sequence'][-offset:]
# the position is the line num * the read_id + read_id (where the +
# is to account for the newlines.
genomic_ref = str(fa[p['seqid']][pos0:pos0 + read_len]).upper()
DEBUG = False
if DEBUG:
print >>sys.stderr, "=" * 80
print >>sys.stderr, sam_line
print >>sys.stderr, "=" * 80
print >>sys.stderr, ">>", sam_line[1], sam_flag
idx = 0 if (sam_flag & 128) == 0 else 1
araw, ac2t = get_records(read_id, idx)
#araw.seq = araw.seq[-offset:]
#c2t.seq = ac2t.seq[-offset:]
print >>sys.stderr, "f['%s'][%i:%i + %i]" % (p['seqid'], pos0,
pos0, read_len)
print >>sys.stderr, "mismatches:", p['nmiss']
print >>sys.stderr, "ref :", genomic_ref
if direction == 'r':
print >>sys.stderr, "raw_read(r):", raw
c2t = ac2t.seq
if not is_colorspace:
if (sam_flag & 128) == 0:
c2t = revcomp(c2t)[-offset:]
assert c2t == p['read_sequence'], (c2t, p['read_sequence'])
else:
print >>sys.stderr, "raw_read(f):", raw
c2t = ac2t.seq
if not is_colorspace:
if (sam_flag & 128) != 0:
c2t = revcomp(c2t)
p['read_sequence'] = revcomp(p['read_sequence'])
assert c2t == p['read_sequence'], (c2t, p['read_sequence'], sam_flag)
print >>sys.stderr, "c2t :", c2t, "\n"
# have to keep the ref in forward here to get the correct bp
# positions. look for CT when forward and GA when back.
current_mismatches = p['nmiss']
# we send in the current mismatches and allowed mismatches so that in
# the case where a C in the ref seq has becomes a T in the align seq
# (but wasnt calc'ed as a mismatch because we had converted C=>T. if
# these errors cause the number of mismatches to exceed the number of
# allowed mismatches, we dont include the stats for this read.
#.........這裏部分代碼省略.........