本文整理汇总了Python中jcvi.formats.fasta.Fasta.iteritems_ordered方法的典型用法代码示例。如果您正苦于以下问题:Python Fasta.iteritems_ordered方法的具体用法?Python Fasta.iteritems_ordered怎么用?Python Fasta.iteritems_ordered使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jcvi.formats.fasta.Fasta
的用法示例。
在下文中一共展示了Fasta.iteritems_ordered方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def main(arg):
f = Fasta(arg)
s = [str(x.seq) for k, x in f.iteritems_ordered()]
m = s[0]
for z in s[1:]:
m = m.replace(z, "")
print Seq(m).translate().strip("*")
示例2: flip
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def flip(args):
"""
%prog flip fastafile
Go through each FASTA record, check against Genbank file and determines
whether or not to flip the sequence. This is useful before updates of the
sequences to make sure the same orientation is used.
"""
p = OptionParser(flip.__doc__)
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
fastafile, = args
outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
fo = open(outfastafile, "w")
f = Fasta(fastafile, lazy=True)
for name, rec in f.iteritems_ordered():
tmpfasta = "a.fasta"
fw = open(tmpfasta, "w")
SeqIO.write([rec], fw, "fasta")
fw.close()
o = overlap([tmpfasta, name])
if o.orientation == '-':
rec.seq = rec.seq.reverse_complement()
SeqIO.write([rec], fo, "fasta")
os.remove(tmpfasta)
示例3: digest
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def digest(args):
"""
%prog digest fastafile NspI,BfuCI
Digest fasta sequences to map restriction site positions.
"""
p = OptionParser(digest.__doc__)
p.set_outfile()
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
fastafile, enzymes = args
enzymes = enzymes.split(",")
enzymes = [x for x in AllEnzymes if str(x) in enzymes]
f = Fasta(fastafile, lazy=True)
fw = must_open(opts.outfile, "w")
header = ["Contig", "Length"] + [str(x) for x in enzymes]
print("\t".join(header), file=fw)
for name, rec in f.iteritems_ordered():
row = [name, len(rec)]
for e in enzymes:
pos = e.search(rec.seq)
pos = "na" if not pos else "|".join(str(x) for x in pos)
row.append(pos)
print("\t".join(str(x) for x in row), file=fw)
示例4: get_GC3
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def get_GC3(cdsfile):
from jcvi.formats.fasta import Fasta
f = Fasta(cdsfile, lazy=True)
GC3 = {}
for name, rec in f.iteritems_ordered():
positions = rec.seq[2::3].upper()
gc_counts = sum(1 for x in positions if x in "GC")
gc_ratio = gc_counts * 1. / len(positions)
GC3[name] = gc_ratio
return GC3
示例5: fragment
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def fragment(args):
"""
%prog fragment fastafile enzyme
Cut the fastafile using the specified enzyme, and grab upstream and
downstream nucleotide sequence along with the cut site. In this case, the
sequences extracted are:
|- PstI
============|===========
(-------)
Sometimes we need to limit the size of the restriction fragments, for
example the GBS protocol does not allow fragments larger than 800bp.
|-PstI |- PstI |- PstI
~~~====|=============|==========~~~~~~~===|============
(---) (---)
In this case, the second fragment is longer than 800bp, therefore the two
ends are NOT extracted, as in the first fragment.
"""
p = OptionParser(fragment.__doc__)
p.add_option("--flank", default=150, type="int",
help="Extract flanking bases of the cut sites [default: %default]")
p.add_option("--full", default=False, action="store_true",
help="The full extraction mode [default: %default]")
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
fastafile, enzyme = args
flank = opts.flank
assert flank > 0
extract = extract_full if opts.full else extract_ends
tag = "full" if opts.full else "ends"
assert enzyme in set(str(x) for x in AllEnzymes)
fragfastafile = fastafile.split(".")[0] + \
".{0}.flank{1}.{2}.fasta".format(enzyme, flank, tag)
enzyme = [x for x in AllEnzymes if str(x) == enzyme][0]
f = Fasta(fastafile, lazy=True)
fw = open(fragfastafile, "w")
for name, rec in f.iteritems_ordered():
a = Analysis([enzyme], rec.seq)
sites = a.full()[enzyme]
extract(rec, sites, flank, fw)
logging.debug("Fragments written to `{0}`.".format(fragfastafile))
示例6: merge
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def merge(args):
"""
%prog merge gffiles
Merge several gff files into one. When only one file is given, it is assumed
to be a file with a list of gff files.
"""
p = OptionParser(merge.__doc__)
set_outfile(p)
opts, args = p.parse_args(args)
nargs = len(args)
if nargs < 1:
sys.exit(not p.print_help())
if nargs == 1:
listfile, = args
fp = open(listfile)
gffiles = [x.strip() for x in fp]
else:
gffiles = args
outfile = opts.outfile
deflines = set()
fw = must_open(outfile, "w")
fastarecs = {}
for gffile in gffiles:
fp = open(gffile)
for row in fp:
row = row.rstrip()
if row[0] == '#':
if row == FastaTag:
break
if row in deflines:
continue
else:
deflines.add(row)
print >> fw, row
f = Fasta(gffile, lazy=True)
for key, rec in f.iteritems_ordered():
if key in fastarecs.keys():
continue
fastarecs[key] = rec
print >> fw, FastaTag
SeqIO.write(fastarecs.values(), fw, "fasta")
示例7: count
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def count(args):
"""
%prog count fastafile jf.db
Run dump - jellyfish - bin - bincount in serial.
"""
from bitarray import bitarray
p = OptionParser(count.__doc__)
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
fastafile, jfdb = args
K = get_K(jfdb)
cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb)
t = must_open("tmp", "w")
proc = Popen(cmd, stdin=PIPE, stdout=t)
t.flush()
f = Fasta(fastafile, lazy=True)
for name, rec in f.iteritems_ordered():
kmers = list(make_kmers(rec.seq, K))
print >> proc.stdin, "\n".join(kmers)
proc.stdin.close()
logging.debug(cmd)
proc.wait()
a = bitarray()
binfile = ".".join((fastafile, jfdb, "bin"))
fw = open(binfile, "w")
t.seek(0)
for row in t:
c = row.strip()
a.append(int(c))
a.tofile(fw)
logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile))
fw.close()
sh("rm {0}".format(t.name))
logging.debug("Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".\
format(K, fastafile, jfdb, binfile))
cntfile = ".".join((fastafile, jfdb, "cnt"))
bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)])
logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
示例8: overlapbatch
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def overlapbatch(args):
"""
%prog overlapbatch ctgfasta poolfasta
Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
Mix and combine using `minimus2`.
"""
p = OptionParser(overlap.__doc__)
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
ctgfasta, poolfasta = args
f = Fasta(ctgfasta)
for k, rec in f.iteritems_ordered():
fastafile = k + ".fasta"
fw = open(fastafile, "w")
SeqIO.write([rec], fw, "fasta")
fw.close()
overlap([fastafile, poolfasta])
示例9: dump
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def dump(args):
"""
%prog dump fastafile
Convert FASTA sequences to list of K-mers.
"""
p = OptionParser(dump.__doc__)
p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]")
p.set_outfile()
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
fastafile, = args
K = opts.K
fw = must_open(opts.outfile, "w")
f = Fasta(fastafile, lazy=True)
for name, rec in f.iteritems_ordered():
kmers = list(make_kmers(rec.seq, K))
print >> fw, "\n".join(kmers)
fw.close()
示例10: shred
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def shred(args):
"""
%prog shred fastafile
Similar to the method of `shredContig` in runCA script. The contigs are
shredded into pseudo-reads with certain length and depth.
"""
p = OptionParser(shred.__doc__)
p.set_depth(depth=2)
p.add_option("--readlen", default=1000, type="int", help="Desired length of the reads [default: %default]")
p.add_option("--minctglen", default=0, type="int", help="Ignore contig sequence less than [default: %default]")
p.add_option("--shift", default=50, type="int", help="Overlap between reads must be at least [default: %default]")
p.add_option(
"--fasta",
default=False,
action="store_true",
help="Output shredded reads as FASTA sequences [default: %default]",
)
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
fastafile, = args
libID = fastafile.split(".")[0]
depth = opts.depth
readlen = opts.readlen
shift = opts.shift
outfile = libID + ".depth{0}".format(depth)
if opts.fasta:
outfile += ".fasta"
else:
outfile += ".frg"
f = Fasta(fastafile, lazy=True)
fw = must_open(outfile, "w", checkexists=True)
if not opts.fasta:
print >> fw, headerTemplate.format(libID=libID)
"""
Taken from runCA:
|*********|
|###################|
|--------------------------------------------------|
---------------1---------------
---------------2---------------
---------------3---------------
*** - center_increments
### - center_range_width
"""
for ctgID, (name, rec) in enumerate(f.iteritems_ordered()):
seq = rec.seq
seqlen = len(seq)
if seqlen < opts.minctglen:
continue
shredlen = min(seqlen - shift, readlen)
numreads = max(seqlen * depth / shredlen, 1)
center_range_width = seqlen - shredlen
ranges = []
if depth == 1:
if seqlen < readlen:
ranges.append((0, seqlen))
else:
for begin in xrange(0, seqlen, readlen - shift):
end = min(seqlen, begin + readlen)
ranges.append((begin, end))
else:
if numreads == 1:
ranges.append((0, shredlen))
else:
prev_begin = -1
center_increments = center_range_width * 1.0 / (numreads - 1)
for i in xrange(numreads):
begin = center_increments * i
end = begin + shredlen
begin, end = int(begin), int(end)
if begin == prev_begin:
continue
ranges.append((begin, end))
prev_begin = begin
for shredID, (begin, end) in enumerate(ranges):
shredded_seq = seq[begin:end]
fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end)
emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta)
fw.close()
logging.debug("Shredded reads are written to `{0}`.".format(outfile))
return outfile
示例11: expand
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def expand(args):
"""
%prog expand bes.fasta reads.fastq
Expand sequences using short reads. Useful, for example for getting BAC-end
sequences. The template to use, in `bes.fasta` may just contain the junction
sequences, then align the reads to get the 'flanks' for such sequences.
"""
import math
from jcvi.formats.fasta import Fasta, SeqIO
from jcvi.formats.fastq import readlen, first, fasta
from jcvi.formats.blast import Blast
from jcvi.formats.base import FileShredder
from jcvi.apps.bowtie import align, get_samfile
from jcvi.apps.align import blast
p = OptionParser(expand.__doc__)
p.set_depth(depth=200)
p.set_firstN()
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
bes, reads = args
size = Fasta(bes).totalsize
rl = readlen([reads])
expected_size = size + 2 * rl
nreads = expected_size * opts.depth / rl
nreads = int(math.ceil(nreads / 1000.)) * 1000
# Attract reads
samfile, logfile = align([bes, reads, "--reorder", "--mapped",
"--firstN={0}".format(opts.firstN)])
samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))
pf = mapped.split(".")[0]
pf = pf.split("-")[0]
bespf = bes.split(".")[0]
reads = pf + ".expand.fastq"
first([str(nreads), mapped, "-o", reads])
# Perform mini-assembly
fastafile = reads.rsplit(".", 1)[0] + ".fasta"
qualfile = ""
if need_update(reads, fastafile):
fastafile, qualfile = fasta([reads])
contigs = op.join(pf, "454LargeContigs.fna")
if need_update(fastafile, contigs):
cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
sh(cmd)
assert op.exists(contigs)
# Annotate contigs
blastfile = blast([bes, contigs])
mapping = {}
for query, b in Blast(blastfile).iter_best_hit():
mapping[query] = b
f = Fasta(contigs, lazy=True)
annotatedfasta = ".".join((pf, bespf, "fasta"))
fw = open(annotatedfasta, "w")
keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list
recs = []
for key, v in f.iteritems_ordered():
vid = v.id
if vid not in mapping:
continue
b = mapping[vid]
subject = b.subject
rec = v.reverse_complement() if b.orientation == '-' else v
rec.id = rid = "_".join((pf, vid, subject))
rec.description = ""
recs.append((keys.index(subject), rid, rec))
recs = [x[-1] for x in sorted(recs)]
SeqIO.write(recs, fw, "fasta")
fw.close()
FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
format(len(recs), annotatedfasta))
return annotatedfasta
示例12: longest
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def longest(args):
"""
%prog longest pasa.fasta output.subclusters.out
Find the longest PASA assembly and label it as full-length. Also removes
transcripts shorter than half the length of the longest, or shorter than
200bp. The assemblies for the same locus is found in
`output.subclusters.out`. In particular the lines that look like:
sub-cluster: asmbl_25 asmbl_26 asmbl_27
"""
from jcvi.formats.fasta import Fasta, SeqIO
from jcvi.formats.sizes import Sizes
p = OptionParser(longest.__doc__)
p.add_option("--prefix", default="pasa",
help="Replace asmbl_ with prefix [default: %default]")
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
fastafile, subclusters = args
prefix = fastafile.rsplit(".", 1)[0]
idsfile = prefix + ".fl.ids"
fw = open(idsfile, "w")
sizes = Sizes(fastafile).mapping
name_convert = lambda x: x.replace("asmbl", opts.prefix)
keep = set() # List of IDs to write
fp = open(subclusters)
nrecs = 0
for row in fp:
if not row.startswith("sub-cluster:"):
continue
asmbls = row.split()[1:]
longest_asmbl = max(asmbls, key=lambda x: sizes[x])
longest_size = sizes[longest_asmbl]
print(name_convert(longest_asmbl), file=fw)
nrecs += 1
cutoff = max(longest_size / 2, 200)
keep.update(set(x for x in asmbls if sizes[x] >= cutoff))
fw.close()
logging.debug("{0} fl-cDNA records written to `{1}`.".format(nrecs, idsfile))
f = Fasta(fastafile, lazy=True)
newfastafile = prefix + ".clean.fasta"
fw = open(newfastafile, "w")
nrecs = 0
for name, rec in f.iteritems_ordered():
if name not in keep:
continue
rec.id = name_convert(name)
rec.description = ""
SeqIO.write([rec], fw, "fasta")
nrecs += 1
fw.close()
logging.debug("{0} valid records written to `{1}`.".format(nrecs, newfastafile))
示例13: overlap
# 需要导入模块: from jcvi.formats.fasta import Fasta [as 别名]
# 或者: from jcvi.formats.fasta.Fasta import iteritems_ordered [as 别名]
def overlap(args):
"""
%prog overlap ctgfasta poolfasta
Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
Mix and combine using `minimus2`.
"""
p = OptionParser(overlap.__doc__)
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
ctgfasta, poolfasta = args
prefix = ctgfasta.split(".")[0]
rid = list(Fasta(ctgfasta).iterkeys())
assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file"
rid = rid[0]
splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta"
ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta)
# Run BLAST
blastfile = ctgfasta + ".blast"
run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta)
# Extract contigs and merge using minimus2
closuredir = prefix + ".closure"
closure = False
if need_update(blastfile, closuredir):
mkdir(closuredir, overwrite=True)
closure = True
if closure:
idsfile = op.join(closuredir, prefix + ".ids")
cmd = "cut -f2 {0} | sort -u".format(blastfile)
sh(cmd, outfile=idsfile)
idsfastafile = op.join(closuredir, prefix + ".ids.fasta")
cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile)
sh(cmd)
# This step is a hack to weight the bases from original sequences more
# than the pulled sequences, by literally adding another copy to be used
# in consensus calls.
redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta")
format([ctgfasta, redundantfastafile, "--prefix=RED."])
mergedfastafile = op.join(closuredir, prefix + ".merged.fasta")
cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile)
sh(cmd, outfile=mergedfastafile)
afgfile = op.join(closuredir, prefix + ".afg")
cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile)
sh(cmd)
cwd = os.getcwd()
os.chdir(closuredir)
cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix)
cmd += " -D OVERLAP=100 -D MINID=98"
sh(cmd)
os.chdir(cwd)
# Analyze output, make sure that:
# + Get the singletons of the original set back
# + Drop any contig that is comprised entirely of pulled set
originalIDs = set(Fasta(ctgfasta).iterkeys())
minimuscontig = op.join(closuredir, prefix + ".contig")
c = ContigFile(minimuscontig)
excludecontigs = set()
for rec in c.iter_records():
reads = set(x.id for x in rec.reads)
if reads.isdisjoint(originalIDs):
excludecontigs.add(rec.id)
logging.debug("Exclude contigs: {0}".\
format(", ".join(sorted(excludecontigs))))
finalfasta = prefix + ".improved.fasta_"
fw = open(finalfasta, "w")
minimusfasta = op.join(closuredir, prefix + ".fasta")
f = Fasta(minimusfasta)
for id, rec in f.iteritems_ordered():
if id in excludecontigs:
continue
SeqIO.write([rec], fw, "fasta")
singletonfile = op.join(closuredir, prefix + ".singletons")
singletons = set(x.strip() for x in open(singletonfile))
leftovers = singletons & originalIDs
logging.debug("Pull leftover singletons: {0}".\
format(", ".join(sorted(leftovers))))
f = Fasta(ctgfasta)
for id, rec in f.iteritems_ordered():
if id not in leftovers:
continue
SeqIO.write([rec], fw, "fasta")
#.........这里部分代码省略.........