本文整理汇总了Python中jcvi.formats.fasta.SeqIO.write方法的典型用法代码示例。如果您正苦于以下问题:Python SeqIO.write方法的具体用法?Python SeqIO.write怎么用?Python SeqIO.write使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类jcvi.formats.fasta.SeqIO
的用法示例。
在下文中一共展示了SeqIO.write方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: flip
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def flip(args):
"""
%prog flip fastafile
Go through each FASTA record, check against Genbank file and determines
whether or not to flip the sequence. This is useful before updates of the
sequences to make sure the same orientation is used.
"""
p = OptionParser(flip.__doc__)
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
fastafile, = args
outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
fo = open(outfastafile, "w")
f = Fasta(fastafile, lazy=True)
for name, rec in f.iteritems_ordered():
tmpfasta = "a.fasta"
fw = open(tmpfasta, "w")
SeqIO.write([rec], fw, "fasta")
fw.close()
o = overlap([tmpfasta, name])
if o.orientation == '-':
rec.seq = rec.seq.reverse_complement()
SeqIO.write([rec], fo, "fasta")
os.remove(tmpfasta)
示例2: filter
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def filter(args):
"""
%prog filter consensus.fasta
Filter consensus sequence with min cluster size.
"""
from jcvi.formats.fasta import Fasta, SeqIO
p = OptionParser(filter.__doc__)
p.add_option("--minsize", default=10, type="int",
help="Minimum cluster size")
p.set_outfile()
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
fastafile, = args
minsize = opts.minsize
f = Fasta(fastafile, lazy=True)
fw = must_open(opts.outfile, "w")
for desc, rec in f.iterdescriptions_ordered():
if desc.startswith("singleton"):
continue
# consensus_for_cluster_0 with 63 sequences
name, w, size, seqs = desc.split()
assert w == "with"
size = int(size)
if size < minsize:
continue
SeqIO.write(rec, fw, "fasta")
示例3: extract_ends
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def extract_ends(rec, sites, flank, fw, maxfragsize=800):
"""
Extraction of ends of fragments above certain size.
"""
nsites = len(sites)
size = len(rec)
for i, s in enumerate(sites):
newid = "{0}:{1}".format(rec.name, s)
recs = []
if i == 0 or s - sites[i - 1] <= maxfragsize:
newidL = newid + "L"
left = max(s - flank, 0)
right = s
frag = rec.seq[left:right].strip("Nn")
recL = SeqRecord(frag, id=newidL, description="")
if i == 0 and s > maxfragsize: # Contig L-end
pass
else:
recs.append(recL)
if i == nsites - 1 or sites[i + 1] - s <= maxfragsize:
newidR = newid + "R"
left = s
right = min(s + flank, size)
frag = rec.seq[left:right].strip("Nn")
recR = SeqRecord(frag, id=newidR, description="")
if i == nsites - 1 and size - s > maxfragsize: # Contig R-end
pass
else:
recs.append(recR)
SeqIO.write(recs, fw, "fasta")
示例4: extract
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def extract(args):
"""
%prog extract gffile
--contigs: Extract particular contig(s) from the gff file. If multiple contigs are
involved, use "," to separate, e.g. "contig_12,contig_150"
--names: Provide a file with IDs, one each line
"""
p = OptionParser(extract.__doc__)
p.add_option("--contigs",
help="Extract features from certain contigs [default: %default]")
p.add_option("--names",
help="Extract features with certain names [default: %default]")
p.add_option("--fasta", default=False, action="store_true",
help="Write FASTA if available [default: %default]")
set_outfile(p)
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
gffile, = args
contigID = opts.contigs
namesfile = opts.names
contigID = set(contigID.split(",")) if contigID else None
names = set(x.strip() for x in open(namesfile)) if namesfile else None
outfile = opts.outfile
fp = open(gffile)
fw = must_open(outfile, "w")
for row in fp:
atoms = row.split()
if len(atoms) == 0:
continue
tag = atoms[0]
if row[0] == "#":
if not (tag == RegionTag and contigID and atoms[1] not in contigID):
print >> fw, row.rstrip()
if tag == FastaTag:
break
continue
b = GffLine(row)
is_right_contig = (contigID and tag in contigID) or (not contigID)
is_right_names = (names and b.attributes["Name"][0] in names) or \
(not names)
if is_right_contig and is_right_names:
print >> fw, row.rstrip()
if not opts.fasta:
return
f = Fasta(gffile)
for s in contigID:
if s in f:
SeqIO.write([f[s]], fw, "fasta")
示例5: extract_full
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def extract_full(rec, sites, flank, fw):
"""
Full extraction of seq flanking the sites.
"""
for s in sites:
newid = "{0}:{1}".format(rec.name, s)
left = max(s - flank, 0)
right = min(s + flank, len(rec))
frag = rec.seq[left:right].strip("Nn")
newrec = SeqRecord(frag, id=newid, description="")
SeqIO.write([newrec], fw, "fasta")
示例6: filter
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def filter(args):
"""
%prog filter *.consensus.fasta
Filter consensus sequence with min cluster size.
"""
from jcvi.formats.fasta import Fasta, SeqIO
p = OptionParser(filter.__doc__)
p.add_option("--minsize", default=2, type="int",
help="Minimum cluster size")
p.set_outfile()
opts, args = p.parse_args(args)
if len(args) < 1:
sys.exit(not p.print_help())
fastafiles = args
minsize = opts.minsize
totalreads = totalassembled = 0
fw = must_open(opts.outfile, "w")
for i, fastafile in enumerate(fastafiles):
f = Fasta(fastafile, lazy=True)
pf = "s{0:03d}".format(i)
nreads = nsingletons = nclusters = 0
for desc, rec in f.iterdescriptions_ordered():
nclusters += 1
if desc.startswith("singleton"):
nsingletons += 1
nreads += 1
continue
# consensus_for_cluster_0 with 63 sequences
name, w, size, seqs = desc.split()
assert w == "with"
size = int(size)
nreads += size
if size < minsize:
continue
rec.description = rec.description.split(None, 1)[-1]
rec.id = pf + "_" + rec.id
SeqIO.write(rec, fw, "fasta")
logging.debug("Scanned {0} clusters with {1} reads ..".\
format(nclusters, nreads))
cclusters, creads = nclusters - nsingletons, nreads - nsingletons
logging.debug("Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".\
format(cclusters, minsize, creads, creads / cclusters, pf))
totalreads += nreads
totalassembled += nreads - nsingletons
logging.debug("Total assembled: {0}".\
format(percentage(totalassembled, totalreads)))
示例7: merge
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def merge(args):
"""
%prog merge gffiles
Merge several gff files into one. When only one file is given, it is assumed
to be a file with a list of gff files.
"""
p = OptionParser(merge.__doc__)
set_outfile(p)
opts, args = p.parse_args(args)
nargs = len(args)
if nargs < 1:
sys.exit(not p.print_help())
if nargs == 1:
listfile, = args
fp = open(listfile)
gffiles = [x.strip() for x in fp]
else:
gffiles = args
outfile = opts.outfile
deflines = set()
fw = must_open(outfile, "w")
fastarecs = {}
for gffile in gffiles:
fp = open(gffile)
for row in fp:
row = row.rstrip()
if row[0] == '#':
if row == FastaTag:
break
if row in deflines:
continue
else:
deflines.add(row)
print >> fw, row
f = Fasta(gffile, lazy=True)
for key, rec in f.iteritems_ordered():
if key in fastarecs.keys():
continue
fastarecs[key] = rec
print >> fw, FastaTag
SeqIO.write(fastarecs.values(), fw, "fasta")
示例8: needle
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def needle(args):
"""
%prog needle nw.pairs a.pep.fasta b.pep.fasta
Take protein pairs and needle them
Automatically writes output file `nw.scores`
"""
from jcvi.formats.fasta import Fasta, SeqIO
p = OptionParser(needle.__doc__)
opts, args = p.parse_args(args)
if len(args) != 3:
sys.exit(not p.print_help())
manager = mp.Manager()
results = manager.list()
needle_pool = mp.Pool(processes=mp.cpu_count())
pairsfile, apep, bpep = args
afasta, bfasta = Fasta(apep), Fasta(bpep)
fp = must_open(pairsfile)
for i, row in enumerate(fp):
a, b = row.split()
a, b = afasta[a], bfasta[b]
fa, fb = must_open("{0}_{1}_a.fasta".format(pairsfile, i), "w"), \
must_open("{0}_{1}_b.fasta".format(pairsfile, i), "w")
SeqIO.write([a], fa, "fasta")
SeqIO.write([b], fb, "fasta")
fa.close()
fb.close()
needlefile = "{0}_{1}_ab.needle".format(pairsfile, i)
needle_pool.apply_async(_needle, \
(fa.name, fb.name, needlefile, a.id, b.id, results))
needle_pool.close()
needle_pool.join()
fp.close()
scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0])
fw = must_open(scoresfile, "w")
for result in results:
print(result, file=fw)
fw.close()
示例9: circular
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def circular(args):
"""
%prog circular fastafile startpos
Make circular genome, startpos is the place to start the sequence. This can
be determined by mapping to a reference. Self overlaps are then resolved.
Startpos is 1-based.
"""
from jcvi.assembly.goldenpath import overlap
p = OptionParser(circular.__doc__)
p.add_option("--flip", default=False, action="store_true",
help="Reverse complement the sequence")
p.set_outfile()
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
fastafile, startpos = args
startpos = int(startpos)
key, seq = parse_fasta(fastafile).next()
aseq = seq[startpos:]
bseq = seq[:startpos]
aseqfile, bseqfile = "a.seq", "b.seq"
for f, s in zip((aseqfile, bseqfile), (aseq, bseq)):
fw = must_open(f, "w")
print >> fw, ">{0}\n{1}".format(f, s)
fw.close()
o = overlap([aseqfile, bseqfile])
seq = aseq[:o.qstop] + bseq[o.sstop:]
seq = Seq(seq)
if opts.flip:
seq = seq.reverse_complement()
for f in (aseqfile, bseqfile):
os.remove(f)
fw = must_open(opts.outfile, "w")
rec = SeqRecord(seq, id=key, description="")
SeqIO.write([rec], fw, "fasta")
fw.close()
示例10: needle
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def needle(args):
"""
%prog needle pairs a.pep.fasta b.pep.fasta
Take protein pairs and needle them.
"""
from Bio.Emboss.Applications import NeedleCommandline
from jcvi.formats.fasta import Fasta, SeqIO
from jcvi.formats.base import FileShredder
p = OptionParser(needle.__doc__)
opts, args = p.parse_args(args)
if len(args) != 3:
sys.exit(not p.print_help())
pairsfile, apep, bpep = args
afasta = Fasta(apep)
bfasta = Fasta(bpep)
fp = open(pairsfile)
for row in fp:
fa = open(pairsfile + "_a.fasta", "w")
fb = open(pairsfile + "_b.fasta", "w")
a, b = row.split()
a = afasta[a]
b = bfasta[b]
SeqIO.write([a], fa, "fasta")
SeqIO.write([b], fb, "fasta")
fa.close()
fb.close()
needlefile = pairsfile + "_ab.needle"
needle_cline = NeedleCommandline(asequence=fa.name,
bsequence=fb.name,
gapopen=10, gapextend=0.5,
outfile=needlefile)
stdout, stderr = needle_cline()
print >> sys.stderr, stdout + stderr
#align = AlignIO.read(needlefile, "emboss")
nh = NeedleHeader(needlefile)
print "\t".join((a.id, b.id, nh.identity, nh.score))
FileShredder([fa.name, fb.name, needlefile])
示例11: overlapbatch
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def overlapbatch(args):
"""
%prog overlapbatch ctgfasta poolfasta
Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
Mix and combine using `minimus2`.
"""
p = OptionParser(overlap.__doc__)
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
ctgfasta, poolfasta = args
f = Fasta(ctgfasta)
for k, rec in f.iteritems_ordered():
fastafile = k + ".fasta"
fw = open(fastafile, "w")
SeqIO.write([rec], fw, "fasta")
fw.close()
overlap([fastafile, poolfasta])
示例12: expand
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def expand(args):
"""
%prog expand bes.fasta reads.fastq
Expand sequences using short reads. Useful, for example for getting BAC-end
sequences. The template to use, in `bes.fasta` may just contain the junction
sequences, then align the reads to get the 'flanks' for such sequences.
"""
import math
from jcvi.formats.fasta import Fasta, SeqIO
from jcvi.formats.fastq import readlen, first, fasta
from jcvi.formats.blast import Blast
from jcvi.formats.base import FileShredder
from jcvi.apps.bowtie import align, get_samfile
from jcvi.apps.align import blast
p = OptionParser(expand.__doc__)
p.set_depth(depth=200)
p.set_firstN()
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
bes, reads = args
size = Fasta(bes).totalsize
rl = readlen([reads])
expected_size = size + 2 * rl
nreads = expected_size * opts.depth / rl
nreads = int(math.ceil(nreads / 1000.)) * 1000
# Attract reads
samfile, logfile = align([bes, reads, "--reorder", "--mapped",
"--firstN={0}".format(opts.firstN)])
samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True)
logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped))
pf = mapped.split(".")[0]
pf = pf.split("-")[0]
bespf = bes.split(".")[0]
reads = pf + ".expand.fastq"
first([str(nreads), mapped, "-o", reads])
# Perform mini-assembly
fastafile = reads.rsplit(".", 1)[0] + ".fasta"
qualfile = ""
if need_update(reads, fastafile):
fastafile, qualfile = fasta([reads])
contigs = op.join(pf, "454LargeContigs.fna")
if need_update(fastafile, contigs):
cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile)
sh(cmd)
assert op.exists(contigs)
# Annotate contigs
blastfile = blast([bes, contigs])
mapping = {}
for query, b in Blast(blastfile).iter_best_hit():
mapping[query] = b
f = Fasta(contigs, lazy=True)
annotatedfasta = ".".join((pf, bespf, "fasta"))
fw = open(annotatedfasta, "w")
keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list
recs = []
for key, v in f.iteritems_ordered():
vid = v.id
if vid not in mapping:
continue
b = mapping[vid]
subject = b.subject
rec = v.reverse_complement() if b.orientation == '-' else v
rec.id = rid = "_".join((pf, vid, subject))
rec.description = ""
recs.append((keys.index(subject), rid, rec))
recs = [x[-1] for x in sorted(recs)]
SeqIO.write(recs, fw, "fasta")
fw.close()
FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf])
logging.debug("Annotated seqs (n={0}) written to `{1}`.".\
format(len(recs), annotatedfasta))
return annotatedfasta
示例13: prepare
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def prepare(args):
"""
%prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile>
Inferred file names
---------------------------------------------
`lookuptblfile` : rearraylibrary.lookup
`rearraylibfile`: rearraylibrary.fasta
Pick sequences from the original library file and the rearrayed library file
based on the mapping information provided in the `lookuptblfile`.
# lookuptblfile format: column number (index)
# 1 (0) 2 (1) 3 (2) 4 (3) 5 (4) 6 (5)
# source_clone source_plate source_well dest_clone dest_plate dest_well
The 1st and 4th column in the `lookuptblfile` form the pair of clones which
constitute the elements used for the per-clone assembly.
"""
from operator import itemgetter
from jcvi.formats.fasta import Fasta, SeqIO
p = OptionParser(prepare.__doc__)
p.add_option("--rearray_lib", default=None,
help="name of the rearrayed library [default: %default]")
p.add_option("--orig_lib_file",
help="fasta file containing reads from the original libraries [default: %default]")
g = OptionGroup(p, "Optional parameters")
g.add_option("--output_folder", default="to_assemble",
help="output folder to write the FASTA files to [default: %default]")
p.add_option_group(g)
opts, args = p.parse_args(args)
if not opts.rearray_lib or not opts.orig_lib_file:
logging.error("Please specify the required parameters")
sys.exit(not p.print_help())
rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file
if not op.isfile(origlibfile):
logging.error("Original library reads file `{0}` does not exist!".format(origlibfile))
sys.exit()
lookuptblfile = rearraylib + '.lookup'
logging.debug(lookuptblfile)
if not op.isfile(lookuptblfile):
logging.error("Lookup table file `{0}` does not exist!".format(lookuptblfile))
sys.exit()
rearraylibfile = rearraylib + '.fasta'
logging.debug(rearraylibfile)
if not op.isfile(rearraylibfile):
logging.error("Rearrayed library reads file `{0}` does not exist!".format(rearraylibfile))
sys.exit()
origlibFasta = Fasta(origlibfile)
rearraylibFasta = Fasta(rearraylibfile)
origlibids = [o for o in origlibFasta.iterkeys_ordered()]
rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()]
if not op.isdir(opts.output_folder):
logging.warning("Output directory `{0}` missing. Creating it now...".format(opts.output_folder))
os.makedirs(opts.output_folder)
logfile = rearraylib + '.log'
log = open(logfile, 'w')
fp = open(lookuptblfile, 'r')
for row in fp:
origprefix, rearrayprefix = itemgetter(0,3)(row.split('\t'))
libpair = origprefix + '_' + rearrayprefix
outfile = opts.output_folder + '/' + libpair + '.fasta'
ofp = open(outfile, 'w')
for o in origlibids:
if re.match(origprefix, o):
SeqIO.write(origlibFasta[o], ofp, 'fasta')
for r in rearraylibids:
if re.match(rearrayprefix, r):
SeqIO.write(rearraylibFasta[r], ofp, 'fasta')
ofp.close()
print >>log, outfile
log.close()
logging.debug('Wrote log file `{0}`'.format(logfile))
示例14: longest
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def longest(args):
"""
%prog longest pasa.fasta output.subclusters.out
Find the longest PASA assembly and label it as full-length. Also removes
transcripts shorter than half the length of the longest, or shorter than
200bp. The assemblies for the same locus is found in
`output.subclusters.out`. In particular the lines that look like:
sub-cluster: asmbl_25 asmbl_26 asmbl_27
"""
from jcvi.formats.fasta import Fasta, SeqIO
from jcvi.formats.sizes import Sizes
p = OptionParser(longest.__doc__)
p.add_option("--prefix", default="pasa",
help="Replace asmbl_ with prefix [default: %default]")
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
fastafile, subclusters = args
prefix = fastafile.rsplit(".", 1)[0]
idsfile = prefix + ".fl.ids"
fw = open(idsfile, "w")
sizes = Sizes(fastafile).mapping
name_convert = lambda x: x.replace("asmbl", opts.prefix)
keep = set() # List of IDs to write
fp = open(subclusters)
nrecs = 0
for row in fp:
if not row.startswith("sub-cluster:"):
continue
asmbls = row.split()[1:]
longest_asmbl = max(asmbls, key=lambda x: sizes[x])
longest_size = sizes[longest_asmbl]
print(name_convert(longest_asmbl), file=fw)
nrecs += 1
cutoff = max(longest_size / 2, 200)
keep.update(set(x for x in asmbls if sizes[x] >= cutoff))
fw.close()
logging.debug("{0} fl-cDNA records written to `{1}`.".format(nrecs, idsfile))
f = Fasta(fastafile, lazy=True)
newfastafile = prefix + ".clean.fasta"
fw = open(newfastafile, "w")
nrecs = 0
for name, rec in f.iteritems_ordered():
if name not in keep:
continue
rec.id = name_convert(name)
rec.description = ""
SeqIO.write([rec], fw, "fasta")
nrecs += 1
fw.close()
logging.debug("{0} valid records written to `{1}`.".format(nrecs, newfastafile))
示例15: overlap
# 需要导入模块: from jcvi.formats.fasta import SeqIO [as 别名]
# 或者: from jcvi.formats.fasta.SeqIO import write [as 别名]
def overlap(args):
"""
%prog overlap ctgfasta poolfasta
Fish out the sequences in `poolfasta` that overlap with `ctgfasta`.
Mix and combine using `minimus2`.
"""
p = OptionParser(overlap.__doc__)
opts, args = p.parse_args(args)
if len(args) != 2:
sys.exit(not p.print_help())
ctgfasta, poolfasta = args
prefix = ctgfasta.split(".")[0]
rid = list(Fasta(ctgfasta).iterkeys())
assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file"
rid = rid[0]
splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta"
ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta)
# Run BLAST
blastfile = ctgfasta + ".blast"
run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta)
# Extract contigs and merge using minimus2
closuredir = prefix + ".closure"
closure = False
if need_update(blastfile, closuredir):
mkdir(closuredir, overwrite=True)
closure = True
if closure:
idsfile = op.join(closuredir, prefix + ".ids")
cmd = "cut -f2 {0} | sort -u".format(blastfile)
sh(cmd, outfile=idsfile)
idsfastafile = op.join(closuredir, prefix + ".ids.fasta")
cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile)
sh(cmd)
# This step is a hack to weight the bases from original sequences more
# than the pulled sequences, by literally adding another copy to be used
# in consensus calls.
redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta")
format([ctgfasta, redundantfastafile, "--prefix=RED."])
mergedfastafile = op.join(closuredir, prefix + ".merged.fasta")
cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile)
sh(cmd, outfile=mergedfastafile)
afgfile = op.join(closuredir, prefix + ".afg")
cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile)
sh(cmd)
cwd = os.getcwd()
os.chdir(closuredir)
cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix)
cmd += " -D OVERLAP=100 -D MINID=98"
sh(cmd)
os.chdir(cwd)
# Analyze output, make sure that:
# + Get the singletons of the original set back
# + Drop any contig that is comprised entirely of pulled set
originalIDs = set(Fasta(ctgfasta).iterkeys())
minimuscontig = op.join(closuredir, prefix + ".contig")
c = ContigFile(minimuscontig)
excludecontigs = set()
for rec in c.iter_records():
reads = set(x.id for x in rec.reads)
if reads.isdisjoint(originalIDs):
excludecontigs.add(rec.id)
logging.debug("Exclude contigs: {0}".\
format(", ".join(sorted(excludecontigs))))
finalfasta = prefix + ".improved.fasta_"
fw = open(finalfasta, "w")
minimusfasta = op.join(closuredir, prefix + ".fasta")
f = Fasta(minimusfasta)
for id, rec in f.iteritems_ordered():
if id in excludecontigs:
continue
SeqIO.write([rec], fw, "fasta")
singletonfile = op.join(closuredir, prefix + ".singletons")
singletons = set(x.strip() for x in open(singletonfile))
leftovers = singletons & originalIDs
logging.debug("Pull leftover singletons: {0}".\
format(", ".join(sorted(leftovers))))
f = Fasta(ctgfasta)
for id, rec in f.iteritems_ordered():
if id not in leftovers:
continue
SeqIO.write([rec], fw, "fasta")
#.........这里部分代码省略.........