本文整理汇总了Python中CGAT.FastaIterator.iterate方法的典型用法代码示例。如果您正苦于以下问题:Python FastaIterator.iterate方法的具体用法?Python FastaIterator.iterate怎么用?Python FastaIterator.iterate使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类CGAT.FastaIterator
的用法示例。
在下文中一共展示了FastaIterator.iterate方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if not argv:
argv = sys.argv
# setup command line parser
parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
usage=globals()["__doc__"])
parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string",
help="supply help")
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
contigs_map = {}
for genome in glob.glob(os.path.join(options.genome_dir, "*")):
for fasta in FastaIterator.iterate(IOTools.openFile(genome)):
identifier = fasta.title.split("|")
gi = identifier[1]
contigs_map[gi] = fasta.title
for line in options.stdin.readlines():
data = line[:-1].split("\t")
gi = data[1]
assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi
options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi]))
# write footer and output benchmark information.
E.Stop()
示例2: maskSequences
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def maskSequences(self, sequences):
'''mask a collection of sequences.'''
outfile, infile = tempfile.mkstemp()
for x, s in enumerate(sequences):
os.write(outfile, ">%i\n%s\n" % (x, s))
os.close(outfile)
statement = self.mCommand % locals()
E.debug("statement: %s" % statement)
s = subprocess.Popen(statement,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
close_fds=True)
(out, err) = s.communicate()
if s.returncode != 0:
raise RuntimeError(
"Error in running %s \n%s\nTemporary directory" %
(statement, err))
result = [
x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))]
os.remove(infile)
return result
示例3: calculateSequenceComposition
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def calculateSequenceComposition(interval_names,
sequence_file,
outfile,
header_line=True):
'''
given a set of interval names that are present in a
fasta file, return CpG content file
'''
interval_file = open(interval_names)
if header_line:
interval_file.readline()
sequence_file = open(sequence_file)
interval_set = set()
for line in interval_file.readlines():
interval_set.add(line[:-1])
temp = P.getTempFile("/ifs/scratch")
for record in FastaIterator.iterate(sequence_file):
seq_id = record.title.split(" ")[0]
if seq_id in interval_set:
temp.write(">%s\n%s\n" % (record.title, record.sequence))
temp.close()
inf = temp.name
statement = '''
cat %(inf)s | cgat fasta2table
-s na -s cpg -s length
--log=%(outfile)s.log > %(outfile)s'''
P.run()
示例4: buildInputFiles
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def buildInputFiles(infile, outfiles):
'''
build input file based on parameters and fasta sequences
that primers are to be designed for
'''
PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0]
fasta, identifiers = infile[0], "identifiers.tsv"
inf = IOTools.openFile(fasta)
E.info("Reading ids for primer design")
ids = readIdentifiers(identifiers)
E.info("collecting sequences")
for f in FastaIterator.iterate(IOTools.openFile(fasta)):
if f.title in ids:
outf = IOTools.openFile(os.path.join(
"input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w")
seq = f.sequence
outf.write("SEQUENCE_ID=%s\n" % f.title)
for key, value in PARAMS.iteritems():
if "constraints" in key:
outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value))
outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq)
outf.close()
示例5: filterByCoverage
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def filterByCoverage(infiles, outfile):
fcoverage = PARAMS["coverage_filter"]
contig_file = infiles[0]
dbh = sqlite3.connect(
os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
cc = dbh.cursor()
contigs = set()
for infile in infiles[1:]:
dirsplit = infile.split("/")
infile = os.path.join(
PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
tablename = P.toTable(os.path.basename(infile))
if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
statement = """SELECT contig_id ave FROM
(SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
for data in cc.execute(statement).fetchall():
contigs.add(data[0])
outf = open(outfile, "w")
print contigs
for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
identifier = fasta.title.split(" ")[0]
if identifier in contigs:
outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
outf.close()
示例6: main
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if not argv:
argv = sys.argv
# setup command line parser
parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
usage=globals()["__doc__"])
parser.add_option("-b", "--bamfile", dest="bamfile", type="string",
help="supply bam file")
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
# read in contigs
E.info("reading in contig file")
contigs = {}
for fasta in FastaIterator.iterate(options.stdin):
contigs[fasta.title] = (1, len(fasta.sequence) - 1)
E.info("read %i contigs" % len(contigs.keys()))
# read in bamfile
E.info("reading bam file")
samfile = pysam.Samfile(options.bamfile)
E.info("iterating over contigs")
c = 0
for contig, coords in contigs.iteritems():
coords = list(coords)
#################################
# NB this is specific for my data!
contig = contig.split(" ")[0]
#################################
species_counts = collections.defaultdict(int)
for alignment in samfile.fetch(contig, coords[0], coords[1]):
species_id = alignment.qname.split("|")[1]
species_counts[species_id] += 1
# at the moment ignore if there are no counts
if len(species_counts.values()) == 0:
E.warn("no reads map to %s" % contig)
continue
for species, count in species_counts.iteritems():
if species_counts[species] == max(species_counts.values()):
top_dog = species
c += 1
break
E.info("species %s assigned to contig number %i" % (top_dog, c))
options.stdout.write("%s\t%s\n" % (contig, top_dog))
# write footer and output benchmark information.
E.Stop()
示例7: main
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def main( argv = None ):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv == None: argv = sys.argv
# setup command line parser
parser = E.OptionParser( version = "%prog version: $Id$",
usage = globals()["__doc__"] )
parser.add_option("-n", dest="N", type="int",
help="e.g N50 - the length at which 50% of contigs are equal or above")
parser.add_option("-f", "--filter-length", dest="filter_length", type="int",
help="calculate stats on contigs longer than -f")
parser.set_defaults(N = 50,
filter_length = 0)
## add common options (-h/--help, ...) and parse command line
(options, args) = E.Start( parser, argv = argv )
f = options.filter_length
# iterate over the contigs/scaffolds and return stats
number_of_contigs = 0
N = options.N
contig_lengths = []
for record in FastaIterator.iterate(options.stdin):
contig_length = len(list(record.sequence))
if contig_length >= f:
number_of_contigs += 1
contig_lengths.append(contig_length)
# mean, median and max contig/scaffold lengths
mean_length = np.mean(contig_lengths)
median_length = np.median(contig_lengths)
max_length = max(contig_lengths)
# iterate over contigs/scaffolds sorted by longest
# and caculate the NX
index = 0
cum_length = 0
total_length = sum(contig_lengths)
for length in sorted(contig_lengths, reverse = True):
while cum_length <= total_length*(float(N)/100):
index += 1
cum_length += length
# output the results
options.stdout.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
options.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (number_of_contigs, total_length, sorted(contig_lengths, reverse = True)[index], str(median_length), str(mean_length), str(max_length)))
## write footer and output benchmark information.
E.Stop()
示例8: filterContigs
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def filterContigs(infile, outfile, length):
'''
filter contigs by length
'''
outf = open(outfile, "w")
for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
seq_length = len(fasta.sequence)
if seq_length < length: continue
outf.write(">%s\n%s\n" % (fasta.title, fasta.sequence))
outf.close()
示例9: contig_to_stats
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def contig_to_stats(contigs_file, stats_file, params):
"""
calculate descriptive stats for a set
of contigs / scaffolds
"""
PARAMS = params
if PARAMS["filter"]:
f = PARAMS["filter"]
else:
f = 0
# iterate over the contigs/scaffolds and return stats
number_of_scaffolds = 0
N = PARAMS["scaffold_n"]
scaffold_lengths = []
inf = open(contigs_file)
for record in FastaIterator.iterate(inf):
scaffold_length = len(list(record.sequence))
if scaffold_length >= f:
number_of_scaffolds += 1
scaffold_lengths.append(scaffold_length)
# mean, median and max contig/scaffold lengths
mean_length = np.mean(scaffold_lengths)
median_length = np.median(scaffold_lengths)
max_length = max(scaffold_lengths)
# iterate over contigs/scaffolds sorted by longest
# and caculate the NX
index = 0
cum_length = 0
total_length = sum(scaffold_lengths)
for length in sorted(scaffold_lengths, reverse=True):
while cum_length <= total_length * (float(N) / 100):
index += 1
cum_length += length
# output the results
outf = open(stats_file, "w")
outf.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
outf.write(
"%s\t%s\t%s\t%s\t%s\t%s\n"
% (
number_of_scaffolds,
total_length,
sorted(scaffold_lengths, reverse=True)[index],
str(median_length),
str(mean_length),
str(max_length),
)
)
示例10: build_scaffold_lengths
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def build_scaffold_lengths(contigs_file, outfile, params):
'''
output the distribution of scaffold lengths
'''
inf = open(contigs_file)
outf = open(outfile, "w")
outf.write("scaffold_name\tlength\n")
for record in FastaIterator.iterate(inf):
scaffold_length = len(list(record.sequence))
outf.write("%s\t%i\n" % (record.title, scaffold_length))
outf.close()
示例11: collectGenomeSizes
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def collectGenomeSizes(infile, outfile):
'''
output the genome sizes for each genome
'''
to_cluster = True
outf = open(outfile, "w")
outf.write("genome\tlength\n")
# assume single fasta entry
for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
name = P.snip(os.path.basename(infile), ".fna")
length = len(list(fasta.sequence))
outf.write("%s\t%s\n" % (name, str(length)))
outf.close()
示例12: buildMisprimingLib
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def buildMisprimingLib(infiles, outfile):
'''
build fasta file of sequences to check for mispriming
'''
fasta, identifiers = infiles
inf = IOTools.openFile(fasta)
E.info("reading ids for sequences to keep")
ids = readIdentifiers(identifiers)
outf = IOTools.openFile(outfile, "w")
E.info("collecting sequences")
for f in FastaIterator.iterate(IOTools.openFile(fasta)):
if f.title not in ids:
outf.write(">%s\n%s\n" % (f.title, f.sequence))
outf.close()
示例13: build_scaffold_lengths
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def build_scaffold_lengths(contigs_file, outfile, params):
'''
output the distribution of scaffold lengths
'''
PARAMS = params
if PARAMS["filter"]:
f = PARAMS["filter"]
else:
f = 0
inf = open(contigs_file)
outf = open(outfile, "w")
outf.write("scaffold_name\tlength\n")
for record in FastaIterator.iterate(inf):
scaffold_length = len(list(record.sequence))
if scaffold_length > f:
# rename sequences if they have a space in them
outf.write("%s\t%i\n" % (record.title.replace(" ", "_"), scaffold_length))
outf.close()
示例14: countCompleteGenes
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def countCompleteGenes(infile, outfile):
'''
count the number of genes that are classed
as complete based on having a start and stop codon
'''
start = "ATG"
stop = ["TAG", "TAA", "TGA"]
ntotal = 0
nstart = 0
nstop = 0
nstart_nstop = 0
for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
ntotal += 1
if fasta.sequence.startswith(start):
nstart += 1
if fasta.sequence[-3:len(fasta.sequence)] in stop:
nstop += 1
if fasta.sequence.startswith(start) and fasta.sequence[-3:len(fasta.sequence)] in stop:
nstart_nstop += 1
outf = open(outfile, "w")
outf.write("total_genes\tpstart\tpstop\tpstart_stop\n")
outf.write("\t".join(map(str,[ntotal, float(nstart)/ntotal, float(nstop)/ntotal, float(nstart_nstop)/ntotal])) + "\n")
outf.close()
示例15: main
# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def main(argv=None):
if argv is None:
argv = sys.argv
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option(
"-s", "--correct-gap-shift", dest="correct_shift",
action="store_true",
help="correct gap length shifts in alignments. "
"Requires alignlib_lite.py [%default]")
parser.add_option(
"-1", "--pattern1", dest="pattern1", type="string",
help="pattern to extract identifier from in identifiers1. "
"[%default]")
parser.add_option(
"-2", "--pattern2", dest="pattern2", type="string",
help="pattern to extract identifier from in identifiers2. "
"[%default]")
parser.add_option(
"-o", "--output-section", dest="output", type="choice",
action="append",
choices=("diff", "missed", "seqdiff"),
help="what to output [%default]")
parser.set_defaults(correct_shift=False,
pattern1="(\S+)",
pattern2="(\S+)",
output=[])
(options, args) = E.Start(parser)
if len(args) != 2:
raise ValueError("two files needed to compare.")
if options.correct_shift:
try:
import alignlib_lite
except ImportError:
raise ImportError(
"option --correct-shift requires alignlib_lite.py_ "
"but alignlib not found")
seqs1 = dict([
(x.title, x.sequence) for x in FastaIterator.iterate(
IOTools.openFile(args[0], "r"))])
seqs2 = dict([
(x.title, x.sequence) for x in FastaIterator.iterate(
IOTools.openFile(args[1], "r"))])
if not seqs1:
raise ValueError("first file %s is empty." % (args[0]))
if not seqs2:
raise ValueError("second file %s is empty." % (args[1]))
MapIdentifiers(seqs1, options.pattern1)
MapIdentifiers(seqs2, options.pattern2)
nsame = 0
nmissed1 = 0
nmissed2 = 0
ndiff = 0
ndiff_first = 0
ndiff_last = 0
ndiff_prefix = 0
ndiff_selenocysteine = 0
ndiff_masked = 0
nfixed = 0
found2 = {}
write_missed1 = "missed" in options.output
write_missed2 = "missed" in options.output
write_seqdiff = "seqdiff" in options.output
write_diff = "diff" in options.output or write_seqdiff
for k in sorted(seqs1):
if k not in seqs2:
nmissed1 += 1
if write_missed1:
options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
continue
found2[k] = 1
s1 = seqs1[k].upper()
s2 = seqs2[k].upper()
m = min(len(s1), len(s2))
if s1 == s2:
nsame += 1
else:
status = "other"
ndiff += 1
#.........这里部分代码省略.........