本文整理汇总了Python中CGAT.FastaIterator类的典型用法代码示例。如果您正苦于以下问题:Python FastaIterator类的具体用法?Python FastaIterator怎么用?Python FastaIterator使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了FastaIterator类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: maskSequences
def maskSequences(self, sequences):
'''mask a collection of sequences.'''
outfile, infile = tempfile.mkstemp()
for x, s in enumerate(sequences):
os.write(outfile, ">%i\n%s\n" % (x, s))
os.close(outfile)
statement = self.mCommand % locals()
E.debug("statement: %s" % statement)
s = subprocess.Popen(statement,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
close_fds=True)
(out, err) = s.communicate()
if s.returncode != 0:
raise RuntimeError(
"Error in running %s \n%s\nTemporary directory" %
(statement, err))
result = [
x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))]
os.remove(infile)
return result
示例2: main
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if not argv:
argv = sys.argv
# setup command line parser
parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
usage=globals()["__doc__"])
parser.add_option("-b", "--bamfile", dest="bamfile", type="string",
help="supply bam file")
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
# read in contigs
E.info("reading in contig file")
contigs = {}
for fasta in FastaIterator.iterate(options.stdin):
contigs[fasta.title] = (1, len(fasta.sequence) - 1)
E.info("read %i contigs" % len(contigs.keys()))
# read in bamfile
E.info("reading bam file")
samfile = pysam.Samfile(options.bamfile)
E.info("iterating over contigs")
c = 0
for contig, coords in contigs.iteritems():
coords = list(coords)
#################################
# NB this is specific for my data!
contig = contig.split(" ")[0]
#################################
species_counts = collections.defaultdict(int)
for alignment in samfile.fetch(contig, coords[0], coords[1]):
species_id = alignment.qname.split("|")[1]
species_counts[species_id] += 1
# at the moment ignore if there are no counts
if len(species_counts.values()) == 0:
E.warn("no reads map to %s" % contig)
continue
for species, count in species_counts.iteritems():
if species_counts[species] == max(species_counts.values()):
top_dog = species
c += 1
break
E.info("species %s assigned to contig number %i" % (top_dog, c))
options.stdout.write("%s\t%s\n" % (contig, top_dog))
# write footer and output benchmark information.
E.Stop()
示例3: main
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if not argv:
argv = sys.argv
# setup command line parser
parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
usage=globals()["__doc__"])
parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string",
help="supply help")
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
contigs_map = {}
for genome in glob.glob(os.path.join(options.genome_dir, "*")):
for fasta in FastaIterator.iterate(IOTools.openFile(genome)):
identifier = fasta.title.split("|")
gi = identifier[1]
contigs_map[gi] = fasta.title
for line in options.stdin.readlines():
data = line[:-1].split("\t")
gi = data[1]
assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi
options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi]))
# write footer and output benchmark information.
E.Stop()
示例4: calculateSequenceComposition
def calculateSequenceComposition(interval_names,
sequence_file,
outfile,
header_line=True):
'''
given a set of interval names that are present in a
fasta file, return CpG content file
'''
interval_file = open(interval_names)
if header_line:
interval_file.readline()
sequence_file = open(sequence_file)
interval_set = set()
for line in interval_file.readlines():
interval_set.add(line[:-1])
temp = P.getTempFile("/ifs/scratch")
for record in FastaIterator.iterate(sequence_file):
seq_id = record.title.split(" ")[0]
if seq_id in interval_set:
temp.write(">%s\n%s\n" % (record.title, record.sequence))
temp.close()
inf = temp.name
statement = '''
cat %(inf)s | cgat fasta2table
-s na -s cpg -s length
--log=%(outfile)s.log > %(outfile)s'''
P.run()
示例5: filterByCoverage
def filterByCoverage(infiles, outfile):
fcoverage = PARAMS["coverage_filter"]
contig_file = infiles[0]
dbh = sqlite3.connect(
os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
cc = dbh.cursor()
contigs = set()
for infile in infiles[1:]:
dirsplit = infile.split("/")
infile = os.path.join(
PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
tablename = P.toTable(os.path.basename(infile))
if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
statement = """SELECT contig_id ave FROM
(SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
for data in cc.execute(statement).fetchall():
contigs.add(data[0])
outf = open(outfile, "w")
print contigs
for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
identifier = fasta.title.split(" ")[0]
if identifier in contigs:
outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
outf.close()
示例6: buildInputFiles
def buildInputFiles(infile, outfiles):
'''
build input file based on parameters and fasta sequences
that primers are to be designed for
'''
PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0]
fasta, identifiers = infile[0], "identifiers.tsv"
inf = IOTools.openFile(fasta)
E.info("Reading ids for primer design")
ids = readIdentifiers(identifiers)
E.info("collecting sequences")
for f in FastaIterator.iterate(IOTools.openFile(fasta)):
if f.title in ids:
outf = IOTools.openFile(os.path.join(
"input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w")
seq = f.sequence
outf.write("SEQUENCE_ID=%s\n" % f.title)
for key, value in PARAMS.iteritems():
if "constraints" in key:
outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value))
outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq)
outf.close()
示例7: iterate_double_fasta
def iterate_double_fasta ( fn1, fn2 ):
iterator = FastaIterator.iterate_together( fn1, fn2 )
for seq1, seq2 in iterator:
yield AlignedPairs.UnalignedPair(
token1 = seq1.title,
sequence1 = seq1.sequence,
token2 = seq2.title,
sequence2 = seq2.sequence )
示例8: main
def main( argv = None ):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv == None: argv = sys.argv
# setup command line parser
parser = E.OptionParser( version = "%prog version: $Id$",
usage = globals()["__doc__"] )
parser.add_option("-n", dest="N", type="int",
help="e.g N50 - the length at which 50% of contigs are equal or above")
parser.add_option("-f", "--filter-length", dest="filter_length", type="int",
help="calculate stats on contigs longer than -f")
parser.set_defaults(N = 50,
filter_length = 0)
## add common options (-h/--help, ...) and parse command line
(options, args) = E.Start( parser, argv = argv )
f = options.filter_length
# iterate over the contigs/scaffolds and return stats
number_of_contigs = 0
N = options.N
contig_lengths = []
for record in FastaIterator.iterate(options.stdin):
contig_length = len(list(record.sequence))
if contig_length >= f:
number_of_contigs += 1
contig_lengths.append(contig_length)
# mean, median and max contig/scaffold lengths
mean_length = np.mean(contig_lengths)
median_length = np.median(contig_lengths)
max_length = max(contig_lengths)
# iterate over contigs/scaffolds sorted by longest
# and caculate the NX
index = 0
cum_length = 0
total_length = sum(contig_lengths)
for length in sorted(contig_lengths, reverse = True):
while cum_length <= total_length*(float(N)/100):
index += 1
cum_length += length
# output the results
options.stdout.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
options.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (number_of_contigs, total_length, sorted(contig_lengths, reverse = True)[index], str(median_length), str(mean_length), str(max_length)))
## write footer and output benchmark information.
E.Stop()
示例9: filterContigs
def filterContigs(infile, outfile, length):
'''
filter contigs by length
'''
outf = open(outfile, "w")
for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
seq_length = len(fasta.sequence)
if seq_length < length: continue
outf.write(">%s\n%s\n" % (fasta.title, fasta.sequence))
outf.close()
示例10: contig_to_stats
def contig_to_stats(contigs_file, stats_file, params):
"""
calculate descriptive stats for a set
of contigs / scaffolds
"""
PARAMS = params
if PARAMS["filter"]:
f = PARAMS["filter"]
else:
f = 0
# iterate over the contigs/scaffolds and return stats
number_of_scaffolds = 0
N = PARAMS["scaffold_n"]
scaffold_lengths = []
inf = open(contigs_file)
for record in FastaIterator.iterate(inf):
scaffold_length = len(list(record.sequence))
if scaffold_length >= f:
number_of_scaffolds += 1
scaffold_lengths.append(scaffold_length)
# mean, median and max contig/scaffold lengths
mean_length = np.mean(scaffold_lengths)
median_length = np.median(scaffold_lengths)
max_length = max(scaffold_lengths)
# iterate over contigs/scaffolds sorted by longest
# and caculate the NX
index = 0
cum_length = 0
total_length = sum(scaffold_lengths)
for length in sorted(scaffold_lengths, reverse=True):
while cum_length <= total_length * (float(N) / 100):
index += 1
cum_length += length
# output the results
outf = open(stats_file, "w")
outf.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
outf.write(
"%s\t%s\t%s\t%s\t%s\t%s\n"
% (
number_of_scaffolds,
total_length,
sorted(scaffold_lengths, reverse=True)[index],
str(median_length),
str(mean_length),
str(max_length),
)
)
示例11: build_scaffold_lengths
def build_scaffold_lengths(contigs_file, outfile, params):
'''
output the distribution of scaffold lengths
'''
inf = open(contigs_file)
outf = open(outfile, "w")
outf.write("scaffold_name\tlength\n")
for record in FastaIterator.iterate(inf):
scaffold_length = len(list(record.sequence))
outf.write("%s\t%i\n" % (record.title, scaffold_length))
outf.close()
示例12: collectGenomeSizes
def collectGenomeSizes(infile, outfile):
'''
output the genome sizes for each genome
'''
to_cluster = True
outf = open(outfile, "w")
outf.write("genome\tlength\n")
# assume single fasta entry
for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
name = P.snip(os.path.basename(infile), ".fna")
length = len(list(fasta.sequence))
outf.write("%s\t%s\n" % (name, str(length)))
outf.close()
示例13: buildMisprimingLib
def buildMisprimingLib(infiles, outfile):
'''
build fasta file of sequences to check for mispriming
'''
fasta, identifiers = infiles
inf = IOTools.openFile(fasta)
E.info("reading ids for sequences to keep")
ids = readIdentifiers(identifiers)
outf = IOTools.openFile(outfile, "w")
E.info("collecting sequences")
for f in FastaIterator.iterate(IOTools.openFile(fasta)):
if f.title not in ids:
outf.write(">%s\n%s\n" % (f.title, f.sequence))
outf.close()
示例14: build_scaffold_lengths
def build_scaffold_lengths(contigs_file, outfile, params):
'''
output the distribution of scaffold lengths
'''
PARAMS = params
if PARAMS["filter"]:
f = PARAMS["filter"]
else:
f = 0
inf = open(contigs_file)
outf = open(outfile, "w")
outf.write("scaffold_name\tlength\n")
for record in FastaIterator.iterate(inf):
scaffold_length = len(list(record.sequence))
if scaffold_length > f:
# rename sequences if they have a space in them
outf.write("%s\t%i\n" % (record.title.replace(" ", "_"), scaffold_length))
outf.close()
示例15: runMEMEOnSequences
def runMEMEOnSequences(infile, outfile):
'''run MEME to find motifs.
In order to increase the signal/noise ratio,
MEME is not run on all intervals but only the
top 10% of intervals (peakval) are used.
Also, only the segment of 200 bp around the peak
is used and not the complete interval.
* Softmasked sequence is converted to hardmasked
sequence to avoid the detection of spurious motifs.
* Sequence is run through dustmasker
'''
to_cluster = True
# job_options = "-l mem_free=8000M"
nseqs = int(FastaIterator.count(infile))
if nseqs == 0:
E.warn("%s: no sequences - meme skipped" % outfile)
P.touch(outfile)
return
target_path = os.path.join(
os.path.abspath(PARAMS["exportdir"]), "meme", outfile)
tmpdir = P.getTempDir(".")
statement = '''
meme %(infile)s -dna -revcomp
-mod %(meme_model)s
-nmotifs %(meme_nmotifs)s
-oc %(tmpdir)s
-maxsize %(motifs_max_size)s
%(meme_options)s
> %(outfile)s.log
'''
P.run()
collectMEMEResults(tmpdir, target_path, outfile)