Python FastaIterator.iterate方法代码示例

本文整理汇总了Python中CGAT.FastaIterator.iterate方法的典型用法代码示例。如果您正苦于以下问题：Python FastaIterator.iterate方法的具体用法？Python FastaIterator.iterate怎么用？Python FastaIterator.iterate使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类CGAT.FastaIterator的用法示例。

在下文中一共展示了FastaIterator.iterate方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    contigs_map = {}
    for genome in glob.glob(os.path.join(options.genome_dir, "*")):
        for fasta in FastaIterator.iterate(IOTools.openFile(genome)):
            identifier = fasta.title.split("|")
            gi = identifier[1]
            contigs_map[gi] = fasta.title

    for line in options.stdin.readlines():
        data = line[:-1].split("\t")
        gi = data[1]
        assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi

        options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi]))

    # write footer and output benchmark information.
    E.Stop()

开发者ID:CGATOxford，项目名称:Optic，代码行数:37，代码来源:species_map2species_map.py

示例2: maskSequences

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x, s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x, s))

        os.close(outfile)

        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [
            x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))]

        os.remove(infile)

        return result

开发者ID:BioXiao，项目名称:cgat，代码行数:35，代码来源:Masker.py

示例3: calculateSequenceComposition

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''
    cat %(inf)s | cgat fasta2table
    -s na -s cpg -s length
    --log=%(outfile)s.log > %(outfile)s'''

    P.run()

开发者ID:CGATOxford，项目名称:CGATPipelines，代码行数:33，代码来源:PipelineTransfacMatch.py

示例4: buildInputFiles

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def buildInputFiles(infile, outfiles):
    '''
    build input file based on parameters and fasta sequences
    that primers are to be designed for
    '''
    PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0]

    fasta, identifiers = infile[0], "identifiers.tsv"
    inf = IOTools.openFile(fasta)
    
    E.info("Reading ids for primer design")
    ids = readIdentifiers(identifiers)
    
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title in ids:
            outf = IOTools.openFile(os.path.join(
                "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w")
            seq = f.sequence
            outf.write("SEQUENCE_ID=%s\n" % f.title)
            for key, value in PARAMS.iteritems():
                if "constraints" in key:
                    outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value))
            outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq)
            outf.close()

开发者ID:nickilott，项目名称:qpcr，代码行数:27，代码来源:pipeline_primerdesign.py

示例5: filterByCoverage

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()

开发者ID:BioXiao，项目名称:CGATPipelines，代码行数:28，代码来源:PipelineMetagenomeBenchmark.py

示例6: main

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-b", "--bamfile", dest="bamfile", type="string",
                      help="supply bam file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read in contigs
    E.info("reading in contig file")
    contigs = {}
    for fasta in FastaIterator.iterate(options.stdin):
        contigs[fasta.title] = (1, len(fasta.sequence) - 1)
    E.info("read %i contigs" % len(contigs.keys()))

    # read in bamfile
    E.info("reading bam file")
    samfile = pysam.Samfile(options.bamfile)

    E.info("iterating over contigs")
    c = 0
    for contig, coords in contigs.iteritems():
        coords = list(coords)

        #################################
        # NB this is specific for my data!
        contig = contig.split(" ")[0]
        #################################

        species_counts = collections.defaultdict(int)
        for alignment in samfile.fetch(contig, coords[0], coords[1]):
            species_id = alignment.qname.split("|")[1]
            species_counts[species_id] += 1

        # at the moment ignore if there are no counts
        if len(species_counts.values()) == 0:
            E.warn("no reads map to %s" % contig)
            continue

        for species, count in species_counts.iteritems():
            if species_counts[species] == max(species_counts.values()):
                top_dog = species
                c += 1
                break
        E.info("species %s assigned to contig number %i" % (top_dog, c))
        options.stdout.write("%s\t%s\n" % (contig, top_dog))

    # write footer and output benchmark information.
    E.Stop()

开发者ID:Charlie-George，项目名称:cgat，代码行数:62，代码来源:bam2species_map.py

示例7: main

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id$", 
                             usage = globals()["__doc__"] )

    parser.add_option("-n", dest="N", type="int",
                      help="e.g N50 - the length at which 50% of contigs are equal or above")
    parser.add_option("-f", "--filter-length", dest="filter_length", type="int",
                      help="calculate stats on contigs longer than -f")

    parser.set_defaults(N = 50,
                        filter_length = 0)

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )
    
    f = options.filter_length

    # iterate over the contigs/scaffolds and return stats                                                                                                                                                                                              
    number_of_contigs = 0

    N = options.N
    contig_lengths = []

    for record in FastaIterator.iterate(options.stdin):
        contig_length = len(list(record.sequence))
        if contig_length >= f:
            number_of_contigs += 1
            contig_lengths.append(contig_length)

    # mean, median and max contig/scaffold lengths
    mean_length = np.mean(contig_lengths)
    median_length = np.median(contig_lengths)
    max_length = max(contig_lengths)

    # iterate over contigs/scaffolds sorted by longest                                                                                                                                                                                               
    # and caculate the NX                                                                                                                                                                                                                    
    index = 0
    cum_length = 0
    total_length = sum(contig_lengths)
    for length in sorted(contig_lengths, reverse = True):
        while cum_length <= total_length*(float(N)/100):
            index += 1
            cum_length += length

    # output the results                                                                                                                                                                                                                     
    options.stdout.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
    options.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (number_of_contigs, total_length, sorted(contig_lengths, reverse = True)[index], str(median_length), str(mean_length), str(max_length)))

    ## write footer and output benchmark information.
    E.Stop()

开发者ID:BioinformaticsArchive，项目名称:cgat，代码行数:60，代码来源:contigs2stats.py

示例8: filterContigs

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def filterContigs(infile, outfile, length):
    '''
    filter contigs by length
    '''
    outf = open(outfile, "w")
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        seq_length = len(fasta.sequence)
        if seq_length < length: continue
        outf.write(">%s\n%s\n" % (fasta.title, fasta.sequence))
    outf.close()

开发者ID:siping，项目名称:cgat，代码行数:12，代码来源:PipelineMetagenomeAssembly.py

示例9: contig_to_stats

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def contig_to_stats(contigs_file, stats_file, params):
    """
    calculate descriptive stats for a set
    of contigs / scaffolds
    """

    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0

    # iterate over the contigs/scaffolds and return stats
    number_of_scaffolds = 0

    N = PARAMS["scaffold_n"]
    scaffold_lengths = []

    inf = open(contigs_file)
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length >= f:
            number_of_scaffolds += 1
            scaffold_lengths.append(scaffold_length)

    # mean, median and max contig/scaffold lengths
    mean_length = np.mean(scaffold_lengths)
    median_length = np.median(scaffold_lengths)
    max_length = max(scaffold_lengths)

    # iterate over contigs/scaffolds sorted by longest
    # and caculate the NX
    index = 0
    cum_length = 0
    total_length = sum(scaffold_lengths)
    for length in sorted(scaffold_lengths, reverse=True):
        while cum_length <= total_length * (float(N) / 100):
            index += 1
            cum_length += length

    # output the results
    outf = open(stats_file, "w")
    outf.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
    outf.write(
        "%s\t%s\t%s\t%s\t%s\t%s\n"
        % (
            number_of_scaffolds,
            total_length,
            sorted(scaffold_lengths, reverse=True)[index],
            str(median_length),
            str(mean_length),
            str(max_length),
        )
    )

开发者ID:nishantthakur，项目名称:cgat，代码行数:57，代码来源:PipelineMetagenomeAssembly.py

示例10: build_scaffold_lengths

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        outf.write("%s\t%i\n" % (record.title, scaffold_length))
    outf.close()

开发者ID:Charlie-George，项目名称:cgat，代码行数:13，代码来源:PipelineMetagenomeAssembly.py

示例11: collectGenomeSizes

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()

开发者ID:yangjl，项目名称:cgat，代码行数:15，代码来源:pipeline_metagenomebenchmark.py

示例12: buildMisprimingLib

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def buildMisprimingLib(infiles, outfile):
    '''
    build fasta file of sequences to check for mispriming
    '''
    fasta, identifiers = infiles
    inf = IOTools.openFile(fasta)
    
    E.info("reading ids for sequences to keep")
    ids = readIdentifiers(identifiers)

    outf = IOTools.openFile(outfile, "w")
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title not in ids:
            outf.write(">%s\n%s\n" % (f.title, f.sequence))
    outf.close()

开发者ID:nickilott，项目名称:qpcr，代码行数:18，代码来源:pipeline_primerdesign.py

示例13: build_scaffold_lengths

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length > f:
            # rename sequences if they have a space in them
            outf.write("%s\t%i\n" % (record.title.replace(" ", "_"), scaffold_length))
    outf.close()

开发者ID:siping，项目名称:cgat，代码行数:21，代码来源:PipelineMetagenomeAssembly.py

示例14: countCompleteGenes

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def countCompleteGenes(infile, outfile):
    '''
    count the number of genes that are classed
    as complete based on having a start and stop codon
    '''
    start = "ATG"
    stop = ["TAG", "TAA", "TGA"]

    ntotal = 0
    nstart = 0
    nstop = 0
    nstart_nstop = 0
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        ntotal += 1
        if fasta.sequence.startswith(start):
            nstart += 1
        if fasta.sequence[-3:len(fasta.sequence)] in stop:
            nstop += 1
        if fasta.sequence.startswith(start) and fasta.sequence[-3:len(fasta.sequence)] in stop:
            nstart_nstop += 1
    outf = open(outfile, "w")
    outf.write("total_genes\tpstart\tpstop\tpstart_stop\n")
    outf.write("\t".join(map(str,[ntotal, float(nstart)/ntotal, float(nstop)/ntotal, float(nstart_nstop)/ntotal])) + "\n")
    outf.close()

开发者ID:Charlie-George，项目名称:cgat，代码行数:26，代码来源:pipeline_metagenomebenchmark.py

示例15: main

# 需要导入模块: from CGAT import FastaIterator [as 别名]
# 或者: from CGAT.FastaIterator import iterate [as 别名]
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-s", "--correct-gap-shift", dest="correct_shift",
        action="store_true",
        help="correct gap length shifts in alignments. "
        "Requires alignlib_lite.py [%default]")

    parser.add_option(
        "-1", "--pattern1", dest="pattern1", type="string",
        help="pattern to extract identifier from in identifiers1. "
        "[%default]")

    parser.add_option(
        "-2", "--pattern2", dest="pattern2", type="string",
        help="pattern to extract identifier from in identifiers2. "
        "[%default]")

    parser.add_option(
        "-o", "--output-section", dest="output", type="choice",
        action="append",
        choices=("diff", "missed", "seqdiff"),
        help="what to output [%default]")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (options, args) = E.Start(parser)

    if len(args) != 2:
        raise ValueError("two files needed to compare.")

    if options.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ "
                "but alignlib not found")

    seqs1 = dict([
        (x.title, x.sequence) for x in FastaIterator.iterate(
            IOTools.openFile(args[0], "r"))])
    seqs2 = dict([
        (x.title, x.sequence) for x in FastaIterator.iterate(
            IOTools.openFile(args[1], "r"))])

    if not seqs1:
        raise ValueError("first file %s is empty." % (args[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (args[1]))

    MapIdentifiers(seqs1, options.pattern1)
    MapIdentifiers(seqs2, options.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in options.output
    write_missed2 = "missed" in options.output
    write_seqdiff = "seqdiff" in options.output
    write_diff = "diff" in options.output or write_seqdiff

    for k in sorted(seqs1):
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

#.........这里部分代码省略.........

开发者ID:CGATOxford，项目名称:cgat，代码行数:103，代码来源:diff_fasta.py

注：本文中的CGAT.FastaIterator.iterate方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。