当前位置: 首页>>代码示例>>Python>>正文


Python CGAT.FastaIterator类代码示例

本文整理汇总了Python中CGAT.FastaIterator的典型用法代码示例。如果您正苦于以下问题:Python FastaIterator类的具体用法?Python FastaIterator怎么用?Python FastaIterator使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了FastaIterator类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: maskSequences

    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x, s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x, s))

        os.close(outfile)

        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [
            x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))]

        os.remove(infile)

        return result
开发者ID:BioXiao,项目名称:cgat,代码行数:33,代码来源:Masker.py

示例2: main

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-b", "--bamfile", dest="bamfile", type="string",
                      help="supply bam file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # read in contigs
    E.info("reading in contig file")
    contigs = {}
    for fasta in FastaIterator.iterate(options.stdin):
        contigs[fasta.title] = (1, len(fasta.sequence) - 1)
    E.info("read %i contigs" % len(contigs.keys()))

    # read in bamfile
    E.info("reading bam file")
    samfile = pysam.Samfile(options.bamfile)

    E.info("iterating over contigs")
    c = 0
    for contig, coords in contigs.iteritems():
        coords = list(coords)

        #################################
        # NB this is specific for my data!
        contig = contig.split(" ")[0]
        #################################

        species_counts = collections.defaultdict(int)
        for alignment in samfile.fetch(contig, coords[0], coords[1]):
            species_id = alignment.qname.split("|")[1]
            species_counts[species_id] += 1

        # at the moment ignore if there are no counts
        if len(species_counts.values()) == 0:
            E.warn("no reads map to %s" % contig)
            continue

        for species, count in species_counts.iteritems():
            if species_counts[species] == max(species_counts.values()):
                top_dog = species
                c += 1
                break
        E.info("species %s assigned to contig number %i" % (top_dog, c))
        options.stdout.write("%s\t%s\n" % (contig, top_dog))

    # write footer and output benchmark information.
    E.Stop()
开发者ID:Charlie-George,项目名称:cgat,代码行数:60,代码来源:bam2species_map.py

示例3: main

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-dir", dest="genome_dir", type="string",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    contigs_map = {}
    for genome in glob.glob(os.path.join(options.genome_dir, "*")):
        for fasta in FastaIterator.iterate(IOTools.openFile(genome)):
            identifier = fasta.title.split("|")
            gi = identifier[1]
            contigs_map[gi] = fasta.title

    for line in options.stdin.readlines():
        data = line[:-1].split("\t")
        gi = data[1]
        assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi

        options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi]))

    # write footer and output benchmark information.
    E.Stop()
开发者ID:CGATOxford,项目名称:Optic,代码行数:35,代码来源:species_map2species_map.py

示例4: calculateSequenceComposition

def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''
    cat %(inf)s | cgat fasta2table
    -s na -s cpg -s length
    --log=%(outfile)s.log > %(outfile)s'''

    P.run()
开发者ID:CGATOxford,项目名称:CGATPipelines,代码行数:31,代码来源:PipelineTransfacMatch.py

示例5: filterByCoverage

def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
开发者ID:BioXiao,项目名称:CGATPipelines,代码行数:26,代码来源:PipelineMetagenomeBenchmark.py

示例6: buildInputFiles

def buildInputFiles(infile, outfiles):
    '''
    build input file based on parameters and fasta sequences
    that primers are to be designed for
    '''
    PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0]

    fasta, identifiers = infile[0], "identifiers.tsv"
    inf = IOTools.openFile(fasta)
    
    E.info("Reading ids for primer design")
    ids = readIdentifiers(identifiers)
    
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title in ids:
            outf = IOTools.openFile(os.path.join(
                "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w")
            seq = f.sequence
            outf.write("SEQUENCE_ID=%s\n" % f.title)
            for key, value in PARAMS.iteritems():
                if "constraints" in key:
                    outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value))
            outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq)
            outf.close()
开发者ID:nickilott,项目名称:qpcr,代码行数:25,代码来源:pipeline_primerdesign.py

示例7: iterate_double_fasta

def iterate_double_fasta ( fn1, fn2 ):
    iterator = FastaIterator.iterate_together( fn1, fn2 )
    for seq1, seq2 in iterator:
        yield AlignedPairs.UnalignedPair( 
            token1 = seq1.title,
            sequence1 = seq1.sequence,
            token2 = seq2.title,
            sequence2 = seq2.sequence )
开发者ID:BioinformaticsArchive,项目名称:cgat,代码行数:8,代码来源:align_pairs.py

示例8: main

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id$", 
                             usage = globals()["__doc__"] )

    parser.add_option("-n", dest="N", type="int",
                      help="e.g N50 - the length at which 50% of contigs are equal or above")
    parser.add_option("-f", "--filter-length", dest="filter_length", type="int",
                      help="calculate stats on contigs longer than -f")

    parser.set_defaults(N = 50,
                        filter_length = 0)

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )
    
    f = options.filter_length

    # iterate over the contigs/scaffolds and return stats                                                                                                                                                                                              
    number_of_contigs = 0

    N = options.N
    contig_lengths = []

    for record in FastaIterator.iterate(options.stdin):
        contig_length = len(list(record.sequence))
        if contig_length >= f:
            number_of_contigs += 1
            contig_lengths.append(contig_length)

    # mean, median and max contig/scaffold lengths
    mean_length = np.mean(contig_lengths)
    median_length = np.median(contig_lengths)
    max_length = max(contig_lengths)

    # iterate over contigs/scaffolds sorted by longest                                                                                                                                                                                               
    # and caculate the NX                                                                                                                                                                                                                    
    index = 0
    cum_length = 0
    total_length = sum(contig_lengths)
    for length in sorted(contig_lengths, reverse = True):
        while cum_length <= total_length*(float(N)/100):
            index += 1
            cum_length += length

    # output the results                                                                                                                                                                                                                     
    options.stdout.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
    options.stdout.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (number_of_contigs, total_length, sorted(contig_lengths, reverse = True)[index], str(median_length), str(mean_length), str(max_length)))

    ## write footer and output benchmark information.
    E.Stop()
开发者ID:BioinformaticsArchive,项目名称:cgat,代码行数:58,代码来源:contigs2stats.py

示例9: filterContigs

def filterContigs(infile, outfile, length):
    '''
    filter contigs by length
    '''
    outf = open(outfile, "w")
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        seq_length = len(fasta.sequence)
        if seq_length < length: continue
        outf.write(">%s\n%s\n" % (fasta.title, fasta.sequence))
    outf.close()
开发者ID:siping,项目名称:cgat,代码行数:10,代码来源:PipelineMetagenomeAssembly.py

示例10: contig_to_stats

def contig_to_stats(contigs_file, stats_file, params):
    """
    calculate descriptive stats for a set
    of contigs / scaffolds
    """

    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0

    # iterate over the contigs/scaffolds and return stats
    number_of_scaffolds = 0

    N = PARAMS["scaffold_n"]
    scaffold_lengths = []

    inf = open(contigs_file)
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length >= f:
            number_of_scaffolds += 1
            scaffold_lengths.append(scaffold_length)

    # mean, median and max contig/scaffold lengths
    mean_length = np.mean(scaffold_lengths)
    median_length = np.median(scaffold_lengths)
    max_length = max(scaffold_lengths)

    # iterate over contigs/scaffolds sorted by longest
    # and caculate the NX
    index = 0
    cum_length = 0
    total_length = sum(scaffold_lengths)
    for length in sorted(scaffold_lengths, reverse=True):
        while cum_length <= total_length * (float(N) / 100):
            index += 1
            cum_length += length

    # output the results
    outf = open(stats_file, "w")
    outf.write("nscaffolds\tscaffold_length\tN%i\tmedian_length\tmean_length\tmax_length\n" % N)
    outf.write(
        "%s\t%s\t%s\t%s\t%s\t%s\n"
        % (
            number_of_scaffolds,
            total_length,
            sorted(scaffold_lengths, reverse=True)[index],
            str(median_length),
            str(mean_length),
            str(max_length),
        )
    )
开发者ID:nishantthakur,项目名称:cgat,代码行数:55,代码来源:PipelineMetagenomeAssembly.py

示例11: build_scaffold_lengths

def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        outf.write("%s\t%i\n" % (record.title, scaffold_length))
    outf.close()
开发者ID:Charlie-George,项目名称:cgat,代码行数:11,代码来源:PipelineMetagenomeAssembly.py

示例12: collectGenomeSizes

def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
开发者ID:yangjl,项目名称:cgat,代码行数:13,代码来源:pipeline_metagenomebenchmark.py

示例13: buildMisprimingLib

def buildMisprimingLib(infiles, outfile):
    '''
    build fasta file of sequences to check for mispriming
    '''
    fasta, identifiers = infiles
    inf = IOTools.openFile(fasta)
    
    E.info("reading ids for sequences to keep")
    ids = readIdentifiers(identifiers)

    outf = IOTools.openFile(outfile, "w")
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title not in ids:
            outf.write(">%s\n%s\n" % (f.title, f.sequence))
    outf.close()
开发者ID:nickilott,项目名称:qpcr,代码行数:16,代码来源:pipeline_primerdesign.py

示例14: build_scaffold_lengths

def build_scaffold_lengths(contigs_file, outfile, params):
    '''
    output the distribution of scaffold lengths
    '''
    PARAMS = params

    if PARAMS["filter"]:
        f = PARAMS["filter"]
    else:
        f = 0
    inf = open(contigs_file)
    outf = open(outfile, "w")
    outf.write("scaffold_name\tlength\n")
    for record in FastaIterator.iterate(inf):
        scaffold_length = len(list(record.sequence))
        if scaffold_length > f:
            # rename sequences if they have a space in them
            outf.write("%s\t%i\n" % (record.title.replace(" ", "_"), scaffold_length))
    outf.close()
开发者ID:siping,项目名称:cgat,代码行数:19,代码来源:PipelineMetagenomeAssembly.py

示例15: runMEMEOnSequences

def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
        return

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "meme", outfile)
    tmpdir = P.getTempDir(".")

    statement = '''
        meme %(infile)s -dna -revcomp 
                        -mod %(meme_model)s 
                        -nmotifs %(meme_nmotifs)s 
                        -oc %(tmpdir)s 
                        -maxsize %(motifs_max_size)s 
                        %(meme_options)s 
       > %(outfile)s.log
    '''

    P.run()

    collectMEMEResults(tmpdir, target_path, outfile)
开发者ID:lesheng,项目名称:cgat,代码行数:40,代码来源:PipelineMotifs.py


注:本文中的CGAT.FastaIterator类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。