当前位置: 首页>>代码示例>>Python>>正文


Python CGAT.Fastq类代码示例

本文整理汇总了Python中CGAT.Fastq的典型用法代码示例。如果您正苦于以下问题:Python Fastq类的具体用法?Python Fastq怎么用?Python Fastq使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Fastq类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                             usage = globals()["__doc__"] )

    parser.add_option("-a", "--fastq1", dest="fastq1", type="string",
                      help="supply read1 fastq file"  )
    parser.add_option("-b", "--fastq2", dest="fastq2", type="string",
                      help="supply read2 fastq file"  )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    fastq1 = IOTools.openFile(options.fastq1)
    fastq2 = IOTools.openFile(options.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in itertools.izip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError("unpaired reads detected. Are files sorted? are files of equal length?")
            except PairedReadError, e:
                raise PairedReadError(e), None, sys.exc_info()[2]
        else:
            assert f1.identifier.endswith("/1") and f2.identifier.endswith("/2"), "Reads in file 1 must end with /1 and reads in file 2 with /2"
            options.stdout.write(">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1
开发者ID:BioinformaticsArchive,项目名称:cgat,代码行数:35,代码来源:fastqs2fasta.py

示例2: main

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--change-format", dest="change_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="guess quality score format and set quality scores to format [default=%default].")

    parser.add_option("--guess-format", dest="guess_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--pattern", dest="pattern", type="string",
                      help="filename prefix [default=%default].")

    parser.set_defaults(
        change_format=None,
        guess_format=None,
        pattern="%s.gz"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    outfile_seq = IOTools.openFile(options.pattern % "csfasta", "w")
    outfile_qual = IOTools.openFile(options.pattern % "qual", "w")

    if options.change_format:
        iter = Fastq.iterate_convert(options.stdin,
                                     format=options.change_format,
                                     guess=options.guess_format)
    else:
        iter = Fastq.iterate(options.stdin)

    for record in iter:
        c.input += 1
        outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq))
        outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals))
        c.output += 1

    outfile_seq.close()
    outfile_qual.close()

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
开发者ID:Charlie-George,项目名称:cgat,代码行数:57,代码来源:fastq2solid.py

示例3: main

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a", "--first-fastq-file", dest="fastq1", type="string",
        help="supply read1 fastq file")
    parser.add_option(
        "-b", "--second-fastq-file", dest="fastq2", type="string",
        help="supply read2 fastq file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if args and len(args) == 2:
        options.fastq1, options.fastq2 = args

    fastq1 = IOTools.openFile(options.fastq1)
    fastq2 = IOTools.openFile(options.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in zip_longest(Fastq.iterate(fastq1),
                              Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError(
                    "unpaired reads detected. Are files sorted? are "
                    "files of equal length?")
            except PairedReadError as e:
                raise PairedReadError(e).with_traceback(sys.exc_info()[2])
        else:
            assert f1.identifier.endswith("/1") and \
                f2.identifier.endswith("/2"), \
                "Reads in file 1 must end with /1 and reads in file 2 with /2"
            options.stdout.write(
                ">%s\n%s\n>%s\n%s\n" %
                (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1

    E.info("output: %i pairs" % f1_count)

    # write footer and output benchmark information.
    E.Stop()
开发者ID:CGATOxford,项目名称:cgat,代码行数:54,代码来源:fastqs2fasta.py

示例4: build

    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''flash %(infile1)s %(infile2)s
        -p %(offset)s
        %(processing_options)s
        -o %(track)s
        -d %(outdir)s
        >& %(output_prefix)s-flash.log;
        checkpoint;
        gzip %(outdir)s/*;
        checkpoint;
        mv %(outdir)s/%(track)s.extendedFrags.fastq.gz %(outfile)s;
        ''' % locals()

        return cmd
开发者ID:sudlab,项目名称:CGATPipelines,代码行数:25,代码来源:PipelinePreprocess.py

示例5: buildTrueTaxonomicRelativeAbundances

def buildTrueTaxonomicRelativeAbundances(infiles, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    '''
    levels = ["species", "genus", "family", "order", "class", "phylum"]
    taxa = open(infiles[1])
    header = taxa.readline()
    gi2taxa = collections.defaultdict(list)
    for line in taxa.readlines():
        data = line[:-1].split("\t")
        gi, strain, species, genus, family, order, _class, phylum = data[
            0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]
        gi2taxa[gi] = (species, genus, family, order, _class, phylum)

    outf = open(outfile, "w")
    outf.write("level\ttaxa\trelab\n")
    for i in range(len(levels)):
        total = 0
        result = collections.defaultdict(int)
        for fastq in Fastq.iterate(IOTools.openFile(infiles[0])):
            total += 1
            gi = fastq.identifier.split("|")[1]
            result[gi2taxa[gi][i]] += 1
        for taxa, value in result.iteritems():
            outf.write("%s\t%s\t%s\n" %
                       (levels[i], taxa, float(value) / total))
    outf.close()
开发者ID:BioXiao,项目名称:CGATPipelines,代码行数:30,代码来源:PipelineMetagenomeBenchmark.py

示例6: peek

def peek(sra, outdir=None):
    """return the full file names for all files which will be extracted

    Parameters:

    outdir : path
        perform extraction in outdir. If outdir is None, the extraction
        will take place in a temporary directory, which will be deleted
        afterwards.
    """
    
    if outdir is None:
        workdir = tempfile.mkdtemp()
    else:
        workdir = outdir

    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(workdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith(
            "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)

    if outdir is None:
        shutil.rmtree(workdir)

    return f, fastq_format
开发者ID:mmaarriiee,项目名称:cgat,代码行数:55,代码来源:Sra.py

示例7: replace

def replace(fastqfile, baseToReplace):
    '''replaces the specified base with N'''

    # use gzip as default to open the fastq file
    outf = gzip.open("replaced_" + fastqfile, "w")
    fastq = gzip.open(fastqfile)
    iterator = Fastq.iterate(fastq)
    for record in iterator:
        x = list(record.seq)
        x[int(baseToReplace)] = "N"
        record.seq = "".join(x)
        outf.write("@" + record.identifier + "\n" + record.seq + "\n" + "+" + record.identifier + "\n" + record.quals + "\n")
开发者ID:BioinformaticsArchive,项目名称:cgat,代码行数:12,代码来源:fastq2N.py

示例8: buildExpectedCoverageOverGenomes

def buildExpectedCoverageOverGenomes(infiles, outfile):
    '''
    take sequence files and estimate the theoretical
    coverage we would get over genomes in the 
    sample i.e. at 1X coverage
    '''

    # if paired end then will have to multiply
    # by two
    multiply = False
    if infiles[0].endswith(".fastq.1.gz"):
        multiply = True

    # the theoretical coverage is defined as
    # (read length (L) * no. reads (N)) / genome size (G) (bp)

    # get genome sizes into memory
    genomes = open(infiles[1])
    header = genomes.readline()
    genome_sizes = {}
    for line in genomes.readlines():
        data = line[:-1].split("\t")
        gi = data[0].split("_")[1]
        size = data[1]
        genome_sizes[gi] = size

    # get the expected genome size
    expected_genome_sizes = collections.defaultdict(int)
    E.info("iterating over fastq file")
    for fastq in Fastq.iterate(IOTools.openFile(infiles[0])):
        gi = fastq.identifier.split("|")[1]
        expected_genome_sizes[gi] += 1
    E.info("iterating over fastq file: DONE")

    # get the proportion of each genome covered
    outf = open(outfile, "w")
    outf.write("gi\texpected_coverage\n")
    for gi, size in expected_genome_sizes.iteritems():
        if multiply:
            size = size * 2
        if gi not in genome_sizes:
            E.warn("could not find gi no. %s in dictionary" % gi)
            continue
        proportion_coverage = float(size) / float(genome_sizes[gi])
        if proportion_coverage > 1:
            proportion_coverage = 1
        outf.write("%s\t%f\n" % (gi, proportion_coverage))
    outf.close()
开发者ID:BioXiao,项目名称:CGATPipelines,代码行数:48,代码来源:PipelineMetagenomeBenchmark.py

示例9: build

    def build(self, infile, outfile, processer_list):
        '''run mapper.'''
        f_format = Fastq.guessFormat(
            IOTools.openFile(infile[0], "r"), raises=False)

        cmd_process, cmd_post, processed_files = self.process(
            infile[0], processer_list, outfile, f_format, save=self.save)
        cmd_clean = self.cleanup(outfile)

        assert cmd_process.strip().endswith(";")
        assert cmd_post.strip().endswith(";")
        assert cmd_clean.strip().endswith(";")

        statement = " checkpoint; ".join((cmd_process,
                                          cmd_post,
                                          cmd_clean))
        return statement
开发者ID:BioXiao,项目名称:CGATPipelines,代码行数:17,代码来源:PipelinePreprocess.py

示例10: preprocessIdba

def preprocessIdba(infile, outfile):
    '''
    preprocess pooled reads for IDBA
    '''
    # check for second read in the pair
    if infile.endswith(".fastq.gz"):
        E.info("converting fastq file to fasta file")
        outf = open(outfile, "w")
        for fastq in Fastq.iterate(IOTools.openFile(infile)):
            outf.write("%s\n%s\n" % (">" + fastq.identifier, fastq.seq))
        outf.close()
    elif infile.endswith(".1.gz"):
        read2 = P.snip(infile, ".1.gz") + ".2.gz"
        assert os.path.exists(read2), "file does not exist %s" % read2

        statement = '''python %(scriptsdir)s/fastqs2fasta.py
                   -a %(infile)s
                   -b %(read2)s
                   --log=%(infile)s.log
                   > %(outfile)s'''
        P.run()
开发者ID:Charlie-George,项目名称:cgat,代码行数:21,代码来源:pipeline_metagenomeassembly.py

示例11: peek

def peek(sra, outdir):
    ''' returns the full file names for all files which will be extracted'''
    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(outdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(outdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith(
            "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)

    return f, fastq_format
开发者ID:BioXiao,项目名称:cgat,代码行数:38,代码来源:Sra.py

示例12: main

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--split-barcode", dest="split", action="store_true",
                      help="barcode is split across read pair")
    parser.add_option("-p", "--bc-pattern", dest="pattern", type="string",
                      help="Barcode pattern. Ns are random bases X's fixed")
    parser.add_option("--bc-pattern2", dest="pattern2", type="string",
                      help="Barcode pattern. Ns are random bases X's fixed")
    parser.add_option("--read2-in", dest="read2_in", type="string",
                      help="file name for read pairs")
    parser.add_option("--3prime", dest="prime3", action="store_true",
                      help="barcode is on 3' end of read")
    parser.add_option("--read2-out", dest="read2_out", type="string",
                      help="file to output processed paired read to")
    parser.add_option("--supress-stats", dest="stats", action="store_false",
                      help="Suppress the writing of stats to the log")

    parser.set_defaults(split=False,
                        pattern=None,
                        pattern2=None,
                        read2_in=None,
                        read2_out=None,
                        prime3=False,
                        stats=True)

    # add common options (-h/--help, ...) and parse command line

    (options, args) = E.Start(parser, argv=argv)

    # check options
    if not options.pattern:
        raise ValueError("must specify a pattern using ``--bc-pattern``")

    if options.split:
        if not options.read2_in:
            raise ValueError("must specify a paired fastq ``--read2-in``")

        if not options.pattern2:
            options.pattern2 = options.pattern

    if options.read2_in:
        if not options.read2_out:
            raise ValueError("must specify an output for the paired end "
                             "``--read2-out``")


    # Initialise the processor
    processor = Extractor(options.pattern, options.pattern2, options.prime3)
    read1s = Fastq.iterate(options.stdin)

    if options.read2_in is None:

        for read in read1s:
            options.stdout.write(str(processor(read)) + "\n")

    else:

        read2s = Fastq.iterate(IOTools.openFile(options.read2_in))
        read2_out = IOTools.openFile(options.read2_out, "w")

        for read1, read2 in zip(read1s, read2s):
            new_1, new_2 = processor(read1, read2)
            options.stdout.write(str(new_1) + "\n")
            read2_out.write(str(new_2) + "\n")

    # write footer and output benchmark information.

    if options.stats:
     
        options.stdlog.write("\t".join(["Barcode", "UMI", "Sample", "Count"]) + "\n")
        for id in processor.bc_count:
            options.stdlog.write("\t".join(id+(str(processor.bc_count[id]),)) + "\n")
                             
    E.Stop()
开发者ID:jdblischak,项目名称:UMI-tools,代码行数:85,代码来源:extract_umi.py

示例13: processReads


#.........这里部分代码省略.........
                     %(fragment_options)s
                     --output-prefix=%(track)s
                     %(threads)s
                     --compress
                     %(infile)s %(infile2)s >> %(outfile)s.log
                     '''
        P.run()
        if PARAMS["combine_reads_concatenate"]:
            infiles = " ".join([track + x for x in  [".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz"]])
            statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s'''
        else:
            statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s'''
        P.run()
        return


    if PARAMS["process_sample"] and infile2:
        E.warn( "sampling can not be combined with other processing for paired ended reads")
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
    E.info( "%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset( format, raises = False )

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
#              %(contamination_trim_type)s
        s = [ '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        ''' ]
        do_sth = True
    else:
        s = ['zcat %(infile)s' ]

    if PARAMS["process_artifacts"]:
        s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append( 'fastq_quality_trimmer -Q %(offset)i  -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True
开发者ID:yangjl,项目名称:cgat,代码行数:66,代码来源:pipeline_preprocess.py

示例14: setfastqAttr

 def setfastqAttr(self, infiles):
     self.offset = Fastq.getOffset(self.f_format, raises=False)
开发者ID:BioXiao,项目名称:CGATPipelines,代码行数:2,代码来源:PipelinePreprocess.py

示例15: main

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=('join', ),
                      help="method to apply [default=%default].")

    parser.set_defaults(
        method="join",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()
    outfile = options.stdout

    if options.method == "join":
        # merge based on diagonals in dotplot
        iter1 = Fastq.iterate(IOTools.openFile(fn1))
        iter2 = Fastq.iterate(IOTools.openFile(fn2))
        tuple_size = 2
        for left, right in zip(iter1, iter2):
            c.input += 1

            # build dictionary of tuples
            s1, q1 = left.seq, left.quals
            d = collections.defaultdict(list)
            for x in range(len(s1) - tuple_size):
                d[s1[x:x + tuple_size]].append(x)

            s2, q2 = right.seq, right.quals
            # reverse complement
            s2 = Genomics.complement(s2)
            q2 = q2[::-1]

            # compute list of offsets/diagonals
            offsets = collections.defaultdict(int)
            for x in range(len(s2) - tuple_size):
                c = s2[x:x + tuple_size]
                for y in d[c]:
                    offsets[x - y] += 1

            # find maximum diagonal
            sorted = sorted([(y, x) for x, y in offsets.items()])
            max_count, max_offset = sorted[-1]

            E.debug('%s: maximum offset at %i' % (left.identifier,
                                                  max_offset))

            # simple merge sequence
            take = len(s2) - max_offset
            merged_seq = s1 + s2[take:]

            # simple merge quality scores
            merged_quals = q1 + q2[take:]

            new_entry = copy.copy(left)
            new_entry.seq = merged_seq
            new_entry.quals = merged_quals
            outfile.write(new_entry)
            c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
开发者ID:SCV,项目名称:cgat,代码行数:81,代码来源:fastqs2fastq.py


注:本文中的CGAT.Fastq类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。