当前位置: 首页>>代码示例>>Python>>正文


Python Fastq.guessFormat方法代码示例

本文整理汇总了Python中CGAT.Fastq.guessFormat方法的典型用法代码示例。如果您正苦于以下问题:Python Fastq.guessFormat方法的具体用法?Python Fastq.guessFormat怎么用?Python Fastq.guessFormat使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在CGAT.Fastq的用法示例。


在下文中一共展示了Fastq.guessFormat方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: peek

# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
def peek(sra, outdir=None):
    """return the full file names for all files which will be extracted

    Parameters:

    outdir : path
        perform extraction in outdir. If outdir is None, the extraction
        will take place in a temporary directory, which will be deleted
        afterwards.
    """
    
    if outdir is None:
        workdir = tempfile.mkdtemp()
    else:
        workdir = outdir

    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(workdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith(
            "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)

    if outdir is None:
        shutil.rmtree(workdir)

    return f, fastq_format
开发者ID:mmaarriiee,项目名称:cgat,代码行数:57,代码来源:Sra.py

示例2: build

# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
    def build(self, infile, outfile, processer_list):
        '''run mapper.'''
        f_format = Fastq.guessFormat(
            IOTools.openFile(infile[0], "r"), raises=False)

        cmd_process, cmd_post, processed_files = self.process(
            infile[0], processer_list, outfile, f_format, save=self.save)
        cmd_clean = self.cleanup(outfile)

        assert cmd_process.strip().endswith(";")
        assert cmd_post.strip().endswith(";")
        assert cmd_clean.strip().endswith(";")

        statement = " checkpoint; ".join((cmd_process,
                                          cmd_post,
                                          cmd_clean))
        return statement
开发者ID:BioXiao,项目名称:CGATPipelines,代码行数:19,代码来源:PipelinePreprocess.py

示例3: peek

# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
def peek(sra, outdir):
    ''' returns the full file names for all files which will be extracted'''
    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(outdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(outdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith(
            "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)

    return f, fastq_format
开发者ID:BioXiao,项目名称:cgat,代码行数:40,代码来源:Sra.py

示例4: preprocess

# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
    def preprocess( self, infiles, outfile ):
        '''build preprocessing statement

        Build a command line statement that extracts/converts 
        various input formats to fastq formatted files.

        Mapping qualities are changed to solexa format.

        returns the statement and the fastq files to map.
        '''

        assert len(infiles) > 0, "no input files for mapping"

        tmpdir_fastq = P.getTempDir()

        # create temporary directory again for nodes
        statement = [ "mkdir -p %s" % tmpdir_fastq ]
        fastqfiles = []

        # get track by extension of outfile
        track = os.path.splitext( os.path.basename( outfile ) )[0]

        if self.compress:
            compress_cmd = "| gzip"
            extension = ".gz"
        else:
            compress_cmd = ""
            extension = ""

        for infile in infiles:

            if infile.endswith( ".export.txt.gz"):
                # single end illumina export
                statement.append( """gunzip < %(infile)s 
                     | awk '$11 != "QC" || $10 ~ /(\d+):(\d+):(\d+)/ \
                        { if ($1 != "") 
                             { readname=sprintf( "%%%%s_%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$2,$3,$4,$5,$6);}
                        else { readname=sprintf( "%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$3,$4,$5,$6); }
                       printf("@%%%%s\\n%%%%s\\n+\\n%%%%s\\n",readname,$9,$10);}'
                     %(compress_cmd)s
                     > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension ),) )
            elif infile.endswith( ".fa.gz" ):
                statement.append( '''gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.fa''' % locals() )
                fastqfiles.append( ("%s/%s.fa" % (tmpdir_fastq, track ),) )
                self.datatype = "fasta"
                
            elif infile.endswith( ".sra"):
                # sneak preview to determine if paired end or single end
                outdir = P.getTempDir()
                # --split-files is present in fastq-dump 2.1.7
                P.execute( "fastq-dump --split-files --gzip -X 1000 --outdir %(outdir)s %(infile)s" % locals() )
                # --split-files will create files called prefix_#.fastq.gz
                # where # is the read number. 
                # The following cases are:

                # * file cotains paired end data: output = prefix_1.fastq.gz, prefix_2.fastq.gz
                #    * special case: unpaired reads in a paired end run end up in prefix.fastq.gz
                #    * special case: if paired reads are stored in a single read, fastq-dump will split.
                #       There might be a joining sequence. The output would thus be:
                #       prefix_1.fastq.gz, prefix_2.fastq.gz and prefix_3.fastq.gz
                #      You want files 1 and 3.
                f = sorted(glob.glob( os.path.join( outdir, "*.fastq.gz" ) ))
                ff = [ os.path.basename(x) for x in f ]
                if len(f) == 1: 
                    # sra file contains one read: output = prefix.fastq.gz
                    pass
                elif len(f) == 2:
                    # sra file contains read pairs: output = prefix_1.fastq.gz, prefix_2.fastq.gz
                    assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith( "_2.fastq.gz" )
                elif len(f) == 3:
                    if ff[2].endswith( "_3.fastq.gz"):
                        f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) )
                    else:
                        f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) )
                E.info("sra file contains the following files: %s" % f )
                shutil.rmtree( outdir )
                fastqfiles.append( [ "%s/%s" % (tmpdir_fastq, os.path.basename( x )) for x in sorted(f) ] )
                statement.append( "fastq-dump --split-files --gzip --outdir %(tmpdir_fastq)s %(infile)s" % locals() )
                
            elif infile.endswith( ".fastq.gz" ):
                format = Fastq.guessFormat( IOTools.openFile( infile, "r"), raises = False)
                if 'sanger' not in format and self.convert:
                    statement.append(  """gunzip < %(infile)s 
                                      | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
                                      %(compress_cmd)s
                                      > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                    fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )
                else:
                    E.debug( "%s: assuming quality score format %s" % (infile, format ) ) 
                    fastqfiles.append( (infile, ) )

            elif infile.endswith( ".csfasta.gz" ):
                # single end SOLiD data
                if self.preserve_colourspace:
                    quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"
                    if not os.path.exists( quality ):
                        raise ValueError( "no quality file for %s" % infile )
                    statement.append(  """gunzip < %(infile)s 
                                          > %(tmpdir_fastq)s/%(track)s.csfasta%(extension)s""" % locals() )
#.........这里部分代码省略.........
开发者ID:BioinformaticsArchive,项目名称:cgat,代码行数:103,代码来源:PipelineMapping.py

示例5: processReads

# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]

#.........这里部分代码省略.........
                     %(fragment_options)s
                     --output-prefix=%(track)s
                     %(threads)s
                     --compress
                     %(infile)s %(infile2)s >> %(outfile)s.log
                     '''
        P.run()
        if PARAMS["combine_reads_concatenate"]:
            infiles = " ".join([track + x for x in  [".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz"]])
            statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s'''
        else:
            statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s'''
        P.run()
        return


    if PARAMS["process_sample"] and infile2:
        E.warn( "sampling can not be combined with other processing for paired ended reads")
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
    E.info( "%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset( format, raises = False )

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
#              %(contamination_trim_type)s
        s = [ '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        ''' ]
        do_sth = True
    else:
        s = ['zcat %(infile)s' ]

    if PARAMS["process_artifacts"]:
        s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append( 'fastq_quality_trimmer -Q %(offset)i  -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True
开发者ID:yangjl,项目名称:cgat,代码行数:69,代码来源:pipeline_preprocess.py

示例6: processReads

# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
def processReads( infiles, outfile ):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs( infile )

    if infile2:
        track = P.snip( outfile, ".fastq.1.gz" )        
        outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz"
    else:
        track = P.snip( outfile, ".fastq.gz" )


    if PARAMS["process_sample"] and infile2:
        E.warn( "sampling can not be combined with other processing for paired ended reads")
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
    E.info( "%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset( format, raises = False )

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
#              %(contamination_trim_type)s
        s = [ '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        ''' ]
        do_sth = True
    else:
        s = ['zcat %(infile)s' ]

    if PARAMS["process_artifacts"]:
        s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append( 'fastq_quality_trimmer -Q %(offset)i  -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log')
        do_sth = True

    if PARAMS["process_sample"]:
        s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' )

    if not do_sth:
        E.warn( "no filtering specified for %s - nothing done" % infile )
        return

    s.append( "gzip" )
    if not infile2:
        statement = " | ".join( s ) + " > %(outfile)s" 
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn( "processing first of pair")
        # first read pair
        statement = " | ".join( s ) + " > %(tmpfile1)s" 
        P.run()

        # second read pair        
        E.warn( "processing second of pair")
        infile = infile2
        statement = " | ".join( s ) + " > %(tmpfile2)s" 
        P.run()

        # reconcile
        E.info("starting reconciliation" )
#.........这里部分代码省略.........
开发者ID:siping,项目名称:cgat,代码行数:103,代码来源:pipeline_readqc.py


注:本文中的CGAT.Fastq.guessFormat方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。