本文整理汇总了Python中CGAT.Fastq.guessFormat方法的典型用法代码示例。如果您正苦于以下问题:Python Fastq.guessFormat方法的具体用法?Python Fastq.guessFormat怎么用?Python Fastq.guessFormat使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类CGAT.Fastq
的用法示例。
在下文中一共展示了Fastq.guessFormat方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: peek
# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
def peek(sra, outdir=None):
"""return the full file names for all files which will be extracted
Parameters:
outdir : path
perform extraction in outdir. If outdir is None, the extraction
will take place in a temporary directory, which will be deleted
afterwards.
"""
if outdir is None:
workdir = tempfile.mkdtemp()
else:
workdir = outdir
# --split-files creates files called prefix_#.fastq.gz,
# where # is the read number.
# If file cotains paired end data:
# output = prefix_1.fastq.gz, prefix_2.fastq.gz
# *special case: unpaired reads in a paired end --> prefix.fastq.gz
# *special case: if paired reads are stored in a single read,
# fastq-dump will split. There might be a joining
# sequence. The output would thus be:
# prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
# You want files 1 and 3.
E.run("""fastq-dump --split-files --gzip -X 1000
--outdir %(workdir)s %(sra)s""" % locals())
f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz")))
ff = [os.path.basename(x) for x in f]
if len(f) == 1:
# sra file contains one read: output = prefix.fastq.gz
pass
elif len(f) == 2:
# sra file contains read pairs:
# output = prefix_1.fastq.gz, prefix_2.fastq.gz
assert ff[0].endswith(
"_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")
elif len(f) == 3:
if ff[2].endswith("_3.fastq.gz"):
f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))
else:
f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))
# check format of fastqs in .sra
fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)
if outdir is None:
shutil.rmtree(workdir)
return f, fastq_format
示例2: build
# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
def build(self, infile, outfile, processer_list):
'''run mapper.'''
f_format = Fastq.guessFormat(
IOTools.openFile(infile[0], "r"), raises=False)
cmd_process, cmd_post, processed_files = self.process(
infile[0], processer_list, outfile, f_format, save=self.save)
cmd_clean = self.cleanup(outfile)
assert cmd_process.strip().endswith(";")
assert cmd_post.strip().endswith(";")
assert cmd_clean.strip().endswith(";")
statement = " checkpoint; ".join((cmd_process,
cmd_post,
cmd_clean))
return statement
示例3: peek
# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
def peek(sra, outdir):
''' returns the full file names for all files which will be extracted'''
# --split-files creates files called prefix_#.fastq.gz,
# where # is the read number.
# If file cotains paired end data:
# output = prefix_1.fastq.gz, prefix_2.fastq.gz
# *special case: unpaired reads in a paired end --> prefix.fastq.gz
# *special case: if paired reads are stored in a single read,
# fastq-dump will split. There might be a joining
# sequence. The output would thus be:
# prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
# You want files 1 and 3.
E.run("""fastq-dump --split-files --gzip -X 1000
--outdir %(outdir)s %(sra)s""" % locals())
f = sorted(glob.glob(os.path.join(outdir, "*.fastq.gz")))
ff = [os.path.basename(x) for x in f]
if len(f) == 1:
# sra file contains one read: output = prefix.fastq.gz
pass
elif len(f) == 2:
# sra file contains read pairs:
# output = prefix_1.fastq.gz, prefix_2.fastq.gz
assert ff[0].endswith(
"_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")
elif len(f) == 3:
if ff[2].endswith("_3.fastq.gz"):
f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz"))
else:
f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz"))
# check format of fastqs in .sra
fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)
return f, fastq_format
示例4: preprocess
# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
def preprocess( self, infiles, outfile ):
'''build preprocessing statement
Build a command line statement that extracts/converts
various input formats to fastq formatted files.
Mapping qualities are changed to solexa format.
returns the statement and the fastq files to map.
'''
assert len(infiles) > 0, "no input files for mapping"
tmpdir_fastq = P.getTempDir()
# create temporary directory again for nodes
statement = [ "mkdir -p %s" % tmpdir_fastq ]
fastqfiles = []
# get track by extension of outfile
track = os.path.splitext( os.path.basename( outfile ) )[0]
if self.compress:
compress_cmd = "| gzip"
extension = ".gz"
else:
compress_cmd = ""
extension = ""
for infile in infiles:
if infile.endswith( ".export.txt.gz"):
# single end illumina export
statement.append( """gunzip < %(infile)s
| awk '$11 != "QC" || $10 ~ /(\d+):(\d+):(\d+)/ \
{ if ($1 != "")
{ readname=sprintf( "%%%%s_%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$2,$3,$4,$5,$6);}
else { readname=sprintf( "%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$3,$4,$5,$6); }
printf("@%%%%s\\n%%%%s\\n+\\n%%%%s\\n",readname,$9,$10);}'
%(compress_cmd)s
> %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension ),) )
elif infile.endswith( ".fa.gz" ):
statement.append( '''gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.fa''' % locals() )
fastqfiles.append( ("%s/%s.fa" % (tmpdir_fastq, track ),) )
self.datatype = "fasta"
elif infile.endswith( ".sra"):
# sneak preview to determine if paired end or single end
outdir = P.getTempDir()
# --split-files is present in fastq-dump 2.1.7
P.execute( "fastq-dump --split-files --gzip -X 1000 --outdir %(outdir)s %(infile)s" % locals() )
# --split-files will create files called prefix_#.fastq.gz
# where # is the read number.
# The following cases are:
# * file cotains paired end data: output = prefix_1.fastq.gz, prefix_2.fastq.gz
# * special case: unpaired reads in a paired end run end up in prefix.fastq.gz
# * special case: if paired reads are stored in a single read, fastq-dump will split.
# There might be a joining sequence. The output would thus be:
# prefix_1.fastq.gz, prefix_2.fastq.gz and prefix_3.fastq.gz
# You want files 1 and 3.
f = sorted(glob.glob( os.path.join( outdir, "*.fastq.gz" ) ))
ff = [ os.path.basename(x) for x in f ]
if len(f) == 1:
# sra file contains one read: output = prefix.fastq.gz
pass
elif len(f) == 2:
# sra file contains read pairs: output = prefix_1.fastq.gz, prefix_2.fastq.gz
assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith( "_2.fastq.gz" )
elif len(f) == 3:
if ff[2].endswith( "_3.fastq.gz"):
f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) )
else:
f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) )
E.info("sra file contains the following files: %s" % f )
shutil.rmtree( outdir )
fastqfiles.append( [ "%s/%s" % (tmpdir_fastq, os.path.basename( x )) for x in sorted(f) ] )
statement.append( "fastq-dump --split-files --gzip --outdir %(tmpdir_fastq)s %(infile)s" % locals() )
elif infile.endswith( ".fastq.gz" ):
format = Fastq.guessFormat( IOTools.openFile( infile, "r"), raises = False)
if 'sanger' not in format and self.convert:
statement.append( """gunzip < %(infile)s
| python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
%(compress_cmd)s
> %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )
else:
E.debug( "%s: assuming quality score format %s" % (infile, format ) )
fastqfiles.append( (infile, ) )
elif infile.endswith( ".csfasta.gz" ):
# single end SOLiD data
if self.preserve_colourspace:
quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"
if not os.path.exists( quality ):
raise ValueError( "no quality file for %s" % infile )
statement.append( """gunzip < %(infile)s
> %(tmpdir_fastq)s/%(track)s.csfasta%(extension)s""" % locals() )
#.........这里部分代码省略.........
示例5: processReads
# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
#.........这里部分代码省略.........
%(fragment_options)s
--output-prefix=%(track)s
%(threads)s
--compress
%(infile)s %(infile2)s >> %(outfile)s.log
'''
P.run()
if PARAMS["combine_reads_concatenate"]:
infiles = " ".join([track + x for x in [".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz"]])
statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s'''
else:
statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s'''
P.run()
return
if PARAMS["process_sample"] and infile2:
E.warn( "sampling can not be combined with other processing for paired ended reads")
statement = '''zcat %(infile)s
| python %(scriptsdir)s/fastq2fastq.py
--sample=%(sample_proportion)f
--pair=%(infile2)s
--outfile-pair=%(outfile2)s
--log=%(outfile)s_sample.log
| gzip
> %(outfile)s
'''
P.run()
return
# fastx does not like quality scores below 64 (Illumina 1.3 format)
# need to detect the scores and convert
format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
E.info( "%s: format guess: %s" % (infile, format))
offset = Fastq.getOffset( format, raises = False )
if PARAMS["process_remove_contaminants"]:
adaptors = listAdaptors(contaminant_file)
# %(contamination_trim_type)s
s = [ '''
cutadapt
%(adaptors)s
--overlap=%(contamination_min_overlap_length)i
--format=fastq
%(contamination_options)s
<( zcat < %(infile)s )
2>> %(outfile)s_contaminants.log
''' ]
do_sth = True
else:
s = ['zcat %(infile)s' ]
if PARAMS["process_artifacts"]:
s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
do_sth = True
if PARAMS["process_trim"]:
s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
do_sth = True
# NICK - may replace fastx trimmer
if PARAMS["process_trim_quality"]:
s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' )
do_sth = True
示例6: processReads
# 需要导入模块: from CGAT import Fastq [as 别名]
# 或者: from CGAT.Fastq import guessFormat [as 别名]
def processReads( infiles, outfile ):
'''process reads.'''
infile, contaminant_file = infiles
do_sth = False
to_cluster = True
infile2 = checkPairs( infile )
if infile2:
track = P.snip( outfile, ".fastq.1.gz" )
outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz"
else:
track = P.snip( outfile, ".fastq.gz" )
if PARAMS["process_sample"] and infile2:
E.warn( "sampling can not be combined with other processing for paired ended reads")
statement = '''zcat %(infile)s
| python %(scriptsdir)s/fastq2fastq.py
--sample=%(sample_proportion)f
--pair=%(infile2)s
--outfile-pair=%(outfile2)s
--log=%(outfile)s_sample.log
| gzip
> %(outfile)s
'''
P.run()
return
# fastx does not like quality scores below 64 (Illumina 1.3 format)
# need to detect the scores and convert
format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
E.info( "%s: format guess: %s" % (infile, format))
offset = Fastq.getOffset( format, raises = False )
if PARAMS["process_remove_contaminants"]:
adaptors = listAdaptors(contaminant_file)
# %(contamination_trim_type)s
s = [ '''
cutadapt
%(adaptors)s
--overlap=%(contamination_min_overlap_length)i
--format=fastq
%(contamination_options)s
<( zcat < %(infile)s )
2>> %(outfile)s_contaminants.log
''' ]
do_sth = True
else:
s = ['zcat %(infile)s' ]
if PARAMS["process_artifacts"]:
s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
do_sth = True
if PARAMS["process_trim"]:
s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
do_sth = True
# NICK - may replace fastx trimmer
if PARAMS["process_trim_quality"]:
s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
do_sth = True
if PARAMS["process_filter"]:
s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log')
do_sth = True
if PARAMS["process_sample"]:
s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' )
if not do_sth:
E.warn( "no filtering specified for %s - nothing done" % infile )
return
s.append( "gzip" )
if not infile2:
statement = " | ".join( s ) + " > %(outfile)s"
P.run()
else:
tmpfile = P.getTempFilename(".")
tmpfile1 = tmpfile + ".fastq.1.gz"
tmpfile2 = tmpfile + ".fastq.2.gz"
E.warn( "processing first of pair")
# first read pair
statement = " | ".join( s ) + " > %(tmpfile1)s"
P.run()
# second read pair
E.warn( "processing second of pair")
infile = infile2
statement = " | ".join( s ) + " > %(tmpfile2)s"
P.run()
# reconcile
E.info("starting reconciliation" )
#.........这里部分代码省略.........