本文整理汇总了Python中CGAT.BamTools类的典型用法代码示例。如果您正苦于以下问题:Python BamTools类的具体用法?Python BamTools怎么用?Python BamTools使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BamTools类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: buildTranscriptLevelReadCounts
def buildTranscriptLevelReadCounts(infiles, outfile):
'''count reads falling into transcripts of protein coding gene models.
.. note::
In paired-end data sets each mate will be counted. Thus
the actual read counts are approximately twice the fragment
counts.
'''
bamfile, geneset = infiles
if BamTools.isPaired(bamfile):
counter = 'readpair-counts'
else:
counter = 'read-counts'
statement = '''
zcat %(geneset)s
| python %(scriptsdir)s/gtf2table.py
--reporter=transcripts
--bam-file=%(bamfile)s
--counter=length
--prefix="exons_"
--counter=%(counter)s
--prefix=""
--counter=read-coverage
--prefix=coverage_
--min-mapping-quality=%(counting_min_mapping_quality)i
--multi-mapping=ignore
--log=%(outfile)s.log
| gzip
> %(outfile)s
'''
P.run()
示例2: isPaired
def isPaired(filename):
'''return "T" if bamfile contains paired end reads.'''
if BamTools.isPaired(filename):
return "T"
else:
return "F"
示例3: buildPicardGCStats
def buildPicardGCStats(infile, outfile, genome_file):
"""picard:CollectGCBiasMetrics
Collect GC bias metrics.
Arguments
---------
infile : string
Input filename in :term:`BAM` format.
outfile : string
Output filename with picard output.
genome_file : string
Filename with genomic sequence.
"""
job_memory = PICARD_MEMORY
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
job_threads = 3
if BamTools.getNumReads(infile) == 0:
E.warn("no reads in %s - no metrics" % infile)
P.touch(outfile)
return
statement = '''picard %(picard_opts)s CollectGcBiasMetrics
INPUT=%(infile)s
REFERENCE_SEQUENCE=%(genome_file)s
OUTPUT=%(outfile)s
VALIDATION_STRINGENCY=SILENT
CHART_OUTPUT=%(outfile)s.pdf
SUMMARY_OUTPUT=%(outfile)s.summary
>& %(outfile)s'''
P.run()
示例4: buildPicardAlignmentStats
def buildPicardAlignmentStats(infile, outfile, genome_file):
'''run picard:CollectMultipleMetrics
Arguments
---------
infile : string
Input filename in :term:`BAM` format.
outfile : string
Output filename with picard output.
genome_file : string
Filename with genomic sequence.
'''
job_memory = PICARD_MEMORY
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
job_threads = 3
if BamTools.getNumReads(infile) == 0:
E.warn("no reads in %s - no metrics" % infile)
P.touch(outfile)
return
statement = '''picard %(picard_opts)s CollectMultipleMetrics
INPUT=%(infile)s
REFERENCE_SEQUENCE=%(genome_file)s
ASSUME_SORTED=true
OUTPUT=%(outfile)s
VALIDATION_STRINGENCY=SILENT
>& %(outfile)s'''
P.run()
示例5: buildPicardDuplicateStats
def buildPicardDuplicateStats(infile, outfile):
'''run picard:MarkDuplicates
Record duplicate metrics using Picard and keep the dedupped .bam
file.
Pair duplication is properly handled, including inter-chromosomal
cases. SE data is also handled. These stats also contain a
histogram that estimates the return from additional sequecing. No
marked bam files are retained (/dev/null...) Note that picards
counts reads but they are in fact alignments.
Arguments
---------
infile : string
Input filename in :term:`BAM` format.
outfile : string
Output filename with picard output.
'''
job_memory = PICARD_MEMORY
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
job_threads = 3
if BamTools.getNumReads(infile) == 0:
E.warn("no reads in %s - no metrics" % infile)
P.touch(outfile)
return
statement = '''picard %(picard_opts)s MarkDuplicates
INPUT=%(infile)s
ASSUME_SORTED=true
METRICS_FILE=%(outfile)s.duplicate_metrics
OUTPUT=%(outfile)s
VALIDATION_STRINGENCY=SILENT;
'''
statement += '''samtools index %(outfile)s ;'''
P.run()
示例6: buildPicardInsertSizeStats
def buildPicardInsertSizeStats(infile, outfile, genome_file):
'''run Picard:CollectInsertSizeMetrics
Collect insert size statistics.
Arguments
---------
infile : string
Input filename in :term:`BAM` format.
outfile : string
Output filename with picard output.
genome_file : string
Filename with genomic sequence.
'''
job_memory = PICARD_MEMORY
job_threads = 3
if BamTools.getNumReads(infile) == 0:
E.warn("no reads in %s - no metrics" % infile)
P.touch(outfile)
return
statement = '''CollectInsertSizeMetrics
INPUT=%(infile)s
REFERENCE_SEQUENCE=%(genome_file)s
ASSUME_SORTED=true
OUTPUT=%(outfile)s
VALIDATION_STRINGENCY=SILENT
>& %(outfile)s'''
P.run()
示例7: buildPicardRnaSeqMetrics
def buildPicardRnaSeqMetrics(infiles, strand, outfile):
'''run picard:RNASeqMetrics
Arguments
---------
infiles : string
Input filename in :term:`BAM` format.
Genome file in refflat format
(http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat)
outfile : string
Output filename with picard output.
'''
job_memory = PICARD_MEMORY
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
job_threads = 3
infile, genome = infiles
if BamTools.getNumReads(infile) == 0:
E.warn("no reads in %s - no metrics" % infile)
P.touch(outfile)
return
statement = '''picard %(picard_opts)s CollectRnaSeqMetrics
REF_FLAT=%(genome)s
INPUT=%(infile)s
ASSUME_SORTED=true
OUTPUT=%(outfile)s
STRAND=%(strand)s
VALIDATION_STRINGENCY=SILENT
'''
P.run()
示例8: buildGeneLevelReadCounts
def buildGeneLevelReadCounts(infiles, outfile):
'''compute read counts and coverage of exons with reads.
'''
bamfile, exons = infiles
if BamTools.isPaired(bamfile):
counter = 'readpair-counts'
else:
counter = 'read-counts'
# ignore multi-mapping reads
statement = '''
zcat %(exons)s
| python %(scriptsdir)s/gtf2table.py
--reporter=genes
--bam-file=%(bamfile)s
--counter=length
--prefix="exons_"
--counter=%(counter)s
--prefix=""
--counter=read-coverage
--prefix=coverage_
--min-mapping-quality=%(counting_min_mapping_quality)i
--multi-mapping=ignore
--log=%(outfile)s.log
| gzip
> %(outfile)s
'''
P.run()
示例9: bamToBed
def bamToBed(infile, outfile, min_insert_size=0, max_insert_size=1000):
"""convert bam to bed with bedtools."""
scriptsdir = "/ifs/devel/andreas/cgat/scripts"
if BamTools.isPaired(infile):
# output strand as well
statement = [
"cat %(infile)s "
"| python %(scriptsdir)s/bam2bed.py "
"--merge-pairs "
"--min-insert-size=%(min_insert_size)i "
"--max-insert-size=%(max_insert_size)i "
"--log=%(outfile)s.log "
"--bed-format=6 "
"> %(outfile)s" % locals()
]
else:
statement = "bamToBed -i %(infile)s > %(outfile)s" % locals()
E.debug("executing statement '%s'" % statement)
retcode = subprocess.call(statement, cwd=os.getcwd(), shell=True)
if retcode < 0:
raise OSError("Child was terminated by signal %i: \n%s\n" % (-retcode, statement))
return outfile
示例10: buildPicardCoverageStats
def buildPicardCoverageStats(infile, outfile, baits, regions):
'''run picard:CollectHsMetrics
Generate coverage statistics for regions of interest from a bed
file using Picard.
Arguments
---------
infile : string
Input filename in :term:`BAM` format.
outfile : string
Output filename with picard output.
baits : :term:`bed` formatted file of bait regions
regions : :term:`bed` formatted file of target regions
'''
job_memory = PICARD_MEMORY
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
job_threads = 3
if BamTools.getNumReads(infile) == 0:
E.warn("no reads in %s - no metrics" % infile)
P.touch(outfile)
return
statement = '''picard %(picard_opts)s CollectHsMetrics
BAIT_INTERVALS=%(baits)s
TARGET_INTERVALS=%(regions)s
INPUT=%(infile)s
OUTPUT=%(outfile)s
VALIDATION_STRINGENCY=LENIENT''' % locals()
P.run()
示例11: buildPicardDuplicationStats
def buildPicardDuplicationStats(infile, outfile):
'''run picard:MarkDuplicates
Record duplicate metrics using Picard, the marked records
are discarded.
Arguments
---------
infile : string
Input filename in :term:`BAM` format.
outfile : string
Output filename with picard output.
'''
job_memory = PICARD_MEMORY
job_threads = 3
if BamTools.getNumReads(infile) == 0:
E.warn("no reads in %s - no metrics" % infile)
P.touch(outfile)
return
# currently, MarkDuplicates cannot handle split alignments from gsnap
# these can be identified by the custom XT tag.
if ".gsnap.bam" in infile:
tmpf = P.getTempFile(".")
tmpfile_name = tmpf.name
statement = '''samtools view -h %(infile)s
| awk "!/\\tXT:/"
| samtools view /dev/stdin -S -b > %(tmpfile_name)s;
''' % locals()
data_source = tmpfile_name
else:
statement = ""
data_source = infile
os.environ["CGAT_JAVA_OPTS"] = "-Xmx%s -XX:+UseParNewGC\
-XX:+UseConcMarkSweepGC" % (PICARD_MEMORY)
statement += '''MarkDuplicates
INPUT=%(data_source)s
ASSUME_SORTED=true
METRICS_FILE=%(outfile)s
OUTPUT=/dev/null
VALIDATION_STRINGENCY=SILENT
'''
P.run()
os.unsetenv("CGAT_JAVA_OPTS")
if ".gsnap.bam" in infile:
os.unlink(tmpfile_name)
示例12: countDEXSeq
def countDEXSeq(infiles, outfile):
'''create counts for DEXSeq
Counts bam reads agains exon features in flattened gtf.
The required python script is provided by DEXSeq
and uses HTSeqCounts.
Parameters
----------
infile[0]: string
:term:`bam` file input
infile[1]: string
:term:`gff` output from buildGff function
outfile : string
A :term:`txt` file containing results
DEXSeq_strandedness : string
:term:`PARAMS`. Specifies strandedness, options
are 'yes', 'no' and 'reverse'
'''
infile, gfffile = infiles
ps = PYTHONSCRIPTSDIR
if BamTools.isPaired(infile):
paired = "yes"
else:
paired = "no"
strandedness = PARAMS["DEXSeq_strandedness"]
statement = '''python %(ps)s/dexseq_count.py
-p %(paired)s
-s %(strandedness)s
-r pos
-f bam %(gfffile)s %(infile)s %(outfile)s'''
P.run()
示例13: buildPicardAlignmentStats
def buildPicardAlignmentStats(infile, outfile, genome_file):
'''run picard:CollectMultipleMetrics
Arguments
---------
infile : string
Input filename in :term:`BAM` format.
outfile : string
Output filename with picard output.
genome_file : string
Filename with genomic sequence.
'''
job_memory = PICARD_MEMORY
picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
job_threads = 3
if BamTools.getNumReads(infile) == 0:
E.warn("no reads in %s - no metrics" % infile)
P.touch(outfile)
return
# Picard seems to have problem if quality information is missing
# or there is no sequence/quality information within the bam file.
# Thus, add it explicitly.
statement = '''cat %(infile)s
| cgat bam2bam -v 0
--method=set-sequence --output-sam
| picard %(picard_opts)s CollectMultipleMetrics
INPUT=/dev/stdin
REFERENCE_SEQUENCE=%(genome_file)s
ASSUME_SORTED=true
OUTPUT=%(outfile)s
VALIDATION_STRINGENCY=SILENT
>& %(outfile)s'''
P.run()
示例14: convertReadsToIntervals
def convertReadsToIntervals(bamfile,
bedfile,
filtering_quality=None,
filtering_dedup=None,
filtering_dedup_method='picard'):
'''convert reads in *bamfile* to *intervals*.
This method converts read data into intervals for
counting based methods.
This method is not appropriated for RNA-Seq.
Optional steps include:
* deduplication - remove duplicate reads
* quality score filtering - remove reads below a certain quality score.
* paired ended data - merge pairs
* paired ended data - filter by insert size
'''
track = P.snip(bedfile, ".bed.gz")
is_paired = BamTools.isPaired(bamfile)
current_file = bamfile
tmpdir = P.getTempFilename()
statement = ["mkdir %(tmpdir)s"]
nfiles = 0
if filtering_quality > 0:
next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
statement.append('''samtools view
-q %(filtering_quality)i -b
%(current_file)s
2>> %%(bedfile)s.log
> %(next_file)s ''' % locals())
nfiles += 1
current_file = next_file
if filtering_dedup is not None:
# Picard's MarkDuplicates requries an explicit bam file.
next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
if filtering_dedup_method == 'samtools':
statement.append('''samtools rmdup - - ''')
elif filtering_dedup_method == 'picard':
statement.append('''MarkDuplicates
INPUT=%(current_file)s
OUTPUT=%(next_file)s
ASSUME_SORTED=TRUE
METRICS_FILE=%(bedfile)s.duplicate_metrics
REMOVE_DUPLICATES=TRUE
VALIDATION_STRINGENCY=SILENT
2>> %%(bedfile)s.log ''' % locals())
nfiles += 1
current_file = next_file
if is_paired:
statement.append('''cat %(current_file)s
| python %(scriptsdir)s/bam2bed.py
--merge-pairs
--min-insert-size=%(filtering_min_insert_size)i
--max-insert-size=%(filtering_max_insert_size)i
--log=%(bedfile)s.log
-
| python %(scriptsdir)s/bed2bed.py
--method=sanitize-genome
--genome-file=%(genome_dir)s/%(genome)s
--log=%(bedfile)s.log
| cut -f 1,2,3,4
| sort -k1,1 -k2,2n
| bgzip > %(bedfile)s''')
else:
statement.append('''cat %(current_file)s
| python %(scriptsdir)s/bam2bed.py
--log=%(bedfile)s.log
-
| python %(scriptsdir)s/bed2bed.py
--method=sanitize-genome
--genome-file=%(genome_dir)s/%(genome)s
--log=%(bedfile)s.log
| cut -f 1,2,3,4
| sort -k1,1 -k2,2n
| bgzip > %(bedfile)s''')
statement.append("tabix -p bed %(bedfile)s")
statement.append("rm -rf %(tmpdir)s")
statement = " ; ".join(statement)
P.run()
os.unlink(tmpdir)
示例15: convertReadsToIntervals
def convertReadsToIntervals(bamfile,
bedfile,
filtering_quality=None,
filtering_dedup=None,
filtering_dedup_method='picard',
filtering_nonunique=False):
'''convert reads in *bamfile* to *intervals*.
This method converts read data into intervals for
counting based methods.
This method is not appropriate for RNA-Seq.
Optional steps include:
For paired end data, pairs are merged and optionally
filtered by insert size.
Arguments
---------
bamfile : string
Filename of input file in :term:`bam` format.
bedfile : string
Filename of output file in :term:`bed` format.
filtering_quality : int
If set, remove reads with a quality score below given threshold.
filtering_dedup : bool
If True, deduplicate data.
filtering_dedup_method : string
Deduplication method. Possible options are ``picard`` and
``samtools``.
filtering_nonunique : bool
If True, remove non-uniquely matching reads.
'''
track = P.snip(bedfile, ".bed.gz")
is_paired = BamTools.isPaired(bamfile)
current_file = bamfile
tmpdir = P.getTempFilename()
statement = ["mkdir %(tmpdir)s"]
nfiles = 0
if filtering_quality > 0:
next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
statement.append('''samtools view
-q %(filtering_quality)i -b
%(current_file)s
2>> %%(bedfile)s.log
> %(next_file)s ''' % locals())
nfiles += 1
current_file = next_file
if filtering_nonunique:
next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
statement.append('''cat %(current_file)s
| python %%(scriptsdir)s/bam2bam.py
--method=filter
--filter-method=unique,mapped
--log=%%(bedfile)s.log
> %(next_file)s ''' % locals())
nfiles += 1
current_file = next_file
if filtering_dedup is not None:
# Picard's MarkDuplicates requries an explicit bam file.
next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
if filtering_dedup_method == 'samtools':
statement.append('''samtools rmdup - - ''')
elif filtering_dedup_method == 'picard':
statement.append('''MarkDuplicates
INPUT=%(current_file)s
OUTPUT=%(next_file)s
ASSUME_SORTED=TRUE
METRICS_FILE=%(bedfile)s.duplicate_metrics
REMOVE_DUPLICATES=TRUE
VALIDATION_STRINGENCY=SILENT
2>> %%(bedfile)s.log ''' % locals())
nfiles += 1
current_file = next_file
if is_paired:
statement.append('''cat %(current_file)s
| python %(scriptsdir)s/bam2bed.py
--merge-pairs
--min-insert-size=%(filtering_min_insert_size)i
--max-insert-size=%(filtering_max_insert_size)i
--log=%(bedfile)s.log
-
| python %(scriptsdir)s/bed2bed.py
--method=sanitize-genome
--genome-file=%(genome_dir)s/%(genome)s
--log=%(bedfile)s.log
| cut -f 1,2,3,4
#.........这里部分代码省略.........