本文整理汇总了Python中CGAT.Intervals.complement方法的典型用法代码示例。如果您正苦于以下问题:Python Intervals.complement方法的具体用法?Python Intervals.complement怎么用?Python Intervals.complement使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类CGAT.Intervals
的用法示例。
在下文中一共展示了Intervals.complement方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: toIntronIntervals
# 需要导入模块: from CGAT import Intervals [as 别名]
# 或者: from CGAT.Intervals import complement [as 别名]
def toIntronIntervals(chunk):
"""convert a set of gtf elements within a transcript to intron coordinates.
Will use first transcript_id found.
Note that coordinates will still be forward strand coordinates
"""
if len(chunk) == 0:
return []
contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id)
for gff in chunk:
assert gff.strand == strand, "features on different strands."
assert gff.contig == contig, "features on different contigs."
intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"])
return Intervals.complement(intervals)
示例2: toIntronIntervals
# 需要导入模块: from CGAT import Intervals [as 别名]
# 或者: from CGAT.Intervals import complement [as 别名]
def toIntronIntervals(chunk):
'''convert a set of gtf elements within a transcript to intron coordinates.
Will raise an error if more than one transcript is submitted.
Note that coordinates will still be forward strand coordinates
'''
if len(chunk) == 0:
return []
contig, strand, transcript_id = chunk[
0].contig, chunk[0].strand, chunk[0].transcript_id
for gff in chunk:
assert gff.strand == strand, "features on different strands."
assert gff.contig == contig, "features on different contigs."
assert gff.transcript_id == transcript_id, "more than one transcript submitted"
intervals = Intervals.combine([(x.start, x.end)
for x in chunk if x.feature == "exon"])
return Intervals.complement(intervals)
示例3: processChunk
# 需要导入模块: from CGAT import Intervals [as 别名]
# 或者: from CGAT.Intervals import complement [as 别名]
#.........这里部分代码省略.........
if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases:
nremoved_gaps += 1
x_nremoved_gaps += 1
continue
new_matches.append( match )
matches = new_matches
if len(matches) == 0:
if outfile_empty:
outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\
(query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) )
nskipped += 1
return
## Remove queries matching to a forbidden region. This section
## will remove the full query if any of its matches matches in a
## forbidden region.
keep = True
for match in matches:
if intersectors and match.mSbjctId in intersectors:
found = intersectors[match.mSbjctId].find( match.mSbjctFrom, match.mSbjctTo )
if found and not options.keep_forbidden or (found and not options.keep_forbidden):
nremoved_regions += 1
keep = False
continue
if not keep:
nqueries_removed_region += 1
if outfile_empty:
outfile_empty.write( "%s\toverlap with forbidden region\n" % query_id )
return
## check for full length matches
for match in matches:
if match.mQueryCoverage >= 99.9:
full_matches.append(match)
if match.mQueryCoverage > options.threshold_good_query_coverage:
good_matches.append(match)
else:
partial_matches.append(match)
if full_matches:
nfull_matches += 1
elif good_matches:
ngood_matches += 1
elif partial_matches:
npartial_matches += 1
## compute coverage of sequence with matches
intervals = []
for match in full_matches + good_matches + partial_matches:
intervals.append( (match.mQueryFrom, match.mQueryTo) )
rest = Intervals.complement( intervals, 0, match.mQueryLength )
query_coverage = 100.0 * (match.mQueryLength - sum( map( lambda x: x[1] - x[0], rest) ) ) / match.mQueryLength
if query_coverage >= 99.9:
fully_matched.append( query_id )
elif query_coverage > options.threshold_good_query_coverage:
well_matched.append( query_id )
else:
partially_matched.append( query_id )
aggregate_coverages.append( query_coverage )
## select matches to output
matches, msg = selectMatches( query_id, matches, options, queries_fasta )
if len(matches) > 0:
for match in matches:
if options.query_forward_coordinates:
match.convertCoordinates()
if options.output_format == "map":
options.stdout.write( "%s\n" %\
"\t".join( map(str, (
match.mQueryId, match.mSbjctId,
match.strand,
"%5.2f" % match.mQueryCoverage,
"%5.2f" % match.mSbjctCoverage,
"%5.2f" % match.mPid,
match.mQueryLength,
match.mSbjctLength,
match.mQueryFrom, match.mQueryTo,
match.mSbjctFrom, match.mSbjctTo,
",".join( map(str,match.mBlockSizes) ),
",".join( map(str,match.mQueryBlockStarts)),
",".join( map(str,match.mSbjctBlockStarts)),
))))
elif options.output_format == "psl":
options.stdout.write( str(match) + "\n" )
noutput += 1
else:
if outfile_empty:
outfile_empty.write( "%s\tno matches selected: %s\n" % (query_id, msg) )
nempty += 1
示例4: readWorkspace
# 需要导入模块: from CGAT import Intervals [as 别名]
# 或者: from CGAT.Intervals import complement [as 别名]
def readWorkspace(infile,
workspace_builder="raw",
label="none",
map_id2annotation={}):
"""read workspace from infile.
A workspace is a collection of intervals with two labels associated
to each interval, one for the 5' and one for the 3' end.
Available workspace builders are:
gff
take a gff file.
gtf-intergenic
build workspace from intergenic segments in a gtf file.
gtf-intronic
build workspace from intronic segments in a gtf file
gtf-genic
the workspace is built from genes (first to last exon).
Available labels are:
none
no labels are given to the ends of workspaces
direction
labels are given based on the 5'/3' end of the
bounding exon
annotation
labels are given based on a gene2annotation map.
returns a list of segments for each contig in a dictionary
"""
if label == "none":
label_f = lambda x, y: (("X",), ("X",))
info_f = lambda x: None
elif label == "direction":
label_f = lambda x, y: ((("5", "3")[x],), (("3", "5")[y],))
info_f = lambda x: x.strand == "+"
elif label == "annotation":
label_f = lambda x, y: (map_id2annotation[x], map_id2annotation[y])
info_f = lambda x: x.gene_id
if workspace_builder == "gff":
workspace = GTF.readAsIntervals(GFF.iterator(infile))
elif workspace_builder == "gtf-intergenic":
workspace = collections.defaultdict(list)
# get all genes
for e in GTF.merged_gene_iterator(GTF.iterator(infile)):
workspace[e.contig].append((e.start, e.end, info_f(e)))
# convert to intergenic regions.
# overlapping genes are merged and the labels
# of the right-most entry is retained
for contig in workspace.keys():
segs = workspace[contig]
segs.sort()
last = segs[0]
new_segs = []
for this in segs[1:]:
if last[1] >= this[0]:
if this[1] > last[1]:
last = (last[0], this[1], this[2])
continue
assert last[1] < this[0], "this=%s, last=%s" % (this, last)
new_segs.append((last[1], this[0],
label_f(last[2], this[2])))
last = this
workspace[contig] = new_segs
elif workspace_builder == "gtf-intronic":
workspace = collections.defaultdict(list)
# the current procedure will count nested genes
# twice
for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):
exons = Intervals.combine([(e.start, e.end) for e in ee])
introns = Intervals.complement(exons)
r = ee[0]
for start, end in introns:
workspace[r.contig].append((start,
end,
label_f(info_f(r), info_f(r))
))
elif workspace_builder == "gtf-genic":
workspace = collections.defaultdict(list)
# the current procedure will count nested genes
#.........这里部分代码省略.........
示例5: __init__
# 需要导入模块: from CGAT import Intervals [as 别名]
# 或者: from CGAT.Intervals import complement [as 别名]
def __init__(self, *args, **kwargs):
Sampler.__init__(self, *args, **kwargs)
self.mGapLengths = [x[1] - x[0]
for x in Intervals.complement(self.mObserved, self.mWorkStart, self.mWorkEnd)]
示例6: main
# 需要导入模块: from CGAT import Intervals [as 别名]
# 或者: from CGAT.Intervals import complement [as 别名]
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv is None:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("-f", "--feature", dest="feature", type="choice",
choices=["gene", "transcript", "exon"],
default="transcript",
help="which feature to use: gene/transcript/exon")
parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string",
help="BigWig with tag counts on both strands")
parser.add_option("--plus-bw", dest="plus_wig", type="string",
help="BigWig with tag counts from plus strand")
parser.add_option("--minus-bw", dest="minus_wig", type="string",
help="BigWig with tag counts from minus strand")
parser.add_option("--bed", dest="bedfile", type="string",
help="tabix indexed bed file with tag counts"),
parser.add_option("-c", "--use-centre", dest="centre", action="store_true",
default=False,
help="Use centre of read rather than start")
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
iterator = GTF.iterator(options.stdin)
if options.feature == "gene":
iterator = GTF.flat_gene_iterator(iterator)
elif options.feature == "transcript":
iterator = GTF.transcript_iterator(iterator)
elif options.feature == "exon":
def _exon_iterator(gff_iterator):
for exon in gff_iterator:
yield [exon]
iterator = _exon_iterator(iterator)
if options.unstranded_wig:
bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig)
elif options.plus_wig:
if not options.minus_wig:
raise ValueError(
"Please provide wigs for both strands or use --unstranded_wig")
bamfile = iCLIP.make_getter(plus_wig=options.plus_wig,
minus_wig=options.minus_wig)
elif options.bedfile:
bamfile = iCLIP.make_getter(bedfile=options.bedfile)
else:
bamfile = pysam.AlignmentFile(args[0])
outlines = []
for feature in iterator:
exons = GTF.asRanges(feature, "exon")
exon_counts = iCLIP.count_intervals(bamfile,
exons,
feature[0].contig,
feature[0].strand,
dtype="uint32",
use_centre=options.centre)
exon_counts = exon_counts.sum()
introns = Intervals.complement(exons)
intron_counts = iCLIP.count_intervals(bamfile,
introns,
feature[0].contig,
feature[0].strand,
dtype="uint32",
use_centre=options.centre)
intron_counts = intron_counts.sum()
if options.feature == "exon":
try:
exon_id = feature[0].exon_id
except AttributeError:
try:
exon_id = feature[0].exon_number
except AttributeError:
exon_id = "missing"
gene_id = feature[0].gene_id
transcript_id = feature[0].transcript_id
intron_counts = "NA"
else:
exon_id = "NA"
gene_id = feature[0].gene_id
transcript_id = feature[0].transcript_id
intron_counts = float(intron_counts)
outlines.append([gene_id,
transcript_id,
#.........这里部分代码省略.........
示例7: main
# 需要导入模块: from CGAT import Intervals [as 别名]
# 或者: from CGAT.Intervals import complement [as 别名]
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv is None:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("-f", "--feature", dest="feature", type="choice",
choices=["gene", "transcript", "exon"],
default="transcript",
help="supply help")
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
iterator = GTF.iterator(options.stdin)
if options.feature == "gene":
iterator = GTF.flat_gene_iterator(iterator)
elif options.feature == "transcript":
iterator = GTF.transcript_iterator(iterator)
elif options.feature == "exon":
def _exon_iterator(gff_iterator):
for exon in gff_iterator:
yield [exon]
iterator = _exon_iterator(iterator)
bamfile = pysam.AlignmentFile(args[0])
outlines = []
for feature in iterator:
exons = GTF.asRanges(feature, "exon")
exon_counts = iCLIP.count_intervals(bamfile,
exons,
feature[0].contig,
feature[0].strand,
dtype="uint32")
exon_counts = exon_counts.sum()
introns = Intervals.complement(exons)
intron_counts = iCLIP.count_intervals(bamfile,
introns,
feature[0].contig,
feature[0].strand,
dtype="uint32")
intron_counts = intron_counts.sum()
if options.feature == "exon":
try:
exon_id = feature[0].exon_id
except AttributeError:
exon_id = "missing"
gene_id = feature[0].gene_id
transcript_id = feature[0].transcript_id
intron_counts = "NA"
else:
exon_id = "NA"
gene_id = feature[0].gene_id
transcript_id = feature[0].transcript_id
outlines.append([gene_id,
transcript_id,
exon_id,
str(exon_counts),
str(intron_counts)])
options.stdout.write("\t".join(["gene_id",
"transcript_id",
"exon_id",
"exon_count",
"intron_count"])+"\n")
outlines = ["\t".join(outline) for outline in outlines]
outlines = "\n".join(outlines)
options.stdout.write(outlines + "\n")
# write footer and output benchmark information.
E.Stop()