本文整理汇总了Python中CGAT.Bed.iterator方法的典型用法代码示例。如果您正苦于以下问题:Python Bed.iterator方法的具体用法?Python Bed.iterator怎么用?Python Bed.iterator使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类CGAT.Bed
的用法示例。
在下文中一共展示了Bed.iterator方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getProbeFragments
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def getProbeFragments(probe_bed, digest_bed, outfile,
lookup_out):
# First find the length of the restriction enzyme cut, required to obtain the start and end coordinates
# from the pregenerated file.
# First iteration, no comparison
first_iteration = True
length_RE_cut = 0
last_bed = None
for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)):
if(first_iteration):
first_iteration = False
else:
# If they are in the same contig they can be compared
if(bed_digest.contig == last_bed.contig):
length_RE_cut = bed_digest.start - last_bed.end
break
last_bed = bed_digest
digest_fragments = pysam.TabixFile(digest_bed)
bed = Bed.Bed()
with IOTools.openFile(outfile, "w") as outf, \
IOTools.openFile(lookup_out,"w") as lookup:
lookup.write("probe\tfragment\n")
for probe in Bed.iterator(IOTools.openFile(probe_bed)):
frag = digest_fragments.fetch(probe.contig,
probe.start,
probe.end,
parser=pysam.asBed())
frag = list(frag)
if not len(frag) == 1:
E.warn("%i fragments found for probe %s, skipping" %
(len(frag), probe.name))
continue
frag = frag[0]
# The restriction enzyme cut on the left side of the fragment
# is the end site of the last restriction enzyme fragment + 1
# (+1 because according to the manual coordinates are specified
# in 1-origin for the bed start.)
bed.start = frag.start-length_RE_cut+1
bed.end = frag.end+length_RE_cut
bed.contig = frag.contig
bed["name"] = probe.name
bed["score"] = "."
bed["strand"] = "+"
lookup.write("%s\t%s\n" % (probe.name, frag.name))
outf.write(str(bed) + "\n")
示例2: exportSequencesFromBedFile
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def exportSequencesFromBedFile( infile, outfile, masker = None, mode = "intervals" ):
'''export sequences for intervals in :term:`bed`-formatted *infile*
to :term:`fasta` formatted *outfile*
'''
track = P.snip( infile, ".bed.gz" )
fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) )
outs = IOTools.openFile( outfile, "w")
ids, seqs = [], []
for bed in Bed.setName(Bed.iterator( IOTools.openFile(infile) )):
lcontig = fasta.getLength( bed.contig )
if mode == "intervals":
seqs.append( fasta.getSequence( bed.contig, "+", bed.start, bed.end) )
ids.append( "%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end) )
elif mode == "leftright":
l = bed.end - bed.start
start, end = max(0,bed.start-l), bed.end-l
ids.append( "%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
start, end = bed.start+l, min(lcontig,bed.end+l)
ids.append( "%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
masked = maskSequences( seqs, masker )
outs.write("\n".join( [ ">%s\n%s" % (x,y) for x,y in zip(ids, masked) ] ) )
outs.close()
示例3: fetchProbeFragments
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def fetchProbeFragments(probe_bed, digest_bed, outfile,
lookup_out):
digest_fragments = pysam.TabixFile(digest_bed)
bed = Bed.Bed()
with IOTools.openFile(outfile, "w") as outf, \
IOTools.openFile(lookup_out,"w") as lookup:
lookup.write("probe\tfragment\n")
for probe in Bed.iterator(IOTools.openFile(probe_bed)):
frag = digest_fragments.fetch(probe.contig,
probe.start,
probe.end,
parser=pysam.asBed())
frag = list(frag)
if not len(frag) == 1:
E.warn("%i fragments found for probe %s, skipping" %
(len(frag), probe.name))
continue
frag = frag[0]
bed.start = frag.start
bed.end = frag.end
bed.contig = frag.contig
bed["name"] = probe.name
bed["score"] = "."
bed["strand"] = "+"
lookup.write("%s\t%s\n" % (probe.name, frag.name))
outf.write(str(bed) + "\n")
示例4: annotateCpGIslands
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def annotateCpGIslands( infiles, outfile ):
'''annotate transcript by absence/presence of CpG islands
'''
cpgfile, tssfile = infiles
cpg = Bed.readAndIndex( IOTools.openFile( cpgfile ) )
extension_upstream = PARAMS["cpg_search_upstream"]
extension_downstream = PARAMS["cpg_search_downstream"]
c = E.Counter()
outf = IOTools.openFile( outfile, "w" )
outf.write("transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n" )
for tss in Bed.iterator(IOTools.openFile( tssfile ) ):
c.tss_total += 1
if tss.strand == "+":
start, end = tss.start - extension_upstream, tss.start + extension_downstream
else:
start, end = tss.end - extension_downstream, tss.end + extension_upstream
try:
matches = list(cpg[tss.contig].find( start, end ))
except KeyError:
c.promotor_without_matches += 1
continue
if len(matches) == 0:
c.promotor_without_matches += 1
continue
c.promotor_output += 1
for match in matches:
c.matches_total += 1
genome_start, genome_end, x = match
l = genome_end - genome_start
# get relative location of match
if tss.strand == "+":
relative_start = genome_start - tss.start
else:
relative_start = tss.end - genome_end
relative_end = relative_start + l
outf.write( "\t".join( map(str, (
tss.name, tss.strand,
genome_start, genome_end,
relative_start, relative_end ))) + "\n" )
c.matches_output += 1
outf.close()
with IOTools.openFile( outfile + ".summary", "w" ) as outf:
outf.write ("category\tcounts\n" )
outf.write( c.asTable() + "\n" )
E.info( c )
示例5: main
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if not argv:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(
version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])
parser.add_option("-o", "--output-section", dest="output", type="choice",
choices=("full", "name"),
help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].")
parser.set_defaults(
output="full",
)
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
if len(args) != 2:
raise ValueError("two arguments required")
if args[0] == "-":
infile1 = options.stdin
else:
infile1 = IOTools.openFile(args[0], "r")
infile2 = IOTools.openFile(args[1], "r")
idx = Bed.readAndIndex(infile2, with_values=True)
output = options.output
outfile = options.stdout
if output == "name":
outfile.write("name1\tname2\n")
outf = lambda x: x.fields[0]
else:
outf = str
for bed in Bed.iterator(infile1):
try:
overlaps = idx[bed.contig].find(bed.start, bed.end)
except (KeyError, IndexError):
# ignore missing contig and zero length intervals
continue
for o in overlaps:
outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")
E.Stop()
示例6: main
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def main( argv = None ):
if argv == None: argv = sys.argv
parser = E.OptionParser( version = "%prog version: $Id: gtf2table.py 2888 2010-04-07 08:48:36Z andreas $", usage = globals()["__doc__"])
parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
help="filename with genome [default=%default]." )
parser.add_option("-n", "--per-name", dest="per_name", action="store_true",
help="compute counts per name [default=%default]." )
parser.add_option("-c", "--per-contig", dest="per_contig", action="store_true",
help="compute counts per contig [default=%default]." )
parser.add_option("-t", "--per-track", dest="per_track", action="store_true",
help="compute counts per track [default=%default]." )
parser.set_defaults(
genome_file = None,
per_name = False,
per_track = False,
)
(options, args) = E.Start( parser, argv )
# get files
if options.genome_file:
fasta = IndexedFasta.IndexedFasta( options.genome_file )
else:
fasta = None
counts = collections.defaultdict( Counter )
if options.per_track:
keyf = lambda x: x.track
elif options.per_name:
keyf = lambda x: x.name
elif options.per_contig:
keyf = lambda x: x.contig
else:
keyf = lambda x: "all"
for bed in Bed.iterator(options.stdin):
counts[keyf(bed)].add( bed )
outf = options.stdout
key = "track"
outf.write( "%s\t%s\n" % (key, "\t".join( Counter.headers) ))
for key, count in counts.iteritems():
outf.write( "%s\t%s\n" % ( key, str(count)) )
E.Stop()
示例7: quantifyAnomolies
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def quantifyAnomolies(bam, probes, outfile):
bamfile = pysam.AlignmentFile(bam)
results = dict()
mapped = bamfile.mapped
total = 0
seeks = 0
for probe in Bed.iterator(IOTools.openFile(probes)):
c = collections.Counter()
for read in bamfile.fetch(probe.contig, probe.start, probe.end,
multiple_iterators=True):
if read.is_unmapped:
continue
c["total"] += 1
if not (read.is_secondary or read.is_supplementary):
c["primary"] += 1
else:
continue
if read.pos < (probe.start-4) or read.aend > (probe.end +4):
c["undigested"] += 1
if read.is_read1:
c["read1"] += 1
if not read.mate_is_unmapped:
c["paired"] += 1
if read.is_read1 and (
read.mpos >= probe.start and read.mpos <= probe.end):
c["self_lig"] += 1
if (total + c["total"] % 10000) == 0:
E.debug("%s/%s done" % (total + c["total"], mapped))
E.debug("%s processed, %i found" % (probe.name, c["total"]))
results[probe.name] = c
total += c["total"]
headers = ["Probe", "total", "primary", "undigested",
"read1", "paired", "self_lig"]
with IOTools.openFile(outfile, "w") as outf:
outf.write("\t".join(headers) + "\n")
for probe in results:
outf.write("\t".join(
[probe]+[str(results[probe][col])
for col in headers[1:]]) + "\n")
示例8: main
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def main(argv=None):
"""script main.
parses command line options in sys.argv, unless *argv* is given.
"""
if argv is None:
argv = sys.argv
# setup command line parser
parser = E.OptionParser(version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("-t", "--test", dest="test", type="string",
help="supply help")
parser.add_option("--regex-filename", dest="re_name", type="string",
help="regex for filename component to be used "
"as the annotation label")
parser.set_defaults(
re_name="(.+)",
)
(options, args) = E.Start(parser, argv=argv)
infiles = argv[-1]
bed_files = infiles.split(",")
if len(bed_files) == 1:
raise IOError("Only one file detected, cannot merge "
"a single bed file")
else:
pass
# get the regex for annotation names,
rx = re.compile(options.re_name)
annot_names = [[y for y in rx.search(x).groups()] for x in bed_files]
annot_names = list(itertools.chain(*annot_names))
# output as BED4 format: chr, start, end, name
for fx in range(len(bed_files)):
bfile = bed_files[fx]
with IOTools.openFile(bfile, "r") as ofile:
intervals = Bed.iterator(ofile)
track_name = [bx for bx in annot_names if re.search(bx, bfile)][0]
for entry in intervals:
entry.__setitem__("name", track_name)
options.stdout.write("%s\t%s\t%s\t%s\n" % (entry.contig,
entry.start,
entry.end,
entry.name))
# write footer and output benchmark information.
E.Stop()
示例9: countTagsInClusters
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def countTagsInClusters(bedfile, bamfile, outfile):
bam = pysam.AlignmentFile(bamfile)
outlines = []
for bed in Bed.iterator(IOTools.openFile(bedfile)):
interval = (bed.start, bed.end)
counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum()
outlines.append(["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)])
IOTools.writeLines(outfile, outlines, header=["position","count"])
示例10: buildQuicksectMask
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def buildQuicksectMask(bed_file):
'''return Quicksect object containing the regions specified
takes a bed file listing the regions to mask
'''
mask = IndexedGenome.Quicksect()
n_regions = 0
for bed in Bed.iterator(IOTools.openFile(bed_file)):
# it is neccessary to extend the region to make an accurate mask
mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1)
n_regions += 1
E.info("Built Quicksect mask for %i regions" % n_regions)
return(mask)
示例11: buildAlignmentSizes
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def buildAlignmentSizes(infiles, outfile):
'''
use bed files to sum the total number of bases
that are aligned to the genomes
'''
outf = open(outfile, "w")
outf.write("genome\tsize\n")
for infile in infiles:
genome = P.snip(os.path.basename(infile), ".bed.gz")
c = 0
inf = IOTools.openFile(infile)
for bed in Bed.iterator(inf):
c += bed.end - bed.start
outf.write("%s\t%s\n" % (genome, str(c)))
outf.close()
示例12: sites2fragments
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def sites2fragments(infile, genomefile, outfile):
'''Convert bedfile of deigestion sites into bedfile of fragments'''
contig_lengths = {line.split()[0]: int(line.split()[1][:-1])
for line in IOTools.openFile(genomefile)}
last_end = 0
last_contig = None
name = 0
new_bed = Bed.Bed()
new_bed["strand"] = "+"
new_bed["score"] = "."
with IOTools.openFile(outfile, "w") as outf:
for bed in Bed.iterator(IOTools.openFile(infile)):
if last_contig is not None and not bed.contig == last_contig:
name += 1
new_bed.start = last_end
new_bed.contig = last_contig
new_bed.end = contig_lengths[bed.contig]
new_bed["name"] = str(name)
outf.write(str(new_bed) + "\n")
last_end = 0
last_contig = bed.contig
new_bed.contig = last_contig
new_bed.start = last_end
new_bed.end = bed.start
name += 1
new_bed["name"] = str(name)
outf.write(str(new_bed) + "\n")
last_end = bed.end
name += 1
new_bed.start = last_end
new_bed.contig = last_contig
new_bed.end = contig_lengths[bed.contig]
new_bed["name"] = str(name)
outf.write(str(new_bed) + "\n")
pysam.tabix_index(outfile, force=True, preset="bed")
示例13: makeIntervalCorrelation
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def makeIntervalCorrelation(infiles, outfile, field, reference):
'''compute correlation of interval properties between sets
'''
dbhandle = sqlite3.connect(PARAMS["database_name"])
tracks, idx = [], []
for infile in infiles:
track = P.snip(infile, ".bed.gz")
tablename = "%s_intervals" % P.tablequote(track)
cc = dbhandle.cursor()
statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
)
cc.execute(statement)
ix = IndexedGenome.IndexedGenome()
for contig, start, end, peakval in cc:
ix.add(contig, start, end, peakval)
idx.append(ix)
tracks.append(track)
outs = IOTools.openFile(outfile, "w")
outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")
for bed in Bed.iterator(infile=IOTools.openFile(reference, "r")):
row = []
for ix in idx:
try:
intervals = list(ix.get(bed.contig, bed.start, bed.end))
except KeyError:
row.append("")
continue
if len(intervals) == 0:
peakval = ""
else:
peakval = str((max([x[2] for x in intervals])))
row.append(peakval)
outs.write(str(bed) + "\t" + "\t".join(row) + "\n")
outs.close()
示例14: main
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def main(argv=None):
if argv is None:
argv = sys.argv
parser = E.OptionParser(
version="%prog version: $Id$",
usage=globals()["__doc__"])
parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
help="filename with genomic sequence to retrieve "
"sequences from.")
parser.add_option("-m", "--masker", dest="masker", type="choice",
choices=("dust", "dustmasker", "softmask", "none"),
help="apply masker to mask output sequences "
"[%default].")
parser.add_option("--output-mode", dest="output_mode", type="choice",
choices=("intervals", "leftright", "segments"),
help="what to output. "
"'intervals' generates a single sequence for "
"each bed interval. 'leftright' generates two "
"sequences, one in each direction, for each bed "
"interval. 'segments' can be used to output "
"sequence from bed12 files so that sequence only covers "
"the segements [%default]")
parser.add_option("--min-sequence-length", dest="min_length", type="int",
help="require a minimum sequence length [%default]")
parser.add_option("--max-sequence-length", dest="max_length", type="int",
help="require a maximum sequence length [%default]")
parser.add_option(
"--extend-at", dest="extend_at", type="choice",
choices=("none", "3", "5", "both", "3only", "5only"),
help="extend at 3', 5' or both or no ends. If 3only or 5only "
"are set, only the added sequence is returned [default=%default]")
parser.add_option(
"--extend-by", dest="extend_by", type="int",
help="extend by # bases [default=%default]")
parser.add_option(
"--use-strand", dest="ignore_strand",
action="store_false",
help="use strand information and return reverse complement "
"on intervals located on the negative strand. "
"[default=%default]")
parser.set_defaults(
genome_file=None,
masker=None,
output_mode="intervals",
min_length=0,
max_length=0,
extend_at=None,
extend_by=100,
ignore_strand=True,
)
(options, args) = E.Start(parser)
if options.genome_file:
fasta = IndexedFasta.IndexedFasta(options.genome_file)
contigs = fasta.getContigSizes()
fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))
counter = E.Counter()
ids, seqs = [], []
E.info("collecting sequences")
for bed in Bed.setName(Bed.iterator(options.stdin)):
counter.input += 1
lcontig = fasta.getLength(bed.contig)
if options.ignore_strand:
strand = "+"
else:
strand = bed.strand
if options.output_mode == "segments" and bed.columns == 12:
ids.append("%s %s:%i..%i (%s) %s %s" %
(bed.name, bed.contig, bed.start, bed.end, strand,
bed["blockSizes"], bed["blockStarts"]))
seg_seqs = [fasta.getSequence(bed.contig, strand, start, end)
for start, end in bed.toIntervals()]
seqs.append("".join(seg_seqs))
elif (options.output_mode == "intervals" or
options.output_mode == "segments"):
ids.append("%s %s:%i..%i (%s)" %
(bed.name, bed.contig, bed.start, bed.end, strand))
seqs.append(
fasta.getSequence(bed.contig, strand, bed.start, bed.end))
elif options.output_mode == "leftright":
l = bed.end - bed.start
#.........这里部分代码省略.........
示例15: main
# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
#.........这里部分代码省略.........
remove_contigs=None,
force=False,
unique=False,
colour_mismatches=False,
ignore_mismatches=False,
output_sam=False,
filename_table=None,
)
# add common options (-h/--help, ...) and parse command line
(options, args) = E.Start(parser, argv=argv)
if len(args) != 1:
raise ValueError("please supply one bam file")
bamfile_genome = args[0]
genome_samfile = pysam.Samfile(bamfile_genome, "rb")
if options.remove_contigs:
options.remove_contigs = options.remove_contigs.split(",")
if options.filename_map:
E.info("reading map")
id_map = IOTools.readMap(
IOTools.openFile(options.filename_map), has_header=True)
id_map = dict([(y, x) for x, y in id_map.iteritems()])
else:
id_map = None
transcripts = {}
if options.filename_gtf:
E.info("indexing geneset")
mapped, missed = 0, 0
for gtf in GTF.transcript_iterator(
GTF.iterator(IOTools.openFile(options.filename_gtf))):
gtf.sort(key=lambda x: x.start)
transcript_id = gtf[0].transcript_id
if id_map:
try:
transcript_id = id_map[transcript_id]
mapped += 1
except KeyError:
missed += 1
continue
transcripts[transcript_id] = gtf
E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
(len(transcripts), mapped, missed))
regions_to_remove = None
if options.filename_regions:
E.info("indexing regions")
regions_to_remove = IndexedGenome.Simple()
for bed in Bed.iterator(IOTools.openFile(options.filename_regions)):
regions_to_remove.add(bed.contig, bed.start, bed.end)
E.info("read %i regions" % len(regions_to_remove))
if options.filename_transcriptome:
transcripts_samfile = pysam.Samfile(options.filename_transcriptome,
"rb")
else:
transcripts_samfile = None
if options.output_sam:
output_samfile = pysam.Samfile("-", "wh", template=genome_samfile)
else: