Python CGAT.Bed类代码示例

本文整理汇总了Python中CGAT.Bed类的典型用法代码示例。如果您正苦于以下问题：Python Bed类的具体用法？Python Bed怎么用？Python Bed使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了Bed类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: exportSequencesFromBedFile

def exportSequencesFromBedFile( infile, outfile, masker = None, mode = "intervals" ):
    '''export sequences for intervals in :term:`bed`-formatted *infile* 
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip( infile, ".bed.gz" )

    fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) )
    outs = IOTools.openFile( outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator( IOTools.openFile(infile) )):
        lcontig = fasta.getLength( bed.contig )

        if mode == "intervals":
            seqs.append( fasta.getSequence( bed.contig, "+", bed.start, bed.end) )
            ids.append( "%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end) )

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0,bed.start-l), bed.end-l
            ids.append( "%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
            seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
            
            start, end = bed.start+l, min(lcontig,bed.end+l)
            ids.append( "%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
            seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
            
    masked = maskSequences( seqs, masker )
    outs.write("\n".join( [ ">%s\n%s" % (x,y) for x,y in zip(ids, masked) ] ) )

    outs.close()

开发者ID:BioinformaticsArchive，项目名称:cgat，代码行数:33，代码来源:PipelineMotifs.py

示例2: getProbeFragments

def getProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):
    
    # First find the length of the restriction enzyme cut, required to obtain the start and end coordinates
    # from the pregenerated file.
    # First iteration, no comparison
    first_iteration = True
    
    length_RE_cut = 0
    
    last_bed = None
    
    for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)):
                
        if(first_iteration):
            first_iteration = False
        else:
            # If they are in the same contig they can be compared
            if(bed_digest.contig == last_bed.contig):
                length_RE_cut = bed_digest.start - last_bed.end
                break
        
        last_bed = bed_digest
    
    
    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            
            # The restriction enzyme cut on the left side of the fragment
            # is the end site of the last restriction enzyme fragment + 1
            # (+1 because according to the manual coordinates are specified
            # in 1-origin for the bed start.)
            
            bed.start = frag.start-length_RE_cut+1
            bed.end = frag.end+length_RE_cut
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")

开发者ID:sudlab，项目名称:pipeline_capt_c_perl，代码行数:59，代码来源:pipelineCaptCPerl.py

示例3: annotateCpGIslands

def annotateCpGIslands( infiles, outfile ):
    '''annotate transcript by absence/presence of CpG islands
    '''
    cpgfile, tssfile = infiles
    cpg = Bed.readAndIndex( IOTools.openFile( cpgfile ) )
    
    extension_upstream = PARAMS["cpg_search_upstream"]
    extension_downstream = PARAMS["cpg_search_downstream"]

    c = E.Counter()
    outf = IOTools.openFile( outfile, "w" )
    outf.write("transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n" )

    for tss in Bed.iterator(IOTools.openFile( tssfile ) ):
        c.tss_total += 1

        if tss.strand == "+":
            start, end = tss.start - extension_upstream, tss.start + extension_downstream
        else:
            start, end = tss.end - extension_downstream, tss.end + extension_upstream

        try:
            matches = list(cpg[tss.contig].find( start, end ))
        except KeyError:
            c.promotor_without_matches += 1
            continue

        if len(matches) == 0:
            c.promotor_without_matches += 1
            continue

        c.promotor_output += 1
        for match in matches:
            c.matches_total += 1
            genome_start, genome_end, x = match

            l = genome_end - genome_start

            # get relative location of match
            if tss.strand == "+":
                relative_start = genome_start - tss.start 
            else:
                relative_start = tss.end - genome_end
            
            relative_end = relative_start + l

            outf.write( "\t".join( map(str, (
                            tss.name, tss.strand,
                            genome_start, genome_end,
                            relative_start, relative_end ))) + "\n" )
            c.matches_output += 1

    outf.close()
            
    with IOTools.openFile( outfile + ".summary", "w" ) as outf:
        outf.write ("category\tcounts\n" )
        outf.write( c.asTable() + "\n" )
    
    E.info( c )

开发者ID:BioinformaticsArchive，项目名称:cgat，代码行数:59，代码来源:pipeline_promotors.py

示例4: main

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-o", "--output-section", dest="output", type="choice",
                      choices=("full", "name"),
                      help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("two arguments required")

    if args[0] == "-":
        infile1 = options.stdin
    else:
        infile1 = IOTools.openFile(args[0], "r")

    infile2 = IOTools.openFile(args[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = options.output
    outfile = options.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.Stop()

开发者ID:SCV，项目名称:cgat，代码行数:56，代码来源:bed2graph.py

示例5: _count

    def _count(self, filename, idx):
        '''count filename against idx.'''

        overlapping_genes = set()
        genes = set()

        # iterate over exons
        infile = IOTools.openFile(filename, "r")
        it = Bed.bed_iterator(infile)

        nexons, nexons_overlapping = 0, 0
        nbases, nbases_overlapping = 0, 0
        for this in it:
            nexons += 1
            nbases += this.end - this.start

            try:
                intervals = list(
                    idx[this.contig].find(max(0, this.start), this.end))
            except KeyError:
                continue
            except Exception, msg:
                raise Exception(
                    "error while processing %s, msg=%s" % (filename, msg))
            if len(intervals) == 0:
                continue

            nexons_overlapping += 1
            start, end = this.start, this.end
            counts = numpy.zeros(end - start, numpy.int)
            for other_start, other_end, other_value in intervals:
                for x in range(max(start, other_start) - start, min(end, other_end) - start):
                    counts[x] += 1
            nbases_overlapping += sum([1 for x in counts if x > 0])

开发者ID:Q-KIM，项目名称:cgat，代码行数:34，代码来源:diff_bed.py

示例6: aggregateWindowsReadCounts

def aggregateWindowsReadCounts(infiles,
                               outfile,
                               regex="(.*)\..*"):
    '''aggregate several results from coverageBed
    into a single file.

    *regex* is used to extract the track name from the filename.
    The default removes any suffix.

    coverageBed outputs the following columns:
    1 Contig
    2 Start
    3 Stop
    4 Name
    5 The number of features in A that overlapped (by at least one
      base pair) the B interval.
    6 The number of bases in B that had non-zero coverage from features in A.
    7 The length of the entry in B.
    8 The fraction of bases in B that had non-zero coverage from
      features in A.

    For bed: use column 5
    For bed6: use column 7
    For bed12: use column 13

    Windows without any counts will not be output.
    '''

    # get bed format
    bed_columns = Bed.getNumColumns(infiles[0])
    # +1 as awk is 1-based
    column = bed_columns - 4 + 1

    src = " ".join(['''<( zcat %s |
              awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' %
                    (x, column) for x in infiles])
    tmpfile = P.getTempFilename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()

    # build track names
    tracks = [re.search(regex, os.path.basename(x)).groups()[0]
              for x in infiles]

    outf = IOTools.openFile(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    for line in open(tmpfile, "r"):
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]
        if sum(values) == 0:
            continue
        assert len(genes) == 1, \
            "paste command failed, wrong number of genes per line: '%s'" % line
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))

    outf.close()

    os.unlink(tmpfile)

开发者ID:BioXiao，项目名称:CGATPipelines，代码行数:60，代码来源:PipelineWindows.py

示例7: fetchProbeFragments

def fetchProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):

    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            bed.start = frag.start
            bed.end = frag.end
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")

开发者ID:sudlab，项目名称:Capture_C，代码行数:31，代码来源:PipelineCaptureC.py

示例8: main

def main( argv = None ):

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gtf2table.py 2888 2010-04-07 08:48:36Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default]."  )

    parser.add_option("-n", "--per-name", dest="per_name", action="store_true",
                      help="compute counts per name [default=%default]."  )

    parser.add_option("-c", "--per-contig", dest="per_contig", action="store_true",
                      help="compute counts per contig [default=%default]."  )

    parser.add_option("-t", "--per-track", dest="per_track", action="store_true",
                      help="compute counts per track [default=%default]."  )

    parser.set_defaults(
        genome_file = None,
        per_name = False,
        per_track = False,
        )

    (options, args) = E.Start( parser, argv )

    # get files
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        fasta = None

    counts = collections.defaultdict( Counter )

    if options.per_track:
        keyf = lambda x: x.track
    elif options.per_name:
        keyf = lambda x: x.name
    elif options.per_contig:
        keyf = lambda x: x.contig
    else:
        keyf = lambda x: "all"

    for bed in Bed.iterator(options.stdin):
        counts[keyf(bed)].add( bed )

    outf = options.stdout

    key = "track"
    outf.write( "%s\t%s\n" % (key, "\t".join( Counter.headers) ))

    for key, count in counts.iteritems():
        outf.write( "%s\t%s\n" % ( key, str(count)) )
        
    E.Stop()

开发者ID:siping，项目名称:cgat，代码行数:55，代码来源:bed2stats.py

示例9: quantifyAnomolies

def quantifyAnomolies(bam, probes, outfile):

    bamfile = pysam.AlignmentFile(bam)
    results = dict()
    mapped = bamfile.mapped
    total = 0
    seeks = 0
    for probe in Bed.iterator(IOTools.openFile(probes)):
        
        c = collections.Counter()
 
        for read in bamfile.fetch(probe.contig, probe.start, probe.end,
                                  multiple_iterators=True):

            if read.is_unmapped:
                continue

            c["total"] += 1
    
            if not (read.is_secondary or read.is_supplementary):
                c["primary"] += 1
            else:
                continue

            if read.pos < (probe.start-4) or read.aend > (probe.end +4):
                c["undigested"] += 1

            if read.is_read1:
                c["read1"] += 1

            if not read.mate_is_unmapped:
                c["paired"] += 1

                if read.is_read1 and (
                        read.mpos >= probe.start and read.mpos <= probe.end):
                        c["self_lig"] += 1

            if (total + c["total"] % 10000) == 0:
                E.debug("%s/%s done" % (total + c["total"], mapped))

        E.debug("%s processed, %i found" % (probe.name, c["total"]))
        results[probe.name] = c
        total += c["total"]

    headers = ["Probe", "total", "primary", "undigested",
               "read1", "paired", "self_lig"]

    with IOTools.openFile(outfile, "w") as outf:

        outf.write("\t".join(headers) + "\n")
        
        for probe in results:
            outf.write("\t".join(
                [probe]+[str(results[probe][col])
                         for col in headers[1:]]) + "\n")

开发者ID:sudlab，项目名称:Capture_C，代码行数:55，代码来源:PipelineCaptureC.py

示例10: main

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--regex-filename", dest="re_name", type="string",
                      help="regex for filename component to be used "
                      "as the annotation label")

    parser.set_defaults(
        re_name="(.+)",
    )

    (options, args) = E.Start(parser, argv=argv)

    infiles = argv[-1]
    bed_files = infiles.split(",")

    if len(bed_files) == 1:
        raise IOError("Only one file detected, cannot merge "
        "a single bed file")
    else:
        pass

    # get the regex for annotation names,
    rx = re.compile(options.re_name)
    annot_names = [[y for y in rx.search(x).groups()] for x in bed_files]
    annot_names = list(itertools.chain(*annot_names))

    # output as BED4 format: chr, start, end, name
    for fx in range(len(bed_files)):
        bfile = bed_files[fx]
        with IOTools.openFile(bfile, "r") as ofile:
            intervals = Bed.iterator(ofile)
            track_name = [bx for bx in annot_names if re.search(bx, bfile)][0]
            for entry in intervals:
                entry.__setitem__("name", track_name)
                options.stdout.write("%s\t%s\t%s\t%s\n" % (entry.contig,
                                                           entry.start,
                                                           entry.end,
                                                           entry.name))

    # write footer and output benchmark information.
    E.Stop()

开发者ID:MikeDMorgan，项目名称:gwasenrichment_pipeline，代码行数:54，代码来源:merge_beds.py

示例11: countTagsInClusters

def countTagsInClusters(bedfile, bamfile, outfile):

    bam = pysam.AlignmentFile(bamfile)

    outlines = []

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        interval = (bed.start, bed.end)
        counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum()
        outlines.append(["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)])

    IOTools.writeLines(outfile, outlines, header=["position","count"])

开发者ID:CGATOxford，项目名称:UMI-tools_pipelines，代码行数:12，代码来源:PipelineUMI.py

示例12: buildAlignmentSizes

def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = IOTools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()

开发者ID:yangjl，项目名称:cgat，代码行数:15，代码来源:pipeline_metagenomebenchmark.py

示例13: buildQuicksectMask

def buildQuicksectMask(bed_file):
    '''return Quicksect object containing the regions specified
       takes a bed file listing the regions to mask 
    '''
    mask = IndexedGenome.Quicksect()

    n_regions = 0
    for bed in Bed.iterator(IOTools.openFile(bed_file)):
        # it is neccessary to extend the region to make an accurate mask
        mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1)
        n_regions += 1

    E.info("Built Quicksect mask for %i regions" % n_regions)

    return(mask)

开发者ID:CGATOxford，项目名称:CGATPipelines，代码行数:15，代码来源:PipelineChipseq.py

示例14: aggregateWindowsReadCounts

def aggregateWindowsReadCounts( infiles, outfile ):
    '''aggregate tag counts for each window.

    coverageBed outputs the following columns:
    1) Contig
    2) Start
    3) Stop
    4) Name
    5) The number of features in A that overlapped (by at least one base pair) the B interval.
    6) The number of bases in B that had non-zero coverage from features in A.
    7) The length of the entry in B.
    8) The fraction of bases in B that had non-zero coverage from features in A.

    For bed: use column 5
    For bed6: use column 7
    For bed12: use column 13

    Tiles with no counts will not be output.
    '''
    
    to_cluster = True

    # get bed format
    bed_columns = Bed.getNumColumns( infiles[0] )
    # +1 as awk is 1-based
    column = bed_columns - 4 + 1

    src = " ".join( [ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x,column) for x in infiles] )
    tmpfile = P.getTempFilename( "." )
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()
    
    tracks = [ re.sub( "\..*", '', os.path.basename(x) ) for x in infiles ]

    outf = IOTools.openFile( outfile, "w")
    outf.write( "interval_id\t%s\n" % "\t".join( tracks ) )
    
    for line in open( tmpfile, "r" ):
        data = line[:-1].split("\t")
        genes = list(set([ data[x] for x in range(0,len(data), 2 ) ]))
        values = [ int(data[x]) for x in range(1,len(data), 2 ) ]
        if sum(values) == 0: continue
        assert len(genes) == 1, "paste command failed, wrong number of genes per line: '%s'" % line
        outf.write( "%s\t%s\n" % (genes[0], "\t".join(map(str, values) ) ) )
    
    outf.close()

    os.unlink(tmpfile)

开发者ID:BioinformaticsArchive，项目名称:cgat，代码行数:48，代码来源:PipelineWindows.py

示例15: sites2fragments

def sites2fragments(infile, genomefile, outfile):
    '''Convert bedfile of deigestion sites into bedfile of fragments'''

    contig_lengths = {line.split()[0]: int(line.split()[1][:-1])
                      for line in IOTools.openFile(genomefile)}

    last_end = 0
    last_contig = None
    name = 0
    new_bed = Bed.Bed()
    new_bed["strand"] = "+"
    new_bed["score"] = "."
    with IOTools.openFile(outfile, "w") as outf:
        for bed in Bed.iterator(IOTools.openFile(infile)):
       
            if last_contig is not None and not bed.contig == last_contig:
                name += 1
                new_bed.start = last_end
                new_bed.contig = last_contig
                new_bed.end = contig_lengths[bed.contig]
                new_bed["name"] = str(name)

                outf.write(str(new_bed) + "\n")
        
                last_end = 0
                
            last_contig = bed.contig
            new_bed.contig = last_contig
            new_bed.start = last_end
            new_bed.end = bed.start
            name += 1
            new_bed["name"] = str(name)
            outf.write(str(new_bed) + "\n")
            last_end = bed.end

        name += 1
        new_bed.start = last_end
        new_bed.contig = last_contig
        new_bed.end = contig_lengths[bed.contig]
        new_bed["name"] = str(name)

        outf.write(str(new_bed) + "\n")

    pysam.tabix_index(outfile, force=True, preset="bed")

开发者ID:sudlab，项目名称:Capture_C，代码行数:44，代码来源:PipelineCaptureC.py

注：本文中的CGAT.Bed类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。