当前位置: 首页>>代码示例>>Python>>正文


Python Bed.iterator方法代码示例

本文整理汇总了Python中CGAT.Bed.iterator方法的典型用法代码示例。如果您正苦于以下问题:Python Bed.iterator方法的具体用法?Python Bed.iterator怎么用?Python Bed.iterator使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在CGAT.Bed的用法示例。


在下文中一共展示了Bed.iterator方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: getProbeFragments

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def getProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):
    
    # First find the length of the restriction enzyme cut, required to obtain the start and end coordinates
    # from the pregenerated file.
    # First iteration, no comparison
    first_iteration = True
    
    length_RE_cut = 0
    
    last_bed = None
    
    for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)):
                
        if(first_iteration):
            first_iteration = False
        else:
            # If they are in the same contig they can be compared
            if(bed_digest.contig == last_bed.contig):
                length_RE_cut = bed_digest.start - last_bed.end
                break
        
        last_bed = bed_digest
    
    
    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            
            # The restriction enzyme cut on the left side of the fragment
            # is the end site of the last restriction enzyme fragment + 1
            # (+1 because according to the manual coordinates are specified
            # in 1-origin for the bed start.)
            
            bed.start = frag.start-length_RE_cut+1
            bed.end = frag.end+length_RE_cut
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
开发者ID:sudlab,项目名称:pipeline_capt_c_perl,代码行数:61,代码来源:pipelineCaptCPerl.py

示例2: exportSequencesFromBedFile

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def exportSequencesFromBedFile( infile, outfile, masker = None, mode = "intervals" ):
    '''export sequences for intervals in :term:`bed`-formatted *infile* 
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip( infile, ".bed.gz" )

    fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) )
    outs = IOTools.openFile( outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator( IOTools.openFile(infile) )):
        lcontig = fasta.getLength( bed.contig )

        if mode == "intervals":
            seqs.append( fasta.getSequence( bed.contig, "+", bed.start, bed.end) )
            ids.append( "%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end) )

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0,bed.start-l), bed.end-l
            ids.append( "%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
            seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
            
            start, end = bed.start+l, min(lcontig,bed.end+l)
            ids.append( "%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
            seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
            
    masked = maskSequences( seqs, masker )
    outs.write("\n".join( [ ">%s\n%s" % (x,y) for x,y in zip(ids, masked) ] ) )

    outs.close()
开发者ID:BioinformaticsArchive,项目名称:cgat,代码行数:35,代码来源:PipelineMotifs.py

示例3: fetchProbeFragments

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def fetchProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):

    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            bed.start = frag.start
            bed.end = frag.end
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
开发者ID:sudlab,项目名称:Capture_C,代码行数:33,代码来源:PipelineCaptureC.py

示例4: annotateCpGIslands

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def annotateCpGIslands( infiles, outfile ):
    '''annotate transcript by absence/presence of CpG islands
    '''
    cpgfile, tssfile = infiles
    cpg = Bed.readAndIndex( IOTools.openFile( cpgfile ) )
    
    extension_upstream = PARAMS["cpg_search_upstream"]
    extension_downstream = PARAMS["cpg_search_downstream"]

    c = E.Counter()
    outf = IOTools.openFile( outfile, "w" )
    outf.write("transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n" )

    for tss in Bed.iterator(IOTools.openFile( tssfile ) ):
        c.tss_total += 1

        if tss.strand == "+":
            start, end = tss.start - extension_upstream, tss.start + extension_downstream
        else:
            start, end = tss.end - extension_downstream, tss.end + extension_upstream

        try:
            matches = list(cpg[tss.contig].find( start, end ))
        except KeyError:
            c.promotor_without_matches += 1
            continue

        if len(matches) == 0:
            c.promotor_without_matches += 1
            continue

        c.promotor_output += 1
        for match in matches:
            c.matches_total += 1
            genome_start, genome_end, x = match

            l = genome_end - genome_start

            # get relative location of match
            if tss.strand == "+":
                relative_start = genome_start - tss.start 
            else:
                relative_start = tss.end - genome_end
            
            relative_end = relative_start + l

            outf.write( "\t".join( map(str, (
                            tss.name, tss.strand,
                            genome_start, genome_end,
                            relative_start, relative_end ))) + "\n" )
            c.matches_output += 1

    outf.close()
            
    with IOTools.openFile( outfile + ".summary", "w" ) as outf:
        outf.write ("category\tcounts\n" )
        outf.write( c.asTable() + "\n" )
    
    E.info( c )
开发者ID:BioinformaticsArchive,项目名称:cgat,代码行数:61,代码来源:pipeline_promotors.py

示例5: main

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-o", "--output-section", dest="output", type="choice",
                      choices=("full", "name"),
                      help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("two arguments required")

    if args[0] == "-":
        infile1 = options.stdin
    else:
        infile1 = IOTools.openFile(args[0], "r")

    infile2 = IOTools.openFile(args[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = options.output
    outfile = options.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.Stop()
开发者ID:SCV,项目名称:cgat,代码行数:58,代码来源:bed2graph.py

示例6: main

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def main( argv = None ):

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gtf2table.py 2888 2010-04-07 08:48:36Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default]."  )

    parser.add_option("-n", "--per-name", dest="per_name", action="store_true",
                      help="compute counts per name [default=%default]."  )

    parser.add_option("-c", "--per-contig", dest="per_contig", action="store_true",
                      help="compute counts per contig [default=%default]."  )

    parser.add_option("-t", "--per-track", dest="per_track", action="store_true",
                      help="compute counts per track [default=%default]."  )

    parser.set_defaults(
        genome_file = None,
        per_name = False,
        per_track = False,
        )

    (options, args) = E.Start( parser, argv )

    # get files
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        fasta = None

    counts = collections.defaultdict( Counter )

    if options.per_track:
        keyf = lambda x: x.track
    elif options.per_name:
        keyf = lambda x: x.name
    elif options.per_contig:
        keyf = lambda x: x.contig
    else:
        keyf = lambda x: "all"

    for bed in Bed.iterator(options.stdin):
        counts[keyf(bed)].add( bed )

    outf = options.stdout

    key = "track"
    outf.write( "%s\t%s\n" % (key, "\t".join( Counter.headers) ))

    for key, count in counts.iteritems():
        outf.write( "%s\t%s\n" % ( key, str(count)) )
        
    E.Stop()
开发者ID:siping,项目名称:cgat,代码行数:57,代码来源:bed2stats.py

示例7: quantifyAnomolies

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def quantifyAnomolies(bam, probes, outfile):

    bamfile = pysam.AlignmentFile(bam)
    results = dict()
    mapped = bamfile.mapped
    total = 0
    seeks = 0
    for probe in Bed.iterator(IOTools.openFile(probes)):
        
        c = collections.Counter()
 
        for read in bamfile.fetch(probe.contig, probe.start, probe.end,
                                  multiple_iterators=True):

            if read.is_unmapped:
                continue

            c["total"] += 1
    
            if not (read.is_secondary or read.is_supplementary):
                c["primary"] += 1
            else:
                continue

            if read.pos < (probe.start-4) or read.aend > (probe.end +4):
                c["undigested"] += 1

            if read.is_read1:
                c["read1"] += 1

            if not read.mate_is_unmapped:
                c["paired"] += 1

                if read.is_read1 and (
                        read.mpos >= probe.start and read.mpos <= probe.end):
                        c["self_lig"] += 1

            if (total + c["total"] % 10000) == 0:
                E.debug("%s/%s done" % (total + c["total"], mapped))

        E.debug("%s processed, %i found" % (probe.name, c["total"]))
        results[probe.name] = c
        total += c["total"]

    headers = ["Probe", "total", "primary", "undigested",
               "read1", "paired", "self_lig"]

    with IOTools.openFile(outfile, "w") as outf:

        outf.write("\t".join(headers) + "\n")
        
        for probe in results:
            outf.write("\t".join(
                [probe]+[str(results[probe][col])
                         for col in headers[1:]]) + "\n")
开发者ID:sudlab,项目名称:Capture_C,代码行数:57,代码来源:PipelineCaptureC.py

示例8: main

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--regex-filename", dest="re_name", type="string",
                      help="regex for filename component to be used "
                      "as the annotation label")

    parser.set_defaults(
        re_name="(.+)",
    )

    (options, args) = E.Start(parser, argv=argv)

    infiles = argv[-1]
    bed_files = infiles.split(",")

    if len(bed_files) == 1:
        raise IOError("Only one file detected, cannot merge "
        "a single bed file")
    else:
        pass

    # get the regex for annotation names,
    rx = re.compile(options.re_name)
    annot_names = [[y for y in rx.search(x).groups()] for x in bed_files]
    annot_names = list(itertools.chain(*annot_names))

    # output as BED4 format: chr, start, end, name
    for fx in range(len(bed_files)):
        bfile = bed_files[fx]
        with IOTools.openFile(bfile, "r") as ofile:
            intervals = Bed.iterator(ofile)
            track_name = [bx for bx in annot_names if re.search(bx, bfile)][0]
            for entry in intervals:
                entry.__setitem__("name", track_name)
                options.stdout.write("%s\t%s\t%s\t%s\n" % (entry.contig,
                                                           entry.start,
                                                           entry.end,
                                                           entry.name))

    # write footer and output benchmark information.
    E.Stop()
开发者ID:MikeDMorgan,项目名称:gwasenrichment_pipeline,代码行数:56,代码来源:merge_beds.py

示例9: countTagsInClusters

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def countTagsInClusters(bedfile, bamfile, outfile):

    bam = pysam.AlignmentFile(bamfile)

    outlines = []

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        interval = (bed.start, bed.end)
        counts = iCLIP.count_intervals(bam, [interval], bed.contig).sum()
        outlines.append(["%s:%i-%i" % (bed.contig, bed.start, bed.end), str(counts)])

    IOTools.writeLines(outfile, outlines, header=["position","count"])
开发者ID:CGATOxford,项目名称:UMI-tools_pipelines,代码行数:14,代码来源:PipelineUMI.py

示例10: buildQuicksectMask

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def buildQuicksectMask(bed_file):
    '''return Quicksect object containing the regions specified
       takes a bed file listing the regions to mask 
    '''
    mask = IndexedGenome.Quicksect()

    n_regions = 0
    for bed in Bed.iterator(IOTools.openFile(bed_file)):
        # it is neccessary to extend the region to make an accurate mask
        mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1)
        n_regions += 1

    E.info("Built Quicksect mask for %i regions" % n_regions)

    return(mask)
开发者ID:CGATOxford,项目名称:CGATPipelines,代码行数:17,代码来源:PipelineChipseq.py

示例11: buildAlignmentSizes

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = IOTools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()
开发者ID:yangjl,项目名称:cgat,代码行数:17,代码来源:pipeline_metagenomebenchmark.py

示例12: sites2fragments

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def sites2fragments(infile, genomefile, outfile):
    '''Convert bedfile of deigestion sites into bedfile of fragments'''

    contig_lengths = {line.split()[0]: int(line.split()[1][:-1])
                      for line in IOTools.openFile(genomefile)}

    last_end = 0
    last_contig = None
    name = 0
    new_bed = Bed.Bed()
    new_bed["strand"] = "+"
    new_bed["score"] = "."
    with IOTools.openFile(outfile, "w") as outf:
        for bed in Bed.iterator(IOTools.openFile(infile)):
       
            if last_contig is not None and not bed.contig == last_contig:
                name += 1
                new_bed.start = last_end
                new_bed.contig = last_contig
                new_bed.end = contig_lengths[bed.contig]
                new_bed["name"] = str(name)

                outf.write(str(new_bed) + "\n")
        
                last_end = 0
                
            last_contig = bed.contig
            new_bed.contig = last_contig
            new_bed.start = last_end
            new_bed.end = bed.start
            name += 1
            new_bed["name"] = str(name)
            outf.write(str(new_bed) + "\n")
            last_end = bed.end

        name += 1
        new_bed.start = last_end
        new_bed.contig = last_contig
        new_bed.end = contig_lengths[bed.contig]
        new_bed["name"] = str(name)

        outf.write(str(new_bed) + "\n")

    pysam.tabix_index(outfile, force=True, preset="bed")
开发者ID:sudlab,项目名称:Capture_C,代码行数:46,代码来源:PipelineCaptureC.py

示例13: makeIntervalCorrelation

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed.gz")
        tablename = "%s_intervals" % P.tablequote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = IOTools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=IOTools.openFile(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()
开发者ID:CGATOxford,项目名称:CGATPipelines,代码行数:43,代码来源:PipelineChipseq.py

示例14: main

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic sequence to retrieve "
                      "sequences from.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker to mask output sequences "
                      "[%default].")

    parser.add_option("--output-mode", dest="output_mode", type="choice",
                      choices=("intervals", "leftright", "segments"),
                      help="what to output. "
                      "'intervals' generates a single sequence for "
                      "each bed interval. 'leftright' generates two "
                      "sequences, one in each direction, for each bed "
                      "interval. 'segments' can be used to output "
                      "sequence from bed12 files so that sequence only covers "
                      "the segements [%default]")

    parser.add_option("--min-sequence-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-sequence-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at", dest="extend_at", type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at 3', 5' or both or no ends. If 3only or 5only "
        "are set, only the added sequence is returned [default=%default]")

    parser.add_option(
        "--extend-by", dest="extend_by", type="int",
        help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand", dest="ignore_strand",
        action="store_false",
        help="use strand information and return reverse complement "
        "on intervals located on the negative strand. "
        "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        output_mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.output_mode == "segments" and bed.columns == 12:
            ids.append("%s %s:%i..%i (%s) %s %s" %
                       (bed.name, bed.contig, bed.start, bed.end, strand,
                        bed["blockSizes"], bed["blockStarts"]))
            seg_seqs = [fasta.getSequence(bed.contig, strand, start, end)
                        for start, end in bed.toIntervals()]
            seqs.append("".join(seg_seqs))

        elif (options.output_mode == "intervals" or
              options.output_mode == "segments"):
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.output_mode == "leftright":
            l = bed.end - bed.start

#.........这里部分代码省略.........
开发者ID:CGATOxford,项目名称:cgat,代码行数:103,代码来源:bed2fasta.py

示例15: main

# 需要导入模块: from CGAT import Bed [as 别名]
# 或者: from CGAT.Bed import iterator [as 别名]

#.........这里部分代码省略.........
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.Samfile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = IOTools.readMap(
            IOTools.openFile(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.iteritems()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(IOTools.openFile(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(IOTools.openFile(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.Samfile(options.filename_transcriptome,
                                            "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.Samfile("-", "wh", template=genome_samfile)
    else:
开发者ID:Q-KIM,项目名称:cgat,代码行数:70,代码来源:bams2bam.py


注:本文中的CGAT.Bed.iterator方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。