Python pybedtools.BedTool类代码示例

本文整理汇总了Python中pybedtools.BedTool类的典型用法代码示例。如果您正苦于以下问题：Python BedTool类的具体用法？Python BedTool怎么用？Python BedTool使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了BedTool类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: generate_bed_file_annotations

def generate_bed_file_annotations(bed_directory, output_directory, loci):
    """
        Generates the annotation file for every bed file in the bed_directory folder
    """
    
    # Loop over the bed files in the bed directory.
    bed_file_list = glob.glob(os.path.join(bed_directory, "*.bed"))
    logging.info("Start to generate BED file annotations")
    logging.info("Writing annotation to: {0}/".format(output_directory))
    for locus in loci:
        zscore = os.path.join(output_directory, locus) 
        bed_lines, rsids = _bed_from_zscore(zscore)
        tmp_bed = open("tmp.bed","w").writelines(bed_lines)
        snps = BedTool("tmp.bed")
        no_snps = _get_line_number(zscore)
        a_matrix= AnnotateLociMatrix(len(bed_file_list), no_snps)
        logging.info("Annotating locus: {0}, using VCF file {1}".format(locus, zscore))
        for beds in bed_file_list:
            test_annotation = BedTool(beds)
            inter = snps.intersect(test_annotation)
            idxs = []
            for inte in inter:
                idxs.append(rsids.index(inte.name))
            zeroes = np.zeros(len(rsids))
            for idx in idxs:
                zeroes[idx] = 1
            a_matrix.add_annotation(zeroes, beds)
        annotations_file = os.path.join(output_directory, locus + ".annotations")
        logging.info("Writing annotation matrix to: {0}".format(annotations_file))
        a_matrix.write_annotations(annotations_file)
        os.remove("tmp.bed")

开发者ID:theboocock，项目名称:fine_mapping_pipeline，代码行数:31，代码来源:annotation.py

示例2: xstream

def xstream(a, b, distance, updown, out):
    """
    find all things in b that are within
    distance of a in the given direction
    (up or down-stream)
    """
    direction = dict(u="l", d="r")[updown[0]]
    kwargs = {'sw':True, direction: distance}

    if "l" in kwargs: kwargs["r"] = 0
    else: kwargs["l"] = 0
    a = BedTool(a).saveas()

    kwargs['stream'] = True
    c = a.window(b, **kwargs)
    afields = a.field_count()

    seen = collections.defaultdict(set)
    for feat in c:
        key = "\t".join(feat[:afields])
        # keep track of all the feature names that overlap this one
        seen[key].update((feat[afields + 3],))

    # the entries that did appear in the window
    for row in seen:
        out.write(row + "\t" + ",".join(sorted(seen[row])) + "\n")

    # write the entries that did not appear in the window'ed Bed
    for row in a:
        key = "\t".join(row[:afields])
        if key in seen: continue
        out.write(str(row) + "\t.\n")
    out.flush()
    assert len(BedTool(out.name)) == len(a)

开发者ID:GunioRobot，项目名称:bio-playground，代码行数:34，代码来源:superanno.py

示例3: get_coverage

def get_coverage(bed_prefix, directory, file_prefix, bam):
    """
    Coverage at all positions is calculated. This is then used for coverage analysis and to determine read depth at any
    false negative sites
    :param bed_prefix: all regions in the bed files submitted are in a file generated during intersections
    :param directory: location of patient results
    :param file_prefix: prefix used for all files in pipeline i.e. worklist-patient
    :return out: filename for coverage stats
    """
    #TODO change BAM path so filename is not required
    print 'Generating coverage stats.'
    whole_bed = '/results/Analysis/MiSeq/MasterBED/GIAB/' + bed_prefix + '.whole.bed'
    out = directory + '/giab_results/whole_bed_coverage.txt'
    command = '/results/Pipeline/program/sambamba/build/sambamba depth base --min-coverage=0 -q29 -m -L ' + whole_bed + \
              ' ' + bam + ' > ' + out + '.tmp'
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)
    print 'Sambamba complete.'
    #issue with sambamba that leaves out regions that have 0 coverage - intersect regions to find missing and add
    # them to the file at coverage 0
    temp_bed = out.replace('.txt', '.bed.tmp')
    command = 'awk \'{print($1"\\t"$2"\\t"$2+1"\\t"$3)}\' ' + out + '.tmp | grep -v "COV" > ' + temp_bed
    print command
    try:
        subprocess.check_call(command, shell=True)
        print 'BED coordinates extracted.'
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)


    coverage_bed = BedTool(temp_bed)
    print 'BED tool created'
    whole_bedtool = BedTool(whole_bed)
    print 'Intersecting'
    missing_regions = whole_bedtool.intersect(coverage_bed, v=True)
    missing_file = directory + '/giab_results/regions_missing'
    missing_regions.moveto(missing_file)
    print 'Generating file'
    sample_split = file_prefix.split('-')
    sample = sample_split[1] + '-' + sample_split[2]
    command = '''while read i; do start=`echo "$i"|cut -f2`; end=`echo "$i"|cut -f3`; chr=`echo "$i"|cut -f1`; end_true=`echo "${end} - 1" | bc`; for j in $(seq $start $end_true); do new_end=`echo -e "${j} + 1" | bc`; echo -e "$chr\\t${j}\\t0\\t0\\t0\\t0\\t0\\t0\\t0\\t''' + sample + '''";done;done < ''' + missing_file + '> ' + directory + '/to_add'
    print command
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)

    command = 'cat ' + out + '.tmp ' + directory + '/to_add > ' + out
    try:
        subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
        print 'Error executing command:' + str(e.returncode)
        exit(1)
    print 'fix complete.'
    return out

开发者ID:sch-sdgs，项目名称:SDGSValidation，代码行数:60，代码来源:giab_comparison_freebayes.py

示例4: segmentations

def segmentations(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    results = {}
    intersection = v.intersect(feats, wb=True)
    if len(intersection) > 0:
        sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$8"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
        call(sort_cmd1, shell=True)
        annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')
        for entry in annots: 
            regions = {}
            regions[entry[4]] = entry[5]

            results[entry.name] = Series(regions)

    names = {
        'CTCF': 'CTCF_REG', 
        'E':    'ENH', 
        'PF':   'TSS_FLANK', 
        'R':    'REP', 
        'T':    'TRAN', 
        'TSS':  'TSS', 
        'WE':   'WEAK_ENH'
    }

    return DataFrame(results, index=names.keys()).T.rename(columns=names)

开发者ID:tianxiahuihui，项目名称:bioinformatics，代码行数:26，代码来源:anno_info2.py

示例5: filter_bed

def filter_bed(bedfile, snp_list, outfile=sys.stdout):
    """Filter a bedfile to only include snps in snp_list, print to outfile.

    :bedfile:  A bed file of all the SNPs, can be gzipped.
    :snp_list: List/tuple/set/frozenset of snp names.
    :outfile:  Something .bed or .bed.gz, deault STDOUT.
    :returns:  0 on success 1 on failure

    """
    try:
        from pybedtools import BedTool
    except ImportError:
        logme.log('pybedtools is not installed.\n' +
                  'Please install and try again. You can get it from here:\n' +
                  'https://github.com/daler/pybedtools',
                  level='error')
        return -1

    if not isinstance(snp_list, (tuple, list, set, frozenset)):
        raise Exception('snp_list must be tuple/list/set/frozenset ' +
                        'it is: {}'.format(type(snp_list)))

    bed      = BedTool(bedfile)
    filtered = bed.filter(lambda a: a.name in snp_list)

    with open_zipped(outfile, 'w') as fout:
        fout.write(str(filtered))

开发者ID:rmagoglia，项目名称:ASEr，代码行数:27，代码来源:snps.py

示例6: calculate_ovl

def calculate_ovl(nbedfile, obedfile, opts, scoresfile):
    nbedtool = BedTool(nbedfile)
    obedtool = BedTool(obedfile)

    ab = nbedtool.intersect(obedtool, wao=True, f=opts.f, r=opts.r, s=opts.s)
    cmd = """cut -f4,5,10,13 | awk -F $'\t' 'BEGIN { OFS = FS } ($3 != "."){ print $1,$3,$2,$4; }'"""
    sh(cmd, infile=ab.fn, outfile=scoresfile)

开发者ID:rrane，项目名称:jcvi，代码行数:7，代码来源:reformat.py

示例7: annotate_peaks

def annotate_peaks(notsif, beds, names):
    """Takes notsif, transforms to bed, and outputs annotation of where the 
    miRNA seed is interrogating via Cytoscape edge attribute file.
    """
    strand = find_strand_from_filename(notsif)

    mirna_bed = BedTool(notsif_to_bed(notsif, strand), from_string=True)

    # create the reference beds
    reference = {}
    for name, bed in izip(names, beds):
        reference[name] = BedTool(bed)

    for name in names:

        # intersect the mirna bed with the reference annotations
        for hit in mirna_bed.intersect(reference[name], s=True, stream=True):

            # name field returned from notsif_to_bed is delimited by "|"
            mirna_name = hit.name.split("|")[0]
            gene_name = hit.name.split("|")[1]
            # Cytoscape formatting
            seed_length = "(%s)" % hit.score
            fields = (mirna_name, seed_length, gene_name, "=", name)
            print " ".join(map(str, fields))

开发者ID:brwnj，项目名称:cu_projects，代码行数:25，代码来源:peak_annotation.py

示例8: _get

def _get(relative_path, genome=None):
    """
    :param relative_path: relative path of the file inside the repository
    :param genome: genome name. Can contain chromosome name after comma, like hg19-chr20,
                   in case of BED, the returning BedTool will be with added filter.
    :return: BedTools object if it's a BED file, or filepath
    """
    chrom = None
    if genome:
        if '-chr' in genome:
            genome, chrom = genome.split('-')
        check_genome(genome)
        relative_path = relative_path.format(genome=genome)

    path = abspath(join(dirname(__file__), relative_path))
    if not isfile(path) and isfile(path + '.gz'):
        path += '.gz'

    if path.endswith('.bed') or path.endswith('.bed.gz'):
        if path.endswith('.bed.gz'):
            bedtools = which('bedtools')
            if not bedtools:
                critical('bedtools not found in PATH: ' + str(os.environ['PATH']))
            debug('BED is compressed, creating BedTool')
            bed = BedTool(path)
        else:
            debug('BED is uncompressed, creating BedTool')
            bed = BedTool(path)

        if chrom:
            debug('Filtering BEDTool for chrom ' + chrom)
            bed = bed.filter(lambda r: r.chrom == chrom)
        return bed
    else:
        return path

开发者ID:vladsaveliev，项目名称:TargQC，代码行数:35，代码来源:__init__.py

示例9: gene_regions

def gene_regions(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    
    # first establish all the columns in the annotation file
    cols = set(f[4] for f in feats)

    results = {}

    intersection = v.intersect(feats, wb=True)

    if len(intersection) > 0:
        annots = intersection.groupby(g=[1,2,3,4], c=9, ops='collapse')

        for entry in annots:
            regions = {}
            for region in entry[4].split(','):  
                if region in regions:
                    regions[region] += 1
                else:
                    regions[region] = 1

            results[entry.name] = Series(regions)

    df = DataFrame(results, index = cols)

    return df.T.fillna(0)

开发者ID:tianxiahuihui，项目名称:bioinformatics，代码行数:27，代码来源:anno_num.py

示例10: calc_origin_bkgd_freqs

def calc_origin_bkgd_freqs(bedtool, strand, fasta_filename, verbose):

    # add strand to bedtool
    if strand == 'pos':
        strand_char = '+'
    elif strand == 'neg':
        strand_char = '-'

    intervals = []
    for row in bedtool:
        # input is BED6, output needs BED6
        row.strand = strand_char
        intervals.append(row)

    stranded_bedtool = BedTool(intervals)

    fastatool = stranded_bedtool.sequence(fi=fasta_filename, s=True)

    kwargs = {'region_size_min':1,
              'region_size_max':1,
              'ignore_chroms':[],
              'only_chroms':[],
              'verbose':verbose}

    if verbose:
        print >>sys.stderr, ">> calculating background freqs ..."

    result = calc_bkgd_counts(fastatool.seqfn, **kwargs)

    return result

开发者ID:hesselberthlab，项目名称:modmap，代码行数:30，代码来源:origin_analysis.py

示例11: getNegativeDatasetFASTA

def getNegativeDatasetFASTA(config):
	try:
		coordinates = BedTool(config['negativesBedFile'])
		genome = BedTool(config['maize_genome_filepath'])
		dataset = coordinates.sequence(fi=genome, fo=config['negative_dataset_output'])
	except ValueError:
		print 'getNegativeDatasetFASTA; File ', config['maize_genome_filepath'], ' not found'

开发者ID:Tay2510，项目名称:PyCorn，代码行数:7，代码来源:driver.py

示例12: sequence_from_bedfile

def sequence_from_bedfile(fastafile, features=None, bedfile=None, pad5=0, pad3=0):
    """Fasta sequences from set of genomic features in a bed file
        Args:
            fastafile: fasta file with genomic sequence
            features: dataframe of features/coords with bed file col names
            bedfile: optionally provide a bed file instead
            pad5,pad3: flanking sequence at 5' or 3' ends
        Returns:
            a pandas dataframe with name, sequence and coord columns"""

    from pybedtools import BedTool
    if bedfile != None:
        features = utils.bed_to_dataframe(bedfile)
    new = []
    for n,r in features.iterrows():
        if r.strand == '+':
            coords = (r.chr,r.chromStart-pad5,r.chromEnd+pad3)
            seq = str(BedTool.seq(coords, fastafile))
        else: #reverse strand
            coords = (r.chr,r.chromStart-pad3,r.chromEnd+pad5)
            seq = str(BedTool.seq(coords, fastafile))
            seq = HTSeq.Sequence(seq).get_reverse_complement()
        #print n, coords, r['name']
        new.append([r['name'],str(seq),coords])
    new = pd.DataFrame(new, columns=['name','seq','coords'])
    return new

开发者ID:dmnfarrell，项目名称:mirnaseq，代码行数:26，代码来源:utils.py

示例13: main

def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('bed', help='bed with miRNA as name')
    p.add_argument('--reference-beds', dest='reference', nargs='+', 
        help='reference beds for each feature to annotate')
    p.add_argument('--names', nargs='+', 
        help='names corresponding to reference files')
    args = p.parse_args()
    if not args.names and not args.reference:
        sys.exit(p.print_help())
    
    bed = BedTool(args.bed)
    
    # create the reference beds
    reference = {}
    for refname, refbed in izip(args.names, args.reference):
        reference[refname] = BedTool(refbed)
    
    for refname in args.names:
    
        # intersect the mirna bed with the reference annotations
        for b in bed.intersect(reference[refname], s=True, stream=True):
            # Cytoscape formatting
            fields = (b.name, "=", refname)
            print " ".join(map(str, fields))

开发者ID:brwnj，项目名称:cu_projects，代码行数:26，代码来源:node_annotation.py

示例14: gene_regions

def gene_regions(vf, af):
    v = BedTool(vf)
    feats = BedTool(af)
    
    # first establish all the columns in the annotation file
    cols = set(f[4] for f in feats)

    results = {}

    intersection = v.intersect(feats, wb=True)

    if len(intersection) > 0:
        #sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
        #call(sort_cmd1, shell=True)
        tempfile1 = tempfile.mktemp()
        sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s > %s' % (intersection.fn, tempfile1)
        call(sort_cmd2, shell=True)
        intersection = BedTool(tempfile1)   
        annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')

        for entry in annots:
            regions = {}
            regions[entry[4]] = entry[5]

            results[entry.name] = Series(regions)

    df = DataFrame(results, index = cols)

    return df.T.fillna(0)

开发者ID:tianxiahuihui，项目名称:bioinformatics，代码行数:29，代码来源:anno_info2.py

示例15: main

def main():
    p = argparse.ArgumentParser(description=__doc__,
            formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument('peaks', help='peaks bed')
    p.add_argument('exons', help='refseq exons from UCSC')
    p.add_argument('gtf', help='refseq gtf with feature of interest')
    p.add_argument('feature', help='feature of interest in the gtf')
    p.add_argument('-v', '--verbose', action="store_true", help='maximum verbosity')
    args = p.parse_args()
    
    if args.verbose: sys.stderr.write(">> building exon library...\n")
    exon_lib = make_exon_lib(args.exons)
    
    peaks = BedTool(args.peaks)
    exons = BedTool(args.exons)
    full_ref = BedTool(args.gtf)
    
    if args.verbose: sys.stderr.write(">> filtering for feature...\n")
    filtered_ref = full_ref.filter(lambda gtf: gtf[2] == args.feature)
    
    if args.verbose: sys.stderr.write(">> selecting exonic peaks...\n")
    exonic_peaks = peaks.intersect(exons, wo=True)
    
    if args.verbose: sys.stderr.write(">> calculating distance fractions...\n")
    # D for distance (returns negative if upstream)
    for peak in exonic_peaks.closest(filtered_ref, D="a"):
        try:
            p = ComplexLine(peak)
            corrected_distance = 0.0
            total_exon_length = 0.0
            # parse gtf attrs
            gene_id = p.gtfattrs.split(';')[0].rstrip('"').lstrip('gene_id "')

            # looking downstream wrt peak
            if p.gtfdistance > 0:
                # exon with peak
                corrected_distance = p.exonstop - p.peakstop
                for exon in exon_lib[p.exoninfo.name]:
                    # add downstream exon lengths
                    if exon > p.exoninfo.number:
                        corrected_distance += exon_lib[p.exoninfo.name][exon]
                        
            # looking upstream wrt peak
            else:
                # exon with peak
                corrected_distance = p.peakstart - p.exonstart
                for exon in exon_lib[p.exoninfo.name]:
                    # add upstream exon lengths
                    if exon < p.exoninfo.number:
                        corrected_distance += exon_lib[p.exoninfo.name][exon]
            
            for exon in exon_lib[p.exoninfo.name]:
                total_exon_length += exon_lib[p.exoninfo.name][exon]
            
            # fraction
            print (corrected_distance / total_exon_length)
        
        except ValueError:
            continue

开发者ID:brwnj，项目名称:cu_projects，代码行数:59，代码来源:relative_cluster_positions.py

注：本文中的pybedtools.BedTool类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。