本文整理汇总了Python中pybedtools.BedTool类的典型用法代码示例。如果您正苦于以下问题:Python BedTool类的具体用法?Python BedTool怎么用?Python BedTool使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BedTool类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: generate_bed_file_annotations
def generate_bed_file_annotations(bed_directory, output_directory, loci):
"""
Generates the annotation file for every bed file in the bed_directory folder
"""
# Loop over the bed files in the bed directory.
bed_file_list = glob.glob(os.path.join(bed_directory, "*.bed"))
logging.info("Start to generate BED file annotations")
logging.info("Writing annotation to: {0}/".format(output_directory))
for locus in loci:
zscore = os.path.join(output_directory, locus)
bed_lines, rsids = _bed_from_zscore(zscore)
tmp_bed = open("tmp.bed","w").writelines(bed_lines)
snps = BedTool("tmp.bed")
no_snps = _get_line_number(zscore)
a_matrix= AnnotateLociMatrix(len(bed_file_list), no_snps)
logging.info("Annotating locus: {0}, using VCF file {1}".format(locus, zscore))
for beds in bed_file_list:
test_annotation = BedTool(beds)
inter = snps.intersect(test_annotation)
idxs = []
for inte in inter:
idxs.append(rsids.index(inte.name))
zeroes = np.zeros(len(rsids))
for idx in idxs:
zeroes[idx] = 1
a_matrix.add_annotation(zeroes, beds)
annotations_file = os.path.join(output_directory, locus + ".annotations")
logging.info("Writing annotation matrix to: {0}".format(annotations_file))
a_matrix.write_annotations(annotations_file)
os.remove("tmp.bed")
示例2: xstream
def xstream(a, b, distance, updown, out):
"""
find all things in b that are within
distance of a in the given direction
(up or down-stream)
"""
direction = dict(u="l", d="r")[updown[0]]
kwargs = {'sw':True, direction: distance}
if "l" in kwargs: kwargs["r"] = 0
else: kwargs["l"] = 0
a = BedTool(a).saveas()
kwargs['stream'] = True
c = a.window(b, **kwargs)
afields = a.field_count()
seen = collections.defaultdict(set)
for feat in c:
key = "\t".join(feat[:afields])
# keep track of all the feature names that overlap this one
seen[key].update((feat[afields + 3],))
# the entries that did appear in the window
for row in seen:
out.write(row + "\t" + ",".join(sorted(seen[row])) + "\n")
# write the entries that did not appear in the window'ed Bed
for row in a:
key = "\t".join(row[:afields])
if key in seen: continue
out.write(str(row) + "\t.\n")
out.flush()
assert len(BedTool(out.name)) == len(a)
示例3: get_coverage
def get_coverage(bed_prefix, directory, file_prefix, bam):
"""
Coverage at all positions is calculated. This is then used for coverage analysis and to determine read depth at any
false negative sites
:param bed_prefix: all regions in the bed files submitted are in a file generated during intersections
:param directory: location of patient results
:param file_prefix: prefix used for all files in pipeline i.e. worklist-patient
:return out: filename for coverage stats
"""
#TODO change BAM path so filename is not required
print 'Generating coverage stats.'
whole_bed = '/results/Analysis/MiSeq/MasterBED/GIAB/' + bed_prefix + '.whole.bed'
out = directory + '/giab_results/whole_bed_coverage.txt'
command = '/results/Pipeline/program/sambamba/build/sambamba depth base --min-coverage=0 -q29 -m -L ' + whole_bed + \
' ' + bam + ' > ' + out + '.tmp'
try:
subprocess.check_call(command, shell=True)
except subprocess.CalledProcessError as e:
print 'Error executing command:' + str(e.returncode)
exit(1)
print 'Sambamba complete.'
#issue with sambamba that leaves out regions that have 0 coverage - intersect regions to find missing and add
# them to the file at coverage 0
temp_bed = out.replace('.txt', '.bed.tmp')
command = 'awk \'{print($1"\\t"$2"\\t"$2+1"\\t"$3)}\' ' + out + '.tmp | grep -v "COV" > ' + temp_bed
print command
try:
subprocess.check_call(command, shell=True)
print 'BED coordinates extracted.'
except subprocess.CalledProcessError as e:
print 'Error executing command:' + str(e.returncode)
exit(1)
coverage_bed = BedTool(temp_bed)
print 'BED tool created'
whole_bedtool = BedTool(whole_bed)
print 'Intersecting'
missing_regions = whole_bedtool.intersect(coverage_bed, v=True)
missing_file = directory + '/giab_results/regions_missing'
missing_regions.moveto(missing_file)
print 'Generating file'
sample_split = file_prefix.split('-')
sample = sample_split[1] + '-' + sample_split[2]
command = '''while read i; do start=`echo "$i"|cut -f2`; end=`echo "$i"|cut -f3`; chr=`echo "$i"|cut -f1`; end_true=`echo "${end} - 1" | bc`; for j in $(seq $start $end_true); do new_end=`echo -e "${j} + 1" | bc`; echo -e "$chr\\t${j}\\t0\\t0\\t0\\t0\\t0\\t0\\t0\\t''' + sample + '''";done;done < ''' + missing_file + '> ' + directory + '/to_add'
print command
try:
subprocess.check_call(command, shell=True)
except subprocess.CalledProcessError as e:
print 'Error executing command:' + str(e.returncode)
exit(1)
command = 'cat ' + out + '.tmp ' + directory + '/to_add > ' + out
try:
subprocess.check_call(command, shell=True)
except subprocess.CalledProcessError as e:
print 'Error executing command:' + str(e.returncode)
exit(1)
print 'fix complete.'
return out
示例4: segmentations
def segmentations(vf, af):
v = BedTool(vf)
feats = BedTool(af)
results = {}
intersection = v.intersect(feats, wb=True)
if len(intersection) > 0:
sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$8"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
call(sort_cmd1, shell=True)
annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')
for entry in annots:
regions = {}
regions[entry[4]] = entry[5]
results[entry.name] = Series(regions)
names = {
'CTCF': 'CTCF_REG',
'E': 'ENH',
'PF': 'TSS_FLANK',
'R': 'REP',
'T': 'TRAN',
'TSS': 'TSS',
'WE': 'WEAK_ENH'
}
return DataFrame(results, index=names.keys()).T.rename(columns=names)
示例5: filter_bed
def filter_bed(bedfile, snp_list, outfile=sys.stdout):
"""Filter a bedfile to only include snps in snp_list, print to outfile.
:bedfile: A bed file of all the SNPs, can be gzipped.
:snp_list: List/tuple/set/frozenset of snp names.
:outfile: Something .bed or .bed.gz, deault STDOUT.
:returns: 0 on success 1 on failure
"""
try:
from pybedtools import BedTool
except ImportError:
logme.log('pybedtools is not installed.\n' +
'Please install and try again. You can get it from here:\n' +
'https://github.com/daler/pybedtools',
level='error')
return -1
if not isinstance(snp_list, (tuple, list, set, frozenset)):
raise Exception('snp_list must be tuple/list/set/frozenset ' +
'it is: {}'.format(type(snp_list)))
bed = BedTool(bedfile)
filtered = bed.filter(lambda a: a.name in snp_list)
with open_zipped(outfile, 'w') as fout:
fout.write(str(filtered))
示例6: calculate_ovl
def calculate_ovl(nbedfile, obedfile, opts, scoresfile):
nbedtool = BedTool(nbedfile)
obedtool = BedTool(obedfile)
ab = nbedtool.intersect(obedtool, wao=True, f=opts.f, r=opts.r, s=opts.s)
cmd = """cut -f4,5,10,13 | awk -F $'\t' 'BEGIN { OFS = FS } ($3 != "."){ print $1,$3,$2,$4; }'"""
sh(cmd, infile=ab.fn, outfile=scoresfile)
示例7: annotate_peaks
def annotate_peaks(notsif, beds, names):
"""Takes notsif, transforms to bed, and outputs annotation of where the
miRNA seed is interrogating via Cytoscape edge attribute file.
"""
strand = find_strand_from_filename(notsif)
mirna_bed = BedTool(notsif_to_bed(notsif, strand), from_string=True)
# create the reference beds
reference = {}
for name, bed in izip(names, beds):
reference[name] = BedTool(bed)
for name in names:
# intersect the mirna bed with the reference annotations
for hit in mirna_bed.intersect(reference[name], s=True, stream=True):
# name field returned from notsif_to_bed is delimited by "|"
mirna_name = hit.name.split("|")[0]
gene_name = hit.name.split("|")[1]
# Cytoscape formatting
seed_length = "(%s)" % hit.score
fields = (mirna_name, seed_length, gene_name, "=", name)
print " ".join(map(str, fields))
示例8: _get
def _get(relative_path, genome=None):
"""
:param relative_path: relative path of the file inside the repository
:param genome: genome name. Can contain chromosome name after comma, like hg19-chr20,
in case of BED, the returning BedTool will be with added filter.
:return: BedTools object if it's a BED file, or filepath
"""
chrom = None
if genome:
if '-chr' in genome:
genome, chrom = genome.split('-')
check_genome(genome)
relative_path = relative_path.format(genome=genome)
path = abspath(join(dirname(__file__), relative_path))
if not isfile(path) and isfile(path + '.gz'):
path += '.gz'
if path.endswith('.bed') or path.endswith('.bed.gz'):
if path.endswith('.bed.gz'):
bedtools = which('bedtools')
if not bedtools:
critical('bedtools not found in PATH: ' + str(os.environ['PATH']))
debug('BED is compressed, creating BedTool')
bed = BedTool(path)
else:
debug('BED is uncompressed, creating BedTool')
bed = BedTool(path)
if chrom:
debug('Filtering BEDTool for chrom ' + chrom)
bed = bed.filter(lambda r: r.chrom == chrom)
return bed
else:
return path
示例9: gene_regions
def gene_regions(vf, af):
v = BedTool(vf)
feats = BedTool(af)
# first establish all the columns in the annotation file
cols = set(f[4] for f in feats)
results = {}
intersection = v.intersect(feats, wb=True)
if len(intersection) > 0:
annots = intersection.groupby(g=[1,2,3,4], c=9, ops='collapse')
for entry in annots:
regions = {}
for region in entry[4].split(','):
if region in regions:
regions[region] += 1
else:
regions[region] = 1
results[entry.name] = Series(regions)
df = DataFrame(results, index = cols)
return df.T.fillna(0)
示例10: calc_origin_bkgd_freqs
def calc_origin_bkgd_freqs(bedtool, strand, fasta_filename, verbose):
# add strand to bedtool
if strand == 'pos':
strand_char = '+'
elif strand == 'neg':
strand_char = '-'
intervals = []
for row in bedtool:
# input is BED6, output needs BED6
row.strand = strand_char
intervals.append(row)
stranded_bedtool = BedTool(intervals)
fastatool = stranded_bedtool.sequence(fi=fasta_filename, s=True)
kwargs = {'region_size_min':1,
'region_size_max':1,
'ignore_chroms':[],
'only_chroms':[],
'verbose':verbose}
if verbose:
print >>sys.stderr, ">> calculating background freqs ..."
result = calc_bkgd_counts(fastatool.seqfn, **kwargs)
return result
示例11: getNegativeDatasetFASTA
def getNegativeDatasetFASTA(config):
try:
coordinates = BedTool(config['negativesBedFile'])
genome = BedTool(config['maize_genome_filepath'])
dataset = coordinates.sequence(fi=genome, fo=config['negative_dataset_output'])
except ValueError:
print 'getNegativeDatasetFASTA; File ', config['maize_genome_filepath'], ' not found'
示例12: sequence_from_bedfile
def sequence_from_bedfile(fastafile, features=None, bedfile=None, pad5=0, pad3=0):
"""Fasta sequences from set of genomic features in a bed file
Args:
fastafile: fasta file with genomic sequence
features: dataframe of features/coords with bed file col names
bedfile: optionally provide a bed file instead
pad5,pad3: flanking sequence at 5' or 3' ends
Returns:
a pandas dataframe with name, sequence and coord columns"""
from pybedtools import BedTool
if bedfile != None:
features = utils.bed_to_dataframe(bedfile)
new = []
for n,r in features.iterrows():
if r.strand == '+':
coords = (r.chr,r.chromStart-pad5,r.chromEnd+pad3)
seq = str(BedTool.seq(coords, fastafile))
else: #reverse strand
coords = (r.chr,r.chromStart-pad3,r.chromEnd+pad5)
seq = str(BedTool.seq(coords, fastafile))
seq = HTSeq.Sequence(seq).get_reverse_complement()
#print n, coords, r['name']
new.append([r['name'],str(seq),coords])
new = pd.DataFrame(new, columns=['name','seq','coords'])
return new
示例13: main
def main():
p = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument('bed', help='bed with miRNA as name')
p.add_argument('--reference-beds', dest='reference', nargs='+',
help='reference beds for each feature to annotate')
p.add_argument('--names', nargs='+',
help='names corresponding to reference files')
args = p.parse_args()
if not args.names and not args.reference:
sys.exit(p.print_help())
bed = BedTool(args.bed)
# create the reference beds
reference = {}
for refname, refbed in izip(args.names, args.reference):
reference[refname] = BedTool(refbed)
for refname in args.names:
# intersect the mirna bed with the reference annotations
for b in bed.intersect(reference[refname], s=True, stream=True):
# Cytoscape formatting
fields = (b.name, "=", refname)
print " ".join(map(str, fields))
示例14: gene_regions
def gene_regions(vf, af):
v = BedTool(vf)
feats = BedTool(af)
# first establish all the columns in the annotation file
cols = set(f[4] for f in feats)
results = {}
intersection = v.intersect(feats, wb=True)
if len(intersection) > 0:
#sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn)
#call(sort_cmd1, shell=True)
tempfile1 = tempfile.mktemp()
sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s > %s' % (intersection.fn, tempfile1)
call(sort_cmd2, shell=True)
intersection = BedTool(tempfile1)
annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse')
for entry in annots:
regions = {}
regions[entry[4]] = entry[5]
results[entry.name] = Series(regions)
df = DataFrame(results, index = cols)
return df.T.fillna(0)
示例15: main
def main():
p = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument('peaks', help='peaks bed')
p.add_argument('exons', help='refseq exons from UCSC')
p.add_argument('gtf', help='refseq gtf with feature of interest')
p.add_argument('feature', help='feature of interest in the gtf')
p.add_argument('-v', '--verbose', action="store_true", help='maximum verbosity')
args = p.parse_args()
if args.verbose: sys.stderr.write(">> building exon library...\n")
exon_lib = make_exon_lib(args.exons)
peaks = BedTool(args.peaks)
exons = BedTool(args.exons)
full_ref = BedTool(args.gtf)
if args.verbose: sys.stderr.write(">> filtering for feature...\n")
filtered_ref = full_ref.filter(lambda gtf: gtf[2] == args.feature)
if args.verbose: sys.stderr.write(">> selecting exonic peaks...\n")
exonic_peaks = peaks.intersect(exons, wo=True)
if args.verbose: sys.stderr.write(">> calculating distance fractions...\n")
# D for distance (returns negative if upstream)
for peak in exonic_peaks.closest(filtered_ref, D="a"):
try:
p = ComplexLine(peak)
corrected_distance = 0.0
total_exon_length = 0.0
# parse gtf attrs
gene_id = p.gtfattrs.split(';')[0].rstrip('"').lstrip('gene_id "')
# looking downstream wrt peak
if p.gtfdistance > 0:
# exon with peak
corrected_distance = p.exonstop - p.peakstop
for exon in exon_lib[p.exoninfo.name]:
# add downstream exon lengths
if exon > p.exoninfo.number:
corrected_distance += exon_lib[p.exoninfo.name][exon]
# looking upstream wrt peak
else:
# exon with peak
corrected_distance = p.peakstart - p.exonstart
for exon in exon_lib[p.exoninfo.name]:
# add upstream exon lengths
if exon < p.exoninfo.number:
corrected_distance += exon_lib[p.exoninfo.name][exon]
for exon in exon_lib[p.exoninfo.name]:
total_exon_length += exon_lib[p.exoninfo.name][exon]
# fraction
print (corrected_distance / total_exon_length)
except ValueError:
continue