本文整理汇总了Python中pybedtools.BedTool.sort方法的典型用法代码示例。如果您正苦于以下问题:Python BedTool.sort方法的具体用法?Python BedTool.sort怎么用?Python BedTool.sort使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pybedtools.BedTool
的用法示例。
在下文中一共展示了BedTool.sort方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: generate_remainder
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
def generate_remainder(whole_bed, bed_prefix, bed_list):
"""
Calculate the remaining regions that are not included in the truth set
:param whole_bed: path to the truth regions for the whole panel
:param bed_prefix: prefix used for all the bed files
:param bed_list: list of all the bed files for that panel
:return: BEDTool containing any regions that are completely missing from the truth regions
"""
whole_truth = BedTool(whole_bed)
whole_truth.saveas()
whole = BedTool()
for bed in bed_list:
print bed
tool = BedTool(bed)
tool.saveas()
if bed == bed_list[0]:
whole = tool
else:
whole = whole.cat(tool)
whole.saveas()
whole_sorted = whole.sort()
whole_merged = whole_sorted.merge()
whole_merged.saveas()
remainder = whole_merged.subtract(whole_truth)
remainder.moveto('/results/Analysis/MiSeq/MasterBED/GIAB/' + bed_prefix + '.remainder.bed')
missing_regions = whole_merged.subtract(whole_truth, A=True)
return missing_regions
示例2: check_bed
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
def check_bed(self, bed_file, stream):
bed = BedTool(bed_file, from_string=stream)
try:
sorted_bed = bed.sort()
merged_bed = sorted_bed.merge(c="4", o="distinct")
return merged_bed
except Exception as exception:
print ("ERROR: " + str(exception))
示例3: readBedIntervals
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
def readBedIntervals(bedPath, ncol = 3,
chrom = None, start = None, end = None,
sort = False, ignoreBed12 = True):
""" Read bed intervals from a bed file (or a specifeid range therein).
NOTE: intervals are sorted by their coordinates"""
if not os.path.isfile(bedPath):
raise RuntimeError("Bed interval file %s not found" % bedPath)
assert ncol == 3 or ncol == 4 or ncol == 5
outIntervals = []
logger.debug("readBedIntervals(%s)" % bedPath)
bedTool = BedTool(bedPath)
if sort is True:
bedTool = bedTool.sort()
logger.debug("sortBed(%s)" % bedPath)
if ignoreBed12 is False:
bedTool = bedTool.bed6()
logger.debug("bed6(%s)" % bedPath)
if chrom is None:
bedIntervals = bedTool
else:
assert start is not None and end is not None
interval = Interval(chrom, start, end)
logger.debug("intersecting (%s,%d,%d) and %s" % (chrom, start, end,
bedPath))
# Below, we try switching from all_hits to intersect()
# all_hits seems to leak a ton of memory for big files, so
# we hope intersect (which creates a temp file) will be better
#bedIntervals = bedTool.all_hits(interval)
tempTool = BedTool(str(interval), from_string = True)
bedIntervals = bedTool.intersect(tempTool)
tempTool.delete_temporary_history(ask=False)
logger.debug("appending bed intervals")
for feat in bedIntervals:
outInterval = (feat.chrom, feat.start, feat.end)
if ncol >= 4:
outInterval += (feat.name,)
if ncol >= 5:
outInterval += (feat.score,)
outIntervals.append(outInterval)
logger.debug("finished readBedIntervals(%s)" % bedPath)
return outIntervals
示例4: generate_remainder
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
def generate_remainder(whole_bed, out_dir, bed_list):
"""
Calculate the remaining regions that are not included in the truth set
:param whole_bed: Path to the truth regions for the whole panel
:type whole_bed: String
:param out_dir: Prefix used for all the bed files
:type out_dir: String
:param bed_list: List of all the bed files for that panel
:type bed_list: List of String
:return: BEDTool containing any regions that are completely missing from the truth regions
:rtype: BedTool
"""
try:
whole_truth = BedTool(whole_bed)
whole_truth.saveas()
whole = BedTool()
for bed in bed_list:
print(bed)
tool = BedTool(bed)
tool.saveas()
if bed == bed_list[0]:
whole = tool
else:
whole = whole.cat(tool)
whole.saveas()
whole_sorted = whole.sort()
whole_merged = whole_sorted.merge()
whole_merged.saveas()
remainder = whole_merged.subtract(whole_truth)
remainder.moveto(out_dir + '/remainder.bed')
missing_regions = whole_merged.subtract(whole_truth, A=True)
except UnicodeDecodeError:
missing_regions = None
return missing_regions
示例5: exonunion
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
def exonunion(args):
"""
%prog exonunion gencode.v26.annotation.exon.bed
Collapse overlapping exons within the same gene. File
`gencode.v26.annotation.exon.bed` can be generated by:
$ zcat gencode.v26.annotation.gtf.gz | awk 'OFS="\t" {if ($3=="exon")
{print $1,$4-1,$5,$10,$12,$14,$16,$7}}' | tr -d '";'
"""
p = OptionParser(exonunion.__doc__)
opts, args = p.parse_args(args)
if len(args) != 1:
sys.exit(not p.print_help())
gencodebed, = args
beds = BedTool(gencodebed)
# fields[3] is gene_id; fields[6] is gene_name
for g, gb in groupby(beds, key=lambda x: x.fields[3]):
gb = BedTool(gb)
sys.stdout.write(str(gb.sort().merge(c="4,5,6,7",
o=','.join(['first'] * 4))))
示例6: consolidate
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
def consolidate(nbedfile, obedfile, cbedfile):
from pybedtools import BedTool
nbedtool = BedTool(nbedfile)
obedtool = BedTool(obedfile)
ab = nbedtool.intersect(obedtool, s=True, u=True)
ba = obedtool.intersect(nbedtool, s=True, u=True)
cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn)
fp = popen(cmd)
ovl = BedTool(fp.readlines())
abmerge = ovl.merge(s=True, nms=True, scores="mean").sort()
cmd = "cat {0}".format(abmerge.fn)
fp = popen(cmd, debug=False)
ovl = BedTool(fp.readlines())
notovl = nbedtool.intersect(ovl.sort(), s=True, v=True)
infile = "{0} {1}".format(notovl.fn, ovl.fn)
tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid())
cmd = "sort -k1,1 -k2,2n"
sh(cmd, infile=infile, outfile=tmpfile)
fp = open(cbedfile, "w")
bed = Bed(tmpfile)
for b in bed:
if ";" in b.accn:
accns = set()
for accn in b.accn.split(";"):
accns.add(accn)
b.accn = ";".join(accns)
print >> fp, b
fp.close()
os.remove(tmpfile)
sort([cbedfile, "-i"])
示例7: getMergedBedIntervals
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
def getMergedBedIntervals(bedPath, ncol=3, sort = False, ignoreBed12 = True):
""" Merge all contiguous and overlapping intervals"""
if not os.path.isfile(bedPath):
raise RuntimeError("Bed interval file %s not found" % bedPath)
logger.debug("mergeBedIntervals(%s)" % bedPath)
outIntervals = []
bedTool = BedTool(bedPath)
if sort is True:
bedTool = bedTool.sort()
logger.debug("sortBed(%s)" % bedPath)
if ignoreBed12 is False:
logger.debug("bed6(%s)" % bedPath)
bedTool = bedTool.bed6()
for feat in bedTool.merge():
outInterval = (feat.chrom, feat.start, feat.end)
if ncol >= 4:
outInterval += (feat.name,)
if ncol >= 5:
outInterval += (feat.score,)
outIntervals.append(outInterval)
logger.debug("finished mergeBedIntervals(%s)" % bedPath)
return outIntervals
示例8: print_split_sort_bed
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
split_region_list = []
for region in self.bed:
pass
def print_split_sort_bed(self):
pass
split_num = 100
split_region_list = [[]*5]
print split_region_list
bed = BedTool('/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test.bed')
bed = BedTool(bed.sort().merge().window_maker(b=bed.fn, w=100))
bed.all_hits()
# x = BedTool().window_maker(genome='hg38', w=1000000)
bed.saveas('/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test_w100.bed')
split_num = bed.count() if bed.count() < split_num else split_num
print bed.count()/split_num
# print bed.split(10, 'out')
# print x
n = 0
示例9: vcf_to_long_bed
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
def vcf_to_long_bed(*in_vcfs):
intervals = bed_generator(in_vcfs)
region_file = BedTool(intervals)
return region_file.sort()
示例10: __init__
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
class Target:
def __init__(self, work_dir, output_dir, fai_fpath, bed_fpath=None,
padding=None, reannotate=False, genome=None, is_debug=False):
self.bed = None
self.original_bed_fpath = None
self.bed_fpath = None # with genomic features
self.capture_bed_fpath = None # w/o genomic features
self.qualimap_bed_fpath = None
self.padded_bed_fpath = None
self.gene_keys_set = set() # set of pairs (gene_name, chrom)
self.gene_keys_list = list() # list of pairs (gene_name, chrom)
self.regions_num = None
self.bases_num = None
self.fraction = None
if bed_fpath:
debug('Using target BED file ' + bed_fpath)
self.is_wgs = False
verify_bed(bed_fpath, is_critical=True)
self.original_bed_fpath = bed_fpath
self._make_target_bed(bed_fpath, work_dir, output_dir, padding=padding,
is_debug=is_debug, fai_fpath=fai_fpath, genome=genome, reannotate=reannotate)
else:
debug('No input BED. Assuming whole genome. For region-based reports, analysing RefSeq CDS.')
self.is_wgs = True
self._make_wgs_regions_file(work_dir, genome=genome)
def get_capture_bed(self):
if not self.is_wgs:
if self.bed.field_count() > ebl.BedCols.FEATURE:
return self.bed.filter(lambda x: x[ebl.BedCols.FEATURE] == 'capture')
else:
return self.bed
else:
return None
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug,
padding=None, fai_fpath=None, genome=None, reannotate=False):
clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean')
if not can_reuse(clean_target_bed_fpath, bed_fpath):
debug()
debug('Cleaning target BED file...')
bed = BedTool(bed_fpath)
if bed.field_count() > 4:
bed = bed.cut(range(4))
bed = bed\
.filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\
.remove_invalid()
with file_transaction(work_dir, clean_target_bed_fpath) as tx:
bed.saveas(tx)
debug('Saved to ' + clean_target_bed_fpath)
verify_file(clean_target_bed_fpath, is_critical=True)
sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted')
if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath):
debug()
debug('Sorting target BED file...')
sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath)
debug('Saved to ' + sort_target_bed_fpath)
verify_file(sort_target_bed_fpath, is_critical=True)
if genome in ebl.SUPPORTED_GENOMES:
ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features')
if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath):
debug()
if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate:
debug('Annotating target BED file and collecting overlapping genome features')
overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
genome=genome, extended=True, reannotate=reannotate, only_canonical=True)
else:
debug('Overlapping with genomic features:')
overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir,
genome=genome, extended=True, only_canonical=True)
debug('Saved to ' + ann_target_bed_fpath)
verify_file(ann_target_bed_fpath, is_critical=True)
else:
ann_target_bed_fpath = sort_target_bed_fpath
final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean')
if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath):
bed = BedTool(ann_target_bed_fpath).remove_invalid()
with file_transaction(work_dir, final_clean_target_bed_fpath) as tx:
bed.saveas(tx)
pass
verify_file(final_clean_target_bed_fpath, is_critical=True)
self.bed_fpath = final_clean_target_bed_fpath
self.bed = BedTool(self.bed_fpath)
self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann')
if not can_reuse(self.capture_bed_fpath, self.bed_fpath):
with file_transaction(work_dir, self.capture_bed_fpath) as tx:
self.get_capture_bed().saveas(tx)
gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath)
self.gene_keys_set = gene_key_set
self.gene_keys_list = gene_key_list
self.regions_num = self.get_capture_bed().count()
#.........这里部分代码省略.........
示例11: BedTool
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
#sys.stderr.write("_"*140 + "\n")
return None;
bed = BedTool(args.path);
if(not len(bed)):
sys.exit('Provided bed file is empty. Interactions cannot be generated\n')
name2intervals = doublebed2dict(bed);
name2interaction = defaultdict(list);
interaction2name = defaultdict(list);
for c, i in enumerate(bed.sort().merge(s=True, d=args.distance, c='4,6', o='distinct', delim=';')):
print i.name
for name in i.name.split(";"):
n = "|".join(name.split("|")[:-1])
name2interaction[n].append(c);
for k, v in name2interaction.iteritems():
key = tuple(sorted(v));
interaction2name[key].append(k)
number_chimeras = len(name2intervals);
number_interactions = 0
number_self_intersection = 0;
示例12: run
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
def run(chipdir, refseq, filedir, DMSO, CA):
TSS = (-200, 1000)
a = BedTool(chipdir)
b = a.closest(refseq, d=True)
b.cut([9, 10, 11, 12, 13, 14, 21]).saveas(filedir + "/SRF_closest.bed")
d = dict()
with open(filedir + "/SRF_closest.bed") as F:
for line in F:
line = line.strip().split()
chrom, start, stop = line[0:3]
d[chrom + "\t" + start + "\t" + stop + "\t"] = "\t".join(line[3:])
outfile = open(filedir + "/SRF_closest.rmdup.bed", "w")
for key in d:
if "." not in key.split():
outfile.write(key + d[key] + "\n")
outfile.close()
# os.system("sort -k1,1 -k2,2n " + filedir + "/SRF_closest.rmdup.bed > " + filedir + "/SRF_closest.rmdup.sorted.bed")
a = BedTool(filedir + "/SRF_closest.rmdup.bed")
a.sort().saveas(filedir + "/SRF_closest.rmdup.sorted.bed")
outfile = open(filedir + "/SRF.TSS.bed", "w")
outfile2 = open(filedir + "/SRF.gene.bed", "w")
with open(filedir + "/SRF_closest.rmdup.sorted.bed") as F:
for line in F:
chrom, start, stop, gene, number, strand, distance = line.strip().split()
if int(stop) - int(start) > 2000 and int(distance) > 10000:
if strand is "+":
outfile.write(chrom + "\t" + str(int(start) + TSS[0]) + "\t" + str(int(start) + TSS[1]) + "\n")
outfile2.write(chrom + "\t" + str(int(start) + TSS[1]) + "\t" + stop + "\n")
else:
outfile.write(chrom + "\t" + str(int(stop) - TSS[1]) + "\t" + str(int(stop) - TSS[0]) + "\n")
outfile2.write(chrom + "\t" + start + "\t" + str(int(stop) - TSS[1]) + "\n")
outfile.close()
outfile2.close()
a = BedTool(filedir + "/SRF.TSS.bed")
a.sort().saveas(filedir + "/SRF.TSS.bed")
a = BedTool(filedir + "/SRF.gene.bed")
a.sort().saveas(filedir + "/SRF.gene.bed")
TSS = filedir + "/SRF.TSS.bed"
genes = filedir + "/SRF.gene.bed"
os.system("bedtools map -a " + genes + " -b " + DMSO + " -c 4 -o sum > " + filedir + "/DMSO.genes.bed")
os.system("bedtools map -a " + TSS + " -b " + DMSO + " -c 4 -o sum > " + filedir + "/DMSO.TSS.bed")
os.system("bedtools map -a " + genes + " -b " + CA + " -c 4 -o sum > " + filedir + "/CA.genes.bed")
os.system("bedtools map -a " + TSS + " -b " + CA + " -c 4 -o sum > " + filedir + "/CA.TSS.bed")
TRx = list()
TRy = list()
expressionlist = list()
with open(filedir + "/DMSO.genes.bed") as a, open(filedir + "/DMSO.TSS.bed") as b, open(
filedir + "/CA.genes.bed"
) as c, open(filedir + "/CA.TSS.bed") as d:
for line in a:
bline = b.readline().strip().split()[-1]
cline = c.readline().strip().split()[-1]
dline = d.readline().strip().split()[-1]
if line.strip().split()[-1] is ".":
DMSOgene = 0.0
else:
DMSOgene = float(line.strip().split()[-1])
if bline is ".":
DMSOTSS = 0.0
else:
DMSOTSS = float(bline)
if cline is ".":
CAgene = 0.0
else:
CAgene = float(cline)
if dline is ".":
CATSS = 0.0
else:
CATSS = float(dline)
if DMSOgene == 0.0:
TRx.append(0.0)
else:
TRx.append((DMSOTSS / DMSOgene))
if CAgene == 0.0:
TRy.append(0.0)
else:
TRy.append((CATSS / CAgene))
expressionlist.append((np.log2(DMSOgene) + np.log2(CAgene)) / 2.0)
F6 = plt.figure()
ax1 = F6.add_subplot(111)
xy = np.vstack([TRx, TRy])
z = gaussian_kde(xy)(xy)
ax1.scatter(TRx, TRy, c=z, edgecolor="")
# ax1.scatter(TRx2,TRy2,c='red',edgecolor="",s=expressionlist2)
ax1.set_title("Pausing Index")
ax1.set_ylabel("CA")
ax1.set_xlabel("DMSO")
ax1.get_xaxis().tick_bottom()
ax1.get_yaxis().tick_left()
# ax1.plot([0,1/slope1],[intercept1,1],color = 'r')
ax1.set_xlim([0, 20])
ax1.set_ylim([0, 20])
ax1.plot([0, 50.0], [0, 50.0], color="k")
# ax1.text(8,18, "Pearson = " + str(pearsons)[0:5])
#.........这里部分代码省略.........
示例13: isec_vcf
# 需要导入模块: from pybedtools import BedTool [as 别名]
# 或者: from pybedtools.BedTool import sort [as 别名]
def isec_vcf(out=None,v1=None,s1=None,v1_prep=None,v2=None,s2=None,bed=None,bam=None,ref=None):
parser = argparse.ArgumentParser()
parser.add_argument('-o', help='Location of output')
parser.add_argument('-v1', help='VCF 1')
parser.add_argument('-s1', help='Sample name (as appears in vcf 1)')
parser.add_argument('-v1_prep', help='Does the reference VCF need to be prepared', default=True)
parser.add_argument('-v2', help='VCF 2')
parser.add_argument('-s2', help='Sample name (as appears in vcf 2), if omitted it will be the same as s1', default=None)
parser.add_argument('-b', help='BED file to filter to')
parser.add_argument('-bam', help='BAM file for vcf 2 sample to generate coverage stats from')
parser.add_argument('-ref', help="Path to the Reference Genome used", default='/results/Pipeline/program/GATK_resource_bundle/ucsc.hg19.nohap.masked.fasta')
args = parser.parse_args()
if not v1:
v1 = args.v1
if not s1:
s1 = args.s1
if not v2:
v2 = args.v2
if not s2 and not args.s2:
s2 = s1
elif not s2:
s2 = args.s2
if not bed:
bed = args.b
if not bam:
bam = args.bam
if not v1_prep:
v1_prep = args.v1_prep
if not out:
out = args.o
if not ref:
ref = args.ref
if out.endswith('/'):
out_dir = os.path.dirname(out)
else:
out_dir = out
if not os.path.exists(out_dir):
print("Output directory does not exist")
exit(1)
no_header = os.path.basename(bed).replace('.bed', '_noheader.bed')
no_header = out_dir + '/' + no_header
command = "grep -i -v start " + bed + " > " + no_header
try:
subprocess.check_call(command, shell=True)
except subprocess.CalledProcessError as e:
print('Error executing command: ' + str(e.returncode))
exit(1)
if os.path.basename(no_header) == "IEM_all_panels_header_noheader.bed":
#IEM name column causes bedtools error
tool = BedTool(no_header)
sorted_tool = tool.sort()
merged_tool = sorted_tool.merge()
merged_tool.saveas(no_header.replace('.bed', '.merged.bed'))
no_header = no_header.replace('.bed', '.merged.bed')
print(no_header)
#prepare vcfs
if v1_prep != "False":
v1_decomposed = prepare_vcf(v1, ref)
else:
print('no prep')
v1_decomposed = v1
if os.path.exists(v2.replace('.vcf', '.decomposed.normalised.vcf.gz')):
v2_decomposed = v2.replace('.vcf', '.decomposed.normalised.vcf.gz')
else:
v2_decomposed = prepare_vcf(v2, ref)
#isec
command = '/results/Pipeline/program/bcftools-1.3.1/bcftools isec -R ' + \
bed + ' -p ' + \
out_dir + ' ' + \
v1_decomposed + ' ' + \
v2_decomposed
try:
subprocess.check_call(command, shell=True)
except subprocess.CalledProcessError as e:
print('Error executing command: ' + str(e.returncode))
exit(1)
#annotate
coverage = get_coverage(no_header, out_dir, s2, bam)
false_negs_ann = annotate_false_negs(out_dir, s1, coverage)
false_pos_ann = annotate_false_pos(out_dir, coverage, s2)
matching, mismatching_genotype = check_genotype(out_dir, s2, s1, coverage)
false_negs_indels = len(false_negs_ann['indels'])
false_negs_cov = len(false_negs_ann['no_coverage'])
false_negs_ev_alt = len(false_negs_ann['evidence_of_alt'])
false_neg_oth = len(false_negs_ann['false_neg'])
false_negs = false_neg_oth + false_negs_cov + false_negs_cov + false_negs_ev_alt + false_negs_indels
false_pos = len(false_pos_ann)
num_mismatch = len(mismatching_genotype)
#.........这里部分代码省略.........