本文整理匯總了Python中pysam.tabix_index方法的典型用法代碼示例。如果您正苦於以下問題:Python pysam.tabix_index方法的具體用法?Python pysam.tabix_index怎麽用?Python pysam.tabix_index使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pysam
的用法示例。
在下文中一共展示了pysam.tabix_index方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: addVariantSet
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def addVariantSet(
self, variantFileName, dataset, referenceSet,
ontology, biosamples):
inputVcf = os.path.join(
self.inputDirectory, variantFileName)
outputVcf = os.path.join(
self.outputDirectory, variantFileName)
shutil.copy(inputVcf, outputVcf)
pysam.tabix_index(outputVcf, preset="vcf")
variantSet = variants.HtslibVariantSet(
dataset, variantFileName.split('_')[1])
variantSet.setReferenceSet(referenceSet)
variantSet.populateFromFile(
[os.path.abspath(outputVcf + ".gz")],
[os.path.abspath(outputVcf + ".gz.tbi")])
variantSet.checkConsistency()
for callSet in variantSet.getCallSets():
for biosample in biosamples:
if biosample.getLocalId() == callSet.getLocalId():
callSet.setBiosampleId(biosample.getId())
self.repo.insertVariantSet(variantSet)
for annotationSet in variantSet.getVariantAnnotationSets():
annotationSet.setOntology(ontology)
self.repo.insertVariantAnnotationSet(annotationSet)
示例2: run
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def run(argv):
if '-h' in argv or '--help' in argv:
print('Make a single large tabixed file of all phenotypes data')
exit(1)
if should_run():
# we don't need `ffi.new('char[]', ...)` because args are `const`
ret = lib.cffi_make_matrix(sites_filepath.encode('utf8'),
common_filepaths['pheno']('*').encode('utf8'),
matrix_gz_tmp_filepath.encode('utf8'))
ret_bytes = ffi.string(ret, maxlen=1000)
if ret_bytes != b'ok':
raise PheWebError('The portion of `pheweb matrix` written in c++/cffi failed with the message ' + repr(ret_bytes))
os.rename(matrix_gz_tmp_filepath, matrix_gz_filepath)
pysam.tabix_index(
filename=matrix_gz_filepath, force=True,
seq_col=0, start_col=1, end_col=1 # note: these are 0-based, but `/usr/bin/tabix` is 1-based
)
else:
print('matrix is up-to-date!')
示例3: _make_vcf_and_read_depths_files
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def _make_vcf_and_read_depths_files(self):
if not os.path.exists(self.ref_fa + '.fai'):
pysam.faidx(self.ref_fa)
tmp_vcf = self.vcf_file + '.tmp'
with open(tmp_vcf, 'w') as f:
print(pysam.mpileup(
'-t', 'INFO/AD,INFO/ADF,INFO/ADR',
'-L', '99999999',
'-A',
'-f', self.ref_fa,
'-u',
'-v',
self.bam,
), end='', file=f)
got = vcfcall_ariba.vcfcall_ariba(tmp_vcf, self.outprefix, self.min_var_read_depth, self.min_second_var_read_depth, self.max_allele_freq)
if got != 0:
raise Error('Error parsing vcf file. Cannot contine')
pysam.tabix_compress(self.outprefix + '.read_depths', self.read_depths_file)
pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1)
os.unlink(self.outprefix + '.read_depths')
os.unlink(tmp_vcf)
示例4: ensureIndexed
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def ensureIndexed(bedPath, preset="bed", trySorting=True):
if not bedPath.endswith(".gz"):
if not os.path.exists(bedPath+".gz"):
logging.info("bgzf compressing {}".format(bedPath))
pysam.tabix_compress(bedPath, bedPath+".gz")
if not os.path.exists(bedPath+".gz"):
raise Exception("Failed to create compress {preset} file for {file}; make sure the {preset} file is "
"sorted and the directory is writeable".format(preset=preset, file=bedPath))
bedPath += ".gz"
if not os.path.exists(bedPath+".tbi"):
logging.info("creating tabix index for {}".format(bedPath))
pysam.tabix_index(bedPath, preset=preset)
if not os.path.exists(bedPath+".tbi"):
raise Exception("Failed to create tabix index file for {file}; make sure the {preset} file is "
"sorted and the directory is writeable".format(preset=preset, file=bedPath))
line = next(pysam.Tabixfile(bedPath).fetch())
if len(line.strip().split("\t")) < 6 and preset == "bed":
raise AnnotationError("BED files need to have at least 6 (tab-delimited) fields (including "
"chrom, start, end, name, score, strand; score is unused)")
if len(line.strip().split("\t")) < 9 and preset == "gff":
raise AnnotationError("GFF/GTF files need to have at least 9 tab-delimited fields")
return bedPath
# def sortFile(uncompressedPath, preset):
# if preset == "bed":
# fields = {"chrom":0, "start":1, "end":2}
# elif preset == "gff":
# fields = {"chrom":0, "start":3, "end":4}
# sortCommand = "sort -k{chrom}V -k{start}n -k{end}n".format(**fields)
# tabixCommand = "{sort} {path} | bgzip > {path}.gz".format(sort=sortCommand, path=uncompressedPath)
# logging.info("Trying to sort input annotation file with command:")
# logging.info(" {}".format(tabixCommand))
# subprocess.check_call(tabixCommand, shell=True)
示例5: get_ins
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def get_ins(args, bases = 50000, splitsize = 1000):
"""function to get insertions
"""
if not args.out:
if args.bed is None:
args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
else:
args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
if args.bed is None:
chrs = read_chrom_sizes_from_bam(args.bam)
chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize)
sets = chunks.split(items = bases/splitsize)
else:
chunks = ChunkList.read(args.bed)
chunks.merge()
sets = chunks.split(bases = bases)
maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
pool1 = mp.Pool(processes = max(1,args.cores-1))
out_handle = open(args.out + '.ins.bedgraph','w')
out_handle.close()
write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
write_process = mp.Process(target = _writeIns, args=(write_queue, args.out))
write_process.start()
for j in sets:
if args.smooth:
tmp = pool1.map(_insHelperSmooth, zip(j,itertools.repeat(args)))
else:
tmp = pool1.map(_insHelper, zip(j,itertools.repeat(args)))
for track in tmp:
write_queue.put(track)
pool1.close()
pool1.join()
write_queue.put('STOP')
write_process.join()
pysam.tabix_compress(args.out + '.ins.bedgraph', args.out + '.ins.bedgraph.gz', force = True)
shell_command('rm ' + args.out + '.ins.bedgraph')
pysam.tabix_index(args.out + '.ins.bedgraph.gz', preset = "bed", force = True)
示例6: get_cov
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def get_cov(args, bases = 50000, splitsize = 1000):
"""function to get coverages
"""
if not args.out:
if args.bed is None:
args.out = '.'.join(os.path.basename(args.bam).split('.')[0:-1])
else:
args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
if args.bed is None:
chrs = read_chrom_sizes_from_bam(args.bam)
chunks = ChunkList.convertChromSizes(chrs, splitsize = splitsize)
sets = chunks.split(items = bases/splitsize)
else:
chunks = ChunkList.read(args.bed)
chunks.merge()
sets = chunks.split(bases = bases)
maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
pool1 = mp.Pool(processes = max(1,args.cores-1))
out_handle = open(args.out + '.cov.bedgraph','w')
out_handle.close()
write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
write_process = mp.Process(target = _writeCov, args=(write_queue, args.out))
write_process.start()
for j in sets:
tmp = pool1.map(_covHelper, zip(j,itertools.repeat(args)))
for track in tmp:
write_queue.put(track)
pool1.close()
pool1.join()
write_queue.put('STOP')
write_process.join()
pysam.tabix_compress(args.out + '.cov.bedgraph', args.out + '.cov.bedgraph.gz', force = True)
shell_command('rm ' + args.out + '.cov.bedgraph')
pysam.tabix_index(args.out + '.cov.bedgraph.gz', preset = "bed", force = True)
示例7: tabix_bedgraph
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def tabix_bedgraph(bedgraph):
pysam.tabix_compress(bedgraph,bedgraph+'.gz')
pysam.tabix_index(bedgraph+'.gz', seq_col=0, start_col=1, end_col=2, zerobased=True)
示例8: make_bias_track
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def make_bias_track(args, bases = 500000, splitsize = 1000):
"""function to compute bias track
"""
if args.out is None:
if args.bed is not None:
args.out = '.'.join(os.path.basename(args.bed).split('.')[0:-1])
else:
args.out = '.'.join(os.path.basename(args.fasta).split('.')[0:-1])
params = _BiasParams(args.fasta, args.pwm)
if args.bed is None:
chunks = ChunkList.convertChromSizes(params.chrs, splitsize = splitsize)
sets = chunks.split(items = bases/splitsize)
else:
chunks = ChunkList.read(args.bed)
chunks.checkChroms(params.chrs.keys())
chunks.merge()
sets = chunks.split(bases = bases)
maxQueueSize = max(2,int(2 * bases / np.mean([chunk.length() for chunk in chunks])))
pool = mp.Pool(processes = max(1,args.cores-1))
out_handle = open(args.out + '.Scores.bedgraph','w')
out_handle.close()
write_queue = mp.JoinableQueue(maxsize = maxQueueSize)
write_process = mp.Process(target = _writeBias, args=(write_queue, args.out))
write_process.start()
for j in sets:
tmp = pool.map(_biasHelper, zip(j,itertools.repeat(params)))
for track in tmp:
write_queue.put(track)
pool.close()
pool.join()
write_queue.put('STOP')
write_process.join()
pysam.tabix_compress(args.out + '.Scores.bedgraph', args.out + '.Scores.bedgraph.gz', force = True)
shell_command('rm ' + args.out + '.Scores.bedgraph')
pysam.tabix_index(args.out + '.Scores.bedgraph.gz', preset = "bed", force = True)
示例9: run_diff
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def run_diff(args, bases = 500000):
"""run differential occupancy calling
"""
chrs = read_chrom_sizes_from_bam(args.bam)
pwm = PWM.open(args.pwm)
chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = args.flank + args.upper/2 + max(pwm.up,pwm.down))
chunks.merge()
maxQueueSize = max(2,int(100 * bases / np.mean([chunk.length() for chunk in chunks])))
#get fragmentsizes
fragment_dist1 = FragmentMixDistribution(0, upper = args.upper)
fragment_dist1.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes1).get(0,args.upper))
fragment_dist1.modelNFR()
fragment_dist2 = FragmentMixDistribution(0, upper = args.upper)
fragment_dist2.fragmentsizes = FragmentSizes(0, args.upper, vals = FragmentSizes.open(args.sizes2).get(0,args.upper))
fragment_dist2.modelNFR()
params = OccupancyParameters(fragment_dist, args.upper, args.fasta, args.pwm, sep = args.nuc_sep, min_occ = args.min_occ,
flank = args.flank, bam = args.bam, ci = args.confidence_interval)
sets = chunks.split(bases = bases)
pool1 = mp.Pool(processes = max(1,args.cores-1))
diff_handle = open(args.out + '.occdiff.bed','w')
diff_handle.close()
diff_queue = mp.JoinableQueue()
diff_process = mp.Process(target = _writeDiff, args=(diff_queue, args.out))
diff_process.start()
nuc_dist = np.zeros(args.upper)
for j in sets:
tmp = pool1.map(_occHelper, zip(j,itertools.repeat(params)))
for result in tmp:
diff_queue.put(result[1])
pool1.close()
pool1.join()
diff_queue.put('STOP')
diff_process.join()
pysam.tabix_compress(args.out + '.occdiff.bed', args.out + '.occdiff.bed.gz',force = True)
shell_command('rm ' + args.out + '.occdiff.bed')
pysam.tabix_index(args.out + '.occdiff.bed.gz', preset = "bed", force = True)
示例10: merge_vcfs
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def merge_vcfs(in_vcfs_dir, contigs, out_vcf):
logger.info("Mergings per-chromosome VCFs from %s" % in_vcfs_dir)
header_done = False
out_vcf_file = open(out_vcf, "w")
for contig in contigs:
chr_vcf = os.path.join(in_vcfs_dir, "%s.vcf.gz" % contig.name)
if os.path.isfile(chr_vcf):
chr_tabix_file = pysam.Tabixfile(chr_vcf)
if not header_done:
print_header(chr_tabix_file.header, out_vcf_file)
for entry in chr_tabix_file.fetch():
out_vcf_file.write("%s\n" % entry)
chr_tabix_file.close()
out_vcf_file.close()
pysam.tabix_index(out_vcf, force=True, preset="vcf")
示例11: convert_svtool_to_vcf
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def convert_svtool_to_vcf(file_name, sample, out_vcf, toolname, reference, sort=False, index=False):
vcf_template_reader = get_template()
vcf_template_reader.samples = [sample]
vcf_fd = open(out_vcf, "w") if out_vcf is not None else sys.stdout
vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader)
reference_handle = pysam.Fastafile(reference) if reference else None
reference_contigs = get_contigs(reference)
if sort:
if not reference_contigs:
logger.warn("Chromosomes will be sorted in lexicographic order since reference is missing")
else:
logger.info("Chromosomes will be sorted by the reference order")
vcf_records = []
for tool_record in tool_to_reader[toolname](file_name, reference_handle=reference_handle):
vcf_record = tool_record.to_vcf_record(sample)
if vcf_record is None:
continue
if sort:
vcf_records.append(vcf_record)
else:
vcf_writer.write_record(vcf_record)
if sort:
if reference_contigs:
contigs_order_dict = {contig.name: index for (index, contig) in enumerate(reference_contigs)}
vcf_records.sort(
cmp=lambda x, y: cmp((contigs_order_dict[x.CHROM], x.POS), (contigs_order_dict[y.CHROM], y.POS)))
else:
vcf_records.sort(cmp=lambda x, y: cmp((x.CHROM, x.POS), (y.CHROM, y.POS)))
for vcf_record in vcf_records:
vcf_writer.write_record(vcf_record)
vcf_writer.close()
if out_vcf and index:
pysam.tabix_index(out_vcf, force=True, preset='vcf')
示例12: convert_VariantFile_to_IndexedVariantFile
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def convert_VariantFile_to_IndexedVariantFile(vf_path, ivf_path):
make_basedir(ivf_path)
tmp_path = get_tmp_path(ivf_path)
pysam.tabix_compress(vf_path, tmp_path, force=True)
os.rename(tmp_path, ivf_path)
pysam.tabix_index(
filename=ivf_path, force=True,
seq_col=0, start_col=1, end_col=1, # note: `pysam.tabix_index` calls the first column `0`, but cmdline `tabix` calls it `1`.
line_skip=1, # skip header
)
示例13: _compress_and_index_file
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def _compress_and_index_file(infile, log_fh=None):
if log_fh is not None:
print('Compressing file', infile, file=log_fh, flush=True)
pysam.tabix_compress(infile, infile + '.gz')
pysam.tabix_index(infile + '.gz', seq_col=0, start_col=1, end_col=1)
示例14: simulate_vcf
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def simulate_vcf(self, out_prefix, mutation_rate,
recombination_rate, length,
chrom_name=1, ploidy=1, random_seed=None,
force=False, print_aa=True):
out_prefix = os.path.expanduser(out_prefix)
vcf_name = out_prefix + ".vcf"
bed_name = out_prefix + ".bed"
for fname in (vcf_name, bed_name):
if not force and os.path.isfile(fname):
raise FileExistsError(
"{} exists and force=False".format(fname))
if np.any(self.sampled_n % ploidy != 0):
raise ValueError("Sampled alleles per population must be"
" integer multiple of ploidy")
with open(bed_name, "w") as bed_f:
print(chrom_name, 0, length, sep="\t", file=bed_f)
with open(vcf_name, "w") as vcf_f:
treeseq = self.simulate_trees(
mutation_rate=mutation_rate,
recombination_rate=recombination_rate,
length=length, num_replicates=1,
random_seed=random_seed)
print("##fileformat=VCFv4.2", file=vcf_f)
print('##source="VCF simulated by momi2 using'
' msprime backend"', file=vcf_f)
print("##contig=<ID={chrom_name},length={length}>".format(
chrom_name=chrom_name, length=length), file=vcf_f)
print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
file=vcf_f)
print('##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">',
file=vcf_f)
n_samples = int(np.sum(self.sampled_n) / ploidy)
fields = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL",
"FILTER", "INFO", "FORMAT"]
for pop, n in zip(self.sampled_pops, self.sampled_n):
for i in range(int(n / ploidy)):
fields.append("{}_{}".format(pop, i))
print(*fields, sep="\t", file=vcf_f)
loc = next(treeseq)
if print_aa:
info_str = "AA=A"
else:
info_str = "."
for v in loc.variants():
gt = np.reshape(v.genotypes, (n_samples, ploidy))
print(chrom_name, int(np.floor(v.position)),
".", "A", "T", ".", ".", info_str, "GT",
*["|".join(map(str, sample)) for sample in gt],
sep="\t", file=vcf_f)
pysam.tabix_index(vcf_name, preset="vcf", force=force)
示例15: run_nuc
# 需要導入模塊: import pysam [as 別名]
# 或者: from pysam import tabix_index [as 別名]
def run_nuc(args):
"""run occupancy calling
"""
vmat = VMat.open(args.vmat)
if args.fasta:
chrs = read_chrom_sizes_from_fasta(args.fasta)
else:
chrs = read_chrom_sizes_from_bam(args.bam)
pwm = PWM.open(args.pwm)
chunks = ChunkList.read(args.bed, chromDict = chrs, min_offset = vmat.mat.shape[1] + vmat.upper/2 + max(pwm.up,pwm.down) + args.nuc_sep/2, min_length = args.nuc_sep * 2)
chunks.slop(chrs, up = args.nuc_sep/2, down = args.nuc_sep/2)
chunks.merge()
maxQueueSize = args.cores*10
if args.sizes is not None:
fragment_dist = FragmentSizes.open(args.sizes)
else:
fragment_dist = FragmentSizes(0, upper = vmat.upper)
fragment_dist.calculateSizes(args.bam, chunks)
params = NucParameters(vmat = vmat, fragmentsizes = fragment_dist, bam = args.bam, fasta = args.fasta, pwm = args.pwm,
occ_track = args.occ_track,
sd = args.sd, nonredundant_sep = args.nuc_sep, redundant_sep = args.redundant_sep,
min_z = args.min_z, min_lr = args.min_lr , atac = args.atac)
sets = chunks.split(items = args.cores*5)
pool1 = mp.Pool(processes = max(1,args.cores-1))
if args.write_all:
outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth',
'nucleoatac_background','nucleoatac_raw']
else:
outputs = ['nucpos','nucpos.redundant','nucleoatac_signal','nucleoatac_signal.smooth']
handles = {}
write_queues = {}
write_processes = {}
for i in outputs:
if i not in ['nucpos','nucpos.redundant','nfrpos']:
handles[i] = open(args.out + '.'+i+'.bedgraph','w')
else:
handles[i] = open(args.out + '.'+i+'.bed','w')
handles[i].close()
write_queues[i] = mp.JoinableQueue(maxsize = maxQueueSize)
write_processes[i] = mp.Process(target = _writeFuncs[i], args=(write_queues[i], args.out))
write_processes[i].start()
for j in sets:
tmp = pool1.map(_nucHelper, zip(j,itertools.repeat(params)))
for result in tmp:
for i in outputs:
write_queues[i].put(result[i])
pool1.close()
pool1.join()
for i in outputs:
write_queues[i].put('STOP')
for i in outputs:
write_processes[i].join()
if i not in ['nucpos','nucpos.redundant']:
pysam.tabix_compress(args.out + '.' + i + '.bedgraph', args.out + '.' + i + '.bedgraph.gz',force = True)
shell_command('rm ' + args.out + '.' + i + '.bedgraph')
pysam.tabix_index(args.out + '.' + i + '.bedgraph.gz', preset = "bed", force = True)
else:
pysam.tabix_compress(args.out + '.' + i + '.bed', args.out + '.' + i + '.bed.gz',force = True)
shell_command('rm ' + args.out + '.' + i + '.bed')
pysam.tabix_index(args.out + '.' + i + '.bed.gz', preset = "bed", force = True)