本文整理汇总了Python中pysam.asVCF函数的典型用法代码示例。如果您正苦于以下问题:Python asVCF函数的具体用法?Python asVCF怎么用?Python asVCF使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了asVCF函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: testFromTabix
def testFromTabix(self):
# use ascii encoding - should raise error
t = pysam.TabixFile(self.tmpfilename + ".gz", encoding="ascii")
results = list(t.fetch(parser=pysam.asVCF()))
self.assertRaises(UnicodeDecodeError, getattr, results[1], "id")
t = pysam.TabixFile(self.tmpfilename + ".gz", encoding="utf-8")
results = list(t.fetch(parser=pysam.asVCF()))
self.assertEqual(getattr(results[1], "id"), u"Rene\xe9")
示例2: fetch
def fetch(self, chrom, start, end):
""" yield tuples (vcf object + info dict key->val) for a range
vcf row attributes are:
contig
pos chromosomal position, zero-based
id
ref reference
alt alt
qual qual
filter filter
info info
format format specifier.
"""
chrom = chrom.replace("chr","")
#fname = self.fnameDict[chrom]
#vcf = pysam.VCF()
#vcf.connect(fname)
tbi = self.fhDict[chrom]
it = tbi.fetch(chrom, start, end, parser=pysam.asVCF())
for row in it:
infoDict = {}
infoStr = row.info
for keyVal in infoStr.split(";"):
if "=" not in keyVal:
continue
key, val = keyVal.split("=")
infoDict[key] = val
yield row, infoDict
示例3: testRead
def testRead(self):
ncolumns = len(self.columns)
for x, r in enumerate(self.tabix.fetch(parser=pysam.asVCF())):
c = self.compare[x]
for y, field in enumerate(self.columns):
# it is ok to have a missing format column
if y == 8 and y == len(c):
continue
if field == "pos":
self.assertEqual(int(c[y]) - 1, getattr(r, field))
self.assertEqual(int(c[y]) - 1, r.pos)
else:
self.assertEqual(c[y], getattr(r, field),
"mismatch in field %s: %s != %s" %
(field, c[y], getattr(r, field)))
if len(c) == 8:
self.assertEqual(0, len(r))
else:
self.assertEqual(len(c), len(r) + ncolumns)
for y in range(len(c) - ncolumns):
self.assertEqual(c[ncolumns + y], r[y])
self.assertEqual("\t".join(map(str, c)),
str(r))
示例4: filter_vcfs
def filter_vcfs(genotype, contig, start, end):
if contig in genotype.contigs:
parser = pysam.asVCF()
# This raises a ValueError if the VCF does not
# contain any entries for the specified contig.
for vcf in genotype.fetch(contig, start, end, parser=parser):
if vcf.filter in ("PASS", "."):
yield vcf
示例5: fetch
def fetch(self,
reference = None,
start = None,
end = None,
region = None ):
""" Parse a stream of VCF-formatted lines. Initializes class instance and return generator """
iter = self.tabixfile.fetch( reference, start, end, region, parser = pysam.asVCF() )
for x in iter:
yield VCFRecord( x, self )
示例6: check_nth_sample
def check_nth_sample(options, genotype):
parser = pysam.asVCF()
for contig in genotype.contigs:
for record in text.parse_lines(genotype.fetch(contig), parser):
if len(record) <= options.nth_sample:
sys.stderr.write("ERROR: Sample %i selected with --nth-sample,"
" but file only contains %i sample(s)!\n"
% (options.nth_sample + 1, len(record)))
return False
return True
return True
示例7: records
def records(self, chromosome_num, start_pos_bp, end_pos_bp, parser=pysam.asVCF()):
'''
Returns an iterator for the file records.
'''
start_pos_bp = 1 if start_pos_bp is None else start_pos_bp
end_pos_bp = self.clens[str(chromosome_num)] if end_pos_bp is None else end_pos_bp
# subtract one since pysam uses incorrect 0-indexed positions
records = self.tabix_file.fetch( str(chromosome_num), start_pos_bp+self.indexDelta, end_pos_bp+self.indexDelta, parser)
return records
示例8: _read_files
def _read_files(filenames, args):
in_header = True
has_filters = False
vcf_parser = pysam.asVCF()
for line in fileinput.input(filenames):
if not line.startswith("#"):
in_header = False
line = line.rstrip("\n\r")
yield vcf_parser(line, len(line))
elif in_header:
if not (line.startswith("##") or has_filters):
has_filters = True
for item in sorted(vcffilter.describe_filters(args).items()):
print('##FILTER=<ID=%s,Description="%s">' % item)
print(line, end="")
示例9: __init__
def __init__(self, inFile, ploidy=1, parser=pysam.asVCF()):
TabixReader.__init__(self, inFile, parser=parser)
assert ploidy in (1,2)
self.ploidy = ploidy
self.clens = []
self.sample_names = None
for line in self.header:
if line.startswith('##contig=<ID=') and line.endswith('>'):
line = line[13:-1]
c = line.split(',')[0]
clen = int(line.split('=')[1])
self.clens.append((c,clen))
elif line.startswith('#CHROM'):
row = line.split('\t')
self.sample_names = row[9:]
self.clens = dict(self.clens)
assert self.sample_names
示例10: testRead
def testRead( self ):
ncolumns = len(self.columns)
for x, r in enumerate(self.tabix.fetch( parser = pysam.asVCF() )):
c = self.compare[x]
for y, field in enumerate( self.columns ):
if field == "pos":
self.assertEqual( int(c[y]) - 1, getattr( r, field ) )
self.assertEqual( int(c[y]) - 1, r.pos )
else:
self.assertEqual( c[y], getattr( r, field ),
"mismatch in field %s: %s != %s" %\
( field,c[y], getattr( r, field ) ) )
self.assertEqual( len(c), len( r ) + ncolumns )
for y in range(len(c) - ncolumns):
self.assertEqual( c[ncolumns+y], r[y] )
示例11: _get_hits
def _get_hits(coords, annotation, parser_type):
"""Retrieve BED information, recovering if BED annotation file does have a chromosome.
"""
if parser_type == "bed":
parser = pysam.asBed()
elif parser_type == "vcf":
parser = pysam.asVCF()
elif parser_type == "tuple":
parser = pysam.asTuple()
elif parser_type is None:
parser = None
else:
raise ValueError("Unexpected parser type: %s" % parser)
chrom, start, end = coords
try:
hit_iter = annotation.fetch(str(chrom), start, end, parser=parser)
# catch invalid region errors raised by ctabix
except ValueError:
hit_iter = []
return hit_iter
示例12: GetSumOfDifferencesFromTheReference
def GetSumOfDifferencesFromTheReference(vcfpath):
from subprocess import check_call
from utilBMF.HTSUtils import TrimExt
import pysam
import numpy as np
from sys import stderr
from itertools import chain
cfi = chain.from_iterable
bgvcfpath = TrimExt(vcfpath) + ".gz"
check_call("bgzip -c %s > %s" % (vcfpath, bgvcfpath), shell=True)
stderr.write("bgvcf now at %s" % bgvcfpath)
tabixstr = "tabix " + bgvcfpath
stderr.write("Now calling tabixstr: '%s'" % tabixstr)
check_call("tabix %s" % bgvcfpath, shell=True)
infh = open(bgvcfpath, "rb")
tabixhandle = pysam.tabix_iterator(infh, pysam.asVCF())
return np.sum(np.array(list(cfi([dict(tup.split("=") for
tup in i.info.split(";"))[
'I16'].split(",")[2:4] for i in tabixhandle if
"INDEL" not in i.info])), dtype=np.int64))
示例13: testWrite
def testWrite(self):
ncolumns = len(self.columns)
for x, r in enumerate(self.tabix.fetch(parser=pysam.asVCF())):
c = self.compare[x]
# check unmodified string
cmp_string = str(r)
ref_string = "\t".join([x for x in c])
self.assertEqual(ref_string, cmp_string)
# set fields and compare field-wise
for y, field in enumerate(self.columns):
# it is ok to have a missing format column
if y == 8 and y == len(c):
continue
if field == "pos":
rpos = getattr(r, field)
self.assertEqual(int(c[y]) - 1, rpos)
self.assertEqual(int(c[y]) - 1, r.pos)
# increment pos by 1
setattr(r, field, rpos + 1)
self.assertEqual(getattr(r, field), rpos + 1)
c[y] = str(int(c[y]) + 1)
else:
setattr(r, field, "test_%i" % y)
c[y] = "test_%i" % y
self.assertEqual(c[y], getattr(r, field),
"mismatch in field %s: %s != %s" %
(field, c[y], getattr(r, field)))
if len(c) == 8:
self.assertEqual(0, len(r))
else:
self.assertEqual(len(c), len(r) + ncolumns)
for y in range(len(c) - ncolumns):
c[ncolumns + y] = "test_%i" % y
r[y] = "test_%i" % y
self.assertEqual(c[ncolumns + y], r[y])
示例14: __init__
def __init__(self, inFile, ploidy=1, parser=pysam.asVCF()):
# using old-style class superclass calling here
# since TabixReader is derived from pysam.TabixFile
# which is itself an old-style class (due to Cython version?)
TabixReader.__init__(self, inFile, parser=parser)
# when pysam uses new-style classes, we can replace with:
#super(VcfReader, self).__init__(inFile, parser=parser)
assert ploidy in (1, 2)
self.ploidy = ploidy
self.clens = []
self.sample_names = None
for line in self.header:
line = bytes_to_string(line)
if line.startswith('##contig=<ID=') and line.endswith('>'):
line = line[13:-1]
c = line.split(',')[0]
clen = int(line.split('=')[1])
self.clens.append((c, clen))
elif line.startswith('#CHROM'):
row = line.split('\t')
self.sample_names = row[9:]
self.clens = dict(self.clens)
assert self.sample_names
示例15: select_vcf_records
def select_vcf_records(bed_records, vcf_records):
"""Returns an iterable of VCF records, corresponding to the contents of each
region specified by the BED records. Records are returned at most once, even
if covered by multiple BED records."""
contigs = frozenset(vcf_records.contigs)
vcf_parser = pysam.asVCF()
# Timer class used processing progress; meant primarily for BAM files
progress = timer.BAMTimer(None)
# Cache of positions observed for this contig, to prevent returning
# positions in overlapping regions multiple times
contig_cache = None
contig_cache_name = None
for bed in sorted(bed_records):
if bed.contig not in contigs:
# Skip contigs for which no calls have been made (e.g. due to
# low coverage. Otherwise Pysam raises an exception.
continue
elif contig_cache_name != bed.contig:
# Reset cache per contig, to save memory
contig_cache = set()
contig_cache_name = bed.contig
for record in vcf_records.fetch(bed.contig, bed.start, bed.end, parser = vcf_parser):
progress.increment()
if record.pos in contig_cache:
# We've already reported this VCF record
continue
contig_cache.add(record.pos)
# Skip records filtered by VCF_filter
if record.filter in ('.', "PASS"):
yield record
progress.finalize()