本文整理匯總了Python中svtools.vcf.file.Vcf.add_header方法的典型用法代碼示例。如果您正苦於以下問題:Python Vcf.add_header方法的具體用法?Python Vcf.add_header怎麽用?Python Vcf.add_header使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類svtools.vcf.file.Vcf
的用法示例。
在下文中一共展示了Vcf.add_header方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: TestGenotype
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
class TestGenotype(TestCase):
def setUp(self):
header_lines = [
'##fileformat=VCFv4.2',
'##fileDate=20151202',
'##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
'##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
'##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
'##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878' ]
self.vcf = Vcf()
self.vcf.add_header(header_lines)
self.variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:SU 0/0:9'
self.variant = Variant(self.variant_line.split('\t'), self.vcf)
def test_set_format(self):
g = Genotype(self.variant, '0/1')
self.assertFalse('INACTIVE' in self.variant.active_formats)
g.set_format('INACTIVE', 10)
self.assertEqual(g.format['INACTIVE'], 10)
self.assertTrue('INACTIVE' in self.variant.active_formats)
def test_get_format(self):
g = Genotype(self.variant, '0/1')
g.set_format('INACTIVE', 10)
self.assertEqual(g.get_format('INACTIVE'), 10)
def test_get_gt_string(self):
g = Genotype(self.variant, '0/1')
g.set_format('INACTIVE', 10)
self.assertEqual(g.get_gt_string(), '0/1:.:10')
示例2: test_all
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def test_all(self):
header_lines = [
'##fileformat=VCFv4.2',
'##fileDate=20090805',
'##source=myImputationProgramV3.1',
'##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
'##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>',
'##phasing=partial',
'##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
'##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
'##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
'##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">',
'##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">',
'##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">',
'##ALT=<ID=DEL,Description="DELETION">',
'##FILTER=<ID=q10,Description="Quality below 10">',
'##FILTER=<ID=s50,Description="Less than 50% of samples have data">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
'##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">',
'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003']
v = Vcf()
v.add_header(header_lines)
expected_header_lines = header_lines
expected_header_lines[1] = '##fileDate=' + time.strftime('%Y%m%d')
self.assertEqual(v.get_header(), '\n'.join(expected_header_lines))
v.add_sample('ScottPilgrim')
self.assertEqual(v.sample_to_col('ScottPilgrim'), 12)
示例3: test_add_genotype
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def test_add_genotype(self):
header_lines = [
'##fileformat=VCFv4.2',
'##fileDate=20151202',
'##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
'##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
'##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
'##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
'##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878' ]
vcf = Vcf()
vcf.add_header(header_lines)
variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG SU 9'
variant = Variant(variant_line.split('\t'), vcf)
self.assertEqual(variant.get_gt_string(), './.:9')
示例4: run_gt_refine
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file):
vcf = Vcf()
header = []
in_header = True
sex={}
for line in gender_file:
v = line.rstrip().split('\t')
sex[v[0]] = int(v[1])
outf=open(diag_outfile, 'w', 4096)
ct=1
for line in vcf_in:
if in_header:
if line[0] == "#":
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters')
#vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette')
vcf_out.write(vcf.get_header() + '\n')
var = Variant(line.rstrip().split('\t'), vcf)
df=load_df(var, sex)
df1=get_silhouette(df)
sil_avg=df1.iloc[0, df1.columns.get_loc('sil_gt_avg')]
#sil_ind=df1.loc[:, 'sil_gt']
var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg
vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
if ct==1:
df1.to_csv(outf, header=True)
ct += 1
else:
df1.to_csv(outf, header=False)
vcf_out.close()
vcf_in.close()
outf.close()
gender_file.close()
return
示例5: VCFReader
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
class VCFReader(object):
def __init__(self, stream):
self.vcf_obj = Vcf()
self.stream = stream
header = list()
for line in stream:
if line[0] != '#':
raise RuntimeError('Error parsing VCF header. Line is not a header line. {}'.format(line))
header.append(line)
if line.startswith('#CHROM\t'):
# end of header
break
self.vcf_obj.add_header(header)
def __iter__(self):
for line in self.stream:
yield Variant(line.rstrip().split('\t'), self.vcf_obj)
示例6: write_copynumber
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def write_copynumber(vcf_file, sample, vcf_out, cn_list):
#go through the VCF and add the read depth annotations
in_header = True
header = []
vcf = Vcf()
i = 0
s_index = -1
for line in vcf_file:
if in_header:
if line[0] == '#' and line[1] == '#':
header.append(line)
continue
if line[0] == '#' and line[1] != '#':
try:
s_index = line.rstrip().split('\t').index(sample)
except ValueError:
sys.stderr.write("Please input valid VCF, format field for " + sample + " not found in VCF")
sys.exit(1)
line = '\t'.join(map(str, line.rstrip().split('\t')[:9] + [sample]))
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf.add_format('CN', 1, 'Float', 'Copy number of structural variant segment.')
vcf_out.write(vcf.get_header() + '\n')
v = line.rstrip().split('\t')
# XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this?
if s_index == -1:
sys.stderr.write("Input a valid sample name: " + sample + " not found in a provided VCF")
sys.exit(1)
v = v[:9] + [v[s_index]]
if not any("SVTYPE=BND" in s for s in v):
if "CN" not in v[8]:
v[8] = v[8] + ":CN"
v[9] = v[9] + ":" + str(cn_list[i])
else:
cn_index = v[8].rstrip().split(":").index("CN")
gts = v[9].rstrip().split(":")
gts[cn_index] = str(cn_list[i])
v[9] = ":".join(gts)
i += 1
# write the VCF
vcf_out.write('\t'.join(v) + '\n')
vcf_out.close()
return
示例7: TestVariant
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
class TestVariant(TestCase):
def setUp(self):
header_lines = [
'##fileformat=VCFv4.2',
'##fileDate=20151202',
'##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
'##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
'##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
'##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878' ]
self.vcf = Vcf()
self.vcf.add_header(header_lines)
self.variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:SU 0/0:9'
self.variant = Variant(self.variant_line.split('\t'), self.vcf)
def test_set_info(self):
self.variant.set_info('SVTYPE', 'INV')
self.assertEqual(self.variant.info['SVTYPE'], 'INV')
self.variant.set_info('IMAFLAG', False)
self.assertEqual(self.variant.info['IMAFLAG'], False)
with self.assertRaises(SystemExit) as cm:
self.variant.set_info('SUPER', True)
def test_get_info(self):
self.assertEqual(self.variant.get_info('IMAFLAG'), True)
self.assertEqual(self.variant.get_info('SVTYPE'), 'BND')
with self.assertRaises(KeyError) as cm:
self.variant.get_info('CALI')
def test_get_info_string(self):
self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9;IMAFLAG')
self.variant.set_info('IMAFLAG', False)
self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9')
def test_get_format_string(self):
self.assertEqual(self.variant.get_format_string(), 'GT:SU')
def test_genotype(self):
self.assertEqual(self.variant.genotype('NA12878').get_gt_string(), '0/0:9')
def test_var_string(self):
self.assertEqual(self.variant.get_var_string(), self.variant_line)
示例8: bedpeToVcf
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def bedpeToVcf(bedpe_file, vcf_out):
myvcf = Vcf()
converter = BedpeToVcfConverter(myvcf)
in_header = True
# parse the bedpe data
header = list()
for line in bedpe_file:
if in_header:
if line[0:2] == '##':
header.append(line)
continue
elif line[0] == '#' and line[1] != '#':
sample_list_str = line.rstrip().split('\t', 20)[-1]
header.append('\t'.join([
'#CHROM',
'POS',
'ID',
'REF',
'ALT',
'QUAL',
'FILTER',
'INFO',
sample_list_str
] ))
continue
else:
in_header = False
myvcf.add_header(header)
myvcf.file_format='VCFv4.2'
vcf_out.write(myvcf.get_header() + '\n')
#
bedpe = Bedpe(line.rstrip().split('\t'))
variants = converter.convert(bedpe)
for v in variants:
vcf_out.write(v.get_var_string() + '\n')
# close the VCF output file and header if no variants found
if in_header == True:
myvcf.add_header(header)
myvcf.file_format='VCFv4.2'
vcf_out.write(myvcf.get_header() + '\n')
vcf_out.close()
return
示例9: test_var_string_format_caching
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def test_var_string_format_caching(self):
header_lines = [
"##fileformat=VCFv4.2",
"##fileDate=20151202",
'##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
'##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
'##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
'##FORMAT=<ID=AS,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
'##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
"#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878",
]
vcf = Vcf()
vcf.add_header(header_lines)
variant_line = "1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:AS:SU 0/0:1:9"
uncached_line = "1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:SU:AS 0/0:9:1"
variant = Variant(variant_line.split("\t"), vcf)
gt = variant.genotypes() # force parsing
self.assertEqual(variant.get_var_string(), uncached_line)
self.assertEqual(variant.get_var_string(use_cached_gt_string=True), variant_line)
示例10: test_add_info_after
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def test_add_info_after(self):
header_lines = [
'##fileformat=VCFv4.2',
'##fileDate=20090805',
'##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
'##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
'##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003']
extra_line = '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">'
v = Vcf()
v.add_header(header_lines)
v.add_info_after('DP', 'DB', 0, 'Flag', 'dbSNP membership, build 129')
expected_lines = header_lines[0:4] + [extra_line] + header_lines[4:]
expected_lines[1] = '##fileDate=' + time.strftime('%Y%m%d')
self.assertEqual(v.get_header(), '\n'.join(expected_lines))
v2 = Vcf()
v2.add_header(header_lines)
v2.add_info_after('AF', 'DB', 0, 'Flag', 'dbSNP membership, build 129')
expected_lines2 = header_lines[0:5] + [extra_line] + header_lines[5:]
expected_lines2[1] = '##fileDate=' + time.strftime('%Y%m%d')
self.assertEqual(v2.get_header(), '\n'.join(expected_lines2))
示例11: sv_classify
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold):
vcf_out = sys.stdout
vcf = Vcf()
header = []
in_header = True
min_pos_samps_for_regression = 10
gender = {}
# read sample genders
for line in gender_file:
v = line.rstrip().split('\t')
gender[v[0]] = int(v[1])
exclude = []
if exclude_file is not None:
for line in exclude_file:
exclude.append(line.rstrip())
for line in vcf_in:
if in_header:
if line[0] == '#':
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
# write the output header
vcf_out.write(vcf.get_header() + '\n')
# split variant line, quick pre-check if the SVTYPE is BND, and skip if so
v = line.rstrip().split('\t')
info = v[7].split(';')
svtype = None
for x in info:
if x.startswith('SVTYPE='):
svtype = x.split('=')[1]
break
# bail if not DEL or DUP prior to reclassification
if svtype not in ['DEL', 'DUP']:
vcf_out.write(line)
continue
# parse the VCF line
var = Variant(v, vcf, True)
# check intersection with mobile elements
if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
ae = annotation_intersect(var, ae_dict, f_overlap)
if ae is not None:
if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
ae = 'ME:' + ae
var.alt = '<DEL:%s>' % ae
var.info['SVTYPE'] = 'MEI'
vcf_out.write(var.get_var_string(True) + '\n')
continue
# # write to directory
# writedir = 'data/r11.100kb.dup'
# annotate based on read depth
if var.info['SVTYPE'] in ['DEL', 'DUP']:
# count the number of positively genotyped samples
num_pos_samps = 0;
for s in var.sample_list:
if s in exclude:
continue
if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
num_pos_samps += 1
if num_pos_samps < min_pos_samps_for_regression:
if has_low_freq_depth_support(var, gender, exclude):
# has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_rd')
# has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_rd')
# write variant
#vcf_out.write(var.get_var_string(True) + '\n')
vcf_out.write(line)
else:
# has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_no_rd')
# has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_no_rd')
for m_var in to_bnd_strings(var):
vcf_out.write(m_var + '\n')
else:
if has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold):
# has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_rd')
# has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_rd')
# write variant
#vcf_out.write(var.get_var_string(True) + '\n')
vcf_out.write(line)
else:
# has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_no_rd')
# has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_no_rd')
for m_var in to_bnd_strings(var):
vcf_out.write(m_var + '\n')
vcf_out.close()
return
示例12: sv_classify
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, het_del_fit, hom_del_fit, params, diag_outfile):
vcf_out = sys.stdout
vcf = Vcf()
header = []
in_header = True
min_pos_samps_for_regression = 10
sex = {}
# read sample genders
for line in gender_file:
v = line.rstrip().split('\t')
sex[v[0]] = int(v[1])
exclude = []
if exclude_file is not None:
for line in exclude_file:
exclude.append(line.rstrip())
if diag_outfile is not None:
outf=open(diag_outfile, 'w', 4096)
for line in vcf_in:
if in_header:
if line[0] == '#':
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf_out.write(vcf.get_header() + '\n')
# split variant line, quick pre-check if the SVTYPE is BND, and skip if so
v = line.rstrip().split('\t')
info = v[7].split(';')
svtype = None
for x in info:
if x.startswith('SVTYPE='):
svtype = x.split('=')[1]
break
# bail if not DEL or DUP prior to reclassification
if svtype not in ['DEL', 'DUP']:
vcf_out.write(line)
continue
# parse the VCF line
var = Variant(v, vcf, True)
# check intersection with mobile elements
if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
ae = annotation_intersect(var, ae_dict, f_overlap)
if ae is not None:
if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
ae = 'ME:' + ae
var.alt = '<DEL:%s>' % ae
var.info['SVTYPE'] = 'MEI'
vcf_out.write(var.get_var_string(True) + '\n')
continue
# for now, don't worry about sex chromosomes
if (var.chrom == 'X' or var.chrom == 'Y'):
vcf_out.write(line)
continue
#count positively genotyped samples
num_pos_samps = 0;
for s in var.sample_list:
if s in exclude:
continue
if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
num_pos_samps += 1
high_freq_support = False
low_freq_support = False
nb_support = False
if num_pos_samps == 0:
vcf_out.write(line)
else:
df=load_df(var, exclude, sex)
if has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params):
nb_support = True
if num_pos_samps < min_pos_samps_for_regression:
if has_low_freq_depth_support(df):
low_freq_support = True
vcf_out.write(line)
else:
for m_var in to_bnd_strings(var, True ):
vcf_out.write(m_var + '\n')
else:
if has_high_freq_depth_support(df, slope_threshold, rsquared_threshold):
high_freq_support = True
vcf_out.write(line)
else:
for m_var in to_bnd_strings(var, True):
vcf_out.write(m_var + '\n')
#.........這裏部分代碼省略.........
示例13: calc_params
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def calc_params(vcf_file):
tSet = list()
epsilon=0.1
header=[]
in_header = True
vcf = Vcf()
for line in vcf_file:
if in_header:
if line[0] == '#':
header.append(line)
if line[1] != '#':
vcf_samples = line.rstrip().split('\t')[9:]
in_header = False
vcf.add_header(header)
continue
else:
# split variant line, quick pre-check if the SVTYPE is BND, and skip if so
v = line.rstrip().split('\t')
info = v[7].split(';')
svtype = None
for x in info:
if x.startswith('SVTYPE='):
svtype = x.split('=')[1]
break
if svtype not in ['DEL', 'DUP'] or v[0]=="X" or v[0]=="Y":
continue
var = Variant(v, vcf)
for sample in vcf_samples:
if var.gts[sample].get_format('GT') != './.':
log2r = math.log((float(var.gts[sample].get_format('CN'))+ epsilon)/2,2) #to avoid log(0)
tSet.append(CN_rec1(var.var_id, sample, var.info['SVTYPE'], abs(float(var.info['SVLEN'])), var.info['AF'],
var.gts[sample].get_format('GT'), var.gts[sample].get_format('CN'), var.gts[sample].get_format('AB'), math.log(abs(float(var.info['SVLEN']))), log2r))
df=pd.DataFrame(tSet, columns=CN_rec1._fields)
df['q_low']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(lowQuantile)
df['q_high']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(highQuantile)
df=df[(df.log2r>=df.q_low) & (df.log2r<=df.q_high)]
df.to_csv('./train.csv')
#adjust copy number for small deletions (<1kb), no strong relationship b/w cn and size for dups evident so far
small_het_dels = df[(df.svtype=="DEL") & (df.GT=="0/1") & (df.svlen<1000) & (df.svlen>=100)]
small_hom_dels = df[(df.svtype=="DEL") & (df.GT=="1/1") & (df.svlen<1000) & (df.svlen>=100)]
het_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="0/1") & (df.svtype=="DEL")]['log2r'])
hom_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="1/1") & (df.svtype=="DEL")]['log2r'])
small_het_dels['offset']=small_het_dels['log2r']-het_del_mean
small_hom_dels['offset']=small_hom_dels['log2r']-hom_del_mean
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
hom_del_fit=smf.ols('offset~log_len',small_hom_dels).fit()
het_del_fit=smf.ols('offset~log_len',small_het_dels).fit()
print hom_del_fit.summary()
print het_del_fit.summary()
small_hom_dels['log2r_adj'] = small_hom_dels['log2r'] - hom_del_fit.predict(small_hom_dels)
small_het_dels['log2r_adj'] = small_het_dels['log2r'] - het_del_fit.predict(small_het_dels)
small_dels=small_hom_dels.append(small_het_dels)
small_dels=small_dels[['var_id', 'sample', 'svtype', 'svlen', 'AF', 'GT', 'CN', 'log_len', 'log2r', 'q_low', 'q_high', 'log2r_adj']]
# dels of length<100 bp are excluded here
df1=df[(df.svtype!="DEL") | (df.GT=="0/0") | (df.svlen>=1000)]
df1['log2r_adj']=df1['log2r']
df1=df1.append(small_dels)
params=df1.groupby(['sample', 'svtype', 'GT'])['log2r_adj'].aggregate([np.mean,np.var, len]).reset_index()
params=pd.pivot_table(params, index=['sample', 'svtype'], columns='GT', values=['mean', 'var', 'len']).reset_index()
params.columns=['sample', 'svtype', 'mean0', 'mean1', 'mean2', 'var0', 'var1', 'var2', 'len0', 'len1', 'len2']
params['std_pooled']=np.sqrt((params['var0']*params['len0']+params['var1']*params['len1']+params['var2']*params['len2'])/(params['len0']+params['len1']+params['len2']))
params.to_csv('./params.csv')
return (params, het_del_fit, hom_del_fit)
示例14: varLookup
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name):
# FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!!
bList = list()
headerObj=Vcf() #co-opt the VCF header object
if cohort_name is None:
cohort_name=str(str(bFile).split('/')[-1])
if bFile == "stdin":
bData = sys.stdin
elif bFile.endswith('.gz'):
bData = gzip.open(bFile, 'rb')
else:
bData = open(bFile, 'r')
for bLine in bData:
if bLine.startswith(pass_prefix):
continue
bentry = Bedpe(bLine.rstrip().split('\t'))
if bentry.af is None:
sys.stderr.write('No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n')
sys.exit(1)
bList.append(bentry)
if aFile == "stdin":
aData = sys.stdin
elif aFile.endswith('.gz'):
aData = gzip.open(aFile, 'rb')
else:
aData = open(aFile, 'r')
in_header=True
header_lines = []
sample_list = None
for aLine in aData:
if pass_prefix is not None and aLine.startswith(pass_prefix):
if aLine[0] == '#' and aLine[1] != '#':
sample_list = aLine.rstrip().split('\t', 14)[-1]
else:
header_lines.append(aLine)
continue
else:
if in_header == True:
headerObj.add_header(header_lines)
headerObj.add_info(cohort_name + '_AF', '.', 'Float', 'Allele frequency(ies) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' )
headerObj.add_info(cohort_name + '_VarID', '.', 'Integer', 'List of Variant ID(s) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' )
header = headerObj.get_header()
bedpe_out.write(header[:header.rfind('\n')] + '\n')
if len(sample_list) > 0:
bedpe_out.write('\t'.join(['#CHROM_A',
'START_A',
'END_A',
'CHROM_B',
'START_B',
'END_B',
'ID',
'QUAL',
'STRAND_A',
'STRAND_B',
'TYPE',
'FILTER',
'INFO_A','INFO_B',
sample_list]
) + '\n')
else:
bedpe_out.write('\t'.join(['#CHROM_A',
'START_A',
'END_A',
'CHROM_B',
'START_B',
'END_B',
'ID',
'QUAL',
'STRAND_A',
'STRAND_B',
'TYPE',
'FILTER',
'INFO_A','INFO_B']
) + '\n')
in_header=False
a = Bedpe(aLine.rstrip().split('\t'))
if a.af is None:
sys.stderr.write('No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n')
sys.exit(1)
for b in bList:
add(a,b,max_distance)
bedpe_out.write(get_var_string(a, cohort_name))
示例15: run_gt_refine
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_header [as 別名]
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):
vcf = Vcf()
header = []
in_header = True
sex={}
for line in gender_file:
v = line.rstrip().split('\t')
sex[v[0]] = int(v[1])
exclude = []
if exclude_file is not None:
for line in exclude_file:
exclude.append(line.rstrip())
outf=open(diag_outfile, 'w', 4096)
ct=1
for line in vcf_in:
if in_header:
if line[0] == "#":
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT')
vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT')
vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.')
vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
vcf_out.write(vcf.get_header() + '\n')
v = line.rstrip().split('\t')
info = v[7].split(';')
svtype = None
for x in info:
if x.startswith('SVTYPE='):
svtype = x.split('=')[1]
break
# bail if not DEL or DUP prior to reclassification
if svtype not in ['DEL']:
vcf_out.write(line)
continue
var = Variant(v, vcf)
sys.stderr.write("%s\n" % var.var_id)
sys.stderr.write("%f\n" % float(var.get_info('AF')))
if float(var.get_info('AF'))<0.01:
vcf_out.write(line)
else:
df=load_df(var, exclude, sex)
recdf=recluster(df)
if ct==1:
recdf.to_csv(outf, header=True)
ct += 1
else:
recdf.to_csv(outf, header=False)
var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re']))
var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re']))
recdf.set_index('sample', inplace=True)
for s in var.sample_list:
if s in recdf.index:
var.genotype(s).set_format("GTR", recdf.loc[s,'GTR'])
var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re']))
else:
var.genotype(s).set_format("GTR", "./.")
var.genotype(s).set_format("GQR", 0)
vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n')
vcf_out.close()
vcf_in.close()
gender_file.close()
outf.close()
if exclude_file is not None:
exclude_file.close()
return