本文整理匯總了Python中svtools.vcf.file.Vcf.get_header方法的典型用法代碼示例。如果您正苦於以下問題:Python Vcf.get_header方法的具體用法?Python Vcf.get_header怎麽用?Python Vcf.get_header使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類svtools.vcf.file.Vcf
的用法示例。
在下文中一共展示了Vcf.get_header方法的14個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: test_eight_column_vcf
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def test_eight_column_vcf(self):
header_lines = [
'##fileformat=VCFv4.2',
'##fileDate=20090805',
'##source=myImputationProgramV3.1',
'##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
'##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>',
'##phasing=partial',
'##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
'##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
'##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
'##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">',
'##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">',
'##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">',
'##ALT=<ID=DEL,Description="DELETION">',
'##FILTER=<ID=q10,Description="Quality below 10">',
'##FILTER=<ID=s50,Description="Less than 50% of samples have data">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
'##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">',
'#CHROM POS ID REF ALT QUAL FILTER INFO']
v = Vcf()
v.add_header(header_lines)
expected_header_lines = header_lines
expected_header_lines[1] = '##fileDate=' + time.strftime('%Y%m%d')
self.assertEqual(v.get_header(), '\n'.join(expected_header_lines))
v.add_sample('ScottPilgrim')
self.assertEqual(v.sample_to_col('ScottPilgrim'), 9)
post_sample_add_header_lines = [
'##fileformat=VCFv4.2',
'##fileDate=' + time.strftime('%Y%m%d'),
'##source=myImputationProgramV3.1',
'##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
'##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>',
'##phasing=partial',
'##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
'##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
'##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
'##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">',
'##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">',
'##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">',
'##ALT=<ID=DEL,Description="DELETION">',
'##FILTER=<ID=q10,Description="Quality below 10">',
'##FILTER=<ID=s50,Description="Less than 50% of samples have data">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
'##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">',
'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ScottPilgrim']
self.assertEqual(v.get_header(), '\n'.join(post_sample_add_header_lines))
示例2: bedpeToVcf
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def bedpeToVcf(bedpe_file, vcf_out):
myvcf = Vcf()
converter = BedpeToVcfConverter(myvcf)
in_header = True
# parse the bedpe data
header = list()
for line in bedpe_file:
if in_header:
if line[0:2] == '##':
header.append(line)
continue
elif line[0] == '#' and line[1] != '#':
sample_list_str = line.rstrip().split('\t', 20)[-1]
header.append('\t'.join([
'#CHROM',
'POS',
'ID',
'REF',
'ALT',
'QUAL',
'FILTER',
'INFO',
sample_list_str
] ))
continue
else:
in_header = False
myvcf.add_header(header)
myvcf.file_format='VCFv4.2'
vcf_out.write(myvcf.get_header() + '\n')
#
bedpe = Bedpe(line.rstrip().split('\t'))
variants = converter.convert(bedpe)
for v in variants:
vcf_out.write(v.get_var_string() + '\n')
# close the VCF output file and header if no variants found
if in_header == True:
myvcf.add_header(header)
myvcf.file_format='VCFv4.2'
vcf_out.write(myvcf.get_header() + '\n')
vcf_out.close()
return
示例3: test_add_info_after
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def test_add_info_after(self):
header_lines = [
'##fileformat=VCFv4.2',
'##fileDate=20090805',
'##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
'##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
'##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003']
extra_line = '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">'
v = Vcf()
v.add_header(header_lines)
v.add_info_after('DP', 'DB', 0, 'Flag', 'dbSNP membership, build 129')
expected_lines = header_lines[0:4] + [extra_line] + header_lines[4:]
expected_lines[1] = '##fileDate=' + time.strftime('%Y%m%d')
self.assertEqual(v.get_header(), '\n'.join(expected_lines))
v2 = Vcf()
v2.add_header(header_lines)
v2.add_info_after('AF', 'DB', 0, 'Flag', 'dbSNP membership, build 129')
expected_lines2 = header_lines[0:5] + [extra_line] + header_lines[5:]
expected_lines2[1] = '##fileDate=' + time.strftime('%Y%m%d')
self.assertEqual(v2.get_header(), '\n'.join(expected_lines2))
示例4: run_gt_refine
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file):
vcf = Vcf()
header = []
in_header = True
sex={}
for line in gender_file:
v = line.rstrip().split('\t')
sex[v[0]] = int(v[1])
outf=open(diag_outfile, 'w', 4096)
ct=1
for line in vcf_in:
if in_header:
if line[0] == "#":
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters')
#vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette')
vcf_out.write(vcf.get_header() + '\n')
var = Variant(line.rstrip().split('\t'), vcf)
df=load_df(var, sex)
df1=get_silhouette(df)
sil_avg=df1.iloc[0, df1.columns.get_loc('sil_gt_avg')]
#sil_ind=df1.loc[:, 'sil_gt']
var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg
vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
if ct==1:
df1.to_csv(outf, header=True)
ct += 1
else:
df1.to_csv(outf, header=False)
vcf_out.close()
vcf_in.close()
outf.close()
gender_file.close()
return
示例5: write_copynumber
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def write_copynumber(vcf_file, sample, vcf_out, cn_list):
#go through the VCF and add the read depth annotations
in_header = True
header = []
vcf = Vcf()
i = 0
s_index = -1
for line in vcf_file:
if in_header:
if line[0] == '#' and line[1] == '#':
header.append(line)
continue
if line[0] == '#' and line[1] != '#':
try:
s_index = line.rstrip().split('\t').index(sample)
except ValueError:
sys.stderr.write("Please input valid VCF, format field for " + sample + " not found in VCF")
sys.exit(1)
line = '\t'.join(map(str, line.rstrip().split('\t')[:9] + [sample]))
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf.add_format('CN', 1, 'Float', 'Copy number of structural variant segment.')
vcf_out.write(vcf.get_header() + '\n')
v = line.rstrip().split('\t')
# XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this?
if s_index == -1:
sys.stderr.write("Input a valid sample name: " + sample + " not found in a provided VCF")
sys.exit(1)
v = v[:9] + [v[s_index]]
if not any("SVTYPE=BND" in s for s in v):
if "CN" not in v[8]:
v[8] = v[8] + ":CN"
v[9] = v[9] + ":" + str(cn_list[i])
else:
cn_index = v[8].rstrip().split(":").index("CN")
gts = v[9].rstrip().split(":")
gts[cn_index] = str(cn_list[i])
v[9] = ":".join(gts)
i += 1
# write the VCF
vcf_out.write('\t'.join(v) + '\n')
vcf_out.close()
return
示例6: sv_classify
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, het_del_fit, hom_del_fit, params, diag_outfile):
vcf_out = sys.stdout
vcf = Vcf()
header = []
in_header = True
min_pos_samps_for_regression = 10
sex = {}
# read sample genders
for line in gender_file:
v = line.rstrip().split('\t')
sex[v[0]] = int(v[1])
exclude = []
if exclude_file is not None:
for line in exclude_file:
exclude.append(line.rstrip())
if diag_outfile is not None:
outf=open(diag_outfile, 'w', 4096)
for line in vcf_in:
if in_header:
if line[0] == '#':
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf_out.write(vcf.get_header() + '\n')
# split variant line, quick pre-check if the SVTYPE is BND, and skip if so
v = line.rstrip().split('\t')
info = v[7].split(';')
svtype = None
for x in info:
if x.startswith('SVTYPE='):
svtype = x.split('=')[1]
break
# bail if not DEL or DUP prior to reclassification
if svtype not in ['DEL', 'DUP']:
vcf_out.write(line)
continue
# parse the VCF line
var = Variant(v, vcf, True)
# check intersection with mobile elements
if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
ae = annotation_intersect(var, ae_dict, f_overlap)
if ae is not None:
if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
ae = 'ME:' + ae
var.alt = '<DEL:%s>' % ae
var.info['SVTYPE'] = 'MEI'
vcf_out.write(var.get_var_string(True) + '\n')
continue
# for now, don't worry about sex chromosomes
if (var.chrom == 'X' or var.chrom == 'Y'):
vcf_out.write(line)
continue
#count positively genotyped samples
num_pos_samps = 0;
for s in var.sample_list:
if s in exclude:
continue
if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
num_pos_samps += 1
high_freq_support = False
low_freq_support = False
nb_support = False
if num_pos_samps == 0:
vcf_out.write(line)
else:
df=load_df(var, exclude, sex)
if has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params):
nb_support = True
if num_pos_samps < min_pos_samps_for_regression:
if has_low_freq_depth_support(df):
low_freq_support = True
vcf_out.write(line)
else:
for m_var in to_bnd_strings(var, True ):
vcf_out.write(m_var + '\n')
else:
if has_high_freq_depth_support(df, slope_threshold, rsquared_threshold):
high_freq_support = True
vcf_out.write(line)
else:
for m_var in to_bnd_strings(var, True):
vcf_out.write(m_var + '\n')
#.........這裏部分代碼省略.........
示例7: varLookup
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name):
# FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!!
bList = list()
headerObj=Vcf() #co-opt the VCF header object
if cohort_name is None:
cohort_name=str(str(bFile).split('/')[-1])
if bFile == "stdin":
bData = sys.stdin
elif bFile.endswith('.gz'):
bData = gzip.open(bFile, 'rb')
else:
bData = open(bFile, 'r')
for bLine in bData:
if bLine.startswith(pass_prefix):
continue
bentry = Bedpe(bLine.rstrip().split('\t'))
if bentry.af is None:
sys.stderr.write('No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n')
sys.exit(1)
bList.append(bentry)
if aFile == "stdin":
aData = sys.stdin
elif aFile.endswith('.gz'):
aData = gzip.open(aFile, 'rb')
else:
aData = open(aFile, 'r')
in_header=True
header_lines = []
sample_list = None
for aLine in aData:
if pass_prefix is not None and aLine.startswith(pass_prefix):
if aLine[0] == '#' and aLine[1] != '#':
sample_list = aLine.rstrip().split('\t', 14)[-1]
else:
header_lines.append(aLine)
continue
else:
if in_header == True:
headerObj.add_header(header_lines)
headerObj.add_info(cohort_name + '_AF', '.', 'Float', 'Allele frequency(ies) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' )
headerObj.add_info(cohort_name + '_VarID', '.', 'Integer', 'List of Variant ID(s) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' )
header = headerObj.get_header()
bedpe_out.write(header[:header.rfind('\n')] + '\n')
if len(sample_list) > 0:
bedpe_out.write('\t'.join(['#CHROM_A',
'START_A',
'END_A',
'CHROM_B',
'START_B',
'END_B',
'ID',
'QUAL',
'STRAND_A',
'STRAND_B',
'TYPE',
'FILTER',
'INFO_A','INFO_B',
sample_list]
) + '\n')
else:
bedpe_out.write('\t'.join(['#CHROM_A',
'START_A',
'END_A',
'CHROM_B',
'START_B',
'END_B',
'ID',
'QUAL',
'STRAND_A',
'STRAND_B',
'TYPE',
'FILTER',
'INFO_A','INFO_B']
) + '\n')
in_header=False
a = Bedpe(aLine.rstrip().split('\t'))
if a.af is None:
sys.stderr.write('No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n')
sys.exit(1)
for b in bList:
add(a,b,max_distance)
bedpe_out.write(get_var_string(a, cohort_name))
示例8: execute
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def execute(self, output_handle=sys.stdout):
in_header = True
header = []
vcf = Vcf()
vcf_out = output_handle
# read input VCF
for line in self.vcf_stream:
if in_header:
if line.startswith('##'):
header.append(line)
continue
elif line.startswith('#CHROM'):
v = line.rstrip().split('\t')
header.append('\t'.join(v))
in_header = False
vcf.add_header(header)
vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')
# write header
vcf_out.write(vcf.get_header() + '\n')
#vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
continue
v = line.rstrip().split('\t')
var = Variant(v, vcf)
# extract genotypes from VCF
num_alt = len(var.alt.split(','))
alleles = [0] * (num_alt + 1)
num_samp = 0
sum_sq = 0.0
for gt in var.genotypes():
gt_string = gt.get_format('GT')
if '.' not in gt_string:
indexes = self.numeric_alleles(gt_string)
for i in indexes:
alleles[i] += 1
# iterate the number of non-reference samples
if sum(indexes) > 0:
num_samp += 1
try:
sum_sq += float(gt.get_format('SQ'))
except KeyError:
pass
allele_sum = float(sum(alleles))
allele_freq = ['.'] * len(alleles)
# populate AF
if allele_sum > 0:
for i in xrange(len(alleles)):
allele_freq[i] = alleles[i] / allele_sum
var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
else:
var.info['AF'] = ','.join(map(str, allele_freq[1:]))
# populate NSAMP
var.info['NSAMP'] = num_samp
if num_samp > 0:
msq = '%0.2f' % (sum_sq / num_samp)
else:
msq = '.'
var.info['MSQ'] = msq
# after all samples have been processed, write
vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
vcf_out.close()
示例9: run_gt_refine
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):
vcf = Vcf()
header = []
in_header = True
sex={}
for line in gender_file:
v = line.rstrip().split('\t')
sex[v[0]] = int(v[1])
exclude = []
if exclude_file is not None:
for line in exclude_file:
exclude.append(line.rstrip())
outf=open(diag_outfile, 'w', 4096)
ct=1
for line in vcf_in:
if in_header:
if line[0] == "#":
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT')
vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT')
vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.')
vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
vcf_out.write(vcf.get_header() + '\n')
v = line.rstrip().split('\t')
info = v[7].split(';')
svtype = None
for x in info:
if x.startswith('SVTYPE='):
svtype = x.split('=')[1]
break
# bail if not DEL or DUP prior to reclassification
if svtype not in ['DEL']:
vcf_out.write(line)
continue
var = Variant(v, vcf)
sys.stderr.write("%s\n" % var.var_id)
sys.stderr.write("%f\n" % float(var.get_info('AF')))
if float(var.get_info('AF'))<0.01:
vcf_out.write(line)
else:
df=load_df(var, exclude, sex)
recdf=recluster(df)
if ct==1:
recdf.to_csv(outf, header=True)
ct += 1
else:
recdf.to_csv(outf, header=False)
var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re']))
var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re']))
recdf.set_index('sample', inplace=True)
for s in var.sample_list:
if s in recdf.index:
var.genotype(s).set_format("GTR", recdf.loc[s,'GTR'])
var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re']))
else:
var.genotype(s).set_format("GTR", "./.")
var.genotype(s).set_format("GQR", 0)
vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n')
vcf_out.close()
vcf_in.close()
gender_file.close()
outf.close()
if exclude_file is not None:
exclude_file.close()
return
示例10: sv_classify
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def sv_classify(vcf_in, vcf_out, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, p_cnv, het_del_fit, hom_del_fit, params, diag_outfile, method):
vcf = Vcf()
header = []
in_header = True
sex = {}
# read sample genders
for line in gender_file:
v = line.rstrip().split('\t')
sex[v[0]] = int(v[1])
exclude = []
if exclude_file is not None:
for line in exclude_file:
exclude.append(line.rstrip())
if diag_outfile is not None:
outf=open(diag_outfile, 'w', 4096)
outf.write("varid\torig_svtype\tsvlen\tnum_pos_samps\tnb_support\tls_support\thybrid_support\thas_rd_support\n")
for line in vcf_in:
if in_header:
if line[0] == '#':
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf_out.write(vcf.get_header() + '\n')
v = line.rstrip().split('\t')
info = v[7].split(';')
svtype = None
for x in info:
if x.startswith('SVTYPE='):
svtype = x.split('=')[1]
break
# bail if not DEL or DUP prior to reclassification
if svtype not in ['DEL', 'DUP']:
vcf_out.write(line)
continue
var = Variant(v, vcf)
# check intersection with mobile elements
if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
ae = annotation_intersect(var, ae_dict, f_overlap)
if ae is not None:
if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
ae = 'ME:' + ae
var.alt = '<DEL:%s>' % ae
var.info['SVTYPE'] = 'MEI'
vcf_out.write(var.get_var_string(True) + '\n')
continue
#count positively genotyped samples
num_pos_samps = 0
num_total_samps=len(var.sample_list)
for s in var.sample_list:
if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
num_pos_samps += 1
nb_support = False
ls_support = False
hybrid_support = False
has_rd_support = False
if num_pos_samps == 0:
vcf_out.write(line)
else:
df=load_df(var, exclude, sex)
if method=='large_sample':
ls_support = has_rd_support_by_ls(df, slope_threshold, rsquared_threshold, num_pos_samps)
has_rd_support=ls_support
elif method=='naive_bayes':
nb_support = has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params, p_cnv)
has_rd_support=nb_support
elif method=='hybrid':
ls_support, nb_support, hybrid_support = has_rd_support_hybrid(df, het_del_fit, hom_del_fit, params, p_cnv, slope_threshold, rsquared_threshold, num_pos_samps)
has_rd_support=hybrid_support
if has_rd_support:
vcf_out.write(line)
else:
for m_var in to_bnd_strings(var, True):
vcf_out.write(m_var + '\n')
if diag_outfile is not None:
svlen=df['svlen'][0]
outf.write(var.var_id+"\t"+svtype+"\t"+str(svlen)+"\t"+str(num_pos_samps)+"\t"+str(nb_support)+"\t"+str(ls_support)+"\t"+str(hybrid_support)+"\t"+str(has_rd_support)+"\n")
vcf_out.close()
if diag_outfile is not None:
outf.close()
vcf_in.close()
vcf_out.close()
gender_file.close()
if exclude_file is not None:
exclude_file.close()
#.........這裏部分代碼省略.........
示例11: l_cluster_by_line
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def l_cluster_by_line(file_name, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'):
v_id = 0
in_header = True
header = []
vcf = Vcf()
vcf_out=sys.stdout
with InputStream(file_name) as vcf_stream:
BP_l = []
BP_sv_type = ''
BP_max_end_l = -1
BP_chr_l = ''
sample_order = []
for line in vcf_stream:
if in_header:
if line.startswith('##'):
header.append(line)
continue
elif line.startswith('#CHROM'):
v=line.rstrip().split('\t')
for headline in header:
if headline[:8] == '##SAMPLE':
sample_order.append(headline.rstrip()[13:-1])
hline=''
if include_genotypes :
v.extend(sample_order)
hline='\t'.join(v)
else :
v=v[:8]
hline='\t'.join(v)
header.append(hline)
in_header=False
vcf.add_header(header)
vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint')
if include_genotypes:
vcf_out.write(vcf.get_header()+'\n')
else:
vcf_out.write(vcf.get_header(False)+'\n')
continue
b = Breakpoint(l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop)
if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)):
BP_l.append(b)
BP_max_end_l = max(BP_max_end_l, b.left.end)
BP_chr_l = b.left.chrom
BP_sv_type = b.sv_type
else:
v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
BP_l = [b]
BP_max_end_l = b.left.end
BP_sv_type = b.sv_type
BP_chr_l = b.left.chrom
if len(BP_l) > 0:
v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
示例12: execute
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def execute(self, output_handle=sys.stdout):
in_header = True
header = []
vcf = Vcf()
vcf_out = output_handle
# read input VCF
for line in self.vcf_stream:
if in_header:
if line.startswith('##'):
header.append(line)
continue
elif line.startswith('#CHROM'):
v = line.rstrip().split('\t')
header.append('\t'.join(v))
in_header = False
vcf.add_header(header)
vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')
# write header
vcf_out.write(vcf.get_header() + '\n')
#vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
continue
v = line.rstrip().split('\t')
var = Variant(v, vcf, fixed_genotypes=True)
# extract genotypes from VCF
num_alt = len(var.alt.split(','))
alleles = [0] * (num_alt + 1)
num_samp = 0
gt = [var.genotype(s).get_format('GT') for s in var.sample_list]
for gt_string in gt:
if '.' in gt_string:
continue
gt = gt_string.split('/')
if len(gt) == 1:
gt = gt_string.split('|')
gt = map(int, gt)
for i in xrange(len(gt)):
alleles[gt[i]] += 1
# iterate the number of non-reference samples
if sum(gt) > 0:
num_samp += 1
allele_sum = float(sum(alleles))
allele_freq = ['.'] * len(alleles)
# populate AF
if allele_sum > 0:
for i in xrange(len(alleles)):
allele_freq[i] = alleles[i] / allele_sum
var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
else:
var.info['AF'] = ','.join(map(str, allele_freq[1:]))
# populate NSAMP
var.info['NSAMP'] = num_samp
var.info['MSQ'] = self.calc_msq(var)
# after all samples have been processed, write
vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
vcf_out.close()
示例13: sv_classify
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold):
vcf_out = sys.stdout
vcf = Vcf()
header = []
in_header = True
min_pos_samps_for_regression = 10
gender = {}
# read sample genders
for line in gender_file:
v = line.rstrip().split('\t')
gender[v[0]] = int(v[1])
exclude = []
if exclude_file is not None:
for line in exclude_file:
exclude.append(line.rstrip())
for line in vcf_in:
if in_header:
if line[0] == '#':
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
# write the output header
vcf_out.write(vcf.get_header() + '\n')
# split variant line, quick pre-check if the SVTYPE is BND, and skip if so
v = line.rstrip().split('\t')
info = v[7].split(';')
svtype = None
for x in info:
if x.startswith('SVTYPE='):
svtype = x.split('=')[1]
break
# bail if not DEL or DUP prior to reclassification
if svtype not in ['DEL', 'DUP']:
vcf_out.write(line)
continue
# parse the VCF line
var = Variant(v, vcf, True)
# check intersection with mobile elements
if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
ae = annotation_intersect(var, ae_dict, f_overlap)
if ae is not None:
if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
ae = 'ME:' + ae
var.alt = '<DEL:%s>' % ae
var.info['SVTYPE'] = 'MEI'
vcf_out.write(var.get_var_string(True) + '\n')
continue
# # write to directory
# writedir = 'data/r11.100kb.dup'
# annotate based on read depth
if var.info['SVTYPE'] in ['DEL', 'DUP']:
# count the number of positively genotyped samples
num_pos_samps = 0;
for s in var.sample_list:
if s in exclude:
continue
if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
num_pos_samps += 1
if num_pos_samps < min_pos_samps_for_regression:
if has_low_freq_depth_support(var, gender, exclude):
# has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_rd')
# has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_rd')
# write variant
#vcf_out.write(var.get_var_string(True) + '\n')
vcf_out.write(line)
else:
# has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_no_rd')
# has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_no_rd')
for m_var in to_bnd_strings(var):
vcf_out.write(m_var + '\n')
else:
if has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold):
# has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_rd')
# has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_rd')
# write variant
#vcf_out.write(var.get_var_string(True) + '\n')
vcf_out.write(line)
else:
# has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_no_rd')
# has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_no_rd')
for m_var in to_bnd_strings(var):
vcf_out.write(m_var + '\n')
vcf_out.close()
return
示例14: bedpeToVcf
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import get_header [as 別名]
def bedpeToVcf(bedpe_file, vcf_out):
myvcf = Vcf()
in_header = True
# parse the bedpe data
header = list()
for line in bedpe_file:
if in_header:
if line[0:2] == '##':
header.append(line)
continue
elif line[0] == '#' and line[1] != '#':
sample_list_str = line.rstrip().split('\t', 14)[-1]
header.append('\t'.join([
'#CHROM',
'POS',
'ID',
'REF',
'ALT',
'QUAL',
'FILTER',
'INFO',
sample_list_str
] ))
continue
else:
in_header = False
myvcf.add_header(header)
myvcf.file_format='VCFv4.2'
vcf_out.write(myvcf.get_header() + '\n')
#
bedpe = Bedpe(line.rstrip().split('\t'))
if bedpe.svtype == 'BND':
bedpe1_list = [
bedpe.c1,
bedpe.b1 + 1,
bedpe.name + '_1', #ID
'N',
'<' + str(bedpe.svtype) + '>', #ALT
bedpe.score,
bedpe.filter
]
bedpe1_list.extend(bedpe.misc)
var1 = Variant(bedpe1_list, myvcf)
if bedpe.o1 == '+':
if bedpe.o2 == '-':
var1.alt = '%s[%s:%s[' % (var1.ref, bedpe.c2, bedpe.b2 + 1)
elif bedpe.o2 == '+':
var1.alt = '%s]%s:%s]' % (var1.ref, bedpe.c2, bedpe.b2 + 1)
elif bedpe.o1 == '-':
if bedpe.o2 == '+':
var1.alt = ']%s:%s]%s' % (bedpe.c2, bedpe.b2 + 1, var1.ref)
elif bedpe.o2 == '-':
var1.alt = '[%s:%s[%s' % (bedpe.c2, bedpe.b2 + 1, var1.ref)
misc = copy.deepcopy(bedpe.misc)
strands = re.split('=|:',''.join(filter(lambda x: 'STRANDS=' in x, bedpe.misc[0].split(";"))))
strands_str = str(strands[0]) + '=' + str(strands[1][::-1]) + ':' + str(strands[2])
misc[0]=misc[0].replace(''.join(filter(lambda x: 'STRANDS=' in x, bedpe.misc[0].split(";"))), strands_str)
#add the cipos ciend,cipos95 and ciend95 variables
misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIPOS=' in x, bedpe.misc[0].split(";"))),'CIPOS='+ re.split('=',''.join(filter(lambda x: 'CIEND=' in x, bedpe.misc[0].split(";"))))[1])
misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIEND=' in x, bedpe.misc[0].split(";"))),'CIEND='+ re.split('=',''.join(filter(lambda x: 'CIPOS=' in x, bedpe.misc[0].split(";"))))[1])
misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIPOS95=' in x, bedpe.misc[0].split(";"))),'CIPOS95='+ re.split('=',''.join(filter(lambda x: 'CIEND95=' in x, bedpe.misc[0].split(";"))))[1])
misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIEND95=' in x, bedpe.misc[0].split(";"))),'CIEND95='+ re.split('=',''.join(filter(lambda x: 'CIPOS95=' in x, bedpe.misc[0].split(";"))))[1])
#Change MATEID
misc[0]= misc[0].replace(''.join(filter(lambda x: 'MATEID=' in x, bedpe.misc[0].split(";"))),'MATEID=' + bedpe.name + '_2')
#ADD IDENTIFIER FOR SECONDARY BREAKEND MATE
misc[0]=misc[0].replace(''.join(filter(lambda x: 'EVENT=' in x, bedpe.misc[0].split(";"))),''.join(filter(lambda x: 'EVENT=' in x, bedpe.misc[0].split(";"))) + ';SECONDARY;')
bedpe2_list = [
bedpe.c2, #chrom1
bedpe.b2 + 1,
bedpe.name + '_2', #ID
'N',
'<' + str(bedpe.svtype) + '>', #ALT
bedpe.score,
bedpe.filter
]
bedpe2_list.extend(misc)
var2 = Variant(bedpe2_list, myvcf)
# add the strands field. For variant 2 must switch the order
if bedpe.o2 == '+':
if bedpe.o1 == '-':
var2.alt = '%s[%s:%s[' % (var2.ref, bedpe.c1, bedpe.b1 + 1)
elif bedpe.o1 == '+':
var2.alt = '%s]%s:%s]' % (var2.ref, bedpe.c1, bedpe.b1 + 1)
elif bedpe.o2 == '-':
if bedpe.o1 == '+':
var2.alt = ']%s:%s]%s' % (bedpe.c1, bedpe.b1 + 1, var2.ref)
elif bedpe.o1 == '-':
var2.alt = '[%s:%s[%s' % (bedpe.c1, bedpe.b1 + 1, var2.ref)
if bedpe.malformedFlag == 0:
vcf_out.write(var1.get_var_string() + '\n')
vcf_out.write(var2.get_var_string() + '\n')
elif bedpe.malformedFlag == 1:
vcf_out.write(var2.get_var_string() + '\n')
elif bedpe.malformedFlag == 2:
vcf_out.write(var1.get_var_string() + '\n')
else:
# set VCF info elements for simple events
bedpe_list = [
#.........這裏部分代碼省略.........