本文整理匯總了Python中svtools.vcf.file.Vcf.add_info方法的典型用法代碼示例。如果您正苦於以下問題:Python Vcf.add_info方法的具體用法?Python Vcf.add_info怎麽用?Python Vcf.add_info使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類svtools.vcf.file.Vcf
的用法示例。
在下文中一共展示了Vcf.add_info方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: run_gt_refine
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_info [as 別名]
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file):
vcf = Vcf()
header = []
in_header = True
sex={}
for line in gender_file:
v = line.rstrip().split('\t')
sex[v[0]] = int(v[1])
outf=open(diag_outfile, 'w', 4096)
ct=1
for line in vcf_in:
if in_header:
if line[0] == "#":
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters')
#vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette')
vcf_out.write(vcf.get_header() + '\n')
var = Variant(line.rstrip().split('\t'), vcf)
df=load_df(var, sex)
df1=get_silhouette(df)
sil_avg=df1.iloc[0, df1.columns.get_loc('sil_gt_avg')]
#sil_ind=df1.loc[:, 'sil_gt']
var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg
vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
if ct==1:
df1.to_csv(outf, header=True)
ct += 1
else:
df1.to_csv(outf, header=False)
vcf_out.close()
vcf_in.close()
outf.close()
gender_file.close()
return
示例2: varLookup
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_info [as 別名]
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name):
# FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!!
bList = list()
headerObj=Vcf() #co-opt the VCF header object
if cohort_name is None:
cohort_name=str(str(bFile).split('/')[-1])
if bFile == "stdin":
bData = sys.stdin
elif bFile.endswith('.gz'):
bData = gzip.open(bFile, 'rb')
else:
bData = open(bFile, 'r')
for bLine in bData:
if bLine.startswith(pass_prefix):
continue
bentry = Bedpe(bLine.rstrip().split('\t'))
if bentry.af is None:
sys.stderr.write('No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n')
sys.exit(1)
bList.append(bentry)
if aFile == "stdin":
aData = sys.stdin
elif aFile.endswith('.gz'):
aData = gzip.open(aFile, 'rb')
else:
aData = open(aFile, 'r')
in_header=True
header_lines = []
sample_list = None
for aLine in aData:
if pass_prefix is not None and aLine.startswith(pass_prefix):
if aLine[0] == '#' and aLine[1] != '#':
sample_list = aLine.rstrip().split('\t', 14)[-1]
else:
header_lines.append(aLine)
continue
else:
if in_header == True:
headerObj.add_header(header_lines)
headerObj.add_info(cohort_name + '_AF', '.', 'Float', 'Allele frequency(ies) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' )
headerObj.add_info(cohort_name + '_VarID', '.', 'Integer', 'List of Variant ID(s) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' )
header = headerObj.get_header()
bedpe_out.write(header[:header.rfind('\n')] + '\n')
if len(sample_list) > 0:
bedpe_out.write('\t'.join(['#CHROM_A',
'START_A',
'END_A',
'CHROM_B',
'START_B',
'END_B',
'ID',
'QUAL',
'STRAND_A',
'STRAND_B',
'TYPE',
'FILTER',
'INFO_A','INFO_B',
sample_list]
) + '\n')
else:
bedpe_out.write('\t'.join(['#CHROM_A',
'START_A',
'END_A',
'CHROM_B',
'START_B',
'END_B',
'ID',
'QUAL',
'STRAND_A',
'STRAND_B',
'TYPE',
'FILTER',
'INFO_A','INFO_B']
) + '\n')
in_header=False
a = Bedpe(aLine.rstrip().split('\t'))
if a.af is None:
sys.stderr.write('No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n')
sys.exit(1)
for b in bList:
add(a,b,max_distance)
bedpe_out.write(get_var_string(a, cohort_name))
示例3: execute
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_info [as 別名]
def execute(self, output_handle=sys.stdout):
in_header = True
header = []
vcf = Vcf()
vcf_out = output_handle
# read input VCF
for line in self.vcf_stream:
if in_header:
if line.startswith('##'):
header.append(line)
continue
elif line.startswith('#CHROM'):
v = line.rstrip().split('\t')
header.append('\t'.join(v))
in_header = False
vcf.add_header(header)
vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')
# write header
vcf_out.write(vcf.get_header() + '\n')
#vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
continue
v = line.rstrip().split('\t')
var = Variant(v, vcf)
# extract genotypes from VCF
num_alt = len(var.alt.split(','))
alleles = [0] * (num_alt + 1)
num_samp = 0
sum_sq = 0.0
for gt in var.genotypes():
gt_string = gt.get_format('GT')
if '.' not in gt_string:
indexes = self.numeric_alleles(gt_string)
for i in indexes:
alleles[i] += 1
# iterate the number of non-reference samples
if sum(indexes) > 0:
num_samp += 1
try:
sum_sq += float(gt.get_format('SQ'))
except KeyError:
pass
allele_sum = float(sum(alleles))
allele_freq = ['.'] * len(alleles)
# populate AF
if allele_sum > 0:
for i in xrange(len(alleles)):
allele_freq[i] = alleles[i] / allele_sum
var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
else:
var.info['AF'] = ','.join(map(str, allele_freq[1:]))
# populate NSAMP
var.info['NSAMP'] = num_samp
if num_samp > 0:
msq = '%0.2f' % (sum_sq / num_samp)
else:
msq = '.'
var.info['MSQ'] = msq
# after all samples have been processed, write
vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
vcf_out.close()
示例4: run_gt_refine
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_info [as 別名]
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):
vcf = Vcf()
header = []
in_header = True
sex={}
for line in gender_file:
v = line.rstrip().split('\t')
sex[v[0]] = int(v[1])
exclude = []
if exclude_file is not None:
for line in exclude_file:
exclude.append(line.rstrip())
outf=open(diag_outfile, 'w', 4096)
ct=1
for line in vcf_in:
if in_header:
if line[0] == "#":
header.append(line)
continue
else:
in_header = False
vcf.add_header(header)
vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT')
vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT')
vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.')
vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
vcf_out.write(vcf.get_header() + '\n')
v = line.rstrip().split('\t')
info = v[7].split(';')
svtype = None
for x in info:
if x.startswith('SVTYPE='):
svtype = x.split('=')[1]
break
# bail if not DEL or DUP prior to reclassification
if svtype not in ['DEL']:
vcf_out.write(line)
continue
var = Variant(v, vcf)
sys.stderr.write("%s\n" % var.var_id)
sys.stderr.write("%f\n" % float(var.get_info('AF')))
if float(var.get_info('AF'))<0.01:
vcf_out.write(line)
else:
df=load_df(var, exclude, sex)
recdf=recluster(df)
if ct==1:
recdf.to_csv(outf, header=True)
ct += 1
else:
recdf.to_csv(outf, header=False)
var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re']))
var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re']))
recdf.set_index('sample', inplace=True)
for s in var.sample_list:
if s in recdf.index:
var.genotype(s).set_format("GTR", recdf.loc[s,'GTR'])
var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re']))
else:
var.genotype(s).set_format("GTR", "./.")
var.genotype(s).set_format("GQR", 0)
vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n')
vcf_out.close()
vcf_in.close()
gender_file.close()
outf.close()
if exclude_file is not None:
exclude_file.close()
return
示例5: l_cluster_by_line
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_info [as 別名]
def l_cluster_by_line(file_name, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'):
v_id = 0
in_header = True
header = []
vcf = Vcf()
vcf_out=sys.stdout
with InputStream(file_name) as vcf_stream:
BP_l = []
BP_sv_type = ''
BP_max_end_l = -1
BP_chr_l = ''
sample_order = []
for line in vcf_stream:
if in_header:
if line.startswith('##'):
header.append(line)
continue
elif line.startswith('#CHROM'):
v=line.rstrip().split('\t')
for headline in header:
if headline[:8] == '##SAMPLE':
sample_order.append(headline.rstrip()[13:-1])
hline=''
if include_genotypes :
v.extend(sample_order)
hline='\t'.join(v)
else :
v=v[:8]
hline='\t'.join(v)
header.append(hline)
in_header=False
vcf.add_header(header)
vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint')
if include_genotypes:
vcf_out.write(vcf.get_header()+'\n')
else:
vcf_out.write(vcf.get_header(False)+'\n')
continue
b = Breakpoint(l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop)
if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)):
BP_l.append(b)
BP_max_end_l = max(BP_max_end_l, b.left.end)
BP_chr_l = b.left.chrom
BP_sv_type = b.sv_type
else:
v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
BP_l = [b]
BP_max_end_l = b.left.end
BP_sv_type = b.sv_type
BP_chr_l = b.left.chrom
if len(BP_l) > 0:
v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
示例6: execute
# 需要導入模塊: from svtools.vcf.file import Vcf [as 別名]
# 或者: from svtools.vcf.file.Vcf import add_info [as 別名]
def execute(self, output_handle=sys.stdout):
in_header = True
header = []
vcf = Vcf()
vcf_out = output_handle
# read input VCF
for line in self.vcf_stream:
if in_header:
if line.startswith('##'):
header.append(line)
continue
elif line.startswith('#CHROM'):
v = line.rstrip().split('\t')
header.append('\t'.join(v))
in_header = False
vcf.add_header(header)
vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')
# write header
vcf_out.write(vcf.get_header() + '\n')
#vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
continue
v = line.rstrip().split('\t')
var = Variant(v, vcf, fixed_genotypes=True)
# extract genotypes from VCF
num_alt = len(var.alt.split(','))
alleles = [0] * (num_alt + 1)
num_samp = 0
gt = [var.genotype(s).get_format('GT') for s in var.sample_list]
for gt_string in gt:
if '.' in gt_string:
continue
gt = gt_string.split('/')
if len(gt) == 1:
gt = gt_string.split('|')
gt = map(int, gt)
for i in xrange(len(gt)):
alleles[gt[i]] += 1
# iterate the number of non-reference samples
if sum(gt) > 0:
num_samp += 1
allele_sum = float(sum(alleles))
allele_freq = ['.'] * len(alleles)
# populate AF
if allele_sum > 0:
for i in xrange(len(alleles)):
allele_freq[i] = alleles[i] / allele_sum
var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
else:
var.info['AF'] = ','.join(map(str, allele_freq[1:]))
# populate NSAMP
var.info['NSAMP'] = num_samp
var.info['MSQ'] = self.calc_msq(var)
# after all samples have been processed, write
vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
vcf_out.close()