本文整理汇总了Python中utils.color_gene函数的典型用法代码示例。如果您正苦于以下问题:Python color_gene函数的具体用法?Python color_gene怎么用?Python color_gene使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了color_gene函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: print_match
def print_match(self, region, gene, query_seq, score, glbounds, qrbounds, codon_pos, warnings, skipping=False):
if self.debug < 2:
return
out_str_list = []
buff_str = (20 - len(gene)) * ' '
tmp_val = score
if self.args.apply_choice_probs_in_sw and self.get_choice_prob(region, gene) != 0.0:
tmp_val = score / self.get_choice_prob(region, gene)
if self.args.apply_choice_probs_in_sw:
out_str_list.append('%8s%s%s%9.1e * %3.0f = %-6.1f' % (' ', utils.color_gene(gene), buff_str, self.get_choice_prob(region, gene), tmp_val, score))
else:
out_str_list.append('%8s%s%s%9s%3s %6.0f ' % (' ', utils.color_gene(gene), '', '', buff_str, score))
out_str_list.append('%4d%4d %s\n' % (glbounds[0], glbounds[1], self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]))
out_str_list.append('%46s %4d%4d' % ('', qrbounds[0], qrbounds[1]))
out_str_list.append(' %s ' % (utils.color_mutants(self.germline_seqs[region][gene][glbounds[0]:glbounds[1]], query_seq[qrbounds[0]:qrbounds[1]])))
if region != 'd':
out_str_list.append('(%s %d)' % (utils.conserved_codon_names[region], codon_pos))
if warnings[gene] != '':
out_str_list.append('WARNING ' + warnings[gene])
if skipping:
out_str_list.append('skipping!')
if self.args.outfname is None:
print ''.join(out_str_list)
else:
out_str_list.append('\n')
self.outfile.write(''.join(out_str_list))
示例2: add_new_allele
def add_new_allele(glfo, newfo, remove_template_genes=False, debug=False):
"""
Add a new allele to <glfo>, specified by <newfo> which is of the
form: {'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT', 'template-gene' : 'IGHV3-71*01'}
If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
"""
template_gene = newfo["template-gene"]
region = utils.get_region(template_gene)
if template_gene not in glfo["seqs"][region]:
raise Exception("unknown template gene %s" % template_gene)
new_gene = newfo["gene"]
if region == "v":
glfo["cyst-positions"][new_gene] = glfo["cyst-positions"][template_gene]
elif region == "j":
glfo["tryp-positions"][new_gene] = glfo["tryp-positions"][template_gene]
glfo["seqs"][region][new_gene] = newfo["seq"]
if debug:
print " adding new allele to glfo:"
print " template %s %s" % (glfo["seqs"][region][template_gene], utils.color_gene(template_gene))
print " new %s %s" % (
utils.color_mutants(glfo["seqs"][region][template_gene], newfo["seq"]),
utils.color_gene(new_gene),
)
if remove_template_genes:
remove_gene(glfo, template_gene, debug=True)
示例3: add_new_allele
def add_new_allele(self, gene, fitfo, n_candidate_snps, debug=False):
# figure out what the new nukes are
old_seq = self.glfo['seqs'][utils.get_region(gene)][gene]
new_seq = old_seq
mutfo = {}
for pos in sorted(fitfo['candidates'][n_candidate_snps]):
obs_counts = {nuke : self.counts[gene][pos][n_candidate_snps][nuke] for nuke in utils.nukes} # NOTE it's super important to only use the counts from sequences with <n_candidate_snps> total mutations
sorted_obs_counts = sorted(obs_counts.items(), key=operator.itemgetter(1), reverse=True)
original_nuke = self.mfreqer.counts[gene][pos]['gl_nuke']
new_nuke = None
for nuke, _ in sorted_obs_counts: # take the most common one that isn't the existing gl nuke
if nuke != original_nuke:
new_nuke = nuke
break
print ' %3d (%s --> %s)' % (pos, original_nuke, new_nuke),
assert old_seq[pos] == original_nuke
mutfo[pos] = {'original' : original_nuke, 'new' : new_nuke}
new_seq = new_seq[:pos] + new_nuke + new_seq[pos+1:]
new_name, mutfo = glutils.get_new_allele_name_and_change_mutfo(gene, mutfo)
print ''
print ' %s %s' % (old_seq, utils.color_gene(gene))
print ' %s %s' % (utils.color_mutants(old_seq, new_seq), utils.color_gene(new_name))
# and add it to the set of new alleles for this gene
self.new_allele_info.append({
'template-gene' : gene,
'gene' : new_name,
'seq' : new_seq,
'aligned-seq' : None
})
示例4: make_transition_plot
def make_transition_plot(self, gene_name, model):
""" NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """
fig, ax = plotting.mpl_init()
fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])
ibin = 0
print utils.color_gene(utils.unsanitize_name(gene_name))
legend_colors = set() # add a color to this the first time you plot it
for state in model.states:
# bin label
ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8)
sorted_to_states = {}
for name in state.transitions.keys():
if name.find('IG') == 0:
sorted_to_states[name] = int(paramutils.simplify_state_name(name))
else:
sorted_to_states[name] = name
sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1))
total = 0.0
for to_state, simple_to_state in sorted_to_states:
prob = state.transitions[to_state]
alpha = 0.6
width = 3
if 'insert' in str(simple_to_state):
label = 'insert'
color = '#3498db' # blue
elif str(simple_to_state) == 'end':
label = 'end'
color = 'red'
else: # regional/internal states
assert to_state.find('IG') == 0
label = 'internal'
color = 'green'
label_to_use = None
if color not in legend_colors:
label_to_use = label
legend_colors.add(color)
# horizontal line at height total+prob
ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use)
# vertical line from total to total + prob
ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width)
midpoint = 0.5*(prob + 2*total)
# ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state)) # nicely labels the midpoint of the chunk between lines, but there isn't really room for it
total += prob
ibin += 1
ax.get_xaxis().set_visible(False)
plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
示例5: remove_gene
def remove_gene(glfo, gene, debug=False):
""" remove <gene> from <glfo> """
region = utils.get_region(gene)
if gene in glfo["seqs"][region]:
if debug:
print " removing %s from glfo" % utils.color_gene(gene)
del glfo["seqs"][region][gene]
if region in utils.conserved_codons[glfo["chain"]]:
del glfo[utils.conserved_codons[glfo["chain"]][region] + "-positions"][gene]
else:
if debug:
print " can't remove %s from glfo, it's not there" % utils.color_gene(gene)
示例6: find_partial_failures
def find_partial_failures(self, fostream_name):
unique_ids = []
for line in open(fostream_name.replace(".fostream", "")).readlines():
if len(self.sim_need) == 0:
return
if len(line.strip()) == 0: # skip blank lines
continue
line = line.replace('"', "")
line = line.split(";")
unique_id = line[0]
if "NA" not in line: # skip lines that were ok
unique_ids.append(unique_id)
continue
if unique_id not in self.sim_need:
continue
if unique_id not in self.siminfo:
continue # not looking for this <unique_id> a.t.m.
info = {}
info["unique_id"] = unique_id
for stuff in line:
for region in utils.regions: # add the first instance of IGH[VDJ] (if it's there at all)
if "IGH" + region.upper() in stuff and region + "_gene" not in info:
genes = re.findall("IGH" + region.upper() + "[^ ][^ ]*", stuff)
if len(genes) == 0:
print "ERROR no %s genes in %s" % (region, stuff)
gene = genes[0]
if gene not in self.germline_seqs[region]:
print "ERROR bad gene %s for %s" % (gene, unique_id)
sys.exit()
info[region + "_gene"] = gene
self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
if self.args.debug:
print "%-20s partial fail %s %s %s" % (
unique_id,
utils.color_gene(info["v_gene"]) if "v_gene" in info else "",
utils.color_gene(info["d_gene"]) if "d_gene" in info else "",
utils.color_gene(info["j_gene"]) if "j_gene" in info else "",
),
print " (true %s %s %s)" % tuple(
[self.siminfo[unique_id][region + "_gene"] for region in utils.regions]
)
self.failtails[unique_id] = info
self.n_partially_failed += 1
self.sim_need.remove(unique_id)
return unique_ids
示例7: skip_gene
def skip_gene(gene):
if self.args.debug:
print ' %s in list of genes to skip' % utils.color_gene(gene)
if gene not in genes_actually_skipped:
genes_actually_skipped[gene] = 0
genes_actually_skipped[gene] += 1
qr_info['skip_gene'] = True
示例8: write_hmm_input
def write_hmm_input(self, csv_fname, sw_info, parameter_dir, preclusters=None, hmm_type='', pair_hmm=False, stripped=False):
print ' writing input'
csvfile = opener('w')(csv_fname)
start = time.time()
# write header
header = ['names', 'k_v_min', 'k_v_max', 'k_d_min', 'k_d_max', 'only_genes', 'seqs'] # I wish I had a good c++ csv reader
csvfile.write(' '.join(header) + '\n')
skipped_gene_matches = set()
assert hmm_type != ''
if hmm_type == 'k=1': # single vanilla hmm
nsets = [[qn] for qn in self.input_info.keys()]
elif hmm_type == 'k=2': # pair hmm
nsets = self.get_pairs(preclusters)
elif hmm_type == 'k=preclusters': # run the k-hmm on each cluster in <preclusters>
assert preclusters != None
nsets = [ val for key, val in preclusters.id_clusters.items() if len(val) > 1 ] # <nsets> is a list of sets (well, lists) of query names
# nsets = []
# for cluster in preclusters.id_clusters.values():
# nsets += itertools.combinations(cluster, 5)
elif hmm_type == 'k=nsets': # run on *every* combination of queries which has length <self.args.n_sets>
if self.args.all_combinations:
nsets = itertools.combinations(self.input_info.keys(), self.args.n_sets)
else: # put the first n together, and the second group of n (not the self.input_info is and OrderedDict)
nsets = []
keylist = self.input_info.keys()
this_set = []
for iquery in range(len(keylist)):
if iquery % self.args.n_sets == 0: # every nth query, start a new group
if len(this_set) > 0:
nsets.append(this_set)
this_set = []
this_set.append(keylist[iquery])
if len(this_set) > 0:
nsets.append(this_set)
else:
assert False
for query_names in nsets:
non_failed_names = self.remove_sw_failures(query_names, sw_info)
if len(non_failed_names) == 0:
continue
combined_query = self.combine_queries(sw_info, non_failed_names, parameter_dir, stripped=stripped, skipped_gene_matches=skipped_gene_matches)
if len(combined_query) == 0: # didn't find all regions
continue
csvfile.write('%s %d %d %d %d %s %s\n' % # NOTE csv.DictWriter can handle tsvs, so this should really be switched to use that
(':'.join([str(qn) for qn in non_failed_names]),
combined_query['k_v']['min'], combined_query['k_v']['max'],
combined_query['k_d']['min'], combined_query['k_d']['max'],
':'.join(combined_query['only_genes']),
':'.join(combined_query['seqs'])))
if len(skipped_gene_matches) > 0:
print ' not found in %s, i.e. were never the best sw match for any query, so removing from consideration for hmm:' % (parameter_dir)
for region in utils.regions:
print ' %s: %s' % (region, ' '.join([utils.color_gene(gene) for gene in skipped_gene_matches if utils.get_region(gene) == region]))
csvfile.close()
print ' input write time: %.3f' % (time.time()-start)
示例9: add_new_allele
def add_new_allele(glfo, newfo, remove_template_genes, debug=False):
"""
Add a new allele to <glfo>, specified by <newfo> which is of the
form: {'template-gene' : 'IGHV3-71*01', 'gene' : 'IGHV3-71*01+C35T.T47G', 'seq' : 'ACTG yadda yadda CGGGT'}
If <remove_template_genes>, we also remove 'template-gene' from <glfo>.
"""
template_gene = newfo['template-gene']
region = utils.get_region(template_gene)
if template_gene not in glfo['seqs'][region]:
raise Exception('unknown template gene %s' % template_gene)
new_gene = newfo['gene']
if region == 'v':
glfo['cyst-positions'][new_gene] = glfo['cyst-positions'][template_gene]
elif region == 'j':
glfo['tryp-positions'][new_gene] = glfo['tryp-positions'][template_gene]
glfo['seqs'][region][new_gene] = newfo['seq']
if debug:
print ' adding new allele to glfo:'
print ' template %s %s' % (glfo['seqs'][region][template_gene], utils.color_gene(template_gene))
print ' new %s %s' % (utils.color_mutants(glfo['seqs'][region][template_gene], newfo['seq']), utils.color_gene(new_gene))
if remove_template_genes:
remove_gene(glfo, template_gene, debug=True)
示例10: finalize
def finalize(self, debug=False):
assert not self.finalized
self.mfreqer.finalize()
start = time.time()
gene_results = {'not_enough_obs_to_fit' : set(), 'didnt_find_anything_with_fit' : set(), 'new_allele' : set()}
if debug:
print '\nlooking for new alleles:'
for gene in sorted(self.mfreqer.counts):
if utils.get_region(gene) != 'v':
continue
if debug:
print '\n%s (observed %d %s)' % (utils.color_gene(gene), self.gene_obs_counts[gene], utils.plural_str('time', self.gene_obs_counts[gene]))
positions_to_try_to_fit, xyvals = self.get_positions_to_fit(gene, gene_results, debug=debug)
if positions_to_try_to_fit is None:
continue
fitfo = {n : {} for n in ('min_snp_ratios', 'candidates')}
for istart in range(1, self.n_max_snps):
if debug:
if istart == 1:
print ' resid. / ndof'
print ' position ratio (m=0 / m>%5.2f) muted / obs ' % self.big_y_icpt_bounds[0]
print ' %d %s' % (istart, utils.plural_str('snp', istart))
subxyvals = {pos : {k : v[istart : istart + self.max_fit_length] for k, v in xyvals[pos].items()} for pos in positions_to_try_to_fit}
self.fit_istart(gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=debug)
if istart not in fitfo['candidates']: # if it didn't get filled, we didn't have enough observations to do the fit
break
istart_candidates = []
if debug:
print ' evaluating each snp hypothesis'
print ' snps min ratio'
for istart in fitfo['candidates']:
if debug:
print ' %2d %9s' % (istart, fstr(fitfo['min_snp_ratios'][istart])),
if self.is_a_candidate(gene, fitfo, istart, debug=debug):
istart_candidates.append(istart)
if len(istart_candidates) > 0:
n_candidate_snps = min(istart_candidates) # add the candidate with the smallest number of snps to the germline set, and run again (if the firs
gene_results['new_allele'].add(gene)
print '\n found a new allele candidate separated from %s by %d %s at %s:' % (utils.color_gene(gene), n_candidate_snps,
utils.plural_str('snp', n_candidate_snps), utils.plural_str('position', n_candidate_snps)),
self.add_new_allele(gene, fitfo, n_candidate_snps, debug=debug)
else:
gene_results['didnt_find_anything_with_fit'].add(gene)
if debug:
print ' no new alleles'
if debug:
print 'found new alleles for %d %s (there were also %d without new alleles, and %d without enough observations to fit)' % (len(gene_results['new_allele']), utils.plural_str('gene', len(gene_results['new_allele'])),
len(gene_results['didnt_find_anything_with_fit']), len(gene_results['not_enough_obs_to_fit']))
print ' allele finding time: %.1f' % (time.time()-start)
self.finalized = True
示例11: set_per_gene_support
def set_per_gene_support(self, true_line, inf_line, region):
if inf_line[region + '_per_gene_support'].keys()[0] != inf_line[region + '_gene']:
print ' WARNING best-supported gene %s not same as viterbi gene %s' % (utils.color_gene(inf_line[region + '_per_gene_support'].keys()[0]), utils.color_gene(inf_line[region + '_gene']))
support = inf_line[region + '_per_gene_support'].values()[0] # sorted, ordered dict with gene : logprob key-val pairs
if true_line[region + '_gene'] == inf_line[region + '_gene']: # NOTE this requires allele to be correct, but set_bool_column() does not
self.hists[region + '_allele_right_vs_per_gene_support'].fill(support)
else:
self.hists[region + '_allele_wrong_vs_per_gene_support'].fill(support)
示例12: find_partial_failures
def find_partial_failures(self, fostream_name):
unique_ids = []
for line in open(fostream_name.replace('.fostream', '')).readlines():
if len(self.sim_need) == 0:
return
if len(line.strip()) == 0: # skip blank lines
continue
line = line.replace('"', '')
line = line.split(';')
unique_id = line[0]
if 'NA' not in line: # skip lines that were ok
unique_ids.append(unique_id)
continue
if unique_id not in self.sim_need:
continue
if unique_id not in self.siminfo:
continue # not looking for this <unique_id> a.t.m.
info = {}
info['unique_id'] = unique_id
for stuff in line:
for region in utils.regions: # add the first instance of IGH[VDJ] (if it's there at all)
if 'IGH'+region.upper() in stuff and region+'_gene' not in info:
genes = re.findall('IGH' + region.upper() + '[^ ][^ ]*', stuff)
if len(genes) == 0:
print 'ERROR no %s genes in %s' % (region, stuff)
gene = genes[0]
if gene not in self.germline_seqs[region]:
print 'ERROR bad gene %s for %s' % (gene, unique_id)
sys.exit()
info[region + '_gene'] = gene
self.perfplotter.add_partial_fail(self.siminfo[unique_id], info)
if self.args.debug:
print '%-20s partial fail %s %s %s' % (unique_id,
utils.color_gene(info['v_gene']) if 'v_gene' in info else '',
utils.color_gene(info['d_gene']) if 'd_gene' in info else '',
utils.color_gene(info['j_gene']) if 'j_gene' in info else ''),
print ' (true %s %s %s)' % tuple([self.siminfo[unique_id][region + '_gene'] for region in utils.regions])
self.failtails[unique_id] = info
self.n_partially_failed += 1
self.sim_need.remove(unique_id)
return unique_ids
示例13: remove_gene
def remove_gene(glfo, gene, debug=False):
""" remove <gene> from <glfo> """
if debug:
print ' removing %s from glfo' % utils.color_gene(gene)
region = utils.get_region(gene)
if region in utils.conserved_codons:
del glfo[utils.conserved_codons[region] + '-positions'][gene]
del glfo['seqs'][region][gene]
示例14: build_v_gene_set
def build_v_gene_set(glfo, introns):
total_d_counts = {}
refseqs = {}
for d_gene, counts in introns.items():
total_d_counts[d_gene] = sum(counts.values())
for d_gene, _ in sorted(total_d_counts.items(), key=operator.itemgetter(1), reverse=True):
counts = introns[d_gene]
# first decide on the reference sequences
refseq, column_counts = None, None
for seq in sorted(counts, key=len, reverse=True):
if refseq is None: # first one, i.e. the longest
refseq = seq
column_counts = [{n : 0 for n in utils.nukes} for i in range(len(refseq))]
ioffset = len(refseq) - len(seq)
partial_refseq = refseq[ioffset:]
assert len(partial_refseq) == len(seq)
for ibase in range(ioffset, len(refseq)):
column_counts[ibase][seq[ibase - ioffset]] += counts[seq]
refseqs[d_gene] = []
for basecounts in column_counts:
most_common_base = sorted(basecounts.items(), key=operator.itemgetter(1), reverse=True)[0][0]
refseqs[d_gene].append(most_common_base)
refseqs[d_gene] = ''.join(refseqs[d_gene])
n_ok = 0
mutecounts = {}
for seq in sorted(counts, key=len, reverse=True):
# print ' %3d %150s' % (count, seq)
partial_refseq = refseqs[d_gene][len(refseqs[d_gene]) - len(seq):]
if seq == partial_refseq:
n_ok += counts[seq]
else:
# utils.color_mutants(partial_refseq, seq, print_result=True, extra_str=' ')
n_mutes = utils.hamming_distance(partial_refseq, seq)
if n_mutes not in mutecounts:
mutecounts[n_mutes] = 0
mutecounts[n_mutes] += counts[seq]
print ' %s %4d / %-4d ok' % (utils.color_gene(d_gene, width=10), n_ok, n_ok + sum(mutecounts.values())),
if len(mutecounts) > 0:
print '(mean of %.1f mutations among the other %d' % (numpy.average(mutecounts.keys(), weights=mutecounts.values()), sum(mutecounts.values())),
print ''
# add the intronic v genes to glfo
for d_gene, refseq in refseqs.items():
glfo['seqs']['v'][utils.generate_dummy_v(d_gene)] = refseq
glfo['cyst-positions'][utils.generate_dummy_v(d_gene)] = len(refseq) - 3
# write a glfo dir with everything
glutils.write_glfo(outdir + '/germlines/imgt-and-intronic', glfo, debug=True)
# remove the original v genes, and write a glfo dir with just the intronic ones
glutils.remove_genes(glfo, [g for g in glfo['seqs']['v'] if 'xDx' not in g], debug=True)
glutils.write_glfo(outdir + '/germlines/intronic', glfo, debug=True)
示例15: make_mutefreq_plot
def make_mutefreq_plot(plotdir, gene_name, positions):
""" NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """
nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'}
fig, ax = plotting.mpl_init()
fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])
ibin = 0
print utils.color_gene(utils.unsanitize_name(gene_name))
legend_colors = set()
for info in positions:
posname = info['name']
# make label below bin
ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8)
total = 0.0
alpha = 0.6
for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True):
color = nuke_colors[nuke]
label_to_use = None
if color not in legend_colors:
label_to_use = nuke
legend_colors.add(color)
# horizontal line at height total+prob
ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use)
# vertical line from total to total + prob
ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3)
# # write [ACGT] at midpoint between total and total+prob
# midpoint = 0.5*(prob + 2*total)
# ... *redacted*
total += prob
ibin += 1
ax.get_xaxis().set_visible(False)
plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})