本文整理汇总了Python中Bio.SearchIO.parse方法的典型用法代码示例。如果您正苦于以下问题:Python SearchIO.parse方法的具体用法?Python SearchIO.parse怎么用?Python SearchIO.parse使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Bio.SearchIO
的用法示例。
在下文中一共展示了SearchIO.parse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_write_and_compare
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def parse_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs):
"""Compares parsed QueryResults after they have been written to a file."""
source_qresults = list(SearchIO.parse(source_file, source_format, **kwargs))
SearchIO.write(source_qresults, out_file, out_format, **kwargs)
out_qresults = list(SearchIO.parse(out_file, out_format, **kwargs))
for source, out in zip(source_qresults, out_qresults):
self.assertTrue(compare_search_obj(source, out))
示例2: parseBlastOutFile
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def parseBlastOutFile(filename):
if filename[-3:] == "xml":
qResultGen = SearchIO.parse(filename, 'blast-xml')
elif filename[-3:] == "txt":
qResultGen = SearchIO.parse(filename, 'blast-tab')
else:
print("Unrecognized filetype.")
assert False
parsed = {qRes.id : qRes for qRes in qResultGen}
print("Parsed "+filename)
return parsed
示例3: blastparse
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def blastparse(stdout, output, tname, ntname):
global recorddict, minLength
handle = open(output, 'w') # open the target fasta file for writing
blast_handle = cStringIO.StringIO(stdout) # Convert string to IO object for use in SearchIO using StringIO
try: # Necessary to avoid bad genomes
for qresult in SearchIO.parse(blast_handle, 'blast-tab'): # Parse the blast output sting as if it were a file
for hit in qresult: # Hit object
for hsp in hit: # Hsp object
begin = hsp.query_range[0] # Start of hsp
finish = hsp.query_range[1] # End of hsp
if hsp.query_id in recorddict:
# For the Contig name in the target fasta dictionary mask using coordinates
if finish > begin:
recorddict[hsp.query_id].seq = \
recorddict[hsp.query_id].seq[:begin] + 'N' * (finish - begin + 1) \
+ recorddict[hsp.query_id].seq[finish:]
else:
recorddict[hsp.query_id].seq \
= recorddict[hsp.query_id].seq[:finish] + 'N' * (begin - finish + 1) \
+ recorddict[hsp.query_id].seq[begin:]
recorddict_bak = deepcopy(recorddict) # Copy the dictionary so we may iterate and modify the result
for idline in recorddict_bak:
# pattern = r'[^N]{'+ re.escape(str(minLength))+r'}' # Find a sequence of at least the target length
pattern = r'[ATCG]{100,}N{200,900}[ATCG]{100,}|[^N]{' + re.escape(str(minLength))+r'}'
if re.match(pattern, str(recorddict[idline].seq)) is not None:
SeqIO.write(recorddict[idline], handle, "fasta")
else:
# print 'Contig \'%s\' not written to file' % id
recorddict.pop(idline)
except ValueError:
print 'Value Error: There was an error removing %s genome from %s' % (ntname, tname)
示例4: process_blast_output
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def process_blast_output(file, simple, argparser):
qresults = SearchIO.parse(file, 'blast-xml')
if simple:
for qresult in qresults:
for hit in qresult:
for hsp in hit:
if ((hsp.aln_span == argparser.cont and (hsp.gap_num == 0) and
(hsp.aln_span == hsp.ident_num)) or (hsp.aln_span > argparser.cont)):
yield ([str(hsp), "\n\n"], None, hsp.aln_span)
for hsp in hit:
if (hsp.aln_span >= argparser.cont and (hsp.gap_num == 0) and
(hsp.aln_span == hsp.ident_num)):
yield (None, [str(hsp), "\n\n"], hsp.aln_span)
else:
for qresult in qresults:
for hit in qresult:
for hsp in hit:
for v, c, p in encode(simstr(hsp.aln)):
if v == "1" and c >= argparser.cont:
yield (format_alignment(hsp, p, c), None, c)
for hsp in hit:
for t0, t1, t2 in thrids(encode(simstr(hsp.aln))):
if t0[0] == "1":
assert (t0[2] < t1[2] < t2[2])
assert t2[0] == "1"
assert t1[0] == "0"
if t0[1] >= argparser.leftmin and t2[1] >= argparser.rightmin and \
(t0[1] + t2[1]) >= argparser.summin and \
t1[1] <= argparser.gapmax:
if not (argparser.S and
(t0[1] >= argparser.cont or t2[1] >= argparser.cont)):
yield (None, format_alignment(hsp, t0[2], t0[1], t2[2], t2[1]),
t0[1]+t2[1]-t1[1])
示例5: parse_n_fill_run_data_searchio
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def parse_n_fill_run_data_searchio(run_path, run_data, querydb):
run_id = get_run_id(run_path)
run_format = get_run_format(run_path)
for query in SearchIO.parse(run_path, run_format):
for hit in query.hits:
for hsp in hit.hsps:
exons = [x.hit_range for x in hsp.fragments]
coverage = 'N/A'
if querydb is not None:
total_matched = sum(x.query_span for x in hsp.fragments)
coverage = '{:.2f}%'.format(100 * total_matched / len(querydb[query.id]))
if hasattr(hsp, 'score'):
score = hsp.score
elif hasattr(hsp, 'bitscore'):
score = hsp.bitscore
else:
score = 'N/A'
if hasattr(hsp, 'ident_num') and hasattr(query, 'seq_len'):
matched = '{:.2f}%'.format(100 * hsp.ident_num / query.seq_len)
else:
matched = 'N/A'
alignment = AlignmentData(run_id, score, matched, coverage, hsp.hit_range, exons)
run_data[query.id][hit.id].append(alignment)
示例6: check_index
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def check_index(self, filename, format, **kwargs):
# check if Python3 installation has sqlite3
try:
import sqlite3
except ImportError:
sqlite3 = None
parsed = list(SearchIO.parse(filename, format, **kwargs))
# compare values by index
indexed = SearchIO.index(filename, format, **kwargs)
self.assertEqual(len(parsed), len(indexed.keys()))
# compare values by index_db, only if sqlite3 is present
if sqlite3 is not None:
db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
self.assertEqual(len(parsed), len(db_indexed.keys()))
for qres in parsed:
idx_qres = indexed[qres.id]
# parsed and indexed qresult are different objects!
self.assertNotEqual(id(qres), id(idx_qres))
# but they should have the same attribute values
self.assertTrue(compare_search_obj(qres, idx_qres))
# sqlite3 comparison, only if it's present
if sqlite3 is not None:
dbidx_qres = db_indexed[qres.id]
self.assertNotEqual(id(qres), id(dbidx_qres))
self.assertTrue(compare_search_obj(qres, dbidx_qres))
indexed._proxy._handle.close() # TODO - Better solution
if sqlite3 is not None:
db_indexed.close()
db_indexed._con.close()
示例7: parse_hmmscan_tab
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def parse_hmmscan_tab(infile, print_header=True):
'''Parse hmmscan output in --tblout format'''
if print_header:
yield "query","top hit","evalue","certainty","num sig hits"
records = SearchIO.parse(infile,'hmmer3-tab')
for rec in records:
query = rec.id
if len(rec) > 1:
hit1,hit2 = rec.hits[0],rec.hits[1]
eval1,eval2 = hit1.evalue,hit2.evalue
if eval1 != 0: # convert to -ln evalue
eval1 = -np.log(eval1)
if eval2 != 0:
eval2 = -np.log(eval2)
if eval1 == 0 and eval2 != 0: # this may be a hack, I don't care
certainty = 1
elif eval1 == 0 and eval2 == 0:
certainty = 0
else: # calculate certainty with info theoretic calc.
total = eval1 + eval2
p1,p2 = eval1/total, eval2/total
certainty = 1 + (p1 * np.log2(p1)) + (p2 * np.log2(p2))
else:
certainty = 1
yield query, rec.hits[0].id, rec.hits[0].evalue, certainty, len(rec)
示例8: first_exonerate_parse
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def first_exonerate_parse(dir, newdir, prefix):
cwd = os.getcwd()
if not os.path.exists(cwd + newdir):
os.makedirs(cwd + newdir)
if not os.path.exists(cwd + '/merged_exons/'):
os.makedirs(cwd + '/merged_exons/')
for file in slistdir(cwd + dir):
if 'DS_Store' not in file:
result = SearchIO.parse(cwd + dir + file, 'exonerate-text')
for h in result:
for hh in h:
for hhh in hh:
hitcounter = 1
for hhhh in hhh:
hitseq = hhhh.query
rootname = file.split('.fasta')
orthoname = file.split("_")
orthosubdir = cwd + newdir + '/' + orthoname[0]
if not os.path.exists(orthosubdir):
os.makedirs(orthosubdir)
newseqstr = str(hitseq.seq.ungap("-"))
newid = prefix + str(hitcounter) + '_' + rootname[0]
record = SeqRecord(Seq(newseqstr, generic_dna), id = newid, description = '')
fastaname = prefix + str(hitcounter) + '_' + rootname[0] + '.fasta'
SeqIO.write(record, orthosubdir + '/' + fastaname, "fasta")
hitcounter += 1
示例9: generate_blast_graph
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def generate_blast_graph(self):
evalue_filter = lambda hsp: hsp.evalue < self.evalue
file_name = "{}/blast_graph.txt".format(self.blast_output_path)
for blast_file in glob.glob(self.blast_data_path):
print("working on " + blast_file)
# Parse the Blast file
qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True)
for qresult in qresults:
write_line = ""
write_line += qresult.id + ":"
# Go to the Hit section of query
for hit in qresult[:]:
if not self.blast_graph.has_node(qresult.id):
self.blast_graph.add_node(qresult.id)
# Check if Hit has min value
filtered_hit = hit.filter(evalue_filter)
if filtered_hit is not None:
if not self.blast_graph.has_node(filtered_hit.id):
self.blast_graph.add_node(filtered_hit.id)
# Add Edge between graph nodes
self.blast_graph.add_edge(qresult.id, filtered_hit.id)
write_line += filtered_hit.id + ","
if write_line != "":
with open(file_name, "a") as f_handle:
f_handle.write(write_line + '\n')
# Write GML files
if self.generate_gml_files:
file_name = "{}/blast_graph.gml".format(self.blast_output_path)
with open(file_name, "a") as f_handle:
nx.write_gml(self.blast_graph, f_handle)
示例10: _call_hmmer
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def _call_hmmer(hmm, inputproteins):
inputproteins = list(inputproteins)
scores = {}
for ip in inputproteins:
scores[ip.id] = 0
with ntf(prefix="/dev/shm/") as inputfasta:
with ntf(prefix="/dev/shm/") as hmmoutput:
SeqIO.write(inputproteins, inputfasta.name, 'fasta')
hmmfile = os.path.join(hmm_location, hmm + '.hmm')
sp.call(['hmmsearch', '-o', hmmoutput.name, hmmfile, inputfasta.name])
hmmoutput.flush()
hmmoutput.seek(0)
QRS = SearchIO.parse(hmmoutput, format="hmmer3-text")
for qr in QRS:
# there's *always* a QR, even though it's usually empty.
# qr.sort()
# I'm kind of hoping this sorts by hit strength.
# worth checking. I guess it doesn't matter anyway.
for hit in qr:
scores[hit.id] = max(scores[hit.id], hit.bitscore)
for hsp in hit.hsps:
def appropriate_hyphens(m):
return '-' * len(m.group(0))
if len(hsp.hit.seq) > 100:
hitseq = re.sub('PPPPP+', appropriate_hyphens, str(hsp.hit.seq))
hitseq = hitseq.translate(None,'-*').upper()
yield hit.id, hsp.bitscore, hitseq
示例11: get_adenylation_domains
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def get_adenylation_domains(fasta, known=None, lagging_strand=False):
adenylation_domains = []
fasta_seqs = []
for fs in SeqIO.parse(fasta, 'fasta'):
revcom=False
seq = str(fs.seq)
pepseq, rf = get_pepseq(seq)
if rf < 0 == lagging_strand:
revcom=True
seq = utils.reverse_complement(seq)
fasta_seqs.append({'id': fs.id, 'seq': seq, 'pepseq': pepseq, 'rf': rf})
for fs in fasta_seqs:
utils.run_cmd([hmmsearch, '--domtblout', 'dump', os.path.abspath('lib/AMP-binding.hmm'), '-'],
'>header\n' + pepseq)
with open('dump') as f:
out = f.read()
res_stream = StringIO(out)
os.remove('dump')
results = list(SearchIO.parse(res_stream, 'hmmsearch3-domtab'))
for result in results:
for i, hsp in enumerate(result.hsps, 1):
s = hsp.hit_start
e = hsp.hit_end
adenylation_domains.append((AdenylationDomain(fs['seq'][s*3:e*3], known, '{}_{}'.format(fs['id'], i), revcom), s, e))
return adenylation_domains
示例12: get_hit_seq
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def get_hit_seq(fastafile, filename):
yamlfile = yaml_load_file(fastafile)
blout = SearchIO.parse(filename, 'blast-text')
for query in blout:
seqid = query.id.split("\n")[0]
#print(seqid)
fh = open("multi_" + seqid + ".fasta", 'a')
yamlfile[seqid]['hits'] = {}
for hit in query.hits:
gi = re.match(r"gi\|(.*)\|ref", hit.id).group(1)
yamlfile[seqid]['hits'][gi] = {}
#print(yamlfile[seqid]['hits'])
for hsp in hit.hsps:
#print(hsp.hit)
#print(hsp.hit_strand)
#print(hsp.hit_start)
#print(hsp.hit_end)
hitstart = hsp.hit_start + 1 - HIT_SEQUENCE_BPS
hitstart = 1 if hitstart < 0 else hitstart
hitend = hsp.hit_end + 1 + HIT_SEQUENCE_BPS
hitstrand = "plus" if (hsp.hit_strand == 1) else "minus"
#print(hsp.hit_end)
out = os.popen(BLAST_BINARY + "/blastdbcmd -db " + BLAST_DATABASE + " -dbtype nucl -entry " + str(gi) + " -range " + str(hitstart) + "-" + str(hitend) + " -strand " + str(hitstrand)).read()
fh.write(out)
#print(hsp.hit.seq)
#print(hsp.query.seq)
#print("-----")
fh.close()
#break
yaml_dump_file(fastafile, yamlfile)
示例13: get_scores_for_curated_via_hmm
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def get_scores_for_curated_via_hmm(self):
"""
For every curated variant we want to generate a set of scores against HMMs.
This is needed to supply the same type of information for curated as well as for automatic seqs.
"""
#Construct the one big file from all cureated seqs.
with open(self.curated_all_fasta, "w") as f:
for hist_type, seed in self.get_seeds():
seed_aln_file = os.path.join(self.seed_directory, hist_type, seed)
for s in SeqIO.parse(seed_aln_file, "fasta"):
s.seq = s.seq.ungap("-")
SeqIO.write(s, f, "fasta")
#Search it by our HMMs
self.search(hmms_db=self.combined_hmm_file, out=self.curated_search_results_file,sequences=self.curated_all_fasta)
##We need to parse this results file;
##we take here a snippet from load_hmmsearch.py, and tune it to work for our curated seq header format
for variant_query in SearchIO.parse(self.curated_search_results_file, "hmmer3-text"):
print "Loading hmmsearch for variant:", variant_query.id
variant_model=Variant.objects.get(id=variant_query.id)
for hit in variant_query:
gi = hit.id.split("|")[1]
seq = Sequence.objects.get(id=gi)
# print hit
try: #sometimes we get this : [No individual domains that satisfy reporting thresholds (although complete target did)]
best_hsp = max(hit, key=lambda hsp: hsp.bitscore)
add_score(seq, variant_model, best_hsp, seq.variant==variant_model)
except:
pass
示例14: retrieve_blast_data
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def retrieve_blast_data(self):
for blast_file in glob.glob(self.blast_data_path):
print(blast_file)
print self.network_data
qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True)
for qresult in qresults:
if(qresult.id in self.network_data):
print qresult.id
示例15: check_index
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import parse [as 别名]
def check_index(self, filename, format, **kwargs):
# check if Python3 installation has sqlite3
try:
import sqlite3
except ImportError:
sqlite3 = None
if filename.endswith(".bgz"):
handle = gzip.open(filename)
parsed = list(SearchIO.parse(handle, format, **kwargs))
handle.close()
else:
parsed = list(SearchIO.parse(filename, format, **kwargs))
# compare values by index
indexed = SearchIO.index(filename, format, **kwargs)
self.assertEqual(len(parsed), len(indexed),
"Should be %i records in %s, index says %i"
% (len(parsed), filename, len(indexed)))
# compare values by index_db, only if sqlite3 is present
if sqlite3 is not None:
db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
self.assertEqual(len(parsed), len(db_indexed),
"Should be %i records in %s, index_db says %i"
% (len(parsed), filename, len(db_indexed)))
for qres in parsed:
idx_qres = indexed[qres.id]
# parsed and indexed qresult are different objects!
self.assertNotEqual(id(qres), id(idx_qres))
# but they should have the same attribute values
self.assertTrue(compare_search_obj(qres, idx_qres))
# sqlite3 comparison, only if it's present
if sqlite3 is not None:
dbidx_qres = db_indexed[qres.id]
self.assertNotEqual(id(qres), id(dbidx_qres))
self.assertTrue(compare_search_obj(qres, dbidx_qres))
indexed.close()
if sqlite3 is not None:
db_indexed.close()
db_indexed._con.close()
if os.path.isfile(filename + ".bgz"):
# Do the tests again with the BGZF compressed file
print("[BONUS %s.bgz]" % filename)
self.check_index(filename + ".bgz", format, **kwargs)