本文整理汇总了Python中Bio.SearchIO类的典型用法代码示例。如果您正苦于以下问题:Python SearchIO类的具体用法?Python SearchIO怎么用?Python SearchIO使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SearchIO类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: check_raw
def check_raw(self, filename, id, raw, **kwargs):
"""Index filename using keyword arguments, check get_raw(id)==raw."""
idx = SearchIO.index(filename, self.fmt, **kwargs)
raw = _as_bytes(raw)
# Anticipate cases where the raw string and/or file uses different
# newline characters ~ we set everything to \n.
new = idx.get_raw(id)
self.assertTrue(isinstance(new, bytes),
"Didn't get bytes from %s get_raw" % self.fmt)
self.assertEqual(raw.replace(b'\r\n', b'\n'),
new.replace(b'\r\n', b'\n'))
idx.close()
# Now again, but using SQLite backend
if sqlite3:
idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs)
new = idx.get_raw(id)
self.assertTrue(isinstance(new, bytes),
"Didn't get bytes from %s get_raw" % self.fmt)
self.assertEqual(raw.replace(b'\r\n', b'\n'),
new.replace(b'\r\n', b'\n'))
idx.close()
if os.path.isfile(filename + ".bgz"):
# Do the tests again with the BGZF compressed file
print("[BONUS %s.bgz]" % filename)
self.check_raw(filename + ".bgz", id, raw, **kwargs)
示例2: parse_write_and_compare
def parse_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs):
"""Compares parsed QueryResults after they have been written to a file."""
source_qresults = list(SearchIO.parse(source_file, source_format, **kwargs))
SearchIO.write(source_qresults, out_file, out_format, **kwargs)
out_qresults = list(SearchIO.parse(out_file, out_format, **kwargs))
for source, out in zip(source_qresults, out_qresults):
self.assertTrue(compare_search_obj(source, out))
示例3: main
def main():
extensions = {'blast-tab': ['tsv', 'csv', 'blast', 'm8', 'blastm8'],
'blast-text': ['txt', 'bls', 'blast'], 'blast-xml': ['xml'],
'blat-psl': ['psl'], 'hmmer3-tab': ['tsv', 'csv'],
'hmmer3-text': ['txt'], 'hmmer2-text': ['txt'],
'exonerate-text': ['txt']}
kwargs = args.keywords
infile = args.infile
in_type = args.in_type
in_ext = infile.split('.')[-1]
proper_ext = extensions[in_type][0]
if in_ext not in extensions[in_type]:
print(textwrap.fill("error: invalid input file extension \"{}\". An "
"appropriate extension for this input type is {}"
.format(in_ext, proper_ext), 79))
sys.exit(1)
out_type = args.out_type
if args.output:
outfile = io_check(args.output, 'w')
else:
out_ext = extensions[out_type][0]
outfile = io_check("{}.{}".format('.'.join(infile.split('.')[:-1]), out_ext), 'w')
print("output will be in {} and formatted as {}".format(outfile, out_type))
SearchIO.convert(infile, in_type, outfile, out_type, out_kwargs=kwargs)
示例4: check_index
def check_index(self, filename, format, **kwargs):
# check if Python3 installation has sqlite3
try:
import sqlite3
except ImportError:
sqlite3 = None
parsed = list(SearchIO.parse(filename, format, **kwargs))
# compare values by index
indexed = SearchIO.index(filename, format, **kwargs)
self.assertEqual(len(parsed), len(indexed.keys()))
# compare values by index_db, only if sqlite3 is present
if sqlite3 is not None:
db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
self.assertEqual(len(parsed), len(db_indexed.keys()))
for qres in parsed:
idx_qres = indexed[qres.id]
# parsed and indexed qresult are different objects!
self.assertNotEqual(id(qres), id(idx_qres))
# but they should have the same attribute values
self.assertTrue(compare_search_obj(qres, idx_qres))
# sqlite3 comparison, only if it's present
if sqlite3 is not None:
dbidx_qres = db_indexed[qres.id]
self.assertNotEqual(id(qres), id(dbidx_qres))
self.assertTrue(compare_search_obj(qres, dbidx_qres))
indexed._proxy._handle.close() # TODO - Better solution
if sqlite3 is not None:
db_indexed.close()
db_indexed._con.close()
示例5: read_write_and_compare
def read_write_and_compare(self, source_file, source_format, out_file,
out_format, **kwargs):
"""Compares read QueryResults after it has been written to a file."""
source_qresult = SearchIO.read(source_file, source_format, **kwargs)
SearchIO.write(source_qresult, out_file, out_format, **kwargs)
out_qresult = SearchIO.read(out_file, out_format, **kwargs)
self.assertTrue(compare_search_obj(source_qresult, out_qresult))
示例6: parseBlastOutFile
def parseBlastOutFile(filename):
if filename[-3:] == "xml":
qResultGen = SearchIO.parse(filename, 'blast-xml')
elif filename[-3:] == "txt":
qResultGen = SearchIO.parse(filename, 'blast-tab')
else:
print("Unrecognized filetype.")
assert False
parsed = {qRes.id : qRes for qRes in qResultGen}
print("Parsed "+filename)
return parsed
示例7: main
def main(args):
if len(args) == 2:
filenameRoot = args[1].split(".")[0]
filenameXML = filenameRoot + ".xml"
SearchIO.convert(args[1], 'blast-tab', filenameXML, 'blast-xml')
elif len(args) == 3:
filenameRoot = args[1].split(".")[0]
filenameXML = args[2]
SearchIO.convert(args[1], 'blast-tab', filenameXML, 'blast-xml')
else:
print("Usage: path/to/blast/tabular/file [optional path/for/new/blast/xml/file]")
示例8: start_queryResult_generator
def start_queryResult_generator(inFile, fDic, work_sheet):
""" invoking the parse function to return a 'generator' that can allow you
to step though the record one QueryResult Object at a time but invoking
nextQuery = (next)generator on it.This approach can allow you to save
on memory. I have found with my current task casting this generator with
(list) works fine but it is really not called for in this current
task of parsing and sorting the records.
"""
""" http://biopython.org/DIST/docs/api/Bio.SearchIO.BlastIO-module.html"""
qGenerator = SearchIO.parse(inFile, 'blast-xml')
max_hits = 0
query_count = 1
# Step through all the records in the lump xml data file and write out
# each separate hit to file. Also write the summary information to the
# work sheet.
for query_result in qGenerator:
print('Processing Query BLAST return ' + str(query_count))
number_hits = int(len(query_result.hits))
# Extend header out right if new MAXHITS
if number_hits > max_hits:
max_hits = number_hits
if number_hits == 0:
# Construct path plus file name for no hit query
filename = str(fDic['topDir'] + fDic['noHit'] + 'Query_'
+ str(query_count) + '_H_none.xml')
# Write out any Queries that had to hits to a no Hit subfolder
SearchIO.write(query_result, filename, 'blast-xml')
write_qr_to_ws(query_count, query_result, work_sheet)
else :
# Now set up a counter of 'hits' in the QueryResult so hit's
# can be sliced away into their own record cleanly.
hit_count = 0;
for hit in query_result.hits:
total_hsps = len (hit.hsps)
lowest_eval = hit.hsps[0].evalue
best_hsp = hit.hsps[0]
for hsp in hit.hsps:
if hsp.evalue < lowest_eval:
lowest_eval = hsp.evalue
best_hsp = hsp
filename = str(fDic['topDir'] + outputFileName(query_count, hit, best_hsp))
SearchIO.write(query_result[hit_count:(hit_count + 1)], filename , 'blast-xml')
hit_count += 1
# Write out query_result to worksheet
write_qr_to_ws(query_count, query_result, work_sheet)
query_count += 1
# break is debugging code
# if query_count == 20:
# break
build_ws_header(work_sheet, max_hits)
return qGenerator
示例9: get_hit_seq
def get_hit_seq(fastafile, filename):
yamlfile = yaml_load_file(fastafile)
blout = SearchIO.parse(filename, 'blast-text')
for query in blout:
seqid = query.id.split("\n")[0]
#print(seqid)
fh = open("multi_" + seqid + ".fasta", 'a')
yamlfile[seqid]['hits'] = {}
for hit in query.hits:
gi = re.match(r"gi\|(.*)\|ref", hit.id).group(1)
yamlfile[seqid]['hits'][gi] = {}
#print(yamlfile[seqid]['hits'])
for hsp in hit.hsps:
#print(hsp.hit)
#print(hsp.hit_strand)
#print(hsp.hit_start)
#print(hsp.hit_end)
hitstart = hsp.hit_start + 1 - HIT_SEQUENCE_BPS
hitstart = 1 if hitstart < 0 else hitstart
hitend = hsp.hit_end + 1 + HIT_SEQUENCE_BPS
hitstrand = "plus" if (hsp.hit_strand == 1) else "minus"
#print(hsp.hit_end)
out = os.popen(BLAST_BINARY + "/blastdbcmd -db " + BLAST_DATABASE + " -dbtype nucl -entry " + str(gi) + " -range " + str(hitstart) + "-" + str(hitend) + " -strand " + str(hitstrand)).read()
fh.write(out)
#print(hsp.hit.seq)
#print(hsp.query.seq)
#print("-----")
fh.close()
#break
yaml_dump_file(fastafile, yamlfile)
示例10: reciprocal_hmm_search
def reciprocal_hmm_search(modelname, modelname_regex, filename, organism, rev_inc_bitscore_percentage, out_filename = None):
''' Performs reciprocal hmmer search against the proteome of given organism
'''
print "# Reciprocal search..."
is_found = False
if out_filename == None:
out_filename = modelname + ".rechits_" + organism
reciprocal_search_command = "phmmer --noali --tblout " + out_filename + " " + filename + " " + proteomes_dir + organism + ".fasta > hmmer_res"
os.system(reciprocal_search_command)
try:
hits = SearchIO.read(out_filename, "hmmer3-tab")
max_bitscore = hits[0].bitscore
except:
hits = []
max_bitscore = 0
if len(hits) > 0:
if re.search(modelname_regex, hits[0].description):
is_found = True
# for h in hits:
# if h.bitscore > rev_inc_bitscore_percentage * max_bitscore:
# if manual_mode:
# print modelname_regex, h.description, re.search(modelname_regex, h.description)
# raw_input("...")
# if re.search(modelname_regex, h.description):
# is_found = True
if manual_mode:
print filename, is_found
raw_input("Check reciprocal search results...")
return is_found
示例11: get_scores_for_curated_via_hmm
def get_scores_for_curated_via_hmm(self):
"""
For every curated variant we want to generate a set of scores against HMMs.
This is needed to supply the same type of information for curated as well as for automatic seqs.
"""
#Construct the one big file from all cureated seqs.
with open(self.curated_all_fasta, "w") as f:
for hist_type, seed in self.get_seeds():
seed_aln_file = os.path.join(self.seed_directory, hist_type, seed)
for s in SeqIO.parse(seed_aln_file, "fasta"):
s.seq = s.seq.ungap("-")
SeqIO.write(s, f, "fasta")
#Search it by our HMMs
self.search(hmms_db=self.combined_hmm_file, out=self.curated_search_results_file,sequences=self.curated_all_fasta)
##We need to parse this results file;
##we take here a snippet from load_hmmsearch.py, and tune it to work for our curated seq header format
for variant_query in SearchIO.parse(self.curated_search_results_file, "hmmer3-text"):
print "Loading hmmsearch for variant:", variant_query.id
variant_model=Variant.objects.get(id=variant_query.id)
for hit in variant_query:
gi = hit.id.split("|")[1]
seq = Sequence.objects.get(id=gi)
# print hit
try: #sometimes we get this : [No individual domains that satisfy reporting thresholds (although complete target did)]
best_hsp = max(hit, key=lambda hsp: hsp.bitscore)
add_score(seq, variant_model, best_hsp, seq.variant==variant_model)
except:
pass
示例12: __init__
def __init__(self,Maxicircle,out_file):
file_out=open(out_file,'w')
writer = csv.writer(file_out,delimiter="\t")
writer.writerow(["##gff-version","3"])
rows=[]
for protein in glob.glob("/Users/Said/Github/Maxicircle/DB/AA/*.faa"):
output_file=protein.split("/")[-1]+".xml"
blastx_cline = NcbiblastxCommandline(query=Maxicircle , db=protein,
outfmt=5, out=output_file)
blastx_cline()
blast_qresult = SearchIO.read(output_file, 'blast-xml')
if len(blast_qresult)>0:
best=blast_qresult[0][0]
query_range=[x for x in best.query_range]
if best.query_strand>0:
query_strand="+"
else:
query_strand="-"
chromosome=best.query_id
rows.append([chromosome,".","exon",query_range[0],query_range[1],".",query_strand,".","ID="+protein.split("/")[-1].split(".faa")[0]])
print(str(len(rows))+" exons found")
rows=iter(rows)
writer.writerows(rows)
示例13: generate_blast_graph
def generate_blast_graph(self):
evalue_filter = lambda hsp: hsp.evalue < self.evalue
file_name = "{}/blast_graph.txt".format(self.blast_output_path)
for blast_file in glob.glob(self.blast_data_path):
print("working on " + blast_file)
# Parse the Blast file
qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True)
for qresult in qresults:
write_line = ""
write_line += qresult.id + ":"
# Go to the Hit section of query
for hit in qresult[:]:
if not self.blast_graph.has_node(qresult.id):
self.blast_graph.add_node(qresult.id)
# Check if Hit has min value
filtered_hit = hit.filter(evalue_filter)
if filtered_hit is not None:
if not self.blast_graph.has_node(filtered_hit.id):
self.blast_graph.add_node(filtered_hit.id)
# Add Edge between graph nodes
self.blast_graph.add_edge(qresult.id, filtered_hit.id)
write_line += filtered_hit.id + ","
if write_line != "":
with open(file_name, "a") as f_handle:
f_handle.write(write_line + '\n')
# Write GML files
if self.generate_gml_files:
file_name = "{}/blast_graph.gml".format(self.blast_output_path)
with open(file_name, "a") as f_handle:
nx.write_gml(self.blast_graph, f_handle)
示例14: parse_hmmscan_tab
def parse_hmmscan_tab(infile, print_header=True):
'''Parse hmmscan output in --tblout format'''
if print_header:
yield "query","top hit","evalue","certainty","num sig hits"
records = SearchIO.parse(infile,'hmmer3-tab')
for rec in records:
query = rec.id
if len(rec) > 1:
hit1,hit2 = rec.hits[0],rec.hits[1]
eval1,eval2 = hit1.evalue,hit2.evalue
if eval1 != 0: # convert to -ln evalue
eval1 = -np.log(eval1)
if eval2 != 0:
eval2 = -np.log(eval2)
if eval1 == 0 and eval2 != 0: # this may be a hack, I don't care
certainty = 1
elif eval1 == 0 and eval2 == 0:
certainty = 0
else: # calculate certainty with info theoretic calc.
total = eval1 + eval2
p1,p2 = eval1/total, eval2/total
certainty = 1 + (p1 * np.log2(p1)) + (p2 * np.log2(p2))
else:
certainty = 1
yield query, rec.hits[0].id, rec.hits[0].evalue, certainty, len(rec)
示例15: process_blast_output
def process_blast_output(file, simple, argparser):
qresults = SearchIO.parse(file, 'blast-xml')
if simple:
for qresult in qresults:
for hit in qresult:
for hsp in hit:
if ((hsp.aln_span == argparser.cont and (hsp.gap_num == 0) and
(hsp.aln_span == hsp.ident_num)) or (hsp.aln_span > argparser.cont)):
yield ([str(hsp), "\n\n"], None, hsp.aln_span)
for hsp in hit:
if (hsp.aln_span >= argparser.cont and (hsp.gap_num == 0) and
(hsp.aln_span == hsp.ident_num)):
yield (None, [str(hsp), "\n\n"], hsp.aln_span)
else:
for qresult in qresults:
for hit in qresult:
for hsp in hit:
for v, c, p in encode(simstr(hsp.aln)):
if v == "1" and c >= argparser.cont:
yield (format_alignment(hsp, p, c), None, c)
for hsp in hit:
for t0, t1, t2 in thrids(encode(simstr(hsp.aln))):
if t0[0] == "1":
assert (t0[2] < t1[2] < t2[2])
assert t2[0] == "1"
assert t1[0] == "0"
if t0[1] >= argparser.leftmin and t2[1] >= argparser.rightmin and \
(t0[1] + t2[1]) >= argparser.summin and \
t1[1] <= argparser.gapmax:
if not (argparser.S and
(t0[1] >= argparser.cont or t2[1] >= argparser.cont)):
yield (None, format_alignment(hsp, t0[2], t0[1], t2[2], t2[1]),
t0[1]+t2[1]-t1[1])