本文整理汇总了Python中Bio.SearchIO.read方法的典型用法代码示例。如果您正苦于以下问题:Python SearchIO.read方法的具体用法?Python SearchIO.read怎么用?Python SearchIO.read使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Bio.SearchIO
的用法示例。
在下文中一共展示了SearchIO.read方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: read_write_and_compare
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def read_write_and_compare(self, source_file, source_format, out_file,
out_format, **kwargs):
"""Compares read QueryResults after it has been written to a file."""
source_qresult = SearchIO.read(source_file, source_format, **kwargs)
SearchIO.write(source_qresult, out_file, out_format, **kwargs)
out_qresult = SearchIO.read(out_file, out_format, **kwargs)
self.assertTrue(compare_search_obj(source_qresult, out_qresult))
示例2: reciprocal_hmm_search
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def reciprocal_hmm_search(modelname, modelname_regex, filename, organism, rev_inc_bitscore_percentage, out_filename = None):
''' Performs reciprocal hmmer search against the proteome of given organism
'''
print "# Reciprocal search..."
is_found = False
if out_filename == None:
out_filename = modelname + ".rechits_" + organism
reciprocal_search_command = "phmmer --noali --tblout " + out_filename + " " + filename + " " + proteomes_dir + organism + ".fasta > hmmer_res"
os.system(reciprocal_search_command)
try:
hits = SearchIO.read(out_filename, "hmmer3-tab")
max_bitscore = hits[0].bitscore
except:
hits = []
max_bitscore = 0
if len(hits) > 0:
if re.search(modelname_regex, hits[0].description):
is_found = True
# for h in hits:
# if h.bitscore > rev_inc_bitscore_percentage * max_bitscore:
# if manual_mode:
# print modelname_regex, h.description, re.search(modelname_regex, h.description)
# raw_input("...")
# if re.search(modelname_regex, h.description):
# is_found = True
if manual_mode:
print filename, is_found
raw_input("Check reciprocal search results...")
return is_found
示例3: __init__
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def __init__(self,Maxicircle,out_file):
file_out=open(out_file,'w')
writer = csv.writer(file_out,delimiter="\t")
writer.writerow(["##gff-version","3"])
rows=[]
for protein in glob.glob("/Users/Said/Github/Maxicircle/DB/AA/*.faa"):
output_file=protein.split("/")[-1]+".xml"
blastx_cline = NcbiblastxCommandline(query=Maxicircle , db=protein,
outfmt=5, out=output_file)
blastx_cline()
blast_qresult = SearchIO.read(output_file, 'blast-xml')
if len(blast_qresult)>0:
best=blast_qresult[0][0]
query_range=[x for x in best.query_range]
if best.query_strand>0:
query_strand="+"
else:
query_strand="-"
chromosome=best.query_id
rows.append([chromosome,".","exon",query_range[0],query_range[1],".",query_strand,".","ID="+protein.split("/")[-1].split(".faa")[0]])
print(str(len(rows))+" exons found")
rows=iter(rows)
writer.writerows(rows)
示例4: read_hmmer_file
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def read_hmmer_file(f_path):
"""
Uses Biopython's SearchIO to parse a HMMER output file.
Returns an iterator with search hits.
"""
f_path = _check_file(f_path)
return SearchIO.read(f_path, "hmmer3-text")
示例5: find_frameshift
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def find_frameshift(sbjct_dict, query_dict, pseudo_hits,
temp_dir, out_frameshift):
assert(os.path.isdir(temp_dir))
sys.stderr.write("finding frameshift mutations...\n")
frameshifts = []
non_frameshift = []
exn_query = temp_dir + "/" + "exn_query.fa"
exn_target = temp_dir + "/" + "exn_target.fa"
align_file = temp_dir + "/" + "fshift_exonerate.exn"
for hit in pseudo_hits:
chrom = hit[0]
record = sbjct_dict[chrom]
qseqid = hit[8].split(";")[0].split("=")[1]
SeqIO.write(query_dict[qseqid], exn_query, "fasta")
flank = 1000
flank_record = _get_hit_record(record, hit, flank)
SeqIO.write(flank_record, exn_target, "fasta")
# alignment using exonerate
p = subprocess.Popen("exonerate -m protein2dna -n 1 -q " +
exn_query + " -t " + exn_target + ">" +
align_file, shell=True)
os.waitpid(p.pid, 0)
fshift = False
try:
qresult = SearchIO.read(align_file, "exonerate-text")
hsp = qresult[0][0] # first hit, first hsp
# query overlapping with the best-hit
new_hit_start = flank + 1
new_hit_end = len(flank_record.seq) - flank
if hsp.hit_start + 1 <= new_hit_end and \
hsp.hit_end >= new_hit_start:
# there are frameshifts
if len(hsp.hit_frame_all) > 1:
fshift = True
except:
pass
if fshift:
fshift_seq = flank_record.seq[hsp.hit_start:hsp.hit_end]
fshift_id = hit[0] + ":" + \
str(hsp.hit_start + 1) + "-" + str(hsp.hit_end)
frameshift_record = SeqRecord(
fshift_seq, id=fshift_id, name=fshift_id,
description="qseqid=" + qseqid)
frameshifts.append(frameshift_record)
else:
non_frameshift.append(hit)
# end for
SeqIO.write(frameshifts, out_frameshift, "fasta")
sys.stderr.write("done.\n")
return non_frameshift
示例6: getIndices
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def getIndices(resultHandle):
'''If not provided directly by the user, this function retrieves the best BLAST hit's indices.'''
blast_result = SearchIO.read(resultHandle, 'blast-tab')
print(blast_result[0][0])
start = blast_result[0][0].hit_start
end = blast_result[0][0].hit_end
return start, end
示例7: runLocalBLAST
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def runLocalBLAST(genome, genomeDict,
genomeFastaF, genomeDatabaseF, workspaceRootDir, sourceIndex,
neighbor_num, bitscore_cutoff):
# auxiliary files
queryF = workspaceRootDir + 'query_active' + str(sourceIndex) + '.fasta'
giF = workspaceRootDir + 'girestrict_active' + str(sourceIndex) + '.txt'
blastOutF = workspaceRootDir + 'active_blast' + str(sourceIndex) + '.xml'
# load sequences
reciter = SeqIO.parse(genomeFastaF, 'fasta')
for index, aaRecord in enumerate(reciter):
# load information, find in genome
thisGene = parseNCBIDescription(aaRecord.description)
chromIndex = genomeDict[thisGene.gi][0]
geneIndex = genomeDict[thisGene.gi][1]
# write sequence
with open(queryF, 'w') as queryFHandle:
SeqIO.write(aaRecord, queryFHandle, 'fasta')
# write neighbors
neighbors = getNeighbors(chromIndex, geneIndex, neighbor_num, genome)
with open(giF, 'w') as giFHandle:
giFHandle.write('\n'.join(neighbors) + '\n')
# run BLAST
blastp_cline = NcbiblastpCommandline(query=queryF,
db=genomeDatabaseF, evalue=0.1, outfmt=5, out=blastOutF,
gilist=giF, dbsize=neighbor_num, searchsp=neighbor_num)
stdout, stderr = blastp_cline()
# parse BLAST output
blastOutput = SearchIO.read(blastOutF, 'blast-xml')
genome[chromIndex][geneIndex].alignments = parseBLASTOut(blastOutput,
thisGene.gi,
bitscore_cutoff)
reciter.close()
os.system('rm ' + queryF)
os.system('rm ' + giF)
os.system('rm ' + blastOutF)
示例8: parseBLAST
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def parseBLAST(blastXML):
blast_qresult = SearchIO.read(StringIO(blastXML), 'blast-xml')
# initialize output variables
num_hits = 0
num_hsps =evalue = qstart = qend = hit_seq = "NA"
if (len(blast_qresult)):
num_hits = len(blast_qresult)
# HIT related data
num_hsps = len(blast_qresult[0]) # blast_qresult[0] -> num hsps of first hit
# HSP related data ( blast_qresult[0][0] -> first HSP from first HIT)
evalue = blast_qresult[0][0].evalue
qstart = blast_qresult[0][0].query_start+1 # it's the XML coordinate -1
qend = blast_qresult[0][0].query_end
hit_seq = str(blast_qresult[0][0][0].hit.seq) # this is a SeqRecord object converted to string (blast_qresult[0][0][0] -> third level, fragments of the HSP)
return(num_hits, num_hsps, evalue, qstart, qend, hit_seq)
示例9: filter_forw_hmm_hits
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def filter_forw_hmm_hits(modelname, forw_inc_bitscore_percentage, unique=True):
''' Reads hmmer hits from standart hmmer output.
Returns hits with bitscore more than provided max bitscore percentage.
By default returns only hits that from unique loci.
'''
gene_regex = re.compile(r'GN=([^=]+)')
try:
hits = SearchIO.read(modelname + ".hits", "hmmer3-tab")
except:
hits = []
# select the longest isoform
filtered_hits = []
try:
max_bitscore = hits[0].bitscore
except:
max_bitscore = 0
unique_genes = []
na_gene_count = 0
for h in hits:
# try to guess gene from hit description
gene = None
for m in gene_regex.finditer(h.description):
gene = m.group(1)[:-3]
if not gene:
gene = "gene" + str(na_gene_count)
na_gene_count += 1
#print h.id, gene, na_gene_count
if gene not in unique_genes:
unique_genes.append(gene)
if h.bitscore > max_bitscore * forw_inc_bitscore_percentage:
filtered_hits.append(h)
if manual_mode:
print unique_genes
# raw_input("...")
return filtered_hits
示例10: read_blat
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def read_blat(path):
data = {}
for name in glob.glob(path):
try:
blat_result = SearchIO.read(name, 'blat-psl')
except:
continue
filter_func = lambda hit: hit.query_span >= 200 and (hit.hit_span <= hit.query_span * 1.2 and hit.query_span <= hit.hit_span * 1.2)
for hit in blat_result:
hit = hit.filter(filter_func)
if hit:
#print(hit)
hsp_hit = []
for hsp in hit:
if hsp.query_id != hsp.hit_id:
#if hsp.is_fragmented:
hsp_hit.append(hsp.query_start)
hsp_hit.append(hsp.query_end)
print(hsp.query_id, hsp.hit_id, hsp.query_start, hsp.query_end, hsp.hit_start, hsp.hit_end)
if len(hsp_hit) > 0:
ranges = ["{},{} ".format(hsp_hit[i], hsp_hit[i + 1] - hsp_hit[i]) for i in range(0, len(hsp_hit), 2)]
if hsp.query_id in data:
data[hsp.query_id] += "".join(ranges)
#data[hsp.query_id] += "{},{} ".format(min(hsp_hit), max(hsp_hit) - min(hsp_hit))
else:
data[hsp.query_id] = "".join(ranges)
#data[hsp.query_id] = "{},{} ".format(min(hsp_hit), max(hsp_hit) - min(hsp_hit))
print(ranges)
return data
示例11: blat_alignment
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def blat_alignment(mapping, reference, scliplen_cutoff, lowqual_cutoff, min_percent_hq, mapq_cutoff, blat_ident_pct_cutoff, gfServer_port, hetero_factor, input, output):
bwa_bam = pysam.Samfile(input, 'rb')
blat_bam = pysam.Samfile(output + '.temp.bam', 'wb', template=bwa_bam)
if hetero_factor != 'a':
denovo = open(output+'.temp.fasta', 'w')
putative_indel_cluster = set()
for read in bwa_bam.fetch(until_eof=True):
if read.is_secondary:
# secondary alignment
continue
if read.is_unmapped:
if hetero_factor != 'a':
print >> denovo, '>' + read.qname
print >> denovo, read.seq
continue
soft_len, soft_qual, soft_pos = get_softclip_length(read)
sclip_ratio = soft_len / float(read.rlen)
if soft_pos != -1:
sclip_hq_ratio = len(soft_qual[soft_qual >= lowqual_cutoff]) / float(len(soft_qual))
else:
sclip_hq_ratio = 0
if sclip_ratio >= scliplen_cutoff and sclip_hq_ratio >= min_percent_hq and read.mapq >= mapq_cutoff:
blat_aln = False
soft_chr = bwa_bam.getrname(read.rname)
if hetero_factor == 'a':
blat_aln = True
elif (soft_chr, soft_pos) in putative_indel_cluster:
blat_aln = True
print >> denovo, '>' + read.qname
print >> denovo, read.seq
if not mapping:
blat_bam.write(read)
continue
# estimate the probability of indels given the coverage and number of soft-clipping readss
elif prob_of_indel_with_error(input, soft_chr, soft_pos, hetero_factor) < 0.05:
putative_indel_cluster.add((soft_chr, soft_pos))
blat_aln = True
print >> denovo, '>' + read.qname
print >> denovo, read.seq
if not mapping:
blat_bam.write(read)
continue
if blat_aln:
fa = open(output+'.temp.fa', 'w')
print >> fa, '>' + read.qname
print >> fa, read.seq
fa.close()
try:
subprocess.check_call('gfClient localhost ' + gfServer_port +' '+ reference['blat'] + ' ' + output + '.temp.fa ' + output + '.temp.psl >/dev/null 2>&1 ', shell=True)
except subprocess.CalledProcessError as e:
print >> sys.stderr, 'Execution failed for gfClient:', e
sys.exit(1)
try:
blat = SearchIO.read(output+'.temp.psl', 'blat-psl')
print >> sys.stderr, 'Blat aligned read:',read.qname
except:
print >> sys.stderr, 'No blat hit for read:',read.qname
blat_bam.write(read)
continue
hsps = blat.hsps
hsps.sort(key=lambda k: k.score, reverse=True)
if hsps[0].ident_pct / 100 >= blat_ident_pct_cutoff and hsps[0].hit_id == bwa_bam.getrname(read.tid):
# matching genomic coordinate
if hsps[0].hit_start == read.pos or hsps[0].hit_end == read.aend:
cigarstring, soft_len = psl2sam(hsps[0], blat.seq_len)
if soft_len == 0:
read.cigarstring, read.pos = cigarstring, hsps[0].hit_start
blat_bam.write(read)
bwa_bam.close()
blat_bam.close()
if hetero_factor != 'a':
denovo.close()
os.system('samtools sort ' + output + '.temp.bam ' + output + '.temp.sorted')
os.system('mv ' + output + '.temp.sorted.bam ' + output)
os.system('samtools index ' + output)
bwa_bam = pysam.Samfile(input, 'rb')
readlen = bwa_bam.next().rlen
bwa_bam.close()
return readlen
示例12: hits
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def hits(self):
if not self.out_path:
raise Exception("You can't access results from HMMER before running the algorithm.")
return SearchIO.read(self.out_path, 'hmmer3-tab')
示例13: open
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
continue
for each in read.cigar:
if each[0] == 4 and each[1]/float(read.rlen) >= cutoff:
# generating fasta file
fa = open('temp.fa','w')
print >> fa,'>'+read.qname
print >> fa, read.seq
fa.close()
# run blat
code = os.system('gfClient localhost 50000 '+sys.argv[1]+' temp.fa temp.psl >/dev/null 2>&1 ')
if code != 0:
print 'Execute gfClient failed!'
sys.exit(1)
try:
blat = SearchIO.read('temp.psl','blat-psl')
except:
break
hsps = blat.hsps
hsps.sort(key=lambda k:k.score, reverse=True)
if hsps[0].hit_id == bwa_bam.getrname(read.tid):
# matching genomic coordinate
if hsps[0].hit_start == read.pos or hsps[0].hit_end==read.aend:
cigarstring, soft_len = psl2sam(hsps[0],blat.seq_len)
if soft_len == 0:
read.cigarstring, read.pos= cigarstring, hsps[0].hit_start
break
blat_bam.write(read)
bwa_bam.close()
blat_bam.close()
示例14: open
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
This script takes quite a long time and often interrupts, due to NCBI not
responding in the alloted time. Has to be restarted several times when
350 sequences were analyzed, removing the sequences for which the results were
obtained.
Run as:
python blast_annotation.py proteins.faa > output.tab
"""
from Bio import SeqIO
from Bio import SearchIO
from Bio.Blast import NCBIWWW
from sys import argv
sequences = open(argv[1], 'r')
for sequence in SeqIO.parse(sequences, "fasta"):
result_handle = NCBIWWW.qblast("blastp", "nr", str(sequence.seq),
hitlist_size=10, expect=1e-03)
save_file = open("my_blast.xml", "w")
save_file.write(result_handle.read())
save_file.close()
result_handle.close()
blast_qresult = SearchIO.read('my_blast.xml', 'blast-xml')
for i in range(0, len(blast_qresult)):
blast_hsp = blast_qresult[i][0]
evalue = blast_hsp.evalue
desc = blast_hsp.hit_description
hit_id = blast_hsp.hit_id
print sequence.id, '\t', evalue, '\t', hit_id, '\t', desc
示例15: blast_it
# 需要导入模块: from Bio import SearchIO [as 别名]
# 或者: from Bio.SearchIO import read [as 别名]
def blast_it (input):
try:
t0 = datetime.datetime.now()
# **** SETTINGS ****
bitscore_cutoff = 100 # see calibration description in notes
# find clusters using basic ordinal filter:
# take max(min(max(left side neighbor genes), max(right side neighbor genes)), self)
# then set hard cutoff (maybe >= 3 matches on edges, 6 on peak, 2 on length)
# -> cuts off immediately once you're past the edge, but includes everything inside
edge_thresh = 2.5 # must be greater than (e.g. 3 or more)
peak_thresh = 5.5
length_thresh = 1.5
clust_extend = 0 # take N extra genes on either side just to show window
# *********************
# INPUT:
sufi = input[0]
sourcedir = input[1]
timestr = input[2]
# Obtain list of files
suffix_f = open(sourcedir, 'rU')
suffix_r = csv.reader(suffix_f)
suffixpaths = []
for row in suffix_r:
if len(row) == 0:
break
suffixpaths.append(row[0])
#print(row[0])
suffix_f.close()
suf = suffixpaths[sufi]
if suf[-len('.fasta'):] == '.fasta':
suf = suf[0:-len('.fasta')]
# assemble list of genes on chromosome
chromnames = []
chromgenes = []
reciter = SeqIO.parse(suf + '.fasta', 'fasta')
for index, aarecord in enumerate(reciter):
dparse = bracketparse(aarecord.description, '[]')
newgeneobj = gene_obj()
newgeneobj.gi = aarecord.description.split(' ')[0].split('|')[1]
for dp in dparse:
if dp.split('=')[0] == 'chromosome':
newgeneobj.chromosome = dp.split('=')[1]
continue
if dp.split('=')[0] == 'location':
newgeneobj.location = [int(bracketparse(dp.split('=')[1], '()')[0].split(',')[0]),
int(bracketparse(dp.split('=')[1], '()')[0].split(',')[1])]
continue
if dp.split('=')[0] == 'direction':
newgeneobj.direction = dp.split('=')[1]
continue
if dp.split('=')[0] == 'description':
newgeneobj.description = '='.join(dp.split('=')[1:])
continue
if dp.split('=')[0] == 'protein_id':
newgeneobj.ncbi_id = dp.split('=')[1]
continue
if newgeneobj.chromosome not in chromnames:
chromnames.append(newgeneobj.chromosome)
chromgenes.append([])
chromgenes[-1].append(newgeneobj)
reciter.close()
# sort genes by location
for chromi in range(len(chromnames)):
chromgenes[chromi].sort(key=lambda thsgi: thsgi.location[0])
# assemble nearby genes
neighbor_num = 10 #** move
neighbor_wind = int(round(neighbor_num/2))
# window = 10000 could do this with nt #
for chromi in range(len(chromgenes)):
for ind, aaobj in enumerate(chromgenes[chromi]):
#if suf.find('Streptomyces') == -1: # linear chromosome
chromgenes[chromi][ind].nearls = [chromgenes[chromi][thsi].gi
for thsi in range(max([ind-neighbor_wind, 0]),
min([ind+neighbor_wind+1, len(chromgenes[chromi])]))]
#else: #circular chromosome
# chromgenes[chromi][ind].nearls = [chromgenes[chromi][thsi].gi
# for thsi in np.mod(range(ind-neighbor_wind,
# ind+neighbor_wind+1), len(chromgenes[chromi]))]
# run local BLAST
db_f = suf + 'BLASTdb'
db_root = '/'.join(suf.split('/')[0:-1]) + '/'
gis = []
inds = []
for chromi in range(len(chromgenes)):
inds += zip([chromi]*len(chromgenes[chromi]), range(len(chromgenes[chromi])))
gis += [thsgene.gi for thsgene in chromgenes[chromi]]
aadict = dict(zip(gis, inds))
query_fname = db_root + 'query_active' + str(sufi) + '.fasta'
gi_fname = db_root + 'girestrict_active' + str(sufi) + '.txt'
#.........这里部分代码省略.........