本文整理汇总了Python中Bio.SeqIO.parse方法的典型用法代码示例。如果您正苦于以下问题:Python SeqIO.parse方法的具体用法?Python SeqIO.parse怎么用?Python SeqIO.parse使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Bio.SeqIO
的用法示例。
在下文中一共展示了SeqIO.parse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: read_fasta
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def read_fasta(mirna_fasta_file, mrna_fasta_file):
handle = open(mirna_fasta_file, "rU")
mirna_list = list(SeqIO.parse(handle, "fasta"))
handle.close()
handle = open(mrna_fasta_file, "rU")
mrna_list = list(SeqIO.parse(handle, "fasta"))
handle.close()
mirna_ids = []
mirna_sequences = []
mrna_ids = []
mrna_sequences = []
for i in range(len(mirna_list)):
mirna_ids.append(str(mirna_list[i].id))
mirna_sequences.append(str(mirna_list[i].seq))
for i in range(len(mrna_list)):
mrna_ids.append(str(mrna_list[i].id))
mrna_sequences.append(str(mrna_list[i].seq))
return (mirna_ids, mirna_sequences, mrna_ids, mrna_sequences)
示例2: encode_bio_sequence
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):
"""
Loads a sequence file and returns an array of one-hot sequences.
Parameters
----------
fname: str
Filename of fasta file.
file_type: str
The type of file encoding to process, e.g. fasta or fastq, this
is passed to Biopython.SeqIO.parse.
letters: str
The set of letters that the sequences consist of, e.g. ATCG.
Returns
-------
np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
"""
from Bio import SeqIO
sequences = SeqIO.parse(fname, file_type)
return seq_one_hot_encode(sequences, letters)
示例3: count_records
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def count_records(fasta_file):
"""Count the number of records in a fasta file and return a list of
recods id
Args:
fasta_file (string): the path to a fasta file
Returns:
list: a list of record ids
"""
logger = logging.getLogger(__name__)
record_list = []
for record in SeqIO.parse(fasta_file, "fasta"):
record_list.append(record.id)
try:
assert len(record_list) != 0
except AssertionError as e:
logger.error(
'Failed to find records in genome(s) file:%s' % fasta_file)
sys.exit(1)
else:
return record_list
示例4: READ_FASTA_ENTRY
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def READ_FASTA_ENTRY(file_name):
fasta_sequences=[]
sequence_name=[]
full_names_dict={}
sequences_dict={}
if os.stat(file_name)[6]!=0: #not empty
fh = open(file_name, "r")
for record in SeqIO.parse(fh, "fasta"):
short_name=str(record.id).split(' ')[0]
sequences_dict[short_name]=str(record.seq)
return sequences_dict
#------------------------------------------------------------------------------------------
示例5: READ_FASTA_ENTRY
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def READ_FASTA_ENTRY(file_name):
fasta_sequences=[]
sequence_name=[]
full_names_dict={}
sequences_dict={}
if os.stat(file_name)[6]!=0: #not empty
fh = open(file_name, "r")
for record in SeqIO.parse(fh, "fasta"):
short_name=str(record.id).split(' ')[0]
sequence_name.append(short_name)
full_names_dict[short_name]=str(record.id)
fasta_sequences.append(str(record.seq))
sequences_dict[short_name]=str(record.seq)
return sequences_dict,fasta_sequences, sequence_name, full_names_dict
示例6: _read_reference
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def _read_reference(self, reference):
"""Read in the reference from the file into a dictionary.
Parameters
----------
reference: str
Path to the reference.
Returns
-------
reference: dict
Dictionary with chromosome - sequence mapping.
"""
if reference is None:
self._reference = None
else:
self._reference = {}
with open(reference) as reference_fp:
for record in SeqIO.parse(reference_fp, "fasta"):
self._reference[record.id] = list(record.seq)
示例7: fetch_names
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def fetch_names(id_list):
organism_names = {}
# Doing 100 by 100 to make sure requests to NCBI are not too big
for i in range(0, len(id_list), 100):
j = i + 100
if j >= len(id_list):
j = len(id_list)
sys.stderr.write(
"Fetching entries from %s to %s from GenBank\n" % (i, j))
sys.stderr.flush()
result_handle = Entrez.efetch(db=db, rettype="gb", id=id_list[i:j])
# Populate result per organism name
for record in parse(result_handle, 'genbank'):
# Using NCBI name, which should match accession number passed
organism_names[record.name] = record.annotations['organism']
return organism_names
示例8: addspacerstodict
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def addspacerstodict(gendict, sfile):
spacerrecords = SeqIO.parse(sfile, 'fasta')
for record in spacerrecords:
accessions = record.name.split('|')
sequence = str(record.seq)
for acc in accessions:
acc_elems = acc.split('_')
order = acc_elems[-1]
acc_id = '_'.join(acc_elems[:-1])
try:
if 'Spacers' in gendict[acc_id]:
gendict[acc_id]['Spacers'][order] = sequence
else:
gendict[acc_id]['Spacers'] = {order: sequence}
except KeyError:
print('Error on accession id: %s' % acc_id)
return gendict
示例9: compare_outputs
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def compare_outputs(reference_db, test_db, out_name):
reference_ids = {}
print('Expect the following sequences to be predicted to be eukaryotic:')
for record in SeqIO.parse(reference_db, "fasta"):
reference_ids[record.id] = 0
print(record.id)
#Check for enexptected output
print('The following sequences were predicted to be eukaryotic:')
for record in SeqIO.parse(test_db, "fasta"):
print(record.id)
if record.id not in reference_ids:
print('\nUnexpected scaffold %s in %s' % (record.id, out_name))
return True
else:
reference_ids[record.id] = 1
#Check for missing output
for scaffold in reference_ids:
if reference_ids[scaffold] is 0:
print('\nMissing scaffold %s in %s' % (scaffold, out_name))
return True
return False
示例10: find_gaps
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def find_gaps(input_filename):
# Load the original FASTA sequence.
fasta = SeqIO.parse(input_filename, "fasta")
for record in fasta:
gap_start = None # Not in a gap
for i in _range(len(record)):
if record.seq[i].upper() == "N":
if gap_start is None:
gap_start = i
else:
if gap_start is not None:
print("\t".join(map(str, (record.id, gap_start, i))))
gap_start = None
if gap_start is not None:
print("\t".join(map(str, (record.id, gap_start, len(record)))))
示例11: _align_proteins_to_hmm
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def _align_proteins_to_hmm(self, protein_sequences, hmm_file):
'''hmmalign proteins to hmm, and return an alignment object
Parameters
----------
protein_sequences: generator / list of tuple(name,sequence) objects
from SeqReader().
'''
cmd = "hmmalign '{}' /dev/stdin".format(hmm_file)
output = extern.run(cmd, stdin=''.join([
">{}\n{}\n".format(s[0], s[1]) for s in protein_sequences]))
protein_alignment = []
for record in SeqIO.parse(StringIO(output), 'stockholm'):
protein_alignment.append(AlignedProteinSequence(record.name, str(record.seq)))
if len(protein_alignment) > 0:
logging.debug("Read in %i aligned sequences e.g. %s %s" % (
len(protein_alignment),
protein_alignment[0].name,
protein_alignment[0].seq))
else:
logging.debug("No aligned sequences found for this HMM")
return protein_alignment
示例12: main
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def main(args):
for record in SeqIO.parse(args.infile, 'fasta'):
if args.discard:
if sum([1 for rx in args.discard if re.match(rx, record.id)]) > 0:
continue
subseqcounter = 0
printlog(args.debug, "DEBUG: convert to upper case", record.id)
sequence = str(record.seq).upper()
printlog(args.debug, "DEBUG: split seq by Ns", record.id)
subseqs = [ss for ss in re.split('[^ACGT]+', sequence) if len(ss) > args.minlength]
printlog(args.debug, "DEBUG: print subseqs", record.id)
for subseq in subseqs:
subseqcounter += 1
subid = '{:s}_chunk_{:d}'.format(record.id, subseqcounter)
subrecord = SeqRecord(Seq(subseq), subid, '', '')
SeqIO.write(subrecord, args.outfile, 'fasta')
示例13: filter_se_fastq_by_qual
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def filter_se_fastq_by_qual(fastq_filename,output_filename=None,min_bp_quality=20,min_single_bp_quality=0):
if fastq_filename.endswith('.gz'):
fastq_handle=gzip.open(fastq_filename)
else:
fastq_handle=open(fastq_filename)
if not output_filename:
output_filename=fastq_filename.replace('.fastq','').replace('.gz','')+'_filtered.fastq.gz'
try:
fastq_filtered_outfile=gzip.open(output_filename,'w+')
for record in SeqIO.parse(fastq_handle, "fastq"):
if np.array(record.letter_annotations["phred_quality"]).mean()>=min_bp_quality \
and np.array(record.letter_annotations["phred_quality"]).min()>=min_single_bp_quality:
fastq_filtered_outfile.write(record.format('fastq'))
except:
raise Exception('Error handling the fastq_filtered_outfile')
return output_filename
示例14: filter_se_fastq_by_qual
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def filter_se_fastq_by_qual(fastq_filename,output_filename=None,min_bp_quality=20,min_single_bp_quality=0):
if fastq_filename.endswith('.gz'):
fastq_handle=gzip.open(fastq_filename)
else:
fastq_handle=open(fastq_filename)
if not output_filename:
output_filename=fastq_filename.replace('.fastq','').replace('.gz','')+'_filtered.fastq.gz'
try:
fastq_filtered_outfile=gzip.open(output_filename,'w+')
for record in SeqIO.parse(fastq_handle, "fastq"):
if np.array(record.letter_annotations["phred_quality"]).mean()>=min_bp_quality \
and np.array(record.letter_annotations["phred_quality"]).min()>=min_single_bp_quality:
fastq_filtered_outfile.write(record.format('fastq'))
except:
raise Exception('Error handling the fastq_filtered_outfile')
return output_filename
示例15: pasa_transcript2gene
# 需要导入模块: from Bio import SeqIO [as 别名]
# 或者: from Bio.SeqIO import parse [as 别名]
def pasa_transcript2gene(input):
# modify kallisto ouput to map gene names to each mRNA ID so you know what locus they have come from
mRNADict = {}
# since mRNA is unique, parse the transcript file which has mRNAID geneID in header
with open(input, 'r') as transin:
for line in transin:
if line.startswith('>'):
line = line.rstrip()
line = line.replace('>', '')
cols = line.split(' ')
mRNAID = cols[0]
geneID = cols[1]
location = cols[-1]
if not mRNAID in mRNADict:
mRNADict[mRNAID] = (geneID, location)
return mRNADict