本文整理匯總了Python中pyfasta.Fasta.keys方法的典型用法代碼示例。如果您正苦於以下問題:Python Fasta.keys方法的具體用法?Python Fasta.keys怎麽用?Python Fasta.keys使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyfasta.Fasta
的用法示例。
在下文中一共展示了Fasta.keys方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: check_keyfn
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def check_keyfn(path, klass, inplace):
f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda key: key.split()[0])
assert sorted(f.keys()) == ['a', 'b', 'c'], f.keys()
fix(path)
ff = Fasta(path, record_class=klass, flatten_inplace=inplace)
assert sorted(ff.keys()) == ['a extra', 'b extra', 'c extra'], (ff.keys(), klass)
fix(path)
示例2: check_keyfn2
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def check_keyfn2(path, klass, inplace):
f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda
key: "-".join(key.split()))
assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()
assert f['a-extra']
fix(path)
示例3: main
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def main():
args = make_parser()
if args.inplace:
f = Fasta(args.fasta_file, flatten_inplace=True)
else:
f = Fasta(args.fasta_file)
if args.output_file is not None:
output = open(args.output_file, 'w')
else:
output_file_name = args.fasta_file.split('.')[0]
output_file = '{0}.phylip'.format(output_file_name)
output = open(output_file, 'w')
sequence_count = len(f.keys())
sequence_length = len(f[next(iter(f.keys()))])
# print('', sequence_count, sequence_length, sep=' ')
output.write(' {0} {1}\n'.format(sequence_count, sequence_length))
for key in f.keys():
subseq = []
for chunk in grouper(f[key][:LINE_LENGTH], CHUNK_LENGTH):
subseq.append(''.join(item[0] for item in chunk))
subseq = ' '.join(subseq)
if len(key) < CHUNK_LENGTH:
key = key.ljust(CHUNK_LENGTH)
else:
key = key[:CHUNK_LENGTH]
# print(key, ' ', subseq)
output.write('{0} {1}\n'.format(key, subseq))
sequence_length -= LINE_LENGTH
start = LINE_LENGTH
stop = LINE_LENGTH * 2
# print()
output.write('\n')
while sequence_length > 0:
for key in f.keys():
subseq = []
for chunk in grouper(f[key][start:stop], CHUNK_LENGTH, ' '):
subseq.append(''.join(item[0] for item in chunk))
subseq = ' '.join(subseq)
# print(PAD_STRING, ' ', subseq)
output.write('{0} {1}\n'.format(PAD_STRING, subseq))
sequence_length -= LINE_LENGTH
start += LINE_LENGTH
stop += LINE_LENGTH
# print()
output.write('\n')
output.close()
示例4: run
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def run(self, filename):
self.openOutFiles(filename)
f = Fasta(filename)
count = len(f)
self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0)
for name in f.keys():
current += 1
if current % 1000 == 0:
print "All %d. Current: %d" % (count, current)
# format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber}
vGeneName = name.split("_")[0]
vGeneRegions = self.getVGeneRegions(vGeneName)
if vGeneRegions is None:
continue
withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 - 1]:]
group = self.findFR4(name, withoutMarkup)
if group is None:
continue
self.result_kabat_file.write(name)
self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions))
self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple(
[vGeneRegions[9] + i for i in [1, group.start(), group.start() + 1, len(withoutMarkup)]]))
self.closeOutFiles()
print "all: {}; not in kabat: {}; without fr4: {}".format(current, self.not_found_in_kabat, self.fr4_not_found)
示例5: aa_seq
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def aa_seq(options):
""" Gets the ancestral sequence from a Fasta file
"""
f = Fasta(options.ancestralfasta)
keyz = (f.keys())
match = ''
if (options.single_chromosome):
# Single chromosome fasta should only have one sequence.
# that sequence should be the sequence of interest.
keyz = list(keyz)
key = keyz[0]
else:
get_chromosome_from_header = options.header
get_chromosome_from_header = \
get_chromosome_from_header.replace('?', options.chromosome)
for key in keyz:
if(re.match(get_chromosome_from_header, key) is not None):
match = key
if(match is ''):
raise Exception("No match possible is something wrong with the"
" regex specified to the program as"
"--header-regex")
aaSeq = f[key]
return(aaSeq)
示例6: _no_empty
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def _no_empty(self, lista, listb):
''' removes empty entries '''
# check for empty fasta.
tmpa = list()
tmpb = list()
for i in range(len(listb)):
# open it.
try:
z = Fasta(listb[i], record_class=MemoryRecord)
# check for empty.
if len(z.keys()) == 0:
continue
# add to temp.
tmpa.append(lista[i])
tmpb.append(listb[i])
except:
logging.warning("bad fasta file")
# sort back.
return tmpa, tmpb
示例7: create_fasta_flat_file
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def create_fasta_flat_file(file):
"""Reads a fasta file for fast sequence retrival"""
fasta_file = Fasta(file, key_fn=lambda key: key.split()[0])
fasta_headers = set(fasta_file.keys());
return fasta_file, fasta_headers
示例8: genome_contenct_stats
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def genome_contenct_stats(fasta_path):
f = Fasta(fasta_path)
g_box_total = []
for seqid in f.keys():
seq = f[seqid][:]
g_boxs = len(re.findall("CACGTG", seq, flags=re.IGNORECASE))
g_box_total.append(g_boxs)
print >> sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
示例9: split
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def split(args):
parser = optparse.OptionParser("""\
split a fasta file into separated files.
pyfasta split -n 6 [-k 5000 ] some.fasta
the output will be some.1.fasta, some.2.fasta ... some.6.fasta
the sizes will be as even as reasonable.
""")
parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
help="""this overrides all other options. if specified, it will
split the file into a separate file for each header. it
will be a template specifying the file name for each new file.
e.g.: "%(fasta)s.%(seqid)s.fasta"
where 'fasta' is the basename of the input fasta file and seqid
is the header of each entry in the fasta file.""" ,default=None)
parser.add_option("-n", "--n", type="int", dest="nsplits",
help="number of new files to create")
parser.add_option("-o", "--overlap", type="int", dest="overlap",
help="overlap in basepairs", default=0)
parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
help="""\
split big files into pieces of this size in basepairs. default
default of -1 means do not split the sequence up into k-mers, just
split based on the headers. a reasonable value would be 10Kbp""")
options, fasta = parser.parse_args(args)
if not (fasta and (options.nsplits or options.header)):
sys.exit(parser.print_help())
if isinstance(fasta, (tuple, list)):
assert len(fasta) == 1, fasta
fasta = fasta[0]
kmer = options.kmers if options.kmers != -1 else None
overlap = options.overlap if options.overlap != 0 else None
f = Fasta(fasta)
if options.header:
names = dict([(seqid, options.header % \
dict(fasta=f.fasta_name, seqid=seqid)) \
for seqid in f.keys()])
"""
if len(names) > 0:
assert names[0][1] != names[1][1], ("problem with header format", options.header)
fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
"""
return with_header_names(f, names)
else:
names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap,
header=options.header)
#fhs = [open(n, 'wb') for n in names]
if options.kmers == -1:
return without_kmers(f, names)
else:
return with_kmers(f, names, options.kmers, options.overlap)
示例10: mask_to_bed
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def mask_to_bed(fasta_file, mask_bed_name):
"creates a bed file of the start and stops of masked seqs"
mask_bed = open(mask_bed_name,"wb")
f= Fasta(fasta_file)
mask_id = 1
for seqid in f.keys():
seq = f[seqid][:]
for m in re.finditer("X+",seq):
mask_id = mask_id + 1
w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(seqid,m.start(),m.end(),"mask_id {0}".format(mask_id),(m.end()-m.start()),(m.end()-m.start()+1))
mask_bed.write(w)
示例11: process_query
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def process_query():
print('Reading sequence library and query sequence')
library = Fasta(library_path)
queries = Fasta(query_path)
query_sequence = str(queries["Rattus"])
print('Processing')
progress = progressbar.ProgressBar(max_value=len(library.keys()))
cpu_count = multiprocessing.cpu_count()
executor = ThreadPoolExecutor(max_workers=cpu_count)
tasks = []
for record in list(library.keys())[:library_process_limit]:
library_sequence = str(library[record])
future = executor.submit(align, library_sequence, query_sequence)
tasks.append(AlignmentTask(record, future))
results = []
for i in range(len(tasks)):
_, _, score = tasks[i].future.result()
results.append(AlignmentResult(title=tasks[i].record, score=score))
progress.update(i)
etalone_score = sum([ smatrix[(x, x)] for x in query_sequence ])
print("Done")
print("Etalone score is %d" % etalone_score)
print("Got %d results, here are top-30 among them:" % len(results))
print("Score | Match | Record")
for sequence in sorted(results, key=lambda x: x.score, reverse=True)[:30]:
match = (sequence.score / etalone_score) * 100.0
print("%6d | %5.3f%% | %s" % (sequence.score, match, sequence.title))
timer = get_performance_timer()
for time in [timer.dotplot, timer.regions, timer.align]:
print(time / cpu_count)
示例12: Sequence
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
class Sequence():
"""docstring for Sequence"""
def __init__(self, engine='mysql', function = 'iterator', **kwargs):
self.engine = engine
if self.engine == 'mysql' and function == 'iterator':
self.create_mysql_iterator(**kwargs)
elif self.engine == 'biopython' and kwargs['data_type'] == 'fasta':
self.create_biopython_iterator(**kwargs)
elif self.engine == 'pyfasta' and kwargs['data_type'] == 'fasta':
self.create_pyfasta_iterator(**kwargs)
elif self.engine == 'twobit' and kwargs['data_type'] == 'twobit':
self.create_twobit_iterator(**kwargs)
def create_mysql_iterator(self, **kwargs):
cur = kwargs['cursor']
query = '''SELECT id, record FROM sequence WHERE n_count <= 2 AND
trimmed_len > 40'''
cur.execute(query)
self.readcount = cur.rowcount
self.read = iter(cur.fetchall())
def create_biopython_iterator(self, **kwargs):
from Bio import SeqIO
print "Generating BioPython sequence index. This may take a moment...."
self.fasta = SeqIO.index(kwargs['input'], kwargs['data_type'])
self.readcount = len(self.fasta)
self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
self.read = iter(self.db_values)
def create_twobit_iterator(self, **kwargs):
import bx.seq.twobit
self.fasta = bx.seq.twobit.TwoBitFile(file(kwargs['input']))
self.readcount = self.fasta.seq_count
self.db_values = zip(range(self.fasta.seq_count), sorted(self.fasta.keys()))
self.read = iter(self.db_values)
def create_pyfasta_iterator(self, **kwargs):
from pyfasta import Fasta
print "Generating PyFasta sequence index. This may take a moment...."
self.fasta = Fasta(kwargs['input'])
self.readcount = len(self.fasta)
self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys()))
self.read = iter(self.db_values)
def get_pyfasta_reads(self, **kwargs):
from pyfasta import Fasta
self.fasta = Fasta(kwargs['input'])
self.readcount = len(self.fasta)
示例13: generate_corpusfile
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def generate_corpusfile(fasta_fname, n, corpus_fname):
'''
Args:
fasta_fname: corpus file name
n: the number of chunks to split. In other words, "n" for "n-gram"
corpus_fname: corpus_fnameput corpus file path
Description:
Protvec uses word2vec inside, and it requires to load corpus file
to generate corpus.
'''
f = open(corpus_fname, "w")
fasta = Fasta(fasta_fname)
for record_id in tqdm(fasta.keys(), desc='corpus generation progress'):
r = fasta[record_id]
seq = str(r)
ngram_patterns = split_ngrams(seq, n)
for ngram_pattern in ngram_patterns:
f.write(" ".join(ngram_pattern) + "\n")
f.close()
示例14: read_fasta
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def read_fasta(ref_files, fasta_header):
"""Read fasta file
New line character can only exist between header and sequence,
not inside sequence
Args:
file_path (str): Path to fasta file.
Returns:
fasta_dict (dict): Dictionary with fasta headers as keys and the
sequences as values.
"""
# Open fasta file and store headers and sequences
for fasta_path in ref_files:
# print(fasta_path)
fasta = Fasta(fasta_path)
if fasta_header in fasta.keys():
return fasta
示例15: split_seqs
# 需要導入模塊: from pyfasta import Fasta [as 別名]
# 或者: from pyfasta.Fasta import keys [as 別名]
def split_seqs(self, num_jobs, max_ref=5, max_qry=20):
''' splits reference and query into appropriate number of splits '''
# load data into memory.
r = Fasta(self.ref_fasta, record_class=MemoryRecord)
q = Fasta(self.qry_fasta, record_class=MemoryRecord)
## reference ##
# split according to criteria.
if len(r) < max_ref:
max_ref = len(r)
if max_ref > num_jobs:
max_ref = 1
if len(q) < max_qry:
max_qry = len(q)
if num_jobs < max_qry:
max_qry = num_jobs
if (max_ref * max_qry) > num_jobs:
max_qry = int(float(num_jobs) / float(max_ref))
# count number of seqs.
sc = len(r.keys())
# create split info.
self.ref_names = ["ref_%i" % x for x in range(max_ref)]
self.ref_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.ref_names]
# split according to rules.
pyfasta.split_fasta.without_kmers(r, self.ref_files)
self.ref_names, self.ref_files = self._no_empty(self.ref_names, self.ref_files)
## query ##
# create split info.
self.qry_names = ["qry_%i" % x for x in range(max_qry)]
self.qry_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.qry_names]
# split according to rules.
pyfasta.split_fasta.without_kmers(q, self.qry_files)
self.qry_names, self.qry_files = self._no_empty(self.qry_names, self.qry_files)