本文整理汇总了Python中utils.common.timesofar函数的典型用法代码示例。如果您正苦于以下问题:Python timesofar函数的具体用法?Python timesofar怎么用?Python timesofar使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了timesofar函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_genome_in_bit
def get_genome_in_bit(chr_fa_folder):
''' encode each chromosome fasta sequence into a bitarray,
and store them in a dictionary with chr numbers as keys
chr_fa_folder is the folder to put all gzipped fasta files:
fasta files can be downloaded from NCBI FTP site:
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/
chr<i>.fa.gz (e.g. chr1.fa.gz)
'''
chr_bit_d = {}
chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT']
t0 = time.time()
for i in chr_range:
t1 = time.time()
#file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i)
file_name = 'chr{}.fa.gz'.format(i)
print("Loading {}...".format(file_name), end='')
file_name = os.path.join(chr_fa_folder, file_name)
with open_anyfile(file_name) as seq_f:
seq_f.readline() # skip header
seq_bit = bitarray()
for line in seq_f:
line = line.rstrip('\n')
line_bit = nuc_to_bit(line)
seq_bit += line_bit
chr_bit_d.update({i: seq_bit})
print("done.[{}]".format(timesofar(t1)))
print('='*20)
print("Finished. [{}]".format(timesofar(t0)))
return chr_bit_d
示例2: redo_parse_gbff
def redo_parse_gbff(path):
'''call this function manually to re-start the parsing step and set src_dump.
This is used when main() is broken at parsing step, then parsing need to be re-started
after the fix.
'''
#mark the download starts
src_dump = get_src_dump()
t0 = time.time()
t_download = timesofar(t0)
t1 = time.time()
#mark parsing starts
src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
parse_gbff(path)
t_parsing = timesofar(t1)
t_total = timesofar(t0)
#mark the download finished successfully
_updates = {
'status': 'success',
'time': {
'download': t_download,
'parsing': t_parsing,
'total': t_total
},
'pending_to_upload': True # a flag to trigger data uploading
}
src_dump.update({'_id': 'entrez'}, {'$set': _updates})
示例3: doc_feeder
def doc_feeder(self, index_type=None, index_name=None, step=10000, verbose=True, query=None, scroll='10m', **kwargs):
conn = self.conn
index_name = index_name or self.ES_INDEX_NAME
doc_type = index_type or self.ES_INDEX_TYPE
n = self.count(query=query)['count']
cnt = 0
t0 = time.time()
if verbose:
print('\ttotal docs: {}'.format(n))
_kwargs = kwargs.copy()
_kwargs.update(dict(size=step, index=index_name, doc_type=doc_type))
res = helpers.scan(conn, query=query, scroll=scroll, **_kwargs)
t1 = time.time()
for doc in res:
if verbose and cnt % step == 0:
if cnt != 0:
print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)))
print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='')
t1 = time.time()
yield doc
cnt += 1
if verbose:
print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)))
print("Finished! [{}]".format(timesofar(t0)))
示例4: load_contig
def load_contig(contig):
'''save cadd contig into mongodb collection.
should be an iterable.
'''
# if CADD_INPUT == "exome":
# CADD_INPUT = exome
tabix = pysam.Tabixfile(whole_genome)
src_db = get_src_db()
target_coll = src_db["cadd"]
t0 = time.time()
cnt = 0
docs = (doc for doc in fetch_generator(tabix, contig))
doc_list = []
for doc in docs:
doc_list.append(doc)
cnt += 1
if len(doc_list) == 100:
target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
doc_list = []
if cnt % 100000 == 0:
print(cnt, timesofar(t0))
if doc_list:
target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
print("successfully loaded cadd chromosome %s into mongodb" % contig)
print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
示例5: doc_feeder
def doc_feeder(collection, step=1000, s=None, e=None, inbatch=False, query=None, batch_callback=None, fields=None):
'''A iterator for returning docs in a collection, with batch query.
additional filter query can be passed via "query", e.g.,
doc_feeder(collection, query={'taxid': {'$in': [9606, 10090, 10116]}})
batch_callback is a callback function as fn(cnt, t), called after every batch
fields is optional parameter passed to find to restrict fields to return.
'''
src = get_src_db()
if type(collection) == str:
cur = src[collection].find()
else:
cur = collection.find()
n = cur.count()
s = s or 0
e = e or n
print('Retrieving {} documents from database "{}".'.format(n, collection))
t0 = time.time()
if inbatch:
doc_li = []
cnt = 0
t1 = time.time()
try:
if s:
cur.skip(s)
cnt = s
print("Skipping {} documents.".format(s))
if e:
cur.limit(e - (s or 0))
cur.batch_size(step)
print("Processing {}-{} documents...".format(cnt + 1, min(cnt + step, e)), end='')
for doc in cur:
if inbatch:
doc_li.append(doc)
else:
yield doc
cnt += 1
if cnt % step == 0:
if inbatch:
yield doc_li
doc_li = []
print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1)))
if batch_callback:
batch_callback(cnt, time.time()-t1)
if cnt < e:
t1 = time.time()
print("Processing {}-{} documents...".format(cnt + 1, min(cnt + step, e)), end='')
if inbatch and doc_li:
#Important: need to yield the last batch here
yield doc_li
#print 'Done.[%s]' % timesofar(t1)
print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1)))
print("=" * 20)
print('Finished.[total time: {}]'.format(timesofar(t0)))
finally:
cur.close()
示例6: two_docs_iterator
def two_docs_iterator(b1, b2, id_list, step=10000):
t0 = time.time()
n = len(id_list)
for i in range(0, n, step):
t1 = time.time()
print "Processing %d-%d documents..." % (i + 1, min(i + step, n)),
_ids = id_list[i:i+step]
iter1 = b1.mget_from_ids(_ids, asiter=True)
iter2 = b2.mget_from_ids(_ids, asiter=True)
for doc1, doc2 in zip(iter1, iter2):
yield doc1, doc2
print 'Done.[%.1f%%,%s]' % (i*100./n, timesofar(t1))
print "="*20
print 'Finished.[total time: %s]' % timesofar(t0)
示例7: main
def main(self, index, collection, diff_filepath, validate=False, wait=60):
self._index = index
self._esi._index = index
diff = loadobj(diff_filepath)
source_collection = diff['source']
add_list = self.add(source_collection, diff['add'])
delete_list = self.delete(collection, diff['delete'])
update_list = self.update(diff['update'])
t00 = time()
print('Adding new {} docs...'.format(len(diff['add'])))
t0 = time()
bulk(self._es, add_list)
print("Done. [{}]".format(timesofar(t0)))
print('Deleting {} docs'.format(len(diff['delete'])))
t0 = time()
bulk(self._es, delete_list)
print("Done. [{}]".format(timesofar(t0)))
print('Updating {} docs'.format(len(diff['update'])))
t0 = time()
bulk(self._es, update_list)
print("Done. [{}]".format(timesofar(t0)))
print("="*20)
print("Finished! [{}]".format(timesofar(t00)))
if validate:
print('Waiting {}s to let ES to finish...'.format(wait), end="")
sleep(wait)
print("Done.")
print("Validating...")
t0 = time()
q = {
"query": {
"constant_score": {
"filter": {
"exists": {
"field": collection
}
}
}
}
}
data = self._esi.doc_feeder(query=q, _source=collection)
temp_collection = collection + '_temp_' + get_random_string()
self._src[temp_collection].drop()
load_source(temp_collection, src_data=data)
c1 = get_backend(source_collection, 'mongodb')
c2 = get_backend(temp_collection, 'mongodb')
diff_result = diff_collections(c1, c2, use_parallel=False)
self._src[temp_collection].drop()
print("Done. [{}]".format(t0))
return diff_result
示例8: apply_changes
def apply_changes(self, changes, verify=True, noconfirm=False):
if verify:
self.pre_verify_changes(changes)
if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
print("Aborted.")
return -1
#src = self.get_source_collection(changes)
step = self.step
_db = get_target_db()
source_col = _db[changes['source']]
src = GeneDocMongoDBBackend(source_col)
target = GeneDocESBackend(self)
_timestamp = changes['timestamp']
def _add_docs(ids):
i = 0
for _ids in iter_n(ids, step):
t1 = time.time()
_doc_li = src.mget_from_ids(_ids)
for _doc in _doc_li:
_doc['_timestamp'] = _timestamp
i += 1
target.insert(_doc_li)
print('\t{}\t{}'.format(i, timesofar(t1)))
t0 = time.time()
if changes['add']:
print("Adding {} new docs...".format(len(changes['add'])))
t00 = time.time()
_add_docs(changes['add'])
print("done. [{}]".format(timesofar(t00)))
if changes['delete']:
print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='')
t00 = time.time()
target.remove_from_ids(changes['delete'], step=step)
print("done. [{}]".format(timesofar(t00)))
if changes['update']:
print("Updating {} existing docs...".format(len(changes['update'])))
t00 = time.time()
ids = [x['_id'] for x in changes['update']]
_add_docs(ids)
print("done. [{}]".format(timesofar(t00)))
target.finalize()
print("\n")
print("Finished.", timesofar(t0))
示例9: run_jobs_on_ipythoncluster
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False):
t0 = time.time()
rc = Client(CLUSTER_CLIENT_JSON)
lview = rc.load_balanced_view()
print "\t# nodes in use: {}".format(len(lview.targets or rc.ids))
lview.block = False
print "\t# of tasks: {}".format(len(task_list))
print "\tsubmitting...",
job = lview.map_async(worker, task_list)
print "done."
try:
job.wait_interactive()
except KeyboardInterrupt:
#handle "Ctrl-C"
if ask("\nAbort all submitted jobs?") == 'Y':
lview.abort()
print "Aborted, all submitted jobs are cancelled."
else:
print "Aborted, but your jobs are still running on the cluster."
return
if len(job.result) != len(task_list):
print "WARNING:\t# of results returned ({}) != # of tasks ({}).".format(len(job.result), len(task_list))
print "\ttotal time: {}".format(timesofar(t0))
if shutdown_ipengines_after_done:
print "\tshuting down all ipengine nodes...",
lview.shutdown()
print 'Done.'
return job.result
示例10: merge
def merge(self, step=100000, restart_at=0):
t0 = time.time()
self.validate_src_collections()
self.log_building_start()
try:
if self.using_ipython_cluster:
self._merge_ipython_cluster(step=step)
else:
self._merge_local(step=step, restart_at=restart_at)
if self.target.name == 'es':
print "Updating metadata...",
self.update_mapping_meta()
t1 = round(time.time() - t0, 0)
t = timesofar(t0)
self.log_src_build({'status': 'success',
'time': t,
'time_in_s': t1,
'timestamp': datetime.now()})
finally:
#do a simple validation here
if getattr(self, '_stats', None):
print "Validating..."
target_cnt = self.target.count()
if target_cnt == self._stats['total_genes']:
print "OK [total count={}]".format(target_cnt)
else:
print "Warning: total count of gene documents does not match [{}, should be {}]".format(target_cnt, self._stats['total_genes'])
if self.merge_logging:
sys.stdout.close()
示例11: load_x
def load_x(idx, fieldname, cvt_fn=None):
print('DATA_FOLDER: ' + DATA_FOLDER)
DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
load_start(DATAFILE)
t0 = time.time()
xli = []
for ld in tabfile_feeder(DATAFILE, header=1):
ld = listitems(ld, *(2,19,idx)) # GeneID Ensembl(Gene) target_value
for value in dupline_seperator(dupline=ld,
dup_sep='; '):
xli.append(value)
ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0]!='' and x[1]!='']), 0, alwayslist=True)
xli2 = []
for entrez_id, ensembl_id, x_value in xli:
if x_value:
if cvt_fn:
x_value = cvt_fn(x_value)
if entrez_id:
xli2.append((entrez_id, x_value))
elif ensembl_id:
entrez_id = ensembl2geneid.get(ensembl_id, None)
if entrez_id:
for _eid in entrez_id:
xli2.append((_eid, x_value))
else:
xli2.append((ensembl_id, x_value))
gene2x = list2dict(list_nondup(xli2), 0)
fn = lambda value: {fieldname: sorted(value) if type(value) is types.ListType else value}
gene2x = value_convert(gene2x, fn, traverse_list=False)
load_done('[%d, %s]' % (len(gene2x), timesofar(t0)))
return gene2x
示例12: run2
def run2():
from databuild.esbuilder import ESIndexerBase
esb = ESIndexerBase()
doc_d = build(sources)
t0 = time.time()
esb.build_index(doc_d)
print 'Done[%s]' % timesofar(t0)
示例13: update_index
def update_index(changes, sync_src, sync_target, noconfirm=False):
# changes['_add'] = changes['delete']
# changes['_delete'] = changes['add']
# changes['delete'] = changes['_delete']
# changes['add'] = changes['_add']
# del changes['_add']
# del changes['_delete']
print "\t{}\trecords will be added.".format(len(changes['add']))
print "\t{}\trecords will be deleted.".format(len(changes['delete']))
print "\t{}\trecords will be updated.".format(len(changes['update']))
print
print '\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name,
sync_src.name)
print '\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME,
sync_target.name)
if noconfirm or ask("Continue?")=='Y':
t00 = time.time()
es_idxer = sync_target.target_esidxer
if len(changes['add']) > 0:
print "Adding {} new records...".format(len(changes['add']))
t0 = time.time()
_q = {'_id': {'$in': changes['add']}}
for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
es_idxer.add_docs(docs)
print "Done. [{}]".format(timesofar(t0))
if len(changes['delete']) > 0:
print "Deleting {} old records...".format(len(changes['delete']))
t0 = time.time()
es_idxer.delete_docs(changes['delete'])
print "Done. [{}]".format(timesofar(t0))
if len(changes['update']) > 0:
print "Updating {} existing records...".format(len(changes['update']))
t0 = time.time()
ids = [d['_id'] for d in changes['update']]
_q = {'_id': {'$in': ids}}
for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
es_idxer.add_docs(docs)
print "Done. [{}]".format(timesofar(t0))
print '='*20
print 'Finished. [{}]'.format(timesofar(t00))
示例14: main
def main():
no_confirm = True # set it to True for running this script automatically without intervention.
if not os.path.exists(DATA_FOLDER):
os.makedirs(DATA_FOLDER)
else:
if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
sys.exit()
log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'entrez_dump.log'), prompt=(not no_confirm), default='O')
sys.stdout = LogPrint(log_f, timestamp=True)
sys.stderr = sys.stdout
#mark the download starts
src_dump = get_src_dump()
doc = {'_id': 'entrez',
'timestamp': timestamp,
'data_folder': DATA_FOLDER,
'logfile': logfile,
'status': 'downloading'}
src_dump.save(doc)
t0 = time.time()
try:
download(DATA_FOLDER, no_confirm=no_confirm)
t_download = timesofar(t0)
t1 = time.time()
#mark parsing starts
src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
parse_gbff(DATA_FOLDER)
t_parsing = timesofar(t1)
t_total = timesofar(t0)
finally:
sys.stdout.close()
#mark the download finished successfully
_updates = {
'status': 'success',
'time': {
'download': t_download,
'parsing': t_parsing,
'total': t_total
},
'pending_to_upload': True # a flag to trigger data uploading
}
src_dump.update({'_id': 'entrez'}, {'$set': _updates})
示例15: load
def load(self, genedoc_d=None, update_data=True, update_master=True, test=False, step=10000):
if not self.temp_collection:
self.make_temp_collection()
self.temp_collection.drop() # drop all existing records just in case.
if update_data:
genedoc_d = genedoc_d or self.load_genedoc()
print("Uploading to the DB...", end='')
t0 = time.time()
# for doc in self.doc_iterator(genedoc_d, batch=False):
# if not test:
# doc.save()
for doc_li in self.doc_iterator(genedoc_d, batch=True, step=step):
if not test:
self.temp_collection.insert(doc_li, manipulate=False, check_keys=False)
print('Done[%s]' % timesofar(t0))
self.switch_collection()
if getattr(self, 'ENTREZ_GENEDOC_ROOT', False):
print('Uploading "geneid_d" to GridFS...', end='')
t0 = time.time()
geneid_d = self.get_geneid_d()
dump2gridfs(geneid_d, self.__collection__ + '__geneid_d.pyobj', self.db)
print('Done[%s]' % timesofar(t0))
if getattr(self, 'ENSEMBL_GENEDOC_ROOT', False):
print('Uploading "mapping2entrezgene" to GridFS...', end='')
t0 = time.time()
x2entrezgene_list = self.get_mapping_to_entrez()
dump2gridfs(x2entrezgene_list, self.__collection__ + '__2entrezgene_list.pyobj', self.db)
print('Done[%s]' % timesofar(t0))
if update_master:
# update src_master collection
if not test:
_doc = {"_id": unicode(self.__collection__),
"name": unicode(self.__collection__),
"timestamp": datetime.datetime.now()}
for attr in ['ENTREZ_GENEDOC_ROOT', 'ENSEMBL_GENEDOC_ROOT', 'id_type']:
if hasattr(self, attr):
_doc[attr] = getattr(self, attr)
if hasattr(self, 'get_mapping'):
_doc['mapping'] = getattr(self, 'get_mapping')()
conn.GeneDocSourceMaster(_doc).save()