本文整理汇总了Python中toolz.partition_all方法的典型用法代码示例。如果您正苦于以下问题:Python toolz.partition_all方法的具体用法?Python toolz.partition_all怎么用?Python toolz.partition_all使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类toolz
的用法示例。
在下文中一共展示了toolz.partition_all方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: sb_filter
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def sb_filter(fastq, bc, cores, nedit):
''' Filters reads with non-matching sample barcodes
Expects formatted fastq files.
'''
barcodes = set(sb.strip() for sb in bc)
if nedit == 0:
filter_sb = partial(exact_sample_filter2, barcodes=barcodes)
else:
barcodehash = MutationHash(barcodes, nedit)
filter_sb = partial(correcting_sample_filter2, barcodehash=barcodehash)
p = multiprocessing.Pool(cores)
chunks = tz.partition_all(10000, read_fastq(fastq))
bigchunks = tz.partition_all(cores, chunks)
for bigchunk in bigchunks:
for chunk in p.map(filter_sb, list(bigchunk)):
for read in chunk:
sys.stdout.write(read)
示例2: _cache_accounts
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def _cache_accounts(cls, accounts, steem, trx=True):
"""Fetch all `accounts` and write to db."""
timer = Timer(len(accounts), 'account', ['rps', 'wps'])
for name_batch in partition_all(1000, accounts):
cached_at = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
timer.batch_start()
batch = steem.get_accounts(name_batch)
timer.batch_lap()
sqls = [cls._sql(acct, cached_at) for acct in batch]
DB.batch_queries(sqls, trx)
timer.batch_finish(len(batch))
if trx or len(accounts) > 1000:
log.info(timer.batch_status())
示例3: scrape_blockchain
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def scrape_blockchain(mongo):
s = Steem()
# see how far behind we are
missing = list(range(last_block_num(mongo), s.last_irreversible_block_num))
# if we are far behind blockchain head
# split work in chunks of 100
if len(missing) > 100:
for batch in partition_all(100, missing):
results = s.get_blocks(batch)
insert_blocks(mongo, results)
# otherwise continue as normal
blockchain = Blockchain(mode="irreversible")
hist = blockchain.stream_from(start_block=last_block_num(mongo), full_blocks=True)
insert_blocks(mongo, hist)
示例4: cb_filter
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def cb_filter(fastq, bc1, bc2, bc3, cores, nedit):
''' Filters reads with non-matching barcodes
Expects formatted fastq files.
'''
with open_gzipsafe(bc1) as bc1_fh:
bc1 = set(cb.strip() for cb in bc1_fh)
if bc2:
with open_gzipsafe(bc2) as bc2_fh:
bc2 = set(cb.strip() for cb in bc2_fh)
if bc3:
with open_gzipsafe(bc3) as bc3_fh:
bc3 = set(cb.strip() for cb in bc3_fh)
annotations = detect_fastq_annotations(fastq)
re_string = construct_transformed_regex(annotations)
if nedit == 0:
filter_cb = partial(exact_barcode_filter, bc1=bc1, bc2=bc2, bc3=bc3,
re_string=re_string)
else:
bc1hash = MutationHash(bc1, nedit)
bc2hash = None
bc3hash = None
if bc2:
bc2hash = MutationHash(bc2, nedit)
if bc3:
bc3hash = MutationHash(bc3, nedit)
filter_cb = partial(correcting_barcode_filter, bc1hash=bc1hash,
bc2hash=bc2hash, bc3hash=bc3hash, re_string=re_string)
p = multiprocessing.Pool(cores)
chunks = tz.partition_all(10000, read_fastq(fastq))
bigchunks = tz.partition_all(cores, chunks)
for bigchunk in bigchunks:
for chunk in p.map(filter_cb, list(bigchunk)):
for read in chunk:
sys.stdout.write(read)
示例5: mb_filter
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def mb_filter(fastq, cores):
''' Filters umis with non-ACGT bases
Expects formatted fastq files.
'''
filter_mb = partial(umi_filter)
p = multiprocessing.Pool(cores)
chunks = tz.partition_all(10000, read_fastq(fastq))
bigchunks = tz.partition_all(cores, chunks)
for bigchunk in bigchunks:
for chunk in p.map(filter_mb, list(bigchunk)):
for read in chunk:
sys.stdout.write(read)
示例6: add_uid
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def add_uid(fastq, cores):
''' Adds UID:[samplebc cellbc umi] to readname for umi-tools deduplication
Expects formatted fastq files with correct sample and cell barcodes.
'''
uids = partial(append_uids)
p = multiprocessing.Pool(cores)
chunks = tz.partition_all(10000, read_fastq(fastq))
bigchunks = tz.partition_all(cores, chunks)
for bigchunk in bigchunks:
for chunk in p.map(uids, list(bigchunk)):
for read in chunk:
sys.stdout.write(read)
示例7: send_topic_nodes
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def send_topic_nodes(
self, node: kademlia.Node, echo: Hash32, nodes: Tuple[kademlia.Node, ...]
) -> None:
encoded_nodes = tuple(
n.address.to_endpoint() + [n.pubkey.to_bytes()] for n in nodes
)
max_neighbours = self._get_max_neighbours_per_packet()
for batch in toolz.partition_all(max_neighbours, encoded_nodes):
message = _pack_v5(CMD_TOPIC_NODES.id, (echo, batch), self.privkey)
self.logger.trace(">>> topic_nodes to %s: %s", node, batch)
self.send_v5(node, message)
示例8: split_inline
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def split_inline(data_dir, val_ratio, test_ratio, overwrite=False, exclude_files=None):
"""Splits the inline data into train, val and test.
Args:
data_dir (str): path to directory that holds the data
val_ratio (float): the ratio of the partition that will be used for validation
test_ratio (float): the ratio of the partition that they should use for testing
exclude_files (list[str]): filenames to exclude from dataset, such as ones that contain
artifacts. Example:['image1.tiff']
"""
num_partitions = 5
image_dir = os.path.join(data_dir, "inlines")
dir_paths = (os.path.join(image_dir, ddir) for ddir in ("train", "val", "test"))
locations_list = [_create_directory(d, overwrite=overwrite) for d in dir_paths] # train, val, test
images_iter = glob.iglob(os.path.join(image_dir, "*.tiff"))
if exclude_files is not None:
images_list = list(itertools.filterfalse(lambda x: x in exclude_files, images_iter))
else:
images_list = list(images_iter)
num_elements = math.ceil(len(images_list) / num_partitions)
for partition in partition_all(num_elements, images_list): # Partition files into N partitions
for files_list, dest_dir in zip(_split_train_val_test(partition, val_ratio, test_ratio), locations_list):
_copy_files(files_list, dest_dir)
示例9: from_checkpoints
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def from_checkpoints(self, chunk_size=1000):
"""Initial sync strategy: read from blocks on disk.
This methods scans for files matching ./checkpoints/*.json.lst
and uses them for hive's initial sync. Each line must contain
exactly one block in JSON format.
"""
# pylint: disable=no-self-use
last_block = Blocks.head_num()
tuplize = lambda path: [int(path.split('/')[-1].split('.')[0]), path]
basedir = os.path.dirname(os.path.realpath(__file__ + "/../.."))
files = glob.glob(basedir + "/checkpoints/*.json.lst")
tuples = sorted(map(tuplize, files), key=lambda f: f[0])
last_read = 0
for (num, path) in tuples:
if last_block < num:
log.info("[SYNC] Load %s. Last block: %d", path, last_block)
with open(path) as f:
# each line in file represents one block
# we can skip the blocks we already have
skip_lines = last_block - last_read
remaining = drop(skip_lines, f)
for lines in partition_all(chunk_size, remaining):
Blocks.process_multi(map(json.loads, lines), True)
last_block = num
last_read = num
示例10: compute_date_range_chunks
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def compute_date_range_chunks(sessions, start_date, end_date, chunksize):
"""Compute the start and end dates to run a pipeline for.
Parameters
----------
sessions : DatetimeIndex
The available dates.
start_date : pd.Timestamp
The first date in the pipeline.
end_date : pd.Timestamp
The last date in the pipeline.
chunksize : int or None
The size of the chunks to run. Setting this to None returns one chunk.
Returns
-------
ranges : iterable[(np.datetime64, np.datetime64)]
A sequence of start and end dates to run the pipeline for.
"""
if start_date not in sessions:
raise KeyError("Start date %s is not found in calendar." %
(start_date.strftime("%Y-%m-%d"),))
if end_date not in sessions:
raise KeyError("End date %s is not found in calendar." %
(end_date.strftime("%Y-%m-%d"),))
if end_date < start_date:
raise ValueError("End date %s cannot precede start date %s." %
(end_date.strftime("%Y-%m-%d"),
start_date.strftime("%Y-%m-%d")))
if chunksize is None:
return [(start_date, end_date)]
start_ix, end_ix = sessions.slice_locs(start_date, end_date)
return (
(r[0], r[-1]) for r in partition_all(
chunksize, sessions[start_ix:end_ix]
)
)
示例11: optimize
# 需要导入模块: import toolz [as 别名]
# 或者: from toolz import partition_all [as 别名]
def optimize(model, sampler, train, valid):
"""
Optimize the model. TODO: implement early-stopping
:param model: model to optimize
:param sampler: mini-batch sampler
:param train: train user-item matrix
:param valid: validation user-item matrix
:return: None
"""
sess = tf.Session()
sess.run(tf.global_variables_initializer())
if model.feature_projection is not None:
# initialize item embedding with feature projection
sess.run(tf.assign(model.item_embeddings, model.feature_projection))
# sample some users to calculate recall validation
valid_users = numpy.random.choice(list(set(valid.nonzero()[0])), size=1000, replace=False)
while True:
# create evaluator on validation set
validation_recall = RecallEvaluator(model, train, valid)
# compute recall on validate set
valid_recalls = []
# compute recall in chunks to utilize speedup provided by Tensorflow
for user_chunk in toolz.partition_all(100, valid_users):
valid_recalls.extend([validation_recall.eval(sess, user_chunk)])
print("\nRecall on (sampled) validation set: {}".format(numpy.mean(valid_recalls)))
# TODO: early stopping based on validation recall
# train model
losses = []
# run n mini-batches
for _ in tqdm(range(EVALUATION_EVERY_N_BATCHES), desc="Optimizing..."):
user_pos, neg = sampler.next_batch()
_, loss = sess.run((model.optimize, model.loss),
{model.user_positive_items_pairs: user_pos,
model.negative_samples: neg})
losses.append(loss)
print("\nTraining loss {}".format(numpy.mean(losses)))