本文整理汇总了Python中toolz.groupby函数的典型用法代码示例。如果您正苦于以下问题:Python groupby函数的具体用法?Python groupby怎么用?Python groupby使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了groupby函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: setup
def setup(self):
keys = self.keys
while not keys.issubset(self.scheduler.tasks):
yield gen.sleep(0.05)
tasks = [self.scheduler.tasks[k] for k in keys]
self.keys = None
self.scheduler.add_plugin(self) # subtle race condition here
self.all_keys, errors = dependent_keys(tasks, complete=self.complete)
if not self.complete:
self.keys = self.all_keys.copy()
else:
self.keys, _ = dependent_keys(tasks, complete=False)
self.all_keys.update(keys)
self.keys |= errors & self.all_keys
if not self.keys:
self.stop(exception=None, key=None)
# Group keys by func name
self.keys = valmap(set, groupby(self.func, self.keys))
self.all_keys = valmap(set, groupby(self.func, self.all_keys))
for k in self.all_keys:
if k not in self.keys:
self.keys[k] = set()
for k in errors:
self.transition(k, None, 'erred', exception=True)
logger.debug("Set up Progress keys")
示例2: scatter_to_workers
def scatter_to_workers(center, ncores, data, key=None, report=True):
""" Scatter data directly to workers
This distributes data in a round-robin fashion to a set of workers based on
how many cores they have. ncores should be a dictionary mapping worker
identities to numbers of cores.
See scatter for parameter docstring
"""
if isinstance(center, str):
ip, port = center.split(':')
elif isinstance(center, rpc):
ip, port = center.ip, center.port
elif isinstance(center, tuple):
ip, port = center
else:
raise TypeError("Bad type for center")
if key is None:
key = str(uuid.uuid1())
if isinstance(ncores, Iterable) and not isinstance(ncores, dict):
k = len(data) // len(ncores)
ncores = {worker: k for worker in ncores}
workers = list(concat([w] * nc for w, nc in ncores.items()))
in_type = type(data)
if isinstance(data, dict):
names, data = list(zip(*data.items()))
else:
names = ('%s-%d' % (key, i) for i in count(0))
worker_iter = drop(_round_robin_counter[0] % len(workers), cycle(workers))
_round_robin_counter[0] += len(data)
L = list(zip(worker_iter, names, data))
d = groupby(0, L)
d = {k: {b: c for a, b, c in v}
for k, v in d.items()}
out = yield All([rpc(ip=w_ip, port=w_port).update_data(data=v,
close=True, report=report)
for (w_ip, w_port), v in d.items()])
nbytes = merge([o[1]['nbytes'] for o in out])
who_has = {k: [w for w, _, _ in v] for k, v in groupby(1, L).items()}
result = [RemoteData(b, ip, port, result=c)
for a, b, c in L]
if in_type is dict:
result = dict(zip(names, result))
raise Return((result, who_has, nbytes))
示例3: setup
def setup(self, keys, complete):
errors = Progress.setup(self, keys, complete)
# Group keys by func name
self.keys = valmap(set, groupby(self.func, self.keys))
self.all_keys = valmap(set, groupby(self.func, self.all_keys))
for k in self.all_keys:
if k not in self.keys:
self.keys[k] = set()
logger.debug("Set up Progress keys")
return errors
示例4: _run_cnvkit_shared_orig
def _run_cnvkit_shared_orig(inputs, backgrounds):
"""Original CNVkit implementation with full normalization and segmentation.
"""
work_dir = _sv_workdir(inputs[0])
raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
ckouts = []
for cur_input in inputs:
cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input)
if utils.file_exists(out_base_old + ".cns"):
out_base = out_base_old
ckouts.append({"cnr": "%s.cnr" % out_base,
"cns": "%s.cns" % out_base})
if not utils.file_exists(ckouts[0]["cns"]):
cov_interval = dd.get_coverage_interval(inputs[0])
samples_to_run = list(zip(["background"] * len(backgrounds), backgrounds)) + \
list(zip(["evaluate"] * len(inputs), inputs))
# New style shared SV bins
if tz.get_in(["depth", "bins", "target"], inputs[0]):
target_bed = tz.get_in(["depth", "bins", "target"], inputs[0])
antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0])
raw_coverage_cnns = reduce(operator.add,
[_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run])
# Back compatible with pre-existing runs
else:
target_bed, antitarget_bed = _get_original_targets(inputs[0])
raw_coverage_cnns = reduce(operator.add,
[_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run])
# Currently metrics not calculated due to speed and needing re-evaluation
# We could re-enable with larger truth sets to evaluate background noise
# But want to reimplement in a more general fashion as part of normalization
if False:
coverage_cnns = reduce(operator.add,
[_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
inputs + backgrounds)
for cnns in tz.groupby("bam", raw_coverage_cnns).values()])
background_cnn = cnvkit_background(_select_background_cnns(coverage_cnns),
background_cnn, inputs, target_bed, antitarget_bed)
else:
coverage_cnns = raw_coverage_cnns
background_cnn = cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"],
background_cnn, inputs, target_bed, antitarget_bed)
parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
fixed_cnrs = run_multicore(_cnvkit_fix,
[(cnns, background_cnn, inputs, ckouts) for cnns in
tz.groupby("bam", [x for x in coverage_cnns
if x["itype"] == "evaluate"]).values()],
inputs[0]["config"], parallel)
[_cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs]
return ckouts
示例5: _run_cnvkit_shared
def _run_cnvkit_shared(inputs, backgrounds):
"""Shared functionality to run CNVkit, parallelizing over multiple BAM files.
"""
work_dir = _sv_workdir(inputs[0])
raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
ckouts = []
for cur_input in inputs:
cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir)
ckouts.append({"cnr": "%s.cnr" % out_base,
"cns": "%s.cns" % out_base,
"back_cnn": background_cnn})
if not utils.file_exists(ckouts[0]["cnr"]):
cov_interval = dd.get_coverage_interval(inputs[0])
raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir)
# bail out if we ended up with no regions
if not utils.file_exists(raw_target_bed):
return {}
raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0])
parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
pct_coverage = (pybedtools.BedTool(raw_target_bed).total_coverage() /
float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0
target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval,
pct_coverage, raw_work_dir, inputs[0])
split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(antitarget_bed, inputs[0])
samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
zip(["evaluate"] * len(inputs), inputs)
split_cnns = run_multicore(_cnvkit_coverage,
[(cdata, bed, itype) for itype, cdata in samples_to_run for bed in split_beds],
inputs[0]["config"], parallel)
raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0])
coverage_cnns = run_multicore(_cnvkit_metrics,
[(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds)
for cnns in tz.groupby("bam", raw_coverage_cnns).values()],
inputs[0]["config"], parallel)
background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns),
background_cnn, target_bed, antitarget_bed, inputs[0])
fixed_cnrs = run_multicore(_cnvkit_fix,
[(cnns, background_cnn, inputs + backgrounds) for cnns in
tz.groupby("bam", [x for x in coverage_cnns
if x["itype"] == "evaluate"]).values()],
inputs[0]["config"], parallel)
run_multicore(_cnvkit_segment,
[(cnr, cov_interval, data) for cnr, data in fixed_cnrs],
inputs[0]["config"], parallel)
return ckouts
示例6: split_next_and_previous_event_columns
def split_next_and_previous_event_columns(self, requested_columns):
"""
Split requested columns into columns that should load the next known
value and columns that should load the previous known value.
Parameters
----------
requested_columns : iterable[BoundColumn]
Returns
-------
next_cols, previous_cols : iterable[BoundColumn], iterable[BoundColumn]
``requested_columns``, partitioned into sub-sequences based on
whether the column should produce values from the next event or the
previous event
"""
def next_or_previous(c):
if c in self.next_value_columns:
return "next"
elif c in self.previous_value_columns:
return "previous"
raise ValueError("{c} not found in next_value_columns " "or previous_value_columns".format(c=c))
groups = groupby(next_or_previous, requested_columns)
return groups.get("next", ()), groups.get("previous", ())
示例7: diagnostic_yield
def diagnostic_yield(self, metric='completeness', cutoff=1,
superblock_ids=None, group_id=None, sample_ids=None):
"""Calculate diagnostic yield."""
# extract column to filter on
metric_column = getattr(BlockData, metric)
# set up the base query for all blocks
total_query = self.total_count(BlockData)
if superblock_ids:
# apply the superblock filter on the Block class level
total_query = total_query.join(BlockData.parent)\
.filter(Block.superblock_id.in_(superblock_ids))
# extend base query to include only passed blocks
pass_query = total_query.filter(metric_column >= cutoff)
# optionally limit query
queries = [limit_query(query, group=group_id, samples=sample_ids)
for query in (total_query, pass_query)]
# group multiple queries by sample ID (first column)
metrics = groupby(get(0), concat(queries))
# iterate over all values, concat different query results, and keep
# only the unique values (excluding second sample_id)
combined = (unique(concat(values)) for values in itervalues(metrics))
# calculate diagnostic yield by simple division
for sample_id, group_id, total, covered in combined:
yield sample_id, group_id, (covered / total)
示例8: pip_dict
def pip_dict():
from pkg_resources import working_set
from toolz import groupby
first = lambda x: x[0].upper()
ws = working_set.by_key
WS = groupby(first, ws)
return ws, WS
示例9: format_website
def format_website(self):
# jira category => website category mapping
categories = {
'New Feature': 'feature',
'Improvement': 'feature',
'Wish': 'feature',
'Task': 'feature',
'Test': 'bug',
'Bug': 'bug',
'Sub-task': 'feature'
}
titles = {
'feature': 'New Features and Improvements',
'bugfix': 'Bug Fixes'
}
issues_by_category = toolz.groupby(
lambda issue: categories[issue.fields.issuetype.name],
self.issues
)
out = StringIO()
for category in ('feature', 'bug'):
title = titles[category]
issues = issues_by_category[category]
issues.sort(key=lambda x: x.key)
out.write(md('## {}\n\n', title))
for issue in issues:
link = md('[{0}]({1}/browse/{0})', issue.key, self.server)
out.write(md('* {} - {}\n', link, issue.fields.summary))
out.write('\n')
return out.getvalue()
示例10: compute
def compute(*args, **kwargs):
"""Compute several dask collections at once.
Examples
--------
>>> import dask.array as da
>>> a = da.arange(10, chunks=2).sum()
>>> b = da.arange(10, chunks=2).mean()
>>> compute(a, b)
(45, 4.5)
"""
groups = groupby(attrgetter('_optimize'), args)
get = kwargs.pop('get', None) or _globals['get']
if not get:
get = args[0]._default_get
if not all(a._default_get == get for a in args):
raise ValueError("Compute called on multiple collections with "
"differing default schedulers. Please specify a "
"scheduler `get` function using either "
"the `get` kwarg or globally with `set_options`.")
dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val])
for opt, val in groups.items()])
keys = [arg._keys() for arg in args]
results = get(dsk, keys, **kwargs)
return tuple(a._finalize(a, r) for a, r in zip(args, results))
示例11: _get_subnet_config_w_cidr
def _get_subnet_config_w_cidr(self, network_config):
network_cidr_base = str(network_config.get('network_cidr_base', '172.16.0.0'))
network_cidr_size = str(network_config.get('network_cidr_size', '20'))
first_network_address_block = str(network_config.get('first_network_address_block', network_cidr_base))
ret_val = {}
base_cidr = network_cidr_base + '/' + network_cidr_size
net = netaddr.IPNetwork(base_cidr)
grouped_subnet = groupby('size', self._get_subnet_config_w_az(network_config))
subnet_groups = sorted(grouped_subnet.items())
available_cidrs = []
for subnet_size, subnet_configs in subnet_groups:
newcidrs = net.subnet(int(subnet_size))
for subnet_config in subnet_configs:
try:
cidr = newcidrs.next()
except StopIteration as e:
net = chain(*reversed(available_cidrs)).next()
newcidrs = net.subnet(int(subnet_size))
cidr = newcidrs.next()
new_config = assoc(subnet_config, 'cidr', str(cidr))
yield new_config
else:
net = newcidrs.next()
available_cidrs.append(newcidrs)
示例12: load_adjusted_array
def load_adjusted_array(self, columns, dates, assets, mask):
return merge(
self.pool.imap_unordered(
partial(self._load_dataset, dates, assets, mask),
itervalues(groupby(getdataset, columns)),
),
)
示例13: load_adjusted_array
def load_adjusted_array(self, columns, dates, assets, mask):
return dict(
concat(map(
partial(self._load_dataset, dates, assets, mask),
itervalues(groupby(getdataset, columns))
))
)
示例14: scatter_to_workers
def scatter_to_workers(center, ncores, data, key=None):
""" Scatter data directly to workers
This distributes data in a round-robin fashion to a set of workers based on
how many cores they have. ncores should be a dictionary mapping worker
identities to numbers of cores.
See scatter for parameter docstring
"""
center = coerce_to_rpc(center)
if key is None:
key = str(uuid.uuid1())
if isinstance(ncores, Iterable) and not isinstance(ncores, dict):
ncores = {worker: 1 for worker in ncores}
workers = list(concat([w] * nc for w, nc in ncores.items()))
if isinstance(data, dict):
names, data = list(zip(*data.items()))
else:
names = ("%s-%d" % (key, i) for i in count(0))
L = list(zip(cycle(workers), names, data))
d = groupby(0, L)
d = {k: {b: c for a, b, c in v} for k, v in d.items()}
yield [rpc(ip=w_ip, port=w_port).update_data(data=v, close=True) for (w_ip, w_port), v in d.items()]
result = [RemoteData(b, center.ip, center.port, result=c) for a, b, c in L]
raise Return(result)
示例15: set_params
def set_params(self, **params):
d = groupby(0, [(k.split('__')[0], k.split('__', 1)[1], v)
for k, v in params.items()])
d = {k: {a: b for _, a, b in v} for k, v in d.items()}
steps = [(name, set_params(est, **d[name]) if name in d else est)
for name, est in self.steps]
return Pipeline(steps)