本文整理汇总了Python中tensorflow.python.distribute.multi_worker_util.normalize_cluster_spec函数的典型用法代码示例。如果您正苦于以下问题:Python normalize_cluster_spec函数的具体用法?Python normalize_cluster_spec怎么用?Python normalize_cluster_spec使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了normalize_cluster_spec函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: testUnexpectedInput
def testUnexpectedInput(self):
cluster_spec = ["127.0.0.1:8964", "127.0.0.1:2333"]
with self.assertRaisesRegexp(
ValueError,
"`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
"`tf.train.ClusterDef` object"):
multi_worker_util.normalize_cluster_spec(cluster_spec)
示例2: _initialize_multi_worker
def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
task_type, task_id):
"""Initializes the object for multi-worker training."""
if task_type is None or task_id is None:
raise ValueError("When `cluster_spec` is given, you must also specify "
"`task_type` and `task_id`")
if task_type not in ("chief", "worker"):
raise ValueError(
"Unrecognized task_type: %r, valid task types are: \"chief\", "
"\"worker\"." % task_type)
cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
if not self._num_workers:
raise ValueError("No `worker` or `chief` tasks can be found in "
"`cluster_spec`.")
self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
task_id)
self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
if num_gpus_per_worker:
local_devices = tuple(
"%s/device:GPU:%d" % (self._worker_device, i)
for i in range(num_gpus_per_worker)
)
else:
local_devices = (self._worker_device,)
self._collective_keys = cross_device_utils.CollectiveKeys()
self._initialize_local(local_devices)
self._input_workers = input_lib.InputWorkers(
self._device_map, [(self._worker_device, self.worker_devices)])
self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
num_workers=self._num_workers,
num_gpus_per_worker=num_gpus_per_worker,
collective_keys=self._collective_keys)
# Add a default device so that ops without specified devices will not end up
# on other workers.
self._default_device = "/job:%s/task:%d" % (task_type, task_id)
self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
self._task_type = task_type
self._task_id = task_id
logging.info(
"Multi-worker CollectiveAllReduceStrategy with "
"cluster_spec = %r, task_type = %r, task_id = %r, "
"num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
task_type, task_id, self._num_workers, local_devices)
示例3: _initialize_multi_worker
def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
task_type, task_id):
"""Initializes the object for multi-worker training."""
if task_type is None or task_id is None:
raise ValueError("When `cluster_spec` is given, you must also specify "
"`task_type` and `task_id`")
if task_type not in ["chief", "worker"]:
raise ValueError(
"Unrecognized task_type: %r, valid task types are: \"chief\", "
"\"worker\"." % task_type)
cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
self._num_workers = len(cluster_spec.as_dict().get("worker", [])) + len(
cluster_spec.as_dict().get("chief", []))
if not self._num_workers:
raise ValueError("No `worker` or `chief` tasks can be found in "
"`cluster_spec`.")
self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
task_id)
worker_device = "/job:%s/task:%d" % (task_type, task_id)
if num_gpus_per_worker:
local_devices = [
"%s/device:GPU:%d" % (worker_device, i)
for i in range(num_gpus_per_worker)
]
else:
local_devices = [worker_device]
self._collective_keys = cross_tower_utils.CollectiveKeys()
super(CollectiveAllReduceStrategy, self).__init__(
devices=local_devices,
cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
num_workers=self._num_workers,
num_gpus_per_worker=num_gpus_per_worker,
collective_keys=self._collective_keys))
# Add a default device so that ops without specified devices will not end up
# on other workers.
self._default_device = "/job:%s/task:%d" % (task_type, task_id)
self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
self._task_type = task_type
self._task_id = task_id
logging.info(
"Multi-worker CollectiveAllReduceStrategy with "
"cluster_spec = %r, task_type = %r, task_id = %r, "
"num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
task_type, task_id, self._num_workers, local_devices)
示例4: configure
def configure(self,
session_config=None,
cluster_spec=None,
task_type=None,
task_id=None):
"""Configures the strategy class.
The strategy object will be re-initialized if `cluster_spec` is given but
was not passed in the constructor.
Args:
session_config: not used currently.
cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
cluster configurations.
task_type: the current task type.
task_id: the current task id.
Raises:
ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
not.
"""
del session_config
# Set the devices if cluster_spec is defined in TF_CONFIG but not passed in
# the constructor.
if not self._cluster_spec and cluster_spec:
self._cluster_spec = multi_worker_util.normalize_cluster_spec(
cluster_spec)
if task_type is None or task_id is None:
raise ValueError("When `cluster_spec` is given, must also specify "
"`task_type` and `task_id`.")
self._initialize_devices(self._num_gpus_per_worker, self._cluster_spec,
task_type, task_id)
示例5: _configure
def _configure(self,
session_config=None,
cluster_spec=None,
task_type=None,
task_id=None):
"""Configures the object.
Args:
session_config: a `tf.compat.v1.ConfigProto`
cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
cluster configurations.
task_type: the current task type, such as "worker".
task_id: the current task id.
Raises:
ValueError: if `task_type` is not in the `cluster_spec`.
"""
if cluster_spec:
# Use the num_gpus_per_worker recorded in constructor since _configure
# doesn't take num_gpus.
cluster_resolver = SimpleClusterResolver(
cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
task_type=task_type,
task_id=task_id,
num_accelerators={"GPU": self._num_gpus_per_worker},
rpc_layer=self._rpc_layer)
self._initialize_multi_worker(cluster_resolver)
assert isinstance(self._get_cross_device_ops(),
cross_device_ops_lib.CollectiveAllReduce)
if session_config:
session_config.CopyFrom(self._update_config_proto(session_config))
示例6: __init__
def __init__(self,
num_gpus_per_worker=0,
cluster_spec=None,
task_type=None,
task_id=None):
"""Initializes this strategy.
Args:
num_gpus_per_worker: number of local GPUs or GPUs per worker.
cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
cluster configurations.
task_type: the current task type.
task_id: the current task id.
Raises:
ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
not.
"""
super(ParameterServerStrategy, self).__init__()
self._num_gpus_per_worker = num_gpus_per_worker
if cluster_spec:
cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
if task_type is None or task_id is None:
raise ValueError("When `cluster_spec` is given, must also specify "
"`task_type` and `task_id`.")
self._cluster_spec = cluster_spec
# We typically don't need to do all-reduce in this strategy.
self._cross_tower_ops = (
cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
reduce_to_device=_LOCAL_CPU))
self._initialize_devices(num_gpus_per_worker, cluster_spec, task_type,
task_id)
示例7: _configure
def _configure(self,
session_config=None,
cluster_spec=None,
task_type=None,
task_id=None):
"""Configures the strategy class.
The strategy object will be re-initialized if `cluster_spec` is given but
was not passed in the constructor.
Args:
session_config: not used currently.
cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
cluster configurations.
task_type: the current task type.
task_id: the current task id.
Raises:
ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
not.
"""
if cluster_spec:
# Use the num_gpus_per_worker recorded in constructor since _configure
# doesn't take num_gpus.
cluster_resolver = SimpleClusterResolver(
cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
task_type=task_type,
task_id=task_id,
num_accelerators={"GPU": self._num_gpus_per_worker})
self._initialize_multi_worker(cluster_resolver)
if session_config:
session_config.CopyFrom(self._update_config_proto(session_config))
示例8: testClusterSpecAsInput
def testClusterSpecAsInput(self):
cluster_spec = server_lib.ClusterSpec({
"chief": ["127.0.0.1:1234"],
"worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
"ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
})
self.assert_same_cluster(
cluster_spec, multi_worker_util.normalize_cluster_spec(cluster_spec))
示例9: _split_cluster_for_evaluator
def _split_cluster_for_evaluator(cluster_spec, task_type):
"""Split the cluster for evaluator since it needn't talk to other tasks."""
# Splitting the cluster is important to prevent the evaluator from talking to
# other tasks in the cluster. Since we allow evaluator not to use
# distribution strategies and as a result ops in the evalauator task may have
# unspecified devices. Those ops may end up on other tasks if we don't split
# the cluster.
new_cluster_spec = multi_worker_util.normalize_cluster_spec(
cluster_spec).as_dict()
if task_type == _TaskType.EVALUATOR:
assert _TaskType.EVALUATOR in new_cluster_spec
new_cluster_spec = {
_TaskType.EVALUATOR: new_cluster_spec[_TaskType.EVALUATOR]
}
else:
new_cluster_spec.pop(_TaskType.EVALUATOR, None)
return multi_worker_util.normalize_cluster_spec(new_cluster_spec)
示例10: _initialize_multi_worker
def _initialize_multi_worker(self, cluster_resolver):
"""Initializes the object for multi-worker training."""
# TODO(yuefengz): The `num_gpus` is only for this particular task. It
# assumes all workers have the same number of GPUs. We should remove this
# assumption by querying all tasks for their numbers of GPUs.
num_gpus = cluster_resolver.num_accelerators()
cluster_spec = multi_worker_util.normalize_cluster_spec(
cluster_resolver.cluster_spec())
task_type = cluster_resolver.task_type
task_id = cluster_resolver.task_id
if task_type is None or task_id is None:
raise ValueError("When `cluster_spec` is given, you must also specify "
"`task_type` and `task_id` in the `cluster_resolver`.")
if task_type not in ("chief", "worker"):
raise ValueError(
"Unrecognized task_type: %r, valid task types are: \"chief\", "
"\"worker\"." % task_type)
self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
if not self._num_workers:
raise ValueError("No `worker` or `chief` tasks can be found in "
"`cluster_spec`.")
self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
task_id)
self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
if num_gpus:
local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
for i in range(num_gpus))
else:
local_devices = (self._worker_device,)
self._collective_keys = cross_device_utils.CollectiveKeys()
super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
self._input_workers = input_lib.InputWorkers(
self._device_map, [(self._worker_device, self.worker_devices)])
self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
num_workers=self._num_workers,
num_gpus_per_worker=num_gpus,
collective_keys=self._collective_keys)
# Add a default device so that ops without specified devices will not end up
# on other workers.
self._default_device = "/job:%s/task:%d" % (task_type, task_id)
self._cluster_spec = cluster_spec
self._task_type = task_type
self._task_id = task_id
logging.info(
"Multi-worker CollectiveAllReduceStrategy with "
"cluster_spec = %r, task_type = %r, task_id = %r, "
"num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
task_type, task_id, self._num_workers, local_devices)
示例11: _configure
def _configure(self,
session_config=None,
cluster_spec=None,
task_type=None,
task_id=None):
"""Configures the strategy class.
The strategy object will be re-initialized if `cluster_spec` is given but
was not passed in the constructor.
Args:
session_config: not used currently.
cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
cluster configurations.
task_type: the current task type.
task_id: the current task id.
Raises:
ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
not.
"""
if not self._cluster_spec and cluster_spec:
# If a `cluster_spec` is already passed in, do nothing here.
# TODO(yuefengz): check `cluster_spec` is the same if this object has
# already been initialized with a `cluster_spec`.
if task_type is None or task_id is None:
raise ValueError("When `cluster_spec` is given, must also specify "
"`task_type` and `task_id`.")
self._cluster_spec = multi_worker_util.normalize_cluster_spec(
cluster_spec)
self._task_type = task_type
self._task_id = task_id
self._initialize_multi_worker(self._num_gpus_per_worker,
self._cluster_spec, task_type, task_id)
if not session_config or not self._cluster_spec:
return
session_config.isolate_session_state = False
assert self._cluster_spec
assert self._task_type
assert self._task_id is not None
# The device filters prevent communication between workers.
if self._task_type not in ["chief", "worker"]:
return
del session_config.device_filters[:]
session_config.device_filters.extend(
["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
示例12: _cluster_spec_to_device_list
def _cluster_spec_to_device_list(cluster_spec, num_gpus_per_worker):
"""Returns a device list given a cluster spec."""
cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
devices = []
for task_type in ("chief", "worker"):
for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
if num_gpus_per_worker is 0:
devices.append("/job:%s/task:%d" % (task_type, task_id))
else:
devices.extend([
"/job:%s/task:%d/device:GPU:%i" % (task_type, task_id, gpu_id)
for gpu_id in range(num_gpus_per_worker)
])
return devices
示例13: estimator_train
def estimator_train(estimator, train_distributed_fn, hooks):
"""Run distribute coordinator for Estimator's `train` method."""
assert estimator._config._distribute_coordinator_mode
run_config = estimator._config
assert estimator._config.cluster_spec
cluster_spec = multi_worker_util.normalize_cluster_spec(
estimator._config.cluster_spec)
assert estimator._config._train_distribute
if 'evaluator' in cluster_spec.jobs:
raise ValueError("'evaluator' job is not supported if you don't use "
'`train_and_evaluate`')
if (estimator._config._distribute_coordinator_mode != # pylint: disable=protected-access
dc.CoordinatorMode.STANDALONE_CLIENT):
raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
'`estimator.train`')
if estimator._config._train_distribute.extended.experimental_between_graph:
# TODO(yuefengz): remove this limitation once we figure out how to merge
# return values from `_worker_fn`s.
raise ValueError('`Estimator.train` API is not supported for %s with '
'`STANDALONE_CLIENT` mode.' %
estimator._config._train_distribute.__class__.__name__)
def _worker_fn(strategy):
"""Function for worker task."""
local_estimator = copy.deepcopy(estimator)
local_estimator._config._train_distribute = strategy
context = dc_context.get_current_worker_context()
_init_run_config_from_worker_context(local_estimator._config, context)
logging.info('Updated config: %s', str(vars(local_estimator._config)))
local_estimator._train_distribution = strategy
if context.is_chief:
chief_hooks = hooks
else:
chief_hooks = []
train_distributed_fn(local_estimator, strategy, chief_hooks)
return local_estimator
return dc.run_distribute_coordinator(
_worker_fn,
estimator._config.train_distribute,
mode=run_config._distribute_coordinator_mode,
cluster_spec=cluster_spec,
session_config=run_config.session_config)
示例14: testClusterDefAsInput
def testClusterDefAsInput(self):
cluster_def = cluster_pb2.ClusterDef()
job = cluster_def.job.add()
job.name = "chief"
job.tasks[0] = "127.0.0.1:1234"
job = cluster_def.job.add()
job.name = "worker"
job.tasks[0] = "127.0.0.1:8964"
job.tasks[1] = "127.0.0.1:2333"
job = cluster_def.job.add()
job.name = "ps"
job.tasks[0] = "127.0.0.1:1926"
job.tasks[1] = "127.0.0.1:3141"
self.assert_same_cluster(
cluster_def, multi_worker_util.normalize_cluster_spec(cluster_def))
示例15: _initialize_multi_worker
def _initialize_multi_worker(self, num_gpus, cluster_spec):
"""Initializes the object for multi-worker training."""
cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
self._cluster_spec = cluster_spec
self._workers = []
for job in ["chief", "worker"]:
for task in range(len(cluster_spec.as_dict().get(job, []))):
self._workers.append("/job:%s/task:%d" % (job, task))
if num_gpus is None:
raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
if num_gpus > 0:
self._worker_device_map = {
worker: [
device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
for gpu in range(num_gpus)
] for worker in self._workers
}
else:
self._worker_device_map = {
worker: [device_util.canonicalize(worker, "/device:CPU:0")]
for worker in self._workers
}
devices = nest.flatten(self._worker_device_map)
# Setting `_default_device` will add a device scope in the
# distribution.scope. We set the default device to the first worker. When
# users specify device under distribution.scope by
# with tf.device("/cpu:0"):
# ...
# their ops will end up on the cpu device of its first worker, e.g.
# "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
self._default_device = self._workers[0]
assert devices, "Must specify at least one device."
assert len(set(devices)) == len(devices), (
"No duplicates allowed in `devices` argument.")
# TODO(josh11b): Require at least 2 devices?
self._devices = [device_util.resolve(d) for d in devices]
self._canonical_device_set = set(self._devices)
self._device_index = values.PerDevice(
{d: i for i, d in enumerate(devices)})