本文整理汇总了Python中horovod.tensorflow.init方法的典型用法代码示例。如果您正苦于以下问题:Python tensorflow.init方法的具体用法?Python tensorflow.init怎么用?Python tensorflow.init使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类horovod.tensorflow
的用法示例。
在下文中一共展示了tensorflow.init方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: evaluate
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def evaluate(self, dataset, epoch):
""" evaluate the model """
loss_metric = tf.keras.metrics.Mean(name="AverageLoss")
loss, metrics = None, None
evaluate_step = self.evaluate_step
if self.hparams.enable_tf_function:
logging.info("please be patient, enable tf.function, it takes time ...")
evaluate_step = tf.function(evaluate_step, input_signature=self.sample_signature)
self.model.reset_metrics() # init metric.result() with 0
for batch, samples in enumerate(dataset):
samples = self.model.prepare_samples(samples)
loss, metrics = evaluate_step(samples)
if batch % self.hparams.log_interval == 0:
logging.info(self.metric_checker(loss, metrics, -2))
total_loss = sum(list(loss.values())) if isinstance(loss, dict) else loss
loss_metric.update_state(total_loss)
logging.info(self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch))
self.model.reset_metrics()
return loss_metric.result(), metrics
示例2: test_horovod_allreduce_type_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def test_horovod_allreduce_type_error(self):
"""Test that the allreduce raises an error if different ranks try to
send tensors of different type."""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
with self.test_session(config=self.config) as session:
# Same rank, different dimension
dims = [17] * 3
tensor = tf.ones(dims,
dtype=tf.int32 if rank % 2 == 0 else tf.float32)
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.allreduce(tensor))
示例3: test_horovod_allreduce_cpu_gpu_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def test_horovod_allreduce_cpu_gpu_error(self):
"""Test that the allreduce raises an error if different ranks try to
perform reduction on CPU and GPU."""
# Only do this test if there are GPUs available.
if not tf.test.is_gpu_available(cuda_only=True):
return
hvd.init()
local_rank = hvd.local_rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
device = "/gpu:%d" % local_rank if local_rank % 2 == 0 else "/cpu:0"
with self.test_session(config=self.config) as session:
with tf.device(device):
# Same rank, different dimension
dims = [17] * 3
tensor = tf.ones(dims, dtype=tf.int32)
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.allreduce(tensor))
示例4: test_horovod_allgather_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def test_horovod_allgather_error(self):
"""Test that the allgather returns an error if any dimension besides
the first is different among the tensors being gathered."""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
with self.test_session(config=self.config) as session:
tensor_size = [17] * 3
tensor_size[1] = 10 * (rank + 1)
tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.allgather(tensor))
示例5: test_horovod_allgather_type_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def test_horovod_allgather_type_error(self):
"""Test that the allgather returns an error if the types being gathered
differ among the processes"""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
with self.test_session(config=self.config) as session:
tensor_size = [17] * 3
dtype = tf.int32 if rank % 2 == 0 else tf.float32
tensor = tf.ones(tensor_size, dtype=dtype) * rank
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.allgather(tensor))
示例6: test_horovod_broadcast_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def test_horovod_broadcast_error(self):
"""Test that the broadcast returns an error if any dimension besides
the first is different among the tensors being broadcasted."""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
with self.test_session(config=self.config) as session:
tensor_size = [17] * 3
tensor_size[1] = 10 * (rank + 1)
tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.broadcast(tensor, 0))
示例7: test_horovod_broadcast_type_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def test_horovod_broadcast_type_error(self):
"""Test that the broadcast returns an error if the types being broadcasted
differ among the processes"""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
with self.test_session(config=self.config) as session:
tensor_size = [17] * 3
dtype = tf.int32 if rank % 2 == 0 else tf.float32
tensor = tf.ones(tensor_size, dtype=dtype) * rank
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.broadcast(tensor, 0))
示例8: setup
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def setup():
if not horovod_installed:
return False
global horovod_initialized
if horovod_initialized:
return hvd
hvd.init()
horovod_initialized = True
horovod_num_worker = hvd.size()
horovod_rank = hvd.rank()
# verify that MPI multi-threading is supported.
assert hvd.mpi_threads_supported()
# make sure MPI is not re-initialized.
import mpi4py.rc
mpi4py.rc.initialize = False
# import mpi4py
from mpi4py import MPI
comm = MPI.COMM_WORLD
# check size and rank are synchronized
assert horovod_num_worker == comm.Get_size()
assert horovod_rank == comm.Get_rank()
return hvd
示例9: __init__
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def __init__(self, average=True):
"""
Args:
average (bool): whether to average or sum the gradients across processes.
"""
import byteps.tensorflow as bps
self.hvd = bps # BytePS has the same interface as Horovod
self.hvd.allreduce = bps.push_pull # https://github.com/bytedance/byteps/issues/8
assert os.environ.get("DMLC_ROLE", None) == "worker"
assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ
bps.init()
self.is_chief = bps.rank() == 0
self._local_rank = bps.local_rank()
self._rank = bps.rank()
self._average = average
self._compression = None
self._has_compression = False
logger.info("[BytePSTrainer] local rank={}".format(self._local_rank))
SingleCostTrainer.__init__(self)
示例10: __init__
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def __init__(self, optimizer: TFOptimizer, comm=None):
super().__init__(optimizer.executor, optimizer.loss)
try:
import horovod.tensorflow as hvd
except ImportError:
raise ImportError('Cannot import Horovod')
hvd.init()
self.op = hvd.DistributedOptimizer(optimizer.op)
if comm is None:
comm = CommunicationNetwork()
self.communication = comm
self.original_optimizer = optimizer
示例11: setup_horovod
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def setup_horovod():
import horovod.tensorflow as hvd
# Initialize Horovod
hvd.init()
# Verify that MPI multi-threading is supported.
assert hvd.mpi_threads_supported()
from mpi4py import MPI
assert hvd.size() == MPI.COMM_WORLD.Get_size()
is_root = hvd.rank() == 0
def mpi_average(local_list):
# _local_list_orig = local_list
local_list = list(map(float, local_list))
# print('RANK {} AVERAGING {} -> {}'.format(hvd.rank(), _local_list_orig, local_list))
sums = MPI.COMM_WORLD.gather(sum(local_list), root=0)
counts = MPI.COMM_WORLD.gather(len(local_list), root=0)
sum_counts = sum(counts) if is_root else None
avg = (sum(sums) / sum_counts) if is_root else None
return avg, sum_counts
return hvd, MPI, is_root, mpi_average
示例12: evaluate
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def evaluate(self, dataset, epoch):
""" evaluate the model """
loss_metric = tf.keras.metrics.Mean(name="AverageLoss")
loss, metrics = None, None
evaluate_step = self.evaluate_step
if self.hparams.enable_tf_function:
logging.info("please be patient, enable tf.function, it takes time ...")
evaluate_step = tf.function(evaluate_step, input_signature=self.sample_signature)
self.model.reset_metrics() # init metric.result() with 0
for batch, samples in enumerate(dataset):
samples = self.model.prepare_samples(samples)
loss, metrics = evaluate_step(samples)
if batch % self.hparams.log_interval == 0:
logging.info(self.metric_checker(loss, metrics, -2))
loss_metric.update_state(loss)
logging.info(self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch))
self.model.reset_metrics()
return loss_metric.result()
示例13: __init__
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def __init__(self, average=True, compression=None):
"""
Args:
average (bool): whether to average or sum the gradients across processes.
compression: `hvd.Compression.fp16` or `hvd.Compression.none`
"""
if 'pyarrow' in sys.modules:
logger.warn("Horovod and pyarrow may conflict due to pyarrow bugs.")
# lazy import
import horovod.tensorflow as hvd
import horovod
hvd_version = tuple(map(int, horovod.__version__.split('.')[:3]))
self.hvd = hvd
hvd.init()
self.is_chief = hvd.rank() == 0
self._local_rank = hvd.local_rank()
self._rank = hvd.rank()
self._average = average
self._compression = compression
self._has_compression = hvd_version >= (0, 15, 0)
logger.info("[HorovodTrainer] local rank={}".format(self._local_rank))
super(HorovodTrainer, self).__init__()
self.BROADCAST_EVERY_EPOCH = True
示例14: _worker_fn
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def _worker_fn(client, task, net_if):
event.broadcast(client, f"{task}/addr", net_if[1])
worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',')
driver_socket = event.wait(client, "chief:0/sock_addr").split(':')
os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0]
os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1]
os.environ['HOROVOD_CONTROLLER'] = 'gloo'
os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo'
os.environ['HOROVOD_GLOO_IFACE'] = net_if[0]
os.environ['HOROVOD_RANK'] = worker_info[0]
os.environ['HOROVOD_SIZE'] = worker_info[1]
os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2]
os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3]
os.environ['HOROVOD_CROSS_RANK'] = worker_info[4]
os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5]
hvd.init()
experiment = _task_commons._get_experiment(client)
if task != 'chief:0':
# Overwrite config to do nothing but training to improve training speed
experiment.estimator._model_dir = "."
new_config = experiment.estimator.config.replace(
save_summary_steps=None,
save_checkpoints_steps=None,
save_checkpoints_secs=None,
log_step_count_steps=None
)
experiment.estimator._config = new_config
logger.info("start training..")
experiment.estimator.train(
experiment.train_spec.input_fn,
hooks=experiment.train_spec.hooks,
max_steps=experiment.train_spec.max_steps)
示例15: _driver_fn
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import init [as 别名]
def _driver_fn(client, net_if):
cluster_tasks = _task_commons._get_cluster_tasks(client)
# Worker discovery
worker_list = [f"{net_if[1]}:{N_PROCESS_PER_WORKER}"]
n_workers = 1
for cluster_task in cluster_tasks:
if 'worker' in cluster_task:
worker_addr = event.wait(client, f"{cluster_task}/addr")
logger.info(f"{cluster_task}: {worker_addr}")
worker_list.append(f"{worker_addr}:{N_PROCESS_PER_WORKER}")
n_workers += 1
# Worker task allocation to workers
hosts = gloo_run.parse_hosts(','.join(worker_list))
host_alloc_plan = gloo_run.get_host_assignments(hosts, n_workers)
for host in host_alloc_plan:
host_info = f"""\
{host.rank},{host.size},{host.local_rank},\
{host.local_size},{host.cross_rank},{host.cross_size}\
"""
event.broadcast(client, f"{cluster.get_task()}/{host.hostname}", host_info)
global_rendezv = RendezvousServer(verbose=1)
global_rendezv_port = global_rendezv.start_server()
global_rendezv.httpd.init(host_alloc_plan)
event.broadcast(client, f"{cluster.get_task()}/sock_addr", f"{net_if[1]}:{global_rendezv_port}")
return global_rendezv.listen_thread