本文整理汇总了Python中horovod.tensorflow.rank方法的典型用法代码示例。如果您正苦于以下问题:Python tensorflow.rank方法的具体用法?Python tensorflow.rank怎么用?Python tensorflow.rank使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类horovod.tensorflow
的用法示例。
在下文中一共展示了tensorflow.rank方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: FastTrain
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def FastTrain():
print("FastTrain", FLAGS.train_steps)
estimator = FastTextEstimator(FLAGS.model_dir)
print("TEST" + FLAGS.train_records)
train_input = InputFn(tf.estimator.ModeKeys.TRAIN, FLAGS.train_records)
print("STARTING TRAIN")
hooks = None
if FLAGS.horovod:
hooks = [hvd.BroadcastGlobalVariablesHook(0)]
estimator.train(input_fn=train_input, steps=FLAGS.train_steps, hooks=hooks)
print("TRAIN COMPLETE")
if not FLAGS.horovod or hvd.rank() == 0:
print("EVALUATE")
eval_input = InputFn(tf.estimator.ModeKeys.EVAL, FLAGS.eval_records)
#eval_metrics = { "accuracy": tf.metrics.accuracy(labels, predictions) }
result = estimator.evaluate(input_fn=eval_input, steps=FLAGS.eval_steps, hooks=None)
print(result)
print("DONE")
if FLAGS.export_dir:
print("EXPORTING")
estimator.export_savedmodel(FLAGS.export_dir,
inputs.ServingInputFn(FLAGS.use_ngrams))
示例2: print_act_stats
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def print_act_stats(x, _str=""):
if not do_print_act_stats:
return x
if hvd.rank() != 0:
return x
if len(x.get_shape()) == 1:
x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
if len(x.get_shape()) == 2:
x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
if len(x.get_shape()) == 4:
x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True)
stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean),
tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))]
return tf.Print(x, stats, "["+_str+"] "+x.name)
# Allreduce methods
示例3: train_input_fn
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def train_input_fn(input_file, _parse_fn, name_to_features,
params, **kargs):
if_shard = kargs.get("if_shard", "1")
dataset = tf.data.TFRecordDataset(input_file, buffer_size=params.get("buffer_size", 100))
print("==hvd size {}, rank {}==".format(hvd.size(), hvd.rank()))
if if_shard == "1":
dataset = dataset.shard(hvd.size(), hvd.rank())
dataset = dataset.map(lambda x:_parse_fn(x, name_to_features))
dataset = dataset.shuffle(
buffer_size=params.get("buffer_size", 1024)+3*params.get("batch_size", 32),
seed=np.random.randint(0,1e10,1)[0],
reshuffle_each_iteration=True)
dataset = dataset.batch(params.get("batch_size", 32))
dataset = dataset.repeat(params.get("epoch", 100))
iterator = dataset.make_one_shot_iterator()
features = iterator.get_next()
return features
示例4: train
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def train(self, dataset, total_batches=-1):
""" Update the model in 1 epoch """
train_step = self.train_step
if self.hparams.enable_tf_function:
logging.info("please be patient, enable tf.function, it takes time ...")
train_step = tf.function(train_step, input_signature=self.sample_signature)
for batch, samples in enumerate(dataset.take(total_batches)):
# train 1 step
samples = self.model.prepare_samples(samples)
loss, metrics = train_step(samples)
# Horovod: broadcast initial variable states from rank 0 to all other processes.
# This is necessary to ensure consistent initialization of all workers when
# training is started with random weights or restored from a checkpoint.
#
# Note: broadcast should be done after the first gradient step to ensure optimizer
# initialization.
if batch == 0:
hvd.broadcast_variables(self.model.trainable_variables, root_rank=0)
hvd.broadcast_variables(self.optimizer.variables(), root_rank=0)
if batch % self.hparams.log_interval == 0 and hvd.rank() == 0:
logging.info(self.metric_checker(loss, metrics))
self.model.reset_metrics()
示例5: evaluate
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def evaluate(self, dataset, epoch=0):
""" evaluate the model """
loss_metric = tf.keras.metrics.Mean(name="AverageLoss")
loss, metrics = None, None
evaluate_step = self.evaluate_step
if self.hparams.enable_tf_function:
logging.info("please be patient, enable tf.function, it takes time ...")
evaluate_step = tf.function(evaluate_step, input_signature=self.sample_signature)
self.model.reset_metrics()
for batch, samples in enumerate(dataset):
samples = self.model.prepare_samples(samples)
loss, metrics = evaluate_step(samples)
if batch % self.hparams.log_interval == 0 and hvd.rank() == 0:
logging.info(self.metric_checker(loss, metrics, -2))
loss_metric.update_state(loss)
if hvd.rank() == 0:
logging.info(self.metric_checker(loss_metric.result(), metrics, evaluate_epoch=epoch))
self.model.reset_metrics()
return loss_metric.result()
示例6: test_horovod_allreduce_type_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def test_horovod_allreduce_type_error(self):
"""Test that the allreduce raises an error if different ranks try to
send tensors of different type."""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
with self.test_session(config=self.config) as session:
# Same rank, different dimension
dims = [17] * 3
tensor = tf.ones(dims,
dtype=tf.int32 if rank % 2 == 0 else tf.float32)
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.allreduce(tensor))
示例7: test_horovod_allreduce_cpu_gpu_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def test_horovod_allreduce_cpu_gpu_error(self):
"""Test that the allreduce raises an error if different ranks try to
perform reduction on CPU and GPU."""
# Only do this test if there are GPUs available.
if not tf.test.is_gpu_available(cuda_only=True):
return
hvd.init()
local_rank = hvd.local_rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
device = "/gpu:%d" % local_rank if local_rank % 2 == 0 else "/cpu:0"
with self.test_session(config=self.config) as session:
with tf.device(device):
# Same rank, different dimension
dims = [17] * 3
tensor = tf.ones(dims, dtype=tf.int32)
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.allreduce(tensor))
示例8: test_horovod_allgather_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def test_horovod_allgather_error(self):
"""Test that the allgather returns an error if any dimension besides
the first is different among the tensors being gathered."""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
with self.test_session(config=self.config) as session:
tensor_size = [17] * 3
tensor_size[1] = 10 * (rank + 1)
tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.allgather(tensor))
示例9: test_horovod_allgather_type_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def test_horovod_allgather_type_error(self):
"""Test that the allgather returns an error if the types being gathered
differ among the processes"""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
with self.test_session(config=self.config) as session:
tensor_size = [17] * 3
dtype = tf.int32 if rank % 2 == 0 else tf.float32
tensor = tf.ones(tensor_size, dtype=dtype) * rank
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.allgather(tensor))
示例10: test_horovod_broadcast_type_error
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def test_horovod_broadcast_type_error(self):
"""Test that the broadcast returns an error if the types being broadcasted
differ among the processes"""
hvd.init()
rank = hvd.rank()
size = hvd.size()
# This test does not apply if there is only one worker.
if size == 1:
return
with self.test_session(config=self.config) as session:
tensor_size = [17] * 3
dtype = tf.int32 if rank % 2 == 0 else tf.float32
tensor = tf.ones(tensor_size, dtype=dtype) * rank
with self.assertRaises(tf.errors.FailedPreconditionError):
session.run(hvd.broadcast(tensor, 0))
示例11: setup
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def setup():
if not horovod_installed:
return False
global horovod_initialized
if horovod_initialized:
return hvd
hvd.init()
horovod_initialized = True
horovod_num_worker = hvd.size()
horovod_rank = hvd.rank()
# verify that MPI multi-threading is supported.
assert hvd.mpi_threads_supported()
# make sure MPI is not re-initialized.
import mpi4py.rc
mpi4py.rc.initialize = False
# import mpi4py
from mpi4py import MPI
comm = MPI.COMM_WORLD
# check size and rank are synchronized
assert horovod_num_worker == comm.Get_size()
assert horovod_rank == comm.Get_rank()
return hvd
示例12: is_enabled
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def is_enabled():
if os.getenv("USE_HOROVOD"):
return True
ppid = os.getppid()
if ppid <= 1:
return False
parent_process_name = _get_pname(ppid)
if parent_process_name.startswith("horovodrun") or parent_process_name.startswith("mpirun"):
if horovod_installed:
return True
else:
print("you're trying to run on horovod, but importing Horovod failed. exit.")
sys.exit(1)
else:
return False
# return True if horovod is not enabled, or enabled and the process is rank 0.
示例13: training_step
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def training_step(images, labels, first_batch):
with tf.GradientTape() as tape:
probs = mnist_model(images, training=True)
loss_value = loss(labels, probs)
# Horovod: add Horovod Distributed GradientTape.
tape = hvd.DistributedGradientTape(tape)
grads = tape.gradient(loss_value, mnist_model.trainable_variables)
opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
# Horovod: broadcast initial variable states from rank 0 to all other processes.
# This is necessary to ensure consistent initialization of all workers when
# training is started with random weights or restored from a checkpoint.
#
# Note: broadcast should be done after the first gradient step to ensure optimizer
# initialization.
if first_batch:
hvd.broadcast_variables(mnist_model.variables, root_rank=0)
hvd.broadcast_variables(opt.variables(), root_rank=0)
return loss_value
# Horovod: adjust number of steps based on number of GPUs.
示例14: _get_runconfig
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def _get_runconfig(is_distributed=DISTRIBUTED, save_checkpoints_steps=None):
if is_distributed:
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
return tf.estimator.RunConfig(
save_checkpoints_steps=save_checkpoints_steps,
save_checkpoints_secs=None,
session_config=config,
log_step_count_steps=100,
)
else:
return tf.estimator.RunConfig(
save_checkpoints_steps=save_checkpoints_steps,
save_checkpoints_secs=None,
log_step_count_steps=100,
)
示例15: _get_runconfig
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import rank [as 别名]
def _get_runconfig(is_distributed=defaults.DISTRIBUTED, save_checkpoints_steps=None):
if is_distributed:
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
return tf.estimator.RunConfig(
save_checkpoints_steps=save_checkpoints_steps,
save_checkpoints_secs=None,
session_config=config,
log_step_count_steps=100,
)
else:
return tf.estimator.RunConfig(
save_checkpoints_steps=save_checkpoints_steps,
save_checkpoints_secs=None,
log_step_count_steps=100,
)