本文整理汇总了Python中horovod.tensorflow.BroadcastGlobalVariablesHook方法的典型用法代码示例。如果您正苦于以下问题:Python tensorflow.BroadcastGlobalVariablesHook方法的具体用法?Python tensorflow.BroadcastGlobalVariablesHook怎么用?Python tensorflow.BroadcastGlobalVariablesHook使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类horovod.tensorflow
的用法示例。
在下文中一共展示了tensorflow.BroadcastGlobalVariablesHook方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: FastTrain
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import BroadcastGlobalVariablesHook [as 别名]
def FastTrain():
print("FastTrain", FLAGS.train_steps)
estimator = FastTextEstimator(FLAGS.model_dir)
print("TEST" + FLAGS.train_records)
train_input = InputFn(tf.estimator.ModeKeys.TRAIN, FLAGS.train_records)
print("STARTING TRAIN")
hooks = None
if FLAGS.horovod:
hooks = [hvd.BroadcastGlobalVariablesHook(0)]
estimator.train(input_fn=train_input, steps=FLAGS.train_steps, hooks=hooks)
print("TRAIN COMPLETE")
if not FLAGS.horovod or hvd.rank() == 0:
print("EVALUATE")
eval_input = InputFn(tf.estimator.ModeKeys.EVAL, FLAGS.eval_records)
#eval_metrics = { "accuracy": tf.metrics.accuracy(labels, predictions) }
result = estimator.evaluate(input_fn=eval_input, steps=FLAGS.eval_steps, hooks=None)
print(result)
print("DONE")
if FLAGS.export_dir:
print("EXPORTING")
estimator.export_savedmodel(FLAGS.export_dir,
inputs.ServingInputFn(FLAGS.use_ngrams))
示例2: experiment_fn
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import BroadcastGlobalVariablesHook [as 别名]
def experiment_fn() -> Experiment:
# To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15
import tensorflow as tf
def train_input_fn():
dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train")
return dataset.shuffle(1000).batch(128).repeat()
def eval_input_fn():
dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test")
return dataset.shuffle(1000).batch(128)
estimator = tf.compat.v1.estimator.LinearClassifier(
feature_columns=winequality.get_feature_columns(),
model_dir=f"{HDFS_DIR}",
n_classes=winequality.get_n_classes(),
optimizer=lambda: hvd.DistributedOptimizer(tf.compat.v1.train.AdamOptimizer()))
return Experiment(
estimator,
tf.estimator.TrainSpec(
train_input_fn,
max_steps=10,
hooks=[hvd.BroadcastGlobalVariablesHook(0)]
),
tf.estimator.EvalSpec(
eval_input_fn,
steps=10,
start_delay_secs=0,
throttle_secs=30
)
)
示例3: _get_hooks
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import BroadcastGlobalVariablesHook [as 别名]
def _get_hooks(is_distributed=DISTRIBUTED):
logger = logging.getLogger(__name__)
if is_distributed:
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
return [bcast_hook]
else:
return []
示例4: _get_hooks
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import BroadcastGlobalVariablesHook [as 别名]
def _get_hooks(batch_size, is_distributed=defaults.DISTRIBUTED):
logger = logging.getLogger(__name__)
if is_distributed:
exps_hook = ExamplesPerSecondHook(batch_size * hvd.size())
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
logger.info("Rank: {} Cluster Size {}".format(hvd.rank(), hvd.size()))
return [bcast_hook, exps_hook]
else:
exps_hook = ExamplesPerSecondHook(batch_size)
return [exps_hook]
示例5: as_operator
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import BroadcastGlobalVariablesHook [as 别名]
def as_operator(self):
try:
import horovod.tensorflow as hvd
except ImportError:
raise ImportError('Cannot import Horovod')
self.network.session_config.gpu_options.visible_device_list = str(hvd.local_rank())
hooks = [hvd.BroadcastGlobalVariablesHook(0)]
self.network.add_hooks(hooks)
return self.op.minimize(self.network.fetch_internal_tensor(self.loss))
示例6: get_opt
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import BroadcastGlobalVariablesHook [as 别名]
def get_opt(self, init_lr,
num_train_steps, **kargs):
learning_rate = init_lr
if self.config.get("decay", "no") == "decay":
print("==apply lr decay==")
learning_rate = self.lr_decay_fn(learning_rate, num_train_steps, **kargs)
if self.config.get("warmup", "no") == "warmup":
print("==apply warmup==")
learning_rate = self.warm_up(learning_rate, init_lr, **kargs)
else:
learning_rate = tf.cast(tf.constant(learning_rate), tf.float32)
self.learning_rate = learning_rate #* (self.config.get('gpu_count', 1) / 2)
# self.learning_rate = learning_rate / np.sqrt(self.config.get('gpu_count', 1) / 2)
# self.learning_rate = learning_rate * np.sqrt(self.config.get('gpu_count', 1)) * 2
self.single_node_learning = learning_rate
# add uber horvod distributed optimizer
if hvd and self.config["opt_type"] == "hvd":
print("==optimizer hvd size=={}".format(self.config.get("worker_count", hvd.size())))
opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", hvd.size()), **kargs)
self.opt = hvd.DistributedOptimizer(opt)
self.distributed_hooks = [hvd.BroadcastGlobalVariablesHook(0)]
# add pai soar distributed optimizer
elif pai and self.config["opt_type"] == "pai_soar":
print("==optimizer pai_soar size=={}".format(self.config.get("worker_count", 4)))
opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
self.opt = pai.ReplicatedVarsOptimizer(opt, clip_norm=self.config.get("clip_norm", 1.0))
self.distributed_hooks = []
# add tensorflow ps sync distributed optimizer
elif self.config["opt_type"] == "ps_sync":
print("==optimizer ps_sync size=={}".format(self.config.get("worker_count", 4)))
opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
self.opt = tf.train.SyncReplicasOptimizer(opt,
replicas_to_aggregate=self.config.get("worker_count", 4),
total_num_replicas=self.config.get("worker_count", 4))
self.distributed_hooks = [self.opt.make_session_run_hook(self.config["is_chief"], num_tokens=0)]
elif self.config["opt_type"] == "ps":
print("==optimizer ps_async size=={}".format(self.config.get("worker_count", 4)))
self.opt = self.optimizer_op(self.learning_rate*self.config.get("worker_count", 4), **kargs)
else:
print("==initialization of single node optimizer==")
self.opt = self.optimizer_op(self.learning_rate, **kargs)
self.distributed_hooks = []
示例7: main
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import BroadcastGlobalVariablesHook [as 别名]
def main(_):
# Horovod: initialize Horovod.
hvd.init()
# Download and load MNIST dataset.
mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())
# Build model...
with tf.name_scope('input'):
image = tf.placeholder(tf.float32, [None, 784], name='image')
label = tf.placeholder(tf.float32, [None], name='label')
predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)
# Horovod: adjust learning rate based on number of GPUs.
opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())
# Horovod: add Horovod Distributed Optimizer.
opt = hvd.DistributedOptimizer(opt)
global_step = tf.contrib.framework.get_or_create_global_step()
train_op = opt.minimize(loss, global_step=global_step)
hooks = [
# Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
# from rank 0 to all other processes. This is necessary to ensure consistent
# initialization of all workers when training is started with random weights
# or restored from a checkpoint.
hvd.BroadcastGlobalVariablesHook(0),
# Horovod: adjust number of steps based on number of GPUs.
tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
every_n_iter=10),
]
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
# The MonitoredTrainingSession takes care of session initialization,
# restoring from a checkpoint, saving to a checkpoint, and closing when done
# or an error occurs.
with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
hooks=hooks,
config=config) as mon_sess:
while not mon_sess.should_stop():
# Run a training step synchronously.
image_, label_ = mnist.train.next_batch(100)
mon_sess.run(train_op, feed_dict={image: image_, label: label_})
示例8: main
# 需要导入模块: from horovod import tensorflow [as 别名]
# 或者: from horovod.tensorflow import BroadcastGlobalVariablesHook [as 别名]
def main(unused_argv):
# Horovod: initialize Horovod.
hvd.init()
# Load training and eval data
mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())
train_data = mnist.train.images # Returns np.array
train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
eval_data = mnist.test.images # Returns np.array
eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None
# Create the Estimator
mnist_classifier = tf.estimator.Estimator(
model_fn=cnn_model_fn, model_dir=model_dir,
config=tf.estimator.RunConfig(session_config=config))
# Set up logging for predictions
# Log the values in the "Softmax" tensor with label "probabilities"
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(
tensors=tensors_to_log, every_n_iter=500)
# Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
# rank 0 to all other processes. This is necessary to ensure consistent
# initialization of all workers when training is started with random weights or
# restored from a checkpoint.
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": train_data},
y=train_labels,
batch_size=100,
num_epochs=None,
shuffle=True)
# Horovod: adjust number of steps based on number of GPUs.
mnist_classifier.train(
input_fn=train_input_fn,
steps=20000 // hvd.size(),
hooks=[logging_hook, bcast_hook])
# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": eval_data},
y=eval_labels,
num_epochs=1,
shuffle=False)
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)