本文整理汇总了Python中tensor2tensor.layers.common_layers.dense_relu_dense方法的典型用法代码示例。如果您正苦于以下问题:Python common_layers.dense_relu_dense方法的具体用法?Python common_layers.dense_relu_dense怎么用?Python common_layers.dense_relu_dense使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tensor2tensor.layers.common_layers
的用法示例。
在下文中一共展示了common_layers.dense_relu_dense方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: body
# 需要导入模块: from tensor2tensor.layers import common_layers [as 别名]
# 或者: from tensor2tensor.layers.common_layers import dense_relu_dense [as 别名]
def body(self, features):
assert self._hparams.block_size > 0
assert not common_layers.is_xla_compiled()
assert "targets_segmentation" not in features
decoder_output = super(TransformerBlockParallel, self).body(features)
assert not isinstance(decoder_output, tuple)
assert len(decoder_output.shape) == 4
relu_dropout_broadcast_dims = (
common_layers.comma_separated_string_to_integer_list(
getattr(self._hparams, "relu_dropout_broadcast_dims", "")))
with tf.variable_scope("block_size_%d" % self._hparams.block_size):
block_output = common_layers.dense_relu_dense(
decoder_output,
self._hparams.block_size * self._hparams.filter_size,
self._hparams.block_size * self._hparams.hidden_size,
dropout=self._hparams.relu_dropout,
dropout_broadcast_dims=relu_dropout_broadcast_dims)
batch_size, length = common_layers.shape_list(decoder_output)[:2]
block_output = tf.reshape(block_output, [
batch_size,
length,
self._hparams.block_size,
self._hparams.hidden_size
])
block_output = common_layers.layer_postprocess(
decoder_output, block_output, self._hparams)
return block_output
示例2: transformer_base_v1
# 需要导入模块: from tensor2tensor.layers import common_layers [as 别名]
# 或者: from tensor2tensor.layers.common_layers import dense_relu_dense [as 别名]
def transformer_base_v1():
"""Set of hyperparameters."""
hparams = common_hparams.basic_params1()
hparams.norm_type = "layer"
hparams.hidden_size = 512
hparams.batch_size = 4096
hparams.max_length = 256
hparams.clip_grad_norm = 0. # i.e. no gradient clipping
hparams.optimizer_adam_epsilon = 1e-9
hparams.learning_rate_schedule = "legacy"
hparams.learning_rate_decay_scheme = "noam"
hparams.learning_rate = 0.1
hparams.learning_rate_warmup_steps = 4000
hparams.initializer_gain = 1.0
hparams.num_hidden_layers = 6
hparams.initializer = "uniform_unit_scaling"
hparams.weight_decay = 0.0
hparams.optimizer_adam_beta1 = 0.9
hparams.optimizer_adam_beta2 = 0.98
hparams.num_sampled_classes = 0
hparams.label_smoothing = 0.1
hparams.shared_embedding_and_softmax_weights = True
hparams.symbol_modality_num_shards = 16
# Add new ones like this.
hparams.add_hparam("filter_size", 2048)
# Layer-related flags. If zero, these fall back on hparams.num_hidden_layers.
hparams.add_hparam("num_encoder_layers", 0)
hparams.add_hparam("num_decoder_layers", 0)
# Attention-related flags.
hparams.add_hparam("num_heads", 8)
hparams.add_hparam("attention_key_channels", 0)
hparams.add_hparam("attention_value_channels", 0)
hparams.add_hparam("ffn_layer", "dense_relu_dense")
hparams.add_hparam("parameter_attention_key_channels", 0)
hparams.add_hparam("parameter_attention_value_channels", 0)
# All hyperparameters ending in "dropout" are automatically set to 0.0
# when not in training mode.
hparams.add_hparam("attention_dropout", 0.0)
hparams.add_hparam("attention_dropout_broadcast_dims", "")
hparams.add_hparam("relu_dropout", 0.0)
hparams.add_hparam("relu_dropout_broadcast_dims", "")
hparams.add_hparam("pos", "timing") # timing, none
hparams.add_hparam("nbr_decoder_problems", 1)
hparams.add_hparam("proximity_bias", False)
hparams.add_hparam("causal_decoder_self_attention", True)
hparams.add_hparam("use_pad_remover", True)
hparams.add_hparam("self_attention_type", "dot_product")
hparams.add_hparam("max_relative_position", 0)
hparams.add_hparam("conv_first_kernel", 3)
hparams.add_hparam("attention_variables_3d", False)
hparams.add_hparam("use_target_space_embedding", True)
# These parameters are only used when ffn_layer=="local_moe_tpu"
hparams.add_hparam("moe_overhead_train", 1.0)
hparams.add_hparam("moe_overhead_eval", 2.0)
hparams.moe_num_experts = 16
hparams.moe_loss_coef = 1e-3
return hparams
示例3: ffn_layer
# 需要导入模块: from tensor2tensor.layers import common_layers [as 别名]
# 或者: from tensor2tensor.layers.common_layers import dense_relu_dense [as 别名]
def ffn_layer(x, hparams, losses=None):
"""ffn layer transformer."""
with tf.variable_scope("ffn"):
if hparams.ffn_layer == "none":
return x
if hparams.ffn_layer == "conv_hidden_relu":
y = common_layers.dense_relu_dense(
x,
hparams.filter_size,
hparams.hidden_size,
dropout=hparams.relu_dropout)
elif hparams.ffn_layer == "normed_conv_hidden_relu":
y = common_layers.normed_conv_hidden_relu(
x,
hparams.norm_type,
hparams.layer_norm_epsilon,
hparams.filter_size,
hparams.hidden_size,
dropout=hparams.relu_dropout,
norm_name="convnorm")
elif hparams.ffn_layer == "self_attention_ffn":
x_shape = tf.shape(x)
x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size])
y = common_attention.ffn_self_attention_layer(
x, hparams.filter_size, hparams.hidden_size, hparams.num_parts,
hparams.attention_dropout, hparams.share_kv)
y = tf.reshape(y, x_shape)
elif hparams.ffn_layer == "local_moe_tpu":
overhead = (hparams.moe_overhead_train
if hparams.mode == tf.estimator.ModeKeys.TRAIN
else hparams.moe_overhead_eval)
x, x_shape, is_4d = maybe_reshape_4d_to_3d(x)
y, loss = expert_utils.local_moe_tpu(
x, hparams.filter_size // 2,
hparams.hidden_size,
hparams.moe_num_experts, overhead=overhead,
loss_coef=hparams.moe_loss_coef)
if is_4d:
y = tf.reshape(y, x_shape)
if losses is None:
raise ValueError(
"transformer_ffn_layer with type local_moe_tpu must pass in "
"a losses list")
losses.append(loss)
else:
assert hparams.ffn_layer == "glu_ffn"
y = common_layers.gated_linear_unit_layer(x)
return y
示例4: image_encoder
# 需要导入模块: from tensor2tensor.layers import common_layers [as 别名]
# 或者: from tensor2tensor.layers.common_layers import dense_relu_dense [as 别名]
def image_encoder(image_feat,
hparams,
name="image_encoder",
save_weights_to=None,
make_image_summary=True):
"""A stack of self attention layers."""
x = image_feat
with tf.variable_scope(name):
for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers):
with tf.variable_scope("layer_%d" % layer):
with tf.variable_scope("self_attention"):
y = vqa_layers.multihead_attention(
common_layers.layer_preprocess(x, hparams),
None,
None,
hparams.attention_key_channels or hparams.image_hidden_size,
hparams.attention_value_channels or hparams.image_hidden_size,
hparams.image_hidden_size,
hparams.num_heads,
hparams.attention_dropout,
attention_type=hparams.self_attention_type,
save_weights_to=save_weights_to,
max_relative_position=None,
make_image_summary=make_image_summary,
dropout_broadcast_dims=None,
max_length=None,
vars_3d=False,
scale_otproduct=hparams.scale_dotproduct)
utils.collect_named_outputs("norms", "image_feat_self_attention",
tf.norm(y, axis=-1))
x = common_layers.layer_postprocess(x, y, hparams)
utils.collect_named_outputs(
"norms", "image_feat_self_attention_zero_add",
tf.norm(x, axis=-1))
with tf.variable_scope("ffn"):
y = common_layers.dense_relu_dense(
common_layers.layer_preprocess(x, hparams),
hparams.image_filter_size,
hparams.image_hidden_size,
dropout=hparams.relu_dropout,
dropout_broadcast_dims=None)
utils.collect_named_outputs("norms", "image_feat_ffn",
tf.norm(y, axis=-1))
x = common_layers.layer_postprocess(x, y, hparams)
utils.collect_named_outputs("norms", "image_feat_ffn_zero_add",
tf.norm(x, axis=-1))
# if normalization is done in layer_preprocess, then it should also be done
# on the output, since the output can grow very large, being the sum of
# a whole stack of unnormalized layer outputs.
return common_layers.layer_preprocess(x, hparams)
示例5: image_encoder
# 需要导入模块: from tensor2tensor.layers import common_layers [as 别名]
# 或者: from tensor2tensor.layers.common_layers import dense_relu_dense [as 别名]
def image_encoder(image_feat,
hparams,
name="image_encoder",
save_weights_to=None,
make_image_summary=True):
"""A stack of self attention layers."""
x = image_feat
image_hidden_size = hparams.image_hidden_size or hparams.hidden_size
image_filter_size = hparams.image_filter_size or hparams.filter_size
with tf.variable_scope(name):
for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers):
with tf.variable_scope("layer_%d" % layer):
with tf.variable_scope("self_attention"):
y = vqa_layers.multihead_attention(
common_layers.layer_preprocess(x, hparams),
None,
None,
hparams.attention_key_channels or image_hidden_size,
hparams.attention_value_channels or image_hidden_size,
image_hidden_size,
hparams.num_heads,
hparams.attention_dropout,
attention_type=hparams.image_self_attention_type,
save_weights_to=save_weights_to,
make_image_summary=make_image_summary,
scale_dotproduct=hparams.scale_dotproduct,
)
utils.collect_named_outputs(
"norms", "image_feat_self_attention_%d"%(layer),
tf.norm(y, axis=-1))
x = common_layers.layer_postprocess(x, y, hparams)
utils.collect_named_outputs(
"norms", "image_feat_self_attention_postprocess_%d"%(layer),
tf.norm(x, axis=-1))
with tf.variable_scope("ffn"):
y = common_layers.dense_relu_dense(
common_layers.layer_preprocess(x, hparams),
image_filter_size,
image_hidden_size,
dropout=hparams.relu_dropout,
)
utils.collect_named_outputs(
"norms", "image_feat_ffn_%d"%(layer), tf.norm(y, axis=-1))
x = common_layers.layer_postprocess(x, y, hparams)
utils.collect_named_outputs(
"norms", "image_feat_ffn_postprocess_%d"%(layer),
tf.norm(x, axis=-1))
# if normalization is done in layer_preprocess, then it should also be done
# on the output, since the output can grow very large, being the sum of
# a whole stack of unnormalized layer outputs.
return common_layers.layer_preprocess(x, hparams)
示例6: question_encoder
# 需要导入模块: from tensor2tensor.layers import common_layers [as 别名]
# 或者: from tensor2tensor.layers.common_layers import dense_relu_dense [as 别名]
def question_encoder(question,
question_self_attention_bias,
hparams,
name="question_encoder",
save_weights_to=None,
make_image_summary=True):
"""A stack of self attention layers."""
x = question
with tf.variable_scope(name):
for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers):
with tf.variable_scope("layer_%d" % layer):
with tf.variable_scope("self_attention"):
y = vqa_layers.multihead_attention(
common_layers.layer_preprocess(x, hparams),
None,
question_self_attention_bias,
hparams.attention_key_channels or hparams.hidden_size,
hparams.attention_value_channels or hparams.hidden_size,
hparams.hidden_size,
hparams.num_heads,
hparams.attention_dropout,
attention_type=hparams.question_self_attention_type,
block_length=hparams.block_length,
save_weights_to=save_weights_to,
make_image_summary=make_image_summary,
scale_dotproduct=hparams.scale_dotproduct,
)
utils.collect_named_outputs(
"norms", "query_self_attention_%d"%(layer),
tf.norm(y, axis=-1))
x = common_layers.layer_postprocess(x, y, hparams)
utils.collect_named_outputs(
"norms", "query_self_attention_postprocess_%d"%(layer),
tf.norm(x, axis=-1))
with tf.variable_scope("ffn"):
y = common_layers.dense_relu_dense(
common_layers.layer_preprocess(x, hparams),
hparams.filter_size,
hparams.hidden_size,
dropout=hparams.relu_dropout,
)
utils.collect_named_outputs(
"norms", "query_ffn_%d"%(layer), tf.norm(y, axis=-1))
x = common_layers.layer_postprocess(x, y, hparams)
utils.collect_named_outputs(
"norms", "query_ffn_postprocess_%d"%(layer),
tf.norm(x, axis=-1))
# if normalization is done in layer_preprocess, then it should also be done
# on the output, since the output can grow very large, being the sum of
# a whole stack of unnormalized layer outputs.
return common_layers.layer_preprocess(x, hparams)