本文整理汇总了Python中modeling.layer_norm方法的典型用法代码示例。如果您正苦于以下问题:Python modeling.layer_norm方法的具体用法?Python modeling.layer_norm怎么用?Python modeling.layer_norm使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类modeling
的用法示例。
在下文中一共展示了modeling.layer_norm方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_masked_lm_output
# 需要导入模块: import modeling [as 别名]
# 或者: from modeling import layer_norm [as 别名]
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
label_ids):
"""Get loss and log probs for the masked LM."""
input_tensor = gather_indexes(input_tensor, positions)
with tf.variable_scope("cls/predictions"):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with tf.variable_scope("transform"):
input_tensor = tf.layers.dense(
input_tensor,
units=bert_config.hidden_size,
activation=modeling.get_activation(bert_config.hidden_act),
kernel_initializer=modeling.create_initializer(
bert_config.initializer_range))
input_tensor = modeling.layer_norm(input_tensor)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias = tf.get_variable(
"output_bias",
shape=[bert_config.vocab_size],
initializer=tf.zeros_initializer())
logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
label_ids = tf.reshape(label_ids, [-1])
one_hot_labels = tf.one_hot(
label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]])
# TODO: dynamic gather from per_example_loss
return loss
示例2: get_masked_lm_output
# 需要导入模块: import modeling [as 别名]
# 或者: from modeling import layer_norm [as 别名]
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
label_ids, label_weights):
"""Get loss and log probs for the masked LM."""
input_tensor = gather_indexes(input_tensor, positions)
with tf.variable_scope("cls/predictions"):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with tf.variable_scope("transform"):
input_tensor = tf.layers.dense(
input_tensor,
units=bert_config.hidden_size,
activation=modeling.get_activation(bert_config.hidden_act),
kernel_initializer=modeling.create_initializer(
bert_config.initializer_range))
input_tensor = modeling.layer_norm(input_tensor)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias = tf.get_variable(
"output_bias",
shape=[bert_config.vocab_size],
initializer=tf.zeros_initializer())
logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
label_ids = tf.reshape(label_ids, [-1])
label_weights = tf.reshape(label_weights, [-1])
one_hot_labels = tf.one_hot(
label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
numerator = tf.reduce_sum(label_weights * per_example_loss)
denominator = tf.reduce_sum(label_weights) + 1e-5
loss = numerator / denominator
return (loss, per_example_loss, log_probs)
开发者ID:Nagakiran1,项目名称:Extending-Google-BERT-as-Question-and-Answering-model-and-Chatbot,代码行数:45,代码来源:run_pretraining.py
示例3: attention_fusion_layer
# 需要导入模块: import modeling [as 别名]
# 或者: from modeling import layer_norm [as 别名]
def attention_fusion_layer(bert_config,
input_tensor, input_ids, input_mask,
source_input_tensor, source_input_ids, source_input_mask,
is_training=True, scope=None):
'''
Attention Fusion Layer for merging source representation and target representation.
'''
# universal shapes
input_tensor_shape = modeling.get_shape_list(input_tensor, expected_rank=3)
batch_size = input_tensor_shape[0]
seq_length = input_tensor_shape[1]
hidden_size = input_tensor_shape[2]
source_input_tensor_shape = modeling.get_shape_list(source_input_tensor, expected_rank=3)
source_seq_length = source_input_tensor_shape[1]
source_hidden_size = source_input_tensor_shape[2]
# universal parameters
UNIVERSAL_DROPOUT_RATE = 0.1
if not is_training:
UNIVERSAL_DROPOUT_RATE = 0 # we disable dropout when predicting
UNIVERSAL_INIT_RANGE = bert_config.initializer_range
NUM_ATTENTION_HEAD = bert_config.num_attention_heads
# attention fusion module
with tf.variable_scope(scope, default_name="attention_fusion"):
ATTENTION_HEAD_SIZE = int(source_hidden_size / NUM_ATTENTION_HEAD)
with tf.variable_scope("attention"):
source_attended_repr = self_attention_layer(
from_tensor=input_tensor,
to_tensor=source_input_tensor,
attention_mask=modeling.create_attention_mask_from_input_mask(input_ids, source_input_mask),
num_attention_heads=NUM_ATTENTION_HEAD,
size_per_head=ATTENTION_HEAD_SIZE,
attention_probs_dropout_prob=UNIVERSAL_DROPOUT_RATE,
initializer_range=UNIVERSAL_INIT_RANGE,
do_return_2d_tensor=False,
batch_size=batch_size,
from_seq_length=seq_length,
to_seq_length=source_seq_length,
self_adaptive=True)
with tf.variable_scope("transform"):
source_attended_repr = tf.layers.dense(
source_attended_repr,
source_hidden_size,
kernel_initializer=modeling.create_initializer(UNIVERSAL_INIT_RANGE))
source_attended_repr = modeling.dropout(source_attended_repr, UNIVERSAL_DROPOUT_RATE)
source_attended_repr = modeling.layer_norm(source_attended_repr + source_input_tensor)
final_output = tf.concat([input_tensor, source_attended_repr], axis=-1)
return final_output
#
示例4: get_masked_lm_output
# 需要导入模块: import modeling [as 别名]
# 或者: from modeling import layer_norm [as 别名]
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
label_ids, label_weights):
"""Get loss and log probs for the masked LM."""
input_tensor = gather_indexes(input_tensor, positions)
with tf.compat.v1.variable_scope("cls/predictions"):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with tf.compat.v1.variable_scope("transform"):
input_tensor = tf.compat.v1.layers.dense(
input_tensor,
units=bert_config.hidden_size,
activation=modeling.get_activation(bert_config.hidden_act),
kernel_initializer=modeling.create_initializer(
bert_config.initializer_range))
input_tensor = modeling.layer_norm(input_tensor)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias = tf.compat.v1.get_variable(
"output_bias",
shape=[bert_config.vocab_size],
initializer=tf.compat.v1.zeros_initializer())
logits = bf.matmul(input_tensor, output_weights, transpose_b=True)
logits = bf.i_cast(logits)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
label_ids = tf.reshape(label_ids, [-1])
label_weights = tf.reshape(label_weights, [-1])
one_hot_labels = tf.one_hot(
label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss = -tf.reduce_sum(input_tensor=log_probs * one_hot_labels, axis=[-1])
numerator = tf.reduce_sum(input_tensor=label_weights * per_example_loss)
denominator = tf.reduce_sum(input_tensor=label_weights) + 1e-5
loss = numerator / denominator
return (loss, per_example_loss, log_probs)
示例5: get_masked_lm_output
# 需要导入模块: import modeling [as 别名]
# 或者: from modeling import layer_norm [as 别名]
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
label_ids, label_weights):
"""Get loss and log probs for the masked LM."""
input_tensor = gather_indexes(input_tensor, positions)
with tf.compat.v1.variable_scope("cls/predictions"):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with tf.compat.v1.variable_scope("transform"):
input_tensor = tf.compat.v1.layers.dense(
input_tensor,
units=bert_config.hidden_size,
activation=modeling.get_activation(bert_config.hidden_act),
kernel_initializer=modeling.create_initializer(
bert_config.initializer_range))
input_tensor = modeling.layer_norm(input_tensor)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias = tf.compat.v1.get_variable(
"output_bias",
shape=[bert_config.vocab_size],
initializer=tf.compat.v1.zeros_initializer())
input_tensor = bf.i_cast(input_tensor)
output_weights = bf.i_cast(output_weights)
logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
label_ids = tf.reshape(label_ids, [-1])
label_weights = tf.reshape(label_weights, [-1])
one_hot_labels = tf.one_hot(
label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss = -tf.reduce_sum(input_tensor=log_probs * one_hot_labels, axis=[-1])
numerator = tf.reduce_sum(input_tensor=label_weights * per_example_loss)
denominator = tf.reduce_sum(input_tensor=label_weights) + 1e-5
loss = numerator / denominator
return (loss, per_example_loss, log_probs)
示例6: get_masked_lm_output
# 需要导入模块: import modeling [as 别名]
# 或者: from modeling import layer_norm [as 别名]
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
label_ids, label_weights):
"""Get loss and log probs for the masked LM."""
input_tensor = gather_indexes(input_tensor, positions)
with tf.variable_scope("cls/predictions"):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with tf.variable_scope("transform"):
input_tensor = tf.layers.dense(
input_tensor,
units=bert_config.hidden_size,
activation=modeling.get_activation(bert_config.hidden_act),
kernel_initializer=modeling.create_initializer(
bert_config.initializer_range))
input_tensor = modeling.layer_norm(input_tensor)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias = tf.get_variable(
"output_bias",
shape=[bert_config.vocab_size],
initializer=tf.zeros_initializer())
logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
label_ids = tf.reshape(label_ids, [-1])
label_weights = tf.reshape(label_weights, [-1])
one_hot_labels = tf.one_hot(
label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss = -tf.reduce_sum(
log_probs * one_hot_labels, axis=[-1])
numerator = tf.reduce_sum(label_weights * per_example_loss)
denominator = tf.reduce_sum(label_weights) + 1e-5
loss = numerator / denominator
return (loss, per_example_loss, log_probs)
示例7: get_masked_lm_output
# 需要导入模块: import modeling [as 别名]
# 或者: from modeling import layer_norm [as 别名]
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
label_ids, label_weights):
"""Get loss and log probs for the masked LM."""
# [batch_size*label_size, dim]
input_tensor = gather_indexes(input_tensor, positions)
with tf.variable_scope("cls/predictions"):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with tf.variable_scope("transform"):
input_tensor = tf.layers.dense(
input_tensor,
units=bert_config.hidden_size,
activation=modeling.get_activation(bert_config.hidden_act),
kernel_initializer=modeling.create_initializer(
bert_config.initializer_range))
input_tensor = modeling.layer_norm(input_tensor)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias = tf.get_variable(
"output_bias",
shape=[output_weights.shape[0]],
initializer=tf.zeros_initializer())
logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
# logits, (bs*label_size, vocab_size)
log_probs = tf.nn.log_softmax(logits, -1)
label_ids = tf.reshape(label_ids, [-1])
label_weights = tf.reshape(label_weights, [-1])
one_hot_labels = tf.one_hot(
label_ids, depth=output_weights.shape[0], dtype=tf.float32)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss = -tf.reduce_sum(
log_probs * one_hot_labels, axis=[-1])
numerator = tf.reduce_sum(label_weights * per_example_loss)
denominator = tf.reduce_sum(label_weights) + 1e-5
loss = numerator / denominator
return (loss, per_example_loss, log_probs)
示例8: get_masked_lm_output
# 需要导入模块: import modeling [as 别名]
# 或者: from modeling import layer_norm [as 别名]
def get_masked_lm_output(bert_config, input_tensor, output_weights,project_weights, positions,
label_ids, label_weights):
"""Get loss and log probs for the masked LM."""
input_tensor = gather_indexes(input_tensor, positions)
with tf.variable_scope("cls/predictions"):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with tf.variable_scope("transform"):
input_tensor = tf.layers.dense(
input_tensor,
units=bert_config.hidden_size,
activation=modeling.get_activation(bert_config.hidden_act),
kernel_initializer=modeling.create_initializer(
bert_config.initializer_range))
input_tensor = modeling.layer_norm(input_tensor)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias = tf.get_variable(
"output_bias",
shape=[bert_config.vocab_size],
initializer=tf.zeros_initializer())
# logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
# input_tensor=[-1,hidden_size], project_weights=[embedding_size, hidden_size], project_weights_transpose=[hidden_size, embedding_size]--->[-1, embedding_size]
input_project = tf.matmul(input_tensor, project_weights, transpose_b=True)
logits = tf.matmul(input_project, output_weights, transpose_b=True)
# # input_project=[-1, embedding_size], output_weights=[vocab_size, embedding_size], output_weights_transpose=[embedding_size, vocab_size] ---> [-1, vocab_size]
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
label_ids = tf.reshape(label_ids, [-1])
label_weights = tf.reshape(label_weights, [-1])
one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
numerator = tf.reduce_sum(label_weights * per_example_loss)
denominator = tf.reduce_sum(label_weights) + 1e-5
loss = numerator / denominator
return (loss, per_example_loss, log_probs)