本文整理匯總了Python中torch.nn.MultiheadAttention方法的典型用法代碼示例。如果您正苦於以下問題:Python nn.MultiheadAttention方法的具體用法?Python nn.MultiheadAttention怎麽用?Python nn.MultiheadAttention使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類torch.nn
的用法示例。
在下文中一共展示了nn.MultiheadAttention方法的12個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: __init__
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def __init__(self, embed_dim, num_heads, keep_prob_attention, keep_prob_residual, keep_prob_mlp, n_ctx=512,
scale=False, use_builtin_mha=False):
if use_builtin_mha:
self.attention = nn.MultiheadAttention(embed_dim=embed_dim,
num_heads=num_heads,
dropout=keep_prob_attention)
else:
self.attention = ModifiedMultiHeadedAttention(num_state=embed_dim,
n_ctx=n_ctx,
num_heads=num_heads,
keep_prob_attention=keep_prob_attention,
keep_prob_residual=keep_prob_residual,
scale=scale)
self.layer_norm1 = LayerNorm(embed_dim)
self.mlp = MultiLayerPerceptron(4 * embed_dim, embed_dim, keep_prob_mlp)
self.layer_norm2 = LayerNorm(embed_dim)
示例2: __init__
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def __init__(self, embed_size, hidden_size, output_size, n_layer=2, dropout=0.5, pretrained=None):
super(Decoder, self).__init__()
self.embed_size, self.hidden_size = embed_size, hidden_size
self.output_size = output_size
self.n_layer = n_layer
self.embed = nn.Embedding(output_size, embed_size)
self.rnn = nn.GRU(hidden_size + embed_size, hidden_size,
num_layers=n_layer, dropout=(0 if n_layer == 1 else dropout))
self.out = nn.Linear(hidden_size, output_size)
self.pos_emb = PositionEmbedding(embed_size, dropout=dropout)
self.self_attention_context1 = nn.MultiheadAttention(embed_size, 8)
self.layer_norm1 = nn.LayerNorm(embed_size)
self.droput1 = nn.Dropout(p=dropout)
self.self_attention_context2 = nn.MultiheadAttention(embed_size, 8)
self.layer_norm2 = nn.LayerNorm(embed_size)
self.droput2 = nn.Dropout(p=dropout)
# self.self_attention_context3 = nn.MultiheadAttention(embed_size, 8)
# self.layer_norm3 = nn.LayerNorm(embed_size)
# self.droput3 = nn.Dropout(p=dropout)
self.self_attention = nn.MultiheadAttention(hidden_size, 8)
self.word_level_attn = Attention(embed_size)
self.init_weight()
示例3: __init__
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def __init__(self, input_size, embed_size, output_size, utter_hidden,
decoder_hidden, teach_force=0.5, pad=1, sos=1, dropout=0.5,
utter_n_layer=1, pretrained=None):
super(MReCoSa, self).__init__()
self.encoder = Encoder(input_size, embed_size, utter_hidden, n_layers=utter_n_layer,
dropout=dropout, pretrained=pretrained)
self.decoder = Decoder(embed_size, decoder_hidden, output_size, n_layer=utter_n_layer,
dropout=dropout, pretrained=pretrained)
self.teach_force = teach_force
self.pad, self.sos = pad, sos
self.output_size = output_size
self.pos_emb = PositionEmbedding(embed_size, dropout=dropout)
self.self_attention_context1 = nn.MultiheadAttention(embed_size, 8)
self.layer_norm1 = nn.LayerNorm(embed_size)
self.self_attention_context2 = nn.MultiheadAttention(embed_size, 8)
self.layer_norm2 = nn.LayerNorm(embed_size)
self.self_attention_context3 = nn.MultiheadAttention(embed_size, 8)
self.layer_norm3 = nn.LayerNorm(embed_size)
示例4: __init__
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def __init__(self, embed_dim, hidden_dim, num_embeddings, num_max_positions, num_heads, num_layers, dropout, causal):
super().__init__()
self.causal = causal
self.tokens_embeddings = nn.Embedding(num_embeddings, embed_dim)
self.position_embeddings = nn.Embedding(num_max_positions, embed_dim)
self.dropout = nn.Dropout(dropout)
self.attentions, self.feed_forwards = nn.ModuleList(), nn.ModuleList()
self.layer_norms_1, self.layer_norms_2 = nn.ModuleList(), nn.ModuleList()
for _ in range(num_layers):
self.attentions.append(nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout))
self.feed_forwards.append(nn.Sequential(nn.Linear(embed_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, embed_dim)))
self.layer_norms_1.append(nn.LayerNorm(embed_dim, eps=1e-12))
self.layer_norms_2.append(nn.LayerNorm(embed_dim, eps=1e-12))
示例5: __init__
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def __init__(self, d_model, nhead, dropout=0.1):
super(SAN, self).__init__()
self.d_model = d_model
self.nhead = nhead
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.dropout = nn.Dropout(p=dropout)
self.norm = nn.LayerNorm(d_model)
示例6: _init_weights
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def _init_weights(module):
r"""Initialize weights like BERT - N(0.0, 0.02), bias = 0."""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=0.02)
elif isinstance(module, nn.MultiheadAttention):
module.in_proj_weight.data.normal_(mean=0.0, std=0.02)
module.out_proj.weight.data.normal_(mean=0.0, std=0.02)
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=0.02)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
示例7: dummy_attention
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def dummy_attention(key : torch.Tensor,
query: torch.Tensor,
value: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
r"""function for dummy in jit-compile features of torch, which have the same inputs and
outputs to nn.MultiheadAttention().__call__()
Args:
key (T): inputs to be passed as output
query (T): dummy inputs
value (T): dummy inputs
Returns:
Tuple[T, T]: values = (key, dummy outputs = torch.Tensor([]))
"""
return key, torch.Tensor([])
示例8: show_attention
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def show_attention(attentions : np.ndarray,
xaxis : Union[list, str] = None,
yaxis : Union[list, str] = None,
savedir : str = None):
r"""Show attention of MultiheadAttention in a mpl heatmap
Args:
attentions (np.ndarray), shape = (sequence length, sequence length), dtype = np.float32: Attentions Weights of output of nn.MultiheadAttention
xaxis (str, optional): string or list of xaxis. Defaults to None.
yaxis (str, optional): string or list of yaxis. Defaults to None.
savedir (str, optional): string of directory to save the attention png. Defaults to None.
"""
# set up figure with colorbar
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(attentions)
fig.colorbar(cax)
# set up axes
if xaxis is not None:
if isinstance(xaxis, str):
xaxis = [""] + xaxis.split(",")
elif isinstance(xaxis, list):
xaxis = [""] + xaxis
ax.set_xticklabels(xaxis, rotation=90)
if yaxis is not None:
if isinstance(yaxis, str):
yaxis = [""] + yaxis.split(",")
elif isinstance(yaxis, list):
yaxis = [""] + yaxis
ax.set_yticklabels(yaxis)
# show label at every tick
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
if savedir is None:
plt.show()
else:
plt.savefig(savedir)
示例9: __init__
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def __init__(self, hidden_size, nhead=8, dropout=0.3):
super(Multi_head_attention_trs, self).__init__()
self.nhead = nhead
self.hidden_size = hidden_size
if hidden_size % nhead != 0:
raise Exception(f'hidden_size must be divisble by nhead, but got {hidden_size}/{nhead}.')
self.multi_head_attention = nn.MultiheadAttention(hidden_size, nhead)
self.layer_norm = nn.LayerNorm(hidden_size)
self.final_attn = Attention(hidden_size)
示例10: __init__
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def __init__(self,
d_model: int,
nhead: int,
dim_feedforward: int = 2048,
dropout: float = 0.1) -> None:
"""Initialize a TransformerEncoderLayer.
Parameters
----------
d_model : int
The number of expected features in the input.
n_head : int
The number of heads in the multiheadattention models.
dim_feedforward : int, optional
The dimension of the feedforward network (default=2048).
dropout : float, optional
The dropout value (default=0.1).
"""
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.dropout = nn.Dropout(dropout)
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
示例11: __init__
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def __init__(self,
d_model: int,
nhead: int,
dim_feedforward: int = 2048,
dropout: float = 0.1,
sru_dropout: Optional[float] = None,
bidirectional: bool = False,
**kwargs: Dict[str, Any]) -> None:
"""Initialize a TransformerSRUEncoderLayer.
Parameters
----------
d_model : int
The number of expected features in the input.
n_head : int
The number of heads in the multiheadattention models.
dim_feedforward : int, optional
The dimension of the feedforward network (default=2048).
dropout : float, optional
The dropout value (default=0.1).
sru_dropout: float, optional
Dropout for the SRU cell. If not given, uses the same
dropout value as the rest of the transformer.
bidirectional: bool
Whether the SRU module should be bidrectional.
Defaul ``False``.
Extra keyword arguments are passed to the SRUCell.
"""
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.sru = SRUCell(d_model,
dim_feedforward,
dropout,
sru_dropout or dropout,
bidirectional=bidirectional,
has_skip_term=False, **kwargs)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
示例12: load_state_dict
# 需要導入模塊: from torch import nn [as 別名]
# 或者: from torch.nn import MultiheadAttention [as 別名]
def load_state_dict(self, state_dict):
""" Loads module from previously saved state.
Supports loading from both DPMultiheadAttention
and nn.MultiheadAttention modules
"""
if "in_proj_weight" in state_dict:
qweight, kweight, vweight = state_dict["in_proj_weight"].chunk(3, dim=0)
state_dict["qlinear.weight"] = qweight
state_dict["klinear.weight"] = kweight
state_dict["vlinear.weight"] = vweight
del state_dict["in_proj_weight"]
if "in_proj_bias" in state_dict:
qbias, kbias, vbias = state_dict["in_proj_bias"].chunk(3, dim=0)
state_dict["qlinear.bias"] = qbias
state_dict["klinear.bias"] = kbias
state_dict["vlinear.bias"] = vbias
del state_dict["in_proj_bias"]
if "bias_k" in state_dict:
state_dict["seq_bias_k.bias"] = state_dict["bias_k"].squeeze()
del state_dict["bias_k"]
if "bias_v" in state_dict:
state_dict["seq_bias_v.bias"] = state_dict["bias_v"].squeeze()
del state_dict["bias_v"]
if "q_proj_weight" in state_dict:
state_dict["qlinear.weight"] = state_dict["q_proj_weight"]
del state_dict["q_proj_weight"]
if "k_proj_weight" in state_dict:
state_dict["klinear.weight"] = state_dict["k_proj_weight"]
del state_dict["k_proj_weight"]
if "v_proj_weight" in state_dict:
state_dict["vlinear.weight"] = state_dict["v_proj_weight"]
del state_dict["v_proj_weight"]
super(DPMultiheadAttention, self).load_state_dict(state_dict)