本文整理汇总了Python中torch.bmm函数的典型用法代码示例。如果您正苦于以下问题:Python bmm函数的具体用法?Python bmm怎么用?Python bmm使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了bmm函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: forward
def forward(self, inputs):
x, u = inputs
x = self.bn0(x)
x = F.tanh(self.linear1(x))
x = F.tanh(self.linear2(x))
V = self.V(x)
mu = F.tanh(self.mu(x))
Q = None
if u is not None:
num_outputs = mu.size(1)
L = self.L(x).view(-1, num_outputs, num_outputs)
L = L * \
self.tril_mask.expand_as(
L) + torch.exp(L) * self.diag_mask.expand_as(L)
P = torch.bmm(L, L.transpose(2, 1))
u_mu = (u - mu).unsqueeze(2)
A = -0.5 * \
torch.bmm(torch.bmm(u_mu.transpose(2, 1), P), u_mu)[:, :, 0]
Q = A + V
return mu, Q, V
示例2: predict
def predict(self, x_de, x_en):
bs = x_de.size(0)
emb_de = self.embedding_de(x_de) # bs,n_de,word_dim
emb_en = self.embedding_en(x_en) # bs,n_en,word_dim
h_enc = Variable(torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda())
c_enc = Variable(torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda())
h_dec = Variable(torch.zeros(self.n_layers, bs, self.hidden_dim).cuda())
c_dec = Variable(torch.zeros(self.n_layers, bs, self.hidden_dim).cuda())
enc_h, _ = self.encoder(emb_de, (h_enc, c_enc)) # (bs,n_de,hiddensz*2)
dec_h, _ = self.decoder(emb_en, (h_dec, c_dec)) # (bs,n_en,hiddensz)
# all the same. enc_h is bs,n_de,hiddensz*n_directions. h and c are both n_layers*n_directions,bs,hiddensz
if self.directions == 2:
scores = torch.bmm(self.dim_reduce(enc_h), dec_h.transpose(1,2))
else:
scores = torch.bmm(enc_h, dec_h.transpose(1,2))
# (bs,n_de,hiddensz) * (bs,hiddensz,n_en) = (bs,n_de,n_en)
scores[(x_de == pad_token).unsqueeze(2).expand(scores.size())] = -math.inf # binary mask
attn_dist = F.softmax(scores,dim=1) # bs,n_de,n_en
context = torch.bmm(attn_dist.transpose(2,1),enc_h)
# (bs,n_en,n_de) * (bs,n_de,hiddensz*ndirections) = (bs,n_en,hiddensz*ndirections)
pred = self.vocab_layer(torch.cat([dec_h,context],2)) # bs,n_en,len(EN.vocab)
pred = pred[:,:-1,:] # alignment
_, tokens = pred.max(2) # bs,n_en-1
sauce = Variable(torch.cuda.LongTensor([[sos_token]]*bs)) # bs
return torch.cat([sauce,tokens],1), attn_dist
示例3: forward
def forward(self, feat, right, wrong, batch_wrong, fake=None, fake_diff_mask=None):
num_wrong = wrong.size(1)
batch_size = feat.size(0)
feat = feat.view(-1, self.ninp, 1)
right_dis = torch.bmm(right.view(-1, 1, self.ninp), feat)
wrong_dis = torch.bmm(wrong, feat)
batch_wrong_dis = torch.bmm(batch_wrong, feat)
wrong_score = torch.sum(torch.exp(wrong_dis - right_dis.expand_as(wrong_dis)),1) \
+ torch.sum(torch.exp(batch_wrong_dis - right_dis.expand_as(batch_wrong_dis)),1)
loss_dis = torch.sum(torch.log(wrong_score + 1))
loss_norm = right.norm() + feat.norm() + wrong.norm() + batch_wrong.norm()
if fake:
fake_dis = torch.bmm(fake.view(-1, 1, self.ninp), feat)
fake_score = torch.masked_select(torch.exp(fake_dis - right_dis), fake_diff_mask)
margin_score = F.relu(torch.log(fake_score + 1) - self.margin)
loss_fake = torch.sum(margin_score)
loss_dis += loss_fake
loss_norm += fake.norm()
loss = (loss_dis + 0.1 * loss_norm) / batch_size
if fake:
return loss, loss_fake.data[0] / batch_size
else:
return loss
示例4: forward
def forward(self, vocab):
with torch.no_grad():
batch_shape = vocab['sentence'].shape
s_embedding = self.embedding(vocab['sentence'].cuda())
a_embedding = self.embedding(vocab['aspect'].cuda())
packed_s = pack_padded_sequence(s_embedding, vocab['sent_len'], batch_first=True)
out_s, (h_s, c1) = self.lstm_s(packed_s) # packed output
out_a, (h_a, c2) = self.lstm_a(a_embedding)
with torch.no_grad():
unpacked_out_s, _ = pad_packed_sequence(out_s, batch_first=True)
# Pair-wise interaction matrix
I_matrix = torch.bmm(unpacked_out_s, out_a.permute(0,2,1))
# Column-wise softmax
a2s_attn = F.softmax(I_matrix, dim=1)
# Row-wise softmax => Column-wise average => aspect attention
s2a_attn = F.softmax(I_matrix, dim=2)
a_attn = torch.mean(s2a_attn, dim=1)
# Final sentence attn => weighted sum of each individual a2s_attn
s_attn = torch.bmm(a2s_attn, a_attn.unsqueeze(-1))
final_rep = torch.bmm(unpacked_out_s.permute(0,2,1), s_attn).squeeze(-1)
pred = self.fc(final_rep)
return pred
示例5: forward
def forward(self, ht, hs, mask, weighted_ctx=True):
'''
ht: batch x ht_dim
hs: (seq_len x batch x hs_dim, seq_len x batch x ht_dim)
mask: seq_len x batch
'''
hs, hs_ = hs
# seq_len, batch, _ = hs.size()
hs = hs.transpose(0, 1)
hs_ = hs_.transpose(0, 1)
# hs: batch x seq_len x hs_dim
# hs_: batch x seq_len x ht_dim
# hs_ = self.hs2ht(hs)
# Alignment/Attention Function
# batch x ht_dim x 1
ht = ht.unsqueeze(2)
# batch x seq_len
score = torch.bmm(hs_, ht).squeeze(2)
# attn = F.softmax(score, dim=-1)
attn = F.softmax(score, dim=-1) * mask.transpose(0, 1) + EPSILON
attn = attn / attn.sum(-1, keepdim=True)
# Compute weighted sum of hs by attention.
# batch x 1 x seq_len
attn = attn.unsqueeze(1)
if weighted_ctx:
# batch x hs_dim
weight_hs = torch.bmm(attn, hs).squeeze(1)
else:
weight_hs = None
return weight_hs, attn
示例6: forward_dot
def forward_dot(self, hid, ctx, ctx_mask):
r"""Computes Luong-style dot attention probabilities between
decoder's hidden state and source annotations.
Arguments:
hid(Variable): A set of decoder hidden states of shape `T*B*H`
where `T` == 1, `B` is batch dim and `H` is hidden state dim.
ctx(Variable): A set of annotations of shape `S*B*C` where `S`
is the source timestep dim, `B` is batch dim and `C`
is annotation dim.
ctx_mask(FloatTensor): A binary mask of shape `S*B` with zeroes
in the padded timesteps.
Returns:
scores(Variable): A variable of shape `S*B` containing normalized
attention scores for each position and sample.
z_t(Variable): A variable of shape `B*H` containing the final
attended context vector for this target decoding timestep.
"""
# Apply transformations first to make last dims both C and then
# shuffle dims to prepare for batch mat-mult
ctx_ = self.ctx2ctx(ctx).permute(1, 2, 0) # S*B*C -> S*B*C -> B*C*S
hid_ = self.hid2ctx(hid).permute(1, 0, 2) # T*B*H -> T*B*C -> B*T*C
# 'dot' scores of B*T*S
scores = F.softmax(torch.bmm(hid_, ctx_), dim=-1)
# Transform back to hidden_dim for further decoders
# B*T*S x B*S*C -> B*T*C -> B*T*H
z_t = self.ctx2hid(torch.bmm(scores, ctx.transpose(0, 1)))
return scores.transpose(0, 1), z_t.transpose(0, 1)
示例7: forward
def forward(self, output, context):
batch_size = output.size(0)
hidden_size = output.size(2)
input_size = context.size(1)
# (batch, out_len, dim) * (batch, in_len, dim) -> (batch, out_len, in_len)
attn = torch.bmm(output, context.transpose(1, 2))
mask = torch.eq(attn, 0).data.byte()
attn.data.masked_fill_(mask, -float('inf'))
attn = F.softmax(attn.view(-1, input_size), dim=1).view(batch_size, -1, input_size)
# (batch, out_len, in_len) * (batch, in_len, dim) -> (batch, out_len, dim)
mix = torch.bmm(attn, context)
# concat -> (batch, out_len, 2*dim)
combined = torch.cat((mix, output), dim=2)
# output -> (batch, out_len, dim)
output = F.tanh(self.linear_out(combined.view(-1, 2 * hidden_size))).view(batch_size, -1, hidden_size)
if not output.is_contiguous():
output = output.contiguous()
return output, attn
示例8: backward
def backward(ctx, grad_output):
batch1, batch2 = ctx.saved_variables
grad_add_matrix = grad_batch1 = grad_batch2 = None
if ctx.needs_input_grad[0]:
grad_add_matrix = maybe_unexpand(grad_output, ctx.add_matrix_size)
if ctx.alpha != 1:
grad_add_matrix = grad_add_matrix.mul(ctx.alpha)
if any(ctx.needs_input_grad[1:]):
batch_grad_output = (grad_output
.unsqueeze(0)
.expand(batch1.size(0), batch1.size(1), batch2.size(2)))
if ctx.needs_input_grad[1]:
grad_batch1 = torch.bmm(batch_grad_output, batch2.transpose(1, 2))
if ctx.beta != 1:
grad_batch1 *= ctx.beta
if ctx.needs_input_grad[2]:
grad_batch2 = torch.bmm(batch1.transpose(1, 2), batch_grad_output)
if ctx.beta != 1:
grad_batch2 *= ctx.beta
return grad_add_matrix, grad_batch1, grad_batch2, None, None, None
示例9: forward
def forward(self, q, k, v):
b_q, t_q, dim_q = list(q.size())
b_k, t_k, dim_k = list(k.size())
b_v, t_v, dim_v = list(v.size())
assert(b_q == b_k and b_k == b_v) # batch size should be equal
assert(dim_q == dim_k) # dims should be equal
assert(t_k == t_v) # times should be equal
b = b_q
qk = torch.bmm(q, k.transpose(1, 2)) # b x t_q x t_k
qk.div_(dim_k ** 0.5)
mask = None
if self.causal and t_q > 1:
causal_mask = q.data.new(t_q, t_k).byte().fill_(1).triu_(1)
mask = Variable(causal_mask.unsqueeze(0).expand(b, t_q, t_k),
requires_grad=False)
if self.mask_k is not None:
mask_k = self.mask_k.unsqueeze(1).expand(b, t_q, t_k)
mask = mask_k if mask is None else mask | mask_k
if self.mask_q is not None:
mask_q = self.mask_q.unsqueeze(2).expand(b, t_q, t_k)
mask = mask_q if mask is None else mask | mask_q
if mask is not None:
qk.masked_fill_(mask, -1e9)
sm_qk = F.softmax(qk, dim=2)
sm_qk = self.dropout(sm_qk)
return torch.bmm(sm_qk, v), sm_qk # b x t_q x dim_v
示例10: lstsq
def lstsq(b, y, alpha=0.01):
"""
Batched linear least-squares for pytorch with optional L1 regularization.
Parameters
----------
b : shape(L, M, N)
y : shape(L, M)
Returns
-------
tuple of (coefficients, model, residuals)
"""
bT = b.transpose(-1, -2)
AA = torch.bmm(bT, b)
if alpha != 0:
diag = torch.diagonal(AA, dim1=1, dim2=2)
diag += alpha
RHS = torch.bmm(bT, y[:, :, None])
X, LU = torch.gesv(RHS, AA)
fit = torch.bmm(b, X)[..., 0]
res = y - fit
return X[..., 0], fit, res
示例11: bnorm
def bnorm(x, U):
mx = torch.bmm(U,x)
subs = x-mx
subs2 = subs*subs
vx = torch.bmm(U,subs2)
out = subs / (vx.clamp(min=1e-10).sqrt() + 1e-5)
return out
示例12: forward
def forward(self, q, k, v, attn_mask=None):
d_k, d_v = self.d_k, self.d_v
n_head = self.n_head
residual = q
#print('q,k,v:',q.size(),k.size(),v.size())
mb_size, len_q, q_hidden_size = q.size()
mb_size, len_k, k_hidden_size = k.size()
mb_size, len_v, v_hidden_size = v.size()
# treat as a (n_head) size batch
q_s = q.repeat(n_head, 1, 1).view(n_head, -1, q_hidden_size) # n_head x (mb_size*len_q) x d_model
k_s = k.repeat(n_head, 1, 1).view(n_head, -1, k_hidden_size) # n_head x (mb_size*len_k) x d_model
v_s = v.repeat(n_head, 1, 1).view(n_head, -1, v_hidden_size) # n_head x (mb_size*len_v) x d_model
#print('q_s,k_s,v_s:',q_s.size(),k_s.size(),v_s.size())
#print('w_qs',self.w_qs.size())
# treat the result as a (n_head * mb_size) size batch
q_s = torch.bmm(q_s, self.w_qs).view(-1, len_q, d_k) # (n_head*mb_size) x len_q x d_k
k_s = torch.bmm(k_s, self.w_ks).view(-1, len_k, d_k) # (n_head*mb_size) x len_k x d_k
v_s = torch.bmm(v_s, self.w_vs).view(-1, len_v, d_v) # (n_head*mb_size) x len_v x d_v
# perform attention, result size = (n_head * mb_size) x len_q x d_v
#print('attn_mask:',attn_mask.size())
#print(attn_mask)
outputs, attns = self.attention.forward(q_s, k_s, v_s, attn_mask=attn_mask.repeat(n_head,1,1))
# back to original mb_size batch, result size = mb_size x len_q x (n_head*d_v)
outputs = torch.cat(torch.split(outputs, mb_size, dim=0), dim=-1)
# project back to residual size
outputs = self.proj.forward(outputs)
outputs = self.dropout(outputs)
return self.layer_norm(outputs + residual), attns
示例13: predict2
def predict2(self, x_de, beamsz, gen_len):
emb_de = self.embedding_de(x_de) # "batch size",n_de,word_dim, but "batch size" is 1 in this case!
h0 = Variable(torch.zeros(self.n_layers*self.directions, 1, self.hidden_dim).cuda())
c0 = Variable(torch.zeros(self.n_layers*self.directions, 1, self.hidden_dim).cuda())
enc_h, _ = self.encoder(emb_de, (h0, c0))
# since enc batch size=1, enc_h is 1,n_de,hiddensz*n_directions
if self.directions == 2:
enc_h = self.dim_reduce(enc_h) # 1,n_de,hiddensz
masterheap = CandList(self.n_layers,self.hidden_dim,enc_h.size(1),beamsz)
# in the following loop, beamsz is length 1 for first iteration, length true beamsz (100) afterward
for i in range(gen_len):
prev = masterheap.get_prev() # beamsz
emb_t = self.embedding_en(prev) # embed the last thing we generated. beamsz,word_dim
enc_h_expand = enc_h.expand(prev.size(0),-1,-1) # beamsz,n_de,hiddensz
h, c = masterheap.get_hiddens() # (n_layers,beamsz,hiddensz),(n_layers,beamsz,hiddensz)
dec_h, (h, c) = self.decoder(emb_t.unsqueeze(1), (h, c)) # dec_h is beamsz,1,hiddensz (batch_first=True)
scores = torch.bmm(enc_h_expand, dec_h.transpose(1,2)).squeeze(2)
# (beamsz,n_de,hiddensz) * (beamsz,hiddensz,1) = (beamsz,n_de,1). squeeze to beamsz,n_de
attn_dist = F.softmax(scores,dim=1)
if self.attn_type == "hard":
_, argmax = attn_dist.max(1) # beamsz for each batch, select most likely german word to pay attention to
one_hot = Variable(torch.zeros_like(attn_dist.data).scatter_(-1, argmax.data.unsqueeze(1), 1).cuda())
context = torch.bmm(one_hot.unsqueeze(1), enc_h_expand).squeeze(1)
else:
context = torch.bmm(attn_dist.unsqueeze(1), enc_h_expand).squeeze(1)
# the difference btwn hard and soft is just whether we use a one_hot or a distribution
# context is beamsz,hiddensz*n_directions
pred = self.vocab_layer(torch.cat([dec_h.squeeze(1), context], 1)) # beamsz,len(EN.vocab)
# TODO: set the columns corresponding to <pad>,<unk>,</s>,etc to 0
masterheap.update_beam(pred)
masterheap.update_hiddens(h,c)
masterheap.update_attentions(attn_dist)
masterheap.firstloop = False
return masterheap.probs,masterheap.wordlist,masterheap.attentions
示例14: predict
def predict(self, x_de, x_en):
bs = x_de.size(0)
emb_de = self.embedding_de(x_de) # bs,n_de,word_dim
emb_en = self.embedding_en(x_en) # bs,n_en,word_dim
h = Variable(torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda())
c = Variable(torch.zeros(self.n_layers*self.directions, bs, self.hidden_dim).cuda())
enc_h, _ = self.encoder(emb_de, (h, c))
dec_h, _ = self.decoder(emb_en, (h, c))
# all the same. enc_h is bs,n_de,hiddensz*n_directions. h and c are both n_layers*n_directions,bs,hiddensz
if self.directions == 2:
enc_h = self.dim_reduce(enc_h) # bs,n_de,hiddensz
scores = torch.bmm(enc_h, dec_h.transpose(1,2))
# (bs,n_de,hiddensz) * (bs,hiddensz,n_en) = (bs,n_de,n_en)
y = [Variable(torch.cuda.LongTensor([sos_token]*bs))] # bs
self.attn = []
for t in range(x_en.size(1)-1): # iterate over english words, with teacher forcing
attn_dist = F.softmax(scores[:,:,t],dim=1) # bs,n_de
self.attn.append(attn_dist.data)
if self.attn_type == "hard":
_, argmax = attn_dist.max(1) # bs. for each batch, select most likely german word to pay attention to
one_hot = Variable(torch.zeros_like(attn_dist.data).scatter_(-1, argmax.data.unsqueeze(1), 1).cuda())
context = torch.bmm(one_hot.unsqueeze(1), enc_h).squeeze(1)
else:
context = torch.bmm(attn_dist.unsqueeze(1), enc_h).squeeze(1)
# the difference btwn hard and soft is just whether we use a one_hot or a distribution
# context is bs,hiddensz
pred = self.vocab_layer(torch.cat([dec_h[:,t,:], context], 1)) # bs,len(EN.vocab)
_, next_token = pred.max(1) # bs
y.append(next_token)
self.attn = torch.stack(self.attn, 0).transpose(0, 1) # bs,n_en,n_de (for visualization!)
y = torch.stack(y,0).transpose(0,1) # bs,n_en
return y,self.attn
示例15: predict
def predict(self, x, attn_type = "hard"):
#predict with greedy decoding
emb = self.embedding(x)
h = Variable(torch.zeros(1, x.size(0), self.hidden_dim))
c = Variable(torch.zeros(1, x.size(0), self.hidden_dim))
enc_h, _ = self.encoder(emb, (h, c))
y = [Variable(torch.zeros(x.size(0)).long())]
self.attn = []
for t in range(x.size(1)):
emb_t = self.embedding(y[-1])
dec_h, (h, c) = self.decoder(emb_t.unsqueeze(1), (h, c))
scores = torch.bmm(enc_h, dec_h.transpose(1,2)).squeeze(2)
attn_dist = F.softmax(scores, dim = 1)
self.attn.append(attn_dist.data)
if attn_type == "hard":
_, argmax = attn_dist.max(1)
one_hot = Variable(torch.zeros_like(attn_dist.data).scatter_(-1, argmax.data.unsqueeze(1), 1))
context = torch.bmm(one_hot.unsqueeze(1), enc_h).squeeze(1)
else:
context = torch.bmm(attn_dist.unsqueeze(1), enc_h).squeeze(1)
pred = self.vocab_layer(torch.cat([dec_h.squeeze(1), context], 1))
_, next_token = pred.max(1)
y.append(next_token)
self.attn = torch.stack(self.attn, 0).transpose(0, 1)
return torch.stack(y, 0).transpose(0, 1)