- 支持attention mask
- 支持kv cache
import torch
from torch import nn
def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
# calculate attention
matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2))
dk = k.shape[-1]
scaled_attention_logits = matmul_qk / np.sqrt(dk)
if mask is not None:
nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4
if attention_mask is not None:
# Apply the attention mask
scaled_attention_logits = scaled_attention_logits + attention_mask
attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
# Mask heads if we want to
if head_mask is not None:
attention_weights = attention_weights * head_mask
output = torch.matmul(attention_weights, v)
return output, attention_weights
class MultiHeadAttention(nn.Module):
def __init__(self, d_model_size, num_heads):
super().__init__()
self.num_heads = num_heads
self.d_model_size = d_model_size
self.depth = int(d_model_size / self.num_heads)
self.Wq = nn.Linear(d_model_size, d_model_size)
self.Wk = nn.Linear(d_model_size, d_model_size)
self.Wv = nn.Linear(d_model_size, d_model_size)
self.dense = nn.Linear(d_model_size, d_model_size)
def split_into_heads(self, x, batch_size):
x = x.reshape(batch_size, -1, self.num_heads, self.depth)
return x.permute([0, 2, 1, 3])
def forward(
self,
v,
k,
q,
mask,
layer_past=None,
attention_mask=None,
head_mask=None,
use_cache=False,
output_attentions=False,
):
batch_size = q.shape[0]
q = self.Wq(q)
k = self.Wk(k)
v = self.Wv(v)
q = self.split_into_heads(q, batch_size)
k = self.split_into_heads(k, batch_size)
v = self.split_into_heads(v, batch_size)
if layer_past is not None:
past_key, past_value = layer_past[0], layer_past[1]
k = torch.cat((past_key, k), dim=-2)
v = torch.cat((past_value, v), dim=-2)
if use_cache is True:
present = torch.stack((k, v))
else:
present = (None,)
output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
scaled_attention = output[0].permute([0, 2, 1, 3])
attn = output[1]
original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size)
output = self.dense(original_size_attention)
outputs = (output, present)
if output_attentions:
outputs = outputs + (attn,)
return outputs
参考:https://github.com/huggingface/transformers/blob/v4.52.0/src/transformers/models/ctrl/modeling_ctrl.py#L85