from typing import Optional
import torch
import torch.nn as nn
from torch import Tensor
class MultiHeadAttention(nn.Module):
"""
This layer applies a multi-head self- or cross-attention as described in
`Attention is all you need <https://arxiv.org/abs/1706.03762>`_ paper
Args:
embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(N, P, C_{in})`
num_heads (int): Number of heads in multi-head attention
attn_dropout (float): Attention dropout. Default: 0.0
bias (bool): Use bias or not. Default: ``True``
Shape:
- Input: :math:`(N, P, C_{in})` where :math:`N` is batch size, :math:`P` is number of patches,
and :math:`C_{in}` is input embedding dim
- Output: same shape as the input
"""
def __init__(
self,
embed_dim: int,
num_heads: int,
attn_dropout: float = 0.0,
bias: bool = True,
*args,
**kwargs
) -> None:
记录:关于多头transformer中qkv矩阵变化过程
于 2024-04-02 17:00:29 首次发布