YOLOv11v10v8使用教程: YOLOv11入门到入土使用教程
YOLOv11改进汇总贴:YOLOv11及自研模型更新汇总
《EfficientViM: Efficient Vision Mamba with Hidden State Mixer based State Space Duality》
一、 模块介绍
论文链接:https://arxiv.org/pdf/2411.15241
论文速览:
在资源受限的环境中部署神经网络方面,先前的研究构建了基于卷积和注意力机制的轻量级架构,分别用于捕获局部和全局依赖关系。最近,状态空间模型(SSM)作为一种有效的全局交互操作崭露头角,其在标记数量上的计算成本呈线性增长。为了充分利用 SSM 的优势,我们引入了基于隐藏状态混合器的状态空间对偶性(HSM-SSD)构建的新型架构——高效视觉mamba(EfficientViM),它能以更低的计算成本高效地捕获全局依赖关系。鉴于 SSD 层的运行时间受输入序列上的线性投影驱动,我们重新设计了原始的 SSD 层,在 HSM-SSD 层的压缩隐藏状态内执行通道混合操作。此外,我们提出了多阶段隐藏状态融合以增强隐藏状态的表示能力,并提供了设计以缓解由内存受限操作导致的瓶颈。
总结:本文更新其中的EfficientViMBlock模块。
⭐⭐本文二创模块仅更新于付费群中,往期免费教程可看下方链接⭐⭐
二、二创融合模块
2.1 相关代码
class LayerNorm1D(nn.Module):
"""LayerNorm for channels of 1D tensor(B C L)"""
def __init__(self, num_channels, eps=1e-5, affine=True):
super(LayerNorm1D, self).__init__()
self.num_channels = num_channels
self.eps = eps
self.affine = affine
if self.affine:
self.weight = nn.Parameter(torch.ones(1, num_channels, 1))
self.bias = nn.Parameter(torch.zeros(1, num_channels, 1))
else:
self.register_parameter('weight', None)
self.register_parameter('bias', None)
def forward(self, x):
mean = x.mean(dim=1, keepdim=True) # (B, 1, H, W)
var = x.var(dim=1, keepdim=True, unbiased=False) # (B, 1, H, W)
x_normalized = (x - mean) / torch.sqrt(var + self.eps) # (B, C, H, W)
if self.affine:
x_normalized = x_normalized * self.weight + self.bias
return x_normalized
class ConvLayer2D(nn.Module):
def __init__(self, in_dim, out_dim, kernel_size=3, stride=1, padding=0, dilation=1, groups=1, norm=nn.BatchNorm2d,
act_layer=nn.ReLU, bn_weight_init=1):
super(ConvLayer2D, self).__init__()
self.conv = nn.Conv2d(
in_dim,
out_dim,
kernel_size=(kernel_size, kernel_size),
stride=(stride, stride),
padding=(padding, padding),
dilation=(dilation, dilation),
groups=groups,
bias=False
)
self.norm = norm(num_features=out_dim) if norm else None
self.act = act_layer() if act_layer else None
if self.norm:
torch.nn.init.constant_(self.norm.weight, bn_weight_init)
torch.nn.init.constant_(self.norm.bias, 0)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.conv(x)
if self.norm:
x = self.norm(x)
if self.act:
x = self.act(x)
return x
class ConvLayer1D(nn.Module):
def __init__(self, in_dim, out_dim, kernel_size=3, stride=1, padding=0, dilation=1, groups=1, norm=nn.BatchNorm1d,
act_layer=nn.ReLU, bn_weight_init=1):
super(ConvLayer1D, self).__init__()
self.conv = nn.Conv1d(
in_dim,
out_dim,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=False
)
self.norm = norm(num_features=out_dim) if norm else None
self.act = act_layer() if act_layer else None
if self.norm:
torch.nn.init.constant_(self.norm.weight, bn_weight_init)
torch.nn.init.constant_(self.norm.bias, 0)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.conv(x)
if self.norm:
x = self.norm(x)
if self.act:
x = self.act(x)
return x
class FFN(nn.Module):
def __init__(self, in_dim, dim):
super().__init__()
self.fc1 = ConvLayer2D(in_dim, dim, 1)
self.fc2 = ConvLayer2D(dim, in_dim, 1, act_layer=None, bn_weight_init=0)
def forward(self, x):
x = self.fc2(self.fc1(x))
return x
class Stem(nn.Module):
def __init__(self, in_dim=3, dim=96):
super().__init__()
self.conv = nn.Sequential(
ConvLayer2D(in_dim, dim // 8, kernel_size=3, stride=2, padding=1),
ConvLayer2D(dim // 8, dim // 4, kernel_size=3, stride=2, padding=1),
ConvLayer2D(dim // 4, dim // 2, kernel_size=3, stride=2, padding=1),
ConvLayer2D(dim // 2, dim, kernel_size=3, stride=2, padding=1, act_layer=None))
def forward(self, x):
x = self.conv(x)
return x
class PatchMerging(nn.Module):
def __init__(self, in_dim, out_dim, ratio=4.0):
super().__init__()
hidden_dim = int(out_dim * ratio)
self.conv = nn.Sequential(
ConvLayer2D(in_dim, hidden_dim, kernel_size=1),
ConvLayer2D(hidden_dim, hidden_dim, kernel_size=3, stride=2, padding=1, groups=hidden_dim),
SqueezeExcite(hidden_dim, .25),
ConvLayer2D(hidden_dim, out_dim, kernel_size=1, act_layer=None)
)
self.dwconv1 = ConvLayer2D(in_dim, in_dim, 3, padding=1, groups=in_dim, act_layer=None)
self.dwconv2 = ConvLayer2D(out_dim, out_dim, 3, padding=1, groups=out_dim, act_layer=None)
def forward(self, x):
x = x + self.dwconv1(x)
x = self.conv(x)
x = x + self.dwconv2(x)
return x
class HSMSSD(nn.Module):
def __init__(self, d_model, ssd_expand=1, A_init_range=(1, 16), state_dim=64):
super().__init__()
self.ssd_expand = ssd_expand
self.d_inner = int(self.ssd_expand * d_model)
self.state_dim = state_dim
self.BCdt_proj = ConvLayer1D(d_model, 3 * state_dim, 1, norm=None, act_layer=None)
conv_dim = self.state_dim * 3
self.dw = ConvLayer2D(conv_dim, conv_dim, 3, 1, 1, groups=conv_dim, norm=None, act_layer=None, bn_weight_init=0)
self.hz_proj = ConvLayer1D(d_model, 2 * self.d_inner, 1, norm=None, act_layer=None)
self.out_proj = ConvLayer1D(self.d_inner, d_model, 1, norm=None, act_layer=None, bn_weight_init=0)
A = torch.empty(self.state_dim, dtype=torch.float32).uniform_(*A_init_range)
self.A = torch.nn.Parameter(A)
self.act = nn.SiLU()
self.D = nn.Parameter(torch.ones(1))
self.D._no_weight_decay = True
def forward(self, x, H, W):
batch, _, L = x.shape
BCdt = self.dw(self.BCdt_proj(x).view(batch, -1, H, W)).flatten(2)
B, C, dt = torch.split(BCdt, [self.state_dim, self.state_dim, self.state_dim], dim=1)
A = (dt + self.A.view(1, -1, 1)).softmax(-1)
AB = (A * B)
h = x @ AB.transpose(-2, -1)
h, z = torch.split(self.hz_proj(h), [self.d_inner, self.d_inner], dim=1)
h = self.out_proj(h * self.act(z.clone()) + h * self.D)
y = h @ C # B C N, B C L -> B C L
y = y.view(batch, -1, H, W).contiguous() # + x * self.D # B C H W
return y, h
class EfficientViMBlock(nn.Module):
def __init__(self, dim, mlp_ratio=4., ssd_expand=1, state_dim=64):
super().__init__()
self.dim = dim
self.mlp_ratio = mlp_ratio
self.mixer = HSMSSD(d_model=dim, ssd_expand=ssd_expand, state_dim=state_dim)
self.norm = LayerNorm1D(dim)
self.dwconv1 = ConvLayer2D(dim, dim, 3, padding=1, groups=dim, bn_weight_init=0, act_layer=None)
self.dwconv2 = ConvLayer2D(dim, dim, 3, padding=1, groups=dim, bn_weight_init=0, act_layer=None)
self.ffn = FFN(in_dim=dim, dim=int(dim * mlp_ratio))
# LayerScale
self.alpha = nn.Parameter(1e-4 * torch.ones(4, dim), requires_grad=True)
def forward(self, x):
alpha = torch.sigmoid(self.alpha).view(4, -1, 1, 1)
# DWconv1
x = (1 - alpha[0]) * x + alpha[0] * self.dwconv1(x)
# HSM-SSD
x_prev = x
H, W = x.shape[2:]
x, h = self.mixer(self.norm(x.flatten(2)), H, W)
x = (1 - alpha[1]) * x_prev + alpha[1] * x
# DWConv2
x = (1 - alpha[2]) * x + alpha[2] * self.dwconv2(x)
# FFN
x = (1 - alpha[3]) * x + alpha[3] * self.ffn(x)
# return x, h
return x
# class EfficientViMStage(nn.Module):
# def __init__(self, in_dim, out_dim, depth=2, mlp_ratio=4., downsample=None, ssd_expand=1, state_dim=64):
# super().__init__()
# self.depth = depth
# self.blocks = nn.ModuleList([
# EfficientViMBlock(dim=in_dim, mlp_ratio=mlp_ratio, ssd_expand=ssd_expand, state_dim=state_dim) for _ in
# range(depth)])
#
# self.downsample = downsample(in_dim=in_dim, out_dim=out_dim) if downsample is not None else None
#
# def forward(self, x):
# for blk in self.blocks:
# x, h = blk(x)
#
# x_out = x
# if self.downsample is not None:
# x = self.downsample(x)
# # return x, x_out, h
# return x
2.2更改yaml文件 (以自研模型为例)
打开更改ultralytics/cfg/models/11路径下的YOLOv11.yaml文件,替换原有模块。
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLO11 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect
# ⭐⭐Powered by https://blog.csdn.net/StopAndGoyyy, 技术指导QQ:2668825911⭐⭐
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
# [depth, width, max_channels]
n: [0.50, 0.25, 1024] # summary: 377 layers, 2,249,525 parameters, 2,249,509 gradients, 8.7 GFLOPs/258 layers, 2,219,405 parameters, 0 gradients, 8.5 GFLOPs
s: [0.50, 0.50, 1024] # summary: 377 layers, 8,082,389 parameters, 8,082,373 gradients, 29.8 GFLOPs/258 layers, 7,972,885 parameters, 0 gradients, 29.2 GFLOPs
m: [0.50, 1.00, 512] # summary: 377 layers, 20,370,221 parameters, 20,370,205 gradients, 103.0 GFLOPs/258 layers, 20,153,773 parameters, 0 gradients, 101.2 GFLOPs
l: [1.00, 1.00, 512] # summary: 521 layers, 23,648,717 parameters, 23,648,701 gradients, 124.5 GFLOPs/330 layers, 23,226,989 parameters, 0 gradients, 121.2 GFLOPs
x: [1.00, 1.50, 512] # summary: 521 layers, 53,125,237 parameters, 53,125,221 gradients, 278.9 GFLOPs/330 layers, 52,191,589 parameters, 0 gradients, 272.1 GFLOPs
# n: [0.33, 0.25, 1024]
# s: [0.50, 0.50, 1024]
# m: [0.67, 0.75, 768]
# l: [1.00, 1.00, 512]
# x: [1.00, 1.25, 512]
# YOLO11n backbone
backbone:
# [from, repeats, module, args]
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 2, RCRep2A, [128, False, 0.25]]
- [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
- [-1, 1, EfficientViMBlock, []]
- [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
- [-1, 4, RCRep2A, [512, True]]
- [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
- [-1, 2, RCRep2A, [1024, True]]
- [-1, 1, SPPF_WD, [1024, 7]] # 9
# YOLO11n head
head:
- [[3, 5, 7], 1, align_3In, [256, 1]] # 10
- [[4, 6, 9], 1, align_3In, [256, 1]] # 11
- [[-1, -2], 1, Concat, [1]] #12 cat
- [-1, 1, RepVGGBlocks, []] #13
- [-1, 1, nn.Upsample, [None, 2, "nearest"]] #14
- [[-1, 4], 1, Concat, [1]] #15 cat
- [-1, 1, Conv, [256, 3]] # 16
- [13, 1, Conv, [512, 3]] #17
- [13, 1, Conv, [1024, 3, 2]] #18
- [[16, 17, 18], 1, Detect, [nc]] # Detect(P3, P4, P5)
# ⭐⭐Powered by https://blog.csdn.net/StopAndGoyyy, 技术指导QQ:2668825911⭐⭐
2.3 修改train.py文件
创建Train脚本用于训练。
from ultralytics.models import YOLO
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
if __name__ == '__main__':
model = YOLO(model='ultralytics/cfg/models/xy_YOLO/xy_yolov1.yaml')
# model = YOLO(model='ultralytics/cfg/models/11/yolo11l.yaml')
model.train(data='./datasets/data.yaml', epochs=1, batch=1, device='0', imgsz=320, workers=1, cache=False,
amp=True, mosaic=False, project='run/train', name='exp',)
在train.py脚本中填入修改好的yaml路径,运行即可训练,数据集创建教程见下方链接。