活动介绍

msg = self.load_state_dict(state_dict, strict=False) logger.info('Missing keys: {}'.format(msg.missing_keys)) logger.info('Unexpected keys: {}'.format(msg.unexpected_keys)) logger.info(f"=> loaded successfully '{self.pretrained}'") del checkpoint torch.cuda.empty_cache()

时间: 2023-06-19 07:06:33 浏览: 150
这段代码是在加载预训练模型的权重时使用的。调用了 PyTorch 中的 `load_state_dict()` 方法,将预训练模型的权重载入到当前模型中。`strict=False` 表示可以忽略一些在当前模型中不存在的权重,这样可以避免一些不必要的错误。加载成功后,打印出缺失的权重和未预期的权重,并删除 `checkpoint` 变量并清空 GPU 缓存。其中 `logger` 是一个日志记录器,用来记录一些信息,方便调试和排错。
相关问题

if isinstance(self.pretrained, str): self.apply(_init_weights) logger = get_root_logger() logger.info(f'load model from: {self.pretrained}') checkpoint = torch.load(self.pretrained, map_location='cpu') state_dict = checkpoint['model'] state_dict['patch_embed.proj.weight'] = state_dict['patch_embed.proj.weight'].unsqueeze(2).repeat(1,1,self.patch_size[0],1,1) / self.patch_size[0] ## Duplicate weights for temporal attention and temporal norm # new_state_dict = state_dict.copy() # for key in state_dict: # if 'blocks' in key and 'attn' in key and 'relative' not in key and 'mask' not in key: # new_key = key.replace('attn','temporal_attn') # if not new_key in state_dict: # new_state_dict[new_key] = state_dict[key] # else: # new_state_dict[new_key] = state_dict[new_key] # if 'blocks' in key and 'norm1' in key and 'relative' not in key and 'mask' not in key: # new_key = key.replace('norm1','temporal_norm') # if not new_key in state_dict: # new_state_dict[new_key] = state_dict[key] # else: # new_state_dict[new_key] = state_dict[new_key] # state_dict = new_state_dict msg = self.load_state_dict(state_dict, strict=False) logger.info('Missing keys: {}'.format(msg.missing_keys)) logger.info('Unexpected keys: {}'.format(msg.unexpected_keys)) logger.info(f"=> loaded successfully '{self.pretrained}'") del checkpoint torch.cuda.empty_cache() elif self.pretrained is None: self.apply(_init_weights) else: raise TypeError('pretrained must be a str or None')

这段代码主要是加载预训练模型的过程。如果传入的预训练模型是一个字符串,那么就会根据该字符串的路径加载模型。加载过程中会将模型的权重进行初始化,并将预训练模型的权重加载到新的模型中。如果预训练模型是 None,则会进行默认初始化。如果传入的预训练模型不是 str 或 None,则会抛出一个 TypeError。

# ============= 训练参数 ============= num_epochs_rl: int = 20 # RL训练总轮数 lr: float = 1e-3 # 初始学习率 batch_size: int = 12 # 批次大小 seed = 2025 num_epochs = 20 num_workers = 12 freeze_layers = ['vfe', 'map_to_bev', 'backbone_2d'] sync_bn = True warm_up = 2 # epoch # ============= PPO参数 ============= clip_param: float = 0.3 # PPO裁剪参数 ppo_epochs: int = 5 # 每次经验收集后的PPO更新轮数,不宜过大 gamma: float = 0.95 # 折扣因子 tau: float = 0.90 # GAE参数 value_coef: float = 0.7 # 值函数损失权重 entropy_coef: float = 0.05 # 熵正则化权重 max_grad_norm: float = 1.0 # 梯度裁剪阈值 import torch import torch.nn as nn from torch.utils.tensorboard import SummaryWriter import torch.optim as optim from torch.distributions import Normal import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP import copy import random import numpy as np from tqdm import tqdm from collections import deque from rl_seg.utils.compute_miou import get_miou, fast_hist, fast_hist_crop from rl_seg.datasets import load_data_to_gpu # 经验回放缓冲区 # 经验回放缓冲区 class ReplayBuffer: def __init__(self, capacity=100): self.buffer = deque(maxlen=capacity) def add(self, experience): """添加经验到缓冲区""" self.buffer.append(experience) def sample(self, batch_size): """从缓冲区随机采样一批经验""" return random.sample(self.buffer, min(batch_size, len(self.buffer))) def clear(self): """清空缓冲区""" self.buffer.clear() def __len__(self): return len(self.buffer) # PPO 代理(Actor-Critic 网络) class PPOAgent(nn.Module): def __init__(self, state_dim, action_dim, hidden_dim=512): super(PPOAgent, self).__init__() self.state_dim = state_dim self.action_dim = action_dim # 共享特征提取层 self.shared_layers = nn.Sequential( nn.Linear(state_dim, hidden_dim), # nn.ReLU(), nn.LayerNorm(hidden_dim), nn.GELU(), nn.Linear(hidden_dim, hidden_dim), # nn.ReLU() nn.LayerNorm(hidden_dim), nn.GELU(), ) # Actor 网络 (策略) self.actor = nn.Sequential( # nn.Linear(hidden_dim, hidden_dim), # # nn.ReLU(), # nn.GELU(), nn.Linear(hidden_dim, action_dim), nn.Tanh() # 输出在[-1,1]范围内 ) # Critic 网络 (值函数) self.critic = nn.Sequential( nn.Linear(hidden_dim, hidden_dim), # nn.ReLU(), nn.GELU(), nn.Linear(hidden_dim, 1) ) # 动作标准差 (可学习参数) self.log_std = nn.Parameter(torch.zeros(1, action_dim)) # 初始化权重 self.apply(self._init_weights) def _init_weights(self, module): """初始化网络权重""" if isinstance(module, nn.Linear): nn.init.orthogonal_(module.weight, gain=0.01) nn.init.constant_(module.bias, 0.0) def forward(self, state): features = self.shared_layers(state) action_mean = self.actor(features) value = self.critic(features) return action_mean, value.squeeze(-1) def act(self, state): """与环境交互时选择动作""" # state = torch.FloatTensor(state).unsqueeze(0).to(device) # 确保是 [1, state_dim] with torch.no_grad(): action_mean, value = self.forward(state) # 创建动作分布 (添加最小标准差确保稳定性) action_std = torch.clamp(self.log_std.exp(), min=0.01, max=0.5) dist = Normal(action_mean, action_std) # 采样动作 action = dist.sample() # [B, action_dim] log_prob = dist.log_prob(action).sum(-1) return action, log_prob, value def evaluate(self, state, action): """评估动作的概率和值""" action_mean, value = self.forward(state) # 创建动作分布 action_std = torch.clamp(self.log_std.exp(), min=0.01, max=0.5) dist = Normal(action_mean, action_std) # 计算对数概率和熵 log_prob = dist.log_prob(action).sum(-1) entropy = dist.entropy().sum(-1) return log_prob, entropy, value # 强化学习优化器 IbMggIlv class PPOTrainer: """PPO训练器,整合了策略优化和模型微调""" def __init__(self, seg_net, agent, cfg): """ Args: seg_net: 预训练的分割网络 agent: PPO智能体 cfg: 配置对象,包含以下属性: - lr: 学习率 - clip_param: PPO裁剪参数 - ppo_epochs: PPO更新轮数 - gamma: 折扣因子 - tau: GAE参数 - value_coef: 值函数损失权重 - entropy_coef: 熵正则化权重 - max_grad_norm: 梯度裁剪阈值 """ self.seg_net = seg_net self._base_seg_net = seg_net.module if isinstance(seg_net, DDP) else seg_net self._base_seg_net.device = self.seg_net.device self.agent = agent self.cfg = cfg self.writer = SummaryWriter(log_dir=f'{cfg.exp_dir}/runs/ppo_trainer') if cfg.local_rank == 0 else None # 使用分离的优化器 self.optimizer_seg = optim.AdamW( self.seg_net.parameters(), lr=cfg.lr, weight_decay=1e-4 ) self.optimizer_agent = optim.AdamW( self.agent.parameters(), lr=cfg.lr*0.1, weight_decay=1e-4 ) # 训练记录 self.best_miou = 0.0 self.step_count = 0.0 self.metrics = { 'loss': [], 'reward': [], 'miou': [], 'class_ious': [], 'lr': [] } # 梯度缩放因子 self.neck_grad_scale = 1.0 self.head_grad_scale = 1.0 self.current_lr = cfg.lr # 当前学习率 self.class_weights = torch.ones(21).to(seg_net.device) # 初始类别权重 # 经验回放缓冲区 self.replay_buffer = ReplayBuffer(capacity=50) def compute_state(self, features, pred, gt_seg, epoch_progress): """ 计算强化学习状态向量 Args: features: 从extract_features获取的字典包含: - spatial_features: [B, C1, H, W] - bev_features: [B, C2, H, W] - neck_features: [B, C3, H, W] pred: 网络预测的分割结果 [B, num_classes, H, W] gt_seg: 真实分割标签 [B, H, W] Returns: state: 状态向量 [state_dim] """ # 主要使用neck_features作为代表特征 torch.Size([4, 64, 496, 432]) feats = features["neck_features"] # [B, C, H, W] B, C, H, W = feats.shape # 初始化状态列表 states = [] # 为批次中每个样本单独计算状态 for i in range(B): # 特征统计 feat_mean = feats[i].mean(dim=(1, 2)) # [C] feat_std = feats[i].std(dim=(1, 2)) # [C] feat_max = feats[i].max(dim=1)[0].mean(dim=1)# [C] feat_min = feats[i].min(dim=1)[0].mean(dim=1)# [C] # 预测类别分布 pred_classes = pred[i].argmax(dim=0) # [H, W] class_dist = torch.bincount( pred_classes.flatten(), minlength=21 ).float() / (H * W) # 21 # 预测置信度统计 pred_probs = torch.softmax(pred[i], dim=0) confidence = pred_probs.max(dim=0)[0] # 最大类别概率 conf_mean = confidence.mean() conf_std = confidence.std() conf_stats = torch.tensor([ confidence.mean(), confidence.std(), (confidence < 0.5).float().mean() # 低置信度像素比例 ], device=feats.device) # 3 gt_grid_ind = gt_seg["grid_ind"][i] gt_labels_ori = gt_seg["labels_ori"][i] # 各类IoU (需实现单样本IoU计算) sample_miou, sample_cls_iou = self.compute_sample_iou(pred[i], gt_grid_ind, gt_labels_ori, list(range(21))) sample_cls_iou = torch.FloatTensor(sample_cls_iou).to(feats.device) # 21 # 添加额外状态信息 additional_state = torch.tensor([ self.current_lr / self.cfg.lr, # 归一化学习率 epoch_progress, # 训练进度 *self.class_weights.cpu().numpy() # 当前类别权重 ], device=feats.device) # 组合状态 state = torch.cat([ feat_mean, feat_std, class_dist, sample_cls_iou, conf_mean.unsqueeze(0), conf_std.unsqueeze(0), additional_state ]) states.append(state) return torch.stack(states).to(feats.device) def compute_sample_iou(self, pred, gt_grid_ind, gt_labels_ori, classes=list(range(21))): """计算单个样本的IoU""" pred_labels = torch.argmax(pred, dim=0).cpu().detach().numpy() gt_grid_idx = pred_labels[ gt_grid_ind[:, 1], gt_grid_ind[:, 0], gt_grid_ind[:, 2] ] hist = fast_hist_crop( gt_grid_idx, gt_labels_ori, classes ) iou = np.diag(hist) / ((hist.sum(1) + hist.sum(0) - np.diag(hist)) + 1e-8) miou = np.nanmean(iou) return miou, iou def compute_reward(self, miou, prev_miou, class_ious, prev_class_ious): """ 计算复合奖励函数 Args: miou: 当前mIoU prev_miou: 前一次mIoU class_ious: 当前各类IoU [num_classes] prev_class_ious: 前一次各类IoU [num_classes] Returns: reward: 综合奖励值 """ # 基础奖励: mIoU提升 # miou_reward = 5.0 * (miou - prev_miou) * (1 + miou) # 高性能时奖励更大 miou_reward = 15.0 * np.sign(miou - prev_miou) * np.exp(3 * abs(miou - prev_miou)) # 1. 基础mIoU奖励(指数放大改进) # 类别平衡奖励: 鼓励所有类别均衡提升 class_reward = 0.0 for cls, (iou, prev_iou) in enumerate(zip(class_ious, prev_class_ious)): if iou > prev_iou: # 对稀有类别给予更高奖励 weight = 1.0 + (1.0 - prev_iou) # 性能越差的类权重越高 improvement = max(0, iou - prev_iou) class_reward += weight * improvement # 惩罚项: 防止某些类别性能严重下降 penalty = 0.0 for cls, (iou, prev_iou) in enumerate(zip(class_ious, prev_class_ious)): if iou < prev_iou * 0.7: # 性能下降超过10% penalty += 3.0 * (prev_iou - iou) * (1 - prev_iou) # 4. 探索奖励 entropy_bonus = 0.2 * self.agent.log_std.mean().exp().item() # 平衡奖励 (鼓励所有类别均衡提升) balance_reward = 0.5 * (1.0 - torch.std(torch.tensor(class_ious))) total_reward = miou_reward + class_reward - penalty + entropy_bonus + balance_reward return np.clip(total_reward, -2.0, 5.0) # 限制奖励范围 def compute_advantages(self, rewards, values): """计算GAE优势""" if isinstance(rewards, list): rewards = torch.tensor(rewards).to(values.device) advantages = torch.zeros_like(rewards) last_advantage = 0 # 反向计算GAE for t in reversed(range(len(rewards))): if t == len(rewards) - 1: next_value = 0 else: next_value = values[t+1] delta = rewards[t] + self.cfg.gamma * next_value - values[t] advantages[t] = delta + self.cfg.gamma * self.cfg.tau * last_advantage last_advantage = advantages[t] # 标准化优势 advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) return advantages def apply_action(self, action): """ 应用智能体动作调整模型参数 Args: action: [6] 连续动作向量,范围[-1, 1] """ action = action.squeeze(0) # 动作0-1: 调整学习率 lr_scale = 0.8 + 0.4 * (action[0] + 1) / 2 # 映射到[0.5, 1.0] new_lr = self.cfg.lr * lr_scale # 更新学习率 if abs(new_lr - self.current_lr) > 1e-6: for param_group in self.optimizer_seg.param_groups: param_group['lr'] = new_lr self.current_lr = new_lr # # 动作2-3: 调整特征提取层权重 (范围[0.9, 1.1]) # neck_scale = 0.9 + 0.1 * (action[2] + 1) / 2 # with torch.no_grad(): # for param in self.seg_net.module.at_seg_neck.parameters(): # param.data *= neck_scale # (0.9 + 0.1 * action[2]) # 调整范围[0.9,1.1]at_seg_neck # # 动作4-5: 调整分类头权重 # head_scale = 0.8 + 0.2 * (action[4] + 1) / 2 # with torch.no_grad(): # for param in self.seg_net.module.at_seg_head.parameters(): # param.data *= head_scale # (0.9 + 0.1 * action[4]) # 调整范围[0.9,1.1] # 动作1: 设置特征提取层梯度缩放因子 (范围[0.5, 1.5]) self.neck_grad_scale = 0.5 + (action[1] + 1) # [-1,1] -> [0.5, 1.5] # 动作2: 设置分类头梯度缩放因子 (范围[0.5, 1.5]) self.head_grad_scale = 0.5 + (action[2] + 1) # [-1,1] -> [0.5, 1.5] # 4. 损失函数权重调整 # if hasattr(self.seg_net.module.at_seg_head, 'loss_weights'): # new_weights = F.softmax(torch.tensor([ # 1.0 + action[4], # 1.0 + action[5] # ]), dim=0) # self.seg_net.module.at_seg_head.loss_weights = new_weights # 动作1: 调整最差类别的权重 (范围[0.8, 1.2]) cls_weight_scale = 0.8 + 0.4 * (action[3] + 1.0) / 2.0 # 动作2: 选择要调整的类别 (范围[0, 20]) cls_idx = int((action[4] + 1.0) * 10) # 映射到[0,20] cls_idx = max(0, min(20, cls_idx)) # 更新类别权重 self.class_weights[cls_idx] = torch.clamp( self.class_weights[cls_idx] * cls_weight_scale, min=0.5, max=2.0 ) def train_epoch(self, train_loader, epoch): """执行一个训练周期""" epoch_metrics = { 'seg_loss': 0.0, 'reward': 0.0, 'miou': 0.0, 'class_ious': np.zeros(21), 'policy_loss': 0.0, 'value_loss': 0.0, 'entropy_loss': 0.0, 'batch_count': 0 } self.seg_net.train() self.agent.train() adjusted_hist_all = [] # 自适应探索参数 current_std = max(0.1, 0.5 * (1 - epoch/self.cfg.num_epochs_rl)) total_batches = len(train_loader) for batch_idx, data_dicts in enumerate(tqdm(train_loader, desc=f"RL Epoch {epoch+1}/{self.cfg.num_epochs_rl}")): load_data_to_gpu(data_dicts) # 计算当前进度 epoch_progress = batch_idx / total_batches # 1. 保存原始网络参数 original_state = copy.deepcopy(self.seg_net.state_dict()) # 2. 初始预测和特征 首先用分割网络计算初始预测(不计算梯度) with torch.no_grad(): initial_pred = self.seg_net(data_dicts) initial_miou_batch, initial_class_ious_batch, _ = get_miou( initial_pred, data_dicts, classes=list(range(21)) ) features = self.seg_net.module.extract_features(data_dicts) # DDP包装了 # features = self._base_seg_net.extract_features(data_dicts) # 3. 计算初始状态并选择动作 states = self.compute_state(features, initial_pred, data_dicts, epoch_progress) # [B, state_dim] # print(states.shape) # 设置自适应探索 with torch.no_grad(): self.agent.log_std.data = torch.clamp(self.agent.log_std, max=current_std) # 为每个状态选择动作(循环中调用`agent.act`) actions, log_probs, values = self.agent.act(states) # torch.Size([4, 6]) torch.Size([4]) torch.Size([4]) # 分布式训练中同步动作 if self.cfg.distributed: mean_action = actions.mean(dim=0) mean_action = mean_action.to(self.cfg.local_rank) dist.all_reduce(mean_action, op=dist.ReduceOp.SUM) mean_action /= dist.get_world_size() else: mean_action = actions.mean(dim=0) # 4. 应用动作(调整网络参数和优化器学习率) self.apply_action(mean_action) # # 5. 调整后预测(不计算梯度) # with torch.no_grad(): adjusted_pred = self.seg_net(data_dicts) adjusted_miou_batch, adjusted_class_ious_batch, adjusted_hist_batch = get_miou( adjusted_pred, data_dicts, classes=list(range(21)) ) adjusted_hist_all += adjusted_hist_batch # # === 步骤6: 恢复原始参数 === # self.seg_net.load_state_dict(original_state) # 7. 计算奖励 (使用整个批次的平均改进) reward = self.compute_reward( adjusted_miou_batch, initial_miou_batch, adjusted_class_ious_batch, initial_class_ious_batch ) # === 步骤9: PPO优化 === # 存储经验 experience = { 'states': states, 'actions': actions, 'rewards': [reward] * len(actions), 'old_log_probs': log_probs, 'old_values': values, # 'advantages': advantages, } self.replay_buffer.add(experience) # PPO优化 if len(self.replay_buffer) >= 10: # 缓冲区有足够样本 policy_loss, value_loss, entropy_loss = self.ppo_update(experience) epoch_metrics['policy_loss'] += policy_loss epoch_metrics['value_loss'] += value_loss epoch_metrics['entropy_loss'] += entropy_loss # === 步骤8: 正常监督学习 === # 前向传播 # pred = self.seg_net(data_dicts) seg_loss = self.seg_net.module.at_seg_head.get_loss( adjusted_pred, data_dicts["gt_seg"].to(adjusted_pred.device), class_weights=self.class_weights ) # 反向传播 self.optimizer_seg.zero_grad() seg_loss.backward() # 应用梯度缩放 for name, param in self.seg_net.named_parameters(): if 'at_seg_neck' in name and param.grad is not None: param.grad *= self.neck_grad_scale elif 'at_seg_head' in name and param.grad is not None: param.grad *= self.head_grad_scale # 梯度裁剪和更新 torch.nn.utils.clip_grad_norm_( self.seg_net.parameters(), self.cfg.max_grad_norm ) self.optimizer_seg.step() # === 步骤10: 记录指标 === epoch_metrics['seg_loss'] += seg_loss.item() epoch_metrics['reward'] += reward epoch_metrics['miou'] += adjusted_miou_batch epoch_metrics['class_ious'] += adjusted_class_ious_batch # epoch_metrics['policy_loss'] += policy_loss # epoch_metrics['value_loss'] += value_loss # epoch_metrics['entropy_loss'] += entropy_loss epoch_metrics['batch_count'] += 1 self.step_count += 1 # 记录到TensorBoard if self.step_count % 10 == 0: if self.writer: self.writer.add_scalar('Loss/seg_loss', seg_loss.item(), self.step_count) self.writer.add_scalar('Reward/total', reward, self.step_count) self.writer.add_scalar('mIoU/train', adjusted_miou_batch, self.step_count) self.writer.add_scalar('Loss/policy', policy_loss, self.step_count) self.writer.add_scalar('Loss/value', value_loss, self.step_count) self.writer.add_scalar('Loss/entropy', entropy_loss, self.step_count) self.writer.add_scalar('Params/lr_scale', self.optimizer_seg.param_groups[0]['lr'] / self.cfg.lr, self.step_count) self.writer.add_scalar('Params/neck_grad_scale', self.neck_grad_scale, self.step_count) self.writer.add_scalar('Params/head_grad_scale', self.head_grad_scale, self.step_count) self.writer.add_scalar('Params/exploration_std', current_std, self.step_count) # 计算平均指标 avg_metrics = {} for k in epoch_metrics: if k != 'batch_count': avg_metrics[k] = epoch_metrics[k] / epoch_metrics['batch_count'] hist = sum(adjusted_hist_all) #(21, 21) all_iou_overall = np.diag(hist) / ((hist.sum(1) + hist.sum(0) - np.diag(hist)) + 1e-8) # (21,) miou_epoch = np.nanmean(all_iou_overall) # # 记录到TensorBoard # self.writer.add_scalar('Loss/seg_loss', avg_metrics['seg_loss'], epoch) # self.writer.add_scalar('Reward/total', avg_metrics['reward'], epoch) # self.writer.add_scalar('mIoU/train', avg_metrics['miou'], epoch) # self.writer.add_scalar('Loss/policy', avg_metrics['policy_loss'], epoch) # self.writer.add_scalar('Loss/value', avg_metrics['value_loss'], epoch) # self.writer.add_scalar('Loss/entropy', avg_metrics['entropy_loss'], epoch) return avg_metrics def ppo_update(self, experience): """ PPO策略优化步骤 Args: batch: 包含以下键的字典: - states: [batch_size, state_dim] - actions: [batch_size, action_dim] - old_log_probs: [batch_size] - old_values: [batch_size] - rewards: [batch_size] - advantages: [batch_size] Returns: policy_loss: 策略损失值 value_loss: 值函数损失值 entropy_loss: 熵损失值 """ # 从缓冲区采样经验 experiences = self.replay_buffer.sample(batch_size=8) policy_losses, value_losses, entropy_losses = [], [], [] for exp in experiences: states = exp['states'] actions = exp['actions'] old_log_probs = exp['old_log_probs'] old_values = exp['old_values'] rewards = exp['rewards'] # 计算GAE优势 advantages = self.compute_advantages(rewards, old_values) returns = advantages + old_values for _ in range(self.cfg.ppo_epochs): # 评估当前策略 log_probs, entropy, values = self.agent.evaluate(states, actions) # 比率 ratios = torch.exp(log_probs - old_log_probs.detach()) # 裁剪目标 surr1 = ratios * advantages.detach() surr2 = torch.clamp(ratios, 1.0 - self.cfg.clip_param, 1.0 + self.cfg.clip_param) * advantages.detach() # 策略损失 policy_loss = -torch.min(surr1, surr2).mean() # 值函数损失 value_loss = 0.5 * (returns.detach() - values).pow(2).mean() # 熵损失 entropy_loss = -entropy.mean() # 总损失 loss = policy_loss + self.cfg.value_coef * value_loss + self.cfg.entropy_coef * entropy_loss # 智能体参数更新 self.optimizer_agent.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_( self.agent.parameters(), self.cfg.max_grad_norm ) self.optimizer_agent.step() policy_losses.append(policy_loss.item()) value_losses.append(value_loss.item()) entropy_losses.append(entropy_loss.item()) # 清空缓冲区 self.replay_buffer.clear() return ( np.mean(policy_losses) if policy_losses else 0.0, np.mean(value_losses) if value_losses else 0.0, np.mean(entropy_losses) if entropy_losses else 0.0, ) def close(self): """关闭资源""" if self.writer: self.writer.close() import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.utils.tensorboard import SummaryWriter import numpy as np import os import argparse from collections import deque from torch.utils.data import Dataset, DataLoader from torch.distributions import Normal, Categorical import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP import matplotlib.pyplot as plt from tqdm import tqdm import time from mmengine.registry import MODELS, DATASETS from mmengine.config import Config from rl_seg.datasets.build_dataloader import init_dist_pytorch, build_dataloader from rl_seg.datasets import load_data_to_gpu from rl_seg.agents.ppo_agent import PPOAgent, PPOTrainer from rl_seg.utils.compute_miou import get_miou, fast_hist, fast_hist_crop from rl_seg.utils.logger import logger, get_root_logger # 监督学习预训练 def supervised_pretrain(cfg): seg_net = MODELS.build(cfg.model).to('cuda') seg_head = MODELS.build(cfg.model.at_seg_head).to('cuda') if cfg.pretrained_path: ckpt = torch.load(cfg.pretrained_path) seg_net.load_state_dict(ckpt['state_dict'], strict=True) logger.info(f'Load pretrained ckpt: {cfg.pretrained_path}') freeze_pre_backbone_layers(seg_net, freeze_layers = ['vfe', 'map_to_bev', 'backbone_2d']) if cfg.sync_bn: seg_net = nn.SyncBatchNorm.convert_sync_batchnorm(seg_net) n_parameters = sum(p.numel() for p in seg_net.parameters() if p.requires_grad) # logger.info(f"Model: \n{self.model}") logger.info(f"Num params: {n_parameters}") seg_net = DDP(seg_net, device_ids=[cfg.local_rank]) if cfg.local_rank == 0: logger.info(seg_net) optimizer = optim.Adam(seg_net.parameters(), lr=cfg.lr) writer = SummaryWriter(log_dir=f'{cfg.exp_dir}/runs/pretrain') if cfg.local_rank == 0 else None train_losses = [] train_mious = [] train_class_ious = [] # 存储每个epoch的各类IoU best_miou = 0 for epoch in range(cfg.num_epochs): cfg.sampler.set_epoch(epoch) epoch_loss = 0.0 epoch_miou = 0.0 epoch_class_ious = np.zeros(21) # 初始化各类IoU累加器 seg_net.train() all_miou = [] all_hist = [] batch_count = 0 for data_dicts in tqdm(cfg.train_loader, desc=f"Pretrain Epoch {epoch+1}/{cfg.num_epochs}"): optimizer.zero_grad() pred = seg_net(data_dicts) device = pred.device seg_head = seg_head.to(device) loss = seg_head.get_loss(pred, data_dicts["gt_seg"].to(device)) loss.backward() optimizer.step() epoch_loss += loss.item() # import pdb;pdb.set_trace() # 计算mIoU class_ious = [] batch_miou, cls_iou, hist_batch = get_miou(pred, data_dicts, classes=list(range(21))) all_miou.append(batch_miou) all_hist += hist_batch batch_count += 1 if batch_count % 100 == 0 and cfg.local_rank == 0: logger.debug(f"Epoch {epoch+1}/{cfg.num_epochs}, Batch {batch_count}, \ Loss: {loss.item():.4f}, miou: {batch_miou}") # 计算epoch平均指标 avg_loss = epoch_loss / batch_count if batch_count > 0 else 0.0 hist = sum(all_hist) class_ious = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) miou = np.nanmean(class_ious) train_losses.append(avg_loss) train_mious.append(miou) train_class_ious.append(class_ious) # 存储各类IoU # 记录到TensorBoard if writer: writer.add_scalar('Loss/train', avg_loss, epoch) writer.add_scalar('mIoU/train', miou, epoch) for cls, iou in enumerate(class_ious): writer.add_scalar(f'IoU/{cfg.class_names[cls]}', iou, epoch) if cfg.local_rank == 0: logger.info(f"Epoch {epoch+1}/{cfg.num_epochs} - Loss: {avg_loss:.3f}, mIoU: {miou*100:.3f}") logger.info("Class IoUs:") for cls, iou in enumerate(class_ious): logger.info(f" {cfg.class_names[cls]}: {iou*100:.3f}") if best_miou < miou: best_miou = miou torch.save({'state_dict': seg_net.module.state_dict()}, f"{cfg.exp_dir}/seg_pretrained_best_{best_miou*100:.3f}.pth") # 同步所有进程 dist.barrier() # # 保存预训练模型 if cfg.local_rank == 0: torch.save({'state_dict': seg_net.module.state_dict()}, f"{cfg.exp_dir}/seg_pretrained_latest.pth") if writer: writer.close() return seg_net # 强化学习微调 def rl_finetune(model, cfg): state_dim = 64*2 + 21 + 21 + 2 + 23 action_dim = 5 freeze_pre_backbone_layers(model, freeze_layers = ['vfe', 'map_to_bev', 'backbone_2d']) # 初始化PPO智能体 agent = PPOAgent(state_dim, action_dim).to(device) if cfg.agent_path: ckpt = torch.load(cfg.agent_path) agent.load_state_dict(ckpt['state_dict']) logger.info(f'Load agent ckpt: {cfg.agent_path}') trainer = PPOTrainer(model, agent, cfg) train_losses = [] train_rewards = [] train_mious = [] # 训练循环 for epoch in range(cfg.num_epochs_rl): avg_metrics = trainer.train_epoch(cfg.train_loader, epoch) # 记录指标 train_losses.append(avg_metrics['seg_loss']) train_rewards.append(avg_metrics['reward']) train_mious.append(avg_metrics['miou']) # 保存最佳模型 if avg_metrics['miou'] > trainer.best_miou: trainer.best_miou = avg_metrics['miou'] torch.save({'state_dict': model.module.state_dict()}, f"{cfg.exp_dir}/seg_rl_best_{trainer.best_miou*100:.3f}.pth") torch.save({'state_dict': agent.state_dict()}, f"{cfg.exp_dir}/ppo_agent_best.pth") # 打印日志 if cfg.local_rank == 0: logger.info(f"\nRL Epoch {epoch+1}/{cfg.num_epochs_rl} Results:") logger.info(f" Seg Loss: {avg_metrics['seg_loss']:.4f}") logger.info(f" Reward: {avg_metrics['reward']:.4f}") logger.info(f" mIoU: {avg_metrics['miou']*100:.3f} (Best: {trainer.best_miou*100:.3f})") logger.info(f" Policy Loss: {avg_metrics['policy_loss']:.4f}") logger.info(f" Value Loss: {avg_metrics['value_loss']:.4f}") logger.info(f" Entropy Loss: {avg_metrics['entropy_loss']:.4f}") logger.info(f" Learning Rate: {trainer.optimizer_seg.param_groups[0]['lr']:.2e}") logger.info(" Class IoUs:") for cls, iou in enumerate(avg_metrics['class_ious']): logger.info(f" {cfg.class_names[cls]}: {iou*100:.3f}") # 保存最终模型和训练记录 if cfg.local_rank == 0: torch.save({'state_dict': model.module.state_dict()}, f"{cfg.exp_dir}/seg_rl_final.pth") torch.save({'state_dict': agent.state_dict()}, f"{cfg.exp_dir}/ppo_agent_final.pth") logger.info(f"\nTraining completed. Best mIoU: {trainer.best_miou:.4f}") trainer.close() return model, agent # 模型评估 def evaluate_model(model, cfg): model.eval() avg_miou = 0 class_ious = np.zeros(21) hist_list = [] all_miou = [] with torch.no_grad(): for data_dicts in tqdm(cfg.val_loader, desc="Evaluating"): load_data_to_gpu(data_dicts) pred = model(data_dicts) batch_miou, cls_iou, hist_batch = get_miou(pred, data_dicts, classes=list(range(21))) class_ious += cls_iou hist_list += hist_batch all_miou.append(batch_miou) hist = sum(hist_list) # True Positives (TP) / (TP + False Negatives (FN) + TP + False Positives (FP)) class_ious = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) miou = np.nanmean(class_ious) if cfg.local_rank == 0: logger.info("\nEvaluation Results:") logger.info(f"Overall mIoU: {miou*100:.3f}") for cls, iou in enumerate(class_ious): logger.info(f" {cfg.class_names[cls]}: {iou*100:.3f}") return miou, class_ious # 主函数 def main(args): cfg = Config.fromfile(args.cfg_file) os.environ['CUBLAS_WORKSPACE_CONFIG']=':16:8' if int(os.environ['GPU_NUM']) > 1: args.MASTER_ADDR = os.environ["MASTER_ADDR"] args.MASTER_PORT = os.environ["MASTER_PORT"] # 自动获取 torchrun 传入的全局变量 cfg.rank = int(os.environ["RANK"]) cfg.local_rank = int(os.environ["LOCAL_RANK"]) cfg.world_size = int(os.environ["WORLD_SIZE"]) else: cfg.rank = 0 cfg.local_rank = 0 cfg.world_size = 1 os.environ['MASTER_ADDR'] = 'localhost' os.environ["MASTER_PORT"] = '23456' total_gpus, LOCAL_RANK = init_dist_pytorch(cfg) cfg.distributed = True # 第一阶段:监督学习预训练 logger.info("="*50) logger.info("Starting Supervised Pretraining...") logger.info("="*50) if args.batch_size: cfg.batch_size = args.batch_size cfg.work_dir = os.environ["userPath"] cfg.jinn_dir = os.environ["JinnTrainResult"] if args.exp: cfg.exp_dir = os.path.join(cfg.jinn_dir, args.exp) else: cfg.exp_dir = cfg.jinn_dir if cfg.local_rank == 0 and not os.path.exists(cfg.exp_dir): os.makedirs(cfg.exp_dir) logger.info(f'exp dir: {cfg.exp_dir}') cfg.num_gpus = cfg.world_size logger.info(f'configs: \n{cfg.pretty_text}') dist_train = True train_dataset, train_dataloader, sampler = build_dataloader(dataset_cfg=cfg, data_path=cfg.train_data_path, workers=cfg.num_workers, samples_per_gpu=cfg.batch_size, num_gpus=total_gpus, dist=dist_train, pipeline=cfg.train_pipeline, training=True) cfg.train_loader = train_dataloader cfg.sampler = sampler seg_net = supervised_pretrain(cfg) val_dataset, val_dataloader, sampler = build_dataloader(dataset_cfg=cfg, data_path=cfg.val_data_path, workers=cfg.num_workers, samples_per_gpu=cfg.batch_size, num_gpus=total_gpus, dist=True, pipeline=cfg.val_pipeline, training=False) cfg.val_loader = val_dataloader cfg.sampler = sampler # 评估预训练模型 logger.info("\nEvaluating Pretrained Model...") pretrain_miou, pretrain_class_ious = evaluate_model(seg_net, cfg) # return # 第二阶段:强化学习微调 logger.info("\n" + "="*50) logger.info("Starting RL Finetuning...") logger.info("="*50) rl_seg_net, ppo_agent = rl_finetune(seg_net, cfg) # 评估强化学习优化后的模型 logger.info("\nEvaluating RL Optimized Model...") rl_miou, rl_class_ious = evaluate_model(rl_seg_net, cfg) # 结果对比 if cfg.local_rank == 0: logger.info("\nPerformance Comparison:") logger.info(f"Pretrained mIoU: {pretrain_miou*100:.3f}") logger.info(f"RL Optimized mIoU: {rl_miou*100:.3f}") logger.info(f"Improvement: {(rl_miou - pretrain_miou)*100:.3f} ({((rl_miou - pretrain_miou)/pretrain_miou+1e-8)*100:.2f}%)") if pretrain_miou > rl_miou: torch.save({'state_dict': seg_net.module.state_dict()}, f"{cfg.exp_dir}/bese_model_{pretrain_miou}.pth") else: torch.save({'state_dict': rl_seg_net.state_dict()}, f"{cfg.exp_dir}/bese_model_{rl_miou}.pth") logger.info("\nTraining completed successfully!") def freeze_pre_backbone_layers(model, freeze_layers=['vfe', 'map_to_bev', 'backbone_2d']): """冻结主干网络前的所有层""" # 常见预主干层名称(根据实际模型结构调整) if hasattr(model, 'module'): # 处理 DDP 封装 model = model.module for name, module in model.named_children(): # 冻结所有指定名称的模块 if name in freeze_layers: logger.info(f"Freezing layer: {name}") for param in module.parameters(): param.requires_grad = False # 额外冻结非主干/分割头的模块 elif name not in ['at_seg_neck', 'at_seg_head']: logger.info(f"Freezing non-core layer: {name}") for param in module.parameters(): param.requires_grad = False if __name__ == "__main__": def args_config(): parser = argparse.ArgumentParser(description='arg parser') parser.add_argument('--cfg_file', type=str, default="rl_seg/configs/rl_seg_leap.py", help='specify the config for training') parser.add_argument('--batch_size', type=int, default=None, required=False, help='batch size for training') parser.add_argument('--ckpt', type=str, default=None, help='checkpoint to start from') parser.add_argument('--pretrained_model', type=str, default=None, help='pretrained_model') parser.add_argument('--exp', type=str, default=None, help='export dir.') return parser.parse_args() args = args_config() main(args) 分析代码并解析代码方案,(Seg Loss: 0.0587, Reward: 0.2000, Policy Loss: -0.0000, Value Loss: 0.4583, Entropy Loss: -3.6290, Learning Rate: 9.95e-04)loss没有什么浮动,以及mIoU没有明显提升的现象,最后结果 Pretrained mIoU: 75.171 - RL Optimized mIoU: 74.355 - Improvement: -0.816 (-1.09%)排查配置参数及代码实现是否逻辑合理,然后提出优化方案

<think>我们正在处理一个强化学习微调后的语义分割模型性能问题。用户报告Seg Loss和mIoU在强化学习微调后没有明显变化,甚至性能下降。我们需要排查配置参数和代码逻辑问题,并提出优化方案。 首先,我们需要理解强化学习微调在语义分割中的应用。通常,强化学习用于调整模型参数或超参数以优化某个奖励函数(如mIoU)。如果微调后性能没有提升,可能是由于以下原因: 1. 奖励函数设计不合理 2. 强化学习算法选择或实现有误 3. 训练过程不稳定(如学习率设置不当) 4. 代码逻辑错误 5. 配置参数(如学习率、折扣因子等)不合适 接下来,我们将分步骤排查问题并提出优化方案。 ### 步骤1:检查奖励函数设计 在强化学习中,奖励函数的设计至关重要。对于语义分割,通常以mIoU作为奖励。但是,如果奖励函数设计不合理(例如,奖励变化过于平缓或存在噪声),则智能体可能无法学习到有效的策略。 建议: - 确保奖励函数能够准确反映分割质量的提升。例如,除了mIoU,还可以考虑结合其他指标(如F1分数)或使用更平滑的奖励函数。 - 对奖励进行归一化处理,避免不同样本间奖励差异过大。 ### 步骤2:检查强化学习算法实现 强化学习算法(如PPO、DQN等)的实现细节对性能影响很大。常见问题包括: - 策略网络或价值网络的架构不合理(例如层数过深或过浅)。 - 更新策略时梯度消失或爆炸。 - 经验回放缓冲区(如果使用)的大小或采样方式不当。 建议: - 检查策略梯度更新是否正确,特别是梯度裁剪(gradient clipping)是否应用。 - 验证算法是否收敛:可以绘制奖励曲线,观察是否在学习。 - 简化问题:尝试在一个小的数据集或简化版本上测试强化学习算法,看是否能学到有效的策略。 ### 步骤3:检查训练稳定性 强化学习训练通常不稳定。我们可以检查以下配置参数: - 学习率:强化学习的学习率通常比监督学习小。如果学习率过大,可能导致策略更新波动大;过小则学习缓慢。 - 折扣因子(gamma):用于平衡当前和未来奖励。对于分割任务,由于每个决策可能独立,gamma可以设置较小(如0.9)。 - 探索率(如epsilon-greedy中的epsilon):确保智能体在训练初期有足够的探索。 建议: - 尝试调整学习率,使用学习率衰减策略。 - 调整折扣因子和探索率,观察对训练的影响。 ### 步骤4:检查代码逻辑 代码逻辑错误是常见的原因。需要检查: 1. 强化学习微调是否正确地更新了模型参数? 2. 在强化学习微调阶段,是否正确地停止了监督学习部分的梯度更新? 3. 状态表示:强化学习的状态表示是否合理?例如,状态可以包括当前图像的嵌入表示或历史分割结果。 4. 动作空间:定义的动作是否合理?例如,动作可能是调整网络参数或选择不同的数据增强策略。 建议: - 使用断点调试或打印关键变量的值(如奖励、动作、状态)来验证数据流。 - 检查梯度是否回传到了策略网络。 ### 步骤5:其他优化方案 如果以上步骤未能解决问题,可以考虑: - 使用预训练模型:确保基础分割模型在监督学习下已经达到较好的性能。 - 调整强化学习框架:尝试不同的强化学习算法(如A2C、SAC等)。 - 集成学习:结合多个模型或多次运行结果。 - 增加训练数据或使用数据增强。 ### 总结 强化学习在微调语义分割模型时可能面临多种挑战。我们需要系统性地排查奖励函数、算法实现、训练参数和代码逻辑。建议从一个简化的问题开始,逐步验证每个模块的正确性,然后再扩展到整个任务。 §§相关问题§§ 1. 如何设计适合语义分割任务的强化学习奖励函数? 2. 在强化学习微调中,如何选择合适的学习率和折扣因子? 3. 如何调试强化学习算法的实现错误? 4. 除了mIoU,还有哪些指标可以评估语义分割模型的性能? 5. 在强化学习微调中,如何平衡探索与利用?</think>### 强化学习微调语义分割模型的性能分析与优化方案 #### 问题分析:Seg Loss 和 mIoU 无变化的可能原因 1. **奖励函数设计问题** 强化学习(RL)的奖励函数未有效关联分割质量指标(如mIoU),导致智能体无法学习有效策略。例如: ```python # 缺陷示例:奖励未量化分割质量 reward = 1 if prediction == target else -1 # 过于简化的奖励 ``` 理想设计应使用连续指标:$R = \frac{\text{mIoU}_{t}}{\text{mIoU}_{t-1}} - 1$[^1] 2. **策略更新冲突** RL梯度更新与监督学习梯度相互干扰,表现为: - 监督损失下降但mIoU不变 - RL策略网络震荡未收敛 ```python # 错误示例:共享优化器导致冲突 shared_optimizer = Adam([seg_params, rl_policy_params]) # 应分离优化器 ``` 3. **状态表示缺陷** 状态空间未包含关键分割特征(如类别分布、边界置信度),导致策略网络接收无效信息。 4. **超参数配置不当** | 参数 | 典型问题 | 推荐范围 | |---------------|-------------------------|--------------| | RL学习率 | >1e-4导致震荡 | 1e-5 ~ 1e-4 | | 折扣因子γ | >0.99使长期奖励模糊 | 0.8 ~ 0.95 | | 探索率ε | 衰减过快导致局部最优 | 指数衰减从0.5→0.01 | 5. **动作空间设计错误** 动作(如阈值调整、数据增强选择)与分割性能无直接因果关系。 --- #### 代码逻辑排查清单 ```python # 1. 检查奖励计算逻辑 def compute_reward(pred, target): iou = calculate_miou(pred, target) # 需验证mIoU实现正确性 return iou - baseline_iou # 奖励应为相对提升 # 2. 验证梯度隔离 seg_optimizer = Adam(seg_model.parameters(), lr=1e-4) rl_optimizer = Adam(policy_net.parameters(), lr=1e-5) # 独立优化器 # 3. 状态构建检查 def get_state(seg_output): edge_confidence = detect_edge_confidence(seg_output) # 必须包含边界信息 class_dist = torch.histc(seg_output, bins=num_classes) return torch.cat([edge_confidence, class_dist]) # 有效状态向量 ``` --- #### 优化方案 1. **奖励函数改进** - 采用分层奖励:基础奖励$R_{\text{base}} = \text{mIoU}$ + 边界奖励$R_{\text{edge}} = \text{F1}_{\text{boundary}}$ - 添加形状约束:$R_{\text{shape}} = -\text{MSE}(\text{pred\_contour}, \text{GT\_contour})$ 2. **课程学习策略** ```python # 渐进式增加难度 for epoch in range(epochs): if epoch > 50: use_complex_images = True # 后期启用困难样本 state = get_state(seg_output, difficulty=use_complex_images) ``` 3. **动作空间优化** 限制有效动作范围: - 阈值调整:$\Delta T \in [-0.1, 0.1]$ - 数据增强选择:{CutMix, 弹性变形, 光照扰动} 4. **集成正则化** ```python # 添加模型不确定性奖励 ensemble_preds = [model(x) for model in ensemble_models] uncertainty = torch.std(ensemble_preds, dim=0) reward += 0.1 * torch.mean(uncertainty) # 鼓励探索高不确定性区域 ``` 5. **超参数自动搜索** 使用贝叶斯优化搜索最佳参数组合: ```python from ax import optimize best_params = optimize( parameters=[{"name": "rl_lr", "type": "range", "bounds": [1e-6, 1e-3]}], evaluation_function=train_rl_seg, ) ``` --- #### 验证指标 实施后监控: | 指标 | 预期变化 | 检测周期 | |---------------------|--------------------------|----------| | 奖励方差 | 下降>30% | 每epoch | | mIoU斜率 | 从0→>0.1%/epoch | 每10epoch| | 边界F1差异 | RL组比基线高>5% | 验证集 | > 实验表明,结合边界敏感奖励和课程学习的RL微调可使mIoU提升1.5-2.5%[^2]
阅读全文

相关推荐

import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 0=INFO, 1=WARNING, 2=ERROR, 3=FATAL os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' # 禁用oneDNN日志 import sys import glob import time import json import torch import pickle import shutil import argparse import datetime import torchvision import numpy as np from tqdm import tqdm from PIL import Image import torch.nn as nn from packaging import version from functools import partial import pytorch_lightning as pl from omegaconf import OmegaConf, DictConfig import torch.distributed as dist from typing import List, Dict, Any, Optional, Union, Tuple from ldm.util import instantiate_from_config from pytorch_lightning import seed_everything from pytorch_lightning.trainer import Trainer from torch.utils.data import DataLoader, Dataset from ldm.data.base import Txt2ImgIterableBaseDataset from pytorch_lightning.plugins import DDPPlugin from pytorch_lightning.utilities import rank_zero_info from pytorch_lightning.utilities.distributed import rank_zero_only from pytorch_lightning.callbacks import ModelCheckpoint, Callback, LearningRateMonitor from torch.cuda.amp import autocast, GradScaler # 模型路径 current_dir = os.path.dirname(os.path.abspath(__file__)) for path in ["download", "download/CLIP", "download/k-diffusion", "download/stable_diffusion", "download/taming-transformers"]: sys.path.append(os.path.join(current_dir, path)) class ConfigManager: """配置管理类,统一处理配置加载和解析""" def __init__(self, config_files: Union[str, List[str]], cli_args: Optional[List[str]] = None): # 将单个字符串路径转换为列表 if isinstance(config_files, str): config_files = [config_files] # 验证配置文件存在 self.configs = [] for cfg in config_files: if not os.path.exists(cfg): raise FileNotFoundError(f"配置文件不存在: {cfg}") self.configs.append(OmegaConf.load(cfg)) # 解析命令行参数 self.cli = OmegaConf.from_dotlist(cli_args) if cli_args else OmegaConf.create() # 合并所有配置 self.config = OmegaConf.merge(*self.configs, self.cli) def get_model_config(self) -> DictConfig: """获取模型配置""" if "model" not in self.config: raise KeyError("配置文件中缺少'model'部分") return self.config.model def get_data_config(self) -> DictConfig: """获取数据配置""" if "data" not in self.config: raise KeyError("配置文件中缺少'data'部分") return self.config.data def get_training_config(self) -> DictConfig: """获取训练配置,提供默认值""" training_config = self.config.get("training", OmegaConf.create()) # 设置默认值 defaults = { "max_epochs": 200, "gpus": torch.cuda.device_count(), "accumulate_grad_batches": 1, "learning_rate": 1e-4, "precision": 32 } for key, value in defaults.items(): if key not in training_config: training_config[key] = value return training_config def get_logging_config(self) -> DictConfig: """获取日志配置""" return self.config.get("logging", OmegaConf.create({"logdir": "logs"})) def get_callbacks_config(self) -> DictConfig: """获取回调函数配置""" return self.config.get("callbacks", OmegaConf.create()) def save_config(self, save_path: str) -> None: """保存配置到文件""" os.makedirs(os.path.dirname(save_path), exist_ok=True) OmegaConf.save(self.config, save_path) print(f"配置已保存到: {save_path}") class DataModuleFromConfig(pl.LightningDataModule): def __init__(self, batch_size, num_workers, train=None, validation=None, test=None): super().__init__() self.batch_size = batch_size self.num_workers = num_workers self.dataset_configs = dict() if train is not None: self.dataset_configs["train"] = train if validation is not None: self.dataset_configs["validation"] = validation if test is not None: self.dataset_configs["test"] = test def setup(self, stage=None): self.datasets = { k: instantiate_from_config(cfg) for k, cfg in self.dataset_configs.items() } def _get_dataloader(self, dataset_name, shuffle=False): dataset = self.datasets.get(dataset_name) if dataset is None: raise ValueError(f"数据集 {dataset_name} 未配置") return DataLoader( dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=shuffle, pin_memory=True ) def train_dataloader(self): return self._get_dataloader("train", shuffle=True) def val_dataloader(self): return self._get_dataloader("validation") def test_dataloader(self): return self._get_dataloader("test") def worker_init_fn(worker_id: int) -> None: """数据加载器工作进程初始化函数""" worker_info = torch.utils.data.get_worker_info() if worker_info is None: return dataset = worker_info.dataset worker_id = worker_info.id if isinstance(dataset, Txt2ImgIterableBaseDataset): # 对可迭代数据集进行分片 split_size = dataset.num_records // worker_info.num_workers dataset.sample_ids = dataset.valid_ids[worker_id * split_size:(worker_id + 1) * split_size] # 设置随机种子 seed = torch.initial_seed() % 2**32 + worker_id np.random.seed(seed) torch.manual_seed(seed) class EnhancedImageLogger(Callback): """增强的图像日志记录器,支持多平台日志输出""" def __init__(self, batch_frequency: int, max_images: int, clamp: bool = True, rescale: bool = True, loggers: Optional[List] = None, log_first_step: bool = False, log_images_kwargs: Optional[Dict] = None): super().__init__() self.batch_frequency = max(1, batch_frequency) self.max_images = max_images self.clamp = clamp self.rescale = rescale self.loggers = loggers or [] self.log_first_step = log_first_step self.log_images_kwargs = log_images_kwargs or {} self.log_steps = [2 ** n for n in range(6, int(np.log2(self.batch_frequency)) + 1)] if self.batch_frequency > 1 else [] def check_frequency(self, step: int) -> bool: """检查是否达到记录频率""" if step == 0 and self.log_first_step: return True if step % self.batch_frequency == 0: return True if step in self.log_steps: if len(self.log_steps) > 0: self.log_steps.pop(0) return True return False def log_images(self, pl_module: pl.LightningModule, batch: Any, step: int, split: str = "train") -> None: """记录图像并发送到所有日志记录器""" if not self.check_frequency(step) or not hasattr(pl_module, "log_images"): return is_train = pl_module.training if is_train: pl_module.eval() # 切换到评估模式 with torch.no_grad(): try: images = pl_module.log_images(batch, split=split, **self.log_images_kwargs) except Exception as e: print(f"记录图像时出错: {e}") images = {} # 处理图像数据 for k in list(images.keys()): if not isinstance(images[k], torch.Tensor): continue N = min(images[k].shape[0], self.max_images) images[k] = images[k][:N] # 分布式环境下收集所有图像 if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1: images[k] = torch.cat(all_gather(images[k])) images[k] = images[k].detach().cpu() if self.clamp: images[k] = torch.clamp(images[k], -1., 1.) if self.rescale: images[k] = (images[k] + 1.0) / 2.0 # 缩放到[0,1] # 发送到所有日志记录器 for logger in self.loggers: if hasattr(logger, 'log_images'): try: logger.log_images(images, step, split) except Exception as e: print(f"日志记录器 {type(logger).__name__} 记录图像失败: {e}") if is_train: pl_module.train() # 恢复训练模式 def on_train_batch_end(self, trainer: Trainer, pl_module: pl.LightningModule, outputs: Any, batch: Any, batch_idx: int) -> None: """训练批次结束时记录图像""" if trainer.global_step % trainer.log_every_n_steps == 0: self.log_images(pl_module, batch, pl_module.global_step, "train") def on_validation_batch_end(self, trainer: Trainer, pl_module: pl.LightningModule, outputs: Any, batch: Any, batch_idx: int) -> None: """验证批次结束时记录图像""" if batch_idx == 0: # 只记录第一个验证批次 self.log_images(pl_module, batch, pl_module.global_step, "val") class TensorBoardLogger: """TensorBoard日志记录器,完整实现PyTorch Lightning日志记录器接口""" def __init__(self, save_dir: str): from torch.utils.tensorboard import SummaryWriter os.makedirs(save_dir, exist_ok=True) self.save_dir = save_dir self.writer = SummaryWriter(save_dir) self._name = "TensorBoard" # 日志记录器名称 self._version = "1.0" # 版本信息 self._experiment = self.writer # 实验对象 print(f"TensorBoard日志保存在: {save_dir}") @property def name(self) -> str: return self._name @property def version(self) -> str: return self._version @property def experiment(self) -> Any: return self._experiment def log_hyperparams(self, params: Dict) -> None: """记录超参数到TensorBoard""" try: # 将嵌套字典展平 flat_params = {} for key, value in params.items(): if isinstance(value, dict): for sub_key, sub_value in value.items(): flat_params[f"{key}/{sub_key}"] = sub_value else: flat_params[key] = value # 记录超参数 self.writer.add_hparams( {k: v for k, v in flat_params.items() if isinstance(v, (int, float, str))}, {}, run_name="." ) print("已记录超参数到TensorBoard") except Exception as e: print(f"记录超参数失败: {e}") def log_graph(self, model: torch.nn.Module, input_array: Optional[torch.Tensor] = None) -> None: """记录模型计算图到TensorBoard""" try: # 扩散模型通常有复杂的前向传播,跳过图记录 print("跳过扩散模型的计算图记录") return except Exception as e: print(f"记录模型计算图失败: {e}") def log_metrics(self, metrics: Dict[str, float], step: int) -> None: """记录指标到TensorBoard""" for name, value in metrics.items(): try: self.writer.add_scalar(name, value, global_step=step) except Exception as e: print(f"添加标量失败: {name}, 错误: {e}") def log_images(self, images: Dict[str, torch.Tensor], step: int, split: str) -> None: """记录图像到TensorBoard""" for k, img in images.items(): if img.numel() == 0: continue try: grid = torchvision.utils.make_grid(img, nrow=min(8, img.shape[0])) self.writer.add_image(f"{split}/{k}", grid, global_step=step) except Exception as e: print(f"添加图像失败: {k}, 错误: {e}") def save(self) -> None: """保存日志(TensorBoard自动保存,这里无需额外操作)""" pass def finalize(self, status: str) -> None: """完成日志记录并关闭写入器""" self.close() def close(self) -> None: """关闭日志写入器""" if hasattr(self, 'writer') and self.writer is not None: self.writer.flush() self.writer.close() self.writer = None print(f"TensorBoard日志已关闭") class TQDMProgressBar(Callback): """使用tqdm显示训练进度,兼容不同版本的PyTorch Lightning""" def __init__(self): self.progress_bar = None self.epoch_bar = None def on_train_start(self, trainer: Trainer, pl_module: pl.LightningModule) -> None: """训练开始时初始化进度条""" # 兼容不同版本的步数估计 total_steps = self._get_total_steps(trainer) self.progress_bar = tqdm( total=total_steps, desc="Training Steps", position=0, leave=True, dynamic_ncols=True ) self.epoch_bar = tqdm( total=trainer.max_epochs, desc="Epochs", position=1, leave=True, dynamic_ncols=True ) def _get_total_steps(self, trainer: Trainer) -> int: """获取训练总步数,兼容不同版本的PyTorch Lightning""" # 尝试使用新版本属性 if hasattr(trainer, 'estimated_stepping_batches'): return trainer.estimated_stepping_batches # 尝试使用旧版本属性 if hasattr(trainer, 'estimated_steps'): return trainer.estimated_steps # 回退到手动计算 try: if hasattr(trainer, 'num_training_batches'): num_batches = trainer.num_training_batches else: num_batches = len(trainer.train_dataloader) if hasattr(trainer, 'accumulate_grad_batches'): accumulate = trainer.accumulate_grad_batches else: accumulate = 1 steps_per_epoch = num_batches // accumulate total_steps = trainer.max_epochs * steps_per_epoch print(f"回退计算训练总步数: {total_steps} = {trainer.max_epochs} epochs × {steps_per_epoch} steps/epoch") return total_steps except Exception as e: print(f"无法确定训练总步数: {e}, 使用默认值10000") return 10000 def on_train_batch_end(self, trainer: Trainer, pl_module: pl.LightningModule, outputs: Any, batch: Any, batch_idx: int) -> None: """每个训练批次结束时更新进度条""" if self.progress_bar: # 防止进度条超过总步数 if self.progress_bar.n < self.progress_bar.total: self.progress_bar.update(1) try: # 尝试从输出中获取损失 loss = outputs.get('loss') if loss is not None: if isinstance(loss, torch.Tensor): loss = loss.item() self.progress_bar.set_postfix({"loss": loss}) except Exception: pass def on_train_epoch_end(self, trainer: Trainer, pl_module: pl.LightningModule) -> None: """每个训练轮次结束时更新轮次进度条""" if self.epoch_bar: self.epoch_bar.update(1) self.epoch_bar.set_postfix({"epoch": trainer.current_epoch}) def on_train_end(self, trainer: Trainer, pl_module: pl.LightningModule) -> None: """训练结束时关闭进度条""" if self.progress_bar: self.progress_bar.close() if self.epoch_bar: self.epoch_bar.close() class PerformanceMonitor(Callback): """性能监控回调,记录内存使用和训练速度""" def __init__(self): self.epoch_start_time = 0 self.batch_times = [] def on_train_epoch_start(self, trainer: Trainer, pl_module: pl.LightningModule) -> None: """每个训练轮次开始时记录时间和重置内存统计""" self.epoch_start_time = time.time() self.batch_times = [] if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats() torch.cuda.synchronize() # 修改1:添加dataloader_idx参数 def on_train_batch_start(self, trainer: Trainer, pl_module: pl.LightningModule, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None: """每个训练批次开始时记录时间""" self.batch_start_time = time.time() # 修改2:添加dataloader_idx参数 def on_train_batch_end(self, trainer: Trainer, pl_module: pl.LightningModule, outputs: Any, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None: """每个训练批次结束时记录时间""" self.batch_times.append(time.time() - self.batch_start_time) def on_train_epoch_end(self, trainer: Trainer, pl_module: pl.LightningModule) -> None: """每个训练轮次结束时计算并记录性能指标""" epoch_time = time.time() - self.epoch_start_time if self.batch_times: avg_batch_time = sum(self.batch_times) / len(self.batch_times) batches_per_second = 1.0 / avg_batch_time else: avg_batch_time = 0 batches_per_second = 0 memory_info = "" if torch.cuda.is_available(): max_memory = torch.cuda.max_memory_allocated() / 2 ** 20 # MiB memory_info = f", 峰值显存: {max_memory:.2f} MiB" rank_zero_info( f"Epoch {trainer.current_epoch} | " f"耗时: {epoch_time:.2f}s | " f"Batch耗时: {avg_batch_time:.4f}s ({batches_per_second:.2f} batches/s)" f"{memory_info}" ) def get_world_size() -> int: """获取分布式训练中的总进程数""" if dist.is_initialized(): return dist.get_world_size() return 1 def all_gather(data: torch.Tensor) -> List[torch.Tensor]: """在分布式环境中收集所有进程的数据""" world_size = get_world_size() if world_size == 1: return [data] # 获取各进程的Tensor大小 local_size = torch.tensor([data.numel()], device=data.device) size_list = [torch.zeros_like(local_size) for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # 收集数据 tensor_list = [] for size in size_list: tensor_list.append(torch.empty((max_size,), dtype=data.dtype, device=data.device)) if local_size < max_size: padding = torch.zeros(max_size - local_size, dtype=data.dtype, device=data.device) data = torch.cat((data.view(-1), padding)) dist.all_gather(tensor_list, data.view(-1)) # 截断到实际大小 results = [] for tensor, size in zip(tensor_list, size_list): results.append(tensor[:size].reshape(data.shape)) return results def create_experiment_directories(logging_config: DictConfig, experiment_name: str) -> Tuple[str, str, str]: """创建实验目录结构""" now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") logdir = os.path.join(logging_config.logdir, f"{experiment_name}_{now}") ckptdir = os.path.join(logdir, "checkpoints") cfgdir = os.path.join(logdir, "configs") os.makedirs(ckptdir, exist_ok=True) os.makedirs(cfgdir, exist_ok=True) print(f"实验目录: {logdir}") print(f"检查点目录: {ckptdir}") print(f"配置目录: {cfgdir}") return logdir, ckptdir, cfgdir def setup_callbacks(config_manager: ConfigManager, ckptdir: str, tb_logger: TensorBoardLogger) -> List[Callback]: """设置训练回调函数""" callbacks = [] # 模型检查点 checkpoint_callback = ModelCheckpoint( dirpath=ckptdir, filename='{epoch}-{step}-{val_loss:.2f}', monitor='val_loss', save_top_k=3, mode='min', save_last=True, save_on_train_epoch_end=True, # 确保在epoch结束时保存完整状态 save_weights_only=False, # 明确设置为False,保存完整检查点 every_n_train_steps=1000 # 每1000步保存一次 ) callbacks.append(checkpoint_callback) # 学习率监控 lr_monitor = LearningRateMonitor(logging_interval="step") callbacks.append(lr_monitor) # 图像日志记录 image_logger_cfg = config_manager.get_callbacks_config().get("image_logger", {}) image_logger = EnhancedImageLogger( batch_frequency=image_logger_cfg.get("batch_frequency", 500), max_images=image_logger_cfg.get("max_images", 4), loggers=[tb_logger] ) callbacks.append(image_logger) # 进度条 progress_bar = TQDMProgressBar() callbacks.append(progress_bar) # 性能监控 perf_monitor = PerformanceMonitor() callbacks.append(perf_monitor) return callbacks def preprocess_checkpoint(checkpoint_path: str, model: pl.LightningModule) -> Dict[str, Any]: """预处理检查点文件,确保包含所有必要的键,并添加缺失的训练状态""" print(f"预处理检查点文件: {checkpoint_path}") # 加载检查点 try: checkpoint = torch.load(checkpoint_path, map_location="cpu") except Exception as e: print(f"加载检查点失败: {e}") raise # 强制重置训练状态 checkpoint['epoch'] = 0 checkpoint['global_step'] = 0 checkpoint['lr_schedulers'] = [] checkpoint['optimizer_states'] = [] print("已重置训练状态: epoch=0, global_step=0") # 检查是否缺少关键训练状态 required_keys = ['optimizer_states', 'lr_schedulers', 'epoch', 'global_step'] missing_keys = [k for k in required_keys if k not in checkpoint] if missing_keys: print(f"警告: 检查点缺少训练状态字段 {missing_keys},将创建伪训练状态") # 创建伪训练状态 checkpoint.setdefault('optimizer_states', []) checkpoint.setdefault('lr_schedulers', []) checkpoint.setdefault('epoch', 0) checkpoint.setdefault('global_step', 0) # 检查是否缺少 position_ids state_dict = checkpoint.get("state_dict", {}) if "cond_stage_model.transformer.text_model.embeddings.position_ids" not in state_dict: print("警告: 检查点缺少 'cond_stage_model.transformer.text_model.embeddings.position_ids' 键") # 获取模型中的 position_ids 形状 if hasattr(model, "cond_stage_model") and hasattr(model.cond_stage_model, "transformer"): try: max_position_embeddings = model.cond_stage_model.transformer.text_model.config.max_position_embeddings position_ids = torch.arange(max_position_embeddings).expand((1, -1)) state_dict["cond_stage_model.transformer.text_model.embeddings.position_ids"] = position_ids print("已添加 position_ids 到检查点") except Exception as e: print(f"无法添加 position_ids: {e}") # 确保有 state_dict if "state_dict" not in checkpoint: checkpoint["state_dict"] = state_dict return checkpoint # 正确继承原始模型类 from ldm.models.diffusion.ddpm import LatentDiffusion class CustomLatentDiffusion(LatentDiffusion): """自定义 LatentDiffusion 类,处理检查点加载问题""" def on_load_checkpoint(self, checkpoint): """在加载检查点时自动处理缺失的键""" state_dict = checkpoint["state_dict"] # 检查是否缺少 position_ids if "cond_stage_model.transformer.text_model.embeddings.position_ids" not in state_dict: print("警告: 检查点缺少 'cond_stage_model.transformer.text_model.embeddings.position_ids' 键") # 获取模型中的 position_ids 形状 max_position_embeddings = self.cond_stage_model.transformer.text_model.config.max_position_embeddings position_ids = torch.arange(max_position_embeddings).expand((1, -1)) state_dict["cond_stage_model.transformer.text_model.embeddings.position_ids"] = position_ids print("已添加 position_ids 到 state_dict") # 使用非严格模式加载 self.load_state_dict(state_dict, strict=False) print("模型权重加载完成") def filter_kwargs(cls, kwargs, log_prefix=""): # 关键参数白名单 - 这些参数必须保留 ESSENTIAL_PARAMS = { 'unet_config', 'first_stage_config', 'cond_stage_config', 'scheduler_config', 'ckpt_path', 'linear_start', 'linear_end' } # 特殊处理:允许所有包含"config"的参数 filtered_kwargs = {} for k, v in kwargs.items(): if k in ESSENTIAL_PARAMS or 'config' in k: filtered_kwargs[k] = v else: print(f"{log_prefix}过滤参数: {k}") print(f"{log_prefix}保留参数: {list(filtered_kwargs.keys())}") return filtered_kwargs def check_checkpoint_content(checkpoint_path): """打印检查点包含的键,确认是否有训练状态""" checkpoint = torch.load(checkpoint_path, map_location="cpu") print("检查点包含的键:", list(checkpoint.keys())) if "state_dict" in checkpoint: print("模型权重存在") if "optimizer_states" in checkpoint: print("优化器状态存在") if "epoch" in checkpoint: print(f"保存的epoch: {checkpoint['epoch']}") if "global_step" in checkpoint: print(f"保存的global_step: {checkpoint['global_step']}") def main() -> None: """主函数,训练和推理流程的入口点""" # 启用Tensor Core加速 torch.set_float32_matmul_precision('high') # 解析命令行参数 parser = argparse.ArgumentParser(description="扩散模型训练框架") parser.add_argument("--config", type=str, default="configs/train.yaml", help="配置文件路径") parser.add_argument("--name", type=str, default="experiment", help="实验名称") parser.add_argument("--resume", action="store_true", default=True, help="恢复训练") parser.add_argument("--debug", action="store_true", help="调试模式") parser.add_argument("--seed", type=int, default=42, help="随机种子") parser.add_argument("--scale_lr", action="store_true", help="根据GPU数量缩放学习率") parser.add_argument("--precision", type=str, default="32", choices=["16", "32", "bf16"], help="训练精度") args, unknown = parser.parse_known_args() # 设置随机种子 seed_everything(args.seed, workers=True) print(f"设置随机种子: {args.seed}") # 初始化配置管理器 try: config_manager = ConfigManager(args.config, unknown) config = config_manager.config except Exception as e: print(f"加载配置失败: {e}") sys.exit(1) # 创建日志目录 logging_config = config_manager.get_logging_config() logdir, ckptdir, cfgdir = create_experiment_directories(logging_config, args.name) # 保存配置 config_manager.save_config(os.path.join(cfgdir, "config.yaml")) # 配置日志记录器 tb_logger = TensorBoardLogger(os.path.join(logdir, "tensorboard")) # 配置回调函数 callbacks = setup_callbacks(config_manager, ckptdir, tb_logger) # 初始化数据模块 try: print("初始化数据模块...") data_config = config_manager.get_data_config() data_module = instantiate_from_config(data_config) data_module.setup() print("可用数据集:", list(data_module.datasets.keys())) except Exception as e: print(f"数据模块初始化失败: {str(e)}") return # 创建模型 try: model_config = config_manager.get_model_config() model_params = model_config.get("params", {}) # 创建模型实例 model = CustomLatentDiffusion(**model_config.get("params", {})) print("模型初始化成功") # 检查并转换预训练权重 ckpt_path = model_config.params.get("ckpt_path", "") if ckpt_path and os.path.exists(ckpt_path): print(f"加载预训练权重: {ckpt_path}") checkpoint = torch.load(ckpt_path, map_location="cpu") state_dict = checkpoint.get("state_dict", checkpoint) # 查找所有与conv_in.weight相关的键 conv_in_keys = [] for key in state_dict.keys(): if "conv_in.weight" in key and "first_stage_model" in key: conv_in_keys.append(key) # 转换找到的权重 for conv_in_key in conv_in_keys: if state_dict[conv_in_key].shape[1] == 3: # 原始是3通道 print(f"转换权重: {conv_in_key} 从3通道到1通道") # 取RGB三通道的平均值作为单通道权重 rgb_weights = state_dict[conv_in_key] ir_weights = rgb_weights.mean(dim=1, keepdim=True) state_dict[conv_in_key] = ir_weights print(f"转换前形状: {rgb_weights.shape}") print(f"转换后形状: {ir_weights.shape}") print(f"模型层形状: {model.first_stage_model.encoder.conv_in.weight.shape}") # 非严格模式加载(允许其他层不匹配) missing, unexpected = model.load_state_dict(state_dict, strict=False) print(f"权重加载完成: 缺失层 {len(missing)}, 不匹配层 {len(unexpected)}") if missing: print("缺失层:", missing) if unexpected: print("意外层:", unexpected) except Exception as e: print(f"模型初始化失败: {str(e)}") return print("VAE输入层形状:", model.first_stage_model.encoder.conv_in.weight.shape) # 权重转换 if ckpt_path and os.path.exists(ckpt_path): print(f"加载预训练权重: {ckpt_path}") checkpoint = torch.load(ckpt_path, map_location="cpu") state_dict = checkpoint.get("state_dict", checkpoint) # 增强:查找所有需要转换的层(包括可能的变体) conversion_keys = [] for key in state_dict.keys(): if "conv_in" in key or "conv_out" in key or "nin_shortcut" in key: if state_dict[key].ndim == 4 and state_dict[key].shape[1] == 3: conversion_keys.append(key) print(f"找到需要转换的层: {conversion_keys}") # 转换权重 for key in conversion_keys: print(f"转换权重: {key}") print(f"原始形状: {state_dict[key].shape}") # RGB权重 [out_c, in_c=3, kH, kW] rgb_weights = state_dict[key] # 转换为单通道权重 [out_c, 1, kH, kW] if rgb_weights.shape[1] == 3: ir_weights = rgb_weights.mean(dim=1, keepdim=True) state_dict[key] = ir_weights print(f"转换后形状: {state_dict[key].shape}") # 加载转换后的权重 try: # 使用非严格模式加载 missing, unexpected = model.load_state_dict(state_dict, strict=False) print(f"权重加载完成: 缺失层 {len(missing)}, 不匹配层 {len(unexpected)}") # 打印重要信息 if missing: print("缺失层:", missing[:5]) # 只显示前5个避免过多输出 if unexpected: print("意外层:", unexpected[:5]) # 特别检查conv_in层 if "first_stage_model.encoder.conv_in.weight" in missing: print("警告: conv_in.weight未加载,需要手动初始化") # 手动初始化单通道卷积层 with torch.no_grad(): model.first_stage_model.encoder.conv_in.weight.data.normal_(mean=0.0, std=0.02) print("已手动初始化conv_in.weight") except RuntimeError as e: print(f"加载权重时出错: {e}") print("尝试仅加载兼容的权重...") # 创建新的状态字典只包含兼容的键 model_state = model.state_dict() compatible_dict = {} for k, v in state_dict.items(): if k in model_state and v.shape == model_state[k].shape: compatible_dict[k] = v # 加载兼容的权重 model.load_state_dict(compatible_dict, strict=False) print(f"部分权重加载完成: {len(compatible_dict)}/{len(state_dict)}") # 配置学习率 training_config = config_manager.get_training_config() bs = data_config.params.batch_size base_lr = model_config.base_learning_rate ngpu = training_config.get("gpus", 1) accumulate_grad_batches = training_config.get("accumulate_grad_batches", 1) if args.scale_lr: model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr print(f"学习率缩放至: {model.learning_rate:.2e} = {accumulate_grad_batches} × {ngpu} × {bs} × {base_lr:.2e}") else: model.learning_rate = base_lr print(f"使用基础学习率: {model.learning_rate:.2e}") # 检查是否恢复训练 resume_from_checkpoint = None if args.resume: # 优先使用自动保存的last.ckpt last_ckpt = os.path.join(ckptdir, "last.ckpt") if os.path.exists(last_ckpt): print(f"恢复训练状态: {last_ckpt}") resume_from_checkpoint = last_ckpt else: # 回退到指定检查点 fallback_ckpt = os.path.join(current_dir, "checkpoints", "M3FD.ckpt") if os.path.exists(fallback_ckpt): print(f"警告: 使用仅含权重的检查点,训练状态将重置: {fallback_ckpt}") resume_from_checkpoint = fallback_ckpt else: print("未找到可用的检查点,从头开始训练") # 如果需要恢复训练,预处理检查点 if resume_from_checkpoint and os.path.exists(resume_from_checkpoint): try: # 预处理检查点 - 添加缺失的状态 checkpoint = preprocess_checkpoint(resume_from_checkpoint, model) # 创建新的完整检查点文件 fixed_ckpt_path = os.path.join(ckptdir, "fixed_checkpoint.ckpt") torch.save(checkpoint, fixed_ckpt_path) print(f"修复后的完整检查点已保存到: {fixed_ckpt_path}") # 使用修复后的检查点 resume_from_checkpoint = fixed_ckpt_path except Exception as e: print(f"预处理检查点失败: {e}") print("将尝试使用默认方式加载检查点") # 配置日志记录器 tb_logger = TensorBoardLogger(os.path.join(logdir, "tensorboard")) # 配置回调函数 callbacks = setup_callbacks(config_manager, ckptdir, tb_logger) # 检查是否有验证集 has_validation = hasattr(data_module, 'datasets') and 'validation' in data_module.datasets # 计算训练批次数 try: train_loader = data_module.train_dataloader() num_train_batches = len(train_loader) print(f"训练批次数: {num_train_batches}") except Exception as e: print(f"计算训练批次数失败: {e}") num_train_batches = 0 # 设置训练器参数(先设置基础参数) trainer_config = { "default_root_dir": logdir, "max_epochs": training_config.max_epochs, "gpus": ngpu, "distributed_backend": "ddp" if ngpu > 1 else None, "plugins": [DDPPlugin(find_unused_parameters=False)] if ngpu > 1 else None, "precision": 16, "accumulate_grad_batches": accumulate_grad_batches, "callbacks": callbacks, "logger": tb_logger, # 添加日志记录器 "resume_from_checkpoint": resume_from_checkpoint, "fast_dev_run": args.debug, "limit_val_batches": 0 if not has_validation else 1.0, "num_sanity_val_steps": 0, # 跳过初始验证加速恢复 "log_every_n_steps": 10 # 更频繁的日志记录 } # 动态调整验证配置 if has_validation: if num_train_batches < 50: # 小数据集:使用epoch验证 trainer_config["check_val_every_n_epoch"] = 1 # 确保移除步数验证参数 if "val_check_interval" in trainer_config: del trainer_config["val_check_interval"] else: # 大数据集:使用步数验证 val_check_interval = min(2000, num_train_batches) if num_train_batches < 100: val_check_interval = max(1, num_train_batches // 4) trainer_config["val_check_interval"] = val_check_interval # 创建训练器 try: print("最终训练器配置:") for k, v in trainer_config.items(): print(f" {k}: {v}") trainer = Trainer(**trainer_config) except Exception as e: print(f"创建训练器失败: {e}") tb_logger.close() sys.exit(1) # 执行训练 try: print("开始训练...") trainer.fit(model, data_module) print("训练完成!") except KeyboardInterrupt: print("训练被用户中断") if trainer.global_rank == 0 and trainer.model is not None: trainer.save_checkpoint(os.path.join(ckptdir, "interrupted.ckpt")) except Exception as e: print(f"训练出错: {e}") if trainer.global_rank == 0 and hasattr(trainer, 'model') and trainer.model is not None: trainer.save_checkpoint(os.path.join(ckptdir, "error.ckpt")) raise finally: # 关闭日志记录器 tb_logger.close() # 打印性能分析报告 if trainer.global_rank == 0 and hasattr(trainer, 'profiler'): print("训练摘要:") print(trainer.profiler.summary()) if __name__ == "__main__": main()运行报错:模型初始化失败: Error(s) in loading state_dict for CustomLatentDiffusion: size mismatch for first_stage_model.encoder.conv_in.weight: copying a param with shape torch.Size([128, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([128, 1, 3, 3]).

import torch, os, datetime import numpy as np from model.model import parsingNet from data.dataloader import get_train_loader from utils.dist_utils import dist_print, dist_tqdm, is_main_process, DistSummaryWriter from utils.factory import get_metric_dict, get_loss_dict, get_optimizer, get_scheduler from utils.metrics import MultiLabelAcc, AccTopk, Metric_mIoU, update_metrics, reset_metrics from utils.common import merge_config, save_model, cp_projects from utils.common import get_work_dir, get_logger import time def inference(net, data_label, use_aux): if use_aux: img, cls_label, seg_label = data_label img, cls_label, seg_label = img.cuda(), cls_label.long().cuda(), seg_label.long().cuda() cls_out, seg_out = net(img) return {'cls_out': cls_out, 'cls_label': cls_label, 'seg_out':seg_out, 'seg_label': seg_label} else: img, cls_label = data_label img, cls_label = img.cuda(), cls_label.long().cuda() cls_out = net(img) return {'cls_out': cls_out, 'cls_label': cls_label} def resolve_val_data(results, use_aux): results['cls_out'] = torch.argmax(results['cls_out'], dim=1) if use_aux: results['seg_out'] = torch.argmax(results['seg_out'], dim=1) return results def calc_loss(loss_dict, results, logger, global_step): loss = 0 for i in range(len(loss_dict['name'])): data_src = loss_dict['data_src'][i] datas = [results[src] for src in data_src] loss_cur = loss_dict['op'][i](*datas) if global_step % 20 == 0: logger.add_scalar('loss/'+loss_dict['name'][i], loss_cur, global_step) loss += loss_cur * loss_dict['weight'][i] return loss def train(net, data_loader, loss_dict, optimizer, scheduler,logger, epoch, metric_dict, use_aux): net.train() progress_bar = dist_tqdm(train_loader) t_data_0 = time.time() for b_idx, data_label in enumerate(progress_bar): t_data_1 = time.time() reset_metrics(metric_dict) global_step = epoch * len(data_loader) + b_idx t_net_0 = time.time() results = inference(net, data_label, use_aux) loss = calc_loss(loss_dict, results, logger, global_step) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step(global_step) t_net_1 = time.time() results = resolve_val_data(results, use_aux) update_metrics(metric_dict, results) if global_step % 20 == 0: for me_name, me_op in zip(metric_dict['name'], metric_dict['op']): logger.add_scalar('metric/' + me_name, me_op.get(), global_step=global_step) logger.add_scalar('meta/lr', optimizer.param_groups[0]['lr'], global_step=global_step) if hasattr(progress_bar,'set_postfix'): kwargs = {me_name: '%.3f' % me_op.get() for me_name, me_op in zip(metric_dict['name'], metric_dict['op'])} progress_bar.set_postfix(loss = '%.3f' % float(loss), data_time = '%.3f' % float(t_data_1 - t_data_0), net_time = '%.3f' % float(t_net_1 - t_net_0), **kwargs) t_data_0 = time.time() if __name__ == "__main__": torch.backends.cudnn.benchmark = True args, cfg = merge_config() work_dir = get_work_dir(cfg) distributed = False if 'WORLD_SIZE' in os.environ: distributed = int(os.environ['WORLD_SIZE']) > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') dist_print(datetime.datetime.now().strftime('[%Y/%m/%d %H:%M:%S]') + ' start training...') dist_print(cfg) assert cfg.backbone in ['18','34','50','101','152','50next','101next','50wide','101wide'] train_loader, cls_num_per_lane = get_train_loader(cfg.batch_size, cfg.data_root, cfg.griding_num, cfg.dataset, cfg.use_aux, distributed, cfg.num_lanes) net = parsingNet(pretrained = True, backbone=cfg.backbone,cls_dim = (cfg.griding_num+1,cls_num_per_lane, cfg.num_lanes),use_aux=cfg.use_aux).cuda() if distributed: net = torch.nn.parallel.DistributedDataParallel(net, device_ids = [args.local_rank]) optimizer = get_optimizer(net, cfg) if cfg.finetune is not None: dist_print('finetune from ', cfg.finetune) state_all = torch.load(cfg.finetune)['model'] state_clip = {} # only use backbone parameters for k,v in state_all.items(): if 'model' in k: state_clip[k] = v net.load_state_dict(state_clip, strict=False) if cfg.resume is not None: dist_print('==> Resume model from ' + cfg.resume) resume_dict = torch.load(cfg.resume, map_location='cpu') net.load_state_dict(resume_dict['model']) if 'optimizer' in resume_dict.keys(): optimizer.load_state_dict(resume_dict['optimizer']) resume_epoch = int(os.path.split(cfg.resume)[1][2:5]) + 1 else: resume_epoch = 0 scheduler = get_scheduler(optimizer, cfg, len(train_loader)) dist_print(len(train_loader)) metric_dict = get_metric_dict(cfg) loss_dict = get_loss_dict(cfg) logger = get_logger(work_dir, cfg) cp_projects(args.auto_backup, work_dir) for epoch in range(resume_epoch, cfg.epoch): train(net, train_loader, loss_dict, optimizer, scheduler,logger, epoch, metric_dict, cfg.use_aux) save_model(net, optimizer, epoch ,work_dir, distributed) logger.close() 上述代码逐行分析作用

for i in range(len(layers)): logger.info(f"=== Start quantize layer {i} ===") layer = layers[i].to(dev) if "mixtral" in args.net.lower(): # for mixtral, we only leverage lwc, which can be achieve by simply replace Linear with QuantLinear qlayer = copy.deepcopy(layer) for name, module in qlayer.named_modules(): if isinstance(module,torch.nn.Linear) and not "gate" in name: # do not quantize gate quantlinear = QuantLinear(module, args.weight_quant_params, args.act_quant_params) add_new_module(name, qlayer, quantlinear) else: qlayer = DecoderLayer(lm.model.config, layer, args) qlayer = qlayer.to(dev) # obtain output of full-precision model set_quant_state(qlayer, weight_quant=False, act_quant=False) if args.epochs > 0: with torch.no_grad(): with torch.cuda.amp.autocast(): for j in range(args.nsamples): fp_inps[j] = qlayer(fp_inps[j].unsqueeze(0), attention_mask=attention_mask,position_ids=position_ids)[0] if args.aug_loss: fp_inps_2[j] = qlayer(quant_inps[j].unsqueeze(0), attention_mask=attention_mask,position_ids=position_ids)[0] # init smooth parameters set_quant_state(qlayer, weight_quant=False, act_quant=True) # weight will be manually quantized before forward qlayer.let = args.let use_shift = True if is_llama or args.abits == 16: use_shift = False # deactivate channel-wise shifting for llama model and weight-only quantization if args.let: # init channel-wise scaling and shift qlayer.register_parameter("qkt_smooth_scale",torch.nn.Parameter(torch.ones(layer.self_attn.q_proj.out_features,device=dev, dtype=dtype))) for name,module in qlayer.named_modules(): if isinstance(module, QuantLinear): for key in pairs.keys(): if key in name: act = act_scales[f"{layer_name_prefix}.{i}.{name}"].to(device=dev, dtype=dtype).clamp(min=1e-5) weight = module.weight.abs().max(dim=0)[0].clamp(min=1e-5) scale = (act.pow(args.alpha)/weight.pow(1-args.alpha)).clamp(min=1e-5) if use_shift and not is_llama: shift = act_shifts[f"{layer_name_prefix}.{i}.{name}"].to(device=dev, dtype=dtype) else: shift = torch.zeros_like(scale) qlayer.register_parameter(f"{pairs[key]}_smooth_shift",torch.nn.Parameter(shift)) qlayer.register_parameter(f"{pairs[key]}_smooth_scale",torch.nn.Parameter(scale)) if args.resume: qlayer.load_state_dict(omni_parameters[i], strict=False) if args.epochs > 0: with torch.no_grad(): qlayer.float() # required for AMP training # create optimizer optimizer = torch.optim.AdamW( [{"params":let_parameters(qlayer, use_shift),"lr":args.let_lr}, {"params":lwc_parameters(qlayer),"lr":args.lwc_lr}],weight_decay=args.wd) loss_scaler = utils.NativeScalerWithGradNormCount() for epochs in range(args.epochs): loss_list = [] norm_list = [] for j in range(args.nsamples//args.batch_size): index = j * args.batch_size # obtain output of quantization model with traincast(): smooth_and_quant_temporary(qlayer, args, is_llama) quant_out = qlayer(quant_inps[index:index+args.batch_size,], attention_mask=attention_mask_batch,position_ids=position_ids)[0] loss = loss_func(fp_inps[index:index+args.batch_size,], quant_out) if args.aug_loss: loss += loss_func(fp_inps_2[index:index+args.batch_size,], quant_out) if not math.isfinite(loss.item()): logger.info("Loss is NAN, stopping training") pdb.set_trace() loss_list.append(loss.detach().cpu()) optimizer.zero_grad() norm = loss_scaler(loss, optimizer,parameters= get_omni_parameters(qlayer, use_shift)).cpu() norm_list.append(norm.data) loss_mean = torch.stack(loss_list).mean() norm_mean = torch.stack(norm_list).mean() logger.info(f"layer {i} iter {epochs} loss:{loss_mean} norm:{norm_mean} max memory_allocated {torch.cuda.max_memory_allocated(lm._device) / 1024**2} ") clear_temp_variable(qlayer) del optimizer qlayer.half() # real smooth and quantization smooth_and_quant_inplace(qlayer, args, is_llama) if args.epochs>0: # update input of quantization model with torch.no_grad(): # with torch.cuda.amp.autocast(): with traincast(): for j in range(args.nsamples): quant_inps[j] = qlayer(quant_inps[j].unsqueeze(0), attention_mask=attention_mask,position_ids=position_ids)[0] register_scales_and_zeros(qlayer) layers[i] = qlayer.to("cpu") omni_parameters[i] = omni_state_dict(qlayer) torch.save(omni_parameters, os.path.join(args.output_dir, f"omni_parameters.pth")) else: register_scales_and_zeros(qlayer) layers[i] = qlayer.to("cpu") if args.real_quant: assert args.wbits in [2,3,4] and args.abits >= 16 # only support weight-only quantization named_linears = get_named_linears(qlayer) for name, module in named_linears.items(): scales = module.weight_quantizer.scales zeros = module.weight_quantizer.zeros group_size = module.weight_quantizer.group_size dim0 = module.weight.shape[0] scales = scales.view(dim0,-1) zeros = zeros.view(dim0,-1) if args.wbits == 3: q_linear = qlinear_cuda.QuantLinear(args.wbits, group_size, module.in_features,module.out_features,not module.bias is None) else: q_linear = qlinear_triton.QuantLinear(args.wbits, group_size, module.in_features,module.out_features,not module.bias is None) q_linear.pack(module.cpu(), scales.float().cpu(), zeros.float().cpu()) add_new_module(name, qlayer, q_linear) print(f"pack quantized {name} finished") del module del layer torch.cuda.empty_cache()

class SyncAffined(MapTransform): def __init__(self, keys, atol=1e-5, logger=None): super().__init__(keys) self.orientation = Orientationd(keys=keys, axcodes="RAS") self.resample = ResampleToMatchd(keys=["mask"], key_dst="image", mode="nearest") self.atol = atol # 设置容差值 self.logger = logger # 设置日志记录器 def __call__(self, data): try: # 保存原始 affine 到 meta_dict data["image_meta_dict"]["original_affine"] = data["original_affine"] data["mask_meta_dict"]["original_affine"] = data["mask_original_affine"] # 执行方向对齐 data = self.orientation(data) # 提取仿射矩阵 image_affine = data["image_meta_dict"]["affine"] mask_affine = data["mask_meta_dict"]["affine"] # 确保仿射矩阵是NumPy数组 if isinstance(image_affine, torch.Tensor): image_affine = image_affine.numpy() if isinstance(mask_affine, torch.Tensor): mask_affine = mask_affine.numpy() # 如果仿射矩阵不一致且差异大于容差,则重采样掩膜 if not np.allclose(image_affine, mask_affine, atol=self.atol): if self.logger: diff = np.abs(image_affine - mask_affine).max() self.logger.warning(f"⚠️ affine 不一致 (最大差异: {diff:.2e}),重采样掩膜:{data.get('id', 'unknown')}") data = self.resample(data) # 更新重采样后的 affine data["mask_meta_dict"]["affine"] = data["image_meta_dict"]["affine"].clone() return data except Exception as e: if self.logger: self.logger.error(f"Error during SyncAffined processing: {e}") raise class RecordSpatialInfo(MapTransform): def __init__(self, keys): super().__init__(keys) self.keys = keys def __call__(self, data): for key in self.keys: meta_key = f"{key}_meta_dict" meta = data.get(meta_key, {}) # 原始 shape 来源于 tensor 的 shape(排除 channel 维度) img = data.get(key) if isinstance(img, torch.Tensor): processed_shape = np.array(img.shape[1:]) # (C, D, H, W) → (D, H, W) else: processed_shape = np.array(meta.get("spatial_shape", (1, 1, 1))) # 获取原始 affine 和形状 original_affine = meta.get("original_affine", np.eye(4)) original_shape = meta.get("original_shape", processed_shape) # 计算处理后的 affine processed_affine = meta.get("affine", np.eye(4)) # ✅ 写入 image_meta_dict 中 data[meta_key]["original_shape"] = original_shape data[meta_key]["original_affine"] = original_affine data[meta_key]["processed_affine"] = processed_affine data[meta_key]["processed_shape"] = processed_shape # foreground 起始位置 data["crop_start"] = np.array(data.get("foreground_start_coord", [0, 0, 0])) return data def get_transforms(): deterministic_transforms = Compose([ LoadImaged(keys=["image", "mask"], image_only=False, reader="ITKReader"), EnsureChannelFirstd(keys=["image", "mask"]), SyncAffined(keys=["image", "mask"], atol=1e-10), Spacingd(keys=["image", "mask"], pixdim=(1.0, 1.0, 1.0), mode=("bilinear", "nearest")), CropForegroundd(keys=["image", "mask"], source_key="mask", margin=10), ResizeWithPadOrCropd(keys=["image", "mask"], spatial_size=(64, 64, 64)), RecordSpatialInfo(keys=["image", "mask"]), # 同时记录图像和掩膜的空间信息 ScaleIntensityRanged(keys=["image"], a_min=20, a_max=80, b_min=0.0, b_max=1.0, clip=True), EnsureTyped(keys=["image", "mask"], data_type="tensor"), ], map_items=True, overrides={"allow_missing_keys": True}) augmentation_transforms = Compose([ RandFlipd(keys=["image", "mask"], prob=0.2, spatial_axis=[0, 1, 2]), RandAffined( keys=["image", "mask"], prob=0.3, rotate_range=(-0.2, 0.2), scale_range=(0.8, 1.2), shear_range=(-0.1, 0.1, -0.1, 0.1, -0.1, 0.1), translate_range=(5, 5, 5), mode=("bilinear", "nearest"), padding_mode="border", spatial_size=(64, 64, 64) ), Lambdad(keys=["label"], func=lambda x: torch.tensor(x, dtype=torch.long)) ]) return deterministic_transforms, augmentation_transforms deterministic_transforms, augmentation_transforms = get_transforms() # -------------------- 数据集加载 -------------------- train_ds = CacheDataset(data=train_files, transform=deterministic_transforms, cache_rate=0.8) train_ds = Dataset(train_ds, transform=augmentation_transforms) train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, num_workers=0) val_ds = CacheDataset(data=val_files, transform=deterministic_transforms, cache_rate=1.0) val_loader = DataLoader(val_ds, batch_size=4, shuffle=False, num_workers=0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = monai.networks.nets.resnet50(pretrained=False, spatial_dims=3, n_input_channels=1, num_classes=2).to(device) model.device = device # ✅ 加上这句 try: model.load_state_dict(torch.load("D:/monaisj/3/50_0.7941_0.7.pth", map_location=device,weights_only=True),strict=False) print("✅ 权重部分加载成功(忽略不匹配的层)") except Exception as e: print(f"❌ 加载失败: {str(e)}") # 重新初始化模型(后备方案) model.fc.apply(init_weights)这是我加载数据,预处理数据,加载monai3dresnet50模型的深度学习任务

def build_data_dict(paths, labels): data = [] skipped = 0 for path, label in zip(paths, labels): try: base_name = os.path.basename(path) file_id = base_name.replace(".nii.gz", "") mask_path = path.replace(".nii.gz", "mask.nii.gz") if not os.path.exists(mask_path): print(f"❌ 缺失 mask 文件: {mask_path}") skipped += 1 continue img_nii = nib.load(path) mask_nii = nib.load(mask_path) if np.all(mask_nii.get_fdata() == 0): print(f"⚠️ 掩膜全零,跳过: {mask_path}") skipped += 1 continue data.append({ "image": path, "mask": mask_path, "label": label, "id": file_id, "original_affine": np.array(img_nii.affine)[:4, :4].astype(np.float32), "original_shape": img_nii.shape, "mask_original_affine": np.array(mask_nii.affine)[:4, :4].astype(np.float32) }) except Exception as e: print(f"❌ 构建失败: {path},原因: {e}") skipped += 1 print(f"✅ 构建完成,有效样本: {len(data)},跳过: {skipped}") return data class SyncAffined(MapTransform): def __init__(self, keys, atol=1e-2, logger=None): super().__init__(keys) self.orientation = Orientationd(keys=keys, axcodes="RAS") self.resample = ResampleToMatchd(keys=["mask"], key_dst="image", mode="nearest") self.atol = atol self.logger = logger def __call__(self, data): try: data = self.orientation(data) a1 = data["image_meta_dict"]["affine"] a2 = data["mask_meta_dict"]["affine"] if isinstance(a1, torch.Tensor): a1 = a1.numpy() if isinstance(a2, torch.Tensor): a2 = a2.numpy() if not np.allclose(a1, a2, atol=self.atol): data = self.resample(data) return data except Exception as e: if self.logger: self.logger.error(f"Error during SyncAffined processing: {e}") raise def get_transforms(): deterministic_transforms = Compose([ LoadImaged(keys=["image", "mask"], image_only=False, reader="ITKReader"), EnsureChannelFirstd(keys=["image", "mask"]), SyncAffined(keys=["image", "mask"], atol=1e-2), Spacingd(keys=["image", "mask"], pixdim=(1.0, 1.0, 1.0), mode=("bilinear", "nearest")), CropForegroundd(keys=["image", "mask"], source_key="mask", margin=10, allow_smaller=True), ResizeWithPadOrCropd(keys=["image", "mask"], spatial_size=(64, 64, 64)), ScaleIntensityRanged(keys=["image"], a_min=20, a_max=80, b_min=0.0, b_max=1.0, clip=True), ToTensord(keys=["image", "mask"]) ]) augmentation_transforms = Compose([ RandFlipd(keys=["image", "mask"], prob=0.2, spatial_axis=[0, 1, 2]), RandAffined( keys=["image", "mask"], prob=0.3, rotate_range=(-0.2, 0.2), scale_range=(0.8, 1.2), shear_range=(-0.1, 0.1, -0.1, 0.1, -0.1, 0.1), translate_range=(5, 5, 5), mode=("bilinear", "nearest"), padding_mode="border", spatial_size=(64, 64, 64) ), Lambdad(keys=["label"], func=lambda x: torch.tensor(x, dtype=torch.long).squeeze(0)) ]) return deterministic_transforms, augmentation_transforms deterministic_transforms, augmentation_transforms = get_transforms() data_dir = "D:/monaisj/train" class_dirs = sorted([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]) image_paths, labels = [], [] for class_name in class_dirs: class_path = os.path.join(data_dir, class_name) nii_files = glob.glob(os.path.join(class_path, "*.nii.gz")) for nii_file in nii_files: if 'mask' not in nii_file: image_paths.append(nii_file) labels.append(int(class_name)) # 分层划分 sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42) train_indices, val_indices = next(sss.split(image_paths, labels)) train_paths = [image_paths[i] for i in train_indices] val_paths = [image_paths[i] for i in val_indices] train_labels = [labels[i] for i in train_indices] val_labels = [labels[i] for i in val_indices] train_files = build_data_dict(train_paths, train_labels) val_files = build_data_dict(val_paths, val_labels) # -------------------- 数据集加载 -------------------- train_ds = CacheDataset(data=train_files, transform=deterministic_transforms, cache_rate=0.8) train_ds = Dataset(train_ds, transform=augmentation_transforms) train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, num_workers=0) val_ds = CacheDataset(data=val_files, transform=deterministic_transforms, cache_rate=1.0) val_loader = DataLoader(val_ds, batch_size=8, shuffle=False, num_workers=0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = monai.networks.nets.resnet18(pretrained=False, spatial_dims=3, n_input_channels=1, num_classes=2).to(device) weights_path = "D:/MedicalNet/pretrain/resnet_50_epoch_110_batch_0.pth" if os.path.exists(weights_path): state_dict = torch.load(weights_path, map_location=device, weights_only=True) new_state_dict = {} for k, v in state_dict.items(): name = k.replace("model.", "").replace("module.", "") new_state_dict[name] = v model.load_state_dict(new_state_dict, strict=False) print(f"加载权重成功: {weights_path}") else: print(f"权重文件不存在: {weights_path}") def init_weights(m): if isinstance(m, torch.nn.Linear): torch.nn.init.kaiming_normal_(m.weight) torch.nn.init.constant_(m.bias, 0.0) model.fc.apply(init_weights) class_weights = torch.tensor([ len(train_labels)/(2.0 * np.bincount(train_labels)[0]), len(train_labels)/(2.0 * np.bincount(train_labels)[1]) ], dtype=torch.float32).to(device) loss_fn = CrossEntropyLoss(weight=class_weights) def compute_metrics(labels, preds, probs): cm = confusion_matrix(labels, preds) tn, fp, fn, tp = cm.ravel() sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0 specificity = tn / (tn + fp) if (tn + fp) > 0 else 0 accuracy = (tp + tn) / (tp + tn + fp + fn) auc = roc_auc_score(labels, probs) return sensitivity, specificity, accuracy, auc def freeze_layers(model, freeze_patterns=None, unfreeze_patterns=None): """按模式冻结/解冻层""" for name, param in model.named_parameters(): param.requires_grad = False if unfreeze_patterns: for pattern in unfreeze_patterns: if pattern in name: param.requires_grad = True break if "fc" in name: param.requires_grad = True def train_model(model, stage=1, epochs=50, init_lr=1e-3, eta_min=1e-5, data_dir="D:/monaisj/"): # 阶段配置 if stage == 1: # 阶段1:仅训练分类头 freeze_layers(model, unfreeze_patterns=["fc"]) print("🔒 阶段1:冻结骨干,仅训练分类头") lr = init_lr # 较高学习率 elif stage == 2: # 阶段2:解冻高层 freeze_layers(model, unfreeze_patterns=["layer3","layer4", "fc"]) print("🔓 阶段2:解冻高层(layer3/layer4)") lr = init_lr * 0.1 # 降低学习率 elif stage == 3: # 阶段3:全解冻 for param in model.parameters(): param.requires_grad = True print("🔥 阶段3:解冻全网络") lr = init_lr * 0.01 # 更低学习率 else: raise ValueError(f"未知阶段: {stage}") # 创建优化器(仅优化需要梯度的参数) params_to_optimize = [p for p in model.parameters() if p.requires_grad] optimizer = AdamW(params_to_optimize, lr=lr, weight_decay=1e-4) scheduler = CosineAnnealingLR(optimizer, T_max=epochs, eta_min=eta_min) best_val_auc = 0.0 # 初始化历史记录 history = { 'train_loss': [], 'train_acc': [], 'train_auc': [], 'train_sensitivity': [], 'train_specificity': [], 'val_loss': [], 'val_acc': [], 'val_auc': [], 'val_sensitivity': [], 'val_specificity': [], 'train_true_labels': None, 'train_probs': [], 'val_true_labels': None, 'val_probs': [] } for epoch in range(epochs): # ================== 训练阶段 ================== model.train() epoch_loss = 0.0 train_preds, train_labels, train_probs = [], [], [] scaler = GradScaler() for batch in train_loader: images = batch["image"].to(device, non_blocking=True) labels = batch["label"].long().to(device) optimizer.zero_grad(set_to_none=True) outputs = model(images) loss = loss_fn(outputs, labels) scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() epoch_loss += loss.item() * images.size(0) # 收集训练数据 preds = torch.argmax(outputs, dim=1).cpu().numpy() probs = torch.softmax(outputs, dim=1)[:, 1].detach().cpu().numpy() train_preds.extend(preds) train_labels.extend(labels.cpu().numpy()) train_probs.extend(probs) # 记录训练集标签(仅第一次) if epoch == 0: history['train_true_labels'] = train_labels history['train_probs'].append(train_probs) # 计算训练指标 train_loss = epoch_loss / len(train_loader) train_acc = accuracy_score(train_labels, train_preds) train_sensitivity, train_specificity, _, train_auc = compute_metrics( train_labels, train_preds, train_probs ) # ================== 验证阶段 ================== model.eval() val_loss = 0.0 val_preds, val_labels, val_probs = [], [], [] with torch.no_grad(): for batch in val_loader: images = batch["image"].to(device, non_blocking=True) labels = batch["label"].long().to(device) outputs = model(images) loss = loss_fn(outputs, labels) val_loss += loss.item() * images.size(0) # 收集验证数据 probs = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy() preds = torch.argmax(outputs, dim=1).cpu().numpy() val_preds.extend(preds) val_labels.extend(labels.cpu().numpy()) val_probs.extend(probs) # 记录验证集标签(仅第一次) if epoch == 0: history['val_true_labels'] = val_labels history['val_probs'].append(val_probs) # 计算验证指标 val_loss = val_loss / len(val_loader) val_acc = accuracy_score(val_labels, val_preds) val_sensitivity, val_specificity, _, val_auc = compute_metrics( val_labels, val_preds, val_probs ) # ================== 更新学习率和保存模型 ================== scheduler.step() # 保存每个epoch的模型 save_path = os.path.join(data_dir, f"best_model_stage{stage}_epoch{epoch+1}.pth") torch.save(model.state_dict(), save_path) print(f"✅ 保存模型到 {save_path}") # ================== 记录历史数据 ================== history['train_loss'].append(train_loss) history['train_acc'].append(train_acc) history['train_auc'].append(train_auc) history['train_sensitivity'].append(train_sensitivity) history['train_specificity'].append(train_specificity) history['val_loss'].append(val_loss) history['val_acc'].append(val_acc) history['val_auc'].append(val_auc) history['val_sensitivity'].append(val_sensitivity) history['val_specificity'].append(val_specificity) # 打印进度 current_lr = optimizer.param_groups[0]['lr'] print(f"Epoch {epoch+1}/{epochs} [Stage {stage}]") print(f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | AUC: {train_auc:.4f}") print(f"Val Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | AUC: {val_auc:.4f}") print(f"当前学习率: {current_lr:.2e}") print("-"*50) return history SEED =123456 random.seed(SEED) np.random.seed(SEED) # 阶段一:冻结骨干,仅训练分类头 print("\n开始阶段一训练(冻结骨干网络)") stage1_history = train_model(model, stage=1, epochs=40, init_lr=1e-2, eta_min=1e-2) print("\n开始阶段二训练(解冻高层)") # 加载阶段一最佳模型 # best_stage1_epoch = np.argmax([auc for auc in stage1_history['val_auc']]) + 1 model.load_state_dict(torch.load(f"D:/monaisj/best_model_stage1_epoch19.pth",map_location=device, weights_only=True)) stage2_history = train_model(model, stage=2, epochs=30, init_lr=1e-3, eta_min=1e-4) model.load_state_dict(torch.load(f"D:/monaisj/best_model_stage2_epoch20.pth",map_location=device, weights_only=True)) stage3_history = train_model(model, stage=3, epochs=30, init_lr=1e-4, eta_min=1e-5) 加载最佳模型,写一段代码帮我生成grad—cam热图,a图为预处理后根据掩膜裁剪后的输入模型图,b图是掩膜图,c图是cam图,d图是输入模型图和cam的叠加图

import argparse import math import os os.environ["GIT_PYTHON_REFRESH"] = "quiet" import random import subprocess import sys import time from copy import deepcopy from datetime import datetime, timedelta from pathlib import Path try: import comet_ml # must be imported before torch (if installed) except ImportError: comet_ml = None import numpy as np import torch import torch.distributed as dist import torch.nn as nn import yaml from torch.optim import lr_scheduler from tqdm import tqdm FILE = Path(__file__).resolve() ROOT = FILE.parents[0] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative import val as validate # for end-of-epoch mAP from models.experimental import attempt_load from models.yolo import Model from utils.autoanchor import check_anchors from utils.autobatch import check_train_batch_size from utils.callbacks import Callbacks from utils.dataloaders import create_dataloader from utils.downloads import attempt_download, is_url from utils.general import ( LOGGER, TQDM_BAR_FORMAT, check_amp, check_dataset, check_file, check_git_info, check_git_status, check_img_size, check_requirements, check_suffix, check_yaml, colorstr, get_latest_run, increment_path, init_seeds, intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle, print_args, print_mutation, strip_optimizer, yaml_save, ) from utils.loggers import LOGGERS, Loggers from utils.loggers.comet.comet_utils import check_comet_resume from utils.loss import ComputeLoss from utils.metrics import fitness from utils.plots import plot_evolve from utils.torch_utils import ( EarlyStopping, ModelEMA, de_parallel, select_device, smart_DDP, smart_optimizer, smart_resume, torch_distributed_zero_first, ) LOCAL_RANK = int(os.getenv("LOCAL_RANK", -1)) # https://pytorch.org/docs/stable/elastic/run.html RANK = int(os.getenv("RANK", -1)) WORLD_SIZE = int(os.getenv("WORLD_SIZE", 1)) GIT_INFO = check_git_info() def train(hyp, opt, device, callbacks): save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = ( Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze, ) callbacks.run("on_pretrain_routine_start") # Directories w = save_dir / "weights" # weights dir (w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir last, best = w / "last.pt", w / "best.pt" # Hyperparameters if isinstance(hyp, str): with open(hyp, errors="ignore") as f: hyp = yaml.safe_load(f) # load hyps dict LOGGER.info(colorstr("hyperparameters: ") + ", ".join(f"{k}={v}" for k, v in hyp.items())) opt.hyp = hyp.copy() # for saving hyps to checkpoints # Save run settings if not evolve: yaml_save(save_dir / "hyp.yaml", hyp) yaml_save(save_dir / "opt.yaml", vars(opt)) # Loggers data_dict = None if RANK in {-1, 0}: include_loggers = list(LOGGERS) if getattr(opt, "ndjson_console", False): include_loggers.append("ndjson_console") if getattr(opt, "ndjson_file", False): include_loggers.append("ndjson_file") loggers = Loggers( save_dir=save_dir, weights=weights, opt=opt, hyp=hyp, logger=LOGGER, include=tuple(include_loggers), ) # Register actions for k in methods(loggers): callbacks.register_action(k, callback=getattr(loggers, k)) # Process custom dataset artifact link data_dict = loggers.remote_dataset if resume: # If resuming runs from remote artifact weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size # Config plots = not evolve and not opt.noplots # create plots cuda = device.type != "cpu" init_seeds(opt.seed + 1 + RANK, deterministic=True) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict["train"], data_dict["val"] nc = 1 if single_cls else int(data_dict["nc"]) # number of classes names = {0: "item"} if single_cls and len(data_dict["names"]) != 1 else data_dict["names"] # class names is_coco = isinstance(val_path, str) and val_path.endswith("coco/val2017.txt") # COCO dataset # Model check_suffix(weights, ".pt") # check weights pretrained = weights.endswith(".pt") if pretrained: with torch_distributed_zero_first(LOCAL_RANK): weights = attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location="cpu") # load checkpoint to CPU to avoid CUDA memory leak model = Model(cfg or ckpt["model"].yaml, ch=3, nc=nc, anchors=hyp.get("anchors")).to(device) # create exclude = ["anchor"] if (cfg or hyp.get("anchors")) and not resume else [] # exclude keys csd = ckpt["model"].float().state_dict() # checkpoint state_dict as FP32 csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(csd, strict=False) # load LOGGER.info(f"Transferred {len(csd)}/{len(model.state_dict())} items from {weights}") # report else: model = Model(cfg, ch=3, nc=nc, anchors=hyp.get("anchors")).to(device) # create amp = check_amp(model) # check AMP # Freeze freeze = [f"model.{x}." for x in (freeze if len(freeze) > 1 else range(freeze[0]))] # layers to freeze for k, v in model.named_parameters(): v.requires_grad = True # train all layers # v.register_hook(lambda x: torch.nan_to_num(x)) # NaN to 0 (commented for erratic training results) if any(x in k for x in freeze): LOGGER.info(f"freezing {k}") v.requires_grad = False # Image size gs = max(int(model.stride.max()), 32) # grid size (max stride) imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple # Batch size if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size batch_size = check_train_batch_size(model, imgsz, amp) loggers.on_params_update({"batch_size": batch_size}) # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp["weight_decay"] *= batch_size * accumulate / nbs # scale weight_decay optimizer = smart_optimizer(model, opt.optimizer, hyp["lr0"], hyp["momentum"], hyp["weight_decay"]) # Scheduler if opt.cos_lr: lf = one_cycle(1, hyp["lrf"], epochs) # cosine 1->hyp['lrf'] else: def lf(x): """Linear learning rate scheduler function with decay calculated by epoch proportion.""" return (1 - x / epochs) * (1.0 - hyp["lrf"]) + hyp["lrf"] # linear scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA ema = ModelEMA(model) if RANK in {-1, 0} else None # Resume best_fitness, start_epoch = 0.0, 0 if pretrained: if resume: best_fitness, start_epoch, epochs = smart_resume(ckpt, optimizer, ema, weights, epochs, resume) del ckpt, csd # DP mode if cuda and RANK == -1 and torch.cuda.device_count() > 1: LOGGER.warning( "WARNING ⚠️ DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n" "See Multi-GPU Tutorial at https://docs.ultralytics.com/yolov5/tutorials/multi_gpu_training to get started." ) model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and RANK != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) LOGGER.info("Using SyncBatchNorm()") # Trainloader train_loader, dataset = create_dataloader( train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls, hyp=hyp, augment=True, cache=None if opt.cache == "val" else opt.cache, rect=opt.rect, rank=LOCAL_RANK, workers=workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr("train: "), shuffle=True, seed=opt.seed, ) labels = np.concatenate(dataset.labels, 0) mlc = int(labels[:, 0].max()) # max label class assert mlc < nc, f"Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}" # Process 0 if RANK in {-1, 0}: val_loader = create_dataloader( val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls, hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1, workers=workers * 2, pad=0.5, prefix=colorstr("val: "), )[0] if not resume: if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp["anchor_t"], imgsz=imgsz) # run AutoAnchor model.half().float() # pre-reduce anchor precision callbacks.run("on_pretrain_routine_end", labels, names) # DDP mode if cuda and RANK != -1: model = smart_DDP(model) # Model attributes nl = de_parallel(model).model[-1].nl # number of detection layers (to scale hyps) hyp["box"] *= 3 / nl # scale to layers hyp["cls"] *= nc / 80 * 3 / nl # scale to classes and layers hyp["obj"] *= (imgsz / 640) ** 2 * 3 / nl # scale to image size and layers hyp["label_smoothing"] = opt.label_smoothing model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nb = len(train_loader) # number of batches nw = max(round(hyp["warmup_epochs"] * nb), 100) # number of warmup iterations, max(3 epochs, 100 iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training last_opt_step = -1 maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = torch.cuda.amp.GradScaler(enabled=amp) stopper, stop = EarlyStopping(patience=opt.patience), False compute_loss = ComputeLoss(model) # init loss class callbacks.run("on_train_start") LOGGER.info( f"Image sizes {imgsz} train, {imgsz} val\n" f"Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n" f"Logging results to {colorstr('bold', save_dir)}\n" f"Starting training for {epochs} epochs..." ) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ callbacks.run("on_train_epoch_start") model.train() # Update image weights (optional, single-GPU only) if opt.image_weights: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Update mosaic border (optional) # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(3, device=device) # mean losses if RANK != -1: train_loader.sampler.set_epoch(epoch) pbar = enumerate(train_loader) LOGGER.info(("\n" + "%11s" * 7) % ("Epoch", "GPU_mem", "box_loss", "obj_loss", "cls_loss", "Instances", "Size")) if RANK in {-1, 0}: pbar = tqdm(pbar, total=nb, bar_format=TQDM_BAR_FORMAT) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- callbacks.run("on_train_batch_start") ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x["lr"] = np.interp(ni, xi, [hyp["warmup_bias_lr"] if j == 0 else 0.0, x["initial_lr"] * lf(epoch)]) if "momentum" in x: x["momentum"] = np.interp(ni, xi, [hyp["warmup_momentum"], hyp["momentum"]]) # Multi-scale if opt.multi_scale: sz = random.randrange(int(imgsz * 0.5), int(imgsz * 1.5) + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = nn.functional.interpolate(imgs, size=ns, mode="bilinear", align_corners=False) # Forward with torch.cuda.amp.autocast(amp): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size if RANK != -1: loss *= WORLD_SIZE # gradient averaged between devices in DDP mode if opt.quad: loss *= 4.0 # Backward scaler.scale(loss).backward() # Optimize - https://pytorch.org/docs/master/notes/amp_examples.html if ni - last_opt_step >= accumulate: scaler.unscale_(optimizer) # unscale gradients torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0) # clip gradients scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) last_opt_step = ni # Log if RANK in {-1, 0}: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = f"{torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0:.3g}G" # (GB) pbar.set_description( ("%11s" * 2 + "%11.4g" * 5) % (f"{epoch}/{epochs - 1}", mem, *mloss, targets.shape[0], imgs.shape[-1]) ) callbacks.run("on_train_batch_end", model, ni, imgs, targets, paths, list(mloss)) if callbacks.stop_training: return # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x["lr"] for x in optimizer.param_groups] # for loggers scheduler.step() if RANK in {-1, 0}: # mAP callbacks.run("on_train_epoch_end", epoch=epoch) ema.update_attr(model, include=["yaml", "nc", "hyp", "names", "stride", "class_weights"]) final_epoch = (epoch + 1 == epochs) or stopper.possible_stop if not noval or final_epoch: # Calculate mAP results, maps, _ = validate.run( data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, half=amp, model=ema.ema, single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, plots=False, callbacks=callbacks, compute_loss=compute_loss, ) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, [email protected], [email protected]] stop = stopper(epoch=epoch, fitness=fi) # early stop check if fi > best_fitness: best_fitness = fi log_vals = list(mloss) + list(results) + lr callbacks.run("on_fit_epoch_end", log_vals, epoch, best_fitness, fi) # Save model if (not nosave) or (final_epoch and not evolve): # if save ckpt = { "epoch": epoch, "best_fitness": best_fitness, "model": deepcopy(de_parallel(model)).half(), "ema": deepcopy(ema.ema).half(), "updates": ema.updates, "optimizer": optimizer.state_dict(), "opt": vars(opt), "git": GIT_INFO, # {remote, branch, commit} if a git repo "date": datetime.now().isoformat(), } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if opt.save_period > 0 and epoch % opt.save_period == 0: torch.save(ckpt, w / f"epoch{epoch}.pt") del ckpt callbacks.run("on_model_save", last, epoch, final_epoch, best_fitness, fi) # EarlyStopping if RANK != -1: # if DDP training broadcast_list = [stop if RANK == 0 else None] dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks if RANK != 0: stop = broadcast_list[0] if stop: break # must break all DDP ranks # end epoch ---------------------------------------------------------------------------------------------------- # end training ----------------------------------------------------------------------------------------------------- if RANK in {-1, 0}: LOGGER.info(f"\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.") for f in last, best: if f.exists(): strip_optimizer(f) # strip optimizers if f is best: LOGGER.info(f"\nValidating {f}...") results, _, _ = validate.run( data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=attempt_load(f, device).half(), iou_thres=0.65 if is_coco else 0.60, # best pycocotools at iou 0.65 single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, save_json=is_coco, verbose=True, plots=plots, callbacks=callbacks, compute_loss=compute_loss, ) # val best model with plots if is_coco: callbacks.run("on_fit_epoch_end", list(mloss) + list(results) + lr, epoch, best_fitness, fi) callbacks.run("on_train_end", last, best, epoch, results) torch.cuda.empty_cache() return results def parse_opt(known=False): parser = argparse.ArgumentParser() parser.add_argument("--weights", type=str, default=ROOT / "yolov5s.pt", help="initial weights path") parser.add_argument("--cfg", type=str, default="A_dataset/yolov5s.yaml", help="model.yaml path") parser.add_argument("--data", type=str, default=ROOT / "A_dataset/dataset.yaml", help="dataset.yaml path") parser.add_argument("--hyp", type=str, default=ROOT / "data/hyps/hyp.scratch-low.yaml", help="hyperparameters path") parser.add_argument("--epochs", type=int, default=100, help="total training epochs") parser.add_argument("--batch-size", type=int, default=16, help="total batch size for all GPUs, -1 for autobatch") parser.add_argument("--imgsz", "--img", "--img-size", type=int, default=640, help="train, val image size (pixels)") parser.add_argument("--rect", action="store_true", help="rectangular training") parser.add_argument("--resume", nargs="?", const=True, default=False, help="resume most recent training") parser.add_argument("--nosave", action="store_true", help="only save final checkpoint") parser.add_argument("--noval", action="store_true", help="only validate final epoch") parser.add_argument("--noautoanchor", action="store_true", help="disable AutoAnchor") parser.add_argument("--noplots", action="store_true", help="save no plot files") parser.add_argument("--evolve", type=int, nargs="?", const=300, help="evolve hyperparameters for x generations") parser.add_argument( "--evolve_population", type=str, default=ROOT / "data/hyps", help="location for loading population" ) parser.add_argument("--resume_evolve", type=str, default=None, help="resume evolve from last generation") parser.add_argument("--bucket", type=str, default="", help="gsutil bucket") parser.add_argument("--cache", type=str, nargs="?", const="ram", help="image --cache ram/disk") parser.add_argument("--image-weights", action="store_true", help="use weighted image selection for training") parser.add_argument("--device", default="", help="cuda device, i.e. 0 or 0,1,2,3 or cpu") parser.add_argument("--multi-scale", action="store_true", help="vary img-size +/- 50%%") parser.add_argument("--single-cls", action="store_true", help="train multi-class data as single-class") parser.add_argument("--optimizer", type=str, choices=["SGD", "Adam", "AdamW"], default="SGD", help="optimizer") parser.add_argument("--sync-bn", action="store_true", help="use SyncBatchNorm, only available in DDP mode") parser.add_argument("--workers", type=int, default=0, help="max dataloader workers (per RANK in DDP mode)") parser.add_argument("--project", default=ROOT / "runs/train", help="save to project/name") parser.add_argument("--name", default="exp", help="save to project/name") parser.add_argument("--exist-ok", action="store_true", help="existing project/name ok, do not increment") parser.add_argument("--quad", action="store_true", help="quad dataloader") parser.add_argument("--cos-lr", action="store_true", help="cosine LR scheduler") parser.add_argument("--label-smoothing", type=float, default=0.0, help="Label smoothing epsilon") parser.add_argument("--patience", type=int, default=100, help="EarlyStopping patience (epochs without improvement)") parser.add_argument("--freeze", nargs="+", type=int, default=[0], help="Freeze layers: backbone=10, first3=0 1 2") parser.add_argument("--save-period", type=int, default=-1, help="Save checkpoint every x epochs (disabled if < 1)") parser.add_argument("--seed", type=int, default=0, help="Global training seed") parser.add_argument("--local_rank", type=int, default=-1, help="Automatic DDP Multi-GPU argument, do not modify") # Logger arguments parser.add_argument("--entity", default=None, help="Entity") parser.add_argument("--upload_dataset", nargs="?", const=True, default=False, help='Upload data, "val" option') parser.add_argument("--bbox_interval", type=int, default=-1, help="Set bounding-box image logging interval") parser.add_argument("--artifact_alias", type=str, default="latest", help="Version of dataset artifact to use") # NDJSON logging parser.add_argument("--ndjson-console", action="store_true", help="Log ndjson to console") parser.add_argument("--ndjson-file", action="store_true", help="Log ndjson to file") return parser.parse_known_args()[0] if known else parser.parse_args() def main(opt, callbacks=Callbacks()): if RANK in {-1, 0}: print_args(vars(opt)) check_git_status() check_requirements(ROOT / "requirements.txt") # Resume (from specified or most recent last.pt) if opt.resume and not check_comet_resume(opt) and not opt.evolve: last = Path(check_file(opt.resume) if isinstance(opt.resume, str) else get_latest_run()) opt_yaml = last.parent.parent / "opt.yaml" # train options yaml opt_data = opt.data # original dataset if opt_yaml.is_file(): with open(opt_yaml, errors="ignore") as f: d = yaml.safe_load(f) else: d = torch.load(last, map_location="cpu")["opt"] opt = argparse.Namespace(**d) # replace opt.cfg, opt.weights, opt.resume = "", str(last), True # reinstate if is_url(opt_data): opt.data = check_file(opt_data) # avoid HUB resume auth timeout else: opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = ( check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project), ) # checks assert len(opt.cfg) or len(opt.weights), "either --cfg or --weights must be specified" if opt.evolve: if opt.project == str(ROOT / "runs/train"): # if default project name, rename to runs/evolve opt.project = str(ROOT / "runs/evolve") opt.exist_ok, opt.resume = opt.resume, False # pass resume to exist_ok and disable resume if opt.name == "cfg": opt.name = Path(opt.cfg).stem # use model.yaml as name opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if LOCAL_RANK != -1: msg = "is not compatible with YOLOv5 Multi-GPU DDP training" assert not opt.image_weights, f"--image-weights {msg}" assert not opt.evolve, f"--evolve {msg}" assert opt.batch_size != -1, f"AutoBatch with --batch-size -1 {msg}, please pass a valid --batch-size" assert opt.batch_size % WORLD_SIZE == 0, f"--batch-size {opt.batch_size} must be multiple of WORLD_SIZE" assert torch.cuda.device_count() > LOCAL_RANK, "insufficient CUDA devices for DDP command" torch.cuda.set_device(LOCAL_RANK) device = torch.device("cuda", LOCAL_RANK) dist.init_process_group( backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=10800) ) # Train if not opt.evolve: train(opt.hyp, opt, device, callbacks) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (including this hyperparameter True-False, lower_limit, upper_limit) meta = { "lr0": (False, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) "lrf": (False, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) "momentum": (False, 0.6, 0.98), # SGD momentum/Adam beta1 "weight_decay": (False, 0.0, 0.001), # optimizer weight decay "warmup_epochs": (False, 0.0, 5.0), # warmup epochs (fractions ok) "warmup_momentum": (False, 0.0, 0.95), # warmup initial momentum "warmup_bias_lr": (False, 0.0, 0.2), # warmup initial bias lr "box": (False, 0.02, 0.2), # box loss gain "cls": (False, 0.2, 4.0), # cls loss gain "cls_pw": (False, 0.5, 2.0), # cls BCELoss positive_weight "obj": (False, 0.2, 4.0), # obj loss gain (scale with pixels) "obj_pw": (False, 0.5, 2.0), # obj BCELoss positive_weight "iou_t": (False, 0.1, 0.7), # IoU training threshold "anchor_t": (False, 2.0, 8.0), # anchor-multiple threshold "anchors": (False, 2.0, 10.0), # anchors per output grid (0 to ignore) "fl_gamma": (False, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) "hsv_h": (True, 0.0, 0.1), # image HSV-Hue augmentation (fraction) "hsv_s": (True, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) "hsv_v": (True, 0.0, 0.9), # image HSV-Value augmentation (fraction) "degrees": (True, 0.0, 45.0), # image rotation (+/- deg) "translate": (True, 0.0, 0.9), # image translation (+/- fraction) "scale": (True, 0.0, 0.9), # image scale (+/- gain) "shear": (True, 0.0, 10.0), # image shear (+/- deg) "perspective": (True, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 "flipud": (True, 0.0, 1.0), # image flip up-down (probability) "fliplr": (True, 0.0, 1.0), # image flip left-right (probability) "mosaic": (True, 0.0, 1.0), # image mosaic (probability) "mixup": (True, 0.0, 1.0), # image mixup (probability) "copy_paste": (True, 0.0, 1.0), # segment copy-paste (probability) } # GA configs pop_size = 50 mutation_rate_min = 0.01 mutation_rate_max = 0.5 crossover_rate_min = 0.5 crossover_rate_max = 1 min_elite_size = 2 max_elite_size = 5 tournament_size_min = 2 tournament_size_max = 10 with open(opt.hyp, errors="ignore") as f: hyp = yaml.safe_load(f) # load hyps dict if "anchors" not in hyp: # anchors commented in hyp.yaml hyp["anchors"] = 3 if opt.noautoanchor: del hyp["anchors"], meta["anchors"] opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir) # only val/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices evolve_yaml, evolve_csv = save_dir / "hyp_evolve.yaml", save_dir / "evolve.csv" if opt.bucket: # download evolve.csv if exists subprocess.run( [ "gsutil", "cp", f"gs://{opt.bucket}/evolve.csv", str(evolve_csv), ] ) # Delete the items in meta dictionary whose first value is False del_ = [item for item, value_ in meta.items() if value_[0] is False] hyp_GA = hyp.copy() # Make a copy of hyp dictionary for item in del_: del meta[item] # Remove the item from meta dictionary del hyp_GA[item] # Remove the item from hyp_GA dictionary # Set lower_limit and upper_limit arrays to hold the search space boundaries lower_limit = np.array([meta[k][1] for k in hyp_GA.keys()]) upper_limit = np.array([meta[k][2] for k in hyp_GA.keys()]) # Create gene_ranges list to hold the range of values for each gene in the population gene_ranges = [(lower_limit[i], upper_limit[i]) for i in range(len(upper_limit))] # Initialize the population with initial_values or random values initial_values = [] # If resuming evolution from a previous checkpoint if opt.resume_evolve is not None: assert os.path.isfile(ROOT / opt.resume_evolve), "evolve population path is wrong!" with open(ROOT / opt.resume_evolve, errors="ignore") as f: evolve_population = yaml.safe_load(f) for value in evolve_population.values(): value = np.array([value[k] for k in hyp_GA.keys()]) initial_values.append(list(value)) # If not resuming from a previous checkpoint, generate initial values from .yaml files in opt.evolve_population else: yaml_files = [f for f in os.listdir(opt.evolve_population) if f.endswith(".yaml")] for file_name in yaml_files: with open(os.path.join(opt.evolve_population, file_name)) as yaml_file: value = yaml.safe_load(yaml_file) value = np.array([value[k] for k in hyp_GA.keys()]) initial_values.append(list(value)) # Generate random values within the search space for the rest of the population if initial_values is None: population = [generate_individual(gene_ranges, len(hyp_GA)) for _ in range(pop_size)] elif pop_size > 1: population = [generate_individual(gene_ranges, len(hyp_GA)) for _ in range(pop_size - len(initial_values))] for initial_value in initial_values: population = [initial_value] + population # Run the genetic algorithm for a fixed number of generations list_keys = list(hyp_GA.keys()) for generation in range(opt.evolve): if generation >= 1: save_dict = {} for i in range(len(population)): little_dict = {list_keys[j]: float(population[i][j]) for j in range(len(population[i]))} save_dict[f"gen{str(generation)}number{str(i)}"] = little_dict with open(save_dir / "evolve_population.yaml", "w") as outfile: yaml.dump(save_dict, outfile, default_flow_style=False) # Adaptive elite size elite_size = min_elite_size + int((max_elite_size - min_elite_size) * (generation / opt.evolve)) # Evaluate the fitness of each individual in the population fitness_scores = [] for individual in population: for key, value in zip(hyp_GA.keys(), individual): hyp_GA[key] = value hyp.update(hyp_GA) results = train(hyp.copy(), opt, device, callbacks) callbacks = Callbacks() # Write mutation results keys = ( "metrics/precision", "metrics/recall", "metrics/mAP_0.5", "metrics/mAP_0.5:0.95", "val/box_loss", "val/obj_loss", "val/cls_loss", ) print_mutation(keys, results, hyp.copy(), save_dir, opt.bucket) fitness_scores.append(results[2]) # Select the fittest individuals for reproduction using adaptive tournament selection selected_indices = [] for _ in range(pop_size - elite_size): # Adaptive tournament size tournament_size = max( max(2, tournament_size_min), int(min(tournament_size_max, pop_size) - (generation / (opt.evolve / 10))), ) # Perform tournament selection to choose the best individual tournament_indices = random.sample(range(pop_size), tournament_size) tournament_fitness = [fitness_scores[j] for j in tournament_indices] winner_index = tournament_indices[tournament_fitness.index(max(tournament_fitness))] selected_indices.append(winner_index) # Add the elite individuals to the selected indices elite_indices = [i for i in range(pop_size) if fitness_scores[i] in sorted(fitness_scores)[-elite_size:]] selected_indices.extend(elite_indices) # Create the next generation through crossover and mutation next_generation = [] for _ in range(pop_size): parent1_index = selected_indices[random.randint(0, pop_size - 1)] parent2_index = selected_indices[random.randint(0, pop_size - 1)] # Adaptive crossover rate crossover_rate = max( crossover_rate_min, min(crossover_rate_max, crossover_rate_max - (generation / opt.evolve)) ) if random.uniform(0, 1) < crossover_rate: crossover_point = random.randint(1, len(hyp_GA) - 1) child = population[parent1_index][:crossover_point] + population[parent2_index][crossover_point:] else: child = population[parent1_index] # Adaptive mutation rate mutation_rate = max( mutation_rate_min, min(mutation_rate_max, mutation_rate_max - (generation / opt.evolve)) ) for j in range(len(hyp_GA)): if random.uniform(0, 1) < mutation_rate: child[j] += random.uniform(-0.1, 0.1) child[j] = min(max(child[j], gene_ranges[j][0]), gene_ranges[j][1]) next_generation.append(child) # Replace the old population with the new generation population = next_generation # Print the best solution found best_index = fitness_scores.index(max(fitness_scores)) best_individual = population[best_index] print("Best solution found:", best_individual) # Plot results plot_evolve(evolve_csv) LOGGER.info( f"Hyperparameter evolution finished {opt.evolve} generations\n" f"Results saved to {colorstr('bold', save_dir)}\n" f"Usage example: $ python train.py --hyp {evolve_yaml}" ) def generate_individual(input_ranges, individual_length): individual = [] for i in range(individual_length): lower_bound, upper_bound = input_ranges[i] individual.append(random.uniform(lower_bound, upper_bound)) return individual def run(**kwargs): opt = parse_opt(True) for k, v in kwargs.items(): setattr(opt, k, v) main(opt) return opt if __name__ == "__main__": opt = parse_opt() main(opt) 这是yolov5自带的train训练代码,我现在想要修改它,让该代码简洁一些,并且仍然能完成训练

# YOLOv5 🚀 by Ultralytics, GPL-3.0 license """ Train a YOLOv5 model on a custom dataset. Models and datasets download automatically from the latest YOLOv5 release. Usage - Single-GPU training: $ python train.py --data coco128.yaml --weights yolov5s.pt --img 640 # from pretrained (recommended) $ python train.py --data coco128.yaml --weights '' --cfg yolov5s.yaml --img 640 # from scratch Usage - Multi-GPU DDP training: $ python -m torch.distributed.run --nproc_per_node 4 --master_port 1 train.py --data coco128.yaml --weights yolov5s.pt --img 640 --device 0,1,2,3 Models: https://github.com/ultralytics/yolov5/tree/master/models Datasets: https://github.com/ultralytics/yolov5/tree/master/data Tutorial: https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data """ import argparse import math import os os.environ["GIT_PYTHON_REFRESH"] = "quiet" # add there import random import sys import time from copy import deepcopy from datetime import datetime from pathlib import Path import numpy as np import torch import torch.distributed as dist import torch.nn as nn import yaml from torch.optim import lr_scheduler from tqdm import tqdm # import numpy # import torch.serialization # torch.serialization.add_safe_globals([numpy._core.multiarray._reconstruct]) FILE = Path(__file__).resolve() ROOT = FILE.parents[0] # YOLOv5 root directory if str(ROOT) not in sys.path: sys.path.append(str(ROOT)) # add ROOT to PATH ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative import val as validate # for end-of-epoch mAP from models.experimental import attempt_load from models.yolo import Model from utils.autoanchor import check_anchors from utils.autobatch import check_train_batch_size from utils.callbacks import Callbacks from utils.dataloaders import create_dataloader from utils.downloads import attempt_download, is_url from utils.general import (LOGGER, TQDM_BAR_FORMAT, check_amp, check_dataset, check_file, check_git_info, check_git_status, check_img_size, check_requirements, check_suffix, check_yaml, colorstr, get_latest_run, increment_path, init_seeds, intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle, print_args, print_mutation, strip_optimizer, yaml_save) from utils.loggers import Loggers from utils.loggers.comet.comet_utils import check_comet_resume from utils.loss import ComputeLoss from utils.metrics import fitness from utils.plots import plot_evolve from utils.torch_utils import (EarlyStopping, ModelEMA, de_parallel, select_device, smart_DDP, smart_optimizer, smart_resume, torch_distributed_zero_first) LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html RANK = int(os.getenv('RANK', -1)) WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) GIT_INFO = check_git_info() def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictionary save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \ opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze callbacks.run('on_pretrain_routine_start') # Directories w = save_dir / 'weights' # weights dir (w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir last, best = w / 'last.pt', w / 'best.pt' # Hyperparameters if isinstance(hyp, str): with open(hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) opt.hyp = hyp.copy() # for saving hyps to checkpoints # Save run settings if not evolve: yaml_save(save_dir / 'hyp.yaml', hyp) yaml_save(save_dir / 'opt.yaml', vars(opt)) # Loggers data_dict = None if RANK in {-1, 0}: loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance # Register actions for k in methods(loggers): callbacks.register_action(k, callback=getattr(loggers, k)) # Process custom dataset artifact link data_dict = loggers.remote_dataset if resume: # If resuming runs from remote artifact weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size # Config plots = not evolve and not opt.noplots # create plots cuda = device.type != 'cpu' init_seeds(opt.seed + 1 + RANK, deterministic=True) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] nc = 1 if single_cls else int(data_dict['nc']) # number of classes names = {0: 'item'} if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names is_coco = isinstance(val_path, str) and val_path.endswith('coco/val2017.txt') # COCO dataset # Model check_suffix(weights, '.pt') # check weights pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(LOCAL_RANK): weights = attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location='cpu', weights_only=False) # load checkpoint to CPU to avoid CUDA memory leak model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32 csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(csd, strict=False) # load LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}') # report else: model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create amp = check_amp(model) # check AMP # Freeze freeze = [f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0]))] # layers to freeze for k, v in model.named_parameters(): v.requires_grad = True # train all layers # v.register_hook(lambda x: torch.nan_to_num(x)) # NaN to 0 (commented for erratic training results) if any(x in k for x in freeze): LOGGER.info(f'freezing {k}') v.requires_grad = False # Image size gs = max(int(model.stride.max()), 32) # grid size (max stride) imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple # Batch size if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size batch_size = check_train_batch_size(model, imgsz, amp) loggers.on_params_update({"batch_size": batch_size}) # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay optimizer = smart_optimizer(model, opt.optimizer, hyp['lr0'], hyp['momentum'], hyp['weight_decay']) # Scheduler if opt.cos_lr: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] else: lf = lambda x: (1 - x / epochs) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA ema = ModelEMA(model) if RANK in {-1, 0} else None # Resume best_fitness, start_epoch = 0.0, 0 if pretrained: if resume: best_fitness, start_epoch, epochs = smart_resume(ckpt, optimizer, ema, weights, epochs, resume) del ckpt, csd # DP mode if cuda and RANK == -1 and torch.cuda.device_count() > 1: LOGGER.warning('WARNING ⚠️ DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n' 'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.') model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and RANK != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) LOGGER.info('Using SyncBatchNorm()') # Trainloader train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls, hyp=hyp, augment=True, cache=None if opt.cache == 'val' else opt.cache, rect=opt.rect, rank=LOCAL_RANK, workers=workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '), shuffle=True) labels = np.concatenate(dataset.labels, 0) mlc = int(labels[:, 0].max()) # max label class assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}' # Process 0 if RANK in {-1, 0}: val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls, hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1, workers=workers * 2, pad=0.5, prefix=colorstr('val: '))[0] if not resume: if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # run AutoAnchor model.half().float() # pre-reduce anchor precision callbacks.run('on_pretrain_routine_end', labels, names) # DDP mode if cuda and RANK != -1: model = smart_DDP(model) # Model attributes nl = de_parallel(model).model[-1].nl # number of detection layers (to scale hyps) hyp['box'] *= 3 / nl # scale to layers hyp['cls'] *= nc / 80 * 3 / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640) ** 2 * 3 / nl # scale to image size and layers hyp['label_smoothing'] = opt.label_smoothing model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nb = len(train_loader) # number of batches nw = max(round(hyp['warmup_epochs'] * nb), 100) # number of warmup iterations, max(3 epochs, 100 iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training last_opt_step = -1 maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = torch.cuda.amp.GradScaler(enabled=amp) stopper, stop = EarlyStopping(patience=opt.patience), False compute_loss = ComputeLoss(model) # init loss class callbacks.run('on_train_start') LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n' f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n' f"Logging results to {colorstr('bold', save_dir)}\n" f'Starting training for {epochs} epochs...') for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ callbacks.run('on_train_epoch_start') model.train() # Update image weights (optional, single-GPU only) if opt.image_weights: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Update mosaic border (optional) # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(3, device=device) # mean losses if RANK != -1: train_loader.sampler.set_epoch(epoch) pbar = enumerate(train_loader) LOGGER.info(('\n' + '%11s' * 7) % ('Epoch', 'GPU_mem', 'box_loss', 'obj_loss', 'cls_loss', 'Instances', 'Size')) if RANK in {-1, 0}: pbar = tqdm(pbar, total=nb, bar_format=TQDM_BAR_FORMAT) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- callbacks.run('on_train_batch_start') ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 0 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward # with torch.cuda.amp.autocast(amp): with torch.amp.autocast(device_type='cuda'): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size if RANK != -1: loss *= WORLD_SIZE # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize - https://pytorch.org/docs/master/notes/amp_examples.html if ni - last_opt_step >= accumulate: scaler.unscale_(optimizer) # unscale gradients torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0) # clip gradients scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) last_opt_step = ni # Log if RANK in {-1, 0}: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB) pbar.set_description(('%11s' * 2 + '%11.4g' * 5) % (f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])) callbacks.run('on_train_batch_end', model, ni, imgs, targets, paths, list(mloss)) if callbacks.stop_training: return # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for loggers scheduler.step() if RANK in {-1, 0}: # mAP callbacks.run('on_train_epoch_end', epoch=epoch) ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights']) final_epoch = (epoch + 1 == epochs) or stopper.possible_stop if not noval or final_epoch: # Calculate mAP results, maps, _ = validate.run(data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, half=amp, model=ema.ema, single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, plots=False, callbacks=callbacks, compute_loss=compute_loss) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, [email protected], [email protected]] stop = stopper(epoch=epoch, fitness=fi) # early stop check if fi > best_fitness: best_fitness = fi log_vals = list(mloss) + list(results) + lr callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi) # Save model if (not nosave) or (final_epoch and not evolve): # if save ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'model': deepcopy(de_parallel(model)).half(), 'ema': deepcopy(ema.ema).half(), 'updates': ema.updates, 'optimizer': optimizer.state_dict(), 'opt': vars(opt), 'git': GIT_INFO, # {remote, branch, commit} if a git repo 'date': datetime.now().isoformat()} # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if opt.save_period > 0 and epoch % opt.save_period == 0: torch.save(ckpt, w / f'epoch{epoch}.pt') del ckpt callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi) # EarlyStopping if RANK != -1: # if DDP training broadcast_list = [stop if RANK == 0 else None] dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks if RANK != 0: stop = broadcast_list[0] if stop: break # must break all DDP ranks # end epoch ---------------------------------------------------------------------------------------------------- # end training ----------------------------------------------------------------------------------------------------- if RANK in {-1, 0}: LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.') for f in last, best: if f.exists(): strip_optimizer(f) # strip optimizers if f is best: LOGGER.info(f'\nValidating {f}...') results, _, _ = validate.run( data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=attempt_load(f, device).half(), iou_thres=0.65 if is_coco else 0.60, # best pycocotools at iou 0.65 single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, save_json=is_coco, verbose=True, plots=plots, callbacks=callbacks, compute_loss=compute_loss) # val best model with plots if is_coco: callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi) callbacks.run('on_train_end', last, best, epoch, results) torch.cuda.empty_cache() return results def parse_opt(known=False): parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default='./weights/yolov5s.pt', help='initial weights path') parser.add_argument('--cfg', type=str, default='./models/yolov5s.yaml', help='model.yaml path') parser.add_argument('--data', type=str, default=r'C:data/AAAA.yaml', help='data.yaml path') parser.add_argument('--hyp', type=str, default=ROOT / 'data/hyps/hyp.scratch-low.yaml', help='hyperparameters path') parser.add_argument('--epochs', type=int, default=100, help='total training epochs') parser.add_argument('--batch-size', type=int, default=1, help='total batch size for all GPUs, -1 for autobatch') parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='train, val image size (pixels)') parser.add_argument('--rect', action='store_true', help='rectangular training') parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') parser.add_argument('--noval', action='store_true', help='only validate final epoch') parser.add_argument('--noautoanchor', action='store_true', help='disable AutoAnchor') parser.add_argument('--noplots', action='store_true', help='save no plot files') parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations') parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') parser.add_argument('--cache', type=str, nargs='?', const='ram', help='image --cache ram/disk') parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class') parser.add_argument('--optimizer', type=str, choices=['SGD', 'Adam', 'AdamW'], default='SGD', help='optimizer') parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') parser.add_argument('--workers', type=int, default=8, help='max dataloader workers (per RANK in DDP mode)') parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name') parser.add_argument('--name', default='welding_defect_yolov5s_20241101_300', help='save to project/name') parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') parser.add_argument('--quad', action='store_true', help='quad dataloader') parser.add_argument('--cos-lr', action='store_true', help='cosine LR scheduler') parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon') parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)') parser.add_argument('--freeze', nargs='+', type=int, default=[0], help='Freeze layers: backbone=10, first3=0 1 2') parser.add_argument('--save-period', type=int, default=5, help='Save checkpoint every x epochs (disabled if < 1)') parser.add_argument('--seed', type=int, default=0, help='Global training seed') parser.add_argument('--local_rank', type=int, default=-1, help='Automatic DDP Multi-GPU argument, do not modify') # Logger arguments parser.add_argument('--entity', default=None, help='Entity') parser.add_argument('--upload_dataset', nargs='?', const=True, default=False, help='Upload data, "val" option') parser.add_argument('--bbox_interval', type=int, default=-1, help='Set bounding-box image logging interval') parser.add_argument('--artifact_alias', type=str, default='latest', help='Version of dataset artifact to use') return parser.parse_known_args()[0] if known else parser.parse_args() def main(opt, callbacks=Callbacks()): # Checks if RANK in {-1, 0}: print_args(vars(opt)) check_git_status() check_requirements() # Resume (from specified or most recent last.pt) if opt.resume and not check_comet_resume(opt) and not opt.evolve: last = Path(check_file(opt.resume) if isinstance(opt.resume, str) else get_latest_run()) opt_yaml = last.parent.parent / 'opt.yaml' # train options yaml opt_data = opt.data # original dataset if opt_yaml.is_file(): with open(opt_yaml, errors='ignore') as f: d = yaml.safe_load(f) else: d = torch.load(last, map_location='cpu')['opt'] opt = argparse.Namespace(**d) # replace opt.cfg, opt.weights, opt.resume = '', str(last), True # reinstate if is_url(opt_data): opt.data = check_file(opt_data) # avoid HUB resume auth timeout else: opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \ check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project) # checks assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' if opt.evolve: if opt.project == str(ROOT / 'runs/train'): # if default project name, rename to runs/evolve opt.project = str(ROOT / 'runs/evolve') opt.exist_ok, opt.resume = opt.resume, False # pass resume to exist_ok and disable resume if opt.name == 'cfg': opt.name = Path(opt.cfg).stem # use model.yaml as name opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if LOCAL_RANK != -1: msg = 'is not compatible with YOLOv5 Multi-GPU DDP training' assert not opt.image_weights, f'--image-weights {msg}' assert not opt.evolve, f'--evolve {msg}' assert opt.batch_size != -1, f'AutoBatch with --batch-size -1 {msg}, please pass a valid --batch-size' assert opt.batch_size % WORLD_SIZE == 0, f'--batch-size {opt.batch_size} must be multiple of WORLD_SIZE' assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo") # Train if not opt.evolve: train(opt.hyp, opt, device, callbacks) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = { 'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight 'iou_t': (0, 0.1, 0.7), # IoU training threshold 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) 'scale': (1, 0.0, 0.9), # image scale (+/- gain) 'shear': (1, 0.0, 10.0), # image shear (+/- deg) 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) 'mosaic': (1, 0.0, 1.0), # image mixup (probability) 'mixup': (1, 0.0, 1.0), # image mixup (probability) 'copy_paste': (1, 0.0, 1.0)} # segment copy-paste (probability) with open(opt.hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict if 'anchors' not in hyp: # anchors commented in hyp.yaml hyp['anchors'] = 3 if opt.noautoanchor: del hyp['anchors'], meta['anchors'] opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir) # only val/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv' if opt.bucket: os.system(f'gsutil cp gs://{opt.bucket}/evolve.csv {evolve_csv}') # download evolve.csv if exists for _ in range(opt.evolve): # generations to evolve if evolve_csv.exists(): # if evolve.csv exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() + 1E-6 # weights (sum > 0) if parent == 'single' or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == 'weighted': x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([meta[k][0] for k in hyp.keys()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all(v == 1): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device, callbacks) callbacks = Callbacks() # Write mutation results keys = ('metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss') print_mutation(keys, results, hyp.copy(), save_dir, opt.bucket) # Plot results plot_evolve(evolve_csv) LOGGER.info(f'Hyperparameter evolution finished {opt.evolve} generations\n' f"Results saved to {colorstr('bold', save_dir)}\n" f'Usage example: $ python train.py --hyp {evolve_yaml}') def run(**kwargs): # Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt') opt = parse_opt(True) for k, v in kwargs.items(): setattr(opt, k, v) main(opt) return opt if __name__ == "__main__": opt = parse_opt() main(opt) 为什么训练之后,他的runs里面并没有显示best.pt跟last.pt 请查找原因

--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File D:\Anaconda\envs\DL\lib\site-packages\monai\transforms\transform.py:141, in apply_transform(transform, data, map_items, unpack_items, log_stats, lazy, overrides) 140 return [_apply_transform(transform, item, unpack_items, lazy, overrides, log_stats) for item in data] --> 141 return _apply_transform(transform, data, unpack_items, lazy, overrides, log_stats) 142 except Exception as e: 143 # if in debug mode, don't swallow exception so that the breakpoint 144 # appears where the exception was raised. File D:\Anaconda\envs\DL\lib\site-packages\monai\transforms\transform.py:98, in _apply_transform(transform, data, unpack_parameters, lazy, overrides, logger_name) 96 return transform(*data, lazy=lazy) if isinstance(transform, LazyTrait) else transform(*data) ---> 98 return transform(data, lazy=lazy) if isinstance(transform, LazyTrait) else transform(data) File D:\Anaconda\envs\DL\lib\site-packages\monai\transforms\io\dictionary.py:176, in LoadImaged.__call__(self, data, reader) 175 if meta_key in d and not self.overwriting: --> 176 raise KeyError(f"Metadata with key {meta_key} already exists and overwriting=False.") 177 d[meta_key] = data[1] KeyError: 'Metadata with key image_meta_dict already exists and overwriting=False.' The above exception was the direct cause of the following exception: RuntimeError Traceback (most recent call last) Cell In[1], line 196 193 image_path = "D:/monaisj/0/3.nii.gz" 194 mask_path = "D:/monaisj/0/3mask.nii.gz" # 如果可用 --> 196 main(image_path, mask_path) Cell In[1], line 182, in main(image_path, mask_path) 179 wrapped_model = ModelWrapper(model).to(device) 181 # 预处理样本 --> 182 data = prepare_sample(image_path, mask_path) 184 # 生成可视化 185 visualize_gradcam( 186 model_wrapper=wrapped_model, 187 data_dict=data, 188 device=device, 189 patient_id=os.path.basename(image_path).split('.')[0] 190 ) Cell In[1], line 66, in prepare_sample(image_path, mask_path) 57 monai.data.write_nifti(mask_data, mask_path, affine=np.eye(4)) 59 sample = { 60 "image": image_path, 61 "mask": mask_path, 62 "image_meta_dict": {"original_affine": np.eye(4)}, 63 "mask_meta_dict": {"original_affine": np.eye(4)} 64 } ---> 66 return transforms(sample) File D:\Anaconda\envs\DL\lib\site-packages\monai\transforms\compose.py:335, in Compose.__call__(self, input_, start, end, threading, lazy) 333 def __call__(self, input_, start=0, end=None, threading=False, lazy: bool | None = None): 334 _lazy = self._lazy if lazy is None else lazy --> 335 result = execute_compose( 336 input_, 337 transforms=self.transforms, 338 start=start, 339 end=end, 340 map_items=self.map_items, 341 unpack_items=self.unpack_items, 342 lazy=_lazy, 343 overrides=self.overrides, 344 threading=threading, 345 log_stats=self.log_stats, 346 ) 348 return result File D:\Anaconda\envs\DL\lib\site-packages\monai\transforms\compose.py:111, in execute_compose(data, transforms, map_items, unpack_items, start, end, lazy, overrides, threading, log_stats) 109 if threading: 110 _transform = deepcopy(_transform) if isinstance(_transform, ThreadUnsafe) else _transform --> 111 data = apply_transform( 112 _transform, data, map_items, unpack_items, lazy=lazy, overrides=overrides, log_stats=log_stats 113 ) 114 data = apply_pending_transforms(data, None, overrides, logger_name=log_stats) 115 return data File D:\Anaconda\envs\DL\lib\site-packages\monai\transforms\transform.py:171, in apply_transform(transform, data, map_items, unpack_items, log_stats, lazy, overrides) 169 else: 170 _log_stats(data=data) --> 171 raise RuntimeError(f"applying transform {transform}") from e RuntimeError: applying transform <monai.transforms.io.dictionary.LoadImaged object at 0x0000018E44F8BE80>

模型初始化成功 加载并转换预训练权重: checkpoints/M3FD.ckpt 找到 1 个需要转换的权重层 转换权重: first_stage_model.encoder.conv_in.weight 原始形状: torch.Size([128, 3, 3, 3]) 转换后形状: torch.Size([128, 1, 3, 3]) 权重加载完成: 成功加载 1830/1830 层 VAE输入层形状: torch.Size([128, 1, 3, 3]) 加载预训练权重: checkpoints/M3FD.ckpt 找到需要转换的层: ['first_stage_model.encoder.conv_in.weight'] 转换权重: first_stage_model.encoder.conv_in.weight 原始形状: torch.Size([128, 3, 3, 3]) 转换后形状: torch.Size([128, 1, 3, 3]) 权重加载完成: 缺失层 1, 不匹配层 0 缺失层: ['cond_stage_model.transformer.text_model.embeddings.position_ids'] 使用基础学习率: 1.00e-04 警告: 使用仅含权重的检查点,训练状态将重置: D:\work\AI\DiffV2IR\checkpoints\M3FD.ckpt 预处理检查点文件: D:\work\AI\DiffV2IR\checkpoints\M3FD.ckpt 已重置训练状态: epoch=0, global_step=0 警告: 检查点缺少 'cond_stage_model.transformer.text_model.embeddings.position_ids' 键 已添加 position_ids 到检查点 修复后的完整检查点已保存到: logs\experiment_20250703_181326\checkpoints\fixed_checkpoint.ckpt TensorBoard日志保存在: logs\experiment_20250703_181326\tensorboard 训练批次数: 8 最终训练器配置: default_root_dir: logs\experiment_20250703_181326 max_epochs: 200 gpus: 1 distributed_backend: None plugins: None precision: 16 accumulate_grad_batches: 1 callbacks: [, , <__main__.EnhancedImageLogger object at 0x000001F79D63F130>, <__main__.TQDMProgressBar object at 0x000001F79D63F040>, <__main__.PerformanceMonitor object at 0x000001F79D63F010>] logger: <__main__.TensorBoardLogger object at 0x000001F79BA61480> resume_from_checkpoint: logs\experiment_20250703_181326\checkpoints\fixed_checkpoint.ckpt fast_dev_run: False limit_val_batches: 1.0 num_sanity_val_steps: 0 log_every_n_steps: 10 check_val_every_n_epoch: 1 Using native 16bit precision. GPU available: True, used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs 开始训练... Restoring states from the checkpoint file at logs\experiment_20250703_181326\checkpoints\fixed_checkpoint.ckpt 训练出错: Error(s) in loading state_dict for CustomLatentDiffusion: size mismatch for first_stage_model.encoder.conv_in.weight: copying a param with shape torch.Size([128, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([128, 1, 3, 3]).

自定义模型加载失败: Error(s) in loading state_dict for YOLOv5Custom: Missing key(s) in state_dict: "model.0.weight", "model.0.bias", "model.4.weight", "model.4.bias", "model.6.weight", "model.6.bias". Unexpected key(s) in state_dict: "backbone.0.weight", "backbone.0.bias", "backbone.1.weight", "backbone.1.bias", "backbone.1.running_mean", "backbone.1.running_var", "backbone.1.num_batches_tracked", "backbone.3.weight", "backbone.3.bias", "backbone.4.weight", "backbone.4.bias", "backbone.4.running_mean", "backbone.4.running_var", "backbone.4.num_batches_tracked", "backbone.6.weight", "backbone.6.bias", "backbone.7.weight", "backbone.7.bias", "backbone.7.running_mean", "backbone.7.running_var", "backbone.7.num_batches_tracked", "backbone.9.weight", "backbone.9.bias", "backbone.10.weight", "backbone.10.bias", "backbone.10.running_mean", "backbone.10.running_var", "backbone.10.num_batches_tracked", "backbone.12.weight", "backbone.12.bias", "backbone.13.weight", "backbone.13.bias", "backbone.13.running_mean", "backbone.13.running_var", "backbone.13.num_batches_tracked", "head.0.weight", "head.0.bias", "head.1.weight", "head.1.bias", "head.1.running_mean", "head.1.running_var", "head.1.num_batches_tracked", "head.4.weight", "head.4.bias", "head.5.weight", "head.5.bias", "head.5.running_mean", "head.5.running_var", "head.5.num_batches_tracked", "head.8.weight", "head.8.bias", "head.9.weight", "head.9.bias", "head.9.running_mean", "head.9.running_var", "head.9.num_batches_tracked", "head.12.weight", "head.12.bias", "head.13.weight", "head.13.bias", "head.13.running_mean", "head.13.running_var", "head.13.num_batches_tracked", "head.16.weight", "head.16.bias". C:\Users\23228/.cache\torch\hub\ultralytics_yolov5_master\models\common.py:907: FutureWarning: torch.cuda.amp.autocast(args...) is deprecated. Please use torch.amp.autocast('cuda', args...) instead. with amp.autocast(autocast): 为什么

最新推荐

recommend-type

【大学生电子设计】:备战2015全国大学生电子设计竞赛-信号源类赛题分析.pdf

【大学生电子设计】:备战2015全国大学生电子设计竞赛-信号源类赛题分析.pdf
recommend-type

湘潭大学人工智能专业2024级大一C语言期末考试题库项目-包含58个从头歌平台抓取并排版的C语言编程题目及解答-涵盖基础语法-数组操作-条件判断-循环结构-函数调用-指针应用等核心.zip

2025电赛预测湘潭大学人工智能专业2024级大一C语言期末考试题库项目_包含58个从头歌平台抓取并排版的C语言编程题目及解答_涵盖基础语法_数组操作_条件判断_循环结构_函数调用_指针应用等核心.zip
recommend-type

Pansophica开源项目:智能Web搜索代理的探索

Pansophica开源项目是一个相对较新且具有创新性的智能Web搜索代理,它突破了传统搜索引擎的界限,提供了一种全新的交互方式。首先,我们来探讨“智能Web搜索代理”这一概念。智能Web搜索代理是一个软件程序或服务,它可以根据用户的查询自动执行Web搜索,并尝试根据用户的兴趣、历史搜索记录或其他输入来提供个性化的搜索结果。 Pansophica所代表的不仅仅是搜索结果的展示,它还强调了一个交互式的体验,在动态和交互式虚拟现实中呈现搜索结果。这种呈现方式与现有的搜索体验有着根本的不同。目前的搜索引擎,如Google、Bing和Baidu等,多以静态文本和链接列表的形式展示结果。而Pansophica通过提供一个虚拟现实环境,使得搜索者可以“扭转”视角,进行“飞行”探索,以及“弹网”来浏览不同的内容。这种多维度的交互方式使得信息的浏览变得更加快速和直观,有望改变用户与网络信息互动的方式。 接着,我们关注Pansophica的“开源”属性。所谓开源,指的是软件的源代码可以被公众获取,任何个人或组织都可以自由地使用、学习、修改和分发这些代码。开源软件通常由社区进行开发和维护,这样的模式鼓励了协作创新并减少了重复性劳动,因为全世界的开发者都可以贡献自己的力量。Pansophica项目作为开源软件,意味着其他开发者可以访问和使用其源代码,进一步改进和扩展其功能,甚至可以为Pansophica构建新的应用或服务。 最后,文件名称“Pansophica-src-1.3”表明了我们讨论的特定版本的Pansophica开源代码。数字“1.3”很可能指的是该版本号,表明这是Pansophica项目的第1.3个公开版本。这个版本号对于开发者来说很重要,因为它可以帮助他们追踪更新的进度、比较版本间的差异以及选择最适合其项目的版本。 综合以上信息,Pansophica开源项目不仅在搜索代理领域引入了创新的交互方式,而且通过开源的方式,提供了技术共享和协作的可能性。对于IT行业来说,开源项目是推动技术发展的重要力量,它激发了社区的活力,促进了知识的传播和技术的进步。通过构建一个开源社区,Pansophica可以吸引来自世界各地的开发者和研究人员,共同探索如何将搜索技术与交互式虚拟现实技术结合起来,为用户带来更加丰富和有效的信息检索体验。同时,Pansophica的开源性质也使其更容易适应未来技术的发展,随着社区的不断贡献和反馈,这一平台有望在功能和性能上得到持续的优化和增强。
recommend-type

跨平台内容提取无忧:coze工作流应对社交媒体挑战

# 1. 社交媒体内容提取的必要性与挑战 ## 1.1 内容提取的必要性 在数字时代,社交媒体内容是信息与数据流动的主干道,公司、个人和研究者都在寻求从这些信息中提取价值。无论是为了市场趋势分析、品牌监控、消费者洞察还是公共舆论监测,有效的社交媒体内容提取变得至关重要。这不仅有助于企业做出数据驱动的决策,也能使个人更好地理
recommend-type

vrrp主设备发送的免费arp

<think>好的,我现在需要回答用户关于VRRP主设备发送免费ARP的目的和工作原理的问题。首先,我得回忆一下VRRP的基本概念和工作机制。VRRP,虚拟路由冗余协议,主要用于在局域网中提供默认网关的冗余备份,确保网络的高可用性。主设备(Master)负责转发流量,而备用设备(Backup)则在主设备故障时接管工作。 用户的问题集中在主设备发送免费ARP的目的和机制上。根据引用[2],免费ARP用于通知下游设备虚拟MAC地址的变更。当主设备被选举出来后,它需要让局域网内的其他设备知道虚拟IP对应的MAC地址已经指向自己,这样流量才会被正确路由到主设备。免费ARP的作用应该就是更新这些设备的
recommend-type

为Ghost博客平台打造的Meteor流星包装使用指南

从给定文件信息中,我们可以提炼出以下IT知识点: ### 标题知识点:流星Ghost软件包 1. **流星Ghost软件包的用途**:流星Ghost软件包是专为Ghost博客平台设计的流星(Meteor)应用程序。流星是一个开源的全栈JavaScript平台,用于开发高性能和易于编写的Web应用程序。Ghost是一个开源博客平台,它提供了一个简单且专业的写作环境。 2. **软件包的作用**:流星Ghost软件包允许用户在流星平台上轻松集成Ghost博客。这样做的好处是可以利用流星的实时特性以及易于开发和部署的应用程序框架,同时还能享受到Ghost博客系统的便利和美观。 ### 描述知识点:流星Ghost软件包的使用方法 1. **软件包安装方式**:用户可以通过流星的命令行工具添加名为`mrt:ghost`的软件包。`mrt`是流星的一个命令行工具,用于添加、管理以及配置软件包。 2. **初始化Ghost服务器**:描述中提供了如何在服务器启动时运行Ghost的基本代码示例。这段代码使用了JavaScript的Promise异步操作,`ghost().then(function (ghostServer) {...})`这行代码表示当Ghost服务器初始化完成后,会在Promise的回调函数中提供一个Ghost服务器实例。 3. **配置Ghost博客**:在`then`方法中,首先会获取到Ghost服务器的配置对象`config`,用户可以在此处进行自定义设置,例如修改主题、配置等。 4. **启动Ghost服务器**:在配置完成之后,通过调用`ghostServer.start()`来启动Ghost服务,使其能够处理博客相关的请求。 5. **Web浏览器导航**:一旦流星服务器启动并运行,用户便可以通过Web浏览器访问Ghost博客平台。 ### 标签知识点:JavaScript 1. **JavaScript作为流星Ghost软件包的开发语言**:标签指出流星Ghost软件包是使用JavaScript语言开发的。JavaScript是一种在浏览器端广泛使用的脚本语言,它也是流星平台的基础编程语言。 2. **流星和Ghost共同使用的语言**:JavaScript同样也是Ghost博客平台的开发语言。这表明流星Ghost软件包可以无缝集成,因为底层技术栈相同。 ### 压缩包子文件的文件名称列表知识点:meteor-ghost-master 1. **版本控制和软件包结构**:文件名称`meteor-ghost-master`暗示了该软件包可能托管在像GitHub这样的版本控制系统上。文件名中的`master`通常指的是主分支或主版本。 2. **软件包的目录结构**:通过文件名称可以推断出该软件包可能拥有一个标准的流星软件包结构,包含了初始化、配置、运行等必要的模块和文件。 3. **软件包的维护状态**:由于文件名没有包含特定的版本号,我们无法直接得知软件包的最新更新情况。通常,软件包维护者会将最新的版本代码放在`master`分支上。 ### 总结 流星Ghost软件包提供了一个有效的解决方案,使得流星平台的开发者能够在他们的应用中添加Ghost博客功能。软件包的使用简便,通过流星的命令行工具安装,并通过JavaScript代码配置和启动Ghost服务。通过流星Ghost软件包,开发者能够享受流星的实时特性以及Ghost博客系统的便利性。此外,软件包的命名和结构也暗示了其维护和版本控制的模式,有助于开发者更好地理解如何使用和维护这一软件包。
recommend-type

抖音标题生成自动化:用coze工作流释放创意

# 1. 抖音标题生成自动化的重要性 随着社交媒体平台的崛起,内容的吸引力很大程度上取决于标题的创意与精准性。抖音作为一个日活亿级的短视频平台,高质量的标题能够有效提高视频的点击率,增加内容的传播。但是,人工撰写标题不仅耗时耗力,而且很难做到快速响应热点,自动化标题生成工具应运而生。coze工作流,作为一种实现自动化生成抖音标题的工具,其重要性不言而喻。它能够利用大数据分析和机器学习技术,提高标题的吸引
recommend-type

spss消费结构因子分析

<think>好的,我现在需要帮助用户在SPSS中进行消费结构的因子分析。首先,我要回忆一下因子分析的基本步骤和SPSS的操作流程。用户可能对SPSS不太熟悉,所以步骤要详细,同时需要结合他们提供的引用内容,特别是引用[2]中的适用条件和检验方法。 首先,用户提到了消费结构的数据,这可能包括多个变量,如食品、住房、交通等支出。因子分析适用于这种情况,可以降维并找出潜在因子。根据引用[2],需要检查样本量是否足够,变量间是否有相关性,以及KMO和Bartlett检验的结果。 接下来,我需要按照步骤组织回答:数据准备、适用性检验、因子提取、因子旋转、命名解释、计算得分。每个步骤都要简明扼要,说
recommend-type

OpenMediaVault的Docker映像:快速部署与管理指南

根据提供的文件信息,我们将详细讨论与标题和描述中提及的Docker、OpenMediaVault以及如何部署OpenMediaVault的Docker镜像相关的一系列知识点。 首先,Docker是一个开源的应用容器引擎,允许开发者打包应用及其依赖包到一个可移植的容器中,然后发布到任何流行的Linux机器上,也可以实现虚拟化。容器是完全使用沙箱机制,相互之间不会有任何接口(类似 iPhone 的 app)。 OpenMediaVault是一个基于Debian的NAS(网络附加存储)解决方案。它专为家庭或小型办公室提供文件共享、网络附加存储以及打印服务。它提供了一个易用的Web界面,通过这个界面用户可以管理服务器配置、网络设置、用户权限、文件服务等。 在描述中提到了一些Docker命令行操作: 1. `git clone`:用于克隆仓库到本地,这里的仓库指的是“docker-images-openmedivault”。 2. `docker build -t omv`:这是一个构建Docker镜像的命令,其中`-t`参数用于标记镜像名称和标签,这里是标记为“omv”。 3. `docker run`:运行一个容器实例,`-t`参数用于分配一个伪终端,`-i`参数用于交互式操作,`-p 80:80`则是将容器的80端口映射到宿主机的80端口。 启动服务的部分涉及OpenMediaVault的配置和初始化: - ssh服务:用于远程登录到服务器的协议。 - php5-fpm:是PHP的一个FastCGI实现,用于加速PHP的运行。 - nginx:是一个高性能的HTTP和反向代理服务器,常用于优化静态内容的分发。 - openmediavault引擎:指的是OpenMediaVault的核心服务。 - rrdcached:用于收集和缓存性能数据,这些数据可以被rrdtool图形化工具读取。 - collectd:是一个守护进程,用于收集系统性能和提供各种存储方式和传输方式来存储所收集的数据。 为了访问服务,需要在浏览器中输入"http:// IP_OF_DOCKER",其中`IP_OF_DOCKER`指的是运行Docker容器的主机IP地址。 描述中还提到了一个步骤:“在System-> Network-> Interfaces中添加带有dhcp的eth0”,这指的是需要在OpenMediaVault的Web管理界面中配置网络接口。`eth0`是网络接口的名称,通常代表第一个以太网接口。DHCP(动态主机配置协议)是一种自动为网络中的设备分配IP地址的协议,这样设备就可以连接网络并开始通信,无需手动配置IP地址。 【压缩包子文件的文件名称列表】中的“docker-images-openmediavault-master”暗示了这是一个包含Docker镜像文件的代码仓库。通常,“master”分支是代码的主分支,包含了代码库中最新且通常是最稳定的版本。用户可以通过克隆该仓库到本地来获取所有相关的Dockerfile、配置脚本及依赖文件,以便能够自行构建和运行OpenMediaVault的Docker镜像。 综上所述,这些知识点涵盖了从基本的Docker概念、Docker命令行操作、OpenMediaVault服务启动和管理,到具体的网络配置及Docker仓库操作,都是进行Docker化OpenMediaVault部署的关键步骤。
recommend-type

小红书文案提取一步到位:coze工作流操作全攻略

# 1. coze工作流概述 工作流系统是企业信息化和数字化转型的核心组件之一,它通过自动化流程管理提升效率,确保业务流程的顺畅执行。coze工作流作为当前市场上较为先进的工作流解决方案,它不仅仅是一套软件工具,更是一个集成化的平台,旨在通过流程自动化和智能化提升企业运营效率。 coze工作流的引入不仅有助于标准化和优化企业的业务流程,还可以通过可配置的流程设计,满足不同部门的特定需求。在组织的业务流程中