'''Training script. ''' import os from tqdm import tqdm import torch import torch.nn as nn from torch.utils.data import DataLoader from torch.optim import Adam, lr_scheduler from torchsummary import summary from torchvision import transforms import torch.distributed as dist import torch.multiprocessing as mp from models.resnet50 import ResNet50 from runtime_args import args from load_dataset import LoadDataset from plot import plot_loss_acc from helpers import calculate_accuracy device = torch.device("cuda:0" if torch.cuda.is_available() and args.device == 'gpu' else 'cpu') if not os.path.exists(args.graphs_folder) : os.mkdir(args.graphs_folder) model_save_folder = 'resnet_cbam/' if args.use_cbam else 'resnet/' if not os.path.exists(model_save_folder) : os.mkdir(model_save_folder) def train(gpu, args): '''Init models and dataloaders and train/validate model. ''' rank = args.rank * args.gpus + gpu world_size = args.gpus * args.nodes dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank) model = ResNet50(image_depth=args.img_depth, num_classes=args.num_classes, use_cbam=args.use_cbam) torch.cuda.set_device(gpu) model.cuda(gpu) optimizer = Adam(model.parameters(), lr=args.learning_rate) lr_decay = lr_scheduler.ExponentialLR(optimizer, gamma=args.decay_rate) criterion = torch.nn.CrossEntropyLoss().cuda(gpu) summary(model, (3, 224, 224)) model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) train_dataset = LoadDataset(dataset_folder_path=args.data_folder, image_size=args.img_size, image_depth=args.img_depth, train=True, transform=transforms.ToTensor()) test_dataset = LoadDataset(dataset_folder_path=args.data_folder, image_size=args.img_size, image_depth=args.img_depth, train=False, transform=transforms.ToTensor()) train_sampler = torch.utils.data.distributed.DistributedSample
时间: 2025-03-12 20:00:30 浏览: 63
### 审查与优化 PyTorch 分布式训练脚本
对于使用ResNet50、CUDA以及Adam优化器的PyTorch分布式训练脚本,确保其高效运行的关键在于合理配置数据并行机制。DDP作为一种有效的分布式训练方法,在PyTorch中实现了数据并行训练[^2]。
#### 初始化环境设置
确保所有参与训练的过程能够正确初始化进程组,并指定合适的后端支持(如NCCL)。这一步骤至关重要,因为不当的初始化可能导致通信障碍或者性能瓶颈。
```python
import torch.distributed as dist
dist.init_process_group(backend='nccl')
```
#### 构建模型实例
当创建模型时,应该考虑采用`torch.nn.parallel.DistributedDataParallel`封装基础模型对象。这样做不仅可以让各个GPU拥有独立的模型副本,还能保证参数更新的一致性。
```python
model = torchvision.models.resnet50().cuda()
model = torch.nn.parallel.DistributedDataParallel(model)
```
#### 数据加载器配置
为了防止不同进程中读取相同的数据片段造成冗余计算,应当利用`DistributedSampler`来划分数据集,使得每台机器处理互不重复的部分。
```python
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
data_loader = torch.utils.data.DataLoader(
dataset,
batch_size=batch_size,
sampler=train_sampler,
num_workers=workers,
pin_memory=True
)
```
#### 训练循环调整
在定义损失函数之后,应用Adam作为优化算法,并注意梯度累积操作可能带来的影响;有时适当减小学习率有助于提高收敛速度而不牺牲最终精度。
```python
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(start_epoch, epochs):
train_sampler.set_epoch(epoch)
model.train()
for i, (input, target) in enumerate(data_loader):
output = model(input.cuda())
loss = criterion(output, target.cuda())
optimizer.zero_grad()
loss.backward()
optimizer.step()
```
上述代码段展示了如何构建一个基本框架来进行高效的多GPU/多节点间的协同工作。然而,针对具体应用场景还可能存在进一步调优的空间,比如探索更精细的任务调度策略或是引入混合精度训练技术以加速迭代过程。
阅读全文
相关推荐




















