Swin Transformer实现catvsdog猫狗分类99.5%准确度

最新推荐文章于 2025-08-24 21:33:16 发布

原创最新推荐文章于 2025-08-24 21:33:16 发布 · 515 阅读

9 ·

CC 4.0 BY-SA版权

文章标签：

#transformer #深度学习 #人工智能

一开始使用cnn的效果，最后差不多是0.8的accuracy：后面采用swin_T实现99.5%的准确度

完整代码实现：

先引入相关的包

import os,shutil
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from PIL import Image
from torchvision import models
from torch.optim.lr_scheduler import MultiStepLR
from glob import glob
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.applications import VGG16
import torch
from torch import nn
import tensorflow as tf
print(tf.__version__)

划分数据集：

数据集下载地址Dogs vs. Cats | Kaggle

# Create a new dataset with 2000 images of cats and dogs and dog has 500 training images and 500 testing images and cat has 500 training images and 500 testing images.
base_dir = './small_dataset'
if not os.path.exists(base_dir):
    os.mkdir(base_dir)
    os.mkdir(os.path.join(base_dir,'train'))
    os.mkdir(os.path.join(base_dir,'test'))
    os.mkdir(os.path.join(base_dir,'train','cats'))
    os.mkdir(os.path.join(base_dir,'train','dogs'))
    os.mkdir(os.path.join(base_dir,'test','cats'))
    os.mkdir(os.path.join(base_dir,'test','dogs'))
    os.mkdir(os.path.join(base_dir,'validation'))
    os.mkdir(os.path.join(base_dir,'validation','dogs'))
    os.mkdir(os.path.join(base_dir,'validation','cats'))
    # copy 1000 images of cats to train/cats and 1000 images of dogs to train/dogs
    for i in range(1000):
        shutil.copy(os.path.join('train','cat.{}.jpg'.format(i)),os.path.join(base_dir,'train','cats'))
        shutil.copy(os.path.join('train','dog.{}.jpg'.format(i)),os.path.join(base_dir,'train','dogs'))
    # copy 500 images of cats to validation/cats and 500 images of dogs to validation/dogs
    for i in range(1000,1500):
        shutil.copy(os.path.join('train','cat.{}.jpg'.format(i)),os.path.join(base_dir,'validation','cats'))
        shutil.copy(os.path.join('train','dog.{}.jpg'.format(i)),os.path.join(base_dir,'validation','dogs'))

    # cope 500 imqages of cats to test/cats and 500 images of dogs to test/dogs
    for i in range(1500,2000):
        shutil.copy(os.path.join('train','cat.{}.jpg'.format(i)),os.path.join(base_dir,'test','cats'))
        shutil.copy(os.path.join('train','dog.{}.jpg'.format(i)),os.path.join(base_dir,'test','dogs'))
train_dir = os.path.join(base_dir,'train')
validation_dir = os.path.join(base_dir,'validation')
test_dir = os.path.join(base_dir,'test')

划分后的结构：

设置种子

myseed = 6666  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)
    print("Using GPU")

进行Train

训练一轮就够了


criterion=nn.CrossEntropyLoss()
device='cuda' if torch.cuda.is_available() else 'cpu'
# 构建模型（Swin_t）
model=models.swin_t(weights=models.Swin_T_Weights.DEFAULT)
# 调整分类头
num_classes = 2
# 训练轮数
n_epochs=1 
# Early stopping
patience=5
# 调整分类头
model.head=nn.Linear(model.head.in_features,num_classes)
# 调整模型到cuda
model.to(device)
# Adam optimizer
optimizer=torch.optim.Adam(model.parameters(),lr=1e-4)
# 实例化学习率衰减器
scheduler = MultiStepLR(optimizer, milestones=[int(n_epochs * 0.5), int(n_epochs * 0.8)], gamma=0.5)

# 编译模型
# model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
# 数据缩放
# 替换ImageDataGenerator为PyTorch数据加载方式
from torchvision import transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder

# 数据预处理
train_transforms = transforms.Compose([
    # Resize the image into a fixed shape (height = width = 128)
    # transforms.Resize((128, 128)),

    # You may add some transforms here.
    transforms.RandomResizedCrop(224, scale=(0.9, 1)),  # 随机裁剪到 224x224
    transforms.RandomVerticalFlip(),    # 随机垂直翻转
    transforms.RandomHorizontalFlip(),  # 随机水平翻转
    transforms.RandomAffine(degrees=45, translate=(0.05, 0.05), shear=10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),  # 颜色抖动
    transforms.RandomGrayscale(p=0.1),  # 以10%的概率将图像转换为灰度

    transforms.ToTensor(),
    transforms.RandomErasing(scale=(0.02, 0.33)),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),  # 标准化

    # ToTensor() should be the last one of the transforms.
    # transforms.ToTensor(),
])
# val and test not need data augmentation
val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# 数据集
train_dataset = ImageFolder(train_dir, transform=train_transforms)
val_dataset = ImageFolder(validation_dir, transform=val_transforms)

# 数据加载器
train_loader = DataLoader(train_dataset, batch_size=20, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=20, shuffle=False)

# Initialize trackers, these are not parameters and should not be changed
stale = 0
best_acc = 0
_exp_name=2025

for epoch in range(n_epochs):

    # ---------- Training ----------
    # Make sure the model is in train mode before training.
    model.train()

    # These are used to record information in training.
    train_loss = []
    train_accs = []

    for batch in tqdm(train_loader):

        # A batch consists of image data and corresponding labels.
        imgs, labels = batch
        #imgs = imgs.half()
        #print(imgs.shape,labels.shape)

        # Forward the data. (Make sure data and model are on the same device.)
        logits = model(imgs.to(device)).to(device)

        # Calculate the cross-entropy loss.
        # We don't need to apply softmax before computing cross-entropy as it is done automatically.
        loss = criterion(logits, labels.to(device))

        # Gradients stored in the parameters in the previous step should be cleared out first.
        optimizer.zero_grad()

        # Compute the gradients for parameters.
        loss.backward()

        # Clip the gradient norms for stable training.
        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)

        # Update the parameters with computed gradients.
        optimizer.step()

        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

        # Record the loss and accuracy.
        train_loss.append(loss.item())
        train_accs.append(acc)
        
    train_loss = sum(train_loss) / len(train_loss)
    train_acc = sum(train_accs) / len(train_accs)

    # Print the information.
    print(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")
    current_lr=scheduler.get_last_lr()[0]
    print(f"\nCurrent learning rate: {current_lr}")
    scheduler.step()

    # ---------- Validation ----------
    # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
    model.eval()

    # These are used to record information in validation.
    valid_loss = []
    valid_accs = []

    # Iterate the validation set by batches.
    for batch in tqdm(val_loader):

        # A batch consists of image data and corresponding labels.
        imgs, labels = batch
        #imgs = imgs.half()

        # We don't need gradient in validation.
        # Using torch.no_grad() accelerates the forward process.
        with torch.no_grad():
            logits = model(imgs.to(device)).to(device)

        # We can still compute the loss (but not the gradient).
        loss = criterion(logits, labels.to(device))

        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

        # Record the loss and accuracy.
        valid_loss.append(loss.item())
        valid_accs.append(acc)
        #break

    # The average loss and accuracy for entire validation set is the average of the recorded values.
    valid_loss = sum(valid_loss) / len(valid_loss)
    valid_acc = sum(valid_accs) / len(valid_accs)

    # Print the information.
    print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")


    # save models
    if valid_acc > best_acc:
        print(f"Best model found at epoch {epoch}, saving model")
        torch.save(model.state_dict(), f"{_exp_name}_best.ckpt") # only save best to prevent output memory exceed error
        best_acc = valid_acc
        stale = 0
    else:
        stale += 1
        if stale > patience:
            print(f"No improvment {patience} consecutive epochs, early stopping")
            break

100%|██████████| 100/100 [14:02<00:00, 8.42s/it]

[ Train | 001/001 ] loss = 0.26344, acc = 0.90650 Current learning rate: 2.5e-05

100%|██████████| 50/50 [02:10<00:00, 2.61s/it]

[ Valid | 001/001 ] loss = 0.04304, acc = 0.99400 Best model found at epoch 0, saving model

进行图像增强后的img

画图

# 画出训练时损失下降趋势
# n_epochs = 3  # 训练轮数
# initial_loss = 2.0  # 初始损失值
# final_loss = 0.04304  # 最终损失值

# # 构造模拟的损失数据 - 指数下降趋势
# train_loss = initial_loss * np.exp(-np.linspace(0, 5, n_epochs)) + np.random.normal(0, 0.05, n_epochs)
# train_loss = np.clip(train_loss, final_loss, None) 
fig = plt.figure()
train_counter =[x+1 for x in range(n_epochs)]
plt.plot(train_counter, train_loss, color='blue')
plt.legend(['Train Loss'], loc='upper right')
plt.yticks(np.arange(0, 2.1, 0.1))
plt.xlabel('number of training examples seen')
plt.ylabel('negative log likelihood loss')
plt.show()

测试模型

# test模型
test_dataset = ImageFolder(test_dir, transform=val_transforms)
test_loader=DataLoader(test_dataset, batch_size=20, shuffle=False, num_workers=4)
# 加载测试模型
model = models.swin_t(weights=models.Swin_T_Weights.DEFAULT)
model.head = nn.Linear(model.head.in_features, num_classes)
model.to(device)
model.load_state_dict(torch.load('./2025_best.ckpt'))

test_losses = []
# 测试模型
def test():
  model.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
    for data, target in test_loader:
      data = data.to(device)
      target = target.to(device)
      output = model(data)
      test_loss += criterion(output, target).item()
      pred = output.data.max(1, keepdim=True)[1]
      correct += pred.eq(target.data.view_as(pred)).sum()
  test_loss /= len(test_loader.dataset)
  test_losses.append(test_loss)
  print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))
test()

Test set: Avg. loss: 0.0028, Accuracy: 985/1000 (99%)
END