- 🍨 本文为🔗365天深度学习训练营 中的学习记录博客
- 🍖 原作者:K同学啊
目标
具体实现
(一)环境
语言环境:Python 3.10
编 译 器: PyCharm
框 架: Pytorch
(二)具体步骤
1. 数据预处理
data = pd.read_csv("data/weatherAUS.csv")
df = data.copy()
print(data.head())
print(data.describe())
print(data.dtypes)
# 将数据转移为日期时间格式
data['Date'] = pd.to_datetime(data['Date'])
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day
print(data.dtypes)
可以看到Date列类型由object变为Datetime64类型了
"""
删除无关特征列并输出剩余特征列信息
1. 删除'Date'时间列:模型训练不需要时间维度
2. 打印剩余特征列:验证特征删除后的数据结构
"""
data.drop('Date', axis=1, inplace=True)
print(data.columns)
#### step 2: 数据分析
# 生成相关系数矩阵热力图
plt.figure(figsize=(25, 23), dpi=120) # 创建15x13英寸大小、120dpi的画布
ax = sns.heatmap(data.corr(numeric_only=True), # 第1列是日期,第2列是地址都被排除了
square=True,
annot=True,
fmt='.2f')
# 设置x轴标签旋转角度为90度
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.show()
# 是否会下雨?
sns.set_theme(style="whitegrid", palette="Set2") # 设置样式和调色板
# 创建一个1行2列的图像布局
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
# 设置图表标题样式
title_font = {'fontsize': 14, 'fontweight': 'bold', 'color': 'darkblue'}
# 第一张图:RainTomorrow
sns.countplot(x='RainTomorrow', data=data, ax=axes[0], edgecolor='black') # 边框
axes[0].set_title('Rain Tomorrow', fontdict=title_font) # 标题
axes[0].set_xlabel('Will it Rain Tomorrow?', fontsize=12) # x轴标签
axes[0].set_ylabel('Count', fontsize=12) # y轴标签
axes[0].tick_params(axis='both', which='major', labelsize=11) # 调整刻度标签大小
# 第二张图:RainToday
sns.countplot(x='RainToday', data=data, ax=axes[1], edgecolor='black') # 边框
axes[1].set_title('Rain Today', fontdict=title_font) # 标题
axes[1].set_xlabel('Will it Rain Today?', fontsize=12) # x轴标签
axes[1].set_ylabel('Count', fontsize=12) # y轴标签
axes[1].tick_params(axis='both', which='major', labelsize=11) # 调整刻度标签大小
sns.despine() # 去除边框
plt.tight_layout() # 调整子图之间的间距
plt.show()
x = pd.crosstab(data['RainTomorrow'], data['RainToday'])
print(x)
y = x / x.transpose().sum().values.reshape(2, 1) * 100
print(y)
如果今天不下雨,那么明天下雨的概率是53%,如果今天下雨,那明天下雨概率为46%
#### step 2: 数据预处理
# 处理缺损值
print(data.isnull().sum() / data.shape[0] * 100) # 每列缺失值的比例
可以看到,像Sunshine等几列的缺失比率比较高,如果删除会影响数据的完整性,因此采用随机填充策略保持数据分布特性
# 对缺失值比例较高的列进行随机填充
# 由于这些列的缺失值比例较高,直接删除会影响数据完整性,因此采用随机填充策略保持数据分布特性
lst = ['Evaporation', 'Sunshine', 'Cloud9am', 'CLoud3pm']
for col in lst:
# 获取当前列非空值列表用于随机抽样
fill_list = data[col].dropna()
# 使用pandas Series随机抽样填充缺失值,保持原始数据分布特征
data[col] = data[col].fillna(pd.Series(np.random.choice(fill_list, size=len(data.index))))
['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']
# 检查数据框中各列的数据类型是否为'object'(字符串或类别型数据)
s = (data.dtypes == 'object')
# 提取所有数据类型为'object'的列名,转换为列表形式存储
object_cols = list(s[s].index)
# 打印这些类别型特征列的名称,用于后续分析和处理
print(object_cols)
# 对类别型特征列进行缺失值填充
for i in object_cols:
# 使用每列的众数(出现频率最高的值)填充缺失值
# mode()[0] 获取该列第一个众数值,用于填充
data[i].fillna(data[i].mode()[0], inplace=True)
# 筛选出数据类型为 float64 的列,并转换为布尔型 Series 对象
t = (data.dtypes == 'float64')
# 提取所有数据类型为 float64 的列名,转换为列表形式存储
num_cols = list(t[t].index)
# 打印这些数值型特征列的名称,用于后续分析和处理
print(num_cols)
# 对数值型特征列进行缺失值填充
for i in num_cols:
# 使用每列的均值(平均数)填充缺失值,保持数据整体分布特性
# mean() 计算当前列的平均值
data[i].fillna(data[i].mean(), inplace=True)
# 检查所有列的缺失值总数,验证缺失值是否已全部处理完成
print(data.isnull().sum())
2. 构建数据集
#### Step 2: 构建数据集
label_encoder = LabelEncoder() # 使用LabelEncoder对类别型特征进行标签编码
for i in object_cols:
# 对每个类别型特征列进行编码,将文本标签转换为数字
data[i] = label_encoder.fit_transform(data[i])
# 定义特征变量 X 和目标变量 y# 特征变量:删除 'RainTomorrow'(预测目标)和 'day'(不相关时间信息)
X = data.drop(['RainTomorrow', 'day'], axis=1).values
# 目标变量:'RainTomorrow' 列,表示是否下雨
y = data['RainTomorrow'].values
# 将数据集划分为训练集和测试集
# 测试集占 25%,随机种子设为 101 以保证结果可重复
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)
# 使用 MinMaxScaler 对特征进行归一化处理
scaler = MinMaxScaler()
scaler.fit(X_train) # 拟合训练数据,计算每个特征的最小值和最大值
# 对训练集和测试集应用归一化变换
X_train = scaler.transform(X_train) # 训练集归一化
X_test = scaler.transform(X_test) # 测试集归一化
3. 构建神经网络
#### Step 3: 构建神经网络
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.layers = nn.Sequential(
nn.Linear(23, out_features=24),
nn.Tanh(),
nn.Linear(in_features=24, out_features=18),
nn.Tanh(),
nn.Linear(in_features=18, out_features=23),
nn.Tanh(),
nn.Dropout(0.5),
nn.Linear(23, 12),
nn.Tanh(),
nn.Dropout(0.2),
nn.Linear(12, 1),
nn.Sigmoid()
)
def forward(self, x):
return self.layers(x)
class EarlyStopping:
def __init__(self, patience=25, min_delta=0.001, restore_best_weights=True):
self.patience = patience
self.min_delta = min_delta
self.restore_best_weights = restore_best_weights
self.counter = 0
self.best_loss = float('inf')
self.best_model_weights = None
def __call__(self, current_loss, model):
if current_loss < self.best_loss - self.min_delta:
self.best_loss = current_loss
self.counter = 0
if self.restore_best_weights:
self.best_model_weights = model.state_dict().copy()
else:
self.counter += 1
if self.counter >= self.patience:
print(f"EarlyStopping: Stopping training (best loss: {self.best_loss:.4f})")
if self.restore_best_weights:
model.load_state_dict(self.best_model_weights)
return True # 停止训练
return False # 继续训练
4.训练
model = NeuralNetwork()
optimizer = Adam(model.parameters(), lr=1e-4)
criterion = nn.BCELoss()
early_stop = EarlyStopping(patience=25, min_delta=0.001, restore_best_weights=True)
epochs = 10
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = TensorDataset(X_test_tensor, y_test_tensor)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
train_loss_history = []
train_acc_history = []
val_loss_history = []
val_acc_history = []
for epoch in range(epochs):
model.train()
train_loss = 0.0
train_correct = 0
train_total = 0
# 训练阶段
for X, y in train_loader:
optimizer.zero_grad()
outputs = model(X)
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_preds = (outputs > 0.5).float()
train_correct += (train_preds == y).sum().item()
train_total += y.size(0)
# 验证阶段
model.eval()
val_loss = 0.0
val_correct = 0
val_total = 0
with torch.no_grad():
for X_test, y_test in val_loader:
val_outputs = model(X_test)
val_loss += criterion(val_outputs, y_test).item()
val_preds = (val_outputs > 0.5).float()
val_correct += (val_preds == y_test).sum().item()
val_total += y_test.size(0)
# 计算指标并记录历史
train_loss /= len(train_loader)
train_acc = train_correct / train_total
val_loss /= len(val_loader)
val_acc = val_correct / val_total
train_loss_history.append(train_loss)
train_acc_history.append(train_acc)
val_loss_history.append(val_loss)
val_acc_history.append(val_acc)
print(f"epoch: {epoch + 1}/{epochs}, "
f"train_loss: {train_loss:.4f}, train_acc: {train_acc:.4f}, "
f"val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}")
if early_stop(val_loss, model):
break
# 训练结果可视化
plt.figure(figsize=(12, 5))
# 绘制损失曲线
plt.subplot(1, 2, 1)
plt.plot(train_loss_history, label='Train Loss')
plt.plot(val_loss_history, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
# 绘制准确率曲线
plt.subplot(1, 2, 2)
plt.plot(train_acc_history, label='Train Accuracy')
plt.plot(val_acc_history, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()
epoch: 1/10, train_loss: 0.4653, train_acc: 0.8026, val_loss: 0.4008, val_acc: 0.8245
epoch: 2/10, train_loss: 0.3936, train_acc: 0.8334, val_loss: 0.3804, val_acc: 0.8365
epoch: 3/10, train_loss: 0.3845, train_acc: 0.8376, val_loss: 0.3748, val_acc: 0.8381
epoch: 4/10, train_loss: 0.3820, train_acc: 0.8380, val_loss: 0.3734, val_acc: 0.8389
epoch: 5/10, train_loss: 0.3804, train_acc: 0.8394, val_loss: 0.3723, val_acc: 0.8393
epoch: 6/10, train_loss: 0.3792, train_acc: 0.8392, val_loss: 0.3721, val_acc: 0.8400
epoch: 7/10, train_loss: 0.3779, train_acc: 0.8398, val_loss: 0.3708, val_acc: 0.8399
epoch: 8/10, train_loss: 0.3773, train_acc: 0.8400, val_loss: 0.3708, val_acc: 0.8399
epoch: 9/10, train_loss: 0.3778, train_acc: 0.8398, val_loss: 0.3702, val_acc: 0.8408
epoch: 10/10, train_loss: 0.3764, train_acc: 0.8399, val_loss: 0.3718, val_acc: 0.8379
(三)总结
缺失值处理完善:
对高缺失率列(如Evaporation
)采用随机抽样填充,保持数据分布
分类列用众数填充(fillna(data[i].mode()[0]
),数值列用均值填充
特征增强:
- 日期字段分解为
year/month/day
后删除原始列,避免信息冗余 - 删除无关特征
Date
和day
,减少噪声