疏锦行Python打卡 DAY 55 序列预测任务介绍-CSDN博客

# 准备工作

import numpy as np
import random
import os
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
# 显示中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 显示负号正常
plt.rcParams['axes.unicode_minus'] = False
import warnings
warnings.filterwarnings("ignore")

# 设置随机种子确保结果可复现，全局随机函数
def set_seed(seed=42, deterministic=True):
    """
    设置全局随机种子，确保实验可重复性
    
    参数:
        seed: 随机种子值，默认为42
        deterministic: 是否启用确定性模式，默认为True
    """
    # 设置Python的随机种子
    random.seed(seed) 
    os.environ['PYTHONHASHSEED'] = str(seed) # 确保Python哈希函数的随机性一致，比如字典、集合等无序
    
    # 设置NumPy的随机种子
    np.random.seed(seed)
    
    # 设置PyTorch的随机种子
    torch.manual_seed(seed) # 设置CPU上的随机种子
    torch.cuda.manual_seed(seed) # 设置GPU上的随机种子
    torch.cuda.manual_seed_all(seed)  # 如果使用多GPU
    
    # 配置cuDNN以确保结果可重复
    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


# 设置随机种子
set_seed(42)

# ===== 步骤1：生成合成时间序列 =====
x = np.linspace(0, 100, 1000) # 在 0 到 100 之间生成 1000 个均匀分布的点作为x
y = np.sin(x) + 0.1 * x + np.random.normal(0, 0.5, 1000)  # 正弦波+线性趋势+噪声
# 可视化原始数据
plt.figure(figsize=(12, 6))
plt.plot(y)
plt.title('合成时间序列数据（正弦波+趋势+噪声）')
plt.xlabel('时间步')
plt.ylabel('值')
plt.grid(True)
plt.show()

# ===== 步骤2：数据预处理 =====
# 1. 数据标准化
scaler = MinMaxScaler(feature_range=(0, 1)) # 创建将数据缩放到0-1范围的缩放器
scaled_y = scaler.fit_transform(y.reshape(-1, 1)).flatten() # 将y转换为二维数组并进行缩放，后续再将其展平为一维数组

# 2. 划分训练集和测试集（80%训练，20%测试），时间序列任务要按时间顺序划分训练集和测试集
train_size = int(len(scaled_y) * 0.8)
train_data = scaled_y[:train_size]
test_data = scaled_y[train_size:]

# 3. 创建时序数据集函数
def create_sequences(data, seq_length):
    """
    将数据转换为适合RNN输入的序列格式
    参数:
        data: 原始时间序列数据
        seq_length: 每个输入序列的长度
    返回:
        X: 输入序列集
        y: 目标值集
    """
    X, y = [], [] # 初始化空列表存储输入序列和目标值
    for i in range(len(data) - seq_length): # 一共这么多个序列对
        X.append(data[i:i+seq_length]) # 截取长度为seq_length的子序列作为列表输入
        y.append(data[i+seq_length])  # 对应的下一个值作为目标
    return np.array(X), np.array(y) # 转换为NumPy数组返回

# 设置序列长度（使用前30个时间步预测下一个）
seq_length = 30

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# # ===== 步骤1：生成合成时间序列  =====
# x = np.linspace(0, 100, 1000)
# y = np.sin(x) + 0.1 * x + np.random.normal(0, 0.5, 1000)

# # =============================================================
# # =================== 正确流程的代码实现 ======================
# # =============================================================

# ===== 步骤2：划分原始数据，并进行正确的标准化 =====

# 1. 定义划分点
train_size = int(len(y) * 0.8)
seq_length = 30

# 2. 划分原始数据（仅用于fit缩放器）
train_data_raw = y[:train_size]
# 注意：测试集暂时不需要单独划分出来

# 3. 数据标准化 (关键步骤！)
#    - 创建缩放器
#    - 仅在训练数据上进行拟合(fit)，学习其分布
#    - 对整个数据集进行转换(transform)
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_data_raw.reshape(-1, 1))
scaled_y = scaler.transform(y.reshape(-1, 1)).flatten()

# ===== 步骤3：对完整的、缩放后的数据应用滑动窗口 =====

def create_sequences(data, seq_length):
    """
    将数据转换为适合RNN输入的序列格式 
    """
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

# 在整个数据集上创建序列
all_X, all_y = create_sequences(scaled_y, seq_length)

# ===== 步骤4：划分序列数据集（X和y） =====

# 计算分割点。最后一个训练样本的标签是原始数据中的 train_data[train_size-1]。
# 这个样本的起始索引是 (train_size - 1) - seq_length。
# 因此，我们总共可以生成 (train_size - seq_length) 个训练样本。
split_idx = train_size - seq_length

X_train = all_X[:split_idx]
y_train = all_y[:split_idx]

X_test = all_X[split_idx:]
y_test = all_y[split_idx:]

# ===== 步骤5：验证结果 =====
print("原始数据总长度:", len(y))
print("训练数据原始长度:", train_size)
print("测试数据原始长度:", len(y) - train_size)
print("-" * 30)
print("序列长度 (seq_length):", seq_length)
print("滑动窗口后样本总数:", len(all_X))
print("-" * 30)
print("训练集划分点 (split_idx):", split_idx)
print("训练集特征(X_train)形状:", X_train.shape) # (770, 30) -> (800-30, 30)
print("训练集标签(y_train)形状:", y_train.shape)   # (770,)
print("测试集特征(X_test)形状:", X_test.shape)   # (200, 30) -> (1000-30 - 770, 30)
print("测试集标签(y_test)形状:", y_test.shape)     # (200,)
print("-" * 30)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# =============================================================
# ===== 步骤1：数据准备 (与之前完全相同) =====
# =============================================================

# 生成合成时间序列
x = np.linspace(0, 100, 1000)
y = np.sin(x) + 0.1 * x + np.random.normal(0, 0.5, 1000)

# 定义参数
train_size = int(len(y) * 0.8)
seq_length = 30

# 正确的数据标准化
train_data_raw = y[:train_size]
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_data_raw.reshape(-1, 1))
scaled_y = scaler.transform(y.reshape(-1, 1)).flatten()

# 创建时序数据集函数
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

# 对完整数据应用滑动窗口
all_X, all_y = create_sequences(scaled_y, seq_length)

# 划分序列数据集
split_idx = train_size - seq_length
X_train_np = all_X[:split_idx]
y_train_np = all_y[:split_idx]
X_test_np = all_X[split_idx:]
y_test_np = all_y[split_idx:]

# =========================================================================
# ===== 步骤2：为机器学习模型准备数据 (关键区别点!) =====
# =========================================================================

# 1. 调整X的形状
# Scikit-learn的机器学习模型需要二维的输入: [样本数, 特征数]
# RNN需要的是三维输入: [样本数, 时间步长, 特征数]
# 我们需要将每个样本的 `seq_length` 个时间步“扁平化”成 `seq_length` 个特征。
# 原始形状: (770, 30, 1) or (770, 30) -> 目标形状: (770, 30)

# 获取样本数
n_samples_train = X_train_np.shape[0]
n_samples_test = X_test_np.shape[0]

# 将三维或二维的X reshape为二维
X_train_rf = X_train_np.reshape(n_samples_train, -1)
X_test_rf = X_test_np.reshape(n_samples_test, -1)

# y_train_np 和 y_test_np 已经是 (n_samples,) 的一维数组，可以直接使用。

print("为随机森林准备的 X_train 形状:", X_train_rf.shape) # (770, 30)
print("为随机森林准备的 y_train 形状:", y_train_np.shape)   # (770,)
print("为随机森林准备的 X_test 形状:", X_test_rf.shape)    # (200, 30)

# 注意：我们不再需要 PyTorch 的 Tensor, TensorDataset 和 DataLoader

# =============================================================
# ===== 步骤3：创建、训练和评估随机森林模型 =====
# =============================================================

# 1. 创建随机森林回归模型
# n_estimators: 森林中树的数量
# random_state: 保证每次运行结果一致，便于复现
# n_jobs=-1: 使用所有可用的CPU核心进行并行计算，加快训练速度
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# 2. 训练模型
print("\n开始训练随机森林模型...")
rf_model.fit(X_train_rf, y_train_np)
print("模型训练完成！")

# 3. 做出预测
train_predict = rf_model.predict(X_train_rf)
test_predict = rf_model.predict(X_test_rf)

# 4. 反标准化预测结果，以便在原始尺度上进行比较
# scaler.inverse_transform 需要二维输入，所以先 reshape
train_predict = scaler.inverse_transform(train_predict.reshape(-1, 1))
test_predict = scaler.inverse_transform(test_predict.reshape(-1, 1))

# 原始标签也需要反标准化
y_train_orig = scaler.inverse_transform(y_train_np.reshape(-1, 1))
y_test_orig = scaler.inverse_transform(y_test_np.reshape(-1, 1))

# 5. 计算均方根误差 (RMSE)
train_rmse = np.sqrt(mean_squared_error(y_train_orig, train_predict))
test_rmse = np.sqrt(mean_squared_error(y_test_orig, test_predict))
print(f"\n训练集 RMSE: {train_rmse:.4f}")
print(f"测试集 RMSE: {test_rmse:.4f}")


# =============================================================
# ===== 步骤4：可视化结果 =====
# =============================================================

plt.figure(figsize=(15, 7))
plt.plot(y, label='原始数据', color='gray', alpha=0.5)

# 绘制训练集的预测结果
train_predict_plot = np.empty_like(y)
train_predict_plot[:] = np.nan
train_predict_plot[seq_length : len(train_predict) + seq_length] = train_predict.flatten()
plt.plot(train_predict_plot, label='训练集预测值 (RF)', color='blue')

# 绘制测试集的预测结果
test_predict_plot = np.empty_like(y)
test_predict_plot[:] = np.nan
test_predict_plot[len(train_predict) + seq_length : len(y)] = test_predict.flatten()
plt.plot(test_predict_plot, label='测试集预测值 (RF)', color='red')

plt.title('时间序列预测结果对比 (随机森林)')
plt.xlabel('时间步')
plt.ylabel('值')
plt.legend()
plt.grid(True)
plt.show()
打卡：@浙大疏锦行