机器学习高级 day3 集成学习

原创已于 2025-08-12 20:31:07 修改 · 115 阅读

CC 4.0 BY-SA版权

文章标签：

于 2025-08-12 20:30:17 首次发布

"""
作业一：基础应用 - 鸢尾花分类
‌任务目标‌：
使用随机森林对鸢尾花数据集进行分类，并分析特征重要性
‌数据集‌：
sklearn.datasets.load_iris()
‌要求步骤‌：
1.加载鸢尾花数据集并划分训练集/测试集(70%/30%)
2.创建随机森林分类器(设置n_estimators=100, max_depth=3)
3.训练模型并在测试集上评估准确率
4.输出分类报告和混淆矩阵
5.可视化特征重要性
6.(选做)尝试调整n_estimators和max_depth观察准确率变化
"""

# NEW_FILE_CODE
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 设置中文字体
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False

# 1. 加载鸢尾花数据集并划分训练集/测试集(70%/30%)
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# 2. 创建随机森林分类器(设置n_estimators=100, max_depth=3)
rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)

# 3. 训练模型并在测试集上评估准确率
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\n模型准确率: {accuracy:.4f}")

# 4. 输出分类报告和混淆矩阵
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

print("\n混淆矩阵:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# 可视化混淆矩阵
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=iris.target_names,
            yticklabels=iris.target_names)
plt.title('混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')

# 5. 可视化特征重要性
feature_importance = rf.feature_importances_
feature_names = iris.feature_names

plt.subplot(1, 2, 2)
indices = np.argsort(feature_importance)[::-1]
plt.bar(range(len(feature_importance)), feature_importance[indices])
plt.xticks(range(len(feature_importance)), [feature_names[i] for i in indices], rotation=45)
plt.title('特征重要性')
plt.xlabel('特征')
plt.ylabel('重要性')

plt.tight_layout()
plt.show()

# 6. (选做)尝试调整n_estimators和max_depth观察准确率变化
print("\n选做部分 - 参数调优:")
# 调整n_estimators
n_estimators_range = [10, 50, 100, 150, 200]
accuracies_n_est = []

for n_est in n_estimators_range:
    rf_temp = RandomForestClassifier(n_estimators=n_est, max_depth=3, random_state=42)
    rf_temp.fit(X_train, y_train)
    y_pred_temp = rf_temp.predict(X_test)
    acc = accuracy_score(y_test, y_pred_temp)
    accuracies_n_est.append(acc)
    print(f"n_estimators={n_est}, 准确率={acc:.4f}")

# 调整max_depth
max_depth_range = [1, 2, 3, 4, 5, 6, None]
accuracies_max_depth = []

for max_d in max_depth_range:
    rf_temp = RandomForestClassifier(n_estimators=100, max_depth=max_d, random_state=42)
    rf_temp.fit(X_train, y_train)
    y_pred_temp = rf_temp.predict(X_test)
    acc = accuracy_score(y_test, y_pred_temp)
    accuracies_max_depth.append(acc)
    if max_d is None:
        print(f"max_depth=None, 准确率={acc:.4f}")
    else:
        print(f"max_depth={max_d}, 准确率={acc:.4f}")

# 可视化参数调整结果
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(n_estimators_range, accuracies_n_est, marker='o')
plt.title('n_estimators对准确率的影响')
plt.xlabel('n_estimators')
plt.ylabel('准确率')

plt.subplot(1, 2, 2)
x_labels = [str(d) if d is not None else 'None' for d in max_depth_range]
plt.plot(range(len(max_depth_range)), accuracies_max_depth, marker='o')
plt.xticks(range(len(max_depth_range)), x_labels)
plt.title('max_depth对准确率的影响')
plt.xlabel('max_depth')
plt.ylabel('准确率')

plt.tight_layout()
plt.show()

"""
作业二：信用卡欺诈检测
‌任务目标‌：
使用随机森林处理类别不平衡的信用卡欺诈检测问题
‌数据集‌：
Kaggle信用卡欺诈数据集https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
‌要求步骤‌：
1.加载信用卡交易数据(注意数据高度不平衡)
2.标准化Amount特征，Time特征可删除
3.使用分层抽样划分训练集/测试集
4.创建随机森林分类器(class_weight='balanced')
5.评估模型(使用精确率、召回率、F1、AUC-ROC)
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_score, recall_score, f1_score

# 步骤1: 加载信用卡交易数据
# 注意：需要先下载数据集并放在当前目录下
df = pd.read_csv("creditcard.csv")

# 检查数据不平衡情况
print("数据集大小:", df.shape)
print("类别分布:")
print(df['Class'].value_counts())
print("欺诈交易比例: {:.4f}%".format(df['Class'].sum() / len(df) * 100))

# 步骤2: 标准化Amount特征，删除Time特征
# 删除Time列
df = df.drop(['Time'], axis=1)

# 标准化Amount列
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))

# 分离特征和标签
X = df.drop('Class', axis=1)
y = df['Class']

# 步骤3: 使用分层抽样划分训练集/测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print("训练集大小:", X_train.shape)
print("测试集大小:", X_test.shape)
print("训练集欺诈交易数:", sum(y_train))
print("测试集欺诈交易数:", sum(y_test))

# 步骤4: 创建随机森林分类器
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',  # 处理类别不平衡
    random_state=42
)

# 训练模型
rf_classifier.fit(X_train, y_train)

# 步骤5: 评估模型
# 预测
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

# 计算评估指标
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_proba)

print("\n模型评估结果:")
print("精确率(Precision): {:.4f}".format(precision))
print("召回率(Recall): {:.4f}".format(recall))
print("F1分数: {:.4f}".format(f1))
print("AUC-ROC: {:.4f}".format(auc_roc))

# 详细分类报告
print("\n详细分类报告:")
print(classification_report(y_test, y_pred))