Python机器学习从入门到精通学习路线图
一、基础准备阶段
1. Python编程基础
- 重要性:机器学习的基础工具
- 关键内容:
- 基本语法和数据结构
- 函数和面向对象编程
- 文件操作和异常处理
- 常用库:NumPy, Pandas, Matplotlib
# 示例:使用Pandas进行数据分析
import pandas as pd
import numpy as np
# 创建DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35],
'Salary': [50000, 60000, 70000]}
df = pd.DataFrame(data)
# 基本操作
print(df.head())
print(df.describe())
df['Seniority'] = np.where(df['Age']>30, 'Senior', 'Junior')
print(df)
2. 数学基础
- 线性代数:矩阵运算、特征值分解
- 概率统计:概率分布、假设检验
- 微积分:梯度、导数、极值
二、机器学习基础
1. 机器学习概述
- 监督学习 vs 无监督学习
- 机器学习流程:数据准备→模型训练→评估→部署
2. Scikit-learn入门
- 介绍:Python最常用的机器学习库
- 关键功能:
- 数据预处理
- 模型训练和评估
- 模型持久化
# 示例:使用Scikit-learn进行线性回归
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
# 生成数据
X = np.random.rand(100, 1)
y = 2 + 3 * X + np.random.randn(100, 1)
# 拆分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 训练模型
model = LinearRegression()
model.fit(X_train, y_train)
# 预测和评估
y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("系数:", model.coef_)
print("截距:", model.intercept_)
三、监督学习
1. 线性模型
- 线性回归
- 逻辑回归
- 正则化方法(Lasso, Ridge)
# 逻辑回归示例
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import classification_report
# 加载数据
iris = load_iris()
X, y = iris.data, iris.target
# 只使用两类进行二分类
X = X[y != 2]
y = y[y != 2]
# 训练模型
model = LogisticRegression()
model.fit(X, y)
# 评估
y_pred = model.predict(X)
print(classification_report(y, y_pred))
2. 决策树与集成方法
- 决策树
- 随机森林
- Gradient Boosting (XGBoost, LightGBM)
# 随机森林示例
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# 生成数据
X, y = make_classification(n_samples=1000, n_features=4,
n_informative=2, n_redundant=0,
random_state=42)
# 拆分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 训练模型
clf = RandomForestClassifier(n_estimators=100, max_depth=2)
clf.fit(X_train, y_train)
# 评估
print("训练集准确率:", clf.score(X_train, y_train))
print("测试集准确率:", clf.score(X_test, y_test))
3. 支持向量机
- 线性SVM
- 核方法
# SVM示例
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
# 生成数据
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_informative=2, random_state=1,
n_clusters_per_class=1)
# 标准化
scaler = StandardScaler()
X = scaler.fit_transform(X)
# 训练模型
model = SVC(kernel='rbf', C=1.0, gamma='scale')
model.fit(X, y)
# 可视化决策边界
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
plot_decision_regions(X, y, clf=model, legend=2)
plt.show()
四、无监督学习
1. 聚类算法
- K-Means
- 层次聚类
- DBSCAN
# K-Means示例
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
# 生成数据
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# 训练模型
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)
# 可视化
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.5)
plt.show()
2. 降维技术
- PCA
- t-SNE
- LDA
# PCA示例
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
# 加载数据
iris = load_iris()
X = iris.data
y = iris.target
# 应用PCA
pca = PCA(n_components=2)
X_r = pca.fit_transform(X)
# 可视化
plt.figure()
colors = ['navy', 'turquoise', 'darkorange']
for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, label=target_name)
plt.legend()
plt.title('PCA of IRIS dataset')
plt.show()
五、模型评估与优化
1. 评估指标
- 分类:准确率、精确率、召回率、F1、ROC-AUC
- 回归:MSE、MAE、R²
- 聚类:轮廓系数
2. 交叉验证与超参数调优
- K折交叉验证
- 网格搜索与随机搜索
- 贝叶斯优化
# 网格搜索示例
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
# 加载数据
iris = load_iris()
X, y = iris.data, iris.target
# 定义参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10]
}
# 创建模型
rf = RandomForestClassifier()
# 网格搜索
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(X, y)
# 输出最佳参数
print("最佳参数:", grid_search.best_params_)
print("最佳分数:", grid_search.best_score_)
六、深度学习入门
1. 神经网络基础
- 感知机
- 激活函数
- 反向传播
2. TensorFlow/Keras入门
- 全连接网络
- CNN
- RNN
# Keras神经网络示例
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# 生成数据
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
# 拆分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 创建模型
model = Sequential([
Dense(64, activation='relu', input_shape=(20,)),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid')
])
# 编译模型
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
# 训练模型
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)
# 评估模型
loss, accuracy = model.evaluate(X_test, y_test)
print(f"测试集准确率: {accuracy:.4f}")
七、进阶主题
1. 特征工程
- 特征选择
- 特征提取
- 特征构造
2. 自然语言处理
- 文本预处理
- Word2Vec
- Transformer
# 文本分类示例
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.datasets import fetch_20newsgroups
# 加载数据
categories = ['alt.atheism', 'soc.religion.christian']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)
# 创建管道
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
# 训练模型
model.fit(train.data, train.target)
# 评估
predicted = model.predict(test.data)
print("准确率:", (predicted == test.target).mean())
3. 计算机视觉
- OpenCV基础
- 图像分类
- 目标检测
八、项目实战
1. 端到端项目流程
- 问题定义
- 数据收集与探索
- 特征工程
- 模型选择与训练
- 模型评估与优化
- 部署
2. 项目示例:房价预测
# 房价预测示例
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# 加载数据
data = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
X = data.drop(['logS'], axis=1)
y = data['logS']
# 拆分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建预处理和模型管道
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
transformers=[
('num', 'passthrough', numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
# 训练模型
pipeline.fit(X_train, y_train)
# 评估
y_pred = pipeline.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
希望这份学习路线图能帮助你系统性地掌握Python机器学习!记住,实践是最好的学习方式,多动手实现项目才能真正掌握这些技术。