基本回归:线性、决策树、SVM、KNN
集成方法:随机森林、Adaboost、GradientBoosting、Bagging、ExtraTrees
##学会了数据分层抽样,以及各种回归的代码书写。可能还需要注意调参等。
继续学习网址:使用sklearn做各种回归
数据准备
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight') #设置matplotlib作图风格
import seaborn as sns
import pandas as pd
sns.set() #设置sns作图默认风格
import warnings
warnings.filterwarnings('ignore')
import numpy as np
data = pd.read_csv("../论文/mianbanshu.csv")
data.head()
city | year | rkmd | rjys | ys | rq | gd | dl | ps | ws | jz | gymj | lh | ldl | shlj | whh | qntz | wstz | Unnamed: 18 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AH | 2008 | 2043.0 | 174.61 | 95.11 | 87.60 | 7.50 | 14.15 | 1.75 | 78.86 | 58.64 | 9.29 | 35.97 | 32.03 | 94.75 | 53.95 | 2722949.0 | 25465 | NaN |
1 | AH | 2009 | 2114.0 | 160.96 | 95.25 | 88.62 | 9.66 | 14.91 | 8.23 | 83.24 | 62.40 | 10.23 | 37.16 | 33.63 | 95.51 | 60.91 | 3762160.0 | 27900 | NaN |
2 | AH | 2010 | 2469.0 | 160.83 | 96.06 | 90.52 | 9.88 | 16.01 | 8.81 | 88.46 | 71.58 | 10.95 | 37.50 | 33.67 | 95.59 | 64.56 | 4756917.0 | 30324 | NaN |
3 | AH | 2011 | 2265.0 | 168.99 | 96.55 | 93.35 | 10.66 | 18.00 | 10.25 | 91.09 | 79.04 | 11.88 | 39.47 | 34.55 | 97.06 | 86.99 | 6203403.0 | 32884 | NaN |
4 | AH | 2012 | 2401.0 | 165.45 | 98.02 | 94.61 | 11.13 | 18.47 | 11.72 | 94.53 | 86.39 | 11.92 | 38.80 | 34.72 | 95.28 | 91.14 | 6339795.0 | 39962 | NaN |
数据探索
f, ax = plt.subplots(figsize = (11, 7))
sns.factorplot(data = data, x = 'year', y = "wstz",
palette = 'plasma',
hue = 'city', ax=ax)
f, ax = plt.subplots(figsize = (11, 7))
sns.factorplot(data = data, x = 'year', y = "rkmd",
palette = 'plasma',
hue = 'city', ax=ax)
f, ax = plt.subplots(figsize = (11, 7))
sns.factorplot(data = data, x = 'year', y = "rjys",
palette = 'plasma',
hue = 'city', ax=ax)
data = data.drop(["Unnamed: 18"],axis=1)
#Correlation Matrix(另一种画法)
corr = kiva_loans_data.corr() plt.figure(figsize=(12,12))
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values, annot=True, cmap=‘cubehelix’, square=True) plt.title(‘Correlation between
different features’) corr
corr_all = data.drop(["city"],axis = 1).corr()
mask = np.zeros_like(corr_all, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize = (11, 9))
sns.heatmap(corr_all, mask = mask,
square = True, linewidths = .5, ax = ax, cmap = "BuPu")
#plt.savefig('heatmap.png')
a = set(data["city"])
b = [9] * 31
实现分层抽样,抽到的即为训练样本
gbr=data.groupby('city')
# 分层抽样字典定义 组名:数据个数
typicalNDict = dict(zip(a,b))
# 函数定义
def typicalsamling(group, typicalNDict):
name = group.name
n = typicalNDict[name]
return (group.sample(n=n))
#返回值:抽样后的数据框
train = data.groupby('city').apply(typicalsamling, typicalNDict)
train.to_csv('TrainData.csv', index=False)
train.head()
city | year | rkmd | rjys | ys | rq | gd | dl | ps | ws | jz | gymj | lh | ldl | shlj | whh | qntz | wstz | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
city | |||||||||||||||||||
AH | 4 | AH | 2012 | 2401.0 | 165.45 | 98.02 | 94.61 | 11.13 | 18.47 | 11.72 | 94.53 | 86.39 | 11.92 | 38.80 | 34.72 | 95.28 | 91.14 | 6339795.0 | 39962 |
2 | AH | 2010 | 2469.0 | 160.83 | 96.06 | 90.52 | 9.88 | 16.01 | 8.81 | 88.46 | 71.58 | 10.95 | 37.50 | 33.67 | 95.59 | 64.56 | 4756917.0 | 30324 | |
8 | AH | 2016 | 2487.0 | 180.19 | 99.20 | 98.05 | 12.76 | 21.82 | 13.18 | 97.36 | 92.03 | 14.02 | 41.71 | 37.67 | 99.94 | 99.94 | 7417525.0 | 67256 | |
0 | AH | 2008 | 2043.0 | 174.61 | 95.11 | 87.60 | 7.50 | 14.15 | 1.75 | 78.86 | 58.64 | 9.29 | 35.97 | 32.03 | 94.75 | 53.95 | 2722949.0 | 25465 | |
3 | AH | 2011 | 2265.0 | 168.99 | 96.55 | 93.35 | 10.66 | 18.00 | 10.25 | 91.09 | 79.04 | 11.88 | 39.47 | 34.55 | 97.06 | 86.99 | 6203403.0 | 32884 |
train = pd.read_csv("../论文/TrainData.csv")
train.head()
city | year | rkmd | rjys | ys | rq | gd | dl | ps | ws | jz | gymj | lh | ldl | shlj | whh | qntz | wstz | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AH | 2012 | 2401.0 | 165.45 | 98.02 | 94.61 | 11.13 | 18.47 | 11.72 | 94.53 | 86.39 | 11.92 | 38.80 | 34.72 | 95.28 | 91.14 | 6339795.0 | 39962 |
1 | AH | 2010 | 2469.0 | 160.83 | 96.06 | 90.52 | 9.88 | 16.01 | 8.81 | 88.46 | 71.58 | 10.95 | 37.50 | 33.67 | 95.59 | 64.56 | 4756917.0 | 30324 |
2 | AH | 2016 | 2487.0 | 180.19 | 99.20 | 98.05 | 12.76 | 21.82 | 13.18 | 97.36 | 92.03 | 14.02 | 41.71 | 37.67 | 99.94 | 99.94 | 7417525.0 | 67256 |
3 | AH | 2008 | 2043.0 | 174.61 | 95.11 | 87.60 | 7.50 | 14.15 | 1.75 | 78.86 | 58.64 | 9.29 | 35.97 | 32.03 | 94.75 | 53.95 | 2722949.0 | 25465 |
4 | AH | 2011 | 2265.0 | 168.99 | 96.55 | 93.35 | 10.66 | 18.00 | 10.25 | 91.09 | 79.04 | 11.88 | 39.47 | 34.55 | 97.06 | 86.99 | 6203403.0 | 32884 |
train.shape
(279, 18)
测试样本
test = data[~data["wstz"].isin(list(train["wstz"]))]
test.shape
(31, 18)
test.head()
city | year | rkmd | rjys | ys | rq | gd | dl | ps | ws | jz | gymj | lh | ldl | shlj | whh | qntz | wstz | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | AH | 2015 | 2458.0 | 168.90 | 98.79 | 97.55 | 12.38 | 20.82 | 12.67 | 96.68 | 91.80 | 13.37 | 41.16 | 37.16 | 99.55 | 99.55 | 6324122.0 | 106486 |
10 | BJ | 2008 | 1181.0 | 187.22 | 100.00 | 100.00 | 6.77 | 6.21 | 0.73 | 78.92 | 74.52 | 8.56 | 37.15 | 35.85 | 97.71 | 97.71 | 5189156.0 | 98295 |
29 | FJ | 2017 | 2854.0 | 203.69 | 99.56 | 97.45 | 13.85 | 17.41 | 9.42 | 92.21 | 89.07 | 14.13 | 43.69 | 40.28 | 99.38 | 99.38 | 7128613.0 | 260721 |
32 | GS | 2010 | 3793.0 | 155.12 | 91.57 | 74.29 | 6.89 | 12.20 | 4.89 | 62.59 | 58.58 | 8.12 | 27.12 | 23.14 | 97.84 | 37.95 | 944242.0 | 6289 |
47 | GD | 2015 | 3060.0 | 248.95 | 98.46 | 97.60 | 17.74 | 13.60 | 9.51 | 93.65 | 93.25 | 17.40 | 41.43 | 37.32 | 97.63 | 91.56 | 8496440.0 | 644310 |
从上面可以看出每个city之间的指标随着年份可能平稳,或趋于增加,但是各折线相交都较少,说明这些指标已足以将样本进行区分。
另外从热力图可以看出,year这个变量与其他的变量有明显相关关系的情况比较少,大部分指标在不同年份相差不大,故考虑也将year这个变量删除。
去除城市和年份这一列
train = train.drop(["city","year"],axis=1)
X_train = train[train.columns[:-1]]
y_train = train[train.columns[-1]]
test = test.drop(["city","year"],axis=1)
X_test = test[test.columns[:-1]]
y_test = test[test.columns[-1]]
1.线性回归
from sklearn import linear_model
model1_linear = linear_model.LinearRegression()
model1_linear = model1_linear.fit(X_train,y_train)
y_pred1 = model1_linear.predict(X_test)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_pred1,y_test)
84180.196193290452
from sklearn.metrics import r2_score
r2_score(y_pred1,y_test)
0.48478789713121917
plt.figure(figsize=(12, 6))
plt.plot(list(y_pred1),label="forcast")
plt.plot(list(y_test),label="test")
plt.ylabel('wstz',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
2.决策树回归
from sklearn import tree
model2_tree = tree.DecisionTreeRegressor()
model2_tree = model2_tree.fit(X_train,y_train)
y_pred2 = model2_tree.predict(X_test)
mean_absolute_error(y_pred2,y_test)
52285.806451612902
r2_score(y_pred2,y_test)
0.55980167732053721
plt.figure(figsize=(12, 6))
plt.plot(list(y_pred2),label="forcast")
plt.plot(list(y_test),label="test")
plt.ylabel('wstz',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
3.SVM回归
####3.3SVM回归####
from sklearn import svm
model3_SVR = svm.SVR()
model3_SVR = model3_SVR.fit(X_train,y_train)
y_pred3 = model3_SVR.predict(X_test)
mean_absolute_error(y_pred3,y_test)
90906.8702612044
r2_score(y_pred3,y_test)
-16123841.245722869
plt.figure(figsize=(12, 6))
plt.plot(list(y_pred3),label="forcast")
plt.plot(list(y_test),label="test")
plt.ylabel('wstz',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
4.KNN回归
from sklearn.neighbors import KNeighborsRegressor
model4_KN = KNeighborsRegressor(n_neighbors=3)
model4_KN.fit(X_train,y_train)
y_pred4 = model4_KN.predict(X_test)
mean_absolute_error(y_pred4,y_test)
117266.68817204301
r2_score(y_pred4,y_test)
-1.5288016385577752
plt.figure(figsize=(12, 6))
plt.plot(list(y_pred4),label="forcast")
plt.plot(list(y_test),label="test")
plt.ylabel('wstz',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
5.随机森林回归
from sklearn.ensemble import RandomForestRegressor
model5_RF = RandomForestRegressor()
model5_RF = model5_RF.fit(X_train,y_train)
y_pred5 = model5_RF.predict(X_test)
mean_absolute_error(y_pred5,y_test)
34605.676129032247
r2_score(y_pred5,y_test)
0.74665386911940801
plt.figure(figsize=(12, 6))
plt.plot(list(y_pred5),label="forcast")
plt.plot(list(y_test),label="test")
plt.ylabel('wstz',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
6.AdaBoost回归
from sklearn.ensemble import AdaBoostRegressor
model6_AdaBoost = AdaBoostRegressor(n_estimators=100)
model6_AdaBoost = model6_AdaBoost.fit(X_train,y_train)
y_pred6 = model6_AdaBoost.predict(X_test)
mean_absolute_error(y_pred6,y_test)
71458.328974579621
r2_score(y_pred6,y_test)
0.30236366136673865
plt.figure(figsize=(12, 6))
plt.plot(list(y_pred6),label="forcast")
plt.plot(list(y_test),label="test")
plt.ylabel('wstz',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
7.梯度上升回归
from sklearn.ensemble import GradientBoostingRegressor
model7_GBDT = GradientBoostingRegressor()
model7_GBDT = model7_GBDT.fit(X_train,y_train)
y_pred7 = model7_GBDT.predict(X_test)
mean_absolute_error(y_pred7,y_test)
30247.23832854132
r2_score(y_pred7,y_test)
0.79286645995236993
plt.figure(figsize=(12, 6))
plt.plot(list(y_pred7),label="forcast")
plt.plot(list(y_test),label="test")
plt.ylabel('wstz',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
8.Bagging回归
from sklearn.ensemble import BaggingRegressor
model8_Bagging = BaggingRegressor()
model8_Bagging = model8_Bagging.fit(X_train,y_train)
y_pred8 = model8_Bagging.predict(X_test)
mean_absolute_error(y_pred8,y_test)
36788.058064516132
r2_score(y_pred8,y_test)
0.68475165003444116
plt.figure(figsize=(12, 6))
plt.plot(list(y_pred8),label="forcast")
plt.plot(list(y_test),label="test")
plt.ylabel('wstz',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
9.ExtraTree极端随机树回归
####3.9ExtraTree极端随机树回归####
from sklearn.tree import ExtraTreeRegressor
model9_Etra = ExtraTreeRegressor()
model9_Etra = model9_Etra.fit(X_train,y_train)
y_pred9 = model9_Etra.predict(X_test)
mean_absolute_error(y_pred9,y_test)
45674.580645161288
r2_score(y_pred9,y_test)
0.31221911911576294
plt.figure(figsize=(12, 6))
plt.plot(list(y_pred9),label="forcast")
plt.plot(list(y_test),label="test")
plt.ylabel('wstz',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
模型比较
modelnames = ['linear_model',
'DecisionTreeRegressor',
'RandomForestRegressor',
'AdaBoostRegressor',
'GradientBoostingRegressor',
'BaggingRegressor',
'ExtraTreeRegressor',
]
R_square = [r2_score(y_pred1,y_test),r2_score(y_pred2,y_test),
r2_score(y_pred5,y_test),r2_score(y_pred6,y_test),
r2_score(y_pred7,y_test),r2_score(y_pred8,y_test),
r2_score(y_pred9,y_test)]
plt.figure(figsize=(10,9))
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(1, 1, 1)
ticks = ax.set_xticks(range(0,7))#设定x轴有7个标签
ax.plot(R_square,'ko--')
labels = ax.set_xticklabels(modelnames,fontsize='14',rotation=90)
plt.title("model comparsion",fontsize='14')
plt.grid(True)
plt.show()
误差衡量指标平均绝对误差和决定系数具有一致性,当决定系数系数越大,平均绝对误差就越小,上述试验数据也是这样。
相对较好的两个模型如下:
随机森林回归:平均绝对误差:34605 决定系数:0.747
梯度上升回归:平均绝对误差:30247 决定系数:0.793
前馈型神经网络,也叫多层感知器(Multi-Layer Perceptron,MLP)模型或全连接神经网络。它每一层的神经元从上一层神经元接收数据,经过计算之后产生输出数据,送入下一层神经元继续处理,最后一层神经元的输出是神经网络最终的输出值。
import numpy as np
from sklearn.neural_network import MLPRegressor
MLP = MLPRegressor()
MLP.fit(X_train,y_train)
y_pred10 = MLP.predict(X_test)
score = r2_score(y_test, y_pred10)
plt.figure(figsize=(12, 6))
plt.plot(np.arange(len(y_pred10)),y_test,'go-',label = 'true value')
plt.plot(np.arange(len(y_pred10)),y_pred10,'ro-',label = 'predict value')
plt.legend()
plt.show()