机器学习实验
实验一:线性回归实验
环境配置:
第一次可能详细一点,之后的都是一样的~
首先是在anaconda里面新建一个环境,关于anaconda的安装这里就不过多阐述了,自己百度吧~
新建一个python2.7的环境
conda create -n jqlearn python=2.7
然后遇到

这种一律按y继续安装,安装好之后进入新环境,因为我的新环境的名字是jqlearn,所以是激活jqlearn。
conda activate jqlearn

看上图的最左边,从DrugX变成了jqlearn就是切换成功了,大家的可能是从base进入的,然后就是安装我们需要的各种库
numpy:
conda install numpy
pandas:
conda install pandas
matplotlib:
conda install -c conda-forge matplotlib
scikit-learn:
conda install scikit-learn
装好之后可以输入下面这个命令看一下是不是装成功了
conda list

环境配好之后就可以打开pycharm开始了,先新建一个项目

选择已经配置的解释器,打开之后如下图

一般情况会自动加载出来的,如箭头所示,然后但确定就好了。
项目建完之后新建一个目录,命名为data,把要做的实验数据放进这个data文件夹里面。

然后把下面代码放进main.py里面直接运行就可以了~
一元线性回归:
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# data.head()
# data.describe()
# #用来查看看下数据长什么样子
# data.plot(kind='scatter', x='人口', y='收益', figsize=(12,8))
# plt.xlabel('人口', fontsize=18)
# plt.ylabel('收益', rotation=0, fontsize=18)
# plt.show()
def getTrainSetAndTestSet(DataPath):
data = pd.read_csv(DataPath)
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:, :cols - 1] # X是所有行,去掉最后一列
y = data.iloc[:, cols - 1:] # X是所有行,最后一列
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # 随机划分训练集和测试集,默认把数据集的25%作为测试集
# 查看训练集和测试集的维度
print "训练集和测试集的维度:"
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape
return X_train, X_test, y_train, y_test
# 损失函数
def computeCost(X, y, w):
inner = np.power(((X * w.T) - y), 2) # (m,n) @ (n, 1) -> (n, 1)
# return np.sum(inner) / (2 * len(X))
return np.sum(inner) / (2 * X.shape[0])
# 梯度下降函数
def batch_gradientDescent(X, y, w, alpha, iters):
temp = np.matrix(np.zeros(w.shape))
parameters = int(w.ravel().shape[1])
cost = np.zeros(iters)
for i in range(iters):
error = (X * w.T) - y
for j in range(parameters):
term = np.multiply(error, X[:, j])
temp[0, j] = w[0, j] - ((alpha / len(X)) * np.sum(term))
w = temp
cost[i] = computeCost(X, y, w)
return w, cost
# 运行部分
path = 'data/regress_data1.csv'
X_train, X_test, y_train, y_test = getTrainSetAndTestSet(path)
X = np.matrix(X_train.values)#代价函数是应该是numpy矩阵,所以我们需要转换X和Y
y = np.matrix(y_train.values)
w = np.matrix(np.array([0, 0]))
# print(X.shape, w.shape, y.shape)
print (computeCost(X, y, w))#计算代价函数
alpha = 0.01#初始化一些附加变量 - 学习速率α和要执行的迭代次数
iters = 1000
g, cost = batch_gradientDescent(X, y, w, alpha, iters)
print(g)
print(computeCost(X, y, g))#计算训练模型的代价函数(误差)
运行结果如下图

绘制线性模型以及数据,直观地看出它的拟合
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# data.head()
# data.describe()
# #用来查看看下数据长什么样子
# data.plot(kind='scatter', x='人口', y='收益', figsize=(12,8))
# plt.xlabel('人口', fontsize=18)
# plt.ylabel('收益', rotation=0, fontsize=18)
# plt.show()
def getTrainSetAndTestSet(DataPath):
data = pd.read_csv(DataPath)
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:, :cols - 1] # X是所有行,去掉最后一列
y = data.iloc[:, cols - 1:] # X是所有行,最后一列
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # 随机划分训练集和测试集,默认把数据集的25%作为测试集
# 查看训练集和测试集的维度
print "训练集和测试集的维度:"
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape
return X_train, X_test, y_train, y_test, data
# 损失函数
def computeCost(X, y, w):
inner = np.power(((X * w.T) - y), 2) # (m,n) @ (n, 1) -> (n, 1)
# return np.sum(inner) / (2 * len(X))
return np.sum(inner) / (2 * X.shape[0])
# 梯度下降函数
def batch_gradientDescent(X, y, w, alpha, iters):
temp = np.matrix(np.zeros(w.shape))
parameters = int(w.ravel().shape[1])
cost = np.zeros(iters)
for i in range(iters):
error = (X * w.T) - y
for j in range(parameters):
term = np.multiply(error, X[:, j])
temp[0, j] = w[0, j] - ((alpha / len(X)) * np.sum(term))
w = temp
cost[i] = computeCost(X, y, w)
return w, cost
# 运行部分
path = 'data/regress_data1.csv'
X_train, X_test, y_train, y_test,data= getTrainSetAndTestSet(path)
X = np.matrix(X_train.values)
y = np.matrix(y_train.values)
w = np.matrix(np.array([0, 0]))
print (computeCost(X, y, w))
alpha = 0.01
iters = 1000
g, cost = batch_gradientDescent(X, y, w, alpha, iters)
print(g)
print(computeCost(X, y, g))
#绘制
x = np.linspace(data['人口'].min(), data['人口'].max(), 100)
f = g[0, 0] + (g[0, 1] * x)
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(x, f, 'r', label='预测值')
ax.scatter(data['人口'], data['收益'], label='训练数据')
ax.legend(loc=2)
ax.set_xlabel('人口', fontsize=18)
ax.set_ylabel('收益', rotation=0, fontsize=18)
ax.set_title('预测收益和人口规模', fontsize=18)
plt.show()
在数据预处理那边多返回一个data数据就可以了。
可以看到图像如下图


多元线性回归:
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# data.head()
# data.describe()
# #用来查看看下数据长什么样子
# data.plot(kind='scatter', x='人口', y='收益', figsize=(12,8))
# plt.xlabel('人口', fontsize=18)
# plt.ylabel('收益', rotation=0, fontsize=18)
# plt.show()
def getTrainSetAndTestSet(DataPath):
data = pd.read_csv(DataPath)
data = (data - data.mean()) / data.std()
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:,0:cols-1] # X是所有行,去掉最后一列
y = data.iloc[:,cols-1:cols] # X是所有行,最后一列
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # 随机划分训练集和测试集,默认把数据集的25%作为测试集
# 查看训练集和测试集的维度
print "训练集和测试集的维度:"
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape
return X_train, X_test, y_train, y_test, data
# 损失函数
def computeCost(X, y, w):
inner = np.power(((X * w.T) - y), 2) # (m,n) @ (n, 1) -> (n, 1)
# return np.sum(inner) / (2 * len(X))
return np.sum(inner) / (2 * X.shape[0])
# 梯度下降函数
def batch_gradientDescent(X, y, w, alpha, iters):
temp = np.matrix(np.zeros(w.shape))
parameters = int(w.ravel().shape[1])
cost = np.zeros(iters)
for i in range(iters):
error = (X * w.T) - y
for j in range(parameters):
term = np.multiply(error, X[:, j])
temp[0, j] = w[0, j] - ((alpha / len(X)) * np.sum(term))
w = temp
cost[i] = computeCost(X, y, w)
return w, cost
# 运行部分
path = 'data/regress_data2.csv'
X_train, X_test, y_train, y_test,data= getTrainSetAndTestSet(path)
X = np.matrix(X_train.values)
y = np.matrix(y_train.values)
w = np.matrix(np.array([0,0,0]))
print(X.shape,y.shape,w.shape)
# print (computeCost(X, y, w))
alpha = 0.01
iters = 1000
g, cost = batch_gradientDescent(X, y, w, alpha, iters)
print(g)
print(computeCost(X, y, g))
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('迭代次数', fontsize=18)
ax.set_ylabel('代价', rotation=0, fontsize=18)
ax.set_title('误差和训练Epoch数', fontsize=18)
plt.show()
结果如下:

会发现我们的图里面的中文都是乱码的

那是因为最开始的时候
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
而你的电脑里面是没有SimHei这个字体的,下载流程可以看https://blog.csdn.net/DS__SS/article/details/123824668
就变成这样了

实验二:逻辑回归实验
用同一个项目就可以了。
可用下面的代码先对我们的数据集进行观察,使用matplotlib将数据集绘制出散点图。
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import scipy.optimize as opt
from sklearn.model_selection import train_test_split
reload(sys)
sys.setdefaultencoding('utf-8')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
def getTrainSetAndTestSet(DataPath):
data = pd.read_csv(DataPath)
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # 随机划分训练集和测试集,默认把数据集的25%作为测试集
# 查看训练集和测试集的维度
print "训练集和测试集的维度:"
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape
return X_train, X_test, y_train, y_test
def drawdata(path):
data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(positive['Exam 1'],
positive['Exam 2'],
s=50,
c='b',
marker='o',
label='Admitted')
ax.scatter(negative['Exam 1'],
negative['Exam 2'],
s=50,
c='r',
marker='x',
label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
path = 'data2/ex2data1.txt'
drawdata(path)
getTrainSetAndTestSet(path)
散点图如下图:

训练代码如下
# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import scipy.optimize as opt
from sklearn.model_selection import train_test_split
reload(sys)
sys.setdefaultencoding('utf-8')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
def getTrainSetAndTestSet(DataPath):
data = pd.read_csv(DataPath)
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:, 0:cols - 1]
y = data.iloc[:, cols - 1:cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # 随机划分训练集和测试集,默认把数据集的25%作为测试集
# 查看训练集和测试集的维度
print "训练集和测试集的维度:"
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape
return X_train, X_test, y_train, y_test
def drawdata(path):
data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(positive['Exam 1'],
positive['Exam 2'],
s=50,
c='b',
marker='o',
label='Admitted')
ax.scatter(negative['Exam 1'],
negative['Exam 2'],
s=50,
c='r',
marker='x',
label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
# plt.show()
# sigmoid激活函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# 定义损失函数
def cost(w, X, y):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * w.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * w.T)))
return np.sum(first - second) / (len(X))
# 梯度下降
def gradient(w, X, y):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(w.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * w.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
grad[i] = np.sum(term) / len(X)
return grad
# 有正则项的损失函数
def costReg(w, X, y, learningRate):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * w.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * w.T)))
reg = (learningRate /
(2 * len(X))) * np.sum(np.power(w[:, 1:w.shape[1]], 2))
return np.sum(first - second) / len(X) + reg
# 有正则项的梯度下降
def gradientReg(w, X, y, learningRate):
w = np.matrix(w)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(w.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * w.T) - y
for i in range(parameters):
term = np.multiply(error, X[:, i])
if (i == 0):
grad[i] = np.sum(term) / len(X)
else:
grad[i] = (np.sum(term) / len(X)) + (
(learningRate / len(X)) * w[:, i])
return grad
# 预测函数
def predict(w, X):
probability = sigmoid(X * w.T)
return [1 if x >= 0.5 else 0 for x in probability]
path = 'data2/ex2data1.txt'
# drawdata(path)
X_train, X_test, y_train, y_test = getTrainSetAndTestSet(path)
X = np.array(X_train.values)
y = np.array(y_train.values)
w = np.zeros(3)
# print(X.shape, w.shape, y.shape)
# print(cost(w, X, y))#查看初始化参数的代价函数值
# gradient(w, X, y)
# result = opt.fmin_tnc(func=cost, x0=w, fprime=gradient, args=(X, y))#梯度下降法
result = opt.minimize(fun=cost, x0=w, args=(X, y), method='Newton-CG', jac=gradient)#牛顿迭代法
#输出准确率
w_min = np.matrix(result.x)
predictions = predict(w_min, X)
correct = [
1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0
for (a, b) in zip(predictions, y)
]
accuracy = (sum(map(int, correct)) % len(correct))
print('accuracy = {0}%'.format(accuracy))
#画出分类边界
# print(result.x)
coef = -(result.x / result.x[2])
# print(coef)
x = np.arange(20, 110, 10)
y = coef[0] + coef[1] * x
drawdata(path)
plt.plot(x, y)
plt.title('Decision Boundary')
plt.show()
结果如下:

