机器学习实验（不更新了，完整在资源里面）

Her_one

已于 2023-04-10 22:41:51 修改

阅读量686

点赞数

CC 4.0 BY-SA版权

分类专栏：机器学习文章标签：机器学习 python 人工智能

于 2023-03-22 21:50:37 首次发布

本文链接：https://blog.csdn.net/Her_one/article/details/129720162

机器学习专栏收录该内容

2 篇文章

订阅专栏

本文围绕机器学习实验展开，包含线性回归和逻辑回归实验。详细介绍了实验环境配置，如在anaconda新建python2.7环境，安装numpy、pandas等库。还说明了项目创建、数据存放等步骤，展示了一元和多元线性回归及逻辑回归实验的运行结果。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

机器学习实验

实验一：线性回归实验

环境配置：

第一次可能详细一点，之后的都是一样的~

首先是在anaconda里面新建一个环境，关于anaconda的安装这里就不过多阐述了，自己百度吧~

新建一个python2.7的环境

conda create -n jqlearn python=2.7

然后遇到

这种一律按y继续安装，安装好之后进入新环境，因为我的新环境的名字是jqlearn，所以是激活jqlearn。

conda activate jqlearn

看上图的最左边，从DrugX变成了jqlearn就是切换成功了，大家的可能是从base进入的，然后就是安装我们需要的各种库

numpy：

conda install numpy

pandas：

conda install pandas

matplotlib：

conda install -c conda-forge matplotlib

scikit-learn：

conda install scikit-learn

装好之后可以输入下面这个命令看一下是不是装成功了

conda list

环境配好之后就可以打开pycharm开始了，先新建一个项目

选择已经配置的解释器，打开之后如下图

一般情况会自动加载出来的，如箭头所示，然后但确定就好了。

项目建完之后新建一个目录，命名为data，把要做的实验数据放进这个data文件夹里面。

然后把下面代码放进main.py里面直接运行就可以了~

一元线性回归：

# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys

reload(sys)
sys.setdefaultencoding('utf-8')

plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号


# data.head()
# data.describe()

# #用来查看看下数据长什么样子
# data.plot(kind='scatter', x='人口', y='收益', figsize=(12,8))
# plt.xlabel('人口', fontsize=18)
# plt.ylabel('收益', rotation=0, fontsize=18)
# plt.show()

def getTrainSetAndTestSet(DataPath):
    data = pd.read_csv(DataPath)
    data.insert(0, 'Ones', 1)
    cols = data.shape[1]
    X = data.iloc[:, :cols - 1]  # X是所有行，去掉最后一列
    y = data.iloc[:, cols - 1:]  # X是所有行，最后一列
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)  # 随机划分训练集和测试集，默认把数据集的25%作为测试集
    # 查看训练集和测试集的维度
    print "训练集和测试集的维度:"
    print X_train.shape
    print y_train.shape
    print X_test.shape
    print y_test.shape
    return X_train, X_test, y_train, y_test


# 损失函数
def computeCost(X, y, w):
    inner = np.power(((X * w.T) - y), 2)  # (m,n) @ (n, 1) -> (n, 1)
    #     return np.sum(inner) / (2 * len(X))
    return np.sum(inner) / (2 * X.shape[0])


# 梯度下降函数
def batch_gradientDescent(X, y, w, alpha, iters):
    temp = np.matrix(np.zeros(w.shape))
    parameters = int(w.ravel().shape[1])
    cost = np.zeros(iters)

    for i in range(iters):
        error = (X * w.T) - y
        for j in range(parameters):
            term = np.multiply(error, X[:, j])
            temp[0, j] = w[0, j] - ((alpha / len(X)) * np.sum(term))
        w = temp
        cost[i] = computeCost(X, y, w)

    return w, cost


# 运行部分
path = 'data/regress_data1.csv'
X_train, X_test, y_train, y_test = getTrainSetAndTestSet(path)
X = np.matrix(X_train.values)#代价函数是应该是numpy矩阵，所以我们需要转换X和Y
y = np.matrix(y_train.values)
w = np.matrix(np.array([0, 0]))
# print(X.shape, w.shape, y.shape)
print (computeCost(X, y, w))#计算代价函数
alpha = 0.01#初始化一些附加变量 - 学习速率α和要执行的迭代次数
iters = 1000
g, cost = batch_gradientDescent(X, y, w, alpha, iters)
print(g)
print(computeCost(X, y, g))#计算训练模型的代价函数（误差）

运行结果如下图

绘制线性模型以及数据，直观地看出它的拟合

# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys

reload(sys)
sys.setdefaultencoding('utf-8')

plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号


# data.head()
# data.describe()

# #用来查看看下数据长什么样子
# data.plot(kind='scatter', x='人口', y='收益', figsize=(12,8))
# plt.xlabel('人口', fontsize=18)
# plt.ylabel('收益', rotation=0, fontsize=18)
# plt.show()

def getTrainSetAndTestSet(DataPath):
    data = pd.read_csv(DataPath)
    data.insert(0, 'Ones', 1)
    cols = data.shape[1]
    X = data.iloc[:, :cols - 1]  # X是所有行，去掉最后一列
    y = data.iloc[:, cols - 1:]  # X是所有行，最后一列
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)  # 随机划分训练集和测试集，默认把数据集的25%作为测试集
    # 查看训练集和测试集的维度
    print "训练集和测试集的维度:"
    print X_train.shape
    print y_train.shape
    print X_test.shape
    print y_test.shape
    return X_train, X_test, y_train, y_test, data


# 损失函数
def computeCost(X, y, w):
    inner = np.power(((X * w.T) - y), 2)  # (m,n) @ (n, 1) -> (n, 1)
    #     return np.sum(inner) / (2 * len(X))
    return np.sum(inner) / (2 * X.shape[0])


# 梯度下降函数
def batch_gradientDescent(X, y, w, alpha, iters):
    temp = np.matrix(np.zeros(w.shape))
    parameters = int(w.ravel().shape[1])
    cost = np.zeros(iters)

    for i in range(iters):
        error = (X * w.T) - y
        for j in range(parameters):
            term = np.multiply(error, X[:, j])
            temp[0, j] = w[0, j] - ((alpha / len(X)) * np.sum(term))
        w = temp
        cost[i] = computeCost(X, y, w)

    return w, cost


# 运行部分
path = 'data/regress_data1.csv'
X_train, X_test, y_train, y_test,data= getTrainSetAndTestSet(path)
X = np.matrix(X_train.values)
y = np.matrix(y_train.values)
w = np.matrix(np.array([0, 0]))
print (computeCost(X, y, w))
alpha = 0.01
iters = 1000
g, cost = batch_gradientDescent(X, y, w, alpha, iters)
print(g)
print(computeCost(X, y, g))

#绘制
x = np.linspace(data['人口'].min(), data['人口'].max(), 100)
f = g[0, 0] + (g[0, 1] * x)

fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(x, f, 'r', label='预测值')
ax.scatter(data['人口'], data['收益'], label='训练数据')
ax.legend(loc=2)
ax.set_xlabel('人口', fontsize=18)
ax.set_ylabel('收益', rotation=0, fontsize=18)
ax.set_title('预测收益和人口规模', fontsize=18)
plt.show()

在数据预处理那边多返回一个data数据就可以了。

可以看到图像如下图

多元线性回归：

# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号


# data.head()
# data.describe()

# #用来查看看下数据长什么样子
# data.plot(kind='scatter', x='人口', y='收益', figsize=(12,8))
# plt.xlabel('人口', fontsize=18)
# plt.ylabel('收益', rotation=0, fontsize=18)
# plt.show()

def getTrainSetAndTestSet(DataPath):
    data = pd.read_csv(DataPath)
    data = (data - data.mean()) / data.std()
    data.insert(0, 'Ones', 1)
    cols = data.shape[1]
    X = data.iloc[:,0:cols-1] # X是所有行，去掉最后一列
    y = data.iloc[:,cols-1:cols]  # X是所有行，最后一列
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)  # 随机划分训练集和测试集，默认把数据集的25%作为测试集
    # 查看训练集和测试集的维度
    print "训练集和测试集的维度:"
    print X_train.shape
    print y_train.shape
    print X_test.shape
    print y_test.shape
    return X_train, X_test, y_train, y_test, data


# 损失函数
def computeCost(X, y, w):
    inner = np.power(((X * w.T) - y), 2)  # (m,n) @ (n, 1) -> (n, 1)
    #     return np.sum(inner) / (2 * len(X))
    return np.sum(inner) / (2 * X.shape[0])


# 梯度下降函数
def batch_gradientDescent(X, y, w, alpha, iters):
    temp = np.matrix(np.zeros(w.shape))
    parameters = int(w.ravel().shape[1])
    cost = np.zeros(iters)

    for i in range(iters):
        error = (X * w.T) - y
        for j in range(parameters):
            term = np.multiply(error, X[:, j])
            temp[0, j] = w[0, j] - ((alpha / len(X)) * np.sum(term))
        w = temp
        cost[i] = computeCost(X, y, w)

    return w, cost


# 运行部分
path = 'data/regress_data2.csv'
X_train, X_test, y_train, y_test,data= getTrainSetAndTestSet(path)
X = np.matrix(X_train.values)
y = np.matrix(y_train.values)
w = np.matrix(np.array([0,0,0]))
print(X.shape,y.shape,w.shape)
# print (computeCost(X, y, w))
alpha = 0.01
iters = 1000
g, cost = batch_gradientDescent(X, y, w, alpha, iters)
print(g)
print(computeCost(X, y, g))


fig, ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('迭代次数', fontsize=18)
ax.set_ylabel('代价', rotation=0, fontsize=18)
ax.set_title('误差和训练Epoch数', fontsize=18)
plt.show()

结果如下：

会发现我们的图里面的中文都是乱码的

那是因为最开始的时候

plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

而你的电脑里面是没有SimHei这个字体的，下载流程可以看https://blog.csdn.net/DS__SS/article/details/123824668

就变成这样了

实验二：逻辑回归实验

用同一个项目就可以了。

可用下面的代码先对我们的数据集进行观察，使用matplotlib将数据集绘制出散点图。

# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import scipy.optimize as opt
from sklearn.model_selection import train_test_split

reload(sys)
sys.setdefaultencoding('utf-8')
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号


def getTrainSetAndTestSet(DataPath):
    data = pd.read_csv(DataPath)
    data.insert(0, 'Ones', 1)
    cols = data.shape[1]
    X = data.iloc[:, 0:cols - 1]
    y = data.iloc[:, cols - 1:cols]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)  # 随机划分训练集和测试集，默认把数据集的25%作为测试集
    # 查看训练集和测试集的维度
    print "训练集和测试集的维度:"
    print X_train.shape
    print y_train.shape
    print X_test.shape
    print y_test.shape
    return X_train, X_test, y_train, y_test


def drawdata(path):
    data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
    positive = data[data['Admitted'].isin([1])]
    negative = data[data['Admitted'].isin([0])]

    fig, ax = plt.subplots(figsize=(12, 8))
    ax.scatter(positive['Exam 1'],
               positive['Exam 2'],
               s=50,
               c='b',
               marker='o',
               label='Admitted')
    ax.scatter(negative['Exam 1'],
               negative['Exam 2'],
               s=50,
               c='r',
               marker='x',
               label='Not Admitted')
    ax.legend()
    ax.set_xlabel('Exam 1 Score')
    ax.set_ylabel('Exam 2 Score')
    plt.show()

path = 'data2/ex2data1.txt'
drawdata(path)
getTrainSetAndTestSet(path)

散点图如下图：

训练代码如下

# coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import scipy.optimize as opt
from sklearn.model_selection import train_test_split

reload(sys)
sys.setdefaultencoding('utf-8')
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号


def getTrainSetAndTestSet(DataPath):
    data = pd.read_csv(DataPath)
    data.insert(0, 'Ones', 1)
    cols = data.shape[1]
    X = data.iloc[:, 0:cols - 1]
    y = data.iloc[:, cols - 1:cols]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)  # 随机划分训练集和测试集，默认把数据集的25%作为测试集
    # 查看训练集和测试集的维度
    print "训练集和测试集的维度:"
    print X_train.shape
    print y_train.shape
    print X_test.shape
    print y_test.shape
    return X_train, X_test, y_train, y_test


def drawdata(path):
    data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
    positive = data[data['Admitted'].isin([1])]
    negative = data[data['Admitted'].isin([0])]

    fig, ax = plt.subplots(figsize=(12, 8))
    ax.scatter(positive['Exam 1'],
               positive['Exam 2'],
               s=50,
               c='b',
               marker='o',
               label='Admitted')
    ax.scatter(negative['Exam 1'],
               negative['Exam 2'],
               s=50,
               c='r',
               marker='x',
               label='Not Admitted')
    ax.legend()
    ax.set_xlabel('Exam 1 Score')
    ax.set_ylabel('Exam 2 Score')
    # plt.show()


# sigmoid激活函数
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


# 定义损失函数
def cost(w, X, y):
    w = np.matrix(w)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * w.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * w.T)))
    return np.sum(first - second) / (len(X))


# 梯度下降
def gradient(w, X, y):
    w = np.matrix(w)
    X = np.matrix(X)
    y = np.matrix(y)
    parameters = int(w.ravel().shape[1])
    grad = np.zeros(parameters)
    error = sigmoid(X * w.T) - y
    for i in range(parameters):
        term = np.multiply(error, X[:, i])
        grad[i] = np.sum(term) / len(X)
    return grad


# 有正则项的损失函数
def costReg(w, X, y, learningRate):
    w = np.matrix(w)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * w.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * w.T)))
    reg = (learningRate /
           (2 * len(X))) * np.sum(np.power(w[:, 1:w.shape[1]], 2))
    return np.sum(first - second) / len(X) + reg


# 有正则项的梯度下降
def gradientReg(w, X, y, learningRate):
    w = np.matrix(w)
    X = np.matrix(X)
    y = np.matrix(y)

    parameters = int(w.ravel().shape[1])
    grad = np.zeros(parameters)

    error = sigmoid(X * w.T) - y

    for i in range(parameters):
        term = np.multiply(error, X[:, i])

        if (i == 0):
            grad[i] = np.sum(term) / len(X)
        else:
            grad[i] = (np.sum(term) / len(X)) + (
                    (learningRate / len(X)) * w[:, i])

    return grad


# 预测函数
def predict(w, X):
    probability = sigmoid(X * w.T)
    return [1 if x >= 0.5 else 0 for x in probability]


path = 'data2/ex2data1.txt'
# drawdata(path)
X_train, X_test, y_train, y_test = getTrainSetAndTestSet(path)
X = np.array(X_train.values)
y = np.array(y_train.values)
w = np.zeros(3)
# print(X.shape, w.shape, y.shape)
# print(cost(w, X, y))#查看初始化参数的代价函数值
# gradient(w, X, y)
# result = opt.fmin_tnc(func=cost, x0=w, fprime=gradient, args=(X, y))#梯度下降法
result = opt.minimize(fun=cost, x0=w, args=(X, y), method='Newton-CG', jac=gradient)#牛顿迭代法
#输出准确率
w_min = np.matrix(result.x)
predictions = predict(w_min, X)
correct = [
    1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0
    for (a, b) in zip(predictions, y)
]
accuracy = (sum(map(int, correct)) % len(correct))
print('accuracy = {0}%'.format(accuracy))

#画出分类边界
# print(result.x)
coef = -(result.x / result.x[2])
# print(coef)
x = np.arange(20, 110, 10)
y = coef[0] + coef[1] * x
drawdata(path)
plt.plot(x, y)
plt.title('Decision Boundary')
plt.show()

结果如下: