常见分类模型python实现

本文介绍了多种常见的分类模型,包括Fisher判别分析、SVM、GBDT、MLP、Bayes和LogisticRegressor,并提供了使用Python和sklearn库进行实现的详细代码示例。此外,还探讨了模型评估指标及数据预处理方法。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1.常用分类模型

Fisher判别分析
SVM
GBDT
MLP
Bayes
Logistic Regressor

2.python实现

分类任务前务必归一化或者标准化,虽然一些算法不受这个影响,如数模型、LDA。

# 归一化
def normalize(x):
	return (x - np.min(x))/(np.max(x) - np.min(x))
X_train = normalize(X_train)
X_test = normalize(X_test)

下面采用sklearn上现成的包实现。

2.1 LDA线性判别分析:
import numpy as np
from sklearn.lda import LDA
// new version:
//from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
clf = LDA()
clf.fit(X, y)
print(clf.predict([[-0.8, -1]]))

如果需要划分数据集,并对测试集进行评估:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
2.2 SVC:
import numpy as np
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 2])
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(X, y) 

print(clf.predict([[-0.8, -1]]))
2.3 GBDT:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)  
2.4 MLP

多分类

from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation, BatchNormalization
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import Callback

from keras.callbacks import EarlyStopping,ReduceLROnPlateau
import keras.backend as K
from sklearn.preprocessing import OneHotEncoder
from keras.utils.np_utils import to_categorical

def DNN(input_dim, output_dim):
    '''
    定义一个DNN model
    '''
    model=Sequential()
    model.add(Dense(128,input_dim= input_dim,activation='tanh'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(50,activation='tanh'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(output_dim, activation='softmax'))
    model.compile(loss= 'categorical_crossentropy', optimizer= 'adam')
    return model

clf = DNN(8, 10)# 输入节点8;输出节点10,共10类。
y_train_cate = to_categorical(y_train, 10)
clf.fit(X_train, y_train_cate, epochs= 40, batch_size = 10, verbose=2)
y_predict = np.argmax( clf.predict(X_test), axis = 1)
print(y_predict.shape)
print ('dnn', metrics.accuracy_score(y_test, y_predict ))

机器学习分类的几种评价指标


附录:

以下是我常用的调用方式。

class baseModel():
    def get_data(self, shuffle = False):

        (X_train, y_train), (X_test, y_test) = self.load_data()
        
        if shuffle:
            idx_perm = np.random.RandomState(101).permutation(X_train.shape[0])
            X_train, y_train = X_train[idx_perm], y_train[idx_perm]

        X_train = X_train.astype('float32')
        X_test = X_test.astype('float32')

        print('X_train shape:', X_train.shape)
        print(X_train.shape[0], 'train samples')
        print(X_test.shape[0], 'test samples')

        return X_train, X_test, y_train, y_test
    
    def labelencoder(self, arr):
        '''
        label encoder
        input: np.array, like: np.array([2, 3, 4]);
        output: one-hot form like: np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])'''
        nclass = len(np.unique(arr))
        for i, item in enumerate(np.unique(arr)):
            arr[np.where(arr == item)] = i
        arr = np.eye(nclass)[arr]
        return arr
    
    def selectSample(self, X_train, y_train, classList = [1, 3]):
        ''' just select sample equal with class. '''
        if y_train.ndim == 2:
            y_train = np.argmax(y_train, axis = -1)
        newY = None
        newX = None
        for n in classList:
            idx = np.where(y_train == n)[0]
            if newY is None:
                newY = y_train[idx]
                newX = X_train[idx]
            else:
                newY = np.concatenate((newY, y_train[idx]), axis = -1)
                newX = np.concatenate((newX, X_train[idx]), axis = 0)
        newY = self.labelencoder(newY)
        return newX, newY
    
    def load_data(self):
    	# 定义导入数据的类
        inputModule = inputTeData(dataRoot = self.train_root)
        X_train, y_train , X_valid, y_valid, X_test, y_test = inputModule.input()
        if self.classList is not None:
            X_train, y_train = self.selectSample(X_train, y_train, self.classList)
            X_test, y_test = self.selectSample(X_test, y_test, self.classList)

        return (X_train, y_train), (X_test, y_test)
    


class SVCModel(baseModel):
    def __init__(self, config):
        self.classes = config['classes']
        
        self.max_features = config['max_features']
        self.num_batch = config['num_batch']
        self.maxlen = config['maxlen']
        self.embedding_dims = config['embedding_dims']
        self.lstm_dims = config['lstm_dims']
        self.hidden_dims = config['hidden_dims']
        self.epochs = config['epochs']
        self.classes = config['classes']
        self.optimizer = config['optimizer']
        # data
        self.train_root = config['train_root']
        self.test_root = config['test_root']
        self.timeStep = config['timeStep']
        self.load = config['load']
        self.classList = config["classList"]
        self.dataformat = config["dataformat"]
        
    
    def build_model(self):
        self.model = SVC(gamma = "auto")
        return self
    
    def fit(self, X_train, y_train):
#        self.build_model()
        self.model.fit(X_train, y_train)
        
    def predict(self, X):
        pred_class = self.model.predict(X)
        return pred_class
    
    def evaluate_model(self, X, Y):
        pred_class = self.predict(X)
        pos = 0
        for i in range(len(pred_class)):
            if pred_class[i] == Y[i]:
                pos += 1
        print("acc:{:.4f}".format(pos*1.0/len(pred_class)))

class GBDTModel(baseModel):
    def __init__(self, config):
        self.classes = config['classes']
        
        self.max_features = config['max_features']
        self.num_batch = config['num_batch']
        self.maxlen = config['maxlen']
        self.embedding_dims = config['embedding_dims']
        self.lstm_dims = config['lstm_dims']
        self.hidden_dims = config['hidden_dims']
        self.epochs = config['epochs']
        self.classes = config['classes']
        self.optimizer = config['optimizer']
        # data
        self.train_root = config['train_root']
        self.test_root = config['test_root']
        self.timeStep = config['timeStep']
        self.load = config['load']
        self.classList = config["classList"]
        self.dataformat = config["dataformat"]
        
    def build_model(self, n_estimators = 100, learning_rate = 1.0, max_depth = 1,
                    random_state = 0, n_iter_no_change = None):
        self.model = GradientBoostingClassifier(
                n_estimators= n_estimators, learning_rate= learning_rate, 
                max_depth= max_depth, random_state= random_state)
        return self
    
    def fit(self, X_train, y_train):
#        self.build_model()
        self.model.fit(X_train, y_train)
        
    def predict(self, X):
        pred_class = self.model.predict(X)
        return pred_class
    
    def evaluate_model(self, X, Y):
        pred_class = self.predict(X)
        pos = 0
        for i in range(len(pred_class)):
            if pred_class[i] == Y[i]:
                pos += 1
        print("acc:{:.4f}".format(pos*1.0/len(pred_class)))

调用函数:

SVC:

        X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
        y = np.array([1, 1, 2, 10])
        model = SVCModel(config)
        X_train, X_test, y_train, y_test = model.get_data()
        model.build_model()
        model.fit(X_train, np.argmax(y_train, axis = 1))
        model.evaluate_model(X_test, np.argmax(y_test, axis = 1))
        y_pred = model.predict(X_test)
        evaluate = Evaluate(np.argmax(y_test, axis = 1), y_pred, "test", x_min = -10, 
                            x_max = len(y_pred), save = False, nClass = config["classes"], 
                            y_list = config["classList"], figsize = (16, 12), showOrigin = True)
        evaluate.plot()
    #    print(model.predict([[-0.8, -1]]))
        print("SVC done")

GBDT:

        model = GBDTModel(config)
        model.build_model(max_depth = 1)
        X_train, X_test, y_train, y_test = model.get_data(shuffle = False)
        model.fit(X_train, np.argmax(y_train, axis = 1))
        y_pred = model.predict(X_test)
        evaluate = Evaluate(np.argmax(y_test, axis = 1), y_pred, "test", x_min = -10, 
                    x_max = len(y_pred), save = False, nClass = config["classes"], 
                  y_list = config["classList"], figsize = (8, 6), showOrigin = True)
        evaluate.plot()
        print("GBDT model Done!")

最近开通了个公众号,感兴趣的伙伴可以关注下,主要分享推荐系统,风控等算法相关的内容。
在这里插入图片描述


参考:

  1. SVM sklearn;
  2. GBDT sklearn;
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

rosefunR

你的赞赏是我创作的动力!

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值