1.常用分类模型
Fisher判别分析
SVM
GBDT
MLP
Bayes
Logistic Regressor
2.python实现
分类任务前务必归一化或者标准化,虽然一些算法不受这个影响,如数模型、LDA。
# 归一化
def normalize(x):
return (x - np.min(x))/(np.max(x) - np.min(x))
X_train = normalize(X_train)
X_test = normalize(X_test)
下面采用sklearn上现成的包实现。
2.1 LDA线性判别分析:
import numpy as np
from sklearn.lda import LDA
// new version:
//from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
clf = LDA()
clf.fit(X, y)
print(clf.predict([[-0.8, -1]]))
如果需要划分数据集,并对测试集进行评估:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
2.2 SVC:
import numpy as np
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 2])
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(X, y)
print(clf.predict([[-0.8, -1]]))
2.3 GBDT:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)
2.4 MLP
多分类
from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation, BatchNormalization
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import Callback
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
import keras.backend as K
from sklearn.preprocessing import OneHotEncoder
from keras.utils.np_utils import to_categorical
def DNN(input_dim, output_dim):
'''
定义一个DNN model
'''
model=Sequential()
model.add(Dense(128,input_dim= input_dim,activation='tanh'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(50,activation='tanh'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(output_dim, activation='softmax'))
model.compile(loss= 'categorical_crossentropy', optimizer= 'adam')
return model
clf = DNN(8, 10)# 输入节点8;输出节点10,共10类。
y_train_cate = to_categorical(y_train, 10)
clf.fit(X_train, y_train_cate, epochs= 40, batch_size = 10, verbose=2)
y_predict = np.argmax( clf.predict(X_test), axis = 1)
print(y_predict.shape)
print ('dnn', metrics.accuracy_score(y_test, y_predict ))
附录:
以下是我常用的调用方式。
class baseModel():
def get_data(self, shuffle = False):
(X_train, y_train), (X_test, y_test) = self.load_data()
if shuffle:
idx_perm = np.random.RandomState(101).permutation(X_train.shape[0])
X_train, y_train = X_train[idx_perm], y_train[idx_perm]
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
return X_train, X_test, y_train, y_test
def labelencoder(self, arr):
'''
label encoder
input: np.array, like: np.array([2, 3, 4]);
output: one-hot form like: np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])'''
nclass = len(np.unique(arr))
for i, item in enumerate(np.unique(arr)):
arr[np.where(arr == item)] = i
arr = np.eye(nclass)[arr]
return arr
def selectSample(self, X_train, y_train, classList = [1, 3]):
''' just select sample equal with class. '''
if y_train.ndim == 2:
y_train = np.argmax(y_train, axis = -1)
newY = None
newX = None
for n in classList:
idx = np.where(y_train == n)[0]
if newY is None:
newY = y_train[idx]
newX = X_train[idx]
else:
newY = np.concatenate((newY, y_train[idx]), axis = -1)
newX = np.concatenate((newX, X_train[idx]), axis = 0)
newY = self.labelencoder(newY)
return newX, newY
def load_data(self):
# 定义导入数据的类
inputModule = inputTeData(dataRoot = self.train_root)
X_train, y_train , X_valid, y_valid, X_test, y_test = inputModule.input()
if self.classList is not None:
X_train, y_train = self.selectSample(X_train, y_train, self.classList)
X_test, y_test = self.selectSample(X_test, y_test, self.classList)
return (X_train, y_train), (X_test, y_test)
class SVCModel(baseModel):
def __init__(self, config):
self.classes = config['classes']
self.max_features = config['max_features']
self.num_batch = config['num_batch']
self.maxlen = config['maxlen']
self.embedding_dims = config['embedding_dims']
self.lstm_dims = config['lstm_dims']
self.hidden_dims = config['hidden_dims']
self.epochs = config['epochs']
self.classes = config['classes']
self.optimizer = config['optimizer']
# data
self.train_root = config['train_root']
self.test_root = config['test_root']
self.timeStep = config['timeStep']
self.load = config['load']
self.classList = config["classList"]
self.dataformat = config["dataformat"]
def build_model(self):
self.model = SVC(gamma = "auto")
return self
def fit(self, X_train, y_train):
# self.build_model()
self.model.fit(X_train, y_train)
def predict(self, X):
pred_class = self.model.predict(X)
return pred_class
def evaluate_model(self, X, Y):
pred_class = self.predict(X)
pos = 0
for i in range(len(pred_class)):
if pred_class[i] == Y[i]:
pos += 1
print("acc:{:.4f}".format(pos*1.0/len(pred_class)))
class GBDTModel(baseModel):
def __init__(self, config):
self.classes = config['classes']
self.max_features = config['max_features']
self.num_batch = config['num_batch']
self.maxlen = config['maxlen']
self.embedding_dims = config['embedding_dims']
self.lstm_dims = config['lstm_dims']
self.hidden_dims = config['hidden_dims']
self.epochs = config['epochs']
self.classes = config['classes']
self.optimizer = config['optimizer']
# data
self.train_root = config['train_root']
self.test_root = config['test_root']
self.timeStep = config['timeStep']
self.load = config['load']
self.classList = config["classList"]
self.dataformat = config["dataformat"]
def build_model(self, n_estimators = 100, learning_rate = 1.0, max_depth = 1,
random_state = 0, n_iter_no_change = None):
self.model = GradientBoostingClassifier(
n_estimators= n_estimators, learning_rate= learning_rate,
max_depth= max_depth, random_state= random_state)
return self
def fit(self, X_train, y_train):
# self.build_model()
self.model.fit(X_train, y_train)
def predict(self, X):
pred_class = self.model.predict(X)
return pred_class
def evaluate_model(self, X, Y):
pred_class = self.predict(X)
pos = 0
for i in range(len(pred_class)):
if pred_class[i] == Y[i]:
pos += 1
print("acc:{:.4f}".format(pos*1.0/len(pred_class)))
调用函数:
SVC:
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 10])
model = SVCModel(config)
X_train, X_test, y_train, y_test = model.get_data()
model.build_model()
model.fit(X_train, np.argmax(y_train, axis = 1))
model.evaluate_model(X_test, np.argmax(y_test, axis = 1))
y_pred = model.predict(X_test)
evaluate = Evaluate(np.argmax(y_test, axis = 1), y_pred, "test", x_min = -10,
x_max = len(y_pred), save = False, nClass = config["classes"],
y_list = config["classList"], figsize = (16, 12), showOrigin = True)
evaluate.plot()
# print(model.predict([[-0.8, -1]]))
print("SVC done")
GBDT:
model = GBDTModel(config)
model.build_model(max_depth = 1)
X_train, X_test, y_train, y_test = model.get_data(shuffle = False)
model.fit(X_train, np.argmax(y_train, axis = 1))
y_pred = model.predict(X_test)
evaluate = Evaluate(np.argmax(y_test, axis = 1), y_pred, "test", x_min = -10,
x_max = len(y_pred), save = False, nClass = config["classes"],
y_list = config["classList"], figsize = (8, 6), showOrigin = True)
evaluate.plot()
print("GBDT model Done!")
最近开通了个公众号,感兴趣的伙伴可以关注下,主要分享推荐系统,风控等算法相关的内容。
参考: