#-*- coding: utf-8 -*-
# 对数据进行基本的探索
import pandas as pd
datafile= 'data.csv'
# 读取原始数据,指定UTF-8编码(需要用文本编辑器将数据装换为UTF-8编码)
data = pd.read_csv(datafile)
#了解数据表的基本情况,包括行数、列数、每列的数据类型、数据完整度
#print(data.info())
#data1=data.describe()
#data1.to_excel('data_dexcribe.xls')
#了解数据表的统计情况;统计违约与不违约客户数量并绘制柱状图;
# 提取违约和不违约的人数
import matplotlib.pyplot as plt
'''weiyue = pd.value_counts(data['违约'])[1]
not_weiyue = pd.value_counts(data['违约'])[0]
# 绘制条形图
fig = plt.figure(figsize = (8 ,5)) # 设置画布大小
#设置字体为楷体
plt.rcParams['font.sans-serif'] = ['KaiTi']
plt.bar(x=range(2),height=[weiyue,not_weiyue,], width=0.4, alpha=0.8, color='skyblue')
plt.xticks([index for index in range(2)], ['1','0'])
#plt.xlabel('是否违约')
#plt.ylabel('客户人数')
plt.title('客户是否违约柱状图')
#plt.show()
#plt.close()'''
import seaborn as sns
# 提取会员年龄
'''age = data['年龄'].dropna()
age = age.astype('int64')
# 绘制客户年龄分布箱型图
#fig = plt.figure()
fig,axes=plt.subplots(1,3,figsize=(10,5))
sns.distplot(data["年龄"],ax = axes[0],axlabel='所有客户年龄分布')
sns.distplot(data.loc[data["违约"] == 0]["年龄"],ax = axes[1],axlabel='非违约客户年龄分布')
sns.distplot(data.loc[data["违约"] == 1]["年龄"],ax = axes[2],axlabel='违约客户年龄分布')
#plt.show()
#plt.close()'''
data['年龄'].fillna(data['年龄'].mean(), inplace=True)
#删除年龄大于80的数据
# 均值填充年龄
data.drop(data[(data['年龄'] > 80)].index.tolist(),inplace = True)
#print(data["年龄"].describe())
data.loc[data["性别"] == "male","性别"] = 0
data.loc[data["性别"] == "female","性别"] = 1
'''# 计算相关性矩阵
data_corr=data
dt_corr = data_corr.corr(method = 'pearson')
print('相关性矩阵为:\n',dt_corr)
# 绘制热力图
import seaborn as sns
#设置字体为楷体
plt.rcParams['font.sans-serif'] = ['KaiTi']
plt.subplots(figsize=(10, 10)) # 设置画面大小
sns.heatmap(dt_corr, annot=True, vmax=1, square=True, cmap='Blues')
plt.show()
plt.close
'''
from sklearn.feature_extraction import DictVectorizer
#通过分析选择合适的特征用于预测
features=['年龄','工龄','负债率','信用卡负债','其他负债','收入']
features_train=data[features]
train_label=data['违约']
dev=DictVectorizer(sparse=False)
#转化成字典型
features_train=dev.fit_transform(features_train.to_dict(orient='record'))
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import cross_val_score
dt_stump =DecisionTreeClassifier(max_depth=1,min_samples_leaf=1)
dt_stump.fit(features_train, train_label)
print(u'决策树弱分类器准确率为 %.4lf' %
np.mean(cross_val_score(dt_stump, features_train,train_label, cv=10)))
# 决策树分类器
dt = DecisionTreeClassifier()
dt.fit(features_train, train_label)
print(u'决策树分类器准确率为 %.4lf' %np.mean(cross_val_score(dt, features_train, train_label,
cv=10)))
#adaboost
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier(n_estimators=120,random_state=7,learning_rate=1.2)
ada.fit(features_train,train_label)
print(u'adaBoost分类器准确率为 %.4lf' %np.mean(cross_val_score(ada, features_train, train_label,cv=10)))
data.to_csv('clean_data.csv')
# 数据标准化
from sklearn.preprocessing import StandardScaler
features_train = StandardScaler().fit_transform(features_train)
#K——means聚类
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4,n_jobs=-1,random_state=1234)
# 模型训练
kmeans_fit = kmeans.fit(features_train)
# 聚类中心
kmeans_cluster = kmeans_fit.cluster_centers_
print('聚类中心为\n',kmeans_fit.cluster_centers_)
# 聚类后样本的类别标签
kmeans_label = kmeans_fit.labels_
print('聚类后样本标签为\n',kmeans_fit.labels_)
# 聚类后各个类别数目
r1 = pd.Series(kmeans_label).value_counts()
print('聚类后各个类别数目\n',r1)
# 输出聚类分群结果
cluster_center = pd.DataFrame(kmeans_cluster,columns=['年龄','工龄','负债率','信用卡负债','其他负债','收入'])
cluster_center.index = pd.DataFrame(kmeans_label).drop_duplicates().iloc[:,0]
cluster = pd.concat([r1,cluster_center],axis=1)
# 修改第一列列名
list_column = list(cluster.columns)
list_column[0] = '类别数目'
cluster.columns = list_column