LDA CreditCardDefault Code N
LDA CreditCardDefault Code N
#Import all necessary modules
import pandas as pd ###Software library written for the Python programming language for data manipulation and analysis.
import numpy as np ### fundamental package for scientific computing with Python
import os ### using operating system dependent functionality
import scipy.stats as stats
import matplotlib.pyplot as plt
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
#os.chdir('C:\\GL Class\Solution Preparation\Logistic Regression - Python')
Load the Data file into Python DataFrame using pandas read_csv method
Ensure that data set loaded do not have any unicode character
data_df = pd.read_csv("default(HandsOnVideo Content).csv")
head function is used to view the top records. The number records to be view need to be given in the parenthesis.
data_df.head(10)
tail function is used to view the last records. The number records to be view need to be given in the parenthesis.
data_df.tail(20)
data_df.describe()
data_df.dtypes
Gender object
Loan.Offered int64
Job object
Work.Exp int64
Credit.Score int64
EMI.Ratio float64
Status object
Credit.History object
Own house int64
Purpose object
Dependents int64
dtype: object
type(data_df)
pandas.core.frame.DataFrame
data_df.isnull().sum()
Gender 0
Loan.Offered 0
Job 0
Work.Exp 0
Credit.Score 0
EMI.Ratio 0
Status 0
Credit.History 0
Own house 0
Purpose 0
Dependents 0
dtype: int64
No Missing values
data_df.shape ### 781 rows and 11 features
(781, 11)
data_df['Own house']=data_df['Own house'].astype('object')
data_df['Gender'].unique()
data_df['Job'].unique()
data_df['Status'].unique() ### No means No Default
data_df['Credit.History'].unique()
data_df['Own house'].unique()
data_df['Purpose'].unique()
data_df.dtypes
Gender object
Loan.Offered int64
Job object
Work.Exp int64
Credit.Score int64
EMI.Ratio float64
Status object
Credit.History object
Own house object
Purpose object
Dependents int64
dtype: object
data_df['Credit.History']=np.where(data_df['Credit.History'] =='very good', 'verygood', data_df['Credit.History'])
data_df['Credit.History']=np.where(data_df['Credit.History'] =='Poor', 'poor', data_df['Credit.History'])
data_df['Credit.History'].unique()
data_df['Status'].value_counts()
No 656
Default 125
Name: Status, dtype: int64
Univariate Plots
sns.distplot(data_df['Work.Exp'])
plt.show() ### Not required in this version, inserted just to show that if graph is not printing then this is required
Bivariate Analysis
Bivariate shown below is only as a sample.. Reader is adviced to perform complete data exploration process
sns.jointplot(data_df['Work.Exp'], data_df['Loan.Offered']) ### annotate function of stats is to print correlation value
sns.stripplot(data_df['Status'], data_df['Work.Exp']) ### Concentration of observations
C:\Users\Shikhar Shrivastava\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWa
warnings.warn(
<AxesSubplot:xlabel='Status', ylabel='Work.Exp'>
Lower Work Experience have more concentration of Default.. Reader is adviced to perform more analysis and generate insight.
count_no_sub = len(data_df[data_df['Status']=='No'])
count_sub = len(data_df[data_df['Status']=='Default'])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print("percentage of no Default is", pct_of_no_sub*100)
pct_of_sub = count_sub/(count_no_sub+count_sub)
print("percentage of Default", pct_of_sub*100)
Distribution is not biased and hence no need to use SMOTE or any other package to balance binary classes
pd.crosstab(data_df.Dependents,data_df.Status).plot(kind='bar')
plt.title('Dependents Vs. Status')
plt.xlabel('Dependents')
plt.ylabel('Status')
data_df.rename(columns = {'Own house':'Ownhouse'}, inplace = True)
data_df.rename(columns = {'Loan.Offered':'LoanOffered'}, inplace = True)
data_df.rename(columns = {'Work.Exp':'WorkExp'}, inplace = True)
data_df.rename(columns = {'Credit.Score':'CreditScore'}, inplace = True)
data_df.rename(columns = {'EMI.Ratio':'EMIRatio'}, inplace = True)
data_df.rename(columns = {'Credit.History':'CreditHistory'}, inplace = True)
data_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781 entries, 0 to 780
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Gender 781 non-null object
1 LoanOffered 781 non-null int64
2 Job 781 non-null object
3 WorkExp 781 non-null int64
4 CreditScore 781 non-null int64
5 EMIRatio 781 non-null float64
6 Status 781 non-null object
7 CreditHistory 781 non-null object
8 Ownhouse 781 non-null object
9 Purpose 781 non-null object
10 Dependents 781 non-null int64
dtypes: float64(1), int64(4), object(6)
memory usage: 67.2+ KB
data_df['Gender']=np.where(data_df['Gender'] =='Male', 1, data_df['Gender'])
data_df['Gender']=np.where(data_df['Gender'] =='Female', 0, data_df['Gender'])
data_df['Job']=np.where(data_df['Job'] =='Management', 1, data_df['Job'])
data_df['Job']=np.where(data_df['Job'] =='unskilled', 0, data_df['Job'])
data_df['Job']=np.where(data_df['Job'] =='skilled', 2, data_df['Job'])
data_df['CreditHistory']=np.where(data_df['CreditHistory'] =='critical', 1, data_df['CreditHistory'])
data_df['CreditHistory']=np.where(data_df['CreditHistory'] =='poor', 0, data_df['CreditHistory'])
data_df['CreditHistory']=np.where(data_df['CreditHistory'] =='good', 2, data_df['CreditHistory'])
data_df['CreditHistory']=np.where(data_df['CreditHistory'] =='verygood', 3, data_df['CreditHistory'])
data_df['Purpose']=np.where(data_df['Purpose'] =='personal', 1, data_df['Purpose'])
data_df['Purpose']=np.where(data_df['Purpose'] =='car', 0, data_df['Purpose'])
data_df['Purpose']=np.where(data_df['Purpose'] =='education', 2, data_df['Purpose'])
data_df['Purpose']=np.where(data_df['Purpose'] =='consumer.durable', 3, data_df['Purpose'])
data_df.head()
0 1 0 0 14 86 3.0 No 0 1
1 0 1 2 15 94 3.0 No 0 1
2 1 0 0 16 86 3.0 No 0 1
3 0 1 2 13 94 3.0 No 0 1
4 1 1 2 12 85 3.3 No 0 1
#Scaling the data which is a pre-requisite for LDA
scaler=StandardScaler()
X = scaler.fit_transform(data_df.drop(['Status'],axis=1))
Y = data_df['Status']
Y.value_counts()
No 656
Default 125
Name: Status, dtype: int64
Y.replace({"No":1,"Default":0})
0 1
1 1
2 1
3 1
4 1
..
776 0
777 0
778 0
779 0
780 0
Name: Status, Length: 781, dtype: int64
#Build LDA Model
# Refer details for LDA at http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html
clf = LinearDiscriminantAnalysis()
model=clf.fit(X,Y)
model
LinearDiscriminantAnalysis()
# Predict it
pred_class = model.predict(X)
data_df['Prediction'] = pred_class
# Check Correlation values
#Refer on correlation at https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.corr.html
data= data_df[['Gender','LoanOffered','Job','WorkExp','CreditScore','EMIRatio','CreditHistory','Ownhouse','Purpose','Dependents']]
Cor1 = data.corr()
Cor1
#generate Confusion Matrix
# Please refer for confusion matrix http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
confusion_matrix(Y, pred_class)
array([[124, 1],
[ 22, 634]], dtype=int64)
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(Y, pred_class),annot=True,fmt='.4g'),'\n\n'
plt.ylabel('Actual Value')
plt.xlabel('Predicted Value')
plt.show();
146 rows classified as 0 (Default) and 635 rows classified as 1 (Not Default)
from sklearn.metrics import classification_report
print(classification_report(Y, pred_class))
X.shape
(781, 10)
model.coef_
array([[ 1.137129 , -0.46395456, 0.83372221, -1.08383805, 3.80345376,
-0.53102867, 0.36191579, 5.95912536, 0.14435512, 2.30379498]])
model.intercept_
array([9.15964746])
'''
LDF=9.159+ X1*1.137 + X2*(-0.463) + X3*(0.833) + X4*(-1.083) + X5*3.803 + X6*(-0.531) + X7*0.361 + X8*5.959 + X9*0.144 + X10*2.30
'''
'\nLDF=9.159+ X1*1.137 + X2*(-0.463) + X3*(0.833) + X4*(-1.083) + X5*3.803 + X6*(-0.531) + X7*0.361 + X8*5.959 + X9*0.144 +
X10*2.30\n'
#Computation of Discriminant Scores/LDF for each row of data
DS=[]
coef=[1.137129 , -0.46395456, 0.83372221, -1.08383805, 3.80345376,
-0.53102867, 0.36191579, 5.95912536, 0.14435512, 2.30379498] # Coefficients
for p in range(len(X)):
s3=0
for q in range(X.shape[1]):
s3=s3+(X[p,q]*coef[q]) # Building the LDF equation
s3=s3+9.159
DS.append(s3)
'''
Classification Rule :
if LDF>=0 then Classify as 1
else if LDF <0 then Classify as 0
'''
s1=0
s2=0
for i in range(len(X)):
if DS[i]>=0:
print("FOR Row:",i," ",X[i,:])
print()
#print("-->","{ prob(Y=1|X) =",pred_prob[:,1][i],">0.5 is True}")
print("-->","{ DS: ",DS[i],">=0 , Classify as 1}")
print("------------------------------------------------------------------------------------------")
s1+=1
elif DS[i]<0:
print("FOR Row:",i," ",X[i,:])
print()
#print("-->","{ prob(Y=1|X) =",pred_prob[:,1][i],">0.5 is True}")
print("-->","{ DS: ",DS[i],"<0 , Classify as 0}")
print("------------------------------------------------------------------------------------------")
s2+=1
print(s1," rows classified as 1 (Not Default) ")
print(s2," rows classified as 0 (Default) ")
Classification by Probability
pred_prob=model.predict_proba(X)#Posterior Probability for each row
pred_prob[:,1]
'''
Classification Rule :
if prob(Y=1|X) >=0 then Classify as 1
else ifprob(Y=1|X) <0 then Classify as 0
'''
s3,s4=0,0
for i in range(len(pred_prob[:,1])):
if pred_prob[:,1][i]>=0.5:
print("FOR Row:",i," ",X[i,:])
print()
print("-->","{ prob(Y=1|X) =",pred_prob[:,1][i],">=0.5 , Classify as 1 }")
print("------------------------------------------------------------------------------------------")
s3+=1
elif pred_prob[:,1][i]<0.5:
print("FOR Row:",i," ",X[i,:])
print()
print("-->","{ prob(Y=1|X) =",pred_prob[:,1][i],"< 0.5 , Classify as 0 }")
print("------------------------------------------------------------------------------------------")
s4+=1
print(s3," rows classified as 1 (Not Default) ")
print(s4," rows classified as 0 (Default) ")