Machine Learning using Python
Chapter 6: Advanced Machine Learning
6.2.1 Developing a Gradient Descent Algorithm for Linear Regression
Model
6.2.1.1 Loading the dataset
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
sales_df = pd.read_csv( 'Advertising.csv' )
# Pring first few records
sales_df.head()
Unnamed: 0 TV Radio Newspaper Sales
0 1 230.1 37.8 69.2 22.1
1 2 44.5 39.3 45.1 10.4
2 3 17.2 45.9 69.3 9.3
3 4 151.5 41.3 58.5 18.5
4 5 180.8 10.8 58.4 12.9
6.2.1.2 Set X and Y Variable
X = sales_df[['TV', 'Radio', 'Newspaper']]
Y = sales_df['Sales']
6.2.1.3 Standardize X & Y
Y = np.array( (Y - Y.mean() ) / Y.std() )
X = X.apply( lambda rec: ( rec - rec.mean() ) / rec.std(),
axis = 0 )
6.2.1.4 Implementing the Gradient Descent Algorithm
Method 1: Random Initialization the bias and weights
Copyright © 2019 by Wiley India Pvt. Ltd. 1/37
Machine Learning using Python
import random
#dim - is the number of weights to be initialized besides the bias
def initialize( dim ):
# For reproducible results, the seed it set to 42.
# Reader can comment the following two lines
# and try other initialiazation values.
np.random.seed(seed=42)
random.seed(42)
#Initialize the bias
b = random.random()
#Initialize the weights.
w = np.random.rand( dim )
return b, w
b, w = initialize( 3 )
print( "Bias: ", b, " Weights: ", w )
Bias: 0.6394267984578837 Weights: [0.37454012 0.95071431 0.731993
94]
Method 2: Predict Y values from the bias and weights
# Inputs:
# b - bias
# w - weights
# X - the input matrix
def predict_Y( b, w, X ):
return b + np.matmul( X, w )
b, w = initialize( 3 )
Y_hat = predict_Y( b, w, X)
Y_hat[0:10]
array([ 3.23149557, 1.70784873, 2.82476076, 2.75309026, 0.924485
58,
3.17136498, 0.62234399, -0.34935444, -2.313095 , -0.768029
83])
Method 3: Calculate the cost function: MSE
import math
# Inputs
# Y - Actual values of y
# Y_hat - predicted value of y
def get_cost( Y, Y_hat ):
# Calculating the residuals from taking difference between actual and predic
ted values
Y_resid = Y - Y_hat
# Matrix multiplication with self will give the square values
# Then takin the sum and dividing by number of examples to calculate mean
return np.sum( np.matmul( Y_resid.T, Y_resid ) ) / len( Y_resid )
Copyright © 2019 by Wiley India Pvt. Ltd. 2/37
Machine Learning using Python
b, w = initialize( 3 )
Y_hat = predict_Y( b, w, X)
get_cost( Y, Y_hat )
1.5303100198505895
Method 4: Update the bias and weights
def update_beta( x, y, y_hat, b_0, w_0, learning_rate ):
#gradient of bias
db = (np.sum( y_hat - y ) * 2) / len(y)
#gradient of weights
dw = (np.dot( ( y_hat - y ), x ) * 2 ) / len(y)
#update bias
b_1 = b_0 - learning_rate * db
#update beta
w_1 = w_0 - learning_rate * dw
#return the new bias and beta values
return b_1, w_1
b, w = initialize( 3 )
print( "After Initialization - Bias: ", b, " Weights: ", w )
Y_hat = predict_Y( b, w, X)
b, w = update_beta( X, Y, Y_hat, b, w, 0.01 )
print( "After first update - Bias: ", b, " Weights: ", w )
After Initialization - Bias: 0.6394267984578837 Weights: [0.37454
012 0.95071431 0.73199394]
After first update - Bias: 0.6266382624887261 Weights: [0.3807909
3 0.9376953 0.71484883]
6.2.1.5 Finding the optimal bias and weights
Copyright © 2019 by Wiley India Pvt. Ltd. 3/37
Machine Learning using Python
def run_gradient_descent( X,
Y,
alpha = 0.01,
num_iterations = 100):
# Intialize the bias and weights
b, w = initialize( X.shape[1] )
iter_num = 0
# gd_iterations_df keeps track of the cost every 10 iterations
gd_iterations_df = pd.DataFrame(columns = ['iteration', 'cost'])
result_idx = 0
# Run the iterations in loop
for each_iter in range(num_iterations):
# Calcuated predicted value of y
Y_hat = predict_Y( b, w, X )
# Calculate the cost
this_cost = get_cost( Y, Y_hat )
# Save the previous bias and weights
prev_b = b
prev_w = w
# Update and calculate the new values of bias and weights
b, w = update_beta( X, Y, Y_hat, prev_b, prev_w, alpha)
# For every 10 iterations, store the cost i.e. MSE
if( iter_num % 10 == 0 ):
gd_iterations_df.loc[result_idx] = [iter_num, this_cost]
result_idx = result_idx + 1
iter_num += 1
print( "Final estimate of b and w: ", b, w )
#return the final bias, weights and the cost at the end
return gd_iterations_df, b, w
gd_iterations_df, b, w = run_gradient_descent( X, Y, alpha = 0.001, num_iteratio
ns = 200 )
Final estimate of b and w: 0.42844895817391493 [0.48270238 0.752659
69 0.46109174]
Copyright © 2019 by Wiley India Pvt. Ltd. 4/37
Machine Learning using Python
gd_iterations_df[0:10]
iteration cost
0 0.0 1.530310
1 10.0 1.465201
2 20.0 1.403145
3 30.0 1.343996
4 40.0 1.287615
5 50.0 1.233868
6 60.0 1.182630
7 70.0 1.133780
8 80.0 1.087203
9 90.0 1.042793
6.2.1.6 Plotting the cost function against the iterations
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
plt.plot( gd_iterations_df['iteration'], gd_iterations_df['cost'] );
plt.xlabel("Number of iterations")
plt.xlabel("Cost or MSE")
Text(0.5,0,'Cost or MSE')
print( "Final estimates of b and w: ", b, w )
Final estimates of b and w: 0.42844895817391493 [0.48270238 0.75265
969 0.46109174]
Copyright © 2019 by Wiley India Pvt. Ltd. 5/37
Machine Learning using Python
alpha_df_1, b, w = run_gradient_descent( X, Y, alpha = 0.01, num_iterations = 20
00 )
Final estimate of b and w: 2.7728016698178713e-16 [ 0.75306591 0.5
3648155 -0.00433069]
What happens if we change the learning parameter and use smaller value e.g. 0.001.
alpha_df_2, b, w = run_gradient_descent( X, Y, alpha = 0.001, num_iterations = 2
000 )
Final estimate of b and w: 0.011664695556930518 [0.74315125 0.52779
959 0.01171703]
plt.plot( alpha_df_1['iteration'], alpha_df_1['cost'], label = "alpha = 0.01" );
plt.plot( alpha_df_2['iteration'], alpha_df_2['cost'], label = "alpha = 0.001"
);
plt.legend()
plt.ylabel('Cost');
plt.xlabel('Number of Iterations');
plt.title('Cost Vs. Iterations for different alpha values');
6.3 scikit-learn Library for Machine Learning
6.3.1 Steps for Building Machine Learning Models
6.3.1.1 Splitting dataset into train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
sales_df[["TV", "Radio", "Newspaper"
]],
sales_df.Sales,
test_size=0.3,
random_state = 42 ) # Seed value of 42
for reproducibility
Copyright © 2019 by Wiley India Pvt. Ltd. 6/37
Machine Learning using Python
len( X_train )
140
len( X_test )
60
6.3.1.2 Building Linear Regression model with train dataset
from sklearn.linear_model import LinearRegression
## Initiliazing the model
linreg = LinearRegression()
# Fitting training data to the model
linreg.fit( X_train, y_train )
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
linreg.intercept_
2.708949092515912
linreg.coef_
array([0.04405928, 0.1992875 , 0.00688245])
list( zip( ["TV", "Radio", "Newspaper"], list( linreg.coef_ ) ) )
[('TV', 0.0440592809574652),
('Radio', 0.1992874968989395),
('Newspaper', 0.0068824522222754)]
6.3.1.3 Making prediction on test set
# Predicting the y value from the test set
y_pred = linreg.predict( X_test )
Copyright © 2019 by Wiley India Pvt. Ltd. 7/37
Machine Learning using Python
# Creating a DataFrame with three columns named actual, predicted and residuals
to store
# the respective values
test_pred_df = pd.DataFrame( { 'actual': y_test,
'predicted': np.round( y_pred, 2),
'residuals': y_test - y_pred } )
# Randomly showing the 10 observations from the DataFrame
test_pred_df.sample(10)
actual predicted residuals
126 6.6 11.15 -4.553147
170 8.4 7.35 1.049715
95 16.9 16.57 0.334604
195 7.6 5.22 2.375645
115 12.6 13.36 -0.755569
38 10.1 10.17 -0.070454
56 5.5 8.92 -3.415494
165 11.9 14.30 -2.402060
173 11.7 11.63 0.068431
9 10.6 12.18 -1.576049
6.3.1.4 Measuring Accuracy
## Importing metrics from sklearn
from sklearn import metrics
R-Squared Value
## y_train contain the actual value and the predicted value is returned from
# predict() method after passing the X values of the training data.
r2 = metrics.r2_score( y_train, linreg.predict(X_train) )
print("R Sqaured: ", r2)
R Sqaured: 0.9055159502227753
RMSE
# y_pred contains predicted value of test data
mse = metrics.mean_squared_error( y_test, y_pred )
Copyright © 2019 by Wiley India Pvt. Ltd. 8/37
Machine Learning using Python
# Taking square root of MSE and then round off to two decimal values
rmse = round( np.sqrt(mse), 2 )
print("RMSE: ", rmse)
RMSE: 1.95
6.3.2 Bias-Variance Trade-off
# Reading the file curve.csv and printing first few examples
curve = pd.read_csv( "curve.csv" )
curve.head()
x y
0 2 -1.999618
1 2 -1.999618
2 8 -3.978312
3 9 -1.969175
4 10 -0.957770
plt.scatter( curve.x, curve.y );
plt.xlabel("x values")
plt.ylabel("y values")
Text(0,0.5,'y values')
# Input
# degree - polynomial terms to be used in the model
def fit_poly( degree ):
# calling numpy method polyfit
p = np.polyfit( curve.x, curve.y, deg = degree )
curve['fit'] = np.polyval( p, curve.x )
# draw the regression line after fitting the model
sn.regplot( curve.x, curve.y, fit_reg = False )
# Plot the actual x and y values
return plt.plot( curve.x, curve.fit, label='fit' )
Copyright © 2019 by Wiley India Pvt. Ltd. 9/37
Machine Learning using Python
fit_poly( 1 );
## Plotting the model form and the data
plt.xlabel("x values")
plt.ylabel("y values");
fit_poly( 2 );
plt.xlabel("x values")
plt.ylabel("y values");
Copyright © 2019 by Wiley India Pvt. Ltd. 10/37
Machine Learning using Python
fit_poly( 10 );
plt.xlabel("x values")
plt.ylabel("y values");
# Split the dataset into 60:40 split into training and test set
train_X, test_X, train_y, test_y = train_test_split( curve.x,
curve.y,
test_size = 0.40,
random_state = 100 )
# Define the dataframe store degree and rmse for training and test set
rmse_df = pd.DataFrame( columns = ["degree", "rmse_train", "rmse_test"] )
# Define a method to return the rmse given actual and predicted values.
def get_rmse( y, y_fit ):
return np.sqrt( metrics.mean_squared_error( y, y_fit ) )
# Iterate from degree 1 to 15
for i in range( 1, 15 ):
# fitting model
p = np.polyfit( train_X, train_y, deg = i )
# storing model degree and rmse on train and test set
rmse_df.loc[i-1] = [ i,
get_rmse( train_y, np.polyval( p, train_X ) ),
get_rmse( test_y, np.polyval( p, test_X ) ) ]
Copyright © 2019 by Wiley India Pvt. Ltd. 11/37
Machine Learning using Python
rmse_df
degree rmse_train rmse_test
0 1.0 5.226638 5.779652
1 2.0 2.394509 2.755286
2 3.0 2.233547 2.560184
3 4.0 2.231998 2.549205
4 5.0 2.197528 2.428728
5 6.0 2.062201 2.703880
6 7.0 2.039408 2.909237
7 8.0 1.995852 3.270892
8 9.0 1.979322 3.120420
9 10.0 1.976326 3.115875
10 11.0 1.964484 3.218203
11 12.0 1.657948 4.457668
12 13.0 1.656719 4.358014
13 14.0 1.642308 4.659503
Copyright © 2019 by Wiley India Pvt. Ltd. 12/37
Machine Learning using Python
# plotting the rmse for training set in red color
plt.plot( rmse_df.degree,
rmse_df.rmse_train,
label='RMSE on Training Set',
color = 'r' )
# plotting the rmse for test set in green color
plt.plot( rmse_df.degree,
rmse_df.rmse_test,
label='RMSE on Test Set',
color = 'g' )
# Mention the legend
plt.legend(bbox_to_anchor=(1.05, 1),
loc=2,
borderaxespad=0.);
plt.xlabel("Model Degrees")
plt.ylabel("RMSE");
6.4 Advanced Regression Models
6.4.1.1 Loading IPL Dataset
Copyright © 2019 by Wiley India Pvt. Ltd. 13/37
Machine Learning using Python
ipl_auction_df = pd.read_csv( 'IPL IMB381IPL2013.csv' )
ipl_auction_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 26 columns):
Sl.NO. 130 non-null int64
PLAYER NAME 130 non-null object
AGE 130 non-null int64
COUNTRY 130 non-null object
TEAM 130 non-null object
PLAYING ROLE 130 non-null object
T-RUNS 130 non-null int64
T-WKTS 130 non-null int64
ODI-RUNS-S 130 non-null int64
ODI-SR-B 130 non-null float64
ODI-WKTS 130 non-null int64
ODI-SR-BL 130 non-null float64
CAPTAINCY EXP 130 non-null int64
RUNS-S 130 non-null int64
HS 130 non-null int64
AVE 130 non-null float64
SR-B 130 non-null float64
SIXERS 130 non-null int64
RUNS-C 130 non-null int64
WKTS 130 non-null int64
AVE-BL 130 non-null float64
ECON 130 non-null float64
SR-BL 130 non-null float64
AUCTION YEAR 130 non-null int64
BASE PRICE 130 non-null int64
SOLD PRICE 130 non-null int64
dtypes: float64(7), int64(15), object(4)
memory usage: 26.5+ KB
X_features = ['AGE', 'COUNTRY', 'PLAYING ROLE',
'T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B',
'ODI-WKTS', 'ODI-SR-BL', 'CAPTAINCY EXP', 'RUNS-S',
'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C', 'WKTS',
'AVE-BL', 'ECON', 'SR-BL']
# categorical_features is initialized with the categorical variable names.
categorical_features = ['AGE', 'COUNTRY', 'PLAYING ROLE', 'CAPTAINCY EXP']
#get_dummies() is invoked to return the dummy features.
ipl_auction_encoded_df = pd.get_dummies( ipl_auction_df[X_features],
columns = categorical_features,
drop_first = True )
Copyright © 2019 by Wiley India Pvt. Ltd. 14/37
Machine Learning using Python
ipl_auction_encoded_df.columns
Index(['T-RUNS', 'T-WKTS', 'ODI-RUNS-S', 'ODI-SR-B', 'ODI-WKTS', 'OD
I-SR-BL',
'RUNS-S', 'HS', 'AVE', 'SR-B', 'SIXERS', 'RUNS-C', 'WKTS', 'A
VE-BL',
'ECON', 'SR-BL', 'AGE_2', 'AGE_3', 'COUNTRY_BAN', 'COUNTRY_EN
G',
'COUNTRY_IND', 'COUNTRY_NZ', 'COUNTRY_PAK', 'COUNTRY_SA', 'CO
UNTRY_SL',
'COUNTRY_WI', 'COUNTRY_ZIM', 'PLAYING ROLE_Batsman',
'PLAYING ROLE_Bowler', 'PLAYING ROLE_W. Keeper', 'CAPTAINCY E
XP_1'],
dtype='object')
X = ipl_auction_encoded_df
Y = ipl_auction_df['SOLD PRICE']
6.4.1.2 Standardize X & Y
from sklearn.preprocessing import StandardScaler
## Initializing the StandardScaler
X_scaler = StandardScaler()
## Standardize all the feature columns
X_scaled = X_scaler.fit_transform(X)
## Standardizing Y explictly by subtracting mean and
## dividing by standard deviation
Y = (Y - Y.mean()) / Y.std()
/Users/manaranjan/anaconda/lib/python3.5/site-packages/sklearn/prepr
ocessing/data.py:617: DataConversionWarning: Data with input dtype u
int8, int64, float64 were all converted to float64 by StandardScale
r.
return self.partial_fit(X, y)
/Users/manaranjan/anaconda/lib/python3.5/site-packages/sklearn/base.
py:462: DataConversionWarning: Data with input dtype uint8, int64, f
loat64 were all converted to float64 by StandardScaler.
return self.fit(X, **fit_params).transform(X)
6.4.1.3 Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X_scaled,
Y,
test_size=0.2,
random_state = 42)
6.4.1.4 Build the model
from sklearn.linear_model import LinearRegression
Copyright © 2019 by Wiley India Pvt. Ltd. 15/37
Machine Learning using Python
linreg = LinearRegression()
linreg.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
linreg.coef_
array([-0.43539611, -0.04632556, 0.50840867, -0.03323988, 0.222037
7 ,
-0.05065703, 0.17282657, -0.49173336, 0.58571405, -0.116547
53,
0.24880095, 0.09546057, 0.16428731, 0.26400753, -0.082533
41,
-0.28643889, -0.26842214, -0.21910913, -0.02622351, 0.248178
98,
0.18760332, 0.10776084, 0.04737488, 0.05191335, 0.012352
45,
0.00547115, -0.03124706, 0.08530192, 0.01790803, -0.050774
54,
0.18745577])
## The dataframe has two columns to store feature name
## and the corresponding coefficient values
columns_coef_df = pd.DataFrame( { 'columns': ipl_auction_encoded_df.columns,
'coef': linreg.coef_ } )
## Sorting the features by coefficient values in descending order
sorted_coef_vals = columns_coef_df.sort_values( 'coef', ascending=False)
6.4.1.5 Plotting the coefficient values
Copyright © 2019 by Wiley India Pvt. Ltd. 16/37
Machine Learning using Python
plt.figure( figsize = ( 8, 6 ))
## Creating a bar plot
sn.barplot(x="coef", y="columns",
data=sorted_coef_vals);
plt.xlabel("Coefficients from Linear Regression")
plt.ylabel("Features")
Text(0,0.5,'Features')
6.4.1.6 Calculate R-Squared value
Copyright © 2019 by Wiley India Pvt. Ltd. 17/37
Machine Learning using Python
from sklearn import metrics
# Takes a model as a parameter
# Prints the RMSE on train and test set
def get_train_test_rmse( model ):
# Predicting on training dataset
y_train_pred = model.predict( X_train )
# Compare the actual y with predicted y in the training dataset
rmse_train = round(np.sqrt(metrics.mean_squared_error( y_train, y_train_pred
)), 3)
# Predicting on test dataset
y_test_pred = model.predict( X_test )
# Compare the actual y with predicted y in the test dataset
rmse_test = round(np.sqrt(metrics.mean_squared_error( y_test, y_test_pred
)), 3)
print( "train: ", rmse_train, " test:", rmse_test )
get_train_test_rmse( linreg )
train: 0.679 test: 0.749
6.4.2 Applying Regularization
6.4.2.1 Ridge Regression
# Importing Ridge Regression
from sklearn.linear_model import Ridge
# Applying alpha = 1 and running the algorithms for maximum of 500 iterations
ridge = Ridge(alpha = 1, max_iter = 500)
ridge.fit( X_train, y_train )
Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=500, normal
ize=False,
random_state=None, solver='auto', tol=0.001)
get_train_test_rmse( ridge )
train: 0.68 test: 0.724
ridge = Ridge(alpha = 2.0, max_iter = 1000)
ridge.fit( X_train, y_train )
get_train_test_rmse( ridge )
train: 0.682 test: 0.706
6.4.2.2 Lasso Regression
Copyright © 2019 by Wiley India Pvt. Ltd. 18/37
Machine Learning using Python
# Importing Ridge Regression
from sklearn.linear_model import Lasso
# Applying alpha = 1 and running the algorithms for maximum of 500 iterations
lasso = Lasso(alpha = 0.01, max_iter = 500)
lasso.fit( X_train, y_train )
Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=500,
normalize=False, positive=False, precompute=False, random_state=N
one,
selection='cyclic', tol=0.0001, warm_start=False)
get_train_test_rmse( lasso )
train: 0.688 test: 0.698
## Storing the feature names and coefficient values in the DataFrame
lasso_coef_df = pd.DataFrame( { 'columns':
ipl_auction_encoded_df.columns,
'coef':
lasso.coef_ } )
## Filtering out coefficients with zeros
lasso_coef_df[lasso_coef_df.coef == 0]
coef columns
1 -0.0 T-WKTS
3 -0.0 ODI-SR-B
13 -0.0 AVE-BL
28 0.0 PLAYING ROLE_Bowler
6.4.2.3 Elastic Net Regression
0.01/1.01
0.009900990099009901
from sklearn.linear_model import ElasticNet
enet = ElasticNet(alpha = 1.01, l1_ratio = 0.0099, max_iter = 500)
enet.fit( X_train, y_train )
get_train_test_rmse( enet )
train: 0.794 test: 0.674
6.5 More Advanced Algorithms
Copyright © 2019 by Wiley India Pvt. Ltd. 19/37
Machine Learning using Python
bank_df = pd.read_csv( 'bank.csv')
bank_df.head(5)
housing- personal- curre
age job marital education default balance
loan loan campai
0 30 unemployed married primary no 1787 no no 1
1 33 services married secondary no 4789 yes yes 1
2 35 management single tertiary no 1350 yes no 1
3 30 management married tertiary no 1476 yes yes 4
4 59 blue-collar married secondary no 0 yes no 1
bank_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
age 4521 non-null int64
job 4521 non-null object
marital 4521 non-null object
education 4521 non-null object
default 4521 non-null object
balance 4521 non-null int64
housing-loan 4521 non-null object
personal-loan 4521 non-null object
current-campaign 4521 non-null int64
previous-campaign 4521 non-null int64
subscribed 4521 non-null object
dtypes: int64(4), object(7)
memory usage: 388.6+ KB
6.5.1 Dealing with imbalanced datasets
bank_df.subscribed.value_counts()
no 4000
yes 521
Name: subscribed, dtype: int64
Copyright © 2019 by Wiley India Pvt. Ltd. 20/37
Machine Learning using Python
## Importing resample from *sklearn.utils* package.
from sklearn.utils import resample
# Separate the case of yes-subscribes and no-subscribes
bank_subscribed_no = bank_df[bank_df.subscribed == 'no']
bank_subscribed_yes = bank_df[bank_df.subscribed == 'yes']
##Upsample the yes-subscribed cases.
df_minority_upsampled = resample(bank_subscribed_yes,
replace=True, # sample with replacement
n_samples=2000)
# Combine majority class with upsampled minority class
new_bank_df = pd.concat([bank_subscribed_no, df_minority_upsampled])
from sklearn.utils import shuffle
new_bank_df = shuffle(new_bank_df)
# Assigning list of all column names in the DataFrame
X_features = list( new_bank_df.columns )
# Remove the response variable from the list
X_features.remove( 'subscribed' )
X_features
['age',
'job',
'marital',
'education',
'default',
'balance',
'housing-loan',
'personal-loan',
'current-campaign',
'previous-campaign']
## get_dummies() will convert all the columns with data type as objects
encoded_bank_df = pd.get_dummies( new_bank_df[X_features], drop_first = True )
X = encoded_bank_df
# Encoding the subscribed column and assigning to Y
Y = new_bank_df.subscribed.map( lambda x: int( x == 'yes') )
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split( X,
Y,
test_size = 0.3,
random_state = 42 )
6.5.2 Logistic Regression model
6.5.2.1 Building the model
Copyright © 2019 by Wiley India Pvt. Ltd. 21/37
Machine Learning using Python
from sklearn.linear_model import LogisticRegression
## Initializing the model
logit = LogisticRegression()
## Fitting the model with X and Y values of the dataset
logit.fit( train_X, train_y)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_interce
pt=True,
intercept_scaling=1, max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2', random_state=None, solver='war
n',
tol=0.0001, verbose=0, warm_start=False)
pred_y = logit.predict(test_X)
6.5.2.2 Confusion Matrix
## Importing the metrics
from sklearn import metrics
## Defining the matrix to draw the confusion metrix from actual and predicted cl
ass labels
def draw_cm( actual, predicted ):
# Invoking confusion_matrix from metric package. The matrix will oriented as
[1,0] i.e.
# the classes with label 1 will be reprensted the first row and 0 as second
row
cm = metrics.confusion_matrix( actual, predicted, [1,0] )
## Confustion will be plotted as heatmap for better visualization
## The lables are configured to better interpretation from the plot
sn.heatmap(cm, annot=True, fmt='.2f',
xticklabels = ["Subscribed", "Not Subscribed"] ,
yticklabels = ["Subscribed", "Not Subscribed"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
cm = draw_cm( test_y, pred_y )
cm
Copyright © 2019 by Wiley India Pvt. Ltd. 22/37
Machine Learning using Python
6 5 2 3 Classification Report
print( metrics.classification_report( test_y, pred_y ) )
precision recall f1-score support
0 0.73 0.92 0.81 1225
1 0.60 0.27 0.37 575
micro avg 0.71 0.71 0.71 1800
macro avg 0.66 0.59 0.59 1800
weighted avg 0.69 0.71 0.67 1800
6.5.2.4 ROC AUC Score
## Predicting the probability values for test cases
predict_proba_df = pd.DataFrame( logit.predict_proba( test_X ) )
predict_proba_df.head()
0 1
0 0.704479 0.295521
1 0.853664 0.146336
2 0.666963 0.333037
3 0.588329 0.411671
4 0.707982 0.292018
## Initializing the DataFrame with actual class lables
test_results_df = pd.DataFrame( { 'actual': test_y } )
test_results_df = test_results_df.reset_index()
## Assigning the probability values for class label 1
test_results_df['chd_1'] = predict_proba_df.iloc[:,1:2]
test_results_df.head(5)
index actual chd_1
0 1321 0 0.295521
1 3677 0 0.146336
2 1680 1 0.333037
3 821 0 0.411671
4 921 0 0.292018
Copyright © 2019 by Wiley India Pvt. Ltd. 23/37
Machine Learning using Python
# Passing actual class labels and the predicted probability values to compute RO
C AUC score.
auc_score = metrics.roc_auc_score( test_results_df.actual, test_results_df.chd_1
)
round( float( auc_score ), 2 )
0.69
## The method takes the three following parameters
## model: the classification model
## test_X: X features of the test set
## test_y: actual labels of the test set
## Returns
## - ROC Auc Score
## - FPR and TPRs for different threshold values
def draw_roc_curve( model, test_X, test_y ):
## Creating and initializing a results DataFrame with actual labels
test_results_df = pd.DataFrame( { 'actual': test_y } )
test_results_df = test_results_df.reset_index()
# predict the probabilities on the test set
predict_proba_df = pd.DataFrame( model.predict_proba( test_X ) )
## selecting the probabilities that the test example belongs to class 1
test_results_df['chd_1'] = predict_proba_df.iloc[:,1:2]
## Invoke roc_curve() to return the fpr, tpr and threshold values.
## threshold values contain values from 0.0 to 1.0
fpr, tpr, thresholds = metrics.roc_curve( test_results_df.actual,
test_results_df.chd_1,
drop_intermediate = False )
## Getting the roc auc score by invoking metrics.roc_auc_score method
auc_score = metrics.roc_auc_score( test_results_df.actual, test_results_df.c
hd_1 )
## Setting the size of the plot
plt.figure(figsize=(8, 6))
## plotting the actual fpr and tpr values
plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
## plotting th diagnoal line from (0,1)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
## Setting labels and titles
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
return auc_score, fpr, tpr, thresholds
Copyright © 2019 by Wiley India Pvt. Ltd. 24/37
Machine Learning using Python
## Invoking draw_roc_curve with the logistic regresson model
_, _, _, _ = draw_roc_curve( logit, test_X, test_y )
6.5.3 KNN Algorithm
## Importing the KNN classifier algorithm
from sklearn.neighbors import KNeighborsClassifier
## Initializing the classifier
knn_clf = KNeighborsClassifier()
## Fitting the model with the training set
knn_clf.fit( train_X, train_y )
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkows
ki',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
6.5.3.1 KNN Accuracy
Copyright © 2019 by Wiley India Pvt. Ltd. 25/37
Machine Learning using Python
## Invoking draw_roc_curve with the KNN model
_, _, _, _ = draw_roc_curve( knn_clf, test_X, test_y )
## Predicting on test set
pred_y = knn_clf.predict(test_X)
## Drawing the confusion matrix for KNN model
draw_cm( test_y, pred_y )
Copyright © 2019 by Wiley India Pvt. Ltd. 26/37
Machine Learning using Python
print( metrics.classification_report( test_y, pred_y ) )
precision recall f1-score support
0 0.85 0.77 0.81 1225
1 0.59 0.72 0.65 575
micro avg 0.75 0.75 0.75 1800
macro avg 0.72 0.74 0.73 1800
weighted avg 0.77 0.75 0.76 1800
6.5.3.2 GridSerach for most optimal parameters
## Importing GridSearchCV
from sklearn.model_selection import GridSearchCV
## Creating a dictionary with hyperparameters and possible values for searching
tuned_parameters = [{'n_neighbors': range(5,10),
'metric': ['canberra', 'euclidean', 'minkowski']}]
## Configuring grid search
clf = GridSearchCV(KNeighborsClassifier(),
tuned_parameters,
cv=10,
scoring='roc_auc')
## fit the search with training set
clf.fit(train_X, train_y )
GridSearchCV(cv=10, error_score='raise-deprecating',
estimator=KNeighborsClassifier(algorithm='auto', leaf_size=3
0, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform'),
fit_params=None, iid='warn', n_jobs=None,
param_grid=[{'n_neighbors': range(5, 10), 'metric': ['canberr
a', 'euclidean', 'minkowski']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='war
n',
scoring='roc_auc', verbose=0)
clf.best_score_
0.8368537419503068
clf.best_params_
{'metric': 'canberra', 'n_neighbors': 5}
6.5.4 Ensemble Methods
6.5.5 Random Forest
6.5.5.1 Buiding Random Forest Model
Copyright © 2019 by Wiley India Pvt. Ltd. 27/37
Machine Learning using Python
## Importing Random Forest Classifier from the sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier
## Initializing the Random Forest Classifier with max_dept and n_estimators
radm_clf = RandomForestClassifier( max_depth=10, n_estimators=10)
radm_clf.fit( train_X, train_y )
RandomForestClassifier(bootstrap=True, class_weight=None, criterion
='gini',
max_depth=10, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=No
ne,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
_, _, _, _ = draw_roc_curve( radm_clf, test_X, test_y );
6.5.5.2 Grid Search for Optimal Parameters
Copyright © 2019 by Wiley India Pvt. Ltd. 28/37
Machine Learning using Python
## Configuring parameters and values for searched
tuned_parameters = [{'max_depth': [10, 15],
'n_estimators': [10,20],
'max_features': ['sqrt', 'auto']}]
## Initializing the RF classifier
radm_clf = RandomForestClassifier()
## Configuring search with the tunable parameters
clf = GridSearchCV(radm_clf,
tuned_parameters,
cv=5,
scoring='roc_auc')
## Fitting the training set
clf.fit(train_X, train_y )
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True, class_weight
=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=Non
e,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators='warn', n_job
s=None,
oob_score=False, random_state=None, verbose=0,
warm_start=False),
fit_params=None, iid='warn', n_jobs=None,
param_grid=[{'n_estimators': [10, 20], 'max_depth': [10, 15],
'max_features': ['sqrt', 'auto']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='war
n',
scoring='roc_auc', verbose=0)
clf.best_score_
0.9399595384858543
clf.best_params_
{'max_depth': 15, 'max_features': 'auto', 'n_estimators': 20}
6.5.5.3 Building the final model with optimal parameter values
Copyright © 2019 by Wiley India Pvt. Ltd. 29/37
Machine Learning using Python
## Initializing the Random Forest Mode with the optimal values
radm_clf = RandomForestClassifier( max_depth=15, n_estimators=20, max_features =
'auto')
## Fitting the model with the training set
radm_clf.fit( train_X, train_y )
RandomForestClassifier(bootstrap=True, class_weight=None, criterion
='gini',
max_depth=15, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=No
ne,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
6.5.5.4 ROC AUC Score
_, _, _, _ = draw_roc_curve( clf, test_X, test_y )
6.5.5.5 Drawing the confusion matrix
Copyright © 2019 by Wiley India Pvt. Ltd. 30/37
Machine Learning using Python
pred_y = radm_clf.predict( test_X )
draw_cm( test_y, pred_y )
print( metrics.classification_report( test_y, pred_y ) )
precision recall f1-score support
0 0.90 0.94 0.92 1225
1 0.86 0.78 0.82 575
micro avg 0.89 0.89 0.89 1800
macro avg 0.88 0.86 0.87 1800
weighted avg 0.89 0.89 0.89 1800
6.5.5.6 Finding important features
Copyright © 2019 by Wiley India Pvt. Ltd. 31/37
Machine Learning using Python
import numpy as np
# Create a dataframe to store the featues and their corresponding importances
feature_rank = pd.DataFrame( { 'feature': train_X.columns,
'importance': radm_clf.feature_importances_ } )
## Sorting the features based on their importances with most important feature a
t top.
feature_rank = feature_rank.sort_values('importance', ascending = False)
plt.figure(figsize=(8, 6))
# plot the values
sn.barplot( y = 'feature', x = 'importance', data = feature_rank );
Copyright © 2019 by Wiley India Pvt. Ltd. 32/37
Machine Learning using Python
feature_rank['cumsum'] = feature_rank.importance.cumsum() * 100
feature_rank.head(10)
feature importance cumsum
1 balance 0.269603 26.960282
0 age 0.203664 47.326707
3 previous-campaign 0.117525 59.079219
2 current-campaign 0.090085 68.087703
21 housing-loan_yes 0.039898 72.077486
15 marital_married 0.034329 75.510337
22 personal-loan_yes 0.027029 78.213244
17 education_secondary 0.023934 80.606690
4 job_blue-collar 0.023081 82.914811
16 marital_single 0.022495 85.164357
6.5.6 Boosting
6.5.6.1 Adaboost
## Importing Adaboost classifier
from sklearn.ensemble import AdaBoostClassifier
## Initializing logistic regression to use as base classifier
logreg_clf = LogisticRegression()
## Initilizing adaboost classifier with 50 classifers
ada_clf = AdaBoostClassifier(logreg_clf, n_estimators=50)
## Fitting adaboost model to training set
ada_clf.fit(train_X, train_y )
AdaBoostClassifier(algorithm='SAMME.R',
base_estimator=LogisticRegression(C=1.0, class_weight=Non
e, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2', random_state=None, solver='war
n',
tol=0.0001, verbose=0, warm_start=False),
learning_rate=1.0, n_estimators=50, random_state=None)
Copyright © 2019 by Wiley India Pvt. Ltd. 33/37
Machine Learning using Python
_, _, _, _ = draw_roc_curve( ada_clf, test_X, test_y )
6.5.6.2 Gradient Boosting
## Importing Gradient Boosting classifier
from sklearn.ensemble import GradientBoostingClassifier
## Initializing Gradient Boosting with 500 estimators and max depth as 10.
gboost_clf = GradientBoostingClassifier( n_estimators=500, max_depth=10)
## Fitting gradient boosting model to training set
gboost_clf.fit(train_X, train_y )
GradientBoostingClassifier(criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=10,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=500,
n_iter_no_change=None, presort='auto', random_state=No
ne,
subsample=1.0, tol=0.0001, validation_fraction=0.1,
verbose=0, warm_start=False)
Copyright © 2019 by Wiley India Pvt. Ltd. 34/37
Machine Learning using Python
_, _, _, _ = draw_roc_curve( gboost_clf, test_X, test_y )
from sklearn.model_selection import cross_val_score
gboost_clf = GradientBoostingClassifier( n_estimators=500, max_depth=10)
cv_scores = cross_val_score( gboost_clf, train_X, train_y, cv = 10, scoring = 'r
oc_auc' )
print( cv_scores )
print( "Mean Accuracy: ", np.mean(cv_scores), " with standard deviation of: ",
np.std(cv_scores))
[0.98241686 0.98105851 0.98084469 0.9585199 0.95482216 0.96667006
0.95342452 0.97368689 0.95937357 0.98174607]
Mean Accuracy: 0.969256322542174 with standard deviation of: 0.01
1406249012935668
Copyright © 2019 by Wiley India Pvt. Ltd. 35/37
Machine Learning using Python
gboost_clf.fit(train_X, train_y )
pred_y = gboost_clf.predict( test_X )
draw_cm( test_y, pred_y )
print( metrics.classification_report( test_y, pred_y ) )
precision recall f1-score support
0 0.96 0.95 0.96 1225
1 0.90 0.92 0.91 575
micro avg 0.94 0.94 0.94 1800
macro avg 0.93 0.94 0.94 1800
weighted avg 0.94 0.94 0.94 1800
Copyright © 2019 by Wiley India Pvt. Ltd. 36/37
Machine Learning using Python
import numpy as np
# Create a dataframe to store the featues and their corresponding importances
feature_rank = pd.DataFrame( { 'feature': train_X.columns,
'importance': gboost_clf.feature_importances_ } )
## Sorting the features based on their importances with most important feature a
t top.
feature_rank = feature_rank.sort_values('importance', ascending = False)
plt.figure(figsize=(8, 6))
# plot the values
sn.barplot( y = 'feature', x = 'importance', data = feature_rank );
Copyright © 2019 by Wiley India Pvt. Ltd. 37/37