Quantam - Learning - Colaboratory
Quantam - Learning - Colaboratory
from·google.colab·import·drive
drive.mount·('/content/drive')
Mounted at /content/drive
path='/content/drive/MyDrive/Copy of Bengaluru_House_Data.csv'
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
Super
Electronic City
0 built-up 19-Dec 2 BHK Coomee 1056 2.0
Phase II
Area
Ready To 4
1 Plot Area Chikka Tirupathi Theanmp 2600 5.0
Move Bedroom
Built-up Ready To
2 Uttarahalli 3 BHK NaN 1440 2.0
Area Move
df1.shape
(13320, 9)
df1.columns
df1['area_type'].unique()
df1['area_type'].value_counts()
df2 = df1.drop(['area_type','society','balcony','availability'],axis='columns')
df2.shape
(13320, 5)
df2 = df1.drop(['area_type','society','balcony','availability'],axis='columns')
df2.shape
(13320, 5)
df2.isnull().sum()
location 1
size 16
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 1/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
total_sqft 0
bath 73
price 0
dtype: int64
df2.shape
(13320, 5)
df3 = df2.dropna()
df3.isnull().sum()
location 0
size 0
total_sqft 0
bath 0
price 0
dtype: int64
df3.shape
(13246, 5)
<ipython-input-14-681cf3aca53d>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
def is_float(x):
try:
float(x)
except:
return False
return True
2+3
df3[~df3['total_sqft'].apply(is_float)].head(10)
def convert_sqft_to_num(x):
tokens = x.split('-')
if len(tokens) == 2:
return (float(tokens[0])+float(tokens[1]))/2
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 2/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
try:
return float(x)
except:
return None
df4 = df3.copy()
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]
df4.head(2)
df4.loc[30]
location Yelahanka
size 4 BHK
total_sqft 2475.0
bath 4.0
price 186.0
bhk 4
Name: 30, dtype: object
(2100+2850)/2
2475.0
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']
df5.head()
df5_stats = df5['price_per_sqft'].describe()
df5_stats
count 1.320000e+04
mean 7.920759e+03
std 1.067272e+05
min 2.678298e+02
25% 4.267701e+03
50% 5.438331e+03
75% 7.317073e+03
max 1.200000e+07
Name: price_per_sqft, dtype: float64
df5.to_csv("bhp.csv",index=False)
Whitefield 533
Sarjapur Road 392
Electronic City 304
Kanakpura Road 264
Thanisandra 235
...
Rajanna Layout 1
Subramanyanagar 1
Lakshmipura Vidyaanyapura 1
Malur Hosur Road 1
Abshot Layout 1
Name: location, Length: 1287, dtype: int64
location_stats.values.sum()
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 3/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
13200
len(location_stats[location_stats>10])
240
len(location_stats)
1287
len(location_stats[location_stats<=10])
1047
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10
len(df5.location.unique())
1287
241
df5.head(10)
df5[df5.total_sqft/df5.bhk<300].head()
df5.shape
(13200, 7)
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 4/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
df6 = df5[~(df5.total_sqft/df5.bhk<300)]
df6.shape
(12456, 7)
df6.price_per_sqft.describe()
count 12456.000000
mean 6308.502826
std 4168.127339
min 267.829813
25% 4210.526316
50% 5294.117647
75% 6916.666667
max 176470.588235
Name: price_per_sqft, dtype: float64
def remove_pps_outliers(df):
df_out = pd.DataFrame()
for key, subdf in df.groupby('location'):
m = np.mean(subdf.price_per_sqft)
st = np.std(subdf.price_per_sqft)
reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
df_out = pd.concat([df_out,reduced_df],ignore_index=True)
return df_out
df7 = remove_pps_outliers(df6)
df7.shape
(10242, 7)
def plot_scatter_chart(df,location):
bhk2 = df[(df.location==location) & (df.bhk==2)]
bhk3 = df[(df.location==location) & (df.bhk==3)]
matplotlib.rcParams['figure.figsize'] = (15,10)
plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)
plt.scatter(bhk3.total_sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)
plt.xlabel("Total Square Feet Area")
plt.ylabel("Price (Lakh Indian Rupees)")
plt.title(location)
plt.legend()
plot_scatter_chart(df7,"Rajaji Nagar")
plot_scatter_chart(df7,"Hebbal")
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 5/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
{
'1' : {
'mean': 4000,
'std': 2000,
'count': 34
},
'2' : {
'mean': 4300,
'std': 2300,
'count': 22
},
}
def remove_bhk_outliers(df):
exclude_indices = np.array([])
for location, location_df in df.groupby('location'):
bhk_stats = {}
for bhk, bhk_df in location_df.groupby('bhk'):
bhk_stats[bhk] = {
'mean': np.mean(bhk_df.price_per_sqft),
'std': np.std(bhk_df.price_per_sqft),
'count': bhk_df.shape[0]
}
for bhk, bhk_df in location_df.groupby('bhk'):
stats = bhk_stats.get(bhk-1)
if stats and stats['count']>5:
exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
return df.drop(exclude_indices,axis='index')
df8 = remove_bhk_outliers(df7)
# df8 = df7.copy()
df8.shape
(7317, 7)
plot_scatter_chart(df8,"Rajaji Nagar")
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 6/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
plot_scatter_chart(df8,"Hebbal")
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df8.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 7/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
df8.bath.unique()
array([ 4., 3., 2., 5., 8., 1., 6., 7., 9., 12., 16., 13.])
plt.hist(df8.bath,rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")
df8[df8.bath>10]
df8[df8.bath>df8.bhk+2]
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 8/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
df9 = df8[df8.bath<df8.bhk+2]
df9.shape
(7239, 7)
df9.head(2)
df10 = df9.drop(['size','price_per_sqft'],axis='columns')
df10.head(3)
dummies = pd.get_dummies(df10.location)
dummies.head(3)
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0
1 1 0 0 0 0 0 0 0 0 0 ... 0 0 0
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 0
df11 = pd.concat([df10,dummies.drop('other',axis='columns')],axis='columns')
df11.head()
1st Block
0 2850.0 4.0 428.0 4 1 0 0 0 0 ... 0 0
Jayanagar
1st Block
1 1630.0 3.0 194.0 3 1 0 0 0 0 ... 0 0
Jayanagar
1st Block
2 1875.0 2.0 235.0 3 1 0 0 0 0 ... 0 0
Jayanagar
1st Block
3 1200.0 2.0 130.0 3 1 0 0 0 0 ... 0 0
Jayanagar
1st Block
4 1235.0 2.0 148.0 2 1 0 0 0 0 ... 0 0
Jayanagar
df12 = df11.drop('location',axis='columns')
df12.head(2)
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 9/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
X = df12.drop(['price'],axis='columns')
X.head(3)
X.shape
(7239, 243)
y = df12.price
y.head(3)
0 428.0
1 194.0
2 235.0
Name: price, dtype: float64
len(y)
7239
0.8629132245229447
cross_val_score(LinearRegression(), X, y, cv=cv)
def find_best_model_using_gridsearchcv(X,y):
algos = {
'linear_regression' : {
'model': LinearRegression(),
'params': {
'normalize': [True, False]
}
},
'lasso': {
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 10/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
'model': Lasso(),
'params': {
'alpha': [1,2],
'selection': ['random', 'cyclic']
}
},
'decision_tree': {
'model': DecisionTreeRegressor(),
'params': {
'criterion' : ['mse','friedman_mse'],
'splitter': ['best','random']
}
}
}
scores = []
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
for algo_name, config in algos.items():
gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
gs.fit(X,y)
scores.append({
'model': algo_name,
'best_score': gs.best_score_,
'best_params': gs.best_params_
})
return pd.DataFrame(scores,columns=['model','best_score','best_params'])
find_best_model_using_gridsearchcv(X,y)
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 11/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:
If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:
If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:
If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:
return lr_clf.predict([x])[0]
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_base.py:148: FutureWarning: 'normalize' was deprecated in version 1.0
predict_price('1st Phase JP Nagar',1000, 2, 2)
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_base.py:148: FutureWarning: 'normalize' was deprecated in version 1.0
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression
/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_base.py:148:
warnings.warn( FutureWarning: 'normalize' was deprecated in version 1.0
warnings.warn(
83.86570258311595
/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_base.py:148: FutureWarning: 'normalize' was deprecated in version 1.0
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_base.py:148: FutureWarning: 'normalize' was deprecated in version 1.0
predict_price('1st Phase JP Nagar',1000, 3, 3)
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_base.py:148: FutureWarning: 'normalize' was deprecated in version 1.0
/usr/local/lib/python3.8/dist-packages/sklearn/base.py:450:
warnings.warn( UserWarning: X does not have valid feature names, but LinearRegression
/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py:359:
warnings.warn( FutureWarning: Criterion 'mse' was deprecated in v1.0 and will
86.08062284986363
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py:359: FutureWarning: Criterion 'mse' was deprecated in v1.0 and will
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py:359: FutureWarning: Criterion 'mse' was deprecated in v1.0 and will
predict_price('Indira
warnings.warn( Nagar',1000, 2, 2)
/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py:359: FutureWarning: Criterion 'mse' was deprecated in v1.0 and will
warnings.warn(
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 12/13
1/11/23, 12:50 PM Copy of Quantam_Learning_ - Colaboratory
/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py:359:
/usr/local/lib/python3.8/dist-packages/sklearn/base.py:450: FutureWarning:
UserWarning: X does not Criterion
have valid'mse' wasnames,
feature deprecated in v1.0 and will
but LinearRegression
warnings.warn(
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py:359:
193.3119773317968 FutureWarning: Criterion 'mse' was deprecated in v1.0 and will
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py:359: FutureWarning: Criterion 'mse' was deprecated in v1.0 and will
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py:359:
predict_price('Indira Nagar',1000, 3, 3) FutureWarning: Criterion 'mse' was deprecated in v1.0 and will
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py:359: FutureWarning:
/usr/local/lib/python3.8/dist-packages/sklearn/base.py:450: UserWarning: X does not Criterion
have valid'mse' wasnames,
feature deprecated in v1.0 and will
but LinearRegression
warnings.warn(
warnings.warn(
/usr/local/lib/python3.8/dist-packages/sklearn/tree/_classes.py:359:
195.5268975985445 FutureWarning: Criterion 'mse' was deprecated in v1.0 and will
warnings.warn(
model best_score best_params
import0pickle
linear_regression 0.847796 {'normalize': False}
with open('banglore_home_prices_model.pickle','wb') as f:
1 lasso
pickle.dump(lr_clf,f) 0.726745 {'alpha': 2, 'selection': 'random'}
https://colab.research.google.com/drive/1GxpC9HQufBWj-pbJ3qRIEW3LWgPzgjTa#scrollTo=mmEvB0k2ZzqO&printMode=true 13/13