Tutorial 2 - Clustering
Tutorial 2 - Clustering
In [13]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
import matplotlib.pyplot as plt
In [9]:
In [10]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 3 columns):
Driver_ID 4000 non-null int64
Distance_Feature 4000 non-null float64
Speeding_Feature 4000 non-null float64
dtypes: float64(2), int64(1)
memory usage: 93.8 KB
In [11]:
data.describe()
Out[11]:
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 1/7
14/09/2018 Tutorial 2 - Clustering
In [26]:
plt.scatter(data.iloc[:,1:2], data.iloc[:,2:3])
plt.xlabel(data.columns.values[1])
plt.ylabel(data.columns.values[2])
plt.show()
In [28]:
wcss = []
for i in range(1,11):
kmeans = KMeans(n_clusters = i,init = 'k-means++',random_state = 0)
kmeans.fit(data)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of cluster')
plt.ylabel('WCSS')
plt.show()
In [52]:
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 2/7
14/09/2018 Tutorial 2 - Clustering
In [53]:
%matplotlib inline
plt.figsize=(40, 40)
plt.scatter(data.iloc[:,1],data.iloc[:,2], c=y_kmeans)
Out[53]:
<matplotlib.collections.PathCollection at 0x7f381ee64ba8>
In [47]:
Out[47]:
0 1
0 0.243 0.280
1 0.161 0.250
2 0.214 0.270
3 0.175 0.220
4 0.170 0.250
In [50]:
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 3/7
14/09/2018 Tutorial 2 - Clustering
In [59]:
%matplotlib inline
plt.scatter(data.iloc[:,1],data.iloc[:,2], c=y2_kmeans)
Out[59]:
<matplotlib.collections.PathCollection at 0x7f381c32eda0>
In [ ]:
#DBSCAN STARTS
In [78]:
In [79]:
dbsc = dbscan.fit(data)
dbsc.labels_
Out[79]:
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 4/7
14/09/2018 Tutorial 2 - Clustering
In [80]:
plt.scatter(data.iloc[:,1],data.iloc[:,2], c=dbsc.labels_)
Out[80]:
<matplotlib.collections.PathCollection at 0x7f38142e7550>
In [81]:
dbsc = dbscan.fit(dataN)
dbsc.labels_
Out[81]:
In [82]:
plt.scatter(data.iloc[:,1],data.iloc[:,2], c=dbsc.labels_)
Out[82]:
<matplotlib.collections.PathCollection at 0x7f381437b198>
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 5/7
14/09/2018 Tutorial 2 - Clustering
In [66]:
model.labels_
Out[66]:
In [ ]:
#AGGLOMERATIVE STARTS
In [67]:
In [68]:
y_aggclus
Out[68]:
In [69]:
In [83]:
ddata=dendrogram(linkage_matrix,color_threshold=1.5)
plt.figure(figsize=(5,7))
Out[83]:
http://localhost:8888/notebooks/Documents/BITS%20Course/DM/Tut/TUT2/Piyush_TUT/Tutorial%202%20-%20Clustering.ipynb# 6/7