ML - Practical - 1 - Jupyter Notebook
ML - Practical - 1 - Jupyter Notebook
In [56]: df = pd.read_csv(r"C:\Users\Dell\Downloads\Uber.csv")
In [57]: df.head()
Out[57]:
Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude
0
2015-05-07 2015-05-07
0 24238194 7.5 -73.999817 40.738354
19:52:06.0000003 19:52:06 UTC
2009-07-17 2009-07-17
1 27835199 7.7 -73.994355 40.728225
20:04:56.0000002 20:04:56 UTC
2009-08-24 2009-08-24
2 44984355 12.9 -74.005043 40.740770
21:45:00.00000061 21:45:00 UTC
2009-06-26 2009-06-26
3 25894730 5.3 -73.976124 40.790844
08:22:21.0000001 08:22:21 UTC
2014-08-28 2014-08-28
4 17610152 16.0 -73.925023 40.744085
17:47:00.000000188 17:47:00 UTC
In [58]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 200000 non-null int64
1 key 200000 non-null object
2 fare_amount 200000 non-null float64
3 pickup_datetime 200000 non-null object
4 pickup_longitude 200000 non-null float64
5 pickup_latitude 200000 non-null float64
6 dropoff_longitude 199999 non-null float64
7 dropoff_latitude 199999 non-null float64
8 passenger_count 200000 non-null int64
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB
In [59]: df.shape
Out[59]: (200000, 9)
localhost:8888/notebooks/ML_Practical_1.ipynb 1/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [60]:
df.isnull().sum()
Out[60]: Unnamed: 0 0
key 0
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64
In [62]: df.isnull().sum()
Out[62]: Unnamed: 0 0
key 0
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dtype: int64
In [64]: df.head()
Out[64]:
fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitu
2015-05-07
0 7.5 -73.999817 40.738354 -73.999512 40.7232
19:52:06 UTC
2009-07-17
1 7.7 -73.994355 40.728225 -73.994710 40.7503
20:04:56 UTC
2009-08-24
2 12.9 -74.005043 40.740770 -73.962565 40.7726
21:45:00 UTC
2009-06-26
3 5.3 -73.976124 40.790844 -73.965316 40.8033
08:22:21 UTC
2014-08-28
4 16.0 -73.925023 40.744085 -73.973082 40.7612
17:47:00 UTC
localhost:8888/notebooks/ML_Practical_1.ipynb 2/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [66]: df.dtypes
In [67]: df.describe()
Out[67]:
fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passen
localhost:8888/notebooks/ML_Practical_1.ipynb 3/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [69]: sns.distplot(df['pickup_latitude'])
localhost:8888/notebooks/ML_Practical_1.ipynb 4/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [70]: sns.distplot(df['pickup_longitude'])
In [71]: sns.distplot(df['dropoff_longitude'])
localhost:8888/notebooks/ML_Practical_1.ipynb 5/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [72]: sns.distplot(df['dropoff_latitude'])
localhost:8888/notebooks/ML_Practical_1.ipynb 6/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [74]: #getting outlier details for column "fair_amount" using the above function
outliers = find_outliers_IQR(df["fare_amount"])
print("number of outliers: "+ str(len(outliers)))
print("max outlier value: "+ str(outliers.max()))
print("min outlier value: "+ str(outliers.min()))
outliers
Out[74]: 6 24.50
30 25.70
34 39.50
39 29.00
48 56.80
...
199976 49.70
199977 43.50
199982 57.33
199985 24.00
199997 30.90
Name: fare_amount, Length: 17166, dtype: float64
In [75]: #you can also pass two columns as argument to the function (here "passenger_count
outliers = find_outliers_IQR(df[["passenger_count","fare_amount"]])
outliers
Out[75]:
passenger_count fare_amount
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
4 5.0 NaN
localhost:8888/notebooks/ML_Practical_1.ipynb 7/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
41.06517154774204
-18.3453884488253
localhost:8888/notebooks/ML_Practical_1.ipynb 8/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [80]: df.head()
Out[80]:
fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_co
In [81]: df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 199999 entries, 0 to 199999
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fare_amount 199999 non-null float64
1 pickup_longitude 199999 non-null float64
2 pickup_latitude 199999 non-null float64
3 dropoff_longitude 199999 non-null float64
4 dropoff_latitude 199999 non-null float64
5 passenger_count 199999 non-null int64
6 day 199999 non-null int64
7 hour 199999 non-null int64
8 month 199999 non-null int64
9 year 199999 non-null int64
10 weekday 199999 non-null int64
dtypes: float64(5), int64(6)
memory usage: 18.3 MB
localhost:8888/notebooks/ML_Practical_1.ipynb 9/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
Out[83]:
pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count day
In [84]: y=df["fare_amount"]
In [86]: x_train.head()
Out[86]:
pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count day
localhost:8888/notebooks/ML_Practical_1.ipynb 10/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [87]: x_test.head()
Out[87]:
pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count day
In [88]: y_train.head()
In [89]: y_test.head()
In [90]: print(x_train.shape)
print(x_test.shape)
print(y_test.shape)
print(y_train.shape)
(159999, 10)
(40000, 10)
(40000,)
(159999,)
Out[91]: LinearRegression()
localhost:8888/notebooks/ML_Practical_1.ipynb 11/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [95]: rfrmodel.fit(x_train,y_train)
rfrmodel_pred= rfrmodel.predict(x_test)
In [116]: test.head()
Out[116]:
Unnamed: Unnamed: Unnamed:
key pickup_datetime pickup_longitude pickup
0 0.1 0.1.1
2011-02-10 2011-02-10
0 0 37338 31401407 -73.951662 40
19:06:00.000000169 19:06:00 UTC
2011-06-23 2011-06-23
1 1 160901 33158465 -73.951007 40
09:24:00.000000157 09:24:00 UTC
2012-07-14 2012-07-14
2 2 40428 10638355 -73.996473 40
10:37:00.000000149 10:37:00 UTC
2014-10-19 2014-10-19
3 3 63353 3836845 -73.997934 40
22:27:05.0000002 22:27:05 UTC
2015-05-25 2015-05-25
4 4 165491 27114503 -73.952583 40
22:54:43.0000001 22:54:43 UTC
localhost:8888/notebooks/ML_Practical_1.ipynb 12/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [119]: #splitting column "pickup_datetime" into 5 columns: "day", "hour", "month", "year
#for a simplified view
#label encoding weekdays
test['day']=test['pickup_datetime'].apply(lambda x:x.day)
test['hour']=test['pickup_datetime'].apply(lambda x:x.hour)
test['month']=test['pickup_datetime'].apply(lambda x:x.month)
test['year']=test['pickup_datetime'].apply(lambda x:x.year)
test['weekday']=test['pickup_datetime'].apply(lambda x: calendar.day_name[x.weekd
test.weekday = test.weekday.map({'Sunday':0,'Monday':1,'Tuesday':2,'Wednesday':3,
test.drop(['pickup_datetime'], axis = 1, inplace = True)
test.head(5)
Out[119]:
Unnamed:
pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_coun
0.1.1
In [130]: regression.fit(X_train,y_train)
Out[130]: LinearRegression()
Out[131]: -1290.1588587396827
localhost:8888/notebooks/ML_Practical_1.ipynb 13/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [134]: print(prediction)
In [135]: y_test
Metrics Evaluation using R2, Mean Squared Error, Root Mean Sqared
Error
In [138]: r2_score(y_test,prediction)
Out[138]: 0.01603948315081527
In [141]: MSE
Out[141]: 98.78999628170801
In [143]: RMSE
Out[143]: 9.939315684779713
localhost:8888/notebooks/ML_Practical_1.ipynb 14/15
11/13/22, 12:12 PM ML_Practical_1 - Jupyter Notebook
In [146]: rf.fit(X_train,y_train)
Out[146]: RandomForestRegressor()
In [148]: y_pred
In [150]: R2_Random
Out[150]: 0.7617634409141242
In [152]: MSE_Random
Out[152]: 23.919037789875
In [154]: RMSE_Random
Out[154]: 4.8907093340204755
In [ ]:
localhost:8888/notebooks/ML_Practical_1.ipynb 15/15