Name: Siddhesh Asati: #Group: B (ML) #Assignment: 6
Name: Siddhesh Asati: #Group: B (ML) #Assignment: 6
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
1
4 2014-08-28 17:47:00 UTC -73.925023 40.744085
... ... ... ...
199995 2012-10-28 10:49:00 UTC -73.987042 40.739367
199996 2014-03-14 01:09:00 UTC -73.984722 40.736837
199997 2009-06-29 00:42:00 UTC -73.986017 40.756487
199998 2015-05-20 14:56:25 UTC -73.997124 40.725452
199999 2010-05-15 04:08:00 UTC -73.984395 40.720077
[4]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
# Column Non-Null Count Dtype
[5]: data.head()
2
3 25894730 2009-06-26 08:22:21.0000001 5.3
4 17610152 2014-08-28 17:47:00.000000188 16.0
[6]: data.tail()
[7]: data.shape
[7]: (200000, 9)
[8]: data.describe()
3
[8]: Unnamed: 0 fare_amount pickup_longitude pickup_latitude \
count 2.000000e+05 200000.000000 200000.000000 200000.000000
mean 2.771250e+07 11.359955 -72.527638 39.935885
std 1.601382e+07 9.901776 11.437787 7.720539
min 1.000000e+00 -52.000000 -1340.648410 -74.015515
25% 1.382535e+07 6.000000 -73.992065 40.734796
50% 2.774550e+07 8.500000 -73.981823 40.752592
75% 4.155530e+07 12.500000 -73.967154 40.767158
max 5.542357e+07 499.000000 57.418457 1644.421482
[9]: data.dtypes
[10] : Unnamed: 0 0
key 0
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64
4
[11]: data.drop(columns='Unnamed: 0', inplace=True)
[13]: data.isnull().sum()
[13]: key 0
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dtype: int64
[14]: data.drop_duplicates(inplace=True)
[16]: data.drop(columns=['key','pickup_datetime'],inplace=True)
[17]: data.head()
dropoff_latitude passenger_count
0 40.723217 1
1 40.750325 1
2 40.772647 1
3 40.803349 3
4 40.761247 5
[18]: #Correlationx
x = data.drop(['fare_amount'], axis=1)
y = data['fare_amount']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,␣
‹→random_state=0)
5
[19]: data.corr()
[20]: #Outlier
sns.boxplot(data['fare_amount'])
6
[21]: data.head()
dropoff_latitude passenger_count
0 40.723217 1
1 40.750325 1
2 40.772647 1
3 40.803349 3
4 40.761247 5
6.0 6.0
7
[24]: data.head()
dropoff_latitude passenger_count
3 40.803349 3
5 40.755910 1
7 0.000000 1
12 40.807133 5
16 40.760050 1
8
[11.31328693 11.31355757 11.37704914 ... 11.37721085 11.4387883
11.31208495]
[26]: def evaluate_model(y_true, y_pred, model_name):
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)
print(f"{model_name} RMSE: {rmse:.2f}")
print(f"{model_name} R-squared (R2): {r2:.2f}")
evaluate_model(y_test, y_pred_lr, "Linear Regression")