0% found this document useful (0 votes)
16 views6 pages

ML - 2 - Jupyter Notebook

Uploaded by

Anushka Jadhav
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views6 pages

ML - 2 - Jupyter Notebook

Uploaded by

Anushka Jadhav
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

10/1/24, 10:25 PM ML_2 - Jupyter Notebook

In [1]: import pandas as pd


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]: df = pd.read_csv('uber.csv')

In [3]: df.head()

Out[3]:
Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dro
0

2015-05-07
0 24238194 52:06.0 7.5 -73.999817 40.738354 -73.999512
19:52:06 UTC

2009-07-17
1 27835199 04:56.0 7.7 -73.994355 40.728225 -73.994710
20:04:56 UTC

2009-08-24
2 44984355 45:00.0 12.9 -74.005043 40.740770 -73.962565
21:45:00 UTC

2009-06-26
3 25894730 22:21.0 5.3 -73.976124 40.790844 -73.965316
08:22:21 UTC

2014-08-28
4 17610152 47:00.0 16.0 -73.925023 40.744085 -73.973082
17:47:00 UTC

In [4]: df.describe()

Out[4]:
Unnamed: 0 fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude pa

count 2.000000e+05 200000.000000 200000.000000 200000.000000 199999.000000 199999.000000

mean 2.771250e+07 11.359955 -72.527638 39.935885 -72.525292 39.923890

std 1.601382e+07 9.901776 11.437787 7.720539 13.117408 6.794829

min 1.000000e+00 -52.000000 -1340.648410 -74.015515 -3356.666300 -881.985513

25% 1.382535e+07 6.000000 -73.992065 40.734796 -73.991407 40.733823

50% 2.774550e+07 8.500000 -73.981823 40.752592 -73.980093 40.753042

75% 4.155530e+07 12.500000 -73.967153 40.767158 -73.963659 40.768001

max 5.542357e+07 499.000000 57.418457 1644.421482 1153.572603 872.697628

In [5]: df.shape

Out[5]: (200000, 9)

localhost:8888/notebooks/BE_PRACTICALS/ML_2.ipynb 1/6
10/1/24, 10:25 PM ML_2 - Jupyter Notebook

In [6]: df.isnull().sum()

Out[6]: Unnamed: 0 0
key 0
fare_amount 0
pickup_datetime 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64

In [9]: df.columns

Out[9]: Index(['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime',


'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
'dropoff_latitude', 'passenger_count'],
dtype='object')

In [10]: df = df.drop(columns=['Unnamed: 0', 'key'])

In [11]: df = df.dropna()

In [12]: df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199999 entries, 0 to 199999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fare_amount 199999 non-null float64
1 pickup_datetime 199999 non-null datetime64[ns, UTC]
2 pickup_longitude 199999 non-null float64
3 pickup_latitude 199999 non-null float64
4 dropoff_longitude 199999 non-null float64
5 dropoff_latitude 199999 non-null float64
6 passenger_count 199999 non-null int64
dtypes: datetime64[ns, UTC](1), float64(5), int64(1)
memory usage: 12.2 MB

localhost:8888/notebooks/BE_PRACTICALS/ML_2.ipynb 2/6
10/1/24, 10:25 PM ML_2 - Jupyter Notebook

In [13]: plt.figure(figsize=(10, 6))


sns.boxplot(data=df[['fare_amount', 'pickup_longitude', 'pickup_latitude', 'dropoff_long
plt.title('Outliers in the dataset')
plt.show()

In [15]: df = df[df['fare_amount'] > 0]


df = df[df['fare_amount'] <= 100]

localhost:8888/notebooks/BE_PRACTICALS/ML_2.ipynb 3/6
10/1/24, 10:25 PM ML_2 - Jupyter Notebook

In [16]: plt.figure(figsize=(10, 6))


corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [17]: X = df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',


y = df['fare_amount']

In [18]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42

In [19]: scaler = StandardScaler()


X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]: lr_model = LinearRegression()


lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)

In [21]: r2_lr = r2_score(y_test, y_pred_lr)


rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

In [22]: print(f"Linear Regression - R²: {r2_lr:.4f}, RMSE: {rmse_lr:.4f}")

Linear Regression - R²: -0.0000, RMSE: 9.3393

In [23]: ridge_model = Ridge(alpha=1.0)


ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)

localhost:8888/notebooks/BE_PRACTICALS/ML_2.ipynb 4/6
10/1/24, 10:25 PM ML_2 - Jupyter Notebook

In [24]: r2_ridge = r2_score(y_test, y_pred_ridge)


rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))

In [25]: print(f"Ridge Regression - R²: {r2_ridge:.4f}, RMSE: {rmse_ridge:.4f}")

Ridge Regression - R²: -0.0000, RMSE: 9.3393

In [26]: lasso_model = Lasso(alpha=0.1)


lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)

In [27]: r2_lasso = r2_score(y_test, y_pred_lasso)


rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))

In [28]: print(f"Lasso Regression - R²: {r2_lasso:.4f}, RMSE: {rmse_lasso:.4f}")

Lasso Regression - R²: -0.0000, RMSE: 9.3394

In [29]: model_scores = pd.DataFrame({


"Model": ["Linear Regression", "Ridge Regression", "Lasso Regression"],
"R2 Score": [r2_lr, r2_ridge, r2_lasso],
"RMSE": [rmse_lr, rmse_ridge, rmse_lasso]
})

In [30]: print(model_scores)

Model R2 Score RMSE


0 Linear Regression -0.000004 9.339345
1 Ridge Regression -0.000004 9.339345
2 Lasso Regression -0.000017 9.339403

In [32]: fig, ax = plt.subplots(1, 2, figsize=(12, 6))



sns.barplot(x="Model", y="R2 Score", data=model_scores, ax=ax[0])
ax[0].set_title("R² Score Comparison")
ax[0].set_ylim(0, 1)

sns.barplot(x="Model", y="RMSE", data=model_scores, ax=ax[1])
ax[1].set_title("RMSE Comparison")

plt.tight_layout()
plt.show()

localhost:8888/notebooks/BE_PRACTICALS/ML_2.ipynb 5/6
10/1/24, 10:25 PM ML_2 - Jupyter Notebook

localhost:8888/notebooks/BE_PRACTICALS/ML_2.ipynb 6/6

You might also like