Linear Regression Test
Linear Regression Test
In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
f=pd.read_csv("Food_Delivery_Times.csv")
f
Out[11]: Order_ID Distance_km Weather Traffic_Level Time_of_Day Vehicle_Type Preparation_Time_min Courier_Experience_yrs Delivery_Time_min
... ... ... ... ... ... ... ... ... ...
In [12]:
f.head()
Out[12]: Order_ID Distance_km Weather Traffic_Level Time_of_Day Vehicle_Type Preparation_Time_min Courier_Experience_yrs Delivery_Time_min
In [13]:
f.describe()
In [25]:
f.isnull().sum()
Out[25]: Order_ID 0
Distance_km 0
Weather 0
Traffic_Level 0
Time_of_Day 0
Vehicle_Type 0
Preparation_Time_min 0
Courier_Experience_yrs 0
Delivery_Time_min 0
dtype: int64
In [15]:
f.duplicated().sum()
Out[15]: 0
In [16]:
f.shape
Out[16]: (1000, 9)
In [17]:
f.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Order_ID 1000 non-null int64
1 Distance_km 1000 non-null float64
2 Weather 970 non-null object
3 Traffic_Level 970 non-null object
4 Time_of_Day 970 non-null object
5 Vehicle_Type 1000 non-null object
6 Preparation_Time_min 1000 non-null int64
7 Courier_Experience_yrs 970 non-null float64
8 Delivery_Time_min 1000 non-null int64
dtypes: float64(2), int64(3), object(4)
memory usage: 70.4+ KB
In [22]:
from sklearn.preprocessing import LabelEncoder
f["Weather"]=LabelEncoder().fit_transform(f["Weather"])
f["Traffic_Level"]=LabelEncoder().fit_transform(f["Traffic_Level"])
f["Time_of_Day"]=LabelEncoder().fit_transform(f["Time_of_Day"])
f["Vehicle_Type"]=LabelEncoder().fit_transform(f["Vehicle_Type"])
In [24]:
f["Courier_Experience_yrs"].fillna(f["Courier_Experience_yrs"].mean(),inplace=True)
visualization
In [40]:
sns.scatterplot(x="Distance_km",y="Delivery_Time_min",data=f)
plt.title("scatter plot of hours studied vs exam score")
plt.xlabel("DISTANCE(KM)")
plt.ylabel("DELIVERY TIME")
plt.show()
In [26]:
correlation=f.corr()
print("correlation Matrix:")
print(correlation)
correlation Matrix:
Order_ID Distance_km Weather Traffic_Level \
Order_ID 1.000000 -0.024483 -0.035785 -0.050845
Distance_km -0.024483 1.000000 0.029756 -0.036602
Weather -0.035785 0.029756 1.000000 -0.031301
Traffic_Level -0.050845 -0.036602 -0.031301 1.000000
Time_of_Day -0.027034 0.009034 0.006595 0.022550
Vehicle_Type -0.045030 0.003319 -0.019231 0.032593
Preparation_Time_min -0.035100 -0.009037 -0.039429 0.004945
Courier_Experience_yrs 0.012933 -0.007713 0.037972 -0.037431
Delivery_Time_min -0.036650 0.780998 0.110254 -0.087523
Courier_Experience_yrs Delivery_Time_min
Order_ID 0.012933 -0.036650
Distance_km -0.007713 0.780998
Weather 0.037972 0.110254
Traffic_Level -0.037431 -0.087523
Time_of_Day -0.057384 0.025133
Vehicle_Type -0.002504 -0.006629
Preparation_Time_min -0.030353 0.307350
Courier_Experience_yrs 1.000000 -0.089066
Delivery_Time_min -0.089066 1.000000
In [28]:
x=f[["Distance_km"]]
y=f["Delivery_Time_min"]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
In [29]:
model=LinearRegression()
model.fit(x_train,y_train)
Out[29]: LinearRegression()
In [30]:
print("\nModel coeeficient:")
print(f"Intercept:{model.intercept_}")
print(f"slope:{model.coef_[0]}")
Model coeeficient:
Intercept:26.585748176869693
slope:3.0164695806793005
In [31]:
y_pred=model.predict(x_test)
Model Evaluation:
Mean Squared Error (MSE): 158.16196727280166
R-squared (R2): 0.6471386683659509
Mean Absolute Error (MAE): 9.580954148917813
INTERPRETING R2
In [42]:
if r2==1:
print("THE MODEL PERFECTLY EXPLAINS THE VARIANCE IN SALES REVENUE")
elif r2>0.5:
print("THE MODEL EXPLAINS A MODERATE PROPORTION OF THE VARIANCE IN SALES REVENUE")
elif r2>0.8:
print("THE MODEL EXPLAINS A STRONG PROPORTION OF THE VARIANCE IN SALES REVENUE")
elif r2>0.5:
print("THE MODEL EXPLAINS A MODERATE PROPORTION OF THE VARIANCE IN SALES REVENUE")
else:
print("THE MODEL EXPLAINS A WEAK PROPORTION OF THE VARIANCE IN SALES REVENUE")
In [33]:
plt.scatter(x, y, color="blue", label="Data Points")
plt.plot(x, model.predict(x), color="red", label="Regression Line")
plt.title("Simple Linear Regression: Hours Studied vs Exam Score")
plt.xlabel("Hours Studied")
plt.ylabel("Exam Score")
plt.legend()
plt.show()
In [ ]: