IDM Assignment
IDM Assignment
Task 1
Part A
Code:
1. Combining Dataset
2. import os
3. import pandas as pd
4. import glob
5.
6. # Folder path containing your CSV files
7. folder_path = '/content/drive/MyDrive/dataset'
8.
9. # Use glob to find all CSV files in the folder
10. csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
11.
12. # List to hold all dataframes
13. df_list = []
14.
15. # Loop through each CSV file and load it into a DataFrame
16. for file in csv_files:
17. df = pd.read_csv(file)
18. df_list.append(df)
19.
20. # Combine all DataFrames into one
21. combined_data = pd.concat(df_list, ignore_index=True)
22.
23. # Check the first few rows of the combined data
24. print(combined_data.head())
2. Droping Irrelevant Columns
import pandas as pd
from sklearn.preprocessing import StandardScaler
# Assuming 'data' is your original DataFrame that contains the cloud cover
column (named 'cloud3pm')
# Load your dataset (replace 'your_file.csv' with your actual file path)
df = pd.read_csv('/content/scaled_weather_data.csv')
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
# Step 1: Load the dataset
df = pd.read_csv('/content/modified_weather_data.csv') # Replace with the
actual path to your CSV file
# Step 12: Visualize the clusters using boxplots for selected attributes
selected_columns = ['min_temp', 'max_temp', 'rainfall', 'humidity3pm',
'wind_speed3pm', 'pressure3pm']
plt.figure(figsize=(15, 10))
# Step 13: Visualize the clusters using scatter plots (for 2D projection)
# First, let's reduce the data to 2D for visualization using PCA
(Principal Component Analysis)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)
Results:
K-Mean Clustering with K=3
Box plots:
Part B
Code:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import pair_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
# Step 8: Visualize the DBSCAN clusters using a scatter plot (2D PCA
projection)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)
Results:
Visualization of DBSCAN Clustering Algorithm
Task 2
Code:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KernelDensity
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
# Step 1: Load the dataset
df = pd.read_csv('/content/modified_weather_data.csv') # Replace with act
ual file path
# Step 2: Preprocess the data
df.replace('Blank', np.nan, inplace=True) # Handle missing values
df = df.apply(pd.to_numeric, errors='coerce') # Convert all columns to nu
meric
df.fillna(df.median(), inplace=True) # Fill missing values with median
# Select relevant columns
features = ['min_temp', 'max_temp', 'rainfall', 'wind_speed3pm', 'humidity
3pm','pressure3pm','cloud_cover']
data = df[features]
# Step 3: Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# --- Distance-based Outlier Detection ---
def calculate_distance_outlier_scores(data, threshold=2):
# Calculate pairwise distances using Euclidean distance
distances = cdist(data, data, metric='euclidean')
# Calculate the mean distance for each point
mean_distances = distances.mean(axis=1)
# Outlier scores based on distance threshold
outlier_scores = mean_distances / mean_distances.max() # Normalize to
range [0, 1]
return outlier_scores
# --- Density-based Outlier Detection ---
def calculate_density_outlier_scores(data, bandwidth=0.5):
# Use KernelDensity to estimate density
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
kde.fit(data)
# Get the log of the density for each point
log_density = kde.score_samples(data)
# Convert to outlier scores (higher log_density = less likely to be an
outlier)
outlier_scores = -log_density / -
log_density.max() # Normalize to range [0, 1]
return outlier_scores
# Step 4: Calculate OLS for both methods
distance_outlier_scores = calculate_distance_outlier_scores(scaled_data)
density_outlier_scores = calculate_density_outlier_scores(scaled_data)
# Add OLS to dataframe
df['distance_OLS'] = distance_outlier_scores
df['density_OLS'] = density_outlier_scores
# Step 5: Sort the dataset by OLS scores and analyze the top/bottom exampl
es
df_sorted_distance = df.sort_values(by='distance_OLS', ascending=False)
df_sorted_density = df.sort_values(by='density_OLS', ascending=False)
# Top 3 likely outliers
print("Top 3 outliers based on distance-based OLS:")
print(df_sorted_distance.head(3))
print("Top 3 outliers based on density-based OLS:")
print(df_sorted_density.head(3))
# Bottom example (most normal)
print("Most normal (bottom) based on distance-based OLS:")
print(df_sorted_distance.tail(1))
print("Most normal (bottom) based on density-based OLS:")
print(df_sorted_density.tail(1))
# Step 5: Sort the dataset by OLS scores and analyze the top/bottom
examples
df_sorted_distance = df.sort_values(by='distance_OLS', ascending=False)
df_sorted_density = df.sort_values(by='density_OLS', ascending=False)
Visualization Results
plt.tight_layout()
plt.show()
plt.subplot(1, 2, 1)
sns.scatterplot(x='min_temp', y='max_temp', data=top_3_distance_outliers,
color='red', s=100, label='Top 3 Distance-based Outliers')
plt.title('Top 3 Distance-based Outliers')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')
plt.tight_layout()
plt.show()
# Step 10: Visualize the most normal day (bottom example) for both OLS
methods
# Plot most normal day for both distance-based and density-based OLS
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(x='min_temp', y='max_temp', data=most_normal_distance,
color='blue', s=100, label='Most Normal (Distance-based)')
plt.title('Most Normal Day (Distance-based OLS)')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')
plt.subplot(1, 2, 2)
sns.scatterplot(x='min_temp', y='max_temp', data=most_normal_density,
color='green', s=100, label='Most Normal (Density-based)')
plt.title('Most Normal Day (Density-based OLS)')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')
plt.tight_layout()
plt.show()
Results