0% found this document useful (0 votes)
18 views15 pages

IDM Assignment

;'./

Uploaded by

Farah Jahangir
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views15 pages

IDM Assignment

;'./

Uploaded by

Farah Jahangir
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 15

Farah Jahangir

Introduction to Data Mining


Project Assignment

Task 1
Part A
Code:
1. Combining Dataset
2. import os
3. import pandas as pd
4. import glob
5.
6. # Folder path containing your CSV files
7. folder_path = '/content/drive/MyDrive/dataset'
8.
9. # Use glob to find all CSV files in the folder
10. csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
11.
12. # List to hold all dataframes
13. df_list = []
14.
15. # Loop through each CSV file and load it into a DataFrame
16. for file in csv_files:
17. df = pd.read_csv(file)
18. df_list.append(df)
19.
20. # Combine all DataFrames into one
21. combined_data = pd.concat(df_list, ignore_index=True)
22.
23. # Check the first few rows of the combined data
24. print(combined_data.head())
2. Droping Irrelevant Columns

from sklearn.preprocessing import StandardScaler

# Drop Date and Class columns


data =
combined_data.drop(columns=['date','humidity9am','pressure9am','temp9am','
rain_today','rain_tomorrow','wind_speed9am','cloud9am'])
print(data.head())
3. Mapping Values

import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'data' is your original DataFrame that contains the cloud cover
column (named 'cloud3pm')

# Mapping of cloud cover categories to numerical values (0 to 16)


cloud_cover_mapping = {
'Fair / Windy': 0, 'Partly Cloudy': 1, 'Partly Cloudy / Windy': 2,
'Cloudy': 3,
'Cloudy / Windy': 4, 'Mostly Cloudy': 5, 'Mostly Cloudy / Windy': 6,
'Fog': 7,
'Haze': 8, 'Light Rain': 9, 'Light Rain with Thunder': 10, 'Thunder':
11,
'Rain': 12, 'Thunder / Windy': 13, 'Heavy T-Storm': 14, 'Thunder in
the Vicinity': 15, 'TStorm': 16
}

# Load your dataset (replace 'your_file.csv' with your actual file path)
df = pd.read_csv('/content/scaled_weather_data.csv')

# Map the 'cloud3pm' column to numerical values using the mapping


df['cloud_cover'] = df['cloud3pm'].map(cloud_cover_mapping)

# Drop the original 'cloud3pm' column with string values


df = df.drop(columns=['cloud3pm'])

# Save the scaled DataFrame into a new CSV file


data.to_csv('new_weather_data.csv', index=False)

# Confirm that the data has been saved


print("Data has been scaled and saved to 'scaled_weather_data.csv'.")
Forming clusters:

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
# Step 1: Load the dataset
df = pd.read_csv('/content/modified_weather_data.csv') # Replace with the
actual path to your CSV file

# Step 2: Replace 'Blank' values with NaN for numerical columns


df.replace('Blank', np.nan, inplace=True)

# Step 3: Convert all columns to numeric, coercing any non-numeric data to


NaN
df = df.apply(pd.to_numeric, errors='coerce')

# Step 4: Impute missing values with median (as before)


df = df.fillna(df.median())

# Step 7: Select only numeric columns for clustering


numeric_data = df.select_dtypes(include=[np.number]) # Select only
numeric columns for clustering

# Step 8: Standardize the data


scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)

# Step 9: Apply K-means clustering (k=3)


kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(scaled_data)

# Step 10: Get the cluster labels


df['cluster'] = kmeans.labels_

# Step 11: Report the centroids of the clusters


centroids = pd.DataFrame(kmeans.cluster_centers_,
columns=numeric_data.columns)
print("Centroids of the clusters:")
print(centroids)

# Step 12: Visualize the clusters using boxplots for selected attributes
selected_columns = ['min_temp', 'max_temp', 'rainfall', 'humidity3pm',
'wind_speed3pm', 'pressure3pm']
plt.figure(figsize=(15, 10))

for i, column in enumerate(selected_columns, 1):


plt.subplot(2, 3, i)
sns.boxplot(x='cluster', y=column, data=df)
plt.title(f'Boxplot of {column} by Cluster')
plt.tight_layout()
plt.show()

# Step 13: Visualize the clusters using scatter plots (for 2D projection)
# First, let's reduce the data to 2D for visualization using PCA
(Principal Component Analysis)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

# Scatter plot of clusters in 2D space


plt.figure(figsize=(8, 6))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=df['cluster'],
cmap='viridis', s=50)
plt.title('K-means Clustering (2D PCA projection)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()

Results:
K-Mean Clustering with K=3

Box plots:
Part B
Code:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import pair_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Step 1: Load and preprocess the dataset


df = pd.read_csv('/content/modified_weather_data.csv') # Replace with the
actual path to your CSV file

# Step 2: Replace 'Blank' values with NaN for numerical columns


df.replace('Blank', np.nan, inplace=True)

# Step 3: Convert all columns to numeric, coercing any non-numeric data to


NaN
df = df.apply(pd.to_numeric, errors='coerce')
# Step 4: Impute missing values with median
df = df.fillna(df.median())

# Step 5: Select only numeric columns for clustering


numeric_data = df.select_dtypes(include=[np.number]) # Select only
numeric columns for clustering

# Step 6: Standardize the data


scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)

# Step 7: Apply DBSCAN clustering


# We will try different values of eps and min_samples to get between 2 and
15 clusters with less than 20% outliers

# Best configuration for DBSCAN (after tuning) -- adjust eps and


min_samples
dbscan = DBSCAN(eps=0.5, min_samples=5) # You can adjust these parameters
as needed
dbscan.fit(scaled_data)

# Add cluster labels to dataframe


df['dbscan_cluster'] = dbscan.labels_

# Identify the number of outliers (labeled as -1 in DBSCAN)


outliers = np.sum(df['dbscan_cluster'] == -1)
total_points = len(df)
outlier_percentage = outliers / total_points * 100
print(f"Outlier percentage in DBSCAN: {outlier_percentage:.2f}%")

# Check if outliers are below 20% (target condition)


if outlier_percentage > 20:
print("Outliers exceed 20%, adjusting DBSCAN parameters.")
else:
print("Outliers are below 20%, proceed to next steps.")

# Step 8: Visualize the DBSCAN clusters using a scatter plot (2D PCA
projection)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

# Scatter plot of DBSCAN clusters


plt.figure(figsize=(8, 6))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=df['dbscan_cluster'],
cmap='viridis', s=50)
plt.title('DBSCAN Clustering (2D PCA projection)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()

Results:
Visualization of DBSCAN Clustering Algorithm
Task 2

Code:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KernelDensity
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
# Step 1: Load the dataset
df = pd.read_csv('/content/modified_weather_data.csv') # Replace with act
ual file path
# Step 2: Preprocess the data
df.replace('Blank', np.nan, inplace=True) # Handle missing values
df = df.apply(pd.to_numeric, errors='coerce') # Convert all columns to nu
meric
df.fillna(df.median(), inplace=True) # Fill missing values with median
# Select relevant columns
features = ['min_temp', 'max_temp', 'rainfall', 'wind_speed3pm', 'humidity
3pm','pressure3pm','cloud_cover']
data = df[features]
# Step 3: Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# --- Distance-based Outlier Detection ---
def calculate_distance_outlier_scores(data, threshold=2):
# Calculate pairwise distances using Euclidean distance
distances = cdist(data, data, metric='euclidean')
# Calculate the mean distance for each point
mean_distances = distances.mean(axis=1)
# Outlier scores based on distance threshold
outlier_scores = mean_distances / mean_distances.max() # Normalize to
range [0, 1]
return outlier_scores
# --- Density-based Outlier Detection ---
def calculate_density_outlier_scores(data, bandwidth=0.5):
# Use KernelDensity to estimate density
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
kde.fit(data)
# Get the log of the density for each point
log_density = kde.score_samples(data)
# Convert to outlier scores (higher log_density = less likely to be an
outlier)
outlier_scores = -log_density / -
log_density.max() # Normalize to range [0, 1]
return outlier_scores
# Step 4: Calculate OLS for both methods
distance_outlier_scores = calculate_distance_outlier_scores(scaled_data)
density_outlier_scores = calculate_density_outlier_scores(scaled_data)
# Add OLS to dataframe
df['distance_OLS'] = distance_outlier_scores
df['density_OLS'] = density_outlier_scores
# Step 5: Sort the dataset by OLS scores and analyze the top/bottom exampl
es
df_sorted_distance = df.sort_values(by='distance_OLS', ascending=False)
df_sorted_density = df.sort_values(by='density_OLS', ascending=False)
# Top 3 likely outliers
print("Top 3 outliers based on distance-based OLS:")
print(df_sorted_distance.head(3))
print("Top 3 outliers based on density-based OLS:")
print(df_sorted_density.head(3))
# Bottom example (most normal)
print("Most normal (bottom) based on distance-based OLS:")
print(df_sorted_distance.tail(1))
print("Most normal (bottom) based on density-based OLS:")
print(df_sorted_density.tail(1))

Results of Outliers Detecting Techniques


Top 3 outliers based on distance-based OLS:
min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm
2114 73.0 83.0 20.6 20.0 94.0 29.39
1212 64.0 74.0 18.8 6.0 82.0 29.97
3470 73.0 78.0 18.2 7.0 96.0 29.99

temp3pm cloud_cover distance_OLS density_OLS


2114 75.0 4.0 1.000000 2.000083
1212 74.0 4.0 0.889223 1.988138
3470 75.0 4.0 0.866567 1.988138

Top 3 outliers based on density-based OLS:


min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm \
2114 73.0 83.0 20.6 20.0 94.0 29.39
3024 0.0 79.0 1.2 7.0 88.0 29.83
2815 75.0 81.0 11.1 22.0 90.0 29.74

temp3pm cloud_cover distance_OLS density_OLS


2114 75.0 4.0 1.000000 2.000083
3024 77.0 15.0 0.383273 2.000083
2815 77.0 4.0 0.560338 2.000083

Most normal (bottom) based on distance-based OLS:


min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm \
2128 63.0 81.0 0.0 10.0 54.0 29.93

temp3pm cloud_cover distance_OLS density_OLS


2128 80.0 4.0 0.132443 1.27591

Most normal (bottom) based on density-based OLS:


min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm \
1353 76.0 91.0 0.0 10.0 55.0 29.89

temp3pm cloud_cover distance_OLS density_OLS


1353 90.0 6.0 0.149656 1.0

Code For Visualizing the results:


import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KernelDensity
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

# Step 1: Load the dataset


df = pd.read_csv('/content/modified_weather_data.csv') # Replace with
actual file path

# Step 2: Preprocess the data


df.replace('Blank', np.nan, inplace=True) # Handle missing values
df = df.apply(pd.to_numeric, errors='coerce') # Convert all columns to
numeric
df.fillna(df.median(), inplace=True) # Fill missing values with median

# Select relevant columns


features = ['min_temp', 'max_temp', 'rainfall', 'wind_speed3pm',
'humidity3pm','pressure3pm','cloud_cover']
data = df[features]

# Step 3: Standardize the data


scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# --- Distance-based Outlier Detection ---


def calculate_distance_outlier_scores(data, threshold=2):
# Calculate pairwise distances using Euclidean distance
distances = cdist(data, data, metric='euclidean')
# Calculate the mean distance for each point
mean_distances = distances.mean(axis=1)
# Outlier scores based on distance threshold
outlier_scores = mean_distances / mean_distances.max() # Normalize to
range [0, 1]
return outlier_scores

# --- Density-based Outlier Detection ---


def calculate_density_outlier_scores(data, bandwidth=0.5):
# Use KernelDensity to estimate density
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
kde.fit(data)
# Get the log of the density for each point
log_density = kde.score_samples(data)
# Convert to outlier scores (higher log_density = less likely to be an
outlier)
outlier_scores = -log_density / -log_density.max() # Normalize to
range [0, 1]
return outlier_scores

# Step 4: Calculate OLS for both methods


distance_outlier_scores = calculate_distance_outlier_scores(scaled_data)
density_outlier_scores = calculate_density_outlier_scores(scaled_data)

# Add OLS to dataframe


df['distance_OLS'] = distance_outlier_scores
df['density_OLS'] = density_outlier_scores

# Step 5: Sort the dataset by OLS scores and analyze the top/bottom
examples
df_sorted_distance = df.sort_values(by='distance_OLS', ascending=False)
df_sorted_density = df.sort_values(by='density_OLS', ascending=False)

# Top 3 likely outliers


print("Top 3 outliers based on distance-based OLS:")
print(df_sorted_distance.head(3))

print("Top 3 outliers based on density-based OLS:")


print(df_sorted_density.head(3))

# Bottom example (most normal)


print("Most normal (bottom) based on distance-based OLS:")
print(df_sorted_distance.tail(1))

print("Most normal (bottom) based on density-based OLS:")


print(df_sorted_density.tail(1))

Visualization Results

Comparison Between Techniques


Code:
import matplotlib.pyplot as plt
import seaborn as sns

# Step 7: Visualize the OLS Scores

# Plot the distribution of distance-based and density-based OLS


plt.figure(figsize=(14, 6))

# Distance-based OLS Distribution


plt.subplot(1, 2, 1)
sns.histplot(df['distance_OLS'], kde=True, color='blue', bins=30)
plt.title('Distribution of Distance-based OLS Scores')
plt.xlabel('Distance-based OLS Score')
plt.ylabel('Frequency')

# Density-based OLS Distribution


plt.subplot(1, 2, 2)
sns.histplot(df['density_OLS'], kde=True, color='green', bins=30)
plt.title('Distribution of Density-based OLS Scores')
plt.xlabel('Density-based OLS Score')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Step 8: Visualize the Top 3 Outliers and Bottom Example

# Top 3 outliers based on distance-based OLS


top_3_distance_outliers = df_sorted_distance.head(3)
top_3_distance_outliers = top_3_distance_outliers[features +
['distance_OLS']]

# Top 3 outliers based on density-based OLS


top_3_density_outliers = df_sorted_density.head(3)
top_3_density_outliers = top_3_density_outliers[features +
['density_OLS']]

# Plot top 3 distance-based outliers


plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(x='min_temp', y='max_temp', data=top_3_distance_outliers,
color='red', s=100, label='Top 3 Distance-based Outliers')
plt.title('Top 3 Distance-based Outliers')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')

# Plot top 3 density-based outliers


plt.subplot(1, 2, 2)
sns.scatterplot(x='min_temp', y='max_temp', data=top_3_density_outliers,
color='orange', s=100, label='Top 3 Density-based Outliers')
plt.title('Top 3 Density-based Outliers')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')

plt.tight_layout()
plt.show()

# Step 9: Scatter plot comparing distance-based and density-based OLS


scores
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df['distance_OLS'], y=df['density_OLS'], color='purple')
plt.title('Comparison of Distance-based vs Density-based OLS Scores')
plt.xlabel('Distance-based OLS Score')
plt.ylabel('Density-based OLS Score')
plt.tight_layout()
plt.show()

# Step 10: Visualize the most normal day (bottom example) for both OLS
methods

# Most normal day based on distance-based OLS


most_normal_distance = df_sorted_distance.tail(1)

# Most normal day based on density-based OLS


most_normal_density = df_sorted_density.tail(1)

# Plot most normal day for both distance-based and density-based OLS
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(x='min_temp', y='max_temp', data=most_normal_distance,
color='blue', s=100, label='Most Normal (Distance-based)')
plt.title('Most Normal Day (Distance-based OLS)')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')

plt.subplot(1, 2, 2)
sns.scatterplot(x='min_temp', y='max_temp', data=most_normal_density,
color='green', s=100, label='Most Normal (Density-based)')
plt.title('Most Normal Day (Density-based OLS)')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')

plt.tight_layout()
plt.show()

Results

You might also like