0% found this document useful (0 votes)

18 views15 pages

IDM Assignment

;'./

Uploaded by

Farah Jahangir

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

18 views15 pages

IDM Assignment

;'./

Uploaded by

Farah Jahangir

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 15

Farah Jahangir

Introduction to Data Mining

Project Assignment

Task 1
Part A
Code:
1. Combining Dataset
2. import os
3. import pandas as pd
4. import glob
5.
6. # Folder path containing your CSV files
7. folder_path = '/content/drive/MyDrive/dataset'
8.
9. # Use glob to find all CSV files in the folder
10. csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
11.
12. # List to hold all dataframes
13. df_list = []
14.
15. # Loop through each CSV file and load it into a DataFrame
16. for file in csv_files:
17. df = pd.read_csv(file)
18. df_list.append(df)
19.
20. # Combine all DataFrames into one
21. combined_data = pd.concat(df_list, ignore_index=True)
22.
23. # Check the first few rows of the combined data
24. print(combined_data.head())
2. Droping Irrelevant Columns

from sklearn.preprocessing import StandardScaler

# Drop Date and Class columns

data =
combined_data.drop(columns=['date','humidity9am','pressure9am','temp9am','
rain_today','rain_tomorrow','wind_speed9am','cloud9am'])
print(data.head())
3. Mapping Values

import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'data' is your original DataFrame that contains the cloud cover
column (named 'cloud3pm')

# Mapping of cloud cover categories to numerical values (0 to 16)

cloud_cover_mapping = {
'Fair / Windy': 0, 'Partly Cloudy': 1, 'Partly Cloudy / Windy': 2,
'Cloudy': 3,
'Cloudy / Windy': 4, 'Mostly Cloudy': 5, 'Mostly Cloudy / Windy': 6,
'Fog': 7,
'Haze': 8, 'Light Rain': 9, 'Light Rain with Thunder': 10, 'Thunder':
11,
'Rain': 12, 'Thunder / Windy': 13, 'Heavy T-Storm': 14, 'Thunder in
the Vicinity': 15, 'TStorm': 16
}

# Load your dataset (replace 'your_file.csv' with your actual file path)
df = pd.read_csv('/content/scaled_weather_data.csv')

# Map the 'cloud3pm' column to numerical values using the mapping

df['cloud_cover'] = df['cloud3pm'].map(cloud_cover_mapping)

# Drop the original 'cloud3pm' column with string values

df = df.drop(columns=['cloud3pm'])

# Save the scaled DataFrame into a new CSV file

data.to_csv('new_weather_data.csv', index=False)

# Confirm that the data has been saved

print("Data has been scaled and saved to 'scaled_weather_data.csv'.")
Forming clusters:

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
# Step 1: Load the dataset
df = pd.read_csv('/content/modified_weather_data.csv') # Replace with the
actual path to your CSV file

# Step 2: Replace 'Blank' values with NaN for numerical columns

df.replace('Blank', np.nan, inplace=True)

# Step 3: Convert all columns to numeric, coercing any non-numeric data to

NaN
df = df.apply(pd.to_numeric, errors='coerce')

# Step 4: Impute missing values with median (as before)

df = df.fillna(df.median())

# Step 7: Select only numeric columns for clustering

numeric_data = df.select_dtypes(include=[np.number]) # Select only
numeric columns for clustering

# Step 8: Standardize the data

scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)

# Step 9: Apply K-means clustering (k=3)

kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(scaled_data)

# Step 10: Get the cluster labels

df['cluster'] = kmeans.labels_

# Step 11: Report the centroids of the clusters

centroids = pd.DataFrame(kmeans.cluster_centers_,
columns=numeric_data.columns)
print("Centroids of the clusters:")
print(centroids)

# Step 12: Visualize the clusters using boxplots for selected attributes
selected_columns = ['min_temp', 'max_temp', 'rainfall', 'humidity3pm',
'wind_speed3pm', 'pressure3pm']
plt.figure(figsize=(15, 10))

for i, column in enumerate(selected_columns, 1):

plt.subplot(2, 3, i)
sns.boxplot(x='cluster', y=column, data=df)
plt.title(f'Boxplot of {column} by Cluster')
plt.tight_layout()
plt.show()

# Step 13: Visualize the clusters using scatter plots (for 2D projection)
# First, let's reduce the data to 2D for visualization using PCA
(Principal Component Analysis)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

# Scatter plot of clusters in 2D space

plt.figure(figsize=(8, 6))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=df['cluster'],
cmap='viridis', s=50)
plt.title('K-means Clustering (2D PCA projection)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()

Results:
K-Mean Clustering with K=3

Box plots:
Part B
Code:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import pair_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Step 1: Load and preprocess the dataset

df = pd.read_csv('/content/modified_weather_data.csv') # Replace with the
actual path to your CSV file

# Step 2: Replace 'Blank' values with NaN for numerical columns

df.replace('Blank', np.nan, inplace=True)

# Step 3: Convert all columns to numeric, coercing any non-numeric data to

NaN
df = df.apply(pd.to_numeric, errors='coerce')
# Step 4: Impute missing values with median
df = df.fillna(df.median())

# Step 5: Select only numeric columns for clustering

numeric_data = df.select_dtypes(include=[np.number]) # Select only
numeric columns for clustering

# Step 6: Standardize the data

scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)

# Step 7: Apply DBSCAN clustering

# We will try different values of eps and min_samples to get between 2 and
15 clusters with less than 20% outliers

# Best configuration for DBSCAN (after tuning) -- adjust eps and

min_samples
dbscan = DBSCAN(eps=0.5, min_samples=5) # You can adjust these parameters
as needed
dbscan.fit(scaled_data)

# Add cluster labels to dataframe

df['dbscan_cluster'] = dbscan.labels_

# Identify the number of outliers (labeled as -1 in DBSCAN)

outliers = np.sum(df['dbscan_cluster'] == -1)
total_points = len(df)
outlier_percentage = outliers / total_points * 100
print(f"Outlier percentage in DBSCAN: {outlier_percentage:.2f}%")

# Check if outliers are below 20% (target condition)

if outlier_percentage > 20:
print("Outliers exceed 20%, adjusting DBSCAN parameters.")
else:
print("Outliers are below 20%, proceed to next steps.")

# Step 8: Visualize the DBSCAN clusters using a scatter plot (2D PCA
projection)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

# Scatter plot of DBSCAN clusters

plt.figure(figsize=(8, 6))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=df['dbscan_cluster'],
cmap='viridis', s=50)
plt.title('DBSCAN Clustering (2D PCA projection)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()

Results:
Visualization of DBSCAN Clustering Algorithm
Task 2

Code:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KernelDensity
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
# Step 1: Load the dataset
df = pd.read_csv('/content/modified_weather_data.csv') # Replace with act
ual file path
# Step 2: Preprocess the data
df.replace('Blank', np.nan, inplace=True) # Handle missing values
df = df.apply(pd.to_numeric, errors='coerce') # Convert all columns to nu
meric
df.fillna(df.median(), inplace=True) # Fill missing values with median
# Select relevant columns
features = ['min_temp', 'max_temp', 'rainfall', 'wind_speed3pm', 'humidity
3pm','pressure3pm','cloud_cover']
data = df[features]
# Step 3: Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# --- Distance-based Outlier Detection ---
def calculate_distance_outlier_scores(data, threshold=2):
# Calculate pairwise distances using Euclidean distance
distances = cdist(data, data, metric='euclidean')
# Calculate the mean distance for each point
mean_distances = distances.mean(axis=1)
# Outlier scores based on distance threshold
outlier_scores = mean_distances / mean_distances.max() # Normalize to
range [0, 1]
return outlier_scores
# --- Density-based Outlier Detection ---
def calculate_density_outlier_scores(data, bandwidth=0.5):
# Use KernelDensity to estimate density
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
kde.fit(data)
# Get the log of the density for each point
log_density = kde.score_samples(data)
# Convert to outlier scores (higher log_density = less likely to be an
outlier)
outlier_scores = -log_density / -
log_density.max() # Normalize to range [0, 1]
return outlier_scores
# Step 4: Calculate OLS for both methods
distance_outlier_scores = calculate_distance_outlier_scores(scaled_data)
density_outlier_scores = calculate_density_outlier_scores(scaled_data)
# Add OLS to dataframe
df['distance_OLS'] = distance_outlier_scores
df['density_OLS'] = density_outlier_scores
# Step 5: Sort the dataset by OLS scores and analyze the top/bottom exampl
es
df_sorted_distance = df.sort_values(by='distance_OLS', ascending=False)
df_sorted_density = df.sort_values(by='density_OLS', ascending=False)
# Top 3 likely outliers
print("Top 3 outliers based on distance-based OLS:")
print(df_sorted_distance.head(3))
print("Top 3 outliers based on density-based OLS:")
print(df_sorted_density.head(3))
# Bottom example (most normal)
print("Most normal (bottom) based on distance-based OLS:")
print(df_sorted_distance.tail(1))
print("Most normal (bottom) based on density-based OLS:")
print(df_sorted_density.tail(1))

Results of Outliers Detecting Techniques

Top 3 outliers based on distance-based OLS:
min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm
2114 73.0 83.0 20.6 20.0 94.0 29.39
1212 64.0 74.0 18.8 6.0 82.0 29.97
3470 73.0 78.0 18.2 7.0 96.0 29.99

temp3pm cloud_cover distance_OLS density_OLS

2114 75.0 4.0 1.000000 2.000083
1212 74.0 4.0 0.889223 1.988138
3470 75.0 4.0 0.866567 1.988138

Top 3 outliers based on density-based OLS:

min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm \
2114 73.0 83.0 20.6 20.0 94.0 29.39
3024 0.0 79.0 1.2 7.0 88.0 29.83
2815 75.0 81.0 11.1 22.0 90.0 29.74

temp3pm cloud_cover distance_OLS density_OLS

2114 75.0 4.0 1.000000 2.000083
3024 77.0 15.0 0.383273 2.000083
2815 77.0 4.0 0.560338 2.000083

Most normal (bottom) based on distance-based OLS:

min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm \
2128 63.0 81.0 0.0 10.0 54.0 29.93

temp3pm cloud_cover distance_OLS density_OLS

2128 80.0 4.0 0.132443 1.27591

Most normal (bottom) based on density-based OLS:

min_temp max_temp rainfall wind_speed3pm humidity3pm pressure3pm \
1353 76.0 91.0 0.0 10.0 55.0 29.89

temp3pm cloud_cover distance_OLS density_OLS

1353 90.0 6.0 0.149656 1.0

Code For Visualizing the results:

# Step 1: Load the dataset

df = pd.read_csv('/content/modified_weather_data.csv') # Replace with
actual file path

# Step 2: Preprocess the data

df.replace('Blank', np.nan, inplace=True) # Handle missing values
df = df.apply(pd.to_numeric, errors='coerce') # Convert all columns to
numeric
df.fillna(df.median(), inplace=True) # Fill missing values with median

# Select relevant columns

features = ['min_temp', 'max_temp', 'rainfall', 'wind_speed3pm',
'humidity3pm','pressure3pm','cloud_cover']
data = df[features]

# Step 3: Standardize the data

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# --- Distance-based Outlier Detection ---

def calculate_distance_outlier_scores(data, threshold=2):
# Calculate pairwise distances using Euclidean distance
distances = cdist(data, data, metric='euclidean')
# Calculate the mean distance for each point
mean_distances = distances.mean(axis=1)
# Outlier scores based on distance threshold
outlier_scores = mean_distances / mean_distances.max() # Normalize to
range [0, 1]
return outlier_scores

# --- Density-based Outlier Detection ---

def calculate_density_outlier_scores(data, bandwidth=0.5):
# Use KernelDensity to estimate density
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
kde.fit(data)
# Get the log of the density for each point
log_density = kde.score_samples(data)
# Convert to outlier scores (higher log_density = less likely to be an
outlier)
outlier_scores = -log_density / -log_density.max() # Normalize to
range [0, 1]
return outlier_scores

# Step 4: Calculate OLS for both methods

distance_outlier_scores = calculate_distance_outlier_scores(scaled_data)
density_outlier_scores = calculate_density_outlier_scores(scaled_data)

# Add OLS to dataframe

df['distance_OLS'] = distance_outlier_scores
df['density_OLS'] = density_outlier_scores

# Step 5: Sort the dataset by OLS scores and analyze the top/bottom
examples
df_sorted_distance = df.sort_values(by='distance_OLS', ascending=False)
df_sorted_density = df.sort_values(by='density_OLS', ascending=False)

# Top 3 likely outliers

print("Top 3 outliers based on distance-based OLS:")
print(df_sorted_distance.head(3))

print("Top 3 outliers based on density-based OLS:")

print(df_sorted_density.head(3))

# Bottom example (most normal)

print("Most normal (bottom) based on distance-based OLS:")
print(df_sorted_distance.tail(1))

print("Most normal (bottom) based on density-based OLS:")

print(df_sorted_density.tail(1))

Visualization Results

Comparison Between Techniques

Code:
import matplotlib.pyplot as plt
import seaborn as sns

# Step 7: Visualize the OLS Scores

# Plot the distribution of distance-based and density-based OLS

plt.figure(figsize=(14, 6))

# Distance-based OLS Distribution

plt.subplot(1, 2, 1)
sns.histplot(df['distance_OLS'], kde=True, color='blue', bins=30)
plt.title('Distribution of Distance-based OLS Scores')
plt.xlabel('Distance-based OLS Score')
plt.ylabel('Frequency')

# Density-based OLS Distribution

plt.subplot(1, 2, 2)
sns.histplot(df['density_OLS'], kde=True, color='green', bins=30)
plt.title('Distribution of Density-based OLS Scores')
plt.xlabel('Density-based OLS Score')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Step 8: Visualize the Top 3 Outliers and Bottom Example

# Top 3 outliers based on distance-based OLS

top_3_distance_outliers = df_sorted_distance.head(3)
top_3_distance_outliers = top_3_distance_outliers[features +
['distance_OLS']]

# Top 3 outliers based on density-based OLS

top_3_density_outliers = df_sorted_density.head(3)
top_3_density_outliers = top_3_density_outliers[features +
['density_OLS']]

# Plot top 3 distance-based outliers

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(x='min_temp', y='max_temp', data=top_3_distance_outliers,
color='red', s=100, label='Top 3 Distance-based Outliers')
plt.title('Top 3 Distance-based Outliers')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')

# Plot top 3 density-based outliers

plt.subplot(1, 2, 2)
sns.scatterplot(x='min_temp', y='max_temp', data=top_3_density_outliers,
color='orange', s=100, label='Top 3 Density-based Outliers')
plt.title('Top 3 Density-based Outliers')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')

plt.tight_layout()
plt.show()

# Step 9: Scatter plot comparing distance-based and density-based OLS

scores
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df['distance_OLS'], y=df['density_OLS'], color='purple')
plt.title('Comparison of Distance-based vs Density-based OLS Scores')
plt.xlabel('Distance-based OLS Score')
plt.ylabel('Density-based OLS Score')
plt.tight_layout()
plt.show()

# Step 10: Visualize the most normal day (bottom example) for both OLS
methods

# Most normal day based on distance-based OLS

most_normal_distance = df_sorted_distance.tail(1)

# Most normal day based on density-based OLS

most_normal_density = df_sorted_density.tail(1)

# Plot most normal day for both distance-based and density-based OLS
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(x='min_temp', y='max_temp', data=most_normal_distance,
color='blue', s=100, label='Most Normal (Distance-based)')
plt.title('Most Normal Day (Distance-based OLS)')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')

plt.subplot(1, 2, 2)
sns.scatterplot(x='min_temp', y='max_temp', data=most_normal_density,
color='green', s=100, label='Most Normal (Density-based)')
plt.title('Most Normal Day (Density-based OLS)')
plt.xlabel('Min Temp (°F)')
plt.ylabel('Max Temp (°F)')

plt.tight_layout()
plt.show()

Results

Practical File of AI and ML
No ratings yet
Practical File of AI and ML
26 pages
(Feature Engineering) (Extended-Cheatsheet)
No ratings yet
(Feature Engineering) (Extended-Cheatsheet)
9 pages
MLRecord
No ratings yet
MLRecord
24 pages
Rajeek8 12
No ratings yet
Rajeek8 12
21 pages
Data Mining Assignment No. 1
No ratings yet
Data Mining Assignment No. 1
22 pages
[email protected] WeatherAnalysis
No ratings yet
[email protected] WeatherAnalysis
22 pages
Exp2 - Data Visualization and Cleaning and Feature Selection
No ratings yet
Exp2 - Data Visualization and Cleaning and Feature Selection
13 pages
DWDM Lab All
No ratings yet
DWDM Lab All
20 pages
Machine Learning Lab
No ratings yet
Machine Learning Lab
43 pages
AbidAdhikari26840-DWDM
No ratings yet
AbidAdhikari26840-DWDM
43 pages
HW1
No ratings yet
HW1
11 pages
Casos de ML Unsupervised Daniel Ames Camayo
No ratings yet
Casos de ML Unsupervised Daniel Ames Camayo
20 pages
Basics of Clustering
No ratings yet
Basics of Clustering
19 pages
Compute2
No ratings yet
Compute2
10 pages
2nd Ed 1 To 3 Exercise Answer and Qutsion Compilers Principles, Techniques, & Tools (Purple Dragon Book) Second Edition Exercise Answers
No ratings yet
2nd Ed 1 To 3 Exercise Answer and Qutsion Compilers Principles, Techniques, & Tools (Purple Dragon Book) Second Edition Exercise Answers
59 pages
UNITIV.BtechIot
No ratings yet
UNITIV.BtechIot
43 pages
Machine Learning Lab - Preprocessing
No ratings yet
Machine Learning Lab - Preprocessing
13 pages
DATA MINING EX1
No ratings yet
DATA MINING EX1
10 pages
Recurrent Neural Network-Programs
No ratings yet
Recurrent Neural Network-Programs
9 pages
DWM Practical
No ratings yet
DWM Practical
12 pages
Numpy Cheatsheet
No ratings yet
Numpy Cheatsheet
11 pages
ml lab exam document
No ratings yet
ml lab exam document
14 pages
DAVL PR1.2 Mit
No ratings yet
DAVL PR1.2 Mit
10 pages
Ml_datascience Manual (1)
No ratings yet
Ml_datascience Manual (1)
64 pages
ML2 Practical List
No ratings yet
ML2 Practical List
80 pages
Unit1 ML Programs
No ratings yet
Unit1 ML Programs
5 pages
Rainfall Prediction using Machine Learning
No ratings yet
Rainfall Prediction using Machine Learning
9 pages
8. ML_Lab Manual
No ratings yet
8. ML_Lab Manual
54 pages
Project Data Mining (AMAN YADAV)
No ratings yet
Project Data Mining (AMAN YADAV)
12 pages
BIG DATA_Assign
No ratings yet
BIG DATA_Assign
28 pages
Advance Python
No ratings yet
Advance Python
5 pages
D3 docs
No ratings yet
D3 docs
6 pages
Untitled document-2-1-13-7-11.4
No ratings yet
Untitled document-2-1-13-7-11.4
5 pages
MLFILE
No ratings yet
MLFILE
21 pages
kmeans
No ratings yet
kmeans
5 pages
Practical 5
No ratings yet
Practical 5
6 pages
ml short
No ratings yet
ml short
2 pages
Final_Code
No ratings yet
Final_Code
3 pages
K++
No ratings yet
K++
5 pages
KMeans Clustering
No ratings yet
KMeans Clustering
1 page
ML0101EN Clus DBSCN Weather Py v1
No ratings yet
ML0101EN Clus DBSCN Weather Py v1
16 pages
Assignment4_CH5650_CH21B112
No ratings yet
Assignment4_CH5650_CH21B112
3 pages
EXP-2 ML
No ratings yet
EXP-2 ML
6 pages
Python DM Lab Manual Part 2
No ratings yet
Python DM Lab Manual Part 2
8 pages
Subspace Cluster i Nig
No ratings yet
Subspace Cluster i Nig
6 pages
23CC554
No ratings yet
23CC554
10 pages
Chapter 3 - Testbank
67% (3)
Chapter 3 - Testbank
62 pages
weather_report
No ratings yet
weather_report
7 pages
ML-3
No ratings yet
ML-3
24 pages
Lecture Material 3
No ratings yet
Lecture Material 3
7 pages
Cluster Analysis in Spark
No ratings yet
Cluster Analysis in Spark
10 pages
M pdf
No ratings yet
M pdf
13 pages
exp_6
No ratings yet
exp_6
10 pages
22MID0187_ML_LAB-5
No ratings yet
22MID0187_ML_LAB-5
13 pages
Ml Short Code_under Updating
No ratings yet
Ml Short Code_under Updating
4 pages
ML LAB_EXP1-10
No ratings yet
ML LAB_EXP1-10
4 pages
Clustering
No ratings yet
Clustering
1 page
Python Scripts For Machine Learning
No ratings yet
Python Scripts For Machine Learning
13 pages
EDA LAB ASSIGNMENT2
No ratings yet
EDA LAB ASSIGNMENT2
10 pages
# For Linear Algebra Import Numpy As NP # For Data Processing Import Pandas As PD
No ratings yet
# For Linear Algebra Import Numpy As NP # For Data Processing Import Pandas As PD
4 pages
New Century Maths Year 7 Investigating Data
No ratings yet
New Century Maths Year 7 Investigating Data
40 pages
3rd Semester DDM AI DAA DEV Print Pages For Spiral Record 25-1-24 - Removed
No ratings yet
3rd Semester DDM AI DAA DEV Print Pages For Spiral Record 25-1-24 - Removed
28 pages
100 Days of Machine Learning
No ratings yet
100 Days of Machine Learning
14 pages
Oos Guidance
No ratings yet
Oos Guidance
48 pages
Essentials of Modern Business Statistics with Microsoft Excel 8th Edition David R. Anderson - eBook PDF instant download
100% (4)
Essentials of Modern Business Statistics with Microsoft Excel 8th Edition David R. Anderson - eBook PDF instant download
85 pages
Download Full (eBook PDF) Advanced Mathematical And Computational Tools In Metrology And Testing X PDF All Chapters
100% (6)
Download Full (eBook PDF) Advanced Mathematical And Computational Tools In Metrology And Testing X PDF All Chapters
30 pages
Past Behaviour, Financial Literacy and Investment Decision-Making Process of Individual Investors
No ratings yet
Past Behaviour, Financial Literacy and Investment Decision-Making Process of Individual Investors
21 pages
Doing Better Statistics In Humancomputer Interaction Cairns pdf download
No ratings yet
Doing Better Statistics In Humancomputer Interaction Cairns pdf download
79 pages
Credit Eda Case Study Analysis
75% (4)
Credit Eda Case Study Analysis
13 pages
lecture6-tfidf Vector Space Model (2)
No ratings yet
lecture6-tfidf Vector Space Model (2)
45 pages
Investigate OOT and OOS in Stability Studies (PDFDrive)
100% (1)
Investigate OOT and OOS in Stability Studies (PDFDrive)
55 pages
Lecture - 06 (Shared Memory Programming With OpenMP)
No ratings yet
Lecture - 06 (Shared Memory Programming With OpenMP)
65 pages
Data Mini Proj
100% (2)
Data Mini Proj
44 pages
Outlier Detection A Survey
No ratings yet
Outlier Detection A Survey
84 pages
lecture7a-vectorspace Computing Scores
No ratings yet
lecture7a-vectorspace Computing Scores
43 pages
Probability and Statistics Advanced (Second Edition)
100% (1)
Probability and Statistics Advanced (Second Edition)
359 pages
Ebooks File The International Encyclopedia of Ethics Volume 3 D Fal 1st Edition Hugh Lafollette All Chapters
100% (9)
Ebooks File The International Encyclopedia of Ethics Volume 3 D Fal 1st Edition Hugh Lafollette All Chapters
55 pages
FDS
No ratings yet
FDS
7 pages
Outlier Detection: Univariate and Multivariate
No ratings yet
Outlier Detection: Univariate and Multivariate
13 pages
Edmans Et Al Soccer
No ratings yet
Edmans Et Al Soccer
33 pages
Lec_08 SIP CIS-322 Im Enhan
No ratings yet
Lec_08 SIP CIS-322 Im Enhan
10 pages
Data Collection Methods
No ratings yet
Data Collection Methods
55 pages
Ethical Relativism-1
No ratings yet
Ethical Relativism-1
12 pages
DataMining Workbook Answers
No ratings yet
DataMining Workbook Answers
18 pages
The AirSensor Open-Source R-Package and DataViewer Web Application For Interpreting Community Data Collected by Low-Cost Sensor Networks
No ratings yet
The AirSensor Open-Source R-Package and DataViewer Web Application For Interpreting Community Data Collected by Low-Cost Sensor Networks
17 pages
Report Project 1
No ratings yet
Report Project 1
25 pages
Crayfish
No ratings yet
Crayfish
1 page
T09-MOTIVA Sample Continual Improvement Procedure
No ratings yet
T09-MOTIVA Sample Continual Improvement Procedure
9 pages
Eğitim-EnPG Belirleme
No ratings yet
Eğitim-EnPG Belirleme
61 pages
Chapter3 - Measures of Central Tendency Ungrouped Data
No ratings yet
Chapter3 - Measures of Central Tendency Ungrouped Data
25 pages
Modelling of Rainfall Intensity in A Watershed A C
No ratings yet
Modelling of Rainfall Intensity in A Watershed A C
10 pages
Data Mining Nostos
No ratings yet
Data Mining Nostos
4 pages
AP Statistics HW - Unit 1 MC
No ratings yet
AP Statistics HW - Unit 1 MC
3 pages
Victoria University Business School: Joe - Traficante@vu - Edu.au
No ratings yet
Victoria University Business School: Joe - Traficante@vu - Edu.au
5 pages
Short Circuit Current Estimation Using PMU Measurements During Normal Load Variation
No ratings yet
Short Circuit Current Estimation Using PMU Measurements During Normal Load Variation
5 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet

IDM Assignment

Uploaded by

IDM Assignment

Uploaded by

Farah Jahangir

Introduction to Data Mining

from sklearn.preprocessing import StandardScaler

# Drop Date and Class columns

# Mapping of cloud cover categories to numerical values (0 to 16)

# Map the 'cloud3pm' column to numerical values using the mapping

# Drop the original 'cloud3pm' column with string values

# Save the scaled DataFrame into a new CSV file

# Confirm that the data has been saved

# Step 2: Replace 'Blank' values with NaN for numerical columns

# Step 3: Convert all columns to numeric, coercing any non-numeric data to

# Step 4: Impute missing values with median (as before)

# Step 7: Select only numeric columns for clustering

# Step 8: Standardize the data

# Step 9: Apply K-means clustering (k=3)

# Step 10: Get the cluster labels

# Step 11: Report the centroids of the clusters

for i, column in enumerate(selected_columns, 1):

# Scatter plot of clusters in 2D space

# Step 1: Load and preprocess the dataset

# Step 2: Replace 'Blank' values with NaN for numerical columns

# Step 3: Convert all columns to numeric, coercing any non-numeric data to

# Step 5: Select only numeric columns for clustering

# Step 6: Standardize the data

# Step 7: Apply DBSCAN clustering

# Best configuration for DBSCAN (after tuning) -- adjust eps and

# Add cluster labels to dataframe

# Identify the number of outliers (labeled as -1 in DBSCAN)

# Check if outliers are below 20% (target condition)

# Scatter plot of DBSCAN clusters

Results of Outliers Detecting Techniques

temp3pm cloud_cover distance_OLS density_OLS

Top 3 outliers based on density-based OLS:

temp3pm cloud_cover distance_OLS density_OLS

Most normal (bottom) based on distance-based OLS:

temp3pm cloud_cover distance_OLS density_OLS

Most normal (bottom) based on density-based OLS:

temp3pm cloud_cover distance_OLS density_OLS

Code For Visualizing the results:

# Step 1: Load the dataset

# Step 2: Preprocess the data

# Select relevant columns

# Step 3: Standardize the data

# --- Distance-based Outlier Detection ---

# --- Density-based Outlier Detection ---

# Step 4: Calculate OLS for both methods

# Add OLS to dataframe

# Top 3 likely outliers

print("Top 3 outliers based on density-based OLS:")

# Bottom example (most normal)

print("Most normal (bottom) based on density-based OLS:")

Comparison Between Techniques

# Step 7: Visualize the OLS Scores

# Plot the distribution of distance-based and density-based OLS

# Distance-based OLS Distribution

# Density-based OLS Distribution

# Step 8: Visualize the Top 3 Outliers and Bottom Example

# Top 3 outliers based on distance-based OLS

# Top 3 outliers based on density-based OLS

# Plot top 3 distance-based outliers

# Plot top 3 density-based outliers

# Step 9: Scatter plot comparing distance-based and density-based OLS

# Most normal day based on distance-based OLS

# Most normal day based on density-based OLS

You might also like