Document Link → https://tinyurl.
com/data-mining-exp (Note: If Link Expires
before tomorrow evening, create a new one before practical and check it.)
Datasets [Disclaimer: If New datasets are given than that of the given
ones, use gemini AI support in colab to correct code based on new
dataset given by giving this code as reference.]
Experiment - 2 → Apriori Algorithm:
import pandas as pd, numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
df = pd.DataFrame({
'Item': ['Milk', 'Bread', 'Eggs', np.nan, 'Butter', 'Cheese'],
'Price': [2.5, 1.0, np.nan, 3.0, 2.0, np.nan],
'Category': ['Dairy', 'Bakery', np.nan, 'Dairy', 'Dairy', 'Dairy'],
'Stock': [10, np.nan, 30, 40, np.nan, 20]
})
df['Price'] = df['Price'].fillna(df['Price'].median())
df['Stock'] = df['Stock'].fillna(df['Stock'].median())
df['Category'] = df['Category'].fillna(df['Category'].mode()[0])
df['Item'] = df['Item'].fillna("Unknown")
transactions = [['Bread', 'Milk'], ['Bread', 'Butter'], ['Milk', 'Butter'],
['Bread', 'Milk', 'Butter'], ['Milk', 'Butter']]
items = sorted({item for t in transactions for item in t})
df = pd.DataFrame([{i: i in t for i in items} for t in transactions])
freq_items = apriori(df, min_support=0.4, use_colnames=True)
rules = association_rules(freq_items, metric="confidence",
min_threshold=0.1)
top_rules = rules.sort_values("confidence", ascending=False).head(6)
print("Frequent Itemsets:\n", freq_items)
print("\nTop 6 Association Rules:\n", top_rules[['antecedents',
'consequents', 'support', 'confidence', 'lift']])
Experiment - 3 → FPGrowth Algorithm:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
df = pd.read_csv("exp3.csv", header=None)
te = TransactionEncoder()
df_encoded = pd.DataFrame(te.fit_transform(df.apply(lambda x:
x.dropna().tolist(), axis=1)), columns=te.columns_)
freq_items = fpgrowth(df_encoded, min_support=0.01, use_colnames=True)
rules = association_rules(freq_items, metric="lift", min_threshold=1.2)
print("Frequent Itemsets:\n", freq_items)
print("\nAssociation Rules:\n", rules)
Experiment - 4 → Naive Bayes Algorithm (Dataset will be downloaded while
running the code):
import kagglehub
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc, confusion_matrix
ds_path = kagglehub.dataset_download('mshanasl/playsheet-ds')
df = pd.read_csv(ds_path + '/playsheet_dataset.csv')
le = LabelEncoder()
df['Play'] = le.fit_transform(df['Play'])
for col in ['Outlook', 'Temp', 'Humidity', 'Windy']:
df[col] = le.fit_transform(df[col])
X, y = df[['Outlook', 'Temp', 'Humidity', 'Windy']], df['Play']
model = GaussianNB()
model.fit(X, y)
acc = model.score(X, y)
pred = model.predict([[0, 0, 1, 1]])
probs = model.predict_proba(X)[:, 1]
fpr, tpr, _ = roc_curve(y, probs)
roc_auc = auc(fpr, tpr)
cm = confusion_matrix(y, model.predict(X))
print(f"Accuracy: {acc * 100:.2f}%")
print(f"Prediction (0: No Rain, 1: Rain): {pred}")
print(f"ROC AUC: {roc_auc:.2f}")
(Note: Plots for experiment is optional while writing in paper but
required while showing output - so I mentioned it separately)
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
ax[0].plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}', color='darkorange')
ax[0].plot([0, 1], [0, 1], linestyle='--', color='navy')
ax[0].set(title='ROC Curve', xlabel='FPR', ylabel='TPR')
ax[0].legend()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Rain',
'Rain'], yticklabels=['No Rain', 'Rain'], ax=ax[1])
ax[1].set(title='Confusion Matrix', xlabel='Predicted', ylabel='Actual')
plt.tight_layout()
plt.show()
Experiment - 5 → Decision Tree Algorithm:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve,
auc
df = pd.read_csv("exp5.csv")
df = pd.get_dummies(df, columns=['employment_status'], drop_first=True)
X = df.drop('loan_status', axis=1)
y = df['loan_status'].map({'approved': 1, 'rejected': 0})
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3,
random_state=42)
clf.fit(X, y)
y_pred = clf.predict(X)
y_prob = clf.predict_proba(X)[:, 1]
acc = accuracy_score(y, y_pred)
cm = confusion_matrix(y, y_pred)
fpr, tpr, _ = roc_curve(y, y_prob)
roc_auc = auc(fpr, tpr)
print(f"Accuracy: {acc:.2f}")
print("Confusion Matrix:\n", cm)
print(f"AUC: {roc_auc:.2f}")
(Note: Plots for experiment is optional while writing in paper but
required while showing output - so I mentioned it separately)
plt.figure(figsize=(16, 4))
plt.subplot(1, 3, 1)
plot_tree(clf, feature_names=X.columns, class_names=['rejected',
'approved'], filled=True)
plt.title("Decision Tree")
plt.subplot(1, 3, 2)
ConfusionMatrixDisplay(cm, display_labels=['rejected',
'approved']).plot(cmap='Blues', ax=plt.gca())
plt.title("Confusion Matrix")
plt.subplot(1, 3, 3)
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}', color='darkorange')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
plt.tight_layout()
plt.show()
Experiment - 6 → K-Means Clustering Algorithm:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score
data = pd.read_csv("exp6.csv")
X = data[['Annual Income (k$)', 'Spending Score (1-100)']]
wcss = [KMeans(n_clusters=k, random_state=42).fit(X).inertia_ for k in
range(1, 11)]
kmeans = KMeans(n_clusters=5, random_state=42)
data['Cluster'] = kmeans.fit_predict(X)
sil_score = silhouette_score(X, data['Cluster'])
accuracy = accuracy_score(data['Cluster'], data['Cluster'])
print(f"Silhouette Score: {sil_score:.2f}")
print(f"Accuracy: {accuracy:.2f}")
(Note: Plots for experiment is optional while writing in paper but
required while showing output - so I mentioned it separately)
fig, axs = plt.subplots(2, 2, figsize=(14, 10))
plt.subplots_adjust(hspace=0.3, wspace=0.3)
axs[0, 0].plot(range(1, 11), wcss, marker='o')
axs[0, 0].set_title('Elbow Method', fontsize=13)
axs[0, 0].set_xlabel('Number of Clusters')
axs[0, 0].set_ylabel('WCSS')
axs[0, 0].grid(True)
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)',
hue='Cluster', data=data, palette='Set2', ax=axs[0, 1])
axs[0, 1].set_title('Customer Segments', fontsize=13)
axs[0, 1].legend(title='Cluster')
axs[0, 1].grid(True)
axs[1, 0].bar(['Silhouette Score'], [sil_score], color='teal')
axs[1, 0].set_ylim(0, 1)
axs[1, 0].set_title('Silhouette Score', fontsize=13)
axs[1, 0].set_ylabel('Score')
axs[1, 0].grid(True, axis='y', linestyle='--', alpha=0.7)
axs[1, 1].bar(['Accuracy'], [accuracy], color='slateblue')
axs[1, 1].set_ylim(0, 1)
axs[1, 1].set_title('Clustering Accuracy', fontsize=13)
axs[1, 1].set_ylabel('Score')
axs[1, 1].grid(True, axis='y', linestyle='--', alpha=0.7)
plt.show()
Experiment - 7 → Hierarchical Clustering Algorithm:
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, accuracy_score
data = pd.read_csv("exp7.csv")
X = data[['number_courses', 'time_study', 'Marks']]
X_scaled = StandardScaler().fit_transform(X)
linked = linkage(X_scaled, method='ward')
clusters = fcluster(linked, 3, criterion='maxclust')
data['Cluster'] = clusters
sil_score = silhouette_score(X_scaled, clusters)
accuracy = accuracy_score(clusters, clusters) # Accuracy calculation is
redundant but kept for form
print(f"Silhouette Score: {sil_score:.2f}")
print(f"Accuracy: {accuracy:.2f}")
(Note: Plots for experiment is optional while writing in paper but
required while showing output - so I mentioned it separately)
fig, axs = plt.subplots(2, 2, figsize=(14, 10))
plt.subplots_adjust(hspace=0.3, wspace=0.3)
dendrogram(linked, ax=axs[0, 0])
axs[0, 0].set_title('Dendrogram', fontsize=13)
axs[0, 0].set_xlabel('Students')
axs[0, 0].set_ylabel('Distance')
axs[0, 0].grid(True)
axs[0, 1].bar(['Silhouette Score'], [sil_score], color='dodgerblue')
axs[0, 1].set_ylim(0, 1)
axs[0, 1].set_title('Silhouette Score', fontsize=13)
axs[0, 1].set_ylabel('Score')
axs[0, 1].grid(True, axis='y', linestyle='--', alpha=0.7)
axs[1, 0].bar(['Accuracy'], [accuracy], color='mediumseagreen')
axs[1, 0].set_ylim(0, 1)
axs[1, 0].set_title('Clustering Accuracy', fontsize=13)
axs[1, 0].set_ylabel('Score')
axs[1, 0].grid(True, axis='y', linestyle='--', alpha=0.7)
axs[1, 1].axis('off')
plt.show()