FIND – S ALGORITHM
import csv
def more_general(h1, h2):
return all(h1[i] == '?' or h2[i] == '?' or h1[i] == h2[i] for i in range(len(h1)))
def find_s(data):
most_specific = data[0][1:]
for instance in data[1:]:
current = ['?' if instance[1:][i] != most_specific[i] else most_specific[i] for i in
range(len(most_specific))]
if more_general(current, most_specific):
most_specific = current
return most_specific
with open('training_data.csv', 'r') as file:
reader = csv.reader(file)
next(reader) # Skip header
data = list(reader)
print("Most Specific Hypothesis:", find_s(data))
CANDIDATE – ELIMINATION ALGO
import csv
def more_general(h1, h2):
return all(h1[i] == '?' or h2[i] == '?' or h1[i] == h2[i] for i in range(len(h1)))
def find_s(data):
most_specific = data[0][1:]
for instance in data[1:]:
current = ['?' if instance[1:][i] != most_specific[i] else most_specific[i] for i in
range(len(most_specific))]
if more_general(current, most_specific):
most_specific = current
return most_specific
def find_g(data):
most_general = ['?'] * len(data[0][1:])
for instance in data:
current = [instance[1:][i] if instance[1:][i] != most_general[i] else '?' for i in
range(len(most_general))]
if not more_general(most_general, current):
most_general = current
return most_general
with open('training_data.csv', 'r') as file:
reader = csv.reader(file)
next(reader) # Skip header
data = list(reader)
s, g = find_s(data), find_g(data)
print("Set of all hypotheses consistent with the training examples:")
for i, (s_val, g_val) in enumerate(zip(s, g), start=1):
print(f"Attribute {i}: {s_val if g_val == '?' else g_val}")
DECISION TREE – ID3
import math
def entropy(data, target_attr):
val_freq = {}
for record in data:
val_freq[record[target_attr]] = val_freq.get(record[target_attr], 0) + 1
return sum(-freq/len(data) * math.log2(freq/len(data)) for freq in val_freq.values())
def info_gain(data, attr, target_attr):
base_entropy = entropy(data, target_attr)
attr_vals = set(record[attr] for record in data)
exp_entropy = sum((len([rec for rec in data if rec[attr] == val])/len(data)) * entropy([rec for
rec in data if rec[attr] == val], target_attr) for val in attr_vals)
return base_entropy - exp_entropy
def id3(data, attrs, target_attr):
base_entropy = entropy(data, target_attr)
if base_entropy == 0:
return next(iter(set(record[target_attr] for record in data)))
elif len(attrs) == 0:
return max(set(record[target_attr] for record in data), key=[record[target_attr] for
record in data].count)
else:
attr_gains = [info_gain(data, attr, target_attr) for attr in attrs]
selected_attr = attrs[attr_gains.index(max(attr_gains))]
node = {selected_attr: {}}
attr_values = set(record[selected_attr] for record in data)
for value in attr_values:
new_data = [record for record in data if record[selected_attr] == value]
new_attrs = attrs.copy()
new_attrs.remove(selected_attr)
child_node = id3(new_data, new_attrs, target_attr)
node[selected_attr][value] = child_node
return node
def classify(tree, sample):
for attr, values in tree.items():
value = sample[attributes.index(attr)]
if value in values:
child = values[value]
if isinstance(child, dict):
return classify(child, sample)
else:
return child
data = [['Sunny', 'Hot', 'High', 'False'], ['Sunny', 'Hot', 'High', 'True'], ['Overcast', 'Hot', 'High',
'False'], ['Rain', 'Mild', 'High', 'False'], ['Rain', 'Cool', 'Normal', 'False'], ['Rain', 'Cool', 'Normal',
'True'], ['Overcast', 'Cool', 'Normal', 'True'], ['Sunny', 'Mild', 'High', 'False'], ['Sunny', 'Cool',
'Normal', 'False'], ['Rain', 'Mild', 'Normal', 'False'], ['Sunny', 'Mild', 'Normal', 'True'],
['Overcast', 'Mild', 'High', 'True'], ['Overcast', 'Hot', 'Normal', 'False'], ['Rain', 'Mild', 'High',
'True']]
attributes = ['Outlook', 'Temperature', 'Humidity', 'Play Tennis']
tree = id3(data, [i for i in range(len(data[0])-1)], len(data[0])-1)
print("Decision Tree:", tree)
sample = ['Rain', 'Mild', 'High']
prediction = classify(tree, sample)
print("Prediction for sample", sample, ":", prediction)
Backpropagation algorithm --
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
iris = datasets.load_iris()
X, y = iris.data, iris.target
y = OneHotEncoder().fit_transform(y.reshape(-1, 1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().transform(X_test)
input_size, hidden_size, output_size = 4, 8, 3
W1, b1, W2, b2 = np.random.randn(input_size, hidden_size), np.zeros((1, hidden_size)),
np.random.randn(hidden_size, output_size), np.zeros((1, output_size))
sigmoid = lambda x: 1 / (1 + np.exp(-x))
sigmoid_derivative = lambda x: x * (1 - x)
def backpropagation(X, y, learning_rate=0.1, epochs=10000):
for _ in range(epochs):
layer1 = sigmoid(np.dot(X, W1) + b1)
layer2 = sigmoid(np.dot(layer1, W2) + b2)
error = y - layer2
delta2 = error * sigmoid_derivative(layer2)
W2_grad, b2_grad = np.dot(layer1.T, delta2), np.sum(delta2, axis=0, keepdims=True)
delta1 = np.dot(delta2, W2.T) * sigmoid_derivative(layer1)
W1_grad, b1_grad = np.dot(X.T, delta1), np.sum(delta1, axis=0, keepdims=True)
W2 += learning_rate * W2_grad
b2 += learning_rate * b2_grad
W1 += learning_rate * W1_grad
b1 += learning_rate * b1_grad
return W1, b1, W2, b2
W1, b1, W2, b2 = backpropagation(X_train, y_train)
def predict(X, W1, b1, W2, b2):
layer1 = sigmoid(np.dot(X, W1) + b1)
layer2 = sigmoid(np.dot(layer1, W2) + b2)
return np.argmax(layer2, axis=1)
y_pred = predict(X_test, W1, b1, W2, b2)
accuracy = np.mean(y_pred == np.argmax(y_test, axis=1))
print(f"Test accuracy: {accuracy * 100:.2f}%")
naïve Bayesian classifier—
import csv
import math
def load_data(filename):
return [row for row in csv.reader(open(filename))]
def split_data(data):
features = [row[:-1] for row in data]
labels = [row[-1] for row in data]
return features, labels
def get_stats(feature):
values = [float(x) for x in feature if x.isdigit()]
mean = sum(values) / len(values)
stdev = (sum((x - mean) ** 2 for x in values) / len(values)) ** 0.5
return mean, stdev
def pdf(x, mean, stdev):
return math.exp(-((x - mean) ** 2) / (2 * stdev ** 2)) / (stdev * math.sqrt(2 * math.pi))
def prob_cat(value, values):
return values.count(value) / len(values)
def train(features, labels):
label_counts = {label: labels.count(label) for label in set(labels)}
feature_stats = [{label: [float(feat) if feat.isdigit() else feat for feat in feats] for label, feats
in zip([label] * len(features), zip(*features))} for label in set(labels)]
for i, feature in enumerate(feature_stats):
for label, values in feature.items():
if all(isinstance(val, float) for val in values):
mean, stdev = get_stats(values)
feature[label] = (mean, stdev)
return label_counts, feature_stats
def classify(feature_vector, label_counts, feature_stats):
label_probs = {label: math.log(count / sum(label_counts.values())) for label, count in
label_counts.items()}
for i, feature_value in enumerate(feature_vector):
for label, stats in feature_stats[i].items():
if isinstance(stats, tuple):
mean, stdev = stats
label_probs[label] += math.log(pdf(float(feature_value), mean, stdev))
else:
label_probs[label] += math.log(prob_cat(feature_value, stats))
return max(label_probs.items(), key=lambda x: x[1])[0]
def compute_accuracy(test_features, test_labels, label_counts, feature_stats):
correct = sum(classify(feature_vector, label_counts, feature_stats) == label for
feature_vector, label in zip(test_features, test_labels))
return correct / len(test_labels)
training_data = load_data('training_data.csv')
features, labels = split_data(training_data)
label_counts, feature_stats = train(features, labels)
test_data = load_data('test_data.csv')
test_features, test_labels = split_data(test_data)
accuracy = compute_accuracy(test_features, test_labels, label_counts, feature_stats)
print(f"Accuracy: {accuracy * 100:.2f}%")
Bayesian network—
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
# Construct the Bayesian network
model = BayesianModel([('age', 'num'), ('sex', 'num'), ('cp', 'num'), ('trestbps', 'num'), ('chol',
'num'), ('fbs', 'num'), ('restecg', 'num'), ('thalach', 'num'), ('exang', 'num'), ('oldpeak', 'num'),
('slope', 'num'), ('ca', 'num'), ('thal', 'num')])
# Estimate the parameters of the Bayesian network
estimator = MaximumLikelihoodEstimator(model, heart_data)
estimator.estimate()
# Perform inference
inference = model.fit(heart_data)
# Example: Predict the probability of heart disease for a new patient
patient_data = {'age': 50, 'sex': 1, 'cp': 3, 'trestbps': 130, 'chol': 250, 'fbs': 0, 'restecg': 0,
'thalach': 180, 'exang': 0, 'oldpeak': 0.8, 'slope': 2, 'ca': 0, 'thal': 3}
# Convert categorical variables to integer values for the new patient
patient_data = pd.DataFrame([patient_data]).replace({'cp': {0: 'cp_0', 1: 'cp_1', 2: 'cp_2', 3:
'cp_3'}, 'restecg': {0: 'restecg_0', 1: 'restecg_1', 2: 'restecg_2'}, 'slope': {0: 'slope_0', 1:
'slope_1', 2: 'slope_2'}, 'thal': {0: 'thal_0', 1: 'thal_1', 2: 'thal_2', 3: 'thal_3'}})
# Perform inference for the new patient
query = inference.map_query('num', patient_data)
# Print the probability of heart disease
print(f"Probability of heart disease: {query.values[-1]:.2f}")
EM algorithm—
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# Load the dataset from a CSV file
data = pd.read_csv('dataset.csv')
X = data.iloc[:, :-1].values # Features
y = data.iloc[:, -1].values # Labels (for evaluation purposes)
# EM Clustering
print("EM Clustering:")
em = GaussianMixture(n_components=3, covariance_type='full', max_iter=100,
random_state=42)
em_labels = em.fit_predict(X)
em_score = silhouette_score(X, em_labels)
print(f"Silhouette Score: {em_score:.2f}")
# k-Means Clustering
print("\nk-Means Clustering:")
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X)
kmeans_score = silhouette_score(X, kmeans_labels)
print(f"Silhouette Score: {kmeans_score:.2f}")
# Compare the results
print("\nComparison:")
if em_score > kmeans_score:
print("EM algorithm performs better than k-Means for this dataset.")
else:
print("k-Means algorithm performs better than EM for this dataset.")
k-Nearest Neighbour algorithm—
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Load the iris dataset
iris = load_iris()
X, y = iris.data, iris.target
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a kNN classifier object
knn = KNeighborsClassifier(n_neighbors=3)
# Train the classifier
knn.fit(X_train, y_train)
# Make predictions on the test set
y_pred = knn.predict(X_test)
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Print correct and wrong predictions
print("\nCorrect Predictions:")
correct_indices = [i for i, (pred, true) in enumerate(zip(y_pred, y_test)) if pred == true]
for i in correct_indices:
print(f"Instance {i+1}: Predicted: {iris.target_names[y_pred[i]]} (Correct:
{iris.target_names[y_test[i]]})")
print("\nWrong Predictions:")
wrong_indices = [i for i, (pred, true) in enumerate(zip(y_pred, y_test)) if pred != true]
for i in wrong_indices:
print(f"Instance {i+1}: Predicted: {iris.target_names[y_pred[i]]} (Correct:
{iris.target_names[y_test[i]]})")
non-parametric Locally Weighted Regression algorithm—
import numpy as np
import matplotlib.pyplot as plt
# Sample dataset
X = np.linspace(-3, 3, 50)
y = np.sin(X) + np.random.normal(0, 0.2, len(X))
# Function to calculate the weight for a given x and x_i
def weight(x, x_i, tau):
return np.exp(-(x - x_i)**2 / (2 * tau**2))
# LWR function
def lwr(x, X, y, tau):
weights = np.array([weight(x, x_i, tau) for x_i in X])
W = np.diag(weights)
X_mat = np.vstack([np.ones(len(X)), X]).T
theta = np.linalg.pinv(X_mat.T @ W @ X_mat) @ (X_mat.T @ W @ y)
return theta[1]
# LWR predictions
tau = 1.0
y_pred = [lwr(x_val, X, y, tau) for x_val in X]
# Plot the data and LWR fit
plt.figure(figsize=(10, 6))
plt.scatter(X, y, label='Data Points', color='b', marker='o')
plt.plot(X, y_pred, label='LWR Fit', color='r')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Locally Weighted Regression')
plt.legend()
plt.show()