ML Lab Manual Devansh (1)
ML Lab Manual Devansh (1)
MACHINE LEARNING(3170724)
7th Semester
LABORATORY MANUAL
SARDAR PATEL COLLEGE OF ENGINEERING
BAKROL, ANAND
CERTIFICATE
of Enrollment No.
has satisfactorily completed his/her term work in the subject
for the term ending in
20 /20 .
Date:
Practical-1
Code-
import pandas as pd
file=pd.read_csv("F:\programmes\python_programmes\ML_Practicals\
Tennis.csv")
print(file)
Output-
2|P a ge
Machine Learning(3170724) 211240107034
Practical-2
Aim-Implement and demonstrate the FIND-S algorithm for finding the most
specific hypothesis based on a given set of training data samples. Read the
training data from a .CSV file. Create Excel file Weather.csv and save it in
same path
Code-
import pandas as pd
def find_s(positive_examples):
"""
Find-S algorithm implementation.
Parameters:
- positive_examples: List of positive examples where each example is a list of attribute values.
Returns:
- Most specific hypothesis that covers all positive examples.
"""
# Initialize hypothesis to the first positive example
hypothesis = positive_examples[0].copy()
return hypothesis
def load_data_from_csv(filename):
"""
Load positive examples from a CSV file.
Parameters:
- filename: Path to the CSV file.
Returns:
- List of positive examples.
"""
# Read the CSV file into a DataFrame
df = pd.read_csv(filename)
positive_examples = positive_examples.values.tolist()
return positive_examples
Output:
4|P a ge
Machine Learning(3170724) 211240107034
Practical-3
Aim- For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithm to output a description of
the set of all hypotheses consistent with the training examples. Create Excel
file Training_examples.csv and save it in same path.
Code-
import pandas as pd
# Candidate-Elimination Algorithm
def candidate_elimination(examples):
X = examples.iloc[:, :-1].values # Extract features (all columns except the last one)
y = examples.iloc[:, -1].values # Extract target class (last column)
# Initialize the most specific hypothesis (S) and the most general hypothesis (G)
specific_hypothesis = X[0].copy()
general_hypothesis = [['?' for _ in range(len(specific_hypothesis))]]
5|P a ge
Machine Learning(3170724) 211240107034
Output:
6|P a ge
Machine Learning(3170724) 211240107034
Practical-4
Code-
import matplotlib.pyplot as plt
x= [5,7,8,7,2,17,2,9,4,11,12,9,6]
y= [99,86,87,111,86,103,87,94,78,85,86,77,76]
plt.scatter(x,y)
plt.show()
Output:
7|P a ge
Machine Learning(3170724) 211240107034
Code-
import matplotlib.pyplot as plt
from scipy import stats
x= [5,7,8,7,2,17,2,9,4,11,12,9,6]
y= [99,86,87,111,86,103,87,94,78,85,86,77,76]
slope, intercept, r, p, std_err = stats.linregress(x, y)
def myfunc(x):
return slope * x + intercept
mymodel = list(map(myfunc, x))
plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()
Output:
8|P a ge
Machine Learning(3170724) 211240107034
Practical-5
(a) Binary
Code-
import numpy
from sklearn import linear_model
logr = linear_model.LogisticRegression()
logr.fit(x,y)
Output:
(B) Multinomial
Code-
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5,
n_classes=3, random_state=1)
Output:
(C) Ordinal
Code-
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
# Example data
x = np.array([5, 10, 15, 20, 25, 30, 35, 40, 45, 50]).reshape(-1, 1)
y = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 3]) # 1: Low, 2: Medium, 3: High
Output:
10 | P a g e
Machine Learning(3170724) 211240107061
Practical-6
Code-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
# Make predictions
X_fit = np.linspace(0, 5, 100).reshape(-1, 1)
X_fit_poly = poly_features.transform(X_fit)
y_pred = model.predict(X_fit_poly)
11 | P a g e
Machine Learning(3170724) 211240107061
Output:
12 | P a g e
Machine Learning(3170724) 211240107061
Practical-7
Aim-Write a program to demonstrate the working of the decision tree based ID3
algorithm. Use an appropriate data set for building the decision tree and apply
this knowledge to classify a new sample.
Code-
import numpy as np
import math
from Data_loader import read_data
class Node:
def init (self, attribute):
self.attribute = attribute
self.children = []
self.answer = ""
for x in range(items.shape[0]):
dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
pos = 0
for y in range(data.shape[0]):
if data[y, col] == items[x]:
dict[items[x]][pos] = data[y]
pos += 1
if delete:
dict[items[x]] = np.delete(dict[items[x]], col, 1)
return items, dict
def entropy(S):
items = np.unique(S)
if items.size == 1:
13 | P a g e
Machine Learning(3170724) 211240107061
return 0
counts = np.zeros((items.shape[0], 1))
sums = 0
for x in range(items.shape[0]):
counts[x] = sum(S == items[x]) / (S.size * 1.0)
for x in range(items.shape[0]):
ratio = dict[items[x]].shape[0]/(total_size * 1.0)
entropies[x] = ratio * entropy(dict[items[x]][:, -1])
intrinsic[x] = ratio * math.log(ratio, 2)
total_entropy = entropy(data[:, -1])
iv = -1 * sum(intrinsic)
for x in range(entropies.shape[0]):
total_entropy -= entropies[x]
return total_entropy / iv
split = np.argmax(gains)
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
items, dict = subtables(data, split, delete=True)
for x in range(items.shape[0]):
child = create_node(dict[items[x]], metadata)
node.children.append((items[x], child))
return node
14 | P a g e
Machine Learning(3170724) 211240107061
def empty(size):
s = ""
for x in range(size):
s += " "
return s
print(empty(level), node.attribute)
for value, n in node.children:
print(empty(level + 1), value)
print_tree(n, level + 2)
metadata, traindata =
read_data( "F:\programmes\python_programmes\ML_Practicals\Tennis.cs
v")
data = np.array(traindata)
node = create_node(data, metadata)
print_tree(node, 0)
Data_loader.py
import csv
def read_data(filename):
with open(filename, 'r') as csvfile:
datareader = csv.reader(csvfile, delimiter=',')
headers = next(datareader)
metadata = []
traindata = []
15 | P a g e
Machine Learning(3170724) 211240107061
Tennis.csv
Outlook,Tempearature,Humidity,Wind,answer
sunny,hot,high,weak,no
sunny,hot,high,strong,no
overcast,hot,high,weak,yes
rain,mild,high,weak,yes
rain,cool,normal,weak,yes
rain,cool,normal,strong,no
overcast,cool,normal,strong,yes
sunny,mild,high,weak,no
sunny,cool,normal,weak,yes
rain,mild,normal,weak,yes
sunny,mild,normal,strong,yes
overcast,mild,high,strong,yes
overcast,hot,normal,weak,yes
rain,mild,high,strong,no
Output-
16 | P a g e
Machine Learning(3170724) 211240107061
Practical-8
Aim- Implement SVR (Super vector regression) & SVM (Support Vector Machine).
Code-
# Sample dataset
X = np.array([[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]])
y = np.array([3, 5, 8, 9, 10, 13, 15, 18, 20, 24])
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2,
random_state=0)
# SVR Model
svr_regressor = SVR(kernel='rbf') # You can use 'linear', 'poly', or 'rbf' for the kernel
svr_regressor.fit(X_train, y_train)
17 | P a g e
Machine Learning(3170724) 211240107061
Output:
18 | P a g e
Machine Learning(3170724) 211240107061
Code-
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from mlxtend.plotting import plot_decision_regions
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Plot decision regions for the training data (using only the first two features for 2D plot)
plt.figure(figsize=(8, 6))
plot_decision_regions(X_train_scaled, y_train, clf=best_svm, legend=2)
plt.title('SVM Decision Boundaries (Training Set)')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.show()
Output:
20 | P a g e
Machine Learning(3170724) 211240107061
21 | P a g e
Machine Learning(3170724) 211240107061
Practical-9
Code-
# Recursive Binary Search function
def binary_search(arr, low, high, x):
# Base case: If the range is valid
if high >= low:
mid = (high + low) // 2
# If the element is smaller than mid, it can only be present in the left subarray
elif arr[mid] > x:
return binary_search(arr, low, mid - 1, x)
# Example usage
arr = [2, 3, 4, 10, 40]
x = 10
# Function call
result = binary_search(arr, 0, len(arr) - 1, x)
if result != -1:
print(f"Element is present at index {result}")
else:
print("Element is not present in array")
Output:
22 | P a g e
Machine Learning(3170724) 211240107061
Practical-10
Code-
import numpy as np
import matplotlib.pyplot as plt
Output:
24 | P a g e
Machine Learning(3170724) 211240107061
Practical-11
Code-
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
# Make predictions
y_pred = knn.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of KNN with k={k}: {accuracy:.2f}')
25 | P a g e
Machine Learning(3170724) 211240107061
yticklabels=data.target_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
Output:
26 | P a g e
Machine Learning(3170724) 211240107061
Practical-12
K-Nearest-Neighbour Algorithm:
1. Load the data
2. Initialize the value of k
3. For getting the predicted class, iterate from 1 to total number of training data points
1. Calculate the distance between test data and each row of training data. Here we will
use Euclidean distance as our distance metric since it’s the most popular method.
The other metrics that can be used are Chebyshev, cosine, etc.
2. Sort the calculated distances in ascending order based on distance values
3. Get top k rows from the sorted array
4. Get the most frequent class of these rows i.e Get the labels of the selected K entries
5. Return the predicted class
If regression, return the mean of the K labels
If classification, return the mode of the K labels
Confusion matrix:
Note, • Class 1 : Positive • Class 2 : Negative
False Positive Rate: When it's actually no, how often does it predict yes?
FP/actual no = 10/60 = 0.17
True Negative Rate: When it's actually no, how often does it predict no?
TN/actual no = 50/60 = 0.83
equivalent to 1 minus False Positive Rate
also known as "Specificity“
Precision: When it predicts yes, how often is it correct?
TP/predicted yes = 100/110 = 0.91
Prevalence: How often does the yes condition actually occur in our sample?
actual yes/total = 105/165 = 0.64
Source Code:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
dataset=pd.read_csv("iris.csv")
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.25)
classifier=KNeighborsClassifier(n_neighbors=8,p=3,metric='euclidean')
classifier.fit(X_train,y_train)
cm=confusion_matrix(y_test,y_pred)
print('Confusion matrix is as follows\n',cm)
print('Accuracy Metrics')
print(classification_report(y_test,y_pred))
print(" correct predicition",accuracy_score(y_test,y_pred))
print(" worng predicition",(1-accuracy_score(y_test,y_pred)))
28 | P a g e
Machine Learning(3170724) 211240107061
Output :
Confusion matrix is as follows
[[13 0 0]
[ 0 15 1]
[ 0 0 9]]
Accuracy Metrics
precision recall f1-score support
Iris-setosa 1.00 1.00 1.00 13
Iris-versicolor 1.00 0.94 0.97 16
Iris-virginica 0.90 1.00 0.95 9
29 | P a g e
Machine Learning(3170724) 211240107061
Practical-13
Aim- Write a program to implement the naïve Bayesian classifier for a sample
training dataset stored as a .CSV file. Compute the accuracy of the
classifier, considering few test data sets.
Code-
import csv
import random
import math
def loadcsv(filename):
lines = csv.reader(open(filename, "r"));
dataset = list(lines)
for i in range(len(dataset)):
#converting strings into numbers for processing
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def separatebyclass(dataset):
separated = {} #dictionary of classes 1 and 0
#creates a dictionary of classes 1 and 0 where the values are
#the instances belonging to each class
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
def mean(numbers):
return sum(numbers)/float(len(numbers))
30 | P a g e
Machine Learning(3170724) 211240107061
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
def summarizebyclass(dataset):
separated = separatebyclass(dataset);
#print(separated)
summaries = {}
for classvalue, instances in separated.items():
#for key,value in dic.items()
#summaries is a dic of tuples(mean,std) for each class value
summaries[classvalue] = summarize(instances) #summarize is used to cal to
mean and std
return summaries
31 | P a g e
Machine Learning(3170724) 211240107061
bestLabel = classvalue
return bestLabel
def main():
filename = 'F:\programmes\python_programmes\ML_Practicals\dataset.csv'
splitratio = 0.67
dataset = loadcsv(filename);
main()
Output:
32 | P a g e
Machine Learning(3170724) 211240107061
Practical-14
Code-
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
Output:
34 | P a g e
Machine Learning(3170724) 211240107061
Practical-15
Aim- Assuming a set of documents that need to be classified, use the naïve
Bayesian Classifier model to perform this task. Built-in Java classes/ API can
be used to write the program. Calculate the accuracy, precision, and recall for
your dataset.
Code-
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import numpy as np
35 | P a g e
Machine Learning(3170724) 211240107061
Output:
36 | P a g e
Machine Learning(3170724) 211240107061
Practical-16
Code-
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
37 | P a g e
Machine Learning(3170724) 211240107061
Output:
38 | P a g e
Machine Learning(3170724) 211240107061
Practical-17
Code-
from itertools import combinations
return frequent_itemsets
# Example usage
if name == " main ":
transactions = [
['milk', 'bread', 'cookies'],
['milk', 'diaper', 'bread', 'cookies'],
['milk', 'diaper', 'bread'],
['bread', 'cookies'],
['milk', 'diaper', 'cookies'],
['milk', 'bread', 'diaper']
]
min_support = 2
frequent_itemsets = get_frequent_itemsets(transactions, min_support)
print("Frequent Itemsets:")
for itemset, count in frequent_itemsets.items():
print(f"{set(itemset)}: {count}")
Output:
40 | P a g e
Machine Learning(3170724) 211240107061
Practical-18
Code-
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
m= np.shape(mbill)[1]
one = np.mat(np.ones(m))
X = np.hstack((one.T,mbill.T))
41 | P a g e
Machine Learning(3170724) 211240107061
#set k here
ypred = localWeightRegression(X,mtip,0.5)
SortIndex = X[:,1].argsort(0)
xsort = X[SortIndex][:,0]
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(bill,tip, color='green')
ax.plot(xsort[:,1],ypred[SortIndex], color = 'red', linewidth=5)
plt.xlabel('Total bill')
plt.ylabel('Tip')
plt.show()
Output-
42 | P a g e
Machine Learning(3170724) 211240107061
Practical-19
Code-
import numpy as np
from nltk.tokenize import word_tokenize
from collections import defaultdict
data = ['She loves pizza, pizza is delicious.','She is a good person.','good people are the best.']
sentences = []
vocab = []
for sent in data:
x = word_tokenize(sent)
sentence = [w.lower() for w in x if w.isalpha() ]
sentences.append(sentence)
for word in sentence:
if word not in vocab:
vocab.append(word)
len_vector = len(vocab)
index_word = {}
i=0
for word in vocab:
index_word[word] = i
i += 1
def bag_of_words(sent):
count_dict = defaultdict(int)
vec = np.zeros(len_vector)
for item in sent:
count_dict[item] += 1
for key,item in count_dict.items():
vec[index_word[key]] = item
return vec
vector = bag_of_words(sentences[0])
print(vector)
Output-
43 | P a g e
Machine Learning(3170724) 211240107061
Practical-20
Code-
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as rnd
mu = np.array([10,13])
sigma = np.array([[3.5, -1.8], [-1.8,3.5]])
cov = np.cov(mean_data.T)
cov = np.round(cov, 2)
print("Covariance matrix ", cov.shape, "\n")
indices = np.arange(0,len(eig_val), 1)
indices = ([x for _,x in sorted(zip(eig_val, indices))])[::-1]
eig_val = eig_val[indices]
eig_vec = eig_vec[:,indices]
print("Sorted Eigen vectors ", eig_vec)
print("Sorted Eigen values ", eig_val, "\n")
sum_eig_val = np.sum(eig_val)
explained_variance = eig_val/ sum_eig_val
print(explained_variance)
cumulative_variance = np.cumsum(explained_variance)
print(cumulative_variance)
44 | P a g e
Machine Learning(3170724) 211240107061
plt.plot(explained_variance,cumulative_variance)
plt.show()
Output:-
45 | P a g e
Machine Learning(3170724) 211240107061
Practical-21
Code-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import FastICA
# Original signals
plt.subplot(4, 1, 1)
plt.title('Original Signals')
plt.plot(S)
# Mixed signals
plt.subplot(4, 1, 2)
plt.title('Mixed Signals')
plt.plot(X)
# Recovered signals
plt.subplot(4, 1, 3)
plt.title('Independent Components (Recovered Signals)')
plt.plot(S_)
46 | P a g e
Machine Learning(3170724) 211240107061
# Mixing matrix
plt.subplot(4, 1, 4)
plt.title('Estimated Mixing Matrix')
plt.imshow(A_, aspect='auto', cmap='viridis')
plt.colorbar()
plt.tight_layout()
plt.show()
Output:
47 | P a g e
Machine Learning(3170724) 211240107061
Practical-22
Code-
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import sklearn.metrics as metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset = pd.read_csv("F:\programmes\python_programmes\ML_Practicals\iris_dataset.csv",
names=names)
X = dataset.iloc[:, :-1]
plt.figure(figsize=(14,7))
colormap=np.array(['red','lime','black'])
# REAL PLOT
plt.subplot(1,3,1)
plt.title('Real')
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y])
# K-PLOT
model=KMeans(n_clusters=3, random_state=0).fit(X)
plt.subplot(1,3,2)
plt.title('KMeans')
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[model.labels_])
# GMM PLOT
gmm=GaussianMixture(n_components=3, random_state=0).fit(X)
y_cluster_gmm=gmm.predict(X)
plt.subplot(1,3,3)
48 | P a g e
Machine Learning(3170724) 211240107061
plt.title('GMM Classification')
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y_cluster_gmm])
Output-
49 | P a g e
Machine Learning(3170724) 211240107061
Practical-23
Aim- Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the
same data set for clustering using k-Means algorithm. Compare the results of
these two algorithms and comment on the quality of clustering. You can add
Java/ Python ML library classes/ API in the program.
Code-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
plt.title('k-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.tight_layout()
plt.show()
else:
print("Data has more than 2 dimensions. Visualization is not applicable.")
Output:
51 | P a g e
Machine Learning(3170724) 211240107061
Practical-24
Code-
import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X, axis=0) # maximum of X array longitudinally
y = y/100
# Sigmoid Function
def sigmoid(x):
return 1/(1 + np.exp(-x))
# Variable initialization
epoch = 7000 # Setting training iterations
lr = 0.1 # Setting learning rate
52 | P a g e
Machine Learning(3170724) 211240107061
outgrad = derivatives_sigmoid(output)
d_output = EO * outgrad
EH = d_output.dot(wout.T)
hiddengrad = derivatives_sigmoid(hlayer_act) # how much hidden layer wts
#contributed to error
d_hiddenlayer = EH * hiddengrad
wout += hlayer_act.T.dot(d_output) *lr# dotproduct of nextlayererror and
#currentlayerop
# bout += np.sum(d_output, axis=0,keepdims=True) *lr
wh += X.T.dot(d_hiddenlayer) *lr
#bh += np.sum(d_hiddenlayer, axis=0,keepdims=True) *lr
print("Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n" ,output)
Output-
53 | P a g e
Machine Learning(3170724) 211240107061
Practical-25
Code-
import numpy as np
Parameters:
A1 (tuple): A tuple representing the coordinates of point A1 (x1, y1).
B2 (tuple): A tuple representing the coordinates of point B2 (x2, y2).
Returns:
float: The Euclidean distance between A1 and B2.
"""
return np.sqrt((B2[0] - A1[0])**2 + (B2[1] - A1[1])**2)
# Example usage
A1 = (1, 2) # Coordinates of point A1
B2 = (4, 6) # Coordinates of point B2
Output:
54 | P a g e