0% found this document useful (0 votes)
4 views23 pages

Machine Learning Lab Assignment 2

The document contains multiple assignments involving machine learning techniques, including SVM and KNN classification, as well as data manipulation using pandas. It provides code snippets for each task, detailing steps such as data loading, preprocessing, model training, and evaluation. Additionally, it includes practice questions on data structures and handling missing values in datasets.

Uploaded by

tarlanavikas12
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views23 pages

Machine Learning Lab Assignment 2

The document contains multiple assignments involving machine learning techniques, including SVM and KNN classification, as well as data manipulation using pandas. It provides code snippets for each task, detailing steps such as data loading, preprocessing, model training, and evaluation. Additionally, it includes practice questions on data structures and handling missing values in datasets.

Uploaded by

tarlanavikas12
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 23

ASSIGNMENT

1.SVM Classification on News Dataset


Code:
#SVM classification on News Dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import
accuracy_score,classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
#Load the dataset
file_path = "Google News.csv"
df = pd.read_csv(file_path)#encoding='ISO-8859-1'
#Drop missing values
df=df.dropna()
#Extract features and labels
X_text =
df[['title','publisher','date','keyword','country']].astype(str).agg(''.join,axis=1)
y = df['category']
#Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
X_tfidf=vectorizer.fit_transform(X_text)
#Standardize the TF-IDF features
scaler = StandardScaler(with_mean=False)
X_tfidf_scaled=scaler.fit_transform(X_tfidf)
#Split into training and testing sets(80%train,20%test)
X_train,X_test,y_train,y_test=train_test_split(X_tfidf_scaled,y,test_size=0.2,ran
dom_state=42)
#Train SVM model
svm_model=SVC(kernel='linear',random_state=42)
svm_model.fit(X_train,y_train)
#Predict on test data
y_pred=svm_model.predict(X_test)
#Evaluate modell performance
accuracy=accuracy_score(y_test,y_pred)
report=classification_report(y_test,y_pred,zero_division=1)
#Compute confusion matrix
conf_matrix=confusion_matrix(y_test,y_pred)
#Plot confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(conf_matrix,annot=True,fmt='d',cmap='Blues',xticklabels=np.uniq
ue(y),yticklabels=np.unique(y))
plt.xlabel('Predicted Label')
plt.ylabel('Confusion Matrix')
plt.show()
#Print the results
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Dataset:

Input:
Output:
2.KNN Classification with Decision Boundary
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report

#Load Dataset
df=pd.read_csv("student_pass.csv")

#Split into features (X) and target (y)


X = df[['Hours_Studied','Sleep_Hours']] #Features
y = df['Exam_Score'].map({'Fail':0,'Pass':1})

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=4
2)
#Train KNN model(K=3)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)

#Predict on test data


y_pred=knn.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
print("Confusion Matrix:\n",cm)
print("\nClassification Report:\n",classification_report(y_test,y_pred))

#Plotting the decision boundary


plt.figure(figsize=(10,6))

#Create a mesh grid for decision boundary


x_min,x_max=X["Hours_Studied"].min()-1,X["Hours_Studied"].max()+1
y_min,y_max=X["Sleep_Hours"].min()-1,X["Sleep_Hours"].max()+1
xx,yy=np.meshgrid(np.linspace(x_min,x_max,100),np.linspace(y_min,y_max,10
0))

#Predict for each point in the grid


Z=knn.predict(np.c_[xx.ravel(),yy.ravel()])
Z=Z.reshape(xx.shape)

#Plot the decision boundary using contour


plt.contourf(xx,yy,Z,alpha=0.3,cmap='coolwarm')

#Scatter plot of training data


sns.scatterplot(x=X_train["Hours_Studied"],y=X_train["Sleep_Hours"],hue=y_tr
ain,palette={0:'red',1:'green'},s=100,edgecolor='black')

#Scatter plot of test data


sns.scatterplot(x=X_test["Hours_Studied"],y=X_test["Sleep_Hours"],hue=y_pre
d,marker='s',palette={0:'orange',1:'blue'},s=150,edgecolor='black')

#Labels and title


plt.xlabel("Hours Studied")
plt.ylabel("Sleep Hours")
plt.title("KNN Classification with Decision Boundary")
plt.legend(title="Legend",labels=["Fail(Train)","Pass(Train)","Fail(Test)","Pass(Te
st)"])
plt.grid(True)
plt.show()
Dataset:

Input:
Output:
3.Practice Questions
3(a)
Code:
import pandas as pd
#Creating a series from a list
data = [10,20,30,40,50]
series1 = pd.Series(data)
print(series1)

Input:

Output:

3(b)
Code:
#Creating a pandas dataframe
import pandas as pd
#creating a dataframe froom a dictionary
data={
'Name':['Alice','Bob','Charlie'],
'Age':[25,30,35],
'Salary':[50000,60000,70000]
}
df=pd.DataFrame(data)
print(df)
Input:
Output:

3(c)
Code:
#From a list of lists
data = [
['Alice',25,50000],
['Bob',30,60000],
['Charlie',35,70000]
]
df = pd.DataFrame(data,columns=['Name','Age','Salary'])
print(df)
Input:

Output:
3(d)
Code:
#missing values
import pandas as pd
import numpy as np
#creating a dataset with some missing values
data = {
'Name': ['Alice','Bob','Charlie','David','Emma'],
'Age': [25,np.nan,30,35,np.nan],
'Salary': [50000,60000,np.nan,80000,75000],
'Department': ['HR','IT',np.nan,'Finance','IT']
}
df = pd.DataFrame(data)
print("Original Dataset with Missing Values:")
print(df)
Input:

Output:
3(e)
Code:
print("Missing Values in Each Column:")
print(df.isnull().sum()) #count missing values in each column
Input:

Output:

3(f)
Code:
import pandas as pd
import numpy as np
#Fill missing Age with the mean age
df['Age'].fillna(df['Age'].mean(),inplace=True)

#Fill missing salary with the median salary


df['Salary'].fillna(df['Salary'].median(),inplace=True)

#Fill missing department with the most frequent vzlue(mode)


df['Department'].fillna(df['Department'].mode()[0],inplace=True)

print("Dataset After filling missing values")


print(df)
Input:
Output:

3(g)
Code:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
#minmax normalization
#sample data
data = np.array([[1,2],[3,4],[5,6],[7,8]])
#initialize the scaler
scaler = MinMaxScaler()
#fit and transform the data
print(data)
normalized_data = scaler.fit_transform(data)
print("Normalized Data (Min-Max Scaling)")
print(normalized_data)
Input:
Output:

3(h)
Code:
import pandas as pd
import numpy as np

#dictionary
data={
'Name':['Geek1','Geek2','Geek3','Geek4'],
'Salary':[18000,20000,15000,35000]
}
#create a dataframe
data = pd.DataFrame(data,
columns=['Name',
'Salary'])
#show the dataframe
data
data['logarithm_base2'] = np.log2(data['Salary'])
#Show the dataframe
print(data)
Input:

Output:

3(i)
Code:
import pandas as pd
import numpy as np

#sample dataset
data = [50,60,70,80,90,100]

#convert to Pandas DataFrame


df = pd.DataFrame(data,columns=['Values'])
#compute mean and standard deviation
mean = df['Values'].mean()
std_dev = df['Values'].std()

#Apply Z-score normalization


df['Z-Score'] = (df['Values']-mean)/std_dev

#display the results


print(df)
Input:

Output:
4. Naïve Bayes Classification
Code:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import
accuracy_score,classification_report,confusion_matrix
#Sample weather Dataset
data = pd.read_csv("weather.csv")
df=pd.DataFrame(data)
#Encoding categorical features
label_enc=LabelEncoder()
df['Outlook'] = label_enc.fit_transform(df['Outlook']) #Convert
#'Sunny','Rain' etc. to numbets
df['Wind'] = label_enc.fit_transform(df['Wind']) #Covert 'Yes'
#No' to 1,0
df['Humidity'] = label_enc.fit_transform(df['Humidity']) #Convert 'Yes'
df['Temperature'] = label_enc.fit_transform(df['Humidity'])
#Splitting features and target
X=df[['Outlook','Temperature','Humidity','Wind']]
y=df['PlayTennis']
#Train test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=4
2)
#Train Naive Bayes Classifier
model=GaussianNB()
model.fit(X_train,y_train)
#Predictions
y_pred=model.predict(X_test)
#Evaluate Model
print("Accuracy:",accuracy_score(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Classification Report:\n",classification_report(y_test,y_pred))
Dataset:

Input:
Output:
5.EM-Model
Code:
#EM-Model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,accuracy_score

#Load dataset
df = pd.read_csv("student_data.csv")
#Extraxt features(Math Score, Science Score)
X = df[["Math_Score","Science_Score"]].values
y_true = df["Category"].values #True labels(0 or 1)

#Standardize data for better clustering


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#Apply Gaussian Mixture Model(EM Algorithm)


gmm =
GaussianMixture(n_components=2,covariance_type='full',random_state=42)
gmm.fit(X_scaled)
y_pred=gmm.predict(X_scaled) #Predicted clusters
#Adjust cluster labels to match true labels
if np.mean(y_pred[y_true==1])< np.mean(y_pred[y_true==0]):
y_pred = 1-y_pred #swap labels if necessary
#Compute Accuracy & Confusion Matrix
accuracy = accuracy_score(y_true,y_pred)
conf_matrix = confusion_matrix(y_true,y_pred)
print("Accuracy:",accuracy)
print("Confusion Matrix:\n",conf_matrix)

#Plot the clusters


plt.figure(figsize=(8,6))
plt.scatter(X[:,0],X[:,1],c=y_pred,cmap='coolwarm',edgecolors='k',s=100)
plt.xlabel("Math Score")
plt.ylabel("Science Score")
plt.title("Student Clusters using EM(GMM)")
plt.colorbar(label="Cluster Label")
plt.show()
Dataset:
Input:
Output:

You might also like