0% found this document useful (0 votes)
17 views8 pages

Strangers

Uploaded by

a61715836
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views8 pages

Strangers

Uploaded by

a61715836
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

1) Develop a program to create histograms for all numerical features and analyze

the distribution of each feature. Generate box plots for all numerical features
and identify any outliers. Use California Housing dataset.
a)
import pandas as pd
import [Link] as plt
from [Link] import fetch_california_housing
data=fetch_california_housing()
df=[Link]([Link],columns=data.feature_names)
df['target']=[Link]
[Link](figsize=(8,6))
[Link](df['HouseAge'],bins=20,edgecolor='black')
[Link]("Histogram of House Age")
[Link]('House Age')
[Link]('Frequency')
[Link]()

import pandas as pd
import [Link] as plt
from [Link] import fetch_california_housing
data=fetch_california_housing()
df=[Link]([Link],columns=data.feature_names)
df['target']=[Link]
print("Column names")
print([Link])
print("\n first few rows of the dataset")
print([Link]())
[Link](figsize=(14,10))
for i,column in enumerate([Link],1):
[Link](3,4,i)
[Link](df[column],bins=20,edgecolor='black')
[Link](f"Histogram of {column}")
[Link](column)
[Link]('Frequency')
[Link]("Histogram for all features",fontsize=16)
plt.tight_layout()
[Link]()

b)
import pandas as pd
import [Link] as plt
from [Link] import fetch_california_housing
data=fetch_california_housing()
df=[Link]([Link],columns=data.feature_names)
df['target']=[Link]
[Link](figsize=(8,6))
[Link](df['HouseAge'],vert=False)
[Link]("Boxplot of House Age")
[Link]('House Age')
[Link]()

import pandas as pd
import [Link] as plt
from [Link] import fetch_california_housing
data=fetch_california_housing()
df=[Link]([Link],columns=data.feature_names)
df['target']=[Link]
[Link](figsize=(14,10))
for i,column in enumerate([Link][:-1],1):
[Link](3,4,i)
[Link](df[column],vert=False)
[Link](f"Boxplot of {column}")
[Link](column)
[Link]("Boxplot for all features",fontsize=16)
plt.tight_layout()
[Link]()
for column in [Link][:-1]:
Q1=df[column].quantile(0.25)
Q3=df[column].quantile(0.75)
IQR=Q3-Q1
lower_bound=Q1-1.5*IQR
upper_bound=Q3+1.5*IQR
print(f"Outliers for {column}:")
print(f"Lower bound:{lower_bound}")
print(f"Upper bound:{upper_bound}")
print(f"Number of outliers below lower bound: {(df[column]<lower_bound).sum()}")
print(f"Number of outliers above upper bound: {(df[column]>upper_bound).sum()}")
print()
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
--------------------------

2)Develop a program to Compute the correlation matrix to understand the


relationships between pairs of features. Visualize the correlation matrix using
a heatmap to know which variables have strong positive/negative correlations.
Create a pair plot to visualize pairwise relationships between features. Use
California Housing dataset.
a)
import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
from [Link] import fetch_california_housing
data=fetch_california_housing()
df=[Link]([Link],columns=data.feature_names)
df['target']=[Link]
print(f"Rows and Columns of the dataset: {[Link]}")
print([Link]())
feature='AveRooms'
correlation_matrix=df[[feature]].corr()
[Link](figsize=(6,5))
[Link](correlation_matrix,annot=True,cmap="coolwarm" ,fmt=".2f",linewidth=0
.5,center=0)
[Link](f"Correlation Matrix Heatmap for {feature}",fontsize=16)
[Link]()
pairplot=[Link](df[[feature,'target']],plot_kws={'alpha':0.5})
[Link](f"Pairwise Relationship of {feature} with
Target",fontsize=16)
[Link]()

b)
import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
from [Link] import fetch_california_housing
data=fetch_california_housing()
df=[Link]([Link],columns=data.feature_names)
df['target']=[Link]
print([Link]())
correlation_matrix=[Link]()
[Link](figsize=(12,10))
[Link](correlation_matrix,annot=True,cmap="coolwarm" ,fmt=".2f",linewidth=0
.5,center=0)
[Link](f"Correlation Matrix Heatmap",fontsize=16)
[Link]()
selected_features=['AveRooms','AveOccup','HouseAge','MedInc','target']
pairplot=[Link](df[selected_features],corner=True,plot_kws={'alpha':0.5})
hu
[Link](f"Pairwise Relationships Between Features",fontsize=16)
[Link]()

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-------------------------

3)Develop a program to implement Principal Component Analysis (PCA) for reducing


the dimensionality of the Iris dataset from 4 features to 2.
import numpy as np
import [Link] as plt
from [Link] import load_iris
from [Link] import PCA
from [Link] import StandardScaler
import pandas as pd
iris=load_iris()
x=[Link]
y=[Link]
iris_df=[Link](x,columns=iris.feature_names)
iris_df['Species']=iris.target_names[y]
print("Original Iris Dataset:")
print(iris_df.head())
[Link](figsize=(10,6))
[Link](x[:,0],x[:,1],c=y,cmap='viridis',marker='o',edgecolor='k',s=100)
[Link]("Scatter Plot of first two features of iris dataset")
[Link](iris.feature_names[0])
[Link](iris.feature_names[1])
[Link](label="Class Labels")
[Link]()
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x)
pca=PCA(n_components=2)
x_pca=pca.fit_transform(x_scaled)
pca_df=[Link](x_pca,columns=['PC1','PC2'])
pca_df['Species']=iris.target_names[y]
print("\n PCA Transformed Dataset:")
print(pca_df.head())
[Link](figsize=(10,6))
[Link](x_pca[:,0],x_pca[:,1],c=y,cmap='viridis',marker='o',edgecolor='k',s=
100)
[Link]("2D Scatter plot after PCA - Iris Dataset")
[Link]("Prinipal Component 1")'
[Link]("Prinipal Component 2")
[Link](label="Class Labels")
[Link]()

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
--------------------------

4)For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Find-S algorithm to output a description of the set of all
hypotheses consistent with the training examples.
import csv
num_attributes=6
a=[]
print("\n The given Training Data Set \n")
file_path= r'C:\Users\user\Desktop\[Link]'
with open(file_path,'r') as csvfile:
reader=[Link](csvfile)
for row in reader:
if row:
[Link](row)
print(row)
if not a:
print("The dataset is empty.")
else:
print(f"\n Total instances in the dataset: {len(a)}")
print("\n The initial value of hypothesis:")
hypothesis=['0']*num_attributes
print(hypothesis)
if len(a) >0 and len(a[0]) >= num_attributes:
for j in range(num_attributes):
hypothesis[j]=a[0][j]
else:
print(f"Error: Data in the file doesn't have{num_attributes} attributes in the
first row.")
print("\n Find S: Finding a maximally specific hypothesis \n")
for i in range(len(a)):
if a[i][num_attributes]=='yes':
for j in range(num_attributes):
if a[i][j]!=hypothesis[j]:
hypothesis[j]='?'
else:
hypothesis[j]=a[i][j]
print("For Training instance No: {0} the hypothesis is:
{1}".format(i+1,hypothesis))
print("\nThe Maximally specific hypothesis for the given training examples:\n")
print(hypothesis)

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
--------------------------
5) Develop a program to implement k-Nearest Neighbour algorithm to classify the
randomly generated 100 values of x in the range of [0,1]. Perform the following
based on dataset generated.

a) Label the first 50 points {x1,……,x50} as follows: if (xi ≤ 0.5), then
xi ∊ Class1, else xi ∊ Class1
b) Classify the remaining points, x51,……,x100 using KNN. Perform this for
k=1,2,3,4,5,20,30

import numpy as np
import [Link] as plt
from [Link] import KNeighborsClassifier
from [Link] import accuracy_score
[Link](0)
X=[Link](100,1)
y=[Link]([1 if x<=0.5 else 2 for x in X[:50]])
y=[Link](y,[1 if x<=0.5 else 2 for x in X[50:]])
x_train,x_test=X[:50],X[50:]
y_train,y_test=y[:50],y[50:]
def perform_knn(x_train,y_train,x_test,y_test,k_values):
results={}
for k in k_values:
knn=KNeighborsClassifier(n_neighbors=k)
[Link](x_train,y_train)
y_pred=[Link](x_test)
accuracy=accuracy_score(y_test,y_pred)
results[k]=accuracy
[Link](figsize=(10,6))
[Link](x_train,y_train,color='red',label='Training data')
[Link](x_test,y_test,color='blue',label='Test data')
x_range=[Link](0,1,1000).reshape(-1,1)
y_range=[Link](x_range)
[Link](x_range,y_range,color='green',linestyle='--', label=f'k={k}
decision boundary')
[Link](f'K-NN Classification (k={k})')
[Link]('X')
[Link]('Class')
[Link]()
[Link]()
return results

k_values=[1,2,3,4,5,20,30]
results=perform_knn(x_train,y_train,x_test,y_test,k_values)
print("Accuracy results for different k values:")
for k, accuracy in [Link]():
print(f"k={k}: Accuracy={accuracy:.2f}")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
----------

6) Implement the non-parametric Locally Weighted Regression algorithm in order


to fit data points. Select appropriate data set for your experiment and draw
graphs
import numpy as np
import [Link] as plt
[Link](42)
X=[Link](5*[Link](80,1),axis=0)
y=[Link](X).ravel()+0.1*[Link](80)
def locally_weighted_regression(X,y,tau=0.1):
m=len(X)
predicted_y=[Link](m)
X_=[Link]([[Link]((m,1)),X])
for i in range(m):
weights=[Link](-((X-X[i])**2)/(2*tau**2)).ravel()
W=[Link](weights)
theta=[Link](X_.T @ W @ X_) @ X_.T @ W @ y
predicted_y[i]=[Link]([1,X[i]]) @ theta
return predicted_y
tau=0.2
predicted_y=locally_weighted_regression(X,y,tau)
[Link](X,y,color='blue',label='Data')
[Link](X,predicted_y,color='red',label='LWR Fit')
[Link]('X')
[Link]('y')
[Link]()
[Link]('Locally Weighted Regression')
[Link]()

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
---------

7)Develop a program to demonstrate the working of Linear Regression and


Polynomial Regression. Use Boston Housing Dataset for Linear Regression and Auto
MPG Dataset (for vehicle fuel efficiency prediction) for Polynomial Regression
import numpy as np
import pandas as pd
import matplotlib
[Link]('TkAgg')
import [Link] as plt
from sklearn.linear_model import LinearRegression
from [Link] import PolynomialFeatures
from [Link] import mean_squared_error
from sklearn.model_selection import train_test_split
from [Link] import fetch_california_housing
data = fetch_california_housing(as_frame=True)
X = [Link][['AveRooms']]
y = [Link]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X_train)
poly_reg = LinearRegression()
poly_reg.fit(X_poly, y_train)
y_pred_poly = poly_reg.predict(poly.fit_transform(X_test))
[Link](1, 2, 1)
[Link](X_test, y_test, color='blue')
[Link](X_test, y_pred, color='red')
[Link]('Linear Regression')
[Link](1, 2, 2)
[Link](X_test, y_test, color='blue')
[Link](X_test, y_pred_poly, color='green')
[Link]('Polynomial Regression')
plt.tight_layout()
[Link]()
print(f"Linear Regression - MSE: {mean_squared_error(y_test, y_pred):.4f}")
print(f"Polynomial Regression - MSE: {mean_squared_error(y_test,
y_pred_poly):.4f}")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
---------
8) Develop a program to demonstrate the working of the decision tree algorithm.
Use Breast Cancer Data set for building the decision tree and apply this
knowledge to classify a new sample.
import numpy as np
import pandas as pd
import [Link] as plt
from sklearn.model_selection import train_test_split
from [Link] import DecisionTreeClassifier
from [Link] import classification_report, accuracy_score
from [Link] import load_breast_cancer
from sklearn import tree
data=load_breast_cancer()
X=[Link]
y=[Link]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,
random_state=42)
clf=DecisionTreeClassifier(random_state=42)
[Link](X_train,y_train)
y_pred=[Link](X_test)
print("Classification Report:\n",classification_report(y_test,y_pred))
print("Accuracy Score:", accuracy_score(y_test,y_pred))
[Link](figsize=(12,8))
tree.plot_tree(clf,filled=True,feature_names=data.feature_names,class_names=data
.target_names, rounded=True)
[Link]("Decision Tree for Breast Cancer Classification")
[Link]()
new_sample=[Link]([[14.2,22.4,104.0,723.0,0.09,0.07,0.076,0.105,0.188,0.084,0.
238,0.013,0.013,0.035,0.08,

0.070,0.061,0.020,0.040,0.033,0.012,0.016,0.004,0.012,0.027,0.018,0.050,0.022,0.
032,0.035]])
assert new_sample.shape[1]==X_train.shape[1],f"Expected 30 features, but got
{new_sample.shape[1]} features."
prediction=[Link](new_sample)
if prediction==0:
print("The new sample is classified as: Malignant")
else:
print("The new sample is classfied as: Benign")

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
---------

9) Develop a program to implement the Naive Bayesian classifier considering


Olivetti Face Data set for training. Compute the accuracy of the classifier,
considering a few test data sets.
import numpy as np
import [Link] as plt
from [Link] import fetch_olivetti_faces
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from [Link] import accuracy_score, classification_report
data=fetch_olivetti_faces(shuffle=True, random_state=42)
X=[Link]
y=[Link]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42
)
nb_classifier=GaussianNB()
nb_classifier.fit(X_train,y_train)
y_pred=nb_classifier.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy of the Naive Bayes Classifier: {accuracy *100 :.2f}%")
print("\nClassification Report:
\n",classification_report(y_test,y_pred,zero_division=1))
fig,axes=[Link](1,5,figsize=(12,3))
for i,ax in enumerate(axes):
[Link](X_test[i].reshape(64,64),cmap=[Link])
ax.set_title(f"Pred: {y_pred[i]}\nTrue: {y_test[i]}")
[Link]('off')
[Link]()

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
---------
10) Develop a program to implement k-means clustering using Wisconsin Breast
Cancer data set and visualize the clustering result
import numpy as np
import pandas as pd
import [Link] as plt
from [Link] import load_breast_cancer
from [Link] import KMeans
from [Link] import StandardScaler
data = load_breast_cancer()
X = [Link]
y = [Link]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=2, random_state=42)
[Link](X_scaled)
[Link](figsize=(8, 6))
[Link](X_scaled[:,0],X_scaled[:,1],c=y,cmap='coolwarm',alpha=0.5,label="Tru
e Labels")
[Link]("True Labels vs Clustering Results")
[Link]("Feature 1")
[Link]("Feature 2")
[Link](label="True Labels")
[Link]()
[Link](figsize=(8, 6))
[Link](X_scaled[:,0],X_scaled[:,1],c=kmeans.labels_ ,cmap='coolwarm',alpha=
0.5,label="KMeans Clusters")
[Link]("KMeans Clustering Results")
[Link]("Feature 1")
[Link]("Feature 2")
[Link](label="Cluster Labels")
[Link]()
centroids=kmeans.cluster_centers_
print(f"Centroids of the clusters in the original feature space: \n
{centroids}")
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
----------------------------------------------

You might also like