Program 1
Develop a program to Load a dataset and select one numerical column. Compute mean,
median, mode, standard deviation, variance, and range for a given numerical column in a
dataset. Generate a histogram and boxplot to understand the distribution of the data. Identify
any outliers in the data using IQR. Select a categorical variable from a dataset. Compute the
frequency of each category and display it as a bar chart or pie chart.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load dataset (Modify the file path or URL as needed)
file_path = "your_dataset.csv" # Update with the actual dataset path
df = pd.read_csv(file_path)
# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())
# Select a numerical column
num_col = "your_numerical_column" # Replace with actual numerical column name
if num_col not in df.columns:
raise ValueError(f"Column '{num_col}' not found in dataset")
# Compute statistics
mean_value = df[num_col].mean()
median_value = df[num_col].median()
mode_value = df[num_col].mode()[0] # Mode might return multiple values
std_dev = df[num_col].std()
variance = df[num_col].var()
data_range = df[num_col].max() - df[num_col].min()
# Print statistics
print("\nStatistical Measures for", num_col)
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"Mode: {mode_value}")
print(f"Standard Deviation: {std_dev}")
print(f"Variance: {variance}")
print(f"Range: {data_range}")
# Plot Histogram
plt.figure(figsize=(6, 4))
sns.histplot(df[num_col], bins=20, kde=True)
plt.title(f"Histogram of {num_col}")
plt.xlabel(num_col)
plt.ylabel("Frequency")
plt.show()
# Plot Boxplot
plt.figure(figsize=(6, 4))
sns.boxplot(x=df[num_col])
plt.title(f"Boxplot of {num_col}")
plt.show()
# Detect Outliers using IQR
Q1 = df[num_col].quantile(0.25)
Q3 = df[num_col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[num_col] < lower_bound) | (df[num_col] > upper_bound)]
print(f"\nNumber of Outliers in {num_col}: {len(outliers)}")
print(outliers)
# Select a categorical column
cat_col = "your_categorical_column" # Replace with actual categorical column name
if cat_col not in df.columns:
raise ValueError(f"Column '{cat_col}' not found in dataset")
# Compute category frequency
category_counts = df[cat_col].value_counts()
# Plot Bar Chart
plt.figure(figsize=(6, 4))
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title(f"Bar Chart of {cat_col}")
plt.xlabel(cat_col)
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()
# Plot Pie Chart
plt.figure(figsize=(6, 4))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140)
plt.title(f"Pie Chart of {cat_col}")
plt.show()
Program 2
Develop a program to Load a dataset with at least two numerical columns (e.g., Iris, Titanic).
Plot a scatter plot of two variables and calculate their Pearson correlation coefficient. Write a
program to compute the covariance and correlation matrix for a dataset. Visualize the
correlation matrix using a heatmap to know which variables have strong positive/negative
correlations.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load dataset (Update file path or dataset)
file_path = "your_dataset.csv" # Replace with actual dataset path
df = pd.read_csv(file_path)
# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())
# Select two numerical columns for scatter plot and correlation
num_col1 = "your_numerical_column1" # Replace with actual column name
num_col2 = "your_numerical_column2" # Replace with actual column name
if num_col1 not in df.columns or num_col2 not in df.columns:
raise ValueError(f"Columns '{num_col1}' or '{num_col2}' not found in dataset")
# Scatter plot
plt.figure(figsize=(6, 4))
sns.scatterplot(x=df[num_col1], y=df[num_col2])
plt.title(f"Scatter Plot: {num_col1} vs {num_col2}")
plt.xlabel(num_col1)
plt.ylabel(num_col2)
plt.show()
# Compute Pearson correlation coefficient
pearson_corr = df[num_col1].corr(df[num_col2])
print(f"\nPearson Correlation Coefficient between {num_col1} and {num_col2}:
{pearson_corr:.4f}")
# Compute Covariance Matrix
cov_matrix = df[[num_col1, num_col2]].cov()
print("\nCovariance Matrix:")
print(cov_matrix)
# Compute Correlation Matrix
corr_matrix = df.corr()
print("\nCorrelation Matrix:")
print(corr_matrix)
# Heatmap of Correlation Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()
Program 3
Develop a program to implement Principal Component Analysis (PCA) for reducing the
dimensionality of the Iris dataset from 4 features to 2.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target
# Standardize the data (PCA is affected by scale)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.iloc[:, :-1]) # Exclude the species column
# Apply PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)
# Create a new DataFrame with PCA components
pca_df = pd.DataFrame(pca_data, columns=['PC1', 'PC2'])
pca_df['species'] = df['species']
# Scatter plot of PCA results
plt.figure(figsize=(8, 6))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=pca_df['species'], palette='coolwarm',
alpha=0.7)
plt.title('PCA of Iris Dataset (4D → 2D)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Species', labels=iris.target_names)
plt.show()
# Explained variance ratio
explained_variance = pca.explained_variance_ratio_
print(f"Explained Variance by PC1: {explained_variance[0]:.4f}")
print(f"Explained Variance by PC2: {explained_variance[1]:.4f}")
print(f"Total Variance Explained: {sum(explained_variance):.4f}")
Program 4
Develop a program to load the Iris dataset. Implement the k-Nearest Neighbors (k-NN)
algorithm for classifying flowers based on their features. Split the dataset into training and
different values of 𝑘 (e.g., k=1,3,5) and evaluate the accuracy. Extend the k-NN algorithm to
testing sets and evaluate the model using metrics like accuracy and F1-score. Test it for
assign weights based on the distance of neighbors (e.g., 𝑤𝑒𝑖𝑔ℎ𝑡=1/𝑑2 ). Compare the
performance of weighted k-NN and regular k-NN on a synthetic or real-world dataset.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
# Load the Iris dataset
iris = load_iris()
X = iris.data # Features
y = iris.target # Labels
# Split dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
stratify=y)
# Standardize the dataset (important for distance-based models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Function to evaluate k-NN with different k values
def evaluate_knn(k_values, weighted=False):
results = []
for k in k_values:
if weighted:
knn = KNeighborsClassifier(n_neighbors=k, weights=lambda d: 1 / (d**2 + 1e-5)) #
Weighted k-NN
else:
knn = KNeighborsClassifier(n_neighbors=k, weights="uniform") # Regular k-NN
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
results.append((k, accuracy, f1))
return results
# Test different k values
k_values = [1, 3, 5]
knn_results = evaluate_knn(k_values, weighted=False)
weighted_knn_results = evaluate_knn(k_values, weighted=True)
# Convert results to DataFrame
knn_df = pd.DataFrame(knn_results, columns=['k', 'Accuracy', 'F1-Score'])
weighted_knn_df = pd.DataFrame(weighted_knn_results, columns=['k', 'Accuracy', 'F1-
Score'])
# Print results
print("\nRegular k-NN Performance:")
print(knn_df)
print("\nWeighted k-NN Performance:")
print(weighted_knn_df)
# Plot comparison
plt.figure(figsize=(8, 5))
plt.plot(knn_df['k'], knn_df['Accuracy'], marker='o', label='Regular k-NN')
plt.plot(weighted_knn_df['k'], weighted_knn_df['Accuracy'], marker='s', linestyle='dashed',
label='Weighted k-NN')
plt.xlabel("k (Number of Neighbors)")
plt.ylabel("Accuracy")
plt.title("k-NN vs. Weighted k-NN Performance")
plt.legend()
plt.show()
Program 5
Implement the non-parametric Locally Weighted Regression algorithm in order to fit data
points. Select appropriate data set for your experiment and draw graphs.
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
# Generate synthetic dataset (Non-linear function)
np.random.seed(42)
X = np.linspace(-3, 3, 100)
y = np.sin(X) + np.random.normal(scale=0.1, size=len(X)) # True function + noise
# Reshape for matrix operations
X = X.reshape(-1, 1)
# Gaussian Kernel for Weights
def get_weights(X_train, x_query, tau):
distances = cdist(X_train, x_query.reshape(1, -1), metric='euclidean')
weights = np.exp(- (distances*2) / (2 * tau*2))
return np.diag(weights.flatten())
# Locally Weighted Regression function
def locally_weighted_regression(X_train, y_train, x_query, tau):
W = get_weights(X_train, x_query, tau)
X_bias = np.c_[np.ones(X_train.shape[0]), X_train] # Add bias term
theta = np.linalg.pinv(X_bias.T @ W @ X_bias) @ X_bias.T @ W @ y_train
return np.array([1, x_query]) @ theta # Prediction for x_query
# Fit LWR on the dataset for multiple query points
tau_values = [0.1, 0.5, 1.0] # Different bandwidth values
plt.figure(figsize=(10, 6))
for tau in tau_values:
y_pred = np.array([locally_weighted_regression(X, y, x, tau) for x in X])
plt.plot(X, y_pred, label=f"LWR (τ={tau})")
# Plot original data
plt.scatter(X, y, color='black', label='Data Points', alpha=0.6)
plt.title("Locally Weighted Regression (LWR) for Different τ")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.show()
Program 6
Develop a program to demonstrate the working of Linear Regression and Polynomial
Regression. Use Boston Housing Dataset for Linear Regression and Auto MPG Dataset (for
vehicle fuel efficiency prediction) for Polynomial Regression.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Load the Boston Housing dataset
boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame
# Display dataset information
print(df.info())
print(df.describe())
# Define features and target variable
X = df.drop(columns='MEDV')
y = df['MEDV']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = lr_model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")
# Plotting Actual vs Predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='b')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual MEDV')
plt.ylabel('Predicted MEDV')
plt.title('Actual vs Predicted MEDV')
plt.show()
Program 7
Develop a program to load the Titanic dataset. Split the data into training and test sets. Train
a decision tree classifier. Visualize the tree structure. Evaluate accuracy, precision, recall, and
F1-score.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
# Display dataset info
print(df.info())
# Select relevant features & preprocess data
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']] # Relevant
columns
df.dropna(inplace=True) # Drop rows with missing values
# Encode categorical variables
df['Sex'] = LabelEncoder().fit_transform(df['Sex']) # Convert 'Sex' to 0 (female) & 1 (male)
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked']) # Encode 'Embarked'
categories
# Define features & target variable
X = df.drop(columns='Survived')
y = df['Survived']
# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
stratify=y)
# Train Decision Tree model
dt_model = DecisionTreeClassifier(max_depth=4, random_state=42)
dt_model.fit(X_train, y_train)
# Predictions
y_pred = dt_model.predict(X_test)
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
# Visualize the Decision Tree
plt.figure(figsize=(15, 8))
plot_tree(dt_model, feature_names=X.columns, class_names=['Died', 'Survived'],
filled=True)
plt.title("Decision Tree for Titanic Survival Prediction")
plt.show()
Program 8
Develop a program to implement the Naive Bayesian classifier considering Iris dataset for
training. Compute the accuracy of the classifier, considering the test data.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data # Features
y = iris.target # Labels
# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
stratify=y)
# Train Naïve Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
# Predict test data
y_pred = nb_model.predict(X_test)
# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', xticklabels=iris.target_names,
yticklabels=iris.target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
Program 9
Develop a program to implement k-means clustering using Wisconsin Breast Cancer data set
and visualize the clustering result.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Load the Wisconsin Breast Cancer dataset
cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
# Standardize the dataset (important for K-Means)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
# Apply K-Means clustering (2 clusters since we have benign & malignant)
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_scaled)
# Visualize clusters using PCA (reduce to 2D)
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)
df['PCA1'] = df_pca[:, 0]
df['PCA2'] = df_pca[:, 1]
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['PCA1'], y=df['PCA2'], hue=df['Cluster'], palette='coolwarm', alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('K-Means Clustering on Breast Cancer Dataset')
plt.legend(title="Cluster")
plt.show()
# Compare with actual labels
print(pd.crosstab(cancer.target, df['Cluster'], rownames=['Actual'], colnames=['Cluster']))