0% found this document useful (0 votes)
3 views8 pages

Solution

The document provides a comprehensive guide on data analysis and visualization using Python libraries such as Matplotlib, Pandas, and Seaborn. It includes various coding examples covering topics like plotting rainfall data, data manipulation with DataFrames, statistical analysis, and generating visualizations like heatmaps and boxplots. Additionally, it discusses employee salary data analysis and categorization of ages using bins.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views8 pages

Solution

The document provides a comprehensive guide on data analysis and visualization using Python libraries such as Matplotlib, Pandas, and Seaborn. It includes various coding examples covering topics like plotting rainfall data, data manipulation with DataFrames, statistical analysis, and generating visualizations like heatmaps and boxplots. Additionally, it discusses employee salary data analysis and categorization of ages using bins.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 8

# Solutions for Data Analysis and Visualization (UPC: 2343012002)

S.No. 1673

# SECTION A

# Q1 (a)

import matplotlib.pyplot as plt

rainfall = [5, 2, 7, 8, 2]

days = [1, 3, 5, 1, 9]

plt.plot(days, rainfall, 'ro', markersize=10)

plt.title("Rainfall over Days")

plt.xlabel("Days")

plt.ylabel("Rainfall")

plt.show()

# Q1 (b)

import pandas as pd

company = pd.DataFrame({'Name': ['Sangeeta', 'Sarika', 'Sangeeta', 'Babita', 'Sarika'], 'Age': [18, 30,
45, 32, 25]})

# (i)

company['Name'].unique()

# (ii)

company.groupby('Name')['Age'].mean()

Q1 (c)

section1 = pd.DataFrame({'RollNo': [1,2,3, 4], 'Name': ['Abhav', 'Vihaan','Chitra','Devansh']})

section2 = pd.DataFrame({'RollNo': [1,5,3, 2], 'Name': ['Roni', 'Kabeer','Ishani','Vihaan']})

# (i)

print(section1)

# (ii)

merged = pd.merge(section2, section1, on='Name', how='inner')

print(merged)

# (iii)
common = pd.merge(section1, section2, on=['Name', 'RollNo'])

print(common)

# Q1 (d)

al = np.zeros((2, 3))

[[0,0,0],[0,0,0]]

a2 = [[3, 4, 5], [7, 8, 9]]

print(np.add(al, a2))

[[3.,4.,5.,],[7.,8.,9.]]

a1=np.append(a1,a2,axis=0)

print(a1)

[[0,0,0],[0,0,0], [3.,4.,5.,],[7.,8.,9.]]

print('shape of array',a1.shape)

(4,3)

# Q1 (e)

empSalary = np.array([4000, 5200, 6100, 7000, 4900, 8000, 3000, 9200, 6300, 4800])

# (i)

len(empSalary[empSalary>5000])# (ii)

incentive = empSalary * 0.1

print("Incentives:", incentive)

# Q1 (f)

data = pd.DataFrame([[2, 4, 6], [np.NaN, 8, 10], [np.NaN, 12, np.NaN], [np.NaN, np.NaN, np.NaN]])

print(data)

0 1 2

0 2.0 4.0 6.0

1 NaN 8.0 10.0

2 NaN 12.0 NaN

3 NaN NaN NaN

print(data.dropna(thresh=2))
0 1 2

0 2.0 4.0 6.0

1 NaN 8.0 10.0

print(data.fillna(method="ffill", limit=2))

0 1 2

0 2.0 4.0 6.0

1 2.0 8.0 10.0

2 2.0 12.0 10.0

3 NaN 12.0 10.0

# SECTION B

# Q2 (a)

df = pd.DataFrame(np.arange(12).reshape(4, 3), index=[['North', 'North', 'South', 'South'], [1, 2, 1,


2]], columns=[['Delhi', 'Delhi', 'Chandigarh'], ['Green', 'Red', 'Green']])

df.index.names = ['key1', 'key2']

print(df)

df1 = df.swaplevel('key1', 'key2')

print(df1)

df2 = df1.sort_index(level=0)

print(df2)

# Q2 (b)

markSheet = np.random.randint(60, 101, size=(2, 3))

print(markSheet)

print("Datatype:", markSheet.dtype)

print("Shape:", markSheet.shape)

print("Dimension:", markSheet.ndim)

# Q2 (c)
itemRate = pd.DataFrame({'Item': ['Apples', 'Oranges'], 'Rate': [220, 90]})

itemRate['Rate'] *= 2

print(itemRate)

print("Item with Min Rate:", itemRate.loc[itemRate['Rate'].idxmin()])

Q 3 import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

# DataFrame

data = {

'Name': ['Mohan', 'Sohan', 'Jeevan', 'Gita', 'Meenu', 'Gopal', 'Rajeev'],

'Hours_studied': [2.5, 4.0, 6.0, 8.0, 10.0, 1.0, 5.0],

'Marks_obtained': [40, 52, 64, 70, 90, 10, 60]

df_Student = pd.DataFrame(data)

# 1. Students with maximum marks

max_marks = df_Student['Marks_obtained'].max()

top_students = df_Student[df_Student['Marks_obtained'] == max_marks]['Name'].tolist()

print("Students with maximum marks:", top_students)

# 2. Average hours studied

avg_hours = df_Student['Hours_studied'].mean()

print("Average hours studied:", avg_hours)

# 3. Correlation and Covariance

correlation = df_Student[['Hours_studied', 'Marks_obtained']].corr()

covariance = df_Student[['Hours_studied', 'Marks_obtained']].cov()

print("Correlation:\n", correlation)

print("Covariance:\n", covariance)
# 4. Heatmap

sns.heatmap(df_Student[['Hours_studied', 'Marks_obtained']].corr(), annot=True, cmap='coolwarm')

plt.title('Heatmap: Hours Studied vs Marks Obtained')

plt.show()

i.

[0 1 2 3 4 5]

ii.

[[1 2 3]

[4 6 8]]

iii.

[[2. 1. 0.66666667]

[0.5 0.33333333 0.25 ]]

iv.

1 [4 6 8] [[1 2 3]]

v.

[0]

Q 4 a)

Q4 (b)

df=pd.DataFrame({'person':['A','B','C','D','E','A','B','C','D'],'sales':
[1000,300,400,500,800,1000,500,700,50],'quarter':[1,1,1,1,1,2,2,2,2],'country':
['US','Japna','Brazil','UK','US','Brazil','Japan','Brazil','US']})sns.boxplot(x='sales', data=data)

max_sales=df[df['country']=='Brazil']['sales'].max()

min_sales=df[df['country']=='Brazil']['sales'].min()

df.groupby('country')['sales'].sum()

max_avg_sales=df.groupby('person')['sales'].mean().max()

df[df['sales']==max_avg_sales]['person']
df['sales'].describe()

boxplot = df.boxplot(column='sales')

plt.show()

# Q5 (a)

c1 = np.arange(0, 24)

c2 = c1.reshape((2, 12))

c2[:, 3:] = 0

print(c1)

print(c2)

print(c1 * 2)

print(c2.reshape((3, 8)))

# Q5 (b)

excel_data = pd.DataFrame({

'Employee id': [101, 102, 103, 104, 105, 106],

'Department': ['CS', 'CS', 'CS', 'English', 'English', 'English'],

'Salary': [2000, 2002, 2040, 2045, 2030, 2006],

'Age': [24, 23, 34, 39, 43, 34]

})

excel_data.to_excel("data.xlsx", index=False)

df1 = pd.read_excel("data.xlsx", index_col='Employee id')

fig, axes = plt.subplots(1, 2)

df1.plot.scatter(x='Salary', y='Age', ax=axes[0], title='Salary vs Age')

df1['Salary_bins'] = pd.cut(df1['Salary'], 3)

df1['Salary_bins'].value_counts().plot(kind='bar', ax=axes[1])

plt.savefig("Employees.png")

# Q6 (a)

s1 = pd.Series([5, 0, -4, 8])

print(s1)

print(s1.rank())
data1 = pd.DataFrame({'One': ['a', 'b'] * 2 + ['b'], 'Two': [21, 22, 21, 23, 24]})

print(data1)

data2 = data1.drop_duplicates(['One', 'Two'], keep='last')

print(data2)

df1 = pd.DataFrame({'A': [21, 32], 'B': [27, 30]})

df2 = pd.DataFrame({'A': [23, 41]})

df2['A'][1] = df2['A'][1] + 10

print(df1)

print(df2)

print(df2 > df1['B'].min())

# Q6 (b)

ages = np.array([20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32])

categories = pd.cut(ages, bins=[18, 25, 35, 60, 100], labels=['Youth', 'YoungAdult', 'MiddleAged',
'Senior'])

print(categories.value_counts())

quantile_bins = pd.qcut(ages, q=4)

print(quantile_bins.value_counts())

# Q7

empData = pd.DataFrame({

'Gender': ['Male', 'Male', 'Male', 'Male', 'Female', 'Female', 'Female', 'Female', 'Female', 'Male',
'Male', 'Male'],

'Role': ['Data Analyst']*3 + ['Data Scientist']*3 + ['Manager']*3 + ['Data Analyst', 'Data Scientist',
'Manager'],

'Experience': [1, 1, 3, 5, 6, 1, 2, 3, 5, 6, 10, 11],

'Salary': [48000, 42000, 51000, 62000, 71000, 73000, 82000, 87000, 91000, 45000, 56000, 66000]

})

# (a)

print(empData)

# (b)

print(empData.groupby('Role')['Salary'].sum())
# (c)

print(empData[empData['Gender'] == 'Female'].groupby('Role').size())

# (d)

print(empData.groupby('Gender')['Salary'].agg(['max', 'min']))

# (e)

avg_salary = empData['Salary'].mean()

empData = empData[empData['Salary'] >= avg_salary]

print(empData)

You might also like