Ex No: 1 Download, install and explore the features of NumPy,
SciPy, Jupyter, Statsmodels and Pandas packages.
#Numpy installation
#Pandas installation
#Statsmodels installation
#Scipy Installation
#Jupyter installation
py –m pip install jupyter
#Jupyter notebook installation
jupyter notebook
Ex No:2 WORKING WITH NUMPY ARRAYS
PROGRAM:
import numpy as np
a=np.array([1,2,3]) #Creating 1D array
print(a)
b=np.array([[1,2,3],[4,5,6]]) #Creating 2D array
print(b)
c=np.array([[[1,2,3],[4,5,6],[7,8,9]]]) #Creating 3D array
print(c)
print(a[1:3]) #Slicing 1D array
print(b[1:3]) #Slicing 2D array
print(a.itemsize) #Itemsize
print(b.itemsize)
print(c.itemsize)
print(a.size) #Size
print(b.size)
print(c.size)
np.zeros((2,3)) #Creating zeros as rows
and columns
np.ones((1,1))
a.shape
b.shape #Shape and Reshape
c.shape
b.reshape(3,2)
a.min()
b.min() #Minimum value
c.sum(axis=1)
np.sqrt(a) #Square root
np.sqrt(b)
np.std(a)
np.std(b) #Standard deviation
np.std(c)
#Add
c=a+b
print(c)
#Subtract
c=a-b
print(c)
#Multiply
a=np.array([1,2,3])
b=np.array([2,3,5])
c=a*b
print(c)
#Matrix multiplication
c=a.dot(b)
print(c)
c.min()
#Maximum value
a.max()
b.max()
c.max()
#Sum of values
a.sum()
b.sum(axis=0)
Ex No: 3 WORKING WITH PANDAS DATAFRAMES
Program:
#Creating dataframes fom list
import pandas as pd
data=[[‘Jack’,1],[‘John’,2],[‘Tom’,3]]
df=pd.DataFrame(data,columns=[‘Name’,’Rollno’])
print(df)
#Creating dataframes from dictionary
import pandas as pd
data={‘Name’:[‘Jack’,’John’,’Tom’],’Rollno’:[1,2,3],’Mark’:[100,99,98]}
df=pd.DataFrame(data)
print(df)
#Creating dataframes with explicit index
import pandas as pd
data={‘Name’:[‘Jack’,’John’,’Tom’], ’Mark’:[100,99,98]}
df=pd.DataFrame(data,index=[‘Rank 1’,’Rank 2’,’Rank 3’])
print(df)
#Creating dataframes from list of dictionary
import pandas as pd
data=[{‘A’:1,’B’:2,’C’:3},{‘A’:11,’B’:22,’C’:33}]
df=pd.DataFrame(data)
print(df)
#Add record to dataframe using .loc function
import pandas as pd
df=pd.DataFrame(columns=[‘Name’,’Rollno’,’Mark’])
df.loc[0]=[‘Jack’,1,100]
df.loc[1]=[‘John’,2,99]
df.loc[2]=[‘Tom’,3,98]
print(df)
Ex No: 4 (a) BASIC PLOTS USING MATPLOTLIB
Program:
#Line plot
import matplotlib.pyplot as plt
x=[10,20,30,40]
y=[11,21,31,41]
plt.plot(x,y)
plt.title(“Line plot”)
plt.xlabel(“x-axis”)
plt.ylabel(“y-axis”)
plt.show()
OUTPUT:
#Histogram
import matplotlib.pyplot as plt
x=[1,2,3,2,4,5,7,6,8,6]
plt.hist(x,bins=[1,2,3,4,5,6,7,8])
plt.title(“Histogram”)
plt.legend([“Bar”])
plt.xlabel(“x-axis”)
plt.ylabel(“y-axis”)
plt.show()
OUTPUT:
#Scatter plot
import matplotlib.pyplot as plt
x=[1,3,5,7,9,2,4]
y=[6,8,10,12,14,3,5]
plt.xlabel(“x-axis”)
plt.ylabel(“y-axis”)
plt.title(“Scatter plot”)
plt.show()
OUTPUT:
#Pie Chart
import matplotlib.pyplot as plt
import numpy as np
y=np.array([5,6,10])
mylabels=[“I-IT”,”II-IT”,”III-IT”]
plt.title(“Pie chart”)
plt.pie(y,labels=mylabels)
plt.show()
OUTPUT:
#Bar Chart
import matplotlib.pyplot as plt
import numpy as np
x=np.array([“A”,”B”,”C”,”D”,”E”])
y=np.array([10,20,30,40,50])
plt.title(“Bar chart”)
plt.bar(x,y)
plt.show()
OUTPUT:
Ex No: 4 (b) FREQUENCY DISTRIBUTION
Program:
#Frequency distribution with marks
import pandas as pd
import matplotlib.pyplot as plt
d={‘Maths’:[80,90,95,80,100],’English’:[100,50,90,80,95],’Zoology’:
[100,90,92,90,80],’Name’:[“Jack”, ”John”,”Tom”,”Sam”,”Ben”]}
df=pd.DataFrame(d)
plt.hist(df[‘Zoology’])
plt.xlabel(“Mark”)
plt.show()
OUTPUT:
#Frequency distribution with bins
import pandas as pd
import matplotlib.pyplot as plt
x=[1,2,3,1,2,6,7,8,9,10,11,12,1,14,16,18,19,2,6,20,21,22,23,24,40,26,40,39,
29,10]
plt.hist(x,bins=[0,10,20,30,40])
plt.title(“Bins”)
plt.show()
OUTPUT:
# Frequency distribution with marks using bins, color, edge color,
linewidth
import pandas as pd
import matplotlib.pyplot as plt
d={‘Maths’:[80,90,95,80,100],’English’:[100,50,90,80,95],’Zoology’:
[100,90,92,90,80],’Name’:[“Jack”, ”John”,”Tom”,”Sam”,”Ben”]}
df=pd.DataFrame(d)
plt.hist(df[‘Zoology’],bins=4,color=’yellow’,edgecolor=’black’,linewidth=2)
plt.title(“Marks”)
plt.show()
OUTPUT:
Ex No: 5 READING DATA FROM TEXT FILE AND EXCEL FILE AND
EXPLORING VARIOUS COMMANDS FOR DOING
DESCRIPTIVE ANALYSIS ON IRIS DATASET
(a)READING DATA FROM EXCEL FILE
Program:
import pandas as pd
#Reading the csv file
df=pd.read_csv(“iris_csv.csv”)
#Printing top 5 rows
df.head()
#Printing last 5 rows
df.tail()
#Columns and their data types
df.info()
#Descriptive analysis of the data
df.describe()
#Check for null values
df.isnull().sum()
#Drop duplicates
data=df.drop_duplicates(subset=”class”)
data=df.drop_duplicates()
print(data)
#Returns the unique count
df.nunique()
#Names of all the variables in a dataframe
df.columns
Ex No: 5 (b) READING DATA FROM TEXT FILE
Date:
Program:
f=open(‘sample.txt’,’rt’,encoding=’windows-1252’)
line=f.read()
print(‘File contents: \n’,line.strip())
#Close file
f.close()
#Split the string into words
line2=line.split()
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
word_list=line2
counts=Counter(word_list)
labels,values=zip(*counts.items())
#Sort your values in descending order
indsort=np.argsort(values)[::-1]
#Rearrange your data
labels=np.array(labels)[indsort]
values=np.array(values)[indsort]
indexes=np.arange(len(labels))
bar_width=0.35
plt.figure(figsize=(15,5))
plt.bar(indexes,values)
#Add labels
plt.xticks(indexes+bar_width,labels)
plt.show()
OUTPUT:
Ex No: 6 APPLY AND EXPLORE VARIOUS PLOTTING
FUNCTIONS ON UCI DATA SETS.
(a) NORMAL CURVES
Program:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
import statistics
xaxis=np.arange(10,20,0.01)
m=statistics.mean(xaxis)
sd=statistics.stdev(xaxis)
plt.plot(xaxis,norm.pdf(xaxis,m,sd))
plt.show()
OUTPUT:
(b) SCATTER PLOT
Program:
import matplotlib.pyplot as plt
import pandas as pd
impot numpy as np
df=pd.read_csv(‘iris_csv.csv’)
x=df.to_numpy()
plt.scatter(x[:,0],x[:,1])
plt.show()
OUTPUT:
(c) HISTOGRAM
Program:
import matplotlib.pyplot as plt
import pandas as pd
impot numpy as np
df=pd.read_csv(‘iris_csv.csv’)
x=df.to_numpy()
plt.hist(x[:,0],bins=[1,2,3,4,5])
plt.show()
OUTPUT:
(d) DENSITY
Program:
import matplotlib.pyplot as plt
import pandas as pd
impot numpy as np
df=pd.read_csv(‘iris_csv.csv’)
data=df.pivot(columns=’class’,values=’petallength’)
data.plot.density(figsize=(5,5),linewidth=(4))
plt.show()
OUTPUT:
(e) CORRELATION
Program:
import matplotlib.pyplot as plt
import pandas as pd
impot numpy as np
df=pd.read_csv(‘iris_csv.csv’)
plt.matshow(df.corr())
plt.show()
OUTPUT:
(f) THREE-DIMENSIONAL PLOTTING
Program:
import numpy as np
import matplotlib.pyplot as plt
fig=plt.figure()
ax=plt.axes(projection=’3d’)
line=np.linspace(0,15,1000)
xline=np.sin(zline)
yline=np.cos(zline)
ax.plot3D(xline,yline,zline,’gray’)
xdata=np.sin(zdata)+0.1*np.random.randn(100)
ydata=np.cos(zdata)+0.1*np.random.randn(100)
ax.scatter3D(xdata,ydata,c=zdata,cmap=’Greens’)
plt.show()
OUTPUT:
Ex No: 7 USE THE DIABETES DATA SET FROM UCI AND PIMA
INDIAN DIABETES DATA ON PERFORMING THE
FOLLOWING
(a)UNIVARIENT ANALYSIS
Program:
#Frequency
import pandas as pd
data=pd.read_csv(‘diabetes.csv’)
data.groupby(‘Outcome’).count()
#Mean
data.groupby(‘Age’)[‘BloodPressure’].mean()
#Median
data.groupby(‘Age’)[‘BloodPressure’].median()
#Mode
data.mode()
#Variance
data.var()
#Standard deviation
data.std()
#Skewness
data.skew()
#Kurtosis
data.kurt()
b. BIVARIATE ANALYSIS: LINEAR AND LOGISTIC REGRESSION
MODELING
Program:
import pandas as pd
df = pd.read_csv(“diabetes.csv”)
df.head()
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
cols =
['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','Diabet
esPedigreeFunction','Age']
import numpy as np
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.5)
hm =
sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f',annot_kws=
{'size': 15},yticklabels=cols,xticklabels=cols)
plt.show()
OUTPUT:
C. MULTIPLE REGRESSION ANALYSIS:
Multiple Regression is an extension of simple linear regression. It is used
when we want to predict the value of a variable based on the value of two or
more other variables. The variable we want to predict is called the
dependent variable (or sometimes, the outcome, target or criterion variable).
The variables we are using to predict the value of the dependent variable are
called the independent variables (or sometimes, the predictor, explanatory
or regressor variables).
Code:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y =
train_test_split(X,Y,test_size=0.3,random_state=99)
train_x.shape, train_y.shape
from sklearn.linear_model import MultipleRegression
le = MultipleRegression()
le.fit(train_x,train_y)
y_pred = le.predict(test_x)
y_pred
result = pd.DataFrame({'Actual': test_y, 'Predict' : y_pred})
OUTPUT:
Code:
print('coefficient', le.coef_)
print('intercept', le.intercept_)
d. ALSO COMPARE THE RESULTS OF THE ABOVE ANALYSIS FOR THE
TWO DATA SETS
For any one working in an analytical role, Comparing two data sets will be a
day to day activity. Whether that is to prove changes made are not
impacting the rest of the data in the file which is typically called as
“Regression testing” or to understand the difference between two files /data
sets.
Installing datacompy:
pip install datacompy
Details:
datacompy takes two dataframes as input and gives us a human-readable
report containing statistics that lets us know the similarities and
dissimilarities between the two dataframes. It will try to join two dataframes
either on a list of join columns, or on indexes.
Code:
import datacompy
compare = datacompy.Compare(df1,df2,join_columns=’acct_id’,
abs_tol=0.0001,
rel_tol=0,df1_name=’olddiabetes’,df2_name=’newdiabetes’)
print(compare.report())
OUTPUT:
Ex No: 8 VISUALIZING GEOGRAPHIC DATA WITH BASEMAP
PRE-REQUIREMENTS:
Install basemap.
python -m pip install basemap
PROGRAM:
1. Vector layers to a map.
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (12,12))
m = Basemap()
m.drawcoastlines()
plt.title("Coastlines", fontsize=20)
plt.show()
OUTPUT:
2. Draw Countries on the map.
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (12,12))
m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries()
plt.title("Country boundaries", fontsize=20)
plt.show()
OUTPUT:
3. Fill continents.
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (12,12))
m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries(linewidth=1.0, linestyle='solid', color='k')
m.fillcontinents()
plt.title("Color filled continents", fontsize=20)
plt.show()
OUTPUT:
4. Longitude & Longitude lines.
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (12,12))
m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries(linewidth=1.0, linestyle='solid', color='k')
m.fillcontinents(color='coral',lake_color='aqua')
m.drawmeridians(range(0, 360, 20), color='k', linewidth=1.0, dashes=[4, 4],
labels=[0, 0, 0, 1])
m.drawparallels(range(-90, 100, 10), color='k', linewidth=1.0, dashes=[4, 4],
labels=[1, 0, 0, 0])
plt.ylabel("Latitude", fontsize=15, labelpad=35)
plt.xlabel("Longitude", fontsize=15, labelpad=20)
plt.show()
OUTPUT: