1 - DataPreparation - Ipynb - Colaboratory
1 - DataPreparation - Ipynb - Colaboratory
We have to
preprocess the raw data to be able to use it for analysis.
Major steps:
Loading data
Cleaning data (removing unnecessary data or erroneous data)
Transforming data
Possibly rearranging data
#import pandas
import pandas as pd
# Set ipython's max row display to 1000
pd.set_option('display.max_row', 1000)
# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)
#Loading .xlsx type data set to data-frame with meaningful name.
#df = pd.read_excel('Data.xlsx',sheet_name="Sheet1")
#or read a csv file
#Loading .csv type data set ,considering the data is in same folder, if not then you can refe
#df_csvData = pd.read_csv('horror-train.csv')
df = pd.read_csv('toy_dataset.zip', compression='zip')
df.head() # 5
Exploration phase
# get info about the attributes and their types
df.info()
<class 'pandas.core.frame.DataFrame'>
#we can find alo the total number of non-empty values using count
df.count()
Number 150000
City 149999
Gender 149993
Age 149994
Income 149992
Illness 149998
dtype: int64
#get statistical summary info about numeric attributes
df.describe()
#gender distribution: #of records for male and female and nan if any
df.groupby("Gender",dropna=False).size()
Gender
Female 66198
Male 83795
dtype: int64
#plot this distribution
#create a new variable that holds the Gender's value counts
bygender=df["Gender"].value_counts(dropna=False)
#!pip install matplotlib --upgrade #required for bar_label to see the numbers above the bars(
#bar plot using the default plot function in pandas (based on matplotlib)
ax = bygender.plot(kind = 'bar') #returns Axes object
#or
#bygender.plot.bar()
#give it a title
ax.set_title("Bar Graph of Gender")
#assign a label to x axis
ax.set_xlabel('Gender')
#assign a label to y axis
ax.set_ylabel('Number of People')
#show the data labels
ax.bar_label(ax.containers[0]) #version 3.4 or matplotlib
#or using matplotlib pyplot
#the bygender has np.nan in the column names, we have to rename it otherwise we get an error
import matplotlib.pyplot as plt
import numpy as np
bygender = bygender.rename({np.nan:"NaN"})
#index returns the labels or the column names, values returns the value for each column
plt.bar(bygender.index,height=bygender.values)
plt.show()
Deletion
#deletion
#drop age and use inplace = True only when you want to modifiy original dataframe, otherwise
newdf=df.drop(['Age'],axis=1)#, inplace=True)
#Remove Gender column
df.drop(columns=['Gender'], inplace=True)
df.head(20)
#Remove rows which contains null values in City, Gender and Age
df_1 = df.dropna(subset =['City','Gender','Age'] )
<class 'pandas.core.frame.DataFrame'>
#Fill missing Income with mean value - add a new column
df['IncomeFillNa'] = df['Income'].fillna(df['Income'].mean())
#Fill missing Age with median value - add a new column
df['AgeFillNa'] = df['Age'].fillna(df['Age'].median())
#Fill missing Gender with mode value - add a new column
#mode returns a dataframe so we access it at index 0
df['GenderFillNa'].fillna(df['Gender'].mode()[0],inplace=True) #mode: 0 Male
Male 83802
Female 66198
Male
#check if there are duplicates
df.duplicated().sum()
False 150000
dtype: int64
#remove duplicates if any
#original not affected unless you save
df[df.duplicated(keep=False)] #'first': all duplicates except first occurence, 'last': all du
#convert categorical Gender to number (get_dummies)
one_hot_gender = pd.get_dummies(df["Gender"])
#add the generated hot-encoded columns to new df
df_1=pd.concat([df, one_hot_gender],axis=1)
df_1.head(10)
Number City Gender Age Income Illness Female Male Female Male
#simple normalization for both - add new columns
min = df_1['Age'].min()
max= df_1['Age'].max()
df_1['MinMaxAge'] = (df_1['Age'] - min) / (max - min)
#z-score normalization for both - add new columns
df_1['ZscoreIncome'] = (df_1['Income'] - df_1['Income'].mean()) / df_1['Income'].std()
df_1['ZscoreAge'] = (df_1['Age'] - df_1['Age'].mean()) / df_1['Age'].std()
df_1.head()
Number City Gender Age Income Illness Female Male Female Male Simpl
#normalization using apply and functions
#define a function to perform the simple feature normalization
def simpleNorm(old,max):
return old/max
#apply the function to age
df_1["Age"].apply(lambda x: simpleNorm(x, df_1["Age"].max())) #slow
#apply the function to income - make it anonymous
max = df_1["Income"].max()
df_1["Income"].apply(lambda x: x/max)
0 0.227860
1 0.254486
2 0.296251
3 0.231100
4 0.283867
...
149995 0.528734
149996 0.546114
149997 0.631558
149998 0.631519
149999 0.492507
#export the updated dataframe to csv
df_1.to_csv("updated_data.csv")