Python Cheat Sheet for Data Analysis
Data Loading Data Wrangling Exploratory Data Analysis
Complete data frame correlation
Read CSV dataset Replace missing data with frequency
[Link]()
# load without header MostFrequentEntry =
df = pd.read_csv(<CSV path>, header = None) df[‘attribute_name’].value_counts().idxmax() Specific attribute correlation
# load using first row as header df[[‘attribute1’,’attribute2’,...]].corr()
df = pd.read_csv(<CSV path>, header = 0) df[‘attribute_name’].replace([Link],MostFrequentEntry
, inplace=True) Scatter plot
Print first few entries from matlplotlib import pyplot as plt
Replace missing data with mean [Link](df[[‘attribute_1’]], df[[‘attribute_2’]])
#n=number of entries; default 5
[Link](n) AverageValue= Regression plot
df[‘attribute’].astype(<data_type>).mean(axis=0) import seaborn as sns
Print last few entries [Link](x=‘attribute_1’,y=‘attribute_2’, data=df)
df[‘attribute’].replace([Link], AverageValue,
#n=number of entries; default 5 inplace=True) Box plot
[Link](n) import seaborn as sns
Fix the data types [Link](x=‘attribute_1’,y=‘attribute_2’, data=df)
Assign header names
df[[‘attribute1’, ‘attribute2’, ...]] = Grouping by attributes
[Link] = headers df[[‘attribute1’, ‘attribute2’, df_group = df[[‘attribute_1’,’attribute_2’,...]]
...]].astype(‘data_type’)
Replace “?” with NaN #data_type can be int, float, char, etc. GroupBy statements
# Group by a single attribute
df = [Link](“?”, [Link]) Data normalization df_group = df_group.groupby(['attribute_1'],
as_index=False).mean()
df[‘attribute_name’] =
Retrieve data types df[‘attribute_name’]/df[‘attribute_name’].max() # Group by multiple attributes
[Link] df_group = df_group.groupby(['attribute_1',
Binning 'attribute_2'],as_index=False).mean()
Retrieve statistical description bins = [Link](min(df[‘attribute_name’]), Pivot tables
max(df[‘attribute_name’],n) grouped_pivot =
# default use
# n is the number of bins needed df_group.pivot(index='attribute_1',columns='attribute
[Link]()
# include all attributes _2')
GroupNames = [‘Group1’,’Group2’,’Group3’,...]
[Link](include=”all”)
Pseudocolor plot
df['binned_attribute_name'] =
Retrieve data set summary [Link](df['attribute_name'], bins, labels=GroupNames, from matlplotlib import pyplot as plt
include_lowest=True) [Link](grouped_pivot, cmap='RdBu')
[Link]()
Pearson Coefficient and p-value
Change column name
Save data frame to csv from scipy import stats
[Link](columns={‘old_name’:’new_name’}, pearson_coef,p_value=[Link](df[’attribute_1’]
df.to_csv(<output CSV path>) inplace=True) , df['attribute_2'])
Indicator variables
dummy_variable = pd.get_dummies(df[‘attribute_name’])
df = [Link]([df, dummy_variable],axis = 1)
© Copyright IBM Corpora on 2023. All rights reserved Page | 1
Python Cheat Sheet for Data Analysis
Model Development Pipeline
lre=LinearRegression()
from [Link] import Pipeline
Linear regression from [Link] import StandardScaler Rcross =
from sklearn.linear_model import LinearRegression Input=[('scale',StandardScaler()), ('polynomial', cross_val_score(lre,x_data[['attribute_1']],y_data,cv
lr = LinearRegression() PolynomialFeatures(include_bias=False)), =n)
('model',LinearRegression())] # n indicates number of times, or folds, for which
Train linear regression model the cross validation is to be done
X = df[[‘attribute_1’, ‘attribute_2’, ...]] pipe=Pipeline(Input)
Y = df['target_attribute'] Mean = [Link]()
[Link](X,Y) Z = [Link](float) Std_dev = [Link]()
[Link](Z,y)
Generate output predictions ypipe=[Link](Z) Cross-validation prediction
from sklearn.model_selection import cross_val_score
Y_hat = [Link](X) R2 value
Identify the coefficient and intercept # For linear regression model from sklearn.linear_model import LinearRegression
X = df[[‘attribute_1’, ‘attribute_2’, ...]]
coeff = lr.coef_ Y = df['target_attribute'] lre=LinearRegression()
intercept = lr.intercept_
Residual plot [Link](X,Y) yhat = cross_val_predict(lre,x_data[[‘attribute_1’]],
R2_score = [Link](X,Y) y_data,cv=4)
import seaborn as sns
[Link](x=df[[‘attribute_1’]], # For polynomial regression model Ridge regression and prediction
y=df[[‘attribute_2’]]) from [Link] import r2_score from sklearn.linear_model import Ridge
Distribution plot pr=PolynomialFeatures(degree=2)
f = [Link](x, y, n)
import seaborn as sns p = np.poly1d(f) x_train_pr=pr.fit_transform(x_train[[‘attribute_1’,
[Link](df['attribute_name'], hist=False) R2_score = r2_score(y, p(x)) ‘attribute_2’, ...]])
# can include other parameters like color, label,
etc. MSE value x_test_pr=pr.fit_transform(x_test[[‘attribute_1’,
from [Link] import mean_squared_error ‘attribute_2’, ...]])
Polynomial regression mse = mean_squared_error(Y, Yhat)
f = [Link](x, y, n) RidgeModel=Ridge(alpha=1)
#creates the polynomial features of order n Model Evaluation and Refinement [Link](x_train_pr, y_train)
yhat = [Link](x_test_pr)
p = np.poly1d(f) Split data for training and testing
#p becomes the polynomial model used to generate the from sklearn.model_selection import train_test_split Grid search
predicted output
from sklearn.model_selection import GridSearchCV
y_data = df[‘target_attribute’]
Y_hat = p(x) from sklearn.linear_model import Ridge
x_data=[Link]('target_attribute',axis=1)
# Y_hat is the predicted output
parameters= [{'alpha': [0.001,0.1,1, 10, 100, 1000,
x_train, x_test, y_train, y_test =
Multi-variate polynomial regression 10000, ...]}]
train_test_split(x_data, y_data, test_size=0.10,
from [Link] import PolynomialFeatures random_state=1)
RR=Ridge()
Cross-validation score Grid1 = GridSearchCV(RR, parameters1,cv=4)
Z = df[[‘attribute_1’,’attribute_2’,...]]
pr=PolynomialFeatures(degree=n) from sklearn.model_selection import cross_val_score [Link](x_data[[‘attribute_1’, ‘attribute_2’,
Z_pr=pr.fit_transform(Z) ...]], y_data)
from sklearn.linear_model import LinearRegression
BestRR=Grid1.best_estimator_
[Link](x_test[[‘attribute_1’, ‘attribute_2’,
...]], y_te
© Copyright IBM Corpora on 2023. All rights reserved Page | 2