EDA (2)
EDA (2)
a) Merging DataFrames
import pandas as pd
print(merged_df)
import pandas as pd
data = {
df = pd.DataFrame(data)
print(reshaped_df)
Data Duplication
# Detecting duplicates
print(df.duplicated())
# Removing duplicates
import pandas as pd
data = {'ID': [1, 2, 3, 1, 4], 'Value': ['A', 'B', 'C', 'A', 'D']}
df = pd.DataFrame(data)
print(duplicates)
d) Replacing Values
import pandas as pd
data = {
df = pd.DataFrame(data)
print(reshaped_df)
import pandas as pd
import numpy as np
s = pd.Series(data['Values'])
print("Original Series:\n", s)
total = s.sum()
print("\nSum:", total)
average = s.mean()
print("Mean:", average)
# Count (non-NaN values)
count = s.count()
print("Count:", count)
maximum = s.max()
print("Max:", maximum)
import pandas as pd
import numpy as np
df = pd.DataFrame(data)
print("Original:\n", df)
df['A'] = df['A'].fillna(0)
df['B'] = df['B'].fillna(df['B'].mean())
print("\nFilled:\n", df)
import pandas as pd
import numpy as np
df = pd.DataFrame(data)
print("Original:\n", df)
# Forward fill
ffilled = df.fillna(method='ffill')
# Backward fill
bfilled = df.fillna(method='bfill')
import pandas as pd
import numpy as np
df = pd.DataFrame(data)
print("Original:\n", df)
for i in range(len(df)):
if pd.isna(df.loc[i, 'A']):
df.loc[i, 'A'] = i
import pandas as pd
import numpy as np
df = pd.DataFrame(data)
print("Original:\n", df)
# Linear interpolation
interpolated = df.interpolate()
print("\nInterpolated:\n", interpolated)
import pandas as pd
# Rename columns
# Rename index
# Rename both
import pandas as pd
# Equal-width bins
# Equal-frequency bins
age_qbins = pd.qcut(ages, 2)
NOTE:
Key Differences:
1) pd.cut() (Equal-Width):
2) pd.qcut() (Equal-Frequency):
data = {'A': [1, 2, 3, 4, 5], 'B': ['a', 'b', 'c', 'd', 'e']}
df = pd.DataFrame(data)
print("Original:\n", df)
# Permute rows
permuted_df = df.sample(frac=1).reset_index(drop=True)
print("\nPermuted:\n", permuted_df)
sampled_df = df.sample(n=3)
print("\nSampled:\n", sampled_df)
print(data)