import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 1. Load Data
transactions = pd.read_csv('transactions.csv', parse_dates=['order_date'])
products = pd.read_excel('product_catalog.xlsx')
customers = pd.read_json('customers.json', convert_dates=['signup_date'])
# 2. Clean Transactions Data
# Handle missing values
transactions['quantity'] = transactions['quantity'].fillna(1)
transactions['unit_price'] = transactions.groupby('product_id')['unit_price'].transform( lambda x:
x.fillna(x.median()))
# Convert data types
transactions['order_id'] = transactions['order_id'].astype('category')
transactions['customer_id'] = transactions['customer_id'].astype('int32')
# 3. Merge Datasets
merged_data = transactions.merge(products[['product_id', 'category', 'cost_price']], on='product_id',
how='left').merge(customers[['customer_id', 'signup_date', 'tier']], on='customer_id',how='left')
# 4. Feature Engineering
merged_data['total_sales'] = merged_data['quantity'] * merged_data['unit_price']
merged_data['profit'] = (merged_data['unit_price'] - merged_data['cost_price']) * erged_data['quantity']
# 5. Visualization
# 6. Monthly Sales Analysis
monthly_sales = merged_data.resample('M', on='order_date')['total_sales'].sum()
monthly_profit = merged_data.resample('M', on='order_date')['profit'].sum()
plt.figure(figsize=(12, 6))
monthly_sales.plot(label='Sales')
monthly_profit.plot(label='Profit')
plt.title('Monthly Sales and Profit Trends')
plt.ylabel('USD')
plt.legend()
plt.show()
# 7. Product Performance Analysis
product_performance = merged_data.groupby('product_id').agg({'total_sales': 'sum', 'quantity': 'sum',
'profit': 'mean'}).sort_values('total_sales', ascending=False).head(10)
# 8. Customer Segmentation
customer_loyalty = merged_data.groupby('customer_id').agg({'order_id': 'nunique','total_sales': 'sum',
'signup_date': 'first'}).rename(columns={'order_id': 'purchase_count'})
customer_loyalty['cohort'] = customer_loyalty['signup_date'].dt.to_period('M')
customer_loyalty['lifetime_months'] = (pd.Period('2023-12', freq='M') -
customer_loyalty['cohort']).apply(lambda x: x.n)
# 9. Data Validation
# Check for negative profits
negative_profit = merged_data[merged_data['profit'] < 0]
if not negative_profit.empty:
print(f"Warning: {len(negative_profit)} transactions with negative profit")
# 10. Verify data completeness
assert merged_data['category'].isna().sum() == 0, "Missing product categories exist"