#导入需要使用的库
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers
# print(tf.__version__)
# print(tf.keras.__version__)
from tensorflow_core.python.keras.callbacks import LearningRateScheduler #需要到此包下面找
import tensorflow_core
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import tensorflow_core.python.keras as keras
import tensorflow_core.python.keras.backend as K
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.metrics import mean_absolute_error
import time
"""
—————————————————————————————————————————————以下为树模型的数据处理—————————————————————————————————————————————
"""
path = os.path.abspath(os.path.dirname(os.getcwd()) + os.path.sep + ".")
Train_data = pd.read_csv(path+'used_car_train_20200313.csv', sep=' ')
Test_data = pd.read_csv(path+'used_car_testB_20200421.csv', sep=' ')
"""
一、预测值处理,处理目标值长尾分布的问题
"""
Train_data['price'] = np.log1p(Train_data['price'])
# 合并方便后面的操作
df = pd.concat([Train_data, Test_data], ignore_index=True)
# print(df.info())
"""
二、数据简单预处理,分三步进行
"""
## 1、第一步处理无用值和基本无变化的值
#SaleID肯定没用,但是我们可以用来统计别的特征的group数量
#name一般没什么好挖掘的,不过同名的好像不少,可以挖掘一下
df['name_count'] = df.groupby(['name'])['SaleID'].transform('count')
del df['name']
#seller有一个特殊值,训练集特有测试集没有,把它删除掉
df.drop(df[df['seller'] == 1].index, inplace=True)
del df['offerType']
del df['seller']
## 2、第二步处理缺失值
# 以下特征全部填充众数
df['fuelType'] = df['fuelType'].fillna(0)
df['gearbox'] = df['gearbox'].fillna(0)
df['bodyType'] = df['bodyType'].fillna(0)
df['model'] = df['model'].fillna(0)
## 3、第三步处理异常值
# 异常值就目前初步判断,只有notRepairedDamage的值有问题,还有题目规定了范围的power。处理一下
df['power'] = df['power'].map(lambda x: 600 if x>600 else x)
df['notRepairedDamage'] = df['notRepairedDamage'].astype('str').apply(lambda x: x if x != '-' else None).astype('float32')
"""
三、以上为数据简单预处理,以下为特征工程(特征工程)
"""
## 1、时间,地区啥的
#挖掘时间中的年月日
from datetime import datetime
def date_process(x):
year = int(str(x)[:4])
month = int(str(x)[4:6])
day = int(str(x)[6:8])
if month < 1:
month = 1
date = datetime(year, month, day)
return date
df['regDate'] = df['regDate'].apply(date_process)
df['creatDate'] = df['creatDate'].apply(date_process)
df['regDate_year'] = df['regDate'].dt.year
df['regDate_month'] = df['regDate'].dt.month
df['regDate_day'] = df['regDate'].dt.day
df['creatDate_year'] = df['creatDate'].dt.year
df['creatDate_month'] = df['creatDate'].dt.month
df['creatDate_day'] = df['creatDate'].dt.day
df['car_age_day'] = (df['creatDate'] - df['regDate']).dt.days
df['car_age_year'] = round(df['car_age_day'] / 365, 1) #留取一位小数
#提取地区中的信息
df['regionCode_count'] = df.groupby(['regionCode'])['SaleID'].transform('count')
print(df['regionCode'])
df['city'] = df['regionCode'].apply(lambda x : str(x)[:2])
print(df['city'])
## 2、分类特征
# 对可分类的连续特征进行分桶,kilometer是已经分桶了
bin = [i*10 for i in range(31)]
df['power_bin'] = pd.cut(df['power'], bin, labels=False)
tong = df[['power_bin', 'power']].head()
bin = [i*10 for i in range(24)]
df['model_bin'] = pd.cut(df['model'], bin, labels=False)
tong = df[['model_bin', 'model']].head()
# 将稍微取值多一点的分类特征与price进行特征组合,做了非常多组,但是在最终使用的时候,
# 每组分开测试,挑选真正work的特征
Train_gb = Train_data.groupby("regionCode")
all_info = {}
for kind, kind_data in Train_gb: #kind:索引, king_data:数据
info = {}
kind_data = kind_data[kind_data['price'] > 0]
info['regionCode_amount'] = len(kind_data)
info['regionCode_price_max'] = kind_data.price.max()
info['regionCode_price_median'] = kind_data.price.median()
info['regionCode_price_min'] = kind_data.price.min()
info['regionCode_price_sum'] = kind_data.price.sum()
info['regionCode_price_std'] = kind_data.price.std()
info['regionCode_price_mean'] = kind_data.price.mean()
info['regionCode_price_skew'] = kind_data.price.skew()
info['regionCode_price_kurt'] = kind_data.price.kurt()
info['regionCode_mad'] = kind_data.price.mad()
all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "regionCode"})
df = df.merge(brand_fe, how='left', on='regionCode')
Train_gb = Train_data.groupby("brand")
all_info = {}
for kind, kind_data in Train_gb:
info = {}
kind_data = kind_data[kind_data['price'] > 0]
info['brand_amount'] = len(kind_data)
info['brand_price_max'] = kind_data.price.max()
info['brand_price_median'] = kind_data.price.median()
info['brand_price_min'] = kind_data.price.min()
info['brand_price_sum'] = kind_data.price.sum()
info['brand_price_std'] = kind_data.price.std()
info['brand_price_mean'] = kind_data.price.mean()
info['brand_price_skew'] = kind_data.price.skew()
info['brand_price_kurt'] = kind_data.price.kurt()
info['brand_price_mad'] = kind_data.price.mad()
all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "brand"})
df = df.merge(brand_fe, how='left', on='brand')
Train_gb = Train_data.groupby("model")
all_info = {}
for kind, kind_data in Train_gb:
info = {}
kind_data = kind_data[kind_data['price'] > 0]
info['model_amount'] = len(kind_data)
info['model_price_max'] = kind_data.price.max()
info['model_price_median'] = kind_data.price.median()
info['model_price_min'] = kind_data.price.min()
info['model_price_sum'] = kind_data.price.sum()
info['model_price_std'] = kind_data.price.std()
info['model_price_mean'] = kind_data.price.mean()
info['model_price_skew'] = kind_data.price.skew()
info['model_price_kurt'] = kind_data.price.kurt()
info['model_price_mad'] = kind_data.price.mad()
all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "model"})
df = df.merge(brand_fe, how='left', on='model')
Train_gb = Train_data.groupby("kilometer")
all_info = {}
for kind, kind_data in Train_gb:
info = {}
kind_data = kind_data[kind_data['price'] > 0]
info['kilometer_amount'] = len(kind_data)
info['kilometer_price_max'] = kind_data.price.max()
info['kilometer_price_median'] = kind_data.price.median()
info['kilometer_price_min'] = kind_data.price.min()
info['kilometer_price_sum'] = kind_data.price.sum()
info['kilometer_price_std'] = kind_data.price.std()
info['kilometer_price_mean'] = kind_data.price.mean()
info['kilometer_price_skew'] = kind_data.price.skew()
info['kilometer_price_kurt'] = kind_data.price.kurt()
info['kilometer_price_mad'] = kind_data.price.mad()
all_info[kind] = info
brand_fe = pd.DataFrame(all_info).T.reset_index().rename(columns={"index": "kilometer"})
df = df.merge(brand_fe, how='left', on='kilometer')
Train_gb = Train_data.groupby("bodyType")
all_info = {}
for kind, kind_data in Train_gb:
info = {}
kind_data = kind_data[kind_data['price'] > 0]
info['bodyType_amount'] = len(kind_data)
info['bodyType_price_max'] = kind_data.price.max()

土豆片片
- 粉丝: 1890
最新资源
- 基于距离控制的变频器加减速参数子程序设计与应用
- 宠物服务管理系统-c2e8kave-论文.zip
- 基于Simulink的永磁同步电机PMSM的矢量控制FOC Simulink 深度版
- 基于Django的二手电子设备交易平台设计与开发-419nx25c-论文.zip
- 水文模型代码实现与建模:VIC模型、集中式模型及LSTM径流预测的优化与应用
- Python协程中的tpresume
- 基于CNN-GRU注意力机制的高精度时间序列预测程序,多领域应用如风电、电力、交通、经济及排放预测,双输入单输出设计,代码清晰可替换数据运行。
- 电力电子领域MMC模块化多电平变流器的载波移相与LADRC双闭环仿真及谐波分析
- 基于Django的个性化餐饮管理系统-1ml1r29h.zip
- 直流有刷电机与三闭环控制的SIMULINK仿真研究及应用
- 基于S7-1200 PLC的流水灯控制系统设计与仿真实验——可编程控制器原理与应用
- 基于Django花卉商城系统的设计与实现-2885fb37-论文.rar
- Vivado AD9680 Verilog源代码工程:包含JESD204B接口、1G采样率、10G线速率与SPI配置接口 - 注释详细
- java的jpa的JpaRepository 的findAllById函数
- 基于Django框架学习资源推送系统-1zp1132q.zip
- DSP28335芯片基于Boot loader与CAN通信的在线固件升级解决方案
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈


