说明
基于样例数据,构建一个建模的粗略过程。以下假定是对于表格数据的建模。
内容
1 数据探查
拿到数据的第一步一般需要了解有多少变量,什么类型,缺失率这些。
中间还可以有一些定制化的查看,这步的结果是把一些不合适的变量剔除掉。
一般来说缺失率较高的变量,字符型但是唯一值较多的,以及其他明显是无关的变量可以剔除。
discard_vars = 'OWN_CAR_AGE,EXT_SOURCE_1,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,WALLSMATERIAL_MODE'.split(',')
str_c_vars = list(describe_df[(describe_df['str_or_num'] =='Str') & (describe_df['discrete_or_continuous']=='C') ]['varname'])
manual_remove_vars = ['Unnamed: 0']
keep_cols = sorted(list( set(list(df.columns)) -
set(discard_vars) -
set(str_c_vars) -
set(manual_remove_vars) ))
df[keep_cols].to_csv('../data/train_df_001.csv', index=False)
保存元数据
to_pickle(describe_df, 'describe_df_001', path='../meta/')
meta_dict_001 = {}
meta_dict_001['discard_vars'] = discard_vars
meta_dict_001['str_c_vars'] = str_c_vars
meta_dict_001['manual_remove_vars'] = manual_remove_vars
meta_dict_001['keep_cols'] = keep_cols
to_pickle(meta_dict_001, 'meta_dict_001', '../meta/')
2 数据字典
这一步读取之前的元数据,然后进行入模前的变量的编号和映射。后续在这一步会加入更多的元数据,用于后续的自动化处理。
describe_df_001 = from_pickle('describe_df_001', path = meta_path)
meta_dict_001 = from_pickle('meta_dict_001', path = meta_path)
keep_cols = meta_dict_001['keep_cols']
data_dict_df = pd.DataFrame()
data_dict_df['varname'] = keep_cols
data_dict_df1 = pd.merge(data_dict_df, describe_df_001, how='left', on='varname')
new_id_list = ['x_' + str(i).zfill(6) for i in range(len(keep_cols))]
data_dict_df1.insert(1,'x', new_id_list)
生成新的元数据后保存
to_pickle(data_dict_df1, 'data_dict_df_002', meta_path)
将建模的变量进行重命名后保存。
train_df = from_pickle('train_df_001', data_path)
keep_cols = list(data_dict_df1['varname'])
new_cols = list(data_dict_df1['x'])
train_df1 = train_df[keep_cols]
train_df1.columns = new_cols
to_pickle(train_df1, 'train_df_002', data_path)
完成变量名的改头换面
保存变量名称的映射
varname_dict = dict(zip(list(data_dict_df1['varname']),list(data_dict_df1['x'])))
to_pickle(varname_dict, 'varname_dict_002', meta_path)
3 数据集分割
将数据集分为训练和测试两部分
split_para = {'target_varname':'x_000081' ,'random_seed':123,'train_ratio':0.7}
to_pickle(split_para, 'split_para_003', meta_path)
data_df = from_pickle('train_df_002', data_path)
train_df, validate_df = train_validate_split(data_df, **split_para)
>>> train df (21544, 87)
>>> validate df (9207, 87)
将这两部分数据分别存储
to_pickle(train_df,'train_df_003', data_path)
to_pickle(validate_df,'validate_df_003', data_path)
下一步就是变量的清洗和数值化,下回分解。