背景:核心任务执行优化前90min,优化后25min
案例一:
create table tmp.tmp_test_d as
select
a.sku
,a.barcode
,a.tu_code
,a.buyer_tel
,a.pay_type
,a.deliver_company
,a.need_sync
,a.import_status
,a.source
,a.source_type
,coalesce(cst.tax_price*a.buy_count,a.cost_price) cost_price
from(select /*+ mapjoin(dpb)*/ t0.*,dpb.cid1,dpb.cid2,dpb.cid3,dpb.cname1,dpb.cname2,dpb.cname3,dpb.package_id,dpb.package_type,dpb.item_name item_name2
,dpb.item_oms_cid1,dpb.item_oms_cname1,dpb.item_oms_cid2,dpb.item_oms_cname2,dpb.item_oms_cid3,dpb.item_oms_cname3,dpb.item_oms_cid4,dpb.item_oms_cname4
,case when t0.fullcoupon_id<=0 or t0.fullcoupon_id is null then cast(ceil(rand()*-1000) as bigint) else t0.fullcoupon_id end fullcoupon_id2
from dw.dw_trd_order_sku_star_d t0
left join (select * from dw.dw_prd_barcode_d where stat_day = '20200219') dpb on t0.barcode=dpb.barcode
where t0.stat_day = '20200219'
) a
left join (select * from dw.dw_prd_barcode_d where stat_day = '20200219') dpb_sku
on a.sku=dpb_sku.barcode
left join(select yj_order_detail_id,sku_no,
max(case when rn =1 then delivery_time end) delivery_time,
max(case when rn =1 then delivery_status end) delivery_status,
max(case when rn =1 then receive_time end) receive_time,
max(case when rn =1 then supplier_code end) supplier_code,
max(case when rn =1 then supplier_name end) supplier_name,
max(case when rn =1 then src_type end) src_type,
sum(case when receive_time is not null then delivery_num end) receive_num,
sum(case when delivery_time is not null then delivery_num end) delivery_num,
sum(case when delivery_time is not null then delivery_money end) delivery_money
from(
select yj_order_detail_id,sku_no,
system_time delivery_time,
status as delivery_status,
qty as delivery_num,
total_price delivery_money,
sign_time receive_time,
supplier_code,supplier_name,src_type,
row_number() over(partition by yj_order_detail_id,sku_no order by system_time desc,order_id desc) rn
from dw.dw_lgs_logistics_control_sku_d where stat_day = '20200219'
)lgc0
group by yj_order_detail_id,sku_no
) lgc on a.sub_order_id = lgc.yj_order_detail_id and a.sku = lgc.sku_no
left join (select shop_id,user_id,user_tel,consumer_id from dw.dw_usr_user_shop_d where stat_day = '20200219') usr on a.shop_id = usr.shop_id
left join (select consumer_id,user_id from dw.dw_usr_consumer_d where stat_day = '20200219') cor on a.consumer_id = cor.consumer_id
left join (select fullcoupon_id,max(pop_shop_id) pop_shop_id,max(use_scope_desc) use_scope_desc from ods.ods_cpn_fullcoupon_d where stat_day='20200219' group by fullcoupon_id
) fc on a.fullcoupon_id2 = fc.fullcoupon_id
left join(select department_id,max(department_name_secd) department_name_secd from dw.dw_usr_department_org_d where stat_day = '20200219' group by department_id) org on dpb_sku.dept_id = org.department_id
left join(select barcode,date_format(cost_date,'yyyy-MM-dd') cost_date,max(tax_price) tax_price from dw.dw_prd_day_barcode_cost_d where stat_day='20200219' group by barcode,date_format(cost_date,'yyyy-MM-dd')) cst on a.sku = cst.barcode and date_format(a.pay_time,'yyyy-MM-dd') = cst.cost_date
;
原始sql如上,省略了部分字段,运行时长平均90min以上。
观察stage的运行时长。
发现有几个stage比较慢,进去看下是否倾斜了
DAG:
从DAG图能看到是跑到了如下一段join的语句,可以看到有两个task特别大,估计是数据倾斜了,通过sql分析是barcode这个字段会有倾斜,可以去数据库验证下barcode数量分布。
select /*+ mapjoin(dpb)*/ t0.*,dpb.cid1,dpb.cid2,dpb.cid3,dpb.cname1,dpb.cname2,dpb.cname3,dpb.package_id,dpb.package_type,dpb.item_name item_name2
,dpb.item_oms_cid1,dpb.item_oms_cname1,dpb.item_oms_cid2,dpb.item_oms_cname2,dpb.item_oms_cid3,dpb.item_oms_cname3,dpb.item_oms_cid4,dpb.item_oms_cname4
,case when t0.fullcoupon_id<=0 or t0.fullcoupon_id is null then cast(ceil(rand()*-1000) as bigint) else t0.fullcoupon_id end fullcoupon_id2
from dw.dw_trd_order_sku_star_d t0
left join (select * from dw.dw_prd_barcode_d where stat_day = '20200219') dpb on t0.barcode=dpb.barcode
where t0.stat_day = '20200219'
原始sql用了mapjoin,sql作者本意应该是想走mapjoin的,但实际通过dag或者执行计划可以看到是走sortmergejoin的,mapjoin再spark是broadcastHashJoin,看下数据量大小大概再400M左右,这是一个可以优化的点。
另外那个24min的stage进去也发现是数据倾斜,分析原因实在sql
left join (select shop_