Commit 4d11f769 by 王玉龙

Merge branch 'FEATURE-SEARCH-813-seachFeature' into 'master'

1.商品特征,用户商品交叉特征

See merge request tianchuan/search-model-data!10
parents 6afe09c3 2ce837d7
......@@ -9,11 +9,11 @@ create external table if not exists secoo_search.search_data_cross_feature_user_
prodanduserpurchpromotion string comment '购买过的商品是促销品 & 商品本身是促销品',
prodanduserpurchbig100 string comment '购买过百大品牌 & 商品是百大品牌',
prodpriceinuserbrowsing string comment '商品的价格是否在用户浏览的价格段中',
prodpriceinuserpurch string comment '商品的价格是否在用户购买的价格段中'
prodpriceinuserpurch string comment '商品的价格是否在用户购买的价格段中',
user_product_buy_is_match string comment '用户购买商品二级类目价格等级是否匹配商品二级类目价格等级:1/0',
user_product_buy_price_level string comment '用户购买商品二级类目价格等级是否匹配商品二级类目价格等级:price_level/0'
) comment '用户商品交叉特征'
partitioned by (
p_day date comment '分区日期'
)
partitioned by (p_day date comment '分区日期')
row format delimited fields terminated by '\t'
stored as textfile;
create external table if not exists secoo_search.search_data_cross_feature_user_product_buy
(
device_id string comment '设备id',
product_id string comment '商品id',
category2_id string comment '二级类目id',
cate_2_price_level string comment '商品二级类目价格段1-10',
user_product_buy_is_match string comment '用户购买商品二级类目价格等级是否匹配商品二级类目价格等级:1/0',
user_product_buy_price_level string comment '用户购买商品二级类目价格等级是否匹配商品二级类目价格等级:price_level/0'
) comment '用户商品购买交叉特征'
partitioned by (p_day date comment '分区日期');
\ No newline at end of file
insert overwrite table secoo_search.search_data_cross_feature_user_product_buy partition(p_day=${yesterday})
select
UPPER(T1.uuid) as device_id,
T1.product_id as product_id,
T1.category_id_2 as category2_id,
T1.buy_sku_cat2_price_level as cate_2_price_level,
case
when T1.buy_sku_cat2_price_level = T2.cate_2_price_level then 1
else 0
end as user_product_buy_is_match,
case
when T1.buy_sku_cat2_price_level = T2.cate_2_price_level then T1.buy_sku_cat2_price_level
else 0
end as user_product_buy_price_level
from secoo_app.app_search_uuid_sku_price_level T1
left join
(select
W.product_id as product_id,
split(W.category_org_code,'_')[2] as category_id_2,
P.cate_2_price_level as cate_2_price_level
from secoo_fact.fact_search_product_wide_p_day W
join secoo_fact.fact_search_product_55_30d_click_product_p_day P on W.main_id = P.product_main_id and P.p_day = ${yesterday}
where W.p_day = ${yesterday}
) T2 on T1.product_id = T2.product_id
where T1.p_day = ${yesterday}
work_dir="/data/zhaoyanchao/java/shell/cross_feature/user_product/"
# 用户商品交叉特征-购买
delta_day=1
today_param=$1
yesterday=`date -d "${today_param} -$delta_day day" "+%Y-%m-%d"`
hive -f "$work_dir"create_cross_feature_user_product_buy.sql
hive --hivevar yesterday="'${yesterday}'" -f "$work_dir"insert_cross_feature_user_product_buy.sql
......@@ -2,11 +2,11 @@
work_dir="/data/zhaoyanchao/java/shell/cross_feature/user_product/"
# 检查推荐最新表名
delta_day=0
delta_day=1
today_param=$1
yesterday=`date -d "${today_param} -$delta_day day" "+%Y-%m-%d"`
date_str=`echo ${yesterday}|sed 's/\-/_/g'`
date_str=`echo ${today_param}|sed 's/\-/_/g'`
table_name="secoo_rcmd_features.userProductCrossFeatures_"${date_str}
echo ${table_name}
......@@ -16,9 +16,9 @@ while [[ ${result} -ne 0 ]];
do
delta_day=$(($delta_day+1))
yesterday=`date -d "${today_param} -$delta_day day" "+%Y-%m-%d"`
today_param_1=`date -d "${today_param} -$delta_day day" "+%Y-%m-%d"`
date_str=`echo ${yesterday}|sed 's/\-/_/g'`
date_str=`echo ${today_param_1}|sed 's/\-/_/g'`
table_name="secoo_rcmd_features.userProductCrossFeatures_"${date_str}
hive -e "desc $table_name"
......@@ -30,19 +30,24 @@ done
# 如果无表,建表
hive -f "$work_dir"create_cross_feature_user_product.sql
hive -e "insert overwrite table secoo_search.search_data_cross_feature_user_product partition(p_day=date'$today_param')
hive -e "insert overwrite table secoo_search.search_data_cross_feature_user_product partition(p_day=date'${today_param}')
select
device_id,
product_id,
prodanduserpurchspecial,
prodinusercat1,
prodinusercat2,
prodanduserpurchnew,
prodanduserpurchpromotion,
prodanduserpurchbig100,
prodpriceinuserbrowsing,
prodpriceinuserpurch
from $table_name;"
T1.device_id,
T1.product_id,
T1.prodanduserpurchspecial,
T1.prodinusercat1,
T1.prodinusercat2,
T1.prodanduserpurchnew,
T1.prodanduserpurchpromotion,
T1.prodanduserpurchbig100,
T1.prodpriceinuserbrowsing,
T1.prodpriceinuserpurch,
T2.user_product_buy_is_match,
T2.user_product_buy_price_level
from ${table_name} T1
left join secoo_search.search_data_cross_feature_user_product_buy T2 on T1.device_id = T2.device_id and T1.product_id = T2.product_id
where T2.p_day = '${yesterday}';"
hive -e "SELECT
sum(prodanduserpurchspecial),
sum(prodinusercat1),
......@@ -51,9 +56,10 @@ hive -e "SELECT
sum(prodanduserpurchpromotion),
sum(prodanduserpurchbig100),
sum(prodpriceinuserbrowsing),
sum(prodpriceinuserpurch)
FROM secoo_search.search_data_cross_feature_user_product
WHERE p_day = '$today_param'"
sum(prodpriceinuserpurch),
sum(user_product_buy_is_match)
FROM secoo_search.search_data_cross_feature_user_product WHERE p_day = '${today_param}'"
hive -e "SELECT
assert_true(sum(prodanduserpurchspecial) > 0),
assert_true(sum(prodinusercat1) > 0),
......@@ -62,6 +68,6 @@ hive -e "SELECT
assert_true(sum(prodanduserpurchpromotion) > 0),
assert_true(sum(prodanduserpurchbig100) > 0),
assert_true(sum(prodpriceinuserbrowsing) > 0),
assert_true(sum(prodpriceinuserpurch) > 0)
FROM secoo_search.search_data_cross_feature_user_product
WHERE p_day = '$today_param'"
\ No newline at end of file
assert_true(sum(prodpriceinuserpurch) > 0),
assert_true(sum(user_product_buy_is_match) > 0)
FROM secoo_search.search_data_cross_feature_user_product WHERE p_day = '${today_param}'"
\ No newline at end of file
-- 商品特征扩展表
create external table if not exists secoo_search.search_data_product_feature_ext
(
`product_id` bigint COMMENT '商品ID',
`product_id` bigint comment '商品ID',
`area_type` tinyint comment '货源地,(0大陆 1香港 2美国 3日本 4意大利)',
`sale_qty_180` bigint comment '前180天至今销售数量',
`sale_qty_90` bigint comment '前90天至今销售数量',
......
......@@ -51,9 +51,8 @@ create external table if not exists secoo_search.search_data_product_feature
sale_qty_180 bigint comment '前180天至今销售数量',
sale_qty_90 bigint comment '前90天至今销售数量',
sale_qty_30 bigint comment '前30天至今销售数量',
sale_qty_15 bigint comment '前15天至今销售数量'
sale_qty_15 bigint comment '前15天至今销售数量',
product_cate2_price_level tinyint comment '商品二级类目价格段1-10'
) comment '商品特征'
partitioned by (
p_day date comment '分区日期'
)
partitioned by (p_day date comment '分区日期')
stored as parquet;
......@@ -2,10 +2,10 @@
# 检查推荐最新表名
work_dir="/data/zhaoyanchao/java/shell/product_feature/"
delta_day=0
delta_day=1
today_param=$1
yesterday=`date -d "${today_param} -$delta_day day" "+%Y-%m-%d"`
date_str=`echo ${yesterday}|sed 's/\-/_/g'`
date_str=`echo ${today_param}|sed 's/\-/_/g'`
table_name="secoo_rcmd_features.productFeatures_"${date_str}
echo ${table_name}
......@@ -14,8 +14,8 @@ result=$?
while [[ ${result} -ne 0 ]];
do
delta_day=$(($delta_day+1))
yesterday=`date -d "${today_param} -$delta_day day" "+%Y-%m-%d"`
date_str=`echo ${yesterday}|sed 's/\-/_/g'`
param_delta_1=`date -d "${today_param} -$delta_day day" "+%Y-%m-%d"`
date_str=`echo ${param_delta_1}|sed 's/\-/_/g'`
table_name="secoo_rcmd_features.productFeatures_"${date_str}
hive -e "desc $table_name"
result=$?
......@@ -24,11 +24,11 @@ done
# 如果无表,建表
hive -f $"work_dir"create_search_product_feature_table.sql
hive -f "${work_dir}"create_search_product_feature_table.sql
hive -e "insert overwrite table secoo_search.search_data_product_feature partition(p_day=date'$today_param')
select
R.product_id,
R.product_id as product_id,
R.isbrand100 as isBrand100,
R.productpopularity as productPopularity,
R.ismemberproduct as isMemberProduct ,
......@@ -80,14 +80,17 @@ select
R.7daysaddcart as 7DaysAddCart,
R.30daysaddcart as 30DaysAddCart,
W.area_type,
W.sale_qty_180,
W.sale_qty_90,
W.sale_qty_30,
W.sale_qty_15
W.area_type as area_type,
W.sale_qty_180 as sale_qty_180,
W.sale_qty_90 as sale_qty_90,
W.sale_qty_30 as sale_qty_30,
W.sale_qty_15 as sale_qty_15,
P.cate_2_price_level as product_cate2_price_level
from $table_name R
left join secoo_fact.fact_search_product_wide_p_day W
on R.product_id = cast(W.product_id AS string) and W.p_day = '$yesterday';"
left join secoo_fact.fact_search_product_wide_p_day W on R.product_id = cast(W.product_id AS string) and W.p_day = '${yesterday}'
left join secoo_fact.fact_search_product_55_30d_click_product_p_day P on W.main_id = P.product_main_id and P.p_day = '${yesterday}'
;"
......
-- 商品特征扩展表
create external table if not exists secoo_search.user_brand_category2_favorite
(
device_id string comment '用户设备id',
click_category2_id1 string comment '用户点击偏好二级类目1',
click_category2_id2 string comment '用户点击偏好二级类目2',
click_category2_id3 string comment '用户点击偏好二级类目3',
click_category2_id4 string comment '用户点击偏好二级类目4',
click_category2_id5 string comment '用户点击偏好二级类目5',
click_category2_id6 string comment '用户点击偏好二级类目6',
click_brand_id1 string comment '用户点击偏好品牌1',
click_brand_id2 string comment '用户点击偏好品牌2',
click_brand_id3 string comment '用户点击偏好品牌3',
click_brand_id4 string comment '用户点击偏好品牌4',
click_brand_id5 string comment '用户点击偏好品牌5',
click_brand_id6 string comment '用户点击偏好品牌6',
add_category2_id1 string comment '用户加购偏好二级类目1',
add_category2_id2 string comment '用户加购偏好二级类目2',
add_category2_id3 string comment '用户加购偏好二级类目3',
add_category2_id4 string comment '用户加购偏好二级类目4',
add_category2_id5 string comment '用户加购偏好二级类目5',
add_category2_id6 string comment '用户加购偏好二级类目6',
add_brand_id1 string comment '用户加购偏好品牌1',
add_brand_id2 string comment '用户加购偏好品牌2',
add_brand_id3 string comment '用户加购偏好品牌3',
add_brand_id4 string comment '用户加购偏好品牌4',
add_brand_id5 string comment '用户加购偏好品牌5',
add_brand_id6 string comment '用户加购偏好品牌6',
pay_category2_id1 string comment '用户购买偏好二级类目1',
pay_category2_id2 string comment '用户购买偏好二级类目2',
pay_category2_id3 string comment '用户购买偏好二级类目3',
pay_category2_id4 string comment '用户购买偏好二级类目4',
pay_category2_id5 string comment '用户购买偏好二级类目5',
pay_category2_id6 string comment '用户购买偏好二级类目6',
pay_brand_id1 string comment '用户购买偏好品牌1',
pay_brand_id2 string comment '用户购买偏好品牌2',
pay_brand_id3 string comment '用户购买偏好品牌3',
pay_brand_id4 string comment '用户购买偏好品牌4',
pay_brand_id5 string comment '用户购买偏好品牌5',
pay_brand_id6 string comment '用户购买偏好品牌6'
) comment '搜索用户偏好品牌品类特征表'
partitioned by (p_day date comment '分区日期');
--点击偏好二级类目--
drop table tmp.tmp_user_click_product_category2_id;
create table if not exists tmp.tmp_user_click_product_category2_id as
select
device_id,
nvl(split(category2_ids, ',')[0], 0) click_category2_id1,
nvl(split(category2_ids, ',')[1], 0) click_category2_id2,
nvl(split(category2_ids, ',')[2], 0) click_category2_id3,
nvl(split(category2_ids, ',')[3], 0) click_category2_id4,
nvl(split(category2_ids, ',')[4], 0) click_category2_id5,
nvl(split(category2_ids, ',')[5], 0) click_category2_id6
from
(select
device_id as device_id,
concat_ws(',',collect_set(cast(category2_id as string))) as category2_ids
from
(select
device_id,
category2_id,
row_number() over(partition by device_id order by click_pv desc) rank
from
(select
click_device_id as device_id,
product_category_id_2 as category2_id,
sum(is_click) as click_pv
from secoo_fact_hour.fact_search_detail_union_p_hour_inrc
where p_day >= date_sub(${today}, 30) AND p_day < ${today}
and is_click = 1 and click_device_id is not null
group by click_device_id, product_category_id_2
) T1
) T2
where rank < 7 group by device_id
)T3;
--点击偏好品牌--
drop table tmp.tmp_user_click_product_brand_id;
create table if not exists tmp.tmp_user_click_product_brand_id as
select
device_id,
nvl(split(product_brand_ids, ',')[0], 0) click_brand_id1,
nvl(split(product_brand_ids, ',')[1], 0) click_brand_id2,
nvl(split(product_brand_ids, ',')[2], 0) click_brand_id3,
nvl(split(product_brand_ids, ',')[3], 0) click_brand_id4,
nvl(split(product_brand_ids, ',')[4], 0) click_brand_id5,
nvl(split(product_brand_ids, ',')[5], 0) click_brand_id6
from
(select
device_id as device_id,
concat_ws(',',collect_set(cast(product_brand_id as string))) as product_brand_ids
from
(select
device_id,
product_brand_id,
row_number() over(partition by device_id order by click_pv desc) rank
from
(select
click_device_id as device_id,
product_brand_id as product_brand_id,
sum(is_click) as click_pv
from secoo_fact_hour.fact_search_detail_union_p_hour_inrc
where p_day >= date_sub(${today}, 30) AND p_day < ${today}
and is_click = 1 and click_device_id is not null
group by click_device_id, product_brand_id
) T1
) T2
where rank < 7 group by device_id
)T3;
--加购偏好类目--
drop table tmp.tmp_user_add_product_category2_id;
create table if not exists tmp.tmp_user_add_product_category2_id as
select
device_id,
nvl(split(category2_ids, ',')[0], 0) add_category2_id1,
nvl(split(category2_ids, ',')[1], 0) add_category2_id2,
nvl(split(category2_ids, ',')[2], 0) add_category2_id3,
nvl(split(category2_ids, ',')[3], 0) add_category2_id4,
nvl(split(category2_ids, ',')[4], 0) add_category2_id5,
nvl(split(category2_ids, ',')[5], 0) add_category2_id6
from
(select
device_id as device_id,
concat_ws(',',collect_set(cast(category2_id as string))) as category2_ids
from
(select
device_id,
category2_id,
row_number() over(partition by device_id order by add_pv desc) rank
from
(select
add_cart_device_id as device_id,
product_category_id_2 as category2_id,
sum(is_action_add_cart) as add_pv
from secoo_fact_hour.fact_search_detail_union_p_hour_inrc
where p_day >= date_sub(${today}, 90) AND p_day < ${today}
and is_action_add_cart = 1 and add_cart_device_id is not null
group by add_cart_device_id, product_category_id_2
) T1
) T2
where rank < 7 group by device_id
)T3;
--加购偏好品牌--
drop table tmp.tmp_user_add_product_brand_id;
create table if not exists tmp.tmp_user_add_product_brand_id as
select
device_id,
nvl(split(product_brand_ids, ',')[0], 0) add_brand_id1,
nvl(split(product_brand_ids, ',')[1], 0) add_brand_id2,
nvl(split(product_brand_ids, ',')[2], 0) add_brand_id3,
nvl(split(product_brand_ids, ',')[3], 0) add_brand_id4,
nvl(split(product_brand_ids, ',')[4], 0) add_brand_id5,
nvl(split(product_brand_ids, ',')[5], 0) add_brand_id6
from
(select
device_id as device_id,
concat_ws(',',collect_set(cast(product_brand_id as string))) as product_brand_ids
from
(select
device_id,
product_brand_id,
row_number() over(partition by device_id order by add_pv desc) rank
from
(select
add_cart_device_id as device_id,
product_brand_id as product_brand_id,
sum(is_action_add_cart) as add_pv
from secoo_fact_hour.fact_search_detail_union_p_hour_inrc
where p_day >= date_sub(${today}, 90) AND p_day < ${today}
and is_action_add_cart = 1 and add_cart_device_id is not null
group by add_cart_device_id, product_brand_id
) T1
) T2
where rank < 7 group by device_id
)T3;
--购买偏好类目--
drop table tmp.tmp_user_pay_product_category2_id;
create table if not exists tmp.tmp_user_pay_product_category2_id as
select
device_id,
nvl(split(category2_ids, ',')[0], 0) pay_category2_id1,
nvl(split(category2_ids, ',')[1], 0) pay_category2_id2,
nvl(split(category2_ids, ',')[2], 0) pay_category2_id3,
nvl(split(category2_ids, ',')[3], 0) pay_category2_id4,
nvl(split(category2_ids, ',')[4], 0) pay_category2_id5,
nvl(split(category2_ids, ',')[5], 0) pay_category2_id6
from
(select
device_id as device_id,
concat_ws(',',collect_set(cast(category2_id as string))) as category2_ids
from
(select
device_id,
category2_id,
row_number() over(partition by device_id order by pay_pv desc) rank
from
(select
pay_device_id as device_id,
product_category_id_2 as category2_id,
sum(is_pay_success) as pay_pv
from secoo_fact_hour.fact_search_detail_union_p_hour_inrc
where p_day >= date_sub(${today}, 365) AND p_day < ${today}
and is_pay_success = 1 and pay_device_id is not null
group by pay_device_id, product_category_id_2
) T1
) T2
where rank < 7 group by device_id
)T3;
--购买偏好品牌--
drop table tmp.tmp_user_pay_product_brand_id;
create table if not exists tmp.tmp_user_pay_product_brand_id as
select
device_id,
nvl(split(product_brand_ids, ',')[0], 0) pay_brand_id1,
nvl(split(product_brand_ids, ',')[1], 0) pay_brand_id2,
nvl(split(product_brand_ids, ',')[2], 0) pay_brand_id3,
nvl(split(product_brand_ids, ',')[3], 0) pay_brand_id4,
nvl(split(product_brand_ids, ',')[4], 0) pay_brand_id5,
nvl(split(product_brand_ids, ',')[5], 0) pay_brand_id6
from
(select
device_id as device_id,
concat_ws(',',collect_set(cast(product_brand_id as string))) as product_brand_ids
from
(select
device_id,
product_brand_id,
row_number() over(partition by device_id order by pay_pv desc) rank
from
(select
pay_device_id as device_id,
product_brand_id as product_brand_id,
sum(is_pay_success) as pay_pv
from secoo_fact_hour.fact_search_detail_union_p_hour_inrc
where p_day >= date_sub(${today}, 365) AND p_day < ${today}
and is_pay_success = 1 and pay_device_id is not null
group by pay_device_id, product_brand_id
) T1
) T2
where rank < 7 group by device_id
)T3;
insert overwrite table secoo_search.user_brand_category2_favorite partition(p_day=${today})
select
C1.device_id as device_id,
nvl(C1.click_category2_id1,0) as click_category2_id1,
nvl(C1.click_category2_id2,0) as click_category2_id2,
nvl(C1.click_category2_id3,0) as click_category2_id3,
nvl(C1.click_category2_id4,0) as click_category2_id4,
nvl(C1.click_category2_id5,0) as click_category2_id5,
nvl(C1.click_category2_id6,0) as click_category2_id6,
nvl(B1.click_brand_id1,0) as click_brand_id1,
nvl(B1.click_brand_id2,0) as click_brand_id2,
nvl(B1.click_brand_id3,0) as click_brand_id3,
nvl(B1.click_brand_id4,0) as click_brand_id4,
nvl(B1.click_brand_id5,0) as click_brand_id5,
nvl(B1.click_brand_id6,0) as click_brand_id6,
nvl(C2.add_category2_id1,0) as add_category2_id1,
nvl(C2.add_category2_id2,0) as add_category2_id2,
nvl(C2.add_category2_id3,0) as add_category2_id3,
nvl(C2.add_category2_id4,0) as add_category2_id4,
nvl(C2.add_category2_id5,0) as add_category2_id5,
nvl(C2.add_category2_id6,0) as add_category2_id6,
nvl(B2.add_brand_id1,0) as add_brand_id1,
nvl(B2.add_brand_id2,0) as add_brand_id2,
nvl(B2.add_brand_id3,0) as add_brand_id3,
nvl(B2.add_brand_id4,0) as add_brand_id4,
nvl(B2.add_brand_id5,0) as add_brand_id5,
nvl(B2.add_brand_id6,0) as add_brand_id6,
nvl(C3.pay_category2_id1,0) as pay_category2_id1,
nvl(C3.pay_category2_id2,0) as pay_category2_id2,
nvl(C3.pay_category2_id3,0) as pay_category2_id3,
nvl(C3.pay_category2_id4,0) as pay_category2_id4,
nvl(C3.pay_category2_id5,0) as pay_category2_id5,
nvl(C3.pay_category2_id6,0) as pay_category2_id6,
nvl(B3.pay_brand_id1,0) as pay_brand_id1,
nvl(B3.pay_brand_id2,0) as pay_brand_id2,
nvl(B3.pay_brand_id3,0) as pay_brand_id3,
nvl(B3.pay_brand_id4,0) as pay_brand_id4,
nvl(B3.pay_brand_id5,0) as pay_brand_id5,
nvl(B3.pay_brand_id6,0) as pay_brand_id6
from tmp.tmp_user_click_product_category2_id C1
left join tmp.tmp_user_click_product_brand_id B1 on C1.device_id = B1.device_id
left join tmp.tmp_user_add_product_category2_id C2 on C1.device_id = C2.device_id
left join tmp.tmp_user_add_product_brand_id B2 on C1.device_id = B2.device_id
left join tmp.tmp_user_pay_product_category2_id C3 on C1.device_id = C3.device_id
left join tmp.tmp_user_pay_product_brand_id B3 on C1.device_id = B3.device_id;
\ No newline at end of file
work_dir="/data/zhaoyanchao/java/shell/user_feature/"
delta_day=0
today_param=$1
today=`date -d "${today_param} -${delta_day} day" "+%Y-%m-%d"`
echo ${today}
hive -f "$work_dir"create_user_brand_category_favorite_table.sql
hive --hivevar today="'${today}'" -f "$work_dir"insert_user_brand_category_favorite_table.sql
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment