Commit b0896276 authored by 宋柯's avatar 宋柯

模型上线

parent 62c71856
...@@ -153,6 +153,20 @@ def getPredictItemStaticFeatures(itemStatisticDays): ...@@ -153,6 +153,20 @@ def getPredictItemStaticFeatures(itemStatisticDays):
itemStatisticDF = spark.sql(itemStatisticSql) itemStatisticDF = spark.sql(itemStatisticSql)
# itemStatisticDF.show(100, False) # itemStatisticDF.show(100, False)
partitionDatas = generatePredictPartitionDates(itemStatisticDays)
partitionDatasBC = spark.sparkContext.broadcast(partitionDatas)
def splitPatitionDatasFlatMapFunc(row):
card_id = row.card_id
label = row.label
partition_date_label_count_list = row.partition_date_label_count_list
partition_date_label_count_dcit = dict(map(lambda s: (s.split('_')[0], s.split('_')[1]), partition_date_label_count_list))
res = []
for partition_date in partitionDatasBC.value:
res.append((card_id, partition_date, label, partition_date_label_count_dcit.get(partition_date, '0')))
return res
itemStatisticDF = itemStatisticDF.rdd.flatMap(splitPatitionDatasFlatMapFunc).toDF(["card_id", "partition_date", "label", "label_count"])
itemStatisticDF.createOrReplaceTempView("predictItemStatisticDF") itemStatisticDF.createOrReplaceTempView("predictItemStatisticDF")
...@@ -761,100 +775,106 @@ def getItemStatisticSql(start, end): ...@@ -761,100 +775,106 @@ def getItemStatisticSql(start, end):
def getPredictItemStatisticSql(start, end): def getPredictItemStatisticSql(start, end):
sql = """ sql = """
SELECT TT.card_id, TT.label, count(1) as label_count SELECT TTT.card_id, TTT.label, COLLECT_LIST(CONCAT(TTT.partition_date, '_', TTT.label_count)) partition_date_label_count_list
FROM FROM
( (
SELECT T.card_id, T.label SELECT TT.partition_date, TT.card_id, TT.label, count(1) as label_count
FROM FROM
( (
SELECT t1.card_id, 1 as label SELECT T.partition_date, T.card_id, T.label
FROM FROM
( (
select cl_id, business_id as card_id SELECT t1.partition_date, t1.card_id, 1 as label
from online.bl_hdfs_maidian_updates FROM
where action = 'page_view' (
AND partition_date>='{startDay}' and partition_date<='{endDay}' select cl_id, business_id as card_id
AND page_name='welfare_detail' from online.bl_hdfs_maidian_updates
AND page_stay >= 2 where action = 'page_view'
AND cl_id is not null AND partition_date>='{startDay}' and partition_date<='{endDay}'
AND cl_id != '' AND page_name='welfare_detail'
AND business_id is not null AND page_stay >= 2
AND business_id != '' AND cl_id is not null
group by partition_date, city_id, cl_id, business_id, cl_type AND cl_id != ''
) AS t1 AND business_id is not null
join AND business_id != ''
( --渠道,新老 group by partition_date, city_id, cl_id, business_id, cl_type
SELECT distinct device_id ) AS t1
FROM online.ml_device_day_active_status join
where partition_date>='{startDay}' and partition_date<='{endDay}' ( --渠道,新老
AND active_type in ('1','2','4') SELECT distinct device_id
and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3' FROM online.ml_device_day_active_status
,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang' where partition_date>='{startDay}' and partition_date<='{endDay}'
,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1' AND active_type in ('1','2','4')
,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4' and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100' ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ' ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
,'promotion_shike','promotion_julang_jl03','promotion_zuimei','','unknown') ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
AND first_channel_source_type not like 'promotion\_jf\_%' ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
) t2 ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
on t1.cl_id = t2.device_id ,'promotion_shike','promotion_julang_jl03','promotion_zuimei','','unknown')
LEFT JOIN AND first_channel_source_type not like 'promotion\_jf\_%'
( --去除黑名单 ) t2
select distinct device_id on t1.cl_id = t2.device_id
from ML.ML_D_CT_DV_DEVICECLEAN_DIMEN_D LEFT JOIN
where PARTITION_DAY =regexp_replace(DATE_SUB(current_date,1) ,'-','') ( --去除黑名单
AND is_abnormal_device = 'true' select distinct device_id
)t3 from ML.ML_D_CT_DV_DEVICECLEAN_DIMEN_D
on t3.device_id=t2.device_id where PARTITION_DAY =regexp_replace(DATE_SUB(current_date,1) ,'-','')
WHERE t3.device_id is null AND is_abnormal_device = 'true'
UNION )t3
SELECT t1.card_id, 0 as label on t3.device_id=t2.device_id
from WHERE t3.device_id is null
( --新首页卡片曝光 UNION
SELECT cl_id, card_id SELECT t1.partition_date, t1.card_id, 0 as label
FROM online.ml_community_precise_exposure_detail from
where partition_date>='{startDay}' and partition_date<='{endDay}' ( --新首页卡片曝光
and action in ('page_precise_exposure','home_choiceness_card_exposure') SELECT cl_id, card_id
and cl_id IS NOT NULL FROM online.ml_community_precise_exposure_detail
and card_id IS NOT NULL where partition_date>='{startDay}' and partition_date<='{endDay}'
and is_exposure='1' and action in ('page_precise_exposure','home_choiceness_card_exposure')
--and page_name='home' and cl_id IS NOT NULL
--and tab_name='精选' and card_id IS NOT NULL
--and page_name in ('home','search_result_more') and is_exposure='1'
--and ((page_name='home' and tab_name='精选') or (page_name='category' and tab_name = '商品')) --and page_name='home'
and card_type in ('card','video') --and tab_name='精选'
and card_content_type in ('service') --and page_name in ('home','search_result_more')
and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill') --and ((page_name='home' and tab_name='精选') or (page_name='category' and tab_name = '商品'))
group by partition_date, city_id, cl_type, cl_id, card_id, app_session_id and card_type in ('card','video')
) t1 and card_content_type in ('service')
join and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill')
( --渠道,新老 group by partition_date, city_id, cl_type, cl_id, card_id, app_session_id
SELECT distinct device_id ) t1
FROM online.ml_device_day_active_status join
where partition_date>='{startDay}' and partition_date<='{endDay}' ( --渠道,新老
AND active_type in ('1','2','4') SELECT distinct device_id
and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3' FROM online.ml_device_day_active_status
,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang' where partition_date>='{startDay}' and partition_date<='{endDay}'
,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1' AND active_type in ('1','2','4')
,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4' and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100' ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ' ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
,'promotion_shike','promotion_julang_jl03','promotion_zuimei','','unknown') ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
AND first_channel_source_type not like 'promotion\_jf\_%' ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
) t2 ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
on t1.cl_id = t2.device_id ,'promotion_shike','promotion_julang_jl03','promotion_zuimei','','unknown')
LEFT JOIN AND first_channel_source_type not like 'promotion\_jf\_%'
( --去除黑名单 ) t2
select distinct device_id on t1.cl_id = t2.device_id
from ML.ML_D_CT_DV_DEVICECLEAN_DIMEN_D LEFT JOIN
where PARTITION_DAY =regexp_replace(DATE_SUB(current_date,1) ,'-','') ( --去除黑名单
AND is_abnormal_device = 'true' select distinct device_id
)t3 from ML.ML_D_CT_DV_DEVICECLEAN_DIMEN_D
on t3.device_id=t2.device_id where PARTITION_DAY =regexp_replace(DATE_SUB(current_date,1) ,'-','')
WHERE t3.device_id is null AND is_abnormal_device = 'true'
) T )t3
) TT on t3.device_id=t2.device_id
GROUP BY TT.card_id, TT.label WHERE t3.device_id is null
) T
) TT
GROUP BY TT.partition_date, TT.card_id, TT.label
) TTT
GROUP BY TTT.card_id, TTT.label
""".format(startDay = start,endDay = end) """.format(startDay = start,endDay = end)
print(sql) print(sql)
return sql return sql
...@@ -1010,6 +1030,9 @@ def addDays(n, format="%Y%m%d"): ...@@ -1010,6 +1030,9 @@ def addDays(n, format="%Y%m%d"):
def generatePartitionDates(partitionDates): def generatePartitionDates(partitionDates):
return [addDays(-trainDay - 1) for trainDay in range(partitionDates)] return [addDays(-trainDay - 1) for trainDay in range(partitionDates)]
def generatePredictPartitionDates(partitionDates):
return [addDays(-trainDay) for trainDay in range(partitionDates)]
#显示所有列 #显示所有列
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
#显示所有行 #显示所有行
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment