Commit 7d47408e authored by 宋柯's avatar 宋柯

模型bug修复

parent be1db75b
...@@ -558,102 +558,102 @@ def getItemStatisticSql(start, end): ...@@ -558,102 +558,102 @@ def getItemStatisticSql(start, end):
FROM FROM
( (
SELECT TT.card_id, TT.partition_date, TT.label, count(1) as label_count SELECT TT.card_id, TT.partition_date, TT.label, count(1) as label_count
FROM
(
SELECT T.partition_date, T.card_id, T.label
FROM FROM
( (
SELECT DISTINCT t1.partition_date, t1.cl_id device_id, t1.card_id,t1.time_stamp,t1.cl_type as os,t1.city_id as user_city_id, 1 as label SELECT T.partition_date, T.card_id, T.label
FROM FROM
( (
select partition_date,city_id,cl_id,business_id as card_id,time_stamp,page_stay,cl_type SELECT DISTINCT t1.partition_date, t1.cl_id device_id, t1.card_id,t1.time_stamp,t1.cl_type as os,t1.city_id as user_city_id, 1 as label
from online.bl_hdfs_maidian_updates FROM
where action = 'page_view' (
AND partition_date>='{startDay}' and partition_date<='{endDay}' select partition_date,city_id,cl_id,business_id as card_id,time_stamp,page_stay,cl_type
AND page_name='welfare_detail' from online.bl_hdfs_maidian_updates
-- AND page_stay>=1 where action = 'page_view'
AND cl_id is not null AND partition_date>='{startDay}' and partition_date<='{endDay}'
AND cl_id != '' AND page_name='welfare_detail'
AND business_id is not null -- AND page_stay>=1
AND business_id != '' AND cl_id is not null
group by partition_date,city_id,cl_id,business_id,time_stamp,page_stay,cl_type AND cl_id != ''
) AS t1 AND business_id is not null
join AND business_id != ''
( --渠道,新老 group by partition_date,city_id,cl_id,business_id,time_stamp,page_stay,cl_type
SELECT distinct device_id ) AS t1
FROM online.ml_device_day_active_status join
where partition_date>='{startDay}' and partition_date<='{endDay}' ( --渠道,新老
AND active_type in ('1','2','4') SELECT distinct device_id
and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3' FROM online.ml_device_day_active_status
,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang' where partition_date>='{startDay}' and partition_date<='{endDay}'
,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1' AND active_type in ('1','2','4')
,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4' and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100' ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ' ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
,'promotion_shike','promotion_julang_jl03','promotion_zuimei','','unknown') ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
AND first_channel_source_type not like 'promotion\_jf\_%' ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
) t2 ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
on t1.cl_id = t2.device_id ,'promotion_shike','promotion_julang_jl03','promotion_zuimei','','unknown')
AND first_channel_source_type not like 'promotion\_jf\_%'
LEFT JOIN ) t2
( --去除黑名单 on t1.cl_id = t2.device_id
select distinct device_id
from ML.ML_D_CT_DV_DEVICECLEAN_DIMEN_D LEFT JOIN
where PARTITION_DAY =regexp_replace(DATE_SUB(current_date,1) ,'-','') ( --去除黑名单
AND is_abnormal_device = 'true' select distinct device_id
)t3 from ML.ML_D_CT_DV_DEVICECLEAN_DIMEN_D
on t3.device_id=t2.device_id where PARTITION_DAY =regexp_replace(DATE_SUB(current_date,1) ,'-','')
WHERE t3.device_id is null AND is_abnormal_device = 'true'
UNION )t3
SELECT DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp,cl_type as os,t1.city_id as user_city_id, 0 as label on t3.device_id=t2.device_id
from WHERE t3.device_id is null
( --新首页卡片曝光 UNION
SELECT partition_date,city_id,cl_type,cl_id,card_id,max(time_stamp) as time_stamp SELECT DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp,cl_type as os,t1.city_id as user_city_id, 0 as label
FROM online.ml_community_precise_exposure_detail from
where partition_date>='{startDay}' and partition_date<='{endDay}' ( --新首页卡片曝光
and action in ('page_precise_exposure','home_choiceness_card_exposure') SELECT partition_date,city_id,cl_type,cl_id,card_id,max(time_stamp) as time_stamp
and cl_id IS NOT NULL FROM online.ml_community_precise_exposure_detail
and card_id IS NOT NULL where partition_date>='{startDay}' and partition_date<='{endDay}'
and is_exposure='1' and action in ('page_precise_exposure','home_choiceness_card_exposure')
--and page_name='home' and cl_id IS NOT NULL
--and tab_name='精选' and card_id IS NOT NULL
--and page_name in ('home','search_result_more') and is_exposure='1'
--and ((page_name='home' and tab_name='精选') or (page_name='category' and tab_name = '商品')) --and page_name='home'
and card_type in ('card','video') --and tab_name='精选'
and card_content_type in ('service') --and page_name in ('home','search_result_more')
and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill') --and ((page_name='home' and tab_name='精选') or (page_name='category' and tab_name = '商品'))
group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id and card_type in ('card','video')
and card_content_type in ('service')
) t1 and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill')
join group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id
( --渠道,新老
SELECT distinct device_id ) t1
FROM online.ml_device_day_active_status join
where partition_date>='{startDay}' and partition_date<='{endDay}' ( --渠道,新老
AND active_type in ('1','2','4') SELECT distinct device_id
and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3' FROM online.ml_device_day_active_status
,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang' where partition_date>='{startDay}' and partition_date<='{endDay}'
,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1' AND active_type in ('1','2','4')
,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4' and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100' ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ' ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
,'promotion_shike','promotion_julang_jl03','promotion_zuimei','','unknown') ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
AND first_channel_source_type not like 'promotion\_jf\_%' ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
) t2 ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
on t1.cl_id = t2.device_id ,'promotion_shike','promotion_julang_jl03','promotion_zuimei','','unknown')
AND first_channel_source_type not like 'promotion\_jf\_%'
LEFT JOIN ) t2
( --去除黑名单 on t1.cl_id = t2.device_id
select distinct device_id
from ML.ML_D_CT_DV_DEVICECLEAN_DIMEN_D LEFT JOIN
where PARTITION_DAY =regexp_replace(DATE_SUB(current_date,1) ,'-','') ( --去除黑名单
AND is_abnormal_device = 'true' select distinct device_id
)t3 from ML.ML_D_CT_DV_DEVICECLEAN_DIMEN_D
on t3.device_id=t2.device_id where PARTITION_DAY =regexp_replace(DATE_SUB(current_date,1) ,'-','')
WHERE t3.device_id is null AND is_abnormal_device = 'true'
) T )t3
) TT on t3.device_id=t2.device_id
GROUP BY TT.card_id, TT.partition_date, TT.label WHERE t3.device_id is null
) T
) TT
GROUP BY TT.card_id, TT.partition_date, TT.label
) TTT ) TTT
GROUP BY TTT.card_id, TTT.label GROUP BY TTT.card_id, TTT.label
""".format(startDay=start,endDay=end) """.format(startDay=start,endDay=end)
...@@ -927,8 +927,8 @@ def get_service_feature_df(): ...@@ -927,8 +927,8 @@ def get_service_feature_df():
def addDays(n, format="%Y%m%d"): def addDays(n, format="%Y%m%d"):
return (date.today() + timedelta(days=n)).strftime(format) return (date.today() + timedelta(days=n)).strftime(format)
def generatePartitionDates(trainDays): def generatePartitionDates(partitionDates):
return [addDays(-trainDay) for trainDay in range(trainDays)] return [addDays(-trainDay) for trainDay in range(partitionDates)]
#显示所有列 #显示所有列
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
...@@ -942,11 +942,12 @@ if __name__ == '__main__': ...@@ -942,11 +942,12 @@ if __name__ == '__main__':
start = time.time() start = time.time()
#入参 #入参
trainDays = int(sys.argv[1]) trainDays = int(sys.argv[1])
itemStatisticStartDays = int(sys.argv[2])
print('trainDays:{}'.format(trainDays),flush=True) print('trainDays:{}'.format(trainDays),flush=True)
endDay = addDays(0) endDay = addDays(0)
startDay = addDays(-int(trainDays)) startDay = addDays(-int(trainDays))
itemStatisticStartDay = addDays(-int(trainDays + 31)) itemStatisticStartDay = addDays(-int(trainDays + itemStatisticStartDays))
print("train_data start:{} end:{}".format(startDay,endDay)) print("train_data start:{} end:{}".format(startDay,endDay))
...@@ -968,7 +969,7 @@ if __name__ == '__main__': ...@@ -968,7 +969,7 @@ if __name__ == '__main__':
itemStatisticDF = spark.sql(itemStatisticSql) itemStatisticDF = spark.sql(itemStatisticSql)
itemStatisticDF.show(100, False) itemStatisticDF.show(100, False)
partitionDatas = generatePartitionDates(trainDays) partitionDatas = generatePartitionDates(trainDays + itemStatisticStartDays)
partitionDatasBC = spark.sparkContext.broadcast(partitionDatas) partitionDatasBC = spark.sparkContext.broadcast(partitionDatas)
def splitPatitionDatasFlatMapFunc(row): def splitPatitionDatasFlatMapFunc(row):
...@@ -1064,7 +1065,7 @@ if __name__ == '__main__': ...@@ -1064,7 +1065,7 @@ if __name__ == '__main__':
# item统计特征处理 # item统计特征处理
itemStaticDF = addItemStaticFeatures(ratingDF,itemDF_spark,dataVocab) itemStaticDF = addItemStaticFeatures(ratingDF,itemDF_spark,dataVocab)
sys.exit(1) # sys.exit(1)
# 统计数据处理 # 统计数据处理
# ratingSamplesWithLabel = addStaticsFeatures(ratingSamplesWithLabel,dataVocab) # ratingSamplesWithLabel = addStaticsFeatures(ratingSamplesWithLabel,dataVocab)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment