Commit 2d6bb864 authored by 宋柯's avatar 宋柯

模型调试

parent e4e1f131
......@@ -86,6 +86,7 @@ def addItemStaticFeatures(samples,itemDF,dataVocab):
ctrUdf = F.udf(wilson_ctr, FloatType())
# item不设置over窗口,原因:item可能一直存在,统计数据按照最新即可
print("item统计特征处理...")
#TODO item特征存在特征穿越
staticFeatures = samples.groupBy('item_id').agg(F.count(F.lit(1)).alias('itemRatingCount'),
F.avg(F.col('rating')).alias('itemRatingAvg'),
F.stddev(F.col('rating')).alias('itemRatingStddev'),
......@@ -551,6 +552,107 @@ def getExposureSql(start, end):
print(sql)
return sql
def getItemStatisticSql(start, end):
sql = """
SELECT TT.card_id, TT.partition_date, TT.label, count(1) as label_count
(
SELECT T.partition_date, T.card_id, T.label
FROM
(
SELECT DISTINCT t1.partition_date, t1.cl_id device_id, t1.card_id,t1.time_stamp,t1.cl_type as os,t1.city_id as user_city_id, 1 as label
FROM
(
select partition_date,city_id,cl_id,business_id as card_id,time_stamp,page_stay,cl_type
from online.bl_hdfs_maidian_updates
where action = 'page_view'
AND partition_date>='{startDay}' and partition_date<='{endDay}'
AND page_name='welfare_detail'
-- AND page_stay>=1
AND cl_id is not null
AND cl_id != ''
AND business_id is not null
AND business_id != ''
group by partition_date,city_id,cl_id,business_id,time_stamp,page_stay,cl_type
) AS t1
join
( --渠道,新老
SELECT distinct device_id
FROM online.ml_device_day_active_status
where partition_date>='{startDay}' and partition_date<='{endDay}'
AND active_type in ('1','2','4')
and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
,'promotion_shike','promotion_julang_jl03','promotion_zuimei','','unknown')
AND first_channel_source_type not like 'promotion\_jf\_%'
) t2
on t1.cl_id = t2.device_id
LEFT JOIN
( --去除黑名单
select distinct device_id
from ML.ML_D_CT_DV_DEVICECLEAN_DIMEN_D
where PARTITION_DAY =regexp_replace(DATE_SUB(current_date,1) ,'-','')
AND is_abnormal_device = 'true'
)t3
on t3.device_id=t2.device_id
WHERE t3.device_id is null
UNION
SELECT DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp,cl_type as os,t1.city_id as user_city_id, 0 as label
from
( --新首页卡片曝光
SELECT partition_date,city_id,cl_type,cl_id,card_id,max(time_stamp) as time_stamp
FROM online.ml_community_precise_exposure_detail
where partition_date>='{startDay}' and partition_date<='{endDay}'
and action in ('page_precise_exposure','home_choiceness_card_exposure')
and cl_id IS NOT NULL
and card_id IS NOT NULL
and is_exposure='1'
--and page_name='home'
--and tab_name='精选'
--and page_name in ('home','search_result_more')
--and ((page_name='home' and tab_name='精选') or (page_name='category' and tab_name = '商品'))
and card_type in ('card','video')
and card_content_type in ('service')
and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill')
group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id
) t1
join
( --渠道,新老
SELECT distinct device_id
FROM online.ml_device_day_active_status
where partition_date>='{startDay}' and partition_date<='{endDay}'
AND active_type in ('1','2','4')
and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
,'promotion_shike','promotion_julang_jl03','promotion_zuimei','','unknown')
AND first_channel_source_type not like 'promotion\_jf\_%'
) t2
on t1.cl_id = t2.device_id
LEFT JOIN
( --去除黑名单
select distinct device_id
from ML.ML_D_CT_DV_DEVICECLEAN_DIMEN_D
where PARTITION_DAY =regexp_replace(DATE_SUB(current_date,1) ,'-','')
AND is_abnormal_device = 'true'
)t3
on t3.device_id=t2.device_id
WHERE t3.device_id is null
) T
) TT
GROUP BY TT.card_id, TT.partition_date, TT.label
""".format(startDay=start,endDay=end)
def getClickSql2(start, end):
sql = """
SELECT DISTINCT t1.partition_date, t1.cl_id device_id, t1.business_id card_id,t1.time_stamp time_stamp,t1.page_stay as page_stay
......@@ -837,6 +939,9 @@ if __name__ == '__main__':
print("train_data start:{} end:{}".format(startDay,endDay))
spark = get_spark("service_feature_csv_export")
spark.sparkContext.setLogLevel("ERROR")
......@@ -847,7 +952,10 @@ if __name__ == '__main__':
# 行为数据
clickSql = getClickSql(startDay,endDay)
expSql = getExposureSql(startDay,endDay)
itemStatisticSql = getItemStatisticSql(startDay, endDay)
spark.sql(itemStatisticSql).show(100, False)
sys.exit(1)
clickDF = spark.sql(clickSql)
clickDF.createOrReplaceTempView("clickDF")
clickDF.cache()
......@@ -918,21 +1026,21 @@ if __name__ == '__main__':
print("dataVocab:")
for k, v in dataVocab.items():
print(k, len(v), v)
sys.exit(1)
itemDF_spark = spark.createDataFrame(itemDF)
itemDF_spark.printSchema()
itemDF_spark.show(10, truncate=False)
sys.exit(1)
# item统计特征处理
itemStaticDF = addItemStaticFeatures(ratingSamplesWithLabel,itemDF_spark,dataVocab)
# item统计特征处理
itemStaticDF = addItemStaticFeatures(ratingDF,itemDF_spark,dataVocab)
sys.exit(1)
# 统计数据处理
# ratingSamplesWithLabel = addStaticsFeatures(ratingSamplesWithLabel,dataVocab)
samples = ratingSamplesWithLabel.join(itemStaticDF, on=['item_id'], how='inner')
samples = ratingDF.join(itemStaticDF, on=['item_id'], how='inner')
print("处理user特征...")
samplesWithUserFeatures = addUserFeatures(samples,dataVocab,multiVocab)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment