Commit 01a55b5e authored by 宋柯's avatar 宋柯

模型调试

parent c39b62ac
......@@ -511,17 +511,17 @@ def getClickSql(start, end):
SELECT t1.partition_date, t1.cl_id device_id, t1.card_id, t1.cl_type as os, t1.city_id as user_city_id
FROM
(
select partition_date,city_id,cl_id,business_id as card_id,page_stay,cl_type
select partition_date, city_id, cl_id, business_id as card_id, cl_type
from online.bl_hdfs_maidian_updates
where action = 'page_view'
AND partition_date>='{startDay}' and partition_date<='{endDay}'
AND page_name='welfare_detail'
-- AND page_stay>=1
AND page_stay >= 2
AND cl_id is not null
AND cl_id != ''
AND business_id is not null
AND business_id != ''
group by partition_date,city_id,cl_id,business_id,page_stay,cl_type
group by partition_date, city_id, cl_id, business_id, cl_type
) AS t1
join
( --渠道,新老
......@@ -614,20 +614,20 @@ def getItemStatisticSql(start, end):
SELECT T.partition_date, T.card_id, T.label
FROM
(
SELECT DISTINCT t1.partition_date, t1.cl_id device_id, t1.card_id,t1.time_stamp,t1.cl_type as os,t1.city_id as user_city_id, 1 as label
SELECT t1.partition_date, t1.cl_id device_id, t1.card_id, t1.cl_type as os, t1.city_id as user_city_id, 1 as label
FROM
(
select partition_date,city_id,cl_id,business_id as card_id,time_stamp,page_stay,cl_type
select partition_date, city_id, cl_id, business_id as card_id, cl_type
from online.bl_hdfs_maidian_updates
where action = 'page_view'
AND partition_date>='{startDay}' and partition_date<='{endDay}'
AND page_name='welfare_detail'
-- AND page_stay>=1
AND page_stay >= 2
AND cl_id is not null
AND cl_id != ''
AND business_id is not null
AND business_id != ''
group by partition_date,city_id,cl_id,business_id,time_stamp,page_stay,cl_type
group by partition_date, city_id, cl_id, business_id, cl_type
) AS t1
join
( --渠道,新老
......@@ -656,10 +656,10 @@ def getItemStatisticSql(start, end):
on t3.device_id=t2.device_id
WHERE t3.device_id is null
UNION
SELECT DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp,cl_type as os,t1.city_id as user_city_id, 0 as label
SELECT t1.partition_date, t1.cl_id device_id, t1.card_id, cl_type as os, t1.city_id as user_city_id, 0 as label
from
( --新首页卡片曝光
SELECT partition_date,city_id,cl_type,cl_id,card_id,max(time_stamp) as time_stamp
SELECT partition_date, city_id, cl_type, cl_id, card_id
FROM online.ml_community_precise_exposure_detail
where partition_date>='{startDay}' and partition_date<='{endDay}'
and action in ('page_precise_exposure','home_choiceness_card_exposure')
......@@ -673,8 +673,7 @@ def getItemStatisticSql(start, end):
and card_type in ('card','video')
and card_content_type in ('service')
and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill')
group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id
group by partition_date, city_id, cl_type, cl_id, card_id, app_session_id
) t1
join
( --渠道,新老
......@@ -692,7 +691,6 @@ def getItemStatisticSql(start, end):
AND first_channel_source_type not like 'promotion\_jf\_%'
) t2
on t1.cl_id = t2.device_id
LEFT JOIN
( --去除黑名单
select distinct device_id
......@@ -894,7 +892,7 @@ def get_click_exp_rating_df(trainDays, spark):
#曝光数据过滤掉点击数据
print("expDF 过滤点击数据前 count: ", expDF.count())
expDF = spark.sql("""
SELECT t1.partition_date, t1.device_id, t1.card_id, t1.time_stamp, t1.os, t1.user_city_id
SELECT t1.partition_date, t1.device_id, t1.card_id, t1.os, t1.user_city_id
FROM expDF t1
LEFT JOIN clickDF t2
ON t1.partition_date = t2.partition_date
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment