Commit c39b62ac authored by 宋柯's avatar 宋柯

模型调试

parent 3c2d3779
...@@ -508,10 +508,10 @@ def getEsConn(): ...@@ -508,10 +508,10 @@ def getEsConn():
def getClickSql(start, end): def getClickSql(start, end):
sql = """ sql = """
SELECT DISTINCT t1.partition_date, t1.cl_id device_id, t1.card_id,t1.time_stamp,t1.cl_type as os,t1.city_id as user_city_id SELECT t1.partition_date, t1.cl_id device_id, t1.card_id, t1.cl_type as os, t1.city_id as user_city_id
FROM FROM
( (
select partition_date,city_id,cl_id,business_id as card_id,time_stamp,page_stay,cl_type select partition_date,city_id,cl_id,business_id as card_id,page_stay,cl_type
from online.bl_hdfs_maidian_updates from online.bl_hdfs_maidian_updates
where action = 'page_view' where action = 'page_view'
AND partition_date>='{startDay}' and partition_date<='{endDay}' AND partition_date>='{startDay}' and partition_date<='{endDay}'
...@@ -521,7 +521,7 @@ def getClickSql(start, end): ...@@ -521,7 +521,7 @@ def getClickSql(start, end):
AND cl_id != '' AND cl_id != ''
AND business_id is not null AND business_id is not null
AND business_id != '' AND business_id != ''
group by partition_date,city_id,cl_id,business_id,time_stamp,page_stay,cl_type group by partition_date,city_id,cl_id,business_id,page_stay,cl_type
) AS t1 ) AS t1
join join
( --渠道,新老 ( --渠道,新老
...@@ -539,7 +539,6 @@ def getClickSql(start, end): ...@@ -539,7 +539,6 @@ def getClickSql(start, end):
AND first_channel_source_type not like 'promotion\_jf\_%' AND first_channel_source_type not like 'promotion\_jf\_%'
) t2 ) t2
on t1.cl_id = t2.device_id on t1.cl_id = t2.device_id
LEFT JOIN LEFT JOIN
( --去除黑名单 ( --去除黑名单
select distinct device_id select distinct device_id
...@@ -556,10 +555,10 @@ def getClickSql(start, end): ...@@ -556,10 +555,10 @@ def getClickSql(start, end):
def getExposureSql(start, end): def getExposureSql(start, end):
# t1.partition_date, t1.cl_id device_id, t1.card_id, t1.time_stamp, t1.cl_type as os, t1.city_id as user_city_id # t1.partition_date, t1.cl_id device_id, t1.card_id, t1.time_stamp, t1.cl_type as os, t1.city_id as user_city_id
sql = """ sql = """
SELECT DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp,cl_type as os,t1.city_id as user_city_id SELECT t1.partition_date, t1.cl_id device_id, t1.card_id, cl_type as os, t1.city_id as user_city_id
from from
( --新首页卡片曝光 ( --新首页卡片曝光
SELECT partition_date,city_id,cl_type,cl_id,card_id,max(time_stamp) as time_stamp SELECT partition_date,city_id,cl_type,cl_id,card_id
FROM online.ml_community_precise_exposure_detail FROM online.ml_community_precise_exposure_detail
where partition_date>='{startDay}' and partition_date<='{endDay}' where partition_date>='{startDay}' and partition_date<='{endDay}'
and action in ('page_precise_exposure','home_choiceness_card_exposure') and action in ('page_precise_exposure','home_choiceness_card_exposure')
...@@ -574,7 +573,6 @@ def getExposureSql(start, end): ...@@ -574,7 +573,6 @@ def getExposureSql(start, end):
and card_content_type in ('service') and card_content_type in ('service')
and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill') and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill')
group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id
) t1 ) t1
join join
( --渠道,新老 ( --渠道,新老
...@@ -592,7 +590,6 @@ def getExposureSql(start, end): ...@@ -592,7 +590,6 @@ def getExposureSql(start, end):
AND first_channel_source_type not like 'promotion\_jf\_%' AND first_channel_source_type not like 'promotion\_jf\_%'
) t2 ) t2
on t1.cl_id = t2.device_id on t1.cl_id = t2.device_id
LEFT JOIN LEFT JOIN
( --去除黑名单 ( --去除黑名单
select distinct device_id select distinct device_id
...@@ -913,9 +910,7 @@ def get_click_exp_rating_df(trainDays, spark): ...@@ -913,9 +910,7 @@ def get_click_exp_rating_df(trainDays, spark):
clickDF = clickDF.withColumn("label", F.lit(1)) clickDF = clickDF.withColumn("label", F.lit(1))
expDF = expDF.withColumn("label", F.lit(0)) expDF = expDF.withColumn("label", F.lit(0))
ratingDF = clickDF.union(expDF) ratingDF = clickDF.union(expDF)
ratingDF = ratingDF.withColumnRenamed("time_stamp", "timestamp")\ ratingDF = ratingDF.withColumn("user_city_id", F.when(F.col("user_city_id").isNull(), "-1").otherwise(F.col("user_city_id")))
.withColumn("user_city_id", F.when(F.col("user_city_id").isNull(), "-1").otherwise(F.col("user_city_id")))\
.withColumn("timestamp",F.col("timestamp").cast("long"))
ratingDF.cache() ratingDF.cache()
print("ratingDF.columns: {}".format(ratingDF.columns)) print("ratingDF.columns: {}".format(ratingDF.columns))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment