Commit c39b62ac authored by 宋柯's avatar 宋柯

模型调试

parent 3c2d3779
......@@ -508,10 +508,10 @@ def getEsConn():
def getClickSql(start, end):
sql = """
SELECT DISTINCT t1.partition_date, t1.cl_id device_id, t1.card_id,t1.time_stamp,t1.cl_type as os,t1.city_id as user_city_id
SELECT t1.partition_date, t1.cl_id device_id, t1.card_id, t1.cl_type as os, t1.city_id as user_city_id
FROM
(
select partition_date,city_id,cl_id,business_id as card_id,time_stamp,page_stay,cl_type
select partition_date,city_id,cl_id,business_id as card_id,page_stay,cl_type
from online.bl_hdfs_maidian_updates
where action = 'page_view'
AND partition_date>='{startDay}' and partition_date<='{endDay}'
......@@ -521,7 +521,7 @@ def getClickSql(start, end):
AND cl_id != ''
AND business_id is not null
AND business_id != ''
group by partition_date,city_id,cl_id,business_id,time_stamp,page_stay,cl_type
group by partition_date,city_id,cl_id,business_id,page_stay,cl_type
) AS t1
join
( --渠道,新老
......@@ -539,7 +539,6 @@ def getClickSql(start, end):
AND first_channel_source_type not like 'promotion\_jf\_%'
) t2
on t1.cl_id = t2.device_id
LEFT JOIN
( --去除黑名单
select distinct device_id
......@@ -556,10 +555,10 @@ def getClickSql(start, end):
def getExposureSql(start, end):
# t1.partition_date, t1.cl_id device_id, t1.card_id, t1.time_stamp, t1.cl_type as os, t1.city_id as user_city_id
sql = """
SELECT DISTINCT t1.partition_date,t1.cl_id device_id,t1.card_id,t1.time_stamp,cl_type as os,t1.city_id as user_city_id
SELECT t1.partition_date, t1.cl_id device_id, t1.card_id, cl_type as os, t1.city_id as user_city_id
from
( --新首页卡片曝光
SELECT partition_date,city_id,cl_type,cl_id,card_id,max(time_stamp) as time_stamp
SELECT partition_date,city_id,cl_type,cl_id,card_id
FROM online.ml_community_precise_exposure_detail
where partition_date>='{startDay}' and partition_date<='{endDay}'
and action in ('page_precise_exposure','home_choiceness_card_exposure')
......@@ -574,7 +573,6 @@ def getExposureSql(start, end):
and card_content_type in ('service')
and (get_json_object(exposure_card,'$.in_page_pos') is null or get_json_object(exposure_card,'$.in_page_pos') != 'seckill')
group by partition_date,city_id,cl_type,cl_id,card_id,app_session_id
) t1
join
( --渠道,新老
......@@ -592,7 +590,6 @@ def getExposureSql(start, end):
AND first_channel_source_type not like 'promotion\_jf\_%'
) t2
on t1.cl_id = t2.device_id
LEFT JOIN
( --去除黑名单
select distinct device_id
......@@ -913,9 +910,7 @@ def get_click_exp_rating_df(trainDays, spark):
clickDF = clickDF.withColumn("label", F.lit(1))
expDF = expDF.withColumn("label", F.lit(0))
ratingDF = clickDF.union(expDF)
ratingDF = ratingDF.withColumnRenamed("time_stamp", "timestamp")\
.withColumn("user_city_id", F.when(F.col("user_city_id").isNull(), "-1").otherwise(F.col("user_city_id")))\
.withColumn("timestamp",F.col("timestamp").cast("long"))
ratingDF = ratingDF.withColumn("user_city_id", F.when(F.col("user_city_id").isNull(), "-1").otherwise(F.col("user_city_id")))
ratingDF.cache()
print("ratingDF.columns: {}".format(ratingDF.columns))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment