Commit bee2d7ff authored by 张彦钊's avatar 张彦钊

在esmm_train_data中增加首页精选之外的点击样本

parent d925b533
......@@ -4,7 +4,7 @@ package com.gmei
import java.io.Serializable
import java.time.LocalDate
import org.apache.spark.sql.{SaveMode, SparkSession, TiContext}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, TiContext}
import org.apache.log4j.{Level, Logger}
import scopt.OptionParser
import com.gmei.lib.AbstractParams
......@@ -139,8 +139,9 @@ object EsmmData {
// println(cvr_data_filter.count())
val clk_data_filter =clk_data.except(cvr_data).withColumn("y",lit(1)).withColumn("z",lit(0))
val other_click = get_other_click(sc,stat_date_not)
val all_click = clk_data.union(other_click)
val clk_data_filter =all_click.except(cvr_data).withColumn("y",lit(1)).withColumn("z",lit(0))
// clk_data_filter.createOrReplaceTempView("clk_data_filter")
// clk_data_filter.show()
// println("clk_data_filter.count()")
......@@ -222,7 +223,6 @@ object EsmmData {
""".stripMargin
)
GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id2, table="esmm_train_data",SaveMode.Append)
} else {
......@@ -233,6 +233,103 @@ object EsmmData {
}
}
def get_other_click(spark:SparkSession,yesterday:String): DataFrame ={
var result01 = spark.sql(
s"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,
|city_id,params['business_id'] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'on_click_diary_card' and params['tab_name'] != '精选'
|and params['page_name'] = 'home'
""".stripMargin
)
// println(result01.count())
// result01.show(6)
val recommend = spark.sql(
s"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,
|city_id,params["business_id"] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'diarybook_detail_click_recommend_block' and params["business_type"] = "diary"
""".stripMargin
)
// println("详情页推荐日记:")
// println(recommend.count())
// recommend.show(6)
val search_zonghe = spark.sql(
s"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,city_id,params["business_id"] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'search_result_click_infomation_item' and params["business_type"] = "diary"
""".stripMargin
)
// println("搜索综合:")
// println(search_zonghe.count())
// search_zonghe.show(6)
val non_home = spark.sql(
s"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,city_id,params["diary_id"] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'on_click_diary_card' and params['page_name'] != 'home'
""".stripMargin
)
// println("non home:")
// println(non_home.count())
// non_home.show(6)
result01 = result01.union(recommend).union(search_zonghe).union(non_home)
// println(result01.count())
result01.createOrReplaceTempView("temp_result")
val result02 = spark.sql(
s"""
|select * from temp_result
|where device_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5')
| and device_id not in
| (SELECT cl_id
| FROM online.ml_hospital_spam_pv_day
| WHERE partition_date>='20180402' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| UNION ALL
| SELECT cl_id
| FROM online.ml_hospital_spam_pv_month
| WHERE partition_date>='20171101' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| )
""".stripMargin
)
result02.createOrReplaceTempView("temp_result02")
val result_dairy = spark.sql(
s"""
|select
| re.stat_date as stat_date,
| re.device_id as device_id,
| re.city_id as ucity_id,
| re.cid as cid_id,
| da.service_id as diary_service_id
|from temp_result02 re
|left join online.ml_community_diary_updates da
|on re.cid = da.diary_id
|where da.partition_date='${yesterday}'
""".stripMargin
).distinct()
result_dairy
}
}
......
tidb.jdbcuri=jdbc:mysql://10.66.157.22:4000/eagle?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
gold.jdbcuri=jdbc:mysql://rm-m5ey2s823bq0lc616.mysql.rds.aliyuncs.com/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
mimas.jdbcuri=jdbc:mysql://rm-m5emg41za2w7l6au3.mysql.rds.aliyuncs.com/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
gaia.jdbcuri=jdbc:mysql://rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true
jerry.jdbcuri=jdbc:mysql://10.66.157.22:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
tispark.pd.addresses=10.66.157.22:2379
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment