Commit e4d0eb0f authored by 张彦钊's avatar 张彦钊

增加esmm click,生成新的esmm训练集

parent 11212cb7
...@@ -4,7 +4,7 @@ package com.gmei ...@@ -4,7 +4,7 @@ package com.gmei
import java.io.Serializable import java.io.Serializable
import java.time.LocalDate import java.time.LocalDate
import org.apache.spark.sql.{SaveMode, TiContext} import org.apache.spark.sql.{SaveMode, SparkSession, TiContext}
import org.apache.log4j.{Level, Logger} import org.apache.log4j.{Level, Logger}
import scopt.OptionParser import scopt.OptionParser
import com.gmei.lib.AbstractParams import com.gmei.lib.AbstractParams
...@@ -888,14 +888,14 @@ object EsmmDataTest { ...@@ -888,14 +888,14 @@ object EsmmDataTest {
GmeiConfig.setup(param.env) GmeiConfig.setup(param.env)
val spark_env = GmeiConfig.getSparkSession() val spark_env = GmeiConfig.getSparkSession()
val sc = spark_env._2 val sc = spark_env._2
val ti = new TiContext(sc) val ti = new TiContext(sc)
ti.tidbMapTable(dbName = "eagle",tableName = "src_mimas_prod_api_diary_tags") ti.tidbMapTable(dbName = "eagle",tableName = "src_mimas_prod_api_diary_tags")
ti.tidbMapTable(dbName = "eagle",tableName = "src_zhengxing_api_tag") ti.tidbMapTable(dbName = "eagle",tableName = "src_zhengxing_api_tag")
ti.tidbMapTable(dbName = "jerry_test",tableName = "diary_click") ti.tidbMapTable(dbName = "jerry_test",tableName = "esmm_click")
ti.tidbMapTable(dbName = "jerry_prod",tableName = "data_feed_exposure_precise") ti.tidbMapTable(dbName = "jerry_prod",tableName = "data_feed_exposure_precise")
ti.tidbMapTable(dbName = "jerry_test", tableName = "train_data") ti.tidbMapTable(dbName = "jerry_test", tableName = "train_data")
click(sc)
val max_stat_date = sc.sql( val max_stat_date = sc.sql(
s""" s"""
...@@ -935,11 +935,9 @@ object EsmmDataTest { ...@@ -935,11 +935,9 @@ object EsmmDataTest {
val clk_data = sc.sql( val clk_data = sc.sql(
s""" s"""
|select distinct stat_date,device_id,city_id as ucity_id, |select distinct stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
| cid_id,diary_service_id |from esmm_click
|from diary_click |where stat_date ='${stat_date}'
|where cid_type = 'diary'
|and stat_date ='${stat_date}'
""".stripMargin """.stripMargin
) )
// clk_data.show() // clk_data.show()
...@@ -1071,5 +1069,78 @@ object EsmmDataTest { ...@@ -1071,5 +1069,78 @@ object EsmmDataTest {
} }
} }
def click(spark:SparkSession): Unit ={
val yesterday = LocalDate.now().minusDays(1).toString.replace("-","")
println(yesterday)
val stat_yesterday = LocalDate.now().minusDays(1).toString
val max_stat_date = spark.sql(
s"""
|select max(stat_date) from esmm_click
""".stripMargin
)
val max = max_stat_date.collect().map(s => s(0).toString).head
println("max_stat_date",max)
if (max != stat_yesterday || max == null){
val result01 = spark.sql(
s"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,
|city_id,params['diary_id'] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'on_click_diary_card' and params['tab_name'] = '精选'
|and params['page_name'] = 'home'
""".stripMargin
)
result01.createOrReplaceTempView("temp_result")
val result02 = spark.sql(
s"""
|select * from temp_result
|where device_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5')
| and device_id not in
| (SELECT cl_id
| FROM online.ml_hospital_spam_pv_day
| WHERE partition_date>='20180402' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| UNION ALL
| SELECT cl_id
| FROM online.ml_hospital_spam_pv_month
| WHERE partition_date>='20171101' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| )
""".stripMargin
)
result02.createOrReplaceTempView("temp_result02")
val result_dairy = spark.sql(
s"""
|select
| re.stat_date as stat_date,
| re.device_id as device_id,
| re.device_type as device_type,
| re.cid as cid_id,
| re.city_id as city_id,
| da.service_id as diary_service_id
|from temp_result02 re
|left join online.ml_community_diary_updates da
|on re.cid = da.diary_id
|where da.partition_date='${yesterday}'
""".stripMargin
)
val jdbcuri = "jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig.writeToJDBCTable(jdbcuri,result_dairy, table="esmm_click",SaveMode.Append)
println("data insert")
}else{
println("data already exists")
}
}
} }
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment