Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
e4d0eb0f
Commit
e4d0eb0f
authored
Feb 22, 2019
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加esmm click,生成新的esmm训练集
parent
11212cb7
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
80 additions
and
8 deletions
+80
-8
EsmmData.scala
eda/feededa/src/main/scala/com/gmei/EsmmData.scala
+80
-8
No files found.
eda/feededa/src/main/scala/com/gmei/EsmmData.scala
View file @
e4d0eb0f
...
...
@@ -4,7 +4,7 @@ package com.gmei
import
java.io.Serializable
import
java.time.LocalDate
import
org.apache.spark.sql.
{
SaveMode
,
TiContext
}
import
org.apache.spark.sql.
{
SaveMode
,
SparkSession
,
TiContext
}
import
org.apache.log4j.
{
Level
,
Logger
}
import
scopt.OptionParser
import
com.gmei.lib.AbstractParams
...
...
@@ -888,14 +888,14 @@ object EsmmDataTest {
GmeiConfig
.
setup
(
param
.
env
)
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
val
ti
=
new
TiContext
(
sc
)
ti
.
tidbMapTable
(
dbName
=
"eagle"
,
tableName
=
"src_mimas_prod_api_diary_tags"
)
ti
.
tidbMapTable
(
dbName
=
"eagle"
,
tableName
=
"src_zhengxing_api_tag"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"
diary
_click"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"
esmm
_click"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"data_feed_exposure_precise"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"train_data"
)
click
(
sc
)
val
max_stat_date
=
sc
.
sql
(
s
"""
...
...
@@ -935,11 +935,9 @@ object EsmmDataTest {
val
clk_data
=
sc
.
sql
(
s
"""
|select distinct stat_date,device_id,city_id as ucity_id,
| cid_id,diary_service_id
|from diary_click
|where cid_type = 'diary'
|and stat_date ='${stat_date}'
|select distinct stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
|from esmm_click
|where stat_date ='${stat_date}'
"""
.
stripMargin
)
// clk_data.show()
...
...
@@ -1071,5 +1069,78 @@ object EsmmDataTest {
}
}
def
click
(
spark
:
SparkSession
)
:
Unit
={
val
yesterday
=
LocalDate
.
now
().
minusDays
(
1
).
toString
.
replace
(
"-"
,
""
)
println
(
yesterday
)
val
stat_yesterday
=
LocalDate
.
now
().
minusDays
(
1
).
toString
val
max_stat_date
=
spark
.
sql
(
s
"""
|select max(stat_date) from esmm_click
"""
.
stripMargin
)
val
max
=
max_stat_date
.
collect
().
map
(
s
=>
s
(
0
).
toString
).
head
println
(
"max_stat_date"
,
max
)
if
(
max
!=
stat_yesterday
||
max
==
null
){
val
result01
=
spark
.
sql
(
s
"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,
|city_id,params['diary_id'] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'on_click_diary_card' and params['tab_name'] = '精选'
|and params['page_name'] = 'home'
"""
.
stripMargin
)
result01
.
createOrReplaceTempView
(
"temp_result"
)
val
result02
=
spark
.
sql
(
s
"""
|select * from temp_result
|where device_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5')
| and device_id not in
| (SELECT cl_id
| FROM online.ml_hospital_spam_pv_day
| WHERE partition_date>='20180402' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| UNION ALL
| SELECT cl_id
| FROM online.ml_hospital_spam_pv_month
| WHERE partition_date>='20171101' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| )
"""
.
stripMargin
)
result02
.
createOrReplaceTempView
(
"temp_result02"
)
val
result_dairy
=
spark
.
sql
(
s
"""
|select
| re.stat_date as stat_date,
| re.device_id as device_id,
| re.device_type as device_type,
| re.cid as cid_id,
| re.city_id as city_id,
| da.service_id as diary_service_id
|from temp_result02 re
|left join online.ml_community_diary_updates da
|on re.cid = da.diary_id
|where da.partition_date='${yesterday}'
"""
.
stripMargin
)
val
jdbcuri
=
"jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig
.
writeToJDBCTable
(
jdbcuri
,
result_dairy
,
table
=
"esmm_click"
,
SaveMode
.
Append
)
println
(
"data insert"
)
}
else
{
println
(
"data already exists"
)
}
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment