Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
bee2d7ff
Commit
bee2d7ff
authored
Mar 20, 2019
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
在esmm_train_data中增加首页精选之外的点击样本
parent
d925b533
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
101 additions
and
10 deletions
+101
-10
EsmmData.scala
eda/feededa/src/main/scala/com/gmei/EsmmData.scala
+101
-4
application.properties
tensnsorflow/application.properties
+0
-6
No files found.
eda/feededa/src/main/scala/com/gmei/EsmmData.scala
View file @
bee2d7ff
...
...
@@ -4,7 +4,7 @@ package com.gmei
import
java.io.Serializable
import
java.time.LocalDate
import
org.apache.spark.sql.
{
SaveMode
,
SparkSession
,
TiContext
}
import
org.apache.spark.sql.
{
DataFrame
,
SaveMode
,
SparkSession
,
TiContext
}
import
org.apache.log4j.
{
Level
,
Logger
}
import
scopt.OptionParser
import
com.gmei.lib.AbstractParams
...
...
@@ -139,8 +139,9 @@ object EsmmData {
// println(cvr_data_filter.count())
val
clk_data_filter
=
clk_data
.
except
(
cvr_data
).
withColumn
(
"y"
,
lit
(
1
)).
withColumn
(
"z"
,
lit
(
0
))
val
other_click
=
get_other_click
(
sc
,
stat_date_not
)
val
all_click
=
clk_data
.
union
(
other_click
)
val
clk_data_filter
=
all_click
.
except
(
cvr_data
).
withColumn
(
"y"
,
lit
(
1
)).
withColumn
(
"z"
,
lit
(
0
))
// clk_data_filter.createOrReplaceTempView("clk_data_filter")
// clk_data_filter.show()
// println("clk_data_filter.count()")
...
...
@@ -222,7 +223,6 @@ object EsmmData {
"""
.
stripMargin
)
GmeiConfig
.
writeToJDBCTable
(
"jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
,
union_data_scity_id2
,
table
=
"esmm_train_data"
,
SaveMode
.
Append
)
}
else
{
...
...
@@ -233,6 +233,103 @@ object EsmmData {
}
}
def
get_other_click
(
spark
:
SparkSession
,
yesterday
:
String
)
:
DataFrame
={
var
result01
=
spark
.
sql
(
s
"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,
|city_id,params['business_id'] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'on_click_diary_card' and params['tab_name'] != '精选'
|and params['page_name'] = 'home'
"""
.
stripMargin
)
// println(result01.count())
// result01.show(6)
val
recommend
=
spark
.
sql
(
s
"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,
|city_id,params["business_id"] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'diarybook_detail_click_recommend_block' and params["business_type"] = "diary"
"""
.
stripMargin
)
// println("详情页推荐日记:")
// println(recommend.count())
// recommend.show(6)
val
search_zonghe
=
spark
.
sql
(
s
"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,city_id,params["business_id"] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'search_result_click_infomation_item' and params["business_type"] = "diary"
"""
.
stripMargin
)
// println("搜索综合:")
// println(search_zonghe.count())
// search_zonghe.show(6)
val
non_home
=
spark
.
sql
(
s
"""
|select from_unixtime(unix_timestamp('${yesterday}', 'yyyyMMdd'), 'yyyy-MM-dd') as stat_date,
|device["device_id"] as device_id,channel as device_type,city_id,params["diary_id"] as cid
|from online.tl_hdfs_maidian_view where partition_date = '$yesterday'
|and action = 'on_click_diary_card' and params['page_name'] != 'home'
"""
.
stripMargin
)
// println("non home:")
// println(non_home.count())
// non_home.show(6)
result01
=
result01
.
union
(
recommend
).
union
(
search_zonghe
).
union
(
non_home
)
// println(result01.count())
result01
.
createOrReplaceTempView
(
"temp_result"
)
val
result02
=
spark
.
sql
(
s
"""
|select * from temp_result
|where device_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5')
| and device_id not in
| (SELECT cl_id
| FROM online.ml_hospital_spam_pv_day
| WHERE partition_date>='20180402' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| UNION ALL
| SELECT cl_id
| FROM online.ml_hospital_spam_pv_month
| WHERE partition_date>='20171101' AND partition_date<'${yesterday}'
| AND pv_ratio>=0.95
| )
"""
.
stripMargin
)
result02
.
createOrReplaceTempView
(
"temp_result02"
)
val
result_dairy
=
spark
.
sql
(
s
"""
|select
| re.stat_date as stat_date,
| re.device_id as device_id,
| re.city_id as ucity_id,
| re.cid as cid_id,
| da.service_id as diary_service_id
|from temp_result02 re
|left join online.ml_community_diary_updates da
|on re.cid = da.diary_id
|where da.partition_date='${yesterday}'
"""
.
stripMargin
).
distinct
()
result_dairy
}
}
...
...
tensnsorflow/application.properties
deleted
100644 → 0
View file @
d925b533
tidb.jdbcuri
=
jdbc:mysql://10.66.157.22:4000/eagle?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
gold.jdbcuri
=
jdbc:mysql://rm-m5ey2s823bq0lc616.mysql.rds.aliyuncs.com/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
mimas.jdbcuri
=
jdbc:mysql://rm-m5emg41za2w7l6au3.mysql.rds.aliyuncs.com/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
gaia.jdbcuri
=
jdbc:mysql://rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true
jerry.jdbcuri
=
jdbc:mysql://10.66.157.22:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
tispark.pd.addresses
=
10.66.157.22:2379
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment