Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
a458bb53
Commit
a458bb53
authored
Apr 19, 2019
by
王志伟
Browse files
Options
Browse Files
Download
Plain Diff
迁移到腾讯
parents
25b39fd1
6bb8533b
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
184 additions
and
225 deletions
+184
-225
feature.py
eda/esmm/Model_pipline/feature.py
+9
-14
submit.sh
eda/esmm/Model_pipline/submit.sh
+2
-2
to_database.py
eda/esmm/Model_pipline/to_database.py
+12
-10
to_tfrecord.py
eda/esmm/Model_pipline/to_tfrecord.py
+0
-2
train.py
eda/esmm/Model_pipline/train.py
+0
-2
application.properties
eda/feededa/src/main/resources/application.properties
+22
-6
EsmmData.scala
eda/feededa/src/main/scala/com/gmei/EsmmData.scala
+36
-77
GmeiConfig.scala
eda/feededa/src/main/scala/com/gmei/GmeiConfig.scala
+4
-14
Recommendation_strategy_all.scala
...src/main/scala/com/gmei/Recommendation_strategy_all.scala
+5
-4
esmm_feature.scala
eda/feededa/src/main/scala/com/gmei/esmm_feature.scala
+3
-12
temp_analysis.scala
eda/feededa/src/main/scala/com/gmei/temp_analysis.scala
+91
-82
No files found.
eda/esmm/Model_pipline/feature.py
View file @
a458bb53
...
...
@@ -6,14 +6,9 @@ import datetime
def
con_sql
(
db
,
sql
):
cursor
=
db
.
cursor
()
try
:
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
except
Exception
:
print
(
"发生异常"
,
Exception
)
df
=
pd
.
DataFrame
()
finally
:
db
.
close
()
return
df
...
...
@@ -32,14 +27,14 @@ def multi_hot(df,column,n):
def
get_data
():
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select max(stat_date) from {}"
.
format
(
train_data_set
)
validate_date
=
con_sql
(
db
,
sql
)[
0
]
.
values
.
tolist
()[
0
]
print
(
"validate_date:"
+
validate_date
)
temp
=
datetime
.
datetime
.
strptime
(
validate_date
,
"
%
Y-
%
m-
%
d"
)
start
=
(
temp
-
datetime
.
timedelta
(
days
=
3
00
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
start
=
(
temp
-
datetime
.
timedelta
(
days
=
3
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
print
(
start
)
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select e.y,e.z,e.stat_date,e.ucity_id,feat.level2_ids,e.ccity_name,u.device_type,u.manufacturer,"
\
"u.channel,c.top,e.device_id,cut.time,dl.app_list,e.diary_service_id,feat.level3_ids,feat.level2 "
\
"from {} e left join user_feature u on e.device_id = u.device_id "
\
...
...
@@ -55,7 +50,7 @@ def get_data():
6
:
"device_type"
,
7
:
"manufacturer"
,
8
:
"channel"
,
9
:
"top"
,
10
:
"device_id"
,
11
:
"time"
,
12
:
"app_list"
,
13
:
"service_id"
,
14
:
"level3_ids"
,
15
:
"level2"
})
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select level2_id,treatment_method,price_min,price_max,treatment_time,maintain_time,recover_time "
\
"from train_Knowledge_network_data"
knowledge
=
con_sql
(
db
,
sql
)
...
...
@@ -67,7 +62,7 @@ def get_data():
df
=
df
.
drop
(
"level2"
,
axis
=
1
)
service_id
=
tuple
(
df
[
"service_id"
]
.
unique
())
db
=
pymysql
.
connect
(
host
=
'
rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com
'
,
port
=
3306
,
user
=
'work'
,
db
=
pymysql
.
connect
(
host
=
'
172.16.30.143
'
,
port
=
3306
,
user
=
'work'
,
passwd
=
'BJQaT9VzDcuPBqkd'
,
db
=
'zhengxing'
)
sql
=
"select s.id,d.hospital_id from api_service s left join api_doctor d on s.doctor_id = d.id "
\
"where s.id in {}"
.
format
(
service_id
)
...
...
@@ -152,7 +147,7 @@ def write_csv(df,name,n):
def
get_predict
(
date
,
value_map
,
app_list_map
,
level2_map
,
level3_map
):
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select e.y,e.z,e.label,e.ucity_id,feat.level2_ids,e.ccity_name,"
\
"u.device_type,u.manufacturer,u.channel,c.top,e.device_id,e.cid_id,cut.time,"
\
"dl.app_list,e.hospital_id,feat.level3_ids,feat.level2 "
\
...
...
@@ -160,14 +155,14 @@ def get_predict(date,value_map,app_list_map,level2_map,level3_map):
"left join cid_type_top c on e.device_id = c.device_id "
\
"left join cid_time_cut cut on e.cid_id = cut.cid "
\
"left join device_app_list dl on e.device_id = dl.device_id "
\
"left join diary_feat feat on e.cid_id = feat.diary_id"
"left join diary_feat feat on e.cid_id = feat.diary_id
limit 600
"
df
=
con_sql
(
db
,
sql
)
df
=
df
.
rename
(
columns
=
{
0
:
"y"
,
1
:
"z"
,
2
:
"label"
,
3
:
"ucity_id"
,
4
:
"clevel2_id"
,
5
:
"ccity_name"
,
6
:
"device_type"
,
7
:
"manufacturer"
,
8
:
"channel"
,
9
:
"top"
,
10
:
"device_id"
,
11
:
"cid_id"
,
12
:
"time"
,
13
:
"app_list"
,
14
:
"hospital_id"
,
15
:
"level3_ids"
,
16
:
"level2"
})
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select level2_id,treatment_method,price_min,price_max,treatment_time,maintain_time,recover_time "
\
"from train_Knowledge_network_data"
knowledge
=
con_sql
(
db
,
sql
)
...
...
@@ -232,7 +227,7 @@ def get_predict(date,value_map,app_list_map,level2_map,level3_map):
if
__name__
==
'__main__'
:
train_data_set
=
"esmm_train_data"
path
=
"/
data
/esmm/"
path
=
"/
home/gmuser
/esmm/"
date
,
value
,
app_list
,
level2
,
level3
=
get_data
()
get_predict
(
date
,
value
,
app_list
,
level2
,
level3
)
...
...
eda/esmm/Model_pipline/submit.sh
View file @
a458bb53
#! /bin/bash
git checkout master
PYTHON_PATH
=
/
home/gaoyazhe/miniconda3
/bin/python
PYTHON_PATH
=
/
opt/anaconda3/envs/esmm
/bin/python
MODEL_PATH
=
/srv/apps/ffm-baseline/eda/esmm/Model_pipline
DATA_PATH
=
/
data
/esmm
DATA_PATH
=
/
home/gmuser
/esmm
echo
"rm leave tfrecord"
rm
${
DATA_PATH
}
/tr/
*
...
...
eda/esmm/Model_pipline/to_database.py
View file @
a458bb53
...
...
@@ -3,14 +3,14 @@
from
sqlalchemy
import
create_engine
import
pandas
as
pd
import
pymysql
import
time
import
date
time
def
con_sql
(
sql
):
"""
:type sql : str
:rtype : tuple
"""
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
...
...
@@ -36,10 +36,10 @@ def native_set_join(lst):
def
main
():
# native queue
df2
=
pd
.
read_csv
(
'/data/esmm
/native.csv'
)
df2
=
pd
.
read_csv
(
path
+
'
/native.csv'
)
df2
[
'cid_id'
]
=
df2
[
'cid_id'
]
.
astype
(
str
)
df1
=
pd
.
read_csv
(
"/data/esmm
/native/pred.txt"
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
"ctr"
,
"cvr"
,
"ctcvr"
])
df1
=
pd
.
read_csv
(
path
+
"
/native/pred.txt"
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
"ctr"
,
"cvr"
,
"ctcvr"
])
df2
[
"ctr"
],
df2
[
"cvr"
],
df2
[
"ctcvr"
]
=
df1
[
"ctr"
],
df1
[
"cvr"
],
df1
[
"ctcvr"
]
df3
=
df2
.
groupby
(
by
=
[
"uid"
,
"city"
])
.
apply
(
lambda
x
:
x
.
sort_values
(
by
=
"ctcvr"
,
ascending
=
False
))
.
reset_index
(
drop
=
True
)
.
groupby
(
by
=
[
"uid"
,
"city"
])
.
agg
({
'cid_id'
:
native_set_join
})
.
reset_index
(
drop
=
False
)
df3
.
columns
=
[
"device_id"
,
"city_id"
,
"native_queue"
]
...
...
@@ -47,10 +47,10 @@ def main():
# nearby queue
df2
=
pd
.
read_csv
(
'/data/esmm
/nearby.csv'
)
df2
=
pd
.
read_csv
(
path
+
'
/nearby.csv'
)
df2
[
'cid_id'
]
=
df2
[
'cid_id'
]
.
astype
(
str
)
df1
=
pd
.
read_csv
(
"/data/esmm
/nearby/pred.txt"
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
"ctr"
,
"cvr"
,
"ctcvr"
])
df1
=
pd
.
read_csv
(
path
+
"
/nearby/pred.txt"
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
"ctr"
,
"cvr"
,
"ctcvr"
])
df2
[
"ctr"
],
df2
[
"cvr"
],
df2
[
"ctcvr"
]
=
df1
[
"ctr"
],
df1
[
"cvr"
],
df1
[
"ctcvr"
]
df4
=
df2
.
groupby
(
by
=
[
"uid"
,
"city"
])
.
apply
(
lambda
x
:
x
.
sort_values
(
by
=
"ctcvr"
,
ascending
=
False
))
.
reset_index
(
drop
=
True
)
.
groupby
(
by
=
[
"uid"
,
"city"
])
.
agg
({
'cid_id'
:
nearby_set_join
})
.
reset_index
(
drop
=
False
)
df4
.
columns
=
[
"device_id"
,
"city_id"
,
"nearby_queue"
]
...
...
@@ -60,11 +60,10 @@ def main():
df_all
=
pd
.
merge
(
df3
,
df4
,
on
=
[
'device_id'
,
'city_id'
],
how
=
'outer'
)
.
fillna
(
""
)
df_all
[
'device_id'
]
=
df_all
[
'device_id'
]
.
astype
(
str
)
df_all
[
'city_id'
]
=
df_all
[
'city_id'
]
.
astype
(
str
)
ctime
=
int
(
time
.
time
())
df_all
[
"time"
]
=
ctime
df_all
[
"time"
]
=
str
(
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y
%
m
%
d
%
H
%
M'
))
print
(
"union_device_count"
,
df_all
.
shape
)
host
=
'1
0.66.157.22
'
host
=
'1
72.16.40.158
'
port
=
4000
user
=
'root'
password
=
'3SYz54LS9#^9sBvC'
...
...
@@ -78,7 +77,7 @@ def main():
# df_merge = df_all[['device_id','city_id']].apply(lambda x: ''.join(x),axis=1)
delete_str
=
'delete from esmm_device_diary_queue where concat(device_id,city_id) in ({0})'
.
format
(
df_merge_str
)
con
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
con
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cur
=
con
.
cursor
()
cur
.
execute
(
delete_str
)
con
.
commit
()
...
...
@@ -88,5 +87,7 @@ def main():
print
(
"done"
)
if
__name__
==
'__main__'
:
path
=
"/home/gmuser/esmm"
main
()
\ No newline at end of file
eda/esmm/Model_pipline/to_tfrecord.py
View file @
a458bb53
...
...
@@ -4,13 +4,11 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
pandas
as
pd
import
sys
import
os
import
glob
import
tensorflow
as
tf
import
numpy
as
np
import
re
from
multiprocessing
import
Pool
as
ThreadPool
flags
=
tf
.
app
.
flags
...
...
eda/esmm/Model_pipline/train.py
View file @
a458bb53
...
...
@@ -6,12 +6,10 @@
#import argparse
import
shutil
#import sys
import
os
import
json
import
glob
from
datetime
import
date
,
timedelta
from
time
import
time
import
random
import
tensorflow
as
tf
...
...
eda/feededa/src/main/resources/application.properties
View file @
a458bb53
dev.tidb.jdbcuri
=
jdbc:mysql://1
0.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC
&rewriteBatchedStatements=true
dev.tispark.pd.addresses
=
1
0.66.157.22
:2379
dev.mimas.jdbcuri
=
jdbc:mysql://r
dsmaqevmuzj6jy.mysql.rds.aliyuncs.com/mimas_test?user=work&password=workwork
&rewriteBatchedStatements=true
dev.tidb.jdbcuri
=
jdbc:mysql://1
92.168.15.12:4000/eagle?user=root&password=
&rewriteBatchedStatements=true
dev.tispark.pd.addresses
=
1
92.168.15.11
:2379
dev.mimas.jdbcuri
=
jdbc:mysql://r
m-2zenowgrn4i5p0j7txo.mysql.rds.aliyuncs.com/mimas_test?user=work&password=Gengmei1
&rewriteBatchedStatements=true
dev.gaia.jdbcuri
=
jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com/zhengxing_test?user=work&password=workwork&rewriteBatchedStatements=true
dev.gold.jdbcuri
=
jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com/doris_test?user=work&password=workwork&rewriteBatchedStatements=true
dev.redis.host
=
10.30.50.58
dev.redis.port
=
6379
dev.jerry.jdbcuri
=
jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com/jerry_test?user=work&password=workwork&rewriteBatchedStatements=true
dev.test.jdbcuri
=
jdbc:mysql://rm-2ze0v6uua2hl9he8edo.mysql.rds.aliyuncs.com/mimas_test?user=work&password=Gengmei1&rewriteBatchedStatements=true
pre.tidb.jdbcuri
=
jdbc:mysql://192.168.16.11:4000/eagle?user=root&password=&rewriteBatchedStatements=true
pre.tispark.pd.addresses
=
192.168.16.11:2379
pre.mimas.jdbcuri
=
jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com:3308/mimas_prod?user=mimas&password=workwork&rewriteBatchedStatements=true
<<<<<<<
HEAD
#prod.tidb.jdbcuri=jdbc:mysql://10.66.157.22:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
#prod.gold.jdbcuri=jdbc:mysql://rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
#prod.mimas.jdbcuri=jdbc:mysql://rm-m5emg41za2w7l6au3.mysql.rds.aliyuncs.com/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
...
...
@@ -19,6 +19,22 @@ pre.mimas.jdbcuri=jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com:3308/mimas
#prod.redis.host=10.30.50.58
#prod.redis.port=6379
=======
#阿里云线上配置
#prod.tidb.jdbcuri=jdbc:mysql://10.66.157.22:4000/eagle?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
#prod.gold.jdbcuri=jdbc:mysql://rm-m5ey2s823bq0lc616.mysql.rds.aliyuncs.com/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
#prod.mimas.jdbcuri=jdbc:mysql://rm-m5emg41za2w7l6au3.mysql.rds.aliyuncs.com/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
#prod.gaia.jdbcuri=jdbc:mysql://rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true
#prod.jerry.jdbcuri=jdbc:mysql://10.66.157.22:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
#prod.tispark.pd.addresses=10.66.157.22:2379
#
#prod.tidb.jdbcuri_new=jdbc:mysql://152.136.44.138:4000/eagle?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
#prod.jerry.jdbcuri_new=jdbc:mysql://152.136.44.138:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
#腾讯云线上配置
>>>>>>>
6bb8533b68efef7c647251ef08479560d5e1216a
prod.gold.jdbcuri
=
jdbc:mysql://172.16.30.136/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
prod.mimas.jdbcuri
=
jdbc:mysql://172.16.30.138/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
prod.gaia.jdbcuri
=
jdbc:mysql://172.16.30.143/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true
...
...
eda/feededa/src/main/scala/com/gmei/EsmmData.scala
View file @
a458bb53
...
...
@@ -4,7 +4,7 @@ package com.gmei
import
java.io.Serializable
import
java.time.LocalDate
import
org.apache.spark.sql.
{
DataFrame
,
SaveMode
,
SparkSession
,
TiContext
}
import
org.apache.spark.sql.
{
DataFrame
,
SaveMode
,
SparkSession
}
import
org.apache.log4j.
{
Level
,
Logger
}
import
scopt.OptionParser
import
com.gmei.lib.AbstractParams
...
...
@@ -51,17 +51,9 @@ object EsmmData {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
val
ti
=
new
TiContext
(
sc
)
ti
.
tidbMapTable
(
dbName
=
"eagle"
,
tableName
=
"src_mimas_prod_api_diary_tags"
)
ti
.
tidbMapTable
(
dbName
=
"eagle"
,
tableName
=
"src_zhengxing_api_tag"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"data_feed_click"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"data_feed_exposure"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"esmm_train_data"
)
val
max_stat_date
=
sc
.
sql
(
s
"""
|select max(stat_date) from esmm_train_data
|select max(stat_date) from
jerry_test.
esmm_train_data
"""
.
stripMargin
)
val
max_stat_date_str
=
max_stat_date
.
collect
().
map
(
s
=>
s
(
0
).
toString
).
head
...
...
@@ -74,7 +66,7 @@ object EsmmData {
// s"""
// |select distinct stat_date,device_id,city_id as ucity_id,
// | cid_id,diary_service_id
// |from data_feed_exposure
// |from
jerry_prod.
data_feed_exposure
// |where cid_type = 'diary'
// |and stat_date ='${stat_date}'
// """.stripMargin
...
...
@@ -84,7 +76,7 @@ object EsmmData {
s
"""
|select * from
|(select stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
|from data_feed_exposure
|from
jerry_prod.
data_feed_exposure
|where cid_type = 'diary'
|and stat_date ='${stat_date}'
|group by stat_date,device_id,city_id,cid_id,diary_service_id having count(*) > 1) a
...
...
@@ -99,7 +91,7 @@ object EsmmData {
s
"""
|select distinct stat_date,device_id,city_id as ucity_id,
| cid_id,diary_service_id
|from data_feed_click
|from
jerry_prod.
data_feed_click
|where cid_type = 'diary'
|and stat_date ='${stat_date}'
"""
.
stripMargin
...
...
@@ -190,8 +182,8 @@ object EsmmData {
|select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.diary_service_id,a.y,a.z,a.clevel1_id,a.slevel1_id,
| c.name as ccity_name
|from union_data_slabel a
|left join src_mimas_prod_api_diary_tags b on a.cid_id=b.diary_id
|left join src_zhengxing_api_tag c on b.tag_id=c.id
|left join
eagle.
src_mimas_prod_api_diary_tags b on a.cid_id=b.diary_id
|left join
eagle.
src_zhengxing_api_tag c on b.tag_id=c.id
| where c.tag_type=4
"""
.
stripMargin
)
...
...
@@ -222,12 +214,11 @@ object EsmmData {
|group by device_id,cid_id
"""
.
stripMargin
)
union_data_scity_id2
.
persist
()
GmeiConfig
.
writeToJDBCTable
(
"jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
,
union_data_scity_id2
,
table
=
"esmm_train_data"
,
SaveMode
.
Append
)
GmeiConfig
.
writeToJDBCTable
(
"jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
,
union_data_scity_id2
,
table
=
"esmm_train_data"
,
SaveMode
.
Append
)
union_data_scity_id2
.
unpersist
()
}
else
{
println
(
"esmm_train_data already have param.date data"
)
println
(
"
jerry_test.
esmm_train_data already have param.date data"
)
}
sc
.
stop
()
...
...
@@ -368,18 +359,6 @@ object EsmmPredData {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
val
ti
=
new
TiContext
(
sc
)
ti
.
tidbMapTable
(
dbName
=
"eagle"
,
tableName
=
"src_mimas_prod_api_diary_tags"
)
ti
.
tidbMapTable
(
dbName
=
"eagle"
,
tableName
=
"src_zhengxing_api_tag"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"data_feed_exposure"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"data_feed_click"
)
ti
.
tidbMapTable
(
"jerry_prod"
,
"nd_device_cid_similarity_matrix"
)
ti
.
tidbMapTable
(
"eagle"
,
"ffm_diary_queue"
)
ti
.
tidbMapTable
(
"eagle"
,
"search_queue"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"esmm_train_data"
)
ti
.
tidbMapTable
(
"eagle"
,
"biz_feed_diary_queue"
)
ti
.
tidbMapTable
(
"jerry_prod"
,
"data_feed_exposure_precise"
)
import
sc.implicits._
val
yesteday_have_seq
=
GmeiConfig
.
getMinusNDate
(
1
)
...
...
@@ -388,7 +367,7 @@ object EsmmPredData {
s
"""
|select concat(t.device_id,",",t.city_id) from
|(select distinct device_id,city_id
|from data_feed_exposure where stat_date='${yesteday_have_seq}') t
|from
jerry_prod.
data_feed_exposure where stat_date='${yesteday_have_seq}') t
"""
.
stripMargin
).
collect
().
map
(
x
=>
x
(
0
).
toString
)
println
(
"target_user"
,
target_user
.
length
)
...
...
@@ -396,11 +375,12 @@ object EsmmPredData {
val
raw_data
=
sc
.
sql
(
s
"""
|select concat(tmp1.device_id,",",tmp1.city_id) as device_city, tmp1.merge_queue from
|(select device_id,if(city_id='world','worldwide',city_id) city_id,similarity_cid as merge_queue from nd_device_cid_similarity_matrix
|(select device_id,if(city_id='world','worldwide',city_id) city_id,similarity_cid as merge_queue
|from jerry_prod.nd_device_cid_similarity_matrix
|union
|select device_id,if(city_id='world','worldwide',city_id) city_id,native_queue as merge_queue from ffm_diary_queue
|select device_id,if(city_id='world','worldwide',city_id) city_id,native_queue as merge_queue from
eagle.
ffm_diary_queue
|union
|select device_id,city_id,search_queue as merge_queue from search_queue) as tmp1
|select device_id,city_id,search_queue as merge_queue from
eagle.
search_queue) as tmp1
"""
.
stripMargin
)
// raw_data.show()
...
...
@@ -421,7 +401,7 @@ object EsmmPredData {
import
sc.implicits._
val
sql
=
s
"""
|select distinct device_id,cid_id from data_feed_exposure_precise
|select distinct device_id,cid_id from
jerry_prod.
data_feed_exposure_precise
|where stat_date >= "$start" and cid_type = "diary"
"""
.
stripMargin
val
history
=
sc
.
sql
(
sql
).
repartition
(
200
).
rdd
...
...
@@ -458,8 +438,8 @@ object EsmmPredData {
// native_data
val
native_data
=
sc
.
sql
(
s
"""
|select distinct a.device_id,a.city_id,b.native_queue from data_feed_exposure a
|left join (select if(city_id='world','worldwide',city_id) city_id,native_queue from biz_feed_diary_queue) b
|select distinct a.device_id,a.city_id,b.native_queue from
jerry_prod.
data_feed_exposure a
|left join (select if(city_id='world','worldwide',city_id) city_id,native_queue from
eagle.
biz_feed_diary_queue) b
|on a.city_id = b.city_id
|where a.stat_date='${yesteday_have_seq}' and b.native_queue != ""
"""
.
stripMargin
...
...
@@ -558,8 +538,8 @@ object EsmmPredData {
|select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.label,a.diary_service_id,a.y,a.z,a.clevel1_id,a.slevel1_id,
| c.name as ccity_name
|from union_data_slabel a
|left join src_mimas_prod_api_diary_tags b on a.cid_id=b.diary_id
|left join src_zhengxing_api_tag c on b.tag_id=c.id
|left join
eagle.
src_mimas_prod_api_diary_tags b on a.cid_id=b.diary_id
|left join
eagle.
src_zhengxing_api_tag c on b.tag_id=c.id
| where c.tag_type=4
"""
.
stripMargin
)
...
...
@@ -614,10 +594,9 @@ object EsmmPredData {
// union_data_scity_id.createOrReplaceTempView("union_data_scity_id")
// println(union_data_scity_id2.count())
union_data_scity_id2
.
persist
()
GmeiConfig
.
writeToJDBCTable
(
"jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
,
union_data_scity_id2
,
table
=
"esmm_pre_data"
,
SaveMode
.
Overwrite
)
GmeiConfig
.
writeToJDBCTable
(
"jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
,
union_data_scity_id2
,
table
=
"esmm_pre_data"
,
SaveMode
.
Overwrite
)
union_data_scity_id2
.
unpersist
()
...
...
@@ -665,9 +644,6 @@ object GetDiaryPortrait {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
val
ti
=
new
TiContext
(
sc
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"data_feed_click"
)
val
stat_date
=
param
.
date
.
replace
(
"-"
,
""
)
val
diary_tag
=
sc
.
sql
(
...
...
@@ -693,7 +669,7 @@ object GetDiaryPortrait {
|select diary_id,level1_ids,level2_ids,level3_ids,split(level2_ids,",")[0] as level2 from t
"""
.
stripMargin
)
val
jdbc
=
"jdbc:mysql://1
0.66.157.22
:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
val
jdbc
=
"jdbc:mysql://1
52.136.44.138
:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig
.
writeToJDBCTable
(
jdbc
,
result
,
"diary_feat"
,
SaveMode
.
Overwrite
)
...
...
@@ -742,9 +718,6 @@ object GetDevicePortrait {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
val
ti
=
new
TiContext
(
sc
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"data_feed_click"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"diary_feat"
)
import
sc.implicits._
val
stat_date
=
param
.
date
.
replace
(
"-"
,
""
)
...
...
@@ -757,7 +730,7 @@ object GetDevicePortrait {
| COALESCE(a.params['diary_id'], a.params['business_id'], 0) as cid_id,
| b.level1_ids as level1_id
| from online.tl_hdfs_maidian_view a
| left join diary_feat b
| left join
jerry_prod.
diary_feat b
| on COALESCE(a.params['diary_id'], a.params['business_id'], 0) = b.diary_id
| where
| b.level1_ids is not null and
...
...
@@ -844,11 +817,6 @@ object GetLevelCount {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
val
ti
=
new
TiContext
(
sc
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"data_feed_click"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"diary_feat"
)
import
sc.implicits._
val
stat_date
=
GmeiConfig
.
getMinusNDate
(
1
).
replace
(
"-"
,
""
)
...
...
@@ -856,7 +824,7 @@ object GetLevelCount {
val
diary_queue
=
"16215222,16204965,15361235,16121397,16277565,15491159,16299587,16296887,15294642,16204934,15649199,16122580,16122580,16122580,16122580,16122580,16122580"
val
diary_level1
=
sc
.
sql
(
s
"""
|select diary_id,explode(split(level1_ids,';')) level1_id from diary_feat
|select diary_id,explode(split(level1_ids,';')) level1_id from
jerry_prod.
diary_feat
|where diary_id in (${diary_queue})
"""
.
stripMargin
)
...
...
@@ -924,9 +892,6 @@ object GetDeviceDuration {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
val
ti
=
new
TiContext
(
sc
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"data_feed_click"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"diary_feat"
)
import
sc.implicits._
val
stat_date
=
param
.
date
...
...
@@ -935,8 +900,8 @@ object GetDeviceDuration {
s
"""
|select a.device_id,coalesce(a.start_time,a.ndiary_in,0) in_time,coalesce(a.end_time,a.ndiary_out,0) out_time,
|explode(split(b.level1_ids,';')) level1_id
|from data_feed_click a
|left join diary_feat b on a.cid_id = b.diary_id
|from
jerry_prod.
data_feed_click a
|left join
jerry_prod.
diary_feat b on a.cid_id = b.diary_id
|where a.stat_date > '2018-12-12'
"""
.
stripMargin
)
...
...
@@ -973,8 +938,8 @@ object GetDeviceDuration {
| (select a.device_id,
| coalesce(a.end_time,a.ndiary_out,0)-coalesce(a.start_time,a.ndiary_in,0) as duration,
| explode(split(b.level1_ids,';')) level1_id
| from data_feed_click a
| left join diary_feat b on a.cid_id = b.diary_id where a.stat_date > '2018-12-12') c
| from
jerry_prod.
data_feed_click a
| left join
jerry_prod.
diary_feat b on a.cid_id = b.diary_id where a.stat_date > '2018-12-12') c
| group by c.device_id,c.level1_id) d
|group by d.device_id
"""
.
stripMargin
...
...
@@ -1026,18 +991,12 @@ object EsmmDataTest {
GmeiConfig
.
setup
(
param
.
env
)
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
val
ti
=
new
TiContext
(
sc
)
ti
.
tidbMapTable
(
dbName
=
"eagle"
,
tableName
=
"src_mimas_prod_api_diary_tags"
)
ti
.
tidbMapTable
(
dbName
=
"eagle"
,
tableName
=
"src_zhengxing_api_tag"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"esmm_click"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_prod"
,
tableName
=
"data_feed_exposure_precise"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"train_data"
)
click
(
sc
)
val
max_stat_date
=
sc
.
sql
(
s
"""
|select max(stat_date) from train_data
|select max(stat_date) from
jerry_test.
train_data
"""
.
stripMargin
)
val
max_stat_date_str
=
max_stat_date
.
collect
().
map
(
s
=>
s
(
0
).
toString
).
head
...
...
@@ -1050,7 +1009,7 @@ object EsmmDataTest {
// s"""
// |select distinct stat_date,device_id,city_id as ucity_id,
// | cid_id,diary_service_id
// |from data_feed_exposure
// |from
jerry_prod.
data_feed_exposure
// |where cid_type = 'diary'
// |and stat_date ='${stat_date}'
// """.stripMargin
...
...
@@ -1060,7 +1019,7 @@ object EsmmDataTest {
s
"""
|select * from
|(select stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
|from data_feed_exposure_precise
|from
jerry_prod.
data_feed_exposure_precise
|where cid_type = 'diary'
|and stat_date ='${stat_date}'
|group by stat_date,device_id,city_id,cid_id,diary_service_id) a
...
...
@@ -1074,7 +1033,7 @@ object EsmmDataTest {
val
clk_data
=
sc
.
sql
(
s
"""
|select distinct stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
|from esmm_click
|from
jerry_test.
esmm_click
|where stat_date ='${stat_date}'
"""
.
stripMargin
)
...
...
@@ -1163,8 +1122,8 @@ object EsmmDataTest {
|select a.stat_date,a.device_id,a.ucity_id,a.cid_id,a.diary_service_id,a.y,a.z,a.clevel1_id,a.slevel1_id,
| c.name as ccity_name
|from union_data_slabel a
|left join src_mimas_prod_api_diary_tags b on a.cid_id=b.diary_id
|left join src_zhengxing_api_tag c on b.tag_id=c.id
|left join
eagle.
src_mimas_prod_api_diary_tags b on a.cid_id=b.diary_id
|left join
eagle.
src_zhengxing_api_tag c on b.tag_id=c.id
| where c.tag_type=4
"""
.
stripMargin
)
...
...
@@ -1213,7 +1172,7 @@ object EsmmDataTest {
val
stat_yesterday
=
LocalDate
.
now
().
minusDays
(
1
).
toString
val
max_stat_date
=
spark
.
sql
(
s
"""
|select max(stat_date) from esmm_click
|select max(stat_date) from
jerry_test.
esmm_click
"""
.
stripMargin
)
val
max
=
max_stat_date
.
collect
().
map
(
s
=>
s
(
0
).
toString
).
head
...
...
eda/feededa/src/main/scala/com/gmei/GmeiConfig.scala
View file @
a458bb53
...
...
@@ -37,27 +37,17 @@ object GmeiConfig extends Serializable {
sparkConf
.
set
(
"spark.debug.maxToStringFields"
,
"130"
)
sparkConf
.
set
(
"spark.sql.broadcastTimeout"
,
"6000"
)
if
(!
sparkConf
.
contains
(
"""spark.master"""
))
{
sparkConf
.
setMaster
(
"local[3]"
)
}
if
(!
sparkConf
.
contains
(
"spark.tispark.pd.addresses"
))
{
sparkConf
.
set
(
"spark.tispark.pd.addresses"
,
this
.
config
.
getString
(
"tispark.pd.addresses"
))
}
println
(
sparkConf
.
get
(
"spark.tispark.pd.addresses"
))
val
spark
=
SparkSession
.
builder
()
// .config(sparkConf)
.
appName
(
"feededa"
)
.
enableHiveSupport
()
.
config
(
sparkConf
)
.
config
(
"spark.tispark.pd.addresses"
,
"172.16.40.158:2379"
)
.
config
(
"spark.sql.extensions"
,
"org.apache.spark.sql.TiExtensions"
)
.
appName
(
"feededa"
)
.
enableHiveSupport
()
.
getOrCreate
()
spark
.
sql
(
"SET mapreduce.job.queuename=data"
)
spark
.
sql
(
"SET mapred.input.dir.recursive=true"
)
spark
.
sql
(
"SET hive.mapred.supports.subdirectories=true"
)
spark
.
sql
(
"use online"
)
spark
.
sql
(
"ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar"
)
spark
.
sql
(
"ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar"
)
spark
.
sql
(
"CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'"
)
...
...
eda/feededa/src/main/scala/com/gmei/Recommendation_strategy_all.scala
View file @
a458bb53
...
...
@@ -52,7 +52,7 @@ object Recommendation_strategy_all {
val
stat_date
=
GmeiConfig
.
getMinusNDate
(
1
)
// val stat_date = param.date
// val stat_date = param.date
//println(param.date)
val
partition_date
=
stat_date
.
replace
(
"-"
,
""
)
val
decive_id_oldUser
=
sc
.
sql
(
...
...
@@ -119,7 +119,7 @@ object Recommendation_strategy_all {
"""
.
stripMargin
)
//获取策略命中用户device_id
//获取策略命中用户device_id
val
device_id_cover
=
sc
.
sql
(
s
"""
|select distinct(device_id) as device_id
...
...
@@ -287,7 +287,7 @@ object Recommendation_strategy_all {
GmeiConfig
.
writeToJDBCTable
(
result2
,
"strategy_other"
,
SaveMode
.
Append
)
//统计新用户点击率
//统计新用户点击率
val
devicee_id_newUser
=
sc
.
sql
(
s
"""
|select distinct(device_id) as device_id
...
...
@@ -442,7 +442,7 @@ object Gini_coefficient {
"""
.
stripMargin
)
agency_id
.
createOrReplaceTempView
(
"agency_id"
)
//统计次数
//统计次数
val
diary_clk_num
=
sc
.
sql
(
s
"""
|select temp1.diary_id as diary_id,count(ov.cl_id) as diary_clk_num
...
...
@@ -468,3 +468,4 @@ object Gini_coefficient {
eda/feededa/src/main/scala/com/gmei/esmm_feature.scala
View file @
a458bb53
...
...
@@ -6,7 +6,7 @@ import java.time.LocalDate
import
com.gmei.lib.AbstractParams
import
org.apache.log4j.
{
Level
,
Logger
}
import
org.apache.spark.sql.
{
DataFrame
,
SaveMode
,
SparkSession
,
TiContext
}
import
org.apache.spark.sql.
{
DataFrame
,
SaveMode
,
SparkSession
}
import
scopt.OptionParser
import
scala.util.parsing.json.JSON
...
...
@@ -46,9 +46,6 @@ object esmm_feature {
GmeiConfig
.
setup
(
param
.
env
)
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
val
ti
=
new
TiContext
(
sc
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"device_app_list"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"user_feature"
)
user_feature
(
sc
)
get_applist
(
sc
)
...
...
@@ -67,7 +64,7 @@ object esmm_feature {
"""
.
stripMargin
).
dropDuplicates
(
"device_id"
)
df
.
persist
()
val
old
=
spark
.
sql
(
"select device_id from device_app_list"
).
collect
().
map
(
x
=>
x
(
0
).
toString
)
val
old
=
spark
.
sql
(
"select device_id from
jerry_test.
device_app_list"
).
collect
().
map
(
x
=>
x
(
0
).
toString
)
import
spark.implicits._
val
android
=
df
.
rdd
.
map
(
x
=>
(
x
(
0
).
toString
,
x
(
1
).
toString
,
x
(
2
).
toString
))
...
...
@@ -81,8 +78,6 @@ object esmm_feature {
val
new_user
=
rdd
.
filter
(
x
=>
old
.
indexOf
(
x
.
_1
)==
-
1
)
.
toDF
(
"device_id"
,
"os"
,
"app_list"
,
"update_date"
)
if
(
new_user
.
take
(
1
).
nonEmpty
){
val
jdbc
=
"jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig
.
writeToJDBCTable
(
jdbc
,
new_user
,
"device_app_list"
,
SaveMode
.
Append
)
val
tecent_jdbc
=
"jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig
.
writeToJDBCTable
(
tecent_jdbc
,
new_user
,
"device_app_list"
,
SaveMode
.
Append
)
...
...
@@ -114,7 +109,7 @@ object esmm_feature {
def
user_feature
(
spark
:
SparkSession
)
:
Unit
={
val
yesterday
=
LocalDate
.
now
().
minusDays
(
1
).
toString
.
replace
(
"-"
,
""
)
println
(
yesterday
)
val
sql_exist
=
"select device_id from user_feature"
val
sql_exist
=
"select device_id from
jerry_test.
user_feature"
val
old
=
spark
.
sql
(
sql_exist
)
.
collect
().
map
(
x
=>
x
(
0
).
toString
)
val
sql_yesterday
=
...
...
@@ -130,12 +125,8 @@ object esmm_feature {
val
df_new
=
rdd
.
filter
(
x
=>
old
.
indexOf
(
x
.
_1
)==
-
1
)
.
toDF
(
"device_id"
,
"device_type"
,
"manufacturer"
,
"city_id"
,
"channel"
,
"date"
)
if
(
df_new
.
take
(
1
).
nonEmpty
){
df_new
.
persist
()
val
jdbcuri
=
"jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig
.
writeToJDBCTable
(
jdbcuri
,
df_new
,
"user_feature"
,
SaveMode
.
Append
)
val
tecent_jdbc
=
"jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig
.
writeToJDBCTable
(
tecent_jdbc
,
df_new
,
"user_feature"
,
SaveMode
.
Append
)
df_new
.
unpersist
()
}
else
{
println
(
"no need to insert into user feature"
)
}
...
...
eda/feededa/src/main/scala/com/gmei/temp_analysis.scala
View file @
a458bb53
...
...
@@ -47,14 +47,14 @@ object temp_analysis {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
// val ti = new TiContext(sc)
// val ti = new TiContext(sc)
sc
.
sql
(
"use jerry_prod"
)
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
import
sc.implicits._
...
...
@@ -81,23 +81,23 @@ object temp_analysis {
agency_id
.
createOrReplaceTempView
(
"agency_id"
)
// //每日新用户
// val device_id_newUser = sc.sql(
// s"""
// |select distinct(device_id) as device_id
// |from online.ml_device_day_active_status
// |where active_type != '4'
// |and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
// | ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
// | ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
// | ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
// | ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
// | ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
// | ,'promotion_shike','promotion_julang_jl03')
// |and partition_date ='${partition_date}'
// """.stripMargin
// )
// device_id_newUser.createOrReplaceTempView("device_id_new")
// //每日新用户
// val device_id_newUser = sc.sql(
// s"""
// |select distinct(device_id) as device_id
// |from online.ml_device_day_active_status
// |where active_type != '4'
// |and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
// | ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
// | ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
// | ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
// | ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
// | ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
// | ,'promotion_shike','promotion_julang_jl03')
// |and partition_date ='${partition_date}'
// """.stripMargin
// )
// device_id_newUser.createOrReplaceTempView("device_id_new")
val
blacklist_id
=
sc
.
sql
(
s
"""
...
...
@@ -136,7 +136,7 @@ object temp_analysis {
diary_clk_all
.
show
(
80
)
//日记本点击
//日记本点击
val
referrer
=
List
(
"about_me_message_list"
,
"all_case_service_comment"
,
"all_cases"
,
"diary_detail"
,
"diary_list"
,
"diary_listof_related_service"
,
"answer_detail"
,
"community_home"
,
"conversation_detail"
,
"create_diary_title"
,
"diary_listof_related_service"
,
"doctor_all_cases"
,
"hospital_all_cases"
,
"my_favor"
,
"my_order"
,
"order_detail"
,
"personal_store_diary_list"
,
"received_votes"
,
...
...
@@ -212,14 +212,14 @@ object ARPU_COM {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
// val ti = new TiContext(sc)
// val ti = new TiContext(sc)
sc
.
sql
(
"use jerry_prod"
)
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
import
sc.implicits._
...
...
@@ -335,14 +335,14 @@ object hospital_gengmei {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
// val ti = new TiContext(sc)
// val ti = new TiContext(sc)
sc
.
sql
(
"use jerry_prod"
)
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
import
sc.implicits._
...
...
@@ -407,19 +407,19 @@ object meigou_xiaofei_renshu {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
// val ti = new TiContext(sc)
// val ti = new TiContext(sc)
sc
.
sql
(
"use jerry_prod"
)
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
import
sc.implicits._
// val stat_date = GmeiConfig.getMinusNDate(1)
// val stat_date = GmeiConfig.getMinusNDate(1)
val
stat_date
=
param
.
date
//println(param.date)
val
partition_date
=
stat_date
.
replace
(
"-"
,
""
)
...
...
@@ -461,21 +461,21 @@ object meigou_xiaofei_renshu {
final_id
.
createOrReplaceTempView
(
"final_id"
)
// val meigou_price = sc.sql(
// s"""
// |select md.user_id,sum(md.gengmei_price) as pay_all
// |from online.ml_meigou_order_detail md left join final_id
// |on md.device_id = final_id.device_id
// |where md.status= 2
// |and final_id.device_id is null
// |and md.partition_date = '20181223'
// |and md.pay_time is not null
// |and md.validate_time>'2017-01-01 00:00:00.0'
// |group by md.user_id
// |order by sum(md.gengmei_price)
// """.stripMargin
// )
// meigou_price.show(80)
// val meigou_price = sc.sql(
// s"""
// |select md.user_id,sum(md.gengmei_price) as pay_all
// |from online.ml_meigou_order_detail md left join final_id
// |on md.device_id = final_id.device_id
// |where md.status= 2
// |and final_id.device_id is null
// |and md.partition_date = '20181223'
// |and md.pay_time is not null
// |and md.validate_time>'2017-01-01 00:00:00.0'
// |group by md.user_id
// |order by sum(md.gengmei_price)
// """.stripMargin
// )
// meigou_price.show(80)
val
meigou_price
=
sc
.
sql
(
...
...
@@ -500,9 +500,9 @@ object meigou_xiaofei_renshu {
|order by sum(md.gengmei_price)
"""
.
stripMargin
)
// meigou_price.show(80)
// meigou_price.show(80)
// GmeiConfig.writeToJDBCTable(meigou_price, "meigou_price", SaveMode.Overwrite)
// GmeiConfig.writeToJDBCTable(meigou_price, "meigou_price", SaveMode.Overwrite)
}
...
...
@@ -549,18 +549,18 @@ object alpha_ctr {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
// val ti = new TiContext(sc)
// val ti = new TiContext(sc)
sc
.
sql
(
"use jerry_prod"
)
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "bl_device_list")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
import
sc.implicits._
// val stat_date = GmeiConfig.getMinusNDate(1)
// val stat_date = GmeiConfig.getMinusNDate(1)
val
stat_date
=
param
.
date
//println(param.date)
val
partition_date
=
stat_date
.
replace
(
"-"
,
""
)
...
...
@@ -638,12 +638,17 @@ object alpha_ctr {
<<<<<<<
HEAD
// GmeiConfig.writeToJDBCTable(result, "alpha_ctr", SaveMode.Append)
// GmeiConfig.writeToJDBCTable("jdbc:mysql://152.136.44.138:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",result, table="alpha_ctr",SaveMode.Append)
println
(
"开始写入"
)
GmeiConfig
.
writeToJDBCTable
(
"jerry.jdbcuri"
,
result
,
table
=
"alpha_ctr"
,
SaveMode
.
Append
)
println
(
"写入完成"
)
=======
// GmeiConfig.writeToJDBCTable(result, "alpha_ctr", SaveMode.Append)
GmeiConfig
.
writeToJDBCTable
(
"jdbc:mysql://152.136.44.138:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
,
result
,
table
=
"alpha_ctr"
,
SaveMode
.
Append
)
>>>>>>>
6
bb8533b68efef7c647251ef08479560d5e1216a
...
...
@@ -667,12 +672,17 @@ object alpha_ctr {
)
val
result3
=
device_num_count
.
join
(
duration_device
,
"stat_date"
)
<<<<<<<
HEAD
// GmeiConfig.writeToJDBCTable(result3, "alpha_duration", SaveMode.Append)
// GmeiConfig.writeToJDBCTable("jdbc:mysql://152.136.44.138:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",result3, table="alpha_duration",SaveMode.Append)
println
(
"开始写入"
)
GmeiConfig
.
writeToJDBCTable
(
"jerry.jdbcuri"
,
result3
,
table
=
"alpha_duration"
,
SaveMode
.
Append
)
println
(
"写入完成"
)
=======
// GmeiConfig.writeToJDBCTable(result3, "alpha_duration", SaveMode.Append)
GmeiConfig
.
writeToJDBCTable
(
"jdbc:mysql://152.136.44.138:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
,
result3
,
table
=
"alpha_duration"
,
SaveMode
.
Append
)
>>>>>>>
6
bb8533b68efef7c647251ef08479560d5e1216a
...
...
@@ -723,19 +733,19 @@ object copy_database {
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
// val ti = new TiContext(sc)
// val ti = new TiContext(sc)
sc
.
sql
(
"use jerry_prod"
)
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "tl_hdfs_wiki_item_tag_view")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "Knowledge_network")
// ti.tidbMapTable(dbName = "eagle", tableName = "src_mimas_prod_api_diary")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "diary_video")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
// ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "tl_hdfs_wiki_item_tag_view")
// ti.tidbMapTable(dbName = "jerry_test", tableName = "Knowledge_network")
// ti.tidbMapTable(dbName = "eagle", tableName = "src_mimas_prod_api_diary")
import
sc.implicits._
val
stat_date
=
GmeiConfig
.
getMinusNDate
(
1
)
// val stat_date=param.date
// val stat_date=param.date
val
partition_date
=
stat_date
.
replace
(
"-"
,
""
)
val
new_data
=
sc
.
sql
(
...
...
@@ -753,7 +763,7 @@ object copy_database {
"""
.
stripMargin
)
GmeiConfig
.
writeToJDBCTable
(
new_data
,
"train_Knowledge_network_data"
,
SaveMode
.
Overwrite
)
GmeiConfig
.
writeToJDBCTable
(
"jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
,
new_data
,
"train_Knowledge_network_data"
,
SaveMode
.
Overwrite
)
}
...
...
@@ -763,4 +773,3 @@ object copy_database {
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment