Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
a458bb53
Commit
a458bb53
authored
Apr 19, 2019
by
王志伟
Browse files
Options
Browse Files
Download
Plain Diff
迁移到腾讯
parents
25b39fd1
6bb8533b
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
57 additions
and
66 deletions
+57
-66
feature.py
eda/esmm/Model_pipline/feature.py
+9
-14
submit.sh
eda/esmm/Model_pipline/submit.sh
+2
-2
to_database.py
eda/esmm/Model_pipline/to_database.py
+12
-10
to_tfrecord.py
eda/esmm/Model_pipline/to_tfrecord.py
+0
-2
train.py
eda/esmm/Model_pipline/train.py
+0
-2
application.properties
eda/feededa/src/main/resources/application.properties
+22
-6
EsmmData.scala
eda/feededa/src/main/scala/com/gmei/EsmmData.scala
+0
-0
GmeiConfig.scala
eda/feededa/src/main/scala/com/gmei/GmeiConfig.scala
+4
-14
Recommendation_strategy_all.scala
...src/main/scala/com/gmei/Recommendation_strategy_all.scala
+5
-4
esmm_feature.scala
eda/feededa/src/main/scala/com/gmei/esmm_feature.scala
+3
-12
temp_analysis.scala
eda/feededa/src/main/scala/com/gmei/temp_analysis.scala
+0
-0
No files found.
eda/esmm/Model_pipline/feature.py
View file @
a458bb53
...
...
@@ -6,14 +6,9 @@ import datetime
def
con_sql
(
db
,
sql
):
cursor
=
db
.
cursor
()
try
:
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
except
Exception
:
print
(
"发生异常"
,
Exception
)
df
=
pd
.
DataFrame
()
finally
:
db
.
close
()
return
df
...
...
@@ -32,14 +27,14 @@ def multi_hot(df,column,n):
def
get_data
():
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select max(stat_date) from {}"
.
format
(
train_data_set
)
validate_date
=
con_sql
(
db
,
sql
)[
0
]
.
values
.
tolist
()[
0
]
print
(
"validate_date:"
+
validate_date
)
temp
=
datetime
.
datetime
.
strptime
(
validate_date
,
"
%
Y-
%
m-
%
d"
)
start
=
(
temp
-
datetime
.
timedelta
(
days
=
3
00
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
start
=
(
temp
-
datetime
.
timedelta
(
days
=
3
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
print
(
start
)
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select e.y,e.z,e.stat_date,e.ucity_id,feat.level2_ids,e.ccity_name,u.device_type,u.manufacturer,"
\
"u.channel,c.top,e.device_id,cut.time,dl.app_list,e.diary_service_id,feat.level3_ids,feat.level2 "
\
"from {} e left join user_feature u on e.device_id = u.device_id "
\
...
...
@@ -55,7 +50,7 @@ def get_data():
6
:
"device_type"
,
7
:
"manufacturer"
,
8
:
"channel"
,
9
:
"top"
,
10
:
"device_id"
,
11
:
"time"
,
12
:
"app_list"
,
13
:
"service_id"
,
14
:
"level3_ids"
,
15
:
"level2"
})
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select level2_id,treatment_method,price_min,price_max,treatment_time,maintain_time,recover_time "
\
"from train_Knowledge_network_data"
knowledge
=
con_sql
(
db
,
sql
)
...
...
@@ -67,7 +62,7 @@ def get_data():
df
=
df
.
drop
(
"level2"
,
axis
=
1
)
service_id
=
tuple
(
df
[
"service_id"
]
.
unique
())
db
=
pymysql
.
connect
(
host
=
'
rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com
'
,
port
=
3306
,
user
=
'work'
,
db
=
pymysql
.
connect
(
host
=
'
172.16.30.143
'
,
port
=
3306
,
user
=
'work'
,
passwd
=
'BJQaT9VzDcuPBqkd'
,
db
=
'zhengxing'
)
sql
=
"select s.id,d.hospital_id from api_service s left join api_doctor d on s.doctor_id = d.id "
\
"where s.id in {}"
.
format
(
service_id
)
...
...
@@ -152,7 +147,7 @@ def write_csv(df,name,n):
def
get_predict
(
date
,
value_map
,
app_list_map
,
level2_map
,
level3_map
):
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select e.y,e.z,e.label,e.ucity_id,feat.level2_ids,e.ccity_name,"
\
"u.device_type,u.manufacturer,u.channel,c.top,e.device_id,e.cid_id,cut.time,"
\
"dl.app_list,e.hospital_id,feat.level3_ids,feat.level2 "
\
...
...
@@ -160,14 +155,14 @@ def get_predict(date,value_map,app_list_map,level2_map,level3_map):
"left join cid_type_top c on e.device_id = c.device_id "
\
"left join cid_time_cut cut on e.cid_id = cut.cid "
\
"left join device_app_list dl on e.device_id = dl.device_id "
\
"left join diary_feat feat on e.cid_id = feat.diary_id"
"left join diary_feat feat on e.cid_id = feat.diary_id
limit 600
"
df
=
con_sql
(
db
,
sql
)
df
=
df
.
rename
(
columns
=
{
0
:
"y"
,
1
:
"z"
,
2
:
"label"
,
3
:
"ucity_id"
,
4
:
"clevel2_id"
,
5
:
"ccity_name"
,
6
:
"device_type"
,
7
:
"manufacturer"
,
8
:
"channel"
,
9
:
"top"
,
10
:
"device_id"
,
11
:
"cid_id"
,
12
:
"time"
,
13
:
"app_list"
,
14
:
"hospital_id"
,
15
:
"level3_ids"
,
16
:
"level2"
})
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select level2_id,treatment_method,price_min,price_max,treatment_time,maintain_time,recover_time "
\
"from train_Knowledge_network_data"
knowledge
=
con_sql
(
db
,
sql
)
...
...
@@ -232,7 +227,7 @@ def get_predict(date,value_map,app_list_map,level2_map,level3_map):
if
__name__
==
'__main__'
:
train_data_set
=
"esmm_train_data"
path
=
"/
data
/esmm/"
path
=
"/
home/gmuser
/esmm/"
date
,
value
,
app_list
,
level2
,
level3
=
get_data
()
get_predict
(
date
,
value
,
app_list
,
level2
,
level3
)
...
...
eda/esmm/Model_pipline/submit.sh
View file @
a458bb53
#! /bin/bash
git checkout master
PYTHON_PATH
=
/
home/gaoyazhe/miniconda3
/bin/python
PYTHON_PATH
=
/
opt/anaconda3/envs/esmm
/bin/python
MODEL_PATH
=
/srv/apps/ffm-baseline/eda/esmm/Model_pipline
DATA_PATH
=
/
data
/esmm
DATA_PATH
=
/
home/gmuser
/esmm
echo
"rm leave tfrecord"
rm
${
DATA_PATH
}
/tr/
*
...
...
eda/esmm/Model_pipline/to_database.py
View file @
a458bb53
...
...
@@ -3,14 +3,14 @@
from
sqlalchemy
import
create_engine
import
pandas
as
pd
import
pymysql
import
time
import
date
time
def
con_sql
(
sql
):
"""
:type sql : str
:rtype : tuple
"""
db
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
...
...
@@ -36,10 +36,10 @@ def native_set_join(lst):
def
main
():
# native queue
df2
=
pd
.
read_csv
(
'/data/esmm
/native.csv'
)
df2
=
pd
.
read_csv
(
path
+
'
/native.csv'
)
df2
[
'cid_id'
]
=
df2
[
'cid_id'
]
.
astype
(
str
)
df1
=
pd
.
read_csv
(
"/data/esmm
/native/pred.txt"
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
"ctr"
,
"cvr"
,
"ctcvr"
])
df1
=
pd
.
read_csv
(
path
+
"
/native/pred.txt"
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
"ctr"
,
"cvr"
,
"ctcvr"
])
df2
[
"ctr"
],
df2
[
"cvr"
],
df2
[
"ctcvr"
]
=
df1
[
"ctr"
],
df1
[
"cvr"
],
df1
[
"ctcvr"
]
df3
=
df2
.
groupby
(
by
=
[
"uid"
,
"city"
])
.
apply
(
lambda
x
:
x
.
sort_values
(
by
=
"ctcvr"
,
ascending
=
False
))
.
reset_index
(
drop
=
True
)
.
groupby
(
by
=
[
"uid"
,
"city"
])
.
agg
({
'cid_id'
:
native_set_join
})
.
reset_index
(
drop
=
False
)
df3
.
columns
=
[
"device_id"
,
"city_id"
,
"native_queue"
]
...
...
@@ -47,10 +47,10 @@ def main():
# nearby queue
df2
=
pd
.
read_csv
(
'/data/esmm
/nearby.csv'
)
df2
=
pd
.
read_csv
(
path
+
'
/nearby.csv'
)
df2
[
'cid_id'
]
=
df2
[
'cid_id'
]
.
astype
(
str
)
df1
=
pd
.
read_csv
(
"/data/esmm
/nearby/pred.txt"
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
"ctr"
,
"cvr"
,
"ctcvr"
])
df1
=
pd
.
read_csv
(
path
+
"
/nearby/pred.txt"
,
sep
=
'
\t
'
,
header
=
None
,
names
=
[
"ctr"
,
"cvr"
,
"ctcvr"
])
df2
[
"ctr"
],
df2
[
"cvr"
],
df2
[
"ctcvr"
]
=
df1
[
"ctr"
],
df1
[
"cvr"
],
df1
[
"ctcvr"
]
df4
=
df2
.
groupby
(
by
=
[
"uid"
,
"city"
])
.
apply
(
lambda
x
:
x
.
sort_values
(
by
=
"ctcvr"
,
ascending
=
False
))
.
reset_index
(
drop
=
True
)
.
groupby
(
by
=
[
"uid"
,
"city"
])
.
agg
({
'cid_id'
:
nearby_set_join
})
.
reset_index
(
drop
=
False
)
df4
.
columns
=
[
"device_id"
,
"city_id"
,
"nearby_queue"
]
...
...
@@ -60,11 +60,10 @@ def main():
df_all
=
pd
.
merge
(
df3
,
df4
,
on
=
[
'device_id'
,
'city_id'
],
how
=
'outer'
)
.
fillna
(
""
)
df_all
[
'device_id'
]
=
df_all
[
'device_id'
]
.
astype
(
str
)
df_all
[
'city_id'
]
=
df_all
[
'city_id'
]
.
astype
(
str
)
ctime
=
int
(
time
.
time
())
df_all
[
"time"
]
=
ctime
df_all
[
"time"
]
=
str
(
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y
%
m
%
d
%
H
%
M'
))
print
(
"union_device_count"
,
df_all
.
shape
)
host
=
'1
0.66.157.22
'
host
=
'1
72.16.40.158
'
port
=
4000
user
=
'root'
password
=
'3SYz54LS9#^9sBvC'
...
...
@@ -78,7 +77,7 @@ def main():
# df_merge = df_all[['device_id','city_id']].apply(lambda x: ''.join(x),axis=1)
delete_str
=
'delete from esmm_device_diary_queue where concat(device_id,city_id) in ({0})'
.
format
(
df_merge_str
)
con
=
pymysql
.
connect
(
host
=
'1
0.66.157.22
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
con
=
pymysql
.
connect
(
host
=
'1
72.16.40.158
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cur
=
con
.
cursor
()
cur
.
execute
(
delete_str
)
con
.
commit
()
...
...
@@ -88,5 +87,7 @@ def main():
print
(
"done"
)
if
__name__
==
'__main__'
:
path
=
"/home/gmuser/esmm"
main
()
\ No newline at end of file
eda/esmm/Model_pipline/to_tfrecord.py
View file @
a458bb53
...
...
@@ -4,13 +4,11 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
pandas
as
pd
import
sys
import
os
import
glob
import
tensorflow
as
tf
import
numpy
as
np
import
re
from
multiprocessing
import
Pool
as
ThreadPool
flags
=
tf
.
app
.
flags
...
...
eda/esmm/Model_pipline/train.py
View file @
a458bb53
...
...
@@ -6,12 +6,10 @@
#import argparse
import
shutil
#import sys
import
os
import
json
import
glob
from
datetime
import
date
,
timedelta
from
time
import
time
import
random
import
tensorflow
as
tf
...
...
eda/feededa/src/main/resources/application.properties
View file @
a458bb53
dev.tidb.jdbcuri
=
jdbc:mysql://1
0.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC
&rewriteBatchedStatements=true
dev.tispark.pd.addresses
=
1
0.66.157.22
:2379
dev.mimas.jdbcuri
=
jdbc:mysql://r
dsmaqevmuzj6jy.mysql.rds.aliyuncs.com/mimas_test?user=work&password=workwork
&rewriteBatchedStatements=true
dev.tidb.jdbcuri
=
jdbc:mysql://1
92.168.15.12:4000/eagle?user=root&password=
&rewriteBatchedStatements=true
dev.tispark.pd.addresses
=
1
92.168.15.11
:2379
dev.mimas.jdbcuri
=
jdbc:mysql://r
m-2zenowgrn4i5p0j7txo.mysql.rds.aliyuncs.com/mimas_test?user=work&password=Gengmei1
&rewriteBatchedStatements=true
dev.gaia.jdbcuri
=
jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com/zhengxing_test?user=work&password=workwork&rewriteBatchedStatements=true
dev.gold.jdbcuri
=
jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com/doris_test?user=work&password=workwork&rewriteBatchedStatements=true
dev.redis.host
=
10.30.50.58
dev.redis.port
=
6379
dev.jerry.jdbcuri
=
jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com/jerry_test?user=work&password=workwork&rewriteBatchedStatements=true
dev.test.jdbcuri
=
jdbc:mysql://rm-2ze0v6uua2hl9he8edo.mysql.rds.aliyuncs.com/mimas_test?user=work&password=Gengmei1&rewriteBatchedStatements=true
pre.tidb.jdbcuri
=
jdbc:mysql://192.168.16.11:4000/eagle?user=root&password=&rewriteBatchedStatements=true
pre.tispark.pd.addresses
=
192.168.16.11:2379
pre.mimas.jdbcuri
=
jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com:3308/mimas_prod?user=mimas&password=workwork&rewriteBatchedStatements=true
<<<<<<<
HEAD
#prod.tidb.jdbcuri=jdbc:mysql://10.66.157.22:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
#prod.gold.jdbcuri=jdbc:mysql://rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
#prod.mimas.jdbcuri=jdbc:mysql://rm-m5emg41za2w7l6au3.mysql.rds.aliyuncs.com/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
...
...
@@ -19,6 +19,22 @@ pre.mimas.jdbcuri=jdbc:mysql://rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com:3308/mimas
#prod.redis.host=10.30.50.58
#prod.redis.port=6379
=======
#阿里云线上配置
#prod.tidb.jdbcuri=jdbc:mysql://10.66.157.22:4000/eagle?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
#prod.gold.jdbcuri=jdbc:mysql://rm-m5ey2s823bq0lc616.mysql.rds.aliyuncs.com/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
#prod.mimas.jdbcuri=jdbc:mysql://rm-m5emg41za2w7l6au3.mysql.rds.aliyuncs.com/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
#prod.gaia.jdbcuri=jdbc:mysql://rdsfewzdmf0jfjp9un8xj.mysql.rds.aliyuncs.com/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true
#prod.jerry.jdbcuri=jdbc:mysql://10.66.157.22:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
#prod.tispark.pd.addresses=10.66.157.22:2379
#
#prod.tidb.jdbcuri_new=jdbc:mysql://152.136.44.138:4000/eagle?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
#prod.jerry.jdbcuri_new=jdbc:mysql://152.136.44.138:4000/jerry_prod?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true
#腾讯云线上配置
>>>>>>>
6bb8533b68efef7c647251ef08479560d5e1216a
prod.gold.jdbcuri
=
jdbc:mysql://172.16.30.136/doris_prod?user=doris&password=o5gbA27hXHHm&rewriteBatchedStatements=true
prod.mimas.jdbcuri
=
jdbc:mysql://172.16.30.138/mimas_prod?user=mimas&password=GJL3UJe1Ck9ggL6aKnZCq4cRvM&rewriteBatchedStatements=true
prod.gaia.jdbcuri
=
jdbc:mysql://172.16.30.143/zhengxing?user=work&password=BJQaT9VzDcuPBqkd&rewriteBatchedStatements=true
...
...
eda/feededa/src/main/scala/com/gmei/EsmmData.scala
View file @
a458bb53
This diff is collapsed.
Click to expand it.
eda/feededa/src/main/scala/com/gmei/GmeiConfig.scala
View file @
a458bb53
...
...
@@ -37,27 +37,17 @@ object GmeiConfig extends Serializable {
sparkConf
.
set
(
"spark.debug.maxToStringFields"
,
"130"
)
sparkConf
.
set
(
"spark.sql.broadcastTimeout"
,
"6000"
)
if
(!
sparkConf
.
contains
(
"""spark.master"""
))
{
sparkConf
.
setMaster
(
"local[3]"
)
}
if
(!
sparkConf
.
contains
(
"spark.tispark.pd.addresses"
))
{
sparkConf
.
set
(
"spark.tispark.pd.addresses"
,
this
.
config
.
getString
(
"tispark.pd.addresses"
))
}
println
(
sparkConf
.
get
(
"spark.tispark.pd.addresses"
))
val
spark
=
SparkSession
.
builder
()
// .config(sparkConf)
.
appName
(
"feededa"
)
.
enableHiveSupport
()
.
config
(
sparkConf
)
.
config
(
"spark.tispark.pd.addresses"
,
"172.16.40.158:2379"
)
.
config
(
"spark.sql.extensions"
,
"org.apache.spark.sql.TiExtensions"
)
.
appName
(
"feededa"
)
.
enableHiveSupport
()
.
getOrCreate
()
spark
.
sql
(
"SET mapreduce.job.queuename=data"
)
spark
.
sql
(
"SET mapred.input.dir.recursive=true"
)
spark
.
sql
(
"SET hive.mapred.supports.subdirectories=true"
)
spark
.
sql
(
"use online"
)
spark
.
sql
(
"ADD JAR hdfs:///user/hive/share/lib/udf/brickhouse-0.7.1-SNAPSHOT.jar"
)
spark
.
sql
(
"ADD JAR hdfs:///user/hive/share/lib/udf/hive-udf-1.0-SNAPSHOT.jar"
)
spark
.
sql
(
"CREATE TEMPORARY FUNCTION json_map AS 'brickhouse.udf.json.JsonMapUDF'"
)
...
...
eda/feededa/src/main/scala/com/gmei/Recommendation_strategy_all.scala
View file @
a458bb53
...
...
@@ -52,7 +52,7 @@ object Recommendation_strategy_all {
val
stat_date
=
GmeiConfig
.
getMinusNDate
(
1
)
// val stat_date = param.date
// val stat_date = param.date
//println(param.date)
val
partition_date
=
stat_date
.
replace
(
"-"
,
""
)
val
decive_id_oldUser
=
sc
.
sql
(
...
...
@@ -119,7 +119,7 @@ object Recommendation_strategy_all {
"""
.
stripMargin
)
//获取策略命中用户device_id
//获取策略命中用户device_id
val
device_id_cover
=
sc
.
sql
(
s
"""
|select distinct(device_id) as device_id
...
...
@@ -287,7 +287,7 @@ object Recommendation_strategy_all {
GmeiConfig
.
writeToJDBCTable
(
result2
,
"strategy_other"
,
SaveMode
.
Append
)
//统计新用户点击率
//统计新用户点击率
val
devicee_id_newUser
=
sc
.
sql
(
s
"""
|select distinct(device_id) as device_id
...
...
@@ -442,7 +442,7 @@ object Gini_coefficient {
"""
.
stripMargin
)
agency_id
.
createOrReplaceTempView
(
"agency_id"
)
//统计次数
//统计次数
val
diary_clk_num
=
sc
.
sql
(
s
"""
|select temp1.diary_id as diary_id,count(ov.cl_id) as diary_clk_num
...
...
@@ -468,3 +468,4 @@ object Gini_coefficient {
eda/feededa/src/main/scala/com/gmei/esmm_feature.scala
View file @
a458bb53
...
...
@@ -6,7 +6,7 @@ import java.time.LocalDate
import
com.gmei.lib.AbstractParams
import
org.apache.log4j.
{
Level
,
Logger
}
import
org.apache.spark.sql.
{
DataFrame
,
SaveMode
,
SparkSession
,
TiContext
}
import
org.apache.spark.sql.
{
DataFrame
,
SaveMode
,
SparkSession
}
import
scopt.OptionParser
import
scala.util.parsing.json.JSON
...
...
@@ -46,9 +46,6 @@ object esmm_feature {
GmeiConfig
.
setup
(
param
.
env
)
val
spark_env
=
GmeiConfig
.
getSparkSession
()
val
sc
=
spark_env
.
_2
val
ti
=
new
TiContext
(
sc
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"device_app_list"
)
ti
.
tidbMapTable
(
dbName
=
"jerry_test"
,
tableName
=
"user_feature"
)
user_feature
(
sc
)
get_applist
(
sc
)
...
...
@@ -67,7 +64,7 @@ object esmm_feature {
"""
.
stripMargin
).
dropDuplicates
(
"device_id"
)
df
.
persist
()
val
old
=
spark
.
sql
(
"select device_id from device_app_list"
).
collect
().
map
(
x
=>
x
(
0
).
toString
)
val
old
=
spark
.
sql
(
"select device_id from
jerry_test.
device_app_list"
).
collect
().
map
(
x
=>
x
(
0
).
toString
)
import
spark.implicits._
val
android
=
df
.
rdd
.
map
(
x
=>
(
x
(
0
).
toString
,
x
(
1
).
toString
,
x
(
2
).
toString
))
...
...
@@ -81,8 +78,6 @@ object esmm_feature {
val
new_user
=
rdd
.
filter
(
x
=>
old
.
indexOf
(
x
.
_1
)==
-
1
)
.
toDF
(
"device_id"
,
"os"
,
"app_list"
,
"update_date"
)
if
(
new_user
.
take
(
1
).
nonEmpty
){
val
jdbc
=
"jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig
.
writeToJDBCTable
(
jdbc
,
new_user
,
"device_app_list"
,
SaveMode
.
Append
)
val
tecent_jdbc
=
"jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig
.
writeToJDBCTable
(
tecent_jdbc
,
new_user
,
"device_app_list"
,
SaveMode
.
Append
)
...
...
@@ -114,7 +109,7 @@ object esmm_feature {
def
user_feature
(
spark
:
SparkSession
)
:
Unit
={
val
yesterday
=
LocalDate
.
now
().
minusDays
(
1
).
toString
.
replace
(
"-"
,
""
)
println
(
yesterday
)
val
sql_exist
=
"select device_id from user_feature"
val
sql_exist
=
"select device_id from
jerry_test.
user_feature"
val
old
=
spark
.
sql
(
sql_exist
)
.
collect
().
map
(
x
=>
x
(
0
).
toString
)
val
sql_yesterday
=
...
...
@@ -130,12 +125,8 @@ object esmm_feature {
val
df_new
=
rdd
.
filter
(
x
=>
old
.
indexOf
(
x
.
_1
)==
-
1
)
.
toDF
(
"device_id"
,
"device_type"
,
"manufacturer"
,
"city_id"
,
"channel"
,
"date"
)
if
(
df_new
.
take
(
1
).
nonEmpty
){
df_new
.
persist
()
val
jdbcuri
=
"jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig
.
writeToJDBCTable
(
jdbcuri
,
df_new
,
"user_feature"
,
SaveMode
.
Append
)
val
tecent_jdbc
=
"jdbc:mysql://152.136.44.138:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true"
GmeiConfig
.
writeToJDBCTable
(
tecent_jdbc
,
df_new
,
"user_feature"
,
SaveMode
.
Append
)
df_new
.
unpersist
()
}
else
{
println
(
"no need to insert into user feature"
)
}
...
...
eda/feededa/src/main/scala/com/gmei/temp_analysis.scala
View file @
a458bb53
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment