Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
0a84be4b
Commit
0a84be4b
authored
Sep 10, 2019
by
张彦钊
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'zhao' into 'master'
把tidb 地址回滚 See merge request
!37
parents
e589ec80
1468fcf0
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
31 additions
and
31 deletions
+31
-31
feature_engineering.py
eda/esmm/Model_pipline/feature_engineering.py
+21
-21
rerank_esmm.py
eda/esmm/Model_pipline/rerank_esmm.py
+6
-6
to_database.py
eda/esmm/Model_pipline/to_database.py
+3
-3
train.py
eda/esmm/Model_pipline/train.py
+1
-1
No files found.
eda/esmm/Model_pipline/feature_engineering.py
View file @
0a84be4b
...
...
@@ -37,19 +37,19 @@ def get_list(db,sql,n):
def
get_map
():
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select app_list from device_app_list"
a
=
time
.
time
()
apps_number
,
app_list_map
=
get_list
(
db
,
sql
,
16
)
print
(
"applist"
)
print
((
time
.
time
()
-
a
)
/
60
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select level2_ids from diary_feat"
b
=
time
.
time
()
leve2_number
,
leve2_map
=
get_list
(
db
,
sql
,
16
+
apps_number
)
print
(
"leve2"
)
print
((
time
.
time
()
-
b
)
/
60
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select level3_ids from diary_feat"
c
=
time
.
time
()
leve3_number
,
leve3_map
=
get_list
(
db
,
sql
,
16
+
leve2_number
+
apps_number
)
...
...
@@ -77,7 +77,7 @@ def con_sql(db,sql):
def
get_pre_number
():
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select count(*) from esmm_pre_data"
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
...
...
@@ -103,65 +103,65 @@ def feature_engineer():
leve2_map
[
"search_tag2"
]
=
27
unique_values
=
[]
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct stat_date from esmm_train_data_dwell"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct ucity_id from esmm_train_data_dwell"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct ccity_name from esmm_train_data_dwell"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct time from cid_time_cut"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct device_type from user_feature"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct manufacturer from user_feature"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct channel from user_feature"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct top from cid_type_top"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct price_min from knowledge"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct treatment_method from knowledge"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct price_max from knowledge"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct treatment_time from knowledge"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct maintain_time from knowledge"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct recover_time from knowledge"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
# unique_values.append("video")
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select max(stat_date) from esmm_train_data_dwell"
validate_date
=
con_sql
(
db
,
sql
)[
0
]
.
values
.
tolist
()[
0
]
print
(
"validate_date:"
+
validate_date
)
...
...
@@ -169,7 +169,7 @@ def feature_engineer():
start
=
(
temp
-
datetime
.
timedelta
(
days
=
180
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
print
(
start
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
)
sql
=
"select distinct doctor.hospital_id from jerry_test.esmm_train_data_dwell e "
\
"left join eagle.src_zhengxing_api_service service on e.diary_service_id = service.id "
\
"left join eagle.src_zhengxing_api_doctor doctor on service.doctor_id = doctor.id "
\
...
...
@@ -374,7 +374,7 @@ if __name__ == '__main__':
.
set
(
"spark.tispark.plan.allow_index_double_read"
,
"false"
)
\
.
set
(
"spark.tispark.plan.allow_index_read"
,
"true"
)
\
.
set
(
"spark.sql.extensions"
,
"org.apache.spark.sql.TiExtensions"
)
\
.
set
(
"spark.tispark.pd.addresses"
,
"172.16.40.1
70
:2379"
)
.
set
(
"spark.io.compression.codec"
,
"lzf"
)
\
.
set
(
"spark.tispark.pd.addresses"
,
"172.16.40.1
58
:2379"
)
.
set
(
"spark.io.compression.codec"
,
"lzf"
)
\
.
set
(
"spark.driver.maxResultSize"
,
"8g"
)
.
set
(
"spark.sql.avro.compression.codec"
,
"snappy"
)
spark
=
SparkSession
.
builder
.
config
(
conf
=
sparkConf
)
.
enableHiveSupport
()
.
getOrCreate
()
...
...
eda/esmm/Model_pipline/rerank_esmm.py
View file @
0a84be4b
...
...
@@ -20,7 +20,7 @@ def get_esmm_users():
stat_date
=
(
datetime
.
date
.
today
()
-
datetime
.
timedelta
(
days
=
1
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
sql
=
"select distinct device_id,city_id from data_feed_exposure_precise "
\
"where stat_date = '{}'"
.
format
(
stat_date
)
result
=
get_mysql_data
(
'172.16.40.1
70
'
,
4000
,
'root'
,
'3SYz54LS9#^9sBvC'
,
'jerry_prod'
,
sql
)
result
=
get_mysql_data
(
'172.16.40.1
58
'
,
4000
,
'root'
,
'3SYz54LS9#^9sBvC'
,
'jerry_prod'
,
sql
)
result
=
list
(
result
)
return
result
except
:
...
...
@@ -70,7 +70,7 @@ def get_searchworlds_to_tagid():
def
get_queues
(
device_id
,
city_id
):
try
:
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
sql
=
"select native_queue, nearby_queue, nation_queue, megacity_queue from esmm_device_diary_queue "
\
...
...
@@ -95,7 +95,7 @@ def tag_boost(cid_str, tag_list):
"(select a.diary_id,b.id from src_mimas_prod_api_diary_tags a left join src_zhengxing_api_tag b "
\
"on a.tag_id = b.id where b.tag_type < '4' and a.diary_id in {}) tmp "
\
"where id in {} group by id"
.
format
(
tuple
(
cids
),
tuple
(
tag_list
))
result
=
get_mysql_data
(
'172.16.40.1
70
'
,
4000
,
'root'
,
'3SYz54LS9#^9sBvC'
,
'eagle'
,
sql
)
result
=
get_mysql_data
(
'172.16.40.1
58
'
,
4000
,
'root'
,
'3SYz54LS9#^9sBvC'
,
'eagle'
,
sql
)
if
len
(
result
)
>
0
:
tag_cids
=
{}
left_cids
=
[]
...
...
@@ -147,13 +147,13 @@ def tag_boost(cid_str, tag_list):
def
to_data_base
(
df
):
sql
=
"select distinct device_id from esmm_resort_diary_queue"
result
=
get_mysql_data
(
'172.16.40.1
70
'
,
4000
,
'root'
,
'3SYz54LS9#^9sBvC'
,
'jerry_test'
,
sql
)
result
=
get_mysql_data
(
'172.16.40.1
58
'
,
4000
,
'root'
,
'3SYz54LS9#^9sBvC'
,
'jerry_test'
,
sql
)
old_uid
=
[
i
[
0
]
for
i
in
result
]
if
len
(
old_uid
)
>
0
:
old_uid
=
set
(
df
[
"device_id"
]
.
values
)
&
set
(
old_uid
)
old_number
=
len
(
old_uid
)
if
old_number
>
0
:
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"delete from esmm_resort_diary_queue where device_id in {}"
.
format
(
tuple
(
old_uid
))
...
...
@@ -163,7 +163,7 @@ def to_data_base(df):
cursor
.
close
()
db
.
close
()
yconnect
=
create_engine
(
'mysql+pymysql://root:3SYz54LS9#^9sBvC@172.16.40.1
70
:4000/jerry_test?charset=utf8'
)
yconnect
=
create_engine
(
'mysql+pymysql://root:3SYz54LS9#^9sBvC@172.16.40.1
58
:4000/jerry_test?charset=utf8'
)
pd
.
io
.
sql
.
to_sql
(
df
,
"esmm_resort_diary_queue"
,
yconnect
,
schema
=
'jerry_test'
,
if_exists
=
'append'
,
index
=
False
,
chunksize
=
200
)
print
(
"insert done"
)
...
...
eda/esmm/Model_pipline/to_database.py
View file @
0a84be4b
...
...
@@ -11,7 +11,7 @@ def con_sql(sql):
:type sql : str
:rtype : tuple
"""
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
...
...
@@ -58,7 +58,7 @@ def main():
df_all
[
"time"
]
=
str
(
datetime
.
datetime
.
now
()
.
strftime
(
'
%
Y
%
m
%
d
%
H
%
M'
))
print
(
"union_device_count"
,
df_all
.
shape
)
host
=
'172.16.40.1
70
'
host
=
'172.16.40.1
58
'
port
=
4000
user
=
'root'
password
=
'3SYz54LS9#^9sBvC'
...
...
@@ -78,7 +78,7 @@ def main():
try
:
for
i
in
df_merge_str
:
delete_str
=
'delete from esmm_device_diary_queue where concat(device_id,city_id) in ({0})'
.
format
(
i
)
con
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
con
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cur
=
con
.
cursor
()
cur
.
execute
(
delete_str
)
con
.
commit
()
...
...
eda/esmm/Model_pipline/train.py
View file @
0a84be4b
...
...
@@ -396,7 +396,7 @@ def df_sort(result,queue_name):
def
update_or_insert
(
df2
,
queue_name
):
device_count
=
df2
.
shape
[
0
]
con
=
pymysql
.
connect
(
host
=
'172.16.40.1
70
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
,
charset
=
'utf8'
)
con
=
pymysql
.
connect
(
host
=
'172.16.40.1
58
'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
,
charset
=
'utf8'
)
cur
=
con
.
cursor
()
try
:
for
i
in
range
(
0
,
device_count
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment