Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
f154eb8d
Commit
f154eb8d
authored
Sep 26, 2018
by
高雅喆
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' into gyz
parents
eae303de
ae152f48
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
433 additions
and
167 deletions
+433
-167
config.py
config.py
+29
-11
ctr.py
ctr.py
+92
-0
diaryUpdateOnlineOffline.py
diaryUpdateOnlineOffline.py
+28
-26
ctr.py
local/ctr.py
+41
-0
testCases.py
local/testCases.py
+55
-12
utils.py
local/utils.py
+64
-62
prepareData.py
prepareData.py
+12
-8
train.py
train.py
+1
-2
userProfile.py
userProfile.py
+67
-33
utils.py
utils.py
+44
-13
No files found.
config.py
View file @
f154eb8d
# 线上地址
DIRECTORY_PATH
=
'/data/models/'
DIRECTORY_PATH
=
'/data/models/'
# 测试日期一定要大于验证日期,因为切割数据集的代码是这样设置的
# 本地地址
# VALIDATION_DATE = '2018-08-05'
LOCAL_DIRCTORY
=
"/Users/mac/utils/"
# TEST_DATE = '2018-08-06'
# 线上用户活跃表db
# DATA_START_DATE = '2018-07-05'
ACTIVE_USER_DB_ONLINE
=
{
"host"
:
'10.66.157.22'
,
"port"
:
4000
,
"user"
:
'root'
,
# DATA_END_DATE = '2018-08-06'
"passwd"
:
'3SYz54LS9#^9sBvC'
,
"db"
:
'jerry_prod'
}
#线下用户活跃表db
ACTIVE_USER_DB_LOCAL
=
{
"host"
:
'192.168.15.12'
,
"port"
:
4000
,
"user"
:
'root'
,
"db"
:
'jerry_test'
}
# 线上日记队列db
QUEUE_DB_ONLINE
=
{
"host"
:
'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
,
"port"
:
3306
,
"user"
:
'doris'
,
"passwd"
:
'o5gbA27hXHHm'
,
"db"
:
'doris_prod'
}
# 本地日记队列db
QUEUE_DB_LOCAL
=
{
"host"
:
'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
,
"port"
:
3306
,
"user"
:
'work'
,
"passwd"
:
'workwork'
,
"db"
:
'doris_test'
}
# 线上日记打分
SCORE_DB_ONLINE
=
{
"host"
:
'10.66.157.22'
,
"port"
:
4000
,
"user"
:
'root'
,
"passwd"
:
'3SYz54LS9#^9sBvC'
,
"db"
:
'eagle'
}
# 本地日记打分db
SCORE_DB_LOCAL
=
{
"host"
:
'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
,
"port"
:
3306
,
"user"
:
'work'
,
"passwd"
:
'workwork'
,
"db"
:
'zhengxing_test'
}
MODEL_VERSION
=
''
MODEL_VERSION
=
''
lr
=
0.03
lr
=
0.03
l2_lambda
=
0.002
l2_lambda
=
0.002
...
@@ -18,9 +32,6 @@ ONLINE_EAGLE_HOST = '10.66.157.22'
...
@@ -18,9 +32,6 @@ ONLINE_EAGLE_HOST = '10.66.157.22'
# 测试日记视频所在的ip
# 测试日记视频所在的ip
LOCAL_EAGLE_HOST
=
"192.168.15.12"
LOCAL_EAGLE_HOST
=
"192.168.15.12"
# 本地地址
LOCAL_DIRCTORY
=
"/Users/mac/utils/"
# 线上日记队列域名
# 线上日记队列域名
QUEUE_ONLINE_HOST
=
'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
QUEUE_ONLINE_HOST
=
'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
...
@@ -28,3 +39,10 @@ QUEUE_ONLINE_HOST = 'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
...
@@ -28,3 +39,10 @@ QUEUE_ONLINE_HOST = 'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
LOCAL_HOST
=
'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
LOCAL_HOST
=
'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
ctr.py
0 → 100644
View file @
f154eb8d
import
pandas
as
pd
import
pymysql
from
datetime
import
datetime
from
datetime
import
timedelta
def
get_tail8
():
sql
=
"select distinct device_id from data_feed_click
\
where stat_date='{}'
\
and cid_type='{}'
\
and device_id regexp '8$';"
.
format
(
stat_date
,
cid_type
)
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
print
(
"开始获取"
)
cursor
.
execute
(
sql
)
print
(
"成功获取"
)
result
=
cursor
.
fetchall
()
db
.
close
()
user
=
pd
.
DataFrame
(
list
(
result
))[
0
]
.
values
.
tolist
()
user
=
tuple
(
user
)
print
(
"尾号是8的用户个数"
)
print
(
len
(
user
))
return
user
def
get_ctr
(
user_tuple
):
sql
=
"select count(device_id) from data_feed_click
\
where stat_date='{}'
\
and cid_type='{}'
\
and device_id in {}"
.
format
(
stat_date
,
cid_type
,
user_tuple
)
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
print
(
"开始获取"
)
cursor
.
execute
(
sql
)
click
=
cursor
.
fetchall
()[
0
][
0
]
print
(
click
)
sql
=
"select count(device_id) from data_feed_exposure
\
where stat_date='{}'
\
and cid_type='{}'
\
and device_id in {}"
.
format
(
stat_date
,
cid_type
,
user_tuple
)
cursor
=
db
.
cursor
()
print
(
"开始获取"
)
cursor
.
execute
(
sql
)
exp
=
cursor
.
fetchall
()[
0
][
0
]
db
.
close
()
print
(
exp
)
print
(
click
/
exp
)
def
get_tail6
():
df
=
pd
.
read_csv
(
path
+
"{}predictTail6Unique.csv"
.
format
(
stat_date
))
pre_list
=
tuple
(
eval
(
df
.
loc
[
0
,
"list"
]))
print
(
len
(
pre_list
))
print
(
pre_list
[:
2
])
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct device_id from data_feed_click
\
where stat_date='{}'
\
and cid_type='{}'
\
and device_id in {}"
.
format
(
stat_date
,
cid_type
,
pre_list
)
cursor
=
db
.
cursor
()
print
(
"开始获取"
)
cursor
.
execute
(
sql
)
print
(
"成功获取"
)
result
=
cursor
.
fetchall
()
db
.
close
()
print
(
pd
.
DataFrame
(
list
(
result
))
.
empty
)
user
=
pd
.
DataFrame
(
list
(
result
))[
0
]
.
values
.
tolist
()
user
=
tuple
(
user
)
print
(
"用户个数"
)
print
(
len
(
user
))
return
user
if
__name__
==
"__main__"
:
path
=
"/data/models/"
cid_type
=
"diary"
now
=
datetime
.
now
()
year
=
now
.
year
month
=
now
.
month
day
=
now
.
day
stat_date
=
datetime
(
year
,
month
,
day
)
stat_date
=
(
stat_date
-
timedelta
(
days
=
1
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
print
(
stat_date
)
tail6
=
get_tail6
()
get_ctr
(
tail6
)
tail8
=
get_tail8
()
get_ctr
(
tail8
)
diaryUpdateOnlineOffline.py
View file @
f154eb8d
...
@@ -13,7 +13,7 @@ from userProfile import get_active_users
...
@@ -13,7 +13,7 @@ from userProfile import get_active_users
from
sklearn.preprocessing
import
MinMaxScaler
from
sklearn.preprocessing
import
MinMaxScaler
import
time
import
time
from
config
import
*
from
config
import
*
import
socket
from
utils
import
judge_online
,
con_sql
def
get_video_id
(
cache_video_id
):
def
get_video_id
(
cache_video_id
):
...
@@ -38,6 +38,8 @@ def get_video_id(cache_video_id):
...
@@ -38,6 +38,8 @@ def get_video_id(cache_video_id):
return
cache_video_id
return
cache_video_id
else
:
else
:
video_id
=
df
[
0
]
.
values
.
tolist
()
video_id
=
df
[
0
]
.
values
.
tolist
()
print
(
"videoid"
)
print
(
video_id
[:
2
])
return
video_id
return
video_id
...
@@ -110,18 +112,18 @@ def save_result(queue_name,queue_arg,device_id):
...
@@ -110,18 +112,18 @@ def save_result(queue_name,queue_arg,device_id):
def
get_score
(
queue_arg
):
def
get_score
(
queue_arg
):
if
flag
:
if
flag
:
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'eagle'
)
db
=
pymysql
.
connect
(
host
=
SCORE_DB_ONLINE
[
"host"
],
port
=
SCORE_DB_ONLINE
[
"port"
],
user
=
SCORE_DB_ONLINE
[
"user"
],
passwd
=
SCORE_DB_ONLINE
[
"passwd"
],
db
=
SCORE_DB_ONLINE
[
"db"
])
else
:
else
:
db
=
pymysql
.
connect
(
host
=
LOCAL_HOST
,
port
=
3306
,
user
=
'work'
,
passwd
=
'workwork'
,
db
=
'zhengxing_tes'
)
db
=
pymysql
.
connect
(
host
=
SCORE_DB_LOCAL
[
"host"
],
port
=
SCORE_DB_LOCAL
[
"port"
],
user
=
SCORE_DB_LOCAL
[
"user"
],
passwd
=
SCORE_DB_LOCAL
[
"passwd"
],
db
=
SCORE_DB_LOCAL
[
"db"
])
cursor
=
db
.
cursor
()
# 去除diary_id 前面的"diary|"
# 去除diary_id 前面的"diary|"
diary_list
=
tuple
(
list
(
map
(
lambda
x
:
x
[
6
:],
queue_arg
[
2
])))
diary_list
=
tuple
(
list
(
map
(
lambda
x
:
x
[
6
:],
queue_arg
[
2
])))
sql
=
"select score,diary_id from biz_feed_diary_score where diary_id in {};"
.
format
(
diary_list
)
sql
=
"select score,diary_id from biz_feed_diary_score where diary_id in {};"
.
format
(
diary_list
)
cursor
.
execute
(
sql
)
score_df
=
con_sql
(
db
,
sql
)
result
=
cursor
.
fetchall
()
score_df
=
pd
.
DataFrame
(
list
(
result
))
.
dropna
()
db
.
close
()
print
(
"get score"
)
print
(
"get score"
)
return
score_df
return
score_df
...
@@ -177,7 +179,6 @@ def update_sql_dairy_queue(queue_name, diary_id,device_id, city_id):
...
@@ -177,7 +179,6 @@ def update_sql_dairy_queue(queue_name, diary_id,device_id, city_id):
db
=
'doris_prod'
)
db
=
'doris_prod'
)
else
:
else
:
db
=
pymysql
.
connect
(
host
=
LOCAL_HOST
,
port
=
3306
,
user
=
'work'
,
passwd
=
'workwork'
,
db
=
'doris_test'
)
db
=
pymysql
.
connect
(
host
=
LOCAL_HOST
,
port
=
3306
,
user
=
'work'
,
passwd
=
'workwork'
,
db
=
'doris_test'
)
cursor
=
db
.
cursor
()
cursor
=
db
.
cursor
()
id_str
=
str
(
diary_id
[
0
])
id_str
=
str
(
diary_id
[
0
])
for
i
in
range
(
1
,
len
(
diary_id
)):
for
i
in
range
(
1
,
len
(
diary_id
)):
...
@@ -206,27 +207,28 @@ def queue_compare(old_list, new_list):
...
@@ -206,27 +207,28 @@ def queue_compare(old_list, new_list):
for
key
in
x_dict
.
keys
():
for
key
in
x_dict
.
keys
():
if
x_dict
[
key
]
!=
y_dict
[
key
]:
if
x_dict
[
key
]
!=
y_dict
[
key
]:
i
+=
1
i
+=
1
if
i
>
0
:
if
i
>
0
:
print
(
"日记队列更新前日记总个数{},位置发生变化个数{},发生变化率{}
%
"
.
format
(
len
(
old_list
),
i
,
print
(
"日记队列更新前日记总个数{},位置发生变化个数{},发生变化率{}
%
"
.
format
(
len
(
old_list
),
i
,
round
(
i
/
len
(
old_list
)
*
100
),
2
))
round
(
i
/
len
(
old_list
)
*
100
),
2
))
return
True
else
:
return
False
def
get_queue
(
device_id
,
city_id
,
queue_name
):
def
get_queue
(
device_id
,
city_id
,
queue_name
):
if
flag
:
if
flag
:
db
=
pymysql
.
connect
(
host
=
QUEUE_ONLINE_HOST
,
port
=
3306
,
user
=
'doris'
,
passwd
=
'o5gbA27hXHHm'
,
db
=
pymysql
.
connect
(
host
=
QUEUE_ONLINE_HOST
,
port
=
3306
,
user
=
'doris'
,
passwd
=
'o5gbA27hXHHm'
,
db
=
'doris_prod'
)
db
=
'doris_prod'
)
else
:
else
:
db
=
pymysql
.
connect
(
host
=
LOCAL_HOST
,
port
=
3306
,
user
=
'work'
,
db
=
pymysql
.
connect
(
host
=
LOCAL_HOST
,
port
=
3306
,
user
=
'work'
,
passwd
=
'workwork'
,
db
=
'doris_test'
)
passwd
=
'workwork'
,
db
=
'doris_test'
)
cursor
=
db
.
cursor
()
cursor
=
db
.
cursor
()
sql
=
"select {} from device_diary_queue "
\
sql
=
"select {} from device_diary_queue "
\
"where device_id = '{}' and city_id = '{}';"
.
format
(
queue_name
,
device_id
,
city_id
)
"where device_id = '{}' and city_id = '{}';"
.
format
(
queue_name
,
device_id
,
city_id
)
cursor
.
execute
(
sql
)
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
df
=
pd
.
DataFrame
(
list
(
result
))
if
df
.
empty
:
if
df
.
empty
:
print
(
"该用户对应的日记为空"
)
print
(
"该用户对应的日记为空"
)
return
False
return
False
...
@@ -260,12 +262,11 @@ def user_update(device_id, city_id, queue_name,data_set_cid,total_video_id):
...
@@ -260,12 +262,11 @@ def user_update(device_id, city_id, queue_name,data_set_cid,total_video_id):
queue_arg
=
[
queue_predict
,
queue_not_predict
,
queue_list
]
queue_arg
=
[
queue_predict
,
queue_not_predict
,
queue_list
]
if
queue_predict
!=
[]:
if
queue_predict
!=
[]:
diary_queue
=
pipe_line
(
queue_name
,
queue_arg
,
device_id
,
total_video_id
)
diary_queue
=
pipe_line
(
queue_name
,
queue_arg
,
device_id
,
total_video_id
)
if
diary_queue
:
if
diary_queue
and
queue_compare
(
queue_list
,
diary_queue
)
:
update_sql_dairy_queue
(
queue_name
,
diary_queue
,
device_id
,
city_id
)
update_sql_dairy_queue
(
queue_name
,
diary_queue
,
device_id
,
city_id
)
queue_compare
(
queue_list
,
diary_queue
)
print
(
"更新结束"
)
# print("更新结束")
else
:
else
:
print
(
"获取的日记列表是空,所以不更新日记队列"
)
print
(
"获取的日记列表是空
或者日记队列顺序没有变化
,所以不更新日记队列"
)
else
:
else
:
print
(
"预测集是空,不需要预测"
)
print
(
"预测集是空,不需要预测"
)
else
:
else
:
...
@@ -274,6 +275,7 @@ def user_update(device_id, city_id, queue_name,data_set_cid,total_video_id):
...
@@ -274,6 +275,7 @@ def user_update(device_id, city_id, queue_name,data_set_cid,total_video_id):
def
multi_proecess_update
(
device_id
,
city_id
,
data_set_cid
,
total_video_id
):
def
multi_proecess_update
(
device_id
,
city_id
,
data_set_cid
,
total_video_id
):
queue_name_list
=
[
"native_queue"
,
"nearby_queue"
,
"nation_queue"
,
"megacity_queue"
]
queue_name_list
=
[
"native_queue"
,
"nearby_queue"
,
"nation_queue"
,
"megacity_queue"
]
pool
=
Pool
(
4
)
pool
=
Pool
(
4
)
for
queue_name
in
queue_name_list
:
for
queue_name
in
queue_name_list
:
pool
.
apply_async
(
user_update
,
(
device_id
,
city_id
,
queue_name
,
data_set_cid
,
total_video_id
,))
pool
.
apply_async
(
user_update
,
(
device_id
,
city_id
,
queue_name
,
data_set_cid
,
total_video_id
,))
...
@@ -283,31 +285,31 @@ def multi_proecess_update(device_id, city_id, data_set_cid,total_video_id):
...
@@ -283,31 +285,31 @@ def multi_proecess_update(device_id, city_id, data_set_cid,total_video_id):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
warnings
.
filterwarnings
(
"ignore"
)
warnings
.
filterwarnings
(
"ignore"
)
flag
=
True
flag
,
path
=
judge_online
()
path
=
DIRECTORY_PATH
# 下面这个ip是本地电脑ip
if
socket
.
gethostbyname
(
socket
.
gethostname
())
==
'172.30.8.160'
:
flag
=
False
path
=
LOCAL_DIRCTORY
# 增加缓存日记视频列表
# 增加缓存日记视频列表
cache_video_id
=
[]
cache_video_id
=
[]
cache_device_city_list
=
[]
cache_device_city_list
=
[]
differ
=
0
differ
=
0
while
True
:
while
True
:
data_set_cid
=
pd
.
read_csv
(
path
+
"data_set_cid.csv"
)[
"cid"
]
.
values
.
tolist
()
total_video_id
=
get_video_id
(
cache_video_id
)
cache_video_id
=
total_video_id
start
=
time
.
time
()
start
=
time
.
time
()
device_city_list
=
get_active_users
(
flag
,
path
,
differ
)
device_city_list
=
get_active_users
(
flag
,
path
,
differ
)
time1
=
time
.
time
()
print
(
"获取用户活跃表耗时:{}秒"
.
format
(
time1
-
start
))
# 过滤掉5分钟内预测过的用户
# 过滤掉5分钟内预测过的用户
device_city_list
=
list
(
set
(
tuple
(
device_city_list
))
-
set
(
tuple
(
cache_device_city_list
)))
device_city_list
=
list
(
set
(
tuple
(
device_city_list
))
-
set
(
tuple
(
cache_device_city_list
)))
print
(
"device_city_list"
)
print
(
device_city_list
)
if
datetime
.
now
()
.
minute
%
5
==
0
:
if
datetime
.
now
()
.
minute
%
5
==
0
:
cache_device_city_list
=
[]
cache_device_city_list
=
[]
if
device_city_list
!=
[]:
if
device_city_list
!=
[]:
data_set_cid
=
pd
.
read_csv
(
path
+
"data_set_cid.csv"
)[
"cid"
]
.
values
.
tolist
()
total_video_id
=
get_video_id
(
cache_video_id
)
cache_video_id
=
total_video_id
cache_device_city_list
.
extend
(
device_city_list
)
cache_device_city_list
.
extend
(
device_city_list
)
for
device_city
in
device_city_list
:
for
device_city
in
device_city_list
:
multi_proecess_update
(
device_city
[
0
],
device_city
[
1
],
data_set_cid
,
total_video_id
)
multi_proecess_update
(
device_city
[
0
],
device_city
[
1
],
data_set_cid
,
total_video_id
)
differ
=
time
.
time
()
-
start
differ
=
time
.
time
()
-
start
print
(
"differ:{}秒"
.
format
(
differ
))
...
...
local/ctr.py
0 → 100644
View file @
f154eb8d
import
pandas
as
pd
import
pymysql
df
=
pd
.
read_csv
(
r"/data2/models/2018-09-02predictTail6Unique.csv"
)
a
=
eval
(
df
.
loc
[
0
,
"list"
])
a
=
list
(
map
(
lambda
x
:
x
[
0
],
a
))
print
(
len
(
a
))
print
(
a
[:
2
])
cf
=
pd
.
read_csv
(
r"/data2/models/nvwa-2018-09-02predictTail6Unique.csv"
)
b
=
eval
(
cf
.
loc
[
0
,
"list"
])
print
(
len
(
b
))
print
(
b
[:
2
])
a
.
extend
(
b
)
print
(
"个数"
)
print
(
len
(
set
(
a
)))
pre_list
=
list
(
set
(
a
))
print
(
pre_list
[:
2
])
stat_date
=
"2018-09-02"
cid_type
=
"diary"
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select count(device_id) from data_feed_exposure2
\
where stat_date='{}'
\
and cid_type='{}'
\
and device_id in {}"
.
format
(
stat_date
,
cid_type
,
pre_list
)
cursor
=
db
.
cursor
()
print
(
"开始获取"
)
cursor
.
execute
(
sql
)
print
(
"成功获取"
)
click
=
cursor
.
fetchall
()[
0
][
0
]
print
(
click
)
sql
=
"select count(device_id) from data_feed_exposure2
\
where stat_date='{}'
\
and cid_type='{}'
\
and device_id in {}"
.
format
(
stat_date
,
cid_type
,
pre_list
)
cursor
=
db
.
cursor
()
print
(
"开始获取"
)
cursor
.
execute
(
sql
)
exp
=
cursor
.
fetchall
()[
0
][
0
]
print
(
exp
)
print
(
click
/
exp
)
local/testCases.py
View file @
f154eb8d
...
@@ -39,23 +39,66 @@ def get_local_device():
...
@@ -39,23 +39,66 @@ def get_local_device():
df
.
to_csv
(
'/Users/mac/utils/test_device_city_id.csv'
,
index
=
None
)
df
.
to_csv
(
'/Users/mac/utils/test_device_city_id.csv'
,
index
=
None
)
print
(
1
)
print
(
1
)
if
__name__
==
"__main__"
:
LOCAL_HOST
=
'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
def
save_queue
():
db
=
pymysql
.
connect
(
host
=
LOCAL_HOST
,
port
=
3306
,
user
=
'work'
,
passwd
=
'workwork'
,
db
=
'doris_test'
)
queue_name_list
=
[
"native_queue"
,
"nearby_queue"
,
"nation_queue"
,
"megacity_queue"
]
diary_id
=
[
14207355
,
16197023
,
13006945
,
12363565
,
15296547
,
15082216
,
16198052
,
15228350
,
13006942
,
14229868
,
15303563
,
16211116
,
15225921
,
15250715
,
15271108
,
15479315
,
16197047
,
15544727
,
15336944
,
15486003
,
15517196
,
16211130
,
15547275
,
15572010
]
for
i
in
queue_name_list
:
device_id
=
'99000645287876'
sql
=
"select {} from device_diary_queue "
\
city_id
=
'beijing'
"where device_id = '{}' and city_id = '{}';"
.
format
(
i
,
device_id
,
city_id
)
db
=
pymysql
.
connect
(
host
=
'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
,
port
=
3306
,
user
=
'doris'
,
passwd
=
'o5gbA27hXHHm'
,
db
=
'doris_prod'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
print
(
df
.
shape
)
df
.
to_csv
(
"/data/models/{}.csv"
.
format
(
i
),
index
=
None
)
print
(
"end"
)
def
delete
():
db
=
pymysql
.
connect
(
host
=
'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
,
port
=
3306
,
user
=
'doris'
,
passwd
=
'o5gbA27hXHHm'
,
db
=
'doris_prod'
)
cursor
=
db
.
cursor
()
cursor
=
db
.
cursor
()
id_str
=
str
(
diary_id
[
0
])
sql
=
"delete from device_diary_queue where device_id = '{}' and city_id = '{}';"
.
format
(
device_id
,
city_id
)
for
i
in
range
(
1
,
len
(
diary_id
)):
cursor
.
execute
(
sql
)
id_str
=
id_str
+
","
+
str
(
diary_id
[
i
])
db
.
close
()
print
(
"删除成功"
)
sql
=
"insert into device_diary_queue values ('{}','{}','{}','{}','{}','{}',89)"
.
format
\
(
device_id
,
city_id
,
id_str
,
id_str
,
id_str
,
id_str
)
def
insert
():
queue_name_list
=
[
"native_queue"
,
"nearby_queue"
,
"nation_queue"
,
"megacity_queue"
]
a
=
{}
for
i
in
queue_name_list
:
a
[
i
]
=
pd
.
read_csv
(
"/data/models/native_queue.csv"
)[
"0"
]
.
values
.
tolist
()[
0
]
# columns = ["native_queue", "nearby_queue", "nation_queue", "megacity_queue","id","device_id","city_id"]
db
=
pymysql
.
connect
(
host
=
'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
,
port
=
3306
,
user
=
'doris'
,
passwd
=
'o5gbA27hXHHm'
,
db
=
'doris_prod'
)
sql
=
"INSERT INTO device_diary_queue (native_queue, nearby_queue, nation_queue, "
\
"megacity_queue,id,device_id,city_id) VALUES ('{}','{}','{}','{}','{}','{}','{}');"
.
format
\
(
a
[
"native_queue"
],
a
[
"nearby_queue"
],
a
[
"nation_queue"
],
a
[
"megacity_queue"
],
id
,
device_id
,
city_id
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
cursor
.
execute
(
sql
)
db
.
commit
()
db
.
commit
()
db
.
close
()
db
.
close
()
print
(
"成功写入diary_id"
)
print
(
"end"
)
if
__name__
==
"__main__"
:
# 先把数据保存下来,调用上面的save函数,然后调上面delete函数删除数据,然后调insert函数插入数据
id
=
334
device_id
=
'00CA20EB-2719-4518-85CC-60E765AC526F'
city_id
=
'beijing'
save_queue
()
delete
()
insert
()
...
...
local/utils.py
View file @
f154eb8d
...
@@ -165,65 +165,67 @@ class multiFFMFormatPandas:
...
@@ -165,65 +165,67 @@ class multiFFMFormatPandas:
return
False
return
False
# ffm 格式转换函数、类
# ffm 格式转换函数、类
# class FFMFormatPandas:
class
FFMFormatPandas
:
# def __init__(self):
def
__init__
(
self
):
# self.field_index_ = None
self
.
field_index_
=
None
# self.feature_index_ = None
self
.
feature_index_
=
None
# self.y = None
self
.
y
=
None
#
# def fit(self, df, y=None):
def
fit
(
self
,
df
,
y
=
None
):
# self.y = y
self
.
y
=
y
# df_ffm = df[df.columns.difference([self.y])]
df_ffm
=
df
[
df
.
columns
.
difference
([
self
.
y
])]
# if self.field_index_ is None:
if
self
.
field_index_
is
None
:
# self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
# 除了y,每列列名加索引对应的字典,例如field_index = {name:0,age:1}
#
self
.
field_index_
=
{
col
:
i
for
i
,
col
in
enumerate
(
df_ffm
)}
# if self.feature_index_ is not None:
# last_idx = max(list(self.feature_index_.values()))
if
self
.
feature_index_
is
not
None
:
#
last_idx
=
max
(
list
(
self
.
feature_index_
.
values
()))
# if self.feature_index_ is None:
# self.feature_index_ = dict()
if
self
.
feature_index_
is
None
:
# last_idx = 0
self
.
feature_index_
=
dict
()
#
last_idx
=
0
# for col in df.columns:
# 下面这个feature包括y,应该不包括。这是个bug
# vals = df[col].unique()
for
col
in
df
.
columns
:
# for val in vals:
vals
=
df
[
col
]
.
unique
()
# if pd.isnull(val):
for
val
in
vals
:
# continue
if
pd
.
isnull
(
val
):
# name = '{}_{}'.format(col, val)
continue
# if name not in self.feature_index_:
name
=
'{}_{}'
.
format
(
col
,
val
)
# self.feature_index_[name] = last_idx
if
name
not
in
self
.
feature_index_
:
# last_idx += 1
# feature_index = {name_tom :0,name_lily :1,name:2,age_18:3,age_19:4:age:5}
# self.feature_index_[col] = last_idx
self
.
feature_index_
[
name
]
=
last_idx
# last_idx += 1
last_idx
+=
1
# return self
self
.
feature_index_
[
col
]
=
last_idx
#
last_idx
+=
1
# def fit_transform(self, df, y=None):
return
self
# self.fit(df, y)
# return self.transform(df)
def
fit_transform
(
self
,
df
,
y
=
None
):
#
self
.
fit
(
df
,
y
)
# def transform_row_(self, row, t):
return
self
.
transform
(
df
)
# ffm = []
# if self.y is not None:
def
transform_row_
(
self
,
row
,
t
):
# ffm.append(str(row.loc[row.index == self.y][0]))
ffm
=
[]
# if self.y is None:
if
self
.
y
is
not
None
:
# ffm.append(str(0))
ffm
.
append
(
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
]))
#
if
self
.
y
is
None
:
# for col, val in row.loc[row.index != self.y].to_dict().items():
ffm
.
append
(
str
(
0
))
# col_type = t[col]
# name = '{}_{}'.format(col, val)
for
col
,
val
in
row
.
loc
[
row
.
index
!=
self
.
y
]
.
to_dict
()
.
items
():
# if col_type.kind == 'O':
col_type
=
t
[
col
]
# ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
name
=
'{}_{}'
.
format
(
col
,
val
)
# elif col_type.kind == 'i':
if
col_type
.
kind
==
'O'
:
# ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
ffm
.
append
(
'{}:{}:1'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
name
]))
# return ' '.join(ffm)
elif
col_type
.
kind
==
'i'
:
#
ffm
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
],
self
.
feature_index_
[
col
],
val
))
# def transform(self, df):
return
' '
.
join
(
ffm
)
# t = df.dtypes.to_dict()
# return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
def
transform
(
self
,
df
):
#
t
=
df
.
dtypes
.
to_dict
()
# # 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
return
pd
.
Series
({
idx
:
self
.
transform_row_
(
row
,
t
)
for
idx
,
row
in
df
.
iterrows
()})
# def is_feature_index_exist(self, name):
# if name in self.feature_index_:
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
# return True
def
is_feature_index_exist
(
self
,
name
):
# else:
if
name
in
self
.
feature_index_
:
# return False
return
True
else
:
return
False
prepareData.py
View file @
f154eb8d
from
utils
import
con_sql
from
utils
import
con_sql
import
datetime
import
datetime
import
time
import
time
import
pymysql
def
fetch_data
(
start_date
,
end_date
):
def
fetch_data
(
start_date
,
end_date
):
# 获取点击表里的device_id
# 获取点击表里的device_id
sql
=
"select distinct device_id from data_feed_click2"
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
click_device_id
=
con_sql
(
sql
)[
0
]
.
values
.
tolist
()
sql
=
"select distinct device_id from data_feed_click"
click_device_id
=
con_sql
(
db
,
sql
)[
0
]
.
values
.
tolist
()
print
(
"成功获取点击表里的device_id"
)
print
(
"成功获取点击表里的device_id"
)
# 获取点击表里的数据
# 获取点击表里的数据
sql
=
"select cid,device_id,time,stat_date from data_feed_click
2
"
\
sql
=
"select cid,device_id,time,stat_date from data_feed_click "
\
"where stat_date >= '{0}' and stat_date <= '{1}'"
.
format
(
start_date
,
end_date
)
"where stat_date >= '{0}' and stat_date <= '{1}'"
.
format
(
start_date
,
end_date
)
click
=
con_sql
(
sql
)
# 因为上面的db已经关了,需要再写一遍
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
click
=
con_sql
(
db
,
sql
)
click
=
click
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time_date"
,
3
:
"stat_date"
})
click
=
click
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time_date"
,
3
:
"stat_date"
})
print
(
"成功获取点击表里的数据"
)
print
(
"成功获取点击表里的数据"
)
# 从time特征中抽取hour
# 从time特征中抽取hour
...
@@ -22,10 +24,12 @@ def fetch_data(start_date, end_date):
...
@@ -22,10 +24,12 @@ def fetch_data(start_date, end_date):
click
=
click
.
drop
(
"time_date"
,
axis
=
1
)
click
=
click
.
drop
(
"time_date"
,
axis
=
1
)
# 获取曝光表里的数据
# 获取曝光表里的数据
sql
=
"select cid,device_id,time,stat_date from data_feed_exposure
2
"
\
sql
=
"select cid,device_id,time,stat_date from data_feed_exposure "
\
"where stat_date >= '{0}' and stat_date <= '{1}'"
.
format
(
start_date
,
end_date
)
"where stat_date >= '{0}' and stat_date <= '{1}'"
.
format
(
start_date
,
end_date
)
start
=
time
.
time
()
start
=
time
.
time
()
exposure
=
con_sql
(
sql
)
# 因为上面的db已经关了,需要再写一遍
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
exposure
=
con_sql
(
db
,
sql
)
end
=
time
.
time
()
end
=
time
.
time
()
print
(
"获取曝光表耗时{}分"
.
format
((
end
-
start
)
/
60
))
print
(
"获取曝光表耗时{}分"
.
format
((
end
-
start
)
/
60
))
exposure
=
exposure
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time_date"
,
3
:
"stat_date"
})
exposure
=
exposure
.
rename
(
columns
=
{
0
:
"cid"
,
1
:
"device_id"
,
2
:
"time_date"
,
3
:
"stat_date"
})
...
...
train.py
View file @
f154eb8d
...
@@ -18,8 +18,7 @@ if __name__ == "__main__":
...
@@ -18,8 +18,7 @@ if __name__ == "__main__":
end_train
=
time
.
time
()
end_train
=
time
.
time
()
print
(
"训练模型耗时{}分"
.
format
((
end_train
-
start_train
)
/
60
))
print
(
"训练模型耗时{}分"
.
format
((
end_train
-
start_train
)
/
60
))
move_file
()
move_file
()
#TODO 如果用自己写的keepProcess文件守护进程,下面在这个函数里删掉重新启动进程那行代码,因为可能会多启动一次进程
# restart_process()
...
...
userProfile.py
View file @
f154eb8d
...
@@ -8,6 +8,37 @@ import pymysql
...
@@ -8,6 +8,37 @@ import pymysql
import
time
import
time
# 统计尾号6的活跃用户数
def
unique_user_count
(
file_path
,
temp_list
,
now
):
if
os
.
path
.
exists
(
file_path
):
# 尾号是6的活跃用户数
tail_6_list
=
eval
(
pd
.
read_csv
(
file_path
)
.
loc
[
0
,
"list"
])
else
:
tail_6_list
=
[]
tail_6_list
.
extend
(
list
(
filter
(
lambda
x
:
(
str
(
x
)[
-
1
]
==
"6"
),
temp_list
)))
if
tail_6_list
!=
[]:
df_tail_6
=
pd
.
DataFrame
({
"number"
:
[
len
(
set
(
tail_6_list
))],
"time"
:
[
str
(
now
)[:
16
]],
"list"
:
[
list
(
set
(
tail_6_list
))]})
df_tail_6
.
to_csv
(
file_path
,
index
=
None
)
print
(
"截止现在尾号是6的独立活跃数:{}"
.
format
(
len
(
set
(
tail_6_list
))))
# 统计预测过的独立用户数
def
predict_user_count
(
predict_file_path
,
device_list
,
now
):
if
os
.
path
.
exists
(
predict_file_path
):
# 预测过尾号是6的用户数
all_predict_list
=
eval
(
pd
.
read_csv
(
predict_file_path
)
.
loc
[
0
,
"list"
])
else
:
all_predict_list
=
[]
all_predict_list
.
extend
(
device_list
)
if
all_predict_list
!=
[]:
df_predict
=
pd
.
DataFrame
({
"number"
:
[
len
(
set
(
all_predict_list
))],
"time"
:
[
str
(
now
)[:
16
]],
"list"
:
[
list
(
set
(
all_predict_list
))]})
df_predict
.
to_csv
(
predict_file_path
,
index
=
None
)
print
(
"截止现在预测过尾号是6的独立活跃数:{}"
.
format
(
len
(
set
(
all_predict_list
))))
# 获取当下一分钟内活跃用户
# 获取当下一分钟内活跃用户
def
get_active_users
(
flag
,
path
,
differ
):
def
get_active_users
(
flag
,
path
,
differ
):
if
differ
==
0
:
if
differ
==
0
:
...
@@ -23,18 +54,18 @@ def get_active_users(flag,path,differ):
...
@@ -23,18 +54,18 @@ def get_active_users(flag,path,differ):
start
=
end
-
differ
start
=
end
-
differ
end_datetime
=
str
(
datetime
.
fromtimestamp
(
end
))
end_datetime
=
str
(
datetime
.
fromtimestamp
(
end
))
start_datetime
=
str
(
datetime
.
fromtimestamp
(
start
))
start_datetime
=
str
(
datetime
.
fromtimestamp
(
start
))
sql
=
"select device_id,city_id from user_active_time "
\
"where active_time <= '{}' and active_time >= '{}'"
.
format
(
end_datetime
,
start_datetime
)
if
flag
:
if
flag
:
df
=
con_sql
(
sql
)
sql
=
"select device_id,city_id from user_active_time "
\
"where active_time <= '{}' and active_time >= '{}'"
.
format
(
end_datetime
,
start_datetime
)
db
=
pymysql
.
connect
(
host
=
ACTIVE_USER_DB_ONLINE
[
"host"
],
port
=
ACTIVE_USER_DB_ONLINE
[
"port"
],
user
=
ACTIVE_USER_DB_ONLINE
[
"user"
],
passwd
=
ACTIVE_USER_DB_ONLINE
[
"passwd"
],
db
=
ACTIVE_USER_DB_ONLINE
[
"db"
])
df
=
con_sql
(
db
,
sql
)
else
:
else
:
db
=
pymysql
.
connect
(
host
=
'192.168.15.12'
,
port
=
4000
,
user
=
'root'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
ACTIVE_USER_DB_LOCAL
[
"host"
],
port
=
ACTIVE_USER_DB_LOCAL
[
"port"
],
user
=
ACTIVE_USER_DB_LOCAL
[
"user"
],
db
=
ACTIVE_USER_DB_LOCAL
[
"db"
])
sql
=
"select device_id,city_id from user_active_time"
sql
=
"select device_id,city_id from user_active_time"
cursor
=
db
.
cursor
()
df
=
con_sql
(
db
,
sql
)
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
.
dropna
()
db
.
close
()
if
df
.
empty
:
if
df
.
empty
:
print
(
"当下没有活跃用户数"
)
print
(
"当下没有活跃用户数"
)
...
@@ -44,19 +75,20 @@ def get_active_users(flag,path,differ):
...
@@ -44,19 +75,20 @@ def get_active_users(flag,path,differ):
temp_list
=
df
[
0
]
.
values
.
tolist
()
temp_list
=
df
[
0
]
.
values
.
tolist
()
now
=
datetime
.
now
()
now
=
datetime
.
now
()
tail6_file_path
=
path
+
"{}tail6Unique.csv"
.
format
(
str
(
now
)[:
10
])
tail6_file_path
=
path
+
"{}tail6Unique.csv"
.
format
(
str
(
now
)[:
10
])
if
os
.
path
.
exists
(
tail6_file_path
):
unique_user_count
(
tail6_file_path
,
temp_list
,
now
)
# 尾号是6的活跃用户数
# if os.path.exists(tail6_file_path):
tail_6_list
=
eval
(
pd
.
read_csv
(
tail6_file_path
)
.
loc
[
0
,
"list"
])
# # 尾号是6的活跃用户数
else
:
# tail_6_list = eval(pd.read_csv(tail6_file_path).loc[0, "list"])
tail_6_list
=
[]
# else:
# tail_6_list = []
tail_6_list
.
extend
(
list
(
filter
(
lambda
x
:
(
str
(
x
)[
-
1
]
==
"6"
),
temp_list
)))
#
if
tail_6_list
!=
[]:
# tail_6_list.extend(list(filter(lambda x: (str(x)[-1] == "6"), temp_list)))
df_tail_6
=
pd
.
DataFrame
({
"number"
:
[
len
(
set
(
tail_6_list
))],
"time"
:
[
str
(
now
)[:
16
]],
# if tail_6_list != []:
"list"
:
[
list
(
set
(
tail_6_list
))]})
# df_tail_6 = pd.DataFrame({"number": [len(set(tail_6_list))], "time": [str(now)[:16]],
df_tail_6
.
to_csv
(
tail6_file_path
,
index
=
None
)
# "list": [list(set(tail_6_list))]})
# df_tail_6.to_csv(tail6_file_path, index=None)
print
(
"截止现在尾号是6的独立活跃数:{}"
.
format
(
len
(
set
(
tail_6_list
))))
#
# print("截止现在尾号是6的独立活跃数:{}".format(len(set(tail_6_list))))
old_device_id_list
=
pd
.
read_csv
(
path
+
"data_set_device_id.csv"
)[
"device_id"
]
.
values
.
tolist
()
old_device_id_list
=
pd
.
read_csv
(
path
+
"data_set_device_id.csv"
)[
"device_id"
]
.
values
.
tolist
()
# 求活跃用户和老用户的交集,也就是只预测老用户
# 求活跃用户和老用户的交集,也就是只预测老用户
df
=
df
.
loc
[
df
[
0
]
.
isin
(
old_device_id_list
)]
df
=
df
.
loc
[
df
[
0
]
.
isin
(
old_device_id_list
)]
...
@@ -83,23 +115,25 @@ def get_active_users(flag,path,differ):
...
@@ -83,23 +115,25 @@ def get_active_users(flag,path,differ):
#统计尾号6的预测用户
#统计尾号6的预测用户
predict_file_path
=
path
+
"{}predictTail6Unique.csv"
.
format
(
str
(
now
)[:
10
])
predict_file_path
=
path
+
"{}predictTail6Unique.csv"
.
format
(
str
(
now
)[:
10
])
if
os
.
path
.
exists
(
predict_file_path
):
predict_user_count
(
predict_file_path
,
device_list
,
now
)
# 预测过尾号是6的用户数
# if os.path.exists(predict_file_path):
all_predict_list
=
eval
(
pd
.
read_csv
(
predict_file_path
)
.
loc
[
0
,
"list"
])
# # 预测过尾号是6的用户数
else
:
# all_predict_list = eval(pd.read_csv(predict_file_path).loc[0, "list"])
all_predict_list
=
[]
# else:
all_predict_list
.
extend
(
device_list
)
# all_predict_list = []
if
all_predict_list
!=
[]:
# all_predict_list.extend(device_list)
df_predict
=
pd
.
DataFrame
({
"number"
:
[
len
(
set
(
all_predict_list
))],
"time"
:
[
str
(
now
)[:
16
]],
# if all_predict_list != []:
"list"
:
[
list
(
set
(
all_predict_list
))]})
# df_predict = pd.DataFrame({"number": [len(set(all_predict_list))], "time": [str(now)[:16]],
df_predict
.
to_csv
(
predict_file_path
,
index
=
None
)
# "list": [list(set(all_predict_list))]})
# df_predict.to_csv(predict_file_path, index=None)
return
device_city_list
return
device_city_list
def
fetch_user_profile
(
device_id
):
def
fetch_user_profile
(
device_id
):
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select device_id,city_id from data_feed_click where device_id = '{0}' limit 1"
.
format
(
device_id
)
sql
=
"select device_id,city_id from data_feed_click where device_id = '{0}' limit 1"
.
format
(
device_id
)
user_profile
=
con_sql
(
sql
)
user_profile
=
con_sql
(
db
,
sql
)
if
user_profile
.
empty
:
if
user_profile
.
empty
:
print
(
"没有获取到该用户对应的city_id"
)
print
(
"没有获取到该用户对应的city_id"
)
return
None
,
True
return
None
,
True
...
...
utils.py
View file @
f154eb8d
...
@@ -10,6 +10,18 @@ from multiprocessing import Pool
...
@@ -10,6 +10,18 @@ from multiprocessing import Pool
import
os
import
os
import
signal
import
signal
from
config
import
*
from
config
import
*
import
socket
def
judge_online
():
# 下面这个ip是本地电脑ip
if
socket
.
gethostbyname
(
socket
.
gethostname
())
==
'172.30.8.160'
:
flag
=
False
path
=
LOCAL_DIRCTORY
else
:
flag
=
True
path
=
DIRECTORY_PATH
return
flag
,
path
def
get_date
():
def
get_date
():
...
@@ -19,11 +31,11 @@ def get_date():
...
@@ -19,11 +31,11 @@ def get_date():
day
=
now
.
day
day
=
now
.
day
date
=
datetime
(
year
,
month
,
day
)
date
=
datetime
(
year
,
month
,
day
)
data_start_date
=
"2018-07-15"
data_start_date
=
"2018-07-15"
data_end_date
=
"2018-08-30
"
# data_end_date = "2018-09-02
"
validation_date
=
"2018-08-29
"
# validation_date = "2018-09-01
"
# data_start_date = (date - timedelta(days=3)).strftime("%Y-%m-%d")
# data_start_date = (date - timedelta(days=3)).strftime("%Y-%m-%d")
#
data_end_date = (date - timedelta(days=1)).strftime("%Y-%m-%d")
data_end_date
=
(
date
-
timedelta
(
days
=
1
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
#
validation_date = (date - timedelta(days=2)).strftime("%Y-%m-%d")
validation_date
=
(
date
-
timedelta
(
days
=
2
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
# 验证集和测试集的日期必须相差一天,否则切割数据集时会报错
# 验证集和测试集的日期必须相差一天,否则切割数据集时会报错
test_date
=
data_end_date
test_date
=
data_end_date
print
(
"data_start_date,data_end_date,validation_date,test_date:"
)
print
(
"data_start_date,data_end_date,validation_date,test_date:"
)
...
@@ -40,14 +52,33 @@ def get_roc_curve(y, pred, pos_label):
...
@@ -40,14 +52,33 @@ def get_roc_curve(y, pred, pos_label):
print
(
AUC
)
print
(
AUC
)
# 从Tidb数据库的表里获取数据,并转化成df格式
# 从Tidb数据库的表里获取数据,并转化成df格式,去掉空值
def
con_sql
(
sql
):
def
con_sql
(
db
,
sql
):
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
try
:
result
=
cursor
.
fetchall
()
cursor
.
execute
(
sql
)
df
=
pd
.
DataFrame
(
list
(
result
))
.
dropna
()
result
=
cursor
.
fetchall
()
db
.
close
()
df
=
pd
.
DataFrame
(
list
(
result
))
.
dropna
()
except
Exception
:
print
(
"发生异常"
,
Exception
)
df
=
pd
.
DataFrame
()
finally
:
db
.
close
()
return
df
# 下面这个函数与上面那个函数区别是上面那个函数去掉了空值
def
sql_df
(
db
,
sql
):
cursor
=
db
.
cursor
()
try
:
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
except
Exception
:
print
(
"发生异常"
,
Exception
)
df
=
pd
.
DataFrame
()
finally
:
db
.
close
()
return
df
return
df
...
@@ -70,7 +101,7 @@ def restart_process():
...
@@ -70,7 +101,7 @@ def restart_process():
except
OSError
:
except
OSError
:
print
(
'没有如此进程!!!'
)
print
(
'没有如此进程!!!'
)
os
.
popen
(
'python diaryUpdateOnlineOffline.py'
)
os
.
popen
(
'python diaryUpdateOnlineOffline.py'
)
print
(
"成功重启diaryUpdateOnlineOffline.py"
)
print
(
"
已经
成功重启diaryUpdateOnlineOffline.py"
)
else
:
else
:
os
.
popen
(
'python diaryUpdateOnlineOffline.py'
)
os
.
popen
(
'python diaryUpdateOnlineOffline.py'
)
print
(
"成功重启diaryUpdateOnlineOffline.py"
)
print
(
"成功重启diaryUpdateOnlineOffline.py"
)
...
@@ -252,7 +283,7 @@ class multiFFMFormatPandas:
...
@@ -252,7 +283,7 @@ class multiFFMFormatPandas:
# t = df.dtypes.to_dict()
# t = df.dtypes.to_dict()
# return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
# return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
#
#
#
#
下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
# def is_feature_index_exist(self, name):
# def is_feature_index_exist(self, name):
# if name in self.feature_index_:
# if name in self.feature_index_:
# return True
# return True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment