Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
5fc6569c
Commit
5fc6569c
authored
6 years ago
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update test file
parent
0f05cbd8
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
317 additions
and
0 deletions
+317
-0
diary2.0.py
local/diary2.0.py
+317
-0
No files found.
local/diary2.0.py
0 → 100644
View file @
5fc6569c
import
pickle
import
xlearn
as
xl
import
pandas
as
pd
import
pymysql
from
datetime
import
datetime
# utils 包必须要导,否则ffm转化时用到的pickle找不到utils,会报错
import
utils
import
warnings
from
multiprocessing
import
Pool
from
config
import
*
import
json
from
sklearn.preprocessing
import
MinMaxScaler
import
time
from
userProfile
import
get_active_users
import
os
def
get_video_id
():
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'eagle'
)
cursor
=
db
.
cursor
()
sql
=
"select diary_id from feed_diary_boost;"
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
video_id
=
df
[
0
]
.
values
.
tolist
()
print
(
video_id
[:
10
])
db
.
close
()
return
video_id
# 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致
def
feature_en
(
x_list
,
device_id
):
data
=
pd
.
DataFrame
(
x_list
)
# 下面的列名一定要用cid,不能用diaryid,因为预测模型用到的ffm上是cid
data
=
data
.
rename
(
columns
=
{
0
:
"cid"
})
data
[
"device_id"
]
=
device_id
now
=
datetime
.
now
()
data
[
"hour"
]
=
now
.
hour
data
[
"minute"
]
=
now
.
minute
data
.
loc
[
data
[
"hour"
]
==
0
,
[
"hour"
]]
=
24
data
.
loc
[
data
[
"minute"
]
==
0
,
[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
# 虽然预测y,但ffm转化需要y,并不影响预测结果
data
[
"y"
]
=
0
# print("done 特征工程")
return
data
# 把ffm.pkl load进来,将上面的表转化为ffm格式
def
transform_ffm_format
(
df
,
queue_name
):
with
open
(
DIRECTORY_PATH
+
"ffm.pkl"
,
"rb"
)
as
f
:
ffm_format_pandas
=
pickle
.
load
(
f
)
data
=
ffm_format_pandas
.
native_transform
(
df
)
predict_file_name
=
DIRECTORY_PATH
+
"result/{0}_{1}.csv"
.
format
(
device_city
[
0
],
queue_name
)
data
.
to_csv
(
predict_file_name
,
index
=
False
,
header
=
None
)
# print("done ffm")
return
predict_file_name
# 将模型加载,预测
def
predict
(
queue_name
,
name_dict
):
data
=
feature_en
(
name_dict
[
queue_name
][
0
],
device_city
[
0
])
data_file_path
=
transform_ffm_format
(
data
,
queue_name
)
ffm_model
=
xl
.
create_ffm
()
ffm_model
.
setTest
(
data_file_path
)
ffm_model
.
setSigmoid
()
ffm_model
.
predict
(
DIRECTORY_PATH
+
"model.out"
,
DIRECTORY_PATH
+
"result/output{0}_{1}.csv"
.
format
(
device_city
[
0
],
queue_name
))
return
save_result
(
queue_name
,
name_dict
)
def
save_result
(
queue_name
,
name_dict
):
score_df
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"result/output{0}_{1}.csv"
.
format
(
device_city
[
0
],
queue_name
),
header
=
None
)
# print(score_df)
mm_scaler
=
MinMaxScaler
()
mm_scaler
.
fit
(
score_df
)
score_df
=
pd
.
DataFrame
(
mm_scaler
.
transform
(
score_df
))
score_df
=
score_df
.
rename
(
columns
=
{
0
:
"score"
})
score_df
[
"cid"
]
=
name_dict
[
queue_name
][
0
]
# 去掉cid前面的"diary|"
score_df
[
"cid"
]
=
score_df
[
"cid"
]
.
apply
(
lambda
x
:
x
[
6
:])
print
(
"score_df:"
)
print
(
score_df
.
head
(
1
))
print
(
score_df
.
shape
)
df_temp
=
pd
.
DataFrame
(
name_dict
[
queue_name
][
1
])
.
rename
(
columns
=
{
0
:
"cid"
})
df_temp
[
"score"
]
=
0
df_temp
=
df_temp
.
sort_index
(
axis
=
1
,
ascending
=
False
)
df_temp
[
"cid"
]
=
df_temp
[
"cid"
]
.
apply
(
lambda
x
:
x
[
6
:])
print
(
"temp_df:"
)
print
(
df_temp
.
head
(
1
))
print
(
df_temp
.
shape
)
predict_score_df
=
score_df
.
append
(
df_temp
)
print
(
"score_df:"
)
print
(
predict_score_df
.
head
(
1
))
print
(
predict_score_df
.
shape
)
return
merge_score
(
queue_name
,
name_dict
,
predict_score_df
)
def
merge_score
(
queue_name
,
name_dict
,
predict_score_df
):
db
=
pymysql
.
connect
(
host
=
'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
,
port
=
3306
,
user
=
'work'
,
passwd
=
'workwork'
,
db
=
'zhengxing_test'
)
cursor
=
db
.
cursor
()
# 去除diary_id 前面的"diary|"
diary_list
=
tuple
(
list
(
map
(
lambda
x
:
x
[
6
:],
name_dict
[
queue_name
][
2
])))
sql
=
"select score,diary_id from biz_feed_diary_score where diary_id in {};"
.
format
(
diary_list
)
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
score_df
=
pd
.
DataFrame
(
list
(
result
))
.
rename
(
columns
=
{
0
:
"score"
,
1
:
"cid"
})
print
(
"日记打分表"
)
print
(
score_df
.
head
(
1
))
db
.
close
()
return
update_dairy_queue
(
score_df
,
predict_score_df
)
def
update_dairy_queue
(
score_df
,
predict_score_df
):
diary_id
=
score_df
[
"cid"
]
.
values
.
tolist
()
video_id
=
[]
x
=
1
while
x
<
len
(
diary_id
):
video_id
.
append
(
diary_id
[
x
])
x
+=
5
if
len
(
video_id
)
>
0
:
not_video
=
list
(
set
(
diary_id
)
-
set
(
video_id
))
# 为了相加时,cid能够匹配,先把cid变成索引,相加后,再把cid恢复成列
not_video_df
=
score_df
.
loc
[
score_df
[
"cid"
]
.
isin
(
not_video
)]
.
reset_index
([
"cid"
])
not_video_predict_df
=
predict_score_df
.
loc
[
predict_score_df
[
"cid"
]
.
isin
(
not_video
)]
.
reset_index
([
"cid"
])
not_video_df
[
"score"
]
=
not_video_df
[
"score"
]
+
not_video_predict_df
[
"score"
]
not_video_df
=
not_video_df
.
reset_index
()
.
sort_values
(
by
=
"score"
,
ascending
=
False
)
video_df
=
score_df
.
loc
[
score_df
[
"cid"
]
.
isin
(
video_id
)]
.
reset_index
([
"cid"
])
video_predict_df
=
predict_score_df
.
loc
[
predict_score_df
[
"cid"
]
.
isin
(
video_id
)]
.
reset_index
([
"cid"
])
video_df
[
"score"
]
=
video_df
[
"score"
]
+
video_predict_df
[
"score"
]
video_df
=
video_df
.
reset_index
()
.
sort_values
(
by
=
"score"
,
ascending
=
False
)
not_video_id
=
not_video_df
[
"cid"
]
.
values
.
tolist
()
video_id
=
video_df
[
"cid"
]
.
values
.
tolist
()
diary_id
=
not_video_id
i
=
1
for
j
in
video_id
:
diary_id
.
insert
(
i
,
j
)
# TODO 下面的3是测试用的,如果上线后,把3改成5
i
+=
3
return
diary_id
# 如果没有视频日记
else
:
score_df
=
score_df
.
reset_index
([
"cid"
])
predict_score_df
=
predict_score_df
.
reset_index
([
"cid"
])
score_df
[
"score"
]
=
score_df
[
"score"
]
+
predict_score_df
[
"score"
]
score_df
=
score_df
.
sort_values
(
by
=
"score"
,
ascending
=
False
)
return
score_df
[
"cid"
]
.
values
.
tolist
()
def
update_sql_dairy_queue
(
queue_name
,
diary_id
,
device_city
):
db
=
pymysql
.
connect
(
host
=
'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
,
port
=
3306
,
user
=
'work'
,
passwd
=
'workwork'
,
db
=
'doris_test'
)
cursor
=
db
.
cursor
()
print
(
"写入前"
)
print
(
diary_id
)
sql
=
"update device_diary_queue set {}='{}' where device_id = '{}' and city_id = '{}'"
.
format
\
(
queue_name
,
diary_id
,
device_city
[
0
],
device_city
[
1
])
cursor
.
execute
(
sql
)
db
.
close
()
print
(
"成功写入diaryid"
)
# 更新前获取最新的native_queue
def
get_native_queue
(
device_id
,
city_id
):
db
=
pymysql
.
connect
(
host
=
'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
,
port
=
3306
,
user
=
'doris'
,
passwd
=
'o5gbA27hXHHm'
,
db
=
'doris_prod'
)
cursor
=
db
.
cursor
()
sql
=
"select native_queue from device_diary_queue where device_id = '{}' and city_id = '{}';"
.
format
(
device_id
,
city_id
)
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
if
not
df
.
empty
:
native_queue
=
df
.
loc
[
0
,
0
]
.
split
(
","
)
native_queue
=
list
(
map
(
lambda
x
:
"diary|"
+
str
(
x
),
native_queue
))
db
.
close
()
# print("成功获取native_queue")
return
native_queue
else
:
return
None
def
multi_update
(
queue_name
,
name_dict
,
native_queue
):
if
name_dict
[
queue_name
]
!=
[]:
diary_id
=
predict
(
queue_name
,
name_dict
)
if
get_native_queue
(
device_city
[
0
],
device_city
[
1
])
==
native_queue
:
update_sql_dairy_queue
(
queue_name
,
diary_id
,
device_city
)
print
(
"更新结束"
)
else
:
print
(
"不需要更新日记队列"
)
def
get_queue
(
device_id
,
city_id
):
db
=
pymysql
.
connect
(
host
=
'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
,
port
=
3306
,
user
=
'work'
,
passwd
=
'workwork'
,
db
=
'doris_test'
)
cursor
=
db
.
cursor
()
sql
=
"select native_queue,nearby_queue,nation_queue,megacity_queue from device_diary_queue "
\
"where device_id = '{}' and city = '{}';"
.
format
(
device_id
,
city_id
)
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
if
not
df
.
empty
:
df
=
df
.
rename
(
columns
=
{
0
:
"native_queue"
,
1
:
"nearby_queue"
,
2
:
"nation_queue"
,
3
:
"megacity_queue"
})
native_queue
=
df
.
loc
[
0
,
"native_queue"
]
.
split
(
","
)
native_queue
=
list
(
map
(
lambda
x
:
"diary|"
+
str
(
x
),
native_queue
))
nearby_queue
=
df
.
loc
[
0
,
"nearby_queue"
]
.
split
(
","
)
nearby_queue
=
list
(
map
(
lambda
x
:
"diary|"
+
str
(
x
),
nearby_queue
))
nation_queue
=
df
.
loc
[
0
,
"nation_queue"
]
.
split
(
","
)
nation_queue
=
list
(
map
(
lambda
x
:
"diary|"
+
str
(
x
),
nation_queue
))
megacity_queue
=
df
.
loc
[
0
,
"megacity_queue"
]
.
split
(
","
)
megacity_queue
=
list
(
map
(
lambda
x
:
"diary|"
+
str
(
x
),
megacity_queue
))
db
.
close
()
return
True
,
native_queue
,
nearby_queue
,
nation_queue
,
megacity_queue
else
:
print
(
"该用户对应的日记队列为空"
)
return
False
,
[],
[],
[],
[]
def
user_update
(
device_id
,
city_id
):
exist
,
native_queue
,
nearby_queue
,
nation_queue
,
megacity_queue
=
get_queue
(
device_id
,
city_id
)
if
exist
:
native_queue_predcit
=
list
(
set
(
native_queue
)
&
set
(
data_set_cid
))
nearby_queue_predict
=
list
(
set
(
nearby_queue
)
&
set
(
data_set_cid
))
nation_queue_predict
=
list
(
set
(
nation_queue
)
&
set
(
data_set_cid
))
megacity_queue_predict
=
list
(
set
(
megacity_queue
)
&
set
(
data_set_cid
))
native_queue_not_predcit
=
list
(
set
(
native_queue
)
-
set
(
data_set_cid
))
nearby_queue_not_predict
=
list
(
set
(
nearby_queue
)
-
set
(
data_set_cid
))
nation_queue_not_predict
=
list
(
set
(
nation_queue
)
-
set
(
data_set_cid
))
megacity_queue_not_predict
=
list
(
set
(
megacity_queue
)
-
set
(
data_set_cid
))
name_dict
=
{
"native_queue"
:[
native_queue_predcit
,
native_queue_not_predcit
,
native_queue
],
"nearby_queue"
:[
nearby_queue_predict
,
nearby_queue_not_predict
,
nearby_queue
],
"nation_queue"
:[
nation_queue_predict
,
nation_queue_not_predict
,
nation_queue
],
"megacity_queue"
:[
megacity_queue_predict
,
megacity_queue_not_predict
,
megacity_queue
]}
#TODO 上线后把下面是数字1改成4
pool
=
Pool
(
1
)
for
queue_name
in
name_dict
.
keys
():
pool
.
apply_async
(
multi_update
,
(
queue_name
,
name_dict
,
native_queue
,))
pool
.
close
()
pool
.
join
()
else
:
print
(
"日记队列为空"
)
if
__name__
==
"__main__"
:
# while True:
# TODO 部署到线上,改一下get_active_users,现在不返回cityid,改成city_id和deviceid 组成的df
# empty,df = get_active_users()
# if empty:
# for eachFile in os.listdir("/tmp"):
# if "xlearn" in eachFile:
# os.remove("/tmp" + "/" + eachFile)
# time.sleep(58)
# else:
# old_device_id_list = pd.read_csv(DIRECTORY_PATH + "data_set_device_id.csv")["device_id"].values.tolist()
# device_id_list = df["device_id"].values.tolist()
# # 求活跃用户和老用户的交集,也就是只预测老用户
# predict_list = list(set(device_id_list) & set(old_device_id_list))
#
# # 只预测尾号是6的ID,这块也可以在数据库取数据时过滤一下
# # predict_list = list(filter(lambda x:str(x)[-1] == "6", predict_list))
# df = df.loc[df["device_id"].isin(predict_list)]
# device_list = df["device_id"].values.tolist()
# city_list = df["city_id"].values.tolist()
# device_city_list = list(zip(device_list,city_list))
# start = time.time()
warnings
.
filterwarnings
(
"ignore"
)
data_set_cid
=
pd
.
read_csv
(
DIRECTORY_PATH
+
"data_set_cid.csv"
)[
"cid"
]
.
values
.
tolist
()
device_city_list
=
[(
"356156075348110"
,
"tainjin"
)]
if
device_city_list
!=
[]:
for
device_city
in
device_city_list
:
user_update
(
device_city
[
0
],
device_city
[
1
])
else
:
print
(
"该列表是新用户,不需要预测"
)
end
=
time
.
time
()
# # TODO 上线后把预测用户改成多进程预测
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment