Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
a892d9e7
Commit
a892d9e7
authored
Oct 15, 2019
by
高雅喆
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加分数指数衰减的函数
parent
04f0591c
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
81 additions
and
119 deletions
+81
-119
evaluation_metrics.py
eda/smart_rank/evaluation_metrics.py
+39
-104
tool.py
eda/smart_rank/tool.py
+42
-15
No files found.
eda/smart_rank/evaluation_metrics.py
View file @
a892d9e7
...
...
@@ -31,14 +31,18 @@ def setup_logger(logger_name, log_file, level=logging.INFO):
my_log
.
addHandler
(
stream_handler
)
def
get_user_service_portrait_not_alipay
(
cl_id
,
all_word_tags
,
all_tag_tag_type
,
pay_time
,
all_3tag_2tag
,
size
=
10
):
def
get_user_service_portrait_not_alipay
(
cl_id
,
all_word_tags
,
all_tag_tag_type
,
pay_time
,
all_3tag_2tag
,
version
=
1
,
exponential
=
exponential
,
normalization_size
=
normalization_size
,
decay_days
=
decay_days
,
size
=
10
):
"""
:param cl_id:
:param all_word_tags:
:param all_tag_tag_type:
:param pay_time 用户下订单的timestamp
:param all_3tag_2tag:
:param version: 0:翔宇版; 1:英赫版
:param size:
:return:
英赫版
画像(去掉支付行为)
:return: 画像(去掉支付行为)
"""
try
:
db_jerry_test
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
...
...
@@ -89,110 +93,43 @@ def get_user_service_portrait_not_alipay(cl_id, all_word_tags, all_tag_tag_type,
if
x
.
tag_type
==
'3'
else
x
.
tag_id
,
axis
=
1
)
user_df_service
[
"tag2_type"
]
=
user_df_service
.
apply
(
lambda
x
:
all_tag_tag_type
.
get
(
x
[
"tag2"
]),
axis
=
1
)
# 算分及比例
if
version
==
1
:
user_df_service
[
"tag_score"
]
=
user_df_service
.
apply
(
lambda
x
:
compute_henqiang
(
x
.
days_diff_now
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"henqiang"
else
(
compute_jiaoqiang
(
x
.
days_diff_now
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"jiaoqiang"
else
(
compute_ai_scan
(
x
.
days_diff_now
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"ai_scan"
else
(
compute_ruoyixiang
(
x
.
days_diff_now
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"ruoyixiang"
else
compute_validate
(
x
.
days_diff_now
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)))),
axis
=
1
)
tag_score_sum
=
user_df_service
.
groupby
(
by
=
[
"tag2"
,
"tag2_type"
])
.
agg
(
lambda
x
:
compute_henqiang
(
x
.
days_diff_now
,
decay_days
,
normalization_size
,
exponential
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"henqiang"
else
(
compute_jiaoqiang
(
x
.
days_diff_now
,
decay_days
,
normalization_size
,
exponential
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"jiaoqiang"
else
(
compute_ai_scan
(
x
.
days_diff_now
,
decay_days
,
normalization_size
,
exponential
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"ai_scan"
else
(
compute_ruoyixiang
(
x
.
days_diff_now
,
decay_days
,
normalization_size
,
exponential
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"ruoyixiang"
else
compute_validate
(
x
.
days_diff_now
,
decay_days
,
normalization_size
,
exponential
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)))),
axis
=
1
)
finally_score
=
user_df_service
.
groupby
(
by
=
[
"tag2"
,
"tag2_type"
])
.
agg
(
{
'tag_score'
:
'sum'
,
'cl_id'
:
'first'
,
'action'
:
get_count
})
.
reset_index
()
.
sort_values
(
by
=
[
"tag_score"
],
ascending
=
False
)
tag_score_sum
[
'weight'
]
=
100
*
tag_score_sum
[
'tag_score'
]
/
tag_score_sum
[
'tag_score'
]
.
sum
()
tag_score_sum
[
"pay_type"
]
=
tag_score_sum
.
apply
(
finally_score
[
'weight'
]
=
100
*
finally_score
[
'tag_score'
]
/
finally_score
[
'tag_score'
]
.
sum
()
finally_score
[
"pay_type"
]
=
finally_score
.
apply
(
lambda
x
:
3
if
x
.
action
==
"api/order/validate"
else
(
2
if
x
.
action
==
"api/settlement/alipay_callback"
else
1
),
axis
=
1
)
gmkv_tag_score_sum_list
=
tag_score_sum
[
"tag2"
]
.
to_list
()[:
size
]
gmkv_tag_score_sum_list
=
finally_score
[
"tag2"
]
.
to_list
()[:
size
]
# 获取tag的得分来源(action信息)
debug_tag_score_sum
=
tag_score_sum
[[
"tag2"
,
"tag_score"
,
"action"
]][:
size
]
.
to_dict
(
'record'
)
debug_tag_score_sum
=
finally_score
[[
"tag2"
,
"tag_score"
,
"action"
]][:
size
]
.
to_dict
(
'record'
)
debug_tag_score_sum_dict
=
{
info
[
"tag2"
]:
info
for
info
in
debug_tag_score_sum
}
# 没有用户的画像
else
:
gmkv_tag_score_sum_list
=
list
()
debug_tag_score_sum_dict
=
dict
()
return
gmkv_tag_score_sum_list
,
debug_tag_score_sum_dict
except
Exception
as
e
:
print
(
e
)
return
list
(),
dict
()
def
get_user_service_portrait_not_alipay2
(
cl_id
,
all_word_tags
,
all_tag_tag_type
,
pay_time
,
all_3tag_2tag
,
size
=
10
):
"""
:param cl_id:
:param all_word_tags:
:param all_tag_tag_type:
:param all_3tag_2tag:
:param size:
:return: 翔宇版画像(去掉支付行为)
"""
try
:
db_jerry_test
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
,
charset
=
'utf8'
)
cur_jerry_test
=
db_jerry_test
.
cursor
()
# 用户的非搜索、支付的行为
user_df_service_sql
=
"select time,cl_id,score_type,tag_id,tag_referrer,action from user_new_tag_log "
\
"where cl_id ='{cl_id}' and time < {pay_time} and action not in "
\
"('api/settlement/alipay_callback','do_search')"
.
format
(
cl_id
=
cl_id
,
pay_time
=
pay_time
)
cur_jerry_test
.
execute
(
user_df_service_sql
)
data
=
list
(
cur_jerry_test
.
fetchall
())
if
data
:
user_df_service
=
pd
.
DataFrame
(
data
)
user_df_service
.
columns
=
[
"time"
,
"cl_id"
,
"score_type"
,
"tag_id"
,
"tag_referrer"
,
"action"
]
else
:
user_df_service
=
pd
.
DataFrame
(
columns
=
[
"time"
,
"cl_id"
,
"score_type"
,
"tag_id"
,
"tag_referrer"
,
"action"
])
# 用户的搜索行为
user_df_search_sql
=
"select time,cl_id,score_type,tag_id,tag_referrer,action from user_new_tag_log "
\
"where cl_id ='{cl_id}' and time < {pay_time} and "
\
"action = 'do_search'"
.
format
(
cl_id
=
cl_id
,
pay_time
=
pay_time
)
cur_jerry_test
.
execute
(
user_df_search_sql
)
data_search
=
list
(
cur_jerry_test
.
fetchall
())
db_jerry_test
.
close
()
if
data_search
:
user_df_search
=
pd
.
DataFrame
(
data_search
)
user_df_search
.
columns
=
[
"time"
,
"cl_id"
,
"score_type"
,
"tag_id"
,
"tag_referrer"
,
"action"
]
else
:
user_df_search
=
pd
.
DataFrame
(
columns
=
[
"time"
,
"cl_id"
,
"score_type"
,
"tag_id"
,
"tag_referrer"
,
"action"
])
# 搜索词转成tag
# user_df_search_2_tag = pd.DataFrame(columns=list(user_df_service.columns))
for
index
,
row
in
user_df_search
.
iterrows
():
if
row
[
'tag_referrer'
]
in
all_word_tags
:
for
search_tag
in
all_word_tags
[
row
[
'tag_referrer'
]]:
row
[
'tag_id'
]
=
int
(
search_tag
)
user_df_service
=
user_df_service
.
append
(
row
,
ignore_index
=
True
)
break
# 增加df字段(days_diff_now, tag_type, tag2)
if
not
user_df_service
.
empty
:
user_df_service
[
"days_diff_now"
]
=
round
((
int
(
time
.
time
())
-
user_df_service
[
"time"
]
.
astype
(
float
))
/
(
24
*
60
*
60
))
user_df_service
[
"tag_type"
]
=
user_df_service
.
apply
(
lambda
x
:
all_tag_tag_type
.
get
(
x
[
"tag_id"
]),
axis
=
1
)
user_df_service
=
user_df_service
[
user_df_service
[
'tag_type'
]
.
isin
([
'2'
,
'3'
])]
user_log_df_tag2_list
=
user_df_service
[
user_df_service
[
'tag_type'
]
==
'2'
][
'tag_id'
]
.
unique
()
.
tolist
()
user_df_service
[
"tag2"
]
=
user_df_service
.
apply
(
lambda
x
:
get_tag2_from_tag3
(
x
.
tag_id
,
all_3tag_2tag
,
user_log_df_tag2_list
)
if
x
.
tag_type
==
'3'
else
x
.
tag_id
,
axis
=
1
)
user_df_service
[
"tag2_type"
]
=
user_df_service
.
apply
(
lambda
x
:
all_tag_tag_type
.
get
(
x
[
"tag2"
]),
axis
=
1
)
# 算分及比例
elif
version
==
0
:
user_df_service
[
"tag_score"
]
=
user_df_service
.
apply
(
lambda
x
:
compute_henqiang
(
x
.
days_diff_now
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"henqiang"
else
(
compute_jiaoqiang
(
x
.
days_diff_now
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"jiaoqiang"
else
(
compute_ai_scan
(
x
.
days_diff_now
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"ai_scan"
else
(
compute_ruoyixiang
(
x
.
days_diff_now
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)
if
x
.
score_type
==
"ruoyixiang"
else
compute_validate
(
x
.
days_diff_now
)
/
get_action_tag_count
(
user_df_service
,
x
.
time
)))),
axis
=
1
)
lambda
x
:
compute_henqiang
(
x
.
days_diff_now
,
decay_days
,
normalization_size
,
exponential
)
if
x
.
score_type
==
"henqiang"
else
(
compute_jiaoqiang
(
x
.
days_diff_now
,
decay_days
,
normalization_size
,
exponential
)
if
x
.
score_type
==
"jiaoqiang"
else
(
compute_ai_scan
(
x
.
days_diff_now
,
decay_days
,
normalization_size
,
exponential
)
if
x
.
score_type
==
"ai_scan"
else
(
compute_ruoyixiang
(
x
.
days_diff_now
,
decay_days
,
normalization_size
,
exponential
)
if
x
.
score_type
==
"ruoyixiang"
else
compute_validate
(
x
.
days_diff_now
,
decay_days
,
normalization_size
,
exponential
)))),
axis
=
1
)
finally_score
=
user_df_service
.
sort_values
(
by
=
[
"tag_score"
,
"time"
],
ascending
=
False
)
finally_score
.
drop_duplicates
(
subset
=
"tag2"
,
inplace
=
True
,
keep
=
"first"
)
finally_score
[
"weight"
]
=
100
*
finally_score
[
'tag_score'
]
/
finally_score
[
'tag_score'
]
.
sum
()
gmkv_tag_score_sum_list
=
finally_score
[
"tag2"
]
.
to_list
()[:
size
]
# 获取tag的得分来源(action信息)
debug_tag_score_sum
=
finally_score
[[
"tag2"
,
"tag_score"
,
"action"
,
"time"
]][:
size
]
.
to_dict
(
'record'
)
debug_tag_score_sum_dict
=
{
info
[
"tag2"
]:
str
(
datetime
.
datetime
.
fromtimestamp
(
int
(
info
[
"time"
])))
for
info
in
debug_tag_score_sum
}
debug_tag_score_sum_dict
=
{
info
[
"tag2"
]:
str
(
datetime
.
datetime
.
fromtimestamp
(
int
(
info
[
"time"
])))
for
info
in
debug_tag_score_sum
}
# 没有用户的画像
else
:
gmkv_tag_score_sum_list
=
list
()
...
...
@@ -254,12 +191,19 @@ if __name__ == '__main__':
parser
.
add_argument
(
"-t"
,
"--top"
,
type
=
int
,
dest
=
"portrait_top_n"
,
default
=
3
,
help
=
"选取画像的前n个tag去统计匹配度"
)
parser
.
add_argument
(
"-c"
,
"--coincide"
,
type
=
int
,
dest
=
"coincide_n"
,
default
=
1
,
help
=
"选取n个tag重合个数作为判断是否匹配的阈值"
)
parser
.
add_argument
(
"-v"
,
"--version"
,
type
=
int
,
dest
=
"version"
,
default
=
1
,
help
=
"选取翔宇(0),英赫(1)版本进行统计"
)
parser
.
add_argument
(
"-e"
,
"--exponential"
,
type
=
int
,
dest
=
"exponential"
,
default
=
0
,
help
=
"是否采用指数衰减"
)
parser
.
add_argument
(
"-n"
,
"--normalization_size"
,
type
=
int
,
dest
=
"normalization_size"
,
default
=
7
,
help
=
"天数差归一化的区间"
)
parser
.
add_argument
(
"-d"
,
"--decay_days"
,
type
=
int
,
dest
=
"decay_days"
,
default
=
180
,
help
=
"分数衰减的天数"
)
args
=
parser
.
parse_args
()
portrait_stat_log_path
=
args
.
portrait_stat_log_path
debug_portrait_stat_log_path
=
args
.
debug_portrait_stat_log_path
cmd_portrait_top_n
=
args
.
portrait_top_n
cmd_coincide_n
=
args
.
coincide_n
version
=
args
.
version
exponential
=
args
.
exponential
normalization_size
=
args
.
normalization_size
decay_days
=
args
.
decay_days
LOG_DIR
=
"/home/gmuser/gyz/log/"
my_today
=
str
(
datetime
.
date
.
today
())
...
...
@@ -338,28 +282,19 @@ if __name__ == '__main__':
# 昨天下单了的用户的去除支付行为的画像
all_device_portrait_result
=
dict
()
debug_all_device_portrait_result
=
dict
()
if
version
==
1
:
for
order_info
in
device_ids_lst
:
device
=
order_info
[
0
]
pay_time
=
order_info
[
1
]
portrait_result
,
debug_portrait_result
=
get_user_service_portrait_not_alipay
(
device
,
all_word_tags
,
all_tag_tag_type
,
pay_time
,
all_3tag_2tag
,
size
=-
1
)
all_device_portrait_result
[
device
]
=
portrait_result
debug_all_device_portrait_result
[
device
]
=
debug_portrait_result
elif
version
==
0
:
for
order_info
in
device_ids_lst
:
device
=
order_info
[
0
]
pay_time
=
order_info
[
1
]
portrait_result
,
debug_portrait_result
=
get_user_service_portrait_not_alipay2
(
device
,
all_word_tags
,
all_tag_tag_type
,
pay_time
,
all_3tag_2tag
,
all_tag_tag_type
,
pay_time
,
all_3tag_2tag
,
version
=
version
,
exponential
=
exponential
,
normalization_size
=
normalization_size
,
decay_days
=
decay_days
,
size
=-
1
)
all_device_portrait_result
[
device
]
=
portrait_result
debug_all_device_portrait_result
[
device
]
=
debug_portrait_result
else
:
pass
# 比较两个tag列表的重合率
result
=
get_2_tags_coincide_rate
(
all_device_order_tags2
,
all_device_portrait_result
,
cmd_portrait_top_n
,
...
...
eda/smart_rank/tool.py
View file @
a892d9e7
...
...
@@ -174,33 +174,53 @@ def get_tag2_from_tag3(tag3, all_3tag_2tag, user_log_df_tag2_list):
print
(
e
)
def
compute_henqiang
(
x
):
score
=
15
-
x
*
((
15
-
0.5
)
/
180
)
if
score
>
0.5
:
def
compute_henqiang
(
x
,
decay_days
=
180
,
normalization_size
=
7
,
exponential
=
1
):
if
exponential
:
alpha
=
exponential_decay
(
x
,
decay_days
,
normalization_size
)
score
=
15
-
2
**
alpha
*
((
15
-
0.5
)
/
decay_days
)
else
:
score
=
15
-
x
*
((
15
-
0.5
)
/
decay_days
)
if
score
>
0.5
:
return
score
else
:
return
0.5
def
compute_jiaoqiang
(
x
):
score
=
12
-
x
*
(
12
/
180
)
if
score
>
0.5
:
def
compute_jiaoqiang
(
x
,
decay_days
=
180
,
normalization_size
=
7
,
exponential
=
1
):
if
exponential
:
alpha
=
exponential_decay
(
x
,
decay_days
,
normalization_size
)
score
=
12
-
2
**
alpha
*
((
12
-
0.5
)
/
decay_days
)
else
:
score
=
12
-
x
*
((
12
-
0.5
)
/
decay_days
)
if
score
>
0.5
:
return
score
else
:
return
0.5
def
compute_ruoyixiang
(
x
):
score
=
5
-
x
*
((
5
-
0.5
)
/
180
)
if
score
>
0.5
:
def
compute_ruoyixiang
(
x
,
decay_days
=
180
,
normalization_size
=
7
,
exponential
=
1
):
if
exponential
:
alpha
=
exponential_decay
(
x
,
decay_days
,
normalization_size
)
score
=
5
-
2
**
alpha
*
((
5
-
0.5
)
/
decay_days
)
else
:
score
=
5
-
x
*
((
5
-
0.5
)
/
decay_days
)
if
score
>
0.5
:
return
score
else
:
return
0.5
def
compute_validate
(
x
):
score
=
10
-
x
*
((
10
-
0.5
)
/
180
)
if
score
>
0.5
:
def
compute_validate
(
x
,
decay_days
=
180
,
normalization_size
=
7
,
exponential
=
1
):
if
exponential
:
alpha
=
exponential_decay
(
x
,
decay_days
,
normalization_size
)
score
=
10
-
2
**
alpha
*
((
10
-
0.5
)
/
decay_days
)
else
:
score
=
10
-
x
*
((
10
-
0.5
)
/
decay_days
)
if
score
>
0.5
:
return
score
else
:
return
0.5
def
compute_ai_scan
(
x
):
score
=
2
-
x
*
((
2
-
0.5
)
/
180
)
if
score
>
0.5
:
def
compute_ai_scan
(
x
,
decay_days
=
180
,
normalization_size
=
7
,
exponential
=
1
):
if
exponential
:
alpha
=
exponential_decay
(
x
,
decay_days
,
normalization_size
)
score
=
2
-
2
**
alpha
*
((
2
-
0.5
)
/
decay_days
)
else
:
score
=
2
-
x
*
((
2
-
0.5
)
/
decay_days
)
if
score
>
0.5
:
return
score
else
:
return
0.5
...
...
@@ -212,3 +232,10 @@ def get_action_tag_count(df, action_time):
return
1
except
Exception
as
e
:
print
(
e
)
def
exponential_decay
(
days_diff
,
decay_days
=
180
,
normalization_size
=
7
):
x
=
np
.
arange
(
1
,
decay_days
+
1
,
1
)
# 天数差归一化到[0, normalization_size]
a
=
(
normalization_size
-
0
)
*
(
days_diff
-
min
(
x
))
/
(
max
(
x
)
-
min
(
x
))
return
a
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment