Commit a892d9e7 authored by 高雅喆's avatar 高雅喆

增加分数指数衰减的函数

parent 04f0591c
......@@ -31,14 +31,18 @@ def setup_logger(logger_name, log_file, level=logging.INFO):
my_log.addHandler(stream_handler)
def get_user_service_portrait_not_alipay(cl_id, all_word_tags, all_tag_tag_type, pay_time, all_3tag_2tag, size=10):
def get_user_service_portrait_not_alipay(cl_id, all_word_tags, all_tag_tag_type, pay_time, all_3tag_2tag, version=1,
exponential=exponential, normalization_size=normalization_size,
decay_days=decay_days, size=10):
"""
:param cl_id:
:param all_word_tags:
:param all_tag_tag_type:
:param pay_time 用户下订单的timestamp
:param all_3tag_2tag:
:param version: 0:翔宇版; 1:英赫版
:param size:
:return: 英赫版画像(去掉支付行为)
:return: 画像(去掉支付行为)
"""
try:
db_jerry_test = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC',
......@@ -89,110 +93,43 @@ def get_user_service_portrait_not_alipay(cl_id, all_word_tags, all_tag_tag_type,
if x.tag_type == '3' else x.tag_id, axis=1)
user_df_service["tag2_type"] = user_df_service.apply(lambda x: all_tag_tag_type.get(x["tag2"]), axis=1)
# 算分及比例
if version == 1:
user_df_service["tag_score"] = user_df_service.apply(
lambda x: compute_henqiang(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "henqiang" else (
compute_jiaoqiang(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "jiaoqiang" else (
compute_ai_scan(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "ai_scan" else (
compute_ruoyixiang(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "ruoyixiang" else
compute_validate(x.days_diff_now)/get_action_tag_count(user_df_service, x.time)))), axis=1)
tag_score_sum = user_df_service.groupby(by=["tag2", "tag2_type"]).agg(
lambda x: compute_henqiang(x.days_diff_now, decay_days, normalization_size, exponential)/get_action_tag_count(user_df_service, x.time) if x.score_type == "henqiang" else (
compute_jiaoqiang(x.days_diff_now, decay_days, normalization_size, exponential)/get_action_tag_count(user_df_service, x.time) if x.score_type == "jiaoqiang" else (
compute_ai_scan(x.days_diff_now, decay_days, normalization_size, exponential)/get_action_tag_count(user_df_service, x.time) if x.score_type == "ai_scan" else (
compute_ruoyixiang(x.days_diff_now, decay_days, normalization_size, exponential)/get_action_tag_count(user_df_service, x.time) if x.score_type == "ruoyixiang" else
compute_validate(x.days_diff_now, decay_days, normalization_size, exponential)/get_action_tag_count(user_df_service, x.time)))), axis=1)
finally_score = user_df_service.groupby(by=["tag2", "tag2_type"]).agg(
{'tag_score': 'sum', 'cl_id': 'first', 'action': get_count}).reset_index().sort_values(by=["tag_score"],
ascending=False)
tag_score_sum['weight'] = 100 * tag_score_sum['tag_score'] / tag_score_sum['tag_score'].sum()
tag_score_sum["pay_type"] = tag_score_sum.apply(
finally_score['weight'] = 100 * finally_score['tag_score'] / finally_score['tag_score'].sum()
finally_score["pay_type"] = finally_score.apply(
lambda x: 3 if x.action == "api/order/validate" else (
2 if x.action == "api/settlement/alipay_callback" else 1
), axis=1
)
gmkv_tag_score_sum_list = tag_score_sum["tag2"].to_list()[:size]
gmkv_tag_score_sum_list = finally_score["tag2"].to_list()[:size]
# 获取tag的得分来源(action信息)
debug_tag_score_sum = tag_score_sum[["tag2", "tag_score", "action"]][:size].to_dict('record')
debug_tag_score_sum = finally_score[["tag2", "tag_score", "action"]][:size].to_dict('record')
debug_tag_score_sum_dict = {info["tag2"]: info for info in debug_tag_score_sum}
# 没有用户的画像
else:
gmkv_tag_score_sum_list = list()
debug_tag_score_sum_dict = dict()
return gmkv_tag_score_sum_list, debug_tag_score_sum_dict
except Exception as e:
print(e)
return list(), dict()
def get_user_service_portrait_not_alipay2(cl_id, all_word_tags, all_tag_tag_type, pay_time, all_3tag_2tag, size=10):
"""
:param cl_id:
:param all_word_tags:
:param all_tag_tag_type:
:param all_3tag_2tag:
:param size:
:return: 翔宇版画像(去掉支付行为)
"""
try:
db_jerry_test = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC',
db='jerry_test', charset='utf8')
cur_jerry_test = db_jerry_test.cursor()
# 用户的非搜索、支付的行为
user_df_service_sql = "select time,cl_id,score_type,tag_id,tag_referrer,action from user_new_tag_log " \
"where cl_id ='{cl_id}' and time < {pay_time} and action not in " \
"('api/settlement/alipay_callback','do_search')".format(cl_id=cl_id, pay_time=pay_time)
cur_jerry_test.execute(user_df_service_sql)
data = list(cur_jerry_test.fetchall())
if data:
user_df_service = pd.DataFrame(data)
user_df_service.columns = ["time", "cl_id", "score_type", "tag_id", "tag_referrer", "action"]
else:
user_df_service = pd.DataFrame(columns=["time", "cl_id", "score_type", "tag_id", "tag_referrer", "action"])
# 用户的搜索行为
user_df_search_sql = "select time,cl_id,score_type,tag_id,tag_referrer,action from user_new_tag_log " \
"where cl_id ='{cl_id}' and time < {pay_time} and " \
"action = 'do_search'".format(cl_id=cl_id, pay_time=pay_time)
cur_jerry_test.execute(user_df_search_sql)
data_search = list(cur_jerry_test.fetchall())
db_jerry_test.close()
if data_search:
user_df_search = pd.DataFrame(data_search)
user_df_search.columns = ["time", "cl_id", "score_type", "tag_id", "tag_referrer", "action"]
else:
user_df_search = pd.DataFrame(columns=["time", "cl_id", "score_type", "tag_id", "tag_referrer", "action"])
# 搜索词转成tag
# user_df_search_2_tag = pd.DataFrame(columns=list(user_df_service.columns))
for index, row in user_df_search.iterrows():
if row['tag_referrer'] in all_word_tags:
for search_tag in all_word_tags[row['tag_referrer']]:
row['tag_id'] = int(search_tag)
user_df_service = user_df_service.append(row, ignore_index=True)
break
# 增加df字段(days_diff_now, tag_type, tag2)
if not user_df_service.empty:
user_df_service["days_diff_now"] = round((int(time.time()) - user_df_service["time"].astype(float)) / (24 * 60 * 60))
user_df_service["tag_type"] = user_df_service.apply(lambda x: all_tag_tag_type.get(x["tag_id"]), axis=1)
user_df_service = user_df_service[user_df_service['tag_type'].isin(['2','3'])]
user_log_df_tag2_list = user_df_service[user_df_service['tag_type'] == '2']['tag_id'].unique().tolist()
user_df_service["tag2"] = user_df_service.apply(lambda x:
get_tag2_from_tag3(x.tag_id, all_3tag_2tag, user_log_df_tag2_list)
if x.tag_type == '3' else x.tag_id, axis=1)
user_df_service["tag2_type"] = user_df_service.apply(lambda x: all_tag_tag_type.get(x["tag2"]), axis=1)
# 算分及比例
elif version == 0:
user_df_service["tag_score"] = user_df_service.apply(
lambda x: compute_henqiang(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "henqiang" else (
compute_jiaoqiang(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "jiaoqiang" else (
compute_ai_scan(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "ai_scan" else (
compute_ruoyixiang(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "ruoyixiang" else
compute_validate(x.days_diff_now)/get_action_tag_count(user_df_service, x.time)))), axis=1)
lambda x: compute_henqiang(x.days_diff_now, decay_days, normalization_size, exponential) if x.score_type == "henqiang" else (
compute_jiaoqiang(x.days_diff_now, decay_days, normalization_size, exponential) if x.score_type == "jiaoqiang" else (
compute_ai_scan(x.days_diff_now, decay_days, normalization_size, exponential) if x.score_type == "ai_scan" else (
compute_ruoyixiang(x.days_diff_now, decay_days, normalization_size, exponential) if x.score_type == "ruoyixiang" else
compute_validate(x.days_diff_now, decay_days, normalization_size, exponential)))),
axis=1)
finally_score = user_df_service.sort_values(by=["tag_score", "time"], ascending=False)
finally_score.drop_duplicates(subset="tag2", inplace=True, keep="first")
finally_score["weight"] = 100 * finally_score['tag_score'] / finally_score['tag_score'].sum()
gmkv_tag_score_sum_list = finally_score["tag2"].to_list()[:size]
# 获取tag的得分来源(action信息)
debug_tag_score_sum = finally_score[["tag2", "tag_score", "action", "time"]][:size].to_dict('record')
debug_tag_score_sum_dict = {info["tag2"]: str(datetime.datetime.fromtimestamp(int(info["time"]))) for info
in debug_tag_score_sum}
debug_tag_score_sum_dict = {info["tag2"]: str(datetime.datetime.fromtimestamp(int(info["time"]))) for
info in debug_tag_score_sum}
# 没有用户的画像
else:
gmkv_tag_score_sum_list = list()
......@@ -254,12 +191,19 @@ if __name__ == '__main__':
parser.add_argument("-t", "--top", type=int, dest="portrait_top_n", default=3, help="选取画像的前n个tag去统计匹配度")
parser.add_argument("-c", "--coincide", type=int, dest="coincide_n", default=1, help="选取n个tag重合个数作为判断是否匹配的阈值")
parser.add_argument("-v", "--version", type=int, dest="version", default=1, help="选取翔宇(0),英赫(1)版本进行统计")
parser.add_argument("-e", "--exponential", type=int, dest="exponential", default=0, help="是否采用指数衰减")
parser.add_argument("-n", "--normalization_size", type=int, dest="normalization_size", default=7,
help="天数差归一化的区间")
parser.add_argument("-d", "--decay_days", type=int, dest="decay_days", default=180, help="分数衰减的天数")
args = parser.parse_args()
portrait_stat_log_path = args.portrait_stat_log_path
debug_portrait_stat_log_path = args.debug_portrait_stat_log_path
cmd_portrait_top_n = args.portrait_top_n
cmd_coincide_n = args.coincide_n
version = args.version
exponential = args.exponential
normalization_size = args.normalization_size
decay_days = args.decay_days
LOG_DIR = "/home/gmuser/gyz/log/"
my_today = str(datetime.date.today())
......@@ -338,28 +282,19 @@ if __name__ == '__main__':
# 昨天下单了的用户的去除支付行为的画像
all_device_portrait_result = dict()
debug_all_device_portrait_result = dict()
if version == 1:
for order_info in device_ids_lst:
device = order_info[0]
pay_time = order_info[1]
portrait_result, debug_portrait_result = get_user_service_portrait_not_alipay(device, all_word_tags,
all_tag_tag_type,
pay_time, all_3tag_2tag,
size=-1)
all_device_portrait_result[device] = portrait_result
debug_all_device_portrait_result[device] = debug_portrait_result
elif version == 0:
for order_info in device_ids_lst:
device = order_info[0]
pay_time = order_info[1]
portrait_result, debug_portrait_result = get_user_service_portrait_not_alipay2(device, all_word_tags,
all_tag_tag_type,
pay_time, all_3tag_2tag,
all_tag_tag_type, pay_time,
all_3tag_2tag,
version=version,
exponential=exponential,
normalization_size=normalization_size,
decay_days=decay_days,
size=-1)
all_device_portrait_result[device] = portrait_result
debug_all_device_portrait_result[device] = debug_portrait_result
else:
pass
# 比较两个tag列表的重合率
result = get_2_tags_coincide_rate(all_device_order_tags2, all_device_portrait_result, cmd_portrait_top_n,
......
......@@ -174,33 +174,53 @@ def get_tag2_from_tag3(tag3, all_3tag_2tag, user_log_df_tag2_list):
print(e)
def compute_henqiang(x):
score = 15-x*((15-0.5)/180)
if score>0.5:
def compute_henqiang(x, decay_days=180, normalization_size=7, exponential=1):
if exponential:
alpha = exponential_decay(x, decay_days, normalization_size)
score = 15 - 2**alpha * ((15-0.5)/decay_days)
else:
score = 15 - x * ((15-0.5)/decay_days)
if score > 0.5:
return score
else:
return 0.5
def compute_jiaoqiang(x):
score = 12-x*(12/180)
if score>0.5:
def compute_jiaoqiang(x, decay_days=180, normalization_size=7, exponential=1):
if exponential:
alpha = exponential_decay(x, decay_days, normalization_size)
score = 12 - 2**alpha * ((12-0.5)/decay_days)
else:
score = 12 - x * ((12-0.5)/decay_days)
if score > 0.5:
return score
else:
return 0.5
def compute_ruoyixiang(x):
score = 5-x*((5-0.5)/180)
if score>0.5:
def compute_ruoyixiang(x, decay_days=180, normalization_size=7, exponential=1):
if exponential:
alpha = exponential_decay(x, decay_days, normalization_size)
score = 5 - 2**alpha * ((5-0.5)/decay_days)
else:
score = 5 - x * ((5-0.5)/decay_days)
if score > 0.5:
return score
else:
return 0.5
def compute_validate(x):
score = 10-x*((10-0.5)/180)
if score>0.5:
def compute_validate(x, decay_days=180, normalization_size=7, exponential=1):
if exponential:
alpha = exponential_decay(x, decay_days, normalization_size)
score = 10 - 2**alpha * ((10-0.5)/decay_days)
else:
score = 10 - x * ((10-0.5)/decay_days)
if score > 0.5:
return score
else:
return 0.5
def compute_ai_scan(x):
score = 2 - x * ((2 - 0.5) / 180)
if score>0.5:
def compute_ai_scan(x, decay_days=180, normalization_size=7, exponential=1):
if exponential:
alpha = exponential_decay(x, decay_days, normalization_size)
score = 2 - 2**alpha * ((2-0.5)/decay_days)
else:
score = 2 - x * ((2-0.5)/decay_days)
if score > 0.5:
return score
else:
return 0.5
......@@ -212,3 +232,10 @@ def get_action_tag_count(df, action_time):
return 1
except Exception as e:
print(e)
def exponential_decay(days_diff, decay_days=180, normalization_size=7):
x = np.arange(1, decay_days+1, 1)
# 天数差归一化到[0, normalization_size]
a = (normalization_size - 0) * (days_diff - min(x)) / (max(x) - min(x))
return a
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment