Commit ffcd5172 authored by 高雅喆's avatar 高雅喆

增加翔宇版本的画像计算规则评估逻辑

parent 144517eb
......@@ -32,6 +32,14 @@ def setup_logger(logger_name, log_file, level=logging.INFO):
def get_user_service_portrait_not_alipay(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2tag, size=10):
"""
:param cl_id:
:param all_word_tags:
:param all_tag_tag_type:
:param all_3tag_2tag:
:param size:
:return: 英赫版画像(去掉支付行为)
"""
try:
db_jerry_test = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC',
db='jerry_test', charset='utf8')
......@@ -110,6 +118,89 @@ def get_user_service_portrait_not_alipay(cl_id, all_word_tags, all_tag_tag_type,
return list(), dict()
def get_user_service_portrait_not_alipay2(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2tag, size=10):
"""
:param cl_id:
:param all_word_tags:
:param all_tag_tag_type:
:param all_3tag_2tag:
:param size:
:return: 翔宇版画像(去掉支付行为)
"""
try:
db_jerry_test = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC',
db='jerry_test', charset='utf8')
cur_jerry_test = db_jerry_test.cursor()
# 用户的非搜索、支付的行为
user_df_service_sql = "select time,cl_id,score_type,tag_id,tag_referrer,action from user_new_tag_log " \
"where cl_id ='{}' and action not in " \
"('api/settlement/alipay_callback','do_search')".format(cl_id)
cur_jerry_test.execute(user_df_service_sql)
data = list(cur_jerry_test.fetchall())
if data:
user_df_service = pd.DataFrame(data)
user_df_service.columns = ["time", "cl_id", "score_type", "tag_id", "tag_referrer", "action"]
else:
user_df_service = pd.DataFrame(columns=["time", "cl_id", "score_type", "tag_id", "tag_referrer", "action"])
# 用户的搜索行为
user_df_search_sql = "select time,cl_id,score_type,tag_id,tag_referrer,action from user_new_tag_log " \
"where cl_id ='{}' and action = 'do_search'".format(cl_id)
cur_jerry_test.execute(user_df_search_sql)
data_search = list(cur_jerry_test.fetchall())
db_jerry_test.close()
if data_search:
user_df_search = pd.DataFrame(data_search)
user_df_search.columns = ["time", "cl_id", "score_type", "tag_id", "tag_referrer", "action"]
else:
user_df_search = pd.DataFrame(columns=["time", "cl_id", "score_type", "tag_id", "tag_referrer", "action"])
# 搜索词转成tag
# user_df_search_2_tag = pd.DataFrame(columns=list(user_df_service.columns))
for index, row in user_df_search.iterrows():
if row['tag_referrer'] in all_word_tags:
for search_tag in all_word_tags[row['tag_referrer']]:
row['tag_id'] = int(search_tag)
user_df_service = user_df_service.append(row, ignore_index=True)
break
# 增加df字段(days_diff_now, tag_type, tag2)
if not user_df_service.empty:
user_df_service["days_diff_now"] = round((int(time.time()) - user_df_service["time"].astype(float)) / (24 * 60 * 60))
user_df_service["tag_type"] = user_df_service.apply(lambda x: all_tag_tag_type.get(x["tag_id"]), axis=1)
user_df_service = user_df_service[user_df_service['tag_type'].isin(['2','3'])]
user_log_df_tag2_list = user_df_service[user_df_service['tag_type'] == '2']['tag_id'].unique().tolist()
user_df_service["tag2"] = user_df_service.apply(lambda x:
get_tag2_from_tag3(x.tag_id, all_3tag_2tag, user_log_df_tag2_list)
if x.tag_type == '3' else x.tag_id, axis=1)
user_df_service["tag2_type"] = user_df_service.apply(lambda x: all_tag_tag_type.get(x["tag2"]), axis=1)
# 算分及比例
user_df_service["tag_score"] = user_df_service.apply(
lambda x: compute_henqiang(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "henqiang" else (
compute_jiaoqiang(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "jiaoqiang" else (
compute_ai_scan(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "ai_scan" else (
compute_ruoyixiang(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "ruoyixiang" else
compute_validate(x.days_diff_now)/get_action_tag_count(user_df_service, x.time)))), axis=1)
finally_score = user_df_service.sort_values(by=["tag_score", "time"], ascending=False)
finally_score.drop_duplicates(subset="tag2", inplace=True, keep="first")
finally_score["weight"] = 100 * finally_score['tag_score'] / finally_score['tag_score'].sum()
gmkv_tag_score_sum_list = finally_score["tag2"].to_list()[:size]
# 获取tag的得分来源(action信息)
debug_tag_score_sum = finally_score[["tag2", "tag_score", "action", "time"]][:size].to_dict('record')
debug_tag_score_sum_dict = {info["tag2"]: str(datetime.datetime.fromtimestamp(int(info["time"]))) for info
in debug_tag_score_sum}
# 没有用户的画像
else:
gmkv_tag_score_sum_list = list()
debug_tag_score_sum_dict = dict()
return gmkv_tag_score_sum_list, debug_tag_score_sum_dict
except Exception as e:
print(e)
return list(), dict()
def get_2_tags_coincide_rate(device_order_tags, device_portrait_result, portrait_top_n, coincide_n):
"""
:param device_order_tags:
......@@ -154,16 +245,26 @@ def get_2_tags_coincide_rate(device_order_tags, device_portrait_result, portrait
if __name__ == '__main__':
try:
parser = argparse.ArgumentParser(description='画像匹配度的统计')
parser.add_argument("-log1", "--log1_file", type=str, dest="portrait_stat_log_path",
default="portrait_stat.log", help="画像统计的日志地址")
parser.add_argument("-log2", "--log2_file", type=str, dest="debug_portrait_stat_log_path",
default="debug_portrait_stat.log", help="画像统计的日志地址")
parser.add_argument("-t", "--top", type=int, dest="portrait_top_n", default=3, help="选取画像的前n个tag去统计匹配度")
parser.add_argument("-c", "--coincide", type=int, dest="coincide_n", default=1, help="选取n个tag重合个数作为判断是否匹配的阈值")
parser.add_argument("-v", "--version", type=int, dest="version", default=1, help="选取翔宇(0),英赫(1)版本进行统计")
args = parser.parse_args()
portrait_stat_log_path = args.portrait_stat_log_path
debug_portrait_stat_log_path = args.debug_portrait_stat_log_path
cmd_portrait_top_n = args.portrait_top_n
cmd_coincide_n = args.coincide_n
version = args.version
LOG_DIR = "/home/gmuser/gyz/log/"
my_today = str(datetime.date.today())
my_yesterday = str(datetime.date.today() - datetime.timedelta(days=1))
setup_logger("log1", LOG_DIR + 'portrait_stat.log')
setup_logger("log2", LOG_DIR + 'debug_portrait_stat.log')
setup_logger("log1", LOG_DIR + portrait_stat_log_path)
setup_logger("log2", LOG_DIR + debug_portrait_stat_log_path)
log1 = logging.getLogger('log1')
log2 = logging.getLogger('log2')
......@@ -230,16 +331,24 @@ if __name__ == '__main__':
# 昨天下单了的用户的去除支付行为的画像
all_device_portrait_result = dict()
debug_all_device_portrait_result = dict()
if version == 1:
for device in device_ids_lst:
portrait_result, debug_portrait_result = get_user_service_portrait_not_alipay(device, all_word_tags,
all_tag_tag_type,
all_3tag_2tag, size=-1)
all_device_portrait_result[device] = portrait_result
debug_all_device_portrait_result[device] = debug_portrait_result
elif version == 0:
for device in device_ids_lst:
portrait_result, debug_portrait_result = get_user_service_portrait_not_alipay2(device, all_word_tags,
all_tag_tag_type,
all_3tag_2tag, size=-1)
all_device_portrait_result[device] = portrait_result
debug_all_device_portrait_result[device] = debug_portrait_result
else:
pass
# 比较两个tag列表的重合率
cmd_portrait_top_n = args.portrait_top_n
cmd_coincide_n = args.coincide_n
result = get_2_tags_coincide_rate(all_device_order_tags2, all_device_portrait_result, cmd_portrait_top_n,
cmd_coincide_n)
......@@ -247,6 +356,7 @@ if __name__ == '__main__':
no_coincide_devices = result["not_coincide_have_portrait_device_ids"]
no_coincide_devices_debug = dict()
log2.info({"统计日期": my_today})
log2.info({"版本": "英赫版" if version == 1 else "翔宇版"})
for device in no_coincide_devices:
no_coincide_devices_debug = dict()
device_portrait_n = all_device_portrait_result[device][:args.portrait_top_n]
......@@ -274,6 +384,7 @@ if __name__ == '__main__':
time_consuming = (end_datetime - start_datetime).seconds / 60
log1.info({"画像信息统计日期": my_today})
log1.info({"版本": "英赫版" if version == 1 else "翔宇版"})
log1.info({"画像更新耗时(分钟)": time_consuming})
log1.info({"画像更新的设备数": portrait_device_count[0]["count(*)"]})
log1.info("")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment