Commit d70dcc7d authored by 高雅喆's avatar 高雅喆

有画像没匹配上的用户的画像信息

parent 233a458a
......@@ -9,6 +9,26 @@ import numpy as np
import pandas as pd
from tool import *
import logging
from collections import defaultdict
def get_count(actions):
counts = defaultdict(int)
for x in actions:
counts[x] += 1
return counts
def setup_logger(logger_name, log_file, level=logging.INFO):
my_log = logging.getLogger(logger_name)
formatter = logging.Formatter('%(message)s')
file_handler = logging.FileHandler(log_file, mode='a')
file_handler.setFormatter(formatter)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
my_log.setLevel(level)
my_log.addHandler(file_handler)
my_log.addHandler(stream_handler)
def get_user_service_portrait_not_alipay(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2tag, size=10):
......@@ -67,39 +87,50 @@ def get_user_service_portrait_not_alipay(cl_id, all_word_tags, all_tag_tag_type,
compute_ruoyixiang(x.days_diff_now)/get_action_tag_count(user_df_service, x.time) if x.score_type == "ruoyixiang" else
compute_validate(x.days_diff_now)/get_action_tag_count(user_df_service, x.time)))), axis=1)
tag_score_sum = user_df_service.groupby(by=["tag2", "tag2_type"]).agg(
{'tag_score': 'sum', 'cl_id': 'first', 'action': 'first'}).reset_index().sort_values(by=["tag_score"],
ascending=False)
{'tag_score': 'sum', 'cl_id': 'first', 'action': get_count}).reset_index().sort_values(by=["tag_score"],
ascending=False)
tag_score_sum['weight'] = 100 * tag_score_sum['tag_score'] / tag_score_sum['tag_score'].sum()
tag_score_sum["pay_type"] = tag_score_sum.apply(
lambda x: 3 if x.action == "api/order/validate" else (
2 if x.action == "api/settlement/alipay_callback" else 1
), axis=1
)
# gmkv_tag_score_sum = tag_score_sum[["tag2", "tag_score", "weight"]][:size].to_dict('record')
gmkv_tag_score_sum_list = tag_score_sum["tag2"].to_list()[:size]
return gmkv_tag_score_sum_list
# 获取tag的得分来源(action信息)
debug_tag_score_sum = tag_score_sum[["tag2", "tag_score", "action"]][:size].to_dict('record')
debug_tag_score_sum_dict = {info["tag2"]: info for info in debug_tag_score_sum}
# 没有用户的画像
else:
gmkv_tag_score_sum_list = list()
debug_tag_score_sum_dict = dict()
return gmkv_tag_score_sum_list, debug_tag_score_sum_dict
except Exception as e:
print(e)
return list(), dict()
def get_2_tags_coincide_rate(device_order_tags, device_portrait_result, portrait_top_n, coincide_n):
device_count = len(device_order_tags)
coincide_count = 0
not_coincide_no_portrait = 0
not_coincide_no_portrait_device_ids = []
not_coincide_have_portrait_device_ids = []
not_coincide_have_portrait = 0
device_count = len(device_order_tags) # 总的下单设备数
coincide_count = 0 # 比对的上的设备数
not_coincide_no_portrait = 0 # 比对不上的且没有画像的设备数
not_coincide_no_portrait_device_ids = [] # 比对不上的且没有画像的设备
not_coincide_have_portrait_device_ids = [] # 比对不上的且有画像的设备数
not_coincide_have_portrait = 0 # 比对不上的且有画像的设备
for device in device_order_tags:
order_tags = device_order_tags[device]
portrait_tags = device_portrait_result[device]
if portrait_tags:
portrait_tags = portrait_tags[:portrait_top_n]
else:
# 没有画像的设备
not_coincide_no_portrait += 1
not_coincide_no_portrait_device_ids.append(device)
continue
# 有画像且匹配的上
if len(set(order_tags).intersection(set(portrait_tags))) >= coincide_n:
coincide_count += 1
# 有画像且匹配不上
else:
not_coincide_have_portrait += 1
not_coincide_have_portrait_device_ids.append(device)
......@@ -123,6 +154,11 @@ if __name__ == '__main__':
my_today = str(datetime.date.today())
my_yesterday = str(datetime.date.today() - datetime.timedelta(days=1))
setup_logger("log1", LOG_DIR + 'portrait_stat.log')
setup_logger("log2", LOG_DIR + 'debug_portrait_stat.log')
log1 = logging.getLogger('log1')
log2 = logging.getLogger('log2')
# 获取昨天下单的用户设备id,下单的美购,美购对应的tag
# api_order只有用户的user_id,一个user_id对应多个device_id
# 用户一次可以下多个订单(美购),一个美购对应多个tag
......@@ -185,10 +221,32 @@ if __name__ == '__main__':
# 昨天下单了的用户的去除支付行为的画像
all_device_portrait_result = dict()
debug_all_device_portrait_result = dict()
for device in device_ids_lst:
portrait_result = get_user_service_portrait_not_alipay(device, all_word_tags, all_tag_tag_type,
all_3tag_2tag, size=10)
portrait_result, debug_portrait_result = get_user_service_portrait_not_alipay(device, all_word_tags,
all_tag_tag_type,
all_3tag_2tag, size=-1)
all_device_portrait_result[device] = portrait_result
debug_all_device_portrait_result[device] = debug_portrait_result
# 有画像没匹配上的用户的画像信息
no_coincide_devices = result["not_coincide_have_portrait_device_ids"]
no_coincide_devices_debug = dict()
log2.info({"统计日期": my_today})
for device in no_coincide_devices:
no_coincide_devices_debug = dict()
device_portrait_n = all_device_portrait_result[device][:args.portrait_top_n]
device_order_tags = all_device_order_tags2[device]
debug_device_portrait_result = debug_all_device_portrait_result[device]
no_coincide_devices_debug[device] = {
"画像的前{top_n}个tag".format(top_n=args.portrait_top_n): [debug_device_portrait_result[tag] for tag in
device_portrait_n],
"用户下单的美购对应的tag": [debug_device_portrait_result.get(tag, dict()) for tag in device_order_tags]
}
log2.info("-" * 66)
log2.info(no_coincide_devices_debug)
log2.info("\n"*6)
# 比较两个tag列表的重合率
cmd_portrait_top_n = args.portrait_top_n
......@@ -207,20 +265,18 @@ if __name__ == '__main__':
end_datetime = datetime.datetime.strptime(end_datetime_str, '%Y-%m-%d %H:%M:%S')
time_consuming = (end_datetime - start_datetime).seconds / 60
logging.basicConfig(format='%(message)s', filename=LOG_DIR + 'portrait_stat.log', filemode='a',
level=logging.INFO)
logging.info({"画像信息统计日期": my_today})
logging.info({"画像更新耗时(分钟)": time_consuming})
logging.info({"画像更新的设备数": portrait_device_count[0]["count(*)"]})
logging.info("")
logging.info({"统计画像匹配度所用数据的日期": my_yesterday})
logging.info({"统计画像的选取前n个tag": cmd_portrait_top_n})
logging.info({"重合个数": cmd_coincide_n})
logging.info({"下单人数": result["device_count"]})
logging.info({"比对的上的人数": result["coincide_count"]})
logging.info({"匹配度": result["coincide_rate"]})
logging.info({"比对不上的有画像的人数": result["not_coincide_have_portrait_count"]})
logging.info({"比对不上的无画像的人数": result["not_coincide_no_portrait_count"]})
logging.info("="*66)
log1.info({"画像信息统计日期": my_today})
log1.info({"画像更新耗时(分钟)": time_consuming})
log1.info({"画像更新的设备数": portrait_device_count[0]["count(*)"]})
log1.info("")
log1.info({"统计画像匹配度所用数据的日期": my_yesterday})
log1.info({"统计画像的选取前n个tag": cmd_portrait_top_n})
log1.info({"重合个数": cmd_coincide_n})
log1.info({"下单人数": result["device_count"]})
log1.info({"比对的上的人数": result["coincide_count"]})
log1.info({"匹配度": result["coincide_rate"]})
log1.info({"比对不上的有画像的人数": result["not_coincide_have_portrait_count"]})
log1.info({"比对不上的无画像的人数": result["not_coincide_no_portrait_count"]})
log1.info("="*66)
except Exception as e:
print(e)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment