Commit b1c9091b authored by 高雅喆's avatar 高雅喆

增加首页精选的日记点击和美购首页的美购点击

parent 16570e50
...@@ -180,6 +180,66 @@ def get_2_tags_coincide_rate(device_order_tags, device_portrait_result, portrait ...@@ -180,6 +180,66 @@ def get_2_tags_coincide_rate(device_order_tags, device_portrait_result, portrait
return result return result
def get_user_order_info_yesterday():
# 获取昨天下单的用户设备id,下单的美购,美购对应的tag
# api_order只有用户的user_id,一个user_id对应多个device_id
# 用户一次可以下多个订单(美购),一个美购对应多个tag
sql_order_device_info_yesterday = """
SELECT tmp1.user_id,
c.device_id,
tmp1.service_ids,
tmp1.tag_ids,
tmp1.pay_time
FROM
(SELECT tmp.user_id,
tmp.service_ids,
tmp.tag_ids,
tmp.pay_time,
max(tmp.device_id) device_id_id
FROM
(SELECT a.user_id,
a.service_ids,
a.tag_ids,
a.pay_time,
b.device_id
FROM
(SELECT user_id,
max(pay_time) AS pay_time,
group_concat(DISTINCT `service_id` separator ',') service_ids,
group_concat(DISTINCT `tag_id` separator ',') tag_ids
FROM
(SELECT d.user_id,
d.service_id,
unix_timestamp(d.pay_time) AS pay_time,
e.tag_id
FROM api_order d
LEFT JOIN api_servicetag e ON d.service_id = e.service_id
LEFT JOIN api_tag f ON e.tag_id = f.id
WHERE d.status=1
AND d.pay_time>'{order_date}'
AND d.pay_time<'{order_date_tomorrow}'
AND f.tag_type+0 <'4'+0) tmp2
GROUP BY user_id) a
LEFT JOIN statistic_device_user b ON a.user_id = b.user_id) tmp
GROUP BY tmp.user_id) tmp1
LEFT JOIN statistic_device c ON tmp1.device_id_id = c.id
WHERE c.device_id IS NOT NULL
""".format(order_date=order_date, order_date_tomorrow=order_date_tomorrow)
mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing',
sql_order_device_info_yesterday)
device_ids_info = [(i["device_id"], int(i["pay_time"])) for i in mysql_results]
all_device_order_tags = {i["device_id"]: [int(tag) for tag in i["tag_ids"].split(",")] for i in mysql_results}
return device_ids_info, all_device_order_tags
def get_user_diary_click_info_yesterday():
pass
def get_user_service_click_info_yesterday():
pass
if __name__ == '__main__': if __name__ == '__main__':
try: try:
parser = argparse.ArgumentParser(description='画像匹配度的统计') parser = argparse.ArgumentParser(description='画像匹配度的统计')
...@@ -196,6 +256,7 @@ if __name__ == '__main__': ...@@ -196,6 +256,7 @@ if __name__ == '__main__':
parser.add_argument("-n", "--normalization_size", type=int, dest="normalization_size", default=7, parser.add_argument("-n", "--normalization_size", type=int, dest="normalization_size", default=7,
help="天数差归一化的区间") help="天数差归一化的区间")
parser.add_argument("-d", "--decay_days", type=int, dest="decay_days", default=180, help="分数衰减的天数") parser.add_argument("-d", "--decay_days", type=int, dest="decay_days", default=180, help="分数衰减的天数")
parser.add_argument("-a", "--action_type", type=list, dest="action_type", default=["order"], help="计算匹配度的行为")
args = parser.parse_args() args = parser.parse_args()
order_date = args.order_date order_date = args.order_date
order_date_tomorrow = str(datetime.datetime.strptime(order_date, '%Y-%m-%d') + datetime.timedelta(days=1)) order_date_tomorrow = str(datetime.datetime.strptime(order_date, '%Y-%m-%d') + datetime.timedelta(days=1))
...@@ -207,6 +268,7 @@ if __name__ == '__main__': ...@@ -207,6 +268,7 @@ if __name__ == '__main__':
exponential = args.exponential exponential = args.exponential
normalization_size = args.normalization_size normalization_size = args.normalization_size
decay_days = args.decay_days decay_days = args.decay_days
action_type = args.action_type
LOG_DIR = "/home/gmuser/gyz/log/" LOG_DIR = "/home/gmuser/gyz/log/"
my_today = str(datetime.date.today()) my_today = str(datetime.date.today())
...@@ -216,55 +278,6 @@ if __name__ == '__main__': ...@@ -216,55 +278,6 @@ if __name__ == '__main__':
log1 = logging.getLogger('log1') log1 = logging.getLogger('log1')
log2 = logging.getLogger('log2') log2 = logging.getLogger('log2')
# 获取昨天下单的用户设备id,下单的美购,美购对应的tag
# api_order只有用户的user_id,一个user_id对应多个device_id
# 用户一次可以下多个订单(美购),一个美购对应多个tag
sql_order_device_info_yesterday = """
SELECT tmp1.user_id,
c.device_id,
tmp1.service_ids,
tmp1.tag_ids,
tmp1.pay_time
FROM
(SELECT tmp.user_id,
tmp.service_ids,
tmp.tag_ids,
tmp.pay_time,
max(tmp.device_id) device_id_id
FROM
(SELECT a.user_id,
a.service_ids,
a.tag_ids,
a.pay_time,
b.device_id
FROM
(SELECT user_id,
max(pay_time) AS pay_time,
group_concat(DISTINCT `service_id` separator ',') service_ids,
group_concat(DISTINCT `tag_id` separator ',') tag_ids
FROM
(SELECT d.user_id,
d.service_id,
unix_timestamp(d.pay_time) AS pay_time,
e.tag_id
FROM api_order d
LEFT JOIN api_servicetag e ON d.service_id = e.service_id
LEFT JOIN api_tag f ON e.tag_id = f.id
WHERE d.status=1
AND d.pay_time>'{order_date}'
AND d.pay_time<'{order_date_tomorrow}'
AND f.tag_type+0 <'4'+0) tmp2
GROUP BY user_id) a
LEFT JOIN statistic_device_user b ON a.user_id = b.user_id) tmp
GROUP BY tmp.user_id) tmp1
LEFT JOIN statistic_device c ON tmp1.device_id_id = c.id
WHERE c.device_id IS NOT NULL
""".format(order_date=order_date, order_date_tomorrow=order_date_tomorrow)
mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing',
sql_order_device_info_yesterday)
device_ids_lst = [(i["device_id"], int(i["pay_time"])) for i in mysql_results]
all_device_order_tags = {i["device_id"]: [int(tag) for tag in i["tag_ids"].split(",")] for i in mysql_results}
# 获取搜索词及其近义词对应的tag # 获取搜索词及其近义词对应的tag
all_word_tags = get_all_word_tags() all_word_tags = get_all_word_tags()
all_tag_tag_type = get_all_tag_tag_type() all_tag_tag_type = get_all_tag_tag_type()
...@@ -272,82 +285,106 @@ if __name__ == '__main__': ...@@ -272,82 +285,106 @@ if __name__ == '__main__':
# 3级tag对应的2级tag # 3级tag对应的2级tag
all_3tag_2tag = get_all_3tag_2tag() all_3tag_2tag = get_all_3tag_2tag()
# 昨天下单了的用户的美购tags(转成2级tags) for action in action_type:
all_device_order_tags2 = dict() # 获取昨天产生行为的设备id、以及行为对应的tag
for device in all_device_order_tags: device_ids_lst = list()
tags = all_device_order_tags[device] all_device_order_tags = dict()
for tag in tags: if "order" in action_type:
tags2 = all_3tag_2tag.get(tag, []) device_ids_lst, all_device_order_tags = get_user_order_info_yesterday()
tags += tags2 elif "diary" in action_type:
all_device_order_tags2[device] = tags device_ids_lst, all_device_order_tags = get_user_diary_click_info_yesterday()
elif "service" in action_type:
# 昨天下单了的用户的去除支付行为的画像 device_ids_lst, all_device_order_tags = get_user_service_click_info_yesterday()
all_device_portrait_result = dict() else:
debug_all_device_portrait_result = dict() break
for order_info in device_ids_lst:
device = order_info[0] # tags扩展2级tags
pay_time = order_info[1] all_device_order_tags2 = dict()
portrait_result, debug_portrait_result = get_user_service_portrait_not_alipay(device, all_word_tags, for device in all_device_order_tags:
all_tag_tag_type, pay_time, tags = all_device_order_tags[device]
all_3tag_2tag, for tag in tags:
version=version, tags2 = all_3tag_2tag.get(tag, [])
exponential=exponential, tags += tags2
normalization_size=normalization_size, all_device_order_tags2[device] = tags
decay_days=decay_days,
size=-1) # 用户的去除支付行为的画像
all_device_portrait_result[device] = portrait_result all_device_portrait_result = dict()
debug_all_device_portrait_result[device] = debug_portrait_result debug_all_device_portrait_result = dict()
for order_info in device_ids_lst:
# 比较两个tag列表的重合率 device = order_info[0]
result = get_2_tags_coincide_rate(all_device_order_tags2, all_device_portrait_result, cmd_portrait_top_n, pay_time = order_info[1]
cmd_coincide_n) portrait_result, debug_portrait_result = get_user_service_portrait_not_alipay(device, all_word_tags,
all_tag_tag_type, pay_time,
# 有画像没匹配上的用户的画像信息 all_3tag_2tag,
no_coincide_devices = result["not_coincide_have_portrait_device_ids"] version=version,
no_coincide_devices_debug = dict() exponential=exponential,
log2.info({"统计日期": my_today}) normalization_size=normalization_size,
log2.info({"参数信息": args}) decay_days=decay_days,
log2.info({"版本": "英赫版" if version == 1 else "翔宇版"}) size=-1)
for device in no_coincide_devices: all_device_portrait_result[device] = portrait_result
debug_all_device_portrait_result[device] = debug_portrait_result
# 比较两个tag列表的重合率
result = get_2_tags_coincide_rate(all_device_order_tags2, all_device_portrait_result, cmd_portrait_top_n,
cmd_coincide_n)
# 有画像没匹配上的用户的画像信息
no_coincide_devices = result["not_coincide_have_portrait_device_ids"]
no_coincide_devices_debug = dict() no_coincide_devices_debug = dict()
device_portrait_n = all_device_portrait_result[device][:args.portrait_top_n] log2.info({"统计日期": my_today})
device_order_tags = all_device_order_tags2[device] log2.info({"参数信息": args})
debug_device_portrait_result = debug_all_device_portrait_result[device] log2.info({"版本": "英赫版" if version == 1 else "翔宇版"})
no_coincide_devices_debug[device] = { action_type_detail = ""
"画像的前{top_n}个tag".format(top_n=args.portrait_top_n): [debug_device_portrait_result[tag] for tag in if action_type == "order":
device_portrait_n], action_type_detail = "昨天下单了的用户"
"用户下单的美购对应的tag": [debug_device_portrait_result.get(tag, dict()) for tag in device_order_tags] elif action_type == "diary":
} action_type_detail = "昨天在首页精选点击了日记的用户"
log2.info("-" * 66) elif action_type == "service":
log2.info(no_coincide_devices_debug) action_type_detail = "昨天在美购首页点击了美购的用户"
log2.info("\n"*6) else:
pass
log2.info({"统计用户": action_type_detail})
# 统计画像更新的耗时和更新的设备数 for device in no_coincide_devices:
sql = "select count(*) from user_service_portrait_tags where stat_date='{my_today}'".format(my_today=my_today) no_coincide_devices_debug = dict()
portrait_device_count = get_data_by_mysql('172.16.40.158', 4000, 'root', '3SYz54LS9#^9sBvC', 'jerry_test', sql) device_portrait_n = all_device_portrait_result[device][:args.portrait_top_n]
with open(LOG_DIR + "dist_portrait.log", 'r') as f: device_order_tags = all_device_order_tags2[device]
lines = f.readlines() debug_device_portrait_result = debug_all_device_portrait_result[device]
start_datetime_str = lines[0][:19] no_coincide_devices_debug[device] = {
end_datetime_str = lines[-1][:19] "画像的前{top_n}个tag".format(top_n=args.portrait_top_n): [debug_device_portrait_result[tag] for tag in
start_datetime = datetime.datetime.strptime(start_datetime_str, '%Y-%m-%d %H:%M:%S') device_portrait_n],
end_datetime = datetime.datetime.strptime(end_datetime_str, '%Y-%m-%d %H:%M:%S') "用户下单的美购对应的tag": [debug_device_portrait_result.get(tag, dict()) for tag in device_order_tags]
time_consuming = (end_datetime - start_datetime).seconds / 60 }
log2.info("-" * 66)
log1.info({"画像信息统计日期": my_today}) log2.info(no_coincide_devices_debug)
log1.info({"参数信息": args}) log2.info("\n"*6)
log1.info({"版本": "英赫版" if version == 1 else "翔宇版"})
log1.info({"画像更新耗时(分钟)": time_consuming})
log1.info({"画像更新的设备数": portrait_device_count[0]["count(*)"]}) # 统计画像更新的耗时和更新的设备数
log1.info("") sql = "select count(*) from user_service_portrait_tags where stat_date='{my_today}'".format(my_today=my_today)
log1.info({"统计画像匹配度所用数据的日期": order_date}) portrait_device_count = get_data_by_mysql('172.16.40.158', 4000, 'root', '3SYz54LS9#^9sBvC', 'jerry_test', sql)
log1.info({"统计画像的选取前n个tag": cmd_portrait_top_n}) with open(LOG_DIR + "dist_portrait.log", 'r') as f:
log1.info({"重合个数": cmd_coincide_n}) lines = f.readlines()
log1.info({"下单人数": result["device_count"]}) start_datetime_str = lines[0][:19]
log1.info({"比对的上的人数": result["coincide_count"]}) end_datetime_str = lines[-1][:19]
log1.info({"匹配度": result["coincide_rate"]}) start_datetime = datetime.datetime.strptime(start_datetime_str, '%Y-%m-%d %H:%M:%S')
log1.info({"比对不上的有画像的人数": result["not_coincide_have_portrait_count"]}) end_datetime = datetime.datetime.strptime(end_datetime_str, '%Y-%m-%d %H:%M:%S')
log1.info({"比对不上的无画像的人数": result["not_coincide_no_portrait_count"]}) time_consuming = (end_datetime - start_datetime).seconds / 60
log1.info("="*66)
log1.info({"画像信息统计日期": my_today})
log1.info({"参数信息": args})
log1.info({"版本": "英赫版" if version == 1 else "翔宇版"})
log1.info({"统计用户": action_type_detail})
log1.info({"画像更新耗时(分钟)": time_consuming})
log1.info({"画像更新的设备数": portrait_device_count[0]["count(*)"]})
log1.info("")
log1.info({"统计画像匹配度所用数据的日期": order_date})
log1.info({"统计画像的选取前n个tag": cmd_portrait_top_n})
log1.info({"重合个数": cmd_coincide_n})
log1.info({"下单人数": result["device_count"]})
log1.info({"比对的上的人数": result["coincide_count"]})
log1.info({"匹配度": result["coincide_rate"]})
log1.info({"比对不上的有画像的人数": result["not_coincide_have_portrait_count"]})
log1.info({"比对不上的无画像的人数": result["not_coincide_no_portrait_count"]})
log1.info("="*66)
except Exception as e: except Exception as e:
print(e) print(e)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment