Commit 08e45a4e authored by 高雅喆's avatar 高雅喆

计算两个tag列表的重合率

parent a4c31b88
......@@ -282,6 +282,7 @@ def get_user_service_portrait(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2
cur_jerry_test.execute(replace_sql)
db_jerry_test.commit()
# 写tidb 用户分层营销
# todo 不准确,因为聚合后,一个标签会有多个来源,即多个pay_type
score_result = tag_score_sum[["tag2", "cl_id", "tag_score", "weight", "pay_type"]]
score_result.rename(columns={"tag2": "tag_id", "cl_id": "device_id", "tag_score": "score"}, inplace=True)
delete_sql = "delete from api_market_personas where device_id='{}'".format(cl_id)
......
......@@ -72,55 +72,88 @@ def get_user_service_portrait_not_alipay(cl_id, all_word_tags, all_tag_tag_type,
2 if x.action == "api/settlement/alipay_callback" else 1
), axis=1
)
gmkv_tag_score_sum = tag_score_sum[["tag2", "tag_score", "weight"]][:size].to_dict('record')
# 写gmkv
gm_kv_cli = redis.Redis(host="172.16.40.135", port=5379, db=2, socket_timeout=2000)
cl_id_portrait_key = "user:service_portrait_tags:cl_id:" + str(cl_id)
tag_id_list_json = json.dumps(gmkv_tag_score_sum)
gm_kv_cli.set(cl_id_portrait_key, tag_id_list_json)
gm_kv_cli.expire(cl_id_portrait_key, time=30 * 24 * 60 * 60)
# 写tidb,redis同步
stat_date = datetime.datetime.today().strftime('%Y-%m-%d')
replace_sql = """replace into user_service_portrait_tags (stat_date, cl_id, tag_list) values("{stat_date}","{cl_id}","{tag_list}")"""\
.format(stat_date=stat_date, cl_id=cl_id, tag_list=gmkv_tag_score_sum)
cur_jerry_test.execute(replace_sql)
db_jerry_test.commit()
# 写tidb 用户分层营销
score_result = tag_score_sum[["tag2", "cl_id", "tag_score", "weight", "pay_type"]]
score_result.rename(columns={"tag2": "tag_id", "cl_id": "device_id", "tag_score": "score"}, inplace=True)
delete_sql = "delete from api_market_personas where device_id='{}'".format(cl_id)
cur_jerry_test.execute(delete_sql)
db_jerry_test.commit()
for index, row in score_result.iterrows():
insert_sql = "insert into api_market_personas values (null, {}, '{}', {}, {}, {})".format(
row['tag_id'], row['device_id'], row['score'], row['weight'], row['pay_type'])
cur_jerry_test.execute(insert_sql)
db_jerry_test.commit()
db_jerry_test.close()
return "sucess"
# gmkv_tag_score_sum = tag_score_sum[["tag2", "tag_score", "weight"]][:size].to_dict('record')
gmkv_tag_score_sum_list = tag_score_sum["tag2"].to_list()[:size]
return gmkv_tag_score_sum_list
except Exception as e:
print(e)
def get_2_tags_coincide_rate(device_order_tags, device_portrait_result, portrait_top_n, coincide_n):
device_count = len(device_order_tags)
coincide_count = 0
for device in device_order_tags:
order_tags = device_order_tags[device]
portrait_tags = device_portrait_result[device]
if portrait_tags:
portrait_tags = portrait_tags[:portrait_top_n]
else:
portrait_tags = []
if len(set(order_tags).intersection(set(portrait_tags))) >= coincide_n:
coincide_count += 1
return coincide_count/device_count
if __name__ == '__main__':
try:
db_jerry_test = pymysql.connect(host='172.16.40.170', port=4000, user='root', passwd='3SYz54LS9#^9sBvC',
db='jerry_test', charset='utf8')
cur_jerry_test = db_jerry_test.cursor()
# 获取最近30天内的用户设备id
sql_device_ids = "select distinct cl_id from user_new_tag_log " \
"where time > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 30 day))"
cur_jerry_test.execute(sql_device_ids)
device_ids_lst = [i[0] for i in cur_jerry_test.fetchall()]
db_jerry_test.close()
# 获取昨天下单的用户设备id,下单的美购,美购对应的tag
# api_order只有用户的user_id,一个user_id对应多个device_id
# 用户一次可以下多个订单(美购),一个美购对应多个tag
my_yesterday = str(datetime.date.today() - datetime.timedelta(days=1))
sql_order_device_info_yesterday = """
SELECT tmp1.user_id,
c.device_id,
tmp1.service_ids,
tmp1.tag_ids
FROM
(SELECT tmp.user_id,
tmp.service_ids,
tmp.tag_ids,
max(tmp.device_id) device_id_id
FROM
(SELECT a.user_id,
a.service_ids,
a.tag_ids,
b.device_id
FROM
(SELECT user_id,
group_concat(DISTINCT `service_id` separator ',') service_ids,
group_concat(DISTINCT `tag_id` separator ',') tag_ids
FROM
(SELECT d.user_id,
d.service_id,
e.tag_id
FROM api_order d
LEFT JOIN api_servicetag e ON d.service_id = e.service_id
LEFT JOIN api_tag f ON e.tag_id = f.id
WHERE d.status=1
AND d.pay_time>'{my_yesterday}'
AND f.tag_type+0 <'4'+0) tmp2
GROUP BY user_id) a
LEFT JOIN statistic_device_user b ON a.user_id = b.user_id) tmp
GROUP BY tmp.user_id) tmp1
LEFT JOIN statistic_device c ON tmp1.device_id_id = c.id
WHERE c.device_id IS NOT NULL
""".format(my_yesterday=my_yesterday)
mysql_results = get_data_by_mysql('172.16.30.141', 3306, 'work', 'BJQaT9VzDcuPBqkd', 'zhengxing',
sql_order_device_info_yesterday)
device_ids_lst = [i["device_id"] for i in mysql_results]
all_device_order_tags = {i["device_id"]: [int(tag) for tag in i["tag_ids"].split(",")] for i in mysql_results}
# 获取搜索词及其近义词对应的tag
all_word_tags = get_all_word_tags()
all_tag_tag_type = get_all_tag_tag_type()
# 3级tag对应的2级tag
all_3tag_2tag = get_all_3tag_2tag()
\ No newline at end of file
all_3tag_2tag = get_all_3tag_2tag()
device_id = "9C5E7C73-380C-4623-8F48-A64C8034E315"
get_user_service_portrait_not_alipay(device_id, all_word_tags, all_tag_tag_type, all_3tag_2tag)
# 昨天下单了的用户的去除支付行为的画像
all_device_portrait_result = dict()
for device in device_ids_lst:
portrait_result = get_user_service_portrait_not_alipay(device, all_word_tags, all_tag_tag_type, all_3tag_2tag, size=10)
all_device_portrait_result[device] = portrait_result
# 比较两个tag列表的重合率
rate = get_2_tags_coincide_rate(all_device_order_tags, all_device_portrait_result, 3, 1)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment