Commit 637fd6ce authored by 高雅喆's avatar 高雅喆

test

parent a594fcd6
......@@ -77,14 +77,8 @@ def get_user_service_portrait(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2
gmkv_tag_score_sum = tag_score_sum[["tag2", "tag_score", "weight"]][:size].to_dict('record')
gmkv_tag_score2_sum = tag_score_sum[["tag2", "tag_score"]][:size].to_dict('record')
gmkv_tag_score2_sum_dict = {i["tag2"]: i["tag_score"] for i in gmkv_tag_score2_sum}
# 写gmkv
gm_kv_cli = redis.Redis(host="172.16.40.135", port=5379, db=2, socket_timeout=2000)
cl_id_portrait_key = "user:service_portrait_tags:cl_id:" + str(cl_id)
tag_id_list_json = json.dumps(gmkv_tag_score_sum)
gm_kv_cli.set(cl_id_portrait_key, tag_id_list_json)
gm_kv_cli.expire(cl_id_portrait_key, time=30 * 24 * 60 * 60)
redis
# 写redis
redis_client = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN9@172.16.40.173:6379')
cl_id_portrait_key2 = "user:service_portrait_tags2:cl_id:" + str(cl_id)
# 如果画像的tag个数小于5,则补充热搜词
......@@ -137,87 +131,83 @@ def get_user_service_portrait(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2
if __name__ == '__main__':
try:
db_jerry_test = pymysql.connect(host='172.16.40.170', port=4000, user='root', passwd='3SYz54LS9#^9sBvC',
db='jerry_test', charset='utf8')
cur_jerry_test = db_jerry_test.cursor()
# 获取最近30天内的用户设备id
sql_device_ids = "select distinct cl_id from user_new_tag_log " \
"where time > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 3 day))"
cur_jerry_test.execute(sql_device_ids)
device_ids_lst = [i[0] for i in cur_jerry_test.fetchall()]
db_jerry_test.close()
cur_jerry_test.close()
redis_client = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN9@172.16.40.173:6379')
# 获取搜索词及其近义词对应的tag
all_word_tags = get_all_word_tags()
all_tag_tag_type = get_all_tag_tag_type()
# 3级tag对应的2级tag
all_3tag_2tag = get_all_3tag_2tag()
# 标签id对应的中文名称
all_tags_name = get_all_tags_name()
# 画像冷启动
hot_search_words = get_hot_search_words_tag()
hot_search_words_portrait = list()
for tag_info in hot_search_words:
tmp = dict()
tmp["tag_score"] = 0.2
tmp["weight"] = 10
tmp["tag2"] = tag_info["id"]
hot_search_words_portrait.append(tmp)
# gm_kv_cli = redis.Redis(host="172.16.40.135", port=5379, db=2, socket_timeout=2000)
# hot_search_words_portrait_portrait_key = "user:service_coldstart_tags"
# hot_search_words_portrait_json = json.dumps(hot_search_words_portrait)
# gm_kv_cli.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
# gm_kv_cli.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
# redis_client.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
# redis_client.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
hot_search_words_portrait_portrait_key2 = "user:service_coldstart_tags2"
hot_search_words_portrait_dict = {i["id"]: 0.2 for i in hot_search_words}
redis_client.hmset(hot_search_words_portrait_portrait_key2, hot_search_words_portrait_dict)
hot_search_words_portrait_portrait_key2 = "user:service_coldstart_tags2_name"
hot_search_words_portrait_dict = {i["keywords"]: 0.2 for i in hot_search_words}
redis_client.hmset(hot_search_words_portrait_portrait_key2, hot_search_words_portrait_dict)
hot_search_words = ["明星娱乐", "网红扒一扒", "明星颜值打call", "颜商", "颜值高光时刻", "瘦脸针", "水光针", "光子嫩肤", "热玛吉", "瘦腿针", "超声刀", "瘦肩针", "皮秒", "果酸焕肤",
"热拉提", "微针", "超皮秒", "点阵激光", "小气泡", "玻尿酸丰下巴", "埋线双眼皮", "纹眉", "嗨体", "溶脂针瘦脸", "黄金微针", "点痣", "激光祛斑", "白瓷娃娃",
"除皱针注射", "微针祛痘坑", "玻尿酸", "大分子玻尿酸", "胶原蛋白", "肉毒素", "水杨酸", "果酸", "杏仁酸", "黑脸娃娃", "童颜针", "祛斑", "祛痣", "祛黑头", "祛疤",
"祛痘", "蜂巢皮秒", "深蓝射频", "美瞳", "孕睫", "婴儿针", "三文鱼针", "少女针", "素颜针", "熊猫针", "脱毛", "面部提升", "嫩肤", "镭射净肤", "红蓝光", "清洁",
"补水", "DPL", "抗衰", "针清", "美白", "冷光美白", "非剥落点阵", "网红抗衰", "网红整形", "网红颜值", "网红婚恋", "明星抗衰", "明星整形", "明星婚恋", "明星颜值"]
hot_search_words_portrait_portrait_key3 = "user:service_coldstart_tags3"
hot_search_words_portrait3_dict = {i: 0.2 for i in hot_search_words}
redis_client.hmset(hot_search_words_portrait_portrait_key3, hot_search_words_portrait3_dict)
# 搜索词tag
search_words_synonym_tags_key = "search:words:synonym:tags"
search_words_synonym_tags_json = json.dumps(all_word_tags)
# gm_kv_cli.set(search_words_synonym_tags_key, search_words_synonym_tags_json)
redis_client.set(search_words_synonym_tags_key, search_words_synonym_tags_json)
# rdd
sparkConf = SparkConf().set("spark.hive.mapred.supports.subdirectories", "true") \
.set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") \
.set("spark.tispark.plan.allow_index_double_read", "false") \
.set("spark.tispark.plan.allow_index_read", "true") \
.set("spark.sql.extensions", "org.apache.spark.sql.TiExtensions") \
.set("spark.tispark.pd.addresses", "172.16.40.170:2379").set("spark.io.compression.codec", "lzf") \
.set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec", "snappy")
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
spark.sparkContext.setLogLevel("WARN")
spark.sparkContext.addPyFile("/srv/apps/ffm-baseline_git/eda/smart_rank/tool.py")
device_ids_lst_rdd = spark.sparkContext.parallelize(device_ids_lst, numSlices=1000)
result = device_ids_lst_rdd.repartition(100).map(lambda x: get_user_service_portrait(x, all_word_tags, all_tag_tag_type, all_3tag_2tag, all_tags_name))
result.collect()
except Exception as e:
send_email("dist_update_user_portrait_service", "dist_update_user_portrait_service", "dist_update_user_portrait_service")
\ No newline at end of file
db_jerry_test = pymysql.connect(host='172.16.40.170', port=4000, user='root', passwd='3SYz54LS9#^9sBvC',
db='jerry_test', charset='utf8')
cur_jerry_test = db_jerry_test.cursor()
# 获取最近30天内的用户设备id
sql_device_ids = "select distinct cl_id from user_new_tag_log " \
"where time > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 3 day))"
cur_jerry_test.execute(sql_device_ids)
device_ids_lst = [i[0] for i in cur_jerry_test.fetchall()]
db_jerry_test.close()
cur_jerry_test.close()
redis_client = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN9@172.16.40.173:6379')
# 获取搜索词及其近义词对应的tag
all_word_tags = get_all_word_tags()
all_tag_tag_type = get_all_tag_tag_type()
# 3级tag对应的2级tag
all_3tag_2tag = get_all_3tag_2tag()
# 标签id对应的中文名称
all_tags_name = get_all_tags_name()
# 画像冷启动
hot_search_words = get_hot_search_words_tag()
hot_search_words_portrait = list()
for tag_info in hot_search_words:
tmp = dict()
tmp["tag_score"] = 0.2
tmp["weight"] = 10
tmp["tag2"] = tag_info["id"]
hot_search_words_portrait.append(tmp)
# gm_kv_cli = redis.Redis(host="172.16.40.135", port=5379, db=2, socket_timeout=2000)
# hot_search_words_portrait_portrait_key = "user:service_coldstart_tags"
# hot_search_words_portrait_json = json.dumps(hot_search_words_portrait)
# gm_kv_cli.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
# gm_kv_cli.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
# redis_client.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
# redis_client.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
hot_search_words_portrait_portrait_key2 = "user:service_coldstart_tags2"
hot_search_words_portrait_dict = {i["id"]: 0.2 for i in hot_search_words}
redis_client.hmset(hot_search_words_portrait_portrait_key2, hot_search_words_portrait_dict)
hot_search_words_portrait_portrait_key2 = "user:service_coldstart_tags2_name"
hot_search_words_portrait_dict = {i["keywords"]: 0.2 for i in hot_search_words}
redis_client.hmset(hot_search_words_portrait_portrait_key2, hot_search_words_portrait_dict)
hot_search_words = ["明星娱乐", "网红扒一扒", "明星颜值打call", "颜商", "颜值高光时刻", "瘦脸针", "水光针", "光子嫩肤", "热玛吉", "瘦腿针", "超声刀", "瘦肩针", "皮秒", "果酸焕肤",
"热拉提", "微针", "超皮秒", "点阵激光", "小气泡", "玻尿酸丰下巴", "埋线双眼皮", "纹眉", "嗨体", "溶脂针瘦脸", "黄金微针", "点痣", "激光祛斑", "白瓷娃娃",
"除皱针注射", "微针祛痘坑", "玻尿酸", "大分子玻尿酸", "胶原蛋白", "肉毒素", "水杨酸", "果酸", "杏仁酸", "黑脸娃娃", "童颜针", "祛斑", "祛痣", "祛黑头", "祛疤",
"祛痘", "蜂巢皮秒", "深蓝射频", "美瞳", "孕睫", "婴儿针", "三文鱼针", "少女针", "素颜针", "熊猫针", "脱毛", "面部提升", "嫩肤", "镭射净肤", "红蓝光", "清洁",
"补水", "DPL", "抗衰", "针清", "美白", "冷光美白", "非剥落点阵", "网红抗衰", "网红整形", "网红颜值", "网红婚恋", "明星抗衰", "明星整形", "明星婚恋", "明星颜值"]
hot_search_words_portrait_portrait_key3 = "user:service_coldstart_tags3"
hot_search_words_portrait3_dict = {i: 0.2 for i in hot_search_words}
redis_client.hmset(hot_search_words_portrait_portrait_key3, hot_search_words_portrait3_dict)
# 搜索词tag
search_words_synonym_tags_key = "search:words:synonym:tags"
search_words_synonym_tags_json = json.dumps(all_word_tags)
# gm_kv_cli.set(search_words_synonym_tags_key, search_words_synonym_tags_json)
redis_client.set(search_words_synonym_tags_key, search_words_synonym_tags_json)
# rdd
sparkConf = SparkConf().set("spark.hive.mapred.supports.subdirectories", "true") \
.set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") \
.set("spark.tispark.plan.allow_index_double_read", "false") \
.set("spark.tispark.plan.allow_index_read", "true") \
.set("spark.sql.extensions", "org.apache.spark.sql.TiExtensions") \
.set("spark.tispark.pd.addresses", "172.16.40.170:2379").set("spark.io.compression.codec", "lzf") \
.set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec", "snappy")
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
spark.sparkContext.setLogLevel("WARN")
spark.sparkContext.addPyFile("/srv/apps/ffm-baseline_git/eda/smart_rank/tool.py")
device_ids_lst_rdd = spark.sparkContext.parallelize(device_ids_lst, numSlices=1000)
result = device_ids_lst_rdd.repartition(100).map(lambda x: get_user_service_portrait(x, all_word_tags, all_tag_tag_type, all_3tag_2tag, all_tags_name))
result.collect()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment