test

637fd6ce · 高雅喆 · a594fcd6 · 637fd6ce
Commit 637fd6ce authored Nov 04, 2019 by 高雅喆
Hide whitespace changes
Inline Side-by-side

Showing with 82 additions and 93 deletions

dist_update_user_portrait_service.py eda/smart_rank/dist_update_user_portrait_service.py +82 -93

No files found.
--- a/eda/smart_rank/dist_update_user_portrait_service.py
+++ b/eda/smart_rank/dist_update_user_portrait_service.py
@@ -77,14 +77,8 @@ def get_user_service_portrait(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2
            gmkv_tag_score_sum = tag_score_sum[["tag2", "tag_score", "weight"]][:size].to_dict('record')
            gmkv_tag_score2_sum = tag_score_sum[["tag2", "tag_score"]][:size].to_dict('record')
            gmkv_tag_score2_sum_dict = {i["tag2"]: i["tag_score"] for i in gmkv_tag_score2_sum}
-            # 写gmkv
-            gm_kv_cli = redis.Redis(host="172.16.40.135", port=5379, db=2, socket_timeout=2000)
+            # 写redis
-            cl_id_portrait_key = "user:service_portrait_tags:cl_id:" + str(cl_id)
-            tag_id_list_json = json.dumps(gmkv_tag_score_sum)
-            gm_kv_cli.set(cl_id_portrait_key, tag_id_list_json)
-            gm_kv_cli.expire(cl_id_portrait_key, time=30 * 24 * 60 * 60)
-            写redis
            redis_client = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN9@172.16.40.173:6379')
            cl_id_portrait_key2 = "user:service_portrait_tags2:cl_id:" + str(cl_id)
            # 如果画像的tag个数小于5，则补充热搜词
@@ -137,87 +131,83 @@ def get_user_service_portrait(cl_id, all_word_tags, all_tag_tag_type, all_3tag_2
 if __name__ == '__main__':
-    try:
+    db_jerry_test = pymysql.connect(host='172.16.40.170', port=4000, user='root', passwd='3SYz54LS9#^9sBvC',
-        db_jerry_test = pymysql.connect(host='172.16.40.170', port=4000, user='root', passwd='3SYz54LS9#^9sBvC',
+                                    db='jerry_test', charset='utf8')
-                                        db='jerry_test', charset='utf8')
+    cur_jerry_test = db_jerry_test.cursor()
-        cur_jerry_test = db_jerry_test.cursor()
+    # 获取最近30天内的用户设备id
-        # 获取最近30天内的用户设备id
+    sql_device_ids = "select distinct cl_id from user_new_tag_log " \
-        sql_device_ids = "select distinct cl_id from user_new_tag_log " \
+                     "where time > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 3 day))"
-                         "where time > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 3 day))"
+    cur_jerry_test.execute(sql_device_ids)
-        cur_jerry_test.execute(sql_device_ids)
+    device_ids_lst = [i[0] for i in cur_jerry_test.fetchall()]
-        device_ids_lst = [i[0] for i in cur_jerry_test.fetchall()]
+    db_jerry_test.close()
-        db_jerry_test.close()
+    cur_jerry_test.close()
-        cur_jerry_test.close()
+    redis_client = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN9@172.16.40.173:6379')
-        redis_client = redis.StrictRedis.from_url('redis://:ReDis!GmTx*0aN9@172.16.40.173:6379')
+    # 获取搜索词及其近义词对应的tag
-        # 获取搜索词及其近义词对应的tag
+    all_word_tags = get_all_word_tags()
-        all_word_tags = get_all_word_tags()
+    all_tag_tag_type = get_all_tag_tag_type()
-        all_tag_tag_type = get_all_tag_tag_type()
+    # 3级tag对应的2级tag
-        # 3级tag对应的2级tag
+    all_3tag_2tag = get_all_3tag_2tag()
-        all_3tag_2tag = get_all_3tag_2tag()
+    # 标签id对应的中文名称
-        # 标签id对应的中文名称
+    all_tags_name = get_all_tags_name()
-        all_tags_name = get_all_tags_name()
+    # 画像冷启动
-        # 画像冷启动
+    hot_search_words = get_hot_search_words_tag()
-        hot_search_words = get_hot_search_words_tag()
+    hot_search_words_portrait = list()
-        hot_search_words_portrait = list()
+    for tag_info in hot_search_words:
-        for tag_info in hot_search_words:
+        tmp = dict()
-            tmp = dict()
+        tmp["tag_score"] = 0.2
-            tmp["tag_score"] = 0.2
+        tmp["weight"] = 10
-            tmp["weight"] = 10
+        tmp["tag2"] = tag_info["id"]
-            tmp["tag2"] = tag_info["id"]
+        hot_search_words_portrait.append(tmp)
-            hot_search_words_portrait.append(tmp)
+    # gm_kv_cli = redis.Redis(host="172.16.40.135", port=5379, db=2, socket_timeout=2000)
-        # gm_kv_cli = redis.Redis(host="172.16.40.135", port=5379, db=2, socket_timeout=2000)
+    # hot_search_words_portrait_portrait_key = "user:service_coldstart_tags"
-        # hot_search_words_portrait_portrait_key = "user:service_coldstart_tags"
+    # hot_search_words_portrait_json = json.dumps(hot_search_words_portrait)
-        # hot_search_words_portrait_json = json.dumps(hot_search_words_portrait)
+    # gm_kv_cli.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
-        # gm_kv_cli.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
+    # gm_kv_cli.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
-        # gm_kv_cli.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
+    # redis_client.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
-        # redis_client.set(hot_search_words_portrait_portrait_key, hot_search_words_portrait_json)
+    # redis_client.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
-        # redis_client.expire(hot_search_words_portrait_portrait_key, time=30 * 24 * 60 * 60)
+    hot_search_words_portrait_portrait_key2 = "user:service_coldstart_tags2"
-        hot_search_words_portrait_portrait_key2 = "user:service_coldstart_tags2"
+    hot_search_words_portrait_dict = {i["id"]: 0.2 for i in hot_search_words}
-        hot_search_words_portrait_dict = {i["id"]: 0.2 for i in hot_search_words}
+    redis_client.hmset(hot_search_words_portrait_portrait_key2, hot_search_words_portrait_dict)
-        redis_client.hmset(hot_search_words_portrait_portrait_key2, hot_search_words_portrait_dict)
+    hot_search_words_portrait_portrait_key2 = "user:service_coldstart_tags2_name"
-        hot_search_words_portrait_portrait_key2 = "user:service_coldstart_tags2_name"
+    hot_search_words_portrait_dict = {i["keywords"]: 0.2 for i in hot_search_words}
-        hot_search_words_portrait_dict = {i["keywords"]: 0.2 for i in hot_search_words}
+    redis_client.hmset(hot_search_words_portrait_portrait_key2, hot_search_words_portrait_dict)
-        redis_client.hmset(hot_search_words_portrait_portrait_key2, hot_search_words_portrait_dict)
+    hot_search_words = ["明星娱乐", "网红扒一扒", "明星颜值打call", "颜商", "颜值高光时刻", "瘦脸针", "水光针", "光子嫩肤", "热玛吉", "瘦腿针", "超声刀", "瘦肩针", "皮秒", "果酸焕肤",
-        hot_search_words = ["明星娱乐", "网红扒一扒", "明星颜值打call", "颜商", "颜值高光时刻", "瘦脸针", "水光针", "光子嫩肤", "热玛吉", "瘦腿针", "超声刀", "瘦肩针", "皮秒", "果酸焕肤",
+     "热拉提", "微针", "超皮秒", "点阵激光", "小气泡", "玻尿酸丰下巴", "埋线双眼皮", "纹眉", "嗨体", "溶脂针瘦脸", "黄金微针", "点痣", "激光祛斑", "白瓷娃娃",
-         "热拉提", "微针", "超皮秒", "点阵激光", "小气泡", "玻尿酸丰下巴", "埋线双眼皮", "纹眉", "嗨体", "溶脂针瘦脸", "黄金微针", "点痣", "激光祛斑", "白瓷娃娃",
+     "除皱针注射", "微针祛痘坑", "玻尿酸", "大分子玻尿酸", "胶原蛋白", "肉毒素", "水杨酸", "果酸", "杏仁酸", "黑脸娃娃", "童颜针", "祛斑", "祛痣", "祛黑头", "祛疤",
-         "除皱针注射", "微针祛痘坑", "玻尿酸", "大分子玻尿酸", "胶原蛋白", "肉毒素", "水杨酸", "果酸", "杏仁酸", "黑脸娃娃", "童颜针", "祛斑", "祛痣", "祛黑头", "祛疤",
+     "祛痘", "蜂巢皮秒", "深蓝射频", "美瞳", "孕睫", "婴儿针", "三文鱼针", "少女针", "素颜针", "熊猫针", "脱毛", "面部提升", "嫩肤", "镭射净肤", "红蓝光", "清洁",
-         "祛痘", "蜂巢皮秒", "深蓝射频", "美瞳", "孕睫", "婴儿针", "三文鱼针", "少女针", "素颜针", "熊猫针", "脱毛", "面部提升", "嫩肤", "镭射净肤", "红蓝光", "清洁",
+     "补水", "DPL", "抗衰", "针清", "美白", "冷光美白", "非剥落点阵", "网红抗衰", "网红整形", "网红颜值", "网红婚恋", "明星抗衰", "明星整形", "明星婚恋", "明星颜值"]
-         "补水", "DPL", "抗衰", "针清", "美白", "冷光美白", "非剥落点阵", "网红抗衰", "网红整形", "网红颜值", "网红婚恋", "明星抗衰", "明星整形", "明星婚恋", "明星颜值"]
+    hot_search_words_portrait_portrait_key3 = "user:service_coldstart_tags3"
-        hot_search_words_portrait_portrait_key3 = "user:service_coldstart_tags3"
+    hot_search_words_portrait3_dict = {i: 0.2 for i in hot_search_words}
-        hot_search_words_portrait3_dict = {i: 0.2 for i in hot_search_words}
+    redis_client.hmset(hot_search_words_portrait_portrait_key3, hot_search_words_portrait3_dict)
-        redis_client.hmset(hot_search_words_portrait_portrait_key3, hot_search_words_portrait3_dict)
+    # 搜索词tag
-        # 搜索词tag
+    search_words_synonym_tags_key = "search:words:synonym:tags"
-        search_words_synonym_tags_key = "search:words:synonym:tags"
+    search_words_synonym_tags_json = json.dumps(all_word_tags)
-        search_words_synonym_tags_json = json.dumps(all_word_tags)
+    # gm_kv_cli.set(search_words_synonym_tags_key, search_words_synonym_tags_json)
-        # gm_kv_cli.set(search_words_synonym_tags_key, search_words_synonym_tags_json)
+    redis_client.set(search_words_synonym_tags_key, search_words_synonym_tags_json)
-        redis_client.set(search_words_synonym_tags_key, search_words_synonym_tags_json)
+    # rdd
-        # rdd
+    sparkConf = SparkConf().set("spark.hive.mapred.supports.subdirectories", "true") \
-        sparkConf = SparkConf().set("spark.hive.mapred.supports.subdirectories", "true") \
+        .set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") \
-            .set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") \
+        .set("spark.tispark.plan.allow_index_double_read", "false") \
-            .set("spark.tispark.plan.allow_index_double_read", "false") \
+        .set("spark.tispark.plan.allow_index_read", "true") \
-            .set("spark.tispark.plan.allow_index_read", "true") \
+        .set("spark.sql.extensions", "org.apache.spark.sql.TiExtensions") \
-            .set("spark.sql.extensions", "org.apache.spark.sql.TiExtensions") \
+        .set("spark.tispark.pd.addresses", "172.16.40.170:2379").set("spark.io.compression.codec", "lzf") \
-            .set("spark.tispark.pd.addresses", "172.16.40.170:2379").set("spark.io.compression.codec", "lzf") \
+        .set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec", "snappy")
-            .set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec", "snappy")
+    spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
-        spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
+    spark.sparkContext.setLogLevel("WARN")
-        spark.sparkContext.setLogLevel("WARN")
+    spark.sparkContext.addPyFile("/srv/apps/ffm-baseline_git/eda/smart_rank/tool.py")
-        spark.sparkContext.addPyFile("/srv/apps/ffm-baseline_git/eda/smart_rank/tool.py")
+    device_ids_lst_rdd = spark.sparkContext.parallelize(device_ids_lst, numSlices=1000)
-        device_ids_lst_rdd = spark.sparkContext.parallelize(device_ids_lst, numSlices=1000)
+    result = device_ids_lst_rdd.repartition(100).map(lambda x: get_user_service_portrait(x, all_word_tags, all_tag_tag_type, all_3tag_2tag, all_tags_name))
-        result = device_ids_lst_rdd.repartition(100).map(lambda x: get_user_service_portrait(x, all_word_tags, all_tag_tag_type, all_3tag_2tag, all_tags_name))
+    result.collect()
-        result.collect()
-    except Exception as e:
-        send_email("dist_update_user_portrait_service", "dist_update_user_portrait_service", "dist_update_user_portrait_service")
\ No newline at end of file