diff --git a/eda/smart_rank/stat_device_order_portrait_score.py b/eda/smart_rank/stat_device_order_portrait_score.py index ad3bbdfa45b4bebe5cec0e107c27251a010e9e3b..93e953429c9782e7ab677afae37fa8ec3cdb31c2 100644 --- a/eda/smart_rank/stat_device_order_portrait_score.py +++ b/eda/smart_rank/stat_device_order_portrait_score.py @@ -25,6 +25,7 @@ def get_user_service_portrait(x, all_word_tags, all_tag_tag_type, all_3tag_2tag, cl_id = x[1] order_tag_id = x[2] order_tag_id_score = 0.0 + tag_position = -1 user_df_service = get_user_log(cl_id, all_word_tags, pay_time=pay_time) # å¢žåŠ dfå—æ®µ(days_diff_now, tag_type, tag2) @@ -56,10 +57,12 @@ def get_user_service_portrait(x, all_word_tags, all_tag_tag_type, all_3tag_2tag, ) gmkv_tag_score2_sum = tag_score_sum[["tag2", "tag_score"]][:size].to_dict('record') gmkv_tag_score2_sum_dict = {i["tag2"]: i["tag_score"] for i in gmkv_tag_score2_sum} + tag_list = [i['tag2'] for i in gmkv_tag_score2_sum] order_tag_id_score = gmkv_tag_score2_sum_dict.get(int(order_tag_id), 0.0) - return pay_time, cl_id, order_tag_id, order_tag_id_score + tag_position = tag_list.index(int(order_tag_id))+1 if int(order_tag_id) in tag_list else -1 + return pay_time, cl_id, order_tag_id, order_tag_id_score, tag_position else: - return pay_time, cl_id, order_tag_id, 0.0 + return pay_time, cl_id, order_tag_id, 0.0, -1 # 获å–近一个月设备下å•的时间ã€è®¾å¤‡idã€æ ‡ç¾id @@ -105,6 +108,6 @@ device_ids_lst_rdd = spark.sparkContext.parallelize(device_info) result = device_ids_lst_rdd.repartition(100).map(lambda x: get_user_service_portrait(x, all_word_tags, all_tag_tag_type, all_3tag_2tag, all_tags_name, size=None)).filter(lambda x: x is not None) print(result.count()) print(result.take(10)) -df = spark.createDataFrame(result).na.drop().toDF("pay_time", "cl_id", "order_tag_id", "order_tag_id_score").na.drop().toPandas() -df.to_csv("~/gyz/log/stat_device_order_portrait_score_1106_1206.csv", index=False) +df = spark.createDataFrame(result).na.drop().toDF("pay_time", "cl_id", "order_tag_id", "order_tag_id_score", "tag_index").na.drop().toPandas() +df.to_csv("~/gyz/log/stat_device_order_portrait_score_1106_1206_v2.csv", index=False) spark.stop()