update

729383ca · 高雅喆 · 82ed7086 · 729383ca
Commit 729383ca authored Dec 06, 2019 by 高雅喆
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 4 deletions

stat_device_order_portrait_score.py eda/smart_rank/stat_device_order_portrait_score.py +7 -4

No files found.
--- a/eda/smart_rank/stat_device_order_portrait_score.py
+++ b/eda/smart_rank/stat_device_order_portrait_score.py
@@ -25,6 +25,7 @@ def get_user_service_portrait(x, all_word_tags, all_tag_tag_type, all_3tag_2tag,
    cl_id = x[1]
    order_tag_id = x[2]
    order_tag_id_score = 0.0
+    tag_position = -1
    user_df_service = get_user_log(cl_id, all_word_tags, pay_time=pay_time)

    # 增加df字段(days_diff_now, tag_type, tag2)
@@ -56,10 +57,12 @@ def get_user_service_portrait(x, all_word_tags, all_tag_tag_type, all_3tag_2tag,
            )
            gmkv_tag_score2_sum = tag_score_sum[["tag2", "tag_score"]][:size].to_dict('record')
            gmkv_tag_score2_sum_dict = {i["tag2"]: i["tag_score"] for i in gmkv_tag_score2_sum}
+            tag_list = [i['tag2'] for i in gmkv_tag_score2_sum]
            order_tag_id_score = gmkv_tag_score2_sum_dict.get(int(order_tag_id), 0.0)
-            return pay_time, cl_id, order_tag_id, order_tag_id_score
+            tag_position = tag_list.index(int(order_tag_id))+1 if int(order_tag_id) in tag_list else -1
+            return pay_time, cl_id, order_tag_id, order_tag_id_score, tag_position
    else:
-        return pay_time, cl_id, order_tag_id, 0.0
+        return pay_time, cl_id, order_tag_id, 0.0, -1


 # 获取近一个月设备下单的时间、设备id、标签id
@@ -105,6 +108,6 @@ device_ids_lst_rdd = spark.sparkContext.parallelize(device_info)
 result = device_ids_lst_rdd.repartition(100).map(lambda x: get_user_service_portrait(x, all_word_tags, all_tag_tag_type, all_3tag_2tag, all_tags_name, size=None)).filter(lambda x: x is not None)
 print(result.count())
 print(result.take(10))
-df = spark.createDataFrame(result).na.drop().toDF("pay_time", "cl_id", "order_tag_id", "order_tag_id_score").na.drop().toPandas()
-df.to_csv("~/gyz/log/stat_device_order_portrait_score_1106_1206.csv", index=False)
+df = spark.createDataFrame(result).na.drop().toDF("pay_time", "cl_id", "order_tag_id", "order_tag_id_score", "tag_index").na.drop().toPandas()
+df.to_csv("~/gyz/log/stat_device_order_portrait_score_1106_1206_v2.csv", index=False)
 spark.stop()