get feature data

a5881d91 · 赵威 · 223dcbc2 · a5881d91 · a5881d91
Commit a5881d91 authored Nov 05, 2020 by 赵威
Hide whitespace changes
Inline Side-by-side

Showing with 128 additions and 3 deletions

get_tractate_data.py dssm/get_tractate_data.py +126 -2

tractate.py word_vector/tractate.py +2 -1

No files found.
--- a/dssm/get_tractate_data.py
+++ b/dssm/get_tractate_data.py
@@ -35,7 +35,7 @@ def get_spark(app_name=""):
        "org.apache.spark.sql.TiExtensions").config("spark.tispark.pd.addresses",
                                                    "172.16.40.170:2379").appName(app_name).enableHiveSupport().getOrCreate()
    sc = spark.sparkContext
-    sc.setLogLevel("WARN")
+    sc.setLogLevel("ERROR")
    # sc.addPyFile("/srv/apps/strategy_embedding/utils/date.py")
    ti = pti.TiContext(spark)
    ti.tidbMapDatabase("jerry_test")
@@ -45,7 +45,7 @@ def get_spark(app_name=""):
 def get_click_data(spark, card_type, start, end):
    reg = r"""^\\d+$"""
    sql = """
-        SELECT DISTINCT t1.cl_id device_id, cast(t1.business_id as int) card_id,t1.partition_date,t1.time_stamp
+        SELECT DISTINCT t1.cl_id device_id, cast(t1.business_id as int) card_id,t1.partition_date
          FROM
          (select partition_date,cl_id,business_id,action,page_name,page_stay,time_stamp
          from online.bl_hdfs_maidian_updates
@@ -236,6 +236,127 @@ def get_exposure_data(spark, card_type, start, end):
      """.format(reg, card_type, end, start, end, end, end, end, end, start, end))


+def get_card_feature_df(spark, card_type, yesterday):
+    reg = r"""^\\d+$"""
+    sql = """
+          SELECT CAST(card_id as INT) as card_id,
+               partition_date,
+               is_pure_author,
+               is_have_pure_reply,
+               is_have_reply,
+               CAST(card_feature.content_level as FLOAT) as content_level,
+               topic_seven_click_num,
+               topic_thirty_click_num,
+               topic_num,
+               seven_transform_num,
+               thirty_transform_num,
+               favor_num,
+               favor_pure_num,
+               vote_num,
+               vote_display_num,
+               reply_num,
+               reply_pure_num,
+               one_click_num,
+               three_click_num,
+               seven_click_num,
+               fifteen_click_num,
+               thirty_click_num,
+               sixty_click_num,
+               ninety_click_num,
+               history_click_num,
+               one_precise_exposure_num,
+               three_precise_exposure_num,
+               seven_precise_exposure_num,
+               fifteen_precise_exposure_num,
+               thirty_precise_exposure_num,
+               sixty_precise_exposure_num,
+               ninety_precise_exposure_num,
+               history_precise_exposure_num,
+               one_vote_user_num,
+               three_vote_user_num,
+               seven_vote_user_num,
+               fifteen_vote_user_num,
+               thirty_vote_user_num,
+               sixty_vote_user_num,
+               ninety_vote_user_num,
+               history_vote_user_num,
+               one_reply_user_num,
+               three_reply_user_num,
+               seven_reply_user_num,
+               fifteen_reply_user_num,
+               thirty_reply_user_num,
+               sixty_reply_user_num,
+               ninety_reply_user_num,
+               history_reply_user_num,
+               one_browse_user_num,
+               three_browse_user_num,
+               seven_browse_user_num,
+               fifteen_browse_user_num,
+               thirty_browse_user_num,
+               sixty_browse_user_num,
+               ninety_browse_user_num,
+               history_browse_user_num,
+               one_reply_num,
+               three_reply_num,
+               seven_reply_num,
+               fifteen_reply_num,
+               thirty_reply_num,
+               sixty_reply_num,
+               ninety_reply_num,
+               history_reply_num,
+               one_ctr,
+               three_ctr,
+               seven_ctr,
+               fifteen_ctr,
+               thirty_ctr,
+               sixty_ctr,
+               ninety_ctr,
+               history_ctr,
+               one_vote_pure_rate,
+               three_vote_pure_rate,
+               seven_vote_pure_rate,
+               fifteen_vote_pure_rate,
+               thirty_vote_pure_rate,
+               sixty_vote_pure_rate,
+               ninety_vote_pure_rate,
+               history_vote_pure_rate,
+               one_reply_pure_rate,
+               three_reply_pure_rate,
+               seven_reply_pure_rate,
+               fifteen_reply_pure_rate,
+               thirty_reply_pure_rate,
+               sixty_reply_pure_rate,
+               ninety_reply_pure_rate,
+               history_reply_pure_rate,
+               CAST(IFNULL(content2.goodclick_7, 0.0) AS DOUBLE) AS goodclick_7,
+               CAST(IFNULL(content2.goodclick_14, 0.0) AS DOUBLE) AS goodclick_14,
+               CAST(IFNULL(content2.goodclick_30, 0.0) AS DOUBLE) AS goodclick_30,
+               CAST(IFNULL(content2.goodclick_60, 0.0) AS DOUBLE) AS goodclick_60,
+               CAST(IFNULL(content2.goodclick_90, 0.0) AS DOUBLE) AS goodclick_90,
+               content_tag3.smart_rank_score smart_rank_score,
+               IFNULL(content_tag3.first_demands, "") AS card_first_demands,
+               IFNULL(content_tag3.second_demands, "") AS card_second_demands,
+               IFNULL(content_tag3.first_solutions, "") AS card_first_solutions,
+               IFNULL(content_tag3.second_solutions, "") AS card_second_solutions,
+               IFNULL(content_tag3.first_positions, "") AS card_first_positions,
+               IFNULL(content_tag3.second_positions, "") AS card_second_positions,
+               IFNULL(content_tag3.projects, "") AS card_projects
+          FROM
+          online.al_community_forecast_character_day_v3 card_feature
+          JOIN content_tag3
+          ON card_feature.card_id = content_tag3.id
+          LEFT JOIN
+          (select * from online.al_community_detail_page_goodclick_ctr
+          where page_name = '{}'
+          ) AS content2
+          ON card_feature.card_id = content2.business_id
+          where partition_date = '{}'
+           and card_content_type = '{}'
+           and card_id rlike '{}'
+       """.format(card_type, yesterday, card_type, reg)
+    return spark.sql(sql)
+
+
 if __name__ == "__main__":
    spark = get_spark("dssm_tractate_data")
    card_type = "user_post"
@@ -247,4 +368,7 @@ if __name__ == "__main__":
    exposure_df = get_exposure_data(spark, card_type, start, end)
    exposure_df.show(5, False)

+    tractate_feature_df = get_card_feature_df(spark, card_type, end)
+    tractate_feature_df.show(5, False)
+
 # spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/dssm/get_tractate_data.py
--- a/word_vector/tractate.py
+++ b/word_vector/tractate.py
@@ -35,7 +35,8 @@ def get_spark(app_name=""):
        "spark.sql.extensions",
        "org.apache.spark.sql.TiExtensions").config("spark.tispark.pd.addresses",
                                                    "172.16.40.170:2379").appName(app_name).enableHiveSupport().getOrCreate()
-    # sc = spark.sparkContext
+    sc = spark.sparkContext
+    sc.setLogLevel("ERROR")
    # sc.addPyFile("/srv/apps/strategy_embedding/utils/date.py")
    ti = pti.TiContext(spark)
    ti.tidbMapDatabase("jerry_test")