Commit a5881d91 authored by 赵威's avatar 赵威

get feature data

parent 223dcbc2
......@@ -35,7 +35,7 @@ def get_spark(app_name=""):
"org.apache.spark.sql.TiExtensions").config("spark.tispark.pd.addresses",
"172.16.40.170:2379").appName(app_name).enableHiveSupport().getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("WARN")
sc.setLogLevel("ERROR")
# sc.addPyFile("/srv/apps/strategy_embedding/utils/date.py")
ti = pti.TiContext(spark)
ti.tidbMapDatabase("jerry_test")
......@@ -45,7 +45,7 @@ def get_spark(app_name=""):
def get_click_data(spark, card_type, start, end):
reg = r"""^\\d+$"""
sql = """
SELECT DISTINCT t1.cl_id device_id, cast(t1.business_id as int) card_id,t1.partition_date,t1.time_stamp
SELECT DISTINCT t1.cl_id device_id, cast(t1.business_id as int) card_id,t1.partition_date
FROM
(select partition_date,cl_id,business_id,action,page_name,page_stay,time_stamp
from online.bl_hdfs_maidian_updates
......@@ -236,6 +236,127 @@ def get_exposure_data(spark, card_type, start, end):
""".format(reg, card_type, end, start, end, end, end, end, end, start, end))
def get_card_feature_df(spark, card_type, yesterday):
reg = r"""^\\d+$"""
sql = """
SELECT CAST(card_id as INT) as card_id,
partition_date,
is_pure_author,
is_have_pure_reply,
is_have_reply,
CAST(card_feature.content_level as FLOAT) as content_level,
topic_seven_click_num,
topic_thirty_click_num,
topic_num,
seven_transform_num,
thirty_transform_num,
favor_num,
favor_pure_num,
vote_num,
vote_display_num,
reply_num,
reply_pure_num,
one_click_num,
three_click_num,
seven_click_num,
fifteen_click_num,
thirty_click_num,
sixty_click_num,
ninety_click_num,
history_click_num,
one_precise_exposure_num,
three_precise_exposure_num,
seven_precise_exposure_num,
fifteen_precise_exposure_num,
thirty_precise_exposure_num,
sixty_precise_exposure_num,
ninety_precise_exposure_num,
history_precise_exposure_num,
one_vote_user_num,
three_vote_user_num,
seven_vote_user_num,
fifteen_vote_user_num,
thirty_vote_user_num,
sixty_vote_user_num,
ninety_vote_user_num,
history_vote_user_num,
one_reply_user_num,
three_reply_user_num,
seven_reply_user_num,
fifteen_reply_user_num,
thirty_reply_user_num,
sixty_reply_user_num,
ninety_reply_user_num,
history_reply_user_num,
one_browse_user_num,
three_browse_user_num,
seven_browse_user_num,
fifteen_browse_user_num,
thirty_browse_user_num,
sixty_browse_user_num,
ninety_browse_user_num,
history_browse_user_num,
one_reply_num,
three_reply_num,
seven_reply_num,
fifteen_reply_num,
thirty_reply_num,
sixty_reply_num,
ninety_reply_num,
history_reply_num,
one_ctr,
three_ctr,
seven_ctr,
fifteen_ctr,
thirty_ctr,
sixty_ctr,
ninety_ctr,
history_ctr,
one_vote_pure_rate,
three_vote_pure_rate,
seven_vote_pure_rate,
fifteen_vote_pure_rate,
thirty_vote_pure_rate,
sixty_vote_pure_rate,
ninety_vote_pure_rate,
history_vote_pure_rate,
one_reply_pure_rate,
three_reply_pure_rate,
seven_reply_pure_rate,
fifteen_reply_pure_rate,
thirty_reply_pure_rate,
sixty_reply_pure_rate,
ninety_reply_pure_rate,
history_reply_pure_rate,
CAST(IFNULL(content2.goodclick_7, 0.0) AS DOUBLE) AS goodclick_7,
CAST(IFNULL(content2.goodclick_14, 0.0) AS DOUBLE) AS goodclick_14,
CAST(IFNULL(content2.goodclick_30, 0.0) AS DOUBLE) AS goodclick_30,
CAST(IFNULL(content2.goodclick_60, 0.0) AS DOUBLE) AS goodclick_60,
CAST(IFNULL(content2.goodclick_90, 0.0) AS DOUBLE) AS goodclick_90,
content_tag3.smart_rank_score smart_rank_score,
IFNULL(content_tag3.first_demands, "") AS card_first_demands,
IFNULL(content_tag3.second_demands, "") AS card_second_demands,
IFNULL(content_tag3.first_solutions, "") AS card_first_solutions,
IFNULL(content_tag3.second_solutions, "") AS card_second_solutions,
IFNULL(content_tag3.first_positions, "") AS card_first_positions,
IFNULL(content_tag3.second_positions, "") AS card_second_positions,
IFNULL(content_tag3.projects, "") AS card_projects
FROM
online.al_community_forecast_character_day_v3 card_feature
JOIN content_tag3
ON card_feature.card_id = content_tag3.id
LEFT JOIN
(select * from online.al_community_detail_page_goodclick_ctr
where page_name = '{}'
) AS content2
ON card_feature.card_id = content2.business_id
where partition_date = '{}'
and card_content_type = '{}'
and card_id rlike '{}'
""".format(card_type, yesterday, card_type, reg)
return spark.sql(sql)
if __name__ == "__main__":
spark = get_spark("dssm_tractate_data")
card_type = "user_post"
......@@ -247,4 +368,7 @@ if __name__ == "__main__":
exposure_df = get_exposure_data(spark, card_type, start, end)
exposure_df.show(5, False)
tractate_feature_df = get_card_feature_df(spark, card_type, end)
tractate_feature_df.show(5, False)
# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/dssm/get_tractate_data.py
......@@ -35,7 +35,8 @@ def get_spark(app_name=""):
"spark.sql.extensions",
"org.apache.spark.sql.TiExtensions").config("spark.tispark.pd.addresses",
"172.16.40.170:2379").appName(app_name).enableHiveSupport().getOrCreate()
# sc = spark.sparkContext
sc = spark.sparkContext
sc.setLogLevel("ERROR")
# sc.addPyFile("/srv/apps/strategy_embedding/utils/date.py")
ti = pti.TiContext(spark)
ti.tidbMapDatabase("jerry_test")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment