Commit c58ac417 authored by 高雅喆's avatar 高雅喆

update

parent 970f1871
...@@ -146,14 +146,25 @@ def compute_ai_scan(x): ...@@ -146,14 +146,25 @@ def compute_ai_scan(x):
return 0.5 return 0.5
def get_user_tag_score(cl_id, all_log_df, all_word_tags, size=10): def get_user_tag_score(cl_id, all_word_tags, size=10):
try: try:
db_jerry_test = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db_jerry_test = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC',
db='jerry_test', charset='utf8') db='jerry_test', charset='utf8')
cur_jerry_test = db_jerry_test.cursor() cur_jerry_test = db_jerry_test.cursor()
user_log_df = all_log_df.loc[(all_log_df['cl_id'] == cl_id) & (all_log_df['action'] != 'do_search')] # 用户的非搜索行为
user_df_search = all_log_df.loc[(all_log_df['cl_id'] == cl_id) & (all_log_df['action'] == 'do_search')] user_df_service_sql = "select time,cl_id,score_type,tag_id,tag_referrer,action from user_new_tag_log " \
"where cl_id ='{}' and action != 'do_search' ".format(cl_id)
cur_jerry_test.execute(user_df_service_sql)
user_log_df = pd.DataFrame(list(cur_jerry_test.fetchall()))
user_log_df.columns = ["time", "cl_id", "score_type", "tag_id", "tag_referrer", "action"]
# 用户的搜索行为
user_df_search_sql = "select time,cl_id,score_type,tag_id,tag_referrer,action from user_new_tag_log " \
"where cl_id ='{}' and action = 'do_search'".format(cl_id)
cur_jerry_test.execute(user_df_search_sql)
user_df_search = pd.DataFrame(list(cur_jerry_test.fetchall()))
user_df_search.columns = ["time", "cl_id", "score_type", "tag_id", "tag_referrer", "action"]
# 搜索词转成tag # 搜索词转成tag
for index, row in user_df_search.iterrows(): for index, row in user_df_search.iterrows():
if row['tag_referrer'] in all_word_tags: if row['tag_referrer'] in all_word_tags:
...@@ -201,26 +212,13 @@ if __name__ == '__main__': ...@@ -201,26 +212,13 @@ if __name__ == '__main__':
db='jerry_test', charset='utf8') db='jerry_test', charset='utf8')
cur_jerry_test = db_jerry_test.cursor() cur_jerry_test = db_jerry_test.cursor()
# 获取所有用户的设备id
# sql_device_ids = "select distinct cl_id from user_new_tag_log"
# 获取最近30天内的用户设备id # 获取最近30天内的用户设备id
sql_device_ids = "select distinct cl_id from user_new_tag_log " \ sql_device_ids = "select distinct cl_id from user_new_tag_log " \
"where time > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 30 day))" "where time > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 30 day))"
cur_jerry_test.execute(sql_device_ids) cur_jerry_test.execute(sql_device_ids)
device_ids_lst = [i[0] for i in cur_jerry_test.fetchall()] device_ids_lst = [i[0] for i in cur_jerry_test.fetchall()]
# 获取所有用户的行为日志
# sql_all_log = "select time,cl_id,score_type,tag_id,tag_referrer,action from user_new_tag_log"
# 获取最近30天内的用户的所有行为
sql_all_log = "select time,cl_id,score_type,tag_id,tag_referrer,action from user_new_tag_log where cl_id in " \
"(select distinct cl_id from user_new_tag_log " \
"where time > UNIX_TIMESTAMP(DATE_SUB(NOW(), INTERVAL 30 day)))"
cur_jerry_test.execute(sql_all_log)
all_log = cur_jerry_test.fetchall()
db_jerry_test.close() db_jerry_test.close()
all_log_df = pd.DataFrame(list(all_log))
all_log_df.columns = ["time", "cl_id", "score_type","tag_id","tag_referrer","action"]
stat_date = datetime.datetime.today().strftime('%Y-%m-%d')
#搜索词及其同义词匹配tag #搜索词及其同义词匹配tag
all_word_tags = get_all_word_tags() all_word_tags = get_all_word_tags()
# rdd # rdd
...@@ -235,18 +233,9 @@ if __name__ == '__main__': ...@@ -235,18 +233,9 @@ if __name__ == '__main__':
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate() spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
spark.sparkContext.setLogLevel("WARN") spark.sparkContext.setLogLevel("WARN")
device_ids_lst_rdd = spark.sparkContext.parallelize(device_ids_lst) device_ids_lst_rdd = spark.sparkContext.parallelize(device_ids_lst)
result = device_ids_lst_rdd.repartition(100).map(lambda x: get_user_tag_score(x, all_log_df, all_word_tags)) result = device_ids_lst_rdd.repartition(100).map(lambda x: get_user_tag_score(x, all_word_tags))
result.collect() result.collect()
# result_last = result_rename.withColumn("stat_date", lit(stat_date))
# result_last.show()
# df = result_last.select("stat_date", "cl_id", concat_ws(',', 'tag_list').alias("tag_list"))
# df.show()
# df.write.jdbc(
# mode="overwrite",
# url="jdbc:mysql://172.16.40.158:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&useSSL=true",
# table="user_portrait_tags",
# properties={"driver": 'com.mysql.jdbc.Driver'})
except Exception as e: except Exception as e:
send_email("dist_update_portrait_market", "dist_update_portrait_market", "dist_update_portrait_market") send_email("dist_update_portrait_market", "dist_update_portrait_market", "dist_update_portrait_market")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment