Commit 0eacdc06 authored by 张彦钊's avatar 张彦钊

增加日记视频集合是空的判断

parent 9ef83045
...@@ -21,10 +21,13 @@ def get_video_id(): ...@@ -21,10 +21,13 @@ def get_video_id():
result = cursor.fetchall() result = cursor.fetchall()
df = pd.DataFrame(list(result)) df = pd.DataFrame(list(result))
print("videio_id 预览") print("videio_id 预览")
print(df.head(2)) print(df.head(1))
video_id = df[0].values.tolist()
db.close() db.close()
return video_id if df.empty:
return False
else:
video_id = df[0].values.tolist()
return video_id
# 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致 # 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致
...@@ -109,42 +112,50 @@ def get_score(queue_arg): ...@@ -109,42 +112,50 @@ def get_score(queue_arg):
def update_dairy_queue(score_df,predict_score_df,total_video_id): def update_dairy_queue(score_df,predict_score_df,total_video_id):
diary_id = score_df["cid"].values.tolist() diary_id = score_df["cid"].values.tolist()
if total_video_id:
video_id = list(set(diary_id)&set(total_video_id)) video_id = list(set(diary_id)&set(total_video_id))
if len(video_id)>0:
if len(video_id)>0: not_video = list(set(diary_id) - set(video_id))
not_video = list(set(diary_id) - set(video_id)) # 为了相加时cid能够匹配,先把cid变成索引
# 为了相加时cid能够匹配,先把cid变成索引 not_video_df = score_df.loc[score_df["cid"].isin(not_video)].set_index(["cid"])
not_video_df = score_df.loc[score_df["cid"].isin(not_video)].set_index(["cid"]) not_video_predict_df = predict_score_df.loc[predict_score_df["cid"].isin(not_video)].set_index(["cid"])
not_video_predict_df = predict_score_df.loc[predict_score_df["cid"].isin(not_video)].set_index(["cid"]) not_video_df["score"] = not_video_df["score"] + not_video_predict_df["score"]
not_video_df["score"] = not_video_df["score"] + not_video_predict_df["score"] not_video_df = not_video_df.sort_values(by="score", ascending=False)
not_video_df = not_video_df.sort_values(by="score", ascending=False)
video_df = score_df.loc[score_df["cid"].isin(video_id)].set_index(["cid"])
video_df = score_df.loc[score_df["cid"].isin(video_id)].set_index(["cid"]) video_predict_df = predict_score_df.loc[predict_score_df["cid"].isin(video_id)].set_index(["cid"])
video_predict_df = predict_score_df.loc[predict_score_df["cid"].isin(video_id)].set_index(["cid"]) video_df["score"] = video_df["score"] + video_predict_df["score"]
video_df["score"] = video_df["score"] + video_predict_df["score"] video_df = video_df.sort_values(by="score", ascending=False)
video_df = video_df.sort_values(by="score", ascending=False)
not_video_id = not_video_df.index.tolist()
not_video_id = not_video_df.index.tolist() video_id = video_df.index.tolist()
video_id = video_df.index.tolist() new_queue = not_video_id
new_queue = not_video_id i = 1
i = 1 for j in video_id:
for j in video_id: new_queue.insert(i, j)
new_queue.insert(i, j) i += 5
i += 5
# print("分数合并成功")
# print("分数合并成功") return new_queue
return new_queue # 如果取交集后没有视频日记
# 如果没有视频日记 else:
score_df = score_df.set_index(["cid"])
predict_score_df = predict_score_df.set_index(["cid"])
score_df["score"]=score_df["score"]+predict_score_df["score"]
score_df = score_df.sort_values(by="score", ascending=False)
# print("分数合并成功1")
return score_df.index.tolist()
# 如果total_video_id是空
else: else:
score_df = score_df.set_index(["cid"]) score_df = score_df.set_index(["cid"])
predict_score_df = predict_score_df.set_index(["cid"]) predict_score_df = predict_score_df.set_index(["cid"])
score_df["score"]=score_df["score"]+predict_score_df["score"] score_df["score"] = score_df["score"] + predict_score_df["score"]
score_df = score_df.sort_values(by="score", ascending=False) score_df = score_df.sort_values(by="score", ascending=False)
# print("分数合并成功1") # print("分数合并成功1")
return score_df.index.tolist() return score_df.index.tolist()
def update_sql_dairy_queue(queue_name, diary_id,device_id, city_id): def update_sql_dairy_queue(queue_name, diary_id,device_id, city_id):
db = pymysql.connect(host='rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com', port=3306, user='doris', db = pymysql.connect(host='rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com', port=3306, user='doris',
passwd='o5gbA27hXHHm', db='doris_prod') passwd='o5gbA27hXHHm', db='doris_prod')
......
...@@ -6,12 +6,12 @@ import time ...@@ -6,12 +6,12 @@ import time
def fetch_data(start_date, end_date): def fetch_data(start_date, end_date):
# 获取点击表里的device_id # 获取点击表里的device_id
sql = "select distinct device_id from data_feed_click" sql = "select distinct device_id from data_feed_click2"
click_device_id = con_sql(sql)[0].values.tolist() click_device_id = con_sql(sql)[0].values.tolist()
print("成功获取点击表里的device_id") print("成功获取点击表里的device_id")
# 获取点击表里的数据 # 获取点击表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_click " \ sql = "select cid,device_id,time,stat_date from data_feed_click2 " \
"where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date) "where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
click = con_sql(sql) click = con_sql(sql)
click = click.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"}) click = click.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"})
...@@ -22,7 +22,7 @@ def fetch_data(start_date, end_date): ...@@ -22,7 +22,7 @@ def fetch_data(start_date, end_date):
click = click.drop("time_date", axis=1) click = click.drop("time_date", axis=1)
# 获取曝光表里的数据 # 获取曝光表里的数据
sql = "select cid,device_id,time,stat_date from data_feed_exposure " \ sql = "select cid,device_id,time,stat_date from data_feed_exposure2 " \
"where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date) "where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
start = time.time() start = time.time()
exposure = con_sql(sql) exposure = con_sql(sql)
......
...@@ -48,7 +48,7 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date): ...@@ -48,7 +48,7 @@ def feature_en(data_start_date, data_end_date, validation_date, test_date):
data["hour"] = data["hour"].astype("category") data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category") data["minute"] = data["minute"].astype("category")
# 持久化候选cid # 持久化候选cid,选预测候选集时用这个过滤
data_set_cid = data["cid"].unique() data_set_cid = data["cid"].unique()
cid_df = pd.DataFrame() cid_df = pd.DataFrame()
cid_df['cid'] = data_set_cid cid_df['cid'] = data_set_cid
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment