Commit 3ac71436 authored by 张彦钊's avatar 张彦钊

解决过滤前样本是0的bug

parent c9ffad87
...@@ -9,9 +9,10 @@ def filter_cid(df): ...@@ -9,9 +9,10 @@ def filter_cid(df):
data_set_cid = pd.read_csv(DIRECTORY_PATH + "data_set_cid.csv")["cid"].values.tolist() data_set_cid = pd.read_csv(DIRECTORY_PATH + "data_set_cid.csv")["cid"].values.tolist()
print("过滤前样本大小:") print("过滤前样本大小:")
print(df.shape) print(df.shape)
df = df.loc[df["cid"].isin(data_set_cid)] if not df.empty:
print("过滤后样本大小:") df = df.loc[df["cid"].isin(data_set_cid)]
print(df.shape) print("过滤后样本大小:")
print(df.shape)
return df return df
......
...@@ -101,6 +101,6 @@ def ffm_transform(data, test_number, validation_number): ...@@ -101,6 +101,6 @@ def ffm_transform(data, test_number, validation_number):
if __name__ == "__main__": if __name__ == "__main__":
data_fe = feature_en() data_fe = feature_en()
# ffm_transform(data_fe) ffm_transform(data_fe)
...@@ -10,18 +10,21 @@ from userProfile import fetch_user_profile ...@@ -10,18 +10,21 @@ from userProfile import fetch_user_profile
# 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致 # 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致
def device_id_merge(user_profile): def feature_en(user_profile):
file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop2000.csv".format(user_profile['city_id']) file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop2000.csv".format(user_profile['city_id'])
data = pd.read_csv(file_name) data = pd.read_csv(file_name)
data["device_id"] = user_profile['device_id'] data["device_id"] = user_profile['device_id']
now = datetime.datetime.now() now = datetime.datetime.now()
data["hour"] = now.hour data["hour"] = now.hour
data["minute"] = now.minute data["minute"] = now.minute
data.loc[data["hour"] == 0, ["hour"]] = 24 data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"] == 0, ["minute"]] = 60 data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category") data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category") data["minute"] = data["minute"].astype("category")
data["y"] = 0 data["y"] = 0
data = data.drop("city_id", axis=1) data = data.drop("city_id", axis=1)
print(data.head(2)) print(data.head(2))
...@@ -37,7 +40,7 @@ def transform_ffm_format(df, device_id): ...@@ -37,7 +40,7 @@ def transform_ffm_format(df, device_id):
data = ffm_format_pandas.transform(df) data = ffm_format_pandas.transform(df)
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
predict_file_name = DIRECTORY_PATH + "diaryPredictSet/{0}_{1}DiaryTop2000.csv".format(device_id, now) predict_file_name = DIRECTORY_PATH + "result/{0}_{1}DiaryTop2000.csv".format(device_id, now)
data.to_csv(predict_file_name) data.to_csv(predict_file_name)
user_instance_file_path = '' user_instance_file_path = ''
return user_instance_file_path return user_instance_file_path
...@@ -45,15 +48,33 @@ def transform_ffm_format(df, device_id): ...@@ -45,15 +48,33 @@ def transform_ffm_format(df, device_id):
# 将模型加载,预测,把预测日记的概率值按照降序排序,存到一个表里 # 将模型加载,预测,把预测日记的概率值按照降序排序,存到一个表里
def predict(user_profile): def predict(user_profile):
ffm_model = xl.create_ffm() user_instance = feature_en(user_profile)
user_instance = device_id_merge(user_profile)
user_instance_file_path = transform_ffm_format(user_instance) user_instance_file_path = transform_ffm_format(user_instance)
ffm_model = xl.create_ffm()
ffm_model.setTest(user_instance_file_path) ffm_model.setTest(user_instance_file_path)
ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DATA_START_DATE, ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DATA_START_DATE,
DATA_END_DATE, lr, l2_lambda), DATA_END_DATE, lr, l2_lambda),
DIRECTORY_PATH + "/{0}_output.txt".format(user_profile['device_id'])) DIRECTORY_PATH + "result/{0}_output.txt".format(user_profile['device_id']))
upload_predict(user_profile, user_instance)
def upload_predict(user_profile, instance):
probabilities = pd.read_csv(DIRECTORY_PATH +
"result/{0}_output.txt".format(user_profile['device_id']), header=None)
probabilities = probabilities.rename(columns={0: "prob"})
probabilities["cid"] = instance['cid']
probabilities = probabilities.sort_values(by="0",ascending=False)
wrapper_result(probabilities,user_profile['device_id'])
def wrapper_result(prob,device_id):
prob = prob.head(500)
prob["url"] = prob["cid"].apply(lambda x:"http://m.igengmei.com/diary_book/" + prob[x.index('|') + 1:] + '/')
prob.to_csv(DIRECTORY_PATH+"result/{}_feed".format(device_id))
def router(device_id): def router(device_id):
...@@ -61,8 +82,9 @@ def router(device_id): ...@@ -61,8 +82,9 @@ def router(device_id):
if is_exist: if is_exist:
predict(user_profile) predict(user_profile)
else: else:
print('Sorry, we don\'t have you') print('Sorry, we don\'t have you.')
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -5,5 +5,4 @@ def fetch_user_profile(device_id): ...@@ -5,5 +5,4 @@ def fetch_user_profile(device_id):
# TODO sql语句中的device_id可能对应多个city_id # TODO sql语句中的device_id可能对应多个city_id
sql = "select device_id,city_id from data_feed_click limit 1" sql = "select device_id,city_id from data_feed_click limit 1"
user_profile = con_sql(sql) user_profile = con_sql(sql)
is_exist = user_profile.empty return user_profile, user_profile.empty
return user_profile, is_exist
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment