Commit 3ac71436 authored by 张彦钊's avatar 张彦钊

解决过滤前样本是0的bug

parent c9ffad87
......@@ -9,9 +9,10 @@ def filter_cid(df):
data_set_cid = pd.read_csv(DIRECTORY_PATH + "data_set_cid.csv")["cid"].values.tolist()
print("过滤前样本大小:")
print(df.shape)
df = df.loc[df["cid"].isin(data_set_cid)]
print("过滤后样本大小:")
print(df.shape)
if not df.empty:
df = df.loc[df["cid"].isin(data_set_cid)]
print("过滤后样本大小:")
print(df.shape)
return df
......
......@@ -101,6 +101,6 @@ def ffm_transform(data, test_number, validation_number):
if __name__ == "__main__":
data_fe = feature_en()
# ffm_transform(data_fe)
ffm_transform(data_fe)
......@@ -10,18 +10,21 @@ from userProfile import fetch_user_profile
# 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致
def device_id_merge(user_profile):
def feature_en(user_profile):
file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop2000.csv".format(user_profile['city_id'])
data = pd.read_csv(file_name)
data["device_id"] = user_profile['device_id']
now = datetime.datetime.now()
data["hour"] = now.hour
data["minute"] = now.minute
data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
data["y"] = 0
data = data.drop("city_id", axis=1)
print(data.head(2))
......@@ -37,7 +40,7 @@ def transform_ffm_format(df, device_id):
data = ffm_format_pandas.transform(df)
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
predict_file_name = DIRECTORY_PATH + "diaryPredictSet/{0}_{1}DiaryTop2000.csv".format(device_id, now)
predict_file_name = DIRECTORY_PATH + "result/{0}_{1}DiaryTop2000.csv".format(device_id, now)
data.to_csv(predict_file_name)
user_instance_file_path = ''
return user_instance_file_path
......@@ -45,15 +48,33 @@ def transform_ffm_format(df, device_id):
# 将模型加载,预测,把预测日记的概率值按照降序排序,存到一个表里
def predict(user_profile):
ffm_model = xl.create_ffm()
user_instance = device_id_merge(user_profile)
user_instance = feature_en(user_profile)
user_instance_file_path = transform_ffm_format(user_instance)
ffm_model = xl.create_ffm()
ffm_model.setTest(user_instance_file_path)
ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DATA_START_DATE,
DATA_END_DATE, lr, l2_lambda),
DIRECTORY_PATH + "/{0}_output.txt".format(user_profile['device_id']))
DIRECTORY_PATH + "result/{0}_output.txt".format(user_profile['device_id']))
upload_predict(user_profile, user_instance)
def upload_predict(user_profile, instance):
probabilities = pd.read_csv(DIRECTORY_PATH +
"result/{0}_output.txt".format(user_profile['device_id']), header=None)
probabilities = probabilities.rename(columns={0: "prob"})
probabilities["cid"] = instance['cid']
probabilities = probabilities.sort_values(by="0",ascending=False)
wrapper_result(probabilities,user_profile['device_id'])
def wrapper_result(prob,device_id):
prob = prob.head(500)
prob["url"] = prob["cid"].apply(lambda x:"http://m.igengmei.com/diary_book/" + prob[x.index('|') + 1:] + '/')
prob.to_csv(DIRECTORY_PATH+"result/{}_feed".format(device_id))
def router(device_id):
......@@ -61,8 +82,9 @@ def router(device_id):
if is_exist:
predict(user_profile)
else:
print('Sorry, we don\'t have you')
print('Sorry, we don\'t have you.')
if __name__ == "__main__":
......
......@@ -5,5 +5,4 @@ def fetch_user_profile(device_id):
# TODO sql语句中的device_id可能对应多个city_id
sql = "select device_id,city_id from data_feed_click limit 1"
user_profile = con_sql(sql)
is_exist = user_profile.empty
return user_profile, is_exist
return user_profile, user_profile.empty
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment