from config import * import pandas as pd import pickle import xlearn as xl from userProfile import * import time from utils import * # 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致 def feature_en(user_profile): file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop3000.csv".format(user_profile['city_id']) data = pd.read_csv(file_name) data["device_id"] = user_profile['device_id'] now = datetime.now() data["hour"] = now.hour data["minute"] = now.minute data.loc[data["hour"] == 0, ["hour"]] = 24 data.loc[data["minute"] == 0, ["minute"]] = 60 data["hour"] = data["hour"].astype("category") data["minute"] = data["minute"].astype("category") # 虽然预测y,但ffm转化需要y,并不影响预测结果 data["y"] = 0 data = data.drop("city_id", axis=1) print(data.head(1)) print("特征工程处理结束") return data # 把ffm.pkl load进来,将上面的表转化为ffm格式 def transform_ffm_format(df, device_id): file_path = DIRECTORY_PATH + "ffm_{0}_{1}.pkl".format(DATA_START_DATE, DATA_END_DATE) with open(file_path, "rb") as f: ffm_format_pandas = pickle.load(f) data = ffm_format_pandas.transform(df) now = datetime.now().strftime("%Y-%m-%d-%H-%M") print("ffm格式转化结束") predict_file_name = DIRECTORY_PATH + "result/{0}_{1}DiaryTop3000.csv".format(device_id, now) data.to_csv(predict_file_name, index=False,header=None) return predict_file_name # 将模型加载,预测,把预测日记的概率值按照降序排序,存到一个表里 def predict(user_profile): instance = feature_en(user_profile) instance_file_path = transform_ffm_format(instance, user_profile["device_id"]) ffm_model = xl.create_ffm() ffm_model.setTest(instance_file_path) ffm_model.setSigmoid() ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DATA_START_DATE, DATA_END_DATE, lr, l2_lambda), DIRECTORY_PATH + "result/{0}_output.txt".format(user_profile['device_id'])) print("预测结束") predict_save_to_local(user_profile, instance) #TODO 没有提供生产环境的redis地址,所以这个函数先不运行 # predict_save_to_redis(user_profile, instance) # 将预测结果与device_id 进行拼接,并按照概率降序排序 def wrapper_result(user_profile, instance): proba = pd.read_csv(DIRECTORY_PATH + "result/{0}_output.txt".format(user_profile['device_id']), header=None) proba = proba.rename(columns={0: "prob"}) proba["cid"] = instance['cid'] proba = proba.sort_values(by="prob", ascending=False) proba = proba.head(50) return proba # 预测候选集保存到本地 def predict_save_to_local(user_profile, instance): proba = wrapper_result(user_profile, instance) proba.loc[:, "url"] = proba["cid"].apply(lambda x: "http://m.igengmei.com/diary_book/" + str(x[6:]) + '/') proba.to_csv(DIRECTORY_PATH + "result/feed_{}".format(user_profile['device_id']), index=False) print("成功将预测候选集保存到本地") # 预测候选集保存到redis def predict_save_to_redis(user_profile, instance): device_id = user_profile['device_id'] cid_list = wrapper_result(user_profile, instance)["cid"].values.tolist() add_data_to_redis(device_id,cid_list) print("成功将预测候选集保存到redis") def router(device_id): user_profile, not_exist = fetch_user_profile(device_id) if not_exist: print('Sorry, we don\'t have you.') else: predict(user_profile) if __name__ == "__main__": # TODO 如果耗时小于一分钟,下一次取到的device_id和上一次相同 while True: start = time.time() empty,device_id_list = get_active_users() if empty: time.sleep(60) else: old_device_id_list = pd.read_csv(DIRECTORY_PATH + "data_set_device_id.csv")["device_id"].values.tolist() for device_id in device_id_list: if device_id in old_device_id_list: router(device_id) else: print("该用户不是老用户,不能预测") end = time.time() time_cost = (end - start) print("耗时{}秒".format(time_cost))