from config import * import pandas as pd import pickle import xlearn as xl from userProfile import * import time from utils import * import os # 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致 def feature_en(user_profile): file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop3000.csv".format(user_profile['city_id']) data = pd.read_csv(file_name) data["device_id"] = user_profile['device_id'] now = datetime.now() data["hour"] = now.hour data["minute"] = now.minute data.loc[data["hour"] == 0, ["hour"]] = 24 data.loc[data["minute"] == 0, ["minute"]] = 60 data["hour"] = data["hour"].astype("category") data["minute"] = data["minute"].astype("category") # 虽然预测y,但ffm转化需要y,并不影响预测结果 data["y"] = 0 data = data.drop("city_id", axis=1) print(data.head(1)) print("特征工程处理结束") return data # 把ffm.pkl load进来,将上面的表转化为ffm格式 def transform_ffm_format(df, device_id): with open(DIRECTORY_PATH+"ffm.pkl","rb") as f: ffm_format_pandas = pickle.load(f) data = ffm_format_pandas.transform(df) now = datetime.now().strftime("%Y-%m-%d-%H-%M") predict_file_name = DIRECTORY_PATH + "result/{0}_{1}DiaryTop3000.csv".format(device_id, now) data.to_csv(predict_file_name, index=False,header=None) print("成功将ffm预测文件写到服务器") return predict_file_name # 将模型加载,预测,把预测日记的概率值按照降序排序,存到一个表里 def predict(user_profile): instance = feature_en(user_profile) instance_file_path = transform_ffm_format(instance, user_profile["device_id"]) ffm_model = xl.create_ffm() ffm_model.setTest(instance_file_path) ffm_model.setSigmoid() ffm_model.predict(DIRECTORY_PATH + "model.out", DIRECTORY_PATH + "result/{0}_output.txt".format(user_profile['device_id'])) print("该用户预测结束") predict_save_to_local(user_profile, instance) #TODO 没有提供生产环境的redis地址,所以这个函数先不运行 # predict_save_to_redis(user_profile, instance) # 将预测结果与device_id 进行拼接,并按照概率降序排序 def wrapper_result(user_profile, instance): proba = pd.read_csv(DIRECTORY_PATH + "result/{0}_output.txt".format(user_profile['device_id']), header=None) proba = proba.rename(columns={0: "prob"}) proba["cid"] = instance['cid'] proba = proba.sort_values(by="prob", ascending=False) proba = proba.head(50) return proba # 预测候选集保存到本地 def predict_save_to_local(user_profile, instance): proba = wrapper_result(user_profile, instance) proba.loc[:, "url"] = proba["cid"].apply(lambda x: "http://m.igengmei.com/diary_book/" + str(x[6:]) + '/') proba.to_csv(DIRECTORY_PATH + "result/feed_{}".format(user_profile['device_id']), index=False) print("成功将预测候选集保存到本地") # 预测候选集保存到redis def predict_save_to_redis(user_profile, instance): device_id = user_profile['device_id'] cid_list = wrapper_result(user_profile, instance)["cid"].values.tolist() add_data_to_redis(device_id,cid_list) print("成功将预测候选集保存到redis") def router(device_id): user_profile, not_exist = fetch_user_profile(device_id) if not_exist: print('Sorry, we don\'t have you.') else: predict(user_profile) if __name__ == "__main__": # TODO 如果耗时小于一分钟,下一次取到的device_id和上一次相同 while True: empty,device_id_list = get_active_users() if empty: for eachFile in os.listdir("/tmp"): if "xlearn" in eachFile: os.remove("/tmp" + "/" + eachFile) time.sleep(58) else: old_device_id_list = pd.read_csv(DIRECTORY_PATH + "data_set_device_id.csv")["device_id"].values.tolist() for device_id in device_id_list: if device_id in old_device_id_list: start = time.time() router(device_id) end = time.time() print("该用户预测耗时{}秒".format(end - start)) else: print("该用户不是老用户,不能预测")