Commit 17f114c6 authored by 张彦钊's avatar 张彦钊

add codes in test file

parent eeae786a
import pymysql import pymysql
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
import time
from config import *
import pickle
import xlearn as xl
def con_sql(sql): def con_sql(sql):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
...@@ -31,6 +35,104 @@ def get_active_users(): ...@@ -31,6 +35,104 @@ def get_active_users():
# 为了debug supervisor,修改了下面的return参数 # 为了debug supervisor,修改了下面的return参数
return "0",device_id_list return "0",device_id_list
def fetch_user_profile(device_id):
sql = "select device_id,city_id from data_feed_click where device_id = '{0}' limit 1".format(device_id)
user_profile = con_sql(sql)
if user_profile.empty:
print("没有获取到该用户对应的city_id")
# 为了debug supervisor,修改了下面的return参数
return {1: 2}, 1
else:
user_profile = user_profile.rename(columns={0: "device_id", 1: "city_id"})
print("成功获取该用户对应的city_id")
user_profile_dict = {}
for i in user_profile.columns:
user_profile_dict[i] = user_profile.loc[0, i]
# 为了debug supervisor,修改了下面的return参数
return user_profile_dict, "0"
def feature_en(user_profile):
file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop3000.csv".format(user_profile['city_id'])
data = pd.read_csv(file_name)
data["device_id"] = user_profile['device_id']
now = datetime.now()
data["hour"] = now.hour
data["minute"] = now.minute
data.loc[data["hour"] == 0, ["hour"]] = 24
data.loc[data["minute"] == 0, ["minute"]] = 60
data["hour"] = data["hour"].astype("category")
data["minute"] = data["minute"].astype("category")
# 虽然预测y,但ffm转化需要y,并不影响预测结果
data["y"] = 0
data = data.drop("city_id", axis=1)
print(data.head(1))
print("特征工程处理结束")
return data
def transform_ffm_format(df, device_id):
file_path = DIRECTORY_PATH + "ffm_{0}_{1}.pkl".format(DATA_START_DATE, DATA_END_DATE)
with open(file_path, "rb") as f:
ffm_format_pandas = pickle.load(f)
data = ffm_format_pandas.transform(df)
now = datetime.now().strftime("%Y-%m-%d-%H-%M")
print("ffm格式转化结束")
predict_file_name = DIRECTORY_PATH + "result/{0}_{1}DiaryTop3000.csv".format(device_id, now)
data.to_csv(predict_file_name, index=False,header=None)
print("ffm写到服务器")
return predict_file_name
def wrapper_result(user_profile, instance):
proba = pd.read_csv(DIRECTORY_PATH +
"result/{0}_output.txt".format(user_profile['device_id']), header=None)
proba = proba.rename(columns={0: "prob"})
proba["cid"] = instance['cid']
proba = proba.sort_values(by="prob", ascending=False)
proba = proba.head(50)
return proba
def predict_save_to_local(user_profile, instance):
proba = wrapper_result(user_profile, instance)
proba.loc[:, "url"] = proba["cid"].apply(lambda x: "http://m.igengmei.com/diary_book/" + str(x[6:]) + '/')
proba.to_csv(DIRECTORY_PATH + "result/feed_{}".format(user_profile['device_id']), index=False)
print("成功将预测候选集保存到本地")
def predict(user_profile):
instance = feature_en(user_profile)
instance_file_path = transform_ffm_format(instance, user_profile["device_id"])
ffm_model = xl.create_ffm()
ffm_model.setTest(instance_file_path)
ffm_model.setSigmoid()
ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DATA_START_DATE,
DATA_END_DATE, lr, l2_lambda),
DIRECTORY_PATH + "result/{0}_output.txt".format(user_profile['device_id']))
print("预测结束")
predict_save_to_local(user_profile, instance)
def router(device_id):
user_profile, not_exist = fetch_user_profile(device_id)
if not_exist==1:
print('Sorry, we don\'t have you.')
else:
predict(user_profile)
if __name__ == "__main__": if __name__ == "__main__":
while True: while True:
get_active_users() start = time.time()
empty,device_id_list = get_active_users()
if empty==1:
time.sleep(60)
else:
old_device_id_list = pd.read_csv(DIRECTORY_PATH + "data_set_device_id.csv")["device_id"].values.tolist()
for device_id in device_id_list:
if device_id in old_device_id_list:
router(device_id)
else:
print("该用户不是老用户,不能预测")
end = time.time()
time_cost = (end - start)
print("耗时{}秒".format(time_cost))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment