add codes in test file

17f114c6 · 张彦钊 · eeae786a · 17f114c6
Commit 17f114c6 authored Aug 14, 2018 by 张彦钊
Show whitespace changes
Inline Side-by-side

Showing with 103 additions and 1 deletion

test_supervisor.py local/test_supervisor.py +103 -1

No files found.
--- a/local/test_supervisor.py
+++ b/local/test_supervisor.py
 import pymysql
 import pandas as pd
 from datetime import datetime
+import time
+from config import *
+import pickle
+import xlearn as xl
 def con_sql(sql):
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
@@ -31,6 +35,104 @@ def get_active_users():
        # 为了debug supervisor，修改了下面的return参数
        return "0",device_id_list
+def fetch_user_profile(device_id):
+    sql = "select device_id,city_id from data_feed_click where device_id = '{0}' limit 1".format(device_id)
+    user_profile = con_sql(sql)
+    if user_profile.empty:
+        print("没有获取到该用户对应的city_id")
+        # 为了debug supervisor，修改了下面的return参数
+        return {1: 2}, 1
+    else:
+        user_profile = user_profile.rename(columns={0: "device_id", 1: "city_id"})
+        print("成功获取该用户对应的city_id")
+        user_profile_dict = {}
+        for i in user_profile.columns:
+            user_profile_dict[i] = user_profile.loc[0, i]
+            # 为了debug supervisor，修改了下面的return参数
+        return user_profile_dict, "0"
+def feature_en(user_profile):
+    file_name = DIRECTORY_PATH + "diaryTestSet/{0}DiaryTop3000.csv".format(user_profile['city_id'])
+    data = pd.read_csv(file_name)
+    data["device_id"] = user_profile['device_id']
+    now = datetime.now()
+    data["hour"] = now.hour
+    data["minute"] = now.minute
+    data.loc[data["hour"] == 0, ["hour"]] = 24
+    data.loc[data["minute"] == 0, ["minute"]] = 60
+    data["hour"] = data["hour"].astype("category")
+    data["minute"] = data["minute"].astype("category")
+    # 虽然预测y，但ffm转化需要y，并不影响预测结果
+    data["y"] = 0
+    data = data.drop("city_id", axis=1)
+    print(data.head(1))
+    print("特征工程处理结束")
+    return data
+def transform_ffm_format(df, device_id):
+    file_path = DIRECTORY_PATH + "ffm_{0}_{1}.pkl".format(DATA_START_DATE, DATA_END_DATE)
+    with open(file_path, "rb") as f:
+        ffm_format_pandas = pickle.load(f)
+        data = ffm_format_pandas.transform(df)
+        now = datetime.now().strftime("%Y-%m-%d-%H-%M")
+        print("ffm格式转化结束")
+        predict_file_name = DIRECTORY_PATH + "result/{0}_{1}DiaryTop3000.csv".format(device_id, now)
+        data.to_csv(predict_file_name, index=False,header=None)
+        print("ffm写到服务器")
+        return predict_file_name
+def wrapper_result(user_profile, instance):
+    proba = pd.read_csv(DIRECTORY_PATH +
+                                "result/{0}_output.txt".format(user_profile['device_id']), header=None)
+    proba = proba.rename(columns={0: "prob"})
+    proba["cid"] = instance['cid']
+    proba = proba.sort_values(by="prob", ascending=False)
+    proba = proba.head(50)
+    return proba
+def predict_save_to_local(user_profile, instance):
+    proba = wrapper_result(user_profile, instance)
+    proba.loc[:, "url"] = proba["cid"].apply(lambda x: "http://m.igengmei.com/diary_book/" + str(x[6:]) + '/')
+    proba.to_csv(DIRECTORY_PATH + "result/feed_{}".format(user_profile['device_id']), index=False)
+    print("成功将预测候选集保存到本地")
+def predict(user_profile):
+    instance = feature_en(user_profile)
+    instance_file_path = transform_ffm_format(instance, user_profile["device_id"])
+    ffm_model = xl.create_ffm()
+    ffm_model.setTest(instance_file_path)
+    ffm_model.setSigmoid()
+    ffm_model.predict(DIRECTORY_PATH + "model_{0}-{1}_lr{2}_lambda{3}.out".format(DATA_START_DATE,
+                                                                                  DATA_END_DATE, lr, l2_lambda),
+                      DIRECTORY_PATH + "result/{0}_output.txt".format(user_profile['device_id']))
+    print("预测结束")
+    predict_save_to_local(user_profile, instance)
+def router(device_id):
+    user_profile, not_exist = fetch_user_profile(device_id)
+    if not_exist==1:
+        print('Sorry, we don\'t have you.')
+    else:
+        predict(user_profile)
 if __name__ == "__main__":
    while True:
-        get_active_users()
+        start = time.time()
+        empty,device_id_list = get_active_users()
+        if empty==1:
+            time.sleep(60)
+        else:
+            old_device_id_list = pd.read_csv(DIRECTORY_PATH + "data_set_device_id.csv")["device_id"].values.tolist()
+            for device_id in device_id_list:
+                if device_id in old_device_id_list:
+                    router(device_id)
+                else:
+                    print("该用户不是老用户，不能预测")
+            end = time.time()
+            time_cost = (end - start)
+            print("耗时{}秒".format(time_cost))