Merge branch 'master' into gyz

f154eb8d · 高雅喆 · eae303de · ae152f48 · f154eb8d · f154eb8d
Commit f154eb8d authored Sep 26, 2018 by 高雅喆
10 changed files
--- a/config.py
+++ b/config.py
+# 线上地址
 DIRECTORY_PATH = '/data/models/'
-# 测试日期一定要大于验证日期，因为切割数据集的代码是这样设置的
+# 本地地址
-# VALIDATION_DATE = '2018-08-05'
+LOCAL_DIRCTORY = "/Users/mac/utils/"
-# TEST_DATE = '2018-08-06'
+# 线上用户活跃表db
-# DATA_START_DATE = '2018-07-05'
+ACTIVE_USER_DB_ONLINE = {"host":'10.66.157.22', "port":4000, "user":'root',
-# DATA_END_DATE = '2018-08-06'
+                  "passwd":'3SYz54LS9#^9sBvC', "db":'jerry_prod'}
+#线下用户活跃表db
+ACTIVE_USER_DB_LOCAL = {"host":'192.168.15.12', "port":4000, "user":'root', "db":'jerry_test'}
+# 线上日记队列db
+QUEUE_DB_ONLINE = {"host":'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com', "port":3306,"user":'doris',
+                   "passwd":'o5gbA27hXHHm',"db":'doris_prod'}
+# 本地日记队列db
+QUEUE_DB_LOCAL = {"host":'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com',"port":3306,"user":'work',
+                             "passwd":'workwork',"db":'doris_test'}
+# 线上日记打分
+SCORE_DB_ONLINE = {"host":'10.66.157.22', "port":4000, "user":'root',"passwd":'3SYz54LS9#^9sBvC', "db":'eagle'}
+# 本地日记打分db
+SCORE_DB_LOCAL = {"host":'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com', "port":3306, "user":'work',
+                  "passwd":'workwork', "db":'zhengxing_test'}
 MODEL_VERSION = ''
 lr = 0.03
 l2_lambda = 0.002
@@ -18,9 +32,6 @@ ONLINE_EAGLE_HOST = '10.66.157.22'
 # 测试日记视频所在的ip
 LOCAL_EAGLE_HOST = "192.168.15.12"
-# 本地地址
-LOCAL_DIRCTORY = "/Users/mac/utils/"
 # 线上日记队列域名
 QUEUE_ONLINE_HOST = 'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
@@ -28,3 +39,10 @@ QUEUE_ONLINE_HOST = 'rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com'
 LOCAL_HOST = 'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
--- a/ctr.py
+++ b/ctr.py
+import pandas as pd
+import pymysql
+from datetime import datetime
+from datetime import timedelta
+def get_tail8():
+    sql = "select distinct device_id from data_feed_click \
+      where stat_date='{}' \
+      and cid_type='{}' \
+      and device_id regexp '8$';".format(stat_date,cid_type)
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    cursor = db.cursor()
+    print("开始获取")
+    cursor.execute(sql)
+    print("成功获取")
+    result = cursor.fetchall()
+    db.close()
+    user = pd.DataFrame(list(result))[0].values.tolist()
+    user = tuple(user)
+    print("尾号是8的用户个数")
+    print(len(user))
+    return user
+def get_ctr(user_tuple):
+    sql = "select count(device_id) from data_feed_click \
+              where stat_date='{}' \
+              and cid_type='{}' \
+              and device_id in {}".format(stat_date, cid_type, user_tuple)
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    cursor = db.cursor()
+    print("开始获取")
+    cursor.execute(sql)
+    click = cursor.fetchall()[0][0]
+    print(click)
+    sql = "select count(device_id) from data_feed_exposure \
+              where stat_date='{}' \
+              and cid_type='{}' \
+              and device_id in {}".format(stat_date, cid_type, user_tuple)
+    cursor = db.cursor()
+    print("开始获取")
+    cursor.execute(sql)
+    exp = cursor.fetchall()[0][0]
+    db.close()
+    print(exp)
+    print(click / exp)
+def get_tail6():
+    df = pd.read_csv(path+"{}predictTail6Unique.csv".format(stat_date))
+    pre_list = tuple(eval(df.loc[0,"list"]))
+    print(len(pre_list))
+    print(pre_list[:2])
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    sql = "select distinct device_id from data_feed_click \
+          where stat_date='{}' \
+          and cid_type='{}' \
+          and device_id in {}".format(stat_date,cid_type,pre_list)
+    cursor = db.cursor()
+    print("开始获取")
+    cursor.execute(sql)
+    print("成功获取")
+    result = cursor.fetchall()
+    db.close()
+    print(pd.DataFrame(list(result)).empty)
+    user = pd.DataFrame(list(result))[0].values.tolist()
+    user = tuple(user)
+    print("用户个数")
+    print(len(user))
+    return user
+if __name__ == "__main__":
+    path = "/data/models/"
+    cid_type = "diary"
+    now = datetime.now()
+    year = now.year
+    month = now.month
+    day = now.day
+    stat_date = datetime(year, month, day)
+    stat_date = (stat_date - timedelta(days=1)).strftime("%Y-%m-%d")
+    print(stat_date)
+    tail6 = get_tail6()
+    get_ctr(tail6)
+    tail8 = get_tail8()
+    get_ctr(tail8)
--- a/diaryUpdateOnlineOffline.py
+++ b/diaryUpdateOnlineOffline.py
@@ -13,7 +13,7 @@ from userProfile import get_active_users
 from sklearn.preprocessing import MinMaxScaler
 import time
 from config import *
-import socket
+from utils import judge_online,con_sql
 def get_video_id(cache_video_id):
@@ -38,6 +38,8 @@ def get_video_id(cache_video_id):
        return cache_video_id
    else:
        video_id = df[0].values.tolist()
+        print("videoid")
+        print(video_id[:2])
        return video_id
@@ -110,18 +112,18 @@ def save_result(queue_name,queue_arg,device_id):
 def get_score(queue_arg):
    if flag:
-        db = pymysql.connect(host='10.66.157.22', port=4000, user='root',passwd='3SYz54LS9#^9sBvC', db='eagle')
+        db = pymysql.connect(host=SCORE_DB_ONLINE["host"], port=SCORE_DB_ONLINE["port"],
+                             user=SCORE_DB_ONLINE["user"],passwd=SCORE_DB_ONLINE["passwd"],
+                             db=SCORE_DB_ONLINE["db"])
    else:
-        db = pymysql.connect(host=LOCAL_HOST, port=3306, user='work', passwd='workwork', db='zhengxing_tes')
+        db = pymysql.connect(host=SCORE_DB_LOCAL["host"], port=SCORE_DB_LOCAL["port"],
+                             user=SCORE_DB_LOCAL["user"], passwd=SCORE_DB_LOCAL["passwd"],
+                             db=SCORE_DB_LOCAL["db"])
-    cursor = db.cursor()
    # 去除diary_id 前面的"diary|"
    diary_list = tuple(list(map(lambda x:x[6:],queue_arg[2])))
    sql = "select score,diary_id from biz_feed_diary_score where diary_id in {};".format(diary_list)
-    cursor.execute(sql)
+    score_df = con_sql(db,sql)
-    result = cursor.fetchall()
-    score_df = pd.DataFrame(list(result)).dropna()
-    db.close()
    print("get score")
    return score_df
@@ -177,7 +179,6 @@ def update_sql_dairy_queue(queue_name, diary_id,device_id, city_id):
                             db='doris_prod')
    else:
        db = pymysql.connect(host=LOCAL_HOST, port=3306, user='work',passwd='workwork', db='doris_test')
    cursor = db.cursor()
    id_str = str(diary_id[0])
    for i in range(1, len(diary_id)):
@@ -206,27 +207,28 @@ def queue_compare(old_list, new_list):
    for key in x_dict.keys():
        if x_dict[key] != y_dict[key]:
            i += 1
    if i >0:
        print("日记队列更新前日记总个数{}，位置发生变化个数{}，发生变化率{}%".format(len(old_list), i,
                                                          round(i / len(old_list) * 100), 2))
+        return True
+    else:
+        return False
 def get_queue(device_id, city_id,queue_name):
    if flag:
        db = pymysql.connect(host=QUEUE_ONLINE_HOST, port=3306, user='doris',passwd='o5gbA27hXHHm',
                             db='doris_prod')
    else:
        db = pymysql.connect(host=LOCAL_HOST, port=3306, user='work',
                             passwd='workwork', db='doris_test')
    cursor = db.cursor()
    sql = "select {} from device_diary_queue " \
          "where device_id = '{}' and city_id = '{}';".format(queue_name,device_id, city_id)
    cursor.execute(sql)
    result = cursor.fetchall()
    df = pd.DataFrame(list(result))
    if df.empty:
        print("该用户对应的日记为空")
        return False
@@ -260,12 +262,11 @@ def user_update(device_id, city_id, queue_name,data_set_cid,total_video_id):
        queue_arg = [queue_predict, queue_not_predict, queue_list]
        if queue_predict != []:
            diary_queue = pipe_line(queue_name, queue_arg, device_id,total_video_id)
-            if diary_queue:
+            if diary_queue and queue_compare(queue_list, diary_queue):
                update_sql_dairy_queue(queue_name, diary_queue, device_id, city_id)
-                queue_compare(queue_list,diary_queue)
+                print("更新结束")
-                # print("更新结束")
            else:
-                print("获取的日记列表是空，所以不更新日记队列")
+                print("获取的日记列表是空或者日记队列顺序没有变化，所以不更新日记队列")
        else:
            print("预测集是空，不需要预测")
    else:
@@ -274,6 +275,7 @@ def user_update(device_id, city_id, queue_name,data_set_cid,total_video_id):
 def multi_proecess_update(device_id, city_id, data_set_cid,total_video_id):
    queue_name_list = ["native_queue","nearby_queue","nation_queue","megacity_queue"]
    pool = Pool(4)
    for queue_name in queue_name_list:
        pool.apply_async(user_update, (device_id, city_id, queue_name,data_set_cid,total_video_id,))
@@ -283,31 +285,31 @@ def multi_proecess_update(device_id, city_id, data_set_cid,total_video_id):
 if __name__ == "__main__":
    warnings.filterwarnings("ignore")
-    flag = True
+    flag,path = judge_online()
-    path = DIRECTORY_PATH
-    # 下面这个ip是本地电脑ip
-    if socket.gethostbyname(socket.gethostname()) == '172.30.8.160':
-        flag = False
-        path = LOCAL_DIRCTORY
    # 增加缓存日记视频列表
    cache_video_id = []
    cache_device_city_list = []
    differ = 0
    while True:
-        data_set_cid = pd.read_csv(path + "data_set_cid.csv")["cid"].values.tolist()
-        total_video_id = get_video_id(cache_video_id)
-        cache_video_id = total_video_id
        start = time.time()
-        device_city_list = get_active_users(flag,path,differ)
+        device_city_list = get_active_users(flag, path, differ)
+        time1 = time.time()
+        print("获取用户活跃表耗时：{}秒".format(time1-start))
        # 过滤掉5分钟内预测过的用户
        device_city_list = list(set(tuple(device_city_list))-set(tuple(cache_device_city_list)))
+        print("device_city_list")
+        print(device_city_list)
        if datetime.now().minute % 5 == 0:
            cache_device_city_list = []
        if device_city_list != []:
+            data_set_cid = pd.read_csv(path + "data_set_cid.csv")["cid"].values.tolist()
+            total_video_id = get_video_id(cache_video_id)
+            cache_video_id = total_video_id
            cache_device_city_list.extend(device_city_list)
            for device_city in device_city_list:
                multi_proecess_update(device_city[0], device_city[1], data_set_cid, total_video_id)
        differ = time.time()-start
+        print("differ:{}秒".format(differ))

--- a/local/ctr.py
+++ b/local/ctr.py
+import pandas as pd
+import pymysql
+df = pd.read_csv(r"/data2/models/2018-09-02predictTail6Unique.csv")
+a = eval(df.loc[0,"list"])
+a = list(map(lambda x:x[0],a))
+print(len(a))
+print(a[:2])
+cf = pd.read_csv(r"/data2/models/nvwa-2018-09-02predictTail6Unique.csv")
+b = eval(cf.loc[0,"list"])
+print(len(b))
+print(b[:2])
+a.extend(b)
+print("个数")
+print(len(set(a)))
+pre_list = list(set(a))
+print(pre_list[:2])
+stat_date = "2018-09-02"
+cid_type = "diary"
+db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+sql = "select count(device_id) from data_feed_exposure2 \
+      where stat_date='{}' \
+      and cid_type='{}' \
+      and device_id in {}".format(stat_date,cid_type,pre_list)
+cursor = db.cursor()
+print("开始获取")
+cursor.execute(sql)
+print("成功获取")
+click = cursor.fetchall()[0][0]
+print(click)
+sql = "select count(device_id) from data_feed_exposure2 \
+      where stat_date='{}' \
+      and cid_type='{}' \
+      and device_id in {}".format(stat_date,cid_type,pre_list)
+cursor = db.cursor()
+print("开始获取")
+cursor.execute(sql)
+exp = cursor.fetchall()[0][0]
+print(exp)
+print(click/exp)
--- a/local/testCases.py
+++ b/local/testCases.py
@@ -39,23 +39,66 @@ def get_local_device():
    df.to_csv('/Users/mac/utils/test_device_city_id.csv', index=None)
    print(1)
-if __name__=="__main__":
-    LOCAL_HOST = 'rdsmaqevmuzj6jy.mysql.rds.aliyuncs.com'
+def save_queue():
-    db = pymysql.connect(host=LOCAL_HOST, port=3306, user='work', passwd='workwork', db='doris_test')
+    queue_name_list = ["native_queue", "nearby_queue", "nation_queue", "megacity_queue"]
-    diary_id = [14207355,16197023,13006945,12363565,15296547,15082216,16198052,15228350,13006942,14229868,15303563,16211116,15225921,15250715,15271108,15479315,16197047,15544727,15336944,15486003,15517196,16211130,15547275,15572010]
+    for i in queue_name_list:
-    device_id = '99000645287876'
+        sql = "select {} from device_diary_queue " \
-    city_id = 'beijing'
+              "where device_id = '{}' and city_id = '{}';".format(i, device_id, city_id)
+        db = pymysql.connect(host='rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com', port=3306, user='doris',
+                             passwd='o5gbA27hXHHm',
+                             db='doris_prod')
+        cursor = db.cursor()
+        cursor.execute(sql)
+        result = cursor.fetchall()
+        df = pd.DataFrame(list(result))
+        print(df.shape)
+        df.to_csv("/data/models/{}.csv".format(i), index=None)
+        print("end")
+def delete():
+    db = pymysql.connect(host='rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com', port=3306, user='doris',
+                         passwd='o5gbA27hXHHm',
+                         db='doris_prod')
    cursor = db.cursor()
-    id_str = str(diary_id[0])
+    sql = "delete from device_diary_queue where device_id = '{}' and city_id = '{}';".format(device_id, city_id)
-    for i in range(1, len(diary_id)):
+    cursor.execute(sql)
-        id_str = id_str + "," + str(diary_id[i])
+    db.close()
+    print("删除成功")
-    sql = "insert into device_diary_queue values ('{}','{}','{}','{}','{}','{}',89)".format \
-        (device_id, city_id,id_str,id_str,id_str,id_str)
+def insert():
+    queue_name_list = ["native_queue", "nearby_queue", "nation_queue", "megacity_queue"]
+    a = {}
+    for i in queue_name_list:
+        a[i] = pd.read_csv("/data/models/native_queue.csv")["0"].values.tolist()[0]
+    # columns = ["native_queue", "nearby_queue", "nation_queue", "megacity_queue","id","device_id","city_id"]
+    db = pymysql.connect(host='rm-m5e842126ng59jrv6.mysql.rds.aliyuncs.com', port=3306, user='doris',
+                         passwd='o5gbA27hXHHm',
+                         db='doris_prod')
+    sql = "INSERT INTO device_diary_queue (native_queue, nearby_queue, nation_queue, " \
+          "megacity_queue,id,device_id,city_id) VALUES ('{}','{}','{}','{}','{}','{}','{}');".format \
+        (a["native_queue"], a["nearby_queue"],a["nation_queue"], a["megacity_queue"],id,device_id, city_id)
+    cursor = db.cursor()
    cursor.execute(sql)
    db.commit()
    db.close()
-    print("成功写入diary_id")
+    print("end")
+if __name__=="__main__":
+    # 先把数据保存下来，调用上面的save函数，然后调上面delete函数删除数据，然后调insert函数插入数据
+    id = 334
+    device_id = '00CA20EB-2719-4518-85CC-60E765AC526F'
+    city_id = 'beijing'
+    save_queue()
+    delete()
+    insert()

--- a/local/utils.py
+++ b/local/utils.py
@@ -165,65 +165,67 @@ class multiFFMFormatPandas:
            return False
 # ffm 格式转换函数、类
-# class FFMFormatPandas:
+class FFMFormatPandas:
-#     def __init__(self):
+    def __init__(self):
-#         self.field_index_ = None
+        self.field_index_ = None
-#         self.feature_index_ = None
+        self.feature_index_ = None
-#         self.y = None
+        self.y = None
-#
-#     def fit(self, df, y=None):
+    def fit(self, df, y=None):
-#         self.y = y
+        self.y = y
-#         df_ffm = df[df.columns.difference([self.y])]
+        df_ffm = df[df.columns.difference([self.y])]
-#         if self.field_index_ is None:
+        if self.field_index_ is None:
-#             self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
+            # 除了y，每列列名加索引对应的字典，例如field_index = {name：0，age：1}
-#
+            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
-#         if self.feature_index_ is not None:
-#             last_idx = max(list(self.feature_index_.values()))
+        if self.feature_index_ is not None:
-#
+            last_idx = max(list(self.feature_index_.values()))
-#         if self.feature_index_ is None:
-#             self.feature_index_ = dict()
+        if self.feature_index_ is None:
-#             last_idx = 0
+            self.feature_index_ = dict()
-#
+            last_idx = 0
-#         for col in df.columns:
+# 下面这个feature包括y，应该不包括。这是个bug
-#             vals = df[col].unique()
+        for col in df.columns:
-#             for val in vals:
+            vals = df[col].unique()
-#                 if pd.isnull(val):
+            for val in vals:
-#                     continue
+                if pd.isnull(val):
-#                 name = '{}_{}'.format(col, val)
+                    continue
-#                 if name not in self.feature_index_:
+                name = '{}_{}'.format(col, val)
-#                     self.feature_index_[name] = last_idx
+                if name not in self.feature_index_:
-#                     last_idx += 1
+                    # feature_index = {name_tom :0,name_lily :1,name:2,age_18:3,age_19:4:age:5}
-#             self.feature_index_[col] = last_idx
+                    self.feature_index_[name] = last_idx
-#             last_idx += 1
+                    last_idx += 1
-#         return self
+            self.feature_index_[col] = last_idx
-#
+            last_idx += 1
-#     def fit_transform(self, df, y=None):
+        return self
-#         self.fit(df, y)
-#         return self.transform(df)
+    def fit_transform(self, df, y=None):
-#
+        self.fit(df, y)
-#     def transform_row_(self, row, t):
+        return self.transform(df)
-#         ffm = []
-#         if self.y is not None:
+    def transform_row_(self, row, t):
-#             ffm.append(str(row.loc[row.index == self.y][0]))
+        ffm = []
-#         if self.y is None:
+        if self.y is not None:
-#             ffm.append(str(0))
+            ffm.append(str(row.loc[row.index == self.y][0]))
-#
+        if self.y is None:
-#         for col, val in row.loc[row.index != self.y].to_dict().items():
+            ffm.append(str(0))
-#             col_type = t[col]
-#             name = '{}_{}'.format(col, val)
+        for col, val in row.loc[row.index != self.y].to_dict().items():
-#             if col_type.kind == 'O':
+            col_type = t[col]
-#                 ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
+            name = '{}_{}'.format(col, val)
-#             elif col_type.kind == 'i':
+            if col_type.kind == 'O':
-#                 ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
+                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
-#         return ' '.join(ffm)
+            elif col_type.kind == 'i':
-#
+                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
-#     def transform(self, df):
+        return ' '.join(ffm)
-#         t = df.dtypes.to_dict()
-#         return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
+    def transform(self, df):
-#
+        t = df.dtypes.to_dict()
-#     # 下面这个方法不是这个类原有的方法，是新增的。目的是用来判断这个用户是不是在训练数据集中存在
+        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
-#     def is_feature_index_exist(self, name):
-#         if name in self.feature_index_:
+    # 下面这个方法不是这个类原有的方法，是新增的。目的是用来判断这个用户是不是在训练数据集中存在
-#             return True
+    def is_feature_index_exist(self, name):
-#         else:
+        if name in self.feature_index_:
-#             return False
+            return True
+        else:
+            return False
--- a/prepareData.py
+++ b/prepareData.py
 from utils import con_sql
 import datetime
 import time
+import pymysql
 def fetch_data(start_date, end_date):
    # 获取点击表里的device_id
-    sql = "select distinct device_id from data_feed_click2"
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
-    click_device_id = con_sql(sql)[0].values.tolist()
+    sql = "select distinct device_id from data_feed_click"
+    click_device_id = con_sql(db,sql)[0].values.tolist()
    print("成功获取点击表里的device_id")
    # 获取点击表里的数据
-    sql = "select cid,device_id,time,stat_date from data_feed_click2 " \
+    sql = "select cid,device_id,time,stat_date from data_feed_click " \
          "where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
-    click = con_sql(sql)
+    # 因为上面的db已经关了，需要再写一遍
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    click = con_sql(db,sql)
    click = click.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"})
    print("成功获取点击表里的数据")
    # 从time特征中抽取hour
@@ -22,10 +24,12 @@ def fetch_data(start_date, end_date):
    click = click.drop("time_date", axis=1)
    # 获取曝光表里的数据
-    sql = "select cid,device_id,time,stat_date from data_feed_exposure2 " \
+    sql = "select cid,device_id,time,stat_date from data_feed_exposure " \
          "where stat_date >= '{0}' and stat_date <= '{1}'".format(start_date, end_date)
    start = time.time()
-    exposure = con_sql(sql)
+    # 因为上面的db已经关了，需要再写一遍
+    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
+    exposure = con_sql(db,sql)
    end = time.time()
    print("获取曝光表耗时{}分".format((end-start)/60))
    exposure = exposure.rename(columns={0: "cid", 1: "device_id", 2: "time_date", 3: "stat_date"})

--- a/train.py
+++ b/train.py
@@ -18,8 +18,7 @@ if __name__ == "__main__":
    end_train = time.time()
    print("训练模型耗时{}分".format((end_train-start_train)/60))
    move_file()
-    #TODO 如果用自己写的keepProcess文件守护进程，下面在这个函数里删掉重新启动进程那行代码，因为可能会多启动一次进程
-    # restart_process()

--- a/userProfile.py
+++ b/userProfile.py
@@ -8,6 +8,37 @@ import pymysql
 import time
+# 统计尾号6的活跃用户数
+def unique_user_count(file_path, temp_list, now):
+    if os.path.exists(file_path):
+        # 尾号是6的活跃用户数
+        tail_6_list = eval(pd.read_csv(file_path).loc[0, "list"])
+    else:
+        tail_6_list = []
+    tail_6_list.extend(list(filter(lambda x: (str(x)[-1] == "6"), temp_list)))
+    if tail_6_list != []:
+        df_tail_6 = pd.DataFrame({"number": [len(set(tail_6_list))], "time": [str(now)[:16]],
+                                  "list": [list(set(tail_6_list))]})
+        df_tail_6.to_csv(file_path, index=None)
+    print("截止现在尾号是6的独立活跃数：{}".format(len(set(tail_6_list))))
+# 统计预测过的独立用户数
+def predict_user_count(predict_file_path,device_list,now):
+    if os.path.exists(predict_file_path):
+        # 预测过尾号是6的用户数
+        all_predict_list = eval(pd.read_csv(predict_file_path).loc[0, "list"])
+    else:
+        all_predict_list = []
+    all_predict_list.extend(device_list)
+    if all_predict_list != []:
+        df_predict = pd.DataFrame({"number": [len(set(all_predict_list))], "time": [str(now)[:16]],
+                                   "list": [list(set(all_predict_list))]})
+        df_predict.to_csv(predict_file_path, index=None)
+    print("截止现在预测过尾号是6的独立活跃数：{}".format(len(set(all_predict_list))))
 # 获取当下一分钟内活跃用户
 def get_active_users(flag,path,differ):
    if differ == 0:
@@ -23,18 +54,18 @@ def get_active_users(flag,path,differ):
        start = end - differ
    end_datetime = str(datetime.fromtimestamp(end))
    start_datetime = str(datetime.fromtimestamp(start))
-    sql = "select device_id,city_id from user_active_time " \
-          "where active_time <= '{}' and active_time >= '{}'".format(end_datetime,start_datetime)
    if flag:
-        df = con_sql(sql)
+        sql = "select device_id,city_id from user_active_time " \
+              "where active_time <= '{}' and active_time >= '{}'".format(end_datetime, start_datetime)
+        db = pymysql.connect(host=ACTIVE_USER_DB_ONLINE["host"], port=ACTIVE_USER_DB_ONLINE["port"],
+                             user=ACTIVE_USER_DB_ONLINE["user"], passwd=ACTIVE_USER_DB_ONLINE["passwd"],
+                             db=ACTIVE_USER_DB_ONLINE["db"])
+        df = con_sql(db,sql)
    else:
-        db = pymysql.connect(host='192.168.15.12', port=4000, user='root', db='jerry_test')
+        db = pymysql.connect(host=ACTIVE_USER_DB_LOCAL["host"], port=ACTIVE_USER_DB_LOCAL["port"],
+                             user=ACTIVE_USER_DB_LOCAL["user"], db=ACTIVE_USER_DB_LOCAL["db"])
        sql = "select device_id,city_id from user_active_time"
-        cursor = db.cursor()
+        df = con_sql(db, sql)
-        cursor.execute(sql)
-        result = cursor.fetchall()
-        df = pd.DataFrame(list(result)).dropna()
-        db.close()
    if df.empty:
        print("当下没有活跃用户数")
@@ -44,19 +75,20 @@ def get_active_users(flag,path,differ):
        temp_list = df[0].values.tolist()
        now = datetime.now()
        tail6_file_path = path + "{}tail6Unique.csv".format(str(now)[:10])
-        if os.path.exists(tail6_file_path):
+        unique_user_count(tail6_file_path, temp_list, now)
-            # 尾号是6的活跃用户数
+        # if os.path.exists(tail6_file_path):
-            tail_6_list = eval(pd.read_csv(tail6_file_path).loc[0, "list"])
+        #     # 尾号是6的活跃用户数
-        else:
+        #     tail_6_list = eval(pd.read_csv(tail6_file_path).loc[0, "list"])
-            tail_6_list = []
+        # else:
+        #     tail_6_list = []
-        tail_6_list.extend(list(filter(lambda x: (str(x)[-1] == "6"), temp_list)))
+        #
-        if tail_6_list != []:
+        # tail_6_list.extend(list(filter(lambda x: (str(x)[-1] == "6"), temp_list)))
-            df_tail_6 = pd.DataFrame({"number": [len(set(tail_6_list))], "time": [str(now)[:16]],
+        # if tail_6_list != []:
-                                      "list": [list(set(tail_6_list))]})
+        #     df_tail_6 = pd.DataFrame({"number": [len(set(tail_6_list))], "time": [str(now)[:16]],
-            df_tail_6.to_csv(tail6_file_path, index=None)
+        #                               "list": [list(set(tail_6_list))]})
+        #     df_tail_6.to_csv(tail6_file_path, index=None)
-        print("截止现在尾号是6的独立活跃数：{}".format(len(set(tail_6_list))))
+        #
+        # print("截止现在尾号是6的独立活跃数：{}".format(len(set(tail_6_list))))
        old_device_id_list = pd.read_csv(path + "data_set_device_id.csv")["device_id"].values.tolist()
        # 求活跃用户和老用户的交集，也就是只预测老用户
        df = df.loc[df[0].isin(old_device_id_list)]
@@ -83,23 +115,25 @@ def get_active_users(flag,path,differ):
                #统计尾号6的预测用户
                predict_file_path = path + "{}predictTail6Unique.csv".format(str(now)[:10])
-                if os.path.exists(predict_file_path):
+                predict_user_count(predict_file_path,device_list,now)
-                    # 预测过尾号是6的用户数
+                # if os.path.exists(predict_file_path):
-                    all_predict_list = eval(pd.read_csv(predict_file_path).loc[0, "list"])
+                #     # 预测过尾号是6的用户数
-                else:
+                #     all_predict_list = eval(pd.read_csv(predict_file_path).loc[0, "list"])
-                    all_predict_list = []
+                # else:
-                all_predict_list.extend(device_list)
+                #     all_predict_list = []
-                if all_predict_list != []:
+                # all_predict_list.extend(device_list)
-                    df_predict = pd.DataFrame({"number": [len(set(all_predict_list))], "time": [str(now)[:16]],
+                # if all_predict_list != []:
-                                               "list": [list(set(all_predict_list))]})
+                #     df_predict = pd.DataFrame({"number": [len(set(all_predict_list))], "time": [str(now)[:16]],
-                    df_predict.to_csv(predict_file_path, index=None)
+                #                                "list": [list(set(all_predict_list))]})
+                #     df_predict.to_csv(predict_file_path, index=None)
                return device_city_list
 def fetch_user_profile(device_id):
+        db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
        sql = "select device_id,city_id from data_feed_click where device_id = '{0}' limit 1".format(device_id)
-        user_profile = con_sql(sql)
+        user_profile = con_sql(db,sql)
        if user_profile.empty:
            print("没有获取到该用户对应的city_id")
            return None,True

--- a/utils.py
+++ b/utils.py
@@ -10,6 +10,18 @@ from multiprocessing import Pool
 import os
 import signal
 from config import *
+import socket
+def judge_online():
+    # 下面这个ip是本地电脑ip
+    if socket.gethostbyname(socket.gethostname()) == '172.30.8.160':
+        flag = False
+        path = LOCAL_DIRCTORY
+    else:
+        flag = True
+        path = DIRECTORY_PATH
+    return flag,path
 def get_date():
@@ -19,11 +31,11 @@ def get_date():
    day = now.day
    date = datetime(year,month,day)
    data_start_date = "2018-07-15"
-    data_end_date = "2018-08-30"
+    # data_end_date = "2018-09-02"
-    validation_date = "2018-08-29"
+    # validation_date = "2018-09-01"
    # data_start_date = (date - timedelta(days=3)).strftime("%Y-%m-%d")
-    # data_end_date = (date - timedelta(days=1)).strftime("%Y-%m-%d")
+    data_end_date = (date - timedelta(days=1)).strftime("%Y-%m-%d")
-    # validation_date = (date - timedelta(days=2)).strftime("%Y-%m-%d")
+    validation_date = (date - timedelta(days=2)).strftime("%Y-%m-%d")
    # 验证集和测试集的日期必须相差一天，否则切割数据集时会报错
    test_date = data_end_date
    print("data_start_date,data_end_date,validation_date,test_date：")
@@ -40,14 +52,33 @@ def get_roc_curve(y, pred, pos_label):
    print(AUC)
-# 从Tidb数据库的表里获取数据，并转化成df格式
+# 从Tidb数据库的表里获取数据，并转化成df格式,去掉空值
-def con_sql(sql):
+def con_sql(db,sql):
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    cursor = db.cursor()
-    cursor.execute(sql)
+    try:
-    result = cursor.fetchall()
+        cursor.execute(sql)
-    df = pd.DataFrame(list(result)).dropna()
+        result = cursor.fetchall()
-    db.close()
+        df = pd.DataFrame(list(result)).dropna()
+    except Exception:
+        print("发生异常", Exception)
+        df = pd.DataFrame()
+    finally:
+        db.close()
+    return df
+# 下面这个函数与上面那个函数区别是上面那个函数去掉了空值
+def sql_df(db,sql):
+    cursor = db.cursor()
+    try:
+        cursor.execute(sql)
+        result = cursor.fetchall()
+        df = pd.DataFrame(list(result))
+    except Exception:
+        print("发生异常", Exception)
+        df = pd.DataFrame()
+    finally:
+        db.close()
    return df
@@ -70,7 +101,7 @@ def restart_process():
            except OSError:
                print('没有如此进程!!!')
            os.popen('python diaryUpdateOnlineOffline.py')
-            print("成功重启diaryUpdateOnlineOffline.py")
+            print("已经成功重启diaryUpdateOnlineOffline.py")
        else:
            os.popen('python diaryUpdateOnlineOffline.py')
            print("成功重启diaryUpdateOnlineOffline.py")
@@ -252,7 +283,7 @@ class multiFFMFormatPandas:
 #         t = df.dtypes.to_dict()
 #         return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
 #
-#     # 下面这个方法不是这个类原有的方法，是新增的。目的是用来判断这个用户是不是在训练数据集中存在
+#     下面这个方法不是这个类原有的方法，是新增的。目的是用来判断这个用户是不是在训练数据集中存在
 #     def is_feature_index_exist(self, name):
 #         if name in self.feature_index_:
 #             return True