feature.py 12.1 KB
Newer Older
1 2 3 4 5 6 7 8

import pandas as pd
import pymysql
import datetime


def con_sql(db,sql):
    cursor = db.cursor()
张彦钊's avatar
张彦钊 committed
9 10 11 12
    cursor.execute(sql)
    result = cursor.fetchall()
    df = pd.DataFrame(list(result))
    db.close()
13 14 15
    return df


16 17 18 19 20 21 22 23 24 25 26 27 28
def multi_hot(df,column,n):
    df[column] = df[column].fillna("lost_na")
    app_list_value = [i.split(",") for i in df[column].unique()]
    app_list_unique = []
    for i in app_list_value:
        app_list_unique.extend(i)
    app_list_unique = list(set(app_list_unique))
    number = len(app_list_unique)
    app_list_map = dict(zip(app_list_unique, list(range(n, number + n))))
    df[column] = df[column].apply(app_list_func, args=(app_list_map,))
    return number,app_list_map


29
def get_data():
30
    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
31
    sql = "select max(stat_date) from {}".format(train_data_set)
32 33 34
    validate_date = con_sql(db, sql)[0].values.tolist()[0]
    print("validate_date:" + validate_date)
    temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
张彦钊's avatar
张彦钊 committed
35
    start = (temp - datetime.timedelta(days=100)).strftime("%Y-%m-%d")
36
    print(start)
37
    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
38
    sql = "select e.y,e.z,e.stat_date,e.ucity_id,feat.level2_ids,e.ccity_name,u.device_type,u.manufacturer," \
张彦钊's avatar
张彦钊 committed
39
          "u.channel,c.top,e.device_id,cut.time,dl.app_list,e.diary_service_id,feat.level3_ids,feat.level2," \
张彦钊's avatar
张彦钊 committed
40
          "wiki.tag,question.tag,search.tag,budan.tag,order_tag.tag,sixin.tag,cart.tag " \
41
          "from {} e left join user_feature u on e.device_id = u.device_id " \
42 43
          "left join cid_type_top c on e.device_id = c.device_id " \
          "left join cid_time_cut cut on e.cid_id = cut.cid " \
44 45
          "left join device_app_list dl on e.device_id = dl.device_id " \
          "left join diary_feat feat on e.cid_id = feat.diary_id " \
张彦钊's avatar
张彦钊 committed
46 47 48
          "left join wiki_tag wiki on e.device_id = wiki.device_id " \
          "left join question_tag question on e.device_id = question.device_id " \
          "left join search_tag search on e.device_id = search.device_id " \
张彦钊's avatar
张彦钊 committed
49
          "left join budan_tag budan on e.device_id = budan.device_id " \
张彦钊's avatar
张彦钊 committed
50
          "left join order_tag on e.device_id = order_tag.device_id " \
张彦钊's avatar
张彦钊 committed
51 52
          "left join sixin_tag sixin on e.device_id = sixin.device_id " \
          "left join cart_tag cart on e.device_id = cart.device_id " \
53 54
          "where e.stat_date >= '{}'".format(train_data_set, start)

张彦钊's avatar
张彦钊 committed
55
# 上面order_tag 表不要简称为order,因为order是mysql的保留字,例如order by
56
    df = con_sql(db, sql)
张彦钊's avatar
张彦钊 committed
57
    # print(df.shape)
58 59
    df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name",
                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "device_id",
张彦钊's avatar
张彦钊 committed
60 61
                            11: "time", 12: "app_list", 13: "service_id", 14: "level3_ids", 15: "level2",
                            16:"tag1",17:"tag2",18:"tag3",19:"tag4",20:"tag5",21:"tag6",22:"tag7"})
62

63
    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
64 65 66 67 68 69 70 71 72 73 74
    sql = "select level2_id,treatment_method,price_min,price_max,treatment_time,maintain_time,recover_time " \
          "from train_Knowledge_network_data"
    knowledge = con_sql(db, sql)
    knowledge = knowledge.rename(columns={0: "level2", 1: "method", 2: "min", 3: "max",
                                          4: "treatment_time", 5: "maintain_time", 6: "recover_time"})
    knowledge["level2"] = knowledge["level2"].astype("str")

    df = pd.merge(df, knowledge, on='level2', how='left')
    df = df.drop("level2", axis=1)

    service_id = tuple(df["service_id"].unique())
75
    db = pymysql.connect(host='172.16.30.143', port=3306, user='work',
76 77 78 79 80 81 82 83 84 85 86 87 88 89
                         passwd='BJQaT9VzDcuPBqkd', db='zhengxing')
    sql = "select s.id,d.hospital_id from api_service s left join api_doctor d on s.doctor_id = d.id " \
          "where s.id in {}".format(service_id)
    hospital = con_sql(db, sql)
    hospital = hospital.rename(columns={0: "service_id", 1: "hospital_id"})
    # print(hospital.head())
    # print("hospital")
    # print(hospital.count())
    hospital["service_id"] = hospital["service_id"].astype("str")
    df = pd.merge(df, hospital, on='service_id', how='left')
    df = df.drop("service_id", axis=1)

    print(df.count())

90 91
    print("before")
    print(df.shape)
92

93
    df = df.drop_duplicates(["ucity_id", "clevel2_id", "ccity_name", "device_type", "manufacturer",
94
                             "channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids"])
95

96 97
    print("after")
    print(df.shape)
98 99 100
    app_list_number, app_list_map = multi_hot(df, "app_list", 2)
    level2_number, level2_map = multi_hot(df, "clevel2_id", 2 + app_list_number)
    level3_number, level3_map = multi_hot(df, "level3_ids", 2 + app_list_number + level2_number)
101

张彦钊's avatar
张彦钊 committed
102 103 104 105
    for i in ["tag1","tag2","tag3","tag4","tag5","tag6","tag7"]:
        df[i] = df[i].fillna("lost_na")
        df[i] = df[i].apply(app_list_func, args=(level2_map,))

106
    unique_values = []
107
    features = ["ucity_id", "ccity_name", "device_type", "manufacturer",
108 109
                "channel", "top", "time", "stat_date", "hospital_id",
                "method", "min", "max", "treatment_time", "maintain_time", "recover_time"]
110 111 112 113 114 115 116
    for i in features:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
        # 下面这行代码是为了区分不同的列中有相同的值
        df[i] = df[i] + i
        unique_values.extend(list(df[i].unique()))

117 118 119
    temp = list(range(2 + app_list_number + level2_number + level3_number,
                      2 + app_list_number + level2_number + level3_number + len(unique_values)))
    value_map = dict(zip(unique_values, temp))
120 121

    df = df.drop("device_id", axis=1)
张彦钊's avatar
张彦钊 committed
122
    # TODO 上线后把最近一天的数据集放进训练集,这样用户的正、负反馈能及时获取
张彦钊's avatar
张彦钊 committed
123 124
    train = df

125
    test = df[df["stat_date"] == validate_date + "stat_date"]
126
    for i in ["ucity_id", "ccity_name", "device_type", "manufacturer",
127 128
              "channel", "top", "time", "stat_date", "hospital_id",
              "method", "min", "max", "treatment_time", "maintain_time", "recover_time"]:
129 130 131
        train[i] = train[i].map(value_map)
        test[i] = test[i].map(value_map)

132 133
    # print("train shape")
    # print(train.shape)
134 135 136
    print("test shape")
    print(test.shape)

137 138
    write_csv(train, "tr", 100000)
    write_csv(test, "va", 80000)
139

140
    return validate_date, value_map, app_list_map, level2_map, level3_map
141 142 143 144 145 146 147 148 149 150 151


def app_list_func(x,l):
    b = x.split(",")
    e = []
    for i in b:
        if i in l.keys():
            e.append(l[i])
        else:
            e.append(0)
    return ",".join([str(j) for j in e])
152 153 154 155 156 157 158 159 160 161 162 163 164


def write_csv(df,name,n):
    for i in range(0, df.shape[0], n):
        if i == 0:
            temp = df.iloc[0:n]
        elif i + n > df.shape[0]:
            temp = df.iloc[i:]
        else:
            temp = df.iloc[i:i + n]
        temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False)


165
def get_predict(date,value_map,app_list_map,level2_map,level3_map):
166
    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
167
    sql = "select e.y,e.z,e.label,e.ucity_id,feat.level2_ids,e.ccity_name," \
168
          "u.device_type,u.manufacturer,u.channel,c.top,e.device_id,e.cid_id,cut.time," \
张彦钊's avatar
张彦钊 committed
169
          "dl.app_list,e.hospital_id,feat.level3_ids,feat.level2," \
张彦钊's avatar
张彦钊 committed
170
          "wiki.tag,question.tag,search.tag,budan.tag,order_tag.tag,sixin.tag,cart.tag " \
171 172
          "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id " \
173 174
          "left join cid_time_cut cut on e.cid_id = cut.cid " \
          "left join device_app_list dl on e.device_id = dl.device_id " \
张彦钊's avatar
张彦钊 committed
175 176 177 178 179
          "left join diary_feat feat on e.cid_id = feat.diary_id " \
          "left join wiki_tag wiki on e.device_id = wiki.device_id " \
          "left join question_tag question on e.device_id = question.device_id " \
          "left join search_tag search on e.device_id = search.device_id " \
          "left join budan_tag budan on e.device_id = budan.device_id " \
张彦钊's avatar
张彦钊 committed
180
          "left join order_tag on e.device_id = order_tag.device_id " \
张彦钊's avatar
张彦钊 committed
181
          "left join sixin_tag sixin on e.device_id = sixin.device_id " \
张彦钊's avatar
张彦钊 committed
182
          "left join cart_tag cart on e.device_id = cart.device_id"
183
    df = con_sql(db, sql)
184
    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name",
185 186
                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "device_id",
                            11: "cid_id", 12: "time", 13: "app_list", 14: "hospital_id", 15: "level3_ids",
张彦钊's avatar
张彦钊 committed
187
                            16: "level2",17:"tag1",18:"tag2",19:"tag3",20:"tag4",21:"tag5",22:"tag6",23:"tag7"})
188

189
    db = pymysql.connect(host='172.16.40.158', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
190 191 192 193 194 195 196 197 198 199 200
    sql = "select level2_id,treatment_method,price_min,price_max,treatment_time,maintain_time,recover_time " \
          "from train_Knowledge_network_data"
    knowledge = con_sql(db, sql)
    knowledge = knowledge.rename(columns={0: "level2", 1: "method", 2: "min", 3: "max",
                                          4: "treatment_time", 5: "maintain_time", 6: "recover_time"})
    knowledge["level2"] = knowledge["level2"].astype("str")

    df = pd.merge(df, knowledge, on='level2', how='left')
    df = df.drop("level2", axis=1)
    df = df.drop_duplicates(["ucity_id", "clevel2_id", "ccity_name", "device_type", "manufacturer",
                             "channel", "top", "time", "app_list", "hospital_id", "level3_ids"])
201 202

    df["stat_date"] = date
203 204
    print(df.head(6))
    df["app_list"] = df["app_list"].fillna("lost_na")
205
    df["app_list"] = df["app_list"].apply(app_list_func, args=(app_list_map,))
206 207
    df["clevel2_id"] = df["clevel2_id"].fillna("lost_na")
    df["clevel2_id"] = df["clevel2_id"].apply(app_list_func, args=(level2_map,))
208 209
    df["level3_ids"] = df["level3_ids"].fillna("lost_na")
    df["level3_ids"] = df["level3_ids"].apply(app_list_func, args=(level3_map,))
210

张彦钊's avatar
张彦钊 committed
211 212 213 214
    for i in ["tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7"]:
        df[i] = df[i].fillna("lost_na")
        df[i] = df[i].apply(app_list_func, args=(level2_map,))

215 216
    # print("predict shape")
    # print(df.shape)
217 218
    df["uid"] = df["device_id"]
    df["city"] = df["ucity_id"]
219
    features = ["ucity_id", "ccity_name", "device_type", "manufacturer",
220 221
                "channel", "top", "time", "stat_date", "hospital_id",
                "method", "min", "max", "treatment_time", "maintain_time", "recover_time"]
222 223 224 225 226 227 228 229 230 231
    for i in features:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
        df[i] = df[i] + i

    native_pre = df[df["label"] == 0]
    native_pre = native_pre.drop("label", axis=1)
    nearby_pre = df[df["label"] == 1]
    nearby_pre = nearby_pre.drop("label", axis=1)

232
    for i in ["ucity_id", "ccity_name", "device_type", "manufacturer",
233 234
              "channel", "top", "time", "stat_date", "hospital_id",
              "method", "min", "max", "treatment_time", "maintain_time", "recover_time"]:
235 236 237 238 239 240 241 242 243 244
        native_pre[i] = native_pre[i].map(value_map)
        # TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下
        native_pre[i] = native_pre[i].fillna(0)

        nearby_pre[i] = nearby_pre[i].map(value_map)
        # TODO 没有覆盖到的类别会处理成na,暂时用0填充,后续完善一下
        nearby_pre[i] = nearby_pre[i].fillna(0)

    print("native")
    print(native_pre.shape)
245

246 247
    native_pre[["uid", "city", "cid_id"]].to_csv(path + "native.csv", index=False)
    write_csv(native_pre, "native", 200000)
248 249 250

    print("nearby")
    print(nearby_pre.shape)
251

252
    nearby_pre[["uid", "city", "cid_id"]].to_csv(path + "nearby.csv", index=False)
253 254 255 256
    write_csv(nearby_pre, "nearby", 160000)


if __name__ == '__main__':
257
    train_data_set = "esmm_train_data_dur"
张彦钊's avatar
张彦钊 committed
258
    path = "/home/gmuser/esmm/"
259 260 261
    date, value, app_list, level2, level3 = get_data()
    get_predict(date, value, app_list, level2, level3)

262