feature.py


import pandas as pd
import pymysql
import datetime


def con_sql(db,sql):
    cursor = db.cursor()
    try:
        cursor.execute(sql)
        result = cursor.fetchall()
        df = pd.DataFrame(list(result))
    except Exception:
        print("发生异常", Exception)
        df = pd.DataFrame()
    finally:
        db.close()
    return df


def get_data():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select max(stat_date) from esmm_train_data"
    validate_date = con_sql(db, sql)[0].values.tolist()[0]
    print("validate_date:" + validate_date)
    temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
    start = (temp - datetime.timedelta(days=60)).strftime("%Y-%m-%d")
    print(start)
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
          "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,cut.time " \
          "from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id " \
          "left join cid_level2 cl on e.cid_id = cl.cid " \
          "left join cid_time_cut cut on e.cid_id = cut.cid " \
          "where e.stat_date >= '{}'".format(start)
    df = con_sql(db, sql)
    # print(df.shape)
    df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11: "l2",
                            12: "device_id", 13: "time"})
    print("esmm data ok")
    # print(df.head(2)
    print("before")
    print(df.shape)
    print("after")
    df = df.drop_duplicates()
    df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
                             "channel", "top", "l1","l2", "time", "stat_date"])
    print(df.shape)

    unique_values = []
    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
                "channel", "top", "time", "stat_date"]
    for i in features:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
        # 下面这行代码是为了区分不同的列中有相同的值
        df[i] = df[i] + i
        unique_values.extend(list(df[i].unique()))
    for i in ["l1","l2"]:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
        # l1和l2中的值与top类别是一个类别
        df[i] = df[i]+"top"
        unique_values.extend(list(df[i].unique()))

    print("features:")
    print(len(unique_values))
    print(df.head(2))

    temp = list(range(1,len(unique_values)+1))
    value_map = dict(zip(unique_values,temp))

    df = df.drop("device_id", axis=1)
    train = df
    test = df[df["stat_date"] == validate_date+"stat_date"]
    for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
                "channel", "top", "l1", "time", "stat_date","l2"]:
        train[i] = train[i].map(value_map)
        test[i] = test[i].map(value_map)

    print("train shape")
    print(train.shape)
    print("test shape")
    print(test.shape)

    write_csv(train, "tr",100000)
    write_csv(test, "va",80000)

    return validate_date,value_map


def write_csv(df,name,n):
    for i in range(0, df.shape[0], n):
        if i == 0:
            temp = df.iloc[0:n]
        elif i + n > df.shape[0]:
            temp = df.iloc[i:]
        else:
            temp = df.iloc[i:i + n]
        temp.to_csv(path + name+ "/{}_{}.csv".format(name,i), index=False)


def get_predict(date,value_map):
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
          "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,e.cid_id,cut.time " \
          "from esmm_pre_data e left join user_feature u on e.device_id = u.device_id " \
          "left join cid_type_top c on e.device_id = c.device_id " \
          "left join cid_level2 cl on e.cid_id = cl.cid " \
          "left join cid_time_cut cut on e.cid_id = cut.cid"
    df = con_sql(db, sql)
    df = df.rename(columns={0: "y", 1: "z", 2: "label", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
                            6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "l1",11:"l2",
                            12: "device_id", 13: "cid_id", 14: "time"})

    df["stat_date"] = date

    print("predict shape")
    print(df.shape)
    df["uid"] = df["device_id"]
    df["city"] = df["ucity_id"]
    features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
                "channel", "top", "time", "stat_date"]
    for i in features:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
        df[i] = df[i] + i
    for i in ["l1","l2"]:
        df[i] = df[i].astype("str")
        df[i] = df[i].fillna("lost")
        # l1和l2中的值与top类别是一个类别
        df[i] = df[i]+"top"

    native_pre = df[df["label"] == 0]
    native_pre = native_pre.drop("label", axis=1)
    nearby_pre = df[df["label"] == 1]
    nearby_pre = nearby_pre.drop("label", axis=1)

    for i in ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
                "channel", "top", "l1", "time", "stat_date","l2"]:
        native_pre[i] = native_pre[i].map(value_map)
        # TODO 没有覆盖到的类别会处理成na，暂时用0填充，后续完善一下
        native_pre[i] = native_pre[i].fillna(0)

        nearby_pre[i] = nearby_pre[i].map(value_map)
        # TODO 没有覆盖到的类别会处理成na，暂时用0填充，后续完善一下
        nearby_pre[i] = nearby_pre[i].fillna(0)


    print("native")
    print(native_pre.shape)
    print(native_pre.head())
    native_pre[["uid","city","cid_id"]].to_csv(path+"native.csv",index=False)
    write_csv(native_pre, "native",200000)

    print("nearby")
    print(nearby_pre.shape)
    print(nearby_pre.head())
    nearby_pre[["uid","city","cid_id"]].to_csv(path+"nearby.csv",index=False)
    write_csv(nearby_pre, "nearby", 160000)


if __name__ == '__main__':
    # path = "/home/gmuser/esmm_data/"
    path = "/data/esmm/"
    date,value = get_data()
    get_predict(date, value)