exp_channel.py 2.35 KB
Newer Older
张彦钊's avatar
张彦钊 committed
1 2 3
import pandas as pd
import pymysql

张彦钊's avatar
张彦钊 committed
4

张彦钊's avatar
张彦钊 committed
5 6 7 8 9 10 11 12 13 14 15 16 17
def con_sql(db,sql):
    cursor = db.cursor()
    try:
        cursor.execute(sql)
        result = cursor.fetchall()
        df = pd.DataFrame(list(result))
    except Exception:
        print("发生异常", Exception)
        df = pd.DataFrame()
    finally:
        db.close()
    return df

张彦钊's avatar
张彦钊 committed
18

张彦钊's avatar
张彦钊 committed
19 20 21 22
def exp():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select manufacturer,channel from user_feature"
    df = con_sql(db, sql)
张彦钊's avatar
张彦钊 committed
23 24 25
    n = df.shape[0]
    manufacturer = df[0].unique()
    manufacturer_map = {}
张彦钊's avatar
张彦钊 committed
26
    print("manufacturer unique")
张彦钊's avatar
张彦钊 committed
27 28 29 30 31 32 33 34 35 36 37 38 39
    print(len(manufacturer))
    for i in manufacturer:
        manufacturer_map[i] = df.loc[df[0]==i].shape[0]/n
    print(sorted(manufacturer_map.items(),key = lambda x:x[1]))
    channel = df[1].unique()
    channel_map = {}
    print("channel unique")
    print(len(channel))
    for i in channel:
        channel_map[i] = df.loc[df[1] == i].shape[0] / n
    print(sorted(channel_map.items(), key=lambda x: x[1]))


张彦钊's avatar
张彦钊 committed
40 41 42 43 44 45 46 47 48 49 50 51 52 53
def clean():
    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
    sql = "select device_id,device_type,manufacturer,channel,city_id from user_feature"
    df = con_sql(db, sql)
    df = df.rename(columns={0: "device_id",1: "device_type", 2: "manufacturer", 3: "channel", 4: "city_id"})
    n = df.shape[0]
    manufacturer = df["manufacturer"].unique()
    for i in manufacturer:
       if df.loc[df["manufacturer"]==i].shape[0]/n < 0.0005:
           df.loc[df["manufacturer"] == i,["manufacturer"]] = "other"
    channel = df["channel"].unique()
    for i in channel:
        if df.loc[df["channel"] == i].shape[0] / n < 0.0001:
            df.loc[df["channel"] == i, ["channel"]] = "other"
张彦钊's avatar
张彦钊 committed
54 55 56 57 58 59 60 61 62 63 64 65 66 67
    from sqlalchemy import create_engine
    yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
    n = 200000
    for i in range(0,df.shape[0],n):
        print(i)
        if i == 0:
            temp = df.loc[0:n]
        elif i+n > df.shape[0]:
            temp = df.loc[i+1:]
        else:
            temp = df.loc[i+1:i+n]
        pd.io.sql.to_sql(temp, "user_feature_clean", yconnect, schema='jerry_test', if_exists='append', index=False)
        print("insert done")

张彦钊's avatar
张彦钊 committed
68

张彦钊's avatar
张彦钊 committed
69
if __name__ == "__main__":
张彦钊's avatar
张彦钊 committed
70
    clean()
张彦钊's avatar
张彦钊 committed
71 72