import pandas as pd import pymysql def con_sql(db,sql): cursor = db.cursor() try: cursor.execute(sql) result = cursor.fetchall() df = pd.DataFrame(list(result)) except Exception: print("发生异常", Exception) df = pd.DataFrame() finally: db.close() return df def exp(): db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') sql = "select manufacturer,channel from user_feature" df = con_sql(db, sql) n = df.shape[0] manufacturer = df[0].unique() manufacturer_map = {} print("manufacturer unique") print(len(manufacturer)) for i in manufacturer: manufacturer_map[i] = df.loc[df[0]==i].shape[0]/n print(sorted(manufacturer_map.items(),key = lambda x:x[1])) channel = df[1].unique() channel_map = {} print("channel unique") print(len(channel)) for i in channel: channel_map[i] = df.loc[df[1] == i].shape[0] / n print(sorted(channel_map.items(), key=lambda x: x[1])) def clean(): db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') sql = "select device_id,device_type,manufacturer,channel,city_id from user_feature" df = con_sql(db, sql) df = df.rename(columns={0: "device_id",1: "device_type", 2: "manufacturer", 3: "channel", 4: "city_id"}) n = df.shape[0] manufacturer = df["manufacturer"].unique() for i in manufacturer: if df.loc[df["manufacturer"]==i].shape[0]/n < 0.0005: df.loc[df["manufacturer"] == i,["manufacturer"]] = "other" channel = df["channel"].unique() for i in channel: if df.loc[df["channel"] == i].shape[0] / n < 0.0001: df.loc[df["channel"] == i, ["channel"]] = "other" from sqlalchemy import create_engine yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8') n = 200000 for i in range(0,df.shape[0],n): print(i) if i == 0: temp = df.loc[0:n] elif i+n > df.shape[0]: temp = df.loc[i+1:] else: temp = df.loc[i+1:i+n] pd.io.sql.to_sql(temp, "user_feature_clean", yconnect, schema='jerry_test', if_exists='append', index=False) print("insert done") if __name__ == "__main__": clean()