1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
import pymysql
def con_sql(db,sql):
cursor = db.cursor()
try:
cursor.execute(sql)
result = cursor.fetchall()
df = pd.DataFrame(list(result))
except Exception:
print("发生异常", Exception)
df = pd.DataFrame()
finally:
db.close()
return df
def exp():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select manufacturer,channel from user_feature"
df = con_sql(db, sql)
n = df.shape[0]
manufacturer = df[0].unique()
manufacturer_map = {}
print("manufacturer unique")
print(len(manufacturer))
for i in manufacturer:
manufacturer_map[i] = df.loc[df[0]==i].shape[0]/n
print(sorted(manufacturer_map.items(),key = lambda x:x[1]))
channel = df[1].unique()
channel_map = {}
print("channel unique")
print(len(channel))
for i in channel:
channel_map[i] = df.loc[df[1] == i].shape[0] / n
print(sorted(channel_map.items(), key=lambda x: x[1]))
def clean():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select device_id,device_type,manufacturer,channel,city_id from user_feature"
df = con_sql(db, sql)
df = df.rename(columns={0: "device_id",1: "device_type", 2: "manufacturer", 3: "channel", 4: "city_id"})
n = df.shape[0]
manufacturer = df["manufacturer"].unique()
for i in manufacturer:
if df.loc[df["manufacturer"]==i].shape[0]/n < 0.0005:
df.loc[df["manufacturer"] == i,["manufacturer"]] = "other"
channel = df["channel"].unique()
for i in channel:
if df.loc[df["channel"] == i].shape[0] / n < 0.0001:
df.loc[df["channel"] == i, ["channel"]] = "other"
from sqlalchemy import create_engine
yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
n = 200000
for i in range(0,df.shape[0],n):
print(i)
if i == 0:
temp = df.loc[0:n]
elif i+n > df.shape[0]:
temp = df.loc[i+1:]
else:
temp = df.loc[i+1:i+n]
pd.io.sql.to_sql(temp, "user_feature_clean", yconnect, schema='jerry_test', if_exists='append', index=False)
print("insert done")
if __name__ == "__main__":
clean()