Commit 8eca148c authored by 张彦钊's avatar 张彦钊

增加保存到csv

parent 280a03ce
...@@ -24,16 +24,16 @@ def get_data(): ...@@ -24,16 +24,16 @@ def get_data():
esmm = esmm.rename(columns={0:"stat_date",1: "device_id",2:"ucity_id",3:"cid_id",4:"diary_service_id",5:"y", esmm = esmm.rename(columns={0:"stat_date",1: "device_id",2:"ucity_id",3:"cid_id",4:"diary_service_id",5:"y",
6:"z",7:"clevel1_id",8:"slevel1_id"}) 6:"z",7:"clevel1_id",8:"slevel1_id"})
print("esmm data ok") print("esmm data ok")
print(esmm.head()) print(esmm.shape)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select * from home_tab_click" sql = "select * from home_tab_click"
temp = con_sql(db,sql) temp = con_sql(db,sql)
temp = temp.rename(columns={0: "device_id"}) temp = temp.rename(columns={0: "device_id"})
print("click data ok") print("click data ok")
print(temp.head()) # print(temp.head())
df = pd.merge(esmm,temp,on = "device_id").dropna() df = pd.merge(esmm,temp,on = "device_id",how='left').fillna(0)
print(df.head()) print(df.shape)
df["diary_service_id"] = df["diary_service_id"].astype("str") df["diary_service_id"] = df["diary_service_id"].astype("str")
df["clevel1_id"] = df["clevel1_id"].astype("str") df["clevel1_id"] = df["clevel1_id"].astype("str")
...@@ -46,9 +46,9 @@ def get_data(): ...@@ -46,9 +46,9 @@ def get_data():
df = df.drop("z", axis=1) df = df.drop("z", axis=1)
print(df.head()) print(df.head())
train = df[df["stat_date"] != "2018-11-25"] train = df[df["stat_date"] != "2018-11-25"]
transform(train,"train") transform(train,"crvtrain.csv")
test = df[df["stat_date"] == "2018-11-25"] test = df[df["stat_date"] == "2018-11-25"]
transform(test, "test") transform(test, "crvtest.csv")
def transform(df,table): def transform(df,table):
...@@ -56,28 +56,30 @@ def transform(df,table): ...@@ -56,28 +56,30 @@ def transform(df,table):
df = model.fit_transform(df, y="y", n=50000, processes=20) df = model.fit_transform(df, y="y", n=50000, processes=20)
df = pd.DataFrame(df) df = pd.DataFrame(df)
df["device_id"] = df[0].apply(lambda x: x.split(",")[0]) df["device_id"] = df[0].apply(lambda x: x.split(",")[0])
df["ucity_id"] = df[0].apply(lambda x: x.split(",")[1]) df["city_id"] = df[0].apply(lambda x: x.split(",")[1])
df["cid_id"] = df[0].apply(lambda x: x.split(",")[2]) df["diary_id"] = df[0].apply(lambda x: x.split(",")[2])
df["y"] = df[0].apply(lambda x: x.split(",")[3]) # df["y"] = df[0].apply(lambda x: x.split(",")[3])
df["ffm"] = df[0].apply(lambda x: x.split(",")[4])
df["seq"] = list(range(df.shape[0])) df["seq"] = list(range(df.shape[0]))
df["seq"] = df["seq"].astype("str") df["seq"] = df["seq"].astype("str")
df["ffm"] = df["seq"].str.cat([df["y"].values.tolist(), df["ffm"].values.tolist()], sep=",") df["ffm"] = df[0].apply(lambda x: x.split(",")[3:])
df["number"] = np.random.randint(1, 2147483647, df.shape[0]) df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
df = df.drop(0, axis=1) df["random"] = np.random.randint(1, 2147483647, df.shape[0])
df = df.drop(0, axis=1).drop("seq",axis=1)
print("size") print("size")
print(df.shape) print(df.shape)
yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8') df.to_csv(path+table,index=None)
for i in range(0,df.shape[0],10000): # yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
print(i) # n = 100000
if i == 0: # for i in range(0,df.shape[0],n):
temp = df.loc[0:10000] # print(i)
elif i+10000 > df.shape[0]: # if i == 0:
temp = df.loc[i:] # temp = df.loc[0:n]
else: # elif i+n > df.shape[0]:
temp = df.loc[i+1:i+10000] # temp = df.loc[i+1:]
pd.io.sql.to_sql(temp, table, yconnect, schema='jerry_test', if_exists='append', index=False) # else:
print("insert done") # temp = df.loc[i+1:i+n]
# pd.io.sql.to_sql(temp, table, yconnect, schema='jerry_test', if_exists='append', index=False)
# print("insert done")
...@@ -123,11 +125,6 @@ class multiFFMFormatPandas: ...@@ -123,11 +125,6 @@ class multiFFMFormatPandas:
def transform_row_(self, row, t): def transform_row_(self, row, t):
ffm = [] ffm = []
if self.y is not None:
ffm.append(str(row.loc[row.index == self.y][0]))
if self.y is None:
ffm.append(str(0))
for col, val in row.loc[row.index != self.y].to_dict().items(): for col, val in row.loc[row.index != self.y].to_dict().items():
col_type = t[col] col_type = t[col]
name = '{}_{}'.format(col, val) name = '{}_{}'.format(col, val)
...@@ -135,7 +132,12 @@ class multiFFMFormatPandas: ...@@ -135,7 +132,12 @@ class multiFFMFormatPandas:
ffm.append('{}:{}:1'.format(self.field_index_[col]+1, self.feature_index_[name]+1)) ffm.append('{}:{}:1'.format(self.field_index_[col]+1, self.feature_index_[name]+1))
elif col_type.kind == 'i': elif col_type.kind == 'i':
ffm.append('{}:{}:{}'.format(self.field_index_[col]+1, self.feature_index_[col]+1, val)) ffm.append('{}:{}:{}'.format(self.field_index_[col]+1, self.feature_index_[col]+1, val))
return ' '.join(ffm) result = ' '.join(ffm)
if self.y is not None:
result = str(row.loc[row.index == self.y][0]) + "," + result
if self.y is None:
result = str(0) + "," + result
return result
def transform(self, df,n=1500,processes=2): def transform(self, df,n=1500,processes=2):
# n是每个线程运行最大的数据条数,processes是线程数 # n是每个线程运行最大的数据条数,processes是线程数
...@@ -172,18 +174,6 @@ class multiFFMFormatPandas: ...@@ -172,18 +174,6 @@ class multiFFMFormatPandas:
data_list.append(data.iloc[x:data.__len__()]) data_list.append(data.iloc[x:data.__len__()])
break break
'''
# 返回生成器方法,但是本地测试效率不高
x = 0
while True:
if x + step < data.__len__():
yield data.iloc[x:x + step]
x = x + step + 1
else:
yield data.iloc[x:data.__len__()]
break
'''
return data_list return data_list
# 原生转化方法,不需要多进程 # 原生转化方法,不需要多进程
...@@ -201,4 +191,5 @@ class multiFFMFormatPandas: ...@@ -201,4 +191,5 @@ class multiFFMFormatPandas:
if __name__ == "__main__": if __name__ == "__main__":
path = "/data/ffm/"
get_data() get_data()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment