Commit 8eca148c authored by 张彦钊's avatar 张彦钊

增加保存到csv

parent 280a03ce
......@@ -24,16 +24,16 @@ def get_data():
esmm = esmm.rename(columns={0:"stat_date",1: "device_id",2:"ucity_id",3:"cid_id",4:"diary_service_id",5:"y",
6:"z",7:"clevel1_id",8:"slevel1_id"})
print("esmm data ok")
print(esmm.head())
print(esmm.shape)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='eagle')
sql = "select * from home_tab_click"
temp = con_sql(db,sql)
temp = temp.rename(columns={0: "device_id"})
print("click data ok")
print(temp.head())
df = pd.merge(esmm,temp,on = "device_id").dropna()
# print(temp.head())
df = pd.merge(esmm,temp,on = "device_id",how='left').fillna(0)
print(df.head())
print(df.shape)
df["diary_service_id"] = df["diary_service_id"].astype("str")
df["clevel1_id"] = df["clevel1_id"].astype("str")
......@@ -46,9 +46,9 @@ def get_data():
df = df.drop("z", axis=1)
print(df.head())
train = df[df["stat_date"] != "2018-11-25"]
transform(train,"train")
transform(train,"crvtrain.csv")
test = df[df["stat_date"] == "2018-11-25"]
transform(test, "test")
transform(test, "crvtest.csv")
def transform(df,table):
......@@ -56,28 +56,30 @@ def transform(df,table):
df = model.fit_transform(df, y="y", n=50000, processes=20)
df = pd.DataFrame(df)
df["device_id"] = df[0].apply(lambda x: x.split(",")[0])
df["ucity_id"] = df[0].apply(lambda x: x.split(",")[1])
df["cid_id"] = df[0].apply(lambda x: x.split(",")[2])
df["y"] = df[0].apply(lambda x: x.split(",")[3])
df["ffm"] = df[0].apply(lambda x: x.split(",")[4])
df["city_id"] = df[0].apply(lambda x: x.split(",")[1])
df["diary_id"] = df[0].apply(lambda x: x.split(",")[2])
# df["y"] = df[0].apply(lambda x: x.split(",")[3])
df["seq"] = list(range(df.shape[0]))
df["seq"] = df["seq"].astype("str")
df["ffm"] = df["seq"].str.cat([df["y"].values.tolist(), df["ffm"].values.tolist()], sep=",")
df["number"] = np.random.randint(1, 2147483647, df.shape[0])
df = df.drop(0, axis=1)
df["ffm"] = df[0].apply(lambda x: x.split(",")[3:])
df["ffm"] = df["seq"].str.cat(df["ffm"], sep=",")
df["random"] = np.random.randint(1, 2147483647, df.shape[0])
df = df.drop(0, axis=1).drop("seq",axis=1)
print("size")
print(df.shape)
yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
for i in range(0,df.shape[0],10000):
print(i)
if i == 0:
temp = df.loc[0:10000]
elif i+10000 > df.shape[0]:
temp = df.loc[i:]
else:
temp = df.loc[i+1:i+10000]
pd.io.sql.to_sql(temp, table, yconnect, schema='jerry_test', if_exists='append', index=False)
print("insert done")
df.to_csv(path+table,index=None)
# yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
# n = 100000
# for i in range(0,df.shape[0],n):
# print(i)
# if i == 0:
# temp = df.loc[0:n]
# elif i+n > df.shape[0]:
# temp = df.loc[i+1:]
# else:
# temp = df.loc[i+1:i+n]
# pd.io.sql.to_sql(temp, table, yconnect, schema='jerry_test', if_exists='append', index=False)
# print("insert done")
......@@ -123,11 +125,6 @@ class multiFFMFormatPandas:
def transform_row_(self, row, t):
ffm = []
if self.y is not None:
ffm.append(str(row.loc[row.index == self.y][0]))
if self.y is None:
ffm.append(str(0))
for col, val in row.loc[row.index != self.y].to_dict().items():
col_type = t[col]
name = '{}_{}'.format(col, val)
......@@ -135,7 +132,12 @@ class multiFFMFormatPandas:
ffm.append('{}:{}:1'.format(self.field_index_[col]+1, self.feature_index_[name]+1))
elif col_type.kind == 'i':
ffm.append('{}:{}:{}'.format(self.field_index_[col]+1, self.feature_index_[col]+1, val))
return ' '.join(ffm)
result = ' '.join(ffm)
if self.y is not None:
result = str(row.loc[row.index == self.y][0]) + "," + result
if self.y is None:
result = str(0) + "," + result
return result
def transform(self, df,n=1500,processes=2):
# n是每个线程运行最大的数据条数,processes是线程数
......@@ -172,18 +174,6 @@ class multiFFMFormatPandas:
data_list.append(data.iloc[x:data.__len__()])
break
'''
# 返回生成器方法,但是本地测试效率不高
x = 0
while True:
if x + step < data.__len__():
yield data.iloc[x:x + step]
x = x + step + 1
else:
yield data.iloc[x:data.__len__()]
break
'''
return data_list
# 原生转化方法,不需要多进程
......@@ -201,4 +191,5 @@ class multiFFMFormatPandas:
if __name__ == "__main__":
path = "/data/ffm/"
get_data()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment