Commit 42b731eb authored by 张彦钊's avatar 张彦钊

add test

parent 8b91bf35
...@@ -38,10 +38,6 @@ class multiFFMFormatPandas: ...@@ -38,10 +38,6 @@ class multiFFMFormatPandas:
self.y = None self.y = None
def fit(self, df, y=None): def fit(self, df, y=None):
b = df.dtypes
c = list(b.values)
d =tuple(df.dtypes.to_dict())
f = dict(zip(d,c))
self.y = y self.y = y
df_ffm = df[df.columns.difference([self.y])] df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None: if self.field_index_ is None:
...@@ -53,24 +49,19 @@ class multiFFMFormatPandas: ...@@ -53,24 +49,19 @@ class multiFFMFormatPandas:
if self.feature_index_ is None: if self.feature_index_ is None:
self.feature_index_ = dict() self.feature_index_ = dict()
last_idx = 1 for col in df.columns:
l = list(df.columns) self.feature_index_[col] = 1
l.remove(y) last_idx = 1
for col in l: vals = list(df[col].unique())
if f[col]=="O": vals.append("na")
vals = df[col].unique() vals = set(vals)
for val in vals: for val in vals:
if pd.isnull(val): if pd.isnull(val):
continue continue
name = '{}_{}'.format(col, val) name = '{}_{}'.format(col, val)
if name not in self.feature_index_: if name not in self.feature_index_:
self.feature_index_[name] = last_idx self.feature_index_[name] = last_idx
last_idx += 1 last_idx += 1
else:
self.feature_index_[col] = last_idx
last_idx += 1
print("last_idx")
print(last_idx-1)
return self return self
def fit_transform(self, df, y=None,n=50000,processes=4): def fit_transform(self, df, y=None,n=50000,processes=4):
...@@ -183,12 +174,11 @@ def get_data(): ...@@ -183,12 +174,11 @@ def get_data():
features = features + len(df[i].unique()) features = features + len(df[i].unique())
print("fields:{}".format(df.shape[1]-1)) print("fields:{}".format(df.shape[1]-1))
print("features:{}".format(features)) print("features:{}".format(features))
ccity_name = list(set(df["ccity_name"].values.tolist())) filter_list = ["ccity_name","ucity_id","manufacturer","channel","level2_ids"]
ucity_id = list(set(df["ucity_id"].values.tolist())) column_map = dict()
manufacturer = list(set(df["manufacturer"].values.tolist())) for i in filter_list:
channel = list(set(df["channel"].values.tolist())) column_map[i] = list(set(df[i].values.tolist()))
level2_ids = list(set(df["level2_ids"].values.tolist())) return df,validate_date,column_map
return df,validate_date,ucity_id,ccity_name,manufacturer,channel,level2_ids
def transform(a,validate_date): def transform(a,validate_date):
...@@ -220,7 +210,7 @@ def transform(a,validate_date): ...@@ -220,7 +210,7 @@ def transform(a,validate_date):
return model return model
def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids): def get_predict_set(model,columns):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \ sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id " \ "u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id " \
...@@ -232,31 +222,10 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids): ...@@ -232,31 +222,10 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids):
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids", 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids",
11:"device_id",12:"cid_id"}) 11:"device_id",12:"cid_id"})
print("before filter:")
print(df.shape)
df = df[df["ucity_id"].isin(ucity_id)]
print("after ucity filter:")
print(df.shape)
df = df[df["ccity_name"].isin(ccity_name)]
print("after ccity_name filter:")
print(df.shape)
df = df[df["manufacturer"].isin(manufacturer)]
print("after manufacturer filter:")
print(df.shape)
df = df[df["channel"].isin(channel)]
print("after channel filter:")
print(df.shape) print(df.shape)
for i in columns.keys():
df = df[df["level2_ids"].isin(level2_ids)] df.loc[~df["city_id"].isin(columns[i]), [i]] = "na"
print("after level2_ids filter:")
print(df.shape) print(df.shape)
df[(df["ucity_id"] == "beijing") & (df["top"] == 66)].to_csv(path + "top66.csv", sep="\t", index=False)
df["cid_id"] = df["cid_id"].astype("str") df["cid_id"] = df["cid_id"].astype("str")
df["clevel1_id"] = df["clevel1_id"].astype("str") df["clevel1_id"] = df["clevel1_id"].astype("str")
df["top"] = df["top"].astype("str") df["top"] = df["top"].astype("str")
...@@ -309,9 +278,9 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids): ...@@ -309,9 +278,9 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids):
if __name__ == "__main__": if __name__ == "__main__":
path = "/home/gmuser/esmm_data/" path = "/home/gmuser/esmm_data/"
a = time.time() a = time.time()
temp, validate_date, ucity_id,ccity_name,manufacturer,channel,level2_ids = get_data() temp, validate_date, column_map = get_data()
model = transform(temp, validate_date) model = transform(temp, validate_date)
get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids) get_predict_set(model,column_map)
b = time.time() b = time.time()
print("cost(分钟)") print("cost(分钟)")
print((b-a)/60) print((b-a)/60)
#! /bin/bash #! /bin/bash
cd /srv/apps/ffm-baseline/eda/esmm cd /srv/apps/ffm-baseline/eda/esmm
git checkout test
PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python
MODEL_PATH=/srv/apps/ffm-baseline/eda/esmm MODEL_PATH=/srv/apps/ffm-baseline/eda/esmm
DATA_PATH=/home/gmuser/esmm_data DATA_PATH=/home/gmuser/esmm_data
......
...@@ -38,10 +38,6 @@ class multiFFMFormatPandas: ...@@ -38,10 +38,6 @@ class multiFFMFormatPandas:
self.y = None self.y = None
def fit(self, df, y=None): def fit(self, df, y=None):
b = df.dtypes
c = list(b.values)
d =tuple(df.dtypes.to_dict())
f = dict(zip(d,c))
self.y = y self.y = y
df_ffm = df[df.columns.difference([self.y])] df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None: if self.field_index_ is None:
...@@ -53,24 +49,17 @@ class multiFFMFormatPandas: ...@@ -53,24 +49,17 @@ class multiFFMFormatPandas:
if self.feature_index_ is None: if self.feature_index_ is None:
self.feature_index_ = dict() self.feature_index_ = dict()
last_idx = 1 for col in df.columns:
l = list(df.columns) self.feature_index_[col] = 1
l.remove(y) last_idx = 1
for col in l: vals = df[col].unique()
if f[col]=="O": for val in vals:
vals = df[col].unique() if pd.isnull(val):
for val in vals: continue
if pd.isnull(val): name = '{}_{}'.format(col, val)
continue if name not in self.feature_index_:
name = '{}_{}'.format(col, val) self.feature_index_[name] = last_idx
if name not in self.feature_index_: last_idx += 1
self.feature_index_[name] = last_idx
last_idx += 1
else:
self.feature_index_[col] = last_idx
last_idx += 1
print("last_idx")
print(last_idx-1)
return self return self
def fit_transform(self, df, y=None,n=50000,processes=4): def fit_transform(self, df, y=None,n=50000,processes=4):
...@@ -148,6 +137,7 @@ class multiFFMFormatPandas: ...@@ -148,6 +137,7 @@ class multiFFMFormatPandas:
return False return False
def get_data(): def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select max(stat_date) from esmm_train_data" sql = "select max(stat_date) from esmm_train_data"
...@@ -256,8 +246,6 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids): ...@@ -256,8 +246,6 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids):
print("after level2_ids filter:") print("after level2_ids filter:")
print(df.shape) print(df.shape)
df[(df["ucity_id"] == "beijing") & (df["top"] == 66)].to_csv(path + "top66.csv", sep="\t", index=False)
df["cid_id"] = df["cid_id"].astype("str") df["cid_id"] = df["cid_id"].astype("str")
df["clevel1_id"] = df["clevel1_id"].astype("str") df["clevel1_id"] = df["clevel1_id"].astype("str")
df["top"] = df["top"].astype("str") df["top"] = df["top"].astype("str")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment