Commit 42b731eb authored by 张彦钊's avatar 张彦钊

add test

parent 8b91bf35
......@@ -38,10 +38,6 @@ class multiFFMFormatPandas:
self.y = None
def fit(self, df, y=None):
b = df.dtypes
c = list(b.values)
d =tuple(df.dtypes.to_dict())
f = dict(zip(d,c))
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
......@@ -53,24 +49,19 @@ class multiFFMFormatPandas:
if self.feature_index_ is None:
self.feature_index_ = dict()
last_idx = 1
l = list(df.columns)
l.remove(y)
for col in l:
if f[col]=="O":
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
else:
self.feature_index_[col] = last_idx
last_idx += 1
print("last_idx")
print(last_idx-1)
for col in df.columns:
self.feature_index_[col] = 1
last_idx = 1
vals = list(df[col].unique())
vals.append("na")
vals = set(vals)
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
return self
def fit_transform(self, df, y=None,n=50000,processes=4):
......@@ -183,12 +174,11 @@ def get_data():
features = features + len(df[i].unique())
print("fields:{}".format(df.shape[1]-1))
print("features:{}".format(features))
ccity_name = list(set(df["ccity_name"].values.tolist()))
ucity_id = list(set(df["ucity_id"].values.tolist()))
manufacturer = list(set(df["manufacturer"].values.tolist()))
channel = list(set(df["channel"].values.tolist()))
level2_ids = list(set(df["level2_ids"].values.tolist()))
return df,validate_date,ucity_id,ccity_name,manufacturer,channel,level2_ids
filter_list = ["ccity_name","ucity_id","manufacturer","channel","level2_ids"]
column_map = dict()
for i in filter_list:
column_map[i] = list(set(df[i].values.tolist()))
return df,validate_date,column_map
def transform(a,validate_date):
......@@ -220,7 +210,7 @@ def transform(a,validate_date):
return model
def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids):
def get_predict_set(model,columns):
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,df.level2_ids,e.device_id,e.cid_id " \
......@@ -232,31 +222,10 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids):
6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "level2_ids",
11:"device_id",12:"cid_id"})
print("before filter:")
print(df.shape)
df = df[df["ucity_id"].isin(ucity_id)]
print("after ucity filter:")
print(df.shape)
df = df[df["ccity_name"].isin(ccity_name)]
print("after ccity_name filter:")
print(df.shape)
df = df[df["manufacturer"].isin(manufacturer)]
print("after manufacturer filter:")
print(df.shape)
df = df[df["channel"].isin(channel)]
print("after channel filter:")
print(df.shape)
df = df[df["level2_ids"].isin(level2_ids)]
print("after level2_ids filter:")
for i in columns.keys():
df.loc[~df["city_id"].isin(columns[i]), [i]] = "na"
print(df.shape)
df[(df["ucity_id"] == "beijing") & (df["top"] == 66)].to_csv(path + "top66.csv", sep="\t", index=False)
df["cid_id"] = df["cid_id"].astype("str")
df["clevel1_id"] = df["clevel1_id"].astype("str")
df["top"] = df["top"].astype("str")
......@@ -309,9 +278,9 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids):
if __name__ == "__main__":
path = "/home/gmuser/esmm_data/"
a = time.time()
temp, validate_date, ucity_id,ccity_name,manufacturer,channel,level2_ids = get_data()
temp, validate_date, column_map = get_data()
model = transform(temp, validate_date)
get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids)
get_predict_set(model,column_map)
b = time.time()
print("cost(分钟)")
print((b-a)/60)
#! /bin/bash
cd /srv/apps/ffm-baseline/eda/esmm
git checkout test
PYTHON_PATH=/home/gaoyazhe/miniconda3/bin/python
MODEL_PATH=/srv/apps/ffm-baseline/eda/esmm
DATA_PATH=/home/gmuser/esmm_data
......
......@@ -38,10 +38,6 @@ class multiFFMFormatPandas:
self.y = None
def fit(self, df, y=None):
b = df.dtypes
c = list(b.values)
d =tuple(df.dtypes.to_dict())
f = dict(zip(d,c))
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
......@@ -53,24 +49,17 @@ class multiFFMFormatPandas:
if self.feature_index_ is None:
self.feature_index_ = dict()
last_idx = 1
l = list(df.columns)
l.remove(y)
for col in l:
if f[col]=="O":
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
else:
self.feature_index_[col] = last_idx
last_idx += 1
print("last_idx")
print(last_idx-1)
for col in df.columns:
self.feature_index_[col] = 1
last_idx = 1
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
return self
def fit_transform(self, df, y=None,n=50000,processes=4):
......@@ -148,6 +137,7 @@ class multiFFMFormatPandas:
return False
def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select max(stat_date) from esmm_train_data"
......@@ -256,8 +246,6 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel,level2_ids):
print("after level2_ids filter:")
print(df.shape)
df[(df["ucity_id"] == "beijing") & (df["top"] == 66)].to_csv(path + "top66.csv", sep="\t", index=False)
df["cid_id"] = df["cid_id"].astype("str")
df["clevel1_id"] = df["clevel1_id"].astype("str")
df["top"] = df["top"].astype("str")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment