Commit 64803aa2 authored by 张彦钊's avatar 张彦钊

把esmm训练集改成train_data

parent e4d0eb0f
...@@ -20,20 +20,20 @@ def con_sql(db,sql): ...@@ -20,20 +20,20 @@ def con_sql(db,sql):
def get_data(): def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select max(stat_date) from esmm_train_data" sql = "select max(stat_date) from {}".format(train_data_set)
validate_date = con_sql(db, sql)[0].values.tolist()[0] validate_date = con_sql(db, sql)[0].values.tolist()[0]
print("validate_date:" + validate_date) print("validate_date:" + validate_date)
temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d") temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
start = (temp - datetime.timedelta(days=60)).strftime("%Y-%m-%d") start = (temp - datetime.timedelta(days=30)).strftime("%Y-%m-%d")
print(start) print(start)
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test') db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \ sql = "select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name," \
"u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,cut.time " \ "u.device_type,u.manufacturer,u.channel,c.top,cl.l1,cl.l2,e.device_id,cut.time " \
"from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \ "from {} e left join user_feature u on e.device_id = u.device_id " \
"left join cid_type_top c on e.device_id = c.device_id " \ "left join cid_type_top c on e.device_id = c.device_id " \
"left join cid_level2 cl on e.cid_id = cl.cid " \ "left join cid_level2 cl on e.cid_id = cl.cid " \
"left join cid_time_cut cut on e.cid_id = cut.cid " \ "left join cid_time_cut cut on e.cid_id = cut.cid " \
"where e.stat_date >= '{}'".format(start) "where e.stat_date >= '{}'".format(train_data_set,start)
df = con_sql(db, sql) df = con_sql(db, sql)
# print(df.shape) # print(df.shape)
df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name", df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel1_id", 5: "ccity_name",
...@@ -48,6 +48,12 @@ def get_data(): ...@@ -48,6 +48,12 @@ def get_data():
df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", df = df.drop_duplicates(["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
"channel", "top", "l1","l2", "time", "stat_date"]) "channel", "top", "l1","l2", "time", "stat_date"])
print(df.shape) print(df.shape)
print("exp numbers:")
print(df[df["y"] == 0].shape)
print("click numbers")
print(df[(df["y"] == 1)&(df["z"] == 0)].shape)
print("buy numbers")
print(df[(df["y"] == 1) & (df["z"] == 1)].shape)
unique_values = [] unique_values = []
features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer", features = ["ucity_id", "clevel1_id", "ccity_name", "device_type", "manufacturer",
...@@ -163,7 +169,7 @@ def get_predict(date,value_map): ...@@ -163,7 +169,7 @@ def get_predict(date,value_map):
if __name__ == '__main__': if __name__ == '__main__':
# path = "/home/gmuser/esmm_data/" train_data_set = "train_data"
path = "/data/esmm/" path = "/data/esmm/"
date,value = get_data() date,value = get_data()
get_predict(date, value) get_predict(date, value)
......
...@@ -32,7 +32,7 @@ rm ${DATA_PATH}/nearby/nearby_* ...@@ -32,7 +32,7 @@ rm ${DATA_PATH}/nearby/nearby_*
echo "train..." echo "train..."
${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.9 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=2 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=1460 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train ${PYTHON_PATH} ${MODEL_PATH}/train.py --ctr_task_wgt=0.9 --learning_rate=0.0001 --deep_layers=512,256,128,64,32 --dropout=0.3,0.3,0.3,0.3,0.3 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=1460 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train
echo "infer native..." echo "infer native..."
......
...@@ -66,7 +66,6 @@ def main(): ...@@ -66,7 +66,6 @@ def main():
print("union_device_count",df_all.shape) print("union_device_count",df_all.shape)
host='10.66.157.22' host='10.66.157.22'
port=4000 port=4000
user='root' user='root'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment