Commit 8998037a authored by 王志伟's avatar 王志伟
parents 638d3d63 651f26a2
......@@ -56,11 +56,11 @@ currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
echo $current
echo "infer native..."
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=354332 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/infer.log
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=23 --feature_size=354332 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/infer.log
echo "infer nearby..."
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=354332 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/infer.log
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=23 --feature_size=354332 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/infer.log
echo "sort and 2sql"
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/sort_and_2sql.py
......
......@@ -69,13 +69,24 @@ object EsmmData {
if (max_stat_date_str != param.date){
val stat_date = param.date
println(stat_date)
// val imp_data = sc.sql(
// s"""
// |select distinct stat_date,device_id,city_id as ucity_id,
// | cid_id,diary_service_id
// |from data_feed_exposure
// |where cid_type = 'diary'
// |and stat_date ='${stat_date}'
// """.stripMargin
// )
val imp_data = sc.sql(
s"""
|select distinct stat_date,device_id,city_id as ucity_id,
| cid_id,diary_service_id
|select * from
|(select stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
|from data_feed_exposure
|where cid_type = 'diary'
|and stat_date ='${stat_date}'
|group by stat_date,device_id,city_id,cid_id,diary_service_id having count(*) > 1) a
""".stripMargin
)
// imp_data.show()
......@@ -262,7 +273,7 @@ object EsmmPredData {
import sc.implicits._
val yesteday_have_seq = GmeiConfig.getMinusNDate(1)
val yesteday_have_seq = GmeiConfig.getMinusNDate(7)
//nearby_data
val raw_data = sc.sql(
......@@ -273,7 +284,7 @@ object EsmmPredData {
|select device_id,city_id,native_queue as merge_queue from ffm_diary_queue
|union
|select device_id,city_id,search_queue as merge_queue from search_queue) as tmp1
|where tmp1.device_id in (select distinct device_id from data_feed_click where stat_date='${yesteday_have_seq}')
|where tmp1.device_id in (select distinct device_id from data_feed_click where stat_date>'${yesteday_have_seq}')
""".stripMargin
)
raw_data.show()
......@@ -303,7 +314,7 @@ object EsmmPredData {
s"""
|select distinct a.device_id,a.city_id,b.native_queue from data_feed_click a
|left join biz_feed_diary_queue b on a.city_id = b.city_id
|where a.stat_date='${yesteday_have_seq}' and b.native_queue != ""
|where a.stat_date>'${yesteday_have_seq}' and b.native_queue != ""
""".stripMargin
)
native_data.createOrReplaceTempView("native_data")
......@@ -334,7 +345,7 @@ object EsmmPredData {
//join feat
val yesteday = yesteday_have_seq.replace("-","")
val yesteday = GmeiConfig.getMinusNDate(1).replace("-","")
val sid_data = sc.sql(
s"""
|select distinct
......
......@@ -138,7 +138,7 @@ class multiFFMFormatPandas:
def get_data():
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
sql = "select max(stat_date) from esmm_train_data"
sql = "select max(stat_date) from esmm_train_test"
validate_date = con_sql(db, sql)[0].values.tolist()[0]
print("validate_date:" + validate_date)
temp = datetime.datetime.strptime(validate_date, "%Y-%m-%d")
......@@ -149,7 +149,7 @@ def get_data():
"u.device_type,u.manufacturer,u.channel," \
"home.jingxuan,home.zhibo,home.nose,home.eyes,home.weizheng,home.teeth,home.lunkuo," \
"home.meifu,home.xizhi,home.zhifang,home.longxiong,home.simi,home.maofa,home.gongli,home.korea " \
"from esmm_train_data e left join user_feature u on e.device_id = u.device_id " \
"from esmm_train_test e left join user_feature u on e.device_id = u.device_id " \
"left join home_tab_click home on e.device_id = home.device_id " \
"where e.stat_date >= '{}'".format(start)
df = con_sql(db, sql)
......@@ -174,7 +174,7 @@ def get_data():
def transform(a,validate_date):
model = multiFFMFormatPandas()
df = model.fit_transform(a, y="y", n=160000, processes=26)
df = model.fit_transform(a, y="y", n=160000, processes=22)
df = pd.DataFrame(df)
df["stat_date"] = df[0].apply(lambda x: x.split(",")[0])
df["device_id"] = df[0].apply(lambda x: x.split(",")[1])
......@@ -194,8 +194,8 @@ def transform(a,validate_date):
test = test.drop("stat_date",axis=1)
# print("train shape")
# print(train.shape)
# train.to_csv(path + "train.csv", sep="\t", index=False)
# test.to_csv(path + "test.csv", sep="\t", index=False)
train.to_csv(path + "tr.csv", sep="\t", index=False)
test.to_csv(path + "va.csv", sep="\t", index=False)
return model
......@@ -245,20 +245,20 @@ def get_predict_set(ucity_id, cid,model):
native_pre = df[df["label"] == "0"]
native_pre = native_pre.drop("label", axis=1)
native_pre.to_csv(path+"native_pre.csv",sep="\t",index=False)
native_pre.to_csv(path+"native.csv",sep="\t",index=False)
# print("native_pre shape")
# print(native_pre.shape)
nearby_pre = df[df["label"] == "1"]
nearby_pre = nearby_pre.drop("label", axis=1)
nearby_pre.to_csv(path + "nearby_pre.csv", sep="\t", index=False)
nearby_pre.to_csv(path + "nearby.csv", sep="\t", index=False)
# print("nearby_pre shape")
# print(nearby_pre.shape)
if __name__ == "__main__":
path = "/home/gmuser/ffm/"
path = "/home/gaoyazhe/esmm/data/"
a = time.time()
df, validate_date, ucity_id, cid = get_data()
model = transform(df, validate_date)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment