Commit 13f4ccb4 authored by 张彦钊's avatar 张彦钊

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

change path
parents 1d4172ba 53c0bf7d
This diff is collapsed.
#!/usr/bin/env python
#coding=utf-8 #coding=utf-8
from __future__ import absolute_import from __future__ import absolute_import
...@@ -26,8 +25,8 @@ tf.app.flags.DEFINE_integer("threads", 16, "threads num") ...@@ -26,8 +25,8 @@ tf.app.flags.DEFINE_integer("threads", 16, "threads num")
#User_Fileds = set(['101','109_14','110_14','127_14','150_14','121','122','124','125','126','127','128','129']) #User_Fileds = set(['101','109_14','110_14','127_14','150_14','121','122','124','125','126','127','128','129'])
#Ad_Fileds = set(['205','206','207','210','216']) #Ad_Fileds = set(['205','206','207','210','216'])
#Context_Fileds = set(['508','509','702','853','301']) #Context_Fileds = set(['508','509','702','853','301'])
#Common_Fileds = {'1':'1','2':'2','3':'3','4':'4','5':'5','6':'6','7':'7','8':'8','9':'9','10':'10','11':'11','12':'12','13':'13','14':'14','15':'15','16':'16','17':'17','18':'18','19':'19','20':'20','21':'21','22':'22','23':'23','24':'24','25':'25','26':'26','27':'27','28':'28','29':'29','30':'30'} Common_Fileds = {'1':'1','2':'2','3':'3','4':'4','5':'5','6':'6','7':'7','8':'8','9':'9','10':'10','11':'11','12':'12','13':'13','14':'14','15':'15','16':'16','17':'17','18':'18','19':'19','20':'20','21':'21','22':'22','23':'23'}
Common_Fileds = {'1':'1','2':'2','3':'3','4':'4','5':'5','6':'6','7':'7','8':'8','9':'9','10':'10','11':'11'} #Common_Fileds = {'1':'1','2':'2','3':'3','4':'4','5':'5','6':'6','7':'7','8':'8','9':'9','10':'10','11':'11'}
UMH_Fileds = {'109_14':('u_cat','12'),'110_14':('u_shop','13'),'127_14':('u_brand','14'),'150_14':('u_int','15')} #user multi-hot feature UMH_Fileds = {'109_14':('u_cat','12'),'110_14':('u_shop','13'),'127_14':('u_brand','14'),'150_14':('u_int','15')} #user multi-hot feature
Ad_Fileds = {'206':('a_cat','16'),'207':('a_shop','17'),'210':('a_int','18'),'216':('a_brand','19')} #ad feature for DIN Ad_Fileds = {'206':('a_cat','16'),'207':('a_shop','17'),'210':('a_int','18'),'216':('a_brand','19')} #ad feature for DIN
......
#!/usr/bin/env python
#coding=utf-8 #coding=utf-8
#from __future__ import absolute_import #from __future__ import absolute_import
...@@ -346,7 +345,7 @@ def main(_): ...@@ -346,7 +345,7 @@ def main(_):
print("-"*100) print("-"*100)
with open(FLAGS.data_dir + "/pred.txt", "w") as fo: with open(FLAGS.data_dir + "/pred.txt", "w") as fo:
for prob in preds: for prob in preds:
fo.write("%f\t%f\n" % (prob['pctr'], prob['pcvr'])) fo.write("%f\t%f\t%f\n" % (prob['pctr'], prob['pcvr'], prob['pctcvr']))
elif FLAGS.task_type == 'export': elif FLAGS.task_type == 'export':
print("Not Implemented, Do It Yourself!") print("Not Implemented, Do It Yourself!")
#feature_spec = tf.feature_column.make_parse_example_spec(feature_columns) #feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
......
# -*- coding: utf-8 -*- #coding=utf-8
import smtplib import smtplib
from email.mime.text import MIMEText from email.mime.text import MIMEText
......
#coding=utf-8
from sqlalchemy import create_engine from sqlalchemy import create_engine
import pandas as pd import pandas as pd
import pymysql import pymysql
...@@ -17,39 +19,30 @@ def con_sql(sql): ...@@ -17,39 +19,30 @@ def con_sql(sql):
return result return result
def set_join(lst): def set_join(lst):
return ','.join(set(lst)) return ','.join([str(i) for i in set(lst)])
def main(): def main():
sql = "select device_id,city_id,cid from esmm_data2ffm_infer_native"
result = con_sql(sql) # native queue
dct = {"uid":[],"city":[],"cid_id":[]} df2 = pd.read_csv('/home/gaoyazhe/data/native.csv',usecols=[0,1,2],header=0,names=['uid','city','cid_id'],sep='\t')
for i in result: df2['cid_id'] = df2['cid_id'].astype('object')
dct["uid"].append(i[0])
dct["city"].append(i[1]) df1 = pd.read_csv("/home/gaoyazhe/data/native/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
dct["cid_id"].append(i[2]) df2["ctr"],df2["cvr"],df2["ctcvr"] = df1["ctr"],df1["cvr"],df1["ctcvr"]
df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
df1 = pd.read_csv("/home/gaoyazhe/data/native/pred.txt",sep='\t',header=None,names=["ctr","cvr"])
df2 = pd.DataFrame(dct)
df2["ctr"],df2["cvr"] = df1["ctr"],df1["cvr"]
df3 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="cvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
ctime = int(time.time()) ctime = int(time.time())
df3["time"] = ctime df3["time"] = ctime
df3.columns = ["device_id","city_id","native_queue","time"] df3.columns = ["device_id","city_id","native_queue","time"]
print("native_device_count",df3.shape) print("native_device_count",df3.shape)
sql_nearby = "select device_id,city_id,cid from esmm_data2ffm_infer_nearby" # nearby queue
result = con_sql(sql_nearby) df2 = pd.read_csv('/home/gaoyazhe/data/nearby.csv',usecols=[0,1,2],header=0,names=['uid','city','cid_id'],sep='\t')
dct = {"uid":[],"city":[],"cid_id":[]} df2['cid_id'] = df2['cid_id'].astype('object')
for i in result:
dct["uid"].append(i[0])
dct["city"].append(i[1])
dct["cid_id"].append(i[2])
df1 = pd.read_csv("/home/gaoyazhe/data/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr"]) df1 = pd.read_csv("/home/gaoyazhe/data/nearby/pred.txt",sep='\t',header=None,names=["ctr","cvr","ctcvr"])
df2 = pd.DataFrame(dct) df2["ctr"], df2["cvr"], df2["ctcvr"] = df1["ctr"], df1["cvr"], df1["ctcvr"]
df2["ctr"],df2["cvr"] = df1["ctr"],df1["cvr"] df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="ctcvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
df4 = df2.groupby(by=["uid","city"]).apply(lambda x: x.sort_values(by="cvr",ascending=False)).reset_index(drop=True).groupby(by=["uid","city"]).agg({'cid_id':set_join}).reset_index(drop=False)
df4.columns = ["device_id","city_id","nearby_queue"] df4.columns = ["device_id","city_id","nearby_queue"]
print("nearby_device_count",df4.shape) print("nearby_device_count",df4.shape)
......
...@@ -15,11 +15,8 @@ rm ${DATA_PATH}/va/* ...@@ -15,11 +15,8 @@ rm ${DATA_PATH}/va/*
rm ${DATA_PATH}/native/* rm ${DATA_PATH}/native/*
rm ${DATA_PATH}/nearby/* rm ${DATA_PATH}/nearby/*
echo "mysql to csv" echo "data2ffm"
mysql -u root -p3SYz54LS9#^9sBvC -h 10.66.157.22 -P 4000 -D jerry_test -e "select number,data from esmm_data2ffm_train" > ${DATA_PATH}/tr.csv ${PYTHON_PATH} ${MODEL_PATH}/Feature_pipline/data2ffm.py > ${DATA_PATH}/infer.log
mysql -u root -p3SYz54LS9#^9sBvC -h 10.66.157.22 -P 4000 -D jerry_test -e "select number,data from esmm_data2ffm_cv" > ${DATA_PATH}/va.csv
mysql -u root -p3SYz54LS9#^9sBvC -h 10.66.157.22 -P 4000 -D jerry_test -e "select number,data from esmm_data2ffm_infer_native" > ${DATA_PATH}/native.csv
mysql -u root -p3SYz54LS9#^9sBvC -h 10.66.157.22 -P 4000 -D jerry_test -e "select number,data from esmm_data2ffm_infer_nearby" > ${DATA_PATH}/nearby.csv
echo "split data" echo "split data"
split -l $((`wc -l < ${DATA_PATH}/tr.csv`/15)) ${DATA_PATH}/tr.csv -d -a 4 ${DATA_PATH}/tr/tr_ --additional-suffix=.csv split -l $((`wc -l < ${DATA_PATH}/tr.csv`/15)) ${DATA_PATH}/tr.csv -d -a 4 ${DATA_PATH}/tr/tr_ --additional-suffix=.csv
...@@ -50,7 +47,7 @@ currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000)) ...@@ -50,7 +47,7 @@ currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
echo $current echo $current
echo "train..." echo "train..."
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=354332 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir="${DATA_PATH}" --task_type="train" ${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=23 --feature_size=354332 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH} --task_type=train
echo "train time" echo "train time"
current=$(date "+%Y-%m-%d %H:%M:%S") current=$(date "+%Y-%m-%d %H:%M:%S")
...@@ -59,11 +56,11 @@ currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000)) ...@@ -59,11 +56,11 @@ currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
echo $current echo $current
echo "infer native..." echo "infer native..."
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=354332 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir="${DATA_PATH}/native" --task_type="infer" > ${DATA_PATH}/infer.log ${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=354332 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/native --task_type=infer > ${DATA_PATH}/infer.log
echo "infer nearby..." echo "infer nearby..."
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=354332 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir="${DATA_PATH}/nearby" --task_type="infer" > ${DATA_PATH}/infer.log ${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/DeepCvrMTL.py --ctr_task_wgt=0.3 --learning_rate=0.0001 --deep_layers=256,128 --dropout=0.8,0.5 --optimizer=Adam --num_epochs=1 --embedding_size=16 --batch_size=1024 --field_size=11 --feature_size=354332 --l2_reg=0.005 --log_steps=100 --num_threads=36 --model_dir=${DATA_PATH}/model_ckpt/DeepCvrMTL/ --data_dir=${DATA_PATH}/nearby --task_type=infer > ${DATA_PATH}/infer.log
echo "sort and 2sql" echo "sort and 2sql"
${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/sort_and_2sql.py ${PYTHON_PATH} ${MODEL_PATH}/Model_pipline/sort_and_2sql.py
......
...@@ -69,13 +69,24 @@ object EsmmData { ...@@ -69,13 +69,24 @@ object EsmmData {
if (max_stat_date_str != param.date){ if (max_stat_date_str != param.date){
val stat_date = param.date val stat_date = param.date
println(stat_date) println(stat_date)
// val imp_data = sc.sql(
// s"""
// |select distinct stat_date,device_id,city_id as ucity_id,
// | cid_id,diary_service_id
// |from data_feed_exposure
// |where cid_type = 'diary'
// |and stat_date ='${stat_date}'
// """.stripMargin
// )
val imp_data = sc.sql( val imp_data = sc.sql(
s""" s"""
|select distinct stat_date,device_id,city_id as ucity_id, |select * from
| cid_id,diary_service_id |(select stat_date,device_id,city_id as ucity_id,cid_id,diary_service_id
|from data_feed_exposure |from data_feed_exposure
|where cid_type = 'diary' |where cid_type = 'diary'
|and stat_date ='${stat_date}' |and stat_date ='${stat_date}'
|group by stat_date,device_id,city_id,cid_id,diary_service_id having count(*) > 1) a
""".stripMargin """.stripMargin
) )
// imp_data.show() // imp_data.show()
...@@ -200,7 +211,7 @@ object EsmmData { ...@@ -200,7 +211,7 @@ object EsmmData {
) )
// union_data_scity_id.createOrReplaceTempView("union_data_scity_id") // union_data_scity_id.createOrReplaceTempView("union_data_scity_id")
union_data_scity_id.show() union_data_scity_id.show()
GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id, table="esmm_train_data",SaveMode.Append) GmeiConfig.writeToJDBCTable("jdbc:mysql://10.66.157.22:4000/jerry_test?user=root&password=3SYz54LS9#^9sBvC&rewriteBatchedStatements=true",union_data_scity_id, table="esmm_train_test",SaveMode.Append)
} else { } else {
println("esmm_train_data already have param.date data") println("esmm_train_data already have param.date data")
......
...@@ -76,23 +76,23 @@ object temp_analysis { ...@@ -76,23 +76,23 @@ object temp_analysis {
agency_id.createOrReplaceTempView("agency_id") agency_id.createOrReplaceTempView("agency_id")
//每日新用户 // //每日新用户
val device_id_newUser = sc.sql( // val device_id_newUser = sc.sql(
s""" // s"""
|select distinct(device_id) as device_id // |select distinct(device_id) as device_id
|from online.ml_device_day_active_status // |from online.ml_device_day_active_status
|where active_type != '4' // |where active_type != '4'
|and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3' // |and first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
| ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang' // | ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
| ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1' // | ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
| ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4' // | ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
| ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100' // | ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
| ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ' // | ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
| ,'promotion_shike','promotion_julang_jl03') // | ,'promotion_shike','promotion_julang_jl03')
|and partition_date ='${partition_date}' // |and partition_date ='${partition_date}'
""".stripMargin // """.stripMargin
) // )
device_id_newUser.createOrReplaceTempView("device_id_new") // device_id_newUser.createOrReplaceTempView("device_id_new")
val blacklist_id = sc.sql( val blacklist_id = sc.sql(
s""" s"""
...@@ -108,16 +108,34 @@ object temp_analysis { ...@@ -108,16 +108,34 @@ object temp_analysis {
|from agency_id |from agency_id
|UNION ALL |UNION ALL
|select device_id |select device_id
|from device_id_new
|UNION ALL
|select device_id
|from blacklist_id |from blacklist_id
""".stripMargin """.stripMargin
) )
final_id.createOrReplaceTempView("final_id") final_id.createOrReplaceTempView("final_id")
val diary_clk_all = sc.sql(
s"""
|select ov.partition_date,count(ov.cl_id) as clk_num,count(distinct(ov.cl_id)),count(ov.cl_id)/count(distinct(ov.cl_id))
|from online.tl_hdfs_maidian_view ov left join final_id
|on ov.cl_id = final_id.device_id
|where ov.action = "page_view"
|and params['page_name']="diary_detail"
|and ov.cl_id != "NULL"
|and ov.partition_date >='20181201'
|and final_id.device_id is null
|group by ov.partition_date
|order by ov.partition_date
""".stripMargin
)
diary_clk_all.show(80)
//日记本点击 //日记本点击
val referrer=List("all_case_service_comment","all_cases","diary_detail","diary_list","diary_listof_related_service", val referrer=List("about_me_message_list","all_case_service_comment","all_cases","diary_detail","diary_list"
,"diary_listof_related_service","answer_detail","community_home","conversation_detail","create_diary_title","diary_listof_related_service",
"doctor_all_cases","hospital_all_cases","my_favor","my_order","order_detail","personal_store_diary_list","received_votes",
"topic_detail","welfare_detail","welfare_list","welfare_special","wiki_detail","zone_detail",
"expert_detail","free_activity_detail","home","message_home","my_diary","organization_detail","other_homepage","question_detail", "expert_detail","free_activity_detail","home","message_home","my_diary","organization_detail","other_homepage","question_detail",
"search_result_diary","search_result_more","welfare_detail","zone_v3") "search_result_diary","search_result_more","welfare_detail","zone_v3")
for( a <- referrer ){ for( a <- referrer ){
...@@ -130,7 +148,7 @@ object temp_analysis { ...@@ -130,7 +148,7 @@ object temp_analysis {
|and params['page_name']="diary_detail" |and params['page_name']="diary_detail"
|and params['referrer']='${a}' |and params['referrer']='${a}'
|and ov.cl_id != "NULL" |and ov.cl_id != "NULL"
|and ov.partition_date >='20181101' |and ov.partition_date >='20181201'
|and final_id.device_id is null |and final_id.device_id is null
|group by ov.partition_date |group by ov.partition_date
|order by ov.partition_date |order by ov.partition_date
...@@ -141,6 +159,8 @@ object temp_analysis { ...@@ -141,6 +159,8 @@ object temp_analysis {
} }
//5.登录人数 //5.登录人数
val log_device_temp = sc.sql( val log_device_temp = sc.sql(
s""" s"""
......
...@@ -399,3 +399,4 @@ object testt { ...@@ -399,3 +399,4 @@ object testt {
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment