Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
8998037a
Commit
8998037a
authored
Dec 19, 2018
by
王志伟
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of
http://git.wanmeizhensuo.com/ML/ffm-baseline
parents
638d3d63
651f26a2
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
27 additions
and
16 deletions
+27
-16
submit.sh
eda/esmm/Model_pipline/submit.sh
+2
-2
EsmmData.scala
eda/feededa/src/main/scala/com/gmei/EsmmData.scala
+17
-6
ffm.py
tensnsorflow/ffm.py
+8
-8
No files found.
eda/esmm/Model_pipline/submit.sh
View file @
8998037a
...
...
@@ -56,11 +56,11 @@ currentTimeStamp=$((timeStamp*1000+`date "+%N"`/1000000))
echo
$current
echo
"infer native..."
${
PYTHON_PATH
}
${
MODEL_PATH
}
/Model_pipline/DeepCvrMTL.py
--ctr_task_wgt
=
0.3
--learning_rate
=
0.0001
--deep_layers
=
256,128
--dropout
=
0.8,0.5
--optimizer
=
Adam
--num_epochs
=
1
--embedding_size
=
16
--batch_size
=
1024
--field_size
=
11
--feature_size
=
354332
--l2_reg
=
0.005
--log_steps
=
100
--num_threads
=
36
--model_dir
=
${
DATA_PATH
}
/model_ckpt/DeepCvrMTL/
--data_dir
=
${
DATA_PATH
}
/native
--task_type
=
infer
>
${
DATA_PATH
}
/infer.log
${
PYTHON_PATH
}
${
MODEL_PATH
}
/Model_pipline/DeepCvrMTL.py
--ctr_task_wgt
=
0.3
--learning_rate
=
0.0001
--deep_layers
=
256,128
--dropout
=
0.8,0.5
--optimizer
=
Adam
--num_epochs
=
1
--embedding_size
=
16
--batch_size
=
1024
--field_size
=
23
--feature_size
=
354332
--l2_reg
=
0.005
--log_steps
=
100
--num_threads
=
36
--model_dir
=
${
DATA_PATH
}
/model_ckpt/DeepCvrMTL/
--data_dir
=
${
DATA_PATH
}
/native
--task_type
=
infer
>
${
DATA_PATH
}
/infer.log
echo
"infer nearby..."
${
PYTHON_PATH
}
${
MODEL_PATH
}
/Model_pipline/DeepCvrMTL.py
--ctr_task_wgt
=
0.3
--learning_rate
=
0.0001
--deep_layers
=
256,128
--dropout
=
0.8,0.5
--optimizer
=
Adam
--num_epochs
=
1
--embedding_size
=
16
--batch_size
=
1024
--field_size
=
11
--feature_size
=
354332
--l2_reg
=
0.005
--log_steps
=
100
--num_threads
=
36
--model_dir
=
${
DATA_PATH
}
/model_ckpt/DeepCvrMTL/
--data_dir
=
${
DATA_PATH
}
/nearby
--task_type
=
infer
>
${
DATA_PATH
}
/infer.log
${
PYTHON_PATH
}
${
MODEL_PATH
}
/Model_pipline/DeepCvrMTL.py
--ctr_task_wgt
=
0.3
--learning_rate
=
0.0001
--deep_layers
=
256,128
--dropout
=
0.8,0.5
--optimizer
=
Adam
--num_epochs
=
1
--embedding_size
=
16
--batch_size
=
1024
--field_size
=
23
--feature_size
=
354332
--l2_reg
=
0.005
--log_steps
=
100
--num_threads
=
36
--model_dir
=
${
DATA_PATH
}
/model_ckpt/DeepCvrMTL/
--data_dir
=
${
DATA_PATH
}
/nearby
--task_type
=
infer
>
${
DATA_PATH
}
/infer.log
echo
"sort and 2sql"
${
PYTHON_PATH
}
${
MODEL_PATH
}
/Model_pipline/sort_and_2sql.py
...
...
eda/feededa/src/main/scala/com/gmei/EsmmData.scala
View file @
8998037a
...
...
@@ -69,13 +69,24 @@ object EsmmData {
if
(
max_stat_date_str
!=
param
.
date
){
val
stat_date
=
param
.
date
println
(
stat_date
)
// val imp_data = sc.sql(
// s"""
// |select distinct stat_date,device_id,city_id as ucity_id,
// | cid_id,diary_service_id
// |from data_feed_exposure
// |where cid_type = 'diary'
// |and stat_date ='${stat_date}'
// """.stripMargin
// )
val
imp_data
=
sc
.
sql
(
s
"""
|select
distinct stat_date,device_id,city_id as ucity_id,
|
cid_id,diary_service_id
|select
* from
|
(select stat_date,device_id,city_id as ucity_id,
cid_id,diary_service_id
|from data_feed_exposure
|where cid_type = 'diary'
|and stat_date ='${stat_date}'
|group by stat_date,device_id,city_id,cid_id,diary_service_id having count(*) > 1) a
"""
.
stripMargin
)
// imp_data.show()
...
...
@@ -262,7 +273,7 @@ object EsmmPredData {
import
sc.implicits._
val
yesteday_have_seq
=
GmeiConfig
.
getMinusNDate
(
1
)
val
yesteday_have_seq
=
GmeiConfig
.
getMinusNDate
(
7
)
//nearby_data
val
raw_data
=
sc
.
sql
(
...
...
@@ -273,7 +284,7 @@ object EsmmPredData {
|select device_id,city_id,native_queue as merge_queue from ffm_diary_queue
|union
|select device_id,city_id,search_queue as merge_queue from search_queue) as tmp1
|where tmp1.device_id in (select distinct device_id from data_feed_click where stat_date
=
'${yesteday_have_seq}')
|where tmp1.device_id in (select distinct device_id from data_feed_click where stat_date
>
'${yesteday_have_seq}')
"""
.
stripMargin
)
raw_data
.
show
()
...
...
@@ -303,7 +314,7 @@ object EsmmPredData {
s
"""
|select distinct a.device_id,a.city_id,b.native_queue from data_feed_click a
|left join biz_feed_diary_queue b on a.city_id = b.city_id
|where a.stat_date
=
'${yesteday_have_seq}' and b.native_queue != ""
|where a.stat_date
>
'${yesteday_have_seq}' and b.native_queue != ""
"""
.
stripMargin
)
native_data
.
createOrReplaceTempView
(
"native_data"
)
...
...
@@ -334,7 +345,7 @@ object EsmmPredData {
//join feat
val
yesteday
=
yesteday_have_seq
.
replace
(
"-"
,
""
)
val
yesteday
=
GmeiConfig
.
getMinusNDate
(
1
)
.
replace
(
"-"
,
""
)
val
sid_data
=
sc
.
sql
(
s
"""
|select distinct
...
...
tensnsorflow/ffm.py
View file @
8998037a
...
...
@@ -138,7 +138,7 @@ class multiFFMFormatPandas:
def
get_data
():
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select max(stat_date) from esmm_train_
data
"
sql
=
"select max(stat_date) from esmm_train_
test
"
validate_date
=
con_sql
(
db
,
sql
)[
0
]
.
values
.
tolist
()[
0
]
print
(
"validate_date:"
+
validate_date
)
temp
=
datetime
.
datetime
.
strptime
(
validate_date
,
"
%
Y-
%
m-
%
d"
)
...
...
@@ -149,7 +149,7 @@ def get_data():
"u.device_type,u.manufacturer,u.channel,"
\
"home.jingxuan,home.zhibo,home.nose,home.eyes,home.weizheng,home.teeth,home.lunkuo,"
\
"home.meifu,home.xizhi,home.zhifang,home.longxiong,home.simi,home.maofa,home.gongli,home.korea "
\
"from esmm_train_
data
e left join user_feature u on e.device_id = u.device_id "
\
"from esmm_train_
test
e left join user_feature u on e.device_id = u.device_id "
\
"left join home_tab_click home on e.device_id = home.device_id "
\
"where e.stat_date >= '{}'"
.
format
(
start
)
df
=
con_sql
(
db
,
sql
)
...
...
@@ -174,7 +174,7 @@ def get_data():
def
transform
(
a
,
validate_date
):
model
=
multiFFMFormatPandas
()
df
=
model
.
fit_transform
(
a
,
y
=
"y"
,
n
=
160000
,
processes
=
2
6
)
df
=
model
.
fit_transform
(
a
,
y
=
"y"
,
n
=
160000
,
processes
=
2
2
)
df
=
pd
.
DataFrame
(
df
)
df
[
"stat_date"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
0
])
df
[
"device_id"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
1
])
...
...
@@ -194,8 +194,8 @@ def transform(a,validate_date):
test
=
test
.
drop
(
"stat_date"
,
axis
=
1
)
# print("train shape")
# print(train.shape)
# train.to_csv(path + "train
.csv", sep="\t", index=False)
# test.to_csv(path + "test
.csv", sep="\t", index=False)
train
.
to_csv
(
path
+
"tr
.csv"
,
sep
=
"
\t
"
,
index
=
False
)
test
.
to_csv
(
path
+
"va
.csv"
,
sep
=
"
\t
"
,
index
=
False
)
return
model
...
...
@@ -245,20 +245,20 @@ def get_predict_set(ucity_id, cid,model):
native_pre
=
df
[
df
[
"label"
]
==
"0"
]
native_pre
=
native_pre
.
drop
(
"label"
,
axis
=
1
)
native_pre
.
to_csv
(
path
+
"native
_pre
.csv"
,
sep
=
"
\t
"
,
index
=
False
)
native_pre
.
to_csv
(
path
+
"native.csv"
,
sep
=
"
\t
"
,
index
=
False
)
# print("native_pre shape")
# print(native_pre.shape)
nearby_pre
=
df
[
df
[
"label"
]
==
"1"
]
nearby_pre
=
nearby_pre
.
drop
(
"label"
,
axis
=
1
)
nearby_pre
.
to_csv
(
path
+
"nearby
_pre
.csv"
,
sep
=
"
\t
"
,
index
=
False
)
nearby_pre
.
to_csv
(
path
+
"nearby.csv"
,
sep
=
"
\t
"
,
index
=
False
)
# print("nearby_pre shape")
# print(nearby_pre.shape)
if
__name__
==
"__main__"
:
path
=
"/home/g
muser/ffm
/"
path
=
"/home/g
aoyazhe/esmm/data
/"
a
=
time
.
time
()
df
,
validate_date
,
ucity_id
,
cid
=
get_data
()
model
=
transform
(
df
,
validate_date
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment