Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
de8f872a
Commit
de8f872a
authored
Dec 27, 2018
by
王志伟
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master' of
http://git.wanmeizhensuo.com/ML/ffm-baseline
parents
f27d8217
971a0d0e
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
186 additions
and
35 deletions
+186
-35
data2ffm.py
eda/esmm/Feature_pipline/data2ffm.py
+5
-5
submit.sh
eda/esmm/Model_pipline/submit.sh
+5
-0
applist.py
tensnsorflow/applist.py
+10
-2
exp_channel.py
tensnsorflow/exp_channel.py
+72
-0
ffm.py
tensnsorflow/ffm.py
+20
-17
test.py
tensnsorflow/test.py
+74
-11
No files found.
eda/esmm/Feature_pipline/data2ffm.py
View file @
de8f872a
...
...
@@ -114,10 +114,10 @@ class multiFFMFormatPandas:
x
=
0
while
True
:
if
x
+
step
<
data
.
__len__
():
data_list
.
append
(
data
.
loc
[
x
:
x
+
step
])
x
=
x
+
step
+
1
data_list
.
append
(
data
.
i
loc
[
x
:
x
+
step
])
x
=
x
+
step
else
:
data_list
.
append
(
data
.
loc
[
x
:
data
.
__len__
()])
data_list
.
append
(
data
.
i
loc
[
x
:
data
.
__len__
()])
break
return
data_list
...
...
@@ -147,7 +147,7 @@ def get_data():
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name,"
\
"u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id "
\
"from esmm_train_data e left join user_feature u on e.device_id = u.device_id "
\
"from esmm_train_data e left join user_feature
_clean
u on e.device_id = u.device_id "
\
"left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id "
\
"where e.stat_date >= '{}'"
.
format
(
start
)
df
=
con_sql
(
db
,
sql
)
...
...
@@ -208,7 +208,7 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name,"
\
"u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id,e.cid_id "
\
"from esmm_pre_data e left join user_feature u on e.device_id = u.device_id "
\
"from esmm_pre_data e left join user_feature
_clean
u on e.device_id = u.device_id "
\
"left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id"
df
=
con_sql
(
db
,
sql
)
df
=
df
.
rename
(
columns
=
{
0
:
"y"
,
1
:
"z"
,
2
:
"label"
,
3
:
"ucity_id"
,
4
:
"clevel1_id"
,
5
:
"ccity_name"
,
...
...
eda/esmm/Model_pipline/submit.sh
View file @
de8f872a
...
...
@@ -19,6 +19,11 @@ rm -r ${DATA_PATH}/model_ckpt/DeepCvrMTL/2018*
echo
"data2ffm"
${
PYTHON_PATH
}
${
MODEL_PATH
}
/Feature_pipline/data2ffm.py
>
${
DATA_PATH
}
/infer.log
all_sample
=
$((
`
cat
${
DATA_PATH
}
/tr.csv |
awk
-F
'\t'
'{print$5}'
|
awk
-F
','
'{print$2$3$4}'
|
sort
|
uniq
|
wc
-l
`
))
uniq_feat
=
$((
`
cat
${
DATA_PATH
}
/tr.csv |
awk
-F
'\t'
'{print$5}'
|
awk
-F
','
'{print$4}'
|
sort
|
uniq
-u
|
wc
-l
`
))
repe_feat
=
$((
all_sample-uniq_feat
))
echo
"Bayes Error Rate"
:
$((
repe_feat
*
100
/
all_sample
))
%
echo
"split data"
split
-l
$((
`
wc
-l
<
${
DATA_PATH
}
/tr.csv
`
/
15
))
${
DATA_PATH
}
/tr.csv
-d
-a
4
${
DATA_PATH
}
/tr/tr_
--additional-suffix
=
.csv
split
-l
$((
`
wc
-l
<
${
DATA_PATH
}
/va.csv
`
/
5
))
${
DATA_PATH
}
/va.csv
-d
-a
4
${
DATA_PATH
}
/va/va_
--additional-suffix
=
.csv
...
...
tensnsorflow/applist.py
View file @
de8f872a
...
...
@@ -68,12 +68,20 @@ def sort_app():
"job"
:
{
"智联招聘"
,
"前程无忧"
,
"斗米"
,
"拉勾"
,
"Boss直聘"
,
"猎聘同道"
,
"智联招聘"
}
}
df
[
"app_list"
]
=
df
[
"app_list"
]
.
apply
(
json_format
)
n
=
df
.
shape
[
0
]
df
[
"sum"
]
=
0
for
i
in
category
.
keys
():
df
[
i
]
=
df
[
"app_list"
]
.
apply
(
lambda
x
:
1
if
len
(
x
&
category
[
i
])
>
0
else
0
)
print
(
i
)
print
(
df
[
i
]
.
value_counts
())
df
[
"sum"
]
=
df
[
"sum"
]
+
df
[
i
]
# print(i)
# print(df.loc[df[i]==1].shape[0]/n)
df
=
df
.
drop
(
"app_list"
,
axis
=
1
)
# for i in df["sum"].unique():
# print(i)
# a = df.loc[df["sum"] == i].shape[0]/n
# print(a)
yconnect
=
create_engine
(
'mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8'
)
print
(
df
.
shape
)
n
=
200000
...
...
tensnsorflow/exp_channel.py
0 → 100644
View file @
de8f872a
import
pandas
as
pd
import
pymysql
def
con_sql
(
db
,
sql
):
cursor
=
db
.
cursor
()
try
:
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
except
Exception
:
print
(
"发生异常"
,
Exception
)
df
=
pd
.
DataFrame
()
finally
:
db
.
close
()
return
df
def
exp
():
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select manufacturer,channel from user_feature"
df
=
con_sql
(
db
,
sql
)
n
=
df
.
shape
[
0
]
manufacturer
=
df
[
0
]
.
unique
()
manufacturer_map
=
{}
print
(
"manufacturer unique"
)
print
(
len
(
manufacturer
))
for
i
in
manufacturer
:
manufacturer_map
[
i
]
=
df
.
loc
[
df
[
0
]
==
i
]
.
shape
[
0
]
/
n
print
(
sorted
(
manufacturer_map
.
items
(),
key
=
lambda
x
:
x
[
1
]))
channel
=
df
[
1
]
.
unique
()
channel_map
=
{}
print
(
"channel unique"
)
print
(
len
(
channel
))
for
i
in
channel
:
channel_map
[
i
]
=
df
.
loc
[
df
[
1
]
==
i
]
.
shape
[
0
]
/
n
print
(
sorted
(
channel_map
.
items
(),
key
=
lambda
x
:
x
[
1
]))
def
clean
():
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select device_id,device_type,manufacturer,channel,city_id from user_feature"
df
=
con_sql
(
db
,
sql
)
df
=
df
.
rename
(
columns
=
{
0
:
"device_id"
,
1
:
"device_type"
,
2
:
"manufacturer"
,
3
:
"channel"
,
4
:
"city_id"
})
n
=
df
.
shape
[
0
]
manufacturer
=
df
[
"manufacturer"
]
.
unique
()
for
i
in
manufacturer
:
if
df
.
loc
[
df
[
"manufacturer"
]
==
i
]
.
shape
[
0
]
/
n
<
0.0005
:
df
.
loc
[
df
[
"manufacturer"
]
==
i
,[
"manufacturer"
]]
=
"other"
channel
=
df
[
"channel"
]
.
unique
()
for
i
in
channel
:
if
df
.
loc
[
df
[
"channel"
]
==
i
]
.
shape
[
0
]
/
n
<
0.0001
:
df
.
loc
[
df
[
"channel"
]
==
i
,
[
"channel"
]]
=
"other"
from
sqlalchemy
import
create_engine
yconnect
=
create_engine
(
'mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8'
)
n
=
200000
for
i
in
range
(
0
,
df
.
shape
[
0
],
n
):
print
(
i
)
if
i
==
0
:
temp
=
df
.
loc
[
0
:
n
]
elif
i
+
n
>
df
.
shape
[
0
]:
temp
=
df
.
loc
[
i
+
1
:]
else
:
temp
=
df
.
loc
[
i
+
1
:
i
+
n
]
pd
.
io
.
sql
.
to_sql
(
temp
,
"user_feature_clean"
,
yconnect
,
schema
=
'jerry_test'
,
if_exists
=
'append'
,
index
=
False
)
print
(
"insert done"
)
if
__name__
==
"__main__"
:
clean
()
tensnsorflow/ffm.py
View file @
de8f872a
...
...
@@ -99,6 +99,7 @@ class multiFFMFormatPandas:
result_map
=
{}
for
i
in
data_list
:
result_map
.
update
(
i
.
get
())
pool
.
close
()
pool
.
join
()
...
...
@@ -114,10 +115,10 @@ class multiFFMFormatPandas:
x
=
0
while
True
:
if
x
+
step
<
data
.
__len__
():
data_list
.
append
(
data
.
loc
[
x
:
x
+
step
])
x
=
x
+
step
+
1
data_list
.
append
(
data
.
i
loc
[
x
:
x
+
step
])
x
=
x
+
step
else
:
data_list
.
append
(
data
.
loc
[
x
:
data
.
__len__
()])
data_list
.
append
(
data
.
i
loc
[
x
:
data
.
__len__
()])
break
return
data_list
...
...
@@ -147,7 +148,7 @@ def get_data():
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select e.y,e.z,e.stat_date,e.ucity_id,e.clevel1_id,e.ccity_name,"
\
"u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id "
\
"from esmm_train_data e left join user_feature u on e.device_id = u.device_id "
\
"from esmm_train_data e left join user_feature
_clean
u on e.device_id = u.device_id "
\
"left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id "
\
"where e.stat_date >= '{}'"
.
format
(
start
)
df
=
con_sql
(
db
,
sql
)
...
...
@@ -174,6 +175,7 @@ def get_data():
manufacturer
=
list
(
set
(
df
[
"manufacturer"
]
.
values
.
tolist
()))
channel
=
list
(
set
(
df
[
"channel"
]
.
values
.
tolist
()))
return
df
,
validate_date
,
ucity_id
,
ccity_name
,
manufacturer
,
channel
...
...
@@ -197,8 +199,8 @@ def transform(a,validate_date):
train
=
train
.
drop
(
"stat_date"
,
axis
=
1
)
test
=
df
[
df
[
"stat_date"
]
==
validate_date
]
test
=
test
.
drop
(
"stat_date"
,
axis
=
1
)
#
print("train shape")
#
print(train.shape)
print
(
"train shape"
)
print
(
train
.
shape
)
train
.
to_csv
(
path
+
"tr.csv"
,
sep
=
"
\t
"
,
index
=
False
)
test
.
to_csv
(
path
+
"va.csv"
,
sep
=
"
\t
"
,
index
=
False
)
...
...
@@ -209,7 +211,7 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select e.y,e.z,e.label,e.ucity_id,e.clevel1_id,e.ccity_name,"
\
"u.device_type,u.manufacturer,u.channel,c.top,cid_time.time,e.device_id,e.cid_id "
\
"from esmm_pre_data e left join user_feature u on e.device_id = u.device_id "
\
"from esmm_pre_data e left join user_feature
_clean
u on e.device_id = u.device_id "
\
"left join cid_type_top c on e.device_id = c.device_id left join cid_time on e.cid_id = cid_time.cid_id"
df
=
con_sql
(
db
,
sql
)
df
=
df
.
rename
(
columns
=
{
0
:
"y"
,
1
:
"z"
,
2
:
"label"
,
3
:
"ucity_id"
,
4
:
"clevel1_id"
,
5
:
"ccity_name"
,
...
...
@@ -218,23 +220,23 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
print
(
"before filter:"
)
print
(
df
.
shape
)
print
(
df
.
loc
[
df
[
"device_id"
]
==
"358035085192742"
]
.
shape
)
df
=
df
[
df
[
"ucity_id"
]
.
isin
(
ucity_id
)]
print
(
"after ucity filter:"
)
print
(
df
.
shape
)
print
(
df
.
loc
[
df
[
"device_id"
]
==
"358035085192742"
]
.
shape
)
df
=
df
[
df
[
"ccity_name"
]
.
isin
(
ccity_name
)]
print
(
"after ccity_name filter:"
)
print
(
df
.
shape
)
print
(
df
.
loc
[
df
[
"device_id"
]
==
"358035085192742"
]
.
shape
)
df
=
df
[
df
[
"manufacturer"
]
.
isin
(
manufacturer
)]
print
(
"after manufacturer filter:"
)
print
(
df
.
shape
)
print
(
df
.
loc
[
df
[
"device_id"
]
==
"358035085192742"
]
.
shape
)
df
=
df
[
df
[
"channel"
]
.
isin
(
channel
)]
print
(
"after channel filter:"
)
print
(
df
.
shape
)
print
(
df
.
loc
[
df
[
"device_id"
]
==
"358035085192742"
]
.
shape
)
df
[
"cid_id"
]
=
df
[
"cid_id"
]
.
astype
(
"str"
)
df
[
"clevel1_id"
]
=
df
[
"clevel1_id"
]
.
astype
(
"str"
)
df
[
"top"
]
=
df
[
"top"
]
.
astype
(
"str"
)
...
...
@@ -245,9 +247,10 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
[
df
[
"device_id"
]
.
values
.
tolist
(),
df
[
"ucity_id"
]
.
values
.
tolist
(),
df
[
"cid_id"
]
.
values
.
tolist
(),
df
[
"y"
]
.
values
.
tolist
(),
df
[
"z"
]
.
values
.
tolist
()],
sep
=
","
)
df
=
df
.
drop
([
"z"
,
"label"
,
"device_id"
,
"cid_id"
],
axis
=
1
)
.
fillna
(
0.0
)
print
(
df
.
head
(
2
))
df
=
model
.
transform
(
df
,
n
=
160000
,
processes
=
22
)
df
=
pd
.
DataFrame
(
df
)
print
(
"before transform"
)
print
(
df
.
shape
)
temp_series
=
model
.
transform
(
df
,
n
=
160000
,
processes
=
22
)
df
=
pd
.
DataFrame
(
temp_series
)
print
(
"after transform"
)
print
(
df
.
shape
)
df
[
"label"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
0
])
...
...
@@ -286,8 +289,8 @@ def get_predict_set(ucity_id,model,ccity_name,manufacturer,channel):
if
__name__
==
"__main__"
:
path
=
"/home/gmuser/ffm/"
a
=
time
.
time
()
df
,
validate_date
,
ucity_id
,
ccity_name
,
manufacturer
,
channel
=
get_data
()
model
=
transform
(
df
,
validate_date
)
temp
,
validate_date
,
ucity_id
,
ccity_name
,
manufacturer
,
channel
=
get_data
()
model
=
transform
(
temp
,
validate_date
)
get_predict_set
(
ucity_id
,
model
,
ccity_name
,
manufacturer
,
channel
)
b
=
time
.
time
()
print
(
"cost(分钟)"
)
...
...
tensnsorflow/test.py
View file @
de8f872a
import
time
from
pyspark.context
import
SparkContext
from
pyspark.conf
import
SparkConf
conf
=
SparkConf
()
.
setMaster
(
"spark://10.30.181.88:7077"
)
.
setAppName
(
"My app"
)
sc
=
SparkContext
(
conf
=
conf
)
sc
.
setLogLevel
(
"WARN"
)
for
i
in
range
(
1
,
100
):
print
(
i
)
time
.
sleep
(
5
)
\ No newline at end of file
import
pandas
as
pd
import
pymysql
def
con_sql
(
db
,
sql
):
cursor
=
db
.
cursor
()
try
:
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
df
=
pd
.
DataFrame
(
list
(
result
))
except
Exception
:
print
(
"发生异常"
,
Exception
)
df
=
pd
.
DataFrame
()
finally
:
db
.
close
()
return
df
def
exp
():
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select native_queue from esmm_device_diary_queue where device_id = '358035085192742'"
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchone
()[
0
]
native
=
tuple
(
result
.
split
(
","
))
print
(
"total"
)
print
(
len
(
native
))
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_prod'
)
sql
=
"select diary_id,level1_ids,level2_ids,level3_ids from diary_feat where diary_id in {}"
.
format
(
native
)
df
=
con_sql
(
db
,
sql
)
n
=
df
.
shape
[
0
]
one
=
df
[
1
]
.
unique
()
one_map
=
{}
for
i
in
one
:
one_map
[
i
]
=
df
.
loc
[
df
[
1
]
==
i
]
.
shape
[
0
]
/
n
print
(
sorted
(
one_map
.
items
(),
key
=
lambda
x
:
x
[
1
]))
two
=
df
[
2
]
.
unique
()
two_map
=
{}
print
(
"分界线"
)
for
i
in
two
:
two_map
[
i
]
=
df
.
loc
[
df
[
2
]
==
i
]
.
shape
[
0
]
/
n
print
(
sorted
(
two_map
.
items
(),
key
=
lambda
x
:
x
[
1
]))
def
click
():
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_prod'
)
sql
=
"select d.cid_id,f.level1_ids,f.level2_ids from data_feed_click d left join diary_feat f "
\
"on d.cid_id = f.diary_id where d.device_id = '358035085192742' "
\
"and (d.cid_type = 'diary' or d.cid_type = 'diary_video') and d.stat_date > '2018-12-20'"
df
=
con_sql
(
db
,
sql
)
n
=
df
.
shape
[
0
]
print
(
n
)
one
=
df
[
1
]
.
unique
()
one_map
=
{}
for
i
in
one
:
one_map
[
i
]
=
df
.
loc
[
df
[
1
]
==
i
]
.
shape
[
0
]
/
n
print
(
sorted
(
one_map
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
))
two
=
df
[
2
]
.
unique
()
two_map
=
{}
print
(
"分界线"
)
for
i
in
two
:
two_map
[
i
]
=
df
.
loc
[
df
[
2
]
==
i
]
.
shape
[
0
]
/
n
print
(
sorted
(
two_map
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
))
if
__name__
==
"__main__"
:
click
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment