Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
serviceRec
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
郭羽
serviceRec
Commits
5e1859bf
Commit
5e1859bf
authored
3 years ago
by
郭羽
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
service model 优化
parent
9dcf2b68
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
52 additions
and
52 deletions
+52
-52
featureEng.py
spark/featureEng.py
+0
-0
featureEng_copy.py
spark/featureEng_copy.py
+0
-0
train_service.py
train/train_service.py
+13
-39
train_service_copy.py
train/train_service_copy.py
+39
-13
No files found.
spark/featureEng.py
View file @
5e1859bf
This diff is collapsed.
Click to expand it.
spark/featureEng
2
.py
→
spark/featureEng
_copy
.py
View file @
5e1859bf
This diff is collapsed.
Click to expand it.
train/train_service.py
View file @
5e1859bf
...
@@ -10,16 +10,6 @@ from datetime import date, timedelta
...
@@ -10,16 +10,6 @@ from datetime import date, timedelta
sys
.
path
.
append
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))))
sys
.
path
.
append
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))))
import
utils.configUtils
as
configUtils
import
utils.configUtils
as
configUtils
ITEM_NUMBER_COLUMNS
=
[
"item_"
+
c
for
c
in
[
"smart_rank2"
]]
embedding_columns
=
[
"itemid"
,
"userid"
]
+
[
"item_"
+
c
for
c
in
[
"doctor_id"
,
"hospital_id"
,
"merchant_id"
]]
multi_columns
=
[
"tags_v3"
,
"first_demands"
,
"second_demands"
,
"first_solutions"
,
"second_solutions"
,
"first_positions"
,
"second_positions"
]
one_hot_columns
=
[
"user_os"
]
+
[
"item_"
+
c
for
c
in
[
"service_type"
,
"doctor_type"
,
"doctor_famous"
,
"hospital_city_tag_id"
,
"hospital_type"
,
"hospital_is_high_quality"
]]
# history_columns = ["userRatedHistory"]
# 数据加载
# data_path_train = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
# data_path_test = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
VERSION
=
configUtils
.
SERVICE_VERSION
VERSION
=
configUtils
.
SERVICE_VERSION
trainDay
=
time
.
strftime
(
"
%
Y
%
m
%
d
%
H"
,
time
.
localtime
())
trainDay
=
time
.
strftime
(
"
%
Y
%
m
%
d
%
H"
,
time
.
localtime
())
data_path_train
=
"/data/files/service_feature_{}_train.csv"
.
format
(
VERSION
)
data_path_train
=
"/data/files/service_feature_{}_train.csv"
.
format
(
VERSION
)
...
@@ -84,43 +74,25 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128):
...
@@ -84,43 +74,25 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128):
def
getTrainColumns
(
train_columns
,
data_vocab
):
def
getTrainColumns
(
train_columns
,
data_vocab
):
emb_columns
=
[]
emb_columns
=
[]
number_columns
=
[]
number_columns
=
[]
oneHot_columns
=
[]
dataColumns
=
[]
inputs
=
{}
inputs
=
{}
# 离散特征
# 离散特征
for
feature
in
train_columns
:
for
feature
in
train_columns
:
if
data_vocab
.
get
(
feature
):
if
data_vocab
.
get
(
feature
):
if
feature
.
count
(
"__"
)
>
0
:
cat_col
=
tf
.
feature_column
.
categorical_column_with_vocabulary_list
(
key
=
feature
,
vocabulary_list
=
data_vocab
[
feature
])
cat_col
=
tf
.
feature_column
.
categorical_column_with_vocabulary_list
(
key
=
feature
,
vocabulary_list
=
data_vocab
[
feature
])
col
=
tf
.
feature_column
.
embedding_column
(
cat_col
,
10
)
col
=
tf
.
feature_column
.
embedding_column
(
cat_col
,
5
)
emb_columns
.
append
(
col
)
emb_columns
.
append
(
col
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'string'
)
dataColumns
.
append
(
feature
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'string'
)
elif
feature
.
endswith
(
"_number"
):
elif
feature
in
one_hot_columns
or
feature
.
count
(
"Bucket"
)
>
0
:
cat_col
=
tf
.
feature_column
.
categorical_column_with_vocabulary_list
(
key
=
feature
,
vocabulary_list
=
data_vocab
[
feature
])
# col = tf.feature_column.indicator_column(cat_col)
col
=
tf
.
feature_column
.
embedding_column
(
cat_col
,
3
)
oneHot_columns
.
append
(
col
)
dataColumns
.
append
(
feature
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'string'
)
else
:
cat_col
=
tf
.
feature_column
.
categorical_column_with_vocabulary_list
(
key
=
feature
,
vocabulary_list
=
data_vocab
[
feature
])
col
=
tf
.
feature_column
.
embedding_column
(
cat_col
,
10
)
emb_columns
.
append
(
col
)
dataColumns
.
append
(
feature
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'string'
)
elif
feature
in
ITEM_NUMBER_COLUMNS
:
col
=
tf
.
feature_column
.
numeric_column
(
feature
)
col
=
tf
.
feature_column
.
numeric_column
(
feature
)
number_columns
.
append
(
col
)
number_columns
.
append
(
col
)
dataColumns
.
append
(
feature
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'float32'
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'float32'
)
return
emb_columns
,
number_columns
,
oneHot_columns
,
dataColumns
,
inputs
return
emb_columns
,
number_columns
,
inputs
def
train
(
emb_columns
,
number_columns
,
oneHot_columns
,
inputs
,
train_dataset
):
def
train
(
emb_columns
,
number_columns
,
inputs
,
train_dataset
):
wide
=
tf
.
keras
.
layers
.
DenseFeatures
(
emb_columns
+
number_columns
+
oneHot_columns
)(
inputs
)
wide
=
tf
.
keras
.
layers
.
DenseFeatures
(
emb_columns
+
number_columns
)(
inputs
)
deep
=
tf
.
keras
.
layers
.
Dense
(
64
,
activation
=
'relu'
)(
wide
)
deep
=
tf
.
keras
.
layers
.
Dense
(
64
,
activation
=
'relu'
)(
wide
)
deep
=
tf
.
keras
.
layers
.
Dropout
(
0.2
)(
deep
)
deep
=
tf
.
keras
.
layers
.
Dropout
(
0.2
)(
deep
)
concat_layer
=
tf
.
keras
.
layers
.
concatenate
([
wide
,
deep
],
axis
=
1
)
concat_layer
=
tf
.
keras
.
layers
.
concatenate
([
wide
,
deep
],
axis
=
1
)
...
@@ -193,6 +165,7 @@ if __name__ == '__main__':
...
@@ -193,6 +165,7 @@ if __name__ == '__main__':
timestmp1
=
int
(
round
(
time
.
time
()))
timestmp1
=
int
(
round
(
time
.
time
()))
df_train
=
loadData
(
data_path_train
)
df_train
=
loadData
(
data_path_train
)
print
(
df_train
.
dtypes
)
print
(
df_train
.
dtypes
)
print
(
"训练数据列:"
,
df_train
.
columns
)
df_test
=
df_train
.
loc
[
df_train
[
'timestamp'
]
>=
splitTimestamp
]
df_test
=
df_train
.
loc
[
df_train
[
'timestamp'
]
>=
splitTimestamp
]
df_train
=
df_train
.
loc
[
df_train
[
'timestamp'
]
<
splitTimestamp
]
df_train
=
df_train
.
loc
[
df_train
[
'timestamp'
]
<
splitTimestamp
]
...
@@ -204,8 +177,9 @@ if __name__ == '__main__':
...
@@ -204,8 +177,9 @@ if __name__ == '__main__':
columns
=
df_train
.
columns
.
tolist
()
columns
=
df_train
.
columns
.
tolist
()
print
(
"原始数据列:"
)
print
(
"原始数据列:"
)
print
(
columns
)
print
(
columns
)
emb_columns
,
number_columns
,
oneHot_columns
,
datasColumns
,
inputs
=
getTrainColumns
(
columns
,
data_vocab
)
emb_columns
,
number_columns
,
inputs
=
getTrainColumns
(
columns
,
data_vocab
)
print
(
"训练列:"
)
print
(
"训练列:"
)
datasColumns
=
list
(
inputs
.
keys
())
print
(
datasColumns
)
print
(
datasColumns
)
df_train
=
df_train
[
datasColumns
+
[
"label"
]]
df_train
=
df_train
[
datasColumns
+
[
"label"
]]
...
@@ -226,7 +200,7 @@ if __name__ == '__main__':
...
@@ -226,7 +200,7 @@ if __name__ == '__main__':
print
(
"train start..."
)
print
(
"train start..."
)
timestmp3
=
int
(
round
(
time
.
time
()))
timestmp3
=
int
(
round
(
time
.
time
()))
model
=
train
(
emb_columns
,
number_columns
,
oneHot_columns
,
inputs
,
train_data
)
model
=
train
(
emb_columns
,
number_columns
,
inputs
,
train_data
)
timestmp4
=
int
(
round
(
time
.
time
()))
timestmp4
=
int
(
round
(
time
.
time
()))
print
(
"train end...耗时h:{}"
.
format
((
timestmp4
-
timestmp3
)
/
60
/
60
))
print
(
"train end...耗时h:{}"
.
format
((
timestmp4
-
timestmp3
)
/
60
/
60
))
...
...
This diff is collapsed.
Click to expand it.
train/train_service
2
.py
→
train/train_service
_copy
.py
View file @
5e1859bf
...
@@ -10,6 +10,16 @@ from datetime import date, timedelta
...
@@ -10,6 +10,16 @@ from datetime import date, timedelta
sys
.
path
.
append
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))))
sys
.
path
.
append
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))))
import
utils.configUtils
as
configUtils
import
utils.configUtils
as
configUtils
ITEM_NUMBER_COLUMNS
=
[
"item_"
+
c
for
c
in
[
"smart_rank2"
]]
embedding_columns
=
[
"itemid"
,
"userid"
]
+
[
"item_"
+
c
for
c
in
[
"doctor_id"
,
"hospital_id"
,
"merchant_id"
]]
multi_columns
=
[
"tags_v3"
,
"first_demands"
,
"second_demands"
,
"first_solutions"
,
"second_solutions"
,
"first_positions"
,
"second_positions"
]
one_hot_columns
=
[
"user_os"
]
+
[
"item_"
+
c
for
c
in
[
"service_type"
,
"doctor_type"
,
"doctor_famous"
,
"hospital_city_tag_id"
,
"hospital_type"
,
"hospital_is_high_quality"
]]
# history_columns = ["userRatedHistory"]
# 数据加载
# data_path_train = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
# data_path_test = "/Users/zhigangzheng/Desktop/work/guoyu/service_sort/train/part-00000-a61205d1-ad4e-4fa7-895d-ad8db41189e6-c000.csv"
VERSION
=
configUtils
.
SERVICE_VERSION
VERSION
=
configUtils
.
SERVICE_VERSION
trainDay
=
time
.
strftime
(
"
%
Y
%
m
%
d
%
H"
,
time
.
localtime
())
trainDay
=
time
.
strftime
(
"
%
Y
%
m
%
d
%
H"
,
time
.
localtime
())
data_path_train
=
"/data/files/service_feature_{}_train.csv"
.
format
(
VERSION
)
data_path_train
=
"/data/files/service_feature_{}_train.csv"
.
format
(
VERSION
)
...
@@ -74,25 +84,43 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128):
...
@@ -74,25 +84,43 @@ def getDataSet(df,shuffleSize = 10000,batchSize=128):
def
getTrainColumns
(
train_columns
,
data_vocab
):
def
getTrainColumns
(
train_columns
,
data_vocab
):
emb_columns
=
[]
emb_columns
=
[]
number_columns
=
[]
number_columns
=
[]
oneHot_columns
=
[]
dataColumns
=
[]
inputs
=
{}
inputs
=
{}
# 离散特征
# 离散特征
for
feature
in
train_columns
:
for
feature
in
train_columns
:
if
data_vocab
.
get
(
feature
):
if
data_vocab
.
get
(
feature
):
cat_col
=
tf
.
feature_column
.
categorical_column_with_vocabulary_list
(
key
=
feature
,
vocabulary_list
=
data_vocab
[
feature
])
if
feature
.
count
(
"__"
)
>
0
:
col
=
tf
.
feature_column
.
embedding_column
(
cat_col
,
10
)
cat_col
=
tf
.
feature_column
.
categorical_column_with_vocabulary_list
(
key
=
feature
,
vocabulary_list
=
data_vocab
[
feature
])
emb_columns
.
append
(
col
)
col
=
tf
.
feature_column
.
embedding_column
(
cat_col
,
5
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'string'
)
emb_columns
.
append
(
col
)
dataColumns
.
append
(
feature
)
elif
feature
.
endswith
(
"_number"
):
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'string'
)
elif
feature
in
one_hot_columns
or
feature
.
count
(
"Bucket"
)
>
0
:
cat_col
=
tf
.
feature_column
.
categorical_column_with_vocabulary_list
(
key
=
feature
,
vocabulary_list
=
data_vocab
[
feature
])
# col = tf.feature_column.indicator_column(cat_col)
col
=
tf
.
feature_column
.
embedding_column
(
cat_col
,
3
)
oneHot_columns
.
append
(
col
)
dataColumns
.
append
(
feature
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'string'
)
else
:
cat_col
=
tf
.
feature_column
.
categorical_column_with_vocabulary_list
(
key
=
feature
,
vocabulary_list
=
data_vocab
[
feature
])
col
=
tf
.
feature_column
.
embedding_column
(
cat_col
,
10
)
emb_columns
.
append
(
col
)
dataColumns
.
append
(
feature
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'string'
)
elif
feature
in
ITEM_NUMBER_COLUMNS
:
col
=
tf
.
feature_column
.
numeric_column
(
feature
)
col
=
tf
.
feature_column
.
numeric_column
(
feature
)
number_columns
.
append
(
col
)
number_columns
.
append
(
col
)
dataColumns
.
append
(
feature
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'float32'
)
inputs
[
feature
]
=
tf
.
keras
.
layers
.
Input
(
name
=
feature
,
shape
=
(),
dtype
=
'float32'
)
return
emb_columns
,
number_columns
,
inputs
return
emb_columns
,
number_columns
,
oneHot_columns
,
dataColumns
,
inputs
def
train
(
emb_columns
,
number_columns
,
inputs
,
train_dataset
):
def
train
(
emb_columns
,
number_columns
,
oneHot_columns
,
inputs
,
train_dataset
):
wide
=
tf
.
keras
.
layers
.
DenseFeatures
(
emb_columns
+
number_columns
)(
inputs
)
wide
=
tf
.
keras
.
layers
.
DenseFeatures
(
emb_columns
+
number_columns
+
oneHot_columns
)(
inputs
)
deep
=
tf
.
keras
.
layers
.
Dense
(
64
,
activation
=
'relu'
)(
wide
)
deep
=
tf
.
keras
.
layers
.
Dense
(
64
,
activation
=
'relu'
)(
wide
)
deep
=
tf
.
keras
.
layers
.
Dropout
(
0.2
)(
deep
)
deep
=
tf
.
keras
.
layers
.
Dropout
(
0.2
)(
deep
)
concat_layer
=
tf
.
keras
.
layers
.
concatenate
([
wide
,
deep
],
axis
=
1
)
concat_layer
=
tf
.
keras
.
layers
.
concatenate
([
wide
,
deep
],
axis
=
1
)
...
@@ -165,7 +193,6 @@ if __name__ == '__main__':
...
@@ -165,7 +193,6 @@ if __name__ == '__main__':
timestmp1
=
int
(
round
(
time
.
time
()))
timestmp1
=
int
(
round
(
time
.
time
()))
df_train
=
loadData
(
data_path_train
)
df_train
=
loadData
(
data_path_train
)
print
(
df_train
.
dtypes
)
print
(
df_train
.
dtypes
)
print
(
"训练数据列:"
,
df_train
.
columns
)
df_test
=
df_train
.
loc
[
df_train
[
'timestamp'
]
>=
splitTimestamp
]
df_test
=
df_train
.
loc
[
df_train
[
'timestamp'
]
>=
splitTimestamp
]
df_train
=
df_train
.
loc
[
df_train
[
'timestamp'
]
<
splitTimestamp
]
df_train
=
df_train
.
loc
[
df_train
[
'timestamp'
]
<
splitTimestamp
]
...
@@ -177,9 +204,8 @@ if __name__ == '__main__':
...
@@ -177,9 +204,8 @@ if __name__ == '__main__':
columns
=
df_train
.
columns
.
tolist
()
columns
=
df_train
.
columns
.
tolist
()
print
(
"原始数据列:"
)
print
(
"原始数据列:"
)
print
(
columns
)
print
(
columns
)
emb_columns
,
number_columns
,
inputs
=
getTrainColumns
(
columns
,
data_vocab
)
emb_columns
,
number_columns
,
oneHot_columns
,
datasColumns
,
inputs
=
getTrainColumns
(
columns
,
data_vocab
)
print
(
"训练列:"
)
print
(
"训练列:"
)
datasColumns
=
list
(
inputs
.
keys
())
print
(
datasColumns
)
print
(
datasColumns
)
df_train
=
df_train
[
datasColumns
+
[
"label"
]]
df_train
=
df_train
[
datasColumns
+
[
"label"
]]
...
@@ -200,7 +226,7 @@ if __name__ == '__main__':
...
@@ -200,7 +226,7 @@ if __name__ == '__main__':
print
(
"train start..."
)
print
(
"train start..."
)
timestmp3
=
int
(
round
(
time
.
time
()))
timestmp3
=
int
(
round
(
time
.
time
()))
model
=
train
(
emb_columns
,
number_columns
,
inputs
,
train_data
)
model
=
train
(
emb_columns
,
number_columns
,
oneHot_columns
,
inputs
,
train_data
)
timestmp4
=
int
(
round
(
time
.
time
()))
timestmp4
=
int
(
round
(
time
.
time
()))
print
(
"train end...耗时h:{}"
.
format
((
timestmp4
-
timestmp3
)
/
60
/
60
))
print
(
"train end...耗时h:{}"
.
format
((
timestmp4
-
timestmp3
)
/
60
/
60
))
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment