Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
S
serviceRec
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
郭羽
serviceRec
Commits
abc9ec79
Commit
abc9ec79
authored
3 years ago
by
宋柯
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
模型调试
parent
73a98c5e
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
15 additions
and
5 deletions
+15
-5
featureEngSk.py
spark/featureEngSk.py
+15
-5
No files found.
spark/featureEngSk.py
View file @
abc9ec79
...
@@ -45,10 +45,10 @@ DATA_PATH_TRAIN = "/data/files/service_feature_{}_train.csv".format(VERSION)
...
@@ -45,10 +45,10 @@ DATA_PATH_TRAIN = "/data/files/service_feature_{}_train.csv".format(VERSION)
def
getRedisConn
():
def
getRedisConn
():
pool
=
redis
.
ConnectionPool
(
host
=
"172.16.50.145"
,
password
=
"XfkMCCdWDIU
%
ls$h"
,
port
=
6379
,
db
=
0
)
#
pool = redis.ConnectionPool(host="172.16.50.145",password="XfkMCCdWDIU%ls$h",port=6379,db=0)
conn
=
redis
.
Redis
(
connection_pool
=
pool
)
#
conn = redis.Redis(connection_pool=pool)
# conn = redis.Redis(host="172.16.50.145", port=6379, password="XfkMCCdWDIU%ls$h",db=0)
# conn = redis.Redis(host="172.16.50.145", port=6379, password="XfkMCCdWDIU%ls$h",db=0)
#
conn = redis.Redis(host="172.18.51.10", port=6379,db=0) #test
conn
=
redis
.
Redis
(
host
=
"172.18.51.10"
,
port
=
6379
,
db
=
0
)
#test
return
conn
return
conn
def
parseTags
(
tags
,
i
):
def
parseTags
(
tags
,
i
):
...
@@ -433,6 +433,12 @@ def dataVocabToRedis(dataVocab):
...
@@ -433,6 +433,12 @@ def dataVocabToRedis(dataVocab):
conn
.
set
(
FEATURE_VOCAB_KEY
,
dataVocab
)
conn
.
set
(
FEATURE_VOCAB_KEY
,
dataVocab
)
conn
.
expire
(
FEATURE_VOCAB_KEY
,
60
*
60
*
24
*
7
)
conn
.
expire
(
FEATURE_VOCAB_KEY
,
60
*
60
*
24
*
7
)
def
saveVocab
(
key
,
vocab
):
conn
=
getRedisConn
()
conn
.
lpush
(
key
,
vocab
)
conn
.
expire
(
FEATURE_VOCAB_KEY
,
60
*
60
*
24
)
def
featureColumnsToRedis
(
columns
):
def
featureColumnsToRedis
(
columns
):
conn
=
getRedisConn
()
conn
=
getRedisConn
()
conn
.
set
(
FEATURE_COLUMN_KEY
,
json
.
dumps
(
columns
))
conn
.
set
(
FEATURE_COLUMN_KEY
,
json
.
dumps
(
columns
))
...
@@ -1016,10 +1022,14 @@ if __name__ == '__main__':
...
@@ -1016,10 +1022,14 @@ if __name__ == '__main__':
write_time_start
=
time
.
time
()
write_time_start
=
time
.
time
()
for
categoty_field
in
categoty_fields
:
for
categoty_field
in
categoty_fields
:
output_file
=
"file:///home/gmuser/"
+
categoty_field
+
"_vocab"
output_file
=
"file:///home/gmuser/"
+
categoty_field
+
"_vocab"
train_samples
.
select
(
categoty_field
)
.
where
(
F
.
col
(
categoty_field
)
!=
'-1'
)
.
where
(
F
.
col
(
categoty_field
)
!=
''
)
.
distinct
()
.
write
.
mode
(
"overwrite"
)
.
options
(
header
=
"false"
)
.
csv
(
output_file
)
# train_samples.select(categoty_field).where(F.col(categoty_field) != '-1').where(F.col(categoty_field) != '').distinct().write.mode("overwrite").options(header="false").csv(output_file)
categoty_field_rows
=
train_samples
.
select
(
categoty_field
)
.
where
(
F
.
col
(
categoty_field
)
!=
'-1'
)
.
where
(
F
.
col
(
categoty_field
)
!=
''
)
.
distinct
()
.
collect
()
saveVocab
(
"strategy:"
+
categoty_field
+
":vocab"
,
list
(
map
(
lambda
row
:
row
[
categoty_field
],
categoty_field_rows
)))
for
multi_categoty_field
in
multi_categoty_fields
:
for
multi_categoty_field
in
multi_categoty_fields
:
output_file
=
"file:///home/gmuser/"
+
multi_categoty_field
+
"_vocab"
output_file
=
"file:///home/gmuser/"
+
multi_categoty_field
+
"_vocab"
train_samples
.
selectExpr
(
"explode(split({multi_categoty_field},','))"
.
format
(
multi_categoty_field
=
multi_categoty_field
))
.
where
(
F
.
col
(
multi_categoty_field
)
!=
'-1'
)
.
distinct
()
.
write
.
mode
(
"overwrite"
)
.
options
(
header
=
"false"
)
.
csv
(
output_file
)
# train_samples.selectExpr("explode(split({multi_categoty_field},','))".format(multi_categoty_field = multi_categoty_field)).where(F.col(multi_categoty_field) != '-1').distinct().write.mode("overwrite").options(header="false").csv(output_file)
multi_categoty_field_rows
=
train_samples
.
selectExpr
(
"explode(split({multi_categoty_field},',')) as {multi_categoty_field}"
.
format
(
multi_categoty_field
=
multi_categoty_field
))
.
where
(
F
.
col
(
multi_categoty_field
)
!=
'-1'
)
.
distinct
()
.
collect
()
saveVocab
(
"strategy:"
+
multi_categoty_field
+
":vocab"
,
list
(
map
(
lambda
row
:
row
[
multi_categoty_field
],
multi_categoty_field_rows
)))
output_file
=
"file:///home/gmuser/train_samples"
output_file
=
"file:///home/gmuser/train_samples"
train_samples
.
write
.
mode
(
"overwrite"
)
.
options
(
header
=
"false"
,
sep
=
'|'
)
.
csv
(
output_file
)
train_samples
.
write
.
mode
(
"overwrite"
)
.
options
(
header
=
"false"
,
sep
=
'|'
)
.
csv
(
output_file
)
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment