Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
6122a37d
Commit
6122a37d
authored
Apr 25, 2019
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
修改测试文件
parent
2f8b47e9
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
12 deletions
+23
-12
multi.py
tensnsorflow/multi.py
+23
-12
No files found.
tensnsorflow/multi.py
View file @
6122a37d
...
...
@@ -7,6 +7,28 @@ from pyspark.sql import SparkSession
import
datetime
import
pandas
as
pd
def
app_list_func
(
x
,
l
):
b
=
x
.
split
(
","
)
e
=
[]
for
i
in
b
:
if
i
in
l
.
keys
():
e
.
append
(
l
[
i
])
else
:
e
.
append
(
0
)
return
","
.
join
([
str
(
j
)
for
j
in
e
])
def
multi_hot
(
df
,
column
,
n
):
df
[
column
]
=
df
[
column
]
.
fillna
(
"lost_na"
)
app_list_value
=
[
i
.
split
(
","
)
for
i
in
df
[
column
]
.
unique
()]
app_list_unique
=
[]
for
i
in
app_list_value
:
app_list_unique
.
extend
(
i
)
app_list_unique
=
list
(
set
(
app_list_unique
))
number
=
len
(
app_list_unique
)
app_list_map
=
dict
(
zip
(
app_list_unique
,
list
(
range
(
n
,
number
+
n
))))
df
[
column
]
=
df
[
column
]
.
apply
(
app_list_func
,
args
=
(
app_list_map
,))
return
number
,
app_list_map
def
feature_engineer
():
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select max(stat_date) from esmm_train_data"
...
...
@@ -54,7 +76,7 @@ def feature_engineer():
df
=
df
.
join
(
hospital
,
"diary_service_id"
,
"left_outer"
)
.
fillna
(
"na"
)
print
(
df
.
count
())
df
=
df
.
drop
(
[
"level2"
,
"diary_service_id"
]
)
df
=
df
.
drop
(
"level2"
)
.
drop
(
"diary_service_id"
)
df
=
df
.
drop_duplicates
([
"ucity_id"
,
"level2_ids"
,
"ccity_name"
,
"device_type"
,
"manufacturer"
,
"channel"
,
"top"
,
"time"
,
"stat_date"
,
"app_list"
,
"hospital_id"
,
"level3_ids"
])
print
(
df
.
count
())
...
...
@@ -62,18 +84,7 @@ def feature_engineer():
# df = df.rename(columns={0: "y", 1: "z", 2: "stat_date", 3: "ucity_id", 4: "clevel2_id", 5: "ccity_name",
# 6: "device_type", 7: "manufacturer", 8: "channel", 9: "top", 10: "device_id",
# 11: "time", 12: "app_list", 13: "service_id", 14: "level3_ids", 15: "level2"})
#
#
# df = df.drop_duplicates(["ucity_id", "clevel2_id", "ccity_name", "device_type", "manufacturer",
# "channel", "top", "time", "stat_date", "app_list", "hospital_id", "level3_ids"])
#
# print("after")
# print(df.shape)
# app_list_number, app_list_map = multi_hot(df, "app_list", 2)
# level2_number, level2_map = multi_hot(df, "clevel2_id", 2 + app_list_number)
# level3_number, level3_map = multi_hot(df, "level3_ids", 2 + app_list_number + level2_number)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment