Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
4ba42d54
Commit
4ba42d54
authored
May 24, 2019
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
change test file
parent
75951d44
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
118 additions
and
77 deletions
+118
-77
feature_engineering.py
tensnsorflow/feature_engineering.py
+118
-77
No files found.
tensnsorflow/feature_engineering.py
View file @
4ba42d54
...
@@ -19,22 +19,6 @@ def app_list_func(x,l):
...
@@ -19,22 +19,6 @@ def app_list_func(x,l):
return
e
return
e
def
multi_hot
(
df
,
column
,
n
):
a
=
time
.
time
()
v
=
list
(
set
(
df
.
select
(
column
)
.
rdd
.
map
(
lambda
x
:
x
[
0
])
.
collect
()))
b
=
time
.
time
()
print
(
column
)
print
(
"cost time 分钟"
)
print
((
b
-
a
)
/
60
)
app_list_value
=
[
str
(
i
)
.
split
(
","
)
for
i
in
v
]
app_list_unique
=
[]
for
i
in
app_list_value
:
app_list_unique
.
extend
(
i
)
app_list_unique
=
list
(
set
(
app_list_unique
))
number
=
len
(
app_list_unique
)
app_list_map
=
dict
(
zip
(
app_list_unique
,
list
(
range
(
n
,
number
+
n
))))
return
number
,
app_list_map
def
get_list
(
db
,
sql
,
n
):
def
get_list
(
db
,
sql
,
n
):
cursor
=
db
.
cursor
()
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
cursor
.
execute
(
sql
)
...
@@ -61,30 +45,114 @@ def get_map():
...
@@ -61,30 +45,114 @@ def get_map():
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select level2_ids from diary_feat"
sql
=
"select level2_ids from diary_feat"
b
=
time
.
time
()
b
=
time
.
time
()
leve2_number
,
leve2_map
=
get_list
(
db
,
sql
,
apps_number
)
leve2_number
,
leve2_map
=
get_list
(
db
,
sql
,
1
+
apps_number
)
print
(
"leve2"
)
print
(
"leve2"
)
print
((
time
.
time
()
-
b
)
/
60
)
print
((
time
.
time
()
-
b
)
/
60
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select level3_ids from diary_feat"
sql
=
"select level3_ids from diary_feat"
c
=
time
.
time
()
c
=
time
.
time
()
leve3_number
,
leve3_map
=
get_list
(
db
,
sql
,
leve2
_number
)
leve3_number
,
leve3_map
=
get_list
(
db
,
sql
,
1
+
leve2_number
+
apps
_number
)
print
((
time
.
time
()
-
c
)
/
60
)
print
((
time
.
time
()
-
c
)
/
60
)
return
apps_number
,
app_list_map
,
leve2_number
,
leve2_map
,
leve3_number
,
leve3_map
def
get_unique
(
db
,
sql
):
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
v
=
list
(
set
([
i
[
0
]
for
i
in
result
]))
db
.
close
()
print
(
sql
)
print
(
len
(
v
))
return
v
def
feature_engineer
():
apps_number
,
app_list_map
,
level2_number
,
level2_map
,
level3_number
,
level3_map
=
get_map
()
unique_values
=
[]
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct stat_date from esmm_train_data_dur"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct ucity_id from esmm_train_data_dur"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct ccity_name from esmm_train_data_dur"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct time from cid_time_cut"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
def
feature_engineer
():
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select max(stat_date) from esmm_train_data"
sql
=
"select distinct device_type from user_feature"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct manufacturer from user_feature"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct channel from user_feature"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct top from cid_type_top"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct price_min from train_Knowledge_network_data"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct treatment_method from train_Knowledge_network_data"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct price_max from train_Knowledge_network_data"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct treatment_time from train_Knowledge_network_data"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct maintain_time from train_Knowledge_network_data"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select distinct recover_time from train_Knowledge_network_data"
unique_values
.
extend
(
get_unique
(
db
,
sql
))
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
sql
=
"select max(stat_date) from esmm_train_data_dur"
validate_date
=
con_sql
(
db
,
sql
)[
0
]
.
values
.
tolist
()[
0
]
validate_date
=
con_sql
(
db
,
sql
)[
0
]
.
values
.
tolist
()[
0
]
print
(
"validate_date:"
+
validate_date
)
print
(
"validate_date:"
+
validate_date
)
temp
=
datetime
.
datetime
.
strptime
(
validate_date
,
"
%
Y-
%
m-
%
d"
)
temp
=
datetime
.
datetime
.
strptime
(
validate_date
,
"
%
Y-
%
m-
%
d"
)
start
=
(
temp
-
datetime
.
timedelta
(
days
=
300
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
start
=
(
temp
-
datetime
.
timedelta
(
days
=
300
))
.
strftime
(
"
%
Y-
%
m-
%
d"
)
print
(
start
)
print
(
start
)
db
=
pymysql
.
connect
(
host
=
'172.16.40.158'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
)
sql
=
"select doctor.hospital_id from jerry_test.esmm_train_data_dur e "
\
"left join eagle.src_zhengxing_api_service service on e.diary_service_id = service.id "
\
"left join eagle.src_zhengxing_api_doctor doctor on service.doctor_id = doctor.id "
\
"where e.stat_date >= '{}'"
.
format
(
start
)
unique_values
.
extend
(
get_unique
(
db
,
sql
))
features
=
[
"ucity_id"
,
"ccity_name"
,
"device_type"
,
"manufacturer"
,
"channel"
,
"top"
,
"time"
,
"stat_date"
,
"hospital_id"
,
"treatment_method"
,
"price_min"
,
"price_max"
,
"treatment_time"
,
"maintain_time"
,
"recover_time"
]
unique_values
.
extend
(
features
)
print
(
"unique_values length"
)
print
(
len
(
unique_values
))
temp
=
list
(
range
(
2
+
apps_number
+
level2_number
+
level3_number
,
2
+
apps_number
+
level2_number
+
level3_number
+
len
(
unique_values
)))
value_map
=
dict
(
zip
(
unique_values
,
temp
))
sql
=
"select e.y,e.z,e.stat_date,e.ucity_id,feat.level2_ids,e.ccity_name,u.device_type,u.manufacturer,"
\
sql
=
"select e.y,e.z,e.stat_date,e.ucity_id,feat.level2_ids,e.ccity_name,u.device_type,u.manufacturer,"
\
"u.channel,c.top,cut.time,dl.app_list,feat.level3_ids,doctor.hospital_id,"
\
"u.channel,c.top,cut.time,dl.app_list,feat.level3_ids,doctor.hospital_id,"
\
"wiki.tag as tag1,question.tag as tag2,search.tag as tag3,budan.tag as tag4,"
\
"wiki.tag as tag1,question.tag as tag2,search.tag as tag3,budan.tag as tag4,"
\
...
@@ -113,36 +181,17 @@ def feature_engineer():
...
@@ -113,36 +181,17 @@ def feature_engineer():
"channel"
,
"top"
,
"time"
,
"stat_date"
,
"app_list"
,
"hospital_id"
,
"level3_ids"
,
"channel"
,
"top"
,
"time"
,
"stat_date"
,
"app_list"
,
"hospital_id"
,
"level3_ids"
,
"tag1"
,
"tag2"
,
"tag3"
,
"tag4"
,
"tag5"
,
"tag6"
,
"tag7"
])
"tag1"
,
"tag2"
,
"tag3"
,
"tag4"
,
"tag5"
,
"tag6"
,
"tag7"
])
features
=
[
"ucity_id"
,
"ccity_name"
,
"device_type"
,
"manufacturer"
,
"channel"
,
"top"
,
"time"
,
"stat_date"
,
"hospital_id"
,
"treatment_method"
,
"price_min"
,
"price_max"
,
"treatment_time"
,
"maintain_time"
,
"recover_time"
]
df
=
df
.
na
.
fill
(
dict
(
zip
(
features
,
features
)))
df
=
df
.
na
.
fill
(
dict
(
zip
(
features
,
features
)))
apps_number
,
app_list_map
=
multi_hot
(
df
,
"app_list"
,
1
)
level2_number
,
leve2_map
=
multi_hot
(
df
,
"level2_ids"
,
1
+
apps_number
)
level3_number
,
leve3_map
=
multi_hot
(
df
,
"level3_ids"
,
1
+
apps_number
+
level2_number
)
unique_values
=
[]
for
i
in
features
:
a
=
time
.
time
()
unique_values
.
extend
(
list
(
set
(
df
.
select
(
i
)
.
rdd
.
map
(
lambda
x
:
x
[
0
])
.
collect
())))
b
=
time
.
time
()
print
(
i
)
print
((
b
-
a
)
/
60
)
temp
=
list
(
range
(
2
+
apps_number
+
level2_number
+
level3_number
,
2
+
apps_number
+
level2_number
+
level3_number
+
len
(
unique_values
)))
value_map
=
dict
(
zip
(
unique_values
,
temp
))
c
=
time
.
time
()
c
=
time
.
time
()
rdd
=
df
.
select
(
"stat_date"
,
"y"
,
"z"
,
"app_list"
,
"level2_ids"
,
"level3_ids"
,
rdd
=
df
.
select
(
"stat_date"
,
"y"
,
"z"
,
"app_list"
,
"level2_ids"
,
"level3_ids"
,
"tag1"
,
"tag2"
,
"tag3"
,
"tag4"
,
"tag5"
,
"tag6"
,
"tag7"
,
"tag1"
,
"tag2"
,
"tag3"
,
"tag4"
,
"tag5"
,
"tag6"
,
"tag7"
,
"ucity_id"
,
"ccity_name"
,
"device_type"
,
"manufacturer"
,
"channel"
,
"top"
,
"time"
,
"ucity_id"
,
"ccity_name"
,
"device_type"
,
"manufacturer"
,
"channel"
,
"top"
,
"time"
,
"hospital_id"
,
"treatment_method"
,
"price_min"
,
"price_max"
,
"treatment_time"
,
"hospital_id"
,
"treatment_method"
,
"price_min"
,
"price_max"
,
"treatment_time"
,
"maintain_time"
,
"recover_time"
)
.
rdd
.
repartition
(
200
)
.
map
(
lambda
x
:
(
x
[
0
],
float
(
x
[
1
]),
float
(
x
[
2
]),
app_list_func
(
x
[
3
],
app_list_map
),
app_list_func
(
x
[
4
],
leve2_map
),
"maintain_time"
,
"recover_time"
)
.
rdd
.
repartition
(
200
)
.
map
(
lambda
x
:
(
x
[
0
],
float
(
x
[
1
]),
float
(
x
[
2
]),
app_list_func
(
x
[
3
],
app_list_map
),
app_list_func
(
x
[
4
],
leve2_map
),
app_list_func
(
x
[
5
],
leve
3_map
),
app_list_func
(
x
[
6
],
leve2_map
),
app_list_func
(
x
[
7
],
leve
2_map
),
app_list_func
(
x
[
5
],
leve
l3_map
),
app_list_func
(
x
[
6
],
level2_map
),
app_list_func
(
x
[
7
],
level
2_map
),
app_list_func
(
x
[
8
],
leve
2_map
),
app_list_func
(
x
[
9
],
leve2_map
),
app_list_func
(
x
[
10
],
leve
2_map
),
app_list_func
(
x
[
8
],
leve
l2_map
),
app_list_func
(
x
[
9
],
level2_map
),
app_list_func
(
x
[
10
],
level
2_map
),
app_list_func
(
x
[
11
],
leve
2_map
),
app_list_func
(
x
[
12
],
leve
2_map
),
app_list_func
(
x
[
11
],
leve
l2_map
),
app_list_func
(
x
[
12
],
level
2_map
),
[
value_map
[
x
[
0
]],
value_map
[
x
[
13
]],
value_map
[
x
[
14
]],
value_map
[
x
[
15
]],
value_map
[
x
[
16
]],
[
value_map
[
x
[
0
]],
value_map
[
x
[
13
]],
value_map
[
x
[
14
]],
value_map
[
x
[
15
]],
value_map
[
x
[
16
]],
value_map
[
x
[
17
]],
value_map
[
x
[
18
]],
value_map
[
x
[
19
]],
value_map
[
x
[
20
]],
value_map
[
x
[
21
]],
value_map
[
x
[
17
]],
value_map
[
x
[
18
]],
value_map
[
x
[
19
]],
value_map
[
x
[
20
]],
value_map
[
x
[
21
]],
value_map
[
x
[
22
]],
value_map
[
x
[
23
]],
value_map
[
x
[
24
]],
value_map
[
x
[
25
]],
value_map
[
x
[
26
]]]))
value_map
[
x
[
22
]],
value_map
[
x
[
23
]],
value_map
[
x
[
24
]],
value_map
[
x
[
25
]],
value_map
[
x
[
26
]]]))
...
@@ -151,14 +200,6 @@ def feature_engineer():
...
@@ -151,14 +200,6 @@ def feature_engineer():
print
((
d
-
c
)
/
60
)
print
((
d
-
c
)
/
60
)
rdd
.
persist
()
rdd
.
persist
()
# TODO 上线后把下面train fliter 删除,因为最近一天的数据也要作为训练集
# TODO 上线后把下面train fliter 删除,因为最近一天的数据也要作为训练集
# train = rdd.filter(lambda x: x[0] != validate_date) \
# .map(lambda x: (float(x[1]),float(x[2]),app_list_func(x[3], app_list_map), app_list_func(x[4], leve2_map),
# app_list_func(x[5], leve3_map), app_list_func(x[6], leve2_map),app_list_func(x[7], leve2_map),
# app_list_func(x[8], leve2_map), app_list_func(x[9], leve2_map),app_list_func(x[10], leve2_map),
# app_list_func(x[11], leve2_map),app_list_func(x[12], leve2_map),
# [value_map[x[0]], value_map[x[13]],value_map[x[14]], value_map[x[15]], value_map[x[16]],
# value_map[x[17]],value_map[x[18]], value_map[x[19]], value_map[x[20]],value_map[x[21]],
# value_map[x[22]], value_map[x[23]], value_map[x[24]],value_map[x[25]],value_map[x[26]]]))
train
=
rdd
.
filter
(
lambda
x
:
x
[
0
]
!=
validate_date
)
.
map
(
lambda
x
:(
x
[
1
],
x
[
2
],
x
[
3
],
x
[
4
],
x
[
5
],
x
[
6
],
x
[
7
],
x
[
8
],
x
[
9
],
train
=
rdd
.
filter
(
lambda
x
:
x
[
0
]
!=
validate_date
)
.
map
(
lambda
x
:(
x
[
1
],
x
[
2
],
x
[
3
],
x
[
4
],
x
[
5
],
x
[
6
],
x
[
7
],
x
[
8
],
x
[
9
],
x
[
10
],
x
[
11
],
x
[
12
],
x
[
13
]))
x
[
10
],
x
[
11
],
x
[
12
],
x
[
13
]))
...
@@ -183,10 +224,10 @@ def feature_engineer():
...
@@ -183,10 +224,10 @@ def feature_engineer():
rdd
.
unpersist
()
rdd
.
unpersist
()
return
validate_date
,
value_map
,
app_list_map
,
leve
2_map
,
leve
3_map
return
validate_date
,
value_map
,
app_list_map
,
leve
l2_map
,
level
3_map
def
get_predict
(
date
,
value_map
,
app_list_map
,
leve2_map
,
leve3_map
):
def
get_predict
():
sql
=
"select e.y,e.z,e.label,e.ucity_id,feat.level2_ids,e.ccity_name,"
\
sql
=
"select e.y,e.z,e.label,e.ucity_id,feat.level2_ids,e.ccity_name,"
\
"u.device_type,u.manufacturer,u.channel,c.top,e.device_id,e.cid_id,cut.time,"
\
"u.device_type,u.manufacturer,u.channel,c.top,e.device_id,e.cid_id,cut.time,"
\
"dl.app_list,e.hospital_id,feat.level3_ids,"
\
"dl.app_list,e.hospital_id,feat.level3_ids,"
\
...
@@ -222,12 +263,12 @@ def get_predict(date,value_map,app_list_map,leve2_map,leve3_map):
...
@@ -222,12 +263,12 @@ def get_predict(date,value_map,app_list_map,leve2_map,leve3_map):
"hospital_id"
,
"treatment_method"
,
"price_min"
,
"price_max"
,
"treatment_time"
,
"hospital_id"
,
"treatment_method"
,
"price_min"
,
"price_max"
,
"treatment_time"
,
"maintain_time"
,
"recover_time"
)
\
"maintain_time"
,
"recover_time"
)
\
.
rdd
.
repartition
(
200
)
.
map
(
lambda
x
:
(
x
[
0
],
float
(
x
[
1
]),
float
(
x
[
2
]),
x
[
3
],
x
[
4
],
x
[
5
],
.
rdd
.
repartition
(
200
)
.
map
(
lambda
x
:
(
x
[
0
],
float
(
x
[
1
]),
float
(
x
[
2
]),
x
[
3
],
x
[
4
],
x
[
5
],
app_list_func
(
x
[
6
],
app_list_map
),
app_list_func
(
x
[
7
],
leve2_map
),
app_list_func
(
x
[
6
],
app_list_map
),
app_list_func
(
x
[
7
],
leve
l
2_map
),
app_list_func
(
x
[
8
],
leve
3_map
),
app_list_func
(
x
[
9
],
leve
2_map
),
app_list_func
(
x
[
8
],
leve
l3_map
),
app_list_func
(
x
[
9
],
level
2_map
),
app_list_func
(
x
[
10
],
leve
2_map
),
app_list_func
(
x
[
11
],
leve
2_map
),
app_list_func
(
x
[
10
],
leve
l2_map
),
app_list_func
(
x
[
11
],
level
2_map
),
app_list_func
(
x
[
12
],
leve
2_map
),
app_list_func
(
x
[
13
],
leve
2_map
),
app_list_func
(
x
[
12
],
leve
l2_map
),
app_list_func
(
x
[
13
],
level
2_map
),
app_list_func
(
x
[
14
],
leve
2_map
),
app_list_func
(
x
[
15
],
leve
2_map
),
app_list_func
(
x
[
14
],
leve
l2_map
),
app_list_func
(
x
[
15
],
level
2_map
),
[
value_map
.
get
(
date
,
299999
),
value_map
.
get
(
x
[
16
],
299998
),
[
value_map
.
get
(
validate_
date
,
299999
),
value_map
.
get
(
x
[
16
],
299998
),
value_map
.
get
(
x
[
17
],
299997
),
value_map
.
get
(
x
[
18
],
299996
),
value_map
.
get
(
x
[
17
],
299997
),
value_map
.
get
(
x
[
18
],
299996
),
value_map
.
get
(
x
[
19
],
299995
),
value_map
.
get
(
x
[
20
],
299994
),
value_map
.
get
(
x
[
19
],
299995
),
value_map
.
get
(
x
[
20
],
299994
),
value_map
.
get
(
x
[
21
],
299993
),
value_map
.
get
(
x
[
22
],
299992
),
value_map
.
get
(
x
[
21
],
299993
),
value_map
.
get
(
x
[
22
],
299992
),
...
@@ -286,25 +327,25 @@ def con_sql(db,sql):
...
@@ -286,25 +327,25 @@ def con_sql(db,sql):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
#
sparkConf = SparkConf().set("spark.hive.mapred.supports.subdirectories", "true") \
sparkConf
=
SparkConf
()
.
set
(
"spark.hive.mapred.supports.subdirectories"
,
"true"
)
\
#
.set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") \
.
set
(
"spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive"
,
"true"
)
\
#
.set("spark.tispark.plan.allow_index_double_read", "false") \
.
set
(
"spark.tispark.plan.allow_index_double_read"
,
"false"
)
\
#
.set("spark.tispark.plan.allow_index_read", "true") \
.
set
(
"spark.tispark.plan.allow_index_read"
,
"true"
)
\
#
.set("spark.sql.extensions", "org.apache.spark.sql.TiExtensions") \
.
set
(
"spark.sql.extensions"
,
"org.apache.spark.sql.TiExtensions"
)
\
#
.set("spark.tispark.pd.addresses", "172.16.40.158:2379").set("spark.io.compression.codec", "lzf")\
.
set
(
"spark.tispark.pd.addresses"
,
"172.16.40.158:2379"
)
.
set
(
"spark.io.compression.codec"
,
"lzf"
)
\
#
.set("spark.driver.maxResultSize", "8g").set("spark.sql.avro.compression.codec","snappy")
.
set
(
"spark.driver.maxResultSize"
,
"8g"
)
.
set
(
"spark.sql.avro.compression.codec"
,
"snappy"
)
#
#
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
spark
=
SparkSession
.
builder
.
config
(
conf
=
sparkConf
)
.
enableHiveSupport
()
.
getOrCreate
()
#
ti = pti.TiContext(spark)
ti
=
pti
.
TiContext
(
spark
)
#
ti.tidbMapDatabase("jerry_test")
ti
.
tidbMapDatabase
(
"jerry_test"
)
#
ti.tidbMapDatabase("eagle")
ti
.
tidbMapDatabase
(
"eagle"
)
#
spark.sparkContext.setLogLevel("WARN")
spark
.
sparkContext
.
setLogLevel
(
"WARN"
)
#
path = "hdfs:///strategy/esmm/"
path
=
"hdfs:///strategy/esmm/"
#
local_path = "/home/gmuser/esmm/"
local_path
=
"/home/gmuser/esmm/"
#
# validate_date, value_map, app_list_map, leve2_map, leve
3_map = feature_engineer()
validate_date
,
value_map
,
app_list_map
,
level2_map
,
level
3_map
=
feature_engineer
()
# get_predict(validate_date, value_map, app_list_map, leve2_map, leve3_map
)
get_predict
(
)
get_map
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment