Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
8e34ccfe
Commit
8e34ccfe
authored
Dec 06, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add test file
parent
9a00fb62
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
228 additions
and
2 deletions
+228
-2
ffm.py
tensnsorflow/ffm.py
+2
-2
test.py
tensnsorflow/test.py
+226
-0
No files found.
tensnsorflow/ffm.py
View file @
8e34ccfe
...
@@ -135,9 +135,9 @@ class multiFFMFormatPandas:
...
@@ -135,9 +135,9 @@ class multiFFMFormatPandas:
col_type
=
t
[
col
]
col_type
=
t
[
col
]
name
=
'{}_{}'
.
format
(
col
,
val
)
name
=
'{}_{}'
.
format
(
col
,
val
)
if
col_type
.
kind
==
'O'
:
if
col_type
.
kind
==
'O'
:
ffm
.
append
(
'{}:{}:1'
.
format
(
self
.
field_index_
[
col
]
+
1
,
self
.
feature_index_
[
name
]
+
1
))
ffm
.
append
(
'{}:{}:1'
.
format
(
self
.
field_index_
[
col
]
+
1
,
self
.
feature_index_
[
name
]))
elif
col_type
.
kind
==
'i'
:
elif
col_type
.
kind
==
'i'
:
ffm
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
]
+
1
,
self
.
feature_index_
[
col
]
+
1
,
val
))
ffm
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
]
+
1
,
self
.
feature_index_
[
col
],
val
))
result
=
' '
.
join
(
ffm
)
result
=
' '
.
join
(
ffm
)
if
self
.
y
is
not
None
:
if
self
.
y
is
not
None
:
result
=
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
])
+
","
+
result
result
=
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
])
+
","
+
result
...
...
tensnsorflow/test.py
0 → 100644
View file @
8e34ccfe
import
json
from
pyspark
import
SparkContext
from
pyspark.sql
import
SQLContext
import
pandas
as
pd
from
pyspark
import
SparkConf
from
multiprocessing
import
Pool
class
multiFFMFormatPandas
:
def
__init__
(
self
):
self
.
field_index_
=
None
self
.
feature_index_
=
None
self
.
y
=
None
self
.
z
=
None
def
fit
(
self
,
df
,
y
=
None
,
z
=
None
):
self
.
y
=
y
self
.
z
=
z
df_ffm
=
df
[
df
.
columns
.
difference
([
self
.
y
,
self
.
z
])]
if
self
.
field_index_
is
None
:
self
.
field_index_
=
{
col
:
i
for
i
,
col
in
enumerate
(
df_ffm
)}
if
self
.
feature_index_
is
not
None
:
last_idx
=
max
(
list
(
self
.
feature_index_
.
values
()))
if
self
.
feature_index_
is
None
:
self
.
feature_index_
=
dict
()
last_idx
=
0
for
col
in
df
.
columns
:
vals
=
df
[
col
]
.
unique
()
for
val
in
vals
:
if
pd
.
isnull
(
val
):
if
self
.
feature_index_
is
not
None
:
last_idx
=
max
(
list
(
self
.
feature_index_
.
values
()))
if
self
.
feature_index_
is
None
:
self
.
feature_index_
=
dict
()
last_idx
=
0
for
col
in
df
.
columns
:
vals
=
df
[
col
]
.
unique
()
for
val
in
vals
:
if
pd
.
isnull
(
val
):
continue
name
=
'{}_{}'
.
format
(
col
,
val
)
if
name
not
in
self
.
feature_index_
:
self
.
feature_index_
[
name
]
=
last_idx
last_idx
+=
1
self
.
feature_index_
[
col
]
=
last_idx
last_idx
+=
1
return
self
def
fit_transform
(
self
,
df
,
y
=
None
,
z
=
None
,
n
=
50000
,
processes
=
4
):
# n是每个线程运行最大的数据条数,processes是线程数
self
.
fit
(
df
,
y
,
z
)
n
=
n
processes
=
processes
return
self
.
transform
(
df
,
n
,
processes
)
def
transform_row_
(
self
,
row
,
t
):
ffm
=
[]
ffm
.
append
(
'1,'
)
if
self
.
y
is
not
None
:
ffm
.
append
(
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
])
+
','
)
if
self
.
y
is
None
:
ffm
.
append
(
str
(
0
)
+
','
)
if
self
.
z
is
not
None
:
ffm
.
append
(
str
(
row
.
loc
[
row
.
index
==
self
.
z
][
0
])
+
','
)
if
self
.
z
is
None
:
ffm
.
append
(
str
(
0
)
+
','
)
ffm1
=
[]
row11
=
[]
for
index
in
row
.
index
:
if
(
index
!=
self
.
y
and
index
!=
self
.
z
):
row11
.
append
(
index
)
for
col
,
val
in
row
.
loc
[
row11
]
.
to_dict
()
.
items
():
col_type
=
t
[
col
]
name
=
'{}_{}'
.
format
(
col
,
val
)
if
col_type
.
kind
==
'O'
:
ffm1
.
append
(
'{}:{}:1.0'
.
format
(
self
.
field_index_
[
col
]
+
1
,
self
.
feature_index_
[
name
]
+
1
))
elif
col_type
.
kind
==
'i'
:
ffm1
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
]
+
1
,
self
.
feature_index_
[
col
]
+
1
,
val
))
elif
col_type
.
kind
==
'f'
:
ffm1
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
]
+
1
,
self
.
feature_index_
[
col
]
+
1
,
val
))
return
''
.
join
(
ffm
)
+
' '
.
join
(
ffm1
)
def
transform
(
self
,
df
,
n
=
1500
,
processes
=
2
):
# n是每个线程运行最大的数据条数,processes是线程数
t
=
df
.
dtypes
.
to_dict
()
data_list
=
self
.
data_split_line
(
df
,
n
)
# 设置进程的数量
pool
=
Pool
(
processes
)
print
(
"总进度: "
+
str
(
len
(
data_list
)))
for
i
in
range
(
len
(
data_list
)):
data_list
[
i
]
=
pool
.
apply_async
(
self
.
pool_function
,
(
data_list
[
i
],
t
,))
result_map
=
{}
for
i
in
data_list
:
result_map
.
update
(
i
.
get
())
pool
.
close
()
pool
.
join
()
return
pd
.
Series
(
result_map
)
# 多进程计算方法
def
pool_function
(
self
,
df
,
t
):
return
{
idx
:
self
.
transform_row_
(
row
,
t
)
for
idx
,
row
in
df
.
iterrows
()}
# 切分数据方法,传人dataframe和切分条数的步长,返回dataframe的集合,每个dataframe中含有若干条数据
def
data_split_line
(
self
,
data
,
step
):
data_list
=
[]
x
=
0
while
True
:
if
x
+
step
<
data
.
__len__
():
data_list
.
append
(
data
.
iloc
[
x
:
x
+
step
])
x
=
x
+
step
+
1
else
:
data_list
.
append
(
data
.
iloc
[
x
:
data
.
__len__
()])
break
return
data_list
# 原生转化方法,不需要多进程
def
native_transform
(
self
,
df
):
t
=
df
.
dtypes
.
to_dict
()
return
pd
.
Series
({
idx
:
self
.
transform_row_
(
row
,
t
)
for
idx
,
row
in
df
.
iterrows
()})
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
def
is_feature_index_exist
(
self
,
name
):
if
name
in
self
.
feature_index_
:
return
True
else
:
return
False
sc
=
SparkContext
(
conf
=
SparkConf
()
.
setMaster
(
"spark://nvwa:7077"
)
.
setAppName
(
"mnist_streaming"
)
.
set
(
"spark.streaming.kafka.maxRatePerPartition"
,
100
))
ssc
=
SQLContext
(
sc
)
data1
=
ssc
.
read
.
format
(
"jdbc"
)
.
options
(
url
=
"jdbc:mysql://10.66.157.22:4000"
,
driver
=
"com.mysql.jdbc.Driver"
,
dbtable
=
"(select device_id,app_list from jerry_prod.device_id_applist where stat_date = '20181119') tmp"
,
user
=
"root"
,
password
=
"3SYz54LS9#^9sBvC"
)
.
load
()
data1
.
show
(
1
)
def
is_json
(
myjson
):
try
:
json
.
loads
(
myjson
)
except
ValueError
:
return
False
return
True
appnamelist
=
ssc
.
read
.
format
(
"jdbc"
)
.
options
(
url
=
"jdbc:mysql://10.66.157.22:4000"
,
driver
=
"com.mysql.jdbc.Driver"
,
dbtable
=
"(select * from jerry_test.esmm_lqd_dict) tmp"
,
user
=
"root"
,
password
=
"3SYz54LS9#^9sBvC"
)
.
load
()
appnamelist
=
appnamelist
.
toPandas
()
appnamelist
=
appnamelist
.
iloc
[:,
0
]
.
tolist
()
appNamelist
=
[
'device_id'
]
for
i
in
appnamelist
:
appNamelist
.
append
(
i
)
def
app1
(
y
,
x
,
appnamelist
):
if
is_json
(
x
):
app
=
json
.
loads
(
x
,
strict
=
False
)
length
=
len
(
appnamelist
)
appNamelist
=
[
0
for
n
in
range
(
length
)]
appNamelist
[
0
]
=
y
for
i
in
range
(
1
,
length
):
for
j
in
app
:
if
(
appnamelist
[
i
]
==
j
[
"appName"
]):
appNamelist
[
i
]
=
1
return
appNamelist
else
:
return
None
applist
=
data1
.
rdd
.
map
(
lambda
x
:(
app1
(
x
[
0
],
x
[
1
],
appNamelist
)))
.
filter
(
lambda
x
:
x
)
.
toDF
(
appNamelist
)
# applist.write.format('jdbc').options(
# url='jdbc:mysql://192.168.15.12:4000',
# driver='com.mysql.jdbc.Driver',
# dbtable='jerry_prod.applist_esmm',
# user='root',
# password='').mode('append').save()
applistPD
=
applist
.
toPandas
()
# userlist = pd.read_csv('esmm_data.csv',sep="\t")
data2
=
ssc
.
read
.
format
(
"jdbc"
)
.
options
(
url
=
"jdbc:mysql://10.66.157.22:4000"
,
driver
=
"com.mysql.jdbc.Driver"
,
dbtable
=
"(select device_id,date,y,z,ucity_id,clevel1_id,slevel1_id,ccity_name,scity_id,channel from jerry_test.newesmmdata where date ='2018-11-19') tmp"
,
user
=
"root"
,
password
=
"3SYz54LS9#^9sBvC"
)
.
load
()
userlist
=
data2
.
toPandas
()
df
=
pd
.
merge
(
userlist
,
applistPD
,
how
=
'left'
,
on
=
'device_id'
)
dfwithoutid
=
df
.
iloc
[:,
2
:
-
1
]
device_id
=
df
.
iloc
[:,
0
:
2
]
dffill
=
dfwithoutid
.
fillna
(
0
)
print
(
dffill
.
head
(
1
))
model
=
multiFFMFormatPandas
()
df
=
model
.
fit_transform
(
dffill
,
y
=
"y"
,
z
=
"z"
,
n
=
80000
,
processes
=
20
)
#print(df.head(1))
df
=
pd
.
concat
([
device_id
,
df
],
axis
=
1
)
df
=
df
.
astype
(
str
)
df1
=
ssc
.
createDataFrame
(
df
,
schema
=
[
'device_id'
,
'stat_date'
,
'data'
])
df1
.
repartition
(
200
)
.
write
.
format
(
'jdbc'
)
.
options
(
url
=
'jdbc:mysql://10.66.157.22:4000'
,
driver
=
'com.mysql.jdbc.Driver'
,
dbtable
=
'jerry_test.esmm_lqd_data'
,
user
=
'root'
,
password
=
'3SYz54LS9#^9sBvC'
)
.
mode
(
'append'
)
.
save
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment