Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
8eca148c
Commit
8eca148c
authored
Nov 30, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
增加保存到csv
parent
280a03ce
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
34 additions
and
42 deletions
+34
-42
ffm.py
tensnsorflow/ffm.py
+34
-42
No files found.
tensnsorflow/ffm.py
View file @
8eca148c
...
...
@@ -24,16 +24,16 @@ def get_data():
esmm
=
esmm
.
rename
(
columns
=
{
0
:
"stat_date"
,
1
:
"device_id"
,
2
:
"ucity_id"
,
3
:
"cid_id"
,
4
:
"diary_service_id"
,
5
:
"y"
,
6
:
"z"
,
7
:
"clevel1_id"
,
8
:
"slevel1_id"
})
print
(
"esmm data ok"
)
print
(
esmm
.
head
()
)
print
(
esmm
.
shape
)
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'eagle'
)
sql
=
"select * from home_tab_click"
temp
=
con_sql
(
db
,
sql
)
temp
=
temp
.
rename
(
columns
=
{
0
:
"device_id"
})
print
(
"click data ok"
)
print
(
temp
.
head
())
df
=
pd
.
merge
(
esmm
,
temp
,
on
=
"device_id"
)
.
dropna
(
)
#
print(temp.head())
df
=
pd
.
merge
(
esmm
,
temp
,
on
=
"device_id"
,
how
=
'left'
)
.
fillna
(
0
)
print
(
df
.
head
()
)
print
(
df
.
shape
)
df
[
"diary_service_id"
]
=
df
[
"diary_service_id"
]
.
astype
(
"str"
)
df
[
"clevel1_id"
]
=
df
[
"clevel1_id"
]
.
astype
(
"str"
)
...
...
@@ -46,9 +46,9 @@ def get_data():
df
=
df
.
drop
(
"z"
,
axis
=
1
)
print
(
df
.
head
())
train
=
df
[
df
[
"stat_date"
]
!=
"2018-11-25"
]
transform
(
train
,
"
train
"
)
transform
(
train
,
"
crvtrain.csv
"
)
test
=
df
[
df
[
"stat_date"
]
==
"2018-11-25"
]
transform
(
test
,
"
test
"
)
transform
(
test
,
"
crvtest.csv
"
)
def
transform
(
df
,
table
):
...
...
@@ -56,28 +56,30 @@ def transform(df,table):
df
=
model
.
fit_transform
(
df
,
y
=
"y"
,
n
=
50000
,
processes
=
20
)
df
=
pd
.
DataFrame
(
df
)
df
[
"device_id"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
0
])
df
[
"ucity_id"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
1
])
df
[
"cid_id"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
2
])
df
[
"y"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
3
])
df
[
"ffm"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
4
])
df
[
"city_id"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
1
])
df
[
"diary_id"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
2
])
# df["y"] = df[0].apply(lambda x: x.split(",")[3])
df
[
"seq"
]
=
list
(
range
(
df
.
shape
[
0
]))
df
[
"seq"
]
=
df
[
"seq"
]
.
astype
(
"str"
)
df
[
"ffm"
]
=
df
[
"seq"
]
.
str
.
cat
([
df
[
"y"
]
.
values
.
tolist
(),
df
[
"ffm"
]
.
values
.
tolist
()],
sep
=
","
)
df
[
"number"
]
=
np
.
random
.
randint
(
1
,
2147483647
,
df
.
shape
[
0
])
df
=
df
.
drop
(
0
,
axis
=
1
)
df
[
"ffm"
]
=
df
[
0
]
.
apply
(
lambda
x
:
x
.
split
(
","
)[
3
:])
df
[
"ffm"
]
=
df
[
"seq"
]
.
str
.
cat
(
df
[
"ffm"
],
sep
=
","
)
df
[
"random"
]
=
np
.
random
.
randint
(
1
,
2147483647
,
df
.
shape
[
0
])
df
=
df
.
drop
(
0
,
axis
=
1
)
.
drop
(
"seq"
,
axis
=
1
)
print
(
"size"
)
print
(
df
.
shape
)
yconnect
=
create_engine
(
'mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8'
)
for
i
in
range
(
0
,
df
.
shape
[
0
],
10000
):
print
(
i
)
if
i
==
0
:
temp
=
df
.
loc
[
0
:
10000
]
elif
i
+
10000
>
df
.
shape
[
0
]:
temp
=
df
.
loc
[
i
:]
else
:
temp
=
df
.
loc
[
i
+
1
:
i
+
10000
]
pd
.
io
.
sql
.
to_sql
(
temp
,
table
,
yconnect
,
schema
=
'jerry_test'
,
if_exists
=
'append'
,
index
=
False
)
print
(
"insert done"
)
df
.
to_csv
(
path
+
table
,
index
=
None
)
# yconnect = create_engine('mysql+pymysql://root:3SYz54LS9#^9sBvC@10.66.157.22:4000/jerry_test?charset=utf8')
# n = 100000
# for i in range(0,df.shape[0],n):
# print(i)
# if i == 0:
# temp = df.loc[0:n]
# elif i+n > df.shape[0]:
# temp = df.loc[i+1:]
# else:
# temp = df.loc[i+1:i+n]
# pd.io.sql.to_sql(temp, table, yconnect, schema='jerry_test', if_exists='append', index=False)
# print("insert done")
...
...
@@ -123,11 +125,6 @@ class multiFFMFormatPandas:
def
transform_row_
(
self
,
row
,
t
):
ffm
=
[]
if
self
.
y
is
not
None
:
ffm
.
append
(
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
]))
if
self
.
y
is
None
:
ffm
.
append
(
str
(
0
))
for
col
,
val
in
row
.
loc
[
row
.
index
!=
self
.
y
]
.
to_dict
()
.
items
():
col_type
=
t
[
col
]
name
=
'{}_{}'
.
format
(
col
,
val
)
...
...
@@ -135,7 +132,12 @@ class multiFFMFormatPandas:
ffm
.
append
(
'{}:{}:1'
.
format
(
self
.
field_index_
[
col
]
+
1
,
self
.
feature_index_
[
name
]
+
1
))
elif
col_type
.
kind
==
'i'
:
ffm
.
append
(
'{}:{}:{}'
.
format
(
self
.
field_index_
[
col
]
+
1
,
self
.
feature_index_
[
col
]
+
1
,
val
))
return
' '
.
join
(
ffm
)
result
=
' '
.
join
(
ffm
)
if
self
.
y
is
not
None
:
result
=
str
(
row
.
loc
[
row
.
index
==
self
.
y
][
0
])
+
","
+
result
if
self
.
y
is
None
:
result
=
str
(
0
)
+
","
+
result
return
result
def
transform
(
self
,
df
,
n
=
1500
,
processes
=
2
):
# n是每个线程运行最大的数据条数,processes是线程数
...
...
@@ -172,18 +174,6 @@ class multiFFMFormatPandas:
data_list
.
append
(
data
.
iloc
[
x
:
data
.
__len__
()])
break
'''
# 返回生成器方法,但是本地测试效率不高
x = 0
while True:
if x + step < data.__len__():
yield data.iloc[x:x + step]
x = x + step + 1
else:
yield data.iloc[x:data.__len__()]
break
'''
return
data_list
# 原生转化方法,不需要多进程
...
...
@@ -201,4 +191,5 @@ class multiFFMFormatPandas:
if
__name__
==
"__main__"
:
path
=
"/data/ffm/"
get_data
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment