Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
e72f288f
Commit
e72f288f
authored
Aug 16, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add multiprocess get_eachCityDiaryTop3000
parent
d0a857c9
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
71 additions
and
63 deletions
+71
-63
diaryCandidateSet.py
diaryCandidateSet.py
+36
-48
testCases.py
local/testCases.py
+35
-15
No files found.
diaryCandidateSet.py
View file @
e72f288f
...
...
@@ -3,6 +3,7 @@ import pandas as pd
from
utils
import
*
from
config
import
*
import
numpy
as
np
import
time
# 候选集cid只能从训练数据集cid中选择
...
...
@@ -36,53 +37,6 @@ def get_cityList():
return
cityList
def
pool_method
(
i
,
sql
,
allCitiesTop3000
):
data
=
con_sql
(
sql
)
data
=
data
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
data
=
filter_cid
(
data
)
if
data
.
shape
[
0
]
<
3000
:
n
=
3000
-
data
.
shape
[
0
]
# 全国点击量TOP3000日记中去除该城市的日记
temp
=
allCitiesTop3000
[
allCitiesTop3000
[
"city_id"
]
!=
i
]
.
loc
[:
n
-
1
]
data
=
data
.
append
(
temp
)
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/{0}DiaryTop3000.csv"
.
format
(
i
)
data
.
to_csv
(
file_name
,
index
=
False
)
print
(
"成功保存{}地区DiaryTop3000"
.
format
(
i
))
# 把城市列表切分成n份,然后拼接成一个列表
# def split_cityList(cityList,n):
# l = len(cityList)
# step = np.rint(l/n)
# new_list = []
# x = 0
# while True:
# if x + step < :
# data_list.append(data.iloc[x:x + step])
# x = x + step + 1
# else:
# data_list.append(data.iloc[x:data.__len__()])
# break
# 多线程方法获取全国城市热门日记
# def multi_get_eachCityDiaryTop3000(processes):
# cityList = get_cityList()
# allCitiesTop3000 = get_allCitiesDiaryTop3000()
#
# pool = Pool(processes)
# for i in range(len(data_list)):
# data_list[i] = pool.apply_async(self.pool_function, (data_list[i], t,))
#
# result_map = {}
# for i in data_list:
# result_map.update(i.get())
# pool.close()
# pool.join()
def
get_eachCityDiaryTop3000
():
# 获取每个城市点击量TOP3000日记,如果数量小于3000,用全国点击量TOP3000日记补充
cityList
=
get_cityList
()
...
...
@@ -106,6 +60,40 @@ def get_eachCityDiaryTop3000():
data
.
to_csv
(
file_name
,
index
=
False
)
print
(
"成功保存{}地区DiaryTop3000"
.
format
(
i
))
def
pool_method
(
city
,
sql
,
allCitiesTop3000
):
data
=
con_sql
(
sql
)
data
=
data
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
data
=
filter_cid
(
data
)
if
data
.
shape
[
0
]
<
3000
:
n
=
3000
-
data
.
shape
[
0
]
# 全国点击量TOP3000日记中去除该城市的日记
temp
=
allCitiesTop3000
[
allCitiesTop3000
[
"city_id"
]
!=
city
]
.
loc
[:
n
-
1
]
data
=
data
.
append
(
temp
)
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/{0}DiaryTop3000.csv"
.
format
(
city
)
data
.
to_csv
(
file_name
,
index
=
False
)
print
(
"成功保存{}地区DiaryTop3000"
.
format
(
city
))
# 多线程方法获取全国城市热门日记
def
multi_get_eachCityDiaryTop3000
(
processes
):
city_list
=
get_cityList
()
allCitiesTop3000
=
get_allCitiesDiaryTop3000
()
pool
=
Pool
(
processes
)
for
city
in
city_list
:
sql
=
"select city_id,cid from data_feed_click "
\
"where cid_type = 'diary' and city_id = '{0}' group by cid "
\
"order by max(click_count_choice) desc limit 3000"
.
format
(
city
)
pool
.
apply_async
(
pool_method
,(
city
,
sql
,
allCitiesTop3000
,))
pool
.
close
()
pool
.
join
()
if
__name__
==
"__main__"
:
get_eachCityDiaryTop3000
()
start
=
time
.
time
()
multi_get_eachCityDiaryTop3000
(
6
)
end
=
time
.
time
()
print
(
"获取各城市热门日记耗时{}分"
.
format
((
end
-
start
)
/
60
))
local/testCases.py
View file @
e72f288f
...
...
@@ -2,20 +2,40 @@ from utils import *
import
datetime
import
pickle
if
__name__
==
'__main__'
:
data
=
pd
.
read_csv
(
"../data/test-data/raw-exposure.csv"
)[[
"cid"
,
"device_id"
]]
data
[
"y"
]
=
1
test_data
=
data
.
tail
(
1
)
ffm
=
FFMFormatPandas
()
data
=
ffm
.
fit_transform
(
data
,
y
=
'y'
)
data
.
to_csv
(
"../data/ffm_data.csv"
,
index
=
False
)
def
split_cityList
(
cityList
,
n
):
l
=
len
(
cityList
)
if
l
<=
n
:
return
cityList
else
:
step
=
int
(
np
.
rint
(
l
/
n
))
data_list
=
[]
x
=
0
while
x
+
step
<
l
:
data_list
.
append
(
cityList
[
x
:
x
+
step
])
x
+=
step
data_list
.
append
(
cityList
[
x
:])
return
data_list
if
__name__
==
'__main__'
:
l
=
list
(
range
(
22
))
a
=
split_cityList
(
l
,
5
)
print
(
a
)
with
open
(
"../data/ffm.object"
,
"wb"
)
as
f
:
pickle
.
dump
(
ffm
,
f
)
with
open
(
"../data/ffm.object"
,
"rb"
)
as
f
:
ffm
=
pickle
.
load
(
f
)
result
=
ffm
.
transform
(
test_data
)
print
(
result
)
data_1
=
pd
.
read_csv
(
"../data/ffm_data.csv"
,
header
=
None
)
.
tail
(
5
)
print
(
data_1
)
# data = pd.read_csv("../data/test-data/raw-exposure.csv")[["cid", "device_id"]]
# data["y"] = 1
# test_data = data.tail(1)
#
# ffm = FFMFormatPandas()
# data = ffm.fit_transform(data, y='y')
# data.to_csv("../data/ffm_data.csv", index=False)
#
# with open("../data/ffm.object", "wb") as f:
# pickle.dump(ffm, f)
# with open("../data/ffm.object", "rb") as f:
# ffm = pickle.load(f)
# result = ffm.transform(test_data)
# print(result)
# data_1 = pd.read_csv("../data/ffm_data.csv", header=None).tail(5)
# print(data_1)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment