Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
3fcaa79f
Commit
3fcaa79f
authored
Aug 09, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
日记候选集从top2000改为3000
parent
aeddd470
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
22 additions
and
22 deletions
+22
-22
diaryCandidateSet.py
diaryCandidateSet.py
+19
-19
predictDiary.py
predictDiary.py
+2
-2
utils.py
utils.py
+1
-1
No files found.
diaryCandidateSet.py
View file @
3fcaa79f
...
...
@@ -16,17 +16,17 @@ def filter_cid(df):
return
df
def
get_allCitiesDiaryTop
2
000
():
# 获取全国点击量TOP
2
000日记
def
get_allCitiesDiaryTop
3
000
():
# 获取全国点击量TOP
3
000日记
sql
=
"select city_id,cid from data_feed_click "
\
"where cid_type = 'diary' group by cid order by max(click_count_choice) desc limit
2
000"
allCitiesTop
2
000
=
con_sql
(
sql
)
allCitiesTop
2000
=
allCitiesTop2
000
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
allCitiesTop
2000
=
filter_cid
(
allCitiesTop2
000
)
allCitiesTop
2000
.
to_csv
(
DIRECTORY_PATH
+
"diaryTestSet/allCitiesDiaryTop2
000.csv"
,
index
=
False
)
print
(
"成功获取全国日记点击量TOP
2
000"
)
return
allCitiesTop
2
000
"where cid_type = 'diary' group by cid order by max(click_count_choice) desc limit
3
000"
allCitiesTop
3
000
=
con_sql
(
sql
)
allCitiesTop
3000
=
allCitiesTop3
000
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
allCitiesTop
3000
=
filter_cid
(
allCitiesTop3
000
)
allCitiesTop
3000
.
to_csv
(
DIRECTORY_PATH
+
"diaryTestSet/allCitiesDiaryTop3
000.csv"
,
index
=
False
)
print
(
"成功获取全国日记点击量TOP
3
000"
)
return
allCitiesTop
3
000
def
get_cityList
():
...
...
@@ -40,28 +40,28 @@ def get_cityList():
return
cityList
def
get_eachCityDiaryTop
2
000
():
# 获取每个城市点击量TOP
2000日记,如果数量小于2000,用全国点击量TOP2
000日记补充
def
get_eachCityDiaryTop
3
000
():
# 获取每个城市点击量TOP
3000日记,如果数量小于3000,用全国点击量TOP3
000日记补充
cityList
=
get_cityList
()
allCitiesTop
2000
=
get_allCitiesDiaryTop2
000
()
allCitiesTop
3000
=
get_allCitiesDiaryTop3
000
()
for
i
in
cityList
:
sql
=
"select city_id,cid from data_feed_click "
\
"where cid_type = 'diary' and city_id = '{0}' group by cid "
\
"order by max(click_count_choice) desc limit
2
000"
.
format
(
i
)
"order by max(click_count_choice) desc limit
3
000"
.
format
(
i
)
data
=
con_sql
(
sql
)
data
=
data
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
data
=
filter_cid
(
data
)
if
data
.
shape
[
0
]
<
2
000
:
n
=
2
000
-
data
.
shape
[
0
]
# 全国点击量TOP
2
000日记中去除该城市的日记
temp
=
allCitiesTop
2000
[
allCitiesTop2
000
[
"city_id"
]
!=
i
]
.
loc
[:
n
-
1
]
if
data
.
shape
[
0
]
<
3
000
:
n
=
3
000
-
data
.
shape
[
0
]
# 全国点击量TOP
3
000日记中去除该城市的日记
temp
=
allCitiesTop
3000
[
allCitiesTop3
000
[
"city_id"
]
!=
i
]
.
loc
[:
n
-
1
]
data
=
data
.
append
(
temp
)
else
:
pass
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/{0}DiaryTop
2
000.csv"
.
format
(
i
)
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/{0}DiaryTop
3
000.csv"
.
format
(
i
)
data
.
to_csv
(
file_name
,
index
=
False
)
if
__name__
==
"__main__"
:
get_eachCityDiaryTop
2
000
()
get_eachCityDiaryTop
3
000
()
predictDiary.py
View file @
3fcaa79f
...
...
@@ -11,7 +11,7 @@ from userProfile import fetch_user_profile
# 将device_id、city_id拼接到对应的城市热门日记表。注意:下面预测集特征顺序要与训练集保持一致
def
feature_en
(
user_profile
):
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/{0}DiaryTop
2
000.csv"
.
format
(
user_profile
[
'city_id'
])
file_name
=
DIRECTORY_PATH
+
"diaryTestSet/{0}DiaryTop
3
000.csv"
.
format
(
user_profile
[
'city_id'
])
data
=
pd
.
read_csv
(
file_name
)
data
[
"device_id"
]
=
user_profile
[
'device_id'
]
...
...
@@ -24,7 +24,7 @@ def feature_en(user_profile):
data
.
loc
[
data
[
"minute"
]
==
0
,
[
"minute"
]]
=
60
data
[
"hour"
]
=
data
[
"hour"
]
.
astype
(
"category"
)
data
[
"minute"
]
=
data
[
"minute"
]
.
astype
(
"category"
)
# 虽然预测y,但ffm转化需要y,并不影响预测结果
data
[
"y"
]
=
0
data
=
data
.
drop
(
"city_id"
,
axis
=
1
)
print
(
data
.
head
(
2
))
...
...
utils.py
View file @
3fcaa79f
...
...
@@ -71,7 +71,7 @@ class FFMFormatPandas:
def
transform
(
self
,
df
):
t
=
df
.
dtypes
.
to_dict
()
return
pd
.
Series
({
idx
:
self
.
transform_row_
(
row
,
t
)
for
idx
,
row
in
df
.
iterrows
()})
# 下面这个方法不是这个类原有的方法,是新增的。目的是用来判断这个用户是不是在训练数据集中存在
def
is_feature_index_exist
(
self
,
name
):
if
name
in
self
.
feature_index_
:
return
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment