Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
879adfa0
Commit
879adfa0
authored
Aug 06, 2018
by
张彦钊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
第一次提交日记测试集脚本文件
parent
5ad54b62
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
50 additions
and
0 deletions
+50
-0
diaryTestSet.py
diaryTestSet.py
+50
-0
No files found.
diaryTestSet.py
0 → 100644
View file @
879adfa0
import
pymysql
import
pandas
as
pd
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
# 从数据库获取数据,并将数据转化成DataFrame
def
get_data
(
sql
):
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
data
=
cursor
.
fetchall
()
data
=
pd
.
DataFrame
(
list
(
data
))
.
dropna
()
return
data
# 获取全国点击量TOP2000日记
sql
=
"select city_id,cid where cid_type = 'diary' order by click_count_choice desc limit 2000"
allCitiesTop2000
=
get_data
(
sql
)
allCitiesTop2000
=
allCitiesTop2000
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
allCitiesTop2000
.
to_csv
(
"
\
home
\
zhangyanzhao
\
diaryTestSet
\a
llCitiesTop2000.csv"
)
print
(
"成功获取全国日记点击量TOP2000"
)
# 获取全国城市列表
sql
=
"select distinct city_id from data_feed_click"
cityList
=
get_data
(
sql
)
cityList
.
to_csv
(
"
\
home
\
zhangyanzhao
\
diaryTestSet
\
cityList.csv"
)
cityList
=
cityList
[
0
]
.
values
.
tolist
()
print
(
"成功获取城市列表"
)
# 获取每个城市点击量TOP2000日记,如果数量小于2000,用全国点击量TOP2000日记补充
for
i
in
cityList
:
sql
=
"select city_id,cid from data_feed_click "
\
"where cid_type = 'diary' and city_id = {0} "
\
"order by click_count_choice desc limit 2000"
.
format
(
i
)
data
=
get_data
(
sql
)
data
=
data
.
rename
(
columns
=
{
0
:
"city_id"
,
1
:
"cid"
})
if
data
.
shape
[
0
]
<
2000
:
n
=
2000
-
data
.
shape
[
0
]
# 全国点击量TOP2000日记中去除该城市的日记
temp
=
allCitiesTop2000
[
allCitiesTop2000
[
"city_id"
]
!=
i
]
.
loc
[:
n
-
1
]
data
=
data
.
append
(
temp
)
else
:
pass
file_name
=
"
\
home
\
zhangyanzhao
\
diaryTestSet
\
{0}DiaryTop2000.csv"
.
format
(
i
)
data
.
to_csv
(
file_name
)
print
(
"end"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment