Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
F
ffm-baseline
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ML
ffm-baseline
Commits
652bfcb4
Commit
652bfcb4
authored
Aug 09, 2018
by
高雅喆
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add a test dir in eda
parent
634de7a7
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
177 additions
and
1 deletion
+177
-1
getTop100Diary.py
eda/recommended_indexs/code/getTop100Diary.py
+0
-1
config.py
eda/test/config.py
+2
-0
getTopFeatures.py
eda/test/getTopFeatures.py
+138
-0
utils.py
eda/test/utils.py
+37
-0
No files found.
eda/recommended_indexs/code/getTop100Diary.py
View file @
652bfcb4
...
...
@@ -21,7 +21,6 @@ def tuple2dict(tuple_result):
def
result2file
(
result_lst
,
fpath
):
with
open
(
fpath
,
'w'
)
as
f
:
tplt
=
"{0:
\u3000
<4}
\t
{1:
\u3000
<12}
\t
{2:
\u3000
^6}
\t
{3:
\u3000
^6}
\t
{4:
\u3000
<8}
\t
{5:
\u3000
^15}
\n
"
tplt
=
"{0:<6}
\t
{1:<10}
\t
{2:^10}
\t
{3:^10}
\t
{4:^10}
\t
{5:<10}
\n
"
f
.
write
(
"Top 100 diary
\n
"
)
f
.
write
(
"=================================================================
\n
"
)
f
.
write
(
tplt
.
format
(
"平台"
,
"diary_id"
,
"点击数"
,
"曝光数"
,
"点击率"
,
"diary链接"
))
...
...
eda/test/config.py
0 → 100644
View file @
652bfcb4
DIRECTORY_PATH
=
"/data2/models/eda/recommended_indexs/"
\ No newline at end of file
eda/test/getTopFeatures.py
0 → 100644
View file @
652bfcb4
# -*- coding: UTF-8 -*-
from
utils
import
con_sql
,
tuple2dict
,
get_yesterday_date
from
config
import
DIRECTORY_PATH
class
TopFeatures
(
object
):
def
__init__
(
self
,
ndays
,
platform
,
cid_type
,
top_n
=-
1
):
"""
ndays : 1;2;3;4..
platform : 'all';'ios';'android'
cid_type : 'diary';'answer';'question'...
top_n : the rows of the result
"""
self
.
ndays
=
ndays
if
platform
==
"ios"
:
self
.
platform
=
"='AppStore'"
elif
platform
==
"android"
:
self
.
platform
=
"!='AppStore'"
else
:
self
.
platform
=
" is not null"
self
.
cid_type
=
cid_type
self
.
top_n
=
top_n
def
get_click_times
(
self
):
# rtype : dict
if
self
.
cid_type
[
-
2
]
==
'e'
:
self
.
cid_type
=
self
.
cid_type
.
replace
(
' '
,
''
)
sql
=
"select cid,count(cid) from data_feed_click
\
where from_unixtime(time,'
%
Y-
%
m-
%
d')=date_add(curdate(), interval -{0} day)
\
and device_type{1}} and cid_type='{2}'
\
group by cid order by count(cid) desc"
.
format
(
self
.
ndays
,
self
.
platform
,
self
.
cid_type
)
clk_times
=
tuple2dict
(
con_sql
(
sql
))
return
clk_times
def
get_impression_times
(
self
):
# rtype : dict
if
self
.
cid_type
[
-
2
]
==
'e'
::
self
.
cid_type
=
self
.
cid_type
[:
-
6
]
+
' '
+
self
.
cid_type
[:
-
6
:]
sql
=
"select cid,count(cid) from data_feed_exposure
\
where from_unixtime(time,'
%
Y-
%
m-
%
d')=date_add(curdate(), interval -{0} day)
\
and device_type{1}} and cid_type='{2}'
\
group by cid order by count(cid) desc"
.
format
(
self
.
ndays
,
self
.
platform
,
self
.
cid_type
)
imp_times
=
tuple2dict
(
con_sql
(
sql
))
return
imp_times
def
get_result
(
self
,
result_types
=
"ctr"
,
clk
,
imp
,
clk_n
=
2
):
"""
result_types : "clk";"imp";"ctr"
clk : dict
imp : dict
clk_n : 获取topN点击率时,过滤的点击数
rtype : list
"""
topn
=
[]
#获取topN的点击
if
imp
==
{}
or
result_types
==
"clk"
:
for
i
in
clk
:
if
self
.
cid_type
==
"diary"
:
url
=
"http://m.igengmei.com/diary_book/"
+
i
[
i
.
index
(
'|'
)
+
1
:]
+
'/'
else
:
url
=
"http://m.igengmei.com/{0}/"
.
format
(
self
.
cid_type
)
+
i
[
i
.
index
(
'|'
)
+
1
:]
+
'/'
topn
.
append
((
self
.
cid_type
.
strip
(),
i
,
clk
[
i
],
0
,
0
,
url
))
topn
.
sort
(
key
=
lambda
x
:
x
[
2
],
reverse
=
True
)
return
topn
[:
int
(
self
.
top_n
)]
#获取topN的曝光
elif
clk
==
{}
or
result_types
==
"imp"
:
for
i
in
imp
:
if
self
.
cid_type
==
"diary"
:
url
=
"http://m.igengmei.com/diary_book/"
+
i
[
i
.
index
(
'|'
)
+
1
:]
+
'/'
else
:
url
=
"http://m.igengmei.com/{0}/"
.
format
(
self
.
cid_type
)
+
i
[
i
.
index
(
'|'
)
+
1
:]
+
'/'
topn
.
append
((
self
.
cid_type
.
strip
(),
i
,
0
,
imp
[
i
],
0
,
url
))
topn
.
sort
(
key
=
lambda
x
:
x
[
3
],
reverse
=
True
)
return
topn
[:
int
(
self
.
top_n
)]
#获取topN的ctr
else
:
for
i
in
clk
:
if
i
in
imp
.
keys
()
and
clk
[
i
]
>
clk_n
:
if
self
.
cid_type
==
"diary"
:
url
=
"http://m.igengmei.com/diary_book/"
+
i
[
i
.
index
(
'|'
)
+
1
:]
+
'/'
else
:
url
=
"http://m.igengmei.com/{0}/"
.
format
(
self
.
cid_type
)
+
i
[
i
.
index
(
'|'
)
+
1
:]
+
'/'
topn
.
append
((
self
.
cid_type
.
strip
(),
i
,
clk
[
i
],
imp
[
i
],
round
(
clk
[
i
]
/
imp
[
i
],
4
),
url
))
topn
.
sort
(
key
=
lambda
x
:
x
[
4
],
reverse
=
True
)
return
topn
[:
int
(
self
.
top_n
)]
def
result2file
(
self
,
result_lst
,
fpath
):
"""
result_lst : [all,ios,android]
fpath : output filename
rtype : none
"""
with
open
(
fpath
,
'w'
)
as
f
:
tplt
=
"{0:
\u3000
<4}
\t
{1:
\u3000
<12}
\t
{2:
\u3000
^6}
\t
{3:
\u3000
^6}
\t
{4:
\u3000
<8}
\t
{5:
\u3000
^15}
\n
"
f
.
write
(
"Top {0} {1}
\n
"
.
format
(
self
.
top_n
,
self
.
cid_type
))
sep
=
"=================================================================
\n
"
header
=
tplt
.
format
(
"平台"
,
"{}_id"
.
format
(
self
.
cid_type
),
"点击数"
,
"曝光数"
,
"点击率"
,
"{}链接"
.
format
(
self
.
cid_type
))
f
.
write
(
sep
)
f
.
write
(
header
)
for
i
in
result_lst
:
for
j
in
i
:
f
.
write
(
tplt
.
format
(
j
[
0
],
j
[
1
],
j
[
2
],
j
[
3
],
j
[
4
],
j
[
5
]))
f
.
write
(
sep
)
if
i
!=
result_lst
[
-
1
]:
f
.
write
(
header
)
f
.
write
(
"
\n\n
"
)
def
main
():
top_diary
=
TopFeatures
(
1
,
"all"
,
"diary"
)
clk_diary_times_all
=
get_click_times
()
imp_diary_times_all
=
get_impression_times
()
clk_diary_ctr_all
=
get_result
(
"ctr"
,
clk_diary_times_all
,
imp_diary_times_all
,
top_n
=
100
)
top_diary
=
TopFeatures
(
1
,
"ios"
,
"diary"
)
clk_diary_times_ios
=
get_click_times
()
imp_diary_times_ios
=
get_impression_times
()
clk_diary_ctr_ios
=
get_result
(
"ctr"
,
clk_diary_times_ios
,
imp_diary_times_ios
,
top_n
=
100
)
top_diary
=
TopFeatures
(
1
,
"android"
,
"diary"
)
clk_diary_times_android
=
get_click_times
()
imp_diary_times_android
=
get_impression_times
()
clk_diary_ctr_android
=
get_result
(
"ctr"
,
clk_diary_times_android
,
imp_diary_times_android
,
top_n
=
100
)
result_lst
=
[
clk_diary_ctr_all
,
clk_diary_ctr_ios
,
clk_diary_ctr_android
]
output_path
=
DIRECTORY_PATH
+
"5top100_ctr_diary_
%
s.txt"
result2file
(
result_lst
,
output_path
)
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
eda/test/utils.py
0 → 100644
View file @
652bfcb4
# -*- coding: UTF-8 -*-
import
pymysql
import
datetime
def
con_sql
(
sql
):
#从数据库的表里获取数据
"""
:type sql : str
:rtype : tuple
"""
db
=
pymysql
.
connect
(
host
=
'10.66.157.22'
,
port
=
4000
,
user
=
'root'
,
passwd
=
'3SYz54LS9#^9sBvC'
,
db
=
'jerry_test'
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
result
=
cursor
.
fetchall
()
db
.
close
()
return
result
def
tuple2dict
(
tuple_result
):
#把sql结果从tuple(tuple,tuple)格式转换成dict格式
"""
:type tuple_result : tuple
:rtype : dict
"""
dict_result
=
{}
for
i
in
range
(
len
(
tuple_result
)):
dict_result
[
tuple_result
[
i
][
0
]]
=
tuple_result
[
i
][
1
]
return
dict_result
def
get_yesterday_date
():
#自动获取昨天的日期,如"20180808"
"""
:rtype : str
"""
today
=
datetime
.
date
.
today
()
yesterday
=
today
-
datetime
.
timedelta
(
days
=
1
)
yesterday
=
yesterday
.
strftime
(
"
%
Y
%
m
%
d"
)
return
yesterday
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment