1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# -*- coding: UTF-8 -*-
from utils import con_sql,tuple2dict,get_yesterday_date
from config import DIRECTORY_PATH
class TopFeatures(object):
def __init__(self, platform, cid_type, top_n=-1):
"""
platform : 'all';'ios';'android'
cid_type : 'diary';'answer';'question'...
top_n : the top rows of the result
"""
if platform == "ios":
self.platform = "='App Store'"
elif platform == "android":
self.platform = "!='App Store'"
else:
self.platform = " is not null"
self.cid_type = cid_type
self.top_n = top_n
def get_click_times(self):
# rtype : dict
sql = "select cid,count(cid) from data_feed_click \
where stat_date = '{0}' \
and device_type{1} and cid_type='{2}' \
group by cid \
order by count(cid) desc".format(get_yesterday_date(), self.platform.replace(' ','') if self.platform[-2]=='e' else self.platform, self.cid_type)
clk_times = tuple2dict(con_sql(sql))
return clk_times
def get_impression_times(self):
# rtype : dict
sql = "select cid,count(cid) from data_feed_exposure \
where stat_date = '{0}' \
and device_type{1} and cid_type='{2}' \
group by cid order by count(cid) desc".format(get_yesterday_date(), self.platform, self.cid_type)
imp_times = tuple2dict(con_sql(sql))
return imp_times
def get_result(self, platform, clk_n=2, result_types="ctr"):
"""
platform : "所有";"苹果","安卓" #方便显示
clk : dict
imp : dict
clk_n : 获取topN点击率时,过滤的点击数
result_types : sorted by ["clk","imp","ctr"]
rtype : list
"""
clk = self.get_click_times()
imp = self.get_impression_times()
topn = []
#获取topN的点击
if imp == {} or result_types == "clk":
for i in clk:
if self.cid_type == "diary":
url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/'
else:
url = "http://m.igengmei.com/{0}/".format(self.cid_type) + i[i.index('|')+1:] + '/'
topn.append((platform,i,clk[i],0,0,url))
topn.sort(key=lambda x:x[2],reverse=True)
return topn[:int(self.top_n)]
#获取topN的曝光
elif clk == {} or result_types == "imp":
for i in imp:
if self.cid_type == "diary":
url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/'
else:
url = "http://m.igengmei.com/{0}/".format(self.cid_type) + i[i.index('|')+1:] + '/'
topn.append((platform,i,0,imp[i],0,url))
topn.sort(key=lambda x:x[3],reverse=True)
return topn[:int(self.top_n)]
#获取topN的ctr
else:
for i in clk:
if i in imp.keys() and clk[i] > clk_n:
if self.cid_type == "diary":
url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/'
else:
url = "http://m.igengmei.com/{0}/".format(self.cid_type) + i[i.index('|')+1:] + '/'
topn.append((platform,i,clk[i],imp[i],round(clk[i]/imp[i],4),url))
topn.sort(key=lambda x:x[4],reverse=True)
return topn[:int(self.top_n)]
def result2file(self, result_lst, fpath):
"""
result_lst : [all_result,ios_result,android_result]
fpath : output filename
rtype : none
"""
with open(fpath, 'w') as f:
tplt = "{0:\u3000<4}\t{1:\u3000<12}\t{2:\u3000^6}\t{3:\u3000^6}\t{4:\u3000<8}\t{5:\u3000^15}\n"
f.write("Top {0} {1}\n".format(self.top_n,self.cid_type))
sep = "=================================================================\n"
header = tplt.format("平台","{0}_id".format(self.cid_type),"点击数","曝光数","点击率","{1}链接".format(self.cid_type,self.cid_type))
f.write(sep)
f.write(header)
for i in result_lst:
for j in i:
f.write(tplt.format(j[0],j[1],j[2],j[3],j[4],j[5]))
f.write(sep)
if i != result_lst[-1]:
f.write(header)
f.write("\n\n")
def main():
#1. Top diary
top_diary_all = TopFeatures("all", "diary", 100)
clk_diary_ctr_all = top_diary_all.get_result("所有", 4, "ctr")
top_diary_ios = TopFeatures("ios", "diary", 100)
clk_diary_ctr_ios = top_diary_ios.get_result("苹果", 4, "ctr")
top_diary_android = TopFeatures("android", "diary", 100)
clk_diary_ctr_android = top_diary_android.get_result("安卓", 4, "ctr")
result_lst = [clk_diary_ctr_all, clk_diary_ctr_ios, clk_diary_ctr_android]
output_path = DIRECTORY_PATH + "top100_ctr_diary_{}.txt".format(get_yesterday_date().replace('-',''))
top_diary_all.result2file(result_lst, output_path)
print("已获取 Top diary 特征")
#2. Top answer
top_answer_all = TopFeatures("all", "answer", 100)
clk_answer_ctr_all = top_answer_all.get_result("所有", 2, "ctr")
top_answer_ios = TopFeatures("ios", "answer", 100)
clk_answer_ctr_ios = top_answer_ios.get_result("苹果", 2, "ctr")
top_answer_android = TopFeatures("android", "answer", 100)
clk_answer_ctr_android = top_answer_android.get_result("安卓", 2, "ctr")
result_lst = [clk_answer_ctr_all, clk_answer_ctr_ios, clk_answer_ctr_android]
output_path = DIRECTORY_PATH + "top100_ctr_answer_{}.txt".format(get_yesterday_date().replace('-',''))
top_answer_all.result2file(result_lst, output_path)
print("已获取 Top answer 特征")
#3. Top question
top_question_all = TopFeatures("all", "question", 100)
clk_question_ctr_all = top_question_all.get_result("所有", 2, "ctr")
top_question_ios = TopFeatures("ios", "question", 100)
clk_question_ctr_ios = top_question_ios.get_result("苹果", 2, "ctr")
top_question_android = TopFeatures("android", "question", 100)
clk_question_ctr_android = top_question_android.get_result("安卓", 2, "ctr")
result_lst = [clk_question_ctr_all, clk_question_ctr_ios, clk_question_ctr_android]
output_path = DIRECTORY_PATH + "top100_ctr_question_{}.txt".format(get_yesterday_date().replace('-',''))
top_question_all.result2file(result_lst, output_path)
print("已获取 Top question 特征")
if __name__ == '__main__':
main()