1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# -*- coding: UTF-8 -*-
from utils import get_yesterday_date
from config import DIRECTORY_PATH
from cidRate import CidRate
from clkCidUidRate import ClkCidUidRate
from topFeatures import TopFeatures
from func import *
import time
start = time.time()
print("开始获取比例特征数据...")
#1.0 question曝光占比(=question被曝光数/总cid被曝光数)
question_imp_rate_all = CidRate("all","question").get_cid_imp_rate("所有")
question_imp_rate_ios = CidRate("ios","question").get_cid_imp_rate("苹果")
question_imp_rate_android = CidRate("android","question").get_cid_imp_rate("安卓")
question_imp_rate_result = [question_imp_rate_all,question_imp_rate_ios,question_imp_rate_android]
print("已获取question曝光占比")
#1.1 answer曝光占比(=answer被曝光数/总cid被曝光数)
answer_imp_rate_all = CidRate("all","answer").get_cid_imp_rate("所有")
answer_imp_rate_ios = CidRate("ios","answer").get_cid_imp_rate("苹果")
answer_imp_rate_android = CidRate("android","answer").get_cid_imp_rate("安卓")
answer_imp_rate_result = [answer_imp_rate_all,answer_imp_rate_ios,answer_imp_rate_android]
print("已获取answer曝光占比")
#1.2 diary曝光占比(=answer被曝光数/总cid被曝光数)
diary_imp_rate_all = CidRate("all","diary").get_cid_imp_rate("所有")
diary_imp_rate_ios = CidRate("ios","diary").get_cid_imp_rate("苹果")
diary_imp_rate_android = CidRate("android","diary").get_cid_imp_rate("安卓")
diary_imp_rate_result = [diary_imp_rate_all,diary_imp_rate_ios,diary_imp_rate_android]
print("已获取diary曝光占比")
#1.3 活跃用户点击率(=活跃用户点击次数/活跃用户曝光次数)
activate_uid_ctr_all = get_activate_uid_ctr("all")
activate_uid_ctr_ios = get_activate_uid_ctr("ios")
activate_uid_ctr_android = get_activate_uid_ctr("android")
activate_uid_ctr_result = [activate_uid_ctr_all,activate_uid_ctr_ios,activate_uid_ctr_android]
print("已获取活跃用户点击率")
#1.4 活跃用户平均每天曝光次数(=活跃用户曝光数/独立活跃用户数)
activate_uid_imp_all = get_activate_uid_imp_times("all")
activate_uid_imp_beijing = get_activate_uid_imp_times("beijing")
activate_uid_imp_result = [activate_uid_imp_all,activate_uid_imp_beijing]
print("已获取活跃用户平均每天曝光次数")
#1.5 点击answer用户占比(=点击answer用户数/曝光answer用户数)
click_answer_all = ClkCidUidRate("all","answer").get_clk_cid_uid_rate("所有")
click_answer_ios = ClkCidUidRate("ios","answer").get_clk_cid_uid_rate("苹果")
click_answer_android = ClkCidUidRate("android","answer").get_clk_cid_uid_rate("安卓")
click_answer_result = [click_answer_all,click_answer_ios,click_answer_android]
print("已获取点击answer用户占比")
#1.6 点击question用户占比(=点击question用户数/曝光question用户数)
click_question_all = ClkCidUidRate("all","question").get_clk_cid_uid_rate("所有")
click_question_ios = ClkCidUidRate("ios","question").get_clk_cid_uid_rate("苹果")
click_question_android = ClkCidUidRate("android","question").get_clk_cid_uid_rate("安卓")
click_question_result = [click_question_all,click_question_ios,click_question_android]
print("已获取点击question用户占比")
#1.7 点击diary用户占比(=点击diary用户数/曝光diary用户数)
click_diary_all = ClkCidUidRate("all","diary").get_clk_cid_uid_rate("所有")
click_diary_ios = ClkCidUidRate("ios","diary").get_clk_cid_uid_rate("苹果")
click_diary_android = ClkCidUidRate("android","diary").get_clk_cid_uid_rate("安卓")
click_diary_result = [click_diary_all,click_diary_ios,click_diary_android]
print("已获取点击diary用户占比")
#1.8 有点击用户占比(=有点击用户数/有曝光用户数)
click_everything_all = ClkCidUidRate("all","everything").get_clk_cid_uid_rate("所有")
click_everything_ios = ClkCidUidRate("ios","everything").get_clk_cid_uid_rate("苹果")
click_everything_android = ClkCidUidRate("android","everything").get_clk_cid_uid_rate("安卓")
click_everything_result = [click_everything_all,click_everything_ios,click_everything_android]
print("已获取有点击用户占比")
#1.9 无点击用户数分布(=无点击用户∩激活用户 / 激活用户数) #注意:(]里面的数字指的是距离当前时间的天数
# try:
# click_zero_uid_detail_all = get_click_zero_uid_rate_detail("all")
# click_zero_uid_detail_all["platform"] = "所有"
# click_zero_uid_detail_ios = get_click_zero_uid_rate_detail("ios")
# click_zero_uid_detail_ios["platform"] = "苹果"
# click_zero_uid_detail_android = get_click_zero_uid_rate_detail("android")
# click_zero_uid_detail_android["platform"] = "安卓"
# click_zero_uid_detail_result = [click_zero_uid_detail_all,click_zero_uid_detail_ios,click_zero_uid_detail_android]
# print("已获取无点击用户数激活日期分布")
# except:
# click_zero_uid_detail_result = []
# print("GC life time is shorter than transaction duration")
#==========================================================================================
#2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)
print("开始获取Top特征数据...")
click_times_to_count_uid = get_click_times_to_count_uid()
print("已获取用户点击次数分布")
#2.2 Top 100 diary(sorted by ctr)
top_diary_all = TopFeatures("all", "diary", 100).get_result("所有", 4, "ctr")
top_diary_ios = TopFeatures("ios", "diary", 100).get_result("苹果", 4, "ctr")
top_diary_android = TopFeatures("android", "diary", 100).get_result("安卓", 4, "ctr")
top_diary_result = [top_diary_all,top_diary_ios,top_diary_android]
print("已获取 Top diary 特征")
#2.3 Top 100 Answer(sorted by ctr)
top_answer_all = TopFeatures("all", "answer", 100).get_result("所有", 2, "ctr")
top_answer_ios = TopFeatures("ios", "answer", 100).get_result("苹果", 2, "ctr")
top_answer_android = TopFeatures("android", "answer", 100).get_result("安卓", 2, "ctr")
top_answer_result = [top_answer_all,top_answer_ios,top_answer_android]
print("已获取 Top answer 特征")
#2.4 Top 100 Question(sorted by click times)
top_question_all = TopFeatures("all", "question", 100).get_result("所有", 2, "ctr")
top_question_ios = TopFeatures("ios", "question", 100).get_result("苹果", 2, "ctr")
top_question_android = TopFeatures("android", "question", 100).get_result("安卓", 2, "ctr")
top_question_result = [top_question_all,top_question_ios,top_question_android]
print("已获取 Top question 特征")
print("done")
end = time.time()
print('程序执行时间: {}s'.format(end-start))
def result2file():
output_path = DIRECTORY_PATH + "result_{}.txt".format(get_yesterday_date().replace('-',''))
with open(output_path, 'w') as f:
tplt = "{0:\u3000<6}\t{1:\u3000<15}\t{2:\u3000<15}\t{3:\u3000<15}\n"
line = """数据日期:{}
内容概览:以下所有数据都是昨天一天的首页的
说明:
(1)红色标记的为比较重要的特征
(2)[A,+B]格式说明:A表示该特征在当天的数值,+B/-B表示该数值相对于昨天的差值
1. 比例特征
1.0 question曝光占比(=question被曝光数/总cid被曝光数) [,]
1.1 answer曝光占比(=answer被曝光数/总cid被曝光数) [,]
1.2 diary曝光占比(=diary被曝光数/总cid被曝光数) [,]
1.3 活跃用户点击率(=有点击用户点击次数/有点击用户曝光次数) [,]
1.4 活跃用户平均每天曝光次数(=活跃用户曝光次数/独立活跃用户数) [,]
1.5 点击answer用户占比(=点击answer用户数/曝光answer用户数) [,]
1.6 点击question用户占比(=点击question用户数/曝光question用户数) [,]
1.7 点击diary用户占比(=点击diary用户数/曝光diary用户数) [,]
1.8 有点击用户占比(=有点击用户数/有曝光用户数) [,]
2.Top特征
2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)
2.2 Top 100 diary (sorted by ctr)
2.3 Top 100 Answer (sorted by ctr)
2.4 Top 100 Question (sorted by ctr)
具体内容:以下所有数据都是昨天一天的首页的
""".format(get_yesterday_date().replace('-',''))
f.write(line)
f.write("#1. 比例特征\n")
f.write("=================================================================\n")
f.write("#1.0question曝光占比(=question被曝光数/总cid被曝光数)\n")
f.write(tplt.format("平台","question被曝光数","总cid被曝光数","question被曝光数占比"))
for i in question_imp_rate_result:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
f.write("#1.1answer曝光占比(=answer被曝光数/总cid被曝光数)\n")
f.write(tplt.format("平台","answer被曝光数","总cid被曝光数","answer被曝光数占比"))
for i in answer_imp_rate_result:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
f.write("#1.2diary曝光占比(=diary被曝光数/总cid被曝光数)\n")
f.write(tplt.format("平台","diary被曝光数","总cid被曝光数","diary被曝光数占比"))
for i in diary_imp_rate_result:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
f.write("#1.3活跃用户点击率(=有点击用户点击次数/有点击用户曝光次数)\n")
f.write(tplt.format("平台","active用户点击次数","active用户曝光次数","active用户点击率"))
for i in activate_uid_ctr_result:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
f.write("#1.4活跃用户平均每天曝光次数(=活跃用户曝光次数/独立活跃用户数)\n")
f.write(tplt.format("地区","active独立用户数","active用户曝光次数","activate用户平均曝光数"))
for i in activate_uid_imp_result:
line = tplt.format(i[0],i[1],i[2],i[3])
f.write(line)
f.write('\n')
f.write("#1.5点击answer用户占比(=点击answer用户数/曝光answer用户数)\n")
f.write(tplt.format("平台","点击answer用户数","曝光answer用户数","点击answer用户占比"))
for i in click_answer_result:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
f.write("#1.6点击question用户占比(=点击question用户数/曝光question用户数)\n")
f.write(tplt.format("平台","点击question用户数","曝光question用户数","点击question用户占比"))
for i in click_question_result:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
f.write("#1.7点击diary用户占比(=点击diary用户数/曝光diary用户数)\n")
f.write(tplt.format("平台","点击diary用户数","曝光diary用户数","点击diary用户占比"))
for i in click_diary_result:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
f.write("#1.8有点击用户占比(=有点击用户数/有曝光用户数)\n")
f.write(tplt.format("平台","have点击用户数","have曝光用户数","have点击用户占比"))
for i in click_everything_result:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
# f.write('\n')
# if click_zero_uid_detail_result != []:
# f.write("#1.9无点击用户数分布(=无点击用户∩激活用户 / 激活用户数) #注意:(]里面的数字指的是距离当前时间的天数\n")
# f.write("平台"+'\t\t'+"0-7"+'\t\t'+"7-14"+'\t\t'+ \
# "14-30"+'\t\t'+"30-60"+'\t\t'+"60-90"+'\t\t'+"90+"+'\n')
# for i in click_zero_uid_detail_result:
# f.write(i["platform"]+'\t\t'+\
# "{}%".format(round(i["0-7"]*100,2))+'\t\t'+\
# "{}%".format(round(i["7-14"]*100,2))+'\t\t'+\
# "{}%".format(round(i["14-30"]*100,2))+'\t\t'+\
# "{}%".format(round(i["30-60"]*100,2))+'\t\t'+\
# "{}%".format(round(i["60-90"]*100,2))+'\t\t'+\
# "{}%".format(round(i["90+"]*100,2))+'\n')
f.write('\n\n\n')
#==========================================================================================
tplt = "{0:^10}\t{1:^10}\n"
f.write("#2. Top特征\n")
f.write("=================================================================\n")
f.write("2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)\n")
f.write(tplt.format("click_times","count_uid"))
for i in click_times_to_count_uid:
line = tplt.format(i[0],i[1])
f.write(line)
f.write("\n\n")
tplt = "{0:\u3000<4}\t{1:\u3000<12}\t{2:\u3000^6}\t{3:\u3000^6}\t{4:\u3000<8}\t{5:\u3000^15}\n"
f.write("2.2 Top 100 Diary\n")
sep = "=================================================================\n"
header = tplt.format("平台","diary_id","点击数","曝光数","点击率","diary链接")
f.write(sep)
f.write(header)
for i in top_diary_result:
for j in i:
f.write(tplt.format(j[0],j[1],j[2],j[3],"{}%".format(round(j[4]*100,2)),j[5]))
f.write(sep)
if i != top_diary_result[-1]:
f.write(header)
f.write("\n\n")
f.write("2.3 Top 100 Answer\n")
sep = "=================================================================\n"
header = tplt.format("平台","answer_id","点击数","曝光数","点击率","answer链接")
f.write(sep)
f.write(header)
for i in top_answer_result:
for j in i:
f.write(tplt.format(j[0],j[1],j[2],j[3],"{}%".format(round(j[4]*100,2)),j[5]))
f.write(sep)
if i != top_answer_result[-1]:
f.write(header)
f.write("\n\n")
f.write("2.4 Top 100 Question\n")
sep = "=================================================================\n"
header = tplt.format("平台","question_id","点击数","曝光数","点击率","question链接")
f.write(sep)
f.write(header)
for i in top_question_result:
for j in i:
f.write(tplt.format(j[0],j[1],j[2],j[3],"{}%".format(round(j[4]*100,2)),j[5]))
f.write(sep)
if i != top_question_result[-1]:
f.write(header)
f.write("\n\n")
def rate2file():
output_path = DIRECTORY_PATH + "rate.csv"
with open(output_path, 'a+') as f:
line = get_yesterday_date().replace('-','')+','+\
str(answer_imp_rate_all[3])+','+str(answer_imp_rate_ios[3])+','+str(answer_imp_rate_android[3])+','+\
str(diary_imp_rate_all[3])+','+str(diary_imp_rate_ios[3])+','+str(diary_imp_rate_android[3])+','+\
str(activate_uid_ctr_all[3])+','+str(activate_uid_ctr_ios[3])+','+str(activate_uid_ctr_android[3])+','+\
str(activate_uid_imp_all[3])+','+str(activate_uid_imp_beijing[3])+','+\
str(click_answer_all[3])+','+str(click_answer_ios[3])+','+str(click_answer_android[3])+','+\
str(click_diary_all[3])+','+str(click_diary_ios[3])+','+str(click_diary_android[3])+','+\
str(click_everything_all[3])+','+str(click_everything_ios[3])+','+str(click_everything_android[3])+','+\
str(question_imp_rate_all[3])+','+str(question_imp_rate_ios[3])+','+str(question_imp_rate_android[3])+','+\
str(click_question_all[3])+','+str(click_question_ios[3])+','+str(click_question_android[3])+'\n'
f.write(line)
if __name__ == '__main__':
result2file()
rate2file()