1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#! -*- coding: utf8 -*-
import pandas as pd
from scipy.stats import ttest_ind
from scipy.stats import levene
import datetime
from utils import con_sql
from decimal import *
#########推荐策略前后统计指标假设检验(t检验)###############
#自动获取昨日日期
def get_yesterday_date():
#自动获取昨天的日期,如"2018-08-08"
"""
:rtype : str
"""
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y-%m-%d")
return yesterday
yesterday=get_yesterday_date()
print(yesterday)
#自动获取10日前的日期
def get_somedate():
#自动获取10日前的日期,如"2018-07-28"
"""
:rtype : str
"""
today = datetime.date.today()
someday = today - datetime.timedelta(days=10)
someday = someday.strftime("%Y-%m-%d")
return someday
ten_days=get_somedate()
print(ten_days)
#获取最近10天的数据
def DATA_recently(x,y,z,q,t):
ten_days = get_somedate()
sql_cid = "select {0}/{1} as {2} from {3} \
where stat_date >='{4}' ".format(x,y,z,q,t)
CVR_DATA_recently = con_sql(sql_cid)
return CVR_DATA_recently
#
# #获取固定时间的10天的数据
def DATA_fixed(x,y,z,q):
sql_cid = "select {0}/{1} as {2} from {3} \
where stat_date >='2018-11-17' and stat_date<='2018-11-26' group by stat_date".format(x,y,z,q)
CVR_DATA_fixed = con_sql(sql_cid)
return CVR_DATA_fixed
#
# #新用户cvr
x_crv_new_temp=DATA_recently("diary_meigou_newUser","diary_clk_newUser","CVR_new","diary_meigou_crv",ten_days)
x_crv_new=[float(str(Decimal(x_crv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_crv_new_temp))]
y_crv_new_temp=DATA_fixed("diary_meigou_newUser","diary_clk_newUser","CVR_new","diary_meigou_crv")
y_crv_new=[float(str(Decimal(y_crv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_crv_new_temp))]
# #老用户cvr
x_crv_old_temp=DATA_recently("diary_meigou_oldUser","diary_clk_oldUser","CVR_old","diary_meigou_crv",ten_days)
x_crv_old=[float(str(Decimal(x_crv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_crv_old_temp))]
y_crv_old_temp=DATA_fixed("diary_meigou_oldUser","diary_clk_oldUser","CVR_old","diary_meigou_crv")
y_crv_old=[float(str(Decimal(y_crv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_crv_old_temp))]
#
# #新用户ct-cvr
x_ctcrv_new_temp=DATA_recently("diary_meigou_newUser","diary_exp_newUser","CT_CVR_new","diary_meigou_crv",ten_days)
x_ctcrv_new=[float(str(Decimal(x_ctcrv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctcrv_new_temp))]
y_ctcrv_new_temp=DATA_fixed("diary_meigou_newUser","diary_exp_newUser","CT_CVR_new","diary_meigou_crv")
y_ctcrv_new=[float(str(Decimal(y_ctcrv_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctcrv_new_temp))]
#
# #老用户ct-cvr
x_ctcrv_old_temp=DATA_recently("diary_meigou_oldUser","diary_exp_oldUser","CT_CVR_old","diary_meigou_crv",ten_days)
x_ctcrv_old =[float(str(Decimal(x_ctcrv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctcrv_old_temp))]
y_ctcrv_old_temp=DATA_fixed("diary_meigou_oldUser","diary_exp_oldUser","CT_CVR_old","diary_meigou_crv")
y_ctcrv_old=[float(str(Decimal(y_ctcrv_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctcrv_old_temp))]
#
# #新用户ctr(page_view)
x_ctr_new_temp=DATA_recently("clk_count_newUser_all","imp_count_newUser_all","ctr_new","bug_Recommendation_strategy_newUser",ten_days)
x_ctr_new=[float(str(Decimal(x_ctr_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_new_temp))]
y_ctr_new_temp=DATA_fixed("clk_count_newUser_all","imp_count_newUser_all","ctr_new","bug_Recommendation_strategy_newUser")
y_ctr_new=[float(str(Decimal(y_ctr_new_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_new_temp))]
# #
# #老用户ctr(page_view)
x_ctr_old_temp=DATA_recently("clk_count_oldUser_all","imp_count_oldUser_all","ctr_old","bug_Recommendation_strategy_temp",ten_days)
x_ctr_old=[float(str(Decimal(x_ctr_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_old_temp))]
y_ctr_old_temp=DATA_fixed("clk_count_oldUser_all","imp_count_oldUser_all","ctr_old","bug_Recommendation_strategy_temp")
y_ctr_old=[float(str(Decimal(y_ctr_old_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_old_temp))]
#
# #新用户ctr(on_click_diary_card)
x_ctr_new_o_temp=DATA_recently("clk_count_newUser_all_a","imp_count_newUser_all","ctr_new","on_click_diary_card",ten_days)
x_ctr_new_o=[float(str(Decimal(x_ctr_new_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_new_o_temp))]
y_ctr_new_o_temp=DATA_fixed("clk_count_newUser_all_a","imp_count_newUser_all","ctr_new","on_click_diary_card")
y_ctr_new_o=[float(str(Decimal(y_ctr_new_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_new_o_temp))]
#
# #老用户ctr(on_click_diary_card)
x_ctr_old_o_temp=DATA_recently("clk_count_oldUser_all_a","imp_count_oldUser_all","ctr_old","on_click_diary_card",ten_days)
x_ctr_old_o=[float(str(Decimal(x_ctr_old_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(x_ctr_old_o_temp))]
y_ctr_old_o_temp=DATA_fixed("clk_count_oldUser_all_a","imp_count_oldUser_all","ctr_old","on_click_diary_card")
y_ctr_old_o=[float(str(Decimal(y_ctr_old_o_temp[i][0]).quantize(Decimal('0.0000')))) for i in range(len(y_ctr_old_o_temp))]
# #
#
#
def t_test(x,y): #进行t检验
#策略前的数据,赋值给x,策略后的数据赋值给y,均采用10日内数据
#检验数据方差是否齐性
a=levene(x,y)
p_value=a[1] #结果若p_value>0.05,则认为两组数据方差是相等的,否则两组数据方差是不等的
if p_value>0.05: #认为数据方差具有齐性,equal_var=ture
t_test=ttest_ind(x,y,equal_var=True)
t_p_value=t_test[1]
# print(t_p_value)
if t_p_value>0.05:
print("策略前后两组数据无显著性差异,即该指标没有显著提升,p_value:{}" .format(t_p_value))
else:
print("策略前后两组数据有显著性差异,即该指标获得显著提升,p_value:{}" .format(t_p_value))
else: #认为数据方差不具有齐性,equal_var=false
t_test = ttest_ind(x, y, equal_var=False)
t_p_value = t_test[1]
# print(t_p_value)
if t_p_value > 0.05:
print("策略前后两组数据无显著性差异,即该指标没有显著提升,p_value:{}" .format(t_p_value))
else:
print("策略前后两组数据有显著性差异,即该指标获得显著提升,p_value:{}" .format(t_p_value))
#
# ###假设检验,判断是否具有显著性
#
#新用户cvr假设检验
print("【1】新用户CVR假设检验结果:")
crv_new_ttest=t_test(x_crv_new,y_crv_new)
#老用户cvr假设检验
print("【2】老用户CVR假设检验结果:")
crv_old_ttest=t_test(x_crv_old,y_crv_old)
#
#新用户ct_cvr假设检验
print("【3】新用户CT-CVR假设检验结果:")
ctcrv_new_ttest=t_test(x_ctcrv_new,y_ctcrv_new)
# #老用户ct_cvr假设检验
print("【4】老用户CT-CVR假设检验结果:")
ctcrv_old_ttest=t_test(x_ctcrv_old,y_ctcrv_old)
#
#
#新用户ctr假设检验
print("【5】新用户CTR假设检验结果:")
ctr_new_ttest=t_test(x_ctr_new,y_ctr_new)
#老用户ctr假设检验
print("【6】老用户CTR假设检验结果:")
ctr_old_ttest=t_test(x_ctr_old,y_ctr_old)
#新用户ctr(on_click_diary_card)假设检验
print("【7】新用户CTR假设检验(on_click_diary_card)结果:")
ctr_new_o_ttest=t_test(x_ctr_new_o,y_ctr_new_o)
#老用户ctr(on_click_diary_card)假设检验
print("【8】老用户CTR假设检验(on_click_diary_card)结果:")
ctr_old_o_ttest=t_test(x_ctr_old_o,y_ctr_old_o)
#
# ###############推荐策略不变的情况下数据假设检验##############
#
# #1 计算每日指标卡方检验
#
# #自动获取5日前的日期
def get_fivedate():
#自动获取10日前的日期,如"2018-07-28"
"""
:rtype : str
"""
today = datetime.date.today()
someday = today - datetime.timedelta(days=5)
someday = someday.strftime("%Y-%m-%d")
return someday
five_days=get_fivedate()
#获取最近5天的数据
def chi_DATA_recently(x,y,z,q,t1,t2):
sql_cid = "select AVG({0}+{1}),AVG({2}) from {3} \
where stat_date >= '{4}' and stat_date < '{5}' ".format(x,y,z,q,t1,t2)
CVR_DATA_recently = con_sql(sql_cid)[0]
return CVR_DATA_recently
def chi_DATA_yesterday(x,y,z,q,t1):
sql_cid = "select {0}+{1},{2} from {3} where stat_date='{4}' ".format(x,y,z,q,t1)
CVR_DATA_yesterday = con_sql(sql_cid)[0]
return CVR_DATA_yesterday
#整理数据
def data_cal(x,y):
x_a = [x[0], x[1] - x[0]]
y_a=[y[0], y[1] - y[0]]
a_df=pd.DataFrame({'原':x_a,'测':y_a})
return a_df
def chi_cal(data):
data['共计'] = data.apply(lambda x: x.sum(), axis=1)
# print(data)
data.loc['共计'] = data.apply(lambda x: x.sum())
print(data)
t1=data.iloc[0]
t2=data.iloc[1]
t11_count=t1[0]
print("t11:{}".format(t11_count))
t12_count=t1[1]
print("t12:{}".format(t12_count))
t21_count=t2[0]
print("t21:{}".format(t21_count))
t22_count=t2[1]
print("t22:{}".format(t22_count))
###理论值计算
temp1=data.loc['共计']
print("共计:{}".format(temp1))
rate1=temp1[0]/temp1[2]
print("rate1:{}".format(rate1))
rate2=temp1[1]/temp1[2]
print("rate2:{}".format(rate2))
temp2=data.iloc[2]
t11_theory=temp2[0]*rate1
print("t11_theory:{}".format(t11_theory))
t12_theory=temp2[1]*rate1
print("t12_theory:{}".format(t12_theory))
t21_theory = temp2[0]*rate2
print("t21_theory:{}".format(t21_theory))
t22_theory = temp2[1]*rate2
print("t22_theory:{}".format(t22_theory))
#计算卡方值
X=(((t11_count-t11_theory)**2)/t11_theory)+(((t12_count-t12_theory)**2)/t12_theory)+(((t21_count-t21_theory)**2)/t21_theory)+(((t22_count-t22_theory)**2)/t22_theory)
print("卡方值为:{}".format(X))
#计算自由度
v=(len(data)-1)*(data.columns.size-1)
#查表发现阈值为3.84
if X>3.84:
print("数据波动较大,超出正常波动范围,95%可能性属于指标显著变化,请关注")
else:
print("数据波动较小,95%可能性属于正常波动范围")
#精准点击曝光数据(首页精选日记本列表on_click_diary_card)
chi_ctr_precise_recently=chi_DATA_recently("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all_precise","on_click_diary_card",five_days,yesterday)
temp1=[float(str(Decimal(chi_ctr_precise_recently[i]).quantize(Decimal('0.0')))) for i in range(len(chi_ctr_precise_recently))]
# print(temp1)
chi_ctr_precise_yesterday=chi_DATA_yesterday("clk_count_oldUser_all_a","clk_count_oldUser_all_b","imp_count_oldUser_all_precise","on_click_diary_card",yesterday)
temp2=[float(chi_ctr_precise_yesterday[i]) for i in range(len(chi_ctr_precise_yesterday))]
# print(temp2)
tst=data_cal(temp1,temp2)
print(tst)
chi_cal(tst)
# print(chi_ctr_precise_recently)
# print(chi_ctr_precise_yesterday)
# chi_cvr_new=
# chi_cvr_old=
#
# chi_ctcvr_new=
# chi_ctcvr_old=
#
#
#
# def chi_cal(data):