main.py 14.7 KB
# -*- coding: UTF-8 -*-
from utils import get_yesterday_date
from config import DIRECTORY_PATH
from cidRate import CidRate
from clkCidUidRate import ClkCidUidRate
from topFeatures import TopFeatures
from func import *
import time


start = time.time()
print("开始获取比例特征数据...")
#1.0 question曝光占比(=question被曝光数/总cid被曝光数)
question_imp_rate_all = CidRate("all","question").get_cid_imp_rate("所有")
question_imp_rate_ios = CidRate("ios","question").get_cid_imp_rate("苹果")
question_imp_rate_android = CidRate("android","question").get_cid_imp_rate("安卓")
question_imp_rate_result = [question_imp_rate_all,question_imp_rate_ios,question_imp_rate_android]
print("已获取question曝光占比")

#1.1 answer曝光占比(=answer被曝光数/总cid被曝光数)
answer_imp_rate_all = CidRate("all","answer").get_cid_imp_rate("所有")
answer_imp_rate_ios = CidRate("ios","answer").get_cid_imp_rate("苹果")
answer_imp_rate_android = CidRate("android","answer").get_cid_imp_rate("安卓")
answer_imp_rate_result = [answer_imp_rate_all,answer_imp_rate_ios,answer_imp_rate_android]
print("已获取answer曝光占比")

#1.2 diary曝光占比(=answer被曝光数/总cid被曝光数)
diary_imp_rate_all = CidRate("all","diary").get_cid_imp_rate("所有")
diary_imp_rate_ios = CidRate("ios","diary").get_cid_imp_rate("苹果")
diary_imp_rate_android = CidRate("android","diary").get_cid_imp_rate("安卓")
diary_imp_rate_result = [diary_imp_rate_all,diary_imp_rate_ios,diary_imp_rate_android]
print("已获取diary曝光占比")

#1.3 活跃用户点击率(=活跃用户点击次数/活跃用户曝光次数)
activate_uid_ctr_all = get_activate_uid_ctr("all")
activate_uid_ctr_ios = get_activate_uid_ctr("ios")
activate_uid_ctr_android = get_activate_uid_ctr("android")
activate_uid_ctr_result = [activate_uid_ctr_all,activate_uid_ctr_ios,activate_uid_ctr_android]
print("已获取活跃用户点击率")

#1.4 活跃用户平均每天曝光次数(=活跃用户曝光数/独立活跃用户数)
activate_uid_imp_all = get_activate_uid_imp_times("all")
activate_uid_imp_beijing = get_activate_uid_imp_times("beijing")
activate_uid_imp_result = [activate_uid_imp_all,activate_uid_imp_beijing]
print("已获取活跃用户平均每天曝光次数")

#1.5 点击answer用户占比(=点击answer用户数/曝光answer用户数)
click_answer_all = ClkCidUidRate("all","answer").get_clk_cid_uid_rate("所有")
click_answer_ios = ClkCidUidRate("ios","answer").get_clk_cid_uid_rate("苹果")
click_answer_android = ClkCidUidRate("android","answer").get_clk_cid_uid_rate("安卓")
click_answer_result = [click_answer_all,click_answer_ios,click_answer_android]
print("已获取点击answer用户占比")

#1.6 点击question用户占比(=点击question用户数/曝光question用户数)
click_question_all = ClkCidUidRate("all","question").get_clk_cid_uid_rate("所有")
click_question_ios = ClkCidUidRate("ios","question").get_clk_cid_uid_rate("苹果")
click_question_android = ClkCidUidRate("android","question").get_clk_cid_uid_rate("安卓")
click_question_result = [click_question_all,click_question_ios,click_question_android]
print("已获取点击question用户占比")

#1.7 点击diary用户占比(=点击diary用户数/曝光diary用户数)
click_diary_all = ClkCidUidRate("all","diary").get_clk_cid_uid_rate("所有")
click_diary_ios = ClkCidUidRate("ios","diary").get_clk_cid_uid_rate("苹果")
click_diary_android = ClkCidUidRate("android","diary").get_clk_cid_uid_rate("安卓")
click_diary_result = [click_diary_all,click_diary_ios,click_diary_android]
print("已获取点击diary用户占比")

#1.8 有点击用户占比(=有点击用户数/有曝光用户数)
click_everything_all = ClkCidUidRate("all","everything").get_clk_cid_uid_rate("所有")
click_everything_ios = ClkCidUidRate("ios","everything").get_clk_cid_uid_rate("苹果")
click_everything_android = ClkCidUidRate("android","everything").get_clk_cid_uid_rate("安卓")
click_everything_result = [click_everything_all,click_everything_ios,click_everything_android]
print("已获取有点击用户占比")

#1.9 无点击用户数分布(=无点击用户∩激活用户 / 激活用户数)   #注意:(]里面的数字指的是距离当前时间的天数
# try:
# 	click_zero_uid_detail_all = get_click_zero_uid_rate_detail("all")
# 	click_zero_uid_detail_all["platform"] = "所有"
# 	click_zero_uid_detail_ios = get_click_zero_uid_rate_detail("ios")
# 	click_zero_uid_detail_ios["platform"] = "苹果"
# 	click_zero_uid_detail_android = get_click_zero_uid_rate_detail("android")
# 	click_zero_uid_detail_android["platform"] = "安卓"
# 	click_zero_uid_detail_result = [click_zero_uid_detail_all,click_zero_uid_detail_ios,click_zero_uid_detail_android]
# 	print("已获取无点击用户数激活日期分布")
# except:
# 	click_zero_uid_detail_result = []
# 	print("GC life time is shorter than transaction duration")


#==========================================================================================


#2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)
print("开始获取Top特征数据...")
click_times_to_count_uid = get_click_times_to_count_uid()
print("已获取用户点击次数分布")
#2.2 Top 100 diary(sorted by ctr)
top_diary_all = TopFeatures("all", "diary", 100).get_result("所有", 4, "ctr")
top_diary_ios = TopFeatures("ios", "diary", 100).get_result("苹果", 4, "ctr")
top_diary_android = TopFeatures("android", "diary", 100).get_result("安卓", 4, "ctr")
top_diary_result = [top_diary_all,top_diary_ios,top_diary_android]
print("已获取 Top diary 特征")
#2.3 Top 100 Answer(sorted by ctr)
top_answer_all = TopFeatures("all", "answer", 100).get_result("所有", 2, "ctr")
top_answer_ios = TopFeatures("ios", "answer", 100).get_result("苹果", 2, "ctr")
top_answer_android = TopFeatures("android", "answer", 100).get_result("安卓", 2, "ctr")
top_answer_result = [top_answer_all,top_answer_ios,top_answer_android]
print("已获取 Top answer 特征")
#2.4 Top 100 Question(sorted by click times)
top_question_all = TopFeatures("all", "question", 100).get_result("所有", 2, "ctr")
top_question_ios = TopFeatures("ios", "question", 100).get_result("苹果", 2, "ctr")
top_question_android = TopFeatures("android", "question", 100).get_result("安卓", 2, "ctr")
top_question_result = [top_question_all,top_question_ios,top_question_android]
print("已获取 Top question 特征")
print("done")


end = time.time()
print('程序执行时间: {}s'.format(end-start))





def result2file():
	output_path = DIRECTORY_PATH + "result_{}.txt".format(get_yesterday_date().replace('-',''))
	with open(output_path, 'w') as f:
		tplt = "{0:\u3000<6}\t{1:\u3000<15}\t{2:\u3000<15}\t{3:\u3000<15}\n"
		line = """数据日期:{}
内容概览:以下所有数据都是昨天一天的首页的
说明:
	(1)红色标记的为比较重要的特征
	(2)[A,+B]格式说明:A表示该特征在当天的数值,+B/-B表示该数值相对于昨天的差值
1. 比例特征
	1.0 question曝光占比(=question被曝光数/总cid被曝光数) [,]
	1.1 answer曝光占比(=answer被曝光数/总cid被曝光数) [,]
	1.2 diary曝光占比(=diary被曝光数/总cid被曝光数) [,]
	1.3 活跃用户点击率(=有点击用户点击次数/有点击用户曝光次数) [,]
	1.4 活跃用户平均每天曝光次数(=活跃用户曝光次数/独立活跃用户数) [,]
	1.5 点击answer用户占比(=点击answer用户数/曝光answer用户数) [,]
	1.6 点击question用户占比(=点击question用户数/曝光question用户数) [,]
	1.7 点击diary用户占比(=点击diary用户数/曝光diary用户数) [,]
	1.8 有点击用户占比(=有点击用户数/有曝光用户数) [,]
2.Top特征
	2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)
	2.2 Top 100 diary (sorted by ctr)
	2.3 Top 100 Answer (sorted by ctr)
	2.4 Top 100 Question (sorted by ctr)



具体内容:以下所有数据都是昨天一天的首页的
""".format(get_yesterday_date().replace('-',''))
		f.write(line)
		f.write("#1. 比例特征\n")
		f.write("=================================================================\n")
		f.write("#1.0question曝光占比(=question被曝光数/总cid被曝光数)\n")
		f.write(tplt.format("平台","question被曝光数","总cid被曝光数","question被曝光数占比"))
		for i in question_imp_rate_result:
			line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
			f.write(line)
		f.write('\n')
		f.write("#1.1answer曝光占比(=answer被曝光数/总cid被曝光数)\n")
		f.write(tplt.format("平台","answer被曝光数","总cid被曝光数","answer被曝光数占比"))
		for i in answer_imp_rate_result:
			line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
			f.write(line)
		f.write('\n')
		f.write("#1.2diary曝光占比(=diary被曝光数/总cid被曝光数)\n")
		f.write(tplt.format("平台","diary被曝光数","总cid被曝光数","diary被曝光数占比"))
		for i in diary_imp_rate_result:
			line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
			f.write(line)
		f.write('\n')
		f.write("#1.3活跃用户点击率(=有点击用户点击次数/有点击用户曝光次数)\n")
		f.write(tplt.format("平台","active用户点击次数","active用户曝光次数","active用户点击率"))
		for i in activate_uid_ctr_result:
			line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
			f.write(line)
		f.write('\n')
		f.write("#1.4活跃用户平均每天曝光次数(=活跃用户曝光次数/独立活跃用户数)\n")
		f.write(tplt.format("地区","active独立用户数","active用户曝光次数","activate用户平均曝光数"))
		for i in activate_uid_imp_result:
			line = tplt.format(i[0],i[1],i[2],i[3])
			f.write(line)
		f.write('\n')
		f.write("#1.5点击answer用户占比(=点击answer用户数/曝光answer用户数)\n")
		f.write(tplt.format("平台","点击answer用户数","曝光answer用户数","点击answer用户占比"))
		for i in click_answer_result:
			line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
			f.write(line)
		f.write('\n')
		f.write("#1.6点击question用户占比(=点击question用户数/曝光question用户数)\n")
		f.write(tplt.format("平台","点击question用户数","曝光question用户数","点击question用户占比"))
		for i in click_question_result:
			line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
			f.write(line)
		f.write('\n')
		f.write("#1.7点击diary用户占比(=点击diary用户数/曝光diary用户数)\n")
		f.write(tplt.format("平台","点击diary用户数","曝光diary用户数","点击diary用户占比"))
		for i in click_diary_result:
			line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
			f.write(line)
		f.write('\n')
		f.write("#1.8有点击用户占比(=有点击用户数/有曝光用户数)\n")
		f.write(tplt.format("平台","have点击用户数","have曝光用户数","have点击用户占比"))
		for i in click_everything_result:
			line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
			f.write(line)
		# f.write('\n')
		# if click_zero_uid_detail_result != []:
		# 	f.write("#1.9无点击用户数分布(=无点击用户∩激活用户 / 激活用户数)   #注意:(]里面的数字指的是距离当前时间的天数\n")
		# 	f.write("平台"+'\t\t'+"0-7"+'\t\t'+"7-14"+'\t\t'+ \
		# 		"14-30"+'\t\t'+"30-60"+'\t\t'+"60-90"+'\t\t'+"90+"+'\n')
		# 	for i in click_zero_uid_detail_result:
		# 		f.write(i["platform"]+'\t\t'+\
		# 			"{}%".format(round(i["0-7"]*100,2))+'\t\t'+\
		# 			"{}%".format(round(i["7-14"]*100,2))+'\t\t'+\
		# 			"{}%".format(round(i["14-30"]*100,2))+'\t\t'+\
		# 			"{}%".format(round(i["30-60"]*100,2))+'\t\t'+\
		# 			"{}%".format(round(i["60-90"]*100,2))+'\t\t'+\
		# 			"{}%".format(round(i["90+"]*100,2))+'\n')
		f.write('\n\n\n')
		
#==========================================================================================
		tplt = "{0:^10}\t{1:^10}\n"
		f.write("#2. Top特征\n")
		f.write("=================================================================\n")
		f.write("2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)\n")
		f.write(tplt.format("click_times","count_uid"))	
		for i in click_times_to_count_uid:
			line = tplt.format(i[0],i[1])
			f.write(line)
		f.write("\n\n")
		tplt = "{0:\u3000<4}\t{1:\u3000<12}\t{2:\u3000^6}\t{3:\u3000^6}\t{4:\u3000<8}\t{5:\u3000^15}\n"
		f.write("2.2 Top 100 Diary\n")
		sep = "=================================================================\n"
		header = tplt.format("平台","diary_id","点击数","曝光数","点击率","diary链接")
		f.write(sep)
		f.write(header)
		for i in top_diary_result:
			for j in i:
				f.write(tplt.format(j[0],j[1],j[2],j[3],"{}%".format(round(j[4]*100,2)),j[5]))
			f.write(sep)
			if i != top_diary_result[-1]:
				f.write(header)
		f.write("\n\n")
		f.write("2.3 Top 100 Answer\n")
		sep = "=================================================================\n"
		header = tplt.format("平台","answer_id","点击数","曝光数","点击率","answer链接")
		f.write(sep)
		f.write(header)
		for i in top_answer_result:
			for j in i:
				f.write(tplt.format(j[0],j[1],j[2],j[3],"{}%".format(round(j[4]*100,2)),j[5]))
			f.write(sep)
			if i != top_answer_result[-1]:
				f.write(header)
		f.write("\n\n")
		f.write("2.4 Top 100 Question\n")
		sep = "=================================================================\n"
		header = tplt.format("平台","question_id","点击数","曝光数","点击率","question链接")
		f.write(sep)
		f.write(header)
		for i in top_question_result:
			for j in i:
				f.write(tplt.format(j[0],j[1],j[2],j[3],"{}%".format(round(j[4]*100,2)),j[5]))
			f.write(sep)
			if i != top_question_result[-1]:
				f.write(header)
		f.write("\n\n")


def rate2file():
	output_path = DIRECTORY_PATH + "rate.csv"
	with open(output_path, 'a+') as f:
		line = get_yesterday_date().replace('-','')+','+\
			str(answer_imp_rate_all[3])+','+str(answer_imp_rate_ios[3])+','+str(answer_imp_rate_android[3])+','+\
			str(diary_imp_rate_all[3])+','+str(diary_imp_rate_ios[3])+','+str(diary_imp_rate_android[3])+','+\
			str(activate_uid_ctr_all[3])+','+str(activate_uid_ctr_ios[3])+','+str(activate_uid_ctr_android[3])+','+\
			str(activate_uid_imp_all[3])+','+str(activate_uid_imp_beijing[3])+','+\
			str(click_answer_all[3])+','+str(click_answer_ios[3])+','+str(click_answer_android[3])+','+\
			str(click_diary_all[3])+','+str(click_diary_ios[3])+','+str(click_diary_android[3])+','+\
			str(click_everything_all[3])+','+str(click_everything_ios[3])+','+str(click_everything_android[3])+','+\
			str(question_imp_rate_all[3])+','+str(question_imp_rate_ios[3])+','+str(question_imp_rate_android[3])+','+\
			str(click_question_all[3])+','+str(click_question_ios[3])+','+str(click_question_android[3])+'\n'
		f.write(line)




if __name__ == '__main__':
	result2file()
	rate2file()