# -*- coding: UTF-8 -*- from utils import con_sql,tuple2dict,get_yesterday_date from config import DIRECTORY_PATH class TopFeatures(object): def __init__(self, platform, cid_type, top_n=-1): """ platform : 'all';'ios';'android' cid_type : 'diary';'answer';'question'... top_n : the top rows of the result """ if platform == "ios": self.platform = "='App Store'" elif platform == "android": self.platform = "!='App Store'" else: self.platform = " is not null" self.cid_type = cid_type self.top_n = top_n def get_click_times(self): # rtype : dict sql = "select cid,count(cid) from data_feed_click \ where stat_date = '{0}' \ and device_type{1} and cid_type='{2}' \ group by cid \ order by count(cid) desc".format(get_yesterday_date(), self.platform.replace(' ','') if self.platform[-2]=='e' else self.platform, self.cid_type) clk_times = tuple2dict(con_sql(sql)) return clk_times def get_impression_times(self): # rtype : dict sql = "select cid,count(cid) from data_feed_exposure \ where stat_date = '{0}' \ and device_type{1} and cid_type='{2}' \ group by cid order by count(cid) desc".format(get_yesterday_date(), self.platform, self.cid_type) imp_times = tuple2dict(con_sql(sql)) return imp_times def get_result(self, platform, clk_n=2, result_types="ctr"): """ platform : "所有";"苹果","安卓" #方便显示 clk : dict imp : dict clk_n : 获取topN点击率时,过滤的点击数 result_types : sorted by ["clk","imp","ctr"] rtype : list """ clk = self.get_click_times() imp = self.get_impression_times() topn = [] #获取topN的点击 if imp == {} or result_types == "clk": for i in clk: if self.cid_type == "diary": url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/' else: url = "http://m.igengmei.com/{0}/".format(self.cid_type) + i[i.index('|')+1:] + '/' topn.append((platform,i,clk[i],0,0,url)) topn.sort(key=lambda x:x[2],reverse=True) return topn[:int(self.top_n)] #获取topN的曝光 elif clk == {} or result_types == "imp": for i in imp: if self.cid_type == "diary": url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/' else: url = "http://m.igengmei.com/{0}/".format(self.cid_type) + i[i.index('|')+1:] + '/' topn.append((platform,i,0,imp[i],0,url)) topn.sort(key=lambda x:x[3],reverse=True) return topn[:int(self.top_n)] #获取topN的ctr else: for i in clk: if i in imp.keys() and clk[i] > clk_n: if self.cid_type == "diary": url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/' else: url = "http://m.igengmei.com/{0}/".format(self.cid_type) + i[i.index('|')+1:] + '/' topn.append((platform,i,clk[i],imp[i],round(clk[i]/imp[i],4),url)) topn.sort(key=lambda x:x[4],reverse=True) return topn[:int(self.top_n)] def result2file(self, result_lst, fpath): """ result_lst : [all_result,ios_result,android_result] fpath : output filename rtype : none """ with open(fpath, 'w') as f: tplt = "{0:\u3000<4}\t{1:\u3000<12}\t{2:\u3000^6}\t{3:\u3000^6}\t{4:\u3000<8}\t{5:\u3000^15}\n" f.write("Top {0} {1}\n".format(self.top_n,self.cid_type)) sep = "=================================================================\n" header = tplt.format("平台","{0}_id".format(self.cid_type),"点击数","曝光数","点击率","{1}链接".format(self.cid_type,self.cid_type)) f.write(sep) f.write(header) for i in result_lst: for j in i: f.write(tplt.format(j[0],j[1],j[2],j[3],j[4],j[5])) f.write(sep) if i != result_lst[-1]: f.write(header) f.write("\n\n") def main(): #1. Top diary top_diary_all = TopFeatures("all", "diary", 100) clk_diary_ctr_all = top_diary_all.get_result("所有", 4, "ctr") top_diary_ios = TopFeatures("ios", "diary", 100) clk_diary_ctr_ios = top_diary_ios.get_result("苹果", 4, "ctr") top_diary_android = TopFeatures("android", "diary", 100) clk_diary_ctr_android = top_diary_android.get_result("安卓", 4, "ctr") result_lst = [clk_diary_ctr_all, clk_diary_ctr_ios, clk_diary_ctr_android] output_path = DIRECTORY_PATH + "top100_ctr_diary_{}.txt".format(get_yesterday_date().replace('-','')) top_diary_all.result2file(result_lst, output_path) print("已获取 Top diary 特征") #2. Top answer top_answer_all = TopFeatures("all", "answer", 100) clk_answer_ctr_all = top_answer_all.get_result("所有", 2, "ctr") top_answer_ios = TopFeatures("ios", "answer", 100) clk_answer_ctr_ios = top_answer_ios.get_result("苹果", 2, "ctr") top_answer_android = TopFeatures("android", "answer", 100) clk_answer_ctr_android = top_answer_android.get_result("安卓", 2, "ctr") result_lst = [clk_answer_ctr_all, clk_answer_ctr_ios, clk_answer_ctr_android] output_path = DIRECTORY_PATH + "top100_ctr_answer_{}.txt".format(get_yesterday_date().replace('-','')) top_answer_all.result2file(result_lst, output_path) print("已获取 Top answer 特征") #3. Top question top_question_all = TopFeatures("all", "question", 100) clk_question_ctr_all = top_question_all.get_result("所有", 2, "ctr") top_question_ios = TopFeatures("ios", "question", 100) clk_question_ctr_ios = top_question_ios.get_result("苹果", 2, "ctr") top_question_android = TopFeatures("android", "question", 100) clk_question_ctr_android = top_question_android.get_result("安卓", 2, "ctr") result_lst = [clk_question_ctr_all, clk_question_ctr_ios, clk_question_ctr_android] output_path = DIRECTORY_PATH + "top100_ctr_question_{}.txt".format(get_yesterday_date().replace('-','')) top_question_all.result2file(result_lst, output_path) print("已获取 Top question 特征") if __name__ == '__main__': main()