Commit 5b296bc4 authored by 张彦钊's avatar 张彦钊

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

parents a8479063 124f0557
# -*- coding: UTF-8 -*-
import pymysql import pymysql
......
# -*- coding: UTF-8 -*-
import pymysql import pymysql
......
# -*- coding: UTF-8 -*-
import pymysql import pymysql
......
# -*- coding: UTF-8 -*-
import pymysql import pymysql
......
# -*- coding: UTF-8 -*-
import pymysql import pymysql
import datetime import datetime
import pandas as pd
def con_sql(sql): def con_sql(sql):
# 从数据库的表里获取数据 # 从数据库的表里获取数据
...@@ -16,24 +18,34 @@ def get_yesterday_date(): ...@@ -16,24 +18,34 @@ def get_yesterday_date():
yesterday = yesterday.strftime("%Y%m%d") yesterday = yesterday.strftime("%Y%m%d")
return yesterday return yesterday
def get_uid_click_times():
def get_click_times_to_count_uid_df():
sql = "select device_id,count(cid_type) click_times from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) group by device_id order by click_times desc" sql = "select device_id,count(cid_type) click_times from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) group by device_id order by click_times desc"
uid_click_times = con_sql(sql) uid_click_times = con_sql(sql)
return uid_click_times uid_lst = [i[0] for i in uid_click_times]
click_times_lst = [i[1] for i in uid_click_times]
uid_click_times_df = pd.DataFrame({"uid":uid_lst,"click_times":click_times_lst})
df = uid_click_times_df.groupby(by="click_times",as_index=False).count()
return df
def result2file(result,fpath): def df2file(df,fpath):
with open(fpath,'w') as f: with open(fpath,"w") as f:
f.write("device_id"+'\t'+"click_times"+'\n') f.write("#2. Top特征\n")
for i in result: f.write("=================================================================\n")
f.write(str(i[0])+'\t'+str(i[1])+'\n') f.write("2.1用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)\n")
f.write("click_times"+"\t"+"count_uid"+"\n")
for row in df.iterrows():
line = str(row[1][0]) + "\t" + str(row[1][1]) + "\n"
f.write(line)
f.write("\n\n")
def main(): def main():
print("开始获取用户点击次数表...") print("2.开始获取用户点击次数表...")
uid_click_times = get_uid_click_times() output_path = "../data/click_times_to_count_uid_%s.csv" % get_yesterday_date()
output_path = "../data/uid_click_times_%s.txt" % get_yesterday_date() df = get_click_times_to_count_uid_df()
result2file(uid_click_times,output_path) df2file(df,output_path)
print("获取完成") print("获取完成")
......
# -*- coding: UTF-8 -*-
import pymysql import pymysql
......
# -*- coding: UTF-8 -*-
import datetime import datetime
from getAnswerImpRate import get_all_answer_imp_rate,get_ios_answer_imp_rate,get_android_answer_imp_rate from getAnswerImpRate import get_all_answer_imp_rate,get_ios_answer_imp_rate,get_android_answer_imp_rate
from getActivateUidCtr import get_all_click_one_rate,get_ios_click_one_rate,get_android_click_one_rate from getActivateUidCtr import get_all_click_one_rate,get_ios_click_one_rate,get_android_click_one_rate
...@@ -17,8 +18,8 @@ def result2file(fpath): ...@@ -17,8 +18,8 @@ def result2file(fpath):
f.write("#注意:以下数据都是首页的\n") f.write("#注意:以下数据都是首页的\n")
f.write("#1. 比例特征\n") f.write("#1. 比例特征\n")
f.write("=================================================================\n") f.write("=================================================================\n")
f.write("#1.1问答曝光占比(=问答被曝光数/总cid被曝光数)\n") f.write("#1.1answer曝光占比(=answer被曝光数/总cid被曝光数)\n")
f.write("平台"+"\t"+"问答被曝光数"+"\t"+"总cid被曝光数"+"\t"+"问答被曝光数占比\n") f.write("平台"+"\t"+"answer被曝光数"+"\t"+"总cid被曝光数"+"\t"+"answer被曝光数占比\n")
all_answer_imp_rate = get_all_answer_imp_rate() all_answer_imp_rate = get_all_answer_imp_rate()
ios_answer_imp_rate = get_ios_answer_imp_rate() ios_answer_imp_rate = get_ios_answer_imp_rate()
android_answer_imp_rate = get_android_answer_imp_rate() android_answer_imp_rate = get_android_answer_imp_rate()
...@@ -29,7 +30,7 @@ def result2file(fpath): ...@@ -29,7 +30,7 @@ def result2file(fpath):
line += str(j) + '\t' line += str(j) + '\t'
line = line[:-1]+'\n' line = line[:-1]+'\n'
f.write(line) f.write(line)
print("1.1已将问答曝光占比存入文件") print("1.1已将answer曝光占比存入文件")
f.write("#1.2有点击用户点击率(=有点击用户点击次数/有点击用户曝光次数)\n") f.write("#1.2有点击用户点击率(=有点击用户点击次数/有点击用户曝光次数)\n")
f.write("平台"+"\t"+"有点击用户点击次数"+"\t"+"有点击用户曝光次数"+"\t"+"有点击用户点击率\n") f.write("平台"+"\t"+"有点击用户点击次数"+"\t"+"有点击用户曝光次数"+"\t"+"有点击用户点击率\n")
...@@ -45,8 +46,8 @@ def result2file(fpath): ...@@ -45,8 +46,8 @@ def result2file(fpath):
f.write(line) f.write(line)
print("1.2已将有点击用户点击率存入文件") print("1.2已将有点击用户点击率存入文件")
f.write("#1.3点击问答用户占比(=点击问答用户数/曝光问答用户数)\n") f.write("#1.3点击answer用户占比(=点击answer用户数/曝光answer用户数)\n")
f.write("平台"+"\t"+"点击问答用户数"+"\t"+"曝光问答用户数"+"\t"+"点击问答用户占比\n") f.write("平台"+"\t"+"点击answer用户数"+"\t"+"曝光answer用户数"+"\t"+"点击answer用户占比\n")
all_click_answer_rate = get_all_click_answer_rate() all_click_answer_rate = get_all_click_answer_rate()
ios_click_answer_rate = get_ios_click_answer_rate() ios_click_answer_rate = get_ios_click_answer_rate()
android_click_answer_rate = get_android_click_answer_rate() android_click_answer_rate = get_android_click_answer_rate()
...@@ -57,10 +58,10 @@ def result2file(fpath): ...@@ -57,10 +58,10 @@ def result2file(fpath):
line += str(j) + '\t' line += str(j) + '\t'
line = line[:-1]+'\n' line = line[:-1]+'\n'
f.write(line) f.write(line)
print("1.3已将点击问答用户占比存入文件") print("1.3已将点击answer用户占比存入文件")
f.write("#1.4点击日记用户占比(=点击日记用户数/曝光日记用户数)\n") f.write("#1.4点击diary用户占比(=点击diary用户数/曝光diary用户数)\n")
f.write("平台"+"\t"+"点击日记用户数"+"\t"+"曝光日记用户数"+"\t"+"点击日记用户占比\n") f.write("平台"+"\t"+"点击diary用户数"+"\t"+"曝光diary用户数"+"\t"+"点击diary用户占比\n")
all_click_diary_rate = get_all_click_diary_rate() all_click_diary_rate = get_all_click_diary_rate()
ios_click_diary_rate = get_ios_click_diary_rate() ios_click_diary_rate = get_ios_click_diary_rate()
android_click_diary_rate = get_android_click_diary_rate() android_click_diary_rate = get_android_click_diary_rate()
...@@ -71,7 +72,7 @@ def result2file(fpath): ...@@ -71,7 +72,7 @@ def result2file(fpath):
line += str(j) + '\t' line += str(j) + '\t'
line = line[:-1]+'\n' line = line[:-1]+'\n'
f.write(line) f.write(line)
print("1.4已将点击日记用户占比存入文件") print("1.4已将点击diary用户占比存入文件")
f.write("#1.5无点击用户占比(=无点击用户数/有曝光用户数)\n") f.write("#1.5无点击用户占比(=无点击用户数/有曝光用户数)\n")
f.write("平台"+"\t"+"无点击用户数"+"\t"+"有曝光用户数"+"\t"+"无点击用户占比\n") f.write("平台"+"\t"+"无点击用户数"+"\t"+"有曝光用户数"+"\t"+"无点击用户占比\n")
...@@ -85,6 +86,7 @@ def result2file(fpath): ...@@ -85,6 +86,7 @@ def result2file(fpath):
line += str(j) + '\t' line += str(j) + '\t'
line = line[:-1]+'\n' line = line[:-1]+'\n'
f.write(line) f.write(line)
f.write('\n\n')
print("1.5已将无点击用户占比存入文件") print("1.5已将无点击用户占比存入文件")
......
# -*- coding: UTF-8 -*-
import pymysql import pymysql
import datetime import datetime
...@@ -19,7 +20,9 @@ def tuple2dict(tuple_result): ...@@ -19,7 +20,9 @@ def tuple2dict(tuple_result):
def result2file(result_lst,fpath): def result2file(result_lst,fpath):
with open(fpath,'w') as f: with open(fpath,'w') as f:
header = "平台"+'\t'+"问答id"+'\t'+"问答被点击数"+'\t'+"问答被曝光数"+'\t'+"问答被点击率"+'\t'+"问答链接"+'\n' header = "平台"+'\t'+"answer_id"+'\t'+"answer被点击数"+'\t'+"answer被曝光数"+'\t'+"answer被点击率"+'\t'+"answer链接"+'\n'
f.write("Top 100 Answer\n")
f.write("=================================================================\n")
f.write(header) f.write(header)
for i in result_lst: for i in result_lst:
for j in i: for j in i:
...@@ -29,24 +32,27 @@ def result2file(result_lst,fpath): ...@@ -29,24 +32,27 @@ def result2file(result_lst,fpath):
line = line[:-1] + '\n' line = line[:-1] + '\n'
f.write(line) f.write(line)
f.write("=================================================================\n") f.write("=================================================================\n")
if i != result_lst[-1]:
f.write(header)
f.write("\n\n")
#1 获取昨天所有平台的top100问答 #1 获取昨天所有平台的top100answer
#1.1 获取昨天所有平台的top100点击数的问答 #1.1 获取昨天所有平台的top100点击数的answer
def get_all_answer_count_by_click(): def get_all_answer_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='answer' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='answer' group by cid order by count(cid) desc"
all_answer_count_by_click = con_sql(sql) all_answer_count_by_click = con_sql(sql)
all_answer_count_by_click = tuple2dict(all_answer_count_by_click) all_answer_count_by_click = tuple2dict(all_answer_count_by_click)
return all_answer_count_by_click return all_answer_count_by_click
#1.2 获取昨天所有平台的top100曝光数的问答 #1.2 获取昨天所有平台的top100曝光数的answer
def get_all_answer_count_by_imp(): def get_all_answer_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='answer' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='answer' group by cid order by count(cid) desc"
all_answer_count_by_imp = con_sql(sql) all_answer_count_by_imp = con_sql(sql)
all_answer_count_by_imp = tuple2dict(all_answer_count_by_imp) all_answer_count_by_imp = tuple2dict(all_answer_count_by_imp)
return all_answer_count_by_imp return all_answer_count_by_imp
#1.3 获取昨天所有平台的top100点击率的问答 #1.3 获取昨天所有平台的top100点击率的answer
def get_all_top100_answer_rate_by_ctr(all_answer_count_by_click,all_answer_count_by_imp): def get_all_top100_answer_rate_by_ctr(all_answer_count_by_click,all_answer_count_by_imp):
all_top100_answer_rate_by_ctr = [] all_top100_answer_rate_by_ctr = []
for i in all_answer_count_by_click: for i in all_answer_count_by_click:
...@@ -57,20 +63,20 @@ def get_all_top100_answer_rate_by_ctr(all_answer_count_by_click,all_answer_count ...@@ -57,20 +63,20 @@ def get_all_top100_answer_rate_by_ctr(all_answer_count_by_click,all_answer_count
return all_top100_answer_rate_by_ctr[:100] if len(all_top100_answer_rate_by_ctr) > 100 else all_top100_answer_rate_by_ctr return all_top100_answer_rate_by_ctr[:100] if len(all_top100_answer_rate_by_ctr) > 100 else all_top100_answer_rate_by_ctr
#2 获取昨天ios平台的top100问答 #2 获取昨天ios平台的top100answer
#2.1 获取昨天ios平台的top100点击数的问答 #2.1 获取昨天ios平台的top100点击数的answer
def get_ios_answer_count_by_click(): def get_ios_answer_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='AppStore' and cid_type='answer' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='AppStore' and cid_type='answer' group by cid order by count(cid) desc"
ios_answer_count_by_click = con_sql(sql) ios_answer_count_by_click = con_sql(sql)
ios_answer_count_by_click = tuple2dict(ios_answer_count_by_click) ios_answer_count_by_click = tuple2dict(ios_answer_count_by_click)
return ios_answer_count_by_click return ios_answer_count_by_click
#2.2 获取昨天ios平台的top100曝光数的问答 #2.2 获取昨天ios平台的top100曝光数的answer
def get_ios_answer_count_by_imp(): def get_ios_answer_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='App Store' and cid_type='answer' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='App Store' and cid_type='answer' group by cid order by count(cid) desc"
ios_answer_count_by_imp = con_sql(sql) ios_answer_count_by_imp = con_sql(sql)
ios_answer_count_by_imp = tuple2dict(ios_answer_count_by_imp) ios_answer_count_by_imp = tuple2dict(ios_answer_count_by_imp)
return ios_answer_count_by_imp return ios_answer_count_by_imp
#2.3 获取昨天ios平台的top100点击率的问答 #2.3 获取昨天ios平台的top100点击率的answer
def get_ios_top100_answer_rate_by_ctr(ios_answer_count_by_click,ios_answer_count_by_imp): def get_ios_top100_answer_rate_by_ctr(ios_answer_count_by_click,ios_answer_count_by_imp):
ios_top100_answer_rate_by_ctr = [] ios_top100_answer_rate_by_ctr = []
for i in ios_answer_count_by_click: for i in ios_answer_count_by_click:
...@@ -81,20 +87,20 @@ def get_ios_top100_answer_rate_by_ctr(ios_answer_count_by_click,ios_answer_count ...@@ -81,20 +87,20 @@ def get_ios_top100_answer_rate_by_ctr(ios_answer_count_by_click,ios_answer_count
return ios_top100_answer_rate_by_ctr[:100] if len(ios_top100_answer_rate_by_ctr) > 100 else ios_top100_answer_rate_by_ctr return ios_top100_answer_rate_by_ctr[:100] if len(ios_top100_answer_rate_by_ctr) > 100 else ios_top100_answer_rate_by_ctr
#3 获取昨天安卓平台的top100问答 #3 获取昨天安卓平台的top100answer
#3.1 获取昨天安卓平台的top100点击数的问答 #3.1 获取昨天安卓平台的top100点击数的answer
def get_android_answer_rate_by_click(): def get_android_answer_rate_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='AppStore' and cid_type='answer' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='AppStore' and cid_type='answer' group by cid order by count(cid) desc"
android_answer_count_by_click = con_sql(sql) android_answer_count_by_click = con_sql(sql)
android_answer_count_by_click = tuple2dict(android_answer_count_by_click) android_answer_count_by_click = tuple2dict(android_answer_count_by_click)
return android_answer_count_by_click return android_answer_count_by_click
#3.2 获取昨天安卓平台的top100曝光数的问答 #3.2 获取昨天安卓平台的top100曝光数的answer
def get_android_answer_rate_by_imp(): def get_android_answer_rate_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='App Store' and cid_type='answer' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='App Store' and cid_type='answer' group by cid order by count(cid) desc"
android_answer_count_by_imp = con_sql(sql) android_answer_count_by_imp = con_sql(sql)
android_answer_count_by_imp = tuple2dict(android_answer_count_by_imp) android_answer_count_by_imp = tuple2dict(android_answer_count_by_imp)
return android_answer_count_by_imp return android_answer_count_by_imp
#3.3 获取昨天安卓平台的top100点击率的问答 #3.3 获取昨天安卓平台的top100点击率的answer
def get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_answer_count_by_imp): def get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_answer_count_by_imp):
android_top100_answer_rate_by_ctr = [] android_top100_answer_rate_by_ctr = []
for i in android_answer_count_by_click: for i in android_answer_count_by_click:
...@@ -105,21 +111,21 @@ def get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_ ...@@ -105,21 +111,21 @@ def get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_
return android_top100_answer_rate_by_ctr[:100] if len(android_top100_answer_rate_by_ctr) > 100 else android_top100_answer_rate_by_ctr return android_top100_answer_rate_by_ctr[:100] if len(android_top100_answer_rate_by_ctr) > 100 else android_top100_answer_rate_by_ctr
if __name__ == "__main__": if __name__ == "__main__":
print("开始获取top100点击率的问答...") print("开始获取top100点击率的answer...")
all_answer_count_by_click = get_all_answer_count_by_click() all_answer_count_by_click = get_all_answer_count_by_click()
all_answer_count_by_imp = get_all_answer_count_by_imp() all_answer_count_by_imp = get_all_answer_count_by_imp()
all_top100_answer_rate_by_ctr = get_all_top100_answer_rate_by_ctr(all_answer_count_by_click,all_answer_count_by_imp) all_top100_answer_rate_by_ctr = get_all_top100_answer_rate_by_ctr(all_answer_count_by_click,all_answer_count_by_imp)
print("3.1已获得所有平台的top100点击率的问答") print("4.1已获得所有平台的top100点击率的answer")
ios_answer_count_by_click = get_ios_answer_count_by_click() ios_answer_count_by_click = get_ios_answer_count_by_click()
ios_answer_count_by_imp = get_ios_answer_count_by_imp() ios_answer_count_by_imp = get_ios_answer_count_by_imp()
ios_top100_answer_rate_by_ctr = get_ios_top100_answer_rate_by_ctr(ios_answer_count_by_click,ios_answer_count_by_imp) ios_top100_answer_rate_by_ctr = get_ios_top100_answer_rate_by_ctr(ios_answer_count_by_click,ios_answer_count_by_imp)
print("3.2已获得ios平台的top100点击率的问答") print("4.2已获得ios平台的top100点击率的answer")
android_answer_count_by_click = get_android_answer_rate_by_click() android_answer_count_by_click = get_android_answer_rate_by_click()
android_answer_count_by_imp = get_android_answer_rate_by_imp() android_answer_count_by_imp = get_android_answer_rate_by_imp()
android_top100_answer_rate_by_ctr = get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_answer_count_by_imp) android_top100_answer_rate_by_ctr = get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_answer_count_by_imp)
print("3.3已获得安卓平台的top100点击率的问答") print("4.3已获得安卓平台的top100点击率的answer")
result_lst = [all_top100_answer_rate_by_ctr,ios_top100_answer_rate_by_ctr,android_top100_answer_rate_by_ctr] result_lst = [all_top100_answer_rate_by_ctr,ios_top100_answer_rate_by_ctr,android_top100_answer_rate_by_ctr]
today = datetime.date.today() today = datetime.date.today()
...@@ -127,4 +133,4 @@ if __name__ == "__main__": ...@@ -127,4 +133,4 @@ if __name__ == "__main__":
yesterday = yesterday.strftime("%Y%m%d") yesterday = yesterday.strftime("%Y%m%d")
output_path = "../data/top100_ctr_answer_%s.csv" % yesterday output_path = "../data/top100_ctr_answer_%s.csv" % yesterday
result2file(result_lst,output_path) result2file(result_lst,output_path)
print("已将top100点击率的问答存入文件") print("已将top100点击率的answer存入文件")
# -*- coding: UTF-8 -*-
import pymysql import pymysql
import datetime import datetime
...@@ -19,7 +20,9 @@ def tuple2dict(tuple_result): ...@@ -19,7 +20,9 @@ def tuple2dict(tuple_result):
def result2file(result_lst,fpath): def result2file(result_lst,fpath):
with open(fpath,'w') as f: with open(fpath,'w') as f:
header = "平台"+'\t'+"日记id"+'\t'+"日记被点击数"+'\t'+"日记被曝光数"+'\t'+"日记被点击率"+'\t'+"日记链接"+'\n' header = "平台"+'\t'+"diary_id"+'\t'+"diary被点击数"+'\t'+"diary被曝光数"+'\t'+"diary被点击率"+'\t'+"diary链接"+'\n'
f.write("Top 100 diary\n")
f.write("=================================================================\n")
f.write(header) f.write(header)
for i in result_lst: for i in result_lst:
for j in i: for j in i:
...@@ -29,23 +32,26 @@ def result2file(result_lst,fpath): ...@@ -29,23 +32,26 @@ def result2file(result_lst,fpath):
line = line[:-1] + '\n' line = line[:-1] + '\n'
f.write(line) f.write(line)
f.write("=================================================================\n") f.write("=================================================================\n")
if i != result_lst[-1]:
f.write(header)
f.write("\n\n")
#1 获取昨天所有平台的top100日记((sorted by ctr)) #1 获取昨天所有平台的top100diary((sorted by ctr))
#1.1 获取昨天所有平台的日记的点击数 #1.1 获取昨天所有平台的diary的点击数
def get_all_diary_count_by_click(): def get_all_diary_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='diary' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='diary' group by cid order by count(cid) desc"
all_diary_count_by_click = con_sql(sql) all_diary_count_by_click = con_sql(sql)
all_diary_count_by_click = tuple2dict(all_diary_count_by_click) all_diary_count_by_click = tuple2dict(all_diary_count_by_click)
return all_diary_count_by_click return all_diary_count_by_click
#1.2 获取昨天所有平台的日记的曝光数 #1.2 获取昨天所有平台的diary的曝光数
def get_all_diary_count_by_imp(): def get_all_diary_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='diary' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='diary' group by cid order by count(cid) desc"
all_diary_count_by_imp = con_sql(sql) all_diary_count_by_imp = con_sql(sql)
all_diary_count_by_imp = tuple2dict(all_diary_count_by_imp) all_diary_count_by_imp = tuple2dict(all_diary_count_by_imp)
return all_diary_count_by_imp return all_diary_count_by_imp
#1.3 获取昨天所有平台的top100点击率的日记 #1.3 获取昨天所有平台的top100点击率的diary
def get_all_top100_diary_rate_by_ctr(all_diary_count_by_click,all_diary_count_by_imp): def get_all_top100_diary_rate_by_ctr(all_diary_count_by_click,all_diary_count_by_imp):
all_top100_diary_rate_by_ctr = [] all_top100_diary_rate_by_ctr = []
for i in all_diary_count_by_click: for i in all_diary_count_by_click:
...@@ -56,20 +62,20 @@ def get_all_top100_diary_rate_by_ctr(all_diary_count_by_click,all_diary_count_by ...@@ -56,20 +62,20 @@ def get_all_top100_diary_rate_by_ctr(all_diary_count_by_click,all_diary_count_by
return all_top100_diary_rate_by_ctr[:100] if len(all_top100_diary_rate_by_ctr) > 100 else all_top100_diary_rate_by_ctr return all_top100_diary_rate_by_ctr[:100] if len(all_top100_diary_rate_by_ctr) > 100 else all_top100_diary_rate_by_ctr
#2 获取昨天ios平台的top100日记(sorted by ctr) #2 获取昨天ios平台的top100diary(sorted by ctr)
#2.1 获取昨天ios平台的日记的点击数 #2.1 获取昨天ios平台的diary的点击数
def get_ios_diary_count_by_click(): def get_ios_diary_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='AppStore' and cid_type='diary' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='AppStore' and cid_type='diary' group by cid order by count(cid) desc"
ios_diary_count_by_click = con_sql(sql) ios_diary_count_by_click = con_sql(sql)
ios_diary_count_by_click = tuple2dict(ios_diary_count_by_click) ios_diary_count_by_click = tuple2dict(ios_diary_count_by_click)
return ios_diary_count_by_click return ios_diary_count_by_click
#2.2 获取昨天ios平台的日记的曝光数 #2.2 获取昨天ios平台的diary的曝光数
def get_ios_diary_count_by_imp(): def get_ios_diary_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='App Store' and cid_type='diary' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='App Store' and cid_type='diary' group by cid order by count(cid) desc"
ios_diary_count_by_imp = con_sql(sql) ios_diary_count_by_imp = con_sql(sql)
ios_diary_count_by_imp = tuple2dict(ios_diary_count_by_imp) ios_diary_count_by_imp = tuple2dict(ios_diary_count_by_imp)
return ios_diary_count_by_imp return ios_diary_count_by_imp
#2.3 获取昨天ios平台的top00点击率的日记 #2.3 获取昨天ios平台的top00点击率的diary
def get_ios_top100_diary_rate_by_ctr(ios_top100_diary_count_by_click,ios_top100_diary_count_by_imp): def get_ios_top100_diary_rate_by_ctr(ios_top100_diary_count_by_click,ios_top100_diary_count_by_imp):
ios_top100_diary_rate_by_ctr = [] ios_top100_diary_rate_by_ctr = []
for i in ios_diary_count_by_click: for i in ios_diary_count_by_click:
...@@ -80,20 +86,20 @@ def get_ios_top100_diary_rate_by_ctr(ios_top100_diary_count_by_click,ios_top100_ ...@@ -80,20 +86,20 @@ def get_ios_top100_diary_rate_by_ctr(ios_top100_diary_count_by_click,ios_top100_
return ios_top100_diary_rate_by_ctr[:100] if len(ios_top100_diary_rate_by_ctr) > 100 else ios_top100_diary_rate_by_ctr return ios_top100_diary_rate_by_ctr[:100] if len(ios_top100_diary_rate_by_ctr) > 100 else ios_top100_diary_rate_by_ctr
#3 获取昨天安卓平台的top100日记(sorted by ctr) #3 获取昨天安卓平台的top100diary(sorted by ctr)
#3.1 获取昨天安卓平台的日记的点击数 #3.1 获取昨天安卓平台的diary的点击数
def get_android_diary_rate_by_click(): def get_android_diary_rate_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='AppStore' and cid_type='diary' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='AppStore' and cid_type='diary' group by cid order by count(cid) desc"
android_diary_count_by_click = con_sql(sql) android_diary_count_by_click = con_sql(sql)
android_diary_count_by_click = tuple2dict(android_diary_count_by_click) android_diary_count_by_click = tuple2dict(android_diary_count_by_click)
return android_diary_count_by_click return android_diary_count_by_click
#3.2 获取昨天安卓平台的日记的曝光数 #3.2 获取昨天安卓平台的diary的曝光数
def get_android_diary_rate_by_imp(): def get_android_diary_rate_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='App Store' and cid_type='diary' group by cid order by count(cid) desc" sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='App Store' and cid_type='diary' group by cid order by count(cid) desc"
android_diary_count_by_imp = con_sql(sql) android_diary_count_by_imp = con_sql(sql)
android_diary_count_by_imp = tuple2dict(android_diary_count_by_imp) android_diary_count_by_imp = tuple2dict(android_diary_count_by_imp)
return android_diary_count_by_imp return android_diary_count_by_imp
#3.3 获取昨天安卓平台的top100点击率的日记 #3.3 获取昨天安卓平台的top100点击率的diary
def get_android_top100_diary_rate_by_ctr(android_top100_diary_count_by_click,android_top100_diary_count_by_imp): def get_android_top100_diary_rate_by_ctr(android_top100_diary_count_by_click,android_top100_diary_count_by_imp):
android_top100_diary_rate_by_ctr = [] android_top100_diary_rate_by_ctr = []
for i in android_diary_count_by_click: for i in android_diary_count_by_click:
...@@ -104,21 +110,21 @@ def get_android_top100_diary_rate_by_ctr(android_top100_diary_count_by_click,and ...@@ -104,21 +110,21 @@ def get_android_top100_diary_rate_by_ctr(android_top100_diary_count_by_click,and
return android_top100_diary_rate_by_ctr[:100] if len(android_top100_diary_rate_by_ctr) > 100 else android_top100_diary_rate_by_ctr return android_top100_diary_rate_by_ctr[:100] if len(android_top100_diary_rate_by_ctr) > 100 else android_top100_diary_rate_by_ctr
if __name__ == "__main__": if __name__ == "__main__":
print("开始获取top100点击率的日记...") print("开始获取top100点击率的diary...")
all_diary_count_by_click = get_all_diary_count_by_click() all_diary_count_by_click = get_all_diary_count_by_click()
all_diary_count_by_imp = get_all_diary_count_by_imp() all_diary_count_by_imp = get_all_diary_count_by_imp()
all_top100_diary_rate_by_ctr = get_all_top100_diary_rate_by_ctr(all_diary_count_by_click,all_diary_count_by_imp) all_top100_diary_rate_by_ctr = get_all_top100_diary_rate_by_ctr(all_diary_count_by_click,all_diary_count_by_imp)
print("2.1已获得所有平台的top100点击率的日记") print("3.1已获得所有平台的top100点击率的diary")
ios_diary_count_by_click = get_ios_diary_count_by_click() ios_diary_count_by_click = get_ios_diary_count_by_click()
ios_diary_count_by_imp = get_ios_diary_count_by_imp() ios_diary_count_by_imp = get_ios_diary_count_by_imp()
ios_top100_diary_rate_by_ctr = get_ios_top100_diary_rate_by_ctr(ios_diary_count_by_click,ios_diary_count_by_imp) ios_top100_diary_rate_by_ctr = get_ios_top100_diary_rate_by_ctr(ios_diary_count_by_click,ios_diary_count_by_imp)
print("2.2已获得ios平台的top100点击率的日记") print("3.2已获得ios平台的top100点击率的diary")
android_diary_count_by_click = get_android_diary_rate_by_click() android_diary_count_by_click = get_android_diary_rate_by_click()
android_diary_count_by_imp = get_android_diary_rate_by_imp() android_diary_count_by_imp = get_android_diary_rate_by_imp()
android_top100_diary_rate_by_ctr = get_android_top100_diary_rate_by_ctr(android_diary_count_by_click,android_diary_count_by_imp) android_top100_diary_rate_by_ctr = get_android_top100_diary_rate_by_ctr(android_diary_count_by_click,android_diary_count_by_imp)
print("2.3已获得安卓平台的top100点击率的日记") print("3.3已获得安卓平台的top100点击率的diary")
result_lst = [all_top100_diary_rate_by_ctr,ios_top100_diary_rate_by_ctr,android_top100_diary_rate_by_ctr] result_lst = [all_top100_diary_rate_by_ctr,ios_top100_diary_rate_by_ctr,android_top100_diary_rate_by_ctr]
today = datetime.date.today() today = datetime.date.today()
...@@ -126,4 +132,4 @@ if __name__ == "__main__": ...@@ -126,4 +132,4 @@ if __name__ == "__main__":
yesterday = yesterday.strftime("%Y%m%d") yesterday = yesterday.strftime("%Y%m%d")
output_path = "../data/top100_ctr_diary_%s.csv" % yesterday output_path = "../data/top100_ctr_diary_%s.csv" % yesterday
result2file(result_lst,output_path) result2file(result_lst,output_path)
print("已将top100点击率的日记存入文件") print("已将top100点击率的diary存入文件")
# -*- coding: UTF-8 -*-
import pymysql
import datetime
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
def tuple2dict(tuple_result):
#把sql结果从tuple格式转换成dict格式
dict_result = {}
for i in range(len(tuple_result)):
dict_result[tuple_result[i][0]] = tuple_result[i][1]
return dict_result
def result2file(result_lst,fpath):
with open(fpath,'w') as f:
header = "平台"+'\t'+"question_id"+'\t'+"question被点击数"+'\t'+"question被曝光数"+'\t'+"question被点击率"+'\t'+"question链接"+'\n'
f.write("Top 100 question\n")
f.write("=================================================================\n")
f.write(header)
for i in result_lst:
for j in i:
line = ""
for k in j:
line += str(k) + '\t'
line = line[:-1] + '\n'
f.write(line)
f.write("=================================================================\n")
if i != result_lst[-1]:
f.write(header)
f.write("\n\n")
#1 获取昨天所有平台的top100question
#1.1 获取昨天所有平台的top100点击数的question
def get_all_question_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='question' group by cid order by count(cid) desc"
all_question_count_by_click = con_sql(sql)
all_question_count_by_click = tuple2dict(all_question_count_by_click)
return all_question_count_by_click
#1.2 获取昨天所有平台的top100曝光数的question
def get_all_question_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='question' group by cid order by count(cid) desc"
all_question_count_by_imp = con_sql(sql)
all_question_count_by_imp = tuple2dict(all_question_count_by_imp)
return all_question_count_by_imp
#1.3 获取昨天所有平台的top100点击率的question
def get_all_top100_question_rate_by_ctr(all_question_count_by_click,all_question_count_by_imp):
all_top100_question_rate_by_ctr = []
if all_question_count_by_imp == {}:
for i in all_question_count_by_click:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
all_top100_question_rate_by_ctr.append(("all",i,all_question_count_by_click[i],0,0,url))
all_top100_question_rate_by_ctr.sort(key=lambda x:x[2],reverse=True)
return all_top100_question_rate_by_ctr[:100] if len(all_top100_question_rate_by_ctr) > 100 else all_top100_question_rate_by_ctr
else:
for i in all_question_count_by_click:
if i in all_question_count_by_imp.keys() and all_question_count_by_click[i]>2:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
all_top100_question_rate_by_ctr.append(("all",i,all_question_count_by_click[i],all_question_count_by_imp[i], round(all_question_count_by_click[i]/all_question_count_by_imp[i],4),url))
all_top100_question_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return all_top100_question_rate_by_ctr[:100] if len(all_top100_question_rate_by_ctr) > 100 else all_top100_question_rate_by_ctr
#2 获取昨天ios平台的top100question
#2.1 获取昨天ios平台的top100点击数的question
def get_ios_question_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='AppStore' and cid_type='question' group by cid order by count(cid) desc"
ios_question_count_by_click = con_sql(sql)
ios_question_count_by_click = tuple2dict(ios_question_count_by_click)
return ios_question_count_by_click
#2.2 获取昨天ios平台的top100曝光数的question
def get_ios_question_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='App Store' and cid_type='question' group by cid order by count(cid) desc"
ios_question_count_by_imp = con_sql(sql)
ios_question_count_by_imp = tuple2dict(ios_question_count_by_imp)
return ios_question_count_by_imp
#2.3 获取昨天ios平台的top100点击率的question
def get_ios_top100_question_rate_by_ctr(ios_question_count_by_click,ios_question_count_by_imp):
ios_top100_question_rate_by_ctr = []
if ios_question_count_by_imp == {}:
for i in ios_question_count_by_click:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
ios_top100_question_rate_by_ctr.append(("ios",i,ios_question_count_by_click[i],0,0,url))
ios_top100_question_rate_by_ctr.sort(key=lambda x:x[2],reverse=True)
return ios_top100_question_rate_by_ctr[:100] if len(ios_top100_question_rate_by_ctr) > 100 else ios_top100_question_rate_by_ctr
else:
for i in ios_question_count_by_click:
if i in ios_question_count_by_imp.keys() and ios_question_count_by_click[i]>2:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
ios_top100_question_rate_by_ctr.append(("ios",i,ios_question_count_by_click[i],ios_question_count_by_imp[i], round(ios_question_count_by_click[i]/ios_question_count_by_imp[i],4),url))
ios_top100_question_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return ios_top100_question_rate_by_ctr[:100] if len(ios_top100_question_rate_by_ctr) > 100 else ios_top100_question_rate_by_ctr
#3 获取昨天安卓平台的top100question
#3.1 获取昨天安卓平台的top100点击数的question
def get_android_question_rate_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='AppStore' and cid_type='question' group by cid order by count(cid) desc"
android_question_count_by_click = con_sql(sql)
android_question_count_by_click = tuple2dict(android_question_count_by_click)
return android_question_count_by_click
#3.2 获取昨天安卓平台的top100曝光数的question
def get_android_question_rate_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='App Store' and cid_type='question' group by cid order by count(cid) desc"
android_question_count_by_imp = con_sql(sql)
android_question_count_by_imp = tuple2dict(android_question_count_by_imp)
return android_question_count_by_imp
#3.3 获取昨天安卓平台的top100点击率的question
def get_android_top100_question_rate_by_ctr(android_question_count_by_click,android_question_count_by_imp):
android_top100_question_rate_by_ctr = []
if android_question_count_by_imp == {}:
for i in android_question_count_by_click:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
android_top100_question_rate_by_ctr.append(("android",i,android_question_count_by_click[i],0,0,url))
android_top100_question_rate_by_ctr.sort(key=lambda x:x[2],reverse=True)
return android_top100_question_rate_by_ctr[:100] if len(android_top100_question_rate_by_ctr) > 100 else android_top100_question_rate_by_ctr
else:
for i in android_question_count_by_click:
if i in android_question_count_by_imp.keys() and android_question_count_by_click[i]>2:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
android_top100_question_rate_by_ctr.append(("android",i,android_question_count_by_click[i],android_question_count_by_imp[i],round(android_question_count_by_click[i]/android_question_count_by_imp[i],4),url))
android_top100_question_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return android_top100_question_rate_by_ctr[:100] if len(android_top100_question_rate_by_ctr) > 100 else android_top100_question_rate_by_ctr
if __name__ == "__main__":
print("开始获取top100点击率的question...")
all_question_count_by_click = get_all_question_count_by_click()
all_question_count_by_imp = get_all_question_count_by_imp()
all_top100_question_rate_by_ctr = get_all_top100_question_rate_by_ctr(all_question_count_by_click,all_question_count_by_imp)
print("5.1已获得所有平台的top100点击率的question")
ios_question_count_by_click = get_ios_question_count_by_click()
ios_question_count_by_imp = get_ios_question_count_by_imp()
ios_top100_question_rate_by_ctr = get_ios_top100_question_rate_by_ctr(ios_question_count_by_click,ios_question_count_by_imp)
print("5.2已获得ios平台的top100点击率的question")
android_question_count_by_click = get_android_question_rate_by_click()
android_question_count_by_imp = get_android_question_rate_by_imp()
android_top100_question_rate_by_ctr = get_android_top100_question_rate_by_ctr(android_question_count_by_click,android_question_count_by_imp)
print("5.3已获得安卓平台的top100点击率的question")
result_lst = [all_top100_question_rate_by_ctr,ios_top100_question_rate_by_ctr,android_top100_question_rate_by_ctr]
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y%m%d")
output_path = "../data/top100_ctr_question_%s.csv" % yesterday
result2file(result_lst,output_path)
print("已将top100点击率的question存入文件")
python getRate.py python getRate.py
python getClickTimes2CountUid.py
python getTop100Diary.py python getTop100Diary.py
python getTop100Answer.py python getTop100Answer.py
python getTop100Question.py
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment