Commit 08be0aa8 authored by 张彦钊's avatar 张彦钊

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

add test files
parents 96b96294 860d7a79
......@@ -2,5 +2,6 @@ data/
*.pyc
.DS_Store
.idea
.ipynb_checkpoints/
This source diff could not be displayed because it is too large. You can view the blob instead.
date,answer_imp_all,answer_imp_ios,answer_imp_android,diary_imp_all,diary_imp_ios,diary_imp_android,activate_uid_ctr_all,activate_uid_ctr_ios,activate_uid_ctr_android,activate_uid_ave_imp_all,activate_uid_ave_imp_beijing,click_answer_uid_all,click_answer_uid_ios,click_answer_uid_android,click_diary_uid_all,click_diary_uid_ios,click_diary_uid_android,click_one_uid_all,click_one_uid_ios,click_one_uid_android
20180817,0.0401,0.0569,0.0336,0.7272,0.8801,0.6678,0.0403,0.0624,0.0269,59.93,92.74,0.0189,0.0178,0.02,0.1094,0.1391,0.0852,0.1242,0.1537,0.1
20180818,0.0383,0.0526,0.0331,0.7134,0.8665,0.658,0.0414,0.065,0.0267,57.73,78.98,0.0153,0.0167,0.0139,0.1052,0.1397,0.0786,0.1227,0.1572,0.0972
20180819,0.0393,0.0538,0.0337,0.7211,0.8763,0.6605,0.0468,0.0726,0.0293,51.29,73.27,0.0175,0.0212,0.0136,0.1115,0.1442,0.085,0.1283,0.1598,0.1027
20180820,0.0374,0.0504,0.032,0.7443,0.8829,0.6866,0.0449,0.068,0.0281,55.88,84.81,0.0157,0.0192,0.0118,0.1142,0.149,0.0846,0.1284,0.1641,0.0981
\ No newline at end of file
**功能:**<br>
&emsp;从data_feed_click和data_feed_exposure两个表中统计一些推荐指标<br>
**用法:**<br>
&emsp;用法1(推荐):<br>
&emsp;&emsp;./start.sh<br>
&emsp;用法2(分开运行):<br>
&emsp;&emsp;python getRate.py<br>
&emsp;&emsp;python getClickTimes2CountUid.py<br>
&emsp;&emsp;python getTop100Diary.py<br>
&emsp;&emsp;python getTop100Answer.py<br>
&emsp;&emsp;python getTop100Question.py<br>
**输入输出:**<br>
&emsp;输入:无<br>
&emsp;输出:<br>
&emsp;&emsp;1rate_features_yesterday.txt<br>
&emsp;&emsp;2click_times_to_count_uid_yesterday.txt<br>
&emsp;&emsp;3top100_ctr_diary_yesterday.txt<br>
&emsp;&emsp;4top100_ctr_answer_yesterday.txt<br>
&emsp;&emsp;5top100_ctr_question_yesterday.csv<br>
&emsp;&emsp;result_all_yesterday.txt<br>
\ No newline at end of file
DIRECTORY_PATH="/data2/models/eda/recommended_indexs/"
\ No newline at end of file
# -*- coding: UTF-8 -*-
import pymysql
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
#1 获取所有平台的有点击用户点击率
def get_all_click_one_rate():
sql = "select count(device_id) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type is not null"
click_one_count = con_sql(sql)
click_one_count = click_one_count[0][0]
sql = "select count(device_id) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_id in (select device_id from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day))"
impression_one_count = con_sql(sql)
impression_one_count = impression_one_count[0][0]
all_click_one_rate = click_one_count / impression_one_count
return ["所有",click_one_count,impression_one_count,round(all_click_one_rate,4)]
#2 获取ios平台的有点击用户点击率
def get_ios_click_one_rate():
sql = "select count(device_id) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='AppStore'"
click_one_count = con_sql(sql)
click_one_count = click_one_count[0][0]
sql = "select count(device_id) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_id in (select device_id from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='AppStore') and device_type='App Store'"
impression_one_count = con_sql(sql)
impression_one_count = impression_one_count[0][0]
ios_click_one_rate = click_one_count / impression_one_count
return ["苹果",click_one_count,impression_one_count,round(ios_click_one_rate,4)]
#3 获取安卓平台的有点击用户点击率
def get_android_click_one_rate():
sql = "select count(device_id) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='AppStore'"
click_one_count = con_sql(sql)
click_one_count = click_one_count[0][0]
sql = "select count(device_id) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_id in (select device_id from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='AppStore') and device_type!='App Store'"
impression_one_count = con_sql(sql)
impression_one_count = impression_one_count[0][0]
android_click_one_rate = click_one_count / impression_one_count
return ["安卓",click_one_count,impression_one_count,round(android_click_one_rate,4)]
if __name__ == "__main__":
all_click_one_rate = get_all_click_one_rate()
ios_click_one_rate = get_ios_click_one_rate()
android_click_one_rate = get_android_click_one_rate()
# -*- coding: UTF-8 -*-
import pymysql
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
#1 获取所有平台的问答曝光占比
def get_all_answer_imp_rate():
sql = "select count(cid) from data_feed_exposure where cid_type='answer' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
answer_imp_count = con_sql(sql)
answer_imp_count = answer_imp_count[0][0]
sql = "select count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
all_imp_count = con_sql(sql)
all_imp_count = all_imp_count[0][0]
all_answer_imp_rate = answer_imp_count / all_imp_count
return ["所有",answer_imp_count,all_imp_count,round(all_answer_imp_rate,4)]
#2 获取ios平台的问答曝光占比
def get_ios_answer_imp_rate():
sql = "select count(cid) from data_feed_exposure where cid_type='answer' and device_type='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
answer_imp_count = con_sql(sql)
answer_imp_count = answer_imp_count[0][0]
sql = "select count(cid) from data_feed_exposure where device_type='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
all_imp_count = con_sql(sql)
all_imp_count = all_imp_count[0][0]
ios_answer_imp_rate = answer_imp_count / all_imp_count
return ["苹果",answer_imp_count,all_imp_count,round(ios_answer_imp_rate,4)]
#3 获取安卓平台的问答曝光占比
def get_android_answer_imp_rate():
sql = "select count(cid) from data_feed_exposure where cid_type='answer' and device_type!='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
answer_imp_count = con_sql(sql)
answer_imp_count = answer_imp_count[0][0]
sql = "select count(cid) from data_feed_exposure where device_type!='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
all_imp_count = con_sql(sql)
all_imp_count = all_imp_count[0][0]
android_answer_imp_rate = answer_imp_count / all_imp_count
return ["安卓",answer_imp_count,all_imp_count,round(android_answer_imp_rate,4)]
if __name__ == "__main__":
all_answer_imp_rate = get_all_answer_imp_rate()
ios_answer_imp_rate = get_ios_answer_imp_rate()
android_answer_imp_rate = get_android_answer_imp_rate()
# -*- coding: UTF-8 -*-
import pymysql
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
#1 获取所有平台的点击问答用户占比
def get_all_click_answer_rate():
sql = "select count(distinct(device_id)) from data_feed_click where cid_type='answer' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
click_answer_count = con_sql(sql)
click_answer_count = click_answer_count[0][0]
sql = "select count(distinct(device_id)) from data_feed_exposure where cid_type='answer' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
impression_answer_count = con_sql(sql)
impression_answer_count = impression_answer_count[0][0]
all_click_answer_rate = click_answer_count / impression_answer_count
return ["所有",click_answer_count,impression_answer_count,round(all_click_answer_rate,4)]
#2 获取ios平台的点击问答用户占比
def get_ios_click_answer_rate():
sql = "select count(distinct(device_id)) from data_feed_click where cid_type='answer' and device_type='AppStore' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
click_answer_count = con_sql(sql)
click_answer_count = click_answer_count[0][0]
sql = "select count(distinct(device_id)) from data_feed_exposure where cid_type='answer' and device_type='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
impression_answer_count = con_sql(sql)
impression_answer_count = impression_answer_count[0][0]
ios_click_answer_rate = click_answer_count / impression_answer_count
return ["苹果",click_answer_count,impression_answer_count,round(ios_click_answer_rate,4)]
#3 获取安卓平台的点击问答用户占比
def get_android_click_answer_rate():
sql = "select count(distinct(device_id)) from data_feed_click where cid_type='answer' and device_type!='AppStore' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
click_answer_count = con_sql(sql)
click_answer_count = click_answer_count[0][0]
sql = "select count(distinct(device_id)) from data_feed_exposure where cid_type='answer' and device_type!='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
impression_answer_count = con_sql(sql)
impression_answer_count = impression_answer_count[0][0]
android_click_answer_rate = click_answer_count / impression_answer_count
return ["安卓",click_answer_count,impression_answer_count,round(android_click_answer_rate,4)]
if __name__ == "__main__":
all_click_answer_rate = get_all_click_answer_rate()
ios_click_answer_rate = get_ios_click_answer_rate()
android_click_answer_rate = get_android_click_answer_rate()
# -*- coding: UTF-8 -*-
import pymysql
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
#1 获取所有平台的点击问答用户占比
def get_all_click_diary_rate():
sql = "select count(distinct(device_id)) from data_feed_click where cid_type='diary' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
click_diary_count = con_sql(sql)
click_diary_count = click_diary_count[0][0]
sql = "select count(distinct(device_id)) from data_feed_exposure where cid_type='diary' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
impression_diary_count = con_sql(sql)
impression_diary_count = impression_diary_count[0][0]
all_click_diary_rate = click_diary_count / impression_diary_count
return ["所有",click_diary_count,impression_diary_count,round(all_click_diary_rate,4)]
#2 获取ios平台的点击问答用户占比
def get_ios_click_diary_rate():
sql = "select count(distinct(device_id)) from data_feed_click where cid_type='diary' and device_type='AppStore' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
click_diary_count = con_sql(sql)
click_diary_count = click_diary_count[0][0]
sql = "select count(distinct(device_id)) from data_feed_exposure where cid_type='diary' and device_type='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
impression_diary_count = con_sql(sql)
impression_diary_count = impression_diary_count[0][0]
ios_click_diary_rate = click_diary_count / impression_diary_count
return ["苹果",click_diary_count,impression_diary_count,round(ios_click_diary_rate,4)]
#3 获取安卓平台的点击问答用户占比
def get_android_click_diary_rate():
sql = "select count(distinct(device_id)) from data_feed_click where cid_type='diary' and device_type!='AppStore' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
click_diary_count = con_sql(sql)
click_diary_count = click_diary_count[0][0]
sql = "select count(distinct(device_id)) from data_feed_exposure where cid_type='diary' and device_type!='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
impression_diary_count = con_sql(sql)
impression_diary_count = impression_diary_count[0][0]
android_click_diary_rate = click_diary_count / impression_diary_count
return ["安卓",click_diary_count,impression_diary_count,round(android_click_diary_rate,4)]
if __name__ == "__main__":
all_click_diary_rate = get_all_click_diary_rate()
ios_click_diary_rate = get_ios_click_diary_rate()
android_click_diary_rate = get_android_click_diary_rate()
import datetime
import pymysql
#一周之前的timestamp(7)
my_date1 = datetime.date.today() - datetime.timedelta(days=7)
my_tm1 = int(my_date1.strftime("%s"))
#二周之前的timestamp(14)
my_date2 = datetime.date.today() - datetime.timedelta(days=14)
my_tm2 = int(my_date2.strftime("%s"))
#一个月之前的timestamp(30)
my_date3 = datetime.date.today() - datetime.timedelta(days=30)
my_tm3 = int(my_date3.strftime("%s"))
#两个月之前的timestamp(60)
my_date4 = datetime.date.today() - datetime.timedelta(days=60)
my_tm4 = int(my_date4.strftime("%s"))
#三个月之前的timestamp(90)
my_date5 = datetime.date.today() - datetime.timedelta(days=90)
my_tm5 = int(my_date5.strftime("%s"))
def get_rate_detail(platform):
if platform == "ios":
platform = "='AppStore'"
elif platform == "android":
platform = "!='AppStore'"
else:
platform = " is not null"
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
sql = "select '0-7' as label,count(distinct(device_id)) from data_feed_click \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {1})) \
union all \
select '7-14' as label,count(distinct(device_id)) from data_feed_click \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {2}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {1})) \
union all \
select '14-30' as label,count(distinct(device_id)) from data_feed_click \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {3}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {2})) \
union all \
select '30-60' as label,count(distinct(device_id)) from data_feed_click \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {4}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {3})) \
union all \
select '60-90' as label,count(distinct(device_id)) from data_feed_click \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {5}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {4})) \
union all \
select '90+' as label,count(distinct(device_id)) from data_feed_click \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {5}))".format(platform,my_tm1,my_tm2,my_tm3,my_tm4,my_tm5)
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
def result2dict(result):
"""
result : tuple2
rtype : dict
"""
dct = {}
sum_count = 0
for i in result:
sum_count += i[1]
for i in result:
dct[i[0]] = "{}--{}%".format(i[1],round(i[1]/sum_count*100,2))
print("sum:{}".format(sum_count))
return dct
if __name__ == '__main__':
have_click_uid_detail_all = result2dict(get_rate_detail("all"))
have_click_uid_detail_ios = result2dict(get_rate_detail("ios"))
have_click_uid_detail_android = result2dict(get_rate_detail("android"))
# -*- coding: UTF-8 -*-
import pymysql
import datetime
import pandas as pd
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
def get_yesterday_date():
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y%m%d")
return yesterday
def get_click_times_to_count_uid_df():
sql = "select device_id,count(cid_type) click_times from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) group by device_id order by click_times desc"
uid_click_times = con_sql(sql)
uid_lst = [i[0] for i in uid_click_times]
click_times_lst = [i[1] for i in uid_click_times]
uid_click_times_df = pd.DataFrame({"uid":uid_lst,"click_times":click_times_lst})
df = uid_click_times_df.groupby(by="click_times",as_index=False).count()
return df
def df2file(df,fpath):
with open(fpath,"w") as f:
tplt = "{0:^10}\t{1:^10}\n"
f.write("#2. Top特征\n")
f.write("=================================================================\n")
f.write("2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)\n")
f.write(tplt.format("click_times","count_uid"))
for row in df.iterrows():
line = tplt.format(row[1][0],row[1][1])
f.write(line)
f.write("\n\n")
def main():
print("2.开始获取Top特征...")
output_path = "/data2/models/eda/recommended_indexs/2click_times_to_count_uid_%s.txt" % get_yesterday_date()
df = get_click_times_to_count_uid_df()
df2file(df,output_path)
print("2.1已将用户点击次数分布存入文件")
if __name__ == '__main__':
main()
\ No newline at end of file
# -*- coding: UTF-8 -*-
import pymysql
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
#1 获取所有平台的0点击用户占比
def get_all_click_zero_rate():
sql = "select count(distinct(device_id)) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
click_zero_count = con_sql(sql)
click_zero_count = click_zero_count[0][0]
sql = "select count(distinct(device_id)) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
impression_zero_count = con_sql(sql)
impression_zero_count = impression_zero_count[0][0]
click_zero_count = impression_zero_count-click_zero_count
all_click_zero_rate = click_zero_count / impression_zero_count
return ["所有",click_zero_count,impression_zero_count,round(all_click_zero_rate,4)]
#2 获取ios平台的0点击用户占比
def get_ios_click_zero_rate():
sql = "select count(distinct(device_id)) from data_feed_click where device_type='AppStore' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
click_zero_count = con_sql(sql)
click_zero_count = click_zero_count[0][0]
sql = "select count(distinct(device_id)) from data_feed_exposure where device_type='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
impression_zero_count = con_sql(sql)
impression_zero_count = impression_zero_count[0][0]
click_zero_count = impression_zero_count-click_zero_count
ios_click_zero_rate = click_zero_count / impression_zero_count
return ["苹果",click_zero_count,impression_zero_count,round(ios_click_zero_rate,4)]
#3 获取安卓平台的0点击用户占比
def get_android_click_zero_rate():
sql = "select count(distinct(device_id)) from data_feed_click where device_type!='AppStore' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
click_zero_count = con_sql(sql)
click_zero_count = click_zero_count[0][0]
sql = "select count(distinct(device_id)) from data_feed_exposure where device_type!='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
impression_zero_count = con_sql(sql)
impression_zero_count = impression_zero_count[0][0]
click_zero_count = impression_zero_count-click_zero_count
android_click_zero_rate = click_zero_count / impression_zero_count
return ["安卓",click_zero_count,impression_zero_count,round(android_click_zero_rate,4)]
if __name__ == "__main__":
all_click_zero_rate = get_all_click_zero_rate()
ios_click_zero_rate = get_ios_click_zero_rate()
android_click_zero_rate = get_android_click_zero_rate()
import datetime
import pymysql
#一周之前的timestamp(7)
my_date1 = datetime.date.today() - datetime.timedelta(days=7)
my_tm1 = int(my_date1.strftime("%s"))
#二周之前的timestamp(14)
my_date2 = datetime.date.today() - datetime.timedelta(days=14)
my_tm2 = int(my_date2.strftime("%s"))
#一个月之前的timestamp(30)
my_date3 = datetime.date.today() - datetime.timedelta(days=30)
my_tm3 = int(my_date3.strftime("%s"))
#两个月之前的timestamp(60)
my_date4 = datetime.date.today() - datetime.timedelta(days=60)
my_tm4 = int(my_date4.strftime("%s"))
#三个月之前的timestamp(90)
my_date5 = datetime.date.today() - datetime.timedelta(days=90)
my_tm5 = int(my_date5.strftime("%s"))
def get_click_zero_uid_count(platform):
"""
platform : "ios","android","all"
rtype : dict
"""
if platform == "ios":
platform = "='App Store'"
elif platform == "android":
platform = "!='App Store'"
else:
platform = " is not null"
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
sql = "select '0-7' as label,count(distinct(device_id)) from data_feed_exposure \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id not in \
(select distinct(device_id) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_type{1}) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {2})) \
union all \
select '7-14' as label,count(distinct(device_id)) from data_feed_exposure \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id not in \
(select distinct(device_id) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_type{1}) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {3}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {2})) \
union all \
select '14-30' as label,count(distinct(device_id)) from data_feed_exposure \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id not in \
(select distinct(device_id) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_type{1}) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {4}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {3})) \
union all \
select '30-60' as label,count(distinct(device_id)) from data_feed_exposure \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id not in \
(select distinct(device_id) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_type{1}) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {5}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {4})) \
union all \
select '60-90' as label,count(distinct(device_id)) from data_feed_exposure \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id not in \
(select distinct(device_id) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_type{1}) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {6}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {5})) \
union all \
select '90+' as label,count(distinct(device_id)) from data_feed_exposure \
where device_type{0} \
and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_id not in \
(select distinct(device_id) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) \
and device_type{1}) \
and device_id in \
(select distinct(device_id) \
from data_feed_exposure \
where device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {6}))".format(platform,platform.replace(' ','') if platform[-2]=='e' else platform,my_tm1,my_tm2,my_tm3,my_tm4,my_tm5)
cursor.execute(sql)
result = cursor.fetchall()
db.close()
dct = {}
for i in result:
dct[i[0]] = i[1]
return dct
if __name__ == '__main__':
no_click_uid_detail_all = get_click_zero_uid_count("all")
no_click_uid_detail_ios = get_click_zero_uid_count("ios")
no_click_uid_detail_android = get_click_zero_uid_count("android")
# -*- coding: UTF-8 -*-
import pymysql
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
#1 获取所有平台的日记曝光占比
def get_all_diary_imp_rate():
sql = "select count(cid) from data_feed_exposure where cid_type='diary' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
diary_imp_count = con_sql(sql)
diary_imp_count = diary_imp_count[0][0]
sql = "select count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
all_imp_count = con_sql(sql)
all_imp_count = all_imp_count[0][0]
all_diary_imp_rate = diary_imp_count / all_imp_count
return ["所有",diary_imp_count,all_imp_count,round(all_diary_imp_rate,4)]
#2 获取ios平台的日记曝光占比
def get_ios_diary_imp_rate():
sql = "select count(cid) from data_feed_exposure where cid_type='diary' and device_type='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
diary_imp_count = con_sql(sql)
diary_imp_count = diary_imp_count[0][0]
sql = "select count(cid) from data_feed_exposure where device_type='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
all_imp_count = con_sql(sql)
all_imp_count = all_imp_count[0][0]
ios_diary_imp_rate = diary_imp_count / all_imp_count
return ["苹果",diary_imp_count,all_imp_count,round(ios_diary_imp_rate,4)]
#3 获取安卓平台的日记曝光占比
def get_android_diary_imp_rate():
sql = "select count(cid) from data_feed_exposure where cid_type='diary' and device_type!='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
diary_imp_count = con_sql(sql)
diary_imp_count = diary_imp_count[0][0]
sql = "select count(cid) from data_feed_exposure where device_type!='App Store' and from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"
all_imp_count = con_sql(sql)
all_imp_count = all_imp_count[0][0]
android_diary_imp_rate = diary_imp_count / all_imp_count
return ["安卓",diary_imp_count,all_imp_count,round(android_diary_imp_rate,4)]
if __name__ == "__main__":
all_diary_imp_rate = get_all_diary_imp_rate()
ios_diary_imp_rate = get_ios_diary_imp_rate()
android_diary_imp_rate = get_android_diary_imp_rate()
# -*- coding: UTF-8 -*-
import datetime
from getAnswerImpRate import get_all_answer_imp_rate,get_ios_answer_imp_rate,get_android_answer_imp_rate
from getDiaryImpRate import get_all_diary_imp_rate,get_ios_diary_imp_rate,get_android_diary_imp_rate
from getActivateUidCtr import get_all_click_one_rate,get_ios_click_one_rate,get_android_click_one_rate
from getClickAnswerUidRate import get_all_click_answer_rate,get_ios_click_answer_rate,get_android_click_answer_rate
from getClickDiaryUidRate import get_all_click_diary_rate,get_ios_click_diary_rate,get_android_click_diary_rate
from getClickZeroUidRate import get_all_click_zero_rate,get_ios_click_zero_rate,get_android_click_zero_rate
def get_yesterday_date():
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y%m%d")
return yesterday
def result2file(fpath):
with open(fpath,'w') as f:
tplt = "{0:\u3000<6}\t{1:\u3000<15}\t{2:\u3000<15}\t{3:\u3000<15}\n"
line = """数据日期:{}
内容概览:以下所有数据都是昨天一天的首页的
1. 比例特征
1.1 answer曝光占比(=answer被曝光数/总cid被曝光数)
1.2 diary曝光占比(=diary被曝光数/总cid被曝光数)
1.3 活跃用户点击率(=有点击用户点击次数/有点击用户曝光次数)
1.4 活跃用户平均每天曝光次数(=活跃用户曝光数/独立活跃用户数)
1.5 点击answer用户占比(=点击answer用户数/曝光answer用户数)
1.6 点击diary用户占比(=点击diary用户数/曝光diary用户数)
1.7 无点击用户占比(=无点击用户数/有曝光用户数)
1.8 无点击用户数分布(=无点击用户∩激活用户数 / 激活用户数) #注意:(]里面的数字指的是距离当前时间的天数
2.Top特征
2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)
2.2 Top 100 diary (sorted by ctr)
2.3 Top 100 Answer (sorted by ctr)
2.4 Top 100 Question (sorted by ctr)
具体内容:以下所有数据都昨天一天的首页的
""".format(get_yesterday_date())
f.write(line)
f.write("#1. 比例特征\n")
f.write("=================================================================\n")
f.write("#1.1answer曝光占比(=answer被曝光数/总cid被曝光数)\n")
f.write(tplt.format("平台","answer被曝光数","总cid被曝光数","answer被曝光数占比"))
all_answer_imp_rate = get_all_answer_imp_rate()
ios_answer_imp_rate = get_ios_answer_imp_rate()
android_answer_imp_rate = get_android_answer_imp_rate()
lst = [all_answer_imp_rate,ios_answer_imp_rate,android_answer_imp_rate]
for i in lst:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
print("1.1已将answer曝光占比存入文件")
f.write("#1.2diary曝光占比(=diary被曝光数/总cid被曝光数)\n")
f.write(tplt.format("平台","diary被曝光数","总cid被曝光数","diary被曝光数占比"))
all_diary_imp_rate = get_all_diary_imp_rate()
ios_diary_imp_rate = get_ios_diary_imp_rate()
android_diary_imp_rate = get_android_diary_imp_rate()
lst = [all_diary_imp_rate,ios_diary_imp_rate,android_diary_imp_rate]
for i in lst:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
print("1.2已将diary曝光占比存入文件")
f.write("#1.3活跃用户点击率(=有点击用户点击次数/有点击用户曝光次数)\n")
f.write(tplt.format("平台","active用户点击次数","active用户曝光次数","active用户点击率"))
all_click_one_rate = get_all_click_one_rate()
ios_click_one_rate = get_ios_click_one_rate()
android_click_one_rate = get_android_click_one_rate()
lst = [all_click_one_rate,ios_click_one_rate,android_click_one_rate]
for i in lst:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
print("1.3已将活跃用户点击率存入文件")
f.write("#1.5点击answer用户占比(=点击answer用户数/曝光answer用户数)\n")
f.write(tplt.format("平台","点击answer用户数","曝光answer用户数","击answer用户占比"))
all_click_answer_rate = get_all_click_answer_rate()
ios_click_answer_rate = get_ios_click_answer_rate()
android_click_answer_rate = get_android_click_answer_rate()
lst = [all_click_answer_rate,ios_click_answer_rate,android_click_answer_rate]
for i in lst:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
print("1.5已将点击answer用户占比存入文件")
f.write("#1.6点击diary用户占比(=点击diary用户数/曝光diary用户数)\n")
f.write(tplt.format("平台","点击diary用户数","曝光diary用户数","击diary用户占比"))
all_click_diary_rate = get_all_click_diary_rate()
ios_click_diary_rate = get_ios_click_diary_rate()
android_click_diary_rate = get_android_click_diary_rate()
lst = [all_click_diary_rate,ios_click_diary_rate,android_click_diary_rate]
for i in lst:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
print("1.6已将点击diary用户占比存入文件")
f.write("#1.7无点击用户占比(=无点击用户数/有曝光用户数)\n")
f.write(tplt.format("平台","no点击用户数","have曝光用户数","no点击用户占比"))
all_click_zero_rate = get_all_click_zero_rate()
ios_click_zero_rate = get_ios_click_zero_rate()
android_click_zero_rate = get_android_click_zero_rate()
lst = [all_click_zero_rate,ios_click_zero_rate,android_click_zero_rate]
for i in lst:
line = tplt.format(i[0],i[1],i[2],"{}%".format(round(i[3]*100,2)))
f.write(line)
f.write('\n')
print("1.7已将无点击用户占比存入文件")
def main():
output_path = "/data2/models/eda/recommended_indexs/1rate_features_%s.txt" % get_yesterday_date()
print("开始获取比例特征...")
result2file(output_path)
print("已完成所有比例特征提取")
if __name__ == '__main__':
main()
# -*- coding: UTF-8 -*-
import pymysql
import datetime
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
def tuple2dict(tuple_result):
#把sql结果从tuple格式转换成dict格式
dict_result = {}
for i in range(len(tuple_result)):
dict_result[tuple_result[i][0]] = tuple_result[i][1]
return dict_result
def result2file(result_lst,fpath):
with open(fpath,'w') as f:
tplt = "{0:\u3000<4}\t{1:\u3000<12}\t{2:\u3000^6}\t{3:\u3000^6}\t{4:\u3000<8}\t{5:\u3000^15}\n"
f.write("2.3 Top 100 Answer\n")
f.write("=================================================================\n")
f.write(tplt.format("平台","answer_id","点击数","曝光数","点击率","answer链接"))
for i in result_lst:
for j in i:
f.write(tplt.format(j[0],j[1],j[2],j[3],"{}%".format(round(j[4]*100,2)),j[5]))
f.write("=================================================================\n")
if i != result_lst[-1]:
f.write(tplt.format("平台","answer_id","点击数","曝光数","点击率","answer链接"))
f.write("\n\n")
#1 获取昨天所有平台的top100answer
#1.1 获取昨天所有平台的top100点击数的answer
def get_all_answer_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='answer' group by cid order by count(cid) desc"
all_answer_count_by_click = con_sql(sql)
all_answer_count_by_click = tuple2dict(all_answer_count_by_click)
return all_answer_count_by_click
#1.2 获取昨天所有平台的top100曝光数的answer
def get_all_answer_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='answer' group by cid order by count(cid) desc"
all_answer_count_by_imp = con_sql(sql)
all_answer_count_by_imp = tuple2dict(all_answer_count_by_imp)
return all_answer_count_by_imp
#1.3 获取昨天所有平台的top100点击率的answer
def get_all_top100_answer_rate_by_ctr(all_answer_count_by_click,all_answer_count_by_imp):
all_top100_answer_rate_by_ctr = []
for i in all_answer_count_by_click:
if i in all_answer_count_by_imp.keys() and all_answer_count_by_click[i]>2:
url = "http://m.igengmei.com/answer/" + i[i.index('|')+1:] + '/'
all_top100_answer_rate_by_ctr.append(("所有",i,all_answer_count_by_click[i],all_answer_count_by_imp[i], round(all_answer_count_by_click[i]/all_answer_count_by_imp[i],4),url))
all_top100_answer_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return all_top100_answer_rate_by_ctr[:100] if len(all_top100_answer_rate_by_ctr) > 100 else all_top100_answer_rate_by_ctr
#2 获取昨天ios平台的top100answer
#2.1 获取昨天ios平台的top100点击数的answer
def get_ios_answer_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='AppStore' and cid_type='answer' group by cid order by count(cid) desc"
ios_answer_count_by_click = con_sql(sql)
ios_answer_count_by_click = tuple2dict(ios_answer_count_by_click)
return ios_answer_count_by_click
#2.2 获取昨天ios平台的top100曝光数的answer
def get_ios_answer_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='App Store' and cid_type='answer' group by cid order by count(cid) desc"
ios_answer_count_by_imp = con_sql(sql)
ios_answer_count_by_imp = tuple2dict(ios_answer_count_by_imp)
return ios_answer_count_by_imp
#2.3 获取昨天ios平台的top100点击率的answer
def get_ios_top100_answer_rate_by_ctr(ios_answer_count_by_click,ios_answer_count_by_imp):
ios_top100_answer_rate_by_ctr = []
for i in ios_answer_count_by_click:
if i in ios_answer_count_by_imp.keys() and ios_answer_count_by_click[i]>2:
url = "http://m.igengmei.com/answer/" + i[i.index('|')+1:] + '/'
ios_top100_answer_rate_by_ctr.append(("苹果",i,ios_answer_count_by_click[i],ios_answer_count_by_imp[i], round(ios_answer_count_by_click[i]/ios_answer_count_by_imp[i],4),url))
ios_top100_answer_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return ios_top100_answer_rate_by_ctr[:100] if len(ios_top100_answer_rate_by_ctr) > 100 else ios_top100_answer_rate_by_ctr
#3 获取昨天安卓平台的top100answer
#3.1 获取昨天安卓平台的top100点击数的answer
def get_android_answer_rate_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='AppStore' and cid_type='answer' group by cid order by count(cid) desc"
android_answer_count_by_click = con_sql(sql)
android_answer_count_by_click = tuple2dict(android_answer_count_by_click)
return android_answer_count_by_click
#3.2 获取昨天安卓平台的top100曝光数的answer
def get_android_answer_rate_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='App Store' and cid_type='answer' group by cid order by count(cid) desc"
android_answer_count_by_imp = con_sql(sql)
android_answer_count_by_imp = tuple2dict(android_answer_count_by_imp)
return android_answer_count_by_imp
#3.3 获取昨天安卓平台的top100点击率的answer
def get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_answer_count_by_imp):
android_top100_answer_rate_by_ctr = []
for i in android_answer_count_by_click:
if i in android_answer_count_by_imp.keys() and android_answer_count_by_click[i]>2:
url = "http://m.igengmei.com/answer/" + i[i.index('|')+1:] + '/'
android_top100_answer_rate_by_ctr.append(("安卓",i,android_answer_count_by_click[i],android_answer_count_by_imp[i],round(android_answer_count_by_click[i]/android_answer_count_by_imp[i],4),url))
android_top100_answer_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return android_top100_answer_rate_by_ctr[:100] if len(android_top100_answer_rate_by_ctr) > 100 else android_top100_answer_rate_by_ctr
if __name__ == "__main__":
all_answer_count_by_click = get_all_answer_count_by_click()
all_answer_count_by_imp = get_all_answer_count_by_imp()
all_top100_answer_rate_by_ctr = get_all_top100_answer_rate_by_ctr(all_answer_count_by_click,all_answer_count_by_imp)
ios_answer_count_by_click = get_ios_answer_count_by_click()
ios_answer_count_by_imp = get_ios_answer_count_by_imp()
ios_top100_answer_rate_by_ctr = get_ios_top100_answer_rate_by_ctr(ios_answer_count_by_click,ios_answer_count_by_imp)
android_answer_count_by_click = get_android_answer_rate_by_click()
android_answer_count_by_imp = get_android_answer_rate_by_imp()
android_top100_answer_rate_by_ctr = get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_answer_count_by_imp)
result_lst = [all_top100_answer_rate_by_ctr,ios_top100_answer_rate_by_ctr,android_top100_answer_rate_by_ctr]
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y%m%d")
output_path = "/data2/models/eda/recommended_indexs/4top100_ctr_answer_%s.txt" % yesterday
result2file(result_lst,output_path)
print("2.3已将top100点击率的answer存入文件")
# -*- coding: UTF-8 -*-
import pymysql
import datetime
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
def tuple2dict(tuple_result):
#把sql结果从tuple格式转换成dict格式
dict_result = {}
for i in range(len(tuple_result)):
dict_result[tuple_result[i][0]] = tuple_result[i][1]
return dict_result
def result2file(result_lst,fpath):
with open(fpath,'w') as f:
tplt = "{0:\u3000<4}\t{1:\u3000<12}\t{2:\u3000^6}\t{3:\u3000^6}\t{4:\u3000<8}\t{5:\u3000^15}\n"
f.write("2.2 Top 100 diary\n")
f.write("=================================================================\n")
f.write(tplt.format("平台","diary_id","点击数","曝光数","点击率","diary链接"))
for i in result_lst:
for j in i:
f.write(tplt.format(j[0],j[1],j[2],j[3],"{}%".format(round(j[4]*100,2)),j[5]))
f.write("=================================================================\n")
if i != result_lst[-1]:
f.write(tplt.format("平台","diary_id","点击数","曝光数","点击率","diary链接"))
f.write("\n\n")
#1 获取昨天所有平台的top100diary((sorted by ctr))
#1.1 获取昨天所有平台的diary的点击数
def get_all_diary_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='diary' group by cid order by count(cid) desc"
all_diary_count_by_click = con_sql(sql)
all_diary_count_by_click = tuple2dict(all_diary_count_by_click)
return all_diary_count_by_click
#1.2 获取昨天所有平台的diary的曝光数
def get_all_diary_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='diary' group by cid order by count(cid) desc"
all_diary_count_by_imp = con_sql(sql)
all_diary_count_by_imp = tuple2dict(all_diary_count_by_imp)
return all_diary_count_by_imp
#1.3 获取昨天所有平台的top100点击率的diary
def get_all_top100_diary_rate_by_ctr(all_diary_count_by_click,all_diary_count_by_imp):
all_top100_diary_rate_by_ctr = []
for i in all_diary_count_by_click:
if i in all_diary_count_by_imp.keys() and all_diary_count_by_click[i] > 4:
url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/'
all_top100_diary_rate_by_ctr.append(("所有",i,all_diary_count_by_click[i],all_diary_count_by_imp[i], round(all_diary_count_by_click[i]/all_diary_count_by_imp[i],4),url))
all_top100_diary_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return all_top100_diary_rate_by_ctr[:100] if len(all_top100_diary_rate_by_ctr) > 100 else all_top100_diary_rate_by_ctr
#2 获取昨天ios平台的top100diary(sorted by ctr)
#2.1 获取昨天ios平台的diary的点击数
def get_ios_diary_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='AppStore' and cid_type='diary' group by cid order by count(cid) desc"
ios_diary_count_by_click = con_sql(sql)
ios_diary_count_by_click = tuple2dict(ios_diary_count_by_click)
return ios_diary_count_by_click
#2.2 获取昨天ios平台的diary的曝光数
def get_ios_diary_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='App Store' and cid_type='diary' group by cid order by count(cid) desc"
ios_diary_count_by_imp = con_sql(sql)
ios_diary_count_by_imp = tuple2dict(ios_diary_count_by_imp)
return ios_diary_count_by_imp
#2.3 获取昨天ios平台的top00点击率的diary
def get_ios_top100_diary_rate_by_ctr(ios_top100_diary_count_by_click,ios_top100_diary_count_by_imp):
ios_top100_diary_rate_by_ctr = []
for i in ios_diary_count_by_click:
if i in ios_diary_count_by_imp.keys() and ios_diary_count_by_click[i] > 4:
url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/'
ios_top100_diary_rate_by_ctr.append(("苹果",i,ios_diary_count_by_click[i],ios_diary_count_by_imp[i], round(ios_diary_count_by_click[i]/ios_diary_count_by_imp[i],4),url))
ios_top100_diary_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return ios_top100_diary_rate_by_ctr[:100] if len(ios_top100_diary_rate_by_ctr) > 100 else ios_top100_diary_rate_by_ctr
#3 获取昨天安卓平台的top100diary(sorted by ctr)
#3.1 获取昨天安卓平台的diary的点击数
def get_android_diary_rate_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='AppStore' and cid_type='diary' group by cid order by count(cid) desc"
android_diary_count_by_click = con_sql(sql)
android_diary_count_by_click = tuple2dict(android_diary_count_by_click)
return android_diary_count_by_click
#3.2 获取昨天安卓平台的diary的曝光数
def get_android_diary_rate_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='App Store' and cid_type='diary' group by cid order by count(cid) desc"
android_diary_count_by_imp = con_sql(sql)
android_diary_count_by_imp = tuple2dict(android_diary_count_by_imp)
return android_diary_count_by_imp
#3.3 获取昨天安卓平台的top100点击率的diary
def get_android_top100_diary_rate_by_ctr(android_top100_diary_count_by_click,android_top100_diary_count_by_imp):
android_top100_diary_rate_by_ctr = []
for i in android_diary_count_by_click:
if i in android_diary_count_by_imp.keys() and android_diary_count_by_click[i] > 4:
url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/'
android_top100_diary_rate_by_ctr.append(("安卓",i,android_diary_count_by_click[i],android_diary_count_by_imp[i], round(android_diary_count_by_click[i]/android_diary_count_by_imp[i],4),url))
android_top100_diary_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return android_top100_diary_rate_by_ctr[:100] if len(android_top100_diary_rate_by_ctr) > 100 else android_top100_diary_rate_by_ctr
if __name__ == "__main__":
all_diary_count_by_click = get_all_diary_count_by_click()
all_diary_count_by_imp = get_all_diary_count_by_imp()
all_top100_diary_rate_by_ctr = get_all_top100_diary_rate_by_ctr(all_diary_count_by_click,all_diary_count_by_imp)
ios_diary_count_by_click = get_ios_diary_count_by_click()
ios_diary_count_by_imp = get_ios_diary_count_by_imp()
ios_top100_diary_rate_by_ctr = get_ios_top100_diary_rate_by_ctr(ios_diary_count_by_click,ios_diary_count_by_imp)
android_diary_count_by_click = get_android_diary_rate_by_click()
android_diary_count_by_imp = get_android_diary_rate_by_imp()
android_top100_diary_rate_by_ctr = get_android_top100_diary_rate_by_ctr(android_diary_count_by_click,android_diary_count_by_imp)
result_lst = [all_top100_diary_rate_by_ctr,ios_top100_diary_rate_by_ctr,android_top100_diary_rate_by_ctr]
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y%m%d")
output_path = "/data2/models/eda/recommended_indexs/3top100_ctr_diary_%s.txt" % yesterday
result2file(result_lst,output_path)
print("2.2已将top100点击率的diary存入文件")
# -*- coding: UTF-8 -*-
import pymysql
import datetime
def con_sql(sql):
# 从数据库的表里获取数据
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
cursor.execute(sql)
result = cursor.fetchall()
db.close()
return result
def tuple2dict(tuple_result):
#把sql结果从tuple格式转换成dict格式
dict_result = {}
for i in range(len(tuple_result)):
dict_result[tuple_result[i][0]] = tuple_result[i][1]
return dict_result
def result2file(result_lst,fpath):
with open(fpath,'w') as f:
tplt = "{0:\u3000<4}\t{1:\u3000<12}\t{2:\u3000^6}\t{3:\u3000^6}\t{4:\u3000<8}\t{5:\u3000^15}\n"
f.write("2.4 Top 100 Question\n")
f.write("=================================================================\n")
f.write(tplt.format("平台","question_id","点击数","曝光数","点击率","question链接"))
for i in result_lst:
for j in i:
f.write(tplt.format(j[0],j[1],j[2],j[3],"{}%".format(round(j[4]*100,2)),j[5]))
f.write("=================================================================\n")
if i != result_lst[-1]:
f.write(tplt.format("平台","question_id","点击数","曝光数","点击率","question链接"))
f.write("\n\n")
#1 获取昨天所有平台的top100question
#1.1 获取昨天所有平台的top100点击数的question
def get_all_question_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='question' group by cid order by count(cid) desc"
all_question_count_by_click = con_sql(sql)
all_question_count_by_click = tuple2dict(all_question_count_by_click)
return all_question_count_by_click
#1.2 获取昨天所有平台的top100曝光数的question
def get_all_question_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and cid_type='question' group by cid order by count(cid) desc"
all_question_count_by_imp = con_sql(sql)
all_question_count_by_imp = tuple2dict(all_question_count_by_imp)
return all_question_count_by_imp
#1.3 获取昨天所有平台的top100点击率的question
def get_all_top100_question_rate_by_ctr(all_question_count_by_click,all_question_count_by_imp):
all_top100_question_rate_by_ctr = []
if all_question_count_by_imp == {}:
for i in all_question_count_by_click:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
all_top100_question_rate_by_ctr.append(("所有",i,all_question_count_by_click[i],0,0,url))
all_top100_question_rate_by_ctr.sort(key=lambda x:x[2],reverse=True)
return all_top100_question_rate_by_ctr[:100] if len(all_top100_question_rate_by_ctr) > 100 else all_top100_question_rate_by_ctr
else:
for i in all_question_count_by_click:
if i in all_question_count_by_imp.keys() and all_question_count_by_click[i]>2:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
all_top100_question_rate_by_ctr.append(("所有",i,all_question_count_by_click[i],all_question_count_by_imp[i], round(all_question_count_by_click[i]/all_question_count_by_imp[i],4),url))
all_top100_question_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return all_top100_question_rate_by_ctr[:100] if len(all_top100_question_rate_by_ctr) > 100 else all_top100_question_rate_by_ctr
#2 获取昨天ios平台的top100question
#2.1 获取昨天ios平台的top100点击数的question
def get_ios_question_count_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='AppStore' and cid_type='question' group by cid order by count(cid) desc"
ios_question_count_by_click = con_sql(sql)
ios_question_count_by_click = tuple2dict(ios_question_count_by_click)
return ios_question_count_by_click
#2.2 获取昨天ios平台的top100曝光数的question
def get_ios_question_count_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type='App Store' and cid_type='question' group by cid order by count(cid) desc"
ios_question_count_by_imp = con_sql(sql)
ios_question_count_by_imp = tuple2dict(ios_question_count_by_imp)
return ios_question_count_by_imp
#2.3 获取昨天ios平台的top100点击率的question
def get_ios_top100_question_rate_by_ctr(ios_question_count_by_click,ios_question_count_by_imp):
ios_top100_question_rate_by_ctr = []
if ios_question_count_by_imp == {}:
for i in ios_question_count_by_click:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
ios_top100_question_rate_by_ctr.append(("苹果",i,ios_question_count_by_click[i],0,0,url))
ios_top100_question_rate_by_ctr.sort(key=lambda x:x[2],reverse=True)
return ios_top100_question_rate_by_ctr[:100] if len(ios_top100_question_rate_by_ctr) > 100 else ios_top100_question_rate_by_ctr
else:
for i in ios_question_count_by_click:
if i in ios_question_count_by_imp.keys() and ios_question_count_by_click[i]>2:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
ios_top100_question_rate_by_ctr.append(("苹果",i,ios_question_count_by_click[i],ios_question_count_by_imp[i], round(ios_question_count_by_click[i]/ios_question_count_by_imp[i],4),url))
ios_top100_question_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return ios_top100_question_rate_by_ctr[:100] if len(ios_top100_question_rate_by_ctr) > 100 else ios_top100_question_rate_by_ctr
#3 获取昨天安卓平台的top100question
#3.1 获取昨天安卓平台的top100点击数的question
def get_android_question_rate_by_click():
sql = "select cid,count(cid) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='AppStore' and cid_type='question' group by cid order by count(cid) desc"
android_question_count_by_click = con_sql(sql)
android_question_count_by_click = tuple2dict(android_question_count_by_click)
return android_question_count_by_click
#3.2 获取昨天安卓平台的top100曝光数的question
def get_android_question_rate_by_imp():
sql = "select cid,count(cid) from data_feed_exposure where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) and device_type!='App Store' and cid_type='question' group by cid order by count(cid) desc"
android_question_count_by_imp = con_sql(sql)
android_question_count_by_imp = tuple2dict(android_question_count_by_imp)
return android_question_count_by_imp
#3.3 获取昨天安卓平台的top100点击率的question
def get_android_top100_question_rate_by_ctr(android_question_count_by_click,android_question_count_by_imp):
android_top100_question_rate_by_ctr = []
if android_question_count_by_imp == {}:
for i in android_question_count_by_click:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
android_top100_question_rate_by_ctr.append(("安卓",i,android_question_count_by_click[i],0,0,url))
android_top100_question_rate_by_ctr.sort(key=lambda x:x[2],reverse=True)
return android_top100_question_rate_by_ctr[:100] if len(android_top100_question_rate_by_ctr) > 100 else android_top100_question_rate_by_ctr
else:
for i in android_question_count_by_click:
if i in android_question_count_by_imp.keys() and android_question_count_by_click[i]>2:
url = "http://m.igengmei.com/question/" + i[i.index('|')+1:] + '/'
android_top100_question_rate_by_ctr.append(("安卓",i,android_question_count_by_click[i],android_question_count_by_imp[i],round(android_question_count_by_click[i]/android_question_count_by_imp[i],4),url))
android_top100_question_rate_by_ctr.sort(key=lambda x:x[4],reverse=True)
return android_top100_question_rate_by_ctr[:100] if len(android_top100_question_rate_by_ctr) > 100 else android_top100_question_rate_by_ctr
if __name__ == "__main__":
all_question_count_by_click = get_all_question_count_by_click()
all_question_count_by_imp = get_all_question_count_by_imp()
all_top100_question_rate_by_ctr = get_all_top100_question_rate_by_ctr(all_question_count_by_click,all_question_count_by_imp)
ios_question_count_by_click = get_ios_question_count_by_click()
ios_question_count_by_imp = get_ios_question_count_by_imp()
ios_top100_question_rate_by_ctr = get_ios_top100_question_rate_by_ctr(ios_question_count_by_click,ios_question_count_by_imp)
android_question_count_by_click = get_android_question_rate_by_click()
android_question_count_by_imp = get_android_question_rate_by_imp()
android_top100_question_rate_by_ctr = get_android_top100_question_rate_by_ctr(android_question_count_by_click,android_question_count_by_imp)
result_lst = [all_top100_question_rate_by_ctr,ios_top100_question_rate_by_ctr,android_top100_question_rate_by_ctr]
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y%m%d")
output_path = "/data2/models/eda/recommended_indexs/5top100_ctr_question_%s.txt" % yesterday
result2file(result_lst,output_path)
print("2.4已将top100点击率的question存入文件")
print("已完成所有Top特征提取")
......@@ -238,42 +238,13 @@ def rate2file():
str(activate_uid_imp_all[3])+','+str(activate_uid_imp_beijing[3])+','+\
str(click_answer_all[3])+','+str(click_answer_ios[3])+','+str(click_answer_android[3])+','+\
str(click_diary_all[3])+','+str(click_diary_ios[3])+','+str(click_diary_android[3])+','+\
str(click_everything_all[3])+','+str(click_everything_ios[3])+','+str(click_everything_android[3])+','+'\n'
str(click_everything_all[3])+','+str(click_everything_ios[3])+','+str(click_everything_android[3])+'\n'
f.write(line)
if __name__ == '__main__':
result2file()
rate2file()
#1.功能:
从data_feed_click和data_feed_exposure两个表中统计一些推荐指标
#2.用法:
python main.py
#3.输出:
result_{date}.txt #发邮件使用
rate.csv #作图使用
#4.输出结果组成:
内容概览:以下所有数据都是昨天一天的首页的
1. 比例特征
1.1 answer曝光占比(=answer被曝光数/总cid被曝光数)
1.2 diary曝光占比(=diary被曝光数/总cid被曝光数)
1.3 活跃用户点击率(=有点击用户点击次数/有点击用户曝光次数)
1.4 活跃用户平均每天曝光次数(活跃用户指的是有点击的用户)
1.5 点击answer用户占比(=点击answer用户数/曝光answer用户数)
1.6 点击diary用户占比(=点击diary用户数/曝光diary用户数)
1.7 无点击用户占比(=无点击用户数/有曝光用户数)
1.8 无点击用户数分布占比(根据激活日期和平台来分)
2.Top特征
2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)
2.2 Top 100 diary (sorted by ctr)
2.3 Top 100 Answer (sorted by ctr)
2.4 Top 100 Question (sorted by ctr)
#5.requirements:
python==3.4.3
pymysql==0.9.2
python getRate.py
python getClickTimes2CountUid.py
python getTop100Diary.py
python getTop100Answer.py
python getTop100Question.py
dt=$(date -d last-day +%Y%m%d)
cat /data2/models/eda/recommended_indexs/1rate_features_$dt.txt /data2/models/eda/recommended_indexs/2click_times_to_count_uid_$dt.txt /data2/models/eda/recommended_indexs/3top100_ctr_diary_$dt.txt /data2/models/eda/recommended_indexs/4top100_ctr_answer_$dt.txt /data2/models/eda/recommended_indexs/5top100_ctr_question_$dt.txt > /data2/models/eda/recommended_indexs/result_all_$dt.txt
\ No newline at end of file
DIRECTORY_PATH="/data2/models/eda/test/"
\ No newline at end of file
import datetime
import pymysql
#一周之前的timestamp(7)
my_date1 = datetime.date.today() - datetime.timedelta(days=7)
my_tm1 = int(my_date1.strftime("%s"))
#二周之前的timestamp(14)
my_date2 = datetime.date.today() - datetime.timedelta(days=14)
my_tm2 = int(my_date2.strftime("%s"))
#一个月之前的timestamp(30)
my_date3 = datetime.date.today() - datetime.timedelta(days=30)
my_tm3 = int(my_date3.strftime("%s"))
#两个月之前的timestamp(60)
my_date4 = datetime.date.today() - datetime.timedelta(days=60)
my_tm4 = int(my_date4.strftime("%s"))
#三个月之前的timestamp(90)
my_date5 = datetime.date.today() - datetime.timedelta(days=90)
my_tm5 = int(my_date5.strftime("%s"))
def get_register_uid_count():
"""
rtype : dict
"""
db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_test')
cursor = db.cursor()
sql = "select '0-7' as label,count(distinct(device_id)) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {0}) \
union all \
select '7-14' as label,count(distinct(device_id)) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {1}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {0}) \
union all \
select '14-30' as label,count(distinct(device_id)) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {2}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {1}) \
union all \
select '30-60' as label,count(distinct(device_id)) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {3}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {2}) \
union all \
select '60-90' as label,count(distinct(device_id)) \
from data_feed_exposure \
where device_id not in \
(select distinct(device_id) from data_feed_exposure \
where time < {4}) \
and device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {3}) \
union all \
select '90+' as label,count(distinct(device_id)) \
from data_feed_exposure \
where device_id in \
(select distinct(device_id) from data_feed_exposure \
where time < {4})".format(my_tm1,my_tm2,my_tm3,my_tm4,my_tm5)
cursor.execute(sql)
result = cursor.fetchall()
db.close()
dct = {}
for i in result:
dct[i[0]] = i[1]
return dct
if __name__ == '__main__':
register_uid_detail = get_register_uid_count()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment