Commit 5f50a255 authored by 张彦钊's avatar 张彦钊

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

change predict_file_name
parents c5c6968c 7ca95fde
......@@ -4,7 +4,7 @@ from utils import con_sql
class CidRate(object):
def __init__(self, ndays, platform, cid_type):
def __init__(self, platform, cid_type, ndays=1):
"""
ndays : 1;2;3;4.. #The number of days from the current time
platform : 'all';'ios';'android'
......@@ -12,9 +12,9 @@ class CidRate(object):
"""
self.ndays = ndays
if platform == "ios":
self.platform = "='AppStore'"
self.platform = "='App Store'"
elif platform == "android":
self.platform = "!='AppStore'"
self.platform = "!='App Store'"
else:
self.platform = " is not null"
self.cid_type = cid_type
......@@ -24,15 +24,14 @@ class CidRate(object):
platform : "所有";"苹果","安卓" #方便显示
rtype : list
"""
if self.platform[-2] == 'e':
self.platform = self.platform.replace(' ','')
sql_cid = "select count(cid) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
and device_type{1} and cid_type='{2}'".format(self.ndays,self.platform,self.cid_type)
and device_type{1} \
and cid_type='{2}'".format(self.ndays,self.platform.replace(' ',''),self.cid_type)
cid_clk_count = con_sql(sql_cid)[0][0]
sql_all = "select count(cid) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
and device_type{1}".format(self.ndays, self.platform)
and device_type{1}".format(self.ndays, self.platform.replace(' ',''))
all_clk_count = con_sql(sql_all)[0][0]
cid_clk_rate = round(cid_clk_count/all_clk_count,4)
return [platform,cid_clk_count,all_clk_count,cid_clk_rate]
......@@ -43,8 +42,6 @@ class CidRate(object):
platform : "所有";"苹果","安卓" #方便显示
rtype : list
"""
if self.platform[-2] == 'e':#注意:曝光表中AppStore有空格
self.platform = self.platform[:-6] + ' ' + self.platform[-6:]
sql_cid = "select count(cid) from data_feed_exposure \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
and device_type{1} and cid_type='{2}'".format(self.ndays,self.platform,self.cid_type)
......@@ -59,10 +56,10 @@ class CidRate(object):
def main():
answer_rate_all = CidRate(1,"all","answer").get_cid_imp_rate("所有")
answer_rate_ios = CidRate(1,"ios","answer").get_cid_imp_rate("苹果")
answer_rate_android = CidRate(1,"android","answer").get_cid_imp_rate("安卓")
answer_rate_result = [answer_rate_all,answer_rate_ios,answer_rate_android]
answer_imp_rate_all = CidRate("all","answer").get_cid_imp_rate("所有")
answer_imp_rate_ios = CidRate("ios","answer").get_cid_imp_rate("苹果")
answer_imp_rate_android = CidRate("android","answer").get_cid_imp_rate("安卓")
answer_imp_rate_result = [answer_imp_rate_all,answer_imp_rate_ios,answer_imp_rate_android]
if __name__ == '__main__':
......
......@@ -3,7 +3,7 @@ from utils import con_sql
class ClkCidUidRate(object):
def __init__(self, ndays, platform, cid_type):
def __init__(self, platform, cid_type, ndays=1):
"""
ndays : 1;2;3;4.. #The number of days from the current time
platform : 'all';'ios';'android'
......@@ -11,9 +11,9 @@ class ClkCidUidRate(object):
"""
self.ndays = ndays
if platform == "ios":
self.platform = "='AppStore'"
self.platform = "='App Store'"
elif platform == "android":
self.platform = "!='AppStore'"
self.platform = "!='App Store'"
else:
self.platform = " is not null"
if cid_type == "everything":
......@@ -26,12 +26,10 @@ class ClkCidUidRate(object):
platform : "所有";"苹果","安卓" #方便显示
rtype : list
"""
if self.platform[-2] == 'e':
self.platform = self.platform.replace(' ','')
sql_clk = "select count(distinct(device_id)) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
and device_type{1} \
and cid_type{2}".format(self.ndays,self.platform,self.cid_type)
and cid_type{2}".format(self.ndays,self.platform.replace(' ',''),self.cid_type)
clk_count = con_sql(sql_clk)[0][0]
if self.platform[-2] == 'e':#注意:曝光表中AppStore有空格
......@@ -53,15 +51,15 @@ class ClkCidUidRate(object):
def main():
#1.点击diary用户占比
click_diary_all = ClkCidUidRate(1,"all","diary").get_clk_cid_uid_rate("所有")
click_diary_ios = ClkCidUidRate(1,"ios","diary").get_clk_cid_uid_rate("苹果")
click_diary_android = ClkCidUidRate(1,"android","diary").get_clk_cid_uid_rate("安卓")
click_diary_all = ClkCidUidRate("all","diary").get_clk_cid_uid_rate("所有")
click_diary_ios = ClkCidUidRate("ios","diary").get_clk_cid_uid_rate("苹果")
click_diary_android = ClkCidUidRate("android","diary").get_clk_cid_uid_rate("安卓")
click_diary_result = [click_diary_all,click_diary_ios,click_diary_android]
print("已获取点击diary用户占比")
#2.点击answer用户占比
click_answer_all = ClkCidUidRate(1,"all","answer").get_clk_cid_uid_rate("所有")
click_answer_ios = ClkCidUidRate(1,"ios","answer").get_clk_cid_uid_rate("苹果")
click_answer_android = ClkCidUidRate(1,"android","answer").get_clk_cid_uid_rate("安卓")
click_answer_all = ClkCidUidRate("all","answer").get_clk_cid_uid_rate("所有")
click_answer_ios = ClkCidUidRate("ios","answer").get_clk_cid_uid_rate("苹果")
click_answer_android = ClkCidUidRate("android","answer").get_clk_cid_uid_rate("安卓")
click_answer_result = [click_answer_all,click_answer_ios,click_answer_android]
print("已获取点击answer用户占比")
#3.点击question用户占比(曝光表里cid类型没有question,因此下面的曝光数为0,0不能作分母)
......@@ -71,9 +69,9 @@ def main():
#click_question_result = [click_question_all,click_question_ios,click_question_android]
#print("已获取点击question用户占比")
#4.有点击用户占比
click_everything_all = ClkCidUidRate(1,"all","everything").get_clk_cid_uid_rate("所有")
click_everything_ios = ClkCidUidRate(1,"ios","everything").get_clk_cid_uid_rate("苹果")
click_everything_android = ClkCidUidRate(1,"android","everything").get_clk_cid_uid_rate("安卓")
click_everything_all = ClkCidUidRate("all","everything").get_clk_cid_uid_rate("所有")
click_everything_ios = ClkCidUidRate("ios","everything").get_clk_cid_uid_rate("苹果")
click_everything_android = ClkCidUidRate("android","everything").get_clk_cid_uid_rate("安卓")
click_everything_result = [click_everything_all,click_everything_ios,click_everything_android]
print("已获取有点击用户占比")
......
......@@ -4,7 +4,7 @@ from config import DIRECTORY_PATH
class TopFeatures(object):
def __init__(self, ndays, platform, cid_type, top_n=-1):
def __init__(self, platform, cid_type, top_n=-1, ndays=1):
"""
ndays : 1;2;3;4.. #The number of days from the current time
platform : 'all';'ios';'android'
......@@ -13,9 +13,9 @@ class TopFeatures(object):
"""
self.ndays = ndays
if platform == "ios":
self.platform = "='AppStore'"
self.platform = "='App Store'"
elif platform == "android":
self.platform = "!='AppStore'"
self.platform = "!='App Store'"
else:
self.platform = " is not null"
self.cid_type = cid_type
......@@ -24,20 +24,17 @@ class TopFeatures(object):
def get_click_times(self):
# rtype : dict
if self.platform[-2] == 'e':
self.platform = self.platform.replace(' ','')
sql = "select cid,count(cid) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
and device_type{1} and cid_type='{2}' \
group by cid order by count(cid) desc".format(self.ndays, self.platform, self.cid_type)
group by cid \
order by count(cid) desc".format(self.ndays, self.platform.replace(' ',''), self.cid_type)
clk_times = tuple2dict(con_sql(sql))
return clk_times
def get_impression_times(self):
# rtype : dict
if self.platform[-2] == 'e':#注意:曝光表中AppStore有空格
self.platform = self.platform[:-6] + ' ' + self.platform[-6:]
sql = "select cid,count(cid) from data_feed_exposure \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
and device_type{1} and cid_type='{2}' \
......@@ -45,7 +42,7 @@ class TopFeatures(object):
imp_times = tuple2dict(con_sql(sql))
return imp_times
def get_result(self, platform, clk={}, imp={}, clk_n=2, result_types="ctr"):
def get_result(self, platform, clk_n=2, result_types="ctr"):
"""
platform : "所有";"苹果","安卓" #方便显示
cid_type : 'diary';'answer';'question';"everything"... #方便显示
......@@ -55,6 +52,8 @@ class TopFeatures(object):
result_types : sorted by ["clk","imp","ctr"]
rtype : list
"""
clk = self.get_click_times()
imp = self.get_impression_times()
topn = []
#获取topN的点击
if imp == {} or result_types == "clk":
......@@ -90,7 +89,6 @@ class TopFeatures(object):
def result2file(self, result_lst, fpath):
"""
cid_type : 'diary';'answer';'question';"everything"... #方便显示
result_lst : [all_result,ios_result,android_result]
fpath : output filename
rtype : none
......@@ -117,20 +115,14 @@ class TopFeatures(object):
def main():
#1. Top diary
top_diary_all = TopFeatures(1, "all", "diary", 100)
clk_diary_times_all = top_diary_all.get_click_times()
imp_diary_times_all = top_diary_all.get_impression_times()
clk_diary_ctr_all = top_diary_all.get_result("所有",clk_diary_times_all, imp_diary_times_all, 4, "ctr")
top_diary_all = TopFeatures("all", "diary", 100)
clk_diary_ctr_all = top_diary_all.get_result("所有", 4, "ctr")
top_diary_ios = TopFeatures(1, "ios", "diary", 100)
clk_diary_times_ios = top_diary_ios.get_click_times()
imp_diary_times_ios = top_diary_ios.get_impression_times()
clk_diary_ctr_ios = top_diary_ios.get_result("苹果",clk_diary_times_ios, imp_diary_times_ios, 4, "ctr")
top_diary_ios = TopFeatures("ios", "diary", 100)
clk_diary_ctr_ios = top_diary_ios.get_result("苹果", 4, "ctr")
top_diary_android = TopFeatures(1, "android", "diary", 100)
clk_diary_times_android = top_diary_android.get_click_times()
imp_diary_times_android = top_diary_android.get_impression_times()
clk_diary_ctr_android = top_diary_android.get_result("安卓",clk_diary_times_android, imp_diary_times_android, 4, "ctr")
top_diary_android = TopFeatures("android", "diary", 100)
clk_diary_ctr_android = top_diary_android.get_result("安卓", 4, "ctr")
result_lst = [clk_diary_ctr_all, clk_diary_ctr_ios, clk_diary_ctr_android]
output_path = DIRECTORY_PATH + "top100_ctr_diary_{}.txt".format(get_yesterday_date())
......@@ -138,20 +130,14 @@ def main():
print("已获取 Top diary 特征")
#2. Top answer
top_answer_all = TopFeatures(1, "all", "answer", 100)
clk_answer_times_all = top_answer_all.get_click_times()
imp_answer_times_all = top_answer_all.get_impression_times()
clk_answer_ctr_all = top_answer_all.get_result("所有",clk_answer_times_all, imp_answer_times_all, 2, "ctr")
top_answer_all = TopFeatures("all", "answer", 100)
clk_answer_ctr_all = top_answer_all.get_result("所有", 2, "ctr")
top_answer_ios = TopFeatures(1, "ios", "answer", 100)
clk_answer_times_ios = top_answer_ios.get_click_times()
imp_answer_times_ios = top_answer_ios.get_impression_times()
clk_answer_ctr_ios = top_answer_ios.get_result("苹果",clk_answer_times_ios, imp_answer_times_ios, 2, "ctr")
top_answer_ios = TopFeatures("ios", "answer", 100)
clk_answer_ctr_ios = top_answer_ios.get_result("苹果", 2, "ctr")
top_answer_android = TopFeatures(1, "android", "answer", 100)
clk_answer_times_android = top_answer_android.get_click_times()
imp_answer_times_android = top_answer_android.get_impression_times()
clk_answer_ctr_android = top_answer_android.get_result("安卓",clk_answer_times_android, imp_answer_times_android, 2, "ctr")
top_answer_android = TopFeatures("android", "answer", 100)
clk_answer_ctr_android = top_answer_android.get_result("安卓", 2, "ctr")
result_lst = [clk_answer_ctr_all, clk_answer_ctr_ios, clk_answer_ctr_android]
output_path = DIRECTORY_PATH + "top100_ctr_answer_{}.txt".format(get_yesterday_date())
......@@ -160,20 +146,14 @@ def main():
#3. Top question
top_question_all = TopFeatures(1, "all", "question", 100)
clk_question_times_all = top_question_all.get_click_times()
imp_question_times_all = top_question_all.get_impression_times()
clk_question_ctr_all = top_question_all.get_result("所有",clk_question_times_all, imp_question_times_all, 2, "ctr")
top_question_ios = TopFeatures(1, "ios", "question", 100)
clk_question_times_ios = top_question_ios.get_click_times()
imp_question_times_ios = top_question_ios.get_impression_times()
clk_question_ctr_ios = top_question_ios.get_result("苹果",clk_question_times_ios, imp_question_times_ios, 2, "ctr")
top_question_android = TopFeatures(1, "android", "question", 100)
clk_question_times_android = top_question_android.get_click_times()
imp_question_times_android = top_question_android.get_impression_times()
clk_question_ctr_android = top_question_android.get_result("安卓",clk_question_times_android, imp_question_times_android, 2, "ctr")
top_question_all = TopFeatures("all", "question", 100)
clk_question_ctr_all = top_question_all.get_result("所有", 2, "ctr")
top_question_ios = TopFeatures("ios", "question", 100)
clk_question_ctr_ios = top_question_ios.get_result("苹果", 2, "ctr")
top_question_android = TopFeatures("android", "question", 100)
clk_question_ctr_android = top_question_android.get_result("安卓", 2, "ctr")
result_lst = [clk_question_ctr_all, clk_question_ctr_ios, clk_question_ctr_android]
output_path = DIRECTORY_PATH + "top100_ctr_question_{}.txt".format(get_yesterday_date())
......
# -*- coding: UTF-8 -*-
from utils import *
from config import DIRECTORY_PATH
from getCidRate import *
from getClkCidUidRate import *
from getTopFeatures import *
def main():
print("开始获取特征数据...")
#1. 比例特征
#1.1 answer曝光占比(=answer被曝光数/总cid被曝光数)
answer_imp_rate_all = CidRate("all","answer").get_cid_imp_rate("所有")
answer_imp_rate_ios = CidRate("ios","answer").get_cid_imp_rate("苹果")
answer_imp_rate_android = CidRate("android","answer").get_cid_imp_rate("安卓")
answer_imp_rate_result = [answer_imp_rate_all,answer_imp_rate_ios,answer_imp_rate_android]
print("已获取answer曝光占比")
#1.2 活跃用户点击率(=活跃用户点击次数/活跃用户曝光次数)
activate_uid_ctr_all = get_activate_uid_ctr("all")
activate_uid_ctr_ios = get_activate_uid_ctr("ios")
activate_uid_ctr_android = get_activate_uid_ctr("android")
print("已获取活跃用户点击率")
#1.3 点击answer用户占比(=点击answer用户数/曝光answer用户数)
click_answer_all = ClkCidUidRate("all","answer").get_clk_cid_uid_rate("所有")
click_answer_ios = ClkCidUidRate("ios","answer").get_clk_cid_uid_rate("苹果")
click_answer_android = ClkCidUidRate("android","answer").get_clk_cid_uid_rate("安卓")
click_answer_result = [click_answer_all,click_answer_ios,click_answer_android]
print("已获取点击answer用户占比")
#1.4 点击diary用户占比(=点击diary用户数/曝光diary用户数)
click_diary_all = ClkCidUidRate("all","diary").get_clk_cid_uid_rate("所有")
click_diary_ios = ClkCidUidRate("ios","diary").get_clk_cid_uid_rate("苹果")
click_diary_android = ClkCidUidRate("android","diary").get_clk_cid_uid_rate("安卓")
click_diary_result = [click_diary_all,click_diary_ios,click_diary_android]
print("已获取点击diary用户占比")
#1.5 有点击用户占比(=有点击用户数/有曝光用户数)
click_everything_all = ClkCidUidRate("all","everything").get_clk_cid_uid_rate("所有")
click_everything_ios = ClkCidUidRate("ios","everything").get_clk_cid_uid_rate("苹果")
click_everything_android = ClkCidUidRate("android","everything").get_clk_cid_uid_rate("安卓")
click_everything_result = [click_everything_all,click_everything_ios,click_everything_android]
print("已获取有点击用户占比")
#2. Top特征
#2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)
df = get_click_times_to_count_uid_df()
print("已获取用户点击次数分布")
#2.2 Top 100 diary(sorted by ctr)
top_diary_all = TopFeatures("all", "diary", 100).get_result("所有", 4, "ctr")
top_diary_ios = TopFeatures("ios", "diary", 100).get_result("苹果", 4, "ctr")
top_diary_android = TopFeatures("android", "diary", 100).get_result("安卓", 4, "ctr")
print("已获取 Top diary 特征")
#2.3 Top 100 Answer(sorted by ctr)
top_answer_all = TopFeatures("all", "answer", 100).get_result("所有", 2, "ctr")
top_answer_ios = TopFeatures("ios", "answer", 100).get_result("苹果", 2, "ctr")
top_answer_android = TopFeatures("android", "answer", 100).get_result("安卓", 2, "ctr")
print("已获取 Top answer 特征")
#2.4 Top 100 Question(sorted by click times)
top_question_all = TopFeatures("all", "question", 100).get_result("所有", 2, "ctr")
top_question_ios = TopFeatures("ios", "question", 100).get_result("苹果", 2, "ctr")
top_question_android = TopFeatures("android", "question", 100).get_result("安卓", 2, "ctr")
print("已获取 Top question 特征")
print("done")
if __name__ == '__main__':
main()
# -*- coding: UTF-8 -*-
import pymysql
import datetime
import pandas as pd
def con_sql(sql):
#从数据库的表里获取数据
......@@ -35,3 +36,51 @@ def get_yesterday_date():
yesterday = today - datetime.timedelta(days=1)
yesterday = yesterday.strftime("%Y%m%d")
return yesterday
#获取各个平台下的活跃用户点击率
def get_activate_uid_ctr(platform, ndays=1):
"""
ndays : 1;2;3;4.. #The number of days from the current time
platform : 'all';'ios';'android'
rtype : list
"""
if platform == "ios":
platform = "='App Store'"
elif platform == "android":
platform = "!='App Store'"
else:
platform = " is not null"
sql_clk = "select count(device_id) from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
and device_type{1}".format(ndays, platform.replace(' ',''))
clk_count = con_sql(sql_clk)[0][0]
sql_imp = "select count(device_id) from data_feed_exposure \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
and device_id in \
(select device_id from data_feed_click \
where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{1} day) \
and device_type{2}) \
and device_type{3}".format(ndays, ndays, platform.replace(' ',''), platform)
imp_count = con_sql(sql_imp)[0][0]
clk_rate = round(clk_count/imp_count, 4 )
if platform == "='App Store'":
platform = "苹果"
elif platform == "!='App Store'":
platform = "安卓"
else:
platform = "所有"
return [platform, clk_count, imp_count, clk_rate]
#获取 {点击次数 : 独立用户数}
def get_click_times_to_count_uid_df():
"""
rtype : pandas.DataFrame
"""
sql = "select device_id,count(cid_type) click_times from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day) group by device_id order by click_times desc"
uid_click_times = con_sql(sql)
uid_lst = [i[0] for i in uid_click_times]
click_times_lst = [i[1] for i in uid_click_times]
uid_click_times_df = pd.DataFrame({"uid":uid_lst,"click_times":click_times_lst})
df = uid_click_times_df.groupby(by="click_times",as_index=False).count()
return df
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment