Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

change main function for test

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline
change main function for test
01b129f2 · 张彦钊 · ee6555f3 · 6a061403 · 01b129f2 · 01b129f2
Commit 01b129f2 authored Aug 11, 2018 by 张彦钊
4 changed files
--- a/eda/recommended_indexs/code/getClickZeroUidRate.py
+++ b/eda/recommended_indexs/code/getClickZeroUidRate.py
@@ -12,7 +12,6 @@ def con_sql(sql):
    return result


-
 #1 获取所有平台的0点击用户占比
 def get_all_click_zero_rate():
 	sql = "select count(distinct(device_id)) from data_feed_click where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -1 day)"

--- a/eda/test/getCidRate.py
+++ b/eda/test/getCidRate.py
+# -*- coding: UTF-8 -*-
+from utils import con_sql
+
+
+
+class CidRate(object):
+	def __init__(self, ndays, platform, cid_type):
+		"""
+		ndays : 1;2;3;4..  #The number of days from the current time
+		platform : 'all';'ios';'android'
+		cid_type : 'diary';'answer';'question'...
+		"""
+		self.ndays = ndays
+		if platform == "ios":
+			self.platform = "='AppStore'"
+		elif platform == "android":
+			self.platform = "!='AppStore'"
+		else:
+			self.platform = " is not null"
+		self.cid_type = cid_type
+
+	def get_cid_clk_rate(self, platform):
+		"""
+		platform : "所有";"苹果","安卓"   #方便显示
+		rtype : list
+		"""
+		if self.platform[-2] == 'e':
+			self.platform = self.platform.replace(' ','')
+		sql_cid = "select count(cid) from data_feed_click \
+		where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
+		and device_type{1} and cid_type='{2}'".format(self.ndays,self.platform,self.cid_type)
+		cid_clk_count = con_sql(sql_cid)[0][0]
+		sql_all = "select count(cid) from data_feed_click \
+		where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
+		and device_type{1}".format(self.ndays, self.platform)
+		all_clk_count = con_sql(sql_all)[0][0]
+		cid_clk_rate = round(cid_clk_count/all_clk_count,4)
+		return [platform,cid_clk_count,all_clk_count,cid_clk_rate]
+
+
+	def get_cid_imp_rate(self, platform):
+		"""
+		platform : "所有";"苹果","安卓"   #方便显示
+		rtype : list
+		"""
+		if self.platform[-2] == 'e':#注意：曝光表中AppStore有空格
+			self.platform = self.platform[:-6] + ' ' + self.platform[-6:]
+		sql_cid = "select count(cid) from data_feed_exposure \
+		where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
+		and device_type{1} and cid_type='{2}'".format(self.ndays,self.platform,self.cid_type)
+		cid_imp_count = con_sql(sql_cid)[0][0]
+		sql_all = "select count(cid) from data_feed_exposure \
+		where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
+		and device_type{1}".format(self.ndays, self.platform)
+		all_imp_count = con_sql(sql_all)[0][0]
+		cid_imp_rate = round(cid_imp_count/all_imp_count,4)
+		return [platform,cid_imp_count,all_imp_count,cid_imp_rate]
+		
+
+
+def main():
+	answer_rate_all = CidRate(1,"all","answer").get_cid_imp_rate("所有")
+	answer_rate_ios = CidRate(1,"ios","answer").get_cid_imp_rate("苹果")
+	answer_rate_android = CidRate(1,"android","answer").get_cid_imp_rate("安卓")
+	answer_rate_result = [answer_rate_all,answer_rate_ios,answer_rate_android]
+
+
+if __name__ == '__main__':
+	main()
+
--- a/eda/test/getClkCidUidRate.py
+++ b/eda/test/getClkCidUidRate.py
+# -*- coding: UTF-8 -*-
+from utils import con_sql
+
+
+class ClkCidUidRate(object):
+	def __init__(self, ndays, platform, cid_type):
+		"""
+		ndays : 1;2;3;4..  #The number of days from the current time
+		platform : 'all';'ios';'android'
+		cid_type : 'diary';'answer';'question';"everything"...
+		"""
+		self.ndays = ndays
+		if platform == "ios":
+			self.platform = "='AppStore'"
+		elif platform == "android":
+			self.platform = "!='AppStore'"
+		else:
+			self.platform = " is not null"
+		if cid_type == "everything":
+			self.cid_type = " is not null"
+		else:
+			self.cid_type = "='" + cid_type + "'"
+
+	def get_clk_cid_uid_rate(self, platform):
+		"""
+		platform : "所有";"苹果","安卓"   #方便显示
+		rtype : list
+		"""
+		if self.platform[-2] == 'e':
+			self.platform = self.platform.replace(' ','')
+		sql_clk = "select count(distinct(device_id)) from data_feed_click \
+		where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
+		and device_type{1} \
+		and cid_type{2}".format(self.ndays,self.platform,self.cid_type)
+		clk_count = con_sql(sql_clk)[0][0]
+
+		if self.platform[-2] == 'e':#注意：曝光表中AppStore有空格
+			self.platform = self.platform[:-6] + ' ' + self.platform[-6:]
+		sql_imp = "select count(distinct(device_id)) from data_feed_exposure \
+		where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
+		and device_type{1} \
+		and cid_type{2}".format(self.ndays,self.platform,self.cid_type)
+		imp_count = con_sql(sql_imp)[0][0]
+
+		clk_rate = round(clk_count/imp_count,4)
+		return [platform,clk_count,imp_count,clk_rate]
+
+
+	def result2file(self, result_lst, fpath):
+		pass
+
+
+
+def main():
+	#1.点击diary用户占比
+	click_diary_all = ClkCidUidRate(1,"all","diary").get_clk_cid_uid_rate("所有")
+	click_diary_ios = ClkCidUidRate(1,"ios","diary").get_clk_cid_uid_rate("苹果")
+	click_diary_android = ClkCidUidRate(1,"android","diary").get_clk_cid_uid_rate("安卓")
+	click_diary_result = [click_diary_all,click_diary_ios,click_diary_android]
+	print("已获取点击diary用户占比")
+	#2.点击answer用户占比
+	click_answer_all = ClkCidUidRate(1,"all","answer").get_clk_cid_uid_rate("所有")
+	click_answer_ios = ClkCidUidRate(1,"ios","answer").get_clk_cid_uid_rate("苹果")
+	click_answer_android = ClkCidUidRate(1,"android","answer").get_clk_cid_uid_rate("安卓")
+	click_answer_result = [click_answer_all,click_answer_ios,click_answer_android]
+	print("已获取点击answer用户占比")
+	#3.点击question用户占比(曝光表里cid类型没有question，因此下面的曝光数为0，0不能作分母)
+	#click_question_all = ClkCidUidRate(1,"all","question").get_clk_cid_uid_rate("所有")
+	#click_question_ios = ClkCidUidRate(1,"ios","question").get_clk_cid_uid_rate("苹果")
+	#click_question_android = ClkCidUidRate(1,"android","question").get_clk_cid_uid_rate("安卓")
+	#click_question_result = [click_question_all,click_question_ios,click_question_android]
+	#print("已获取点击question用户占比")
+	#4.有点击用户占比
+	click_everything_all = ClkCidUidRate(1,"all","everything").get_clk_cid_uid_rate("所有")
+	click_everything_ios = ClkCidUidRate(1,"ios","everything").get_clk_cid_uid_rate("苹果")
+	click_everything_android = ClkCidUidRate(1,"android","everything").get_clk_cid_uid_rate("安卓")
+	click_everything_result = [click_everything_all,click_everything_ios,click_everything_android]
+	print("已获取有点击用户占比")
+
+
+
+if __name__ == '__main__':
+	main()
+
--- a/eda/test/getTopFeatures.py
+++ b/eda/test/getTopFeatures.py
@@ -6,10 +6,10 @@ from config import DIRECTORY_PATH
 class TopFeatures(object):
 	def __init__(self, ndays, platform, cid_type, top_n=-1):
 		"""
-		ndays : 1;2;3;4..
+		ndays : 1;2;3;4..  #The number of days from the current time
 		platform : 'all';'ios';'android'
 		cid_type : 'diary';'answer';'question'...
-		top_n : the rows of the result
+		top_n : the top rows of the result
 		"""
 		self.ndays = ndays
 		if platform == "ios":
@@ -36,7 +36,7 @@ class TopFeatures(object):

 	def get_impression_times(self):
 		# rtype : dict
-		if self.platform[-2] == 'e':
+		if self.platform[-2] == 'e':#注意：曝光表中AppStore有空格
 			self.platform = self.platform[:-6] + ' ' + self.platform[-6:]
 		sql = "select cid,count(cid) from data_feed_exposure \
 		where from_unixtime(time,'%Y-%m-%d')=date_add(curdate(), interval -{0} day) \
@@ -45,12 +45,14 @@ class TopFeatures(object):
 		imp_times = tuple2dict(con_sql(sql))
 		return imp_times

-	def get_result(self, clk={}, imp={}, clk_n=2, result_types="ctr"):
+	def get_result(self, platform, clk={}, imp={}, clk_n=2, result_types="ctr"):
 		"""
+		platform : "所有";"苹果","安卓"   #方便显示
+		cid_type : 'diary';'answer';'question';"everything"...   #方便显示
 		clk : dict
 		imp : dict
 		clk_n : 获取topN点击率时，过滤的点击数
-		result_types : "clk";"imp";"ctr"
+		result_types : sorted by ["clk","imp","ctr"]
 		rtype : list
 		"""
 		topn = []
@@ -61,7 +63,7 @@ class TopFeatures(object):
 					url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/'
 				else:
 					url = "http://m.igengmei.com/{0}/".format(self.cid_type) + i[i.index('|')+1:] + '/'
-				topn.append((self.platform.strip(),i,clk[i],0,0,url))
+				topn.append((platform,i,clk[i],0,0,url))
 			topn.sort(key=lambda x:x[2],reverse=True)
 			return topn[:int(self.top_n)]
 		#获取topN的曝光
@@ -71,7 +73,7 @@ class TopFeatures(object):
 					url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/'
 				else:
 					url = "http://m.igengmei.com/{0}/".format(self.cid_type) + i[i.index('|')+1:] + '/'
-				topn.append((self.platform.strip(),i,0,imp[i],0,url))
+				topn.append((platform,i,0,imp[i],0,url))
 			topn.sort(key=lambda x:x[3],reverse=True)
 			return topn[:int(self.top_n)]
 		#获取topN的ctr
@@ -82,12 +84,13 @@ class TopFeatures(object):
 						url = "http://m.igengmei.com/diary_book/" + i[i.index('|')+1:] + '/'
 					else:
 						url = "http://m.igengmei.com/{0}/".format(self.cid_type) + i[i.index('|')+1:] + '/'
-					topn.append((self.platform.strip(),i,clk[i],imp[i],round(clk[i]/imp[i],4),url))
+					topn.append((platform,i,clk[i],imp[i],round(clk[i]/imp[i],4),url))
 			topn.sort(key=lambda x:x[4],reverse=True)
 			return topn[:int(self.top_n)]

 	def result2file(self, result_lst, fpath):
 		"""
+		cid_type : 'diary';'answer';'question';"everything"...   #方便显示
 		result_lst : [all_result,ios_result,android_result]
 		fpath : output filename
 		rtype : none
@@ -117,62 +120,65 @@ def main():
 	top_diary_all = TopFeatures(1, "all", "diary", 100)
 	clk_diary_times_all = top_diary_all.get_click_times()
 	imp_diary_times_all = top_diary_all.get_impression_times()
-	clk_diary_ctr_all = top_diary_all.get_result(clk_diary_times_all, imp_diary_times_all, 2, "ctr")
+	clk_diary_ctr_all = top_diary_all.get_result("所有",clk_diary_times_all, imp_diary_times_all, 4, "ctr")

 	top_diary_ios = TopFeatures(1, "ios", "diary", 100)
 	clk_diary_times_ios = top_diary_ios.get_click_times()
 	imp_diary_times_ios = top_diary_ios.get_impression_times()
-	clk_diary_ctr_ios = top_diary_ios.get_result(clk_diary_times_ios, imp_diary_times_ios, 2, "ctr")
+	clk_diary_ctr_ios = top_diary_ios.get_result("苹果",clk_diary_times_ios, imp_diary_times_ios, 4, "ctr")

 	top_diary_android = TopFeatures(1, "android", "diary", 100)
 	clk_diary_times_android = top_diary_android.get_click_times()
 	imp_diary_times_android = top_diary_android.get_impression_times()
-	clk_diary_ctr_android = top_diary_android.get_result(clk_diary_times_android, imp_diary_times_android, 2, "ctr")
+	clk_diary_ctr_android = top_diary_android.get_result("安卓",clk_diary_times_android, imp_diary_times_android, 4, "ctr")

 	result_lst = [clk_diary_ctr_all, clk_diary_ctr_ios, clk_diary_ctr_android]
 	output_path = DIRECTORY_PATH + "top100_ctr_diary_{}.txt".format(get_yesterday_date())
 	top_diary_all.result2file(result_lst, output_path)
+	print("已获取 Top diary 特征")

 	#2. Top answer
 	top_answer_all = TopFeatures(1, "all", "answer", 100)
 	clk_answer_times_all = top_answer_all.get_click_times()
 	imp_answer_times_all = top_answer_all.get_impression_times()
-	clk_answer_ctr_all = top_answer_all.get_result(clk_answer_times_all, imp_answer_times_all, 2, "ctr")
+	clk_answer_ctr_all = top_answer_all.get_result("所有",clk_answer_times_all, imp_answer_times_all, 2, "ctr")

 	top_answer_ios = TopFeatures(1, "ios", "answer", 100)
 	clk_answer_times_ios = top_answer_ios.get_click_times()
 	imp_answer_times_ios = top_answer_ios.get_impression_times()
-	clk_answer_ctr_ios = top_answer_ios.get_result(clk_answer_times_ios, imp_answer_times_ios, 2, "ctr")
+	clk_answer_ctr_ios = top_answer_ios.get_result("苹果",clk_answer_times_ios, imp_answer_times_ios, 2, "ctr")

 	top_answer_android = TopFeatures(1, "android", "answer", 100)
 	clk_answer_times_android = top_answer_android.get_click_times()
 	imp_answer_times_android = top_answer_android.get_impression_times()
-	clk_answer_ctr_android = top_answer_android.get_result(clk_answer_times_android, imp_answer_times_android, 2, "ctr")
+	clk_answer_ctr_android = top_answer_android.get_result("安卓",clk_answer_times_android, imp_answer_times_android, 2, "ctr")

 	result_lst = [clk_answer_ctr_all, clk_answer_ctr_ios, clk_answer_ctr_android]
 	output_path = DIRECTORY_PATH + "top100_ctr_answer_{}.txt".format(get_yesterday_date())
 	top_answer_all.result2file(result_lst, output_path)
+	print("已获取 Top answer 特征")


 	#3. Top question
 	top_question_all = TopFeatures(1, "all", "question", 100)
 	clk_question_times_all = top_question_all.get_click_times()
 	imp_question_times_all = top_question_all.get_impression_times()
-	clk_question_ctr_all = top_question_all.get_result(clk_question_times_all, imp_question_times_all, 2, "ctr")
+	clk_question_ctr_all = top_question_all.get_result("所有",clk_question_times_all, imp_question_times_all, 2, "ctr")

 	top_question_ios = TopFeatures(1, "ios", "question", 100)
 	clk_question_times_ios = top_question_ios.get_click_times()
 	imp_question_times_ios = top_question_ios.get_impression_times()
-	clk_question_ctr_ios = top_question_ios.get_result(clk_question_times_ios, imp_question_times_ios, 2, "ctr")
+	clk_question_ctr_ios = top_question_ios.get_result("苹果",clk_question_times_ios, imp_question_times_ios, 2, "ctr")

 	top_question_android = TopFeatures(1, "android", "question", 100)
 	clk_question_times_android = top_question_android.get_click_times()
 	imp_question_times_android = top_question_android.get_impression_times()
-	clk_question_ctr_android = top_question_android.get_result(clk_question_times_android, imp_question_times_android, 2, "ctr")
+	clk_question_ctr_android = top_question_android.get_result("安卓",clk_question_times_android, imp_question_times_android, 2, "ctr")

 	result_lst = [clk_question_ctr_all, clk_question_ctr_ios, clk_question_ctr_android]
 	output_path = DIRECTORY_PATH + "top100_ctr_question_{}.txt".format(get_yesterday_date())
 	top_question_all.result2file(result_lst, output_path)
+	print("已获取 Top question 特征")


 if __name__ == '__main__':