Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

modify add_data_to_redis

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline
modify add_data_to_redis
b20353fd · 张彦钊 · 5b36cb84 · a6f24bd0 · b20353fd · b20353fd
Commit b20353fd authored Aug 09, 2018 by 张彦钊
5 changed files
--- a/eda/recommended_indexs/code/getClickTimes2CountUid.py
+++ b/eda/recommended_indexs/code/getClickTimes2CountUid.py
@@ -43,11 +43,11 @@ def df2file(df,fpath):


 def main():
-	print("2.开始获取用户点击次数表...")
+	print("2.开始获取Top特征...")
 	output_path = "/data2/models/eda/recommended_indexs/2click_times_to_count_uid_%s.txt" % get_yesterday_date()
 	df = get_click_times_to_count_uid_df()
 	df2file(df,output_path)
-	print("获取完成")
+	print("2.1已将用户点击次数分布存入文件")




--- a/eda/recommended_indexs/code/getRate.py
+++ b/eda/recommended_indexs/code/getRate.py
@@ -16,7 +16,24 @@ def get_yesterday_date():
 def result2file(fpath):
 	with open(fpath,'w') as f:
 		tplt = "{0:\u3000<6}\t{1:\u3000<15}\t{2:\u3000<15}\t{3:\u3000<15}\n"
-		f.write("#注意：以下数据都是首页的\n")
+	    line = """内容概览：以下所有数据都是首页的
+1. 比例特征
+    1.1 answer曝光占比(=answer被曝光数/总cid被曝光数)
+    1.2 活跃用户点击率(=活跃用户点击次数/活跃用户曝光次数)
+    1.3 点击answer用户占比(=点击answer用户数/曝光answer用户数)
+    1.4 点击diary用户占比(=点击diary用户数/曝光diary用户数)
+    1.5 无点击用户占比(=无点击用户数/有曝光用户数)
+2.Top特征
+    2.1 用户点击次数分布(第一列：用户点击次数；第二列：独立用户数量)
+    2.2 Top 100 diary(sorted by ctr)
+    2.3 Top 100 Answer(sorted by ctr)
+    2.4 Top 100 Question(sorted by click times)
+ 
+
+
+具体内容：以下所有数据都是首页的
+    """
+    	f.write(line)
 		f.write("#1. 比例特征\n")
 		f.write("=================================================================\n")
 		f.write("#1.1answer曝光占比(=answer被曝光数/总cid被曝光数)\n")
@@ -84,7 +101,7 @@ def main():
 	output_path = "/data2/models/eda/recommended_indexs/1rate_features_%s.txt" % get_yesterday_date()
 	print("开始获取比例特征...")
 	result2file(output_path)
-	print("已将所有比例特征存入文件")
+	print("已完成所有比例特征提取")


 if __name__ == '__main__':

--- a/eda/recommended_indexs/code/getTop100Answer.py
+++ b/eda/recommended_indexs/code/getTop100Answer.py
@@ -107,21 +107,17 @@ def get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_
    return android_top100_answer_rate_by_ctr[:100] if len(android_top100_answer_rate_by_ctr) > 100 else android_top100_answer_rate_by_ctr

 if __name__ == "__main__":
-	print("开始获取top100点击率的answer...")
 	all_answer_count_by_click = get_all_answer_count_by_click()
 	all_answer_count_by_imp = get_all_answer_count_by_imp()
 	all_top100_answer_rate_by_ctr = get_all_top100_answer_rate_by_ctr(all_answer_count_by_click,all_answer_count_by_imp)
-	print("4.1已获得所有平台的top100点击率的answer")

 	ios_answer_count_by_click = get_ios_answer_count_by_click()
 	ios_answer_count_by_imp = get_ios_answer_count_by_imp()
 	ios_top100_answer_rate_by_ctr = get_ios_top100_answer_rate_by_ctr(ios_answer_count_by_click,ios_answer_count_by_imp)
-	print("4.2已获得ios平台的top100点击率的answer")

 	android_answer_count_by_click = get_android_answer_rate_by_click()
 	android_answer_count_by_imp = get_android_answer_rate_by_imp()
 	android_top100_answer_rate_by_ctr = get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_answer_count_by_imp)
-	print("4.3已获得安卓平台的top100点击率的answer")

 	result_lst = [all_top100_answer_rate_by_ctr,ios_top100_answer_rate_by_ctr,android_top100_answer_rate_by_ctr]
 	today = datetime.date.today()
@@ -129,4 +125,4 @@ if __name__ == "__main__":
 	yesterday = yesterday.strftime("%Y%m%d")
 	output_path = "/data2/models/eda/recommended_indexs/4top100_ctr_answer_%s.txt" % yesterday
 	result2file(result_lst,output_path)
-	print("已将top100点击率的answer存入文件")
+	print("2.3已将top100点击率的answer存入文件")
--- a/eda/recommended_indexs/code/getTop100Diary.py
+++ b/eda/recommended_indexs/code/getTop100Diary.py
@@ -106,21 +106,17 @@ def get_android_top100_diary_rate_by_ctr(android_top100_diary_count_by_click,and
    return android_top100_diary_rate_by_ctr[:100] if len(android_top100_diary_rate_by_ctr) > 100 else android_top100_diary_rate_by_ctr

 if __name__ == "__main__":
-	print("开始获取top100点击率的diary...")
 	all_diary_count_by_click = get_all_diary_count_by_click()
 	all_diary_count_by_imp = get_all_diary_count_by_imp()
 	all_top100_diary_rate_by_ctr = get_all_top100_diary_rate_by_ctr(all_diary_count_by_click,all_diary_count_by_imp)
-	print("3.1已获得所有平台的top100点击率的diary")

 	ios_diary_count_by_click = get_ios_diary_count_by_click()
 	ios_diary_count_by_imp = get_ios_diary_count_by_imp()
 	ios_top100_diary_rate_by_ctr = get_ios_top100_diary_rate_by_ctr(ios_diary_count_by_click,ios_diary_count_by_imp)
-	print("3.2已获得ios平台的top100点击率的diary")

 	android_diary_count_by_click = get_android_diary_rate_by_click()
 	android_diary_count_by_imp = get_android_diary_rate_by_imp()
 	android_top100_diary_rate_by_ctr = get_android_top100_diary_rate_by_ctr(android_diary_count_by_click,android_diary_count_by_imp)
-	print("3.3已获得安卓平台的top100点击率的diary")

 	result_lst = [all_top100_diary_rate_by_ctr,ios_top100_diary_rate_by_ctr,android_top100_diary_rate_by_ctr]
 	today = datetime.date.today()
@@ -128,4 +124,4 @@ if __name__ == "__main__":
 	yesterday = yesterday.strftime("%Y%m%d")
 	output_path = "/data2/models/eda/recommended_indexs/3top100_ctr_diary_%s.txt" % yesterday
 	result2file(result_lst,output_path)
-	print("已将top100点击率的diary存入文件")
+	print("2.2已将top100点击率的diary存入文件")
--- a/eda/recommended_indexs/code/getTop100Question.py
+++ b/eda/recommended_indexs/code/getTop100Question.py
@@ -21,7 +21,7 @@ def tuple2dict(tuple_result):
 def result2file(result_lst,fpath):
 	with open(fpath,'w') as f:
 		tplt = "{0:\u3000<4}\t{1:\u3000<12}\t{2:\u3000^6}\t{3:\u3000^6}\t{4:\u3000<8}\t{5:\u3000^15}\n"
-		f.write("Top 100 Question\n")
+		f.write("Top 100 Question(曝光表里cid类型没有question，因此下面的曝光数和点击率都为0)\n")
 		f.write("=================================================================\n")
 		f.write(tplt.format("平台","question_id","点击数","曝光数","点击率","question链接"))
 		for i in result_lst:
@@ -128,21 +128,17 @@ def get_android_top100_question_rate_by_ctr(android_question_count_by_click,andr
 	    return android_top100_question_rate_by_ctr[:100] if len(android_top100_question_rate_by_ctr) > 100 else android_top100_question_rate_by_ctr

 if __name__ == "__main__":
-	print("开始获取top100点击率的question...")
 	all_question_count_by_click = get_all_question_count_by_click()
 	all_question_count_by_imp = get_all_question_count_by_imp()
 	all_top100_question_rate_by_ctr = get_all_top100_question_rate_by_ctr(all_question_count_by_click,all_question_count_by_imp)
-	print("5.1已获得所有平台的top100点击率的question")

 	ios_question_count_by_click = get_ios_question_count_by_click()
 	ios_question_count_by_imp = get_ios_question_count_by_imp()
 	ios_top100_question_rate_by_ctr = get_ios_top100_question_rate_by_ctr(ios_question_count_by_click,ios_question_count_by_imp)
-	print("5.2已获得ios平台的top100点击率的question")

 	android_question_count_by_click = get_android_question_rate_by_click()
 	android_question_count_by_imp = get_android_question_rate_by_imp()
 	android_top100_question_rate_by_ctr = get_android_top100_question_rate_by_ctr(android_question_count_by_click,android_question_count_by_imp)
-	print("5.3已获得安卓平台的top100点击率的question")

 	result_lst = [all_top100_question_rate_by_ctr,ios_top100_question_rate_by_ctr,android_top100_question_rate_by_ctr]
 	today = datetime.date.today()
@@ -150,4 +146,5 @@ if __name__ == "__main__":
 	yesterday = yesterday.strftime("%Y%m%d")
 	output_path = "/data2/models/eda/recommended_indexs/5top100_ctr_question_%s.txt" % yesterday
 	result2file(result_lst,output_path)
-	print("已将top100点击率的question存入文件")
+	print("2.4已将top100点击率的question存入文件")
+	print("已完成所有Top特征提取")