Commit b20353fd authored by 张彦钊's avatar 张彦钊

Merge branch 'master' of git.wanmeizhensuo.com:ML/ffm-baseline

modify add_data_to_redis
parents 5b36cb84 a6f24bd0
......@@ -43,11 +43,11 @@ def df2file(df,fpath):
def main():
print("2.开始获取用户点击次数表...")
print("2.开始获取Top特征...")
output_path = "/data2/models/eda/recommended_indexs/2click_times_to_count_uid_%s.txt" % get_yesterday_date()
df = get_click_times_to_count_uid_df()
df2file(df,output_path)
print("获取完成")
print("2.1已将用户点击次数分布存入文件")
......
......@@ -16,7 +16,24 @@ def get_yesterday_date():
def result2file(fpath):
with open(fpath,'w') as f:
tplt = "{0:\u3000<6}\t{1:\u3000<15}\t{2:\u3000<15}\t{3:\u3000<15}\n"
f.write("#注意:以下数据都是首页的\n")
line = """内容概览:以下所有数据都是首页的
1. 比例特征
1.1 answer曝光占比(=answer被曝光数/总cid被曝光数)
1.2 活跃用户点击率(=活跃用户点击次数/活跃用户曝光次数)
1.3 点击answer用户占比(=点击answer用户数/曝光answer用户数)
1.4 点击diary用户占比(=点击diary用户数/曝光diary用户数)
1.5 无点击用户占比(=无点击用户数/有曝光用户数)
2.Top特征
2.1 用户点击次数分布(第一列:用户点击次数;第二列:独立用户数量)
2.2 Top 100 diary(sorted by ctr)
2.3 Top 100 Answer(sorted by ctr)
2.4 Top 100 Question(sorted by click times)
具体内容:以下所有数据都是首页的
"""
f.write(line)
f.write("#1. 比例特征\n")
f.write("=================================================================\n")
f.write("#1.1answer曝光占比(=answer被曝光数/总cid被曝光数)\n")
......@@ -84,7 +101,7 @@ def main():
output_path = "/data2/models/eda/recommended_indexs/1rate_features_%s.txt" % get_yesterday_date()
print("开始获取比例特征...")
result2file(output_path)
print("已将所有比例特征存入文件")
print("已完成所有比例特征提取")
if __name__ == '__main__':
......
......@@ -107,21 +107,17 @@ def get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_
return android_top100_answer_rate_by_ctr[:100] if len(android_top100_answer_rate_by_ctr) > 100 else android_top100_answer_rate_by_ctr
if __name__ == "__main__":
print("开始获取top100点击率的answer...")
all_answer_count_by_click = get_all_answer_count_by_click()
all_answer_count_by_imp = get_all_answer_count_by_imp()
all_top100_answer_rate_by_ctr = get_all_top100_answer_rate_by_ctr(all_answer_count_by_click,all_answer_count_by_imp)
print("4.1已获得所有平台的top100点击率的answer")
ios_answer_count_by_click = get_ios_answer_count_by_click()
ios_answer_count_by_imp = get_ios_answer_count_by_imp()
ios_top100_answer_rate_by_ctr = get_ios_top100_answer_rate_by_ctr(ios_answer_count_by_click,ios_answer_count_by_imp)
print("4.2已获得ios平台的top100点击率的answer")
android_answer_count_by_click = get_android_answer_rate_by_click()
android_answer_count_by_imp = get_android_answer_rate_by_imp()
android_top100_answer_rate_by_ctr = get_android_top100_answer_rate_by_ctr(android_answer_count_by_click,android_answer_count_by_imp)
print("4.3已获得安卓平台的top100点击率的answer")
result_lst = [all_top100_answer_rate_by_ctr,ios_top100_answer_rate_by_ctr,android_top100_answer_rate_by_ctr]
today = datetime.date.today()
......@@ -129,4 +125,4 @@ if __name__ == "__main__":
yesterday = yesterday.strftime("%Y%m%d")
output_path = "/data2/models/eda/recommended_indexs/4top100_ctr_answer_%s.txt" % yesterday
result2file(result_lst,output_path)
print("已将top100点击率的answer存入文件")
print("2.3已将top100点击率的answer存入文件")
......@@ -106,21 +106,17 @@ def get_android_top100_diary_rate_by_ctr(android_top100_diary_count_by_click,and
return android_top100_diary_rate_by_ctr[:100] if len(android_top100_diary_rate_by_ctr) > 100 else android_top100_diary_rate_by_ctr
if __name__ == "__main__":
print("开始获取top100点击率的diary...")
all_diary_count_by_click = get_all_diary_count_by_click()
all_diary_count_by_imp = get_all_diary_count_by_imp()
all_top100_diary_rate_by_ctr = get_all_top100_diary_rate_by_ctr(all_diary_count_by_click,all_diary_count_by_imp)
print("3.1已获得所有平台的top100点击率的diary")
ios_diary_count_by_click = get_ios_diary_count_by_click()
ios_diary_count_by_imp = get_ios_diary_count_by_imp()
ios_top100_diary_rate_by_ctr = get_ios_top100_diary_rate_by_ctr(ios_diary_count_by_click,ios_diary_count_by_imp)
print("3.2已获得ios平台的top100点击率的diary")
android_diary_count_by_click = get_android_diary_rate_by_click()
android_diary_count_by_imp = get_android_diary_rate_by_imp()
android_top100_diary_rate_by_ctr = get_android_top100_diary_rate_by_ctr(android_diary_count_by_click,android_diary_count_by_imp)
print("3.3已获得安卓平台的top100点击率的diary")
result_lst = [all_top100_diary_rate_by_ctr,ios_top100_diary_rate_by_ctr,android_top100_diary_rate_by_ctr]
today = datetime.date.today()
......@@ -128,4 +124,4 @@ if __name__ == "__main__":
yesterday = yesterday.strftime("%Y%m%d")
output_path = "/data2/models/eda/recommended_indexs/3top100_ctr_diary_%s.txt" % yesterday
result2file(result_lst,output_path)
print("已将top100点击率的diary存入文件")
print("2.2已将top100点击率的diary存入文件")
......@@ -21,7 +21,7 @@ def tuple2dict(tuple_result):
def result2file(result_lst,fpath):
with open(fpath,'w') as f:
tplt = "{0:\u3000<4}\t{1:\u3000<12}\t{2:\u3000^6}\t{3:\u3000^6}\t{4:\u3000<8}\t{5:\u3000^15}\n"
f.write("Top 100 Question\n")
f.write("Top 100 Question(曝光表里cid类型没有question,因此下面的曝光数和点击率都为0)\n")
f.write("=================================================================\n")
f.write(tplt.format("平台","question_id","点击数","曝光数","点击率","question链接"))
for i in result_lst:
......@@ -128,21 +128,17 @@ def get_android_top100_question_rate_by_ctr(android_question_count_by_click,andr
return android_top100_question_rate_by_ctr[:100] if len(android_top100_question_rate_by_ctr) > 100 else android_top100_question_rate_by_ctr
if __name__ == "__main__":
print("开始获取top100点击率的question...")
all_question_count_by_click = get_all_question_count_by_click()
all_question_count_by_imp = get_all_question_count_by_imp()
all_top100_question_rate_by_ctr = get_all_top100_question_rate_by_ctr(all_question_count_by_click,all_question_count_by_imp)
print("5.1已获得所有平台的top100点击率的question")
ios_question_count_by_click = get_ios_question_count_by_click()
ios_question_count_by_imp = get_ios_question_count_by_imp()
ios_top100_question_rate_by_ctr = get_ios_top100_question_rate_by_ctr(ios_question_count_by_click,ios_question_count_by_imp)
print("5.2已获得ios平台的top100点击率的question")
android_question_count_by_click = get_android_question_rate_by_click()
android_question_count_by_imp = get_android_question_rate_by_imp()
android_top100_question_rate_by_ctr = get_android_top100_question_rate_by_ctr(android_question_count_by_click,android_question_count_by_imp)
print("5.3已获得安卓平台的top100点击率的question")
result_lst = [all_top100_question_rate_by_ctr,ios_top100_question_rate_by_ctr,android_top100_question_rate_by_ctr]
today = datetime.date.today()
......@@ -150,4 +146,5 @@ if __name__ == "__main__":
yesterday = yesterday.strftime("%Y%m%d")
output_path = "/data2/models/eda/recommended_indexs/5top100_ctr_question_%s.txt" % yesterday
result2file(result_lst,output_path)
print("已将top100点击率的question存入文件")
print("2.4已将top100点击率的question存入文件")
print("已完成所有Top特征提取")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment