Merge remote-tracking branch 'origin/master'

2c37208a · 赵晨 · ed2cb21f · 4fd9eed9 · 2c37208a
Commit 2c37208a authored Oct 15, 2018 by 赵晨
Hide whitespace changes
Inline Side-by-side

Showing with 133 additions and 63 deletions

node2vec_ctr.py eda/gray_stat/node2vec_ctr.py +133 -63

No files found.
--- a/eda/gray_stat/node2vec_ctr.py
+++ b/eda/gray_stat/node2vec_ctr.py
 # -*- coding: UTF-8 -*-
-import pymysql
+from utils import con_sql,get_yesterday_date,get_between_day
-import datetime
+import time
-import pandas as pd
+OUTPUT_PATH = "/data2/models/eda/node2vec/"
-DIRECTORY_PATH="/data2/ffm/"
-def get_yesterday_date():
+class GrayStat(object):
-    #自动获取昨天的日期,如"2018-08-08"
+	def __init__(self, cid_type, uid_type, ndays=get_yesterday_date()):
-    """
+		"""
-    :rtype : str
+		cid_type : diary，answer，question
-    """
+		uid_type : 8：_8结尾；6：_6结尾；6|8：_6或者_8结尾;^68：不是6或者8结尾的
-    today = datetime.date.today()
+		ndays : '2018-08-30'....
-    yesterday = today - datetime.timedelta(days=1)
+		"""
-    yesterday = yesterday.strftime("%Y-%m-%d")
+		self.cid_type = cid_type
-    print(yesterday)
+		self.uid_type = uid_type
-    return yesterday
+		self.ndays = ndays
-    #today = datetime.date.today().strftime("%Y%m%d")
-    #return today
+	def get_uid_count(self):
+		sql = "select count(distinct(device_id)) from data_feed_click \
+			where stat_date='{0}' \
-def get_data():
+			and (cid_type='{1}' or cid_type='diary_video') \
-    conn2db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
+			and device_id regexp '[{2}]$' \
-    cursor = conn2db.cursor()
+			and device_id in \
-    sql = "select device_id from nd_device_cid_similarity_matrix where device_id regexp '[3|4]$'"
+				(select device_id \
-    cursor.execute(sql)
+				from nd_device_cid_similarity_matrix_tmp \
-    result = cursor.fetchall()
+				where stat_date='{0}') \
-    device_id = tuple(pd.DataFrame(list(result))[0].values.tolist())
+			and device_id not in (select distinct(device_id) from jerry_test.bl_device_list) \
-    cursor.close()
+			and device_id not in (select distinct(device_id) from jerry_prod.blacklist)".format(self.ndays,\
-    return device_id
+				self.cid_type,self.uid_type)
+		uid_count = con_sql(sql)[0][0]
+		return uid_count
-def ctr(date):
-    device_id = get_data()
+	def get_uid_clk_times(self):
-    sql_click = "select count(cid) from data_feed_click " \
+		sql = "select count(device_id) from data_feed_click \
-              "where cid_type = 'diary' " \
+			where stat_date='{0}' \
-                "and stat_date = '{}' and device_id in {};".format(date,device_id)
+			and (cid_type='{1}' or cid_type='diary_video') \
-    db = pymysql.connect(host='10.66.157.22', port=4000, user='root', passwd='3SYz54LS9#^9sBvC', db='jerry_prod')
+			and device_id regexp '[{2}]$' \
-    cursor = db.cursor()
+			and device_id in \
-    cursor.execute(sql_click)
+				(select device_id \
-    click = cursor.fetchone()[0]
+				from nd_device_cid_similarity_matrix_tmp \
-    print("点击数："+str(click))
+				where stat_date='{0}') \
-    sql_exp = "select count(cid) from data_feed_exposure " \
+			and device_id not in (select distinct(device_id) from jerry_test.bl_device_list) \
-              "where cid_type = 'diary' and stat_date = '{}' and " \
+			and device_id not in (select distinct(device_id) from jerry_prod.blacklist)".format(self.ndays,\
-              "device_id in {}".format(date,device_id)
+				self.cid_type,self.uid_type)
-    cursor.execute(sql_exp)
+		uid_clk_times = con_sql(sql)[0][0]
-    exp = cursor.fetchone()[0]
+		return uid_clk_times
-    print("曝光数："+str(exp))
-    if exp != 0:
+	def get_uid_imp_times(self):
-        print("点击率："+str(click/exp))
+		sql = "select count(device_id) from data_feed_exposure \
+			where stat_date='{0}' \
-    return click,exp,click/exp
+			and cid_type='{1}' \
+			and device_id regexp '[{2}]$' \
-def rate2file():
+			and device_id in \
-    output_path = DIRECTORY_PATH + "node2vec_ctr.csv"
+				(select device_id \
-    with open(output_path,'a+') as f:
+				from nd_device_cid_similarity_matrix_tmp \
-        line = get_yesterday_date().replace('-', '')+','+str(temp_data[0])+','+str(temp_data[1])+','+str(temp_data[2])+'\n'
+				where stat_date='{0}') \
-        f.write(line)
+			and device_id not in (select distinct(device_id) from jerry_test.bl_device_list) \
+			and device_id not in (select distinct(device_id) from jerry_prod.blacklist)".format(self.ndays,\
-if __name__ == "__main__":
+				self.cid_type,self.uid_type)
-    #ctr(date)
+		uid_imp_times = con_sql(sql)[0][0]
-    date = get_yesterday_date()
+		return uid_imp_times
-    temp_data = ctr(date)
-    rate2file()
+class AllStat(object):
+	def __init__(self, cid_type, uid_type, ndays=get_yesterday_date()):
+		"""
+		cid_type : diary，answer，question
+		uid_type : 8：_8结尾；6：_6结尾；6|8：_6或者_8结尾;^68：不是6或者8结尾的
+		ndays : '2018-08-30'....
+		"""
+		self.cid_type = cid_type
+		self.uid_type = uid_type
+		self.ndays = ndays
+	def get_uid_count(self):
+		sql = "select count(distinct(device_id)) from data_feed_click \
+			where stat_date='{0}' \
+			and (cid_type='{1}' or cid_type='diary_video') \
+			and device_id regexp '[{2}]$' \
+			and device_id not in (select distinct(device_id) from jerry_test.bl_device_list) \
+			and device_id not in (select distinct(device_id) from jerry_prod.blacklist)".format(self.ndays,\
+				self.cid_type,self.uid_type)
+		uid_count = con_sql(sql)[0][0]
+		return uid_count
+	def get_uid_clk_times(self):
+		sql = "select count(device_id) from data_feed_click \
+			where stat_date='{0}' \
+			and (cid_type='{1}' or cid_type='diary_video') \
+			and device_id regexp '[{2}]$' \
+			and device_id not in (select distinct(device_id) from jerry_test.bl_device_list) \
+			and device_id not in (select distinct(device_id) from jerry_prod.blacklist)".format(self.ndays,\
+				self.cid_type,self.uid_type)
+		uid_clk_times = con_sql(sql)[0][0]
+		return uid_clk_times
+	def get_uid_imp_times(self):
+		sql = "select count(device_id) from data_feed_exposure \
+			where stat_date='{0}' \
+			and cid_type='{1}' \
+			and device_id regexp '[{2}]$' \
+			and device_id not in (select distinct(device_id) from jerry_test.bl_device_list) \
+			and device_id not in (select distinct(device_id) from jerry_prod.blacklist)".format(self.ndays,\
+				self.cid_type,self.uid_type)
+		uid_imp_times = con_sql(sql)[0][0]
+		return uid_imp_times
+def main():
+	date_list = get_between_day('2018-10-11','2018-10-14')
+	output = OUTPUT_PATH + "ctr.csv"
+	result = []
+	for my_date in date_list:
+		print("stat" + "  " + my_date)
+		g_class = GrayStat("diary","3|4",my_date)
+		a_class = AllStat("diary","3|4",my_date)
+		line1 = str(g_class.get_uid_count())+"\t"+str(g_class.get_uid_imp_times())+"\t"+str(g_class.get_uid_clk_times())
+		line2 = str(a_class.get_uid_count())+"\t"+str(a_class.get_uid_imp_times())+"\t"+str(a_class.get_uid_clk_times())
+		g_ctr = g_class.get_uid_clk_times()/g_class.get_uid_imp_times()
+		a_ctr = a_class.get_uid_clk_times()/a_class.get_uid_imp_times()
+		growth_rate = (g_ctr-a_ctr)/a_ctr
+		line = my_date + "\t" + str(round(g_ctr*100,2))+'%' + "\t" + str(round(a_ctr*100,2))+'%' + "\t" + \
+		str(round(growth_rate*100,2))+'%' + "\t" + line1 + "\t" + line2 + "\n"
+		result.append(line)
+	with open(output,"a+") as f:
+		for line in result:
+			f.write(line)
+if __name__ == '__main__':
+	main()
\ No newline at end of file