Commit ab758003 authored by litaolemo's avatar litaolemo

update

parent 227576f9
...@@ -4,18 +4,6 @@ ...@@ -4,18 +4,6 @@
# @email : litao@igengmei.com # @email : litao@igengmei.com
# @author : litao # @author : litao
# -*- coding:UTF-8 -*-
# @Time : 2020/9/14 14:53
# @File : meigou_huidu_huisu.py
# @email : litao@igengmei.com
# @author : litao
# -*- coding:UTF-8 -*-
# @Time : 2020/9/4 17:07
# @File : search_meigou_ctr.py
# @email : litao@igengmei.com
# @author : litao
import hashlib import hashlib
import json import json
...@@ -87,7 +75,7 @@ spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJso ...@@ -87,7 +75,7 @@ spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJso
spark.sql("CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'") spark.sql("CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'")
task_list = [] task_list = []
task_days = 50 task_days = 5
for t in range(1, task_days): for t in range(1, task_days):
day_num = 0 - t day_num = 0 - t
now = (datetime.datetime.now() + datetime.timedelta(days=day_num)) now = (datetime.datetime.now() + datetime.timedelta(days=day_num))
......
...@@ -73,7 +73,7 @@ spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJso ...@@ -73,7 +73,7 @@ spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJso
spark.sql("CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'") spark.sql("CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'")
task_list = [] task_list = []
task_days = 3 task_days = 60
for t in range(1, task_days): for t in range(1, task_days):
day_num = 0 - t day_num = 0 - t
now = (datetime.datetime.now() + datetime.timedelta(days=day_num)) now = (datetime.datetime.now() + datetime.timedelta(days=day_num))
...@@ -83,157 +83,133 @@ for t in range(1, task_days): ...@@ -83,157 +83,133 @@ for t in range(1, task_days):
one_week_age_str = (now + datetime.timedelta(days=-7)).strftime("%Y%m%d") one_week_age_str = (now + datetime.timedelta(days=-7)).strftime("%Y%m%d")
sql_search_ctr = r""" sql_search_ctr = r"""
select D.ACTIVE_TYPE,D.DEVICE_OS_TYPE,sum(T.CLICK_NUM) as CLICK_NUM,sum(C.EXPOSURE) as EXPOSURE from SELECT
exp.partition_date as partition_date
(SELECT T.DEVICE_ID as DEVICE_ID, --设备ID ,active_type
T.CARD_ID as CARD_ID, --卡片ID ,device_os_type
COUNT(T.CARD_ID) AS EXPOSURE --点击次数 ,sum(service_exp_pv) as service_exp_pv
FROM ML.MID_ML_C_ET_PE_PRECISEEXPOSURE_DIMEN_D T ,sum(neirong_exp_pv) as neirong_exp_pv
WHERE T.PARTITION_DAY = '{partition_day}' ,sum(service_click_pv) as service_click_pv
AND T.PAGE_CODE = 'search_result_diary' ,sum(neirong_click_pv) as neirong_click_pv
GROUP BY T.DEVICE_ID, FROM
T.CARD_ID) C (
left join SELECT t1.partition_day as partition_date,device_id
(SELECT T.DEVICE_ID, --设备ID ,service_exp_pv,neirong_exp_pv,service_click_pv,neirong_click_pv
T.CARD_ID, --卡片ID FROM
SUM(T.CLICK_NUM) AS CLICK_NUM --点击次数 (--搜索结果页卡片精准曝光
FROM ML.ML_C_ET_CK_CLICK_DIMEN_D T SELECT partition_day,
WHERE T.PARTITION_DAY = '{partition_day}' device_id,
AND T.PAGE_CODE = 'search_result_diary' count(CASE WHEN card_content_type='service' THEN 1 END) as service_exp_pv,
AND T.ACTION IN ('search_result_click_infomation_item','search_result_more_diary_click_item','on_click_card','on_click_diary_card') count(CASE WHEN card_content_type<>'service' THEN 1 END) as neirong_exp_pv
GROUP BY T.DEVICE_ID, FROM
T.CARD_ID) T (
on C.DEVICE_ID=T.DEVICE_ID and C.CARD_ID = T.CARD_ID SELECT device_id,partition_day,card_content_type
LEFT JOIN FROM ml.mid_ml_c_et_pe_preciseexposure_dimen_d
( WHERE partition_day >= '{partition_day}'
SELECT T.DEVICE_ID, and partition_day < '{end_date}'
T.DEVICE_OS_TYPE, and action in ('page_precise_exposure','home_choiceness_card_exposure')
T.ACTIVE_TYPE and is_exposure = '1'
FROM ML.ML_C_CT_DV_DEVICE_DIMEN_D T and page_code in ('search_result_diary','search_result_doctor','search_result_hospital','search_result_more'
WHERE T.PARTITION_DAY = '{partition_day}' ,'search_result_more_infomation','search_result_more_user','search_result_post','search_result_welfare'
AND T.ACTIVE_TYPE IN ('1', '2', '4')) ,'search_result_wiki','search_result_question_answer')
D on C.DEVICE_ID = D.DEVICE_ID AND card_content_type IN ('diary')
LEFT JOIN )a
( group by partition_day,card_content_type,device_id
SELECT DISTINCT device_id )t1
FROM ml.ml_d_ct_dv_devicespam_d --去除机构刷单设备,即作弊设备(浏览和曝光事件去除)
WHERE partition_day='{partition_day}' LEFT JOIN
(--搜索结果页卡片点击
UNION ALL SELECT cl_id,partition_date
SELECT DISTINCT device_id ,sum(CASE WHEN card_content_type='service' THEN click_pv END) as service_click_pv
FROM dim.dim_device_user_staff --去除内网用户 ,sum(CASE WHEN card_content_type='neirong' THEN click_pv END) as neirong_click_pv
)spam_pv FROM
on spam_pv.device_id=T.DEVICE_ID (
LEFT JOIN SELECT partition_date,cl_id,'service' as card_content_type,count(1) as click_pv
( FROM online.bl_hdfs_maidian_updates
SELECT partition_date,device_id WHERE partition_date >= '{partition_day}'
FROM AND partition_date < '{end_date}'
(--找出user_id当天活跃的第一个设备id AND ((action in ('search_result_click_recommend_item','search_result_welfare_click_item')
SELECT user_id,partition_date, AND page_name in ('search_result_more','search_result_welfare'))
if(size(device_list) > 0, device_list [ 0 ], '') AS device_id or (action = 'goto_welfare_detail' AND params ['from'] = 'search_result_welfare_recommend')
FROM online.ml_user_updates or (action = 'on_click_card' AND params['card_content_type'] in ('service') AND page_name in ('search_result_more','search_result_welfare')))
WHERE partition_date>='{partition_day}' AND partition_date<'{end_date}' GROUP BY partition_date,cl_id,'service'
)t1
JOIN UNION ALL
( --医生账号 SELECT partition_date,cl_id,'neirong' as card_content_type,count(1) as click_pv
SELECT distinct user_id FROM online.bl_hdfs_maidian_updates
FROM online.tl_hdfs_doctor_view WHERE partition_date >= '{partition_day}'
WHERE partition_date = '{partition_day}' AND partition_date < '{end_date}'
AND ((action in ('on_click_topic_card','on_click_diary_card','search_result_click_infomation_item')
--马甲账号/模特用户 AND page_name in ('search_result_more','search_result_diary','search_result_post'))
UNION ALL or (action = 'on_click_card' AND params['card_content_type'] in ('answer','diary') AND page_name in ('search_result_more','search_result_diary','search_result_question_answer')))
SELECT user_id GROUP BY partition_date,cl_id,'neirong'
FROM ml.ml_c_ct_ui_user_dimen_d )t2
WHERE partition_day = '{partition_day}' GROUP BY cl_id,partition_date
AND (is_puppet = 'true' or is_classifyuser = 'true') )t2
ON t1.partition_day=t2.partition_date AND t1.device_id=t2.cl_id
UNION ALL )exp
--公司内网覆盖用户
select distinct user_id JOIN
from dim.dim_device_user_staff (
SELECT partition_date,device_id,t2.active_type,t2.channel,t2.device_os_type
UNION ALL FROM
--登陆过医生设备 (
SELECT distinct t1.user_id SELECT
FROM partition_date,m.device_id
( ,array(device_os_type ,'合计') as device_os_type
SELECT user_id, v.device_id as device_id ,array(case WHEN active_type = '4' THEN '老活'
FROM online.ml_user_history_detail WHEN active_type in ('1','2') then '新增' END ,'合计') as active_type
LATERAL VIEW EXPLODE(device_history_list) v AS device_id ,array(CASE WHEN is_ai_channel = 'true' THEN 'AI' ELSE '其他' END , '合计') as channel
WHERE partition_date = '{partition_day}' FROM online.ml_device_day_active_status m
)t1 LEFT JOIN
JOIN (SELECT code,is_ai_channel,partition_day
( FROM DIM.DIM_AI_CHANNEL_ZP_NEW
SELECT device_id WHERE partition_day>= '{partition_day}' AND partition_day < '{end_date}' ) tmp
FROM online.ml_device_history_detail ON m.partition_date=tmp.partition_day AND first_channel_source_type=code
WHERE partition_date = '{partition_day}' where partition_date >= '{partition_day}'
AND is_login_doctor = '1' AND partition_date < '{end_date}'
)t2 AND active_type in ('1','2','4')
ON t1.device_id = t2.device_id ) mas
)t2 LATERAL VIEW explode(mas.channel) t2 AS channel
on t1.user_id=t2.user_id LATERAL VIEW explode(mas.device_os_type) t2 AS device_os_type
group by partition_date,device_id LATERAL VIEW explode(mas.active_type) t2 AS active_type
)dev )dev_channel
on T.DEVICE_ID=dev.device_id on dev_channel.device_id = exp.device_id
WHERE (spam_pv.device_id IS NULL or spam_pv.device_id = '') AND dev_channel.partition_date = exp.partition_date
and (dev.device_id is null or dev.device_id='') GROUP BY exp.partition_date,active_type,device_os_type
""".format(partition_day=yesterday_str, end_date=today_str)
GROUP by D.DEVICE_OS_TYPE,
D.ACTIVE_TYPE
""".format(partition_day=yesterday_str, end_date=today_str)
print(sql_search_ctr) print(sql_search_ctr)
search_ctr_df = spark.sql(sql_search_ctr) search_ctr_df = spark.sql(sql_search_ctr)
# spam_pv_df.createOrReplaceTempView("dev_view") # spam_pv_df.createOrReplaceTempView("dev_view")
search_ctr_df.show(1) search_ctr_df.show(1)
sql_res = search_ctr_df.collect() sql_res = search_ctr_df.collect()
res_dict = {
"新增": {
"ios": {"click_num": 0, "exposure": 0},
"android": {"click_num": 0, "exposure": 0}
},
"老活": {
"ios": {"click_num": 0, "exposure": 0},
"android": {"click_num": 0, "exposure": 0}
}
}
print("-------------------------------") print("-------------------------------")
db = pymysql.connect(host='172.16.40.158', port=4000, user='st_user', passwd='aqpuBLYzEV7tML5RPsN1pntUzFy',
db='jerry_prod')
cursor = db.cursor()
for res in sql_res: for res in sql_res:
print(res) print(res)
if res.ACTIVE_TYPE: device_os_type = res.device_os_type
if res.ACTIVE_TYPE in ('1', '2'): active_type = res.active_type
if res.CLICK_NUM: partition_date = yesterday_str
res_dict["新增"][res.DEVICE_OS_TYPE]["click_num"] += res.CLICK_NUM pid = hashlib.md5((partition_date + device_os_type + active_type).encode("utf8")).hexdigest()
if res.EXPOSURE: click_num = res.neirong_click_pv
res_dict["新增"][res.DEVICE_OS_TYPE]["exposure"] += res.EXPOSURE exposure = res.neirong_exp_pv
else: try:
if res.CLICK_NUM: search_ctr = round(click_num / exposure, 5)
res_dict["老活"][res.DEVICE_OS_TYPE]["click_num"] += res.CLICK_NUM except:
if res.EXPOSURE: search_ctr = 0
res_dict["老活"][res.DEVICE_OS_TYPE]["exposure"] += res.EXPOSURE instert_sql = """replace into search_diary_ctr(
for active_type in res_dict:
for device_os_type in res_dict[active_type]:
partition_date = yesterday_str
pid = hashlib.md5((partition_date + device_os_type + active_type).encode("utf8")).hexdigest()
click_num = res_dict[active_type][device_os_type]["click_num"]
exposure = res_dict[active_type][device_os_type]["exposure"]
try:
search_ctr = round(click_num / exposure, 5)
except:
search_ctr = 0
instert_sql = """replace into search_diary_ctr(
partition_date,device_os_type,active_type,pid,click_num,exposure,search_ctr) VALUES('{partition_date}','{device_os_type}','{active_type}','{pid}',{click_num},{exposure},{search_ctr});""".format( partition_date,device_os_type,active_type,pid,click_num,exposure,search_ctr) VALUES('{partition_date}','{device_os_type}','{active_type}','{pid}',{click_num},{exposure},{search_ctr});""".format(
partition_date=partition_date, device_os_type=device_os_type, active_type=active_type, pid=pid, click_num=click_num, partition_date=partition_date, device_os_type=device_os_type, active_type=active_type, pid=pid,
exposure=exposure, search_ctr=search_ctr click_num=click_num,
) exposure=exposure, search_ctr=search_ctr
print(instert_sql) )
# cursor.execute("set names 'UTF8'") print(instert_sql)
res = cursor.execute(instert_sql) # cursor.execute("set names 'UTF8'")
db.commit() db = pymysql.connect(host='172.16.40.158', port=4000, user='st_user', passwd='aqpuBLYzEV7tML5RPsN1pntUzFy',
print(res) db='jerry_prod')
# cursor.executemany() cursor = db.cursor()
db.close() res = cursor.execute(instert_sql)
db.commit()
print(res)
\ No newline at end of file
...@@ -73,7 +73,7 @@ spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJso ...@@ -73,7 +73,7 @@ spark.sql("CREATE TEMPORARY FUNCTION is_json AS 'com.gmei.hive.common.udf.UDFJso
spark.sql("CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'") spark.sql("CREATE TEMPORARY FUNCTION arrayMerge AS 'com.gmei.hive.common.udf.UDFArryMerge'")
task_list = [] task_list = []
task_days = 60 task_days = 3
for t in range(1, task_days): for t in range(1, task_days):
day_num = 0 - t day_num = 0 - t
now = (datetime.datetime.now() + datetime.timedelta(days=day_num)) now = (datetime.datetime.now() + datetime.timedelta(days=day_num))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment