diff --git a/task/tractate_analysis_in_7000.py b/task/tractate_analysis_in_7000.py index 5f1862f88c16dbf259969ee5e83fa8ee3d555cf0..c028b776328acbdda5f7895b04e8ae299fa62caf 100644 --- a/task/tractate_analysis_in_7000.py +++ b/task/tractate_analysis_in_7000.py @@ -107,9 +107,8 @@ one_week_age_str = (now + datetime.timedelta(days=-7)).strftime("%Y%m%d") device_id_dict = {} huidu_device_id_sql = r""" -select t2.device_id from -(select first_device as device_id from online.ml_user_history_detail where partition_date = {today_str} and substr(md5(first_device),-1) in ('8', '9', 'a', 'b')) t2 - +select t1.device_id from +(select distinct(first_device) as device_id from online.ml_user_history_detail where partition_date = {today_str} and substr(md5(first_device),-1) in ('8', '9', 'a', 'b') and last_active_date >= {last_30_day_str}) t2 LEFT JOIN ( select distinct device_id @@ -176,7 +175,7 @@ select t2.device_id from WHERE spam_pv.device_id IS NULL and dev.device_id is null -""".format(today_str=today_str) +""".format(today_str=today_str,last_30_day_str=last_30_day_str) print(huidu_device_id_sql) huidu_device_id_df = spark.sql(huidu_device_id_sql)