import os
from collections import defaultdict
from datetime import date, timedelta

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pytispark import pytispark as pti

base_dir = os.getcwd()
print("base_dir: " + base_dir)
data_dir = os.path.join(base_dir, "_data")


def get_ndays_before_with_format(n, format):
    yesterday = (date.today() + timedelta(days=-n)).strftime(format)
    return yesterday


def get_ndays_before_no_minus(n):
    return get_ndays_before_with_format(n, "%Y%m%d")


def get_spark(app_name=""):
    sparkConf = SparkConf()
    sparkConf.set("spark.sql.crossJoin.enabled", True)
    sparkConf.set("spark.debug.maxToStringFields", "100")
    sparkConf.set("spark.tispark.plan.allow_index_double_read", False)
    sparkConf.set("spark.tispark.plan.allow_index_read", True)
    sparkConf.set("spark.hive.mapred.supports.subdirectories", True)
    sparkConf.set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", True)
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sparkConf.set("mapreduce.output.fileoutputformat.compress", False)
    sparkConf.set("mapreduce.map.output.compress", False)
    spark = SparkSession.builder.config(conf=sparkConf).config(
        "spark.sql.extensions",
        "org.apache.spark.sql.TiExtensions").config("spark.tispark.pd.addresses",
                                                    "172.16.40.170:2379").appName(app_name).enableHiveSupport().getOrCreate()
    # sc = spark.sparkContext
    # sc.addPyFile("/srv/apps/strategy_embedding/utils/date.py")
    ti = pti.TiContext(spark)
    ti.tidbMapDatabase("jerry_test")
    return spark


def get_tracate_click_data(spark, start, end):
    reg = r"""^\\d+$"""
    sql = """
        SELECT DISTINCT t1.partition_date, t1.cl_id, cast(t1.business_id as int) card_id
          FROM
          (select partition_date,cl_id,business_id,action,page_name,page_stay
          from online.bl_hdfs_maidian_updates
          where action = 'page_view'
            AND partition_date BETWEEN '{}' AND '{}'
            AND page_name='user_post_detail'
            AND page_stay>=3
            AND cl_id is not null
            AND cl_id != ''
            AND business_id is not null
            AND business_id != ''
            AND business_id rlike '{}'
            ) AS t1
          JOIN
          (select partition_date,active_type,first_channel_source_type,device_id
          from online.ml_device_day_active_status
          where partition_date BETWEEN '{}' AND '{}'
            AND active_type IN ('1', '2', '4')
            AND first_channel_source_type not IN ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
                  ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
                  ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
                  ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
                  ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
                  ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
                  ,'promotion_shike','promotion_julang_jl03','promotion_zuimei')
            AND first_channel_source_type not LIKE 'promotion\\_jf\\_%') as t2
          ON t1.cl_id = t2.device_id
          AND t1.partition_date = t2.partition_date
      LEFT JOIN
    (
        SELECT DISTINCT device_id
        FROM ml.ml_d_ct_dv_devicespam_d  --去除机构刷单设备，即作弊设备（浏览和曝光事件去除）
        WHERE partition_day='{}'

        UNION ALL
        SELECT DISTINCT device_id
        FROM dim.dim_device_user_staff   --去除内网用户
    )spam_pv
    on spam_pv.device_id=t1.cl_id
        LEFT JOIN
    (
        SELECT partition_date,device_id
        FROM
        (--找出user_id当天活跃的第一个设备id
            SELECT user_id,partition_date,
                    if(size(device_list) > 0, device_list [ 0 ], '') AS device_id
              FROM online.ml_user_updates
              WHERE partition_date>='{}' AND partition_date<'{}'
        )t1
        JOIN
        (  --医生账号
            SELECT distinct user_id
            FROM online.tl_hdfs_doctor_view
            WHERE partition_date = '{}'

            --马甲账号/模特用户
            UNION ALL
            SELECT user_id
            FROM ml.ml_c_ct_ui_user_dimen_d
            WHERE partition_day = '{}'
            AND (is_puppet = 'true' or is_classifyuser = 'true')

            UNION ALL
            --公司内网覆盖用户
            select distinct user_id
            from dim.dim_device_user_staff

            UNION ALL
            --登陆过医生设备
            SELECT distinct t1.user_id
            FROM
            (
                SELECT user_id, v.device_id as device_id
                FROM online.ml_user_history_detail
                    LATERAL VIEW EXPLODE(device_history_list) v AS device_id
                WHERE partition_date = '{}'
            )t1
            JOIN
            (
                SELECT device_id
                FROM online.ml_device_history_detail
                WHERE partition_date = '{}'
                AND is_login_doctor = '1'
            )t2
                ON t1.device_id = t2.device_id
        )t2
        on t1.user_id=t2.user_id
        group by partition_date,device_id
    )dev
    on t1.partition_date=dev.partition_date and t1.cl_id=dev.device_id
    WHERE (spam_pv.device_id IS NULL or spam_pv.device_id ='')
    and (dev.device_id is null or dev.device_id ='')
     """.format(start, end, reg, start, end, end, start, end, end, end, end, end)
    # print("sql", flush=True)
    # print(sql, flush=True)
    df = spark.sql(sql)
    return df


def get_device_click_tractate_ids_dict(click_df):
    res = defaultdict(list)
    cols = click_df.orderBy("partition_date", ascending=False).collect()
    for i in cols:
        res[i["cl_id"]].append(i["card_id"])
    return res


if __name__ == "__main__":
    spark = get_spark("tractate_click_ids")
    click_df = get_tracate_click_data(spark, get_ndays_before_no_minus(180), get_ndays_before_no_minus(1))
    click_df.show(5, False)

    res_dict = get_device_click_tractate_ids_dict(click_df)

    with open(os.path.join(data_dir, "click_tractate_ids.csv"), "w") as f:
        for (k, v) in res_dict.items():
            if v:
                f.write("{}|{}\n".format(k, ",".join([str(x) for x in v])))

# spark-submit --master yarn --deploy-mode client --queue root.strategy --driver-memory 16g --executor-memory 1g --executor-cores 1 --num-executors 70 --conf spark.default.parallelism=100 --conf spark.storage.memoryFraction=0.5 --conf spark.shuffle.memoryFraction=0.3 --conf spark.locality.wait=0 --jars /srv/apps/tispark-core-2.1-SNAPSHOT-jar-with-dependencies.jar,/srv/apps/spark-connector_2.11-1.9.0-rc2.jar,/srv/apps/mysql-connector-java-5.1.38.jar /srv/apps/strategy_embedding/word_vector/tractate.py
