# -*- coding: utf-8 -*-

'''
    Different services are run by different operators.
    The service name is the same as the task id.
'''

from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
import os, json, logging, time, signal,platform,subprocess
import airflow

#variable parameter
DAG_ID = 'alarm_flink_job'
START_DATE = datetime.now() - timedelta(minutes=30)
SCHEDULE_INTERVAL = timedelta(minutes=15)

SERVICES = [
'cpc-cluster02-prod-flink-jobmanager',
'mv-cluster03-prod-flink-jobmanager',
'mv-cluster04-prod-flink-jobmanager'
]

DINGDING = 'https://oapi.dingtalk.com/robot/send?access_token=4e00d7f7b3b8686ea7d37bd01264f86e197294f9f995ef8e12cc853760a30c60'
DING_PATH = '/opt/bitnami/airflow/dags/*/script/ding.sh'

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TimeoutError(Exception):
    pass

def run_command(cmd, timeout=60):
    is_linux = platform.system() == 'Linux'
    p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid if is_linux else None)
    t_beginning = time.time()
    seconds_passed = 0
    while True:
        if p.poll() is not None:
            break
        seconds_passed = time.time() - t_beginning
        if timeout and seconds_passed > timeout:
            if is_linux:
                os.killpg(p.
                    pid, signal.SIGTERM)
            else:
                p.terminate()
            raise TimeoutError(cmd, timeout)
        time.sleep(0.1)
    return (p.stdout.read(), p.stderr.read())

default_args = {
    'owner': 'tangxianggang',
    'depends_on_past': False,
    'start_date': START_DATE,
    'retries': 3,
    'retry_delay': timedelta(seconds=5)
}

dag = DAG(
    dag_id=DAG_ID,
    default_args=default_args,
    schedule_interval=SCHEDULE_INTERVAL
)

def failure_callback(context):
    title = 'Flink job error alarm'
    message = '''SERVICE: %s
    \n#### DAG_ID:  %s 
    \n#### TASKID:  %s 
    \n#### CONTENT: 
    %s 
    \n> **For more details, please check the airflow task log.**
    ''' % (context['task_instance'].task_id, context['task_instance'].dag_id, context['task_instance'].task_id, context['exception'])
    
    logger.error('message : \n' + message)
    cmd = " bash " + DING_PATH + "  \'%s\' \'%s\' \'%s\' " % (title, message, DINGDING)
    os.system(cmd)

def get_name(cmd):
    logger.info('exec cmd:' + cmd)
    config_dict = {}
    timeout = 10

    try:
        (stdout, stderr) = run_command(cmd, timeout)
        stdout = stdout.decode()
        stderr = stderr.decode()
        logger.info(stderr)
        logger.info(stdout)

        config_dict = json.loads(stdout)
        if 'errors' in config_dict:
            msg = '\n- **errors**: ' + config_dict['errors'][0]
            logger.error(msg)
            raise Exception(msg)
    except TimeoutError:
        msg = '\n- **error_msg**: excute command=(%s) timeout after %i s' % (cmd, timeout)
        logger.error(msg)
        raise Exception(msg)
    return config_dict['name']

def get_jobs(cmd):
    logger.info('exec cmd:' + cmd)
    jobs_dict = {}
    timeout = 10

    try:
        (stdout, stderr) = run_command(cmd, timeout)
        stdout = stdout.decode()
        stderr = stderr.decode()
        logger.info(stderr)
        logger.info(stdout)

        if 'Could not resolve host' in stderr and not stdout:
            msg = '\n- **error_msg**: Could not resolve host %s' % (cmd)
            raise Exception(msg)
        elif not stdout:
            msg = '\n- **error_msg**: connectors is empty'
            raise Exception(msg)
        jobs_dict = json.loads(stdout)
        if 'errors' in jobs_dict:
            msg = '\n- **errors**: ' + jobs_dict['errors'][0]
            logger.error(msg)
            raise Exception(msg)
    except TimeoutError:
        msg = '\n- **error_msg**: excute command=(%s) timeout after %i s' % (cmd, timeout)
        logger.error(msg)
        raise Exception(msg)
    return jobs_dict

def judge_job_status(job, cmd):
    if 'errors' in job:
        msg = "\n- **errors**: %s" % (job['errors'])
        logger.error(msg)
        return (False, msg)

    job_id = job['id']
    job_status = job['status']
    job_name = get_name(cmd + job_id + "/config")
    if 'RUNNING' != job['status']:
        msg = "\n- **job_id**: %s, **job_name**: %s, **status**: %s" % (job_id,job_name,job_status)
        logger.error(msg)
        return (False, msg)
    msg = "\n- **job_id**: %s, **job_name**: %s, **status**: %s" % (job_id,job_name,job_status)
    logger.info(msg)
    return (True, msg)


def python_callable(**kwargs):
    logger.info('start kafka connect status analyze .')

    curl = 'curl '
    flink_jobmanager_url = 'http://'+ kwargs['task_instance'].task_id +':8081/jobs/'

    jobs_dict = get_jobs(curl + flink_jobmanager_url)
    logger.info('exec cmd: ' + curl + flink_jobmanager_url + ' success!')
    jobs_list = jobs_dict['jobs']
    
    error_msg = ""
    all_job = len(jobs_list)
    running_job = 0

    for job in jobs_list:
        (isrunning,msg) = judge_job_status(job, curl + flink_jobmanager_url)
        if not isrunning:
            error_msg += msg
        else :
            running_job += 1
    if error_msg:
        msg = "\n- **running_job/all_job**: %s/%s" % (running_job, all_job)
        error_msg += msg
        raise Exception(error_msg)
    logger.info('Flink job status ok!')

for service in SERVICES:
    task = PythonOperator(
        task_id=service,
        provide_context = True,
        python_callable=python_callable,
        on_failure_callback=failure_callback,
        dag=dag,
    )

if __name__ == '__main__':
    dag.cli()
