test

parent 2f39b401
CREATE EXTERNAL TABLE IF NOT EXISTS default.test (
id STRING COMMENT 'dsgsdg'
,key STRING COMMENT 'sdgs'
,value STRING COMMENT 'sdgsgd'
) COMMENT 'test'
PARTITIONED BY (partition_date STRING COMMENT '分区日期')
ROW FORMAT
DELIMITED FIELDS TERMINATED BY '\001'
COLLECTION ITEMS TERMINATED BY '\002'
MAP KEYS TERMINATED BY '\003'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/data/log/thirdparty/test';
\ No newline at end of file
SET mapred.input.dir.recursive=true;
SET hive.mapred.supports.subdirectories=true;
--声明数据库
USE ${dbname};
--加载HDFS外部数据分区
ALTER TABLE test DROP IF EXISTS PARTITION (partition_date = '${partition_date}');
ALTER TABLE test ADD IF NOT EXISTS PARTITION (partition_date = '${partition_date}') LOCATION 'hdfs://bj-gm-test-data001:8020bj-gm-test-data001:8020/test/${partition_date}';
\ No newline at end of file
<coordinator-app name="test"
frequency="0 1 * * *"
start="${start_date}" end="${end_date}" timezone="${timeZone}"
xmlns="uri:oozie:coordinator:0.2"
>
<controls>
<execution>FIFO</execution>
</controls>
<action>
<workflow>
<app-path>${wf_application_path}</app-path>
<configuration>
<property>
<name>jobTracker</name>
<value>${jobTracker}</value>
</property>
<property>
<name>nameNode</name>
<value>${nameNode}</value>
</property>
<property>
<name>queueName</name>
<value>${queueName}</value>
</property>
<property>
<name>checkClient</name>
<value>${checkClient}</value>
</property>
<property>
<name>checkEXEC</name>
<value>${checkEXEC}</value>
</property>
<property>
<name>zxURL</name>
<value>${zxURL}</value>
</property>
<property>
<name>userName</name>
<value>${userName}</value>
</property>
<property>
<name>passWord</name>
<value>${passWord}</value>
</property>
<property>
<name>sourceTableName</name>
<value>${sourceTableName}</value>
</property>
<property>
<name>columns</name>
<value>${columns}</value>
</property>
<property>
<name>targetTableName</name>
<value>${targetTableName}</value>
</property>
<property>
<name>fields_terminated</name>
<value>${fields_terminated}</value>
</property>
<property>
<name>lines_terminated</name>
<value>${lines_terminated}</value>
</property>
<property>
<name>num_mappers</name>
<value>${num_mappers}</value>
</property>
<property>
<name>dbname</name>
<value>${dbname}</value>
</property>
<property>
<name>jdbcURL</name>
<value>${jdbcURL}</value>
</property>
<property>
<name>pwd</name>
<value>${pwd}</value>
</property>
<property>
<name>partition_date</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'DAY'), 'yyyyMMdd')}</value>
</property>
<property>
<name>year</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'DAY'), 'yyyy')}</value>
</property>
<property>
<name>day</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'DAY'), 'dd')}</value>
</property>
<property>
<name>month</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'DAY'), 'MM')}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>True</value>
</property>
<property>
<name>start_date</name>
<value>${start_date}</value>
</property>
<property>
<name>end_date</name>
<value>${end_date}</value>
</property>
</configuration>
</workflow>
</action>
</coordinator-app>
#全量导入数据job.properties文件模板
oozie.use.system.libpath=True
security_enabled=False
#appName
#命名规则与hive的表名必须一致
appName=test
dbname=default
nameNode=hdfs://bj-gm-test-data001:8020
jobTracker=bj-gm-test-data001:8032
queueName=data
timeZone=GMT+0800
#任务的执行的开始时间
start_date=2019-06-03T15:00+0800
#任务执行的结束时间
end_date=2019-06-20T15:00+0800
#hiveServer2的url
jdbcURL=jdbc:hive2://bj-gm-prod-cos-datacenter006:2181,bj-gm-prod-cos-datacenter007:2181,bj-gm-prod-cos-datacenter008:2181/;serviceDiscoveryMode=zookeeper
#hiveServer2的密码
pwd=data
checkClient=hdfs://bj-gm-test-data001:8020/user/hive/project/utils/data_ready_checker/client
checkEXEC=./checkclient/bin/checker.sh
#mysql的url
#此处只是一个例子,具体url需要根据导入的目标库决定
zxURL=jdbc:mysql://172.22.30.12:3306/plutus?tinyInt1isBit=false
#访问数据库的用户名
userName=work
#访问数据库的密码
passWord=zJnxVEhyyxeC7ciqxdMITVyWqOFc2mew
#数据源表
sourceTableName=configuration
#数据的输出表
targetTableName=test
#数据源表的字段,要与表中字段顺序一致
columns="id,key,value"
num_mappers=1
fields_terminated=\\001
lines_terminated=\\n
oozie.coord.application.path=hdfs://bj-gm-test-data001:8020/user/hive/project/workflow/test
wf_application_path=hdfs://bj-gm-test-data001:8020/user/hive/project/workflow/test
#工作流的执行时间
#制定规则与crontab相同
frequency=
#工作流分为三个action,action的执行顺序为startAction -> checkAction -> jobAction
#这三个字段代表三个action name
startAction=test_sqoop
checkAction=test_check
jobAction=test_job
\ No newline at end of file
<workflow-app name="test" xmlns="uri:oozie:workflow:0.5">
<start to="test_start"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="test_start">
<sqoop xmlns="uri:oozie:sqoop-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<prepare>
<delete path="${nameNode}/data/log/thirdparty/${targetTableName}/${partition_date}" />
<!--<mkdir path="${nameNode}/data/log/thirdparty/${targetTableName}/${partition_date}"/>-->
</prepare>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<command>
import --connect ${zxURL} --username ${userName} --password ${passWord} --table ${sourceTableName} --columns ${columns} --target-dir /data/log/thirdparty/${targetTableName}/${partition_date} --fields-terminated-by ${fields_terminated} --lines-terminated-by ${lines_terminated} --num-mappers ${num_mappers} --hive-drop-import-delims --null-string \\N --null-non-string \\N
</command>
</sqoop>
<ok to="test_check"/>
<error to="Kill"/>
</action>
<action name="test_check" retry-max="3" retry-interval="5">
<shell xmlns="uri:oozie:shell-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<exec>/bin/bash</exec>
<argument>${checkEXEC}</argument>
<env-var>TYPE=hdfs</env-var>
<env-var>URI=/data/log/thirdparty/${targetTableName}/${partition_date}/</env-var>
<archive>${checkClient}#checkclient</archive>
<capture-output/>
</shell>
<ok to="test_job"/>
<error to="Kill"/>
</action>
<action name="test_job" cred="hive2" retry-max="3" retry-interval="5">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<jdbc-url>${jdbcURL}</jdbc-url>
<password>${pwd}</password>
<script>/user/hive/project/etl/test/test.sql</script>
<param>partition_date=${partition_date}</param>
<param>dbname=${dbname}</param>
</hive2>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment