mysql表api_alipayaccout导入到hive

parent 4d6c1cc3
--***************************************************************
--*脚本名称: create_test_20190712.sql
--*功能: test
--*业务名称: tl
--*输入数据: hdfs://bj-gmei-hdfs/user/hive/warehouse/tl.db/test_20190712/partition_day=${partition_day}/
--*作者: data-exchange
--*更新时间: 2019-07-12 16:33:26
--***************************************************************
--设置全局变量&UDF
SET mapreduce.job.queuename=data;
USE default;
CREATE EXTERNAL TABLE IF NOT EXISTS test_20190712 (
account STRING COMMENT '支付宝账户'
,created_time STRING COMMENT '创建时间'
,id BIGINT COMMENT 'ID'
,name STRING COMMENT '用户名'
,updated_time STRING COMMENT '最后更新时间'
) COMMENT 'test'
PARTITIONED BY (partition_date STRING COMMENT '分区日期')
ROW FORMAT
DELIMITED FIELDS TERMINATED BY '\001'
COLLECTION ITEMS TERMINATED BY '\002'
MAP KEYS TERMINATED BY '\003'
LINES TERMINATED BY '\n'
NULL DEFINED AS ''
STORED AS TEXTFILE
LOCATION '/data/log/thirdparty/test_20190712';
\ No newline at end of file
--***************************************************************
--*脚本名称: create_test_20190712.sql
--*功能: test
--*业务名称: tl
--*输入数据: hdfs://bj-gmei-hdfs/user/hive/warehouse/tl.db/test_20190712/partition_day=${partition_day}/
--*作者: data-exchange
--*更新时间: 2019-07-12 16:33:26
--***************************************************************
SET mapred.input.dir.recursive=true;
SET hive.mapred.supports.subdirectories=true;
--声明数据库
USE ${dbname};
--加载HDFS外部数据分区
ALTER TABLE test_20190712 DROP IF EXISTS PARTITION (partition_date = '${partition_date}');
ALTER TABLE test_20190712 ADD IF NOT EXISTS PARTITION (partition_date = '${partition_date}') LOCATION 'hdfs://bj-gm-test-data001:8020/data/log/thirdparty/test_20190712/${partition_date}';
\ No newline at end of file
<coordinator-app name="test_20190712"
frequency="0 1 * * *"
start="${start_date}" end="${end_date}" timezone="${timeZone}"
xmlns="uri:oozie:coordinator:0.2"
>
<controls>
<execution>FIFO</execution>
</controls>
<action>
<workflow>
<app-path>${wf_application_path}</app-path>
<configuration>
<property>
<name>jobTracker</name>
<value>${jobTracker}</value>
</property>
<property>
<name>nameNode</name>
<value>${nameNode}</value>
</property>
<property>
<name>queueName</name>
<value>${queueName}</value>
</property>
<property>
<name>checkClient</name>
<value>${checkClient}</value>
</property>
<property>
<name>checkEXEC</name>
<value>${checkEXEC}</value>
</property>
<property>
<name>zxURL</name>
<value>${zxURL}</value>
</property>
<property>
<name>userName</name>
<value>${userName}</value>
</property>
<property>
<name>passWord</name>
<value>${passWord}</value>
</property>
<property>
<name>sourceTableName</name>
<value>${sourceTableName}</value>
</property>
<property>
<name>columns</name>
<value>${columns}</value>
</property>
<property>
<name>targetTableName</name>
<value>${targetTableName}</value>
</property>
<property>
<name>fields_terminated</name>
<value>${fields_terminated}</value>
</property>
<property>
<name>lines_terminated</name>
<value>${lines_terminated}</value>
</property>
<property>
<name>num_mappers</name>
<value>${num_mappers}</value>
</property>
<property>
<name>dbname</name>
<value>${dbname}</value>
</property>
<property>
<name>jdbcURL</name>
<value>${jdbcURL}</value>
</property>
<property>
<name>pwd</name>
<value>${pwd}</value>
</property>
<property>
<name>partition_date</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'DAY'), 'yyyyMMdd')}</value>
</property>
<property>
<name>year</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'DAY'), 'yyyy')}</value>
</property>
<property>
<name>day</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'DAY'), 'dd')}</value>
</property>
<property>
<name>month</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'DAY'), 'MM')}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>True</value>
</property>
<property>
<name>start_date</name>
<value>${start_date}</value>
</property>
<property>
<name>end_date</name>
<value>${end_date}</value>
</property>
</configuration>
</workflow>
</action>
</coordinator-app>
#全量导入数据job.properties文件模板
oozie.use.system.libpath=True
security_enabled=False
#appName
#命名规则与hive的表名必须一致
appName=test_20190712
dbname=default
nameNode=hdfs://bj-gm-test-data001:8020
jobTracker=bj-gm-prod-cos-datacenter005:8032
queueName=data
timeZone=GMT+0800
#任务的执行的开始时间
start_date=2019-07-09T00:00+0800
#任务执行的结束时间
end_date=2019-08-28T23:59+0800
#hiveServer2的url
jdbcURL=jdbc:hive2://bj-gm-prod-cos-datacenter006:2181,bj-gm-prod-cos-datacenter007:2181,bj-gm-prod-cos-datacenter008:2181/;serviceDiscoveryMode=zookeeper
#hiveServer2的密码
pwd=data
checkClient=hdfs://bj-gm-test-data001:8020/user/hive/project/utils/data_ready_checker/client
checkEXEC=./checkclient/bin/checker.sh
#mysql的url
#此处只是一个例子,具体url需要根据导入的目标库决定
zxURL=jdbc:mysql://172.22.30.12:3306/zhengxing?tinyInt1isBit=false
#访问数据库的用户名
userName=work
#访问数据库的密码
passWord=zJnxVEhyyxeC7ciqxdMITVyWqOFc2mew
#数据源表
sourceTableName=api_alipayaccout
#数据的输出表
targetTableName=test_20190712
#数据源表的字段,要与表中字段顺序一致
columns="account,created_time,id,name,updated_time"
num_mappers=1
fields_terminated=\\001
lines_terminated=\\n
oozie.coord.application.path=hdfs://bj-gm-test-data001:8020/user/hive/project/workflow/test_20190712
wf_application_path=hdfs://bj-gm-test-data001:8020/user/hive/project/workflow/test_20190712
<workflow-app name="test_20190712" xmlns="uri:oozie:workflow:0.5">
<start to="test_20190712_start"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="test_20190712_start">
<sqoop xmlns="uri:oozie:sqoop-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<prepare>
<delete path="${nameNode}/data/log/thirdparty/${targetTableName}/${partition_date}" />
<!--<mkdir path="${nameNode}/data/log/thirdparty/${targetTableName}/${partition_date}"/>-->
</prepare>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<command>import --connect ${zxURL} --username ${userName} --password ${passWord} --table ${sourceTableName} --columns ${columns} --escaped-by '\\'--target-dir /data/log/thirdparty/${targetTableName}/${partition_date} --fields-terminated-by ${fields_terminated} --lines-terminated-by ${lines_terminated} --num-mappers ${num_mappers} --hive-drop-import-delims --null-string "" --null-non-string ""</command>
</sqoop>
<ok to="test_20190712_check"/>
<error to="Kill"/>
</action>
<action name="test_20190712_check" retry-max="3" retry-interval="5">
<shell xmlns="uri:oozie:shell-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<exec>/bin/bash</exec>
<argument>${checkEXEC}</argument>
<env-var>TYPE=hdfs</env-var>
<env-var>URI=/data/log/thirdparty/${targetTableName}/${partition_date}/</env-var>
<archive>${checkClient}#checkclient</archive>
<capture-output/>
</shell>
<ok to="test_20190712_job"/>
<error to="Kill"/>
</action>
<action name="test_20190712_job" cred="hive2" retry-max="3" retry-interval="5">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<jdbc-url>${jdbcURL}</jdbc-url>
<password>${pwd}</password>
<script>/user/hive/project/etl/test_20190712/test_20190712.sql</script>
<param>partition_date=${partition_date}</param>
<param>dbname=${dbname}</param>
</hive2>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment