Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
D
data-dqmonitor
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
薛鹏飞
data-dqmonitor
Commits
5365f98c
Commit
5365f98c
authored
Jun 20, 2019
by
Pengfei Xue
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
init
parents
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
612 additions
and
0 deletions
+612
-0
.gitignore
.gitignore
+3
-0
README.md
README.md
+2
-0
pom.xml
pom.xml
+152
-0
core-site.xml
src/main/resources/core-site.xml
+145
-0
hive-site.xml
src/main/resources/hive-site.xml
+243
-0
Main.scala
src/main/scala/com/gmei/data/dq/Main.scala
+34
-0
Utils.scala
src/main/scala/com/gmei/data/dq/Utils.scala
+22
-0
pvCheker.scala
src/main/scala/com/gmei/data/dq/pvCheker.scala
+11
-0
No files found.
.gitignore
0 → 100644
View file @
5365f98c
*.class
*.log
target/
README.md
0 → 100644
View file @
5365f98c
# spark-starter
\ No newline at end of file
pom.xml
0 → 100644
View file @
5365f98c
<?xml version="1.0" encoding="UTF-8"?>
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<artifactId>
dq
</artifactId>
<groupId>
com.gmei.data
</groupId>
<version>
1.0-SNAPSHOT
</version>
<properties>
<spark.version>
2.1.2
</spark.version>
<scala.version>
2.11
</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>
org.apache.spark
</groupId>
<artifactId>
spark-core_${scala.version}
</artifactId>
<version>
${spark.version}
</version>
</dependency>
<dependency>
<groupId>
org.apache.spark
</groupId>
<artifactId>
spark-sql_${scala.version}
</artifactId>
<version>
${spark.version}
</version>
</dependency>
<dependency>
<groupId>
org.apache.spark
</groupId>
<artifactId>
spark-hive_${scala.version}
</artifactId>
<version>
${spark.version}
</version>
</dependency>
</dependencies>
<repositories>
<repository>
<id>
my-local-repo
</id>
<url>
file:/Users/pengfei/.m2/repository/
</url>
</repository>
<repository>
<id>
gengmei-data-release
</id>
<name>
Gengmei Data Libraries
</name>
<url>
http://maven.gengmei/nexus/content/repositories/gengmei-data/
</url>
</repository>
<repository>
<id>
gengmei-data-snapshots
</id>
<name>
gengmei-data-snapshots
</name>
<url>
http://maven.gengmei/nexus/content/repositories/gengmei-data-snapshots/
</url>
</repository>
<repository>
<id>
ali
</id>
<url>
http://maven.aliyun.com/nexus/content/groups/public
</url>
</repository>
<repository>
<id>
phoenix
</id>
<url>
http://repo1.maven.org/maven2/
</url>
</repository>
<repository>
<id>
apache release
</id>
<url>
https://repository.apache.org/content/repositories/releases/
</url>
</repository>
<repository>
<id>
cloudera
</id>
<url>
https://repository.cloudera.com/artifactory/cloudera-repos/
</url>
</repository>
<repository>
<id>
sonatype-shapshot
</id>
<name>
Sonatype Snapshot Repository
</name>
<url>
https://oss.sonatype.org/content/repositories/snapshots/
</url>
</repository>
</repositories>
<build>
<plugins>
<plugin>
<groupId>
org.scala-tools
</groupId>
<artifactId>
maven-scala-plugin
</artifactId>
<version>
2.15.2
</version>
<executions>
<execution>
<goals>
<goal>
compile
</goal>
<goal>
testCompile
</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>
maven-compiler-plugin
</artifactId>
<version>
3.6.0
</version>
<configuration>
<source>
1.8
</source>
<target>
1.8
</target>
</configuration>
</plugin>
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-surefire-plugin
</artifactId>
<version>
2.19
</version>
<configuration>
<skip>
true
</skip>
</configuration>
</plugin>
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-shade-plugin
</artifactId>
<version>
2.4.3
</version>
<executions>
<execution>
<phase>
package
</phase>
<goals>
<goal>
shade
</goal>
</goals>
<configuration>
<createDependencyReducedPom>
false
</createDependencyReducedPom>
<createSourcesJar>
false
</createSourcesJar>
<promoteTransitiveDependencies>
true
</promoteTransitiveDependencies>
<artifactSet>
<includes>
<include>
com.gmei.data:daybits
</include>
</includes>
</artifactSet>
<transformers>
<transformer
implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"
>
<mainClass>
com.gmei.data.dq.Main
</mainClass>
</transformer>
</transformers>
<minimizeJar>
true
</minimizeJar>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
src/main/resources/core-site.xml
0 → 100755
View file @
5365f98c
<?xml version="1.0" encoding="UTF-8"?>
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>
fs.defaultFS
</name>
<value>
hdfs://bj-gmei-hdfs
</value>
</property>
<property>
<name>
fs.trash.interval
</name>
<value>
1
</value>
</property>
<property>
<name>
io.compression.codecs
</name>
<value>
org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec
</value>
</property>
<property>
<name>
hadoop.security.authentication
</name>
<value>
simple
</value>
</property>
<property>
<name>
hadoop.security.authorization
</name>
<value>
true
</value>
</property>
<property>
<name>
hadoop.rpc.protection
</name>
<value>
authentication
</value>
</property>
<property>
<name>
hadoop.security.auth_to_local
</name>
<value>
DEFAULT
</value>
</property>
<property>
<name>
hadoop.proxyuser.oozie.hosts
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.oozie.groups
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.mapred.hosts
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.mapred.groups
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.flume.hosts
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.flume.groups
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.HTTP.hosts
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.HTTP.groups
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.hadoop.hosts
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.hadoop.groups
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.hue.hosts
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.hue.groups
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.httpfs.hosts
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.httpfs.groups
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.hdfs.groups
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.hdfs.hosts
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.yarn.hosts
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.proxyuser.yarn.groups
</name>
<value>
*
</value>
</property>
<property>
<name>
hadoop.security.group.mapping
</name>
<value>
org.apache.hadoop.security.ShellBasedUnixGroupsMapping
</value>
</property>
<property>
<name>
hadoop.security.instrumentation.requires.admin
</name>
<value>
false
</value>
</property>
<property>
<name>
net.topology.script.file.name
</name>
<value>
/etc/hadoop/conf.cloudera.yarn/topology.py
</value>
</property>
<property>
<name>
io.file.buffer.size
</name>
<value>
262144
</value>
</property>
<property>
<name>
hadoop.ssl.enabled
</name>
<value>
false
</value>
</property>
<property>
<name>
hadoop.ssl.require.client.cert
</name>
<value>
false
</value>
<final>
true
</final>
</property>
<property>
<name>
hadoop.ssl.keystores.factory.class
</name>
<value>
org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory
</value>
<final>
true
</final>
</property>
<property>
<name>
hadoop.ssl.server.conf
</name>
<value>
ssl-server.xml
</value>
<final>
true
</final>
</property>
<property>
<name>
hadoop.ssl.client.conf
</name>
<value>
ssl-client.xml
</value>
<final>
true
</final>
</property>
</configuration>
src/main/resources/hive-site.xml
0 → 100644
View file @
5365f98c
<?xml version="1.0" encoding="UTF-8"?>
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>
hive.metastore.uris
</name>
<value>
thrift://bj-gm-prod-cos-datacenter001:9083,thrift://bj-gm-prod-cos-datacenter002:9083
</value>
</property>
<property>
<name>
hive.metastore.client.socket.timeout
</name>
<value>
300
</value>
</property>
<property>
<name>
hive.metastore.warehouse.dir
</name>
<value>
/user/hive/warehouse
</value>
</property>
<property>
<name>
hive.warehouse.subdir.inherit.perms
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.auto.convert.join
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.auto.convert.join.noconditionaltask.size
</name>
<value>
20971520
</value>
</property>
<property>
<name>
hive.optimize.bucketmapjoin.sortedmerge
</name>
<value>
false
</value>
</property>
<property>
<name>
hive.smbjoin.cache.rows
</name>
<value>
10000
</value>
</property>
<!--'hive.server2.logging.operation.enabled', originally set to 'true' (non-final), is overridden below by a safety valve-->
<!--'hive.server2.logging.operation.log.location', originally set to '/opt/hive/operation_logs' (non-final), is overridden below by a safety valve-->
<property>
<name>
mapred.reduce.tasks
</name>
<value>
-1
</value>
</property>
<property>
<name>
hive.exec.reducers.bytes.per.reducer
</name>
<value>
1073741824
</value>
</property>
<property>
<name>
hive.exec.copyfile.maxsize
</name>
<value>
33554432
</value>
</property>
<property>
<name>
hive.exec.reducers.max
</name>
<value>
999
</value>
</property>
<property>
<name>
hive.vectorized.groupby.checkinterval
</name>
<value>
4096
</value>
</property>
<property>
<name>
hive.vectorized.groupby.flush.percent
</name>
<value>
0.1
</value>
</property>
<property>
<name>
hive.compute.query.using.stats
</name>
<value>
false
</value>
</property>
<property>
<name>
hive.vectorized.execution.enabled
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.vectorized.execution.reduce.enabled
</name>
<value>
false
</value>
</property>
<property>
<name>
hive.merge.mapfiles
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.merge.mapredfiles
</name>
<value>
false
</value>
</property>
<property>
<name>
hive.cbo.enable
</name>
<value>
false
</value>
</property>
<property>
<name>
hive.fetch.task.conversion
</name>
<value>
minimal
</value>
</property>
<property>
<name>
hive.fetch.task.conversion.threshold
</name>
<value>
268435456
</value>
</property>
<property>
<name>
hive.limit.pushdown.memory.usage
</name>
<value>
0.1
</value>
</property>
<property>
<name>
hive.merge.sparkfiles
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.merge.smallfiles.avgsize
</name>
<value>
16777216
</value>
</property>
<property>
<name>
hive.merge.size.per.task
</name>
<value>
268435456
</value>
</property>
<property>
<name>
hive.optimize.reducededuplication
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.optimize.reducededuplication.min.reducer
</name>
<value>
4
</value>
</property>
<property>
<name>
hive.map.aggr
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.map.aggr.hash.percentmemory
</name>
<value>
0.5
</value>
</property>
<property>
<name>
hive.optimize.sort.dynamic.partition
</name>
<value>
false
</value>
</property>
<property>
<name>
hive.execution.engine
</name>
<value>
mr
</value>
</property>
<property>
<name>
spark.executor.memory
</name>
<value>
5888304742
</value>
</property>
<property>
<name>
spark.driver.memory
</name>
<value>
3865470566
</value>
</property>
<property>
<name>
spark.executor.cores
</name>
<value>
6
</value>
</property>
<property>
<name>
spark.yarn.driver.memoryOverhead
</name>
<value>
409
</value>
</property>
<property>
<name>
spark.yarn.executor.memoryOverhead
</name>
<value>
990
</value>
</property>
<property>
<name>
spark.dynamicAllocation.enabled
</name>
<value>
true
</value>
</property>
<property>
<name>
spark.dynamicAllocation.initialExecutors
</name>
<value>
1
</value>
</property>
<property>
<name>
spark.dynamicAllocation.minExecutors
</name>
<value>
1
</value>
</property>
<property>
<name>
spark.dynamicAllocation.maxExecutors
</name>
<value>
2147483647
</value>
</property>
<property>
<name>
hive.metastore.execute.setugi
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.support.concurrency
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.zookeeper.quorum
</name>
<value>
bj-gm-prod-cos-datacenter008,bj-gm-prod-cos-datacenter006,bj-gm-prod-cos-datacenter007
</value>
</property>
<property>
<name>
hive.zookeeper.client.port
</name>
<value>
2181
</value>
</property>
<property>
<name>
hive.zookeeper.namespace
</name>
<value>
hive_zookeeper_namespace_hive
</value>
</property>
<property>
<name>
hbase.zookeeper.quorum
</name>
<value>
bj-gm-prod-cos-datacenter008,bj-gm-prod-cos-datacenter006,bj-gm-prod-cos-datacenter007
</value>
</property>
<property>
<name>
hbase.zookeeper.property.clientPort
</name>
<value>
2181
</value>
</property>
<property>
<name>
hive.cluster.delegation.token.store.class
</name>
<value>
org.apache.hadoop.hive.thrift.MemoryTokenStore
</value>
</property>
<property>
<name>
hive.server2.enable.doAs
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.server2.use.SSL
</name>
<value>
false
</value>
</property>
<property>
<name>
spark.shuffle.service.enabled
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.security.command.whitelist
</name>
<value>
set,reset,dfs,delete,add
</value>
</property>
<property>
<name>
hive.security.temporary.function.need.admin
</name>
<value>
false
</value>
</property>
<property>
<name>
hive.reloadable.aux.jars.path
</name>
<value>
/opt/cloudera/parcels/CDH/lib/hive/auxlib
</value>
</property>
<property>
<name>
phoenix.schema.isNamespaceMappingEnabled
</name>
<value>
true
</value>
</property>
<property>
<name>
phoenix.schema.mapSystemTablesToNamespace
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.server2.logging.operation.enabled
</name>
<value>
true
</value>
</property>
<property>
<name>
hive.server2.logging.operation.log.location
</name>
<value>
/opt/hive/operation_logs
</value>
</property>
</configuration>
src/main/scala/com/gmei/data/dq/Main.scala
0 → 100644
View file @
5365f98c
package
com.gmei.data.dq
import
org.apache.spark.sql.SparkSession
import
org.apache.spark.
{
SparkConf
,
SparkContext
}
/**
* Created by pengfei.x on 2019/3/22.
*/
object
Main
{
def
main
(
args
:
Array
[
String
])
=
{
val
spark
=
SparkSession
.
builder
()
.
appName
(
"DataQualityMonitor"
)
.
getOrCreate
()
// For implicit conversions like converting RDDs to DataFrames
import
spark.implicits._
var
partition_date
=
""
var
yesterday
=
""
if
(
args
.
length
==
1
&&
Utils
.
is_date_string
(
args
(
0
)))
{
partition_date
=
args
(
0
)
}
else
{
throw
new
IllegalArgumentException
(
"have no partition date!"
)
}
// check pv
pvCheker
.
check
(
spark
,
partition_date
)
spark
.
stop
()
}
}
src/main/scala/com/gmei/data/dq/Utils.scala
0 → 100644
View file @
5365f98c
package
com.gmei.data.dq
import
java.text.SimpleDateFormat
object
Utils
{
def
is_date_string
(
time_str
:
String
)
:
Boolean
=
{
val
dateFormat
:
SimpleDateFormat
=
new
SimpleDateFormat
(
"yyyyMMdd"
)
try
{
val
time_date
=
dateFormat
.
parse
(
time_str
)
if
(
dateFormat
.
format
(
time_date
).
equals
(
time_str
)
&&
time_str
>
"20100101"
&&
time_str
<
"20500101"
)
return
true
else
return
false
}
catch
{
case
e
:
Exception
=>
return
false
}
}
}
src/main/scala/com/gmei/data/dq/pvCheker.scala
0 → 100644
View file @
5365f98c
package
com.gmei.data.dq
import
org.apache.spark.sql.SparkSession
object
pvCheker
{
def
check
(
sc
:
SparkSession
,
partition_date
:
String
)
=
{
sc
.
sql
(
"use online"
)
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment