Commit 5d0d30e2 authored by Pengfei Xue's avatar Pengfei Xue

check pv write to tidb

parent 496d3108
...@@ -21,9 +21,11 @@ object Main { ...@@ -21,9 +21,11 @@ object Main {
var partition_date = "" var partition_date = ""
var yesterday = "" var yesterday = ""
var cmd = ""
if (args.length == 1 && Utils.is_date_string(args(0))) { if (args.length == 2 && Utils.is_date_string(args(0))) {
partition_date = args(0) partition_date = args(0)
cmd = args(1)
} }
else { else {
throw new IllegalArgumentException("have no partition date!") throw new IllegalArgumentException("have no partition date!")
...@@ -31,9 +33,13 @@ object Main { ...@@ -31,9 +33,13 @@ object Main {
// check pv // check pv
// pvReferCheker.check(spark, partition_date) // pvReferCheker.check(spark, partition_date)
if (cmd == "actionCheck") {
actionCheck.check(spark, partition_date) actionCheck.check(spark, partition_date)
} else if (cmd == "pvCheck") {
pvChecker.check(spark, partition_date) pvChecker.check(spark, partition_date)
} else {
println("unknow cmd. supported actionCheck|pvCheck ")
}
spark.stop() spark.stop()
} }
} }
...@@ -28,7 +28,7 @@ object Utils { ...@@ -28,7 +28,7 @@ object Utils {
def getDateByStringAndFormatStr(date: String, formatString: String = "yyyyMMdd") = { def getDateByStringAndFormatStr(date: String, formatString: String = "yyyyMMdd") = {
val dateFormat = new SimpleDateFormat(formatString) val dateFormat = new SimpleDateFormat(formatString)
dateFormat.parse(partition_date) dateFormat.parse(date)
} }
def getYesterday(date: Date) = { def getYesterday(date: Date) = {
......
package com.gmei.data.dq package com.gmei.data.dq
import java.text.SimpleDateFormat
import java.util.Date
import javax.rmi.CORBA.Util
import org.apache.spark.sql.SaveMode import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession
......
package com.gmei.data.dq package com.gmei.data.dq
import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.{Row, SaveMode, SparkSession}
object pvReferCheker { object pvChecker {
def check(sc: SparkSession, partition_date: String) = { private[this] def getSql(date: String): String = {
import sc.implicits._
val x = sc.sql(
s""" s"""
|select |select
| z.partition_date, | z.partition_date as partition_date,
| z.cl_type, | z.cl_type as cl_type,
| z.name, | z.name as name,
| z.active_type, | z.active_type as active_type,
| z.referrer, | sum(z.c) as duration
| sum(z.c)
|from |from
|( |(
| select | select
...@@ -24,7 +20,6 @@ object pvReferCheker { ...@@ -24,7 +20,6 @@ object pvReferCheker {
| y.cl_type as cl_type, | y.cl_type as cl_type,
| x.active_type as active_type, | x.active_type as active_type,
| y.name as name, | y.name as name,
| y.referrer as referrer,
| y.c as c | y.c as c
| from | from
| ( | (
...@@ -33,25 +28,53 @@ object pvReferCheker { ...@@ -33,25 +28,53 @@ object pvReferCheker {
| partition_date, | partition_date,
| case when active_type = '1' or active_type = '2' or active_type = '3' then 'new' else 'old' end as active_type | case when active_type = '1' or active_type = '2' or active_type = '3' then 'new' else 'old' end as active_type
| from online.ml_device_day_active_status | from online.ml_device_day_active_status
| where partition_date = '${partition_date}' | where partition_date = '${date}'
| ) x left join ( | ) x left join (
| select | select
| cl_id, | cl_id,
| partition_date, | partition_date,
| cl_type, | cl_type,
| params['page_name'] as name, | params['page_name'] as name,
| params['referrer'] as referrer,
| (params['out'] - params['in']) as c | (params['out'] - params['in']) as c
| from | from
| online.bl_hdfs_maidian_updates | online.bl_hdfs_maidian_updates
| where | where
| partition_date = '${partition_date}' and action = 'page_view' | partition_date = '${date}' and action = 'page_view'
| ) y on x.device_id = y.cl_id | ) y on x.device_id = y.cl_id
| where y.cl_id is not null | where y.cl_id is not null
|) z |) z
|group by z.partition_date, z.cl_type, z.name, z.active_type, z.referrer |group by z.partition_date, z.cl_type, z.name, z.active_type
""".stripMargin
}
def check(sc: SparkSession, partition_date: String) = {
import sc.implicits._
val x = sc.sql(getSql(partition_date))
x.createTempView("x")
val yesterday = Utils.getYesterDayStrByTodayStr(partition_date)
val y = sc.sql(getSql(yesterday))
y.createTempView("y")
val z = sc.sql(
""" """
.stripMargin | select
) | x.partition_date as date,
| y.partition_date as yesterday,
| x.name,
| x.cl_type,
| x.active_type,
| x.duration as todayCount,
| y.duration as yesterdayCount,
| case when y.duration = 0 then 1.0 else (x.duration - y.duration) * 1.0 / y.duration end as chainRate
| from x left join y on x.cl_type = y.cl_type and x.name = y.name and x.active_type = y.active_type
""".stripMargin)
val tidb = Utils.getTidbConnectionInfo
//df is a dataframe contains the data which you want to write.
z.write.mode(SaveMode.Append).jdbc(tidb._1, "pv_check", tidb._2)
// TODO: delete date before 14 days ago
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment