拆分新老用户重复曝光指标

6aa7d071 · 王志伟 · 8ab1cc5c · 6aa7d071
Commit 6aa7d071 authored Feb 27, 2019 by 王志伟
Hide whitespace changes
Inline Side-by-side

Showing with 133 additions and 2 deletions

temp_count.scala eda/feededa/src/main/scala/com/gmei/temp_count.scala +133 -2

No files found.
--- a/eda/feededa/src/main/scala/com/gmei/temp_count.scala
+++ b/eda/feededa/src/main/scala/com/gmei/temp_count.scala
@@ -218,8 +218,8 @@ object Repeated_content_recommendation {
      ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
-//      val stat_date = GmeiConfig.getMinusNDate(1)
+      val stat_date = GmeiConfig.getMinusNDate(1)
-      val stat_date = param.date
+//      val stat_date = param.date
      val partition_date = stat_date.replace("-","")
@@ -352,6 +352,137 @@ object Repeated_content_recommendation {
+object Repeated_content_recommendation_old {
+  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
+  Logger.getLogger("org.apache.eclipse.jetty.server").setLevel(Level.OFF)
+  case class Params(env: String = "dev",
+                    date: String = "2018-08-01"
+                   ) extends AbstractParams[Params] with Serializable
+  val defaultParams = Params()
+  val parser = new OptionParser[Params]("Feed_EDA") {
+    head("WeafareStat")
+    opt[String]("env")
+      .text(s"the databases environment you used")
+      .action((x, c) => c.copy(env = x))
+    opt[String] ("date")
+      .text(s"the date you used")
+      .action((x,c) => c.copy(date = x))
+    note(
+      """
+        |For example, the following command runs this app on a tidb dataset:
+        |
+        | spark-submit --class com.gmei.WeafareStat ./target/scala-2.11/feededa-assembly-0.1.jar \
+      """.stripMargin +
+        s"|   --env ${defaultParams.env}"
+    )
+  }
+  def main(args: Array[String]): Unit = {
+    parser.parse(args, defaultParams).map { param =>
+      GmeiConfig.setup(param.env)
+      val spark_env = GmeiConfig.getSparkSession()
+      val sc = spark_env._2
+      val ti = new TiContext(sc)
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure_precise")
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_click")
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "blacklist")
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "data_feed_exposure")
+      ti.tidbMapTable(dbName = "jerry_prod", tableName = "merge_queue_table")
+      //      val stat_date = GmeiConfig.getMinusNDate(1)
+      val stat_date = param.date
+      val partition_date = stat_date.replace("-","")
+      val agency_id = sc.sql(
+        s"""
+           |SELECT DISTINCT(cl_id) as device_id
+           |FROM online.ml_hospital_spam_pv_day
+           |WHERE partition_date >= '20180402'
+           |AND partition_date <= '${partition_date}'
+           |AND pv_ratio >= 0.95
+           |UNION ALL
+           |SELECT DISTINCT(cl_id) as device_id
+           |FROM online.ml_hospital_spam_pv_month
+           |WHERE partition_date >= '20171101'
+           |AND partition_date <= '${partition_date}'
+           |AND pv_ratio >= 0.95
+           |UNION ALL
+           |select distinct(device_id)
+           |from blacklist
+         """.stripMargin
+      )
+      agency_id.createOrReplaceTempView("agency_id")
+      val device_id_oldUser = sc.sql(
+        s"""
+           |select distinct(om.device_id) as device_id
+           |from online.ml_device_day_active_status om left join agency_id
+           |on om.device_id = agency_id.device_id
+           |where om.active_type = '4'
+           |and om.first_channel_source_type not in ('yqxiu1','yqxiu2','yqxiu3','yqxiu4','yqxiu5','mxyc1','mxyc2','mxyc3'
+           |    ,'wanpu','jinshan','jx','maimai','zhuoyi','huatian','suopingjingling','mocha','mizhe','meika','lamabang'
+           |    ,'js-az1','js-az2','js-az3','js-az4','js-az5','jfq-az1','jfq-az2','jfq-az3','jfq-az4','jfq-az5','toufang1'
+           |    ,'toufang2','toufang3','toufang4','toufang5','toufang6','TF-toufang1','TF-toufang2','TF-toufang3','TF-toufang4'
+           |    ,'TF-toufang5','tf-toufang1','tf-toufang2','tf-toufang3','tf-toufang4','tf-toufang5','benzhan','promotion_aso100'
+           |    ,'promotion_qianka','promotion_xiaoyu','promotion_dianru','promotion_malioaso','promotion_malioaso-shequ'
+           |    ,'promotion_shike','promotion_julang_jl03','','unknown')
+           |and om.partition_date ='${partition_date}'
+           |and agency_id.device_id is null
+         """.stripMargin
+      )
+      device_id_oldUser.createOrReplaceTempView("device_id_old")
+      device_id_oldUser.show()
+            val exp_diary_old = sc.sql(
+              s"""
+                 |select concat_ws('|',de.device_id,de.cid_id)
+                 |from data_feed_exposure de inner join device_id_old
+                 |where de.cid_type = 'diary'
+                 |and de.stat_date ='${stat_date}'
+               """.stripMargin
+            )
+            val get_result_old =exp_diary_old.rdd.map((_, 1)).reduceByKey(_ + _)
+              .sortBy(_._2,false)
+            val more_than2_old=get_result_old.filter(_._2 >=2).map(_._2).reduce((x,y)=>x+y)
+            println(more_than2_old)
+            val all_old =get_result_old.map(_._2).reduce((x,y)=>x+y)
+            println(all_old)
+            val repeated_rate_old= more_than2_old / all_old.toDouble
+            println(repeated_rate_old)
+            val result2=List((stat_date,more_than2_old,all_old))
+            val df2 = sc.createDataFrame(result2).toDF("stat_date","old_rep_count","old_imp_all")
+            GmeiConfig.writeToJDBCTable(df2, table = "Repeated_evaluation_indicator_old", SaveMode.Append)
+      //      val temp=get_result.collect()
+      //            for (i <- 0 until 30 ) {
+      //              println(temp(i))
+      //            }
+    }
+  }
+}
 object Repeated_content_recommendation_moreday {
  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)