Commit bc002afe authored by Pengfei Xue's avatar Pengfei Xue

use df.rdd.map

parent 8c384665
...@@ -4,7 +4,6 @@ import org.apache.spark.sql.{Row, SparkSession} ...@@ -4,7 +4,6 @@ import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
case class Record( case class Record(
cl_id: String, action: String, app_version: String, page_name: String, cl_id: String, action: String, app_version: String, page_name: String,
extra: Int, referrer: Int, is_push: Int, in: Int, out: Int, extra: Int, referrer: Int, is_push: Int, in: Int, out: Int,
...@@ -51,9 +50,9 @@ object pvCheker { ...@@ -51,9 +50,9 @@ object pvCheker {
import sc.implicits._ import sc.implicits._
import sc.sqlContext.implicits._ import sc.sqlContext.implicits._
val y = df.as[Record].map { val y = df.rdd.map {
case r => Seq(r.extra, r.referrer, r.is_push, r.in, r.out, r.referrer_id, r.referrer_tab_name, r.bz_id) case r: Record => Seq(r.extra, r.referrer, r.is_push, r.in, r.out, r.referrer_id, r.referrer_tab_name, r.bz_id)
}.rdd }
val z = y map {i => Vectors.dense(i.toArray[Double])} val z = y map {i => Vectors.dense(i.toArray[Double])}
val summary: MultivariateStatisticalSummary = Statistics.colStats(z) val summary: MultivariateStatisticalSummary = Statistics.colStats(z)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment