From d8fd40f816a6f03bd3c6483efbd4059bc26e8436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E9=9B=85=E5=96=86?= <gaoyazhe@igengmei.com> Date: Mon, 8 Oct 2018 17:02:04 +0800 Subject: [PATCH] change similar queue length --- eda/node2vec/src/main/scala/com/gmei/Main.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/eda/node2vec/src/main/scala/com/gmei/Main.scala b/eda/node2vec/src/main/scala/com/gmei/Main.scala index d989a49e..8214b12a 100644 --- a/eda/node2vec/src/main/scala/com/gmei/Main.scala +++ b/eda/node2vec/src/main/scala/com/gmei/Main.scala @@ -149,7 +149,7 @@ object Main { val matrix = new IndexedRowMatrix(rows) val lsh = new Lsh( minCosineSimilarity = 0.5, - dimensions = 128, + dimensions = 20, numNeighbours = 200, numPermutations = 10, partitions = 200, @@ -173,12 +173,12 @@ object Main { // group by neighbours to get a list of similar words and then take top k - val result = remapSecond.groupBy(_._1).map { + val result = remapSecond.filter(_._1.startsWith("diary")).groupBy(_._1).map { case (word1, similarWords) => - // sort by score desc. and take top 10 entries - val similar = similarWords.toSeq.sortBy(-1 * _._3).filter(_._2.startsWith("diary")).take(50).map(_._2).mkString(",") + // sort by score desc. and take top 20 entries + val similar = Try(similarWords.toSeq.sortBy(-1 * _._3).filter(_._2.startsWith("diary")).take(20).map(_._2).mkString(",")).getOrElse(null) (word1,s"$similar") - } + }.filter(_._2.split(",").length > 9) result.take(20).foreach(println) val similar_result = result.toDF("cid","similarity_cid") -- 2.18.0