From d8fd40f816a6f03bd3c6483efbd4059bc26e8436 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=9B=85=E5=96=86?= <gaoyazhe@igengmei.com>
Date: Mon, 8 Oct 2018 17:02:04 +0800
Subject: [PATCH] change similar queue length

---
 eda/node2vec/src/main/scala/com/gmei/Main.scala | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/eda/node2vec/src/main/scala/com/gmei/Main.scala b/eda/node2vec/src/main/scala/com/gmei/Main.scala
index d989a49e..8214b12a 100644
--- a/eda/node2vec/src/main/scala/com/gmei/Main.scala
+++ b/eda/node2vec/src/main/scala/com/gmei/Main.scala
@@ -149,7 +149,7 @@ object Main {
       val matrix = new IndexedRowMatrix(rows)
       val lsh = new Lsh(
         minCosineSimilarity = 0.5,
-        dimensions = 128,
+        dimensions = 20,
         numNeighbours = 200,
         numPermutations = 10,
         partitions = 200,
@@ -173,12 +173,12 @@ object Main {
 
 
       // group by neighbours to get a list of similar words and then take top k
-      val result = remapSecond.groupBy(_._1).map {
+      val result = remapSecond.filter(_._1.startsWith("diary")).groupBy(_._1).map {
         case (word1, similarWords) =>
-          // sort by score desc. and take top 10 entries
-          val similar = similarWords.toSeq.sortBy(-1 * _._3).filter(_._2.startsWith("diary")).take(50).map(_._2).mkString(",")
+          // sort by score desc. and take top 20 entries
+          val similar = Try(similarWords.toSeq.sortBy(-1 * _._3).filter(_._2.startsWith("diary")).take(20).map(_._2).mkString(",")).getOrElse(null)
           (word1,s"$similar")
-      }
+      }.filter(_._2.split(",").length > 9)
       result.take(20).foreach(println)
 
       val similar_result = result.toDF("cid","similarity_cid")
-- 
2.18.0