From 8c02daaa93cf3629fb540e12ab0bc5b94ddaa593 Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Fri, 15 Feb 2019 15:49:49 +0100 Subject: [PATCH 1/4] Remove empty files from hashing On real test dataset it produces 1500 duplicates and is not valuable for a user. Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/Hash.scala | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/scala/tech/sourced/gemini/Hash.scala b/src/main/scala/tech/sourced/gemini/Hash.scala index 0d3ffe05..8c54e64e 100644 --- a/src/main/scala/tech/sourced/gemini/Hash.scala +++ b/src/main/scala/tech/sourced/gemini/Hash.scala @@ -8,6 +8,7 @@ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.cassandra._ import org.apache.spark.sql.functions._ +import org.apache.spark.sql.functions.{udf => sparkUdf} // udf name conflicts with engine import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.storage.StorageLevel import org.bblfsh.client.BblfshClient @@ -120,13 +121,17 @@ class Hash(session: SparkSession, protected def filesForRepos(repos: DataFrame): DataFrame = { log.warn("Listing files") + val fileSizeUdf = sparkUdf { (content: Array[Byte]) => content.size } + repos .getHEAD .getCommits .getTreeEntries .getBlobs - .filter(r => !Enry.isVendor(r.getAs[String]("path"))) .filter('is_binary === false) + .filter(r => !Enry.isVendor(r.getAs[String]("path"))) + .withColumn("content_size", fileSizeUdf('content)) + .filter('content_size !== 0) // empty files only pollute results } protected def extractUast(files: DataFrame): DataFrame = { From bdffa6298f9c73f3011a4047de05a6ad8a0ae38c Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Fri, 15 Feb 2019 16:06:52 +0100 Subject: [PATCH 2/4] Exclude very small files from similarity hashing On real dataset too small files produce too much false positives. Also very small files as duplicates aren't very valuable, it doesn't make sense to abstract common code for them. Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/Hash.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main/scala/tech/sourced/gemini/Hash.scala b/src/main/scala/tech/sourced/gemini/Hash.scala index 8c54e64e..cd4b75ae 100644 --- a/src/main/scala/tech/sourced/gemini/Hash.scala +++ b/src/main/scala/tech/sourced/gemini/Hash.scala @@ -43,6 +43,9 @@ class Hash(session: SparkSession, mode: String = Gemini.fileSimilarityMode, docFreqPath: String = "") { + // very small files produce too much false positives + val fileSizeThresholdBytes = 500 + import session.implicits._ def report(header: String, countProcessed: Long, skipped: MapAccumulator): Unit = { @@ -139,6 +142,7 @@ class Hash(session: SparkSession, files .dropDuplicates("blob_id") + .filter('content_size > fileSizeThresholdBytes) .classifyLanguages .filter('lang.isNotNull) .extractUASTs From f1131f5f03d3af85daa0bfa9b3de8cc362fee621 Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Tue, 19 Feb 2019 12:52:44 +0100 Subject: [PATCH 3/4] apply file limit only in file-similarity mode Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/Hash.scala | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/Hash.scala b/src/main/scala/tech/sourced/gemini/Hash.scala index cd4b75ae..aeeba3d5 100644 --- a/src/main/scala/tech/sourced/gemini/Hash.scala +++ b/src/main/scala/tech/sourced/gemini/Hash.scala @@ -140,9 +140,15 @@ class Hash(session: SparkSession, protected def extractUast(files: DataFrame): DataFrame = { log.warn("Extracting UASTs") - files - .dropDuplicates("blob_id") - .filter('content_size > fileSizeThresholdBytes) + val blobs = files.dropDuplicates("blob_id") + + val filteredBlobs = if (mode == Gemini.fileSimilarityMode) { + blobs.filter('content_size > fileSizeThresholdBytes) + } else { + blobs + } + + filteredBlobs .classifyLanguages .filter('lang.isNotNull) .extractUASTs From 62566ea078033a3fa9c0598cfc8c7f6fbaabc82a Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Tue, 19 Feb 2019 13:02:36 +0100 Subject: [PATCH 4/4] update test due to changed output of similarity without too small files gemini is able to find one more similar file I validated content of all results manually Signed-off-by: Maxim Sukharev --- src/test/scala/tech/sourced/gemini/ReportSpec.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test/scala/tech/sourced/gemini/ReportSpec.scala b/src/test/scala/tech/sourced/gemini/ReportSpec.scala index e453604b..882e4ccd 100644 --- a/src/test/scala/tech/sourced/gemini/ReportSpec.scala +++ b/src/test/scala/tech/sourced/gemini/ReportSpec.scala @@ -104,12 +104,12 @@ class ReportSpec extends FlatSpec val similarGroups = report.findSimilarItems("/tmp/report-files-test", Gemini.fileSimilarityMode) println("Done") - similarGroups should have size 6 + similarGroups should have size 7 val files = similarGroups.head.map(_.toString) files.toSeq should contain theSameElementsAs Seq( - "https://github.com/erizocosmico/borges/blob/b1fcd3bf0ba810c05cb418babc09cc7f7783cc03/fixtures_test.go", - "https://github.com/src-d/borges/blob/e784f9d5f59d5c081c5f8f71b6c517918b899df0/fixtures_test.go" + "https://github.com/erizocosmico/borges/blob/b1fcd3bf0ba810c05cb418babc09cc7f7783cc03/consumer_test.go", + "https://github.com/src-d/borges/blob/e784f9d5f59d5c081c5f8f71b6c517918b899df0/consumer_test.go" ) }