diff --git a/src/main/scala/tech/sourced/gemini/Hash.scala b/src/main/scala/tech/sourced/gemini/Hash.scala index 0d3ffe05..aeeba3d5 100644 --- a/src/main/scala/tech/sourced/gemini/Hash.scala +++ b/src/main/scala/tech/sourced/gemini/Hash.scala @@ -8,6 +8,7 @@ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.cassandra._ import org.apache.spark.sql.functions._ +import org.apache.spark.sql.functions.{udf => sparkUdf} // udf name conflicts with engine import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.storage.StorageLevel import org.bblfsh.client.BblfshClient @@ -42,6 +43,9 @@ class Hash(session: SparkSession, mode: String = Gemini.fileSimilarityMode, docFreqPath: String = "") { + // very small files produce too much false positives + val fileSizeThresholdBytes = 500 + import session.implicits._ def report(header: String, countProcessed: Long, skipped: MapAccumulator): Unit = { @@ -120,20 +124,31 @@ class Hash(session: SparkSession, protected def filesForRepos(repos: DataFrame): DataFrame = { log.warn("Listing files") + val fileSizeUdf = sparkUdf { (content: Array[Byte]) => content.size } + repos .getHEAD .getCommits .getTreeEntries .getBlobs - .filter(r => !Enry.isVendor(r.getAs[String]("path"))) .filter('is_binary === false) + .filter(r => !Enry.isVendor(r.getAs[String]("path"))) + .withColumn("content_size", fileSizeUdf('content)) + .filter('content_size !== 0) // empty files only pollute results } protected def extractUast(files: DataFrame): DataFrame = { log.warn("Extracting UASTs") - files - .dropDuplicates("blob_id") + val blobs = files.dropDuplicates("blob_id") + + val filteredBlobs = if (mode == Gemini.fileSimilarityMode) { + blobs.filter('content_size > fileSizeThresholdBytes) + } else { + blobs + } + + filteredBlobs .classifyLanguages .filter('lang.isNotNull) .extractUASTs diff --git a/src/test/scala/tech/sourced/gemini/ReportSpec.scala b/src/test/scala/tech/sourced/gemini/ReportSpec.scala index e453604b..882e4ccd 100644 --- a/src/test/scala/tech/sourced/gemini/ReportSpec.scala +++ b/src/test/scala/tech/sourced/gemini/ReportSpec.scala @@ -104,12 +104,12 @@ class ReportSpec extends FlatSpec val similarGroups = report.findSimilarItems("/tmp/report-files-test", Gemini.fileSimilarityMode) println("Done") - similarGroups should have size 6 + similarGroups should have size 7 val files = similarGroups.head.map(_.toString) files.toSeq should contain theSameElementsAs Seq( - "https://github.com/erizocosmico/borges/blob/b1fcd3bf0ba810c05cb418babc09cc7f7783cc03/fixtures_test.go", - "https://github.com/src-d/borges/blob/e784f9d5f59d5c081c5f8f71b6c517918b899df0/fixtures_test.go" + "https://github.com/erizocosmico/borges/blob/b1fcd3bf0ba810c05cb418babc09cc7f7783cc03/consumer_test.go", + "https://github.com/src-d/borges/blob/e784f9d5f59d5c081c5f8f71b6c517918b899df0/consumer_test.go" ) }