From 315bf54d09447e55d2ac8ac5537ca1823fcc57fc Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Thu, 14 Feb 2019 13:44:12 +0100 Subject: [PATCH] filter files with null language extractUASTs UDF sends files to bblfsh when language is null to guess the language on bblfsh side which doesn't make sense and slows down hashing. Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/Hash.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/tech/sourced/gemini/Hash.scala b/src/main/scala/tech/sourced/gemini/Hash.scala index 6a265d30..0d3ffe05 100644 --- a/src/main/scala/tech/sourced/gemini/Hash.scala +++ b/src/main/scala/tech/sourced/gemini/Hash.scala @@ -135,6 +135,7 @@ class Hash(session: SparkSession, files .dropDuplicates("blob_id") .classifyLanguages + .filter('lang.isNotNull) .extractUASTs .select("repository_id", "path", "blob_id", "uast") .filter(_.getAs[Seq[Array[Byte]]]("uast").nonEmpty)