From d20e2412735e33e40be1cf4ed549a4416919c889 Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Fri, 15 Feb 2019 15:23:28 +0100 Subject: [PATCH] Remove duplicated repositories In case of bare or standard repositories engine can return more than 1 repository for the same directory. The most common case is when you just "git clone" remote repository and run gemini on it. Engine would produce 2 different repositories: - local one: file://path/to/directory - remote one: https://github.com/... As a result all files in the repository will have duplicates which is incorrect from user point of view. Signed-off-by: Maxim Sukharev --- .../scala/tech/sourced/gemini/Gemini.scala | 51 +++++++++++++++---- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index 95b799d1..2d98444f 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -9,8 +9,8 @@ import org.eclipse.jgit.lib.Constants.OBJ_BLOB import org.eclipse.jgit.lib.ObjectInserter import org.slf4j.{Logger => Slf4jLogger} import tech.sourced.engine.Engine +import tech.sourced.engine._ import tech.sourced.featurext.generated.service.FeatureExtractorGrpc.FeatureExtractor -import tech.sourced.gemini.cmd.ReportApp import tech.sourced.gemini.util.{Logger, URLFormatter} import scala.io.Source @@ -61,19 +61,52 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. def getRepos(reposPath: String, limit: Int = 0, format: String = "siva"): DataFrame = { val engine = Engine(session, reposPath, format) val repos = engine.getRepositories + // siva files contain only 1 remote per repository + // no need to do deduplication + val onlyUnique = format != "siva" + + // get repositories ids to filter jgit-results + if (onlyUnique || limit > 0) { + val reposIdsSubset = if (onlyUnique) { + uniqueRepoIds(repos) + } else { + repos.select("id") + } - // engine.getRepositories.limit(n)...getFiles - doesn't work in engine now - // https://github.com/src-d/engine/issues/267 - // use workaround with filter - if (limit <= 0) { - repos - } else { - log.info(s"Using only $limit repositories") - val repoIds = repos.limit(limit).select($"id").collect().map(_ (0)) + val limitedReposIds = if (limit > 0) { + log.info(s"Using only $limit repositories") + reposIdsSubset.limit(limit) + } else { + reposIdsSubset + } + + val repoIds = limitedReposIds.collect().map(_ (0)) repos.filter($"id".isin(repoIds: _*)) + } else { + repos } } + def uniqueRepoIds(repos: DataFrame): DataFrame = { + repos + .getHEAD + .groupByKey(row => row.getAs[String]("hash")) + .reduceGroups {(r1, r2) => + // In case of multiple repositories have the same HEAD we keep only one + // we prefer remote over local one + // in case of multiple remotes we choose random one + val v1 = r1.getAs[String]("repository_id") + if (v1.isEmpty || v1.startsWith("file://")) { + r2 + } else { + r1 + } + } + // FIXME there must be better way to do it but I couldn't find it + .withColumnRenamed("ReduceAggregator(org.apache.spark.sql.Row)", "tmp") + .select("tmp.repository_id") + } + val fileWithFuncPattern: Regex = "(.+):(.+):([0-9]+)".r /**