diff --git a/README.md b/README.md index d4345180..6ae1263d 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,11 @@ Hash command specific arguments: * `-l/--limit` - limit the number of repositories to be processed. All repositories will be processed by default * `-f/--format` - format of the stored repositories. Supported input data formats that repositories could be stored in are `siva`, `bare` or `standard`, default `siva` +Report specific arguments: + + * `--output-format` - output format: text or json + * `--cassandra` - Enable advanced cql queries for Apache Cassandra database + ## Development ### Compile & Run diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index 0ac8a327..d6cf3059 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -108,18 +108,17 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. * It is used one query per distinct file * * @param conn Database connections - * @param format Duplicated items mode + * @param advancedCql use advanced cql or not (supported only by Apache Cassandra) * @param ccDirPath directory for connected components * @return */ - def report(conn: Session, format: String, ccDirPath: String): ReportResult = { + def report(conn: Session, advancedCql: Boolean, ccDirPath: String): ReportResult = { val report = new Report(conn, log, keyspace, tables) log.info(s"Report duplicate items from DB $keyspace") - val duplicates = format match { - case ReportApp.defaultFmt => ReportExpandedGroup(report.findAllDuplicateItems()) - case ReportApp.defaultFmtGroupBy => ReportExpandedGroup(report.reportCassandraGroupBy()) - case ReportApp.condensedFmt => ReportGrouped(report.reportCassandraCondensed()) + val duplicates = advancedCql match { + case false => ReportDuplicates(report.findAllDuplicateItems()) + case true => ReportDuplicates(report.reportCassandraGroupBy()) } log.info(s"${duplicates.size} duplicate SHA1s") diff --git a/src/main/scala/tech/sourced/gemini/Report.scala b/src/main/scala/tech/sourced/gemini/Report.scala index a2bd527e..0260848e 100644 --- a/src/main/scala/tech/sourced/gemini/Report.scala +++ b/src/main/scala/tech/sourced/gemini/Report.scala @@ -21,7 +21,7 @@ import scala.sys.process._ */ case class ReportResult(duplicates: ReportDuplicates, similar: Iterable[Iterable[SimilarItem]]) -sealed abstract class ReportDuplicates(v: Iterable[Any]) { +case class ReportDuplicates(v: Iterable[Iterable[RepoFile]]) { def empty(): Boolean = { v.isEmpty } @@ -29,25 +29,8 @@ sealed abstract class ReportDuplicates(v: Iterable[Any]) { def size(): Int = v.size } -case class ReportByLine(v: Iterable[RepoFile]) extends ReportDuplicates(v) - -case class ReportGrouped(v: Iterable[DuplicateBlobHash]) extends ReportDuplicates(v) - -case class ReportExpandedGroup(v: Iterable[Iterable[RepoFile]]) extends ReportDuplicates(v) - class Report(conn: Session, log: Slf4jLogger, keyspace: String, tables: Tables) { - /** - * Finds duplicate files among hashed repositories - * It is used only one query - * (Only supported by Apache Cassandra databases) - * - * @return - */ - def reportCassandraCondensed(): Iterable[DuplicateBlobHash] = { - findAllDuplicateBlobHashes() - } - /** * Finds duplicate files among hashed repositories * It is used one query per unique duplicate file, plus an extra one @@ -56,7 +39,7 @@ class Report(conn: Session, log: Slf4jLogger, keyspace: String, tables: Tables) * @return */ def reportCassandraGroupBy(): Iterable[Iterable[RepoFile]] = { - reportCassandraCondensed() + findAllDuplicateBlobHashes() .map { item => Database.findFilesByHash(item.sha, conn, keyspace, tables) } diff --git a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala index 64aac13d..2d266dd4 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala @@ -1,6 +1,9 @@ package tech.sourced.gemini.cmd import com.datastax.driver.core.Cluster +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper import tech.sourced.gemini._ import tech.sourced.gemini.util.Logger @@ -9,16 +12,21 @@ case class ReportAppConfig( host: String = Gemini.defaultCassandraHost, port: Int = Gemini.defaultCassandraPort, keyspace: String = Gemini.defautKeyspace, - format: String = ReportApp.defaultFmt, ccDirPath: String = ".", - verbose: Boolean = false, - mode: String = Gemini.fileSimilarityMode + mode: String = Gemini.fileSimilarityMode, + output: String = ReportApp.defaultOutput, + cassandra: Boolean = false, + verbose: Boolean = false ) object ReportApp extends App { + val outputText = "text" + val outputJson = "json" + val outputs = Array(outputText, outputJson) + val defaultOutput = outputText + val defaultFmt = "" val defaultFmtGroupBy = "use-group-by" - val condensedFmt = "condensed" val parser = new Parser[ReportAppConfig]("./report") { head("Gemini Report") @@ -40,12 +48,19 @@ object ReportApp extends App { opt[Unit]('v', "verbose") .action((_, c) => c.copy(verbose = true)) .text("producing more verbose debug output") - opt[String]("format") - .valueName("use-group-by or condensed") - .action((x, c) => c.copy(format = x)) - .text("Only for Apache Cassandra database\n" + - "use-group-by - use as many queries as unique duplicate files are found, plus one.\n" + - "condensed - use only one query to find the duplicates.") + opt[String]("output-format") + .valueName(outputs.mkString(" | ")) + .validate(x => + if (outputs contains x) { + success + } else { + failure(s"output must be one of: " + outputs.mkString(" | ")) + }) + .action((x, c) => c.copy(output = x)) + .text("output format") + opt[Boolean]("cassandra") + .action((x, c) => c.copy(cassandra = x)) + .text("Enable advanced cql queries for Apache Cassandra database") } parser.parseWithEnv(args, ReportAppConfig()) match { @@ -64,10 +79,12 @@ object ReportApp extends App { log.info("Checking DB schema") gemini.applySchema(cassandra) - val ReportResult(duplicates, similarities) = gemini.report(cassandra, config.format, config.ccDirPath) + val result = gemini.report(cassandra, config.cassandra, config.ccDirPath) - print(duplicates) - printCommunities(similarities) + config.output match { + case `outputText` => printAsText(result) + case `outputJson` => printAsJson(result) + } log.info("Closing DB connection") cassandra.close() @@ -77,23 +94,22 @@ object ReportApp extends App { System.exit(2) } - def print(report: ReportDuplicates): Unit = { - report match { - case e if e.empty() => println(s"No duplicated files found.") - case ReportGrouped(v) => println(s"Duplicated files found:\n\t" + (v mkString "\n\t")) - case ReportExpandedGroup(v) => - v.foreach { item => - val count = item.size - println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n") - } + def printAsText(result: ReportResult): Unit = { + val ReportResult(duplicates, similarities) = result + + if (duplicates.empty()) { + println(s"No duplicated files found.") + } else { + duplicates.v.foreach { item => + val count = item.size + println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n") + } } - } - def printCommunities(report: Iterable[Iterable[SimilarItem]]): Unit = { - if (report.isEmpty) { + if (similarities.isEmpty) { println(s"No similarities found.") } else { - report.foreach { community => + similarities.foreach { community => val count = community.size val typeName = community.head match { case SimilarFunc(_, _, _) => "functions" @@ -103,4 +119,18 @@ object ReportApp extends App { } } } + + def printAsJson(result: ReportResult): Unit = { + val ReportResult(duplicates, similarities) = result + + val mapper = new ObjectMapper() with ScalaObjectMapper + mapper.registerModule(DefaultScalaModule) + + val str = mapper.writeValueAsString(Map( + "duplicates" -> duplicates.v, + "similarities" -> similarities + )) + println(str) + } + } diff --git a/src/test/scala/tech/sourced/gemini/ReportSpec.scala b/src/test/scala/tech/sourced/gemini/ReportSpec.scala index 85df6fc7..c1402a97 100644 --- a/src/test/scala/tech/sourced/gemini/ReportSpec.scala +++ b/src/test/scala/tech/sourced/gemini/ReportSpec.scala @@ -65,17 +65,6 @@ class ReportSpec extends FlatSpec super.afterAll() } - "Report from Cassandra using GROUP BY" should "return duplicate files" taggedAs Cassandra in { - val report = new Report(session, logger, DUPLICATES, Gemini.tables) - - println("Query") - val result = report.reportCassandraCondensed() - println("Done") - - result should have size expectedDuplicateFiles.size - result foreach (_.count should be(2)) - } - "Detailed Report from Cassandra using GROUP BY" should "return duplicate files" taggedAs Cassandra in { val report = new Report(session, logger, DUPLICATES, Gemini.tables)