From 4bf3abafe16f50aa066230d87361cae7d5497a18 Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Tue, 22 Jan 2019 15:54:05 +0100 Subject: [PATCH 1/7] add json output to report command Signed-off-by: Maxim Sukharev --- .../tech/sourced/gemini/cmd/ReportApp.scala | 55 +++++++++++++++---- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala index 64aac13d..8ca6dbd5 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala @@ -1,6 +1,9 @@ package tech.sourced.gemini.cmd import com.datastax.driver.core.Cluster +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper import tech.sourced.gemini._ import tech.sourced.gemini.util.Logger @@ -12,10 +15,16 @@ case class ReportAppConfig( format: String = ReportApp.defaultFmt, ccDirPath: String = ".", verbose: Boolean = false, - mode: String = Gemini.fileSimilarityMode + mode: String = Gemini.fileSimilarityMode, + output: String = ReportApp.defaultOutput ) object ReportApp extends App { + val outputText = "text" + val outputJson = "json" + val outputs = Array(outputText, outputJson) + val defaultOutput = outputText + val defaultFmt = "" val defaultFmtGroupBy = "use-group-by" val condensedFmt = "condensed" @@ -40,6 +49,16 @@ object ReportApp extends App { opt[Unit]('v', "verbose") .action((_, c) => c.copy(verbose = true)) .text("producing more verbose debug output") + opt[String]("output") + .valueName(outputs.mkString(" | ")) + .validate(x => + if (outputs contains x) { + success + } else { + failure(s"output must be one of: " + outputs.mkString(" | ")) + }) + .action((x, c) => c.copy(output = x)) + .text("output format") opt[String]("format") .valueName("use-group-by or condensed") .action((x, c) => c.copy(format = x)) @@ -64,10 +83,12 @@ object ReportApp extends App { log.info("Checking DB schema") gemini.applySchema(cassandra) - val ReportResult(duplicates, similarities) = gemini.report(cassandra, config.format, config.ccDirPath) + val result = gemini.report(cassandra, config.format, config.ccDirPath) - print(duplicates) - printCommunities(similarities) + config.output match { + case `outputText` => printAsText(result) + case `outputJson` => printAsJson(result) + } log.info("Closing DB connection") cassandra.close() @@ -77,8 +98,10 @@ object ReportApp extends App { System.exit(2) } - def print(report: ReportDuplicates): Unit = { - report match { + def printAsText(result: ReportResult): Unit = { + val ReportResult(duplicates, similarities) = result + + duplicates match { case e if e.empty() => println(s"No duplicated files found.") case ReportGrouped(v) => println(s"Duplicated files found:\n\t" + (v mkString "\n\t")) case ReportExpandedGroup(v) => @@ -87,13 +110,11 @@ object ReportApp extends App { println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n") } } - } - def printCommunities(report: Iterable[Iterable[SimilarItem]]): Unit = { - if (report.isEmpty) { + if (similarities.isEmpty) { println(s"No similarities found.") } else { - report.foreach { community => + similarities.foreach { community => val count = community.size val typeName = community.head match { case SimilarFunc(_, _, _) => "functions" @@ -103,4 +124,18 @@ object ReportApp extends App { } } } + + def printAsJson(result: ReportResult): Unit = { + val ReportResult(duplicates, similarities) = result + + val mapper = new ObjectMapper() with ScalaObjectMapper + mapper.registerModule(DefaultScalaModule) + + val str = mapper.writeValueAsString(Map( + "duplicates" -> duplicates, + "similarities" -> similarities + )) + println(str) + } + } From 035d3a1e4ac0dd98820b8641376e8b5f3f088b48 Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Tue, 22 Jan 2019 16:08:57 +0100 Subject: [PATCH 2/7] Remove report condensed format [fix #23] Replace --format param with --cassandra format which would enable advanced cql queries. Now functionality with Cassandra & ScyllaDB are the same. Signed-off-by: Maxim Sukharev --- .../scala/tech/sourced/gemini/Gemini.scala | 11 +++++------ .../scala/tech/sourced/gemini/Report.scala | 15 +-------------- .../tech/sourced/gemini/cmd/ReportApp.scala | 19 +++++++------------ .../tech/sourced/gemini/ReportSpec.scala | 11 ----------- 4 files changed, 13 insertions(+), 43 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index 0ac8a327..588a77a6 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -108,18 +108,17 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. * It is used one query per distinct file * * @param conn Database connections - * @param format Duplicated items mode + * @param advancedCql use advanced cql or not (supported only by Apache Cassandra) * @param ccDirPath directory for connected components * @return */ - def report(conn: Session, format: String, ccDirPath: String): ReportResult = { + def report(conn: Session, advancedCql: Boolean, ccDirPath: String): ReportResult = { val report = new Report(conn, log, keyspace, tables) log.info(s"Report duplicate items from DB $keyspace") - val duplicates = format match { - case ReportApp.defaultFmt => ReportExpandedGroup(report.findAllDuplicateItems()) - case ReportApp.defaultFmtGroupBy => ReportExpandedGroup(report.reportCassandraGroupBy()) - case ReportApp.condensedFmt => ReportGrouped(report.reportCassandraCondensed()) + val duplicates = advancedCql match { + case false => ReportExpandedGroup(report.findAllDuplicateItems()) + case true => ReportExpandedGroup(report.reportCassandraGroupBy()) } log.info(s"${duplicates.size} duplicate SHA1s") diff --git a/src/main/scala/tech/sourced/gemini/Report.scala b/src/main/scala/tech/sourced/gemini/Report.scala index a2bd527e..cb4293c0 100644 --- a/src/main/scala/tech/sourced/gemini/Report.scala +++ b/src/main/scala/tech/sourced/gemini/Report.scala @@ -31,23 +31,10 @@ sealed abstract class ReportDuplicates(v: Iterable[Any]) { case class ReportByLine(v: Iterable[RepoFile]) extends ReportDuplicates(v) -case class ReportGrouped(v: Iterable[DuplicateBlobHash]) extends ReportDuplicates(v) - case class ReportExpandedGroup(v: Iterable[Iterable[RepoFile]]) extends ReportDuplicates(v) class Report(conn: Session, log: Slf4jLogger, keyspace: String, tables: Tables) { - /** - * Finds duplicate files among hashed repositories - * It is used only one query - * (Only supported by Apache Cassandra databases) - * - * @return - */ - def reportCassandraCondensed(): Iterable[DuplicateBlobHash] = { - findAllDuplicateBlobHashes() - } - /** * Finds duplicate files among hashed repositories * It is used one query per unique duplicate file, plus an extra one @@ -56,7 +43,7 @@ class Report(conn: Session, log: Slf4jLogger, keyspace: String, tables: Tables) * @return */ def reportCassandraGroupBy(): Iterable[Iterable[RepoFile]] = { - reportCassandraCondensed() + findAllDuplicateBlobHashes() .map { item => Database.findFilesByHash(item.sha, conn, keyspace, tables) } diff --git a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala index 8ca6dbd5..6a5a543b 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala @@ -12,11 +12,11 @@ case class ReportAppConfig( host: String = Gemini.defaultCassandraHost, port: Int = Gemini.defaultCassandraPort, keyspace: String = Gemini.defautKeyspace, - format: String = ReportApp.defaultFmt, ccDirPath: String = ".", - verbose: Boolean = false, mode: String = Gemini.fileSimilarityMode, - output: String = ReportApp.defaultOutput + output: String = ReportApp.defaultOutput, + cassandra: Boolean = false, + verbose: Boolean = false ) object ReportApp extends App { @@ -27,7 +27,6 @@ object ReportApp extends App { val defaultFmt = "" val defaultFmtGroupBy = "use-group-by" - val condensedFmt = "condensed" val parser = new Parser[ReportAppConfig]("./report") { head("Gemini Report") @@ -59,12 +58,9 @@ object ReportApp extends App { }) .action((x, c) => c.copy(output = x)) .text("output format") - opt[String]("format") - .valueName("use-group-by or condensed") - .action((x, c) => c.copy(format = x)) - .text("Only for Apache Cassandra database\n" + - "use-group-by - use as many queries as unique duplicate files are found, plus one.\n" + - "condensed - use only one query to find the duplicates.") + opt[Boolean]("cassandra") + .action((x, c) => c.copy(cassandra = x)) + .text("Enable advanced cql queries for Apache Cassandra database") } parser.parseWithEnv(args, ReportAppConfig()) match { @@ -83,7 +79,7 @@ object ReportApp extends App { log.info("Checking DB schema") gemini.applySchema(cassandra) - val result = gemini.report(cassandra, config.format, config.ccDirPath) + val result = gemini.report(cassandra, config.cassandra, config.ccDirPath) config.output match { case `outputText` => printAsText(result) @@ -103,7 +99,6 @@ object ReportApp extends App { duplicates match { case e if e.empty() => println(s"No duplicated files found.") - case ReportGrouped(v) => println(s"Duplicated files found:\n\t" + (v mkString "\n\t")) case ReportExpandedGroup(v) => v.foreach { item => val count = item.size diff --git a/src/test/scala/tech/sourced/gemini/ReportSpec.scala b/src/test/scala/tech/sourced/gemini/ReportSpec.scala index 85df6fc7..c1402a97 100644 --- a/src/test/scala/tech/sourced/gemini/ReportSpec.scala +++ b/src/test/scala/tech/sourced/gemini/ReportSpec.scala @@ -65,17 +65,6 @@ class ReportSpec extends FlatSpec super.afterAll() } - "Report from Cassandra using GROUP BY" should "return duplicate files" taggedAs Cassandra in { - val report = new Report(session, logger, DUPLICATES, Gemini.tables) - - println("Query") - val result = report.reportCassandraCondensed() - println("Done") - - result should have size expectedDuplicateFiles.size - result foreach (_.count should be(2)) - } - "Detailed Report from Cassandra using GROUP BY" should "return duplicate files" taggedAs Cassandra in { val report = new Report(session, logger, DUPLICATES, Gemini.tables) From 53b8f731d63af309e1db35d462fe3b21a4a1fdef Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Tue, 22 Jan 2019 16:29:59 +0100 Subject: [PATCH 3/7] duplicates in json output should be array Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala index 6a5a543b..223df070 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala @@ -127,7 +127,7 @@ object ReportApp extends App { mapper.registerModule(DefaultScalaModule) val str = mapper.writeValueAsString(Map( - "duplicates" -> duplicates, + "duplicates" -> duplicates.asInstanceOf[ReportExpandedGroup].v, "similarities" -> similarities )) println(str) From c2ba2b18dcad46473b06866ca066ee3a01a90388 Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Tue, 22 Jan 2019 16:31:20 +0100 Subject: [PATCH 4/7] rename output flag to output-format Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala index 223df070..37b30cdd 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala @@ -48,7 +48,7 @@ object ReportApp extends App { opt[Unit]('v', "verbose") .action((_, c) => c.copy(verbose = true)) .text("producing more verbose debug output") - opt[String]("output") + opt[String]("output-format") .valueName(outputs.mkString(" | ")) .validate(x => if (outputs contains x) { From 279c56f92ed3209e5e258a8663ba974079e11bfd Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Tue, 22 Jan 2019 16:35:45 +0100 Subject: [PATCH 5/7] remove unused class ReportByLine Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/Report.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/Report.scala b/src/main/scala/tech/sourced/gemini/Report.scala index cb4293c0..cc5813a0 100644 --- a/src/main/scala/tech/sourced/gemini/Report.scala +++ b/src/main/scala/tech/sourced/gemini/Report.scala @@ -29,8 +29,6 @@ sealed abstract class ReportDuplicates(v: Iterable[Any]) { def size(): Int = v.size } -case class ReportByLine(v: Iterable[RepoFile]) extends ReportDuplicates(v) - case class ReportExpandedGroup(v: Iterable[Iterable[RepoFile]]) extends ReportDuplicates(v) class Report(conn: Session, log: Slf4jLogger, keyspace: String, tables: Tables) { From 6ca10dcadaf0784dc417ae09ac51eab16377fbf7 Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Tue, 22 Jan 2019 16:47:59 +0100 Subject: [PATCH 6/7] simplify report code a little Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/Gemini.scala | 4 ++-- src/main/scala/tech/sourced/gemini/Report.scala | 4 +--- .../tech/sourced/gemini/cmd/ReportApp.scala | 16 ++++++++-------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index 588a77a6..d6cf3059 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -117,8 +117,8 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. log.info(s"Report duplicate items from DB $keyspace") val duplicates = advancedCql match { - case false => ReportExpandedGroup(report.findAllDuplicateItems()) - case true => ReportExpandedGroup(report.reportCassandraGroupBy()) + case false => ReportDuplicates(report.findAllDuplicateItems()) + case true => ReportDuplicates(report.reportCassandraGroupBy()) } log.info(s"${duplicates.size} duplicate SHA1s") diff --git a/src/main/scala/tech/sourced/gemini/Report.scala b/src/main/scala/tech/sourced/gemini/Report.scala index cc5813a0..0260848e 100644 --- a/src/main/scala/tech/sourced/gemini/Report.scala +++ b/src/main/scala/tech/sourced/gemini/Report.scala @@ -21,7 +21,7 @@ import scala.sys.process._ */ case class ReportResult(duplicates: ReportDuplicates, similar: Iterable[Iterable[SimilarItem]]) -sealed abstract class ReportDuplicates(v: Iterable[Any]) { +case class ReportDuplicates(v: Iterable[Iterable[RepoFile]]) { def empty(): Boolean = { v.isEmpty } @@ -29,8 +29,6 @@ sealed abstract class ReportDuplicates(v: Iterable[Any]) { def size(): Int = v.size } -case class ReportExpandedGroup(v: Iterable[Iterable[RepoFile]]) extends ReportDuplicates(v) - class Report(conn: Session, log: Slf4jLogger, keyspace: String, tables: Tables) { /** diff --git a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala index 37b30cdd..2d266dd4 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala @@ -97,13 +97,13 @@ object ReportApp extends App { def printAsText(result: ReportResult): Unit = { val ReportResult(duplicates, similarities) = result - duplicates match { - case e if e.empty() => println(s"No duplicated files found.") - case ReportExpandedGroup(v) => - v.foreach { item => - val count = item.size - println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n") - } + if (duplicates.empty()) { + println(s"No duplicated files found.") + } else { + duplicates.v.foreach { item => + val count = item.size + println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n") + } } if (similarities.isEmpty) { @@ -127,7 +127,7 @@ object ReportApp extends App { mapper.registerModule(DefaultScalaModule) val str = mapper.writeValueAsString(Map( - "duplicates" -> duplicates.asInstanceOf[ReportExpandedGroup].v, + "duplicates" -> duplicates.v, "similarities" -> similarities )) println(str) From 701202e6a4dab9697c0924be16c38774c7350c4e Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Tue, 22 Jan 2019 18:24:02 +0100 Subject: [PATCH 7/7] add new cli arguments for report to README Signed-off-by: Maxim Sukharev --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index d4345180..6ae1263d 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,11 @@ Hash command specific arguments: * `-l/--limit` - limit the number of repositories to be processed. All repositories will be processed by default * `-f/--format` - format of the stored repositories. Supported input data formats that repositories could be stored in are `siva`, `bare` or `standard`, default `siva` +Report specific arguments: + + * `--output-format` - output format: text or json + * `--cassandra` - Enable advanced cql queries for Apache Cassandra database + ## Development ### Compile & Run