From 830d1285fc92943f244f1144fdded9441d5cd973 Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Fri, 25 Jan 2019 13:28:24 +0100 Subject: [PATCH 1/4] require empty db keyspace to run hash Hashing can't be executed incrementally due to calculation of document frequencies which require full input. this commit checks if hashtables and docfreq tables are empty and gemini exits with error if they are not. it also introduces new flag --replace which would clean up db for current hashing mode. Signed-off-by: Maxim Sukharev --- .../scala/tech/sourced/gemini/Gemini.scala | 19 +++++++++++++++++++ .../sourced/gemini/cmd/HashSparkApp.scala | 16 +++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index d6cf3059..7de7d45c 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -129,6 +129,25 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. ReportResult(duplicates, similarities) } + def isDBEmpty(session: Session, mode: String): Boolean = { + var row = session.execute(s"select count(*) from $keyspace.${tables.docFreq} where id='$mode'").one() + if (row.getLong(0) > 0) { + return false + } + + row = session.execute(s"select count(*) from $keyspace.${tables.hashtables}_$mode").one() + if (row.getLong(0) > 0) { + return false + } + + true + } + + def cleanDB(session: Session, mode: String): Unit = { + session.execute(s"delete from $keyspace.${tables.docFreq} where id='$mode'") + session.execute(s"truncate table $keyspace.${tables.hashtables}_$mode") + } + def applySchema(session: Session): Unit = { log.debug("CQL: creating schema") Source diff --git a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala index 180ad45f..f6326bb0 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala @@ -25,7 +25,8 @@ case class HashAppConfig( docFreqFile: String = "", verbose: Boolean = false, mode: String = Gemini.fileSimilarityMode, - gcsKeyFile: String = "" + gcsKeyFile: String = "", + replace: Boolean = false ) /** @@ -98,6 +99,9 @@ object HashSparkApp extends App with Logging { opt[String]("gcs-keyfile") .action((x, c) => c.copy(gcsKeyFile = x)) .text("path to JSON keyfile for authentication in Google Cloud Storage") + opt[Unit]("replace") + .action((x, c) => c.copy(replace = true)) + .text("replace results of previous hashing") arg[String]("") .required() .action((x, c) => c.copy(reposPath = x)) @@ -136,6 +140,16 @@ object HashSparkApp extends App with Logging { log.info("Checking DB schema") CassandraConnector(spark.sparkContext).withSessionDo { cassandra => gemini.applySchema(cassandra) + + if (config.replace) { + gemini.cleanDB(cassandra, config.mode) + } + + if (!gemini.isDBEmpty(cassandra, config.mode)) { + println("Database keyspace is not empty! Hashing may produce wrong results. " + + "Please choose another keyspace or pass the --replace option") + System.exit(2) + } } gemini.hash(reposPath, config.limit, config.format, config.mode, config.docFreqFile) From 268ad64296684a879528b0419fb85bbbd3e4fd9f Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Fri, 25 Jan 2019 15:02:38 +0100 Subject: [PATCH 2/4] change type of cassandra flag to unit It allows to pass just `--cassandra` instead of `--cassandra=true` Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala index 2d266dd4..0690c9e8 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala @@ -58,8 +58,8 @@ object ReportApp extends App { }) .action((x, c) => c.copy(output = x)) .text("output format") - opt[Boolean]("cassandra") - .action((x, c) => c.copy(cassandra = x)) + opt[Unit]("cassandra") + .action((x, c) => c.copy(cassandra = true)) .text("Enable advanced cql queries for Apache Cassandra database") } From 5743f2d68d267dfb451e688517baa30d063b9a5a Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Mon, 28 Jan 2019 14:54:36 +0100 Subject: [PATCH 3/4] use limit in the check for empty database Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/Gemini.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index 7de7d45c..a04d520a 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -130,7 +130,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. } def isDBEmpty(session: Session, mode: String): Boolean = { - var row = session.execute(s"select count(*) from $keyspace.${tables.docFreq} where id='$mode'").one() + var row = session.execute(s"select count(*) from $keyspace.${tables.docFreq} where id='$mode' limit 1").one() if (row.getLong(0) > 0) { return false } From 16379ef3bd89c0333f663822adf3728534b3907e Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Mon, 28 Jan 2019 17:30:49 +0100 Subject: [PATCH 4/4] check db emptiness only without replace flag Signed-off-by: Maxim Sukharev --- src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala index f6326bb0..b4a87d4b 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala @@ -143,9 +143,7 @@ object HashSparkApp extends App with Logging { if (config.replace) { gemini.cleanDB(cassandra, config.mode) - } - - if (!gemini.isDBEmpty(cassandra, config.mode)) { + } else if (!gemini.isDBEmpty(cassandra, config.mode)) { println("Database keyspace is not empty! Hashing may produce wrong results. " + "Please choose another keyspace or pass the --replace option") System.exit(2)