diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index d6cf3059..a04d520a 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -129,6 +129,25 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. ReportResult(duplicates, similarities) } + def isDBEmpty(session: Session, mode: String): Boolean = { + var row = session.execute(s"select count(*) from $keyspace.${tables.docFreq} where id='$mode' limit 1").one() + if (row.getLong(0) > 0) { + return false + } + + row = session.execute(s"select count(*) from $keyspace.${tables.hashtables}_$mode").one() + if (row.getLong(0) > 0) { + return false + } + + true + } + + def cleanDB(session: Session, mode: String): Unit = { + session.execute(s"delete from $keyspace.${tables.docFreq} where id='$mode'") + session.execute(s"truncate table $keyspace.${tables.hashtables}_$mode") + } + def applySchema(session: Session): Unit = { log.debug("CQL: creating schema") Source diff --git a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala index 180ad45f..b4a87d4b 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala @@ -25,7 +25,8 @@ case class HashAppConfig( docFreqFile: String = "", verbose: Boolean = false, mode: String = Gemini.fileSimilarityMode, - gcsKeyFile: String = "" + gcsKeyFile: String = "", + replace: Boolean = false ) /** @@ -98,6 +99,9 @@ object HashSparkApp extends App with Logging { opt[String]("gcs-keyfile") .action((x, c) => c.copy(gcsKeyFile = x)) .text("path to JSON keyfile for authentication in Google Cloud Storage") + opt[Unit]("replace") + .action((x, c) => c.copy(replace = true)) + .text("replace results of previous hashing") arg[String]("") .required() .action((x, c) => c.copy(reposPath = x)) @@ -136,6 +140,14 @@ object HashSparkApp extends App with Logging { log.info("Checking DB schema") CassandraConnector(spark.sparkContext).withSessionDo { cassandra => gemini.applySchema(cassandra) + + if (config.replace) { + gemini.cleanDB(cassandra, config.mode) + } else if (!gemini.isDBEmpty(cassandra, config.mode)) { + println("Database keyspace is not empty! Hashing may produce wrong results. " + + "Please choose another keyspace or pass the --replace option") + System.exit(2) + } } gemini.hash(reposPath, config.limit, config.format, config.mode, config.docFreqFile) diff --git a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala index 2d266dd4..0690c9e8 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/ReportApp.scala @@ -58,8 +58,8 @@ object ReportApp extends App { }) .action((x, c) => c.copy(output = x)) .text("output format") - opt[Boolean]("cassandra") - .action((x, c) => c.copy(cassandra = x)) + opt[Unit]("cassandra") + .action((x, c) => c.copy(cassandra = true)) .text("Enable advanced cql queries for Apache Cassandra database") }