From 20bc70d55ea421466d7dfe968e5ad4a53bbdf77c Mon Sep 17 00:00:00 2001
From: Maxim Sukharev <max@smacker.ru>
Date: Thu, 24 Jan 2019 12:41:36 +0100
Subject: [PATCH] add support for Google Cloud Storage

- Allows to use gcs path for files as: gcs://bucket/path
- Adds new flag --gcs-keyfile to hash command for authorization

I added gcs-connector to the deps jar instead of sending it separatly to
spark due to conflicts in guava between spark 2.2 and gcs-connector.

I also updated scalapb to avoid multiple shading of guava for
gcs-connector and grpc separatly. Now both use compatible versions.

Signed-off-by: Maxim Sukharev <max@smacker.ru>
---
 README.md                                              |  1 +
 build.sbt                                              |  1 +
 project/Dependencies.scala                             |  5 +++--
 .../scala/tech/sourced/gemini/cmd/HashSparkApp.scala   | 10 +++++++++-
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index d4345180..4bda9dcc 100644
--- a/README.md
+++ b/README.md
@@ -152,6 +152,7 @@ Hash command specific arguments:
 
  * `-l/--limit` - limit the number of repositories to be processed. All repositories will be processed by default
  * `-f/--format` - format of the stored repositories. Supported input data formats that repositories could be stored in are `siva`, `bare` or `standard`, default `siva`
+ * `--gcs-keyfile` - path to [JSON keyfile](https://cloud.google.com/storage/docs/authentication) for authentication in Google Cloud Storage
 
 ## Development
 
diff --git a/build.sbt b/build.sbt
index 1659fa7c..d9d39cb9 100644
--- a/build.sbt
+++ b/build.sbt
@@ -22,6 +22,7 @@ libraryDependencies ++= Seq(
   scalapbGrpc % Compile,
   engine % Compile,
   jgit % Compile,
+  gcs % Compile,
   fixNetty,
   cassandraDriverMetrics % Provided, //needed for using Driver \wo Spark from SparkConnector
   cassandraSparkConnector % Compile,
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
index 65f3d13a..c4321bf4 100644
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -24,10 +24,11 @@ object Dependencies {
   lazy val hadoopCommon = ("org.apache.hadoop" % "hadoop-common" % "2.6.5")
     .exclude("com.sun.jersey", "jersey-server")
     .exclude("commons-beanutils", "commons-beanutils-core")
-  lazy val scalapb = "com.thesamet.scalapb" %% "scalapb-runtime" % "0.7.1"
-  lazy val scalapbGrpc = "com.thesamet.scalapb" %% "scalapb-runtime-grpc" % "0.7.1"
+  lazy val scalapb = "com.thesamet.scalapb" %% "scalapb-runtime" % "0.8.4"
+  lazy val scalapbGrpc = "com.thesamet.scalapb" %% "scalapb-runtime-grpc" % "0.8.4"
   lazy val ioGrpc = "io.grpc" % "grpc-netty" % "1.10.0"
   lazy val commonsMath = "org.apache.commons" % "commons-math3" % "3.6.1"
   lazy val bblfshClient = "org.bblfsh" % "bblfsh-client" % "1.8.2"
   lazy val scalaJsonParser = "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.6.7"
+  lazy val gcs = "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop2-1.9.11"
 }
diff --git a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala
index bc8c5bdf..180ad45f 100644
--- a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala
+++ b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala
@@ -24,7 +24,8 @@ case class HashAppConfig(
   sparkParallelism: Int = 8,
   docFreqFile: String = "",
   verbose: Boolean = false,
-  mode: String = Gemini.fileSimilarityMode
+  mode: String = Gemini.fileSimilarityMode,
+  gcsKeyFile: String = ""
 )
 
 /**
@@ -94,6 +95,9 @@ object HashSparkApp extends App with Logging {
     opt[String]("doc-freq-file")
       .action((x, c) => c.copy(docFreqFile = x))
       .text("path to file with feature frequencies")
+    opt[String]("gcs-keyfile")
+      .action((x, c) => c.copy(gcsKeyFile = x))
+      .text("path to JSON keyfile for authentication in Google Cloud Storage")
     arg[String]("<path-to-git-repos>")
       .required()
       .action((x, c) => c.copy(reposPath = x))
@@ -120,6 +124,10 @@ object HashSparkApp extends App with Logging {
         .config("spark.tech.sourced.featurext.grpc.port", config.fePort)
         .getOrCreate()
 
+      if (config.gcsKeyFile.nonEmpty) {
+        spark.sparkContext.hadoopConfiguration.set("google.cloud.auth.service.account.json.keyfile", config.gcsKeyFile)
+      }
+
       val reposPath = config.reposPath
       val repos = listRepositories(reposPath, config.format, spark, config.limit)
       printRepositories(reposPath, repos)