From 20bc70d55ea421466d7dfe968e5ad4a53bbdf77c Mon Sep 17 00:00:00 2001 From: Maxim Sukharev Date: Thu, 24 Jan 2019 12:41:36 +0100 Subject: [PATCH] add support for Google Cloud Storage - Allows to use gcs path for files as: gcs://bucket/path - Adds new flag --gcs-keyfile to hash command for authorization I added gcs-connector to the deps jar instead of sending it separatly to spark due to conflicts in guava between spark 2.2 and gcs-connector. I also updated scalapb to avoid multiple shading of guava for gcs-connector and grpc separatly. Now both use compatible versions. Signed-off-by: Maxim Sukharev --- README.md | 1 + build.sbt | 1 + project/Dependencies.scala | 5 +++-- .../scala/tech/sourced/gemini/cmd/HashSparkApp.scala | 10 +++++++++- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d4345180..4bda9dcc 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,7 @@ Hash command specific arguments: * `-l/--limit` - limit the number of repositories to be processed. All repositories will be processed by default * `-f/--format` - format of the stored repositories. Supported input data formats that repositories could be stored in are `siva`, `bare` or `standard`, default `siva` + * `--gcs-keyfile` - path to [JSON keyfile](https://cloud.google.com/storage/docs/authentication) for authentication in Google Cloud Storage ## Development diff --git a/build.sbt b/build.sbt index 1659fa7c..d9d39cb9 100644 --- a/build.sbt +++ b/build.sbt @@ -22,6 +22,7 @@ libraryDependencies ++= Seq( scalapbGrpc % Compile, engine % Compile, jgit % Compile, + gcs % Compile, fixNetty, cassandraDriverMetrics % Provided, //needed for using Driver \wo Spark from SparkConnector cassandraSparkConnector % Compile, diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 65f3d13a..c4321bf4 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -24,10 +24,11 @@ object Dependencies { lazy val hadoopCommon = ("org.apache.hadoop" % "hadoop-common" % "2.6.5") .exclude("com.sun.jersey", "jersey-server") .exclude("commons-beanutils", "commons-beanutils-core") - lazy val scalapb = "com.thesamet.scalapb" %% "scalapb-runtime" % "0.7.1" - lazy val scalapbGrpc = "com.thesamet.scalapb" %% "scalapb-runtime-grpc" % "0.7.1" + lazy val scalapb = "com.thesamet.scalapb" %% "scalapb-runtime" % "0.8.4" + lazy val scalapbGrpc = "com.thesamet.scalapb" %% "scalapb-runtime-grpc" % "0.8.4" lazy val ioGrpc = "io.grpc" % "grpc-netty" % "1.10.0" lazy val commonsMath = "org.apache.commons" % "commons-math3" % "3.6.1" lazy val bblfshClient = "org.bblfsh" % "bblfsh-client" % "1.8.2" lazy val scalaJsonParser = "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.6.7" + lazy val gcs = "com.google.cloud.bigdataoss" % "gcs-connector" % "hadoop2-1.9.11" } diff --git a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala index bc8c5bdf..180ad45f 100644 --- a/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala +++ b/src/main/scala/tech/sourced/gemini/cmd/HashSparkApp.scala @@ -24,7 +24,8 @@ case class HashAppConfig( sparkParallelism: Int = 8, docFreqFile: String = "", verbose: Boolean = false, - mode: String = Gemini.fileSimilarityMode + mode: String = Gemini.fileSimilarityMode, + gcsKeyFile: String = "" ) /** @@ -94,6 +95,9 @@ object HashSparkApp extends App with Logging { opt[String]("doc-freq-file") .action((x, c) => c.copy(docFreqFile = x)) .text("path to file with feature frequencies") + opt[String]("gcs-keyfile") + .action((x, c) => c.copy(gcsKeyFile = x)) + .text("path to JSON keyfile for authentication in Google Cloud Storage") arg[String]("") .required() .action((x, c) => c.copy(reposPath = x)) @@ -120,6 +124,10 @@ object HashSparkApp extends App with Logging { .config("spark.tech.sourced.featurext.grpc.port", config.fePort) .getOrCreate() + if (config.gcsKeyFile.nonEmpty) { + spark.sparkContext.hadoopConfiguration.set("google.cloud.auth.service.account.json.keyfile", config.gcsKeyFile) + } + val reposPath = config.reposPath val repos = listRepositories(reposPath, config.format, spark, config.limit) printRepositories(reposPath, repos)