-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-26595][core] Allow credential renewal based on kerberos ticket cache. #23525
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7fa03ec
f7e6734
df85d68
ec2d2c1
331bb6b
57aed47
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,7 +38,7 @@ import org.apache.spark.util.ThreadUtils | |
| /** | ||
| * Manager for delegation tokens in a Spark application. | ||
| * | ||
| * When configured with a principal and a keytab, this manager will make sure long-running apps can | ||
| * When delegation token renewal is enabled, this manager will make sure long-running apps can | ||
| * run without interruption while accessing secured services. It periodically logs in to the KDC | ||
| * with user-provided credentials, and contacts all the configured secure services to obtain | ||
| * delegation tokens to be distributed to the rest of the application. | ||
|
|
@@ -47,6 +47,11 @@ import org.apache.spark.util.ThreadUtils | |
| * elapsed. The new tokens are sent to the Spark driver endpoint. The driver is tasked with | ||
| * distributing the tokens to other processes that might need them. | ||
| * | ||
| * Renewal can be enabled in two different ways: by providing a principal and keytab to Spark, or by | ||
| * enabling renewal based on the local credential cache. The latter has the drawback that Spark | ||
| * can't create new TGTs by itself, so the user has to manually update the Kerberos ticket cache | ||
| * externally. | ||
| * | ||
| * This class can also be used just to create delegation tokens, by calling the | ||
| * `obtainDelegationTokens` method. This option does not require calling the `start` method nor | ||
| * providing a driver reference, but leaves it up to the caller to distribute the tokens that were | ||
|
|
@@ -78,7 +83,11 @@ private[spark] class HadoopDelegationTokenManager( | |
| private var renewalExecutor: ScheduledExecutorService = _ | ||
|
|
||
| /** @return Whether delegation token renewal is enabled. */ | ||
| def renewalEnabled: Boolean = principal != null | ||
| def renewalEnabled: Boolean = sparkConf.get(KERBEROS_RENEWAL_CREDENTIALS) match { | ||
| case "keytab" => principal != null | ||
| case "ccache" => UserGroupInformation.getCurrentUser().hasKerberosCredentials() | ||
| case _ => false | ||
| } | ||
|
|
||
| /** | ||
| * Start the token renewer. Requires a principal and keytab. Upon start, the renewer will | ||
|
|
@@ -118,7 +127,7 @@ private[spark] class HadoopDelegationTokenManager( | |
|
|
||
| def stop(): Unit = { | ||
| if (renewalExecutor != null) { | ||
| renewalExecutor.shutdown() | ||
| renewalExecutor.shutdownNow() | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -179,7 +188,7 @@ private[spark] class HadoopDelegationTokenManager( | |
|
|
||
| private def scheduleRenewal(delay: Long): Unit = { | ||
| val _delay = math.max(0, delay) | ||
| logInfo(s"Scheduling login from keytab in ${UIUtils.formatDuration(delay)}.") | ||
| logInfo(s"Scheduling renewal in ${UIUtils.formatDuration(delay)}.") | ||
|
|
||
| val renewalTask = new Runnable() { | ||
| override def run(): Unit = { | ||
|
|
@@ -203,6 +212,9 @@ private[spark] class HadoopDelegationTokenManager( | |
| schedulerRef.send(UpdateDelegationTokens(tokens)) | ||
| tokens | ||
| } catch { | ||
| case _: InterruptedException => | ||
| // Ignore, may happen if shutting down. | ||
| null | ||
| case e: Exception => | ||
| val delay = TimeUnit.SECONDS.toMillis(sparkConf.get(CREDENTIALS_RENEWAL_RETRY_WAIT)) | ||
| logWarning(s"Failed to update tokens, will try again in ${UIUtils.formatDuration(delay)}!" + | ||
|
|
@@ -236,11 +248,19 @@ private[spark] class HadoopDelegationTokenManager( | |
| } | ||
|
|
||
| private def doLogin(): UserGroupInformation = { | ||
| logInfo(s"Attempting to login to KDC using principal: $principal") | ||
| require(new File(keytab).isFile(), s"Cannot find keytab at $keytab.") | ||
| val ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab) | ||
| logInfo("Successfully logged into KDC.") | ||
| ugi | ||
| if (principal != null) { | ||
| logInfo(s"Attempting to login to KDC using principal: $principal") | ||
| require(new File(keytab).isFile(), s"Cannot find keytab at $keytab.") | ||
| val ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab) | ||
| logInfo("Successfully logged into KDC.") | ||
| ugi | ||
| } else { | ||
| logInfo(s"Attempting to load user's ticket cache.") | ||
| val ccache = sparkConf.getenv("KRB5CCNAME") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was wondering if adding an additional optional configuration parameter with the path of the KRB5CC file could also be useful? Possibly more useful when using this in cluster mode?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure how you'd use this in cluster mode; but the biggest issue is that the Hadoop libraries only use the env variable (which is also recognized by all kerberos tools). So we can't really add a Spark-specific option. |
||
| val user = Option(sparkConf.getenv("KRB5PRINCIPAL")).getOrElse( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it make sense to also check/use the value of spark.yarn.principal (or an ad-hoc config parameter if "reusing" this one is not OK) if provided by the user?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| UserGroupInformation.getCurrentUser().getUserName()) | ||
| UserGroupInformation.getUGIFromTicketCache(ccache, user) | ||
| } | ||
| } | ||
|
|
||
| private def loadProviders(): Map[String, HadoopDelegationTokenProvider] = { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -333,6 +333,16 @@ package object config { | |
| .timeConf(TimeUnit.SECONDS) | ||
| .createWithDefaultString("1m") | ||
|
|
||
| private[spark] val KERBEROS_RENEWAL_CREDENTIALS = | ||
| ConfigBuilder("spark.kerberos.renewal.credentials") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wondering why an additional config needed and not just falling back to cache if keytab is not provided + the cluster is secure.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would be a change in behavior. e.g in YARN that would increase the number of delegation tokens an app creates when it's run without a keytab. Second, if your TGT is not renewable ( Also, a config opens the path for other ways of providing tokens (e.g. renewing based on a delegation token cache, which some k8s guys were interested in).
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now I see, it's fine. A little bit offtopic here but I've considered to apply this to the kafka area. If no global JVM security config provided then use this config or something. |
||
| .doc( | ||
| "Which credentials to use when renewing delegation tokens for executors. Can be either " + | ||
| "'keytab', the default, which requires a keytab to be provided, or 'ccache', which uses " + | ||
| "the local credentials cache.") | ||
| .stringConf | ||
| .checkValues(Set("keytab", "ccache")) | ||
| .createWithDefault("keytab") | ||
|
|
||
| private[spark] val EXECUTOR_INSTANCES = ConfigBuilder("spark.executor.instances") | ||
| .intConf | ||
| .createOptional | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -776,16 +776,32 @@ The following options provides finer-grained control for this feature: | |
| Long-running applications may run into issues if their run time exceeds the maximum delegation | ||
| token lifetime configured in services it needs to access. | ||
|
|
||
| Spark supports automatically creating new tokens for these applications when running in YARN mode. | ||
| Kerberos credentials need to be provided to the Spark application via the `spark-submit` command, | ||
| using the `--principal` and `--keytab` parameters. | ||
| This feature is not available everywhere. In particular, it's only implemented | ||
| on YARN and Kubernetes (both client and cluster modes), and on Mesos when using client mode. | ||
|
|
||
| The provided keytab will be copied over to the machine running the Application Master via the Hadoop | ||
| Distributed Cache. For this reason, it's strongly recommended that both YARN and HDFS be secured | ||
| with encryption, at least. | ||
| Spark supports automatically creating new tokens for these applications. There are two ways to | ||
| enable this functionality. | ||
|
|
||
| The Kerberos login will be periodically renewed using the provided credentials, and new delegation | ||
| tokens for supported will be created. | ||
| ### Using a Keytab | ||
|
|
||
| By providing Spark with a principal and keytab (e.g. using `spark-submit` with `--principal` | ||
| and `--keytab` parameters), the application will maintain a valid Kerberos login that can be | ||
| used to retrieve delegation tokens indefinitely. | ||
|
|
||
| Note that when using a keytab in cluster mode, it will be copied over to the machine running the | ||
| Spark driver. In the case of YARN, this means using HDFS as a staging area for the keytab, so it's | ||
| strongly recommended that both YARN and HDFS be secured with encryption, at least. | ||
|
|
||
| ### Using a ticket cache | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very nice improvement in this PR. I guess it is worth documenting it also on docs/running-on-yarn.md
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This document is linked from the YARN doc. No need to duplicate documentation. |
||
|
|
||
| By setting `spark.kerberos.renewal.credentials` to `ccache` in Spark's configuration, the local | ||
| Kerberos ticket cache will be used for authentication. Spark will keep the ticket renewed during its | ||
| renewable life, but after it expires a new ticket needs to be acquired (e.g. by running `kinit`). | ||
|
|
||
| It's up to the user to maintain an updated ticket cache that Spark can use. | ||
|
|
||
| The location of the ticket cache can be customized by setting the `KRB5CCNAME` environment | ||
| variable. | ||
|
|
||
| ## Secure Interaction with Kubernetes | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -144,17 +144,15 @@ private[spark] class KubernetesClusterSchedulerBackend( | |
| // Don't do anything else - let event handling from the Kubernetes API do the Spark changes | ||
| } | ||
|
|
||
| override def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = { | ||
| new KubernetesDriverEndpoint(sc.env.rpcEnv, properties) | ||
| override def createDriverEndpoint(): DriverEndpoint = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Curly braces can be dropped
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't fit in the same line, so braces remain. |
||
| new KubernetesDriverEndpoint() | ||
| } | ||
|
|
||
| override protected def createTokenManager( | ||
| schedulerRef: RpcEndpointRef): Option[HadoopDelegationTokenManager] = { | ||
| Some(new HadoopDelegationTokenManager(conf, sc.hadoopConfiguration, schedulerRef)) | ||
| override protected def createTokenManager(): Option[HadoopDelegationTokenManager] = { | ||
| Some(new HadoopDelegationTokenManager(conf, sc.hadoopConfiguration, driverEndpoint)) | ||
| } | ||
|
|
||
| private class KubernetesDriverEndpoint(rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)]) | ||
| extends DriverEndpoint(rpcEnv, sparkProperties) { | ||
| private class KubernetesDriverEndpoint extends DriverEndpoint { | ||
|
|
||
| override def onDisconnected(rpcAddress: RpcAddress): Unit = { | ||
| // Don't do anything besides disabling the executor - allow the Kubernetes API events to | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why force needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shutdowndoesn't stop the executor. Any schedule tasks will still be run. So the executor might stay up for about a day if there's a renewal task scheduled.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, thanks for the info.