Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 109 additions & 39 deletions airflow/providers/databricks/hooks/databricks_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
TOKEN_REFRESH_LEAD_TIME = 120
AZURE_MANAGEMENT_ENDPOINT = "https://management.core.windows.net/"
DEFAULT_DATABRICKS_SCOPE = "2ff814a6-3304-4ab8-85cb-cd0e6f879c1d"
OIDC_TOKEN_SERVICE_URL = "{}/oidc/v1/token"


class BaseDatabricksHook(BaseHook):
Expand Down Expand Up @@ -89,6 +90,7 @@ class BaseDatabricksHook(BaseHook):
"azure_ad_endpoint",
"azure_resource_id",
"azure_tenant_id",
"service_principal_oauth",
]

def __init__(
Expand All @@ -107,8 +109,8 @@ def __init__(
raise ValueError("Retry limit must be greater than or equal to 1")
self.retry_limit = retry_limit
self.retry_delay = retry_delay
self.aad_tokens: dict[str, dict] = {}
self.aad_timeout_seconds = 10
self.oauth_tokens: dict[str, dict] = {}
self.token_timeout_seconds = 10
self.caller = caller

def my_after_func(retry_state):
Expand Down Expand Up @@ -210,6 +212,75 @@ def _a_get_retry_object(self) -> AsyncRetrying:
"""
return AsyncRetrying(**self.retry_args)

def _get_sp_token(self, resource: str) -> str:
"""Function to get Service Principal token."""
sp_token = self.oauth_tokens.get(resource)
if sp_token and self._is_oauth_token_valid(sp_token):
return sp_token["access_token"]

self.log.info("Existing Service Principal token is expired, or going to expire soon. Refreshing...")
try:
for attempt in self._get_retry_object():
with attempt:
resp = requests.post(
resource,
auth=HTTPBasicAuth(self.databricks_conn.login, self.databricks_conn.password),
data="grant_type=client_credentials&scope=all-apis",
headers={
**self.user_agent_header,
"Content-Type": "application/x-www-form-urlencoded",
},
timeout=self.token_timeout_seconds,
)

resp.raise_for_status()
jsn = resp.json()
jsn["expires_on"] = int(time.time() + jsn["expires_in"])

self._is_oauth_token_valid(jsn)
self.oauth_tokens[resource] = jsn
break
except RetryError:
raise AirflowException(f"API requests to Databricks failed {self.retry_limit} times. Giving up.")
except requests_exceptions.HTTPError as e:
raise AirflowException(f"Response: {e.response.content}, Status Code: {e.response.status_code}")

return jsn["access_token"]

async def _a_get_sp_token(self, resource: str) -> str:
"""Async version of `_get_sp_token()`."""
sp_token = self.oauth_tokens.get(resource)
if sp_token and self._is_oauth_token_valid(sp_token):
return sp_token["access_token"]

self.log.info("Existing Service Principal token is expired, or going to expire soon. Refreshing...")
try:
async for attempt in self._a_get_retry_object():
with attempt:
async with self._session.post(
resource,
auth=HTTPBasicAuth(self.databricks_conn.login, self.databricks_conn.password),
data="grant_type=client_credentials&scope=all-apis",
headers={
**self.user_agent_header,
"Content-Type": "application/x-www-form-urlencoded",
},
timeout=self.token_timeout_seconds,
) as resp:
resp.raise_for_status()
jsn = await resp.json()
jsn["expires_on"] = int(time.time() + jsn["expires_in"])

self._is_oauth_token_valid(jsn)
self.oauth_tokens[resource] = jsn
break
except RetryError:
raise AirflowException(f"API requests to Databricks failed {self.retry_limit} times. Giving up.")
except requests_exceptions.HTTPError as e:
raise AirflowException(f"Response: {e.response.content}, Status Code: {e.response.status_code}")

return jsn["access_token"]

def _get_aad_token(self, resource: str) -> str:
"""
Function to get AAD token for given resource.
Expand All @@ -218,9 +289,9 @@ def _get_aad_token(self, resource: str) -> str:
:param resource: resource to issue token to
:return: AAD token, or raise an exception
"""
aad_token = self.aad_tokens.get(resource)
if aad_token and self._is_aad_token_valid(aad_token):
return aad_token["token"]
aad_token = self.oauth_tokens.get(resource)
if aad_token and self._is_oauth_token_valid(aad_token):
return aad_token["access_token"]

self.log.info("Existing AAD token is expired, or going to expire soon. Refreshing...")
try:
Expand All @@ -235,7 +306,7 @@ def _get_aad_token(self, resource: str) -> str:
AZURE_METADATA_SERVICE_TOKEN_URL,
params=params,
headers={**self.user_agent_header, "Metadata": "true"},
timeout=self.aad_timeout_seconds,
timeout=self.token_timeout_seconds,
)
else:
tenant_id = self.databricks_conn.extra_dejson["azure_tenant_id"]
Expand All @@ -255,27 +326,21 @@ def _get_aad_token(self, resource: str) -> str:
**self.user_agent_header,
"Content-Type": "application/x-www-form-urlencoded",
},
timeout=self.aad_timeout_seconds,
timeout=self.token_timeout_seconds,
)

resp.raise_for_status()
jsn = resp.json()
if (
"access_token" not in jsn
or jsn.get("token_type") != "Bearer"
or "expires_on" not in jsn
):
raise AirflowException(f"Can't get necessary data from AAD token: {jsn}")

token = jsn["access_token"]
self.aad_tokens[resource] = {"token": token, "expires_on": int(jsn["expires_on"])}

self._is_oauth_token_valid(jsn)
self.oauth_tokens[resource] = jsn
break
except RetryError:
raise AirflowException(f"API requests to Azure failed {self.retry_limit} times. Giving up.")
except requests_exceptions.HTTPError as e:
raise AirflowException(f"Response: {e.response.content}, Status Code: {e.response.status_code}")

return token
return jsn["access_token"]

async def _a_get_aad_token(self, resource: str) -> str:
"""
Expand All @@ -284,9 +349,9 @@ async def _a_get_aad_token(self, resource: str) -> str:
:param resource: resource to issue token to
:return: AAD token, or raise an exception
"""
aad_token = self.aad_tokens.get(resource)
if aad_token and self._is_aad_token_valid(aad_token):
return aad_token["token"]
aad_token = self.oauth_tokens.get(resource)
if aad_token and self._is_oauth_token_valid(aad_token):
return aad_token["access_token"]

self.log.info("Existing AAD token is expired, or going to expire soon. Refreshing...")
try:
Expand All @@ -301,7 +366,7 @@ async def _a_get_aad_token(self, resource: str) -> str:
url=AZURE_METADATA_SERVICE_TOKEN_URL,
params=params,
headers={**self.user_agent_header, "Metadata": "true"},
timeout=self.aad_timeout_seconds,
timeout=self.token_timeout_seconds,
) as resp:
resp.raise_for_status()
jsn = await resp.json()
Expand All @@ -323,26 +388,20 @@ async def _a_get_aad_token(self, resource: str) -> str:
**self.user_agent_header,
"Content-Type": "application/x-www-form-urlencoded",
},
timeout=self.aad_timeout_seconds,
timeout=self.token_timeout_seconds,
) as resp:
resp.raise_for_status()
jsn = await resp.json()
if (
"access_token" not in jsn
or jsn.get("token_type") != "Bearer"
or "expires_on" not in jsn
):
raise AirflowException(f"Can't get necessary data from AAD token: {jsn}")

token = jsn["access_token"]
self.aad_tokens[resource] = {"token": token, "expires_on": int(jsn["expires_on"])}

self._is_oauth_token_valid(jsn)
self.oauth_tokens[resource] = jsn
break
except RetryError:
raise AirflowException(f"API requests to Azure failed {self.retry_limit} times. Giving up.")
except aiohttp.ClientResponseError as err:
raise AirflowException(f"Response: {err.message}, Status Code: {err.status}")

return token
return jsn["access_token"]

def _get_aad_headers(self) -> dict:
"""
Expand Down Expand Up @@ -375,17 +434,18 @@ async def _a_get_aad_headers(self) -> dict:
return headers

@staticmethod
def _is_aad_token_valid(aad_token: dict) -> bool:
def _is_oauth_token_valid(token: dict, time_key="expires_on") -> bool:
"""
Utility function to check AAD token hasn't expired yet.
Utility function to check if an OAuth token is valid and hasn't expired yet.

:param aad_token: dict with properties of AAD token
:param sp_token: dict with properties of OAuth token
:param time_key: name of the key that holds the time of expiration
:return: true if token is valid, false otherwise
"""
now = int(time.time())
if aad_token["expires_on"] > (now + TOKEN_REFRESH_LEAD_TIME):
return True
return False
if "access_token" not in token or token.get("token_type", "") != "Bearer" or time_key not in token:
raise AirflowException(f"Can't get necessary data from OAuth token: {token}")

return int(token[time_key]) > (int(time.time()) + TOKEN_REFRESH_LEAD_TIME)

@staticmethod
def _check_azure_metadata_service() -> None:
Expand Down Expand Up @@ -443,6 +503,11 @@ def _get_token(self, raise_error: bool = False) -> str | None:
self.log.info("Using AAD Token for managed identity.")
self._check_azure_metadata_service()
return self._get_aad_token(DEFAULT_DATABRICKS_SCOPE)
elif self.databricks_conn.extra_dejson.get("service_principal_oauth", False):
if self.databricks_conn.login == "" or self.databricks_conn.password == "":
raise AirflowException("Service Principal credentials aren't provided")
self.log.info("Using Service Principal Token.")
return self._get_sp_token(OIDC_TOKEN_SERVICE_URL.format(self.databricks_conn.host))
elif raise_error:
raise AirflowException("Token authentication isn't configured")

Expand All @@ -466,6 +531,11 @@ async def _a_get_token(self, raise_error: bool = False) -> str | None:
self.log.info("Using AAD Token for managed identity.")
await self._a_check_azure_metadata_service()
return await self._a_get_aad_token(DEFAULT_DATABRICKS_SCOPE)
elif self.databricks_conn.extra_dejson.get("service_principal_oauth", False):
if self.databricks_conn.login == "" or self.databricks_conn.password == "":
raise AirflowException("Service Principal credentials aren't provided")
self.log.info("Using Service Principal Token.")
return await self._a_get_sp_token(OIDC_TOKEN_SERVICE_URL.format(self.databricks_conn.host))
elif raise_error:
raise AirflowException("Token authentication isn't configured")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,15 @@ Host (required)

Login (optional)
* If authentication with *Databricks login credentials* is used then specify the ``username`` used to login to Databricks.
* If *authentication with Azure Service Principal* is used then specify the ID of the Azure Service Principal
* If authentication with *Azure Service Principal* is used then specify the ID of the Azure Service Principal
* If authentication with *PAT* is used then either leave this field empty or use 'token' as login (both work, the only difference is that if login is empty then token will be sent in request header as Bearer token, if login is 'token' then it will be sent using Basic Auth which is allowed by Databricks API, this may be useful if you plan to reuse this connection with e.g. SimpleHttpOperator)
* If authentication with *Databricks Service Principal OAuth* is used then specify the ID of the Service Principal (Databricks on AWS)

Password (optional)
* If authentication with *Databricks login credentials* is used then specify the ``password`` used to login to Databricks.
* If authentication with *Databricks login credentials* is used then specify the ``password`` used to login to Databricks.
* If authentication with *Azure Service Principal* is used then specify the secret of the Azure Service Principal
* If authentication with *PAT* is used, then specify PAT (recommended)
* If authentication with *Databricks Service Principal OAuth* is used then specify the secret of the Service Principal (Databricks on AWS)

Extra (optional)
Specify the extra parameter (as json dictionary) that can be used in the Databricks connection.
Expand All @@ -70,6 +72,10 @@ Extra (optional)

* ``token``: Specify PAT to use. Consider to switch to specification of PAT in the Password field as it's more secure.

Following parameters are necessary if using authentication with OAuth token for AWS Databricks Service Principal:

* ``service_principal_oauth``: required boolean flag. If specified as ``true``, use the Client ID and Client Secret as the Username and Password. See `Authentication using OAuth for service principals <https://docs.databricks.com/en/dev-tools/authentication-oauth.html>`_.

Following parameters are necessary if using authentication with AAD token:

* ``azure_tenant_id``: ID of the Azure Active Directory tenant
Expand Down
Loading