From 027e1d1e114ab92410bb3e1a8c2edfa9a9b5134b Mon Sep 17 00:00:00 2001 From: Jed Cunningham Date: Thu, 20 Jan 2022 13:55:04 -0700 Subject: [PATCH 001/250] Bump version to 2.2.4 --- README.md | 14 +++++++------- .../extending/add-apt-packages/Dockerfile | 2 +- .../add-build-essential-extend/Dockerfile | 2 +- .../extending/add-providers/Dockerfile | 2 +- .../extending/add-pypi-packages/Dockerfile | 2 +- .../extending/embedding-dags/Dockerfile | 2 +- .../extending/writable-directory/Dockerfile | 2 +- .../restricted/restricted_environments.sh | 4 ++-- docs/docker-stack/entrypoint.rst | 16 ++++++++-------- setup.py | 2 +- 10 files changed, 24 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index cc2582102205e..2534c94e9ff8d 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ Airflow is not a streaming solution, but it is often used to process real-time d Apache Airflow is tested with: -| | Main version (dev) | Stable version (2.2.3) | +| | Main version (dev) | Stable version (2.2.4) | | -------------------- | ------------------------- | ------------------------ | | Python | 3.6, 3.7, 3.8, 3.9 | 3.6, 3.7, 3.8, 3.9 | | Kubernetes | 1.18, 1.19, 1.20 | 1.18, 1.19, 1.20 | @@ -153,15 +153,15 @@ them to the appropriate format and workflow that your tool requires. ```bash -pip install 'apache-airflow==2.2.3' \ - --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.2.3/constraints-3.7.txt" +pip install 'apache-airflow==2.2.4' \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.2.4/constraints-3.7.txt" ``` 2. Installing with extras (i.e., postgres, google) ```bash -pip install 'apache-airflow[postgres,google]==2.2.3' \ - --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.2.3/constraints-3.7.txt" +pip install 'apache-airflow[postgres,google]==2.2.4' \ + --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.2.4/constraints-3.7.txt" ``` For information on installing provider packages, check @@ -263,7 +263,7 @@ Apache Airflow version life cycle: | Version | Current Patch/Minor | State | First Release | Limited Support | EOL/Terminated | |---------|---------------------|-----------|---------------|-----------------|----------------| -| 2 | 2.2.3 | Supported | Dec 17, 2020 | TBD | TBD | +| 2 | 2.2.4 | Supported | Dec 17, 2020 | TBD | TBD | | 1.10 | 1.10.15 | EOL | Aug 27, 2018 | Dec 17, 2020 | June 17, 2021 | | 1.9 | 1.9.0 | EOL | Jan 03, 2018 | Aug 27, 2018 | Aug 27, 2018 | | 1.8 | 1.8.2 | EOL | Mar 19, 2017 | Jan 03, 2018 | Jan 03, 2018 | @@ -290,7 +290,7 @@ They are based on the official release schedule of Python and Kubernetes, nicely 2. The "oldest" supported version of Python/Kubernetes is the default one until we decide to switch to later version. "Default" is only meaningful in terms of "smoke tests" in CI PRs, which are run using this default version and the default reference image available. Currently `apache/airflow:latest` - and `apache/airflow:2.2.3` images are Python 3.7 images as we are preparing for 23.12.2021 when will + and `apache/airflow:2.2.4` images are Python 3.7 images as we are preparing for 23.12.2021 when will Python 3.6 reaches end of life. 3. We support a new version of Python/Kubernetes in main after they are officially released, as soon as we diff --git a/docs/docker-stack/docker-examples/extending/add-apt-packages/Dockerfile b/docs/docker-stack/docker-examples/extending/add-apt-packages/Dockerfile index de55cd6eff59e..18d54617713ae 100644 --- a/docs/docker-stack/docker-examples/extending/add-apt-packages/Dockerfile +++ b/docs/docker-stack/docker-examples/extending/add-apt-packages/Dockerfile @@ -15,7 +15,7 @@ # This is an example Dockerfile. It is not intended for PRODUCTION use # [START Dockerfile] -FROM apache/airflow:2.2.3 +FROM apache/airflow:2.2.4 USER root RUN apt-get update \ && apt-get install -y --no-install-recommends \ diff --git a/docs/docker-stack/docker-examples/extending/add-build-essential-extend/Dockerfile b/docs/docker-stack/docker-examples/extending/add-build-essential-extend/Dockerfile index 220b917020a92..b5d5cd17875fa 100644 --- a/docs/docker-stack/docker-examples/extending/add-build-essential-extend/Dockerfile +++ b/docs/docker-stack/docker-examples/extending/add-build-essential-extend/Dockerfile @@ -15,7 +15,7 @@ # This is an example Dockerfile. It is not intended for PRODUCTION use # [START Dockerfile] -FROM apache/airflow:2.2.3 +FROM apache/airflow:2.2.4 USER root RUN apt-get update \ && apt-get install -y --no-install-recommends \ diff --git a/docs/docker-stack/docker-examples/extending/add-providers/Dockerfile b/docs/docker-stack/docker-examples/extending/add-providers/Dockerfile index bb17c3a333d03..1786f2e30e2a6 100644 --- a/docs/docker-stack/docker-examples/extending/add-providers/Dockerfile +++ b/docs/docker-stack/docker-examples/extending/add-providers/Dockerfile @@ -15,6 +15,6 @@ # This is an example Dockerfile. It is not intended for PRODUCTION use # [START Dockerfile] -FROM apache/airflow:2.2.3 +FROM apache/airflow:2.2.4 RUN pip install --no-cache-dir apache-airflow-providers-docker==2.1.0 # [END Dockerfile] diff --git a/docs/docker-stack/docker-examples/extending/add-pypi-packages/Dockerfile b/docs/docker-stack/docker-examples/extending/add-pypi-packages/Dockerfile index b487d6ea0b70a..feaf714c199ce 100644 --- a/docs/docker-stack/docker-examples/extending/add-pypi-packages/Dockerfile +++ b/docs/docker-stack/docker-examples/extending/add-pypi-packages/Dockerfile @@ -15,6 +15,6 @@ # This is an example Dockerfile. It is not intended for PRODUCTION use # [START Dockerfile] -FROM apache/airflow:2.2.3 +FROM apache/airflow:2.2.4 RUN pip install --no-cache-dir lxml # [END Dockerfile] diff --git a/docs/docker-stack/docker-examples/extending/embedding-dags/Dockerfile b/docs/docker-stack/docker-examples/extending/embedding-dags/Dockerfile index e5562effddb2b..9342faed01a37 100644 --- a/docs/docker-stack/docker-examples/extending/embedding-dags/Dockerfile +++ b/docs/docker-stack/docker-examples/extending/embedding-dags/Dockerfile @@ -15,7 +15,7 @@ # This is an example Dockerfile. It is not intended for PRODUCTION use # [START Dockerfile] -FROM apache/airflow:2.2.3 +FROM apache/airflow:2.2.4 COPY --chown=airflow:root test_dag.py /opt/airflow/dags diff --git a/docs/docker-stack/docker-examples/extending/writable-directory/Dockerfile b/docs/docker-stack/docker-examples/extending/writable-directory/Dockerfile index 42f1c069bf78f..ffcb8adb60c46 100644 --- a/docs/docker-stack/docker-examples/extending/writable-directory/Dockerfile +++ b/docs/docker-stack/docker-examples/extending/writable-directory/Dockerfile @@ -15,7 +15,7 @@ # This is an example Dockerfile. It is not intended for PRODUCTION use # [START Dockerfile] -FROM apache/airflow:2.2.3 +FROM apache/airflow:2.2.4 RUN umask 0002; \ mkdir -p ~/writeable-directory # [END Dockerfile] diff --git a/docs/docker-stack/docker-examples/restricted/restricted_environments.sh b/docs/docker-stack/docker-examples/restricted/restricted_environments.sh index 4eefc69a167eb..3a87f43b20b73 100755 --- a/docs/docker-stack/docker-examples/restricted/restricted_environments.sh +++ b/docs/docker-stack/docker-examples/restricted/restricted_environments.sh @@ -25,7 +25,7 @@ cd "${AIRFLOW_SOURCES}" rm docker-context-files/*.whl docker-context-files/*.tar.gz docker-context-files/*.txt || true curl -Lo "docker-context-files/constraints-3.7.txt" \ - https://raw.githubusercontent.com/apache/airflow/constraints-2.2.3/constraints-3.7.txt + https://raw.githubusercontent.com/apache/airflow/constraints-2.2.4/constraints-3.7.txt # For Airflow pre 2.1 you need to use PIP 20.2.4 to install/download Airflow packages. pip install pip==20.2.4 @@ -39,7 +39,7 @@ pip download --dest docker-context-files \ docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ --build-arg AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ - --build-arg AIRFLOW_VERSION="2.2.3" \ + --build-arg AIRFLOW_VERSION="2.2.4" \ --build-arg INSTALL_MYSQL_CLIENT="false" \ --build-arg INSTALL_MSSQL_CLIENT="false" \ --build-arg AIRFLOW_PRE_CACHED_PIP_PACKAGES="false" \ diff --git a/docs/docker-stack/entrypoint.rst b/docs/docker-stack/entrypoint.rst index e63a2306a69a9..542e7e54884ac 100644 --- a/docs/docker-stack/entrypoint.rst +++ b/docs/docker-stack/entrypoint.rst @@ -132,7 +132,7 @@ if you specify extra arguments. For example: .. code-block:: bash - docker run -it apache/airflow:2.2.3-python3.6 bash -c "ls -la" + docker run -it apache/airflow:2.2.4-python3.6 bash -c "ls -la" total 16 drwxr-xr-x 4 airflow root 4096 Jun 5 18:12 . drwxr-xr-x 1 root root 4096 Jun 5 18:12 .. @@ -144,7 +144,7 @@ you pass extra parameters. For example: .. code-block:: bash - > docker run -it apache/airflow:2.2.3-python3.6 python -c "print('test')" + > docker run -it apache/airflow:2.2.4-python3.6 python -c "print('test')" test If first argument equals to "airflow" - the rest of the arguments is treated as an airflow command @@ -152,13 +152,13 @@ to execute. Example: .. code-block:: bash - docker run -it apache/airflow:2.2.3-python3.6 airflow webserver + docker run -it apache/airflow:2.2.4-python3.6 airflow webserver If there are any other arguments - they are simply passed to the "airflow" command .. code-block:: bash - > docker run -it apache/airflow:2.2.3-python3.6 help + > docker run -it apache/airflow:2.2.4-python3.6 help usage: airflow [-h] GROUP_OR_COMMAND ... positional arguments: @@ -258,7 +258,7 @@ And then you can run this script by running the command: .. code-block:: bash - docker run -it apache/airflow:2.2.3-python3.6 bash -c "/my_after_entrypoint_script.sh" + docker run -it apache/airflow:2.2.4-python3.6 bash -c "/my_after_entrypoint_script.sh" Signal propagation @@ -363,7 +363,7 @@ database and creating an ``admin/admin`` Admin user with the following command: --env "_AIRFLOW_DB_UPGRADE=true" \ --env "_AIRFLOW_WWW_USER_CREATE=true" \ --env "_AIRFLOW_WWW_USER_PASSWORD=admin" \ - apache/airflow:2.2.3-python3.8 webserver + apache/airflow:2.2.4-python3.8 webserver .. code-block:: bash @@ -372,7 +372,7 @@ database and creating an ``admin/admin`` Admin user with the following command: --env "_AIRFLOW_DB_UPGRADE=true" \ --env "_AIRFLOW_WWW_USER_CREATE=true" \ --env "_AIRFLOW_WWW_USER_PASSWORD_CMD=echo admin" \ - apache/airflow:2.2.3-python3.8 webserver + apache/airflow:2.2.4-python3.8 webserver The commands above perform initialization of the SQLite database, create admin user with admin password and Admin role. They also forward local port ``8080`` to the webserver port and finally start the webserver. @@ -412,6 +412,6 @@ Example: --env "_AIRFLOW_DB_UPGRADE=true" \ --env "_AIRFLOW_WWW_USER_CREATE=true" \ --env "_AIRFLOW_WWW_USER_PASSWORD_CMD=echo admin" \ - apache/airflow:2.2.3-python3.8 webserver + apache/airflow:2.2.4-python3.8 webserver This method is only available starting from Docker image of Airflow 2.1.1 and above. diff --git a/setup.py b/setup.py index 90a2037efb36f..d1ac695d335f8 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ logger = logging.getLogger(__name__) -version = '2.2.3' +version = '2.2.4' my_dir = dirname(__file__) From fd5558fe9eb4c82f1aa49244deb75a6d500334bc Mon Sep 17 00:00:00 2001 From: ignaski <53809919+ignaski@users.noreply.github.com> Date: Fri, 29 Oct 2021 12:10:49 +0300 Subject: [PATCH 002/250] Fixing ses email backend (#18042) (cherry picked from commit 1543dc28f4a2f1631dfaedd948e646a181ccf7ee) --- airflow/config_templates/config.yml | 8 ++++ airflow/config_templates/default_airflow.cfg | 5 +++ airflow/providers/amazon/aws/utils/emailer.py | 3 +- airflow/utils/email.py | 10 ++++- docs/apache-airflow/howto/email-config.rst | 7 ++++ .../amazon/aws/utils/test_emailer.py | 42 ++++++++++--------- tests/utils/test_email.py | 14 +++++++ 7 files changed, 66 insertions(+), 23 deletions(-) diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index 6fa38d7a64884..a70854e6bf5f9 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -1353,6 +1353,14 @@ example: "/path/to/my_html_content_template_file" default: ~ see_also: ":doc:`Email Configuration `" + - name: from_email + description: | + Email address that will be used as sender address. + It can either be raw email or the complete address in a format ``Sender Name `` + version_added: 2.3.0 + type: string + example: "Airflow " + default: ~ - name: smtp description: | diff --git a/airflow/config_templates/default_airflow.cfg b/airflow/config_templates/default_airflow.cfg index 7124d95fead1c..6a5449b76d29a 100644 --- a/airflow/config_templates/default_airflow.cfg +++ b/airflow/config_templates/default_airflow.cfg @@ -681,6 +681,11 @@ default_email_on_failure = True # Example: html_content_template = /path/to/my_html_content_template_file # html_content_template = +# Email address that will be used as sender address. +# It can either be raw email or the complete address in a format ``Sender Name `` +# Example: from_email = Airflow +# from_email = + [smtp] # If you want airflow to send emails on retries, failure, and you want to use diff --git a/airflow/providers/amazon/aws/utils/emailer.py b/airflow/providers/amazon/aws/utils/emailer.py index d098892d224a2..fc34835993304 100644 --- a/airflow/providers/amazon/aws/utils/emailer.py +++ b/airflow/providers/amazon/aws/utils/emailer.py @@ -23,6 +23,7 @@ def send_email( + from_email: str, to: Union[List[str], str], subject: str, html_content: str, @@ -37,7 +38,7 @@ def send_email( """Email backend for SES.""" hook = SESHook(aws_conn_id=conn_id) hook.send_email( - mail_from=None, + mail_from=from_email, to=to, subject=subject, html_content=html_content, diff --git a/airflow/utils/email.py b/airflow/utils/email.py index 7d17027be4307..50f24150ec56c 100644 --- a/airflow/utils/email.py +++ b/airflow/utils/email.py @@ -49,6 +49,8 @@ def send_email( """Send email using backend specified in EMAIL_BACKEND.""" backend = conf.getimport('email', 'EMAIL_BACKEND') backend_conn_id = conn_id or conf.get("email", "EMAIL_CONN_ID") + from_email = conf.get('email', 'from_email', fallback=None) + to_list = get_email_address_list(to) to_comma_separated = ", ".join(to_list) @@ -63,6 +65,7 @@ def send_email( mime_subtype=mime_subtype, mime_charset=mime_charset, conn_id=backend_conn_id, + from_email=from_email, **kwargs, ) @@ -78,6 +81,7 @@ def send_email_smtp( mime_subtype: str = 'mixed', mime_charset: str = 'utf-8', conn_id: str = "smtp_default", + from_email: str = None, **kwargs, ): """ @@ -87,8 +91,10 @@ def send_email_smtp( """ smtp_mail_from = conf.get('smtp', 'SMTP_MAIL_FROM') + mail_from = smtp_mail_from or from_email + msg, recipients = build_mime_message( - mail_from=smtp_mail_from, + mail_from=mail_from, to=to, subject=subject, html_content=html_content, @@ -99,7 +105,7 @@ def send_email_smtp( mime_charset=mime_charset, ) - send_mime_email(e_from=smtp_mail_from, e_to=recipients, mime_msg=msg, conn_id=conn_id, dryrun=dryrun) + send_mime_email(e_from=mail_from, e_to=recipients, mime_msg=msg, conn_id=conn_id, dryrun=dryrun) def build_mime_message( diff --git a/docs/apache-airflow/howto/email-config.rst b/docs/apache-airflow/howto/email-config.rst index 67e26a7ca8a59..af7b3cf877818 100644 --- a/docs/apache-airflow/howto/email-config.rst +++ b/docs/apache-airflow/howto/email-config.rst @@ -29,6 +29,8 @@ in the ``[email]`` section. subject_template = /path/to/my_subject_template_file html_content_template = /path/to/my_html_content_template_file +You can configure sender's email address by setting ``from_email`` in the ``[email]`` section. + To configure SMTP settings, checkout the :ref:`SMTP ` section in the standard configuration. If you do not want to store the SMTP credentials in the config or in the environment variables, you can create a connection called ``smtp_default`` of ``Email`` type, or choose a custom connection name and set the ``email_conn_id`` with it's name in @@ -91,6 +93,9 @@ or name and set it in ``email_conn_id`` of 'Email' type. Only login and password are used from the connection. +4. Configure sender's email address and name either by exporting the environment variables ``SENDGRID_MAIL_FROM`` and ``SENDGRID_MAIL_SENDER`` or + in your ``airflow.cfg`` by setting ``from_email`` in the ``[email]`` section. + .. _email-configuration-ses: Send email using AWS SES @@ -116,3 +121,5 @@ Follow the steps below to enable it: 3. Create a connection called ``aws_default``, or choose a custom connection name and set it in ``email_conn_id``. The type of connection should be ``Amazon Web Services``. + +4. Configure sender's email address in your ``airflow.cfg`` by setting ``from_email`` in the ``[email]`` section. diff --git a/tests/providers/amazon/aws/utils/test_emailer.py b/tests/providers/amazon/aws/utils/test_emailer.py index 3d9957393fa41..bcbbd4ebe6899 100644 --- a/tests/providers/amazon/aws/utils/test_emailer.py +++ b/tests/providers/amazon/aws/utils/test_emailer.py @@ -16,27 +16,29 @@ # specific language governing permissions and limitations # under the License. # - -from unittest import mock +from unittest import TestCase, mock from airflow.providers.amazon.aws.utils.emailer import send_email -@mock.patch("airflow.providers.amazon.aws.utils.emailer.SESHook") -def test_send_email(mock_hook): - send_email( - to="to@test.com", - subject="subject", - html_content="content", - ) - mock_hook.return_value.send_email.assert_called_once_with( - mail_from=None, - to="to@test.com", - subject="subject", - html_content="content", - bcc=None, - cc=None, - files=None, - mime_charset="utf-8", - mime_subtype="mixed", - ) +class TestSendEmailSes(TestCase): + @mock.patch("airflow.providers.amazon.aws.utils.emailer.SESHook") + def test_send_ses_email(self, mock_hook): + send_email( + from_email="From Test ", + to="to@test.com", + subject="subject", + html_content="content", + ) + + mock_hook.return_value.send_email.assert_called_once_with( + mail_from="From Test ", + to="to@test.com", + subject="subject", + html_content="content", + bcc=None, + cc=None, + files=None, + mime_charset="utf-8", + mime_subtype="mixed", + ) diff --git a/tests/utils/test_email.py b/tests/utils/test_email.py index 28d43284ee8d7..b458bbdcc8fa7 100644 --- a/tests/utils/test_email.py +++ b/tests/utils/test_email.py @@ -99,9 +99,23 @@ def test_custom_backend(self, mock_send_email): mime_charset='utf-8', mime_subtype='mixed', conn_id='smtp_default', + from_email=None, ) assert not mock_send_email.called + @mock.patch('airflow.utils.email.send_email_smtp') + @conf_vars( + { + ('email', 'email_backend'): 'tests.utils.test_email.send_email_test', + ('email', 'from_email'): 'from@test.com', + } + ) + def test_custom_backend_sender(self, mock_send_email_smtp): + utils.email.send_email('to', 'subject', 'content') + _, call_kwargs = send_email_test.call_args + assert call_kwargs['from_email'] == 'from@test.com' + assert not mock_send_email_smtp.called + def test_build_mime_message(self): mail_from = 'from@example.com' mail_to = 'to@example.com' From d2ae684a09c85fc557b24e2fb4421df7da79a9b0 Mon Sep 17 00:00:00 2001 From: Josh Fell <48934154+josh-fell@users.noreply.github.com> Date: Mon, 10 Jan 2022 11:14:22 -0500 Subject: [PATCH 003/250] Enhance `multiple_outputs` inference of dict typing (#19608) (cherry picked from commit 4198550bba474e7942705a4c6df2ad916fb76561) --- .pre-commit-config.yaml | 1 + airflow/decorators/base.py | 27 ++++++++++++++++--------- airflow/decorators/python.py | 16 ++++++--------- airflow/decorators/python_virtualenv.py | 10 ++++----- tests/decorators/test_python.py | 23 ++++++++++++++++----- 5 files changed, 46 insertions(+), 31 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 00030710df791..08d60abc3d517 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -195,6 +195,7 @@ repos: - "4" files: ^chart/values\.schema\.json$|^chart/values_schema\.schema\.json$ pass_filenames: true + # TODO: Bump to Python 3.7 when support for Python 3.6 is dropped in Airflow 2.3. - repo: https://github.com/asottile/pyupgrade rev: v2.29.0 hooks: diff --git a/airflow/decorators/base.py b/airflow/decorators/base.py index 229a114fc9cfb..cd7683988c68f 100644 --- a/airflow/decorators/base.py +++ b/airflow/decorators/base.py @@ -18,6 +18,7 @@ import functools import inspect import re +import sys from inspect import signature from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, cast @@ -91,9 +92,8 @@ class DecoratedOperator(BaseOperator): :param op_args: a list of positional arguments that will get unpacked when calling your callable (templated) :type op_args: list - :param multiple_outputs: if set, function return value will be - unrolled to multiple XCom values. Dict will unroll to xcom values with keys as keys. - Defaults to False. + :param multiple_outputs: If set to True, the decorated function's return value will be unrolled to + multiple XCom values. Dict will unroll to XCom values with its keys as XCom keys. Defaults to False. :type multiple_outputs: bool :param kwargs_to_upstream: For certain operators, we might need to upstream certain arguments that would otherwise be absorbed by the DecoratedOperator (for example python_callable for the @@ -189,10 +189,8 @@ def task_decorator_factory( :param python_callable: Function to decorate :type python_callable: Optional[Callable] - :param multiple_outputs: if set, function return value will be - unrolled to multiple XCom values. List/Tuples will unroll to xcom values - with index as key. Dict will unroll to xcom values with keys as XCom keys. - Defaults to False. + :param multiple_outputs: If set to True, the decorated function's return value will be unrolled to + multiple XCom values. Dict will unroll to XCom values with its keys as XCom keys. Defaults to False. :type multiple_outputs: bool :param decorated_operator_class: The operator that executes the logic needed to run the python function in the correct environment @@ -201,10 +199,19 @@ def task_decorator_factory( """ # try to infer from type annotation if python_callable and multiple_outputs is None: - sig = signature(python_callable).return_annotation - ttype = getattr(sig, "__origin__", None) + return_type = signature(python_callable).return_annotation + + # If the return type annotation is already the builtins ``dict`` type, use it for the inference. + if return_type == dict: + ttype = return_type + # Checking if Python 3.6, ``__origin__`` attribute does not exist until 3.7; need to use ``__extra__`` + # TODO: Remove check when support for Python 3.6 is dropped in Airflow 2.3. + elif sys.version_info < (3, 7): + ttype = getattr(return_type, "__extra__", None) + else: + ttype = getattr(return_type, "__origin__", None) - multiple_outputs = sig != inspect.Signature.empty and ttype in (dict, Dict) + multiple_outputs = return_type != inspect.Signature.empty and ttype in (dict, Dict) def wrapper(f: T): """ diff --git a/airflow/decorators/python.py b/airflow/decorators/python.py index 7dc6c1bff088b..2411761c05509 100644 --- a/airflow/decorators/python.py +++ b/airflow/decorators/python.py @@ -33,9 +33,8 @@ class _PythonDecoratedOperator(DecoratedOperator, PythonOperator): :param op_args: a list of positional arguments that will get unpacked when calling your callable (templated) :type op_args: list - :param multiple_outputs: if set, function return value will be - unrolled to multiple XCom values. Dict will unroll to xcom values with keys as keys. - Defaults to False. + :param multiple_outputs: If set to True, the decorated function's return value will be unrolled to + multiple XCom values. Dict will unroll to XCom values with its keys as XCom keys. Defaults to False. :type multiple_outputs: bool """ @@ -85,9 +84,8 @@ def python( :param python_callable: Function to decorate :type python_callable: Optional[Callable] - :param multiple_outputs: if set, function return value will be - unrolled to multiple XCom values. List/Tuples will unroll to xcom values - with index as key. Dict will unroll to xcom values with keys as XCom keys. + :param multiple_outputs: If set to True, the decorated function's return value will be unrolled to + multiple XCom values. Dict will unroll to XCom values with its keys as XCom keys. Defaults to False. :type multiple_outputs: bool """ @@ -109,10 +107,8 @@ def python_task( :param python_callable: Function to decorate :type python_callable: Optional[Callable] - :param multiple_outputs: if set, function return value will be - unrolled to multiple XCom values. List/Tuples will unroll to xcom values - with index as key. Dict will unroll to xcom values with keys as XCom keys. - Defaults to False. + :param multiple_outputs: If set to True, the decorated function's return value will be unrolled to + multiple XCom values. Dict will unroll to XCom values with its keys as XCom keys. Defaults to False. :type multiple_outputs: bool """ return task_decorator_factory( diff --git a/airflow/decorators/python_virtualenv.py b/airflow/decorators/python_virtualenv.py index 8024e5a99ca5a..d412344b23746 100644 --- a/airflow/decorators/python_virtualenv.py +++ b/airflow/decorators/python_virtualenv.py @@ -36,9 +36,8 @@ class _PythonVirtualenvDecoratedOperator(DecoratedOperator, PythonVirtualenvOper :param op_args: a list of positional arguments that will get unpacked when calling your callable (templated) :type op_args: list - :param multiple_outputs: if set, function return value will be - unrolled to multiple XCom values. Dict will unroll to xcom values with keys as keys. - Defaults to False. + :param multiple_outputs: If set to True, the decorated function's return value will be unrolled to + multiple XCom values. Dict will unroll to XCom values with its keys as XCom keys. Defaults to False. :type multiple_outputs: bool """ @@ -88,9 +87,8 @@ def virtualenv( :param python_callable: Function to decorate :type python_callable: Optional[Callable] - :param multiple_outputs: if set, function return value will be - unrolled to multiple XCom values. List/Tuples will unroll to xcom values - with index as key. Dict will unroll to xcom values with keys as XCom keys. + :param multiple_outputs: If set to True, the decorated function's return value will be unrolled to + multiple XCom values. Dict will unroll to XCom values with its keys as XCom keys. Defaults to False. :type multiple_outputs: bool """ diff --git a/tests/decorators/test_python.py b/tests/decorators/test_python.py index 8782999d8a671..798d87713f428 100644 --- a/tests/decorators/test_python.py +++ b/tests/decorators/test_python.py @@ -15,12 +15,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import sys import unittest.mock from collections import namedtuple from datetime import date, timedelta from typing import Dict, Tuple import pytest +from parameterized import parameterized from airflow.decorators import task as task_decorator from airflow.exceptions import AirflowException @@ -112,13 +114,24 @@ def test_python_operator_python_callable_is_callable(self): with pytest.raises(AirflowException): task_decorator(not_callable, dag=self.dag) - def test_infer_multiple_outputs_using_typing(self): - @task_decorator - def identity_dict(x: int, y: int) -> Dict[str, int]: - return {"x": x, "y": y} + @parameterized.expand([["dict"], ["dict[str, int]"], ["Dict"], ["Dict[str, int]"]]) + def test_infer_multiple_outputs_using_dict_typing(self, test_return_annotation): + if sys.version_info < (3, 9) and test_return_annotation == "dict[str, int]": + self.skipTest("dict[...] not a supported typing prior to Python 3.9") + + @task_decorator + def identity_dict(x: int, y: int) -> eval(test_return_annotation): + return {"x": x, "y": y} + + assert identity_dict(5, 5).operator.multiple_outputs is True + + @task_decorator + def identity_dict_stringified(x: int, y: int) -> test_return_annotation: + return {"x": x, "y": y} - assert identity_dict(5, 5).operator.multiple_outputs is True + assert identity_dict_stringified(5, 5).operator.multiple_outputs is True + def test_infer_multiple_outputs_using_other_typing(self): @task_decorator def identity_tuple(x: int, y: int) -> Tuple[int, int]: return x, y From b05722e8b98963f65cd5f615d4390bb6377ba660 Mon Sep 17 00:00:00 2001 From: Lutz Ostkamp <35694434+lostkamp@users.noreply.github.com> Date: Wed, 15 Dec 2021 11:42:14 +0100 Subject: [PATCH 004/250] Correctly send timing metrics when using dogstatsd (fix schedule_delay metric) (#19973) (cherry picked from commit 5d405d9cda0b88909e6b726769381044477f4678) --- airflow/stats.py | 9 ++++++--- tests/core/test_stats.py | 5 +++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/airflow/stats.py b/airflow/stats.py index 0a7004d10a595..d1c4da6361c4a 100644 --- a/airflow/stats.py +++ b/airflow/stats.py @@ -16,13 +16,14 @@ # specific language governing permissions and limitations # under the License. +import datetime import logging import socket import string import textwrap import time from functools import wraps -from typing import TYPE_CHECKING, Callable, Optional, TypeVar, cast +from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar, Union, cast from airflow.configuration import conf from airflow.exceptions import AirflowConfigException, InvalidStatsNameException @@ -65,7 +66,7 @@ def gauge(cls, stat: str, value: float, rate: int = 1, delta: bool = False) -> N """Gauge stat""" @classmethod - def timing(cls, stat: str, dt) -> None: + def timing(cls, stat: str, dt: Union[float, datetime.timedelta]) -> None: """Stats timing""" @classmethod @@ -331,10 +332,12 @@ def gauge(self, stat, value, rate=1, delta=False, tags=None): return None @validate_stat - def timing(self, stat, dt, tags=None): + def timing(self, stat, dt: Union[float, datetime.timedelta], tags: Optional[List[str]] = None): """Stats timing""" if self.allow_list_validator.test(stat): tags = tags or [] + if isinstance(dt, datetime.timedelta): + dt = dt.total_seconds() return self.dogstatsd.timing(metric=stat, value=dt, tags=tags) return None diff --git a/tests/core/test_stats.py b/tests/core/test_stats.py index 83169e2935b24..c401a2f61ef95 100644 --- a/tests/core/test_stats.py +++ b/tests/core/test_stats.py @@ -181,9 +181,14 @@ def test_empty_timer(self): self.dogstatsd_client.timed.assert_not_called() def test_timing(self): + import datetime + self.dogstatsd.timing("dummy_timer", 123) self.dogstatsd_client.timing.assert_called_once_with(metric='dummy_timer', value=123, tags=[]) + self.dogstatsd.timing("dummy_timer", datetime.timedelta(seconds=123)) + self.dogstatsd_client.timing.assert_called_with(metric='dummy_timer', value=123.0, tags=[]) + def test_gauge(self): self.dogstatsd.gauge("dummy", 123) self.dogstatsd_client.gauge.assert_called_once_with(metric='dummy', sample_rate=1, value=123, tags=[]) From 03be5a186520d78afb4ac15368594d0e0c7c4468 Mon Sep 17 00:00:00 2001 From: adaezebestow <87827898+adaezebestow@users.noreply.github.com> Date: Sun, 19 Dec 2021 12:26:04 -0500 Subject: [PATCH 005/250] Update upgrading.rst with detailed code example of how to resolve post-upgrade warning (#19993) (cherry picked from commit 4ac35d723b73d02875d56bf000aafd2235ef0f4a) --- docs/apache-airflow/installation/upgrading.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/apache-airflow/installation/upgrading.rst b/docs/apache-airflow/installation/upgrading.rst index cd29e32466b27..929c6044e08d9 100644 --- a/docs/apache-airflow/installation/upgrading.rst +++ b/docs/apache-airflow/installation/upgrading.rst @@ -79,6 +79,21 @@ table or rename it or move it to another database using those tools. If you don' can use the ``airflow db shell`` command - this will drop you in the db shell tool for your database and you will be able to both inspect and delete the table. +How to drop the table using Kubernetes: + + +1. Exec into any of the Airflow pods - webserver or scheduler: ``kubectl exec -it python`` + +2. Run the following commands in the python shell: + + .. code-block:: python + + from airflow.settings import Session + + session = Session() + session.execute("DROP TABLE _airflow_moved__2_2__task_instance") + session.commit() + Please replace ```` in the examples with the actual table name as printed in the warning message. Inspecting a table: From 94865f9c6b780ab80bc78f6287752c426e769c60 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 7 Dec 2021 16:05:47 +0100 Subject: [PATCH 006/250] Adds retry on taskinstance retrieval lock (#20030) Fixes: #19832 Co-authored-by: Jaroslaw Potiuk (cherry picked from commit 78c815e22b67e442982b53f41d7d899723d5de9f) --- airflow/models/taskinstance.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/airflow/models/taskinstance.py b/airflow/models/taskinstance.py index 6e9862ef8073f..f37cada726ae3 100644 --- a/airflow/models/taskinstance.py +++ b/airflow/models/taskinstance.py @@ -91,6 +91,7 @@ from airflow.utils.net import get_hostname from airflow.utils.operator_helpers import context_to_airflow_vars from airflow.utils.platform import getuser +from airflow.utils.retries import run_with_db_retries from airflow.utils.session import create_session, provide_session from airflow.utils.sqlalchemy import ExtendedJSON, UtcDateTime from airflow.utils.state import DagRunState, State @@ -723,7 +724,9 @@ def refresh_from_db(self, session=None, lock_for_update=False) -> None: ) if lock_for_update: - ti: Optional[TaskInstance] = qry.with_for_update().first() + for attempt in run_with_db_retries(logger=self.log): + with attempt: + ti: Optional[TaskInstance] = qry.with_for_update().first() else: ti = qry.first() if ti: From 128521548614294e267fe134ffb8a8418fee0376 Mon Sep 17 00:00:00 2001 From: PApostol <50751110+PApostol@users.noreply.github.com> Date: Mon, 13 Dec 2021 11:54:11 +0000 Subject: [PATCH 007/250] Docs for multiple pool slots (#20257) Co-authored-by: Tzu-ping Chung (cherry picked from commit 7c4bed095e15a5ecb3320aa9e68a468d67832a70) --- docs/apache-airflow/concepts/pools.rst | 42 ++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/docs/apache-airflow/concepts/pools.rst b/docs/apache-airflow/concepts/pools.rst index f6c97d7625f4c..5395ccfe3866b 100644 --- a/docs/apache-airflow/concepts/pools.rst +++ b/docs/apache-airflow/concepts/pools.rst @@ -39,13 +39,51 @@ Tasks can then be associated with one of the existing pools by using the ``pool` Tasks will be scheduled as usual while the slots fill up. The number of slots occupied by a task can be configured by -``pool_slots``. Once capacity is reached, runnable tasks get queued and their state will show as such in the UI. +``pool_slots`` (see section below). Once capacity is reached, runnable tasks get queued and their state will show as such in the UI. As slots free up, queued tasks start running based on the :ref:`concepts:priority-weight` of the task and its descendants. -Note that if tasks are not given a pool, they are assigned to a default pool ``default_pool``. ``default_pool`` is +Note that if tasks are not given a pool, they are assigned to a default pool ``default_pool``, which is initialized with 128 slots and can be modified through the UI or CLI (but cannot be removed). +Using multiple pool slots +------------------------- + +Airflow tasks will each occupy a single pool slot by default, but they can be configured to occupy more with the ``pool_slots`` argument if required. +This is particularly useful when several tasks that belong to the same pool don't carry the same "computational weight". + +For instance, consider a pool with 2 slots, ``Pool(pool='maintenance', slots=2)``, and the following tasks: + +.. code-block:: python + + BashOperator( + task_id="heavy_task", + bash_command="bash backup_data.sh", + pool_slots=2, + pool="maintenance", + ) + + BashOperator( + task_id="light_task1", + bash_command="bash check_files.sh", + pool_slots=1, + pool="maintenance", + ) + + BashOperator( + task_id="light_task2", + bash_command="bash remove_files.sh", + pool_slots=1, + pool="maintenance", + ) + +Since the heavy task is configured to use 2 pool slots, it depletes the pool when running. Therefore, any of the light tasks must queue and wait +for the heavy task to complete before they are executed. Here, in terms of resource usage, the heavy task is equivalent to two light tasks running concurrently. + +This implementation can prevent overwhelming system resources, which (in this example) could occur when a heavy and a light task are running concurrently. +On the other hand, both light tasks can run concurrently since they only occupy one pool slot each, while the heavy task would have to wait for two pool +slots to become available before getting executed. + .. warning:: Pools and SubDAGs do not interact as you might first expect. SubDAGs will *not* honor any pool you set on them at From c7256dcac9af524d4f058e60cb2779db8a1ff5dd Mon Sep 17 00:00:00 2001 From: Roberto Li <1402927+robertoea@users.noreply.github.com> Date: Thu, 16 Dec 2021 01:11:02 +0100 Subject: [PATCH 008/250] Doc: Fix incorrect filename references (#20277) Minor typo corrections. I changed the filenames in the example folder structure instead of the later references to be consistent with the other examples in the documentation. (cherry picked from commit d11087c22ef509831379fa6730496f3a4d4c9eed) --- docs/apache-airflow/modules_management.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/apache-airflow/modules_management.rst b/docs/apache-airflow/modules_management.rst index 8488a58a2cb09..4bc16f656774c 100644 --- a/docs/apache-airflow/modules_management.rst +++ b/docs/apache-airflow/modules_management.rst @@ -103,8 +103,8 @@ This is an example structure that you might have in your ``dags`` folder: | | my_custom_dags | __init__.py - | my_dag_1.py - | my_dag_2.py + | my_dag1.py + | my_dag2.py | base_dag.py In the case above, there are the ways you could import the python files: @@ -123,7 +123,7 @@ shared code in the other folders, not the actual DAGs). In the example above the dags are only in ``my_custom_dags`` folder, the ``common_package`` should not be scanned by scheduler when searching for DAGS, so we should ignore ``common_package`` folder. You also -want to ignore the ``base_dag`` if you keep a base DAG there that ``my_dag1.py`` and ``my_dag1.py`` derives +want to ignore the ``base_dag.py`` if you keep a base DAG there that ``my_dag1.py`` and ``my_dag2.py`` derives from. Your ``.airflowignore`` should look then like this: .. code-block:: none @@ -186,7 +186,7 @@ You should import such shared dag using full path (starting from the directory w The relative imports are counter-intuitive, and depending on how you start your python code, they can behave differently. In Airflow the same DAG file might be parsed in different contexts (by schedulers, by workers -or during tests) and in those cases, relatives imports might behave differently. Always use full +or during tests) and in those cases, relative imports might behave differently. Always use full python package paths when you import anything in Airflow DAGs, this will save you a lot of troubles. You can read more about relative import caveats in `this Stack Overflow thread `_. From d6466ee8b4ea7bbd20adc47283efdca3ffaa97cb Mon Sep 17 00:00:00 2001 From: jon-fearer Date: Tue, 14 Dec 2021 20:50:00 -0700 Subject: [PATCH 009/250] fix(dag-dependencies): fix arrow styling (#20303) (cherry picked from commit 28045696dd3ea7207b1162c2343ba142e1f75e5d) --- airflow/www/static/js/dag_dependencies.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/airflow/www/static/js/dag_dependencies.js b/airflow/www/static/js/dag_dependencies.js index 02f83f67c7337..4e342288efcd9 100644 --- a/airflow/www/static/js/dag_dependencies.js +++ b/airflow/www/static/js/dag_dependencies.js @@ -200,7 +200,10 @@ const renderGraph = () => { // Set edges edges.forEach((edge) => { - g.setEdge(edge.u, edge.v); + g.setEdge(edge.u, edge.v, { + curve: d3.curveBasis, + arrowheadClass: 'arrowhead', + }); }); innerSvg.call(render, g); From 1a7f94389274a689590c718d1e4ae800b50bbfa9 Mon Sep 17 00:00:00 2001 From: Matt Rixman <5834582+MatrixManAtYrService@users.noreply.github.com> Date: Wed, 15 Dec 2021 08:13:19 -0700 Subject: [PATCH 010/250] Add docs about ``.airflowignore`` (#20311) This was deleted in an [earlier refactor](https://github.com/apache/airflow/pull/15444/files) (see `concepts.rst`). This PR brings it back. I added it under the "DAGs" section because even though it's file-based and not dag-based, excluding files that define dags is the most likely use case for this feature (I think). (cherry picked from commit 6eac2e0807a8be5f39178f079db28ebcd2f83621) --- docs/apache-airflow/concepts/dags.rst | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/docs/apache-airflow/concepts/dags.rst b/docs/apache-airflow/concepts/dags.rst index b8718956aba5a..83d1cbd7f6dcb 100644 --- a/docs/apache-airflow/concepts/dags.rst +++ b/docs/apache-airflow/concepts/dags.rst @@ -616,6 +616,35 @@ Note that packaged DAGs come with some caveats: In general, if you have a complex set of compiled dependencies and modules, you are likely better off using the Python ``virtualenv`` system and installing the necessary packages on your target systems with ``pip``. +``.airflowignore`` +------------------ + +A ``.airflowignore`` file specifies the directories or files in ``DAG_FOLDER`` +or ``PLUGINS_FOLDER`` that Airflow should intentionally ignore. +Each line in ``.airflowignore`` specifies a regular expression pattern, +and directories or files whose names (not DAG id) match any of the patterns +would be ignored (under the hood, ``Pattern.search()`` is used to match the pattern). +Overall it works like a ``.gitignore`` file. +Use the ``#`` character to indicate a comment; all characters +on a line following a ``#`` will be ignored. + +``.airflowignore`` file should be put in your ``DAG_FOLDER``. +For example, you can prepare a ``.airflowignore`` file with content + +.. code-block:: + + project_a + tenant_[\d] + +Then files like ``project_a_dag_1.py``, ``TESTING_project_a.py``, ``tenant_1.py``, +``project_a/dag_1.py``, and ``tenant_1/dag_1.py`` in your ``DAG_FOLDER`` would be ignored +(If a directory's name matches any of the patterns, this directory and all its subfolders +would not be scanned by Airflow at all. This improves efficiency of DAG finding). + +The scope of a ``.airflowignore`` file is the directory it is in plus all its subfolders. +You can also prepare ``.airflowignore`` file for a subfolder in ``DAG_FOLDER`` and it +would only be applicable for that subfolder. + DAG Dependencies ================ From 614cd3c7eed52711001a16b7afba02678c081635 Mon Sep 17 00:00:00 2001 From: Tanguy <35039373+tanguymartinez@users.noreply.github.com> Date: Wed, 15 Dec 2021 13:15:24 +0100 Subject: [PATCH 011/250] Fix typo (#20314) Build should be built. (cherry picked from commit de36616e1e3b578d9a5b6727daf7a32fe15c4c32) --- docs/apache-airflow/installation/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/apache-airflow/installation/index.rst b/docs/apache-airflow/installation/index.rst index e2f23adc580f6..dfb18f5134fd9 100644 --- a/docs/apache-airflow/installation/index.rst +++ b/docs/apache-airflow/installation/index.rst @@ -155,7 +155,7 @@ This installation method is useful when you are familiar with Container/Docker s running Airflow components in isolation from other software running on the same physical or virtual machines with easy maintenance of dependencies. -The images are build by Apache Airflow release managers and they use officially released packages from PyPI +The images are built by Apache Airflow release managers and they use officially released packages from PyPI and official constraint files- same that are used for installing Airflow from PyPI. **Intended users** From c836e71b2f39e069ae880eb7a2eacfb420672fd6 Mon Sep 17 00:00:00 2001 From: Bas Harenslak Date: Thu, 16 Dec 2021 00:54:31 +0100 Subject: [PATCH 012/250] Bugfix: Deepcopying Kubernetes Secrets attributes causing issues (#20318) Encountered a nasty bug where somebody basically implemented their own KubernetesPodSensor, which failed after more than one attempt when using mode="poke" + a volume + a secret. Root cause turned out to be in `secret.attach_to_pod()`. In here, a volume and volumemount is created to mount the secret. A deepcopy() is made of the given Pod spec. In order to avoid appending to None, there is this line: `cp_pod.spec.volumes = pod.spec.volumes or []`. In case a volume is set on the Pod spec, a reference is created to the original pod spec volumes, which in turn was a reference to `self.volumes`. As a result, each secret resulted in a volume added to `self.volumes`, which resulted in an error when running the sensor a second time because the secret volume was already mounted during the first sensor attempt. This PR references the deepcopied object instead, and creates a new list if pod.spec.volumes is None. Co-authored-by: Bas Harenslak (cherry picked from commit 2409760694b668213a111712bb1162884c23dd2d) --- airflow/kubernetes/secret.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/airflow/kubernetes/secret.py b/airflow/kubernetes/secret.py index 20ed27b1ffb31..1ca26111303dd 100644 --- a/airflow/kubernetes/secret.py +++ b/airflow/kubernetes/secret.py @@ -91,20 +91,28 @@ def to_volume_secret(self) -> Tuple[k8s.V1Volume, k8s.V1VolumeMount]: def attach_to_pod(self, pod: k8s.V1Pod) -> k8s.V1Pod: """Attaches to pod""" cp_pod = copy.deepcopy(pod) + if self.deploy_type == 'volume': volume, volume_mount = self.to_volume_secret() - cp_pod.spec.volumes = pod.spec.volumes or [] + if cp_pod.spec.volumes is None: + cp_pod.spec.volumes = [] cp_pod.spec.volumes.append(volume) - cp_pod.spec.containers[0].volume_mounts = pod.spec.containers[0].volume_mounts or [] + if cp_pod.spec.containers[0].volume_mounts is None: + cp_pod.spec.containers[0].volume_mounts = [] cp_pod.spec.containers[0].volume_mounts.append(volume_mount) + if self.deploy_type == 'env' and self.key is not None: env = self.to_env_secret() - cp_pod.spec.containers[0].env = cp_pod.spec.containers[0].env or [] + if cp_pod.spec.containers[0].env is None: + cp_pod.spec.containers[0].env = [] cp_pod.spec.containers[0].env.append(env) + if self.deploy_type == 'env' and self.key is None: env_from = self.to_env_from_secret() - cp_pod.spec.containers[0].env_from = cp_pod.spec.containers[0].env_from or [] + if cp_pod.spec.containers[0].env_from is None: + cp_pod.spec.containers[0].env_from = [] cp_pod.spec.containers[0].env_from.append(env_from) + return cp_pod def __eq__(self, other): From a25d7cef7f10be25b2446abe641c0b5822e9d9dc Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Tue, 21 Dec 2021 18:00:46 +0800 Subject: [PATCH 013/250] Un-ignore DeprecationWarning (#20322) (cherry picked from commit 9876e19273cd56dc53d3a4e287db43acbfa65c4b) --- airflow/models/taskinstance.py | 41 ++++----- airflow/operators/datetime.py | 2 +- airflow/operators/python.py | 26 +++--- airflow/operators/weekday.py | 2 +- airflow/providers/http/operators/http.py | 10 +-- airflow/providers/http/sensors/http.py | 7 +- airflow/sensors/external_task.py | 24 +++--- airflow/sensors/weekday.py | 2 +- airflow/utils/context.py | 33 ++++++++ airflow/utils/context.pyi | 6 +- airflow/utils/helpers.py | 2 +- .../log/task_handler_with_custom_formatter.py | 4 +- airflow/utils/operator_helpers.py | 84 ++++++++++++++----- .../ci/kubernetes/ci_run_kubernetes_tests.sh | 7 +- scripts/in_container/entrypoint_ci.sh | 2 - tests/cli/commands/test_task_command.py | 2 + tests/core/test_core.py | 21 +++-- tests/operators/test_email.py | 2 +- tests/operators/test_python.py | 9 +- tests/operators/test_trigger_dagrun.py | 2 +- tests/providers/http/sensors/test_http.py | 4 +- tests/sensors/test_external_task_sensor.py | 8 +- tests/utils/test_log_handlers.py | 6 +- 23 files changed, 195 insertions(+), 111 deletions(-) diff --git a/airflow/models/taskinstance.py b/airflow/models/taskinstance.py index f37cada726ae3..716167c20f9b8 100644 --- a/airflow/models/taskinstance.py +++ b/airflow/models/taskinstance.py @@ -86,7 +86,7 @@ from airflow.utils import timezone from airflow.utils.context import ConnectionAccessor, Context, VariableAccessor from airflow.utils.email import send_email -from airflow.utils.helpers import is_container +from airflow.utils.helpers import is_container, render_template_to_string from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.net import get_hostname from airflow.utils.operator_helpers import context_to_airflow_vars @@ -2016,7 +2016,7 @@ def render_k8s_pod_yaml(self) -> Optional[dict]: sanitized_pod = ApiClient().sanitize_for_serialization(pod) return sanitized_pod - def get_email_subject_content(self, exception): + def get_email_subject_content(self, exception: BaseException) -> Tuple[str, str, str]: """Get the email subject content for exceptions.""" # For a ti from DB (without ti.task), return the default value # Reuse it for smart sensor to send default email alert @@ -2043,18 +2043,18 @@ def get_email_subject_content(self, exception): 'Mark success: Link
' ) + # This function is called after changing the state from State.RUNNING, + # so we need to subtract 1 from self.try_number here. + current_try_number = self.try_number - 1 + additional_context = { + "exception": exception, + "exception_html": exception_html, + "try_number": current_try_number, + "max_tries": self.max_tries, + } + if use_default: - jinja_context = {'ti': self} - # This function is called after changing the state - # from State.RUNNING so need to subtract 1 from self.try_number. - jinja_context.update( - dict( - exception=exception, - exception_html=exception_html, - try_number=self.try_number - 1, - max_tries=self.max_tries, - ) - ) + jinja_context = {"ti": self, **additional_context} jinja_env = jinja2.Environment( loader=jinja2.FileSystemLoader(os.path.dirname(__file__)), autoescape=True ) @@ -2064,24 +2064,15 @@ def get_email_subject_content(self, exception): else: jinja_context = self.get_template_context() - - jinja_context.update( - dict( - exception=exception, - exception_html=exception_html, - try_number=self.try_number - 1, - max_tries=self.max_tries, - ) - ) - + jinja_context.update(additional_context) jinja_env = self.task.get_template_env() - def render(key, content): + def render(key: str, content: str) -> str: if conf.has_option('email', key): path = conf.get('email', key) with open(path) as f: content = f.read() - return jinja_env.from_string(content).render(**jinja_context) + return render_template_to_string(jinja_env.from_string(content), jinja_context) subject = render('subject_template', default_subject) html_content = render('html_content_template', default_html_content) diff --git a/airflow/operators/datetime.py b/airflow/operators/datetime.py index 6b1acf72b4b41..15d4300372ff3 100644 --- a/airflow/operators/datetime.py +++ b/airflow/operators/datetime.py @@ -72,7 +72,7 @@ def __init__( def choose_branch(self, context: Dict) -> Union[str, Iterable[str]]: if self.use_task_execution_date is True: - now = timezone.make_naive(context["execution_date"], self.dag.timezone) + now = timezone.make_naive(context["logical_date"], self.dag.timezone) else: now = timezone.make_naive(timezone.utcnow(), self.dag.timezone) diff --git a/airflow/operators/python.py b/airflow/operators/python.py index 5b552b8192c46..8e51536d617be 100644 --- a/airflow/operators/python.py +++ b/airflow/operators/python.py @@ -24,7 +24,7 @@ import warnings from tempfile import TemporaryDirectory from textwrap import dedent -from typing import Callable, Dict, Iterable, List, Optional, Union +from typing import Any, Callable, Collection, Dict, Iterable, List, Mapping, Optional, Union import dill @@ -33,7 +33,7 @@ from airflow.models.skipmixin import SkipMixin from airflow.models.taskinstance import _CURRENT_CONTEXT from airflow.utils.context import Context -from airflow.utils.operator_helpers import determine_kwargs +from airflow.utils.operator_helpers import KeywordParameters from airflow.utils.process_utils import execute_in_subprocess from airflow.utils.python_virtualenv import prepare_virtualenv, write_python_script @@ -142,8 +142,8 @@ def __init__( self, *, python_callable: Callable, - op_args: Optional[List] = None, - op_kwargs: Optional[Dict] = None, + op_args: Optional[Collection[Any]] = None, + op_kwargs: Optional[Mapping[str, Any]] = None, templates_dict: Optional[Dict] = None, templates_exts: Optional[List[str]] = None, **kwargs, @@ -159,7 +159,7 @@ def __init__( if not callable(python_callable): raise AirflowException('`python_callable` param must be callable') self.python_callable = python_callable - self.op_args = op_args or [] + self.op_args = op_args or () self.op_kwargs = op_kwargs or {} self.templates_dict = templates_dict if templates_exts: @@ -169,12 +169,15 @@ def execute(self, context: Dict): context.update(self.op_kwargs) context['templates_dict'] = self.templates_dict - self.op_kwargs = determine_kwargs(self.python_callable, self.op_args, context) + self.op_kwargs = self.determine_kwargs(context) return_value = self.execute_callable() self.log.info("Done. Returned value was: %s", return_value) return return_value + def determine_kwargs(self, context: Mapping[str, Any]) -> Mapping[str, Any]: + return KeywordParameters.determine(self.python_callable, self.op_args, context).unpacking() + def execute_callable(self): """ Calls the python callable with the given arguments. @@ -241,11 +244,11 @@ def execute(self, context: Dict): self.log.info('Skipping downstream tasks...') - downstream_tasks = context['task'].get_flat_relatives(upstream=False) + downstream_tasks = context["task"].get_flat_relatives(upstream=False) self.log.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: - self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks) + self.skip(context["dag_run"], context["logical_date"], downstream_tasks) self.log.info("Done.") @@ -345,8 +348,8 @@ def __init__( python_version: Optional[Union[str, int, float]] = None, use_dill: bool = False, system_site_packages: bool = True, - op_args: Optional[List] = None, - op_kwargs: Optional[Dict] = None, + op_args: Optional[Collection[Any]] = None, + op_kwargs: Optional[Mapping[str, Any]] = None, string_args: Optional[Iterable[str]] = None, templates_dict: Optional[Dict] = None, templates_exts: Optional[List[str]] = None, @@ -392,6 +395,9 @@ def execute(self, context: Context): serializable_context = context.copy_only(serializable_keys) return super().execute(context=serializable_context) + def determine_kwargs(self, context: Mapping[str, Any]) -> Mapping[str, Any]: + return KeywordParameters.determine(self.python_callable, self.op_args, context).serializing() + def execute_callable(self): with TemporaryDirectory(prefix='venv') as tmp_dir: if self.templates_dict: diff --git a/airflow/operators/weekday.py b/airflow/operators/weekday.py index e1167a5137d98..2e4e656fae1f2 100644 --- a/airflow/operators/weekday.py +++ b/airflow/operators/weekday.py @@ -67,7 +67,7 @@ def __init__( def choose_branch(self, context: Dict) -> Union[str, Iterable[str]]: if self.use_task_execution_day: - now = context["execution_date"] + now = context["logical_date"] else: now = timezone.make_naive(timezone.utcnow(), self.dag.timezone) diff --git a/airflow/providers/http/operators/http.py b/airflow/providers/http/operators/http.py index b6295185d8ba8..d36ceb21b73aa 100644 --- a/airflow/providers/http/operators/http.py +++ b/airflow/providers/http/operators/http.py @@ -104,7 +104,7 @@ def __init__( raise AirflowException("'xcom_push' was deprecated, use 'BaseOperator.do_xcom_push' instead") def execute(self, context: Dict[str, Any]) -> Any: - from airflow.utils.operator_helpers import make_kwargs_callable + from airflow.utils.operator_helpers import determine_kwargs http = HttpHook(self.method, http_conn_id=self.http_conn_id, auth_type=self.auth_type) @@ -114,10 +114,10 @@ def execute(self, context: Dict[str, Any]) -> Any: if self.log_response: self.log.info(response.text) if self.response_check: - kwargs_callable = make_kwargs_callable(self.response_check) - if not kwargs_callable(response, **context): + kwargs = determine_kwargs(self.response_check, [response], context) + if not self.response_check(response, **kwargs): raise AirflowException("Response check returned False.") if self.response_filter: - kwargs_callable = make_kwargs_callable(self.response_filter) - return kwargs_callable(response, **context) + kwargs = determine_kwargs(self.response_filter, [response], context) + return self.response_filter(response, **kwargs) return response.text diff --git a/airflow/providers/http/sensors/http.py b/airflow/providers/http/sensors/http.py index 6ef55ea5a5641..e052c014cc851 100644 --- a/airflow/providers/http/sensors/http.py +++ b/airflow/providers/http/sensors/http.py @@ -96,7 +96,7 @@ def __init__( self.hook = HttpHook(method=method, http_conn_id=http_conn_id) def poke(self, context: Dict[Any, Any]) -> bool: - from airflow.utils.operator_helpers import make_kwargs_callable + from airflow.utils.operator_helpers import determine_kwargs self.log.info('Poking: %s', self.endpoint) try: @@ -107,9 +107,8 @@ def poke(self, context: Dict[Any, Any]) -> bool: extra_options=self.extra_options, ) if self.response_check: - kwargs_callable = make_kwargs_callable(self.response_check) - return kwargs_callable(response, **context) - + kwargs = determine_kwargs(self.response_check, [response], context) + return self.response_check(response, **kwargs) except AirflowException as exc: if str(exc).startswith("404"): return False diff --git a/airflow/sensors/external_task.py b/airflow/sensors/external_task.py index c4510015138e0..32336d3fa381a 100644 --- a/airflow/sensors/external_task.py +++ b/airflow/sensors/external_task.py @@ -47,7 +47,7 @@ def get_link(self, operator, dttm): class ExternalTaskSensor(BaseSensorOperator): """ Waits for a different DAG or a task in a different DAG to complete for a - specific execution_date + specific logical date. :param external_dag_id: The dag_id that contains the task you want to wait for @@ -65,14 +65,14 @@ class ExternalTaskSensor(BaseSensorOperator): :param failed_states: Iterable of failed or dis-allowed states, default is ``None`` :type failed_states: Iterable :param execution_delta: time difference with the previous execution to - look at, the default is the same execution_date as the current task or DAG. + look at, the default is the same logical date as the current task or DAG. For yesterday, use [positive!] datetime.timedelta(days=1). Either execution_delta or execution_date_fn can be passed to ExternalTaskSensor, but not both. :type execution_delta: Optional[datetime.timedelta] - :param execution_date_fn: function that receives the current execution date as the first + :param execution_date_fn: function that receives the current execution's logical date as the first positional argument and optionally any number of keyword arguments available in the - context dictionary, and returns the desired execution dates to query. + context dictionary, and returns the desired logical dates to query. Either execution_delta or execution_date_fn can be passed to ExternalTaskSensor, but not both. :type execution_date_fn: Optional[Callable] @@ -157,11 +157,11 @@ def __init__( @provide_session def poke(self, context, session=None): if self.execution_delta: - dttm = context['execution_date'] - self.execution_delta + dttm = context['logical_date'] - self.execution_delta elif self.execution_date_fn: dttm = self._handle_execution_date_fn(context=context) else: - dttm = context['execution_date'] + dttm = context['logical_date'] dttm_filter = dttm if isinstance(dttm, list) else [dttm] serialized_dttm_filter = ','.join(dt.isoformat() for dt in dttm_filter) @@ -260,14 +260,14 @@ def _handle_execution_date_fn(self, context) -> Any: """ from airflow.utils.operator_helpers import make_kwargs_callable - # Remove "execution_date" because it is already a mandatory positional argument - execution_date = context["execution_date"] - kwargs = {k: v for k, v in context.items() if k != "execution_date"} + # Remove "logical_date" because it is already a mandatory positional argument + logical_date = context["logical_date"] + kwargs = {k: v for k, v in context.items() if k not in {"execution_date", "logical_date"}} # Add "context" in the kwargs for backward compatibility (because context used to be # an acceptable argument of execution_date_fn) kwargs["context"] = context kwargs_callable = make_kwargs_callable(self.execution_date_fn) - return kwargs_callable(execution_date, **kwargs) + return kwargs_callable(logical_date, **kwargs) class ExternalTaskMarker(DummyOperator): @@ -281,7 +281,7 @@ class ExternalTaskMarker(DummyOperator): :type external_dag_id: str :param external_task_id: The task_id of the dependent task that needs to be cleared. :type external_task_id: str - :param execution_date: The execution_date of the dependent task that needs to be cleared. + :param execution_date: The logical date of the dependent task execution that needs to be cleared. :type execution_date: str or datetime.datetime :param recursion_depth: The maximum level of transitive dependencies allowed. Default is 10. This is mostly used for preventing cyclic dependencies. It is fine to increase @@ -300,7 +300,7 @@ def __init__( *, external_dag_id: str, external_task_id: str, - execution_date: Optional[Union[str, datetime.datetime]] = "{{ execution_date.isoformat() }}", + execution_date: Optional[Union[str, datetime.datetime]] = "{{ logical_date.isoformat() }}", recursion_depth: int = 10, **kwargs, ): diff --git a/airflow/sensors/weekday.py b/airflow/sensors/weekday.py index 03e3221493b9c..741e1660251db 100644 --- a/airflow/sensors/weekday.py +++ b/airflow/sensors/weekday.py @@ -84,6 +84,6 @@ def poke(self, context): WeekDay(timezone.utcnow().isoweekday()).name, ) if self.use_task_execution_day: - return context['execution_date'].isoweekday() in self._week_day_num + return context['logical_date'].isoweekday() in self._week_day_num else: return timezone.utcnow().isoweekday() in self._week_day_num diff --git a/airflow/utils/context.py b/airflow/utils/context.py index 61f9319f2bed0..d8eee04599ad0 100644 --- a/airflow/utils/context.py +++ b/airflow/utils/context.py @@ -20,6 +20,7 @@ import contextlib import copy +import functools import warnings from typing import ( AbstractSet, @@ -28,12 +29,15 @@ Dict, Iterator, List, + Mapping, MutableMapping, Optional, Tuple, ValuesView, ) +import lazy_object_proxy + _NOT_SET: Any = object() @@ -194,3 +198,32 @@ def copy_only(self, keys: Container[str]) -> "Context": new = type(self)({k: v for k, v in self._context.items() if k in keys}) new._deprecation_replacements = self._deprecation_replacements.copy() return new + + +def lazy_mapping_from_context(source: Context) -> Mapping[str, Any]: + """Create a mapping that wraps deprecated entries in a lazy object proxy. + + This further delays deprecation warning to until when the entry is actually + used, instead of when it's accessed in the context. The result is useful for + passing into a callable with ``**kwargs``, which would unpack the mapping + too eagerly otherwise. + + This is implemented as a free function because the ``Context`` type is + "faked" as a ``TypedDict`` in ``context.pyi``, which cannot have custom + functions. + + :meta private: + """ + + def _deprecated_proxy_factory(k: str, v: Any) -> Any: + replacements = source._deprecation_replacements[k] + warnings.warn(_create_deprecation_warning(k, replacements)) + return v + + def _create_value(k: str, v: Any) -> Any: + if k not in source._deprecation_replacements: + return v + factory = functools.partial(_deprecated_proxy_factory, k, v) + return lazy_object_proxy.Proxy(factory) + + return {k: _create_value(k, v) for k, v in source._context.items()} diff --git a/airflow/utils/context.pyi b/airflow/utils/context.pyi index 0921d79affd8e..44b152c429ce9 100644 --- a/airflow/utils/context.pyi +++ b/airflow/utils/context.pyi @@ -25,7 +25,7 @@ # undefined attribute errors from Mypy. Hopefully there will be a mechanism to # declare "these are defined, but don't error if others are accessed" someday. -from typing import Any, Optional +from typing import Any, Mapping, Optional from pendulum import DateTime @@ -80,3 +80,7 @@ class Context(TypedDict, total=False): var: _VariableAccessors yesterday_ds: str yesterday_ds_nodash: str + +class AirflowContextDeprecationWarning(DeprecationWarning): ... + +def lazy_mapping_from_context(source: Context) -> Mapping[str, Any]: ... diff --git a/airflow/utils/helpers.py b/airflow/utils/helpers.py index c5f9f27fd0e1b..2215c4c3ee71b 100644 --- a/airflow/utils/helpers.py +++ b/airflow/utils/helpers.py @@ -167,7 +167,7 @@ def render_log_filename(ti: "TaskInstance", try_number, filename_template) -> st if filename_jinja_template: jinja_context = ti.get_template_context() jinja_context['try_number'] = try_number - return filename_jinja_template.render(**jinja_context) + return render_template_to_string(filename_jinja_template, jinja_context) return filename_template.format( dag_id=ti.dag_id, diff --git a/airflow/utils/log/task_handler_with_custom_formatter.py b/airflow/utils/log/task_handler_with_custom_formatter.py index 5034d00fe16e9..b7b431b63222a 100644 --- a/airflow/utils/log/task_handler_with_custom_formatter.py +++ b/airflow/utils/log/task_handler_with_custom_formatter.py @@ -20,7 +20,7 @@ from logging import StreamHandler from airflow.configuration import conf -from airflow.utils.helpers import parse_template_string +from airflow.utils.helpers import parse_template_string, render_template_to_string class TaskHandlerWithCustomFormatter(StreamHandler): @@ -52,6 +52,6 @@ def set_context(self, ti): def _render_prefix(self, ti): if self.prefix_jinja_template: jinja_context = ti.get_template_context() - return self.prefix_jinja_template.render(**jinja_context) + return render_template_to_string(self.prefix_jinja_template, jinja_context) logging.warning("'task_log_prefix_template' is in invalid format, ignoring the variable value") return "" diff --git a/airflow/utils/operator_helpers.py b/airflow/utils/operator_helpers.py index 8c5125bd403ac..05c050cad95f7 100644 --- a/airflow/utils/operator_helpers.py +++ b/airflow/utils/operator_helpers.py @@ -17,7 +17,9 @@ # under the License. # from datetime import datetime -from typing import Callable, Dict, List, Mapping, Tuple, Union +from typing import Any, Callable, Collection, Mapping + +from airflow.utils.context import Context, lazy_mapping_from_context AIRFLOW_VAR_NAME_FORMAT_MAPPING = { 'AIRFLOW_CONTEXT_DAG_ID': {'default': 'airflow.ctx.dag_id', 'env_var_format': 'AIRFLOW_CTX_DAG_ID'}, @@ -88,7 +90,67 @@ def context_to_airflow_vars(context, in_env_var_format=False): return params -def determine_kwargs(func: Callable, args: Union[Tuple, List], kwargs: Mapping) -> Dict: +class KeywordParameters: + """Wrapper representing ``**kwargs`` to a callable. + + The actual ``kwargs`` can be obtained by calling either ``unpacking()`` or + ``serializing()``. They behave almost the same and are only different if + the containing ``kwargs`` is an Airflow Context object, and the calling + function uses ``**kwargs`` in the argument list. + + In this particular case, ``unpacking()`` uses ``lazy-object-proxy`` to + prevent the Context from emitting deprecation warnings too eagerly when it's + unpacked by ``**``. ``serializing()`` does not do this, and will allow the + warnings to be emitted eagerly, which is useful when you want to dump the + content and use it somewhere else without needing ``lazy-object-proxy``. + """ + + def __init__(self, kwargs: Mapping[str, Any], *, wildcard: bool) -> None: + self._kwargs = kwargs + self._wildcard = wildcard + + @classmethod + def determine( + cls, + func: Callable[..., Any], + args: Collection[Any], + kwargs: Mapping[str, Any], + ) -> "KeywordParameters": + import inspect + import itertools + + signature = inspect.signature(func) + has_wildcard_kwargs = any(p.kind == p.VAR_KEYWORD for p in signature.parameters.values()) + + for name in itertools.islice(signature.parameters.keys(), len(args)): + # Check if args conflict with names in kwargs. + if name in kwargs: + raise ValueError(f"The key {name!r} in args is a part of kwargs and therefore reserved.") + + if has_wildcard_kwargs: + # If the callable has a **kwargs argument, it's ready to accept all the kwargs. + return cls(kwargs, wildcard=True) + + # If the callable has no **kwargs argument, it only wants the arguments it requested. + kwargs = {key: kwargs[key] for key in signature.parameters if key in kwargs} + return cls(kwargs, wildcard=False) + + def unpacking(self) -> Mapping[str, Any]: + """Dump the kwargs mapping to unpack with ``**`` in a function call.""" + if self._wildcard and isinstance(self._kwargs, Context): + return lazy_mapping_from_context(self._kwargs) + return self._kwargs + + def serializing(self) -> Mapping[str, Any]: + """Dump the kwargs mapping for serialization purposes.""" + return self._kwargs + + +def determine_kwargs( + func: Callable[..., Any], + args: Collection[Any], + kwargs: Mapping[str, Any], +) -> Mapping[str, Any]: """ Inspect the signature of a given callable to determine which arguments in kwargs need to be passed to the callable. @@ -99,23 +161,7 @@ def determine_kwargs(func: Callable, args: Union[Tuple, List], kwargs: Mapping) :param kwargs: The keyword arguments that need to be filtered before passing to the callable. :return: A dictionary which contains the keyword arguments that are compatible with the callable. """ - import inspect - import itertools - - signature = inspect.signature(func) - has_kwargs = any(p.kind == p.VAR_KEYWORD for p in signature.parameters.values()) - - for name in itertools.islice(signature.parameters.keys(), len(args)): - # Check if args conflict with names in kwargs - if name in kwargs: - raise ValueError(f"The key {name} in args is part of kwargs and therefore reserved.") - - if has_kwargs: - # If the callable has a **kwargs argument, it's ready to accept all the kwargs. - return kwargs - - # If the callable has no **kwargs argument, it only wants the arguments it requested. - return {key: kwargs[key] for key in signature.parameters if key in kwargs} + return KeywordParameters.determine(func, args, kwargs).unpacking() def make_kwargs_callable(func: Callable) -> Callable: diff --git a/scripts/ci/kubernetes/ci_run_kubernetes_tests.sh b/scripts/ci/kubernetes/ci_run_kubernetes_tests.sh index a97f6929e1716..e586c300be37d 100755 --- a/scripts/ci/kubernetes/ci_run_kubernetes_tests.sh +++ b/scripts/ci/kubernetes/ci_run_kubernetes_tests.sh @@ -52,10 +52,7 @@ function parse_tests_to_run() { else tests_to_run=("${@}") fi - pytest_args=( - "--pythonwarnings=ignore::DeprecationWarning" - "--pythonwarnings=ignore::PendingDeprecationWarning" - ) + pytest_args=() else tests_to_run=("kubernetes_tests") pytest_args=( @@ -64,8 +61,6 @@ function parse_tests_to_run() { "--durations=100" "--color=yes" "--maxfail=50" - "--pythonwarnings=ignore::DeprecationWarning" - "--pythonwarnings=ignore::PendingDeprecationWarning" ) fi diff --git a/scripts/in_container/entrypoint_ci.sh b/scripts/in_container/entrypoint_ci.sh index 29f5210814248..5d7aca057b4ec 100755 --- a/scripts/in_container/entrypoint_ci.sh +++ b/scripts/in_container/entrypoint_ci.sh @@ -209,8 +209,6 @@ EXTRA_PYTEST_ARGS=( "--cov-report=xml:/files/coverage-${TEST_TYPE}-${BACKEND}.xml" "--color=yes" "--maxfail=50" - "--pythonwarnings=ignore::DeprecationWarning" - "--pythonwarnings=ignore::PendingDeprecationWarning" "--junitxml=${RESULT_LOG_FILE}" # timeouts in seconds for individual tests "--timeouts-order" diff --git a/tests/cli/commands/test_task_command.py b/tests/cli/commands/test_task_command.py index 7d246c732dafe..201af16bb75be 100644 --- a/tests/cli/commands/test_task_command.py +++ b/tests/cli/commands/test_task_command.py @@ -84,6 +84,7 @@ def test_cli_list_tasks(self): args = self.parser.parse_args(['tasks', 'list', 'example_bash_operator', '--tree']) task_command.task_list(args) + @pytest.mark.filterwarnings("ignore::airflow.utils.context.AirflowContextDeprecationWarning") def test_test(self): """Test the `airflow test` command""" args = self.parser.parse_args( @@ -96,6 +97,7 @@ def test_test(self): # Check that prints, and log messages, are shown assert "'example_python_operator__print_the_context__20180101'" in stdout.getvalue() + @pytest.mark.filterwarnings("ignore::airflow.utils.context.AirflowContextDeprecationWarning") def test_test_with_existing_dag_run(self): """Test the `airflow test` command""" task_id = 'print_the_context' diff --git a/tests/core/test_core.py b/tests/core/test_core.py index cae311d30b962..02162e9ba0694 100644 --- a/tests/core/test_core.py +++ b/tests/core/test_core.py @@ -218,7 +218,7 @@ def test_timeout(self, dag_maker): op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) def test_python_op(self, dag_maker): - def test_py_op(templates_dict, ds, **kwargs): + def test_py_op(templates_dict, ds): if not templates_dict['ds'] == ds: raise Exception("failure") @@ -246,10 +246,6 @@ def test_task_get_template(self, session): assert context['ds'] == '2015-01-01' assert context['ds_nodash'] == '20150101' - # next_ds is 2015-01-02 as the dag schedule is daily. - assert context['next_ds'] == '2015-01-02' - assert context['next_ds_nodash'] == '20150102' - assert context['ts'] == '2015-01-01T00:00:00+00:00' assert context['ts_nodash'] == '20150101T000000' assert context['ts_nodash_with_tz'] == '20150101T000000+0000' @@ -259,6 +255,8 @@ def test_task_get_template(self, session): # Test deprecated fields. expected_deprecated_fields = [ + ("next_ds", "2015-01-02"), + ("next_ds_nodash", "20150102"), ("prev_ds", "2014-12-31"), ("prev_ds_nodash", "20141231"), ("yesterday_ds", "2014-12-31"), @@ -267,14 +265,17 @@ def test_task_get_template(self, session): ("tomorrow_ds_nodash", "20150102"), ] for key, expected_value in expected_deprecated_fields: - message = ( + message_beginning = ( f"Accessing {key!r} from the template is deprecated and " f"will be removed in a future version." ) with pytest.deprecated_call() as recorder: value = str(context[key]) # Simulate template evaluation to trigger warning. assert value == expected_value - assert [str(m.message) for m in recorder] == [message] + + recorded_message = [str(m.message) for m in recorder] + assert len(recorded_message) == 1 + assert recorded_message[0].startswith(message_beginning) def test_bad_trigger_rule(self, dag_maker): with pytest.raises(AirflowException): @@ -338,8 +339,10 @@ def test_externally_triggered_dagrun(self, dag_maker): context = ti.get_template_context() # next_ds should be the execution date for manually triggered runs - assert context['next_ds'] == execution_ds - assert context['next_ds_nodash'] == execution_ds_nodash + with pytest.deprecated_call(): + assert context['next_ds'] == execution_ds + with pytest.deprecated_call(): + assert context['next_ds_nodash'] == execution_ds_nodash def test_dag_params_and_task_params(self, dag_maker): # This test case guards how params of DAG and Operator work together. diff --git a/tests/operators/test_email.py b/tests/operators/test_email.py index 5419796fbadbf..ba2acda44e3ba 100644 --- a/tests/operators/test_email.py +++ b/tests/operators/test_email.py @@ -50,7 +50,7 @@ def _run_as_operator(self, **kwargs): html_content='The quick brown fox jumps over the lazy dog', task_id='task', dag=self.dag, - files=["/tmp/Report-A-{{ execution_date.strftime('%Y-%m-%d') }}.csv"], + files=["/tmp/Report-A-{{ ds }}.csv"], **kwargs, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) diff --git a/tests/operators/test_python.py b/tests/operators/test_python.py index 172468b1d5aab..ac3446803248b 100644 --- a/tests/operators/test_python.py +++ b/tests/operators/test_python.py @@ -19,6 +19,7 @@ import logging import sys import unittest.mock +import warnings from collections import namedtuple from datetime import date, datetime, timedelta from subprocess import CalledProcessError @@ -39,6 +40,7 @@ get_current_context, ) from airflow.utils import timezone +from airflow.utils.context import AirflowContextDeprecationWarning from airflow.utils.dates import days_ago from airflow.utils.session import create_session from airflow.utils.state import State @@ -850,6 +852,7 @@ def f(templates_dict): # This tests might take longer than default 60 seconds as it is serializing a lot of # context using dill (which is slow apparently). @pytest.mark.execution_timeout(120) + @pytest.mark.filterwarnings("ignore::airflow.utils.context.AirflowContextDeprecationWarning") def test_airflow_context(self): def f( # basic @@ -890,6 +893,7 @@ def f( self._run_as_operator(f, use_dill=True, system_site_packages=True, requirements=None) + @pytest.mark.filterwarnings("ignore::airflow.utils.context.AirflowContextDeprecationWarning") def test_pendulum_context(self): def f( # basic @@ -923,6 +927,7 @@ def f( self._run_as_operator(f, use_dill=True, system_site_packages=False, requirements=['pendulum']) + @pytest.mark.filterwarnings("ignore::airflow.utils.context.AirflowContextDeprecationWarning") def test_base_context(self): def f( # basic @@ -1026,7 +1031,9 @@ def execute(self, context): def get_all_the_context(**context): current_context = get_current_context() - assert context == current_context._context + with warnings.catch_warnings(): + warnings.simplefilter("ignore", AirflowContextDeprecationWarning) + assert context == current_context._context @pytest.fixture() diff --git a/tests/operators/test_trigger_dagrun.py b/tests/operators/test_trigger_dagrun.py index ea61687db1656..9ff87358d0db2 100644 --- a/tests/operators/test_trigger_dagrun.py +++ b/tests/operators/test_trigger_dagrun.py @@ -152,7 +152,7 @@ def test_trigger_dagrun_with_templated_execution_date(self): task = TriggerDagRunOperator( task_id="test_trigger_dagrun_with_str_execution_date", trigger_dag_id=TRIGGERED_DAG_ID, - execution_date="{{ execution_date }}", + execution_date="{{ logical_date }}", dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) diff --git a/tests/providers/http/sensors/test_http.py b/tests/providers/http/sensors/test_http.py index 3fc61bb5295a5..dc3b41f5f5b15 100644 --- a/tests/providers/http/sensors/test_http.py +++ b/tests/providers/http/sensors/test_http.py @@ -125,8 +125,8 @@ def test_poke_context(self, mock_session_send, create_task_instance_of_operator) response.status_code = 200 mock_session_send.return_value = response - def resp_check(_, execution_date): - if execution_date == DEFAULT_DATE: + def resp_check(_, logical_date): + if logical_date == DEFAULT_DATE: return True raise AirflowException('AirflowException raised here!') diff --git a/tests/sensors/test_external_task_sensor.py b/tests/sensors/test_external_task_sensor.py index d1e150bf5d916..28018b9bbc8dc 100644 --- a/tests/sensors/test_external_task_sensor.py +++ b/tests/sensors/test_external_task_sensor.py @@ -174,7 +174,7 @@ def test_external_dag_sensor(self): def test_external_task_sensor_fn_multiple_execution_dates(self): bash_command_code = """ -{% set s=execution_date.time().second %} +{% set s=logical_date.time().second %} echo "second is {{ s }}" if [[ $(( {{ s }} % 60 )) == 1 ]] then @@ -292,7 +292,7 @@ def test_external_task_sensor_fn_multiple_args(self): self.test_time_sensor() def my_func(dt, context): - assert context['execution_date'] == dt + assert context['logical_date'] == dt return dt + timedelta(0) op1 = ExternalTaskSensor( @@ -541,7 +541,7 @@ def dag_bag_parent_child(): task_id="task_1", external_dag_id=dag_0.dag_id, external_task_id=task_0.task_id, - execution_date_fn=lambda execution_date: day_1 if execution_date == day_1 else [], + execution_date_fn=lambda logical_date: day_1 if logical_date == day_1 else [], mode='reschedule', ) @@ -884,7 +884,7 @@ def dag_bag_head_tail(): task_id="tail", external_dag_id=dag.dag_id, external_task_id=head.task_id, - execution_date="{{ tomorrow_ds_nodash }}", + execution_date="{{ macros.ds_add(ds, 1) }}", ) head >> body >> tail diff --git a/tests/utils/test_log_handlers.py b/tests/utils/test_log_handlers.py index 4503dd80303e3..78166a8b27fbe 100644 --- a/tests/utils/test_log_handlers.py +++ b/tests/utils/test_log_handlers.py @@ -62,7 +62,7 @@ def test_default_task_logging_setup(self): assert handler.name == FILE_TASK_HANDLER def test_file_task_handler_when_ti_value_is_invalid(self): - def task_callable(ti, **kwargs): + def task_callable(ti): ti.log.info("test") dag = DAG('dag_for_testing_file_task_handler', start_date=DEFAULT_DATE) @@ -114,7 +114,7 @@ def task_callable(ti, **kwargs): os.remove(log_filename) def test_file_task_handler(self): - def task_callable(ti, **kwargs): + def task_callable(ti): ti.log.info("test") dag = DAG('dag_for_testing_file_task_handler', start_date=DEFAULT_DATE) @@ -168,7 +168,7 @@ def task_callable(ti, **kwargs): os.remove(log_filename) def test_file_task_handler_running(self): - def task_callable(ti, **kwargs): + def task_callable(ti): ti.log.info("test") dag = DAG('dag_for_testing_file_task_handler', start_date=DEFAULT_DATE) From 2ee63511f05622874ca81a540df354bc8dc379cc Mon Sep 17 00:00:00 2001 From: Tanguy <35039373+tanguymartinez@users.noreply.github.com> Date: Thu, 16 Dec 2021 16:47:03 +0100 Subject: [PATCH 014/250] Fix grammar mistakes (#20341) (cherry picked from commit 0163495d7193aee7be86cebcc4116b279d460004) --- docs/apache-airflow/concepts/tasks.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/apache-airflow/concepts/tasks.rst b/docs/apache-airflow/concepts/tasks.rst index 23c7713882c95..f6c6619735a72 100644 --- a/docs/apache-airflow/concepts/tasks.rst +++ b/docs/apache-airflow/concepts/tasks.rst @@ -47,7 +47,7 @@ Or the more explicit ``set_upstream`` and ``set_downstream`` methods:: These both do exactly the same thing, but in general we recommend you use the bitshift operators, as they are easier to read in most cases. -By default, a Task will run when all of its upstream (parent) tasks have succeeded, but there are many ways of modifying this behaviour to add branching, only wait for some upstream tasks, or change behaviour based on where the current run is in history. For more, see :ref:`concepts:control-flow`. +By default, a Task will run when all of its upstream (parent) tasks have succeeded, but there are many ways of modifying this behaviour to add branching, to only wait for some upstream tasks, or to change behaviour based on where the current run is in history. For more, see :ref:`concepts:control-flow`. Tasks don't pass information to each other by default, and run entirely independently. If you want to pass information from one Task to another, you should use :doc:`xcoms`. @@ -153,7 +153,7 @@ If you merely want to be notified if a task runs over but still let it run to co SLAs ---- -An SLA, or a Service Level Agreement, is an expectation for the maximum time a Task should take. If a task takes longer than this to run, then it visible in the "SLA Misses" part of the user interface, as well going out in an email of all tasks that missed their SLA. +An SLA, or a Service Level Agreement, is an expectation for the maximum time a Task should take. If a task takes longer than this to run, it is then visible in the "SLA Misses" part of the user interface, as well as going out in an email of all tasks that missed their SLA. Tasks over their SLA are not cancelled, though - they are allowed to run to completion. If you want to cancel a task after a certain runtime is reached, you want :ref:`concepts:timeouts` instead. From f1a2e5024cdfeac4cbe80437991d9c44a8ac7850 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Miranda?= Date: Thu, 16 Dec 2021 15:58:52 +0000 Subject: [PATCH 015/250] Correct typo (#20345) (cherry picked from commit c4d2e16197c5f49493c142bfd9b754ea3c816f48) --- docs/apache-airflow/modules_management.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/apache-airflow/modules_management.rst b/docs/apache-airflow/modules_management.rst index 4bc16f656774c..00e031c6e1480 100644 --- a/docs/apache-airflow/modules_management.rst +++ b/docs/apache-airflow/modules_management.rst @@ -107,7 +107,7 @@ This is an example structure that you might have in your ``dags`` folder: | my_dag2.py | base_dag.py -In the case above, there are the ways you could import the python files: +In the case above, these are the ways you could import the python files: .. code-block:: python From 581fcfdd58705a44c24cacbe364786b0422a0d1d Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Fri, 17 Dec 2021 20:47:49 +0000 Subject: [PATCH 016/250] Remove unnecssary logging in experimental API (#20356) The `execution_data` does not need to be passed to log. We send enough details to the API user in the response. (cherry picked from commit 790bc784435646c043d8def7096917a4ce0a62f7) --- airflow/www/api/experimental/endpoints.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/airflow/www/api/experimental/endpoints.py b/airflow/www/api/experimental/endpoints.py index 91528e9387669..30c2728a0deaf 100644 --- a/airflow/www/api/experimental/endpoints.py +++ b/airflow/www/api/experimental/endpoints.py @@ -103,11 +103,11 @@ def trigger_dag(dag_id): try: execution_date = timezone.parse(execution_date) except ValueError: + log.error("Given execution date could not be identified as a date.") error_message = ( 'Given execution date, {}, could not be identified ' 'as a date. Example date format: 2015-11-16T14:34:15+00:00'.format(execution_date) ) - log.error(error_message) response = jsonify({'error': error_message}) response.status_code = 400 @@ -253,11 +253,11 @@ def task_instance_info(dag_id, execution_date, task_id): try: execution_date = timezone.parse(execution_date) except ValueError: + log.error("Given execution date could not be identified as a date.") error_message = ( 'Given execution date, {}, could not be identified ' 'as a date. Example date format: 2015-11-16T14:34:15+00:00'.format(execution_date) ) - log.error(error_message) response = jsonify({'error': error_message}) response.status_code = 400 @@ -289,11 +289,11 @@ def dag_run_status(dag_id, execution_date): try: execution_date = timezone.parse(execution_date) except ValueError: + log.error("Given execution date could not be identified as a date.") error_message = ( 'Given execution date, {}, could not be identified ' 'as a date. Example date format: 2015-11-16T14:34:15+00:00'.format(execution_date) ) - log.error(error_message) response = jsonify({'error': error_message}) response.status_code = 400 @@ -402,11 +402,11 @@ def get_lineage(dag_id: str, execution_date: str): try: execution_dt = timezone.parse(execution_date) except ValueError: + log.error("Given execution date could not be identified as a date.") error_message = ( 'Given execution date, {}, could not be identified ' 'as a date. Example date format: 2015-11-16T14:34:15+00:00'.format(execution_date) ) - log.error(error_message) response = jsonify({'error': error_message}) response.status_code = 400 From 330c3657922cbabc8ca096fc0ad29f2f5bf42a99 Mon Sep 17 00:00:00 2001 From: Daniel van der Ende Date: Fri, 17 Dec 2021 13:38:19 +0100 Subject: [PATCH 017/250] Fix typo in docs (#20371) Minor typo in the task decorator documentation (cherry picked from commit 7f6ab06d218973900ead79b74b7c9dca4734ee06) --- docs/apache-airflow/howto/create-custom-decorator.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/apache-airflow/howto/create-custom-decorator.rst b/docs/apache-airflow/howto/create-custom-decorator.rst index b6a9a7b8bbef0..ebb6c1502325c 100644 --- a/docs/apache-airflow/howto/create-custom-decorator.rst +++ b/docs/apache-airflow/howto/create-custom-decorator.rst @@ -53,7 +53,7 @@ tasks. The steps to create and register ``@task.foo`` are: 3. Register your new decorator in get_provider_info of your provider - Finally, add a key-value ``task-decortor`` to the dict returned from the provider entrypoint. This should be + Finally, add a key-value ``task-decorators`` to the dict returned from the provider entrypoint. This should be a list with each item containing ``name`` and ``class-name`` keys. When Airflow starts, the ``ProviderManager`` class will automatically import this value and ``task.foo`` will work as a new decorator! From b19dfdbc946254cd570b14e032ae9b63042d9339 Mon Sep 17 00:00:00 2001 From: Jonas Strassel Date: Tue, 21 Dec 2021 16:17:00 +0100 Subject: [PATCH 018/250] fix(standalone): Remove hardcoded Webserver port (#20429) Port 8080 is the default port for webserver (https://airflow.apache.org/docs/apache-airflow/stable/cli-and-env-variables-ref.html?highlight=webserver#webserver). By setting it here again explicitly, we forbid users to override it using AIRFLOW__WEBSERVER__WEB_SERVER_PORT. Removing it IMO is not a breaking change, since it will still default to 8080. (cherry picked from commit 9d36b1fdac16d8db8907d4b792fdbe13a6e80f7e) --- airflow/cli/commands/standalone_command.py | 2 +- docs/apache-airflow/start/local.rst | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/airflow/cli/commands/standalone_command.py b/airflow/cli/commands/standalone_command.py index 41c1684fda3f6..37d38cb5d0204 100644 --- a/airflow/cli/commands/standalone_command.py +++ b/airflow/cli/commands/standalone_command.py @@ -72,7 +72,7 @@ def run(self): self.subcommands["webserver"] = SubCommand( self, name="webserver", - command=["webserver", "--port", "8080"], + command=["webserver"], env=env, ) self.subcommands["triggerer"] = SubCommand( diff --git a/docs/apache-airflow/start/local.rst b/docs/apache-airflow/start/local.rst index b80a9333c3486..644130601c25b 100644 --- a/docs/apache-airflow/start/local.rst +++ b/docs/apache-airflow/start/local.rst @@ -62,6 +62,7 @@ constraint files to enable reproducible installation, so using ``pip`` and const Upon running these commands, Airflow will create the ``$AIRFLOW_HOME`` folder and create the "airflow.cfg" file with defaults that will get you going fast. +You can override defaults using environment variables, see :doc:`/configurations-ref`. You can inspect the file either in ``$AIRFLOW_HOME/airflow.cfg``, or through the UI in the ``Admin->Configuration`` menu. The PID file for the webserver will be stored in ``$AIRFLOW_HOME/airflow-webserver.pid`` or in ``/run/airflow/webserver.pid`` From b43882ca1a8f26a582f13a5e4443e9d95c8842f5 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Wed, 29 Dec 2021 03:12:21 +0800 Subject: [PATCH 019/250] Avoid calling DAG.following_schedule() for TaskInstance.get_template_context() (#20486) This can use a more modern mechanism since get_template_context() has enough context (namely, the current data interval). (cherry picked from commit 9e315ff7caec7fd3d4c0dfe8b89ee2a1c7b5fe3a) --- airflow/models/taskinstance.py | 25 +++++++++++++------------ tests/conftest.py | 12 +++++++++++- tests/models/test_taskinstance.py | 25 +++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 13 deletions(-) diff --git a/airflow/models/taskinstance.py b/airflow/models/taskinstance.py index 716167c20f9b8..281d067861f44 100644 --- a/airflow/models/taskinstance.py +++ b/airflow/models/taskinstance.py @@ -1839,14 +1839,14 @@ def get_prev_start_date_success() -> Optional[pendulum.DateTime]: @cache def get_yesterday_ds() -> str: - return (self.execution_date - timedelta(1)).strftime('%Y-%m-%d') + return (logical_date - timedelta(1)).strftime('%Y-%m-%d') def get_yesterday_ds_nodash() -> str: return get_yesterday_ds().replace('-', '') @cache def get_tomorrow_ds() -> str: - return (self.execution_date + timedelta(1)).strftime('%Y-%m-%d') + return (logical_date + timedelta(1)).strftime('%Y-%m-%d') def get_tomorrow_ds_nodash() -> str: return get_tomorrow_ds().replace('-', '') @@ -1854,18 +1854,15 @@ def get_tomorrow_ds_nodash() -> str: @cache def get_next_execution_date() -> Optional[pendulum.DateTime]: # For manually triggered dagruns that aren't run on a schedule, - # next/previous execution dates don't make sense, and should be set + # the "next" execution date doesn't make sense, and should be set # to execution date for consistency with how execution_date is set # for manually triggered tasks, i.e. triggered_date == execution_date. if dag_run.external_trigger: - next_execution_date = dag_run.execution_date - else: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) - next_execution_date = dag.following_schedule(self.execution_date) - if next_execution_date is None: + return logical_date + next_info = dag.next_dagrun_info(data_interval, restricted=False) + if next_info is None: return None - return timezone.coerce_datetime(next_execution_date) + return timezone.coerce_datetime(next_info.logical_date) def get_next_ds() -> Optional[str]: execution_date = get_next_execution_date() @@ -1881,11 +1878,15 @@ def get_next_ds_nodash() -> Optional[str]: @cache def get_prev_execution_date(): + # For manually triggered dagruns that aren't run on a schedule, + # the "previous" execution date doesn't make sense, and should be set + # to execution date for consistency with how execution_date is set + # for manually triggered tasks, i.e. triggered_date == execution_date. if dag_run.external_trigger: - return timezone.coerce_datetime(self.execution_date) + return logical_date with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) - return dag.previous_schedule(self.execution_date) + return dag.previous_schedule(logical_date) @cache def get_prev_ds() -> Optional[str]: diff --git a/tests/conftest.py b/tests/conftest.py index f7248d1d73dff..9e72d37a5ea83 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -709,7 +709,15 @@ def create_task_instance(dag_maker, create_dummy_dag): Uses ``create_dummy_dag`` to create the dag structure. """ - def maker(execution_date=None, dagrun_state=None, state=None, run_id=None, run_type=None, **kwargs): + def maker( + execution_date=None, + dagrun_state=None, + state=None, + run_id=None, + run_type=None, + data_interval=None, + **kwargs, + ): if execution_date is None: from airflow.utils import timezone @@ -721,6 +729,8 @@ def maker(execution_date=None, dagrun_state=None, state=None, run_id=None, run_t dagrun_kwargs["run_id"] = run_id if run_type is not None: dagrun_kwargs["run_type"] = run_type + if data_interval is not None: + dagrun_kwargs["data_interval"] = data_interval dagrun = dag_maker.create_dagrun(**dagrun_kwargs) (ti,) = dagrun.task_instances ti.state = state diff --git a/tests/models/test_taskinstance.py b/tests/models/test_taskinstance.py index 8458ea91540a6..d1113714071fd 100644 --- a/tests/models/test_taskinstance.py +++ b/tests/models/test_taskinstance.py @@ -30,6 +30,7 @@ from freezegun import freeze_time from airflow import models, settings +from airflow.example_dags.plugins.workday import AfterWorkdayTimetable from airflow.exceptions import ( AirflowException, AirflowFailException, @@ -1630,6 +1631,30 @@ def test_template_with_json_variable_missing(self, create_task_instance): with pytest.raises(KeyError): ti.task.render_template('{{ var.json.get("missing_variable") }}', context) + def test_tempalte_with_custom_timetable_deprecated_context(self, create_task_instance): + ti = create_task_instance( + start_date=DEFAULT_DATE, + timetable=AfterWorkdayTimetable(), + run_type=DagRunType.SCHEDULED, + execution_date=timezone.datetime(2021, 9, 6), + data_interval=(timezone.datetime(2021, 9, 6), timezone.datetime(2021, 9, 7)), + ) + context = ti.get_template_context() + with pytest.deprecated_call(): + assert context["execution_date"] == pendulum.DateTime(2021, 9, 6, tzinfo=timezone.TIMEZONE) + with pytest.deprecated_call(): + assert context["next_ds"] == "2021-09-07" + with pytest.deprecated_call(): + assert context["next_ds_nodash"] == "20210907" + with pytest.deprecated_call(): + assert context["next_execution_date"] == pendulum.DateTime(2021, 9, 7, tzinfo=timezone.TIMEZONE) + with pytest.deprecated_call(): + assert context["prev_ds"] is None, "Does not make sense for custom timetable" + with pytest.deprecated_call(): + assert context["prev_ds_nodash"] is None, "Does not make sense for custom timetable" + with pytest.deprecated_call(): + assert context["prev_execution_date"] is None, "Does not make sense for custom timetable" + def test_execute_callback(self, create_task_instance): called = False From 36920077f790f4202238f1bf27608714f0794f25 Mon Sep 17 00:00:00 2001 From: Kanthi Date: Tue, 28 Dec 2021 16:15:31 -0500 Subject: [PATCH 020/250] 20496 fix port standalone mode (#20505) (cherry picked from commit f743e46c5a4fdd0b76fea2d07729b744644fc416) --- airflow/cli/commands/standalone_command.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/airflow/cli/commands/standalone_command.py b/airflow/cli/commands/standalone_command.py index 37d38cb5d0204..82a082e853f6b 100644 --- a/airflow/cli/commands/standalone_command.py +++ b/airflow/cli/commands/standalone_command.py @@ -81,6 +81,8 @@ def run(self): command=["triggerer"], env=env, ) + + self.web_server_port = conf.getint('webserver', 'WEB_SERVER_PORT', fallback=8080) # Run subcommand threads for command in self.subcommands.values(): command.start() @@ -206,7 +208,11 @@ def is_ready(self): Detects when all Airflow components are ready to serve. For now, it's simply time-based. """ - return self.port_open(8080) and self.job_running(SchedulerJob) and self.job_running(TriggererJob) + return ( + self.port_open(self.web_server_port) + and self.job_running(SchedulerJob) + and self.job_running(TriggererJob) + ) def port_open(self, port): """ From cb6891b9cf929633e805b458043e836015b22b33 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Thu, 30 Dec 2021 17:16:03 +0000 Subject: [PATCH 021/250] Doc: Update Supported column for 1.10.x series (#20592) 1.10.x is EOL (cherry picked from commit dcd4c492f0b869b2e7dd80756da5695036d70758) --- docs/apache-airflow/installation/supported-versions.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/apache-airflow/installation/supported-versions.rst b/docs/apache-airflow/installation/supported-versions.rst index 2e1ee97223efe..c4a2a26df18dc 100644 --- a/docs/apache-airflow/installation/supported-versions.rst +++ b/docs/apache-airflow/installation/supported-versions.rst @@ -26,9 +26,9 @@ Apache Airflow version life cycle: +---------+-----------------+---------------+-----------------+----------------+ | Version | State | First Release | Limited Support | EOL/Terminated | +---------+-----------------+---------------+-----------------+----------------+ -| 2 | Supported | Dec 17, 2020 | Dec 2021 | TBD | +| 2 | Supported | Dec 17, 2020 | TBD | TBD | +---------+-----------------+---------------+-----------------+----------------+ -| 1.10 | Limited Support | Aug 27, 2018 | Dec 17, 2020 | June 2021 | +| 1.10 | EOL | Aug 27, 2018 | Dec 17, 2020 | June 2021 | +---------+-----------------+---------------+-----------------+----------------+ | 1.9 | EOL | Jan 03, 2018 | Aug 27, 2018 | Aug 2018 | +---------+-----------------+---------------+-----------------+----------------+ From 9912cf1de91e9e39ba53adc08168b575ce8b0300 Mon Sep 17 00:00:00 2001 From: rustikk Date: Mon, 3 Jan 2022 20:39:51 -0700 Subject: [PATCH 022/250] Docs: Changed macros to correct classes and modules (#20637) closes: #20545 Fixed docs for time and random macros as the reference to what they are was incorrect. (cherry picked from commit 8b2299b284ac15900f54bf8c84976cc01f4d597c) --- docs/apache-airflow/templates-ref.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/apache-airflow/templates-ref.rst b/docs/apache-airflow/templates-ref.rst index 28aba68b04e9a..f3c4a46978a33 100644 --- a/docs/apache-airflow/templates-ref.rst +++ b/docs/apache-airflow/templates-ref.rst @@ -166,9 +166,9 @@ Variable Description ``macros.datetime`` The standard lib's :class:`datetime.datetime` ``macros.timedelta`` The standard lib's :class:`datetime.timedelta` ``macros.dateutil`` A reference to the ``dateutil`` package -``macros.time`` The standard lib's :class:`datetime.time` +``macros.time`` The standard lib's :mod:`time` ``macros.uuid`` The standard lib's :mod:`uuid` -``macros.random`` The standard lib's :mod:`random` +``macros.random`` The standard lib's :class:`random.random` ================================= ============================================== Some airflow specific macros are also defined: From 713a807f0f7b948387945e763cab8a7646fd35a8 Mon Sep 17 00:00:00 2001 From: Alan Ma Date: Wed, 5 Jan 2022 00:51:00 -0800 Subject: [PATCH 023/250] Docs: Clarify ``sentry_on`` value is not quoted with example (#20639) Clarify the value for ``sentry_on`` is not quoted by providing an example. (cherry picked from commit e8b5ab9efe93f826a2b521b5d1cac0404354c3b4) --- docs/apache-airflow/logging-monitoring/errors.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/apache-airflow/logging-monitoring/errors.rst b/docs/apache-airflow/logging-monitoring/errors.rst index 7a5df129585fa..9f4256ae04c0e 100644 --- a/docs/apache-airflow/logging-monitoring/errors.rst +++ b/docs/apache-airflow/logging-monitoring/errors.rst @@ -29,13 +29,14 @@ First you must install sentry requirement: pip install 'apache-airflow[sentry]' -After that, you need to enable the integration by set ``sentry_on`` option in ``[sentry]`` section to ``"True"``. +After that, you need to enable the integration by set ``sentry_on`` option in ``[sentry]`` section to ``True``. -Add your ``SENTRY_DSN`` to your configuration file e.g. ``airflow.cfg`` in ``[sentry]`` section. Its template resembles the following: ``'{PROTOCOL}://{PUBLIC_KEY}@{HOST}/{PROJECT_ID}'`` +Add your ``SENTRY_DSN`` to your configuration file e.g. ``airflow.cfg`` in ``[sentry]`` section. Its template resembles the following: ``{PROTOCOL}://{PUBLIC_KEY}@{HOST}/{PROJECT_ID}`` .. code-block:: ini [sentry] + sentry_on = True sentry_dsn = http://foo@sentry.io/123 .. note:: From 97d90c1edd65019e27824ba162e01c0e568d8e5b Mon Sep 17 00:00:00 2001 From: Bowrna Date: Tue, 12 Oct 2021 02:24:24 +0530 Subject: [PATCH 024/250] breeze setup-autocomplete zshrc reload (#18893) (cherry picked from commit ec31b2049e7c3b9f9694913031553f2d7eb66265) --- breeze | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/breeze b/breeze index 428e275a3a364..755133dad2813 100755 --- a/breeze +++ b/breeze @@ -396,6 +396,16 @@ EOF echo echo "Please exit and re-enter your shell or run:" echo + if [[ "${OSTYPE}" == "darwin"* ]]; then + if grep "${breeze_comment}" "${HOME}/.zshrc" >/dev/null 2>&1; then + echo " source ~/.zshrc" + echo + echo " source ~/.bash_completion.d/breeze-complete" + echo + exec zsh + exit 0 + fi + fi echo " source ~/.bash_completion.d/breeze-complete" echo exit 0 From 01edc7e66dd880e2f2316893403f8e80aad78762 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 12 Oct 2021 00:18:49 +0200 Subject: [PATCH 025/250] Workaround docker-compose-v2 env passing (#18887) * Workaround docker-compose-v2 env passing Docker Compose v2 has environment parsing broken in many ways. Until this is fixed, we cannot use env files, instead we must set all the variables directly, because parsing variables without values or parsing variables which have empty values is broken in several ways. Some of the issues are closed but not released, and until this is fixed, some extra code duplication and explicitly setting all default variables to "" when needed should solve the problem for both Docker-Compose v1 and Docker-Compose v2 (cherry picked from commit ab5b2bfca6b5be57a19bdbfe2206319bd2e71ac9) --- breeze | 29 ++++------- scripts/ci/docker-compose/_docker_compose.env | 49 ----------------- scripts/ci/docker-compose/base.yml | 52 ++++++++++++++++++- scripts/ci/libraries/_initialization.sh | 27 ++++++++++ scripts/in_container/entrypoint_ci.sh | 3 +- 5 files changed, 90 insertions(+), 70 deletions(-) delete mode 100644 scripts/ci/docker-compose/_docker_compose.env diff --git a/breeze b/breeze index 755133dad2813..3d814dc0f315a 100755 --- a/breeze +++ b/breeze @@ -567,24 +567,6 @@ EOF # Those are a convenience scripts that you might use to debug command execution although # In most cases they are used internally by Breeze. # -# Used Globals: -# BRANCH_NAME -# PYTHON_MAJOR_MINOR_VERSION -# BACKEND -# AIRFLOW_VERSION -# INSTALL_AIRFLOW_VERSION -# SSH_PORT -# WEBSERVER_HOST_PORT -# POSTGRES_HOST_PORT -# POSTGRES_VERSION -# MYSQL_HOST_PORT -# MYSQL_VERSION -# AIRFLOW_SOURCES -# AIRFLOW_CI_IMAGE -# AIRFLOW_PROD_IMAGE -# AIRFLOW_IMAGE_KUBERNETES -# SQLITE_URL -# # Arguments: # # file to prepare @@ -657,6 +639,17 @@ export SQLITE_URL="${SQLITE_URL}" export USE_AIRFLOW_VERSION="${USE_AIRFLOW_VERSION}" export USE_PACKAGES_FROM_DIST="${USE_PACKAGES_FROM_DIST}" export EXECUTOR="${EXECUTOR}" +export START_AIRFLOW="${START_AIRFLOW}" +export ENABLED_INTEGRATIONS="${ENABLED_INTEGRATIONS}" +export ENABLED_SYSTEMS="${ENABLED_SYSTEMS}" +export GITHUB_ACTIONS="${GITHUB_ACTIONS}" +export ISSUE_ID="${ISSUE_ID}" +export NUM_RUNS="${NUM_RUNS}" +export RUN_TESTS="${RUN_TESTS}" +export RUN_INTEGRATION_TESTS="${RUN_INTEGRATION_TESTS}" +export RUN_SYSTEM_TESTS="${RUN_SYSTEM_TESTS}" +export VERSION_SUFFIX_FOR_SVN="${VERSION_SUFFIX_FOR_SVN}" +export VERSION_SUFFIX_FOR_PYPI="${VERSION_SUFFIX_FOR_PYPI}" docker-compose ${command} EOF chmod u+x "${file}" diff --git a/scripts/ci/docker-compose/_docker_compose.env b/scripts/ci/docker-compose/_docker_compose.env deleted file mode 100644 index d206af47d0f1d..0000000000000 --- a/scripts/ci/docker-compose/_docker_compose.env +++ /dev/null @@ -1,49 +0,0 @@ -AIRFLOW_CI_IMAGE="${AIRFLOW_CI_IMAGE}" -AIRFLOW_EXTRAS="${AIRFLOW_EXTRAS}" -BACKEND="${BACKEND}" -BREEZE="${BREEZE}" -CI="${CI}" -CI_BUILD_ID="${CI_BUILD_ID}" -CI_JOB_ID="${CI_JOB_ID}" -CI_EVENT_TYPE="${CI_EVENT_TYPE}" -CI_TARGET_REPO="${CI_TARGET_REPO}" -CI_TARGET_BRANCH="${CI_TARGET_BRANCH}" -COMMIT_SHA="${COMMIT_SHA}" -DB_RESET="${DB_RESET}" -DEFAULT_BRANCH="${DEFAULT_BRANCH}" -DEFAULT_CONSTRAINTS_BRANCH="${DEFAULT_CONSTRAINTS_BRANCH}" -ENABLED_INTEGRATIONS="${ENABLED_INTEGRATIONS}" -ENABLED_SYSTEMS="${ENABLED_SYSTEMS}" -GITHUB_ACTIONS="${GITHUB_ACTIONS}" -GITHUB_REGISTRY_PULL_IMAGE_TAG="${GITHUB_REGISTRY_PULL_IMAGE_TAG}" -HOST_USER_ID="${HOST_USER_ID}" -HOST_GROUP_ID="${HOST_GROUP_ID}" -HOST_OS="${HOST_OS}" -HOST_HOME="${HOST_HOME}" -INIT_SCRIPT_FILE="${INIT_SCRIPT_FILE}" -INSTALL_AIRFLOW_VERSION="${INSTALL_AIRFLOW_VERSION}" -GENERATE_CONSTRAINTS_MODE="${GENERATE_CONSTRAINTS_MODE}" -INSTALL_PROVIDERS_FROM_SOURCES="${INSTALL_PROVIDERS_FROM_SOURCES}" -USE_AIRFLOW_VERSION="${USE_AIRFLOW_VERSION}" -USE_PACKAGES_FROM_DIST="${USE_PACKAGES_FROM_DIST}" -ISSUE_ID="${ISSUE_ID}" -LOAD_DEFAULT_CONNECTIONS="${LOAD_DEFAULT_CONNECTIONS}" -LOAD_EXAMPLES="${LOAD_EXAMPLES}" -MYSQL_VERSION="${MYSQL_VERSION}" -NUM_RUNS="${NUM_RUNS}" -PACKAGE_FORMAT="${PACKAGE_FORMAT}" -POSTGRES_VERSION="${POSTGRES_VERSION}" -PRINT_INFO_FROM_SCRIPTS="${PRINT_INFO_FROM_SCRIPTS}" -PYTHONDONTWRITEBYTECODE="${PYTHONDONTWRITEBYTECODE}" -PYTHON_MAJOR_MINOR_VERSION="${PYTHON_MAJOR_MINOR_VERSION}" -RUN_TESTS="${RUN_TESTS}" -RUN_INTEGRATION_TESTS="${RUN_INTEGRATION_TESTS}" -RUN_SYSTEM_TESTS="${RUN_SYSTEM_TESTS}" -START_AIRFLOW="${START_AIRFLOW}" -TEST_TYPE="${TEST_TYPE}" -UPGRADE_TO_NEWER_DEPENDENCIES="${UPGRADE_TO_NEWER_DEPENDENCIES}" -VERBOSE="${VERBOSE}" -VERBOSE_COMMANDS="${VERBOSE_COMMANDS}" -VERSION_SUFFIX_FOR_PYPI="${VERSION_SUFFIX_FOR_PYPI}" -VERSION_SUFFIX_FOR_SVN="${VERSION_SUFFIX_FOR_SVN}" -WHEEL_VERSION="${WHEEL_VERSION}" diff --git a/scripts/ci/docker-compose/base.yml b/scripts/ci/docker-compose/base.yml index c9fed6afca1e7..4ecf6ee3bcb64 100644 --- a/scripts/ci/docker-compose/base.yml +++ b/scripts/ci/docker-compose/base.yml @@ -26,8 +26,56 @@ services: - CELERY_BROKER_URLS=amqp://guest:guest@rabbitmq:5672,redis://redis:6379/0 - KUBECONFIG=/files/.kube/config - HOST_HOME=${HOME} - env_file: - - _docker_compose.env + # We need all those env variables here because docker-compose-v2 does not really work well + # With env files and there are many problems with it: + - AIRFLOW_CI_IMAGE=${AIRFLOW_CI_IMAGE} + - AIRFLOW_EXTRAS=${AIRFLOW_EXTRAS} + - BACKEND=${BACKEND} + - BREEZE=${BREEZE} + - CI=${CI} + - CI_BUILD_ID=${CI_BUILD_ID} + - CI_JOB_ID=${CI_JOB_ID} + - CI_EVENT_TYPE=${CI_EVENT_TYPE} + - CI_TARGET_REPO=${CI_TARGET_REPO} + - CI_TARGET_BRANCH=${CI_TARGET_BRANCH} + - COMMIT_SHA=${COMMIT_SHA} + - DB_RESET=${DB_RESET} + - DEFAULT_BRANCH=${DEFAULT_BRANCH} + - DEFAULT_CONSTRAINTS_BRANCH=${DEFAULT_CONSTRAINTS_BRANCH} + - ENABLED_INTEGRATIONS=${ENABLED_INTEGRATIONS} + - ENABLED_SYSTEMS=${ENABLED_SYSTEMS} + - GITHUB_ACTIONS=${GITHUB_ACTIONS} + - GITHUB_REGISTRY_PULL_IMAGE_TAG=${GITHUB_REGISTRY_PULL_IMAGE_TAG} + - HOST_USER_ID=${HOST_USER_ID} + - HOST_GROUP_ID=${HOST_GROUP_ID} + - HOST_OS=${HOST_OS} + - INIT_SCRIPT_FILE=${INIT_SCRIPT_FILE} + - INSTALL_AIRFLOW_VERSION=${INSTALL_AIRFLOW_VERSION} + - GENERATE_CONSTRAINTS_MODE=${GENERATE_CONSTRAINTS_MODE} + - INSTALL_PROVIDERS_FROM_SOURCES=${INSTALL_PROVIDERS_FROM_SOURCES} + - USE_AIRFLOW_VERSION=${USE_AIRFLOW_VERSION} + - USE_PACKAGES_FROM_DIST=${USE_PACKAGES_FROM_DIST} + - ISSUE_ID=${ISSUE_ID} + - LOAD_DEFAULT_CONNECTIONS=${LOAD_DEFAULT_CONNECTIONS} + - LOAD_EXAMPLES=${LOAD_EXAMPLES} + - MYSQL_VERSION=${MYSQL_VERSION} + - NUM_RUNS=${NUM_RUNS} + - PACKAGE_FORMAT=${PACKAGE_FORMAT} + - POSTGRES_VERSION=${POSTGRES_VERSION} + - PRINT_INFO_FROM_SCRIPTS=${PRINT_INFO_FROM_SCRIPTS} + - PYTHONDONTWRITEBYTECODE=${PYTHONDONTWRITEBYTECODE} + - PYTHON_MAJOR_MINOR_VERSION=${PYTHON_MAJOR_MINOR_VERSION} + - RUN_TESTS=${RUN_TESTS} + - RUN_INTEGRATION_TESTS=${RUN_INTEGRATION_TESTS} + - RUN_SYSTEM_TESTS=${RUN_SYSTEM_TESTS} + - START_AIRFLOW=${START_AIRFLOW} + - TEST_TYPE=${TEST_TYPE} + - UPGRADE_TO_NEWER_DEPENDENCIES=${UPGRADE_TO_NEWER_DEPENDENCIES} + - VERBOSE=${VERBOSE} + - VERBOSE_COMMANDS=${VERBOSE_COMMANDS} + - VERSION_SUFFIX_FOR_PYPI=${VERSION_SUFFIX_FOR_PYPI} + - VERSION_SUFFIX_FOR_SVN=${VERSION_SUFFIX_FOR_SVN} + - WHEEL_VERSION=${WHEEL_VERSION} volumes: # Pass docker to inside of the container so that Kind and Moto tests can use it. - /var/run/docker.sock:/var/run/docker.sock diff --git a/scripts/ci/libraries/_initialization.sh b/scripts/ci/libraries/_initialization.sh index 82701656b6932..1b8ac6e4b8b9c 100644 --- a/scripts/ci/libraries/_initialization.sh +++ b/scripts/ci/libraries/_initialization.sh @@ -311,6 +311,19 @@ function initialization::initialize_force_variables() { # Can be set to true to skip if the image is newer in registry export SKIP_CHECK_REMOTE_IMAGE=${SKIP_CHECK_REMOTE_IMAGE:="false"} + + # integrations are disabled by default + export ENABLED_INTEGRATIONS=${ENABLED_INTEGRATIONS:=""} + + # systems are disabled by default + export ENABLED_SYSTEMS=${ENABLED_SYSTEMS:=""} + + # no issue id by default (quarantined builds only) + export ISSUE_ID=${ISSUE_ID:=""} + + # no NUM_RUNS by default (quarantined builds only) + export NUM_RUNS=${NUM_RUNS:=""} + } # Determine information about the host @@ -576,10 +589,24 @@ function initialization::initialize_test_variables() { # In case we want to force certain test type to run, this variable should be set to this type # Otherwise TEST_TYPEs to run will be derived from TEST_TYPES space-separated string export FORCE_TEST_TYPE=${FORCE_TEST_TYPE:=""} + + # Do not run tests by default + export RUN_TESTS=${RUN_TESTS:="false"} + + # Do not run integration tests by default + export RUN_INTEGRATION_TESTS=${RUN_INTEGRATION_TESTS:="false"} + + # Do not run system tests by default + export RUN_SYSTEM_TESTS=${RUN_SYSTEM_TESTS:="false"} + } function initialization::initialize_package_variables() { + # default package format export PACKAGE_FORMAT=${PACKAGE_FORMAT:="wheel"} + # default version suffixes + export VERSION_SUFFIX_FOR_PYPI=${VERSION_SUFFIX_FOR_PYPI:=""} + export VERSION_SUFFIX_FOR_SVN=${VERSION_SUFFIX_FOR_SVN:=""} } diff --git a/scripts/in_container/entrypoint_ci.sh b/scripts/in_container/entrypoint_ci.sh index 5d7aca057b4ec..c77f5a5d0bea8 100755 --- a/scripts/in_container/entrypoint_ci.sh +++ b/scripts/in_container/entrypoint_ci.sh @@ -52,7 +52,7 @@ else export RUN_AIRFLOW_1_10="false" fi -if [[ -z ${USE_AIRFLOW_VERSION=} ]]; then +if [[ ${USE_AIRFLOW_VERSION} == "" ]]; then export PYTHONPATH=${AIRFLOW_SOURCES} echo echo "Using already installed airflow version" @@ -184,6 +184,7 @@ ssh-keyscan -H localhost >> ~/.ssh/known_hosts 2>/dev/null cd "${AIRFLOW_SOURCES}" +echo "START_AIRFLOW:=${START_AIRFLOW}" if [[ ${START_AIRFLOW:="false"} == "true" ]]; then export AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=${LOAD_DEFAULT_CONNECTIONS} export AIRFLOW__CORE__LOAD_EXAMPLES=${LOAD_EXAMPLES} From ef817be1346d28490f85e53ecabf8e938579cd03 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 12 Oct 2021 12:19:31 +0200 Subject: [PATCH 026/250] Fix comparision of docker versions (#18902) In some shells the comparable string with version was too long. The number leading with 0 was interpreted as octal number and it had too many digits for octal number to handle. This change; 1) decreases the length of the string by using 3-digit numbers 2) strips leading 0s during comparision making comparision work in decimal (cherry picked from commit a05f0c37951a222c4764b4c910f78733eee0556f) --- scripts/ci/libraries/_initialization.sh | 34 +++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/scripts/ci/libraries/_initialization.sh b/scripts/ci/libraries/_initialization.sh index 1b8ac6e4b8b9c..d337ff4e441ce 100644 --- a/scripts/ci/libraries/_initialization.sh +++ b/scripts/ci/libraries/_initialization.sh @@ -925,3 +925,37 @@ function initialization::ga_env() { echo "${1}=${2}" >>"${GITHUB_ENV}" fi } + +function initialization::ver() { + # convert SemVer number to comparable string (strips pre-release version) + # shellcheck disable=SC2086,SC2183 + printf "%03d%03d%03d%.0s" ${1//[.-]/} +} + +function initialization::check_docker_version() { + local docker_version + # In GitHub Code QL, the version of docker has +azure suffix which we should remove + docker_version=$(docker version --format '{{.Client.Version}}' | sed 's/\+.*$//' || true) + if [ "${docker_version}" == "" ]; then + echo + echo "${COLOR_YELLOW}Your version of docker is unknown. If the scripts faill, please make sure to install docker at least: ${min_docker_version} version.${COLOR_RESET}" + echo + return + fi + local comparable_docker_version + comparable_docker_version=$(initialization::ver "${docker_version}") + local min_docker_version="20.10.0" + local min_comparable_docker_version + min_comparable_docker_version=$(initialization::ver "${min_docker_version}") + # The #0 Strips leading zeros + if [[ ${comparable_docker_version#0} -lt ${min_comparable_docker_version#0} ]]; then + echo + echo "${COLOR_RED}Your version of docker is too old: ${docker_version}. Please upgrade to at least ${min_docker_version}.${COLOR_RESET}" + echo + exit 1 + else + if [[ ${PRINT_INFO_FROM_SCRIPTS} != "false" ]]; then + echo "${COLOR_GREEN}Good version of docker ${docker_version}.${COLOR_RESET}" + fi + fi +} From 49e582aa6cd70d31891eed6dce71bd6c7557e6e7 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 12 Oct 2021 12:19:45 +0200 Subject: [PATCH 027/250] Remove adding of "test-run" variables to dc_ci script (#18903) The RUN_*TEST variables are not part of the environment so they are not set when the dc_ci is generated they are overridden by Breeze when particular commands are executed. Therefore we should not hard-code those values in dc_ci script (this is useful for debugging to have the script but it is only there for environment configuration) (cherry picked from commit 7d3b6b51c0227f6251fd5b0023970c19fcc3c402) --- breeze | 5 ++--- scripts/ci/docker-compose/_docker.env | 2 +- scripts/ci/docker-compose/base.yml | 2 +- scripts/ci/libraries/_initialization.sh | 6 +++--- scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh | 4 ++-- scripts/in_container/entrypoint_ci.sh | 4 ++-- 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/breeze b/breeze index 3d814dc0f315a..73ca2ad5b1bc0 100755 --- a/breeze +++ b/breeze @@ -645,9 +645,6 @@ export ENABLED_SYSTEMS="${ENABLED_SYSTEMS}" export GITHUB_ACTIONS="${GITHUB_ACTIONS}" export ISSUE_ID="${ISSUE_ID}" export NUM_RUNS="${NUM_RUNS}" -export RUN_TESTS="${RUN_TESTS}" -export RUN_INTEGRATION_TESTS="${RUN_INTEGRATION_TESTS}" -export RUN_SYSTEM_TESTS="${RUN_SYSTEM_TESTS}" export VERSION_SUFFIX_FOR_SVN="${VERSION_SUFFIX_FOR_SVN}" export VERSION_SUFFIX_FOR_PYPI="${VERSION_SUFFIX_FOR_PYPI}" docker-compose ${command} @@ -3474,6 +3471,8 @@ function breeze::run_breeze_command() { docker_engine_resources::check_all_resources export RUN_TESTS="true" readonly RUN_TESTS + export ENABLED_INTEGRATIONS="${INTEGRATIONS[*]}" + export LIST_OF_INTEGRATION_TESTS_TO_RUN="${INTEGRATIONS[*]}" ${run_command} "${BUILD_CACHE_DIR}/${DOCKER_COMPOSE_RUN_SCRIPT_FOR_CI}" run --service-ports --rm airflow "$@" ;; run_docker_compose) diff --git a/scripts/ci/docker-compose/_docker.env b/scripts/ci/docker-compose/_docker.env index a4e017872c53b..08fb37cf076ee 100644 --- a/scripts/ci/docker-compose/_docker.env +++ b/scripts/ci/docker-compose/_docker.env @@ -53,7 +53,7 @@ PRINT_INFO_FROM_SCRIPTS PYTHONDONTWRITEBYTECODE PYTHON_MAJOR_MINOR_VERSION RUN_TESTS -RUN_INTEGRATION_TESTS +LIST_OF_INTEGRATION_TESTS_TO_RUN RUN_SYSTEM_TESTS START_AIRFLOW TEST_TYPE diff --git a/scripts/ci/docker-compose/base.yml b/scripts/ci/docker-compose/base.yml index 4ecf6ee3bcb64..809ba0887c9c4 100644 --- a/scripts/ci/docker-compose/base.yml +++ b/scripts/ci/docker-compose/base.yml @@ -66,7 +66,7 @@ services: - PYTHONDONTWRITEBYTECODE=${PYTHONDONTWRITEBYTECODE} - PYTHON_MAJOR_MINOR_VERSION=${PYTHON_MAJOR_MINOR_VERSION} - RUN_TESTS=${RUN_TESTS} - - RUN_INTEGRATION_TESTS=${RUN_INTEGRATION_TESTS} + - LIST_OF_INTEGRATION_TESTS_TO_RUN=${LIST_OF_INTEGRATION_TESTS_TO_RUN} - RUN_SYSTEM_TESTS=${RUN_SYSTEM_TESTS} - START_AIRFLOW=${START_AIRFLOW} - TEST_TYPE=${TEST_TYPE} diff --git a/scripts/ci/libraries/_initialization.sh b/scripts/ci/libraries/_initialization.sh index d337ff4e441ce..cdfbfcdbab1ad 100644 --- a/scripts/ci/libraries/_initialization.sh +++ b/scripts/ci/libraries/_initialization.sh @@ -594,10 +594,10 @@ function initialization::initialize_test_variables() { export RUN_TESTS=${RUN_TESTS:="false"} # Do not run integration tests by default - export RUN_INTEGRATION_TESTS=${RUN_INTEGRATION_TESTS:="false"} + export LIST_OF_INTEGRATION_TESTS_TO_RUN=${LIST_OF_INTEGRATION_TESTS_TO_RUN:=""} - # Do not run system tests by default - export RUN_SYSTEM_TESTS=${RUN_SYSTEM_TESTS:="false"} + # Do not run system tests by default (they can be enabled by setting the RUN_SYSTEM_TESTS variable to "true") + export RUN_SYSTEM_TESTS=${RUN_SYSTEM_TESTS:=""} } diff --git a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh index f2af0b92c3c8f..615ce21437664 100755 --- a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh +++ b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh @@ -51,10 +51,10 @@ function prepare_tests() { if [[ ${TEST_TYPE:=} == "Integration" ]]; then export ENABLED_INTEGRATIONS="${AVAILABLE_INTEGRATIONS}" - export RUN_INTEGRATION_TESTS="${AVAILABLE_INTEGRATIONS}" + export LIST_OF_INTEGRATION_TESTS_TO_RUN="${AVAILABLE_INTEGRATIONS}" else export ENABLED_INTEGRATIONS="" - export RUN_INTEGRATION_TESTS="" + export LIST_OF_INTEGRATION_TESTS_TO_RUN="" fi for _INT in ${ENABLED_INTEGRATIONS} diff --git a/scripts/in_container/entrypoint_ci.sh b/scripts/in_container/entrypoint_ci.sh index c77f5a5d0bea8..78909f72fb532 100755 --- a/scripts/in_container/entrypoint_ci.sh +++ b/scripts/in_container/entrypoint_ci.sh @@ -324,9 +324,9 @@ fi readonly SELECTED_TESTS CLI_TESTS API_TESTS PROVIDERS_TESTS CORE_TESTS WWW_TESTS \ ALL_TESTS ALL_PRESELECTED_TESTS -if [[ -n ${RUN_INTEGRATION_TESTS=} ]]; then +if [[ -n ${LIST_OF_INTEGRATION_TESTS_TO_RUN=} ]]; then # Integration tests - for INT in ${RUN_INTEGRATION_TESTS} + for INT in ${LIST_OF_INTEGRATION_TESTS_TO_RUN} do EXTRA_PYTEST_ARGS+=("--integration" "${INT}") done From 40bf532395d5a3daa985117562747341af1b976c Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 12 Oct 2021 19:21:43 +0200 Subject: [PATCH 028/250] Decrease likelihood of memory issue in CI (#18852) This PR attempts to decrease the likelihood of memory issues for CI for non-committers. The MSSQL and MYSQL Provider and Integration tests when run together with other tests in parallel (for MSSQL even standalone) might cause memory problems (143 or 137 exit code). This PR changes the approach slightly for low-memory conditions: 1) MSSQL - both Integration and Providers tests are skipped entirely (they will be run in High-Mem case so we will see if there are any problems anyway) 2) MySQL - both Integration and Providers tests are run separately which will lead to slightly longer test runs but likely this will save us from the occasional memory issues. (cherry picked from commit 5d9e5f69b9d9c7d4f4e5e5c040ace0589b541a91) --- scripts/ci/testing/ci_run_airflow_testing.sh | 32 ++++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/scripts/ci/testing/ci_run_airflow_testing.sh b/scripts/ci/testing/ci_run_airflow_testing.sh index 8d7440cfce8c3..73c3a813564c0 100755 --- a/scripts/ci/testing/ci_run_airflow_testing.sh +++ b/scripts/ci/testing/ci_run_airflow_testing.sh @@ -77,6 +77,7 @@ function run_all_test_types_in_parallel() { echo local run_integration_tests_separately="false" + local run_providers_tests_separately="false" # shellcheck disable=SC2153 local test_types_to_run=${TEST_TYPES} @@ -92,14 +93,28 @@ function run_all_test_types_in_parallel() { echo "" echo "${COLOR_YELLOW}Integration tests will be run separately at the end after cleaning up docker${COLOR_RESET}" echo "" - # Remove Integration from list of tests to run in parallel - test_types_to_run="${test_types_to_run//Integration/}" - run_integration_tests_separately="true" if [[ ${BACKEND} == "mssql" ]]; then - # Skip running "Integration" tests for low memory condition for mssql - run_integration_tests_separately="false" + # Skip running "Integration" and "Providers" tests for low memory condition for mssql + # Both might lead to memory issues even in run on their own. We have no need to + # Test those specifically for MSSQL (and they will be tested in `main` as there + # We have no memory limits + test_types_to_run="${test_types_to_run//Integration/}" + run_integration_tests_separately="false" + test_types_to_run="${test_types_to_run//Providers/}" + run_providers_tests_separately="false" + elif [[ ${BACKEND} == "mysql" ]]; then + # Separate "Integration" and "Providers" tests for low memory condition for mysql + # To not run them in parallel with other tests as this often leads to memory issue + # (Error 137 or 143). + test_types_to_run="${test_types_to_run//Integration/}" + run_integration_tests_separately="true" + test_types_to_run="${test_types_to_run//Providers/}" + run_providers_tests_separately="true" else - run_integration_tests_separately="true" + # Remove Integration from list of tests to run in parallel + # and run them separately for all other backends + test_types_to_run="${test_types_to_run//Integration/}" + run_integration_tests_separately="true" fi fi fi @@ -109,6 +124,11 @@ function run_all_test_types_in_parallel() { parallel::initialize_monitoring run_test_types_in_parallel "${@}" + if [[ ${run_providers_tests_separately} == "true" ]]; then + parallel::cleanup_runner + test_types_to_run="Providers" + run_test_types_in_parallel "${@}" + fi if [[ ${run_integration_tests_separately} == "true" ]]; then parallel::cleanup_runner test_types_to_run="Integration" From ebf1ff7100cf2ad60a4e72d1fc06654a821749cb Mon Sep 17 00:00:00 2001 From: Dmitriy Fishman Date: Wed, 13 Oct 2021 11:13:43 +0300 Subject: [PATCH 029/250] Fix typos ``build.rst`` (#18935) (cherry picked from commit 2f38c6f5bdb102fa5a6d00fcb977e9ca90e88cb0) --- docs/docker-stack/build.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/docker-stack/build.rst b/docs/docker-stack/build.rst index 03569dffd045e..335b9adc0581f 100644 --- a/docs/docker-stack/build.rst +++ b/docs/docker-stack/build.rst @@ -20,7 +20,7 @@ Building the image ================== -Before you dive-deeply in the way how the Airflow Image is build, let us first explain why you might need +Before you dive-deeply in the way how the Airflow Image is built, let us first explain why you might need to build the custom container image and we show a few typical ways you can do it. Why custom image ? @@ -235,7 +235,7 @@ You should be aware, about a few things: in runtime, will have ``GID=0`` and will be group-writable. .. note:: - When you build image for Airflow version < ``2.1`` (for example 2.0.2 or 1.10.15) the image is build with + When you build image for Airflow version < ``2.1`` (for example 2.0.2 or 1.10.15) the image is built with PIP 20.2.4 because ``PIP21+`` is only supported for ``Airflow 2.1+`` .. note:: @@ -442,7 +442,7 @@ The following example adds ``mpi4py`` package which requires both ``build-essent :start-after: [START build] :end-before: [END build] -The above image is equivalent of the "extended" image from previous chapter but it's size is only +The above image is equivalent of the "extended" image from previous chapter but its size is only 874 MB. Comparing to 1.1 GB of the "extended image" this is about 230 MB less, so you can achieve ~20% improvement in size of the image by using "customization" vs. extension. The saving can increase in case you have more complex dependencies to build. @@ -559,7 +559,7 @@ The following - rather complex - example shows capabilities of: Build images in security restricted environments ................................................ -You can also make sure your image is only build using local constraint file and locally downloaded +You can also make sure your image is only built using local constraint file and locally downloaded wheel files. This is often useful in Enterprise environments where the binary files are verified and vetted by the security teams. It is also the most complex way of building the image. You should be an expert of building and using Dockerfiles in order to use it and have to have specific needs of security if @@ -585,7 +585,7 @@ of Airflow when needed on an air-gaped system. Example of preparing the constraint files and wheel files. Note that ``mysql`` dependency is removed as ``mysqlclient`` is installed from Oracle's ``apt`` repository and if you want to add it, you need -to provide this library from you repository if you want to build Airflow image in an "air-gaped" system. +to provide this library from your repository if you want to build Airflow image in an "air-gaped" system. .. exampleinclude:: docker-examples/restricted/restricted_environments.sh :language: bash @@ -613,7 +613,7 @@ where you can build the image using the packages downloaded by passing those bui Note, that the solution we have for installing python packages from local packages, only solves the problem of "air-gaped" python installation. The Docker image also downloads ``apt`` dependencies and ``node-modules``. -Those type of dependencies are however more likely to be available in your "air-gaped" system via transparent +Those types of dependencies are however more likely to be available in your "air-gaped" system via transparent proxies and it should automatically reach out to your private registries, however in the future the solution might be applied to both of those installation steps. @@ -647,7 +647,7 @@ There are a few things to remember when you modify the ``Dockerfile``: and only the required folders are added through exclusion (!). This allows to keep docker context small because there are many binary artifacts generated in the sources of Airflow and if they are added to the context, the time of building the image would increase significantly. If you want to add any new - folders to be available in the image you must add it here with leading ``!`` + folders to be available in the image you must add them here with leading ``!`` .. code-block:: text From 56aa22e685b34c2121cc7bebb130c524374bcdd6 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Thu, 14 Oct 2021 16:25:11 +0200 Subject: [PATCH 030/250] Fix --github-image-id flag for Breeze (#18882) (#18946) When we moved to github registry, the --github-image-id flag was broken as it had pulled the "latest" image when run right after pulling the tagged image (and it run that image instead). This change fixes it and uses GITHUB_PULL_IMAGE_TAG (latest if not specified) everywhere where the image is used for running. This flag is not persistent so it is not persistent. (cherry picked from commit 0a82a422e42072db459f527db976e0621ccab9fb) --- breeze | 3 +- scripts/ci/docker-compose/base.yml | 2 +- scripts/ci/libraries/_build_images.sh | 10 ++++- .../ci/libraries/_docker_engine_resources.sh | 2 +- .../ci/libraries/_push_pull_remove_images.sh | 39 +++++++------------ scripts/ci/libraries/_runs.sh | 10 ++--- .../ci_install_and_test_provider_packages.sh | 2 +- .../static_checks/in_container_bats_tests.sh | 4 +- scripts/ci/static_checks/mypy.sh | 2 +- scripts/ci/static_checks/ui_lint.sh | 2 +- scripts/ci/static_checks/www_lint.sh | 2 +- .../ci_run_single_airflow_test_in_docker.sh | 2 +- scripts/ci/tools/fix_ownership.sh | 6 +-- 13 files changed, 41 insertions(+), 45 deletions(-) diff --git a/breeze b/breeze index 73ca2ad5b1bc0..87701516452d8 100755 --- a/breeze +++ b/breeze @@ -502,7 +502,7 @@ EOF Use CI image. Branch name: ${BRANCH_NAME} - Docker image: ${AIRFLOW_CI_IMAGE} + Docker image: ${AIRFLOW_CI_IMAGE_WITH_TAG} Airflow source version: ${AIRFLOW_VERSION} EOF fi @@ -633,6 +633,7 @@ export MYSQL_HOST_PORT="${MYSQL_HOST_PORT}" export MYSQL_VERSION="${MYSQL_VERSION}" export AIRFLOW_SOURCES="${AIRFLOW_SOURCES}" export AIRFLOW_CI_IMAGE="${AIRFLOW_CI_IMAGE}" +export AIRFLOW_CI_IMAGE_WITH_TAG="${AIRFLOW_CI_IMAGE_WITH_TAG}" export AIRFLOW_PROD_IMAGE="${AIRFLOW_PROD_IMAGE}" export AIRFLOW_IMAGE_KUBERNETES="${AIRFLOW_IMAGE_KUBERNETES}" export SQLITE_URL="${SQLITE_URL}" diff --git a/scripts/ci/docker-compose/base.yml b/scripts/ci/docker-compose/base.yml index 809ba0887c9c4..9c68c18fac16c 100644 --- a/scripts/ci/docker-compose/base.yml +++ b/scripts/ci/docker-compose/base.yml @@ -18,7 +18,7 @@ version: "3.7" services: airflow: - image: ${AIRFLOW_CI_IMAGE} + image: ${AIRFLOW_CI_IMAGE_WITH_TAG} pull_policy: never environment: - USER=root diff --git a/scripts/ci/libraries/_build_images.sh b/scripts/ci/libraries/_build_images.sh index b2273a8b17345..af038db61a9c4 100644 --- a/scripts/ci/libraries/_build_images.sh +++ b/scripts/ci/libraries/_build_images.sh @@ -271,7 +271,7 @@ function build_images::get_local_build_cache_hash() { local_image_build_cache_file="${AIRFLOW_SOURCES}/manifests/local-build-cache-hash-${PYTHON_MAJOR_MINOR_VERSION}" # Remove the container just in case docker_v rm --force "local-airflow-ci-container" 2>/dev/null >/dev/null - if ! docker_v inspect "${AIRFLOW_CI_IMAGE}" 2>/dev/null >/dev/null; then + if ! docker_v inspect "${AIRFLOW_CI_IMAGE_WITH_TAG}" 2>/dev/null >/dev/null; then verbosity::print_info verbosity::print_info "Local airflow CI image not available" verbosity::print_info @@ -282,7 +282,7 @@ function build_images::get_local_build_cache_hash() { return fi - docker_v create --name "local-airflow-ci-container" "${AIRFLOW_CI_IMAGE}" 2>/dev/null >/dev/null + docker_v create --name "local-airflow-ci-container" "${AIRFLOW_CI_IMAGE_WITH_TAG}" 2>/dev/null >/dev/null docker_v cp "local-airflow-ci-container:/build-cache-hash" \ "${local_image_build_cache_file}" 2>/dev/null || touch "${local_image_build_cache_file}" @@ -399,6 +399,11 @@ function build_images::get_docker_cache_image_names() { # ghcr.io/apache/airflow/main/ci/python3.8 export AIRFLOW_CI_IMAGE="${image_name}/${BRANCH_NAME}/ci/python${PYTHON_MAJOR_MINOR_VERSION}" + # Example: + # ghcr.io/apache/airflow/main/ci/python3.8:latest + # ghcr.io/apache/airflow/main/ci/python3.8: + export AIRFLOW_CI_IMAGE_WITH_TAG="${image_name}/${BRANCH_NAME}/ci/python${PYTHON_MAJOR_MINOR_VERSION}:${GITHUB_REGISTRY_PULL_IMAGE_TAG}" + # Example: # local-airflow-ci-manifest/main/python3.8 export AIRFLOW_CI_LOCAL_MANIFEST_IMAGE="local-airflow-ci-manifest/${BRANCH_NAME}/python${PYTHON_MAJOR_MINOR_VERSION}" @@ -455,6 +460,7 @@ function build_images::login_to_docker_registry() { else verbosity::print_info "Skip Login to GitHub Container Registry as token is missing" fi + start_end::group_end fi } diff --git a/scripts/ci/libraries/_docker_engine_resources.sh b/scripts/ci/libraries/_docker_engine_resources.sh index 7bcf427e40f64..75daf175e29f9 100644 --- a/scripts/ci/libraries/_docker_engine_resources.sh +++ b/scripts/ci/libraries/_docker_engine_resources.sh @@ -45,6 +45,6 @@ function docker_engine_resources::get_available_memory_in_docker() { function docker_engine_resources::check_all_resources() { docker_v run -t "${EXTRA_DOCKER_FLAGS[@]}" \ --entrypoint "/bin/bash" \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ -c "/opt/airflow/scripts/in_container/run_resource_check.sh" } diff --git a/scripts/ci/libraries/_push_pull_remove_images.sh b/scripts/ci/libraries/_push_pull_remove_images.sh index 51611ae235434..9741e32e81771 100644 --- a/scripts/ci/libraries/_push_pull_remove_images.sh +++ b/scripts/ci/libraries/_push_pull_remove_images.sh @@ -108,26 +108,15 @@ function push_pull_remove_images::pull_base_python_image() { echo -n "Docker pull base python image. Upgrade to newer deps: ${UPGRADE_TO_NEWER_DEPENDENCIES} " > "${DETECTED_TERMINAL}" fi - if [[ ${GITHUB_REGISTRY_PULL_IMAGE_TAG} != "latest" ]]; then - push_pull_remove_images::pull_image_if_not_present_or_forced \ - "${AIRFLOW_PYTHON_BASE_IMAGE}${GITHUB_REGISTRY_PULL_IMAGE_TAG}" - if [[ ${CHECK_IF_BASE_PYTHON_IMAGE_UPDATED} == "true" ]] ; then - echo - echo "${COLOR_RED}ERROR: You cannot check for base python image if you pull specific tag: ${GITHUB_REGISTRY_PULL_IMAGE_TAG}.${COLOR_RESET}" - echo - return 1 - fi - else - set +e - push_pull_remove_images::pull_image_if_not_present_or_forced "${AIRFLOW_PYTHON_BASE_IMAGE}" - local res="$?" - set -e - if [[ ${CHECK_IF_BASE_PYTHON_IMAGE_UPDATED} == "true" || ${res} != "0" ]] ; then - # Rebuild the base python image using DockerHub - either when we explicitly want it - # or when there is no image available yet in ghcr.io (usually when you build it for the - # first time in your repository - push_pull_remove_images::check_and_rebuild_python_base_image_if_needed - fi + set +e + push_pull_remove_images::pull_image_if_not_present_or_forced "${AIRFLOW_PYTHON_BASE_IMAGE}" + local res="$?" + set -e + if [[ ${CHECK_IF_BASE_PYTHON_IMAGE_UPDATED} == "true" || ${res} != "0" ]] ; then + # Rebuild the base python image using DockerHub - either when we explicitly want it + # or when there is no image available yet in ghcr.io (usually when you build it for the + # first time in your repository + push_pull_remove_images::check_and_rebuild_python_base_image_if_needed fi } @@ -144,8 +133,7 @@ function push_pull_remove_images::pull_ci_images_if_needed() { fi if [[ "${DOCKER_CACHE}" == "pulled" ]]; then set +e - push_pull_remove_images::pull_image_if_not_present_or_forced \ - "${AIRFLOW_CI_IMAGE}:${GITHUB_REGISTRY_PULL_IMAGE_TAG}" + push_pull_remove_images::pull_image_if_not_present_or_forced "${AIRFLOW_CI_IMAGE_WITH_TAG}" local res="$?" set -e if [[ ${res} != "0" ]]; then @@ -228,18 +216,19 @@ function push_pull_remove_images::push_python_image_to_github() { # Pushes Ci images and their tags to registry in GitHub function push_pull_remove_images::push_ci_images_to_github() { + start_end::group_start "Push image" if [[ "${PUSH_PYTHON_BASE_IMAGE=}" != "false" ]]; then push_pull_remove_images::push_python_image_to_github fi - local airflow_ci_tagged_image="${AIRFLOW_CI_IMAGE}:${GITHUB_REGISTRY_PUSH_IMAGE_TAG}" - docker_v tag "${AIRFLOW_CI_IMAGE}" "${airflow_ci_tagged_image}" - push_pull_remove_images::push_image_with_retries "${airflow_ci_tagged_image}" + docker_v tag "${AIRFLOW_CI_IMAGE}" "${AIRFLOW_CI_IMAGE}:${GITHUB_REGISTRY_PUSH_IMAGE_TAG}" + push_pull_remove_images::push_image_with_retries "${AIRFLOW_CI_IMAGE}:${GITHUB_REGISTRY_PUSH_IMAGE_TAG}" # Also push ci manifest image if GITHUB_REGISTRY_PUSH_IMAGE_TAG is "latest" if [[ ${GITHUB_REGISTRY_PUSH_IMAGE_TAG} == "latest" ]]; then local airflow_ci_manifest_tagged_image="${AIRFLOW_CI_REMOTE_MANIFEST_IMAGE}:latest" docker_v tag "${AIRFLOW_CI_LOCAL_MANIFEST_IMAGE}" "${airflow_ci_manifest_tagged_image}" push_pull_remove_images::push_image_with_retries "${airflow_ci_manifest_tagged_image}" fi + start_end::group_end } # Pushes PROD image to registry in GitHub diff --git a/scripts/ci/libraries/_runs.sh b/scripts/ci/libraries/_runs.sh index 16cb1749fca77..84b31f4ef84ae 100644 --- a/scripts/ci/libraries/_runs.sh +++ b/scripts/ci/libraries/_runs.sh @@ -23,7 +23,7 @@ function runs::run_docs() { -e "GITHUB_ACTIONS=${GITHUB_ACTIONS="false"}" \ --entrypoint "/usr/local/bin/dumb-init" \ --pull never \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ "--" "/opt/airflow/scripts/in_container/run_docs_build.sh" "${@}" start_end::group_end } @@ -34,7 +34,7 @@ function runs::run_generate_constraints() { docker_v run "${EXTRA_DOCKER_FLAGS[@]}" \ --entrypoint "/usr/local/bin/dumb-init" \ --pull never \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ "--" "/opt/airflow/scripts/in_container/run_generate_constraints.sh" start_end::group_end } @@ -47,7 +47,7 @@ function runs::run_prepare_airflow_packages() { -t \ -v "${AIRFLOW_SOURCES}:/opt/airflow" \ --pull never \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ "--" "/opt/airflow/scripts/in_container/run_prepare_airflow_packages.sh" start_end::group_end } @@ -61,7 +61,7 @@ function runs::run_prepare_provider_packages() { -t \ -v "${AIRFLOW_SOURCES}:/opt/airflow" \ --pull never \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ "--" "/opt/airflow/scripts/in_container/run_prepare_provider_packages.sh" "${@}" } @@ -80,6 +80,6 @@ function runs::run_prepare_provider_documentation() { -e "GENERATE_PROVIDERS_ISSUE" \ -e "GITHUB_TOKEN" \ --pull never \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ "--" "/opt/airflow/scripts/in_container/run_prepare_provider_documentation.sh" "${@}" } diff --git a/scripts/ci/provider_packages/ci_install_and_test_provider_packages.sh b/scripts/ci/provider_packages/ci_install_and_test_provider_packages.sh index 51575ebe80cdf..b8228851f4dd9 100755 --- a/scripts/ci/provider_packages/ci_install_and_test_provider_packages.sh +++ b/scripts/ci/provider_packages/ci_install_and_test_provider_packages.sh @@ -38,7 +38,7 @@ function run_test_package_import_all_classes() { -v "${AIRFLOW_SOURCES}/empty:/opt/airflow/airflow:cached" \ -v "${AIRFLOW_SOURCES}/scripts/in_container:/opt/airflow/scripts/in_container:cached" \ -v "${AIRFLOW_SOURCES}/dev/import_all_classes.py:/opt/airflow/dev/import_all_classes.py:cached" \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ "--" "/opt/airflow/scripts/in_container/run_install_and_test_provider_packages.sh" } diff --git a/scripts/ci/static_checks/in_container_bats_tests.sh b/scripts/ci/static_checks/in_container_bats_tests.sh index fa4eacd86ab21..4778c6012f113 100644 --- a/scripts/ci/static_checks/in_container_bats_tests.sh +++ b/scripts/ci/static_checks/in_container_bats_tests.sh @@ -23,13 +23,13 @@ function run_in_container_bats_tests() { docker_v run "${EXTRA_DOCKER_FLAGS[@]}" \ --entrypoint "/opt/bats/bin/bats" \ "-v" "$(pwd):/airflow" \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ --tap "tests/bats/in_container/" else docker_v run "${EXTRA_DOCKER_FLAGS[@]}" \ --entrypoint "/opt/bats/bin/bats" \ "-v" "$(pwd):/airflow" \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ --tap "${@}" fi } diff --git a/scripts/ci/static_checks/mypy.sh b/scripts/ci/static_checks/mypy.sh index 7ebbd6340ff06..7b5879e1e3d13 100755 --- a/scripts/ci/static_checks/mypy.sh +++ b/scripts/ci/static_checks/mypy.sh @@ -29,7 +29,7 @@ function run_mypy() { docker_v run "${EXTRA_DOCKER_FLAGS[@]}" \ --entrypoint "/usr/local/bin/dumb-init" \ "-v" "${AIRFLOW_SOURCES}/.mypy_cache:/opt/airflow/.mypy_cache" \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ "--" "/opt/airflow/scripts/in_container/run_mypy.sh" "${files[@]}" } diff --git a/scripts/ci/static_checks/ui_lint.sh b/scripts/ci/static_checks/ui_lint.sh index 3722e54b93d42..d5f722c1354b5 100755 --- a/scripts/ci/static_checks/ui_lint.sh +++ b/scripts/ci/static_checks/ui_lint.sh @@ -27,5 +27,5 @@ build_images::rebuild_ci_image_if_needed docker run "${EXTRA_DOCKER_FLAGS[@]}" \ --entrypoint "/bin/bash" \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ -c 'cd airflow/ui && yarn --frozen-lockfile --non-interactive && yarn run lint "${@}"' "${@#airflow/ui/}" diff --git a/scripts/ci/static_checks/www_lint.sh b/scripts/ci/static_checks/www_lint.sh index 7ae56204274a5..fec51516d5a24 100755 --- a/scripts/ci/static_checks/www_lint.sh +++ b/scripts/ci/static_checks/www_lint.sh @@ -27,5 +27,5 @@ build_images::rebuild_ci_image_if_needed docker run "${EXTRA_DOCKER_FLAGS[@]}" \ --entrypoint "/bin/bash" \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ -c 'cd airflow/www && yarn --frozen-lockfile --non-interactive && yarn run lint "${@}"' "${@#airflow/www/static/js/}" diff --git a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh index 615ce21437664..d97310ac7b260 100755 --- a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh +++ b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh @@ -170,7 +170,7 @@ function run_airflow_testing_in_docker() { echo "${COLOR_BLUE}*${COLOR_RESET}" echo "${COLOR_BLUE}***********************************************************************************************${COLOR_RESET}" echo - curl "${constraints_url}" | grep -ve "^#" | diff --color=always - <( docker run --entrypoint /bin/bash "${AIRFLOW_CI_IMAGE}" -c 'pip freeze' \ + curl "${constraints_url}" | grep -ve "^#" | diff --color=always - <( docker run --entrypoint /bin/bash "${AIRFLOW_CI_IMAGE_WITH_TAG}" -c 'pip freeze' \ | sort | grep -v "apache_airflow" | grep -v "@" | grep -v "/opt/airflow" | grep -ve "^#") echo fi diff --git a/scripts/ci/tools/fix_ownership.sh b/scripts/ci/tools/fix_ownership.sh index de1562122a779..1dc7b92e4ec56 100755 --- a/scripts/ci/tools/fix_ownership.sh +++ b/scripts/ci/tools/fix_ownership.sh @@ -33,12 +33,12 @@ sanity_checks::sanitize_mounted_files read -r -a EXTRA_DOCKER_FLAGS <<<"$(local_mounts::convert_local_mounts_to_docker_params)" -if docker image inspect "${AIRFLOW_CI_IMAGE}" >/dev/null 2>&1; then +if docker image inspect "${AIRFLOW_CI_IMAGE_WITH_TAG}" >/dev/null 2>&1; then docker_v run --entrypoint /bin/bash "${EXTRA_DOCKER_FLAGS[@]}" \ --rm \ --env-file "${AIRFLOW_SOURCES}/scripts/ci/docker-compose/_docker.env" \ - "${AIRFLOW_CI_IMAGE}" \ + "${AIRFLOW_CI_IMAGE_WITH_TAG}" \ -c /opt/airflow/scripts/in_container/run_fix_ownership.sh || true else - echo "Skip fixing ownership as seems that you do not have the ${AIRFLOW_CI_IMAGE} image yet" + echo "Skip fixing ownership as seems that you do not have the ${AIRFLOW_CI_IMAGE_WITH_TAG} image yet" fi From 31eea570f58fe57a5714ab7a63f4c40fc532fdcf Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Thu, 14 Oct 2021 23:41:58 +0200 Subject: [PATCH 031/250] Add decription on how you can customize image entrypoint (#18915) (cherry picked from commit cc627b3d8aab2b7282821280ef9c8912e1e0fab2) --- docs/docker-stack/entrypoint.rst | 66 -------------------------------- 1 file changed, 66 deletions(-) diff --git a/docs/docker-stack/entrypoint.rst b/docs/docker-stack/entrypoint.rst index 542e7e54884ac..7cd3a15949f0a 100644 --- a/docs/docker-stack/entrypoint.rst +++ b/docs/docker-stack/entrypoint.rst @@ -194,72 +194,6 @@ If there are any other arguments - they are simply passed to the "airflow" comma optional arguments: -h, --help show this help message and exit -Execute custom code before the Airflow entrypoint -------------------------------------------------- - -If you want to execute some custom code before Airflow's entrypoint you can by using -a custom script and calling Airflow's entrypoint as the -last ``exec`` instruction in your custom one. However you have to remember to use ``dumb-init`` in the same -way as it is used with Airflow's entrypoint, otherwise you might have problems with proper signal -propagation (See the next chapter). - - -.. code-block:: Dockerfile - - FROM airflow::2.3.0.dev0 - COPY my_entrypoint.sh / - ENTRYPOINT ["/usr/bin/dumb-init", "--", "/my_entrypoint.sh"] - -Your entrypoint might for example modify or add variables on the fly. For example the below -entrypoint sets max count of DB checks from the first parameter passed as parameter of the image -execution (A bit useless example but should give the reader an example of how you could use it). - -.. code-block:: bash - - #!/bin/bash - export CONNECTION_CHECK_MAX_COUNT=${1} - shift - exec /entrypoint "${@}" - -Make sure Airflow's entrypoint is run with ``exec /entrypoint "${@}"`` as the last command in your -custom entrypoint. This way signals will be properly propagated and arguments will be passed -to the entrypoint as usual (you can use ``shift`` as above if you need to pass some extra -arguments. Note that passing secret values this way or storing secrets inside the image is a bad -idea from security point of view - as both image and parameters to run the image with are accessible -to anyone who has access to logs of your Kubernetes or image registry. - -Also be aware that code executed before Airflow's entrypoint should not create any files or -directories inside the container and everything might not work the same way when it is executed. -Before Airflow entrypoint is executed, the following functionalities are not available: - -* umask is not set properly to allow ``group`` write access -* user is not yet created in ``/etc/passwd`` if an arbitrary user is used to run the image -* the database and brokers might not be available yet - -Adding custom image behaviour ------------------------------ - -The Airflow image executes a lot of steps in the entrypoint, and sets the right environment, but -you might want to run additional code after the entrypoint creates the user, sets the umask, sets -variables and checks that database is running. - -Rather than running regular commands - ``scheduler``, ``webserver`` you can run *custom* script that -you can embed into the image. You can even execute the usual components of airflow - -``scheduler``, ``webserver`` in your custom script when you finish your custom setup. -Similarly to custom entrypoint, it can be added to the image by extending it. - -.. code-block:: Dockerfile - - FROM airflow::2.3.0.dev0 - COPY my_after_entrypoint_script.sh / - - -And then you can run this script by running the command: - -.. code-block:: bash - - docker run -it apache/airflow:2.2.4-python3.6 bash -c "/my_after_entrypoint_script.sh" - Signal propagation ------------------ From ecdadf5ee5b36d324b4833e95ad2d992a81499a1 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Fri, 15 Oct 2021 01:32:45 -0600 Subject: [PATCH 032/250] CI: Increase parallel test timeout for Helm Chart tests (#18993) The helm tests are now regularly taking right around 35 minutes on public GitHub Actions workers, so we will increase the timeout. (cherry picked from commit 3545a2c4f399b02dfcd03b1ecdd965f480cb67e3) --- scripts/ci/testing/ci_run_airflow_testing.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci/testing/ci_run_airflow_testing.sh b/scripts/ci/testing/ci_run_airflow_testing.sh index 73c3a813564c0..c65a0580a62db 100755 --- a/scripts/ci/testing/ci_run_airflow_testing.sh +++ b/scripts/ci/testing/ci_run_airflow_testing.sh @@ -45,10 +45,10 @@ function run_test_types_in_parallel() { mkdir -p "${PARALLEL_MONITORED_DIR}/${SEMAPHORE_NAME}/${TEST_TYPE}" export JOB_LOG="${PARALLEL_MONITORED_DIR}/${SEMAPHORE_NAME}/${TEST_TYPE}/stdout" export PARALLEL_JOB_STATUS="${PARALLEL_MONITORED_DIR}/${SEMAPHORE_NAME}/${TEST_TYPE}/status" - # Each test job will get SIGTERM followed by SIGTERM 200ms later and SIGKILL 200ms later after 35 mins + # Each test job will get SIGTERM followed by SIGTERM 200ms later and SIGKILL 200ms later after 45 mins # shellcheck disable=SC2086 parallel --ungroup --bg --semaphore --semaphorename "${SEMAPHORE_NAME}" \ - --jobs "${MAX_PARALLEL_TEST_JOBS}" --timeout 2100 \ + --jobs "${MAX_PARALLEL_TEST_JOBS}" --timeout 2700 \ "$( dirname "${BASH_SOURCE[0]}" )/ci_run_single_airflow_test_in_docker.sh" "${@}" >"${JOB_LOG}" 2>&1 done parallel --semaphore --semaphorename "${SEMAPHORE_NAME}" --wait From 7b1fb7eb596e08343b05a2a175ae4f772371555f Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sat, 16 Oct 2021 20:43:31 +0200 Subject: [PATCH 033/250] Skip updating constraints when only datetime changes (#19023) After adding datetime to generated constraints it became possible that constraints remained the same but generated constraint files changed (because of the date time). It was a rare occurence because we rarely had "all-green" build in `main`, but since we managed to fix most of the flaky tests it became more probable (and happened already several times). This PR ignores comment files when comparing generated constraints and commits should only happen if something else than generated comment changes. (cherry picked from commit 603a3b28c3733415a9eeb7c8271f122ec92b08d5) --- scripts/ci/constraints/ci_commit_constraints.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/ci/constraints/ci_commit_constraints.sh b/scripts/ci/constraints/ci_commit_constraints.sh index 58afbd94ab795..e3da5e20e26df 100755 --- a/scripts/ci/constraints/ci_commit_constraints.sh +++ b/scripts/ci/constraints/ci_commit_constraints.sh @@ -22,7 +22,8 @@ cp -v ./files/constraints-*/constraints*.txt repo/ cd repo || exit 1 git config --local user.email "dev@airflow.apache.org" git config --local user.name "Automated GitHub Actions commit" -git diff --color --exit-code || git commit --all --message "Updating constraints. Build id:${CI_BUILD_ID} +git diff --color --exit-code --ignore-matching-lines="^#.*" || \ +git commit --all --message "Updating constraints. Build id:${CI_BUILD_ID} This update in constraints is automatically committed by the CI 'constraints-push' step based on HEAD of '${CI_REF}' in '${CI_TARGET_REPO}' From dd1a3fa5c3888bdb829eed0078b578be5d648535 Mon Sep 17 00:00:00 2001 From: Oyinkansola Awosan Date: Wed, 20 Oct 2021 19:53:33 +0100 Subject: [PATCH 034/250] Fix wrong commands in docs/breeze (cherry picked from commit 326e2b1272688e55fd76a85de399d0672aabe92c) --- CONTRIBUTORS_QUICK_START.rst | 4 ++-- INSTALL | 4 ++-- breeze | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTORS_QUICK_START.rst b/CONTRIBUTORS_QUICK_START.rst index 895458c6b517a..545f7cf9cf47e 100644 --- a/CONTRIBUTORS_QUICK_START.rst +++ b/CONTRIBUTORS_QUICK_START.rst @@ -136,8 +136,8 @@ Pyenv and setting up virtual-env libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \ xz-utils tk-dev libffi-dev liblzma-dev python-openssl git - $ sudo apt install build-essentials python3.6-dev python3.7-dev python3.8-dev python3.9-dev python-dev openssl \ - sqlite sqlite-dev default-libmysqlclient-dev libmysqld-dev postgresql + $ sudo apt install openssl \ + sqlite default-libmysqlclient-dev libmysqlclient-dev postgresql 2. Install pyenv diff --git a/INSTALL b/INSTALL index cc2025f578588..96edb059faea7 100644 --- a/INSTALL +++ b/INSTALL @@ -7,8 +7,8 @@ systems/prerequisites are known to work: Linux (Debian Buster and Linux Mint Tricia): -sudo apt install build-essentials python3.6-dev python3.7-dev python-dev openssl \ - sqlite sqlite-dev default-libmysqlclient-dev libmysqld-dev postgresq +sudo apt install build-essential python3-dev libsqlite3-dev openssl \ + sqlite default-libmysqlclient-dev libmysqlclient-dev postgresql MacOS (Mojave/Catalina): diff --git a/breeze b/breeze index 87701516452d8..32653a95d2b66 100755 --- a/breeze +++ b/breeze @@ -273,8 +273,8 @@ function breeze::initialize_virtualenv() { echo " export LDFLAGS=\"-L/usr/local/opt/openssl/lib\"" echo " export CPPFLAGS=\"-I/usr/local/opt/openssl/include\"" else - echo " sudo apt install build-essentials python3.6-dev python3.7-dev python3.8-dev python-dev openssl \\" - echo " sqlite sqlite-dev default-libmysqlclient-dev libmysqld-dev postgresql" + echo " sudo apt install build-essential python3-dev libsqlite3-dev openssl \\" + echo " sqlite default-libmysqlclient-dev libmysqlclient-dev postgresql" fi echo echo "#######################################################################" From c8f492bfd81f694c0c4b66f73bcda56c1f92a4f5 Mon Sep 17 00:00:00 2001 From: Niko <65743084+o-nikolas@users.noreply.github.com> Date: Fri, 22 Oct 2021 17:30:33 -0700 Subject: [PATCH 035/250] Allow specifying extras when using breeze initialize_local_virtualenv (#19178) (cherry picked from commit f47d7b95fe68fe4c0c9db6503ddbf2ed13ea43dd) --- LOCAL_VIRTUALENV.rst | 7 +++++++ breeze | 5 ++++- scripts/ci/libraries/_initialization.sh | 6 ++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/LOCAL_VIRTUALENV.rst b/LOCAL_VIRTUALENV.rst index b389442170a7c..a53f9bae4e111 100644 --- a/LOCAL_VIRTUALENV.rst +++ b/LOCAL_VIRTUALENV.rst @@ -205,6 +205,13 @@ Activate your virtualenv, e.g. by using ``workon``, and once you are in it, run: ./breeze initialize-local-virtualenv +By default Breeze installs the ``devel`` extra only. You can optionally control which extras are installed by exporting ``VIRTUALENV_EXTRAS`` before calling Breeze: + +.. code-block:: bash + + export VIRTUALENV_EXTRAS="devel,google,postgres" + ./breeze initialize-local-virtualenv + 5. (optionally) run yarn build if you plan to run the webserver .. code-block:: bash diff --git a/breeze b/breeze index 32653a95d2b66..99b076f8bcb22 100755 --- a/breeze +++ b/breeze @@ -230,6 +230,7 @@ function breeze::setup_default_breeze_constants() { # PYTHON_MAJOR_MINOR_VERSION # AIRFLOW_HOME_DIR # AIRFLOW_SOURCES +# VIRTUALENV_EXTRAS # DEFAULT_CONSTRAINTS_BRANCH # OSTYPE # @@ -252,13 +253,15 @@ function breeze::initialize_virtualenv() { echo echo "Initializing the virtualenv: $(command -v python)!" echo + echo "Extras to be installed: ${VIRTUALENV_EXTRAS}" + echo echo "This will wipe out ${AIRFLOW_HOME_DIR} and reset all the databases!" echo "${AIRFLOW_SOURCES}/confirm" "Proceeding with the initialization" echo pushd "${AIRFLOW_SOURCES}" >/dev/null 2>&1 || exit 1 set +e - pip install -e ".[devel]" \ + pip install -e ".[${VIRTUALENV_EXTRAS}]" \ --constraint "https://raw.githubusercontent.com/${CONSTRAINTS_GITHUB_REPOSITORY}/${DEFAULT_CONSTRAINTS_BRANCH}/constraints-source-providers-${PYTHON_MAJOR_MINOR_VERSION}.txt" res=$? set -e diff --git a/scripts/ci/libraries/_initialization.sh b/scripts/ci/libraries/_initialization.sh index cdfbfcdbab1ad..9b320299f4ffc 100644 --- a/scripts/ci/libraries/_initialization.sh +++ b/scripts/ci/libraries/_initialization.sh @@ -564,6 +564,11 @@ function initialization::initialize_kubernetes_variables() { readonly API_SERVER_PORT } +function initialization::initialize_virtualenv_variables() { + # The extras to install when initializing a virtual env with breeze + export VIRTUALENV_EXTRAS=${VIRTUALENV_EXTRAS:="devel"} +} + function initialization::initialize_git_variables() { # SHA of the commit for the current sources COMMIT_SHA="$(git rev-parse HEAD 2>/dev/null || echo "Unknown")" @@ -638,6 +643,7 @@ function initialization::initialize_common_environment() { initialization::initialize_image_build_variables initialization::initialize_provider_package_building initialization::initialize_kubernetes_variables + initialization::initialize_virtualenv_variables initialization::initialize_git_variables initialization::initialize_github_variables initialization::initialize_test_variables From 80f4e5f8b374efb6454a8a659ee4a3a277b3d3d1 Mon Sep 17 00:00:00 2001 From: Niko <65743084+o-nikolas@users.noreply.github.com> Date: Sat, 23 Oct 2021 02:40:59 -0700 Subject: [PATCH 036/250] Fix breeze docker version parsing (#19182) (cherry picked from commit d70909422ec7ec2f6e5311d26c1b15e31c3bc188) --- scripts/ci/libraries/_initialization.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/libraries/_initialization.sh b/scripts/ci/libraries/_initialization.sh index 9b320299f4ffc..36cc29ed50fd8 100644 --- a/scripts/ci/libraries/_initialization.sh +++ b/scripts/ci/libraries/_initialization.sh @@ -935,7 +935,7 @@ function initialization::ga_env() { function initialization::ver() { # convert SemVer number to comparable string (strips pre-release version) # shellcheck disable=SC2086,SC2183 - printf "%03d%03d%03d%.0s" ${1//[.-]/} + printf "%03d%03d%03d%.0s" ${1//[.-]/ } } function initialization::check_docker_version() { From d5ee47150366c96a798296e6a7239896025fa065 Mon Sep 17 00:00:00 2001 From: Bowrna Date: Fri, 29 Oct 2021 00:24:23 +0530 Subject: [PATCH 037/250] pyenv related docs added, warning message in breeze initialize-local-virtualenv command (#19100) (cherry picked from commit b5df0f96c07504489850f8b3d99c9d4e320e6935) --- CONTRIBUTORS_QUICK_START.rst | 2 +- breeze | 6 ++++++ docs/spelling_wordlist.txt | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS_QUICK_START.rst b/CONTRIBUTORS_QUICK_START.rst index 545f7cf9cf47e..62465420d4861 100644 --- a/CONTRIBUTORS_QUICK_START.rst +++ b/CONTRIBUTORS_QUICK_START.rst @@ -145,7 +145,7 @@ Pyenv and setting up virtual-env $ curl https://pyenv.run | bash -3. Add the lines suggested at the end of installation to ~/.bashrc +3. Configure your shell's environment for Pyenv as suggested in Pyenv `README `_ 4. Restart your shell so the path changes take effect and verifying installation diff --git a/breeze b/breeze index 99b076f8bcb22..72ac0c4552a20 100755 --- a/breeze +++ b/breeze @@ -286,6 +286,12 @@ function breeze::initialize_virtualenv() { echo echo "Wiping and recreating ${AIRFLOW_HOME_DIR}" echo + if [[ "${AIRFLOW_SOURCES}" == "${AIRFLOW_HOME_DIR}" ]]; then + echo "AIRFLOW_HOME and Source code for Apache Airflow resides in the same path ${AIRFLOW_HOME_DIR}" + echo "When running this command it will delete all the files in the path ${AIRFLOW_HOME_DIR} to clear dynamic files like config/logs/db" + echo "Move your source code for Apache Airflow to different folder to avoid deletion" + exit 1 + fi rm -rvf "${AIRFLOW_HOME_DIR}" mkdir -p "${AIRFLOW_HOME_DIR}" echo diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index fe949728a937b..91d0471faa871 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -1293,6 +1293,7 @@ sudo sudoers summarization superclass +sur svg swp symlink From 6f6e9c2b76b1db79d2fc968bce71e14d6ee612a4 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Fri, 29 Oct 2021 18:53:49 +0800 Subject: [PATCH 038/250] Skip triggerer in 'breeze start-airflow' if on 3.6 (#19305) The triggerer does not work on 3.6, so there's no point showing a dead pane. (cherry picked from commit 8e124aed600eb82a00f3f7fdde15d1620732012b) --- scripts/in_container/bin/run_tmux | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/in_container/bin/run_tmux b/scripts/in_container/bin/run_tmux index 38bdd44434d56..7efe863a2ed62 100755 --- a/scripts/in_container/bin/run_tmux +++ b/scripts/in_container/bin/run_tmux @@ -57,9 +57,11 @@ if [[ -z "${USE_AIRFLOW_VERSION=}" ]]; then tmux send-keys 'cd /opt/airflow/airflow/www/; yarn install --frozen-lockfile; yarn dev' C-m fi -tmux select-pane -t 0 -tmux split-window -h -tmux send-keys 'airflow triggerer' C-m +if python -c 'import sys; sys.exit(sys.version_info < (3, 7))'; then + tmux select-pane -t 0 + tmux split-window -h + tmux send-keys 'airflow triggerer' C-m +fi # Attach Session, on the Main window tmux select-pane -t 0 From 70c58699f17612ecb02ae713d92a58e2748827cb Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Mon, 1 Nov 2021 12:17:06 +0100 Subject: [PATCH 039/250] Temporarily remove mypy checks to stop PRs from failing (#19345) After we moved to Python 3.7 as default, it had a ripple effect that MyPy checks started failing. We aim to fix it permanently in #19334 but this needs a bit more changes, so for the moment we skip the checks. (cherry picked from commit 1f0db2885c338d51fe9b8d39b56d74cc376a6c8f) --- .pre-commit-config.yaml | 2 +- scripts/ci/pre_commit/pre_commit_flake8.sh | 2 +- scripts/ci/pre_commit/pre_commit_mypy.sh | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 08d60abc3d517..ad985d108315b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -675,7 +675,7 @@ repos: # The below pre-commits are those requiring CI image to be built - id: build name: Check if image build is needed - entry: ./scripts/ci/pre_commit/pre_commit_ci_build.sh 3.6 false + entry: ./scripts/ci/pre_commit/pre_commit_ci_build.sh 3.7 false language: system always_run: true pass_filenames: false diff --git a/scripts/ci/pre_commit/pre_commit_flake8.sh b/scripts/ci/pre_commit/pre_commit_flake8.sh index a2fe9b907f992..cbef9d08bce23 100755 --- a/scripts/ci/pre_commit/pre_commit_flake8.sh +++ b/scripts/ci/pre_commit/pre_commit_flake8.sh @@ -15,7 +15,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -export PYTHON_MAJOR_MINOR_VERSION="3.6" +export PYTHON_MAJOR_MINOR_VERSION="3.7" export FORCE_ANSWER_TO_QUESTIONS=${FORCE_ANSWER_TO_QUESTIONS:="quit"} export REMEMBER_LAST_ANSWER="true" export PRINT_INFO_FROM_SCRIPTS="false" diff --git a/scripts/ci/pre_commit/pre_commit_mypy.sh b/scripts/ci/pre_commit/pre_commit_mypy.sh index f202f8a7ee8bb..7e2b4f6223f77 100755 --- a/scripts/ci/pre_commit/pre_commit_mypy.sh +++ b/scripts/ci/pre_commit/pre_commit_mypy.sh @@ -15,10 +15,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -export PYTHON_MAJOR_MINOR_VERSION="3.6" +export PYTHON_MAJOR_MINOR_VERSION="3.7" export FORCE_ANSWER_TO_QUESTIONS=${FORCE_ANSWER_TO_QUESTIONS:="quit"} export REMEMBER_LAST_ANSWER="true" export PRINT_INFO_FROM_SCRIPTS="false" +# Temporarily remove mypy checks until we fix them for Python 3.7 +exit 0 + # shellcheck source=scripts/ci/static_checks/mypy.sh . "$( dirname "${BASH_SOURCE[0]}" )/../static_checks/mypy.sh" "${@}" From e166a371f81bf5c96e04798d5b5fb830d9d94a26 Mon Sep 17 00:00:00 2001 From: Niko <65743084+o-nikolas@users.noreply.github.com> Date: Mon, 1 Nov 2021 17:22:51 -0700 Subject: [PATCH 040/250] Enable mouse mode by default in start_airflow tmux session (#19325) (cherry picked from commit 42997514cb8cc40b16520ea2269f719c98ababac) --- scripts/in_container/bin/run_tmux | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/in_container/bin/run_tmux b/scripts/in_container/bin/run_tmux index 7efe863a2ed62..f405b6d9ecf48 100755 --- a/scripts/in_container/bin/run_tmux +++ b/scripts/in_container/bin/run_tmux @@ -39,6 +39,11 @@ export TMUX_SESSION="Airflow" # Start New Session with our name tmux new-session -d -s "${TMUX_SESSION}" +# Enable mouse interaction with tmux. This allows selecting between the panes +# by clicking with the mouse and also allows scrolling back through terminal +# output with the mouse wheel. +tmux set mouse on + # Name first Pane and start bash tmux rename-window -t 0 'Main' tmux send-keys -t 'Main' 'bash' C-m 'clear' C-m From cfa1bae5396609980bb974fc4f06c1047595440a Mon Sep 17 00:00:00 2001 From: Melodie Ezeani <78446940+Melodie97@users.noreply.github.com> Date: Tue, 2 Nov 2021 17:31:29 +0100 Subject: [PATCH 041/250] Make scripts/in_container/check_environment.sh Google Shell Guide Compliant (#19350) (cherry picked from commit 7b293c548a92d2cd0eea4f9571c007057aa06482) --- scripts/in_container/check_environment.sh | 58 ++++++++++++----------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/scripts/in_container/check_environment.sh b/scripts/in_container/check_environment.sh index 6bbd702a4bcd3..ca1a365bd4b89 100755 --- a/scripts/in_container/check_environment.sh +++ b/scripts/in_container/check_environment.sh @@ -40,67 +40,69 @@ function run_nc() { } function check_service { - LABEL=$1 - CALL=$2 - MAX_CHECK=${3:=1} + local label=$1 + local call=$2 + local max_check=${3:=1} - echo -n "${LABEL}: " + echo -n "${label}: " while true do set +e - LAST_CHECK_RESULT=$(eval "${CALL}" 2>&1) - RES=$? + local last_check_result + last_check_result=$(eval "${call}" 2>&1) + local res=$? set -e - if [[ ${RES} == 0 ]]; then + if [[ ${res} == 0 ]]; then echo "${COLOR_GREEN}OK. ${COLOR_RESET}" break else echo -n "." - MAX_CHECK=$((MAX_CHECK-1)) + max_check=$((max_check-1)) fi - if [[ ${MAX_CHECK} == 0 ]]; then + if [[ ${max_check} == 0 ]]; then echo "${COLOR_RED}ERROR: Maximum number of retries while checking service. Exiting ${COLOR_RESET}" break else sleep 1 fi done - if [[ ${RES} != 0 ]]; then + if [[ ${res} != 0 ]]; then echo "Service could not be started!" echo - echo "$ ${CALL}" - echo "${LAST_CHECK_RESULT}" + echo "$ ${call}" + echo "${last_check_result}" echo - EXIT_CODE=${RES} + EXIT_CODE=${res} fi } function check_integration { - INTEGRATION_LABEL=$1 - INTEGRATION_NAME=$2 - CALL=$3 - MAX_CHECK=${4:=1} - - ENV_VAR_NAME=INTEGRATION_${INTEGRATION_NAME^^} - if [[ ${!ENV_VAR_NAME:=} != "true" ]]; then - if [[ ! ${DISABLED_INTEGRATIONS} == *" ${INTEGRATION_NAME}"* ]]; then - DISABLED_INTEGRATIONS="${DISABLED_INTEGRATIONS} ${INTEGRATION_NAME}" + local integration_label=$1 + local integration_name=$2 + local call=$3 + local max_check=${4:=1} + + local env_var_name + env_var_name=INTEGRATION_${integration_name^^} + if [[ ${!env_var_name:=} != "true" ]]; then + if [[ ! ${DISABLED_INTEGRATIONS} == *" ${integration_name}"* ]]; then + DISABLED_INTEGRATIONS="${DISABLED_INTEGRATIONS} ${integration_name}" fi return fi - check_service "${INTEGRATION_LABEL}" "${CALL}" "${MAX_CHECK}" + check_service "${integration_label}" "${call}" "${max_check}" } function check_db_backend { - MAX_CHECK=${1:=1} + local max_check=${1:=1} if [[ ${BACKEND} == "postgres" ]]; then - check_service "PostgreSQL" "run_nc postgres 5432" "${MAX_CHECK}" + check_service "PostgreSQL" "run_nc postgres 5432" "${max_check}" elif [[ ${BACKEND} == "mysql" ]]; then - check_service "MySQL" "run_nc mysql 3306" "${MAX_CHECK}" + check_service "MySQL" "run_nc mysql 3306" "${max_check}" elif [[ ${BACKEND} == "mssql" ]]; then - check_service "MSSQL" "run_nc mssql 1433" "${MAX_CHECK}" - check_service "MSSQL Login Check" "airflow db check" "${MAX_CHECK}" + check_service "MSSQL" "run_nc mssql 1433" "${max_check}" + check_service "MSSQL Login Check" "airflow db check" "${max_check}" elif [[ ${BACKEND} == "sqlite" ]]; then return else From 581e974f567db091ed308d16cccfb3fb20b4c25f Mon Sep 17 00:00:00 2001 From: Brian Bagdasarian Date: Thu, 4 Nov 2021 00:26:17 -0700 Subject: [PATCH 042/250] Fix --disable-mssql-client-installation error (#19295) * Fix --disable-mssql-client-installation error * Add flag to documentation * Fix documentation with hook (cherry picked from commit c6aed34feb9321757eeaaaf2f3c055d51786a4f9) --- BREEZE.rst | 12 ++++++++++++ breeze | 4 ++++ breeze-complete | 2 +- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/BREEZE.rst b/BREEZE.rst index 8ab16cb721f1f..b10fd6c6cdc9c 100644 --- a/BREEZE.rst +++ b/BREEZE.rst @@ -1385,6 +1385,10 @@ This is the current syntax for `./breeze <./breeze>`_: Disables installation of the mysql client which might be problematic if you are building image in controlled environment. Only valid for production image. + --disable-mssql-client-installation + Disables installation of the mssql client which might be problematic if you are building + image in controlled environment. Only valid for production image. + --constraints-location Url to the constraints file. In case of the production image it can also be a path to the constraint file placed in 'docker-context-files' folder, in which case it has to be @@ -1984,6 +1988,10 @@ This is the current syntax for `./breeze <./breeze>`_: Disables installation of the mysql client which might be problematic if you are building image in controlled environment. Only valid for production image. + --disable-mssql-client-installation + Disables installation of the mssql client which might be problematic if you are building + image in controlled environment. Only valid for production image. + --constraints-location Url to the constraints file. In case of the production image it can also be a path to the constraint file placed in 'docker-context-files' folder, in which case it has to be @@ -2573,6 +2581,10 @@ This is the current syntax for `./breeze <./breeze>`_: Disables installation of the mysql client which might be problematic if you are building image in controlled environment. Only valid for production image. + --disable-mssql-client-installation + Disables installation of the mssql client which might be problematic if you are building + image in controlled environment. Only valid for production image. + --constraints-location Url to the constraints file. In case of the production image it can also be a path to the constraint file placed in 'docker-context-files' folder, in which case it has to be diff --git a/breeze b/breeze index 72ac0c4552a20..dd72e6e1f1aaf 100755 --- a/breeze +++ b/breeze @@ -2730,6 +2730,10 @@ Build options: Disables installation of the mysql client which might be problematic if you are building image in controlled environment. Only valid for production image. +--disable-mssql-client-installation + Disables installation of the mssql client which might be problematic if you are building + image in controlled environment. Only valid for production image. + --constraints-location Url to the constraints file. In case of the production image it can also be a path to the constraint file placed in 'docker-context-files' folder, in which case it has to be diff --git a/breeze-complete b/breeze-complete index 0b7e8abf9384e..1d673e04222b1 100644 --- a/breeze-complete +++ b/breeze-complete @@ -179,7 +179,7 @@ github-repository: github-image-id: generate-constraints-mode: postgres-version: mysql-version: mssql-version: version-suffix-for-pypi: version-suffix-for-svn: additional-extras: additional-python-deps: additional-dev-deps: additional-runtime-deps: image-tag: -disable-mysql-client-installation constraints-location: disable-pip-cache install-from-docker-context-files +disable-mysql-client-installation disable-mssql-client-installation constraints-location: disable-pip-cache install-from-docker-context-files additional-extras: additional-python-deps: disable-pypi-when-building skip-installing-airflow-providers-from-sources dev-apt-deps: additional-dev-apt-deps: dev-apt-command: additional-dev-apt-command: additional-dev-apt-env: runtime-apt-deps: additional-runtime-apt-deps: runtime-apt-command: additional-runtime-apt-command: additional-runtime-apt-env: From 74cba25b23d3f440d8052a35620fa5c4787267ad Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sun, 7 Nov 2021 23:47:00 +0100 Subject: [PATCH 043/250] Updates version of airflow in docker examples (#19455) Unfortunately those cannot be updated at release time as they need released version of Airflow to run. (cherry picked from commit d451fc3a0945b0649c97b97f3af3fe5171a527cd) --- .../customizing/add-build-essential-custom.sh | 4 +++- .../docker-examples/customizing/custom-sources.sh | 6 ++++-- .../docker-examples/customizing/pypi-dev-runtime-deps.sh | 4 +++- .../docker-examples/customizing/pypi-extras-and-deps.sh | 4 +++- .../docker-examples/customizing/pypi-selected-version.sh | 4 +++- 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/docs/docker-stack/docker-examples/customizing/add-build-essential-custom.sh b/docs/docker-stack/docker-examples/customizing/add-build-essential-custom.sh index 05004593a67f3..7cf1dae5f42ec 100755 --- a/docs/docker-stack/docker-examples/customizing/add-build-essential-custom.sh +++ b/docs/docker-stack/docker-examples/customizing/add-build-essential-custom.sh @@ -22,9 +22,11 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START build] +export AIRFLOW_VERSION=2.2.1 + docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.6-slim-buster" \ - --build-arg AIRFLOW_VERSION="2.0.2" \ + --build-arg AIRFLOW_VERSION="${AIRFLOW_VERSION}" \ --build-arg ADDITIONAL_PYTHON_DEPS="mpi4py" \ --build-arg ADDITIONAL_DEV_APT_DEPS="libopenmpi-dev" \ --build-arg ADDITIONAL_RUNTIME_APT_DEPS="openmpi-common" \ diff --git a/docs/docker-stack/docker-examples/customizing/custom-sources.sh b/docs/docker-stack/docker-examples/customizing/custom-sources.sh index 8f087b3f3eb2d..2aecf4ec39bba 100755 --- a/docs/docker-stack/docker-examples/customizing/custom-sources.sh +++ b/docs/docker-stack/docker-examples/customizing/custom-sources.sh @@ -22,12 +22,14 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START build] +export AIRFLOW_VERSION=2.2.1 + docker build . -f Dockerfile \ --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg AIRFLOW_VERSION="2.0.2" \ + --build-arg AIRFLOW_VERSION="${AIRFLOW_VERSION}" \ --build-arg ADDITIONAL_AIRFLOW_EXTRAS="slack,odbc" \ --build-arg ADDITIONAL_PYTHON_DEPS=" \ - azure-storage-blob \ + azure-storage-blob<12.9.0 \ oauth2client \ beautifulsoup4 \ dateparser \ diff --git a/docs/docker-stack/docker-examples/customizing/pypi-dev-runtime-deps.sh b/docs/docker-stack/docker-examples/customizing/pypi-dev-runtime-deps.sh index 32bd1fcfac338..9ba93c1de7ca8 100755 --- a/docs/docker-stack/docker-examples/customizing/pypi-dev-runtime-deps.sh +++ b/docs/docker-stack/docker-examples/customizing/pypi-dev-runtime-deps.sh @@ -22,9 +22,11 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START build] +export AIRFLOW_VERSION=2.2.1 + docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.6-slim-buster" \ - --build-arg AIRFLOW_VERSION="2.0.2" \ + --build-arg AIRFLOW_VERSION="${AIRFLOW_VERSION}" \ --build-arg ADDITIONAL_AIRFLOW_EXTRAS="jdbc" \ --build-arg ADDITIONAL_PYTHON_DEPS="pandas" \ --build-arg ADDITIONAL_DEV_APT_DEPS="gcc g++" \ diff --git a/docs/docker-stack/docker-examples/customizing/pypi-extras-and-deps.sh b/docs/docker-stack/docker-examples/customizing/pypi-extras-and-deps.sh index 43731216ced90..804eacf0ae3e6 100755 --- a/docs/docker-stack/docker-examples/customizing/pypi-extras-and-deps.sh +++ b/docs/docker-stack/docker-examples/customizing/pypi-extras-and-deps.sh @@ -22,9 +22,11 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START build] +export AIRFLOW_VERSION=2.2.1 + docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.8-slim-buster" \ - --build-arg AIRFLOW_VERSION="2.0.2" \ + --build-arg AIRFLOW_VERSION="${AIRFLOW_VERSION}" \ --build-arg ADDITIONAL_AIRFLOW_EXTRAS="mssql,hdfs" \ --build-arg ADDITIONAL_PYTHON_DEPS="oauth2client" \ --tag "my-pypi-extras-and-deps:0.0.1" diff --git a/docs/docker-stack/docker-examples/customizing/pypi-selected-version.sh b/docs/docker-stack/docker-examples/customizing/pypi-selected-version.sh index c8e1f395ee6d6..77045a40016c7 100755 --- a/docs/docker-stack/docker-examples/customizing/pypi-selected-version.sh +++ b/docs/docker-stack/docker-examples/customizing/pypi-selected-version.sh @@ -22,9 +22,11 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START build] +export AIRFLOW_VERSION=2.2.1 + docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ - --build-arg AIRFLOW_VERSION="2.0.2" \ + --build-arg AIRFLOW_VERSION="${AIRFLOW_VERSION}" \ --tag "my-pypi-selected-version:0.0.1" # [END build] docker rmi --force "my-pypi-selected-version:0.0.1" From 9f0271b32f5ef03eadf8e56818219606b1b57486 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Tue, 9 Nov 2021 14:02:57 -0700 Subject: [PATCH 044/250] Fix docker "after entrypoint" custom script example (#19495) (cherry picked from commit 4d14885e84fa4e4545d398585dcb99b5fbd64247) --- docs/docker-stack/entrypoint.rst | 66 ++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/docs/docker-stack/entrypoint.rst b/docs/docker-stack/entrypoint.rst index 7cd3a15949f0a..a37c916e26f34 100644 --- a/docs/docker-stack/entrypoint.rst +++ b/docs/docker-stack/entrypoint.rst @@ -194,6 +194,72 @@ If there are any other arguments - they are simply passed to the "airflow" comma optional arguments: -h, --help show this help message and exit +Execute custom code before the Airflow entrypoint +------------------------------------------------- + +If you want to execute some custom code before Airflow's entrypoint you can by using +a custom script and calling Airflow's entrypoint as the +last ``exec`` instruction in your custom one. However you have to remember to use ``dumb-init`` in the same +way as it is used with Airflow's entrypoint, otherwise you might have problems with proper signal +propagation (See the next chapter). + + +.. code-block:: Dockerfile + + FROM airflow:2.3.0.dev0 + COPY my_entrypoint.sh / + ENTRYPOINT ["/usr/bin/dumb-init", "--", "/my_entrypoint.sh"] + +Your entrypoint might for example modify or add variables on the fly. For example the below +entrypoint sets max count of DB checks from the first parameter passed as parameter of the image +execution (A bit useless example but should give the reader an example of how you could use it). + +.. code-block:: bash + + #!/bin/bash + export CONNECTION_CHECK_MAX_COUNT=${1} + shift + exec /entrypoint "${@}" + +Make sure Airflow's entrypoint is run with ``exec /entrypoint "${@}"`` as the last command in your +custom entrypoint. This way signals will be properly propagated and arguments will be passed +to the entrypoint as usual (you can use ``shift`` as above if you need to pass some extra +arguments. Note that passing secret values this way or storing secrets inside the image is a bad +idea from security point of view - as both image and parameters to run the image with are accessible +to anyone who has access to logs of your Kubernetes or image registry. + +Also be aware that code executed before Airflow's entrypoint should not create any files or +directories inside the container and everything might not work the same way when it is executed. +Before Airflow entrypoint is executed, the following functionalities are not available: + +* umask is not set properly to allow ``group`` write access +* user is not yet created in ``/etc/passwd`` if an arbitrary user is used to run the image +* the database and brokers might not be available yet + +Adding custom image behaviour +----------------------------- + +The Airflow image executes a lot of steps in the entrypoint, and sets the right environment, but +you might want to run additional code after the entrypoint creates the user, sets the umask, sets +variables and checks that database is running. + +Rather than running regular commands - ``scheduler``, ``webserver`` you can run *custom* script that +you can embed into the image. You can even execute the usual components of airflow - +``scheduler``, ``webserver`` in your custom script when you finish your custom setup. +Similarly to custom entrypoint, it can be added to the image by extending it. + +.. code-block:: Dockerfile + + FROM airflow:2.3.0.dev0 + COPY my_after_entrypoint_script.sh / + +Build your image and then you can run this script by running the command: + +.. code-block:: bash + + docker build . --tag my-image:0.0.1 + docker run -it my-image:0.0.1 bash -c "/my_after_entrypoint_script.sh" + Signal propagation ------------------ From 7f41a049737c864fb807593f330578df62094e43 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 10 Nov 2021 14:38:26 +0100 Subject: [PATCH 045/250] Optimizes running tests for public GitHub Runners. (#19512) We started to get more (and almost consistent) OOM failures when we tried to run all tests in parallel for the public GitHub runners. This could previously happen for Providers and Integration tests but it started to happen for Core tests. This PR optimizes this to also make Core tests sequentially run and refactors the code to make it much more readable and easy to understand what's going on there. (cherry picked from commit a1b7f98ff371bea42188a189f848675b348b977c) --- scripts/ci/libraries/_testing.sh | 2 +- scripts/ci/testing/ci_run_airflow_testing.sh | 84 +++++++++----------- 2 files changed, 37 insertions(+), 49 deletions(-) diff --git a/scripts/ci/libraries/_testing.sh b/scripts/ci/libraries/_testing.sh index 11220a8727ce2..3c66a203469d3 100644 --- a/scripts/ci/libraries/_testing.sh +++ b/scripts/ci/libraries/_testing.sh @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -export MEMORY_REQUIRED_FOR_INTEGRATION_TEST_PARALLEL_RUN=33000 +export MEMORY_REQUIRED_FOR_HEAVY_TEST_PARALLEL_RUN=33000 function testing::skip_tests_if_requested(){ if [[ -f ${BUILD_CACHE_DIR}/.skip_tests ]]; then diff --git a/scripts/ci/testing/ci_run_airflow_testing.sh b/scripts/ci/testing/ci_run_airflow_testing.sh index c65a0580a62db..27ded3ecec2e2 100755 --- a/scripts/ci/testing/ci_run_airflow_testing.sh +++ b/scripts/ci/testing/ci_run_airflow_testing.sh @@ -29,8 +29,6 @@ export SEMAPHORE_NAME # shellcheck source=scripts/ci/libraries/_script_init.sh . "$( dirname "${BASH_SOURCE[0]}" )/../libraries/_script_init.sh" - - # Starts test types in parallel # test_types_to_run - list of test types (it's not an array, it is space-separate list) # ${@} - additional arguments to pass to test execution @@ -59,13 +57,13 @@ function run_test_types_in_parallel() { # Runs all test types in parallel depending on the number of CPUs available # We monitors their progress, display the progress and summarize the result when finished. # -# In case there is not enough memory (MEMORY_REQUIRED_FOR_INTEGRATION_TEST_PARALLEL_RUN) available for +# In case there is not enough memory (MEMORY_REQUIRED_FOR_HEAVY_TEST_PARALLEL_RUN) available for # the docker engine, the integration tests (which take a lot of memory for all the integrations) # are run sequentially after all other tests were run in parallel. # # Input: # * TEST_TYPES - contains all test types that should be executed -# * MEMORY_REQUIRED_FOR_INTEGRATION_TEST_PARALLEL_RUN - memory in bytes required to run integration tests +# * MEMORY_REQUIRED_FOR_HEAVY_TEST_PARALLEL_RUN - memory in bytes required to run integration tests # in parallel to other tests # function run_all_test_types_in_parallel() { @@ -75,46 +73,38 @@ function run_all_test_types_in_parallel() { echo echo "${COLOR_YELLOW}Running maximum ${MAX_PARALLEL_TEST_JOBS} test types in parallel${COLOR_RESET}" echo - - local run_integration_tests_separately="false" - local run_providers_tests_separately="false" + local sequential_tests=() # shellcheck disable=SC2153 local test_types_to_run=${TEST_TYPES} - if [[ ${test_types_to_run} == *"Integration"* ]]; then - if (( MEMORY_AVAILABLE_FOR_DOCKER < MEMORY_REQUIRED_FOR_INTEGRATION_TEST_PARALLEL_RUN )) ; then - # In case of Integration tests - they need more resources (Memory) thus we only run them in - # parallel if we have more than 32 GB memory available. Otherwise we run them sequentially - # after cleaning up the memory and stopping all docker instances - echo "" - echo "${COLOR_YELLOW}There is not enough memory to run Integration test in parallel${COLOR_RESET}" - echo "${COLOR_YELLOW} Available memory: ${MEMORY_AVAILABLE_FOR_DOCKER}${COLOR_RESET}" - echo "${COLOR_YELLOW} Required memory: ${MEMORY_REQUIRED_FOR_INTEGRATION_TEST_PARALLEL_RUN}${COLOR_RESET}" - echo "" - echo "${COLOR_YELLOW}Integration tests will be run separately at the end after cleaning up docker${COLOR_RESET}" - echo "" - if [[ ${BACKEND} == "mssql" ]]; then - # Skip running "Integration" and "Providers" tests for low memory condition for mssql - # Both might lead to memory issues even in run on their own. We have no need to - # Test those specifically for MSSQL (and they will be tested in `main` as there - # We have no memory limits - test_types_to_run="${test_types_to_run//Integration/}" - run_integration_tests_separately="false" - test_types_to_run="${test_types_to_run//Providers/}" - run_providers_tests_separately="false" - elif [[ ${BACKEND} == "mysql" ]]; then - # Separate "Integration" and "Providers" tests for low memory condition for mysql - # To not run them in parallel with other tests as this often leads to memory issue - # (Error 137 or 143). - test_types_to_run="${test_types_to_run//Integration/}" - run_integration_tests_separately="true" + if (( MEMORY_AVAILABLE_FOR_DOCKER < MEMORY_REQUIRED_FOR_HEAVY_TEST_PARALLEL_RUN )) ; then + # In case of Heavy tests - they need more resources (Memory) thus we only run them in + # parallel if we have more than 32 GB memory available. Otherwise we run them sequentially + # after cleaning up the memory and stopping all docker instances + echo "" + echo "${COLOR_YELLOW}There is not enough memory to run heavy test in parallel${COLOR_RESET}" + echo "${COLOR_YELLOW} Available memory: ${MEMORY_AVAILABLE_FOR_DOCKER}${COLOR_RESET}" + echo "${COLOR_YELLOW} Required memory: ${MEMORY_REQUIRED_FOR_HEAVY_TEST_PARALLEL_RUN}${COLOR_RESET}" + echo "" + echo "${COLOR_YELLOW}Heavy tests will be run sequentially after parallel tests including cleaning up docker between tests${COLOR_RESET}" + echo "" + if [[ ${test_types_to_run} == *"Integration"* ]]; then + echo "${COLOR_YELLOW}Remove Integration from tests_types_to_run and add them to sequential tests due to low memory.${COLOR_RESET}" + test_types_to_run="${test_types_to_run//Integration/}" + sequential_tests+=("Integration") + fi + if [[ ${test_types_to_run} == *"Core"* ]]; then + echo "${COLOR_YELLOW}Remove Core from tests_types_to_run and add them to sequential tests due to low memory.${COLOR_RESET}" + test_types_to_run="${test_types_to_run//Core/}" + sequential_tests+=("Core") + fi + if [[ ${BACKEND} == "mssql" || ${BACKEND} == "mysql" ]]; then + # For mssql/mysql - they take far more memory than postgres (or sqlite) - we skip the Provider + # tests altogether as they take too much memory even if run sequentially. + # Those tests will run in `main` anyway. + if [[ ${test_types_to_run} == *"Providers"* ]]; then + echo "${COLOR_YELLOW}Remove Providers from tests_types_to_run and skip running them altogether (mysql/mssql case).${COLOR_RESET}" test_types_to_run="${test_types_to_run//Providers/}" - run_providers_tests_separately="true" - else - # Remove Integration from list of tests to run in parallel - # and run them separately for all other backends - test_types_to_run="${test_types_to_run//Integration/}" - run_integration_tests_separately="true" fi fi fi @@ -123,17 +113,15 @@ function run_all_test_types_in_parallel() { parallel::initialize_monitoring + # Run all tests that should run in parallel (from test_types_to_run variable) run_test_types_in_parallel "${@}" - if [[ ${run_providers_tests_separately} == "true" ]]; then - parallel::cleanup_runner - test_types_to_run="Providers" - run_test_types_in_parallel "${@}" - fi - if [[ ${run_integration_tests_separately} == "true" ]]; then + + # if needed run remaining tests sequentially + for sequential_test in "${sequential_tests[@]}"; do parallel::cleanup_runner - test_types_to_run="Integration" + test_types_to_run="${sequential_test}" run_test_types_in_parallel "${@}" - fi + done set -e # this will exit with error code in case some of the non-Quarantined tests failed parallel::print_job_summary_and_return_status_code From 94b923d766630e16804cd5dd7bc090dd0e63542a Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 10 Nov 2021 22:38:33 +0100 Subject: [PATCH 046/250] Disable test code coverage for PRs (#19523) The test code coverage took a lot of memory (2-3GB) when core tests were running (specifically test_kubernetes_executor.py) and the memory was kept for the duration of whole test. This caused intermittent memory issues on public GitHub runners. The change only uses test coverage for `main` builds where we have a lot of memory available. (cherry picked from commit 83011b7f87eef81ddd34722f1bc9842237491cad) --- .github/workflows/ci.yml | 1 + scripts/ci/docker-compose/_docker.env | 1 + scripts/ci/docker-compose/base.yml | 1 + scripts/in_container/entrypoint_ci.sh | 15 +++++++++++---- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f40ac7b88aa51..51ef0374c26f5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,6 +51,7 @@ env: GITHUB_REGISTRY_PUSH_IMAGE_TAG: "latest" INSTALL_PROVIDERS_FROM_SOURCES: "true" AIRFLOW_LOGIN_TO_GITHUB_REGISTRY: "true" + ENABLE_TEST_COVERAGE: "${{ github.event_name == 'push' }}" # You can switch between building the image in "Build Images" workflow or building them in CI workflow # Separately for each job. diff --git a/scripts/ci/docker-compose/_docker.env b/scripts/ci/docker-compose/_docker.env index 08fb37cf076ee..72a1afc2d8b8c 100644 --- a/scripts/ci/docker-compose/_docker.env +++ b/scripts/ci/docker-compose/_docker.env @@ -30,6 +30,7 @@ DEFAULT_BRANCH DEFAULT_CONSTRAINTS_BRANCH ENABLED_INTEGRATIONS ENABLED_SYSTEMS +ENABLE_TEST_COVERAGE GITHUB_ACTIONS GITHUB_REGISTRY_PULL_IMAGE_TAG HOST_USER_ID diff --git a/scripts/ci/docker-compose/base.yml b/scripts/ci/docker-compose/base.yml index 9c68c18fac16c..8413179a665a5 100644 --- a/scripts/ci/docker-compose/base.yml +++ b/scripts/ci/docker-compose/base.yml @@ -44,6 +44,7 @@ services: - DEFAULT_CONSTRAINTS_BRANCH=${DEFAULT_CONSTRAINTS_BRANCH} - ENABLED_INTEGRATIONS=${ENABLED_INTEGRATIONS} - ENABLED_SYSTEMS=${ENABLED_SYSTEMS} + - ENABLE_TEST_COVERAGE=${ENABLE_TEST_COVERAGE} - GITHUB_ACTIONS=${GITHUB_ACTIONS} - GITHUB_REGISTRY_PULL_IMAGE_TAG=${GITHUB_REGISTRY_PULL_IMAGE_TAG} - HOST_USER_ID=${HOST_USER_ID} diff --git a/scripts/in_container/entrypoint_ci.sh b/scripts/in_container/entrypoint_ci.sh index 78909f72fb532..1416149d2db50 100755 --- a/scripts/in_container/entrypoint_ci.sh +++ b/scripts/in_container/entrypoint_ci.sh @@ -205,11 +205,10 @@ EXTRA_PYTEST_ARGS=( "--verbosity=0" "--strict-markers" "--durations=100" - "--cov=airflow/" - "--cov-config=.coveragerc" - "--cov-report=xml:/files/coverage-${TEST_TYPE}-${BACKEND}.xml" - "--color=yes" "--maxfail=50" + "--color=yes" + "--pythonwarnings=ignore::DeprecationWarning" + "--pythonwarnings=ignore::PendingDeprecationWarning" "--junitxml=${RESULT_LOG_FILE}" # timeouts in seconds for individual tests "--timeouts-order" @@ -240,6 +239,14 @@ else ) fi +if [[ ${ENABLE_TEST_COVERAGE:="false"} == "true" ]]; then + EXTRA_PYTEST_ARGS+=( + "--cov=airflow/" + "--cov-config=.coveragerc" + "--cov-report=xml:/files/coverage-${TEST_TYPE}-${BACKEND}.xml" + ) +fi + declare -a SELECTED_TESTS CLI_TESTS API_TESTS PROVIDERS_TESTS CORE_TESTS WWW_TESTS \ ALL_TESTS ALL_PRESELECTED_TESTS ALL_OTHER_TESTS From 75dc1c0a3936d463d1ccb3e577c2a834cbb58327 Mon Sep 17 00:00:00 2001 From: Ash Berlin-Taylor Date: Fri, 12 Nov 2021 12:41:52 +0000 Subject: [PATCH 047/250] Upload KinD logs on cancell too (#19554) If the job times out, it is "cancelled", rather than failed, which means that the logs were not uploaded. This will likely also catch a few cases where the job is cancelled cos of another push to the branch/PR, but it's better to have too many logs than not enough to debug problems (cherry picked from commit 4f1e66d0227fba7a1378e895b9186711c03ead61) --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 51ef0374c26f5..d4e06a62325e3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1058,7 +1058,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" PR_LABELS: "${{ needs.build-info.outputs.pullRequestLabels }}" - name: "Upload KinD logs" uses: actions/upload-artifact@v2 - if: failure() + if: failure() || cancelled() with: name: > kind-logs-${{matrix.executor}} @@ -1126,7 +1126,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" PR_LABELS: "${{ needs.build-info.outputs.pullRequestLabels }}" - name: "Upload KinD logs" uses: actions/upload-artifact@v2 - if: failure() + if: failure() || cancelled() with: name: > kind-logs-KubernetesExecutor From 68f729cb3cab4c1daef8d20a24068030accc0576 Mon Sep 17 00:00:00 2001 From: Ash Berlin-Taylor Date: Mon, 15 Nov 2021 18:46:33 +0000 Subject: [PATCH 048/250] Stop polling when Webserver doesn't start up in Kube tests (#19598) (cherry picked from commit 6c20444cc688621795dc46a640b3885a9e735e47) --- scripts/ci/libraries/_kind.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/libraries/_kind.sh b/scripts/ci/libraries/_kind.sh index 1fb77ebe2e1e1..5b3ae69a11746 100644 --- a/scripts/ci/libraries/_kind.sh +++ b/scripts/ci/libraries/_kind.sh @@ -298,6 +298,7 @@ function kind::wait_for_webserver_healthy() { echo echo "${COLOR_RED}ERROR: Timeout while waiting for the webserver health check ${COLOR_RESET}" echo + return 1 fi done echo From e8a8566c7867e9e378d006fe6d196eed15136af6 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Mon, 15 Nov 2021 22:34:12 +0100 Subject: [PATCH 049/250] Improve automation for docker image release (#19573) The "latest" tags for docker images were not applied in recent releases - mainly because the process of doing it was not followed, but this was also not obvious as preparing the rc image and final images was different - the latest images required extra manual step. This PR modifies the "image preparation" script to ask a question whether the latest images should be tagged when non-RC build is being prepared. Fixes: #19569 (cherry picked from commit 4a072725cbe63bff8f69b05dfb960134783ee17e) --- dev/README_RELEASE_AIRFLOW.md | 14 +++----- .../ci/tools/prepare_prod_docker_images.sh | 33 +++++++++++++++++-- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index 8abc4234494a8..13855279d1464 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -654,15 +654,11 @@ previously released RC candidates in "${AIRFLOW_SOURCES}/dist": ./scripts/ci/tools/prepare_prod_docker_images.sh ${VERSION} ``` -This will wipe Breeze cache and docker-context-files in order to make sure the build is "clean". It -also performs image verification before pushing the images. - -If this is the newest image released, push the latest image as well. - -```shell script -docker tag "apache/airflow:${VERSION}" "apache/airflow:latest" -docker push "apache/airflow:latest" -``` +If you release 'official' (non-rc) version you will be asked if you want to +tag the images as latest - if you are releasing the latest stable branch, you +should answer y and tags will be created and pushed. If you are releasing a +patch release from an older branch, you should answer n and creating tags will +be skipped. ## Publish documentation diff --git a/scripts/ci/tools/prepare_prod_docker_images.sh b/scripts/ci/tools/prepare_prod_docker_images.sh index 928f282e3ee41..bc04d349a5ee3 100755 --- a/scripts/ci/tools/prepare_prod_docker_images.sh +++ b/scripts/ci/tools/prepare_prod_docker_images.sh @@ -18,6 +18,8 @@ AIRFLOW_SOURCES_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"/../../../ && pwd)" export AIRFLOW_SOURCES_DIR +CURRENT_PYTHON_MAJOR_MINOR_VERSIONS=("3.7" "3.8" "3.9" "3.6") + usage() { local cmdname cmdname="$(basename -- "$0")" @@ -38,8 +40,35 @@ fi export INSTALL_AIRFLOW_VERSION="${1}" -for python_version in "3.6" "3.7" "3.8" "3.9" +for python_version in "${CURRENT_PYTHON_MAJOR_MINOR_VERSIONS[@]}" do export PYTHON_MAJOR_MINOR_VERSION=${python_version} - "${AIRFLOW_SOURCES_DIR}/scripts/ci/tools/build_dockerhub.sh" + echo "${AIRFLOW_SOURCES_DIR}/scripts/ci/tools/build_dockerhub.sh" +done + +if [[ ${INSTALL_AIRFLOW_VERSION} =~ .*rc.* ]]; then + echo + echo "Skipping tagging latest as this is an rc version" + echo + exit +fi + +echo "Should we tag version ${1} with latest tag [y/N]" +read -r RESPONSE + +if [[ ${RESPONSE} == 'n' || ${RESPONSE} = 'N' ]]; then + echo + echo "Skip tagging the image with latest tag." + echo + exit +fi + +for python_version in "${CURRENT_PYTHON_MAJOR_MINOR_VERSIONS[@]}" +do + echo docker tag "apache/airflow:${INSTALL_AIRFLOW_VERSION}-python${python_version}" \ + "apache/airflow:latest-python${python_version}" + echo docker push "apache/airflow:latest-python${python_version}" done + +echo docker tag "apache/airflow:${INSTALL_AIRFLOW_VERSION}" "apache/airflow:latest" +echo docker push "apache/airflow:latest" From 97c00695d4860760de02e47cde85421d9314c1dd Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 16 Nov 2021 18:06:41 +0100 Subject: [PATCH 050/250] Move scripts for prod image preparation to dev (#19623) The script belongs to dev. Also it had `echo` debug commands left after testing that are removed now. (cherry picked from commit d02c11780adfb56a788799c56ae23d875b7610dd) --- dev/README_RELEASE_AIRFLOW.md | 4 ++-- .../ci/tools => dev}/prepare_prod_docker_images.sh | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) rename {scripts/ci/tools => dev}/prepare_prod_docker_images.sh (80%) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index 13855279d1464..5b7f8e38e1b3c 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -238,7 +238,7 @@ is not supposed to be used by and advertised to the end-users who do not read th Production Docker images should be manually prepared and pushed by the release manager. ```shell script -./scripts/ci/tools/prepare_prod_docker_images.sh ${VERSION} +./dev/prepare_prod_docker_images.sh ${VERSION} ``` This will wipe Breeze cache and docker-context-files in order to make sure the build is "clean". It @@ -651,7 +651,7 @@ previously released RC candidates in "${AIRFLOW_SOURCES}/dist": ```shell script -./scripts/ci/tools/prepare_prod_docker_images.sh ${VERSION} +./dev/prepare_prod_docker_images.sh ${VERSION} ``` If you release 'official' (non-rc) version you will be asked if you want to diff --git a/scripts/ci/tools/prepare_prod_docker_images.sh b/dev/prepare_prod_docker_images.sh similarity index 80% rename from scripts/ci/tools/prepare_prod_docker_images.sh rename to dev/prepare_prod_docker_images.sh index bc04d349a5ee3..c2a3ad2f13f27 100755 --- a/scripts/ci/tools/prepare_prod_docker_images.sh +++ b/dev/prepare_prod_docker_images.sh @@ -15,7 +15,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -AIRFLOW_SOURCES_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"/../../../ && pwd)" +AIRFLOW_SOURCES_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)" export AIRFLOW_SOURCES_DIR CURRENT_PYTHON_MAJOR_MINOR_VERSIONS=("3.7" "3.8" "3.9" "3.6") @@ -43,7 +43,7 @@ export INSTALL_AIRFLOW_VERSION="${1}" for python_version in "${CURRENT_PYTHON_MAJOR_MINOR_VERSIONS[@]}" do export PYTHON_MAJOR_MINOR_VERSION=${python_version} - echo "${AIRFLOW_SOURCES_DIR}/scripts/ci/tools/build_dockerhub.sh" + "${AIRFLOW_SOURCES_DIR}/scripts/ci/tools/build_dockerhub.sh" done if [[ ${INSTALL_AIRFLOW_VERSION} =~ .*rc.* ]]; then @@ -65,10 +65,10 @@ fi for python_version in "${CURRENT_PYTHON_MAJOR_MINOR_VERSIONS[@]}" do - echo docker tag "apache/airflow:${INSTALL_AIRFLOW_VERSION}-python${python_version}" \ + docker tag "apache/airflow:${INSTALL_AIRFLOW_VERSION}-python${python_version}" \ "apache/airflow:latest-python${python_version}" - echo docker push "apache/airflow:latest-python${python_version}" + docker push "apache/airflow:latest-python${python_version}" done -echo docker tag "apache/airflow:${INSTALL_AIRFLOW_VERSION}" "apache/airflow:latest" -echo docker push "apache/airflow:latest" +docker tag "apache/airflow:${INSTALL_AIRFLOW_VERSION}" "apache/airflow:latest" +docker push "apache/airflow:latest" From 82964ef3be722cb6898dd0226af75fb821166b19 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 16 Nov 2021 22:12:25 +0100 Subject: [PATCH 051/250] Disable yarn-dev in start-airflow command (#19626) When you run start-airflow, by default it also run `yarn dev` command, however if you've never built assets before, yarn dev is very slow first time and the webserver started before the dist folder was even created which caused asset-less airflow experience. There was another race condition even if you did build the assets before. If you run start-airflow on MacOS or Windows when the filesystem was slow, there could be a case that yarn dev cleaned up the dist folder while webserver was starting and it could lead again to asset-less experience if you were unlucky. Also running `yarn dev` has the side effect of removing the checksum file which is used to see if any of the assets changed and whether they need recompilation. As the result after running `start-airflow` you always got the warning that the assets need recompilation. This PR disables automated start of `yarn dev` and suggests to run it manually instead if there is a need for dynamic asset recompilation. Also when `start-airflow` is run and we are starting airflow from sources rather than PyPI, asset compilation is executed if the checksum is missing or does not match the source files. Related to: #19566 (cherry picked from commit 510ff6277585d8de411f83b7f0ec0b5d52ce685c) --- .../www/ask_for_recompile_assets_if_needed.sh | 19 +++++++++++++------ scripts/in_container/bin/run_tmux | 6 ------ scripts/in_container/run_tmux_welcome.sh | 4 +++- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/airflow/www/ask_for_recompile_assets_if_needed.sh b/airflow/www/ask_for_recompile_assets_if_needed.sh index 0d8f7800db9eb..4fba5bcaa1e29 100755 --- a/airflow/www/ask_for_recompile_assets_if_needed.sh +++ b/airflow/www/ask_for_recompile_assets_if_needed.sh @@ -30,12 +30,19 @@ NO_COLOR='\033[0m' md5sum=$(find package.json yarn.lock static/css static/js -type f | sort | xargs md5sum) old_md5sum=$(cat "${MD5SUM_FILE}" 2>/dev/null || true) if [[ ${old_md5sum} != "${md5sum}" ]]; then - echo - echo -e "${YELLOW}WARNING: It seems that the generated assets files do not match the content of the sources.${NO_COLOR}" - echo "To recompile assets, run:" - echo "" - echo " ./airflow/www/compile_assets.sh" - echo "" + if [[ ${START_AIRFLOW} == "true" && ${USE_AIRFLOW_VERSION} == "" ]]; then + echo + echo -e "${YELLOW}Recompiling assets as they have changed and you need them for 'start_airflow' command${NO_COLOR}" + echo + ./compile_assets.sh + else + echo + echo -e "${YELLOW}WARNING: It seems that the generated assets files do not match the content of the sources.${NO_COLOR}" + echo "To recompile assets, run:" + echo "" + echo " ./airflow/www/compile_assets.sh" + echo "" + fi else echo echo -e "${GREEN}No need for www assets recompilation.${NO_COLOR}" diff --git a/scripts/in_container/bin/run_tmux b/scripts/in_container/bin/run_tmux index f405b6d9ecf48..4fa757b8e21b4 100755 --- a/scripts/in_container/bin/run_tmux +++ b/scripts/in_container/bin/run_tmux @@ -56,12 +56,6 @@ tmux split-window -h tmux select-pane -t 2 tmux send-keys 'airflow webserver' C-m -if [[ -z "${USE_AIRFLOW_VERSION=}" ]]; then - tmux select-pane -t 0 - tmux split-window -h - tmux send-keys 'cd /opt/airflow/airflow/www/; yarn install --frozen-lockfile; yarn dev' C-m -fi - if python -c 'import sys; sys.exit(sys.version_info < (3, 7))'; then tmux select-pane -t 0 tmux split-window -h diff --git a/scripts/in_container/run_tmux_welcome.sh b/scripts/in_container/run_tmux_welcome.sh index 68360c6a7c12f..91d69406e2dcf 100755 --- a/scripts/in_container/run_tmux_welcome.sh +++ b/scripts/in_container/run_tmux_welcome.sh @@ -19,5 +19,7 @@ cd /opt/airflow/ || exit clear echo "Welcome to your tmux based running Airflow environment (courtesy of Breeze)." echo -echo " To stop Airflow and exit tmux just type 'stop_airflow'." +echo " To stop Airflow and exit tmux, just type 'stop_airflow'." +echo +echo " If you want to rebuild webserver assets dynamically, run 'cd airflow/www; yarn && yarn dev' and restart airflow webserver with '-d' flag." echo From 4b09facbf9186ca6b60abf58b57e95468c7c6de9 Mon Sep 17 00:00:00 2001 From: Khalid Mammadov Date: Wed, 17 Nov 2021 13:41:52 +0000 Subject: [PATCH 052/250] Fix failing CI phase with unhealthy container issue (#19633) Fix failing CI phase with unhealthy container issue * Add post cleanup * Pin pinot to stable version * Pin grafana to stable version Co-authored-by: Jarek Potiuk (cherry picked from commit fcf90c5970aaf7043b1a57d58296d7fd80d6ebf9) --- scripts/ci/docker-compose/integration-pinot.yml | 2 +- scripts/ci/docker-compose/integration-statsd.yml | 2 +- scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/ci/docker-compose/integration-pinot.yml b/scripts/ci/docker-compose/integration-pinot.yml index 923afdf237601..70262e1ad7832 100644 --- a/scripts/ci/docker-compose/integration-pinot.yml +++ b/scripts/ci/docker-compose/integration-pinot.yml @@ -18,7 +18,7 @@ version: "3.7" services: pinot: - image: apachepinot/pinot:latest + image: apachepinot/pinot:0.8.0 ports: - "9080:9080" volumes: diff --git a/scripts/ci/docker-compose/integration-statsd.yml b/scripts/ci/docker-compose/integration-statsd.yml index e7847597f20e6..458d29a06d1da 100644 --- a/scripts/ci/docker-compose/integration-statsd.yml +++ b/scripts/ci/docker-compose/integration-statsd.yml @@ -25,7 +25,7 @@ services: - "29102:9102" grafana: - image: grafana/grafana + image: grafana/grafana:8.2.4 ports: - "23000:3000" diff --git a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh index d97310ac7b260..90e5b0d0f3483 100755 --- a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh +++ b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh @@ -113,6 +113,7 @@ function run_airflow_testing_in_docker() { echo "Making sure docker-compose is down and remnants removed" echo docker-compose -f "${SCRIPTS_CI_DIR}/docker-compose/base.yml" \ + "${INTEGRATIONS[@]}" \ --project-name "airflow-${TEST_TYPE}-${BACKEND}" \ down --remove-orphans \ --volumes --timeout 10 @@ -123,8 +124,10 @@ function run_airflow_testing_in_docker() { "${DOCKER_COMPOSE_LOCAL[@]}" \ --project-name "airflow-${TEST_TYPE}-${BACKEND}" \ run airflow "${@}" + docker ps exit_code=$? docker-compose --log-level INFO -f "${SCRIPTS_CI_DIR}/docker-compose/base.yml" \ + "${INTEGRATIONS[@]}" \ --project-name "airflow-${TEST_TYPE}-${BACKEND}" \ down --remove-orphans \ --volumes --timeout 10 From 6561e7b212a2c4f48a9541240546c75406ad235f Mon Sep 17 00:00:00 2001 From: Josh Fell <48934154+josh-fell@users.noreply.github.com> Date: Wed, 17 Nov 2021 13:15:59 -0500 Subject: [PATCH 053/250] Clean up dynamic `start_date` values from docs (#19607) (cherry picked from commit 26e4e114683c23e600b7c5b2d062309449c5087c) --- UPDATING.md | 6 ++++-- airflow/smart_sensor_dags/smart_sensor_group.py | 10 ++-------- docs/apache-airflow/best-practices.rst | 7 ++++++- docs/apache-airflow/concepts/dags.rst | 8 +++++--- docs/apache-airflow/concepts/operators.rst | 3 ++- docs/apache-airflow/dag-run.rst | 10 ++++++++-- docs/apache-airflow/lineage.rst | 9 ++++----- docs/apache-airflow/tutorial.rst | 3 ++- .../extending/embedding-dags/test_dag.py | 10 +++++++--- 9 files changed, 40 insertions(+), 26 deletions(-) diff --git a/UPDATING.md b/UPDATING.md index ca342d0c917c0..718da8b5dea03 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -175,8 +175,9 @@ Similarly, `DAG.concurrency` has been renamed to `DAG.max_active_tasks`. ```python dag = DAG( dag_id="example_dag", + start_date=datetime(2021, 1, 1), + catchup=False, concurrency=3, - start_date=days_ago(2), ) ``` @@ -185,8 +186,9 @@ dag = DAG( ```python dag = DAG( dag_id="example_dag", + start_date=datetime(2021, 1, 1), + catchup=False, max_active_tasks=3, - start_date=days_ago(2), ) ``` diff --git a/airflow/smart_sensor_dags/smart_sensor_group.py b/airflow/smart_sensor_dags/smart_sensor_group.py index b9b6989ad1167..df6329c407567 100644 --- a/airflow/smart_sensor_dags/smart_sensor_group.py +++ b/airflow/smart_sensor_dags/smart_sensor_group.py @@ -17,16 +17,11 @@ # under the License. """Smart sensor DAGs managing all smart sensor tasks.""" -from datetime import timedelta +from datetime import datetime, timedelta from airflow.configuration import conf from airflow.models import DAG from airflow.sensors.smart_sensor import SmartSensorOperator -from airflow.utils.dates import days_ago - -args = { - 'owner': 'airflow', -} num_smart_sensor_shard = conf.getint("smart_sensor", "shards") shard_code_upper_limit = conf.getint('smart_sensor', 'shard_code_upper_limit') @@ -38,13 +33,12 @@ dag_id = f'smart_sensor_group_shard_{i}' dag = DAG( dag_id=dag_id, - default_args=args, schedule_interval=timedelta(minutes=5), max_active_tasks=1, max_active_runs=1, catchup=False, dagrun_timeout=timedelta(hours=24), - start_date=days_ago(2), + start_date=datetime(2021, 1, 1), ) SmartSensorOperator( diff --git a/docs/apache-airflow/best-practices.rst b/docs/apache-airflow/best-practices.rst index 72ef492e488d3..5ebed3be03095 100644 --- a/docs/apache-airflow/best-practices.rst +++ b/docs/apache-airflow/best-practices.rst @@ -239,7 +239,12 @@ Then you can import and use the ``ALL_TASKS`` constant in all your DAGs like tha from my_company_utils.common import ALL_TASKS - with DAG(dag_id="my_dag", schedule_interval=None, start_date=days_ago(2)) as dag: + with DAG( + dag_id="my_dag", + schedule_interval=None, + start_date=datetime(2021, 1, 1), + catchup=False, + ) as dag: for task in ALL_TASKS: # create your operators and relations here pass diff --git a/docs/apache-airflow/concepts/dags.rst b/docs/apache-airflow/concepts/dags.rst index 83d1cbd7f6dcb..563264e7bb3f5 100644 --- a/docs/apache-airflow/concepts/dags.rst +++ b/docs/apache-airflow/concepts/dags.rst @@ -37,18 +37,20 @@ Declaring a DAG There are three ways to declare a DAG - either you can use a context manager, which will add the DAG to anything inside it implicitly:: - with DAG("my_dag_name") as dag: + with DAG( + "my_dag_name", start_date=datetime(2021, 1, 1), schedule_interval="@daily", catchup=False + ) as dag: op = DummyOperator(task_id="task") Or, you can use a standard constructor, passing the dag into any operators you use:: - my_dag = DAG("my_dag_name") + my_dag = DAG("my_dag_name", start_date=datetime(2021, 1, 1), schedule_interval="@daily", catchup=False) op = DummyOperator(task_id="task", dag=my_dag) Or, you can use the ``@dag`` decorator to :ref:`turn a function into a DAG generator `:: - @dag(start_date=days_ago(2)) + @dag(start_date=datetime(2021, 1, 1), schedule_interval="@daily", catchup=False) def generate_dag(): op = DummyOperator(task_id="task") diff --git a/docs/apache-airflow/concepts/operators.rst b/docs/apache-airflow/concepts/operators.rst index 695e9d39c4cd3..c66c2ebef322a 100644 --- a/docs/apache-airflow/concepts/operators.rst +++ b/docs/apache-airflow/concepts/operators.rst @@ -175,7 +175,8 @@ you can pass ``render_template_as_native_obj=True`` to the DAG as follows: dag = DAG( dag_id="example_template_as_python_object", schedule_interval=None, - start_date=days_ago(2), + start_date=datetime(2021, 1, 1), + catchup=False, render_template_as_native_obj=True, ) diff --git a/docs/apache-airflow/dag-run.rst b/docs/apache-airflow/dag-run.rst index d4d2b7752e25a..39bd9d2122fbb 100644 --- a/docs/apache-airflow/dag-run.rst +++ b/docs/apache-airflow/dag-run.rst @@ -229,11 +229,17 @@ Example of a parameterized DAG: .. code-block:: python + from datetime import datetime + from airflow import DAG from airflow.operators.bash import BashOperator - from airflow.utils.dates import days_ago - dag = DAG("example_parameterized_dag", schedule_interval=None, start_date=days_ago(2)) + dag = DAG( + "example_parameterized_dag", + schedule_interval=None, + start_date=datetime(2021, 1, 1), + catchup=False, + ) parameterized_task = BashOperator( task_id="parameterized_task", diff --git a/docs/apache-airflow/lineage.rst b/docs/apache-airflow/lineage.rst index 3680010a08b9c..f0b79aac1aada 100644 --- a/docs/apache-airflow/lineage.rst +++ b/docs/apache-airflow/lineage.rst @@ -30,22 +30,21 @@ works. .. code-block:: python + from datetime import datetime, timedelta + from airflow.operators.bash import BashOperator from airflow.operators.dummy import DummyOperator from airflow.lineage import AUTO from airflow.lineage.entities import File from airflow.models import DAG - from airflow.utils.dates import days_ago - from datetime import timedelta FILE_CATEGORIES = ["CAT1", "CAT2", "CAT3"] - args = {"owner": "airflow", "start_date": days_ago(2)} - dag = DAG( dag_id="example_lineage", - default_args=args, + start_date=datetime(2021, 1, 1), schedule_interval="0 0 * * *", + catchup=False, dagrun_timeout=timedelta(minutes=60), ) diff --git a/docs/apache-airflow/tutorial.rst b/docs/apache-airflow/tutorial.rst index 818db39e237c2..df87f969df25a 100644 --- a/docs/apache-airflow/tutorial.rst +++ b/docs/apache-airflow/tutorial.rst @@ -470,7 +470,8 @@ Lets look at our DAG: @dag( schedule_interval="0 0 * * *", - start_date=datetime.today() - timedelta(days=2), + start_date=datetime(2021, 1, 1), + catchup=False, dagrun_timeout=timedelta(minutes=60), ) def Etl(): diff --git a/docs/docker-stack/docker-examples/extending/embedding-dags/test_dag.py b/docs/docker-stack/docker-examples/extending/embedding-dags/test_dag.py index 467c8c3e6539e..25ceeba6d3e8e 100644 --- a/docs/docker-stack/docker-examples/extending/embedding-dags/test_dag.py +++ b/docs/docker-stack/docker-examples/extending/embedding-dags/test_dag.py @@ -21,15 +21,19 @@ from airflow.models.dag import DAG from airflow.operators.dummy import DummyOperator -from airflow.utils.dates import days_ago now = datetime.now() now_to_the_hour = (now - timedelta(0, 0, 0, 0, 0, 3)).replace(minute=0, second=0, microsecond=0) START_DATE = now_to_the_hour DAG_NAME = 'test_dag_v1' -default_args = {'owner': 'airflow', 'depends_on_past': True, 'start_date': days_ago(2)} -dag = DAG(DAG_NAME, schedule_interval='*/10 * * * *', default_args=default_args) +dag = DAG( + DAG_NAME, + schedule_interval='*/10 * * * *', + default_args={'depends_on_past': True}, + start_date=datetime(2021, 1, 1), + catchup=False, +) run_this_1 = DummyOperator(task_id='run_this_1', dag=dag) run_this_2 = DummyOperator(task_id='run_this_2', dag=dag) From 6aacc6647ead714891da2ad5cef8a51b953af44a Mon Sep 17 00:00:00 2001 From: Juho Lauri <9369863+jplauri@users.noreply.github.com> Date: Wed, 17 Nov 2021 20:12:17 +0200 Subject: [PATCH 054/250] Misc. documentation typos and language improvements (#19599) (cherry picked from commit 355dec8fea5e2ef1a9b88363f201fce4f022fef3) --- airflow/hooks/dbapi.py | 2 +- airflow/operators/generic_transfer.py | 2 +- airflow/providers/google/cloud/hooks/workflows.py | 4 ++-- airflow/providers/google/cloud/operators/workflows.py | 4 ++-- airflow/providers/postgres/hooks/postgres.py | 4 ++-- airflow/providers/sqlite/hooks/sqlite.py | 2 +- scripts/in_container/run_generate_constraints.sh | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/airflow/hooks/dbapi.py b/airflow/hooks/dbapi.py index 3c51e6f8a3572..1cd1444099fc6 100644 --- a/airflow/hooks/dbapi.py +++ b/airflow/hooks/dbapi.py @@ -262,7 +262,7 @@ def get_cursor(self): @staticmethod def _generate_insert_sql(table, values, target_fields, replace, **kwargs): """ - Static helper method that generate the INSERT SQL statement. + Static helper method that generates the INSERT SQL statement. The REPLACE variant is specific to MySQL syntax. :param table: Name of the target table diff --git a/airflow/operators/generic_transfer.py b/airflow/operators/generic_transfer.py index 1bdfa79c0e3e2..15ba9cb8a4a04 100644 --- a/airflow/operators/generic_transfer.py +++ b/airflow/operators/generic_transfer.py @@ -36,7 +36,7 @@ class GenericTransfer(BaseOperator): :type destination_table: str :param source_conn_id: source connection :type source_conn_id: str - :param destination_conn_id: source connection + :param destination_conn_id: destination connection :type destination_conn_id: str :param preoperator: sql statement or list of statements to be executed prior to loading the data. (templated) diff --git a/airflow/providers/google/cloud/hooks/workflows.py b/airflow/providers/google/cloud/hooks/workflows.py index 87bc924f44e12..ed3716a8c944c 100644 --- a/airflow/providers/google/cloud/hooks/workflows.py +++ b/airflow/providers/google/cloud/hooks/workflows.py @@ -212,8 +212,8 @@ def list_workflows( :param filter_: Filter to restrict results to specific workflows. :type filter_: str - :param order_by: Comma-separated list of fields that that - specify the order of the results. Default sorting order for a field is ascending. + :param order_by: Comma-separated list of fields that + specifies the order of the results. Default sorting order for a field is ascending. To specify descending order for a field, append a "desc" suffix. If not specified, the results will be returned in an unspecified order. :type order_by: str diff --git a/airflow/providers/google/cloud/operators/workflows.py b/airflow/providers/google/cloud/operators/workflows.py index 8b1b49ca428de..1c434b5af4974 100644 --- a/airflow/providers/google/cloud/operators/workflows.py +++ b/airflow/providers/google/cloud/operators/workflows.py @@ -299,8 +299,8 @@ class WorkflowsListWorkflowsOperator(BaseOperator): :param filter_: Filter to restrict results to specific workflows. :type filter_: str - :param order_by: Comma-separated list of fields that that - specify the order of the results. Default sorting order for a field is ascending. + :param order_by: Comma-separated list of fields that + specifies the order of the results. Default sorting order for a field is ascending. To specify descending order for a field, append a "desc" suffix. If not specified, the results will be returned in an unspecified order. :type order_by: str diff --git a/airflow/providers/postgres/hooks/postgres.py b/airflow/providers/postgres/hooks/postgres.py index 67cc8b37209ae..a2608ef4151b9 100644 --- a/airflow/providers/postgres/hooks/postgres.py +++ b/airflow/providers/postgres/hooks/postgres.py @@ -227,8 +227,8 @@ def _generate_insert_sql( table: str, values: Tuple[str, ...], target_fields: Iterable[str], replace: bool, **kwargs ) -> str: """ - Static helper method that generate the INSERT SQL statement. - The REPLACE variant is specific to MySQL syntax. + Static helper method that generates the INSERT SQL statement. + The REPLACE variant is specific to PostgreSQL syntax. :param table: Name of the target table :type table: str diff --git a/airflow/providers/sqlite/hooks/sqlite.py b/airflow/providers/sqlite/hooks/sqlite.py index e4a43317859b5..47a5457097737 100644 --- a/airflow/providers/sqlite/hooks/sqlite.py +++ b/airflow/providers/sqlite/hooks/sqlite.py @@ -39,7 +39,7 @@ def get_conn(self) -> sqlite3.dbapi2.Connection: @staticmethod def _generate_insert_sql(table, values, target_fields, replace, **kwargs): """ - Static helper method that generate the INSERT SQL statement. + Static helper method that generates the INSERT SQL statement. The REPLACE variant is specific to MySQL syntax. :param table: Name of the target table diff --git a/scripts/in_container/run_generate_constraints.sh b/scripts/in_container/run_generate_constraints.sh index dbb349cee5647..9e44c1c0ee827 100755 --- a/scripts/in_container/run_generate_constraints.sh +++ b/scripts/in_container/run_generate_constraints.sh @@ -79,7 +79,7 @@ elif [[ ${GENERATE_CONSTRAINTS_MODE} == "pypi-providers" ]]; then # This variant of constraints install uses the HEAD of the branch version for 'apache-airflow' but installs # the providers from PIP-released packages at the moment of the constraint generation. # -# Those constraints are actually those that that regular users use to install released version of Airflow. +# Those constraints are actually those that regular users use to install released version of Airflow. # We also use those constraints after "apache-airflow" is released and the constraints are tagged with # "constraints-X.Y.Z" tag to build the production image for that version. # From 319d32768c988dfcfff51893be692f5ae39ca811 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 17 Nov 2021 19:49:46 +0100 Subject: [PATCH 055/250] Fix dumping container logs on error (#19645) When we optimized tests for memory use we added cleanup of all containers after each test suite. Unfortunately it caused dumping container logs to stop working because this dumping was done only only when the script was exiting. This PR moves dumping container logs to between the test run and cleanup, so that we can see the logs when there is a test failure. Related to: #19633 where the logs were not dumped and it made the analysis much more difficult. (cherry picked from commit 7cda7d4b5e413925bf639976e77ebf2442b4bff9) --- scripts/ci/libraries/_start_end.sh | 22 ------------------- scripts/ci/libraries/_testing.sh | 14 ++++++++++++ .../ci_run_single_airflow_test_in_docker.sh | 9 ++++++++ 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/scripts/ci/libraries/_start_end.sh b/scripts/ci/libraries/_start_end.sh index 35d9e1ccd7f61..fbca97dcea22c 100644 --- a/scripts/ci/libraries/_start_end.sh +++ b/scripts/ci/libraries/_start_end.sh @@ -71,21 +71,6 @@ function start_end::script_start { fi } -function start_end::dump_container_logs() { - start_end::group_start "${COLOR_BLUE}Dumping container logs ${container}${COLOR_RESET}" - local container="${1}" - local dump_file - dump_file=${AIRFLOW_SOURCES}/files/container_logs_${container}_$(date "+%Y-%m-%d")_${CI_BUILD_ID}_${CI_JOB_ID}.log - echo "${COLOR_BLUE}###########################################################################################${COLOR_RESET}" - echo " Dumping logs from ${container} container" - echo "${COLOR_BLUE}###########################################################################################${COLOR_RESET}" - docker_v logs "${container}" > "${dump_file}" - echo " Container ${container} logs dumped to ${dump_file}" - echo "${COLOR_BLUE}###########################################################################################${COLOR_RESET}" - start_end::group_end -} - - # # Trap function executed always at the end of the script. In case of verbose output it also # Prints the exit code that the script exits with. Removes verbosity of commands in case it was run with @@ -106,13 +91,6 @@ function start_end::script_end { echo echo "${COLOR_RED}ERROR: The previous step completed with error. Please take a look at output above ${COLOR_RESET}" echo - if [[ ${CI} == "true" ]]; then - local container - for container in $(docker ps --format '{{.Names}}') - do - start_end::dump_container_logs "${container}" - done - fi verbosity::print_info "${COLOR_RED}###########################################################################################${COLOR_RESET}" verbosity::print_info "${COLOR_RED} EXITING WITH STATUS CODE ${exit_code}${COLOR_RESET}" verbosity::print_info "${COLOR_RED}###########################################################################################${COLOR_RESET}" diff --git a/scripts/ci/libraries/_testing.sh b/scripts/ci/libraries/_testing.sh index 3c66a203469d3..6b387e70d53eb 100644 --- a/scripts/ci/libraries/_testing.sh +++ b/scripts/ci/libraries/_testing.sh @@ -114,3 +114,17 @@ function testing::get_test_types_to_run() { fi readonly TEST_TYPES } + +function testing::dump_container_logs() { + start_end::group_start "${COLOR_BLUE}Dumping container logs ${container}${COLOR_RESET}" + local container="${1}" + local dump_file + dump_file=${AIRFLOW_SOURCES}/files/container_logs_${container}_$(date "+%Y-%m-%d")_${CI_BUILD_ID}_${CI_JOB_ID}.log + echo "${COLOR_BLUE}###########################################################################################${COLOR_RESET}" + echo " Dumping logs from ${container} container" + echo "${COLOR_BLUE}###########################################################################################${COLOR_RESET}" + docker_v logs "${container}" > "${dump_file}" + echo " Container ${container} logs dumped to ${dump_file}" + echo "${COLOR_BLUE}###########################################################################################${COLOR_RESET}" + start_end::group_end +} diff --git a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh index 90e5b0d0f3483..c05466f5ee020 100755 --- a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh +++ b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh @@ -126,6 +126,15 @@ function run_airflow_testing_in_docker() { run airflow "${@}" docker ps exit_code=$? + if [[ ${exit_code} != "0" && ${CI} == "true" ]]; then + docker ps --all + local container + for container in $(docker ps --all --format '{{.Names}}') + do + testing::dump_container_logs "${container}" + done + fi + docker-compose --log-level INFO -f "${SCRIPTS_CI_DIR}/docker-compose/base.yml" \ "${INTEGRATIONS[@]}" \ --project-name "airflow-${TEST_TYPE}-${BACKEND}" \ From cd558102f7e1cc712d557b1876ecbbf4288714b8 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Thu, 18 Nov 2021 09:17:35 +0100 Subject: [PATCH 056/250] Add more complete instruction for reproducing failed integration tests (#19646) When integration tests are failing, breeze prints the exact reproduction step to recreate the same environment. However when integration tests were enabled it missed the --integration flags that were necessary to enable the integrations. This PR adds the --integration flags to the instructions and also adds the comment that Kerberos integration currently does not work with docker-compose v2. (cherry picked from commit 7b700bbe32e18708bd0affbc59c43ce9b3420e28) --- .../ci/testing/ci_run_single_airflow_test_in_docker.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh index c05466f5ee020..596bea14d5838 100755 --- a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh +++ b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh @@ -25,6 +25,7 @@ export PRINT_INFO_FROM_SCRIPTS DOCKER_COMPOSE_LOCAL=() INTEGRATIONS=() +INTEGRATION_BREEZE_FLAGS=() function prepare_tests() { DOCKER_COMPOSE_LOCAL+=("-f" "${SCRIPTS_CI_DIR}/docker-compose/files.yml") @@ -59,8 +60,8 @@ function prepare_tests() { for _INT in ${ENABLED_INTEGRATIONS} do - INTEGRATIONS+=("-f") - INTEGRATIONS+=("${SCRIPTS_CI_DIR}/docker-compose/integration-${_INT}.yml") + INTEGRATIONS+=("-f" "${SCRIPTS_CI_DIR}/docker-compose/integration-${_INT}.yml") + INTEGRATION_BREEZE_FLAGS+=("--integration" "${_INT}") done readonly INTEGRATIONS @@ -158,8 +159,8 @@ function run_airflow_testing_in_docker() { echo "${COLOR_RED}***********************************************************************************************${COLOR_RESET}" echo echo "${COLOR_BLUE}***********************************************************************************************${COLOR_RESET}" - echo "${COLOR_BLUE}Reproduce the failed tests on your local machine:${COLOR_RESET}" - echo "${COLOR_YELLOW}./breeze --github-image-id ${GITHUB_REGISTRY_PULL_IMAGE_TAG=} --backend ${BACKEND} ${EXTRA_ARGS}--python ${PYTHON_MAJOR_MINOR_VERSION} --db-reset --skip-mounting-local-sources --test-type ${TEST_TYPE} shell${COLOR_RESET}" + echo "${COLOR_BLUE}Reproduce the failed tests on your local machine (note that you need to use docker-compose v1 rather than v2 to enable Kerberos integration):${COLOR_RESET}" + echo "${COLOR_YELLOW}./breeze --github-image-id ${GITHUB_REGISTRY_PULL_IMAGE_TAG=} --backend ${BACKEND} ${EXTRA_ARGS}--python ${PYTHON_MAJOR_MINOR_VERSION} --db-reset --skip-mounting-local-sources --test-type ${TEST_TYPE} ${INTEGRATION_BREEZE_FLAGS[*]} shell${COLOR_RESET}" echo "${COLOR_BLUE}Then you can run failed tests with:${COLOR_RESET}" echo "${COLOR_YELLOW}pytest [TEST_NAME]${COLOR_RESET}" echo "${COLOR_BLUE}***********************************************************************************************${COLOR_RESET}" From 8a79d7ecfeefa946567675a3c9ee8568bcf7d64c Mon Sep 17 00:00:00 2001 From: Ash Berlin-Taylor Date: Thu, 18 Nov 2021 12:50:14 +0000 Subject: [PATCH 057/250] Fix CI tests so they correctly fail in case of error! (#19678) (cherry picked from commit 889f1571259ae5ce83fb8723ac2d10cd21dc9d50) --- scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh index 596bea14d5838..dd5c27a9aefe4 100755 --- a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh +++ b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh @@ -125,8 +125,8 @@ function run_airflow_testing_in_docker() { "${DOCKER_COMPOSE_LOCAL[@]}" \ --project-name "airflow-${TEST_TYPE}-${BACKEND}" \ run airflow "${@}" - docker ps exit_code=$? + docker ps if [[ ${exit_code} != "0" && ${CI} == "true" ]]; then docker ps --all local container From 32768645e291e7dfcac2618028f9b4d5b7bd787d Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Fri, 19 Nov 2021 10:38:28 +0100 Subject: [PATCH 058/250] Fix speed of yarn installation (#19697) The --network-concurrency=1 is very slow and even if this has been added in #17293 to battle connection refused, it slows regular builds far too much. There is a new optimisation in progress that should significantly reduce the yarn installations on kind-cluster deploy: #19210 and it should solve the problem much better. (cherry picked from commit 43e84ccb8c909ad78862a9411ab72dbac6c7169c) --- scripts/docker/compile_www_assets.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/docker/compile_www_assets.sh b/scripts/docker/compile_www_assets.sh index 50e1318c548a4..59a7017fd157f 100755 --- a/scripts/docker/compile_www_assets.sh +++ b/scripts/docker/compile_www_assets.sh @@ -35,7 +35,7 @@ function compile_www_assets() { www_dir="$(python -m site --user-site)/airflow/www" fi pushd ${www_dir} || exit 1 - yarn install --frozen-lockfile --no-cache --network-concurrency=1 + yarn install --frozen-lockfile --no-cache yarn run prod find package.json yarn.lock static/css static/js -type f | sort | xargs md5sum > "${md5sum_file}" rm -rf "${www_dir}/node_modules" From 84c523d841347f7c4b10ff32869d2792c14077fa Mon Sep 17 00:00:00 2001 From: Ash Berlin-Taylor Date: Fri, 19 Nov 2021 20:16:47 +0000 Subject: [PATCH 059/250] Speed up webserver start up in Kube tests (#19710) Thanks to a previous change to not load provider hooks too early we can take advantage of the "preload-app" feature of Gunicorn to load the application once in the main gunicorn process before the workers are forked off. This change makes the webserver start up (time to serving first request) go from 20s to 5s. (The reason we don't just do this blindly everywhere is that it would mean plugins are loaded at start only, and is a change in behaviour. But in tests this is fine.) (cherry picked from commit 17d8656a0745a76925a51de6f6e7ce2488d1d2f4) --- scripts/ci/libraries/_kind.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/ci/libraries/_kind.sh b/scripts/ci/libraries/_kind.sh index 5b3ae69a11746..1322a1c5d1ba0 100644 --- a/scripts/ci/libraries/_kind.sh +++ b/scripts/ci/libraries/_kind.sh @@ -270,6 +270,8 @@ COPY airflow/example_dags/ \${AIRFLOW_HOME}/dags/ COPY airflow/kubernetes_executor_templates/ \${AIRFLOW_HOME}/pod_templates/ +ENV GUNICORN_CMD_ARGS='--preload' AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL=0 + EOF echo "The ${AIRFLOW_IMAGE_KUBERNETES}:${image_tag} is prepared for test kubernetes deployment." } From 01de1b7b7e3cc86ebd119cb3d04c47a210644610 Mon Sep 17 00:00:00 2001 From: Khalid Mammadov Date: Sun, 21 Nov 2021 09:04:08 +0000 Subject: [PATCH 060/250] Remove duplicate line call in CI (#19728) (cherry picked from commit c6c662798b3cdee88a760cfeccd1be83ec7922f2) --- scripts/ci/testing/ci_run_airflow_testing.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/ci/testing/ci_run_airflow_testing.sh b/scripts/ci/testing/ci_run_airflow_testing.sh index 27ded3ecec2e2..9412c56473c70 100755 --- a/scripts/ci/testing/ci_run_airflow_testing.sh +++ b/scripts/ci/testing/ci_run_airflow_testing.sh @@ -40,7 +40,6 @@ function run_test_types_in_parallel() { do export TEST_TYPE mkdir -p "${PARALLEL_MONITORED_DIR}/${SEMAPHORE_NAME}/${TEST_TYPE}" - mkdir -p "${PARALLEL_MONITORED_DIR}/${SEMAPHORE_NAME}/${TEST_TYPE}" export JOB_LOG="${PARALLEL_MONITORED_DIR}/${SEMAPHORE_NAME}/${TEST_TYPE}/stdout" export PARALLEL_JOB_STATUS="${PARALLEL_MONITORED_DIR}/${SEMAPHORE_NAME}/${TEST_TYPE}/status" # Each test job will get SIGTERM followed by SIGTERM 200ms later and SIGKILL 200ms later after 45 mins From 871271065b67c39aa6ae1d94148ac522984502d9 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 23 Nov 2021 15:16:15 +0100 Subject: [PATCH 061/250] Lower the recommended disk space requirements (#19775) The recommended disk space requirements for Breeze were set to 40GB which is way to high (and our Public Runners do not have that much of a disk space - this generated false warnings). Lowering it to 20GB should be quite enough for most "casual" users. (cherry picked from commit 5901f79bb61d3e56f1350d43448edb7e2c28961e) --- scripts/in_container/run_resource_check.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/in_container/run_resource_check.sh b/scripts/in_container/run_resource_check.sh index f4739cdaeba47..584af350d324b 100755 --- a/scripts/in_container/run_resource_check.sh +++ b/scripts/in_container/run_resource_check.sh @@ -46,9 +46,9 @@ function resource_check() { else echo "* CPUs available ${cpus_available}. ${COLOR_GREEN}OK.${COLOR_RESET}" fi - if (( disk_available < one_meg*40 )); then + if (( disk_available < one_meg*20 )); then echo "${COLOR_YELLOW}WARNING!!!: Not enough Disk space available for Docker.${COLOR_RESET}" - echo "At least 40 GBs recommended. You have ${human_readable_disk}" + echo "At least 20 GBs recommended. You have ${human_readable_disk}" warning_resources="true" else echo "* Disk available ${human_readable_disk}. ${COLOR_GREEN}OK.${COLOR_RESET}" From 500946ad7f939d4e19fe37f84bcc020d58095b74 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 23 Nov 2021 15:17:16 +0100 Subject: [PATCH 062/250] Run Other tests sequentially for Public GitHub runners (#19766) The Other tests take a lot of memory (> 1GB when tests of webserver are running). This causes OOM issues for Public GitHub runners when those tests are run in parallel to other tests. This PR add Other to sequentially run tests which will make sure they are not run in parallel with any other tests. (cherry picked from commit ba6909494c4de94b41d4abebea8d4f66e05c9c64) --- scripts/ci/testing/ci_run_airflow_testing.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/ci/testing/ci_run_airflow_testing.sh b/scripts/ci/testing/ci_run_airflow_testing.sh index 9412c56473c70..46c191ac89cf4 100755 --- a/scripts/ci/testing/ci_run_airflow_testing.sh +++ b/scripts/ci/testing/ci_run_airflow_testing.sh @@ -97,6 +97,11 @@ function run_all_test_types_in_parallel() { test_types_to_run="${test_types_to_run//Core/}" sequential_tests+=("Core") fi + if [[ ${test_types_to_run} == *"Other"* ]]; then + echo "${COLOR_YELLOW}Remove Other from tests_types_to_run and add them to sequential tests due to low memory.${COLOR_RESET}" + test_types_to_run="${test_types_to_run//Other/}" + sequential_tests+=("Other") + fi if [[ ${BACKEND} == "mssql" || ${BACKEND} == "mysql" ]]; then # For mssql/mysql - they take far more memory than postgres (or sqlite) - we skip the Provider # tests altogether as they take too much memory even if run sequentially. From da2f943ab304580eb7d3b2c40c42055b96eba2ff Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 23 Nov 2021 15:30:39 +0100 Subject: [PATCH 063/250] Add option to run PRs on public runners by maintainers. (#19772) When `use public runners` label is applied to a PR, that PR will run on Public Runners even if it is created by the maintainer. (cherry picked from commit 9963c9cdd08487eaf264ae4c7b32ecdd1d4cdd54) --- .github/workflows/ci.yml | 29 ++++++++++++++++++----------- PULL_REQUEST_WORKFLOW.rst | 10 +++------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d4e06a62325e3..4283f8115236a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -170,17 +170,6 @@ jobs: pullRequestLabels: ${{ steps.source-run-info.outputs.pullRequestLabels }} runsOn: ${{ steps.set-runs-on.outputs.runsOn }} steps: - # Avoid having to specify the runs-on logic every time. We use the custom - # env var AIRFLOW_SELF_HOSTED_RUNNER set only on our runners, but never - # on the public runners - - name: Set runs-on - id: set-runs-on - run: | - if [[ ${AIRFLOW_SELF_HOSTED_RUNNER} != "" ]]; then - echo "::set-output name=runsOn::\"self-hosted\"" - else - echo "::set-output name=runsOn::\"ubuntu-20.04\"" - fi - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" uses: actions/checkout@v2 with: @@ -218,6 +207,24 @@ jobs: # Run all checks ./scripts/ci/selective_ci_checks.sh fi + # Avoid having to specify the runs-on logic every time. We use the custom + # env var AIRFLOW_SELF_HOSTED_RUNNER set only on our runners, but never + # on the public runners + - name: Set runs-on + id: set-runs-on + env: + PR_LABELS: "${{ steps.source-run-info.outputs.pullRequestLabels }}" + run: | + if [[ ${PR_LABELS=} == *"use public runners"* ]]; then + echo "Forcing running on Public Runners via `use public runners` label" + echo "::set-output name=runsOn::\"ubuntu-20.04\"" + elif [[ ${AIRFLOW_SELF_HOSTED_RUNNER} == "" ]]; then + echo "Regular PR running with Public Runner" + echo "::set-output name=runsOn::\"ubuntu-20.04\"" + else + echo "Maintainer or main run running with self-hosted runner" + echo "::set-output name=runsOn::\"self-hosted\"" + fi tests-ui: timeout-minutes: 10 diff --git a/PULL_REQUEST_WORKFLOW.rst b/PULL_REQUEST_WORKFLOW.rst index db3006360ec30..b18b98039391b 100644 --- a/PULL_REQUEST_WORKFLOW.rst +++ b/PULL_REQUEST_WORKFLOW.rst @@ -81,13 +81,9 @@ We approached the problem by: More about it can be found in `Approval workflow and Matrix tests <#approval-workflow-and-matrix-tests>`_ chapter. -4) We've also applied (and received) funds to run self-hosted runners. This is not yet implemented, due to - discussions about security of self-hosted runners for public repositories. Running self-hosted runners by - public repositories is currently (as of end of October 2020) - `Discouraged by GitHub `_ - and we are working on solving the problem - also involving Apache Software Foundation infrastructure team. - This document does not describe this part of the approach. Most likely we will add soon a document - describing details of the approach taken there. +4) We've also applied (and received) funds to run self-hosted runners. They are used for ``main`` runs + and whenever the PRs are done by one of the maintainers. Maintainers can force using Public GitHub runners + by applying "use public runners" label to the PR before submitting it. Selective CI Checks ------------------- From c931b9c872a48b8b364f61ad95a00bdfd6a7443c Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Tue, 23 Nov 2021 13:55:12 -0700 Subject: [PATCH 064/250] Sync committers in ci config for self-hosted runners (#19786) (cherry picked from commit f865c6187ea2906258b7cb25cf0a70a100c6bcdf) --- .github/workflows/ci.yml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4283f8115236a..3910e6b5b0843 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -99,12 +99,20 @@ jobs: "aoen", "artwr", "ashb", + "bbovenzi", "bolkedebruin", "criccomini", "dimberman", + "dstandish", + "eladkal", + "ephraimbuddy", + "feluelle", "feng-tao", "houqp", + "jedcunningham", + "jgao54", "jghoman", + "jhtimmins", "jmcarp", "kaxil", "leahecole", @@ -119,13 +127,11 @@ jobs: "saguziel", "sekikn", "turbaszek", - "zhongjiajie", - "ephraimbuddy", - "jhtimmins", - "dstandish", + "uranusjr", + "vikramkoka", "xinbinhuang", - "yuqian", - "eladkal" + "yuqian90", + "zhongjiajie" ]'), github.event.pull_request.user.login) ) && github.repository == 'apache/airflow' ) && 'self-hosted' || 'ubuntu-20.04' }} From 069158e543c85c903b7bd7ab9f5927c20d5def9b Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Wed, 24 Nov 2021 03:14:49 -0700 Subject: [PATCH 065/250] Add note to restart runners when updating committers (#19795) Also link to a script that can generate the list of committers (cherry picked from commit d69b4c9dc82b6c35c387bb819b95cf41fb974ab8) --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3910e6b5b0843..06b4be1d3f57f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -84,6 +84,10 @@ jobs: # When changing this list, ensure that it is kept in sync with the # /runners/apache/airflow/configOverlay # parameter in AWS SSM ParameterStore (which is what the runner uses) + # and restart the self-hosted runners. + # + # This list of committers can be generated with: + # https://github.com/apache/airflow-ci-infra/blob/main/scripts/list_committers runs-on: >- ${{ ( ( From f9c444cbaa540582b1d986ed1555626e90d422ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Bregu=C5=82a?= Date: Wed, 24 Nov 2021 17:59:47 +0100 Subject: [PATCH 066/250] Tests for Docker images in Python (#19737) (cherry picked from commit 621d17bb77e3160c1a927803e5d190c0e2aade3c) --- .github/CODEOWNERS | 5 +- .github/boring-cyborg.yml | 2 +- .pre-commit-config.yaml | 1 + docker_tests/__init__.py | 16 + docker_tests/ci_image.py | 41 ++ docker_tests/docker_tests_utils.py | 116 ++++++ docker_tests/prod_image.py | 206 ++++++++++ scripts/ci/docker-compose/local.yml | 1 + scripts/ci/images/ci_run_docker_tests.py | 101 +++++ .../ci_wait_for_and_verify_all_ci_images.sh | 2 + .../ci_wait_for_and_verify_all_prod_images.sh | 2 + scripts/ci/installed_providers.txt | 22 ++ scripts/ci/libraries/_build_images.sh | 3 +- scripts/ci/libraries/_initialization.sh | 24 -- scripts/ci/libraries/_local_mounts.sh | 1 + scripts/ci/libraries/_verify_image.sh | 366 +----------------- scripts/ci/tools/verify_docker_image.sh | 57 --- 17 files changed, 519 insertions(+), 447 deletions(-) create mode 100644 docker_tests/__init__.py create mode 100644 docker_tests/ci_image.py create mode 100644 docker_tests/docker_tests_utils.py create mode 100644 docker_tests/prod_image.py create mode 100755 scripts/ci/images/ci_run_docker_tests.py create mode 100644 scripts/ci/installed_providers.txt delete mode 100755 scripts/ci/tools/verify_docker_image.sh diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f993eac40fef7..92b5ce2fbf627 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -50,8 +50,9 @@ /.github/workflows/ @potiuk @ashb @kaxil breeze @potiuk breeze-complete @potiuk -Dockerfile @potiuk @ashb +Dockerfile @potiuk @ashb @mik-laj Dockerfile.ci @potiuk @ashb /dev/ @potiuk @ashb @kaxil /provider_packages/ @potiuk @ashb -/scripts/ @potiuk @ashb +/scripts/ @potiuk @ashb @mik-laj +/docker_tests/ @potiuk @ashb @mik-laj diff --git a/.github/boring-cyborg.yml b/.github/boring-cyborg.yml index f1a6c4b51dde1..765caa995be8e 100644 --- a/.github/boring-cyborg.yml +++ b/.github/boring-cyborg.yml @@ -190,8 +190,8 @@ labelPRBasedOnFilePath: - Dockerfile - docs/docker-stack/**/* - scripts/in_container/prod/* - - scripts/ci/tools/verify_docker_image.sh - scripts/ci/libraries/_verify_image.sh + - docker_tests/**/* # Various Flags to control behaviour of the "Labeler" labelerFlags: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ad985d108315b..5ba9cc682f1f4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -239,6 +239,7 @@ repos: ^scripts/.*\.py$| ^dev| ^provider_packages| + ^docker_tests| ^kubernetes_tests| .*example_dags/.*| ^chart/.*\.py$| diff --git a/docker_tests/__init__.py b/docker_tests/__init__.py new file mode 100644 index 0000000000000..13a83393a9124 --- /dev/null +++ b/docker_tests/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/docker_tests/ci_image.py b/docker_tests/ci_image.py new file mode 100644 index 0000000000000..25a9bfb73ac5d --- /dev/null +++ b/docker_tests/ci_image.py @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import subprocess + +from docker_tests.docker_tests_utils import ( + display_dependency_conflict_message, + docker_image, + run_bash_in_docker, + run_command, +) + + +class TestFiles: + def test_dist_folder_should_exists(self): + run_bash_in_docker('[ -f /opt/airflow/airflow/www/static/dist/manifest.json ] || exit 1') + + +class TestPythonPackages: + def test_pip_dependencies_conflict(self): + try: + run_command( + ["docker", "run", "--rm", "--entrypoint", "/bin/bash", docker_image, "-c", 'pip check'] + ) + except subprocess.CalledProcessError as ex: + display_dependency_conflict_message() + raise ex diff --git a/docker_tests/docker_tests_utils.py b/docker_tests/docker_tests_utils.py new file mode 100644 index 0000000000000..b0155b0fb7964 --- /dev/null +++ b/docker_tests/docker_tests_utils.py @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import shlex +import subprocess +from pathlib import Path +from typing import List + +docker_image = os.environ.get('DOCKER_IMAGE') +SOURCE_ROOT = Path(__file__).resolve().parents[1] + +if not docker_image: + raise Exception("The DOCKER_IMAGE environment variable is required") + + +def run_command(cmd: List[str], print_output_on_error: bool = True, **kwargs): + print(f"$ {' '.join(shlex.quote(c) for c in cmd)}") + try: + return subprocess.check_output(cmd, **kwargs).decode() + except subprocess.CalledProcessError as ex: + if print_output_on_error: + print("========================= OUTPUT start ============================") + print(ex.stderr) + print(ex.stdout) + print("========================= OUTPUT end ============================") + raise + + +def run_bash_in_docker(bash_script, **kwargs): + docker_command = [ + "docker", + "run", + "--rm", + "-e", + "COLUMNS=180", + "--entrypoint", + "/bin/bash", + docker_image, + "-c", + bash_script, + ] + return run_command(docker_command, **kwargs) + + +def run_python_in_docker(python_script, **kwargs): + docker_command = [ + "docker", + "run", + "--rm", + "-e", + "COLUMNS=180", + "-e", + "PYTHONDONTWRITEBYTECODE=true", + docker_image, + "python", + "-c", + python_script, + ] + return run_command(docker_command, **kwargs) + + +def display_dependency_conflict_message(): + print( + """ +***** Beginning of the instructions **** + +The image did not pass 'pip check' verification. This means that there are some conflicting dependencies +in the image. + +It can mean one of those: + +1) The main is currently broken (other PRs will fail with the same error) +2) You changed some dependencies in setup.py or setup.cfg and they are conflicting. + + + +In case 1) - apologies for the trouble.Please let committers know and they will fix it. You might +be asked to rebase to the latest main after the problem is fixed. + +In case 2) - Follow the steps below: + +* try to build CI and then PROD image locally with breeze, adding --upgrade-to-newer-dependencies flag + (repeat it for all python versions) + +CI image: + + ./breeze build-image --upgrade-to-newer-dependencies --python 3.6 + +Production image: + + ./breeze build-image --production-image --upgrade-to-newer-dependencies --python 3.6 + +* You will see error messages there telling which requirements are conflicting and which packages caused the + conflict. Add the limitation that caused the conflict to EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS + variable in Dockerfile.ci. Note that the limitations might be different for Dockerfile.ci and Dockerfile + because not all packages are installed by default in the PROD Dockerfile. So you might find that you + only need to add the limitation to the Dockerfile.ci + +***** End of the instructions **** +""" + ) diff --git a/docker_tests/prod_image.py b/docker_tests/prod_image.py new file mode 100644 index 0000000000000..cac517c80ed99 --- /dev/null +++ b/docker_tests/prod_image.py @@ -0,0 +1,206 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json +import subprocess +import tempfile +from pathlib import Path + +import pytest + +from docker_tests.docker_tests_utils import ( + SOURCE_ROOT, + display_dependency_conflict_message, + docker_image, + run_bash_in_docker, + run_command, + run_python_in_docker, +) + +INSTALLED_PROVIDER_PATH = SOURCE_ROOT / "scripts" / "ci" / "installed_providers.txt" + + +class TestCommands: + def test_without_command(self): + """Checking the image without a command. It should return non-zero exit code.""" + with pytest.raises(subprocess.CalledProcessError) as ctx: + run_command(["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image]) + assert 2 == ctx.value.returncode + + def test_airflow_command(self): + """Checking 'airflow' command It should return non-zero exit code.""" + with pytest.raises(subprocess.CalledProcessError) as ctx: + run_command(["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image, "airflow"]) + assert 2 == ctx.value.returncode + + def test_airflow_version(self): + """Checking 'airflow version' command It should return zero exit code.""" + output = run_command( + ["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image, "airflow", "version"] + ) + assert "2." in output + + def test_python_version(self): + """Checking 'python --version' command It should return zero exit code.""" + output = run_command( + ["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image, "python", "--version"] + ) + assert "Python 3." in output + + def test_bash_version(self): + """Checking 'bash --version' command It should return zero exit code.""" + output = run_command( + ["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image, "bash", "--version"] + ) + assert "GNU bash," in output + + +class TestPythonPackages: + def test_required_providers_are_installed(self): + lines = (d.strip() for d in INSTALLED_PROVIDER_PATH.read_text().splitlines()) + lines = (d for d in lines) + packages_to_install = {f"apache-airflow-providers-{d.replace('.', '-')}" for d in lines} + assert len(packages_to_install) != 0 + + output = run_bash_in_docker("airflow providers list --output json", stderr=subprocess.DEVNULL) + providers = json.loads(output) + packages_installed = {d['package_name'] for d in providers} + assert len(packages_installed) != 0 + + assert packages_to_install == packages_installed, ( + f"List of expected installed packages and image content mismatch. " + f"Check {INSTALLED_PROVIDER_PATH} file." + ) + + def test_pip_dependencies_conflict(self): + try: + run_bash_in_docker("pip check") + except subprocess.CalledProcessError as ex: + display_dependency_conflict_message() + raise ex + + PACKAGE_IMPORTS = { + "amazon": ["boto3", "botocore", "watchtower"], + "async": ["gevent", "eventlet", "greenlet"], + "azure": [ + 'azure.batch', + 'azure.cosmos', + 'azure.datalake.store', + 'azure.identity', + 'azure.keyvault', + 'azure.kusto.data', + 'azure.mgmt.containerinstance', + 'azure.mgmt.datalake.store', + 'azure.mgmt.resource', + 'azure.storage', + ], + "celery": ["celery", "flower", "vine"], + "cncf.kubernetes": ["kubernetes", "cryptography"], + "dask": ["cloudpickle", "distributed"], + "docker": ["docker"], + "elasticsearch": ["elasticsearch", "es.elastic", "elasticsearch_dsl"], + "google": [ + 'OpenSSL', + 'google.ads', + 'googleapiclient', + 'google.auth', + 'google_auth_httplib2', + 'google.cloud.automl', + 'google.cloud.bigquery_datatransfer', + 'google.cloud.bigtable', + 'google.cloud.container', + 'google.cloud.datacatalog', + 'google.cloud.dataproc', + 'google.cloud.dlp', + 'google.cloud.kms', + 'google.cloud.language', + 'google.cloud.logging', + 'google.cloud.memcache', + 'google.cloud.monitoring', + 'google.cloud.oslogin', + 'google.cloud.pubsub', + 'google.cloud.redis', + 'google.cloud.secretmanager', + 'google.cloud.spanner', + 'google.cloud.speech', + 'google.cloud.storage', + 'google.cloud.tasks', + 'google.cloud.texttospeech', + 'google.cloud.translate', + 'google.cloud.videointelligence', + 'google.cloud.vision', + ], + "grpc": ["grpc", "google.auth", "google_auth_httplib2"], + "hashicorp": ["hvac"], + "ldap": ["ldap"], + "mysql": ["mysql"], + "postgres": ["psycopg2"], + "pyodbc": ["pyodbc"], + "redis": ["redis"], + "sendgrid": ["sendgrid"], + "sftp/ssh": ["paramiko", "pysftp", "sshtunnel"], + "slack": ["slack_sdk"], + "statsd": ["statsd"], + "virtualenv": ["virtualenv"], + } + + @pytest.mark.parametrize("package_name,import_names", PACKAGE_IMPORTS.items()) + def test_check_dependencies_imports(self, package_name, import_names): + run_python_in_docker(f"import {','.join(import_names)}") + + +class TestExecuteAsRoot: + def test_execute_airflow_as_root(self): + run_command( + [ + "docker", + "run", + "--rm", + "--user", + "0", + "-e", + "PYTHONDONTWRITEBYTECODE=true", + docker_image, + "airflow", + "info", + ] + ) + + def test_run_custom_python_packages_as_root(self): + with tempfile.TemporaryDirectory() as tmp_dir: + (Path(tmp_dir) / "__init__.py").write_text('') + (Path(tmp_dir) / "awesome.py").write_text('print("Awesome")') + + run_command( + [ + "docker", + "run", + "--rm", + "-e", + f"PYTHONPATH={tmp_dir}", + "-e", + "PYTHONDONTWRITEBYTECODE=true", + "-v", + f"{tmp_dir}:{tmp_dir}", + "--user", + "0", + docker_image, + "python", + "-c", + "import awesome", + ] + ) diff --git a/scripts/ci/docker-compose/local.yml b/scripts/ci/docker-compose/local.yml index 0f144b5c3cf25..d5c183683f113 100644 --- a/scripts/ci/docker-compose/local.yml +++ b/scripts/ci/docker-compose/local.yml @@ -54,6 +54,7 @@ services: - ../../../setup.py:/opt/airflow/setup.py:cached - ../../../tests:/opt/airflow/tests:cached - ../../../kubernetes_tests:/opt/airflow/kubernetes_tests:cached + - ../../../docker_tests:/opt/airflow/docker_tests:cached - ../../../chart:/opt/airflow/chart:cached - ../../../metastore_browser:/opt/airflow/metastore_browser:cached # END automatically generated volumes from LOCAL_MOUNTS in _local_mounts.sh diff --git a/scripts/ci/images/ci_run_docker_tests.py b/scripts/ci/images/ci_run_docker_tests.py new file mode 100755 index 0000000000000..af81cb4964ed3 --- /dev/null +++ b/scripts/ci/images/ci_run_docker_tests.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import shlex +import subprocess +import sys +from pathlib import Path +from typing import List + +AIRFLOW_SOURCE = Path(__file__).resolve().parent.parent.parent +BUILD_CACHE_DIR = AIRFLOW_SOURCE / ".build" + +CBLUE = '\033[94m' +CEND = '\033[0m' + + +def get_parser(): + parser = argparse.ArgumentParser( + prog="ci_run_docker_tests", + description="Running Docker tests using pytest", + epilog="Unknown arguments are passed unchanged to Pytest.", + ) + parser.add_argument( + "--interactive", + "-i", + action='store_true', + help="Activates virtual environment ready to run tests and drops you in", + ) + parser.add_argument("--initialize", action="store_true", help="Initialize virtual environment and exit") + parser.add_argument("pytestopts", nargs=argparse.REMAINDER, help="Tests to run") + return parser + + +def run_verbose(cmd: List[str], **kwargs): + print(f"{CBLUE}$ {' '.join(shlex.quote(c) for c in cmd)}{CEND}") + subprocess.run(cmd, **kwargs) + + +def create_virtualenv(): + virtualenv_path = ( + BUILD_CACHE_DIR / ".docker_venv" / f"host_python_{sys.version_info[0]}.{sys.version_info[1]}" + ) + virtualenv_path.parent.mkdir(parents=True, exist_ok=True) + if not virtualenv_path.exists(): + print("Creating virtualenv environment") + run_verbose([sys.executable, "-m", "venv", str(virtualenv_path)]) + + python_bin = virtualenv_path / "bin" / "python" + run_verbose([str(python_bin), "-m", "pip", "install", "pytest", "pytest-xdist"]) + return python_bin + + +def main(): + parser = get_parser() + args = parser.parse_args() + + python_bin = create_virtualenv() + + if args.initialize: + return + if args.interactive: + activate_bin = python_bin.parent / "activate" + bash_trampoline = f"source {shlex.quote(str(activate_bin))}" + print("To enter virtual environment, run:") + print(f" {bash_trampoline}") + return + + extra_pytest_args = ( + args.pytestopts[1:] if args.pytestopts and args.pytestopts[0] == "--" else args.pytestopts + ) + if not extra_pytest_args: + raise SystemExit("You must select the tests to run.") + + pytest_args = ( + "--pythonwarnings=ignore::DeprecationWarning", + "--pythonwarnings=ignore::PendingDeprecationWarning", + "-n", + "auto", + ) + + run_verbose([str(python_bin), "-m", "pytest", *pytest_args, *extra_pytest_args]) + + +if __name__ == "__main__": + main() diff --git a/scripts/ci/images/ci_wait_for_and_verify_all_ci_images.sh b/scripts/ci/images/ci_wait_for_and_verify_all_ci_images.sh index cb45c1ff3f5da..0a31ad22f4f6f 100755 --- a/scripts/ci/images/ci_wait_for_and_verify_all_ci_images.sh +++ b/scripts/ci/images/ci_wait_for_and_verify_all_ci_images.sh @@ -23,6 +23,8 @@ LIBRARIES_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/../libraries/" && pwd) # shellcheck source=scripts/ci/libraries/_all_libs.sh source "${LIBRARIES_DIR}/_all_libs.sh" +python3 "$( dirname "${BASH_SOURCE[0]}" )/ci_run_docker_tests.py" "--initialize" + initialization::set_output_color_variables export PARALLEL_TAIL_LENGTH=5 diff --git a/scripts/ci/images/ci_wait_for_and_verify_all_prod_images.sh b/scripts/ci/images/ci_wait_for_and_verify_all_prod_images.sh index 5df66a31ce81c..bd6c336b0d3a0 100755 --- a/scripts/ci/images/ci_wait_for_and_verify_all_prod_images.sh +++ b/scripts/ci/images/ci_wait_for_and_verify_all_prod_images.sh @@ -25,6 +25,8 @@ source "${LIBRARIES_DIR}/_all_libs.sh" initialization::set_output_color_variables +python3 "$( dirname "${BASH_SOURCE[0]}" )/ci_run_docker_tests.py" "--initialize" + export PARALLEL_TAIL_LENGTH=5 parallel::make_sure_gnu_parallel_is_installed diff --git a/scripts/ci/installed_providers.txt b/scripts/ci/installed_providers.txt new file mode 100644 index 0000000000000..c6b02bfae16b3 --- /dev/null +++ b/scripts/ci/installed_providers.txt @@ -0,0 +1,22 @@ +amazon +celery +cncf.kubernetes +docker +elasticsearch +ftp +google +grpc +hashicorp +http +imap +microsoft.azure +mysql +odbc +postgres +redis +sendgrid +sftp +slack +sqlite +sqlite +ssh diff --git a/scripts/ci/libraries/_build_images.sh b/scripts/ci/libraries/_build_images.sh index af038db61a9c4..8dc24d00fe084 100644 --- a/scripts/ci/libraries/_build_images.sh +++ b/scripts/ci/libraries/_build_images.sh @@ -938,7 +938,8 @@ function build_images::build_prod_images_from_locally_built_airflow_packages() { build_images::cleanup_docker_context_files # Build necessary provider packages - runs::run_prepare_provider_packages "${INSTALLED_PROVIDERS[@]}" + IFS=$'\n' read -d '' -r -a installed_providers < "${AIRFLOW_SOURCES}/scripts/ci/installed_providers.txt" + runs::run_prepare_provider_packages "${installed_providers[@]}" mv "${AIRFLOW_SOURCES}/dist/"* "${AIRFLOW_SOURCES}/docker-context-files/" # Build apache airflow packages diff --git a/scripts/ci/libraries/_initialization.sh b/scripts/ci/libraries/_initialization.sh index 36cc29ed50fd8..87a64aba33c1a 100644 --- a/scripts/ci/libraries/_initialization.sh +++ b/scripts/ci/libraries/_initialization.sh @@ -416,30 +416,6 @@ function initialization::initialize_image_build_variables() { INSTALL_PROVIDERS_FROM_SOURCES=${INSTALL_PROVIDERS_FROM_SOURCES:="true"} export INSTALL_PROVIDERS_FROM_SOURCES - INSTALLED_PROVIDERS+=( - "amazon" - "celery" - "cncf.kubernetes" - "docker" - "elasticsearch" - "ftp" - "grpc" - "hashicorp" - "http" - "imap" - "google" - "microsoft.azure" - "mysql" - "postgres" - "redis" - "sendgrid" - "sqlite" - "sftp" - "slack" - "sqlite" - "ssh" - ) - export INSTALLED_PROVIDERS export INSTALLED_EXTRAS="async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,http,imap,ldap,google,microsoft.azure,mysql,postgres,redis,sendgrid,sftp,slack,ssh,statsd,virtualenv" AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION:="21.2.4"} diff --git a/scripts/ci/libraries/_local_mounts.sh b/scripts/ci/libraries/_local_mounts.sh index c9cc70957e029..c74a71ff61bc8 100644 --- a/scripts/ci/libraries/_local_mounts.sh +++ b/scripts/ci/libraries/_local_mounts.sh @@ -50,6 +50,7 @@ function local_mounts::generate_local_mounts_list { "$prefix"setup.py:/opt/airflow/setup.py:cached "$prefix"tests:/opt/airflow/tests:cached "$prefix"kubernetes_tests:/opt/airflow/kubernetes_tests:cached + "$prefix"docker_tests:/opt/airflow/docker_tests:cached "$prefix"chart:/opt/airflow/chart:cached "$prefix"metastore_browser:/opt/airflow/metastore_browser:cached ) diff --git a/scripts/ci/libraries/_verify_image.sh b/scripts/ci/libraries/_verify_image.sh index 1b6b2700d9840..dfe507dde8eed 100644 --- a/scripts/ci/libraries/_verify_image.sh +++ b/scripts/ci/libraries/_verify_image.sh @@ -15,373 +15,15 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -function verify_image::run_command_in_image() { - docker_v run --rm \ - -e COLUMNS=180 \ - --entrypoint /bin/bash "${DOCKER_IMAGE}" \ - -c "${@}" -} - -IMAGE_VALID="true" - -function verify_image::check_command() { - DESCRIPTION="${1}" - COMMAND=${2} - set +e - echo -n "Feature: ${DESCRIPTION} " - local output - output=$(verify_image::run_command_in_image "${COMMAND}" 2>&1) - local res=$? - if [[ ${res} == "0" ]]; then - echo "${COLOR_GREEN}OK${COLOR_RESET}" - else - echo "${COLOR_RED}NOK${COLOR_RESET}" - echo "${COLOR_BLUE}========================= OUTPUT start ============================${COLOR_RESET}" - echo "${output}" - echo "${COLOR_BLUE}========================= OUTPUT end ===========================${COLOR_RESET}" - IMAGE_VALID="false" - fi - set -e -} - -function verify_image::verify_prod_image_commands() { - start_end::group_start "Checking command supports" - set +e - - echo -n "Feature: Checking the image without a command. It should return non-zero exit code." - local output - output=$(docker_v run --rm \ - -e COLUMNS=180 \ - "${DOCKER_IMAGE}" \ - 2>&1) - local res=$? - if [[ ${res} == "2" ]]; then - echo "${COLOR_GREEN}OK${COLOR_RESET}" - else - echo "${COLOR_RED}NOK${COLOR_RESET}" - echo "${COLOR_BLUE}========================= OUTPUT start ============================${COLOR_RESET}" - echo "${output}" - echo "${COLOR_BLUE}========================= OUTPUT end ===========================${COLOR_RESET}" - IMAGE_VALID="false" - fi - echo -n "Feature: Checking 'airflow' command It should return non-zero exit code." - output=$(docker_v run --rm \ - -e COLUMNS=180 \ - "${DOCKER_IMAGE}" \ - "airflow" 2>&1) - local res=$? - if [[ ${res} == "2" ]]; then - echo "${COLOR_GREEN}OK${COLOR_RESET}" - else - echo "${COLOR_RED}NOK${COLOR_RESET}" - echo "${COLOR_BLUE}========================= OUTPUT start ============================${COLOR_RESET}" - echo "${output}" - echo "${COLOR_BLUE}========================= OUTPUT end ===========================${COLOR_RESET}" - IMAGE_VALID="false" - fi - - echo -n "Feature: Checking 'airflow version' command It should return zero exit code." - output=$(docker_v run --rm \ - -e COLUMNS=180 \ - "${DOCKER_IMAGE}" \ - "airflow" "version" 2>&1) - local res=$? - if [[ ${res} == "0" ]]; then - echo "${COLOR_GREEN}OK${COLOR_RESET}" - else - echo "${COLOR_RED}NOK${COLOR_RESET}" - echo "${COLOR_BLUE}========================= OUTPUT start ============================${COLOR_RESET}" - echo "${output}" - echo "${COLOR_BLUE}========================= OUTPUT end ===========================${COLOR_RESET}" - IMAGE_VALID="false" - fi - - echo -n "Feature: Checking 'python --version' command It should return zero exit code." - output=$(docker_v run --rm \ - -e COLUMNS=180 \ - "${DOCKER_IMAGE}" \ - python --version | grep "Python 3." 2>&1) - local res=$? - if [[ ${res} == "0" ]]; then - echo "${COLOR_GREEN}OK${COLOR_RESET}" - else - echo "${COLOR_RED}NOK${COLOR_RESET}" - echo "${COLOR_BLUE}========================= OUTPUT start ============================${COLOR_RESET}" - echo "${output}" - echo "${COLOR_BLUE}========================= OUTPUT end ===========================${COLOR_RESET}" - IMAGE_VALID="false" - fi - echo -n "Feature: Checking 'bash --version' command It should return zero exit code." - output=$(docker_v run --rm \ - -e COLUMNS=180 \ - "${DOCKER_IMAGE}" \ - bash --version | grep "GNU bash, " 2>&1) - local res=$? - if [[ ${res} == "0" ]]; then - echo "${COLOR_GREEN}OK${COLOR_RESET}" - else - echo "${COLOR_RED}NOK${COLOR_RESET}" - echo "${COLOR_BLUE}========================= OUTPUT start ============================${COLOR_RESET}" - echo "${output}" - echo "${COLOR_BLUE}========================= OUTPUT end ===========================${COLOR_RESET}" - IMAGE_VALID="false" - fi - - set -e -} - -function verify_image::verify_prod_image_has_airflow_and_providers() { - start_end::group_start "Verify prod image: ${DOCKER_IMAGE}" - echo - echo "Checking if Providers are installed" - echo - - all_providers_installed_in_image=$(verify_image::run_command_in_image "airflow providers list --output table") - - echo - echo "Installed providers:" - echo - echo "${all_providers_installed_in_image}" - echo - local error="false" - for provider in "${INSTALLED_PROVIDERS[@]}"; do - echo -n "Verifying if provider ${provider} installed: " - if [[ ${all_providers_installed_in_image} == *"apache-airflow-providers-${provider//./-}"* ]]; then - echo "${COLOR_GREEN}OK${COLOR_RESET}" - else - echo "${COLOR_RED}NOK${COLOR_RESET}" - error="true" - fi - done - if [[ ${error} == "true" ]]; then - echo - echo "${COLOR_RED}ERROR: Some expected providers are not installed!${COLOR_RESET}" - echo - IMAGE_VALID="false" - else - echo - echo "${COLOR_GREEN}OK. All expected providers installed!${COLOR_RESET}" - echo - fi - start_end::group_end -} - -function verify_image::verify_ci_image_dependencies() { - start_end::group_start "Checking if Airflow dependencies are non-conflicting in ${DOCKER_IMAGE} image." - set +e - docker_v run --rm --entrypoint /bin/bash "${DOCKER_IMAGE}" -c 'pip check' - local res=$? - if [[ ${res} != "0" ]]; then - echo "${COLOR_RED}ERROR: ^^^ Some dependencies are conflicting. See instructions below on how to deal with it. ${COLOR_RESET}" - echo - build_images::inform_about_pip_check "" - IMAGE_VALID="false" - else - echo - echo "${COLOR_GREEN}OK. The ${DOCKER_IMAGE} image dependencies are consistent. ${COLOR_RESET}" - echo - fi - set -e - start_end::group_end -} - -function verify_image::verify_ci_image_has_dist_folder() { - start_end::group_start "Verify CI image dist folder (compiled www assets): ${DOCKER_IMAGE}" - - verify_image::check_command "Dist folder" '[ -f /opt/airflow/airflow/www/static/dist/manifest.json ] || exit 1' - - start_end::group_end -} - - -function verify_image::verify_prod_image_dependencies() { - start_end::group_start "Checking if Airflow dependencies are non-conflicting in ${DOCKER_IMAGE} image." - - set +e - verify_image::run_command_in_image 'pip check' - local res=$? - if [[ ${res} != "0" ]]; then - echo "${COLOR_RED}ERROR: ^^^ Some dependencies are conflicting. See instructions below on how to deal with it. ${COLOR_RESET}" - echo - build_images::inform_about_pip_check "--production " - IMAGE_VALID="false" - else - echo - echo "${COLOR_GREEN}OK. The ${DOCKER_IMAGE} image dependencies are consistent. ${COLOR_RESET}" - echo - fi - set -e - start_end::group_end -} - -GOOGLE_IMPORTS=( - 'OpenSSL' - 'google.ads' - 'googleapiclient' - 'google.auth' - 'google_auth_httplib2' - 'google.cloud.automl' - 'google.cloud.bigquery_datatransfer' - 'google.cloud.bigtable' - 'google.cloud.container' - 'google.cloud.datacatalog' - 'google.cloud.dataproc' - 'google.cloud.dlp' - 'google.cloud.kms' - 'google.cloud.language' - 'google.cloud.logging' - 'google.cloud.memcache' - 'google.cloud.monitoring' - 'google.cloud.oslogin' - 'google.cloud.pubsub' - 'google.cloud.redis' - 'google.cloud.secretmanager' - 'google.cloud.spanner' - 'google.cloud.speech' - 'google.cloud.storage' - 'google.cloud.tasks' - 'google.cloud.texttospeech' - 'google.cloud.translate' - 'google.cloud.videointelligence' - 'google.cloud.vision' -) - -AZURE_IMPORTS=( - 'azure.batch' - 'azure.cosmos' - 'azure.datalake.store' - 'azure.identity' - 'azure.keyvault' - 'azure.kusto.data' - 'azure.mgmt.containerinstance' - 'azure.mgmt.datalake.store' - 'azure.mgmt.resource' - 'azure.storage' -) - -function verify_image::verify_production_image_python_modules() { - start_end::group_start "Verify prod image features: ${DOCKER_IMAGE}" - - verify_image::check_command "Import: async" "python -c 'import gevent, eventlet, greenlet'" - verify_image::check_command "Import: amazon" "python -c 'import boto3, botocore, watchtower'" - verify_image::check_command "Import: celery" "python -c 'import celery, flower, vine'" - verify_image::check_command "Import: cncf.kubernetes" "python -c 'import kubernetes, cryptography'" - verify_image::check_command "Import: docker" "python -c 'import docker'" - verify_image::check_command "Import: dask" "python -c 'import cloudpickle, distributed'" - verify_image::check_command "Import: elasticsearch" "python -c 'import elasticsearch,es.elastic, elasticsearch_dsl'" - verify_image::check_command "Import: grpc" "python -c 'import grpc, google.auth, google_auth_httplib2'" - verify_image::check_command "Import: hashicorp" "python -c 'import hvac'" - verify_image::check_command "Import: ldap" "python -c 'import ldap'" - for google_import in "${GOOGLE_IMPORTS[@]}" - do - verify_image::check_command "Import google: ${google_import}" "python -c 'import ${google_import}'" - done - for azure_import in "${AZURE_IMPORTS[@]}" - do - verify_image::check_command "Import azure: ${azure_import}" "python -c 'import ${azure_import}'" - done - verify_image::check_command "Import: mysql" "python -c 'import mysql'" - verify_image::check_command "Import: postgres" "python -c 'import psycopg2'" - verify_image::check_command "Import: redis" "python -c 'import redis'" - verify_image::check_command "Import: sendgrid" "python -c 'import sendgrid'" - verify_image::check_command "Import: sftp/ssh" "python -c 'import paramiko, pysftp, sshtunnel'" - verify_image::check_command "Import: slack" "python -c 'import slack_sdk'" - verify_image::check_command "Import: statsd" "python -c 'import statsd'" - verify_image::check_command "Import: virtualenv" "python -c 'import virtualenv'" - verify_image::check_command "Import: pyodbc" "python -c 'import pyodbc'" - - start_end::group_end -} - -function verify_image::verify_prod_image_as_root() { - start_end::group_start "Checking if the image can be run as root." - set +e - echo "Checking airflow as root" - local output - local res - output=$(docker_v run --rm --user 0 "${DOCKER_IMAGE}" "airflow" "info" 2>&1) - res=$? - if [[ ${res} == "0" ]]; then - echo "${COLOR_GREEN}OK${COLOR_RESET}" - else - echo "${COLOR_RED}NOK${COLOR_RESET}" - echo "${COLOR_BLUE}========================= OUTPUT start ============================${COLOR_RESET}" - echo "${output}" - echo "${COLOR_BLUE}========================= OUTPUT end ===========================${COLOR_RESET}" - IMAGE_VALID="false" - fi - - echo "Checking root container with custom PYTHONPATH" - local tmp_dir - tmp_dir="$(mktemp -d)" - touch "${tmp_dir}/__init__.py" - echo 'print("Awesome")' >> "${tmp_dir}/awesome.py" - output=$(docker_v run \ - --rm \ - -e "PYTHONPATH=${tmp_dir}" \ - -v "${tmp_dir}:${tmp_dir}" \ - --user 0 "${DOCKER_IMAGE}" \ - "python" "-c" "import awesome" \ - 2>&1) - res=$? - if [[ ${res} == "0" ]]; then - echo "${COLOR_GREEN}OK${COLOR_RESET}" - else - echo "${COLOR_RED}NOK${COLOR_RESET}" - echo "${COLOR_BLUE}========================= OUTPUT start ============================${COLOR_RESET}" - echo "${output}" - echo "${COLOR_BLUE}========================= OUTPUT end ===========================${COLOR_RESET}" - IMAGE_VALID="false" - fi - rm -rf "${tmp_dir}" - set -e -} - -function verify_image::verify_production_image_has_dist_folder() { - start_end::group_start "Verify prod image has dist folder (compiled www assets): ${DOCKER_IMAGE}" - # shellcheck disable=SC2016 - verify_image::check_command "Dist folder" '[ -f $(python -m site --user-site)/airflow/www/static/dist/manifest.json ] || exit 1' - - start_end::group_end -} - -function verify_image::display_result { - if [[ ${IMAGE_VALID} == "true" ]]; then - echo - echo "${COLOR_GREEN}OK. The ${DOCKER_IMAGE} features are all OK. ${COLOR_RESET}" - echo - else - echo - echo "${COLOR_RED}ERROR: Some features were not ok!${COLOR_RESET}" - echo - exit 1 - fi -} function verify_image::verify_prod_image { - IMAGE_VALID="true" DOCKER_IMAGE="${1}" - verify_image::verify_prod_image_commands - - verify_image::verify_prod_image_has_airflow_and_providers - - verify_image::verify_production_image_python_modules - - verify_image::verify_prod_image_dependencies - - verify_image::verify_prod_image_as_root - - verify_image::verify_production_image_has_dist_folder - - verify_image::display_result + export DOCKER_IMAGE + python3 "${SCRIPTS_CI_DIR}/images/ci_run_docker_tests.py" "${AIRFLOW_SOURCES}/docker_tests/prod_image.py" } function verify_image::verify_ci_image { - IMAGE_VALID="true" DOCKER_IMAGE="${1}" - verify_image::verify_ci_image_dependencies - - verify_image::verify_ci_image_has_dist_folder - - verify_image::display_result + export DOCKER_IMAGE + python3 "${SCRIPTS_CI_DIR}/images/ci_run_docker_tests.py" "${AIRFLOW_SOURCES}/docker_tests/ci_image.py" } diff --git a/scripts/ci/tools/verify_docker_image.sh b/scripts/ci/tools/verify_docker_image.sh deleted file mode 100755 index 3ef5e3e0e03e1..0000000000000 --- a/scripts/ci/tools/verify_docker_image.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# shellcheck source=scripts/ci/libraries/_script_init.sh -. "$(dirname "${BASH_SOURCE[0]}")/../libraries/_script_init.sh" - -usage() { -local cmdname -cmdname="$(basename -- "$0")" - -cat << EOF -Usage: ${cmdname} - -Verify the user-specified docker image. - -Image Type can be one of the two values: CI or PROD - -EOF -} - - -if [[ "$#" -ne 2 ]]; then - >&2 echo "You must provide two argument - image type [PROD/CI] and image name." - usage - exit 1 -fi - -IMAGE_TYPE="${1}" -IMAGE_NAME="${2}" - -if ! docker image inspect "${IMAGE_NAME}" &>/dev/null; then - >&2 echo "Image '${IMAGE_NAME}' doesn't exists in local registry." - exit 1 -fi - -if [ "$(echo "${IMAGE_TYPE}" | tr '[:lower:]' '[:upper:]')" = "PROD" ]; then - verify_image::verify_prod_image "${IMAGE_NAME}" -elif [ "$(echo "${IMAGE_TYPE}" | tr '[:lower:]' '[:upper:]')" = "CI" ]; then - verify_image::verify_ci_image "${IMAGE_NAME}" -else - >&2 echo "Unsupported image type. Supported values: PROD, CI" - exit 1 -fi From 92b8ac3679e37a485d5dcfb7a8675a417c4f6f9f Mon Sep 17 00:00:00 2001 From: Malthe Borch Date: Wed, 24 Nov 2021 18:54:38 +0100 Subject: [PATCH 067/250] Upload provider distribution artifacts during CI (#19807) (cherry picked from commit 5e78c2c3cc94da2328c3f664f80f00b86489e512) --- .github/workflows/ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 06b4be1d3f57f..db07fe1916208 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -582,6 +582,12 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" env: USE_AIRFLOW_VERSION: "sdist" PACKAGE_FORMAT: "sdist" + - name: "Upload provider distribution artifacts" + uses: actions/upload-artifact@v2 + with: + name: airflow-provider-packages + path: "./dist/apache-airflow-providers-*.tar.gz" + retention-days: 1 tests-helm: timeout-minutes: 40 From 2e3712402ee606c28cbcd17c433f395bc32026fb Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 24 Nov 2021 19:34:12 +0100 Subject: [PATCH 068/250] Fixes failure of image building (#19813) The read command introduced in #19737 returned non zer error code on encountering EOF, and our bash script fail on that. This PR makes sure that the return code is not taken into account for that command. (cherry picked from commit 0dc51ea80ecbbe37e5679f5454c48c42700d9706) --- scripts/ci/libraries/_build_images.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/libraries/_build_images.sh b/scripts/ci/libraries/_build_images.sh index 8dc24d00fe084..9d5096bb64aff 100644 --- a/scripts/ci/libraries/_build_images.sh +++ b/scripts/ci/libraries/_build_images.sh @@ -938,7 +938,7 @@ function build_images::build_prod_images_from_locally_built_airflow_packages() { build_images::cleanup_docker_context_files # Build necessary provider packages - IFS=$'\n' read -d '' -r -a installed_providers < "${AIRFLOW_SOURCES}/scripts/ci/installed_providers.txt" + IFS=$'\n' read -d '' -r -a installed_providers < "${AIRFLOW_SOURCES}/scripts/ci/installed_providers.txt" || true runs::run_prepare_provider_packages "${installed_providers[@]}" mv "${AIRFLOW_SOURCES}/dist/"* "${AIRFLOW_SOURCES}/docker-context-files/" From 85cdc4da1a1f041a17f9cb9526ce57dd145c6eba Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 24 Nov 2021 19:36:26 +0100 Subject: [PATCH 069/250] Bring back Core and Other tests to be run in parallel (#19812) After merging #19809 we can very likely come back to parallel running of Core and Other tests as we separated them out thinking that the parallel runs were the cause of the problems. Those tests should be perfectly fine to run in parallel now. (cherry picked from commit 6c80149d0abf84caec8f4c1b4e8795ea5923f89a) --- scripts/ci/testing/ci_run_airflow_testing.sh | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/scripts/ci/testing/ci_run_airflow_testing.sh b/scripts/ci/testing/ci_run_airflow_testing.sh index 46c191ac89cf4..c3bee5e131d30 100755 --- a/scripts/ci/testing/ci_run_airflow_testing.sh +++ b/scripts/ci/testing/ci_run_airflow_testing.sh @@ -92,16 +92,6 @@ function run_all_test_types_in_parallel() { test_types_to_run="${test_types_to_run//Integration/}" sequential_tests+=("Integration") fi - if [[ ${test_types_to_run} == *"Core"* ]]; then - echo "${COLOR_YELLOW}Remove Core from tests_types_to_run and add them to sequential tests due to low memory.${COLOR_RESET}" - test_types_to_run="${test_types_to_run//Core/}" - sequential_tests+=("Core") - fi - if [[ ${test_types_to_run} == *"Other"* ]]; then - echo "${COLOR_YELLOW}Remove Other from tests_types_to_run and add them to sequential tests due to low memory.${COLOR_RESET}" - test_types_to_run="${test_types_to_run//Other/}" - sequential_tests+=("Other") - fi if [[ ${BACKEND} == "mssql" || ${BACKEND} == "mysql" ]]; then # For mssql/mysql - they take far more memory than postgres (or sqlite) - we skip the Provider # tests altogether as they take too much memory even if run sequentially. From 3a878d505f19d28b1be42cb6c5ab758c7bcd0012 Mon Sep 17 00:00:00 2001 From: Niko Date: Thu, 25 Nov 2021 01:27:10 -0800 Subject: [PATCH 070/250] Fix PATH export in breeze tmux sessions (#19818) - The PATH which is exported in the CI Dockerfile was not making it's way into the tmux session, as it was being overwritten by /etc/profile (cherry picked from commit 1cf90230349a8dcc930cdc1711b2b390e3f7c047) --- scripts/in_container/bin/run_tmux | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/in_container/bin/run_tmux b/scripts/in_container/bin/run_tmux index 4fa757b8e21b4..723ef278d632c 100755 --- a/scripts/in_container/bin/run_tmux +++ b/scripts/in_container/bin/run_tmux @@ -33,6 +33,11 @@ fi mkdir -p ~/.tmux/tmp chmod 777 -R ~/.tmux/tmp +# Creating a new tmux session (below) will start a new login shell and /etc/profile +# will overwrite the custom Dockerfile PATH variable. Adding the custom PATH export +# to home directory profile here will take precedence. +echo "export PATH=$PATH" >> ~/.profile + # Set Session Name export TMUX_SESSION="Airflow" From 66ae46c43e0ccd0ef6e413e8c63084cd130b4ba9 Mon Sep 17 00:00:00 2001 From: Malthe Borch Date: Thu, 25 Nov 2021 10:34:53 +0100 Subject: [PATCH 071/250] Use hyphen instead of underscore to match other artifacts (#19820) (cherry picked from commit 4bf85cf7a1a5c05d8cadc031b2d8d9a7e04895ea) --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db07fe1916208..d6985d35a6230 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -924,7 +924,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" uses: actions/upload-artifact@v2 if: always() with: - name: quarantined_tests + name: quarantined-tests path: "files/test_result-*.xml" retention-days: 7 - name: "Upload airflow logs" From 7e019778defa7d59055f2eff2e627c0cd5c74e3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Bregu=C5=82a?= Date: Fri, 26 Nov 2021 18:29:36 +0100 Subject: [PATCH 072/250] Rewrite image building tests to Python (#19819) (cherry picked from commit 20dc5b9aef66a9f2bed4e3ba652b385fb94b7e24) --- .github/workflows/ci.yml | 15 ++++ docker_tests/command_utils.py | 36 ++++++++++ docker_tests/constants.py | 20 ++++++ docker_tests/docker_tests_utils.py | 20 +----- .../{ci_image.py => test_ci_image.py} | 2 +- .../test_examples_of_prod_image_building.py | 64 +++++++++++++++++ .../{prod_image.py => test_prod_image.py} | 13 ++-- docs/docker-stack/build.rst | 4 +- .../customizing/add-build-essential-custom.sh | 2 +- .../customizing/custom-sources.sh | 2 +- ...ithub-v2-1-test.sh => github-v2-2-test.sh} | 8 +-- .../customizing/pypi-dev-runtime-deps.sh | 2 +- .../customizing/pypi-extras-and-deps.sh | 2 +- .../customizing/pypi-selected-version.sh | 2 +- .../restricted/restricted_environments.sh | 12 ++-- scripts/ci/images/ci_run_docker_tests.py | 2 +- ...ci_test_examples_of_prod_image_building.sh | 70 +------------------ scripts/ci/libraries/_verify_image.sh | 4 +- .../pre_commit/pre_commit_update_versions.py | 25 +++---- 19 files changed, 177 insertions(+), 128 deletions(-) create mode 100644 docker_tests/command_utils.py create mode 100644 docker_tests/constants.py rename docker_tests/{ci_image.py => test_ci_image.py} (96%) create mode 100644 docker_tests/test_examples_of_prod_image_building.py rename docker_tests/{prod_image.py => test_prod_image.py} (96%) rename docs/docker-stack/docker-examples/customizing/{github-v2-1-test.sh => github-v2-2-test.sh} (85%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d6985d35a6230..bb86648aafde2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -297,6 +297,11 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{needs.build-info.outputs.defaultPythonVersion}} + - name: "Cache virtualenv environmnent" + uses: actions/cache@v2 + with: + path: '.build/.docker_venv' + key: ${{ runner.os }}-docker-venv-${{ hashFiles('scripts/ci/images/ci_run_docker_tests.py') }} - name: "Test examples of PROD image building" run: ./scripts/ci/images/ci_test_examples_of_prod_image_building.sh @@ -326,6 +331,11 @@ jobs: run: ./scripts/ci/tools/free_space.sh if: | needs.build-info.outputs.waitForImage == 'true' + - name: "Cache virtualenv environmnent" + uses: actions/cache@v2 + with: + path: '.build/.docker_venv' + key: ${{ runner.os }}-docker-venv-${{ hashFiles('scripts/ci/images/ci_run_docker_tests.py') }} - name: > Wait for CI images ${{ needs.build-info.outputs.pythonVersions }}:${{ env.GITHUB_REGISTRY_PULL_IMAGE_TAG }} @@ -1010,6 +1020,11 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" run: ./scripts/ci/tools/free_space.sh if: | needs.build-info.outputs.waitForImage == 'true' + - name: "Cache virtualenv environmnent" + uses: actions/cache@v2 + with: + path: '.build/.docker_venv' + key: ${{ runner.os }}-docker-venv-${{ hashFiles('scripts/ci/images/ci_run_docker_tests.py') }} - name: > Wait for PROD images ${{ needs.build-info.outputs.pythonVersions }}:${{ env.GITHUB_REGISTRY_PULL_IMAGE_TAG }} diff --git a/docker_tests/command_utils.py b/docker_tests/command_utils.py new file mode 100644 index 0000000000000..d1e793a2c0ebb --- /dev/null +++ b/docker_tests/command_utils.py @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import shlex +import subprocess +from typing import List + + +def run_command(cmd: List[str], *, print_output_on_error: bool = True, return_output: bool = False, **kwargs): + print(f"$ {' '.join(shlex.quote(c) for c in cmd)}") + try: + if return_output: + return subprocess.check_output(cmd, **kwargs).decode() + else: + subprocess.run(cmd, check=True, **kwargs) + except subprocess.CalledProcessError as ex: + if print_output_on_error: + print("========================= OUTPUT start ============================") + print(ex.stderr) + print(ex.stdout) + print("========================= OUTPUT end ============================") + raise diff --git a/docker_tests/constants.py b/docker_tests/constants.py new file mode 100644 index 0000000000000..054825b73ba69 --- /dev/null +++ b/docker_tests/constants.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pathlib import Path + +SOURCE_ROOT = Path(__file__).resolve().parents[1] diff --git a/docker_tests/docker_tests_utils.py b/docker_tests/docker_tests_utils.py index b0155b0fb7964..96afced33a061 100644 --- a/docker_tests/docker_tests_utils.py +++ b/docker_tests/docker_tests_utils.py @@ -16,31 +16,15 @@ # under the License. import os -import shlex -import subprocess -from pathlib import Path -from typing import List + +from docker_tests.command_utils import run_command docker_image = os.environ.get('DOCKER_IMAGE') -SOURCE_ROOT = Path(__file__).resolve().parents[1] if not docker_image: raise Exception("The DOCKER_IMAGE environment variable is required") -def run_command(cmd: List[str], print_output_on_error: bool = True, **kwargs): - print(f"$ {' '.join(shlex.quote(c) for c in cmd)}") - try: - return subprocess.check_output(cmd, **kwargs).decode() - except subprocess.CalledProcessError as ex: - if print_output_on_error: - print("========================= OUTPUT start ============================") - print(ex.stderr) - print(ex.stdout) - print("========================= OUTPUT end ============================") - raise - - def run_bash_in_docker(bash_script, **kwargs): docker_command = [ "docker", diff --git a/docker_tests/ci_image.py b/docker_tests/test_ci_image.py similarity index 96% rename from docker_tests/ci_image.py rename to docker_tests/test_ci_image.py index 25a9bfb73ac5d..00823328b8b6c 100644 --- a/docker_tests/ci_image.py +++ b/docker_tests/test_ci_image.py @@ -17,11 +17,11 @@ import subprocess +from docker_tests.command_utils import run_command from docker_tests.docker_tests_utils import ( display_dependency_conflict_message, docker_image, run_bash_in_docker, - run_command, ) diff --git a/docker_tests/test_examples_of_prod_image_building.py b/docker_tests/test_examples_of_prod_image_building.py new file mode 100644 index 0000000000000..1fd2676bb5008 --- /dev/null +++ b/docker_tests/test_examples_of_prod_image_building.py @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import glob +import os +import re +from functools import lru_cache +from pathlib import Path + +import pytest +import requests + +from docker_tests.command_utils import run_command +from docker_tests.constants import SOURCE_ROOT + +DOCKER_EXAMPLES_DIR = SOURCE_ROOT / "docs" / "docker-stack" / "docker-examples" + + +@lru_cache(maxsize=None) +def get_latest_airflow_version_released(): + response = requests.get('https://pypi.org/pypi/apache-airflow/json') + response.raise_for_status() + return response.json()['info']['version'] + + +@pytest.mark.skipif( + os.environ.get('CI') == "true", + reason="Skipping the script builds on CI! They take very long time to build.", +) +@pytest.mark.parametrize("script_file", glob.glob(f"{DOCKER_EXAMPLES_DIR}/**/*.sh", recursive=True)) +def test_shell_script_example(script_file): + run_command(["bash", script_file]) + + +@pytest.mark.parametrize("dockerfile", glob.glob(f"{DOCKER_EXAMPLES_DIR}/**/Dockerfile", recursive=True)) +def test_dockerfile_example(dockerfile): + rel_dockerfile_path = Path(dockerfile).relative_to(DOCKER_EXAMPLES_DIR) + image_name = str(rel_dockerfile_path).lower().replace("/", "-") + content = Path(dockerfile).read_text() + new_content = re.sub( + r'FROM apache/airflow:.*', fr'FROM apache/airflow:{get_latest_airflow_version_released()}', content + ) + try: + run_command( + ["docker", "build", ".", "--tag", image_name, '-f', '-'], + cwd=str(Path(dockerfile).parent), + input=new_content.encode(), + ) + finally: + run_command(["docker", "rmi", "--force", image_name]) diff --git a/docker_tests/prod_image.py b/docker_tests/test_prod_image.py similarity index 96% rename from docker_tests/prod_image.py rename to docker_tests/test_prod_image.py index cac517c80ed99..c8a5729b5e1b5 100644 --- a/docker_tests/prod_image.py +++ b/docker_tests/test_prod_image.py @@ -22,12 +22,12 @@ import pytest +from docker_tests.command_utils import run_command +from docker_tests.constants import SOURCE_ROOT from docker_tests.docker_tests_utils import ( - SOURCE_ROOT, display_dependency_conflict_message, docker_image, run_bash_in_docker, - run_command, run_python_in_docker, ) @@ -50,21 +50,24 @@ def test_airflow_command(self): def test_airflow_version(self): """Checking 'airflow version' command It should return zero exit code.""" output = run_command( - ["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image, "airflow", "version"] + ["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image, "airflow", "version"], + return_output=True, ) assert "2." in output def test_python_version(self): """Checking 'python --version' command It should return zero exit code.""" output = run_command( - ["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image, "python", "--version"] + ["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image, "python", "--version"], + return_output=True, ) assert "Python 3." in output def test_bash_version(self): """Checking 'bash --version' command It should return zero exit code.""" output = run_command( - ["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image, "bash", "--version"] + ["docker", "run", "--rm", "-e", "COLUMNS=180", docker_image, "bash", "--version"], + return_output=True, ) assert "GNU bash," in output diff --git a/docs/docker-stack/build.rst b/docs/docker-stack/build.rst index 335b9adc0581f..a09f879835977 100644 --- a/docs/docker-stack/build.rst +++ b/docs/docker-stack/build.rst @@ -491,11 +491,11 @@ constraints are taken from latest version of the constraints-main branch in GitH The following example builds the production image with default extras from the latest ``v2-*-test`` version and constraints are taken from the latest version of -the ``constraints-2-*`` branch in GitHub (for example ``v2-1-test`` branch matches ``constraints-2-1``). +the ``constraints-2-*`` branch in GitHub (for example ``v2-2-test`` branch matches ``constraints-2-2``). Note that this command might fail occasionally as only the "released version" constraints when building a version and "main" constraints when building main are guaranteed to work. -.. exampleinclude:: docker-examples/customizing/github-v2-1-test.sh +.. exampleinclude:: docker-examples/customizing/github-v2-2-test.sh :language: bash :start-after: [START build] :end-before: [END build] diff --git a/docs/docker-stack/docker-examples/customizing/add-build-essential-custom.sh b/docs/docker-stack/docker-examples/customizing/add-build-essential-custom.sh index 7cf1dae5f42ec..e6d8ddc12b555 100755 --- a/docs/docker-stack/docker-examples/customizing/add-build-essential-custom.sh +++ b/docs/docker-stack/docker-examples/customizing/add-build-essential-custom.sh @@ -22,7 +22,7 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START build] -export AIRFLOW_VERSION=2.2.1 +export AIRFLOW_VERSION=2.2.2 docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.6-slim-buster" \ diff --git a/docs/docker-stack/docker-examples/customizing/custom-sources.sh b/docs/docker-stack/docker-examples/customizing/custom-sources.sh index 2aecf4ec39bba..3fcb72c595aec 100755 --- a/docs/docker-stack/docker-examples/customizing/custom-sources.sh +++ b/docs/docker-stack/docker-examples/customizing/custom-sources.sh @@ -22,7 +22,7 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START build] -export AIRFLOW_VERSION=2.2.1 +export AIRFLOW_VERSION=2.2.2 docker build . -f Dockerfile \ --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ diff --git a/docs/docker-stack/docker-examples/customizing/github-v2-1-test.sh b/docs/docker-stack/docker-examples/customizing/github-v2-2-test.sh similarity index 85% rename from docs/docker-stack/docker-examples/customizing/github-v2-1-test.sh rename to docs/docker-stack/docker-examples/customizing/github-v2-2-test.sh index f8e1fe5edce2f..cc40e12f64922 100755 --- a/docs/docker-stack/docker-examples/customizing/github-v2-1-test.sh +++ b/docs/docker-stack/docker-examples/customizing/github-v2-2-test.sh @@ -24,8 +24,8 @@ cd "${AIRFLOW_SOURCES}" # [START build] docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.8-slim-buster" \ - --build-arg AIRFLOW_INSTALLATION_METHOD="https://github.com/apache/airflow/archive/v2-1-test.tar.gz#egg=apache-airflow" \ - --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-2-1" \ - --tag "my-github-v2-1:0.0.1" + --build-arg AIRFLOW_INSTALLATION_METHOD="https://github.com/apache/airflow/archive/v2-2-test.tar.gz#egg=apache-airflow" \ + --build-arg AIRFLOW_CONSTRAINTS_REFERENCE="constraints-2-2" \ + --tag "my-github-v2-2:0.0.1" # [END build] -docker rmi --force "my-github-v2-1:0.0.1" +docker rmi --force "my-github-v2-2:0.0.1" diff --git a/docs/docker-stack/docker-examples/customizing/pypi-dev-runtime-deps.sh b/docs/docker-stack/docker-examples/customizing/pypi-dev-runtime-deps.sh index 9ba93c1de7ca8..c0255e670c43b 100755 --- a/docs/docker-stack/docker-examples/customizing/pypi-dev-runtime-deps.sh +++ b/docs/docker-stack/docker-examples/customizing/pypi-dev-runtime-deps.sh @@ -22,7 +22,7 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START build] -export AIRFLOW_VERSION=2.2.1 +export AIRFLOW_VERSION=2.2.2 docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.6-slim-buster" \ diff --git a/docs/docker-stack/docker-examples/customizing/pypi-extras-and-deps.sh b/docs/docker-stack/docker-examples/customizing/pypi-extras-and-deps.sh index 804eacf0ae3e6..ffb0a493eb398 100755 --- a/docs/docker-stack/docker-examples/customizing/pypi-extras-and-deps.sh +++ b/docs/docker-stack/docker-examples/customizing/pypi-extras-and-deps.sh @@ -22,7 +22,7 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START build] -export AIRFLOW_VERSION=2.2.1 +export AIRFLOW_VERSION=2.2.2 docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.8-slim-buster" \ diff --git a/docs/docker-stack/docker-examples/customizing/pypi-selected-version.sh b/docs/docker-stack/docker-examples/customizing/pypi-selected-version.sh index 77045a40016c7..a1e2ddafa8852 100755 --- a/docs/docker-stack/docker-examples/customizing/pypi-selected-version.sh +++ b/docs/docker-stack/docker-examples/customizing/pypi-selected-version.sh @@ -22,7 +22,7 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START build] -export AIRFLOW_VERSION=2.2.1 +export AIRFLOW_VERSION=2.2.2 docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ diff --git a/docs/docker-stack/docker-examples/restricted/restricted_environments.sh b/docs/docker-stack/docker-examples/restricted/restricted_environments.sh index 3a87f43b20b73..f4a1930f4e52f 100755 --- a/docs/docker-stack/docker-examples/restricted/restricted_environments.sh +++ b/docs/docker-stack/docker-examples/restricted/restricted_environments.sh @@ -22,28 +22,26 @@ AIRFLOW_SOURCES="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../../" && pwd)" cd "${AIRFLOW_SOURCES}" # [START download] +export AIRFLOW_VERSION="2.2.3" rm docker-context-files/*.whl docker-context-files/*.tar.gz docker-context-files/*.txt || true curl -Lo "docker-context-files/constraints-3.7.txt" \ - https://raw.githubusercontent.com/apache/airflow/constraints-2.2.4/constraints-3.7.txt - -# For Airflow pre 2.1 you need to use PIP 20.2.4 to install/download Airflow packages. -pip install pip==20.2.4 + "https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-3.7.txt" pip download --dest docker-context-files \ --constraint docker-context-files/constraints-3.7.txt \ - "apache-airflow[async,aws,azure,celery,dask,elasticsearch,gcp,kubernetes,postgres,redis,slack,ssh,statsd,virtualenv]==2.0.2" + "apache-airflow[async,aws,azure,celery,dask,elasticsearch,gcp,kubernetes,postgres,redis,slack,ssh,statsd,virtualenv]==${AIRFLOW_VERSION}" # [END download] # [START build] docker build . \ --build-arg PYTHON_BASE_IMAGE="python:3.7-slim-buster" \ --build-arg AIRFLOW_INSTALLATION_METHOD="apache-airflow" \ - --build-arg AIRFLOW_VERSION="2.2.4" \ + --build-arg AIRFLOW_VERSION="${AIRFLOW_VERSION}" \ --build-arg INSTALL_MYSQL_CLIENT="false" \ --build-arg INSTALL_MSSQL_CLIENT="false" \ --build-arg AIRFLOW_PRE_CACHED_PIP_PACKAGES="false" \ --build-arg INSTALL_FROM_DOCKER_CONTEXT_FILES="true" \ --build-arg AIRFLOW_CONSTRAINTS_LOCATION="/docker-context-files/constraints-3.7.txt" \ - --tag my-restricted-environment:0.0.1 + --tag airflow-my-restricted-environment:0.0.1 # [END build] diff --git a/scripts/ci/images/ci_run_docker_tests.py b/scripts/ci/images/ci_run_docker_tests.py index af81cb4964ed3..90e10d94146c1 100755 --- a/scripts/ci/images/ci_run_docker_tests.py +++ b/scripts/ci/images/ci_run_docker_tests.py @@ -62,7 +62,7 @@ def create_virtualenv(): run_verbose([sys.executable, "-m", "venv", str(virtualenv_path)]) python_bin = virtualenv_path / "bin" / "python" - run_verbose([str(python_bin), "-m", "pip", "install", "pytest", "pytest-xdist"]) + run_verbose([str(python_bin), "-m", "pip", "install", "pytest", "pytest-xdist", "requests"]) return python_bin diff --git a/scripts/ci/images/ci_test_examples_of_prod_image_building.sh b/scripts/ci/images/ci_test_examples_of_prod_image_building.sh index be37e8291c4fa..f0653f26961b8 100755 --- a/scripts/ci/images/ci_test_examples_of_prod_image_building.sh +++ b/scripts/ci/images/ci_test_examples_of_prod_image_building.sh @@ -18,72 +18,4 @@ # shellcheck source=scripts/ci/libraries/_script_init.sh . "$(dirname "${BASH_SOURCE[0]}")/../libraries/_script_init.sh" -SEMAPHORE_NAME="image_tests" -export SEMAPHORE_NAME - -DOCKER_EXAMPLES_DIR=${AIRFLOW_SOURCES}/docs/docker-stack/docker-examples/ -export DOCKER_EXAMPLES_DIR - -# Launches parallel building of images. Redirects output to log set the right directories -# $1 - name of the job -# $2 - bash file to execute in parallel -function run_image_test_job() { - local file=$1 - - local job_name=$2 - mkdir -p "${PARALLEL_MONITORED_DIR}/${SEMAPHORE_NAME}/${job_name}" - export JOB_LOG="${PARALLEL_MONITORED_DIR}/${SEMAPHORE_NAME}/${job_name}/stdout" - export PARALLEL_JOB_STATUS="${PARALLEL_MONITORED_DIR}/${SEMAPHORE_NAME}/${job_name}/status" - parallel --ungroup --bg --semaphore --semaphorename "${SEMAPHORE_NAME}" \ - --jobs "${MAX_PARALLEL_IMAGE_JOBS}" \ - "$(dirname "${BASH_SOURCE[0]}")/ci_run_prod_image_test.sh" "${job_name}" "${file}" >"${JOB_LOG}" 2>&1 -} - - -function test_images() { - if [[ ${CI=} == "true" ]]; then - echo - echo "Skipping the script builds on CI! " - echo "They take very long time to build." - echo - else - local scripts_to_test - scripts_to_test=$(find "${DOCKER_EXAMPLES_DIR}" -type f -name '*.sh' ) - for file in ${scripts_to_test} - do - local job_name - job_name=$(basename "${file}") - run_image_test_job "${file}" "${job_name}" - done - fi - local dockerfiles_to_test - dockerfiles_to_test=$(find "${DOCKER_EXAMPLES_DIR}" -type f -name 'Dockerfile' ) - for file in ${dockerfiles_to_test} - do - local job_name - job_name="$(basename "$(dirname "${file}")")" - run_image_test_job "${file}" "${job_name}" - done - -} - -cd "${AIRFLOW_SOURCES}" || exit 1 - -# Building max for images in parallel helps to conserve docker image space -MAX_PARALLEL_IMAGE_JOBS=4 -export MAX_PARALLEL_IMAGE_JOBS - -parallel::make_sure_gnu_parallel_is_installed -parallel::kill_stale_semaphore_locks -parallel::initialize_monitoring - -start_end::group_start "Testing image building" - -parallel::monitor_progress - -test_images - -parallel --semaphore --semaphorename "${SEMAPHORE_NAME}" --wait -start_end::group_end - -parallel::print_job_summary_and_return_status_code +python3 "${SCRIPTS_CI_DIR}/images/ci_run_docker_tests.py" "${AIRFLOW_SOURCES}/docker_tests/test_examples_of_prod_image_building.py" diff --git a/scripts/ci/libraries/_verify_image.sh b/scripts/ci/libraries/_verify_image.sh index dfe507dde8eed..eb038fc35821f 100644 --- a/scripts/ci/libraries/_verify_image.sh +++ b/scripts/ci/libraries/_verify_image.sh @@ -19,11 +19,11 @@ function verify_image::verify_prod_image { DOCKER_IMAGE="${1}" export DOCKER_IMAGE - python3 "${SCRIPTS_CI_DIR}/images/ci_run_docker_tests.py" "${AIRFLOW_SOURCES}/docker_tests/prod_image.py" + python3 "${SCRIPTS_CI_DIR}/images/ci_run_docker_tests.py" "${AIRFLOW_SOURCES}/docker_tests/test_prod_image.py" } function verify_image::verify_ci_image { DOCKER_IMAGE="${1}" export DOCKER_IMAGE - python3 "${SCRIPTS_CI_DIR}/images/ci_run_docker_tests.py" "${AIRFLOW_SOURCES}/docker_tests/ci_image.py" + python3 "${SCRIPTS_CI_DIR}/images/ci_run_docker_tests.py" "${AIRFLOW_SOURCES}/docker_tests/test_ci_image.py" } diff --git a/scripts/ci/pre_commit/pre_commit_update_versions.py b/scripts/ci/pre_commit/pre_commit_update_versions.py index ee12bb647510a..2af06980755fc 100755 --- a/scripts/ci/pre_commit/pre_commit_update_versions.py +++ b/scripts/ci/pre_commit/pre_commit_update_versions.py @@ -29,31 +29,28 @@ from setup import version # isort:skip -def update_version(pattern, v: str, file_path: str): +def update_version(pattern: re.Pattern, v: str, file_path: str): print(f"Replacing {pattern} to {version} in {file_path}") with open(file_path, "r+") as f: - file_contents = f.read() - lines = file_contents.splitlines(keepends=True) - for i in range(0, len(lines)): - lines[i] = re.sub(pattern, fr'\g<1>{v}\g<2>', lines[i]) - file_contents = "".join(lines) + file_content = f.read() + if not pattern.search(file_content): + raise Exception(f"Pattern {pattern!r} doesn't found in {file_path!r} file") + new_content = pattern.sub(fr'\g<1>{v}\g<2>', file_content) + if file_content == new_content: + return f.seek(0) f.truncate() - f.write(file_contents) + f.write(new_content) REPLACEMENTS = { - r'(FROM apache/airflow:).*($)': "docs/docker-stack/docker-examples/extending/*/Dockerfile", - r'(apache/airflow:)[^-]*(\-)': "docs/docker-stack/entrypoint.rst", - r'(/constraints-)[^-]*(/constraints)': "docs/docker-stack/docker-examples/" - "restricted/restricted_environments.sh", - r'(AIRFLOW_VERSION=")[^"]*(" \\)': "docs/docker-stack/docker-examples/" - "restricted/restricted_environments.sh", + r'^(FROM apache\/airflow:).*($)': "docs/docker-stack/docker-examples/extending/*/Dockerfile", + r'(apache\/airflow:)[^-]*(\-)': "docs/docker-stack/entrypoint.rst", } if __name__ == '__main__': for regexp, p in REPLACEMENTS.items(): - text_pattern = re.compile(regexp) + text_pattern = re.compile(regexp, flags=re.MULTILINE) files = glob.glob(join(AIRFLOW_SOURCES_DIR, p), recursive=True) if not files: print(f"ERROR! No files matched on {p}") From ab752e75534e9f96856ceb569df534b2162abe63 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Mon, 29 Nov 2021 12:13:40 +0100 Subject: [PATCH 073/250] Fix race condition when running mssql tests (#19863) There is a race condition where initialization of Airlfow DB for mssql might be executed when the server is started but it is not yet initialized with a model db needed to create airflow db. In such case mssql database intialization will fail as it will not be able to obtain a locl on the `model` database. The error in the mssqlsetup container will be similar to: ``` Msg 1807, Level 16, State 3, Server d2888dd467fe, Line 20 Could not obtain exclusive lock on database 'model'. Retry the operation later. Msg 1802, Level 16, State 4, Server d2888dd467fe, Line 20 CREATE DATABASE failed. Some file names listed could not be created. Check related errors. Msg 5011, Level 14, State 5, Server d2888dd467fe, Line 21 User does not have permission to alter database 'airflow', the database does not exist, or the database is not in a state that allows access checks. Msg 5069, Level 16, State 1, Server d2888dd467fe, Line 21 ALTER DATABASE statement failed. ``` This PR alters the setup job to try to create airflow db several times and wait a second before every retry. (cherry picked from commit f1c333f3b3b2ce93be9ad3710c81dcb6cd320913) --- scripts/ci/docker-compose/backend-mssql.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/ci/docker-compose/backend-mssql.yml b/scripts/ci/docker-compose/backend-mssql.yml index 69b4aa5fb6e5a..7fc5540e467a0 100644 --- a/scripts/ci/docker-compose/backend-mssql.yml +++ b/scripts/ci/docker-compose/backend-mssql.yml @@ -49,6 +49,12 @@ services: entrypoint: - bash - -c - - opt/mssql-tools/bin/sqlcmd -S mssql -U sa -P Airflow123 -i /mssql_create_airflow_db.sql || true + - > + for i in {1..10}; + do + /opt/mssql-tools/bin/sqlcmd -S mssql -U sa -P Airflow123 -i /mssql_create_airflow_db.sql && + exit 0; + sleep 1; + done volumes: - ./mssql_create_airflow_db.sql:/mssql_create_airflow_db.sql:ro From 601d00c61dab6b12b925d76602347fd7ae2d51b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Bregu=C5=82a?= Date: Tue, 30 Nov 2021 07:56:40 +0100 Subject: [PATCH 074/250] Add tests for docker-compose quick start (#19874) (cherry picked from commit 0df50f42dde4bd9b4c99cb6646416dde6fd4961e) --- .github/workflows/ci.yml | 28 +++ .../test_docker_compose_quick_start.py | 161 ++++++++++++++++++ docker_tests/test_prod_image.py | 4 +- .../ci_run_docker_compose_quick_start_test.sh | 28 +++ scripts/ci/images/ci_run_docker_tests.py | 4 +- 5 files changed, 222 insertions(+), 3 deletions(-) create mode 100644 docker_tests/test_docker_compose_quick_start.py create mode 100755 scripts/ci/images/ci_run_docker_compose_quick_start_test.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bb86648aafde2..0015a9062ce3c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -305,6 +305,34 @@ jobs: - name: "Test examples of PROD image building" run: ./scripts/ci/images/ci_test_examples_of_prod_image_building.sh + test-docker-compose-quick-start: + timeout-minutes: 60 + name: "Test docker-compose quick start" + runs-on: ${{ fromJson(needs.build-info.outputs.runsOn) }} + needs: [build-info, prod-images] + if: needs.build-info.outputs.image-build == 'true' + steps: + - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" + uses: actions/checkout@v2 + with: + fetch-depth: 2 + persist-credentials: false + - name: "Free space" + run: ./scripts/ci/tools/free_space.sh + if: | + needs.build-info.outputs.waitForImage == 'true' + - name: "Setup python" + uses: actions/setup-python@v2 + with: + python-version: ${{needs.build-info.outputs.defaultPythonVersion}} + - name: "Cache virtualenv environmnent" + uses: actions/cache@v2 + with: + path: '.build/.docker_venv' + key: ${{ runner.os }}-docker-venv-${{ hashFiles('scripts/ci/images/ci_run_docker_tests.py') }} + - name: "Test docker-compose quick start" + run: ./scripts/ci/images/ci_run_docker_compose_quick_start_test.sh + ci-images: timeout-minutes: 120 name: "Wait for CI images" diff --git a/docker_tests/test_docker_compose_quick_start.py b/docker_tests/test_docker_compose_quick_start.py new file mode 100644 index 0000000000000..bf6382778390a --- /dev/null +++ b/docker_tests/test_docker_compose_quick_start.py @@ -0,0 +1,161 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import contextlib +import os +import subprocess +import tempfile +from pathlib import Path +from pprint import pprint +from shutil import copyfile +from time import monotonic, sleep +from typing import Dict +from unittest import mock + +import requests + +from docker_tests.command_utils import run_command +from docker_tests.constants import SOURCE_ROOT +from docker_tests.docker_tests_utils import docker_image + +AIRFLOW_WWW_USER_USERNAME = os.environ.get("_AIRFLOW_WWW_USER_USERNAME", "airflow") +AIRFLOW_WWW_USER_PASSWORD = os.environ.get("_AIRFLOW_WWW_USER_PASSWORD", "airflow") +DAG_ID = "example_bash_operator" +DAG_RUN_ID = "test_dag_run_id" + + +def api_request(method: str, path: str, base_url: str = "http://localhost:8080/api/v1", **kwargs) -> Dict: + response = requests.request( + method=method, + url=f"{base_url}/{path}", + auth=(AIRFLOW_WWW_USER_USERNAME, AIRFLOW_WWW_USER_PASSWORD), + headers={"Content-Type": "application/json"}, + **kwargs, + ) + response.raise_for_status() + return response.json() + + +@contextlib.contextmanager +def tmp_chdir(path): + current_cwd = os.getcwd() + try: + os.chdir(path) + yield current_cwd + finally: + os.chdir(current_cwd) + + +def wait_for_container(container_id: str, timeout: int = 300): + container_name = ( + subprocess.check_output(["docker", "inspect", container_id, "--format", '{{ .Name }}']) + .decode() + .strip() + ) + print(f"Waiting for container: {container_name} [{container_id}]") + waiting_done = False + start_time = monotonic() + while not waiting_done: + container_state = ( + subprocess.check_output(["docker", "inspect", container_id, "--format", '{{ .State.Status }}']) + .decode() + .strip() + ) + if container_state in ("running", 'restarting'): + health_status = ( + subprocess.check_output( + [ + "docker", + "inspect", + container_id, + "--format", + "{{ if .State.Health }}{{ .State.Health.Status }}{{ else }}no-check{{ end }}", + ] + ) + .decode() + .strip() + ) + print(f"{container_name}: container_state={container_state}, health_status={health_status}") + + if health_status == "healthy" or health_status == "no-check": + waiting_done = True + else: + print(f"{container_name}: container_state={container_state}") + waiting_done = True + if timeout != 0 and monotonic() - start_time > timeout: + raise Exception(f"Timeout. The operation takes longer than the maximum waiting time ({timeout}s)") + sleep(1) + + +def wait_for_terminal_dag_state(dag_id, dag_run_id): + # Wait 30 seconds + for _ in range(30): + dag_state = api_request("GET", f"dags/{dag_id}/dagRuns/{dag_run_id}").get("state") + print(f"Waiting for DAG Run: dag_state={dag_state}") + sleep(1) + if dag_state in ("success", "failed"): + break + + +def test_trigger_dag_and_wait_for_result(): + compose_file_path = SOURCE_ROOT / "docs" / "apache-airflow" / "start" / "docker-compose.yaml" + + with tempfile.TemporaryDirectory() as tmp_dir, tmp_chdir(tmp_dir), mock.patch.dict( + 'os.environ', AIRFLOW_IMAGE_NAME=docker_image + ): + copyfile(str(compose_file_path), f"{tmp_dir}/docker-compose.yaml") + os.mkdir(f"{tmp_dir}/dags") + os.mkdir(f"{tmp_dir}/logs") + os.mkdir(f"{tmp_dir}/plugins") + (Path(tmp_dir) / ".env").write_text(f"AIRFLOW_UID={subprocess.check_output(['id', '-u']).decode()}\n") + print(".emv=", (Path(tmp_dir) / ".env").read_text()) + copyfile( + str(SOURCE_ROOT / "airflow" / "example_dags" / "example_bash_operator.py"), + f"{tmp_dir}/dags/example_bash_operator.py", + ) + + run_command(["docker-compose", "config"]) + run_command(["docker-compose", "down", "--volumes", "--remove-orphans"]) + try: + run_command(["docker-compose", "up", "-d"]) + # The --wait condition was released in docker-compose v2.1.1, but we want to support + # docker-compose v1 yet. + # See: + # https://github.com/docker/compose/releases/tag/v2.1.1 + # https://github.com/docker/compose/pull/8777 + for container_id in ( + subprocess.check_output(["docker-compose", 'ps', '-q']).decode().strip().splitlines() + ): + wait_for_container(container_id) + api_request("PATCH", path=f"dags/{DAG_ID}", json={"is_paused": False}) + api_request("POST", path=f"dags/{DAG_ID}/dagRuns", json={"dag_run_id": DAG_RUN_ID}) + try: + wait_for_terminal_dag_state(dag_id=DAG_ID, dag_run_id=DAG_RUN_ID) + dag_state = api_request("GET", f"dags/{DAG_ID}/dagRuns/{DAG_RUN_ID}").get("state") + assert dag_state == "success" + except Exception: + print(f"HTTP: GET dags/{DAG_ID}/dagRuns/{DAG_RUN_ID}") + pprint(api_request("GET", f"dags/{DAG_ID}/dagRuns/{DAG_RUN_ID}")) + print(f"HTTP: GET dags/{DAG_ID}/dagRuns/{DAG_RUN_ID}/taskInstances") + pprint(api_request("GET", f"dags/{DAG_ID}/dagRuns/{DAG_RUN_ID}/taskInstances")) + raise + except Exception: + run_command(["docker", "ps"]) + run_command(["docker-compose", "logs"]) + raise + finally: + run_command(["docker-compose", "down", "--volumes"]) diff --git a/docker_tests/test_prod_image.py b/docker_tests/test_prod_image.py index c8a5729b5e1b5..d09673961656d 100644 --- a/docker_tests/test_prod_image.py +++ b/docker_tests/test_prod_image.py @@ -79,7 +79,9 @@ def test_required_providers_are_installed(self): packages_to_install = {f"apache-airflow-providers-{d.replace('.', '-')}" for d in lines} assert len(packages_to_install) != 0 - output = run_bash_in_docker("airflow providers list --output json", stderr=subprocess.DEVNULL) + output = run_bash_in_docker( + "airflow providers list --output json", stderr=subprocess.DEVNULL, return_output=True + ) providers = json.loads(output) packages_installed = {d['package_name'] for d in providers} assert len(packages_installed) != 0 diff --git a/scripts/ci/images/ci_run_docker_compose_quick_start_test.sh b/scripts/ci/images/ci_run_docker_compose_quick_start_test.sh new file mode 100755 index 0000000000000..1b6b90d62a95d --- /dev/null +++ b/scripts/ci/images/ci_run_docker_compose_quick_start_test.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# shellcheck source=scripts/ci/libraries/_script_init.sh +. "$(dirname "${BASH_SOURCE[0]}")/../libraries/_script_init.sh" + + +DOCKER_IMAGE="${AIRFLOW_PROD_IMAGE}:${GITHUB_REGISTRY_PULL_IMAGE_TAG}" +export DOCKER_IMAGE + +build_images::prepare_prod_build +push_pull_remove_images::wait_for_image "${DOCKER_IMAGE}" + +python3 "${SCRIPTS_CI_DIR}/images/ci_run_docker_tests.py" "${AIRFLOW_SOURCES}/docker_tests/test_docker_compose_quick_start.py" diff --git a/scripts/ci/images/ci_run_docker_tests.py b/scripts/ci/images/ci_run_docker_tests.py index 90e10d94146c1..c9c8a0571396c 100755 --- a/scripts/ci/images/ci_run_docker_tests.py +++ b/scripts/ci/images/ci_run_docker_tests.py @@ -47,9 +47,9 @@ def get_parser(): return parser -def run_verbose(cmd: List[str], **kwargs): +def run_verbose(cmd: List[str], *, check=True, **kwargs): print(f"{CBLUE}$ {' '.join(shlex.quote(c) for c in cmd)}{CEND}") - subprocess.run(cmd, **kwargs) + subprocess.run(cmd, check=check, **kwargs) def create_virtualenv(): From f9ff33f1a68ad7b956079e18b4d90eaa5f4541e0 Mon Sep 17 00:00:00 2001 From: "Ryan, Siu Long Wa" Date: Tue, 12 Oct 2021 19:54:43 +0800 Subject: [PATCH 075/250] Remove the docker timeout workaround (#18872) (cherry picked from commit 3154935138748a8ac89aa4c8fde848e31610941b) --- .../docker/operators/docker_swarm.py | 7 --- setup.py | 2 +- .../docker/operators/test_docker_swarm.py | 48 ------------------- 3 files changed, 1 insertion(+), 56 deletions(-) diff --git a/airflow/providers/docker/operators/docker_swarm.py b/airflow/providers/docker/operators/docker_swarm.py index 2d5373c840f17..a1f3f0b2bcf2d 100644 --- a/airflow/providers/docker/operators/docker_swarm.py +++ b/airflow/providers/docker/operators/docker_swarm.py @@ -17,7 +17,6 @@ """Run ephemeral Docker Swarm services""" from typing import List, Optional, Union -import requests from docker import types from airflow.exceptions import AirflowException @@ -204,12 +203,6 @@ def _stream_logs_to_output(self) -> None: while True: try: log = next(logs) - # TODO: Remove this clause once https://github.com/docker/docker-py/issues/931 is fixed - except requests.exceptions.ConnectionError: - # If the service log stream stopped sending messages, check if it the service has - # terminated. - if self._has_service_terminated(): - break except StopIteration: # If the service log stream terminated, stop fetching logs further. break diff --git a/setup.py b/setup.py index d1ac695d335f8..8b9237ea9f341 100644 --- a/setup.py +++ b/setup.py @@ -258,7 +258,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'sphinxcontrib-spelling==7.2.1', ] docker = [ - 'docker', + 'docker>=5.0.3', ] drill = ['sqlalchemy-drill>=1.1.0', 'sqlparse>=0.4.1'] druid = [ diff --git a/tests/providers/docker/operators/test_docker_swarm.py b/tests/providers/docker/operators/test_docker_swarm.py index 8523644888de4..09207b425dbb8 100644 --- a/tests/providers/docker/operators/test_docker_swarm.py +++ b/tests/providers/docker/operators/test_docker_swarm.py @@ -20,7 +20,6 @@ from unittest import mock import pytest -import requests from docker import APIClient, types from parameterized import parameterized @@ -184,53 +183,6 @@ def test_non_complete_service_raises_error(self, status, types_mock, client_clas operator.execute(None) assert str(ctx.value) == msg - @mock.patch('airflow.providers.docker.operators.docker.APIClient') - @mock.patch('airflow.providers.docker.operators.docker_swarm.types') - def test_logging_with_requests_timeout(self, types_mock, client_class_mock): - - mock_obj = mock.Mock() - - def _client_tasks_side_effect(): - for _ in range(2): - yield [{'Status': {'State': 'pending'}}] - while True: - yield [{'Status': {'State': 'complete'}}] - - def _client_service_logs_effect(): - yield b'Testing is awesome.' - raise requests.exceptions.ConnectionError('') - - client_mock = mock.Mock(spec=APIClient) - client_mock.create_service.return_value = {'ID': 'some_id'} - client_mock.service_logs.return_value = _client_service_logs_effect() - client_mock.images.return_value = [] - client_mock.pull.return_value = [b'{"status":"pull log"}'] - client_mock.tasks.side_effect = _client_tasks_side_effect() - types_mock.TaskTemplate.return_value = mock_obj - types_mock.ContainerSpec.return_value = mock_obj - types_mock.RestartPolicy.return_value = mock_obj - types_mock.Resources.return_value = mock_obj - - client_class_mock.return_value = client_mock - - operator = DockerSwarmOperator( - api_version='1.19', - command='env', - environment={'UNIT': 'TEST'}, - image='ubuntu:latest', - mem_limit='128m', - user='unittest', - task_id='unittest', - auto_remove=True, - tty=True, - enable_logging=True, - ) - operator.execute(None) - - client_mock.service_logs.assert_called_once_with( - 'some_id', follow=True, stdout=True, stderr=True, is_tty=True - ) - def test_on_kill(self): client_mock = mock.Mock(spec=APIClient) From 1a80dcd0c37cb3ef2ff90f2e56b2b1ca085cecc7 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Wed, 13 Oct 2021 13:35:30 -0600 Subject: [PATCH 076/250] Add ``semver`` to devel deps (#18818) We have a new dev script, `validate_version_added_fields_in_config.py`, that uses it. (cherry picked from commit 306d0601246b43a4fcf1f21c6e30a917e6d18c28) --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 8b9237ea9f341..561c582200831 100644 --- a/setup.py +++ b/setup.py @@ -540,6 +540,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'qds-sdk>=1.9.6', 'pytest-httpx', 'requests_mock', + 'semver', 'wheel', 'yamllint', ] From 60e2b65a8abce533faa5648dfb4925c24b13e520 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sat, 16 Oct 2021 20:21:31 +0200 Subject: [PATCH 077/250] Add pandas requirements for providers that use pandas (#18997) As we removed pandas as core airflow requirement, the providers that need it should get pandas explicitlyly as installation requirements. Fixes: #18901 (cherry picked from commit de98976581294e080967e2aa52043176dffb644f) --- setup.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index 561c582200831..b742e2825243c 100644 --- a/setup.py +++ b/setup.py @@ -176,6 +176,8 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version file.write(text) +pandas_requirement = 'pandas>=0.17.1, <2.0' + # 'Start dependencies group' and 'Start dependencies group' are mark for ./scripts/ci/check_order_setup.py # If you change this mark you should also change ./scripts/ci/check_order_setup.py # Start dependencies group @@ -186,6 +188,9 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'boto3>=1.15.0,<1.19.0', 'watchtower~=1.0.6', 'jsonpath_ng>=1.5.3', + 'redshift_connector~=2.0.888', + 'sqlalchemy_redshift~=0.8.6', + pandas_requirement, ] apache_beam = [ 'apache-beam>=2.20.0', @@ -269,9 +274,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'elasticsearch-dbapi', 'elasticsearch-dsl>=5.0.0', ] -exasol = [ - 'pyexasol>=0.5.1,<1.0.0', -] +exasol = ['pyexasol>=0.5.1,<1.0.0', pandas_requirement] facebook = [ 'facebook-business>=6.0.2', ] @@ -328,6 +331,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version # pandas-gbq 0.15.0 release broke google provider's bigquery import # _check_google_client_version (airflow/providers/google/cloud/hooks/bigquery.py:49) 'pandas-gbq<0.15.0', + pandas_requirement, ] grpc = [ 'google-auth>=1.0.0, <3.0.0', @@ -344,6 +348,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'hmsclient>=0.1.0', 'pyhive[hive]>=0.6.0;python_version<"3.9"', 'thrift>=0.9.2', + pandas_requirement, ] http = [ # The 2.26.0 release of requests got rid of the chardet LGPL mandatory dependency, allowing us to @@ -353,7 +358,10 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version http_provider = [ 'apache-airflow-providers-http', ] -influxdb = ['pandas>=0.17.1, <2.0', 'influxdb-client>=1.19.0'] +influxdb = [ + 'influxdb-client>=1.19.0', + pandas_requirement, +] jdbc = [ 'jaydebeapi>=1.1.1', ] @@ -400,7 +408,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'pdpyras>=4.1.2,<5', ] pandas = [ - 'pandas>=0.17.1, <2.0', + pandas_requirement, ] papermill = [ 'papermill[all]>=1.2.1', @@ -421,7 +429,10 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version postgres = [ 'psycopg2-binary>=2.7.4', ] -presto = ['presto-python-client>=0.7.0,<0.8'] +presto = [ + 'presto-python-client>=0.7.0,<0.8', + pandas_requirement, +] psrp = [ 'pypsrp~=0.5', ] @@ -434,10 +445,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version redis = [ 'redis~=3.2', ] -salesforce = [ - 'simple-salesforce>=1.0.0', - 'tableauserverclient', -] +salesforce = ['simple-salesforce>=1.0.0', 'tableauserverclient', pandas_requirement] samba = [ 'smbprotocol>=1.5.0', ] @@ -479,7 +487,10 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version telegram = [ 'python-telegram-bot~=13.0', ] -trino = ['trino>=0.301.0'] +trino = [ + 'trino>=0.301.0', + pandas_requirement, +] vertica = [ 'vertica-python>=0.5.1', ] From a5ee60e5f1f3e74b7f809c50b85ecb1b3d44f303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Wyszomirski?= Date: Thu, 21 Oct 2021 10:57:49 +0200 Subject: [PATCH 078/250] Upgrade the Dataproc package to 3.0.0 and migrate from v1beta2 to v1 api (#18879) (cherry picked from commit 4fae04a47119c9f2319ae5e533edcf457e4df003) --- airflow/providers/google/cloud/hooks/dataproc.py | 2 +- airflow/providers/google/cloud/operators/dataproc.py | 8 ++++---- airflow/providers/google/cloud/sensors/dataproc.py | 2 +- setup.py | 2 +- tests/providers/google/cloud/hooks/test_dataproc.py | 2 +- tests/providers/google/cloud/sensors/test_dataproc.py | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/airflow/providers/google/cloud/hooks/dataproc.py b/airflow/providers/google/cloud/hooks/dataproc.py index e353ef7f83ae5..16aed3617e8a9 100644 --- a/airflow/providers/google/cloud/hooks/dataproc.py +++ b/airflow/providers/google/cloud/hooks/dataproc.py @@ -25,7 +25,7 @@ from google.api_core.exceptions import ServerError from google.api_core.retry import Retry -from google.cloud.dataproc_v1beta2 import ( +from google.cloud.dataproc_v1 import ( Cluster, ClusterControllerClient, Job, diff --git a/airflow/providers/google/cloud/operators/dataproc.py b/airflow/providers/google/cloud/operators/dataproc.py index fb4d8ed037e27..537cc69650314 100644 --- a/airflow/providers/google/cloud/operators/dataproc.py +++ b/airflow/providers/google/cloud/operators/dataproc.py @@ -30,7 +30,7 @@ from google.api_core.exceptions import AlreadyExists, NotFound from google.api_core.retry import Retry, exponential_sleep_generator -from google.cloud.dataproc_v1beta2 import Cluster +from google.cloud.dataproc_v1 import Cluster from google.protobuf.duration_pb2 import Duration from google.protobuf.field_mask_pb2 import FieldMask @@ -1909,7 +1909,7 @@ class DataprocSubmitJobOperator(BaseOperator): :type location: str :param job: Required. The job resource. If a dict is provided, it must be of the same form as the protobuf message - :class:`~google.cloud.dataproc_v1beta2.types.Job` + :class:`~google.cloud.dataproc_v1.types.Job` :type job: Dict :param request_id: Optional. A unique id used to identify the request. If the server receives two ``SubmitJobRequest`` requests with the same id, then the second request will be ignored and the first @@ -2050,8 +2050,8 @@ class DataprocUpdateClusterOperator(BaseOperator): :param cluster: Required. The changes to the cluster. If a dict is provided, it must be of the same form as the protobuf message - :class:`~google.cloud.dataproc_v1beta2.types.Cluster` - :type cluster: Union[Dict, google.cloud.dataproc_v1beta2.types.Cluster] + :class:`~google.cloud.dataproc_v1.types.Cluster` + :type cluster: Union[Dict, google.cloud.dataproc_v1.types.Cluster] :param update_mask: Required. Specifies the path, relative to ``Cluster``, of the field to update. For example, to change the number of workers in a cluster to 5, the ``update_mask`` parameter would be specified as ``config.worker_config.num_instances``, and the ``PATCH`` request body would specify the diff --git a/airflow/providers/google/cloud/sensors/dataproc.py b/airflow/providers/google/cloud/sensors/dataproc.py index 2bcfbe138a7be..fd5ead0ac7729 100644 --- a/airflow/providers/google/cloud/sensors/dataproc.py +++ b/airflow/providers/google/cloud/sensors/dataproc.py @@ -20,7 +20,7 @@ import warnings from typing import Optional -from google.cloud.dataproc_v1beta2.types import JobStatus +from google.cloud.dataproc_v1.types import JobStatus from airflow.exceptions import AirflowException from airflow.providers.google.cloud.hooks.dataproc import DataprocHook diff --git a/setup.py b/setup.py index b742e2825243c..c3014fd2a4e52 100644 --- a/setup.py +++ b/setup.py @@ -303,7 +303,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'google-cloud-build>=3.0.0,<4.0.0', 'google-cloud-container>=0.1.1,<2.0.0', 'google-cloud-datacatalog>=3.0.0,<4.0.0', - 'google-cloud-dataproc>=2.2.0,<2.6.0', + 'google-cloud-dataproc>=2.2.0,<4.0.0', 'google-cloud-dlp>=0.11.0,<2.0.0', 'google-cloud-kms>=2.0.0,<3.0.0', 'google-cloud-language>=1.1.1,<2.0.0', diff --git a/tests/providers/google/cloud/hooks/test_dataproc.py b/tests/providers/google/cloud/hooks/test_dataproc.py index 32e61817fd462..c81c39cc6eb49 100644 --- a/tests/providers/google/cloud/hooks/test_dataproc.py +++ b/tests/providers/google/cloud/hooks/test_dataproc.py @@ -20,7 +20,7 @@ from unittest import mock import pytest -from google.cloud.dataproc_v1beta2 import JobStatus +from google.cloud.dataproc_v1 import JobStatus from airflow.exceptions import AirflowException from airflow.providers.google.cloud.hooks.dataproc import DataprocHook, DataProcJobBuilder diff --git a/tests/providers/google/cloud/sensors/test_dataproc.py b/tests/providers/google/cloud/sensors/test_dataproc.py index 0a5b8f5bed7f6..0f9f09638a222 100644 --- a/tests/providers/google/cloud/sensors/test_dataproc.py +++ b/tests/providers/google/cloud/sensors/test_dataproc.py @@ -19,7 +19,7 @@ from unittest import mock import pytest -from google.cloud.dataproc_v1beta2.types import JobStatus +from google.cloud.dataproc_v1.types import JobStatus from airflow import AirflowException from airflow.providers.google.cloud.sensors.dataproc import DataprocJobSensor From 2ad02ef545769912eb17e5047020be5a9ca94f91 Mon Sep 17 00:00:00 2001 From: Wojciech Januszek Date: Mon, 8 Nov 2021 00:37:21 +0100 Subject: [PATCH 079/250] Add dataproc metastore operators (#18945) (cherry picked from commit 26ad55beb00f5a0915ba4bec541e3d67044834e9) --- .../example_dataproc_metastore.py | 216 ++++ .../google/cloud/hooks/dataproc_metastore.py | 676 +++++++++++ .../cloud/operators/dataproc_metastore.py | 1068 +++++++++++++++++ airflow/providers/google/provider.yaml | 11 + .../operators/cloud/dataproc_metastore.rst | 196 +++ setup.py | 1 + .../cloud/hooks/test_dataproc_metastore.py | 489 ++++++++ .../operators/test_dataproc_metastore.py | 396 ++++++ .../test_dataproc_metastore_system.py | 40 + 9 files changed, 3093 insertions(+) create mode 100644 airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py create mode 100644 airflow/providers/google/cloud/hooks/dataproc_metastore.py create mode 100644 airflow/providers/google/cloud/operators/dataproc_metastore.py create mode 100644 docs/apache-airflow-providers-google/operators/cloud/dataproc_metastore.rst create mode 100644 tests/providers/google/cloud/hooks/test_dataproc_metastore.py create mode 100644 tests/providers/google/cloud/operators/test_dataproc_metastore.py create mode 100644 tests/providers/google/cloud/operators/test_dataproc_metastore_system.py diff --git a/airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py b/airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py new file mode 100644 index 0000000000000..563a0443bc70f --- /dev/null +++ b/airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py @@ -0,0 +1,216 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Example Airflow DAG that show how to use various Dataproc Metastore +operators to manage a service. +""" + +import datetime +import os + +from airflow import models +from airflow.models.baseoperator import chain +from airflow.providers.google.cloud.operators.dataproc_metastore import ( + DataprocMetastoreCreateBackupOperator, + DataprocMetastoreCreateMetadataImportOperator, + DataprocMetastoreCreateServiceOperator, + DataprocMetastoreDeleteBackupOperator, + DataprocMetastoreDeleteServiceOperator, + DataprocMetastoreExportMetadataOperator, + DataprocMetastoreGetServiceOperator, + DataprocMetastoreListBackupsOperator, + DataprocMetastoreRestoreServiceOperator, + DataprocMetastoreUpdateServiceOperator, +) + +PROJECT_ID = os.environ.get("GCP_PROJECT_ID", "") +SERVICE_ID = os.environ.get("GCP_DATAPROC_METASTORE_SERVICE_ID", "dataproc-metastore-system-tests-service-1") +BACKUP_ID = os.environ.get("GCP_DATAPROC_METASTORE_BACKUP_ID", "dataproc-metastore-system-tests-backup-1") +REGION = os.environ.get("GCP_REGION", "") +BUCKET = os.environ.get("GCP_DATAPROC_METASTORE_BUCKET", "INVALID BUCKET NAME") +METADATA_IMPORT_FILE = os.environ.get("GCS_METADATA_IMPORT_FILE", None) +GCS_URI = os.environ.get("GCS_URI", f"gs://{BUCKET}/data/hive.sql") +METADATA_IMPORT_ID = "dataproc-metastore-system-tests-metadata-import-1" +TIMEOUT = 1200 +DB_TYPE = "MYSQL" +DESTINATION_GCS_FOLDER = f"gs://{BUCKET}/>" + +# Service definition +# Docs: https://cloud.google.com/dataproc-metastore/docs/reference/rest/v1/projects.locations.services#Service +# [START how_to_cloud_dataproc_metastore_create_service] +SERVICE = { + "name": "test-service", +} +# [END how_to_cloud_dataproc_metastore_create_service] + +# Update service +# [START how_to_cloud_dataproc_metastore_update_service] +SERVICE_TO_UPDATE = { + "labels": { + "mylocalmachine": "mylocalmachine", + "systemtest": "systemtest", + } +} +UPDATE_MASK = {"paths": ["labels"]} +# [END how_to_cloud_dataproc_metastore_update_service] + +# Backup definition +# [START how_to_cloud_dataproc_metastore_create_backup] +BACKUP = { + "name": "test-backup", +} +# [END how_to_cloud_dataproc_metastore_create_backup] + +# Metadata import definition +# [START how_to_cloud_dataproc_metastore_create_metadata_import] +METADATA_IMPORT = { + "name": "test-metadata-import", + "database_dump": { + "gcs_uri": GCS_URI, + "database_type": DB_TYPE, + }, +} +# [END how_to_cloud_dataproc_metastore_create_metadata_import] + + +with models.DAG( + "example_gcp_dataproc_metastore", start_date=datetime.datetime(2021, 1, 1), schedule_interval="@once" +) as dag: + # [START how_to_cloud_dataproc_metastore_create_service_operator] + create_service = DataprocMetastoreCreateServiceOperator( + task_id="create_service", + region=REGION, + project_id=PROJECT_ID, + service=SERVICE, + service_id=SERVICE_ID, + timeout=TIMEOUT, + ) + # [END how_to_cloud_dataproc_metastore_create_service_operator] + + # [START how_to_cloud_dataproc_metastore_get_service_operator] + get_service_details = DataprocMetastoreGetServiceOperator( + task_id="get_service", + region=REGION, + project_id=PROJECT_ID, + service_id=SERVICE_ID, + ) + # [END how_to_cloud_dataproc_metastore_get_service_operator] + + # [START how_to_cloud_dataproc_metastore_update_service_operator] + update_service = DataprocMetastoreUpdateServiceOperator( + task_id="update_service", + project_id=PROJECT_ID, + service_id=SERVICE_ID, + region=REGION, + service=SERVICE_TO_UPDATE, + update_mask=UPDATE_MASK, + timeout=TIMEOUT, + ) + # [END how_to_cloud_dataproc_metastore_update_service_operator] + + # [START how_to_cloud_dataproc_metastore_create_metadata_import_operator] + import_metadata = DataprocMetastoreCreateMetadataImportOperator( + task_id="create_metadata_import", + project_id=PROJECT_ID, + region=REGION, + service_id=SERVICE_ID, + metadata_import=METADATA_IMPORT, + metadata_import_id=METADATA_IMPORT_ID, + timeout=TIMEOUT, + ) + # [END how_to_cloud_dataproc_metastore_create_metadata_import_operator] + + # [START how_to_cloud_dataproc_metastore_export_metadata_operator] + export_metadata = DataprocMetastoreExportMetadataOperator( + task_id="export_metadata", + destination_gcs_folder=DESTINATION_GCS_FOLDER, + project_id=PROJECT_ID, + region=REGION, + service_id=SERVICE_ID, + timeout=TIMEOUT, + ) + # [END how_to_cloud_dataproc_metastore_export_metadata_operator] + + # [START how_to_cloud_dataproc_metastore_create_backup_operator] + backup_service = DataprocMetastoreCreateBackupOperator( + task_id="create_backup", + project_id=PROJECT_ID, + region=REGION, + service_id=SERVICE_ID, + backup=BACKUP, + backup_id=BACKUP_ID, + timeout=TIMEOUT, + ) + # [END how_to_cloud_dataproc_metastore_create_backup_operator] + + # [START how_to_cloud_dataproc_metastore_list_backups_operator] + list_backups = DataprocMetastoreListBackupsOperator( + task_id="list_backups", + project_id=PROJECT_ID, + region=REGION, + service_id=SERVICE_ID, + ) + # [END how_to_cloud_dataproc_metastore_list_backups_operator] + + # [START how_to_cloud_dataproc_metastore_delete_backup_operator] + delete_backup = DataprocMetastoreDeleteBackupOperator( + task_id="delete_backup", + project_id=PROJECT_ID, + region=REGION, + service_id=SERVICE_ID, + backup_id=BACKUP_ID, + timeout=TIMEOUT, + ) + # [END how_to_cloud_dataproc_metastore_delete_backup_operator] + + # [START how_to_cloud_dataproc_metastore_restore_service_operator] + restore_service = DataprocMetastoreRestoreServiceOperator( + task_id="restore_metastore", + region=REGION, + project_id=PROJECT_ID, + service_id=SERVICE_ID, + backup_id=BACKUP_ID, + backup_region=REGION, + backup_project_id=PROJECT_ID, + backup_service_id=SERVICE_ID, + timeout=TIMEOUT, + ) + # [END how_to_cloud_dataproc_metastore_restore_service_operator] + + # [START how_to_cloud_dataproc_metastore_delete_service_operator] + delete_service = DataprocMetastoreDeleteServiceOperator( + task_id="delete_service", + region=REGION, + project_id=PROJECT_ID, + service_id=SERVICE_ID, + timeout=TIMEOUT, + ) + # [END how_to_cloud_dataproc_metastore_delete_service_operator] + + chain( + create_service, + update_service, + get_service_details, + backup_service, + list_backups, + restore_service, + delete_backup, + export_metadata, + import_metadata, + delete_service, + ) diff --git a/airflow/providers/google/cloud/hooks/dataproc_metastore.py b/airflow/providers/google/cloud/hooks/dataproc_metastore.py new file mode 100644 index 0000000000000..7a645ff49742c --- /dev/null +++ b/airflow/providers/google/cloud/hooks/dataproc_metastore.py @@ -0,0 +1,676 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +"""This module contains a Google Cloud Dataproc Metastore hook.""" + +from typing import Dict, Optional, Sequence, Tuple, Union + +from google.api_core.operation import Operation +from google.api_core.retry import Retry +from google.cloud.metastore_v1 import DataprocMetastoreClient +from google.cloud.metastore_v1.types import Backup, MetadataImport, Service +from google.cloud.metastore_v1.types.metastore import DatabaseDumpSpec, Restore +from google.protobuf.field_mask_pb2 import FieldMask + +from airflow.exceptions import AirflowException +from airflow.providers.google.common.hooks.base_google import GoogleBaseHook + + +class DataprocMetastoreHook(GoogleBaseHook): + """Hook for Google Cloud Dataproc Metastore APIs.""" + + def get_dataproc_metastore_client(self) -> DataprocMetastoreClient: + """Returns DataprocMetastoreClient.""" + client_options = {'api_endpoint': 'metastore.googleapis.com:443'} + + return DataprocMetastoreClient( + credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options + ) + + def wait_for_operation(self, timeout: float, operation: Operation): + """Waits for long-lasting operation to complete.""" + try: + return operation.result(timeout=timeout) + except Exception: + error = operation.exception(timeout=timeout) + raise AirflowException(error) + + @GoogleBaseHook.fallback_to_default_project_id + def create_backup( + self, + project_id: str, + region: str, + service_id: str, + backup: Backup, + backup_id: str, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Creates a new backup in a given project and location. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param backup: Required. The backup to create. The ``name`` field is ignored. The ID of the created + backup must be provided in the request's ``backup_id`` field. + + This corresponds to the ``backup`` field on the ``request`` instance; if ``request`` is provided, + this should not be set. + :type backup: google.cloud.metastore_v1.types.Backup + :param backup_id: Required. The ID of the backup, which is used as the final component of the + backup's name. This value must be between 1 and 64 characters long, begin with a letter, end with + a letter or number, and consist of alphanumeric ASCII characters or hyphens. + + This corresponds to the ``backup_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type backup_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + """ + parent = f'projects/{project_id}/locations/{region}/services/{service_id}' + + client = self.get_dataproc_metastore_client() + result = client.create_backup( + request={ + 'parent': parent, + 'backup': backup, + 'backup_id': backup_id, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def create_metadata_import( + self, + project_id: str, + region: str, + service_id: str, + metadata_import: MetadataImport, + metadata_import_id: str, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Creates a new MetadataImport in a given project and location. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param metadata_import: Required. The metadata import to create. The ``name`` field is ignored. The + ID of the created metadata import must be provided in the request's ``metadata_import_id`` field. + + This corresponds to the ``metadata_import`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type metadata_import: google.cloud.metastore_v1.types.MetadataImport + :param metadata_import_id: Required. The ID of the metadata import, which is used as the final + component of the metadata import's name. This value must be between 1 and 64 characters long, + begin with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``metadata_import_id`` field on the ``request`` instance; if ``request`` + is provided, this should not be set. + :type metadata_import_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + """ + parent = f'projects/{project_id}/locations/{region}/services/{service_id}' + + client = self.get_dataproc_metastore_client() + result = client.create_metadata_import( + request={ + 'parent': parent, + 'metadata_import': metadata_import, + 'metadata_import_id': metadata_import_id, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def create_service( + self, + region: str, + project_id: str, + service: Union[Dict, Service], + service_id: str, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + ): + """ + Creates a metastore service in a project and location. + + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param service: Required. The Metastore service to create. The ``name`` field is ignored. The ID of + the created metastore service must be provided in the request's ``service_id`` field. + + This corresponds to the ``service`` field on the ``request`` instance; if ``request`` is provided, + this should not be set. + :type service: google.cloud.metastore_v1.types.Service + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + """ + parent = f'projects/{project_id}/locations/{region}' + + client = self.get_dataproc_metastore_client() + result = client.create_service( + request={ + 'parent': parent, + 'service_id': service_id, + 'service': service if service else {}, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def delete_backup( + self, + project_id: str, + region: str, + service_id: str, + backup_id: str, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Deletes a single backup. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param backup_id: Required. The ID of the backup, which is used as the final component of the + backup's name. This value must be between 1 and 64 characters long, begin with a letter, end with + a letter or number, and consist of alphanumeric ASCII characters or hyphens. + + This corresponds to the ``backup_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type backup_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + """ + name = f'projects/{project_id}/locations/{region}/services/{service_id}/backups/{backup_id}' + + client = self.get_dataproc_metastore_client() + result = client.delete_backup( + request={ + 'name': name, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def delete_service( + self, + project_id: str, + region: str, + service_id: str, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Deletes a single service. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + """ + name = f'projects/{project_id}/locations/{region}/services/{service_id}' + + client = self.get_dataproc_metastore_client() + result = client.delete_service( + request={ + 'name': name, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def export_metadata( + self, + destination_gcs_folder: str, + project_id: str, + region: str, + service_id: str, + request_id: Optional[str] = None, + database_dump_type: Optional[DatabaseDumpSpec] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Exports metadata from a service. + + :param destination_gcs_folder: A Cloud Storage URI of a folder, in the format + ``gs:///``. A sub-folder + ```` containing exported files will be + created below it. + :type destination_gcs_folder: str + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param database_dump_type: Optional. The type of the database dump. If unspecified, + defaults to ``MYSQL``. + :type database_dump_type: google.cloud.metastore_v1.types.DatabaseDumpSpec.Type + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + """ + service = f'projects/{project_id}/locations/{region}/services/{service_id}' + + client = self.get_dataproc_metastore_client() + result = client.export_metadata( + request={ + 'destination_gcs_folder': destination_gcs_folder, + 'service': service, + 'request_id': request_id, + 'database_dump_type': database_dump_type, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def get_service( + self, + project_id: str, + region: str, + service_id: str, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Gets the details of a single service. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + """ + name = f'projects/{project_id}/locations/{region}/services/{service_id}' + + client = self.get_dataproc_metastore_client() + result = client.get_service( + request={ + 'name': name, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def list_backups( + self, + project_id: str, + region: str, + service_id: str, + page_size: Optional[int] = None, + page_token: Optional[str] = None, + filter: Optional[str] = None, + order_by: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Lists backups in a service. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param page_size: Optional. The maximum number of backups to + return. The response may contain less than the + maximum number. If unspecified, no more than 500 + backups are returned. The maximum value is 1000; + values above 1000 are changed to 1000. + :type page_size: int + :param page_token: Optional. A page token, received from a previous + [DataprocMetastore.ListBackups][google.cloud.metastore.v1.DataprocMetastore.ListBackups] + call. Provide this token to retrieve the subsequent page. + To retrieve the first page, supply an empty page token. + When paginating, other parameters provided to + [DataprocMetastore.ListBackups][google.cloud.metastore.v1.DataprocMetastore.ListBackups] + must match the call that provided the page token. + :type page_token: str + :param filter: Optional. The filter to apply to list + results. + :type filter: str + :param order_by: Optional. Specify the ordering of results as described in + `Sorting + Order `__. + If not specified, the results will be sorted in the default + order. + :type order_by: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + """ + parent = f'projects/{project_id}/locations/{region}/services/{service_id}/backups' + + client = self.get_dataproc_metastore_client() + result = client.list_backups( + request={ + 'parent': parent, + 'page_size': page_size, + 'page_token': page_token, + 'filter': filter, + 'order_by': order_by, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def restore_service( + self, + project_id: str, + region: str, + service_id: str, + backup_project_id: str, + backup_region: str, + backup_service_id: str, + backup_id: str, + restore_type: Optional[Restore] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Restores a service from a backup. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param backup_project_id: Required. The ID of the Google Cloud project that the metastore service + backup to restore from. + :type backup_project_id: str + :param backup_region: Required. The ID of the Google Cloud region that the metastore + service backup to restore from. + :type backup_region: str + :param backup_service_id: Required. The ID of the metastore service backup to restore from, + which is used as the final component of the metastore service's name. This value must be + between 2 and 63 characters long inclusive, begin with a letter, end with a letter or number, + and consist of alphanumeric ASCII characters or hyphens. + :type backup_service_id: str + :param backup_id: Required. The ID of the metastore service backup to restore from + :type backup_id: str + :param restore_type: Optional. The type of restore. If unspecified, defaults to + ``METADATA_ONLY`` + :type restore_type: google.cloud.metastore_v1.types.Restore.RestoreType + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + """ + service = f'projects/{project_id}/locations/{region}/services/{service_id}' + backup = ( + f'projects/{backup_project_id}/locations/{backup_region}/services/' + f'{backup_service_id}/backups/{backup_id}' + ) + + client = self.get_dataproc_metastore_client() + result = client.restore_service( + request={ + 'service': service, + 'backup': backup, + 'restore_type': restore_type, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def update_service( + self, + project_id: str, + region: str, + service_id: str, + service: Union[Dict, Service], + update_mask: FieldMask, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Updates the parameters of a single service. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param service: Required. The metastore service to update. The server only merges fields in the + service if they are specified in ``update_mask``. + + The metastore service's ``name`` field is used to identify the metastore service to be updated. + + This corresponds to the ``service`` field on the ``request`` instance; if ``request`` is provided, + this should not be set. + :type service: Union[Dict, google.cloud.metastore_v1.types.Service] + :param update_mask: Required. A field mask used to specify the fields to be overwritten in the + metastore service resource by the update. Fields specified in the ``update_mask`` are relative to + the resource (not to the full request). A field is overwritten if it is in the mask. + + This corresponds to the ``update_mask`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type update_mask: google.protobuf.field_mask_pb2.FieldMask + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + """ + client = self.get_dataproc_metastore_client() + + service_name = f'projects/{project_id}/locations/{region}/services/{service_id}' + + service["name"] = service_name + + result = client.update_service( + request={ + 'service': service, + 'update_mask': update_mask, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result diff --git a/airflow/providers/google/cloud/operators/dataproc_metastore.py b/airflow/providers/google/cloud/operators/dataproc_metastore.py new file mode 100644 index 0000000000000..2823b72cd4422 --- /dev/null +++ b/airflow/providers/google/cloud/operators/dataproc_metastore.py @@ -0,0 +1,1068 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +"""This module contains Google Dataproc Metastore operators.""" + +from time import sleep +from typing import Dict, Optional, Sequence, Tuple, Union + +from google.api_core.retry import Retry, exponential_sleep_generator +from google.cloud.metastore_v1 import MetadataExport, MetadataManagementActivity +from google.cloud.metastore_v1.types import Backup, MetadataImport, Service +from google.cloud.metastore_v1.types.metastore import DatabaseDumpSpec, Restore +from google.protobuf.field_mask_pb2 import FieldMask +from googleapiclient.errors import HttpError + +from airflow import AirflowException +from airflow.models import BaseOperator +from airflow.providers.google.cloud.hooks.dataproc_metastore import DataprocMetastoreHook + + +class DataprocMetastoreCreateBackupOperator(BaseOperator): + """ + Creates a new backup in a given project and location. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param backup: Required. The backup to create. The ``name`` field is ignored. The ID of the created + backup must be provided in the request's ``backup_id`` field. + + This corresponds to the ``backup`` field on the ``request`` instance; if ``request`` is provided, this + should not be set. + :type backup: google.cloud.metastore_v1.types.Backup + :param backup_id: Required. The ID of the backup, which is used as the final component of the backup's + name. This value must be between 1 and 64 characters long, begin with a letter, end with a letter or + number, and consist of alphanumeric ASCII characters or hyphens. + + This corresponds to the ``backup_id`` field on the ``request`` instance; if ``request`` is provided, + this should not be set. + :type backup_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Optional. Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: Optional. The timeout for this request. + :type timeout: float + :param metadata: Optional. Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'backup', + 'impersonation_chain', + ) + template_fields_renderers = {'backup': 'json'} + + def __init__( + self, + *, + project_id: str, + region: str, + service_id: str, + backup: Union[Dict, Backup], + backup_id: str, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.project_id = project_id + self.region = region + self.service_id = service_id + self.backup = backup + self.backup_id = backup_id + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context: dict) -> dict: + hook = DataprocMetastoreHook( + gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain + ) + self.log.info("Creating Dataproc Metastore backup: %s", self.backup_id) + + try: + operation = hook.create_backup( + project_id=self.project_id, + region=self.region, + service_id=self.service_id, + backup=self.backup, + backup_id=self.backup_id, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + backup = hook.wait_for_operation(self.timeout, operation) + self.log.info("Backup %s created successfully", self.backup_id) + except HttpError as err: + if err.resp.status not in (409, '409'): + raise + self.log.info("Backup %s already exists", self.backup_id) + backup = hook.get_backup( + project_id=self.project_id, + region=self.region, + service_id=self.service_id, + backup_id=self.backup_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + return Backup.to_dict(backup) + + +class DataprocMetastoreCreateMetadataImportOperator(BaseOperator): + """ + Creates a new MetadataImport in a given project and location. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param metadata_import: Required. The metadata import to create. The ``name`` field is ignored. The ID of + the created metadata import must be provided in the request's ``metadata_import_id`` field. + + This corresponds to the ``metadata_import`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type metadata_import: google.cloud.metastore_v1.types.MetadataImport + :param metadata_import_id: Required. The ID of the metadata import, which is used as the final component + of the metadata import's name. This value must be between 1 and 64 characters long, begin with a + letter, end with a letter or number, and consist of alphanumeric ASCII characters or hyphens. + + This corresponds to the ``metadata_import_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type metadata_import_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Optional. Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: Optional. The timeout for this request. + :type timeout: float + :param metadata: Optional. Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'metadata_import', + 'impersonation_chain', + ) + template_fields_renderers = {'metadata_import': 'json'} + + def __init__( + self, + *, + project_id: str, + region: str, + service_id: str, + metadata_import: MetadataImport, + metadata_import_id: str, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.project_id = project_id + self.region = region + self.service_id = service_id + self.metadata_import = metadata_import + self.metadata_import_id = metadata_import_id + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context: dict): + hook = DataprocMetastoreHook( + gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain + ) + self.log.info("Creating Dataproc Metastore metadata import: %s", self.metadata_import_id) + operation = hook.create_metadata_import( + project_id=self.project_id, + region=self.region, + service_id=self.service_id, + metadata_import=self.metadata_import, + metadata_import_id=self.metadata_import_id, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + metadata_import = hook.wait_for_operation(self.timeout, operation) + self.log.info("Metadata import %s created successfully", self.metadata_import_id) + return MetadataImport.to_dict(metadata_import) + + +class DataprocMetastoreCreateServiceOperator(BaseOperator): + """ + Creates a metastore service in a project and location. + + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param service: Required. The Metastore service to create. The ``name`` field is ignored. The ID of + the created metastore service must be provided in the request's ``service_id`` field. + + This corresponds to the ``service`` field on the ``request`` instance; if ``request`` is provided, + this should not be set. + :type service: google.cloud.metastore_v1.types.Service + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'service', + 'impersonation_chain', + ) + template_fields_renderers = {'service': 'json'} + + def __init__( + self, + *, + region: str, + project_id: str, + service: Optional[Union[Dict, Service]] = None, + service_id: str, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.region = region + self.project_id = project_id + self.service = service + self.service_id = service_id + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context) -> dict: + hook = DataprocMetastoreHook( + gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain + ) + self.log.info("Creating Dataproc Metastore service: %s", self.project_id) + try: + operation = hook.create_service( + region=self.region, + project_id=self.project_id, + service=self.service, + service_id=self.service_id, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + service = hook.wait_for_operation(self.timeout, operation) + self.log.info("Service %s created successfully", self.service_id) + except HttpError as err: + if err.resp.status not in (409, '409'): + raise + self.log.info("Instance %s already exists", self.service_id) + service = hook.get_service( + region=self.region, + project_id=self.project_id, + service_id=self.service_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + return Service.to_dict(service) + + +class DataprocMetastoreDeleteBackupOperator(BaseOperator): + """ + Deletes a single backup. + + :param project_id: Required. The ID of the Google Cloud project that the backup belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the backup belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param backup_id: Required. The ID of the backup, which is used as the final component of the backup's + name. This value must be between 1 and 64 characters long, begin with a letter, end with a letter or + number, and consist of alphanumeric ASCII characters or hyphens. + + This corresponds to the ``backup_id`` field on the ``request`` instance; if ``request`` is provided, + this should not be set. + :type backup_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Optional. Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: Optional. The timeout for this request. + :type timeout: float + :param metadata: Optional. Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'impersonation_chain', + ) + + def __init__( + self, + *, + project_id: str, + region: str, + service_id: str, + backup_id: str, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.project_id = project_id + self.region = region + self.service_id = service_id + self.backup_id = backup_id + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context: dict) -> None: + hook = DataprocMetastoreHook( + gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain + ) + self.log.info("Deleting Dataproc Metastore backup: %s", self.backup_id) + operation = hook.delete_backup( + project_id=self.project_id, + region=self.region, + service_id=self.service_id, + backup_id=self.backup_id, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + hook.wait_for_operation(self.timeout, operation) + self.log.info("Backup %s deleted successfully", self.project_id) + + +class DataprocMetastoreDeleteServiceOperator(BaseOperator): + """ + Deletes a single service. + + :param request: The request object. Request message for + [DataprocMetastore.DeleteService][google.cloud.metastore.v1.DataprocMetastore.DeleteService]. + :type request: google.cloud.metastore_v1.types.DeleteServiceRequest + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: + :type gcp_conn_id: str + """ + + template_fields = ( + 'project_id', + 'impersonation_chain', + ) + + def __init__( + self, + *, + region: str, + project_id: str, + service_id: str, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.region = region + self.project_id = project_id + self.service_id = service_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context) -> dict: + hook = DataprocMetastoreHook( + gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain + ) + self.log.info("Deleting Dataproc Metastore service: %s", self.project_id) + operation = hook.delete_service( + region=self.region, + project_id=self.project_id, + service_id=self.service_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + hook.wait_for_operation(self.timeout, operation) + self.log.info("Service %s deleted successfully", self.project_id) + + +class DataprocMetastoreExportMetadataOperator(BaseOperator): + """ + Exports metadata from a service. + + :param destination_gcs_folder: A Cloud Storage URI of a folder, in the format + ``gs:///``. A sub-folder + ```` containing exported files will be + created below it. + :type destination_gcs_folder: str + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Optional. Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: Optional. The timeout for this request. + :type timeout: float + :param metadata: Optional. Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'impersonation_chain', + ) + + def __init__( + self, + *, + destination_gcs_folder: str, + project_id: str, + region: str, + service_id: str, + request_id: Optional[str] = None, + database_dump_type: Optional[DatabaseDumpSpec] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.destination_gcs_folder = destination_gcs_folder + self.project_id = project_id + self.region = region + self.service_id = service_id + self.request_id = request_id + self.database_dump_type = database_dump_type + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context: Dict): + hook = DataprocMetastoreHook( + gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain + ) + self.log.info("Exporting metadata from Dataproc Metastore service: %s", self.service_id) + hook.export_metadata( + destination_gcs_folder=self.destination_gcs_folder, + project_id=self.project_id, + region=self.region, + service_id=self.service_id, + request_id=self.request_id, + database_dump_type=self.database_dump_type, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + metadata_export = self._wait_for_export_metadata(hook) + self.log.info("Metadata from service %s exported successfully", self.service_id) + return MetadataExport.to_dict(metadata_export) + + def _wait_for_export_metadata(self, hook: DataprocMetastoreHook): + """ + Workaround to check that export was created successfully. + We discovered a issue to parse result to MetadataExport inside the SDK + """ + for time_to_wait in exponential_sleep_generator(initial=10, maximum=120): + sleep(time_to_wait) + service = hook.get_service( + region=self.region, + project_id=self.project_id, + service_id=self.service_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + activities: MetadataManagementActivity = service.metadata_management_activity + metadata_export: MetadataExport = activities.metadata_exports[0] + if metadata_export.state == MetadataExport.State.SUCCEEDED: + return metadata_export + if metadata_export.state == MetadataExport.State.FAILED: + raise AirflowException( + f"Exporting metadata from Dataproc Metastore {metadata_export.name} FAILED" + ) + + +class DataprocMetastoreGetServiceOperator(BaseOperator): + """ + Gets the details of a single service. + + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param retry: Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: The timeout for this request. + :type timeout: float + :param metadata: Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'impersonation_chain', + ) + + def __init__( + self, + *, + region: str, + project_id: str, + service_id: str, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.region = region + self.project_id = project_id + self.service_id = service_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context) -> dict: + hook = DataprocMetastoreHook( + gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain + ) + self.log.info("Gets the details of a single Dataproc Metastore service: %s", self.project_id) + result = hook.get_service( + region=self.region, + project_id=self.project_id, + service_id=self.service_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + return Service.to_dict(result) + + +class DataprocMetastoreListBackupsOperator(BaseOperator): + """ + Lists backups in a service. + + :param project_id: Required. The ID of the Google Cloud project that the backup belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the backup belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param retry: Optional. Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: Optional. The timeout for this request. + :type timeout: float + :param metadata: Optional. Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'impersonation_chain', + ) + + def __init__( + self, + *, + project_id: str, + region: str, + service_id: str, + page_size: Optional[int] = None, + page_token: Optional[str] = None, + filter: Optional[str] = None, + order_by: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.project_id = project_id + self.region = region + self.service_id = service_id + self.page_size = page_size + self.page_token = page_token + self.filter = filter + self.order_by = order_by + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context: dict) -> dict: + hook = DataprocMetastoreHook( + gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain + ) + self.log.info("Listing Dataproc Metastore backups: %s", self.service_id) + backups = hook.list_backups( + project_id=self.project_id, + region=self.region, + service_id=self.service_id, + page_size=self.page_size, + page_token=self.page_token, + filter=self.filter, + order_by=self.order_by, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + return [Backup.to_dict(backup) for backup in backups] + + +class DataprocMetastoreRestoreServiceOperator(BaseOperator): + """ + Restores a service from a backup. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param backup_project_id: Required. The ID of the Google Cloud project that the metastore + service backup to restore from. + :type backup_project_id: str + :param backup_region: Required. The ID of the Google Cloud region that the metastore + service backup to restore from. + :type backup_region: str + :param backup_service_id: Required. The ID of the metastore service backup to restore from, which is + used as the final component of the metastore service's name. This value must be between 2 and 63 + characters long inclusive, begin with a letter, end with a letter or number, and consist + of alphanumeric ASCII characters or hyphens. + :type backup_service_id: str + :param backup_id: Required. The ID of the metastore service backup to restore from + :type backup_id: str + :param restore_type: Optional. The type of restore. If unspecified, defaults to + ``METADATA_ONLY`` + :type restore_type: google.cloud.metastore_v1.types.Restore.RestoreType + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Optional. Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: Optional. The timeout for this request. + :type timeout: float + :param metadata: Optional. Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'impersonation_chain', + ) + + def __init__( + self, + *, + project_id: str, + region: str, + service_id: str, + backup_project_id: str, + backup_region: str, + backup_service_id: str, + backup_id: str, + restore_type: Optional[Restore] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.project_id = project_id + self.region = region + self.service_id = service_id + self.backup_project_id = backup_project_id + self.backup_region = backup_region + self.backup_service_id = backup_service_id + self.backup_id = backup_id + self.restore_type = restore_type + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context) -> dict: + hook = DataprocMetastoreHook( + gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain + ) + self.log.info( + "Restoring Dataproc Metastore service: %s from backup: %s", self.service_id, self.backup_id + ) + hook.restore_service( + project_id=self.project_id, + region=self.region, + service_id=self.service_id, + backup_project_id=self.backup_project_id, + backup_region=self.backup_region, + backup_service_id=self.backup_service_id, + backup_id=self.backup_id, + restore_type=self.restore_type, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + self._wait_for_restore_service(hook) + self.log.info("Service %s restored from backup %s", self.service_id, self.backup_id) + + def _wait_for_restore_service(self, hook: DataprocMetastoreHook): + """ + Workaround to check that restore service was finished successfully. + We discovered an issue to parse result to Restore inside the SDK + """ + for time_to_wait in exponential_sleep_generator(initial=10, maximum=120): + sleep(time_to_wait) + service = hook.get_service( + region=self.region, + project_id=self.project_id, + service_id=self.service_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + activities: MetadataManagementActivity = service.metadata_management_activity + restore_service: Restore = activities.restores[0] + if restore_service.state == Restore.State.SUCCEEDED: + return restore_service + if restore_service.state == Restore.State.FAILED: + raise AirflowException("Restoring service FAILED") + + +class DataprocMetastoreUpdateServiceOperator(BaseOperator): + """ + Updates the parameters of a single service. + + :param project_id: Required. The ID of the Google Cloud project that the service belongs to. + :type project_id: str + :param region: Required. The ID of the Google Cloud region that the service belongs to. + :type region: str + :param service_id: Required. The ID of the metastore service, which is used as the final component of + the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin + with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or + hyphens. + + This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is + provided, this should not be set. + :type service_id: str + :param service: Required. The metastore service to update. The server only merges fields in the service + if they are specified in ``update_mask``. + + The metastore service's ``name`` field is used to identify the metastore service to be updated. + + This corresponds to the ``service`` field on the ``request`` instance; if ``request`` is provided, + this should not be set. + :type service: Union[Dict, google.cloud.metastore_v1.types.Service] + :param update_mask: Required. A field mask used to specify the fields to be overwritten in the metastore + service resource by the update. Fields specified in the ``update_mask`` are relative to the resource + (not to the full request). A field is overwritten if it is in the mask. + + This corresponds to the ``update_mask`` field on the ``request`` instance; if ``request`` is provided, + this should not be set. + :type update_mask: google.protobuf.field_mask_pb2.FieldMask + :param request_id: Optional. A unique id used to identify the request. + :type request_id: str + :param retry: Optional. Designation of what errors, if any, should be retried. + :type retry: google.api_core.retry.Retry + :param timeout: Optional. The timeout for this request. + :type timeout: float + :param metadata: Optional. Strings which should be sent along with the request as metadata. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'impersonation_chain', + ) + + def __init__( + self, + *, + project_id: str, + region: str, + service_id: str, + service: Union[Dict, Service], + update_mask: Union[Dict, FieldMask], + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = (), + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.project_id = project_id + self.region = region + self.service_id = service_id + self.service = service + self.update_mask = update_mask + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context: Dict): + hook = DataprocMetastoreHook( + gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain + ) + self.log.info("Updating Dataproc Metastore service: %s", self.service.get("name")) + + operation = hook.update_service( + project_id=self.project_id, + region=self.region, + service_id=self.service_id, + service=self.service, + update_mask=self.update_mask, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + hook.wait_for_operation(self.timeout, operation) + self.log.info("Service %s updated successfully", self.service.get("name")) diff --git a/airflow/providers/google/provider.yaml b/airflow/providers/google/provider.yaml index 1e3cca49f6951..066ab5a7d236d 100644 --- a/airflow/providers/google/provider.yaml +++ b/airflow/providers/google/provider.yaml @@ -229,6 +229,11 @@ integrations: - /docs/apache-airflow-providers-google/operators/cloud/dataprep.rst logo: /integration-logos/gcp/Google-Dataprep.png tags: [gcp] + - integration-name: Google Dataproc Metastore + external-doc-url: https://cloud.google.com/dataproc-metastore/ + how-to-guide: + - /docs/apache-airflow-providers-google/operators/cloud/dataproc_metastore.rst + tags: [gcp] - integration-name: Google Dataproc external-doc-url: https://cloud.google.com/dataproc/ how-to-guide: @@ -367,6 +372,9 @@ operators: - integration-name: Google Dataprep python-modules: - airflow.providers.google.cloud.operators.dataprep + - integration-name: Google Dataproc Metastore + python-modules: + - airflow.providers.google.cloud.operators.dataproc_metastore - integration-name: Google Dataproc python-modules: - airflow.providers.google.cloud.operators.dataproc @@ -536,6 +544,9 @@ hooks: - integration-name: Google Dataprep python-modules: - airflow.providers.google.cloud.hooks.dataprep + - integration-name: Google Dataproc Metastore + python-modules: + - airflow.providers.google.cloud.hooks.dataproc_metastore - integration-name: Google Dataproc python-modules: - airflow.providers.google.cloud.hooks.dataproc diff --git a/docs/apache-airflow-providers-google/operators/cloud/dataproc_metastore.rst b/docs/apache-airflow-providers-google/operators/cloud/dataproc_metastore.rst new file mode 100644 index 0000000000000..c7ff5305c96b6 --- /dev/null +++ b/docs/apache-airflow-providers-google/operators/cloud/dataproc_metastore.rst @@ -0,0 +1,196 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Google Cloud Dataproc Metastore Operators +========================================= + +Dataproc Metastore is a fully managed, highly available, auto-healing serverless +Apache Hive metastore (HMS) that runs on Google Cloud. It supports HMS, serves as +a critical component for managing the metadata of relational entities, +and provides interoperability between data processing applications in the open source data ecosystem. + +For more information about the service visit `Dataproc Metastore production documentation `__ + +Create a Service +---------------- + +Before you create a dataproc metastore service you need to define the service. +For more information about the available fields to pass when creating a service, visit `Dataproc Metastore create service API. `__ + +A simple service configuration can look as followed: + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 0 + :start-after: [START how_to_cloud_dataproc_metastore_create_service] + :end-before: [END how_to_cloud_dataproc_metastore_create_service] + +With this configuration we can create the service: +:class:`~airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreCreateServiceOperator` + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 4 + :start-after: [START how_to_cloud_dataproc_metastore_create_service_operator] + :end-before: [END how_to_cloud_dataproc_metastore_create_service_operator] + +Get a service +------------- + +To get a service you can use: + +:class:`~airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreGetServiceOperator` + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 4 + :start-after: [START how_to_cloud_dataproc_metastore_get_service_operator] + :end-before: [END how_to_cloud_dataproc_metastore_get_service_operator] + +Update a service +---------------- +You can update the service by providing a service config and an updateMask. +In the updateMask argument you specifies the path, relative to Service, of the field to update. +For more information on updateMask and other parameters take a look at `Dataproc Metastore update service API. `__ + +An example of a new service config and the updateMask: + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 0 + :start-after: [START how_to_cloud_dataproc_metastore_update_service] + :end-before: [END how_to_cloud_dataproc_metastore_update_service] + +To update a service you can use: +:class:`~airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreUpdateServiceOperator` + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 4 + :start-after: [START how_to_cloud_dataproc_metastore_update_service_operator] + :end-before: [END how_to_cloud_dataproc_metastore_update_service_operator] + +Delete a service +---------------- + +To delete a service you can use: + +:class:`~airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreDeleteServiceOperator` + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 4 + :start-after: [START how_to_cloud_dataproc_metastore_delete_service_operator] + :end-before: [END how_to_cloud_dataproc_metastore_delete_service_operator] + +Export a service metadata +------------------------- + +To export metadata you can use: + +:class:`~airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreExportMetadataOperator` + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 4 + :start-after: [START how_to_cloud_dataproc_metastore_export_metadata_operator] + :end-before: [END how_to_cloud_dataproc_metastore_export_metadata_operator] + +Restore a service +----------------- + +To restore a service you can use: + +:class:`~airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreRestoreServiceOperator` + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 4 + :start-after: [START how_to_cloud_dataproc_metastore_restore_service_operator] + :end-before: [END how_to_cloud_dataproc_metastore_restore_service_operator] + +Create a metadata import +------------------------ + +Before you create a dataproc metastore metadata import you need to define the metadata import. +For more information about the available fields to pass when creating a metadata import, visit `Dataproc Metastore create metadata import API. `__ + +A simple metadata import configuration can look as followed: + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 0 + :start-after: [START how_to_cloud_dataproc_metastore_create_metadata_import] + :end-before: [END how_to_cloud_dataproc_metastore_create_metadata_import] + +To create a metadata import you can use: +:class:`~airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreCreateMetadataImportOperator` + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 4 + :start-after: [START how_to_cloud_dataproc_metastore_create_metadata_import_operator] + :end-before: [END how_to_cloud_dataproc_metastore_create_metadata_import_operator] + +Create a Backup +--------------- + +Before you create a dataproc metastore backup of the service you need to define the backup. +For more information about the available fields to pass when creating a backup, visit `Dataproc Metastore create backup API. `__ + +A simple backup configuration can look as followed: + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 0 + :start-after: [START how_to_cloud_dataproc_metastore_create_backup] + :end-before: [END how_to_cloud_dataproc_metastore_create_backup] + +With this configuration we can create the backup: +:class:`~airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreCreateBackupOperator` + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 4 + :start-after: [START how_to_cloud_dataproc_metastore_create_backup_operator] + :end-before: [END how_to_cloud_dataproc_metastore_create_backup_operator] + +Delete a backup +--------------- + +To delete a backup you can use: + +:class:`~airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreDeleteBackupOperator` + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 4 + :start-after: [START how_to_cloud_dataproc_metastore_delete_backup_operator] + :end-before: [END how_to_cloud_dataproc_metastore_delete_backup_operator] + +List backups +------------ + +To list backups you can use: + +:class:`~airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreListBackupsOperator` + +.. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_dataproc_metastore.py + :language: python + :dedent: 4 + :start-after: [START how_to_cloud_dataproc_metastore_list_backups_operator] + :end-before: [END how_to_cloud_dataproc_metastore_list_backups_operator] diff --git a/setup.py b/setup.py index c3014fd2a4e52..008e2ef4d9acb 100644 --- a/setup.py +++ b/setup.py @@ -304,6 +304,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'google-cloud-container>=0.1.1,<2.0.0', 'google-cloud-datacatalog>=3.0.0,<4.0.0', 'google-cloud-dataproc>=2.2.0,<4.0.0', + 'google-cloud-dataproc-metastore>=1.2.0,<2.0.0', 'google-cloud-dlp>=0.11.0,<2.0.0', 'google-cloud-kms>=2.0.0,<3.0.0', 'google-cloud-language>=1.1.1,<2.0.0', diff --git a/tests/providers/google/cloud/hooks/test_dataproc_metastore.py b/tests/providers/google/cloud/hooks/test_dataproc_metastore.py new file mode 100644 index 0000000000000..cd8602cb3f5db --- /dev/null +++ b/tests/providers/google/cloud/hooks/test_dataproc_metastore.py @@ -0,0 +1,489 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +from unittest import TestCase, mock + +from airflow.providers.google.cloud.hooks.dataproc_metastore import DataprocMetastoreHook +from tests.providers.google.cloud.utils.base_gcp_mock import ( + mock_base_gcp_hook_default_project_id, + mock_base_gcp_hook_no_default_project_id, +) + +TEST_GCP_CONN_ID: str = "test-gcp-conn-id" +TEST_REGION: str = "test-region" +TEST_PROJECT_ID: str = "test-project-id" +TEST_BACKUP: str = "test-backup" +TEST_BACKUP_ID: str = "test-backup-id" +TEST_METADATA_IMPORT: dict = { + "name": "test-metadata-import", + "database_dump": { + "gcs_uri": "gs://bucket_name/path_inside_bucket", + "database_type": "MYSQL", + }, +} +TEST_METADATA_IMPORT_ID: str = "test-metadata-import-id" +TEST_SERVICE: dict = {"name": "test-service"} +TEST_SERVICE_ID: str = "test-service-id" +TEST_SERVICE_TO_UPDATE = { + "labels": { + "first_key": "first_value", + "second_key": "second_value", + } +} +TEST_UPDATE_MASK: dict = {"paths": ["labels"]} +TEST_PARENT: str = "projects/{}/locations/{}" +TEST_PARENT_SERVICES: str = "projects/{}/locations/{}/services/{}" +TEST_PARENT_BACKUPS: str = "projects/{}/locations/{}/services/{}/backups" +TEST_NAME_BACKUPS: str = "projects/{}/locations/{}/services/{}/backups/{}" +TEST_DESTINATION_GCS_FOLDER: str = "gs://bucket_name/path_inside_bucket" + +BASE_STRING = "airflow.providers.google.common.hooks.base_google.{}" +DATAPROC_METASTORE_STRING = "airflow.providers.google.cloud.hooks.dataproc_metastore.{}" + + +class TestDataprocMetastoreWithDefaultProjectIdHook(TestCase): + def setUp(self): + with mock.patch( + BASE_STRING.format("GoogleBaseHook.__init__"), new=mock_base_gcp_hook_default_project_id + ): + self.hook = DataprocMetastoreHook(gcp_conn_id=TEST_GCP_CONN_ID) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_create_backup(self, mock_client) -> None: + self.hook.create_backup( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + backup=TEST_BACKUP, + backup_id=TEST_BACKUP_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.create_backup.assert_called_once_with( + request=dict( + parent=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + backup=TEST_BACKUP, + backup_id=TEST_BACKUP_ID, + request_id=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_create_metadata_import(self, mock_client) -> None: + self.hook.create_metadata_import( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + metadata_import=TEST_METADATA_IMPORT, + metadata_import_id=TEST_METADATA_IMPORT_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.create_metadata_import.assert_called_once_with( + request=dict( + parent=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + metadata_import=TEST_METADATA_IMPORT, + metadata_import_id=TEST_METADATA_IMPORT_ID, + request_id=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_create_service(self, mock_client) -> None: + self.hook.create_service( + region=TEST_REGION, + project_id=TEST_PROJECT_ID, + service=TEST_SERVICE, + service_id=TEST_SERVICE_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.create_service.assert_called_once_with( + request=dict( + parent=TEST_PARENT.format(TEST_PROJECT_ID, TEST_REGION), + service_id=TEST_SERVICE_ID, + service=TEST_SERVICE, + request_id=None, + ), + metadata=(), + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_delete_backup(self, mock_client) -> None: + self.hook.delete_backup( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + backup_id=TEST_BACKUP_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.delete_backup.assert_called_once_with( + request=dict( + name=TEST_NAME_BACKUPS.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID, TEST_BACKUP_ID), + request_id=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_delete_service(self, mock_client) -> None: + self.hook.delete_service( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.delete_service.assert_called_once_with( + request=dict( + name=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + request_id=None, + ), + retry=None, + timeout=None, + metadata=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_export_metadata(self, mock_client) -> None: + self.hook.export_metadata( + destination_gcs_folder=TEST_DESTINATION_GCS_FOLDER, + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.export_metadata.assert_called_once_with( + request=dict( + destination_gcs_folder=TEST_DESTINATION_GCS_FOLDER, + service=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + request_id=None, + database_dump_type=None, + ), + retry=None, + timeout=None, + metadata=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_get_service(self, mock_client) -> None: + self.hook.get_service( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.get_service.assert_called_once_with( + request=dict( + name=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_list_backups(self, mock_client) -> None: + self.hook.list_backups( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.list_backups.assert_called_once_with( + request=dict( + parent=TEST_PARENT_BACKUPS.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + page_size=None, + page_token=None, + filter=None, + order_by=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_restore_service(self, mock_client) -> None: + self.hook.restore_service( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + backup_project_id=TEST_PROJECT_ID, + backup_region=TEST_REGION, + backup_service_id=TEST_SERVICE_ID, + backup_id=TEST_BACKUP_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.restore_service.assert_called_once_with( + request=dict( + service=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + backup=TEST_NAME_BACKUPS.format( + TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID, TEST_BACKUP_ID + ), + restore_type=None, + request_id=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_update_service(self, mock_client) -> None: + self.hook.update_service( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + service=TEST_SERVICE_TO_UPDATE, + update_mask=TEST_UPDATE_MASK, + ) + mock_client.assert_called_once() + mock_client.return_value.update_service.assert_called_once_with( + request=dict( + service=TEST_SERVICE_TO_UPDATE, + update_mask=TEST_UPDATE_MASK, + request_id=None, + ), + retry=None, + timeout=None, + metadata=None, + ) + + +class TestDataprocMetastoreWithoutDefaultProjectIdHook(TestCase): + def setUp(self): + with mock.patch( + BASE_STRING.format("GoogleBaseHook.__init__"), new=mock_base_gcp_hook_no_default_project_id + ): + self.hook = DataprocMetastoreHook(gcp_conn_id=TEST_GCP_CONN_ID) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_create_backup(self, mock_client) -> None: + self.hook.create_backup( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + backup=TEST_BACKUP, + backup_id=TEST_BACKUP_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.create_backup.assert_called_once_with( + request=dict( + parent=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + backup=TEST_BACKUP, + backup_id=TEST_BACKUP_ID, + request_id=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_create_metadata_import(self, mock_client) -> None: + self.hook.create_metadata_import( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + metadata_import=TEST_METADATA_IMPORT, + metadata_import_id=TEST_METADATA_IMPORT_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.create_metadata_import.assert_called_once_with( + request=dict( + parent=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + metadata_import=TEST_METADATA_IMPORT, + metadata_import_id=TEST_METADATA_IMPORT_ID, + request_id=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_create_service(self, mock_client) -> None: + self.hook.create_service( + region=TEST_REGION, + project_id=TEST_PROJECT_ID, + service=TEST_SERVICE, + service_id=TEST_SERVICE_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.create_service.assert_called_once_with( + request=dict( + parent=TEST_PARENT.format(TEST_PROJECT_ID, TEST_REGION), + service_id=TEST_SERVICE_ID, + service=TEST_SERVICE, + request_id=None, + ), + metadata=(), + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_delete_backup(self, mock_client) -> None: + self.hook.delete_backup( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + backup_id=TEST_BACKUP_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.delete_backup.assert_called_once_with( + request=dict( + name=TEST_NAME_BACKUPS.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID, TEST_BACKUP_ID), + request_id=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_delete_service(self, mock_client) -> None: + self.hook.delete_service( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.delete_service.assert_called_once_with( + request=dict( + name=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + request_id=None, + ), + retry=None, + timeout=None, + metadata=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_export_metadata(self, mock_client) -> None: + self.hook.export_metadata( + destination_gcs_folder=TEST_DESTINATION_GCS_FOLDER, + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.export_metadata.assert_called_once_with( + request=dict( + destination_gcs_folder=TEST_DESTINATION_GCS_FOLDER, + service=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + request_id=None, + database_dump_type=None, + ), + retry=None, + timeout=None, + metadata=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_get_service(self, mock_client) -> None: + self.hook.get_service( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.get_service.assert_called_once_with( + request=dict( + name=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_list_backups(self, mock_client) -> None: + self.hook.list_backups( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.list_backups.assert_called_once_with( + request=dict( + parent=TEST_PARENT_BACKUPS.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + page_size=None, + page_token=None, + filter=None, + order_by=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_restore_service(self, mock_client) -> None: + self.hook.restore_service( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + backup_project_id=TEST_PROJECT_ID, + backup_region=TEST_REGION, + backup_service_id=TEST_SERVICE_ID, + backup_id=TEST_BACKUP_ID, + ) + mock_client.assert_called_once() + mock_client.return_value.restore_service.assert_called_once_with( + request=dict( + service=TEST_PARENT_SERVICES.format(TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID), + backup=TEST_NAME_BACKUPS.format( + TEST_PROJECT_ID, TEST_REGION, TEST_SERVICE_ID, TEST_BACKUP_ID + ), + restore_type=None, + request_id=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_METASTORE_STRING.format("DataprocMetastoreHook.get_dataproc_metastore_client")) + def test_update_service(self, mock_client) -> None: + self.hook.update_service( + project_id=TEST_PROJECT_ID, + region=TEST_REGION, + service_id=TEST_SERVICE_ID, + service=TEST_SERVICE_TO_UPDATE, + update_mask=TEST_UPDATE_MASK, + ) + mock_client.assert_called_once() + mock_client.return_value.update_service.assert_called_once_with( + request=dict( + service=TEST_SERVICE_TO_UPDATE, + update_mask=TEST_UPDATE_MASK, + request_id=None, + ), + retry=None, + timeout=None, + metadata=None, + ) diff --git a/tests/providers/google/cloud/operators/test_dataproc_metastore.py b/tests/providers/google/cloud/operators/test_dataproc_metastore.py new file mode 100644 index 0000000000000..652b98367ba16 --- /dev/null +++ b/tests/providers/google/cloud/operators/test_dataproc_metastore.py @@ -0,0 +1,396 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from unittest import TestCase, mock + +from google.api_core.retry import Retry + +from airflow.providers.google.cloud.operators.dataproc_metastore import ( + DataprocMetastoreCreateBackupOperator, + DataprocMetastoreCreateMetadataImportOperator, + DataprocMetastoreCreateServiceOperator, + DataprocMetastoreDeleteBackupOperator, + DataprocMetastoreDeleteServiceOperator, + DataprocMetastoreExportMetadataOperator, + DataprocMetastoreGetServiceOperator, + DataprocMetastoreListBackupsOperator, + DataprocMetastoreRestoreServiceOperator, + DataprocMetastoreUpdateServiceOperator, +) + +TASK_ID: str = "task_id" +GCP_LOCATION: str = "test-location" +GCP_PROJECT_ID: str = "test-project-id" + +GCP_CONN_ID: str = "test-gcp-conn-id" +IMPERSONATION_CHAIN = ["ACCOUNT_1", "ACCOUNT_2", "ACCOUNT_3"] + +TEST_SERVICE: dict = {"name": "test-service"} +TEST_SERVICE_ID: str = "test-service-id" + +TEST_TIMEOUT = 120 +TEST_RETRY = mock.MagicMock(Retry) +TEST_METADATA = [("key", "value")] +TEST_REQUEST_ID = "request_id_uuid" + +TEST_BACKUP: dict = {"name": "test-backup"} +TEST_BACKUP_ID: str = "test-backup-id" +TEST_METADATA_IMPORT: dict = { + "name": "test-metadata-import", + "database_dump": { + "gcs_uri": "gs://bucket_name/path_inside_bucket", + "database_type": "MYSQL", + }, +} +TEST_METADATA_IMPORT_ID: str = "test-metadata-import-id" +TEST_SERVICE_TO_UPDATE = { + "labels": { + "first_key": "first_value", + "second_key": "second_value", + } +} +TEST_UPDATE_MASK: dict = {"paths": ["labels"]} +TEST_DESTINATION_GCS_FOLDER: str = "gs://bucket_name/path_inside_bucket" + + +class TestDataprocMetastoreCreateBackupOperator(TestCase): + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreHook") + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.Backup") + def test_assert_valid_hook_call(self, mock_backup, mock_hook) -> None: + task = DataprocMetastoreCreateBackupOperator( + task_id=TASK_ID, + project_id=GCP_PROJECT_ID, + region=GCP_LOCATION, + backup=TEST_BACKUP, + backup_id=TEST_BACKUP_ID, + service_id=TEST_SERVICE_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + ) + mock_hook.return_value.wait_for_operation.return_value = None + mock_backup.return_value.to_dict.return_value = None + task.execute(context=mock.MagicMock()) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.create_backup.assert_called_once_with( + project_id=GCP_PROJECT_ID, + region=GCP_LOCATION, + backup=TEST_BACKUP, + backup_id=TEST_BACKUP_ID, + service_id=TEST_SERVICE_ID, + request_id=None, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + ) + + +class TestDataprocMetastoreCreateMetadataImportOperator(TestCase): + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreHook") + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.MetadataImport") + def test_assert_valid_hook_call(self, mock_metadata_import, mock_hook) -> None: + task = DataprocMetastoreCreateMetadataImportOperator( + task_id=TASK_ID, + project_id=GCP_PROJECT_ID, + region=GCP_LOCATION, + service_id=TEST_SERVICE_ID, + metadata_import=TEST_METADATA_IMPORT, + metadata_import_id=TEST_METADATA_IMPORT_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + ) + mock_hook.return_value.wait_for_operation.return_value = None + mock_metadata_import.return_value.to_dict.return_value = None + task.execute(context=mock.MagicMock()) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.create_metadata_import.assert_called_once_with( + project_id=GCP_PROJECT_ID, + region=GCP_LOCATION, + service_id=TEST_SERVICE_ID, + metadata_import=TEST_METADATA_IMPORT, + metadata_import_id=TEST_METADATA_IMPORT_ID, + request_id=None, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + ) + + +class TestDataprocMetastoreCreateServiceOperator(TestCase): + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreHook") + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.Service") + def test_execute(self, mock_service, mock_hook) -> None: + task = DataprocMetastoreCreateServiceOperator( + task_id=TASK_ID, + region=GCP_LOCATION, + project_id=GCP_PROJECT_ID, + service=TEST_SERVICE, + service_id=TEST_SERVICE_ID, + request_id=TEST_REQUEST_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + ) + mock_hook.return_value.wait_for_operation.return_value = None + mock_service.return_value.to_dict.return_value = None + task.execute(context=mock.MagicMock()) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.create_service.assert_called_once_with( + region=GCP_LOCATION, + project_id=GCP_PROJECT_ID, + service=TEST_SERVICE, + service_id=TEST_SERVICE_ID, + request_id=TEST_REQUEST_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + ) + + +class TestDataprocMetastoreDeleteBackupOperator(TestCase): + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreHook") + def test_assert_valid_hook_call(self, mock_hook) -> None: + task = DataprocMetastoreDeleteBackupOperator( + task_id=TASK_ID, + project_id=GCP_PROJECT_ID, + region=GCP_LOCATION, + retry=TEST_RETRY, + service_id=TEST_SERVICE_ID, + backup_id=TEST_BACKUP_ID, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + ) + mock_hook.return_value.wait_for_operation.return_value = None + task.execute(context=mock.MagicMock()) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.delete_backup.assert_called_once_with( + project_id=GCP_PROJECT_ID, + region=GCP_LOCATION, + service_id=TEST_SERVICE_ID, + backup_id=TEST_BACKUP_ID, + request_id=None, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + ) + + +class TestDataprocMetastoreDeleteServiceOperator(TestCase): + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreHook") + def test_execute(self, mock_hook) -> None: + task = DataprocMetastoreDeleteServiceOperator( + task_id=TASK_ID, + region=GCP_LOCATION, + project_id=GCP_PROJECT_ID, + service_id=TEST_SERVICE_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + ) + mock_hook.return_value.wait_for_operation.return_value = None + task.execute(context=mock.MagicMock()) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.delete_service.assert_called_once_with( + region=GCP_LOCATION, + project_id=GCP_PROJECT_ID, + service_id=TEST_SERVICE_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + ) + + +class TestDataprocMetastoreExportMetadataOperator(TestCase): + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreHook") + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.MetadataExport") + @mock.patch( + "airflow.providers.google.cloud.operators.dataproc_metastore" + ".DataprocMetastoreExportMetadataOperator._wait_for_export_metadata" + ) + def test_assert_valid_hook_call(self, mock_wait, mock_export_metadata, mock_hook) -> None: + task = DataprocMetastoreExportMetadataOperator( + task_id=TASK_ID, + service_id=TEST_SERVICE_ID, + destination_gcs_folder=TEST_DESTINATION_GCS_FOLDER, + project_id=GCP_PROJECT_ID, + region=GCP_LOCATION, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + ) + mock_wait.return_value = None + mock_export_metadata.return_value.to_dict.return_value = None + task.execute(context=mock.MagicMock()) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.export_metadata.assert_called_once_with( + database_dump_type=None, + destination_gcs_folder=TEST_DESTINATION_GCS_FOLDER, + project_id=GCP_PROJECT_ID, + region=GCP_LOCATION, + service_id=TEST_SERVICE_ID, + request_id=None, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + ) + + +class TestDataprocMetastoreGetServiceOperator(TestCase): + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreHook") + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.Service") + def test_execute(self, mock_service, mock_hook) -> None: + task = DataprocMetastoreGetServiceOperator( + task_id=TASK_ID, + region=GCP_LOCATION, + project_id=GCP_PROJECT_ID, + service_id=TEST_SERVICE_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + ) + mock_hook.return_value.wait_for_operation.return_value = None + mock_service.return_value.to_dict.return_value = None + task.execute(context=mock.MagicMock()) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.get_service.assert_called_once_with( + region=GCP_LOCATION, + project_id=GCP_PROJECT_ID, + service_id=TEST_SERVICE_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + ) + + +class TestDataprocMetastoreListBackupsOperator(TestCase): + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreHook") + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.Backup") + def test_assert_valid_hook_call(self, mock_backup, mock_hook) -> None: + task = DataprocMetastoreListBackupsOperator( + task_id=TASK_ID, + project_id=GCP_PROJECT_ID, + region=GCP_LOCATION, + service_id=TEST_SERVICE_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + ) + mock_hook.return_value.wait_for_operation.return_value = None + mock_backup.return_value.to_dict.return_value = None + task.execute(context=mock.MagicMock()) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.list_backups.assert_called_once_with( + project_id=GCP_PROJECT_ID, + region=GCP_LOCATION, + service_id=TEST_SERVICE_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + filter=None, + order_by=None, + page_size=None, + page_token=None, + ) + + +class TestDataprocMetastoreRestoreServiceOperator(TestCase): + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreHook") + @mock.patch( + "airflow.providers.google.cloud.operators.dataproc_metastore" + ".DataprocMetastoreRestoreServiceOperator._wait_for_restore_service" + ) + def test_assert_valid_hook_call(self, mock_wait, mock_hook) -> None: + task = DataprocMetastoreRestoreServiceOperator( + task_id=TASK_ID, + region=GCP_LOCATION, + project_id=GCP_PROJECT_ID, + service_id=TEST_SERVICE_ID, + backup_id=TEST_BACKUP_ID, + backup_region=GCP_LOCATION, + backup_project_id=GCP_PROJECT_ID, + backup_service_id=TEST_SERVICE_ID, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + ) + mock_wait.return_value = None + task.execute(context=mock.MagicMock()) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.restore_service.assert_called_once_with( + region=GCP_LOCATION, + project_id=GCP_PROJECT_ID, + service_id=TEST_SERVICE_ID, + backup_id=TEST_BACKUP_ID, + backup_region=GCP_LOCATION, + backup_project_id=GCP_PROJECT_ID, + backup_service_id=TEST_SERVICE_ID, + restore_type=None, + request_id=None, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + ) + + +class TestDataprocMetastoreUpdateServiceOperator(TestCase): + @mock.patch("airflow.providers.google.cloud.operators.dataproc_metastore.DataprocMetastoreHook") + def test_assert_valid_hook_call(self, mock_hook) -> None: + task = DataprocMetastoreUpdateServiceOperator( + task_id=TASK_ID, + region=GCP_LOCATION, + project_id=GCP_PROJECT_ID, + service_id=TEST_SERVICE_ID, + service=TEST_SERVICE_TO_UPDATE, + update_mask=TEST_UPDATE_MASK, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + ) + task.execute(context=mock.MagicMock()) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.update_service.assert_called_once_with( + region=GCP_LOCATION, + project_id=GCP_PROJECT_ID, + service_id=TEST_SERVICE_ID, + service=TEST_SERVICE_TO_UPDATE, + update_mask=TEST_UPDATE_MASK, + request_id=None, + retry=TEST_RETRY, + timeout=TEST_TIMEOUT, + metadata=TEST_METADATA, + ) diff --git a/tests/providers/google/cloud/operators/test_dataproc_metastore_system.py b/tests/providers/google/cloud/operators/test_dataproc_metastore_system.py new file mode 100644 index 0000000000000..3c1ad88ff7f76 --- /dev/null +++ b/tests/providers/google/cloud/operators/test_dataproc_metastore_system.py @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +from airflow.providers.google.cloud.example_dags.example_dataproc_metastore import BUCKET +from tests.providers.google.cloud.utils.gcp_authenticator import GCP_DATAPROC_KEY +from tests.test_utils.gcp_system_helpers import CLOUD_DAG_FOLDER, GoogleSystemTest, provide_gcp_context + + +@pytest.mark.backend("mysql", "postgres") +@pytest.mark.credential_file(GCP_DATAPROC_KEY) +class DataprocMetastoreExampleDagsTest(GoogleSystemTest): + @provide_gcp_context(GCP_DATAPROC_KEY) + def setUp(self): + super().setUp() + self.create_gcs_bucket(BUCKET) + + @provide_gcp_context(GCP_DATAPROC_KEY) + def tearDown(self): + self.delete_gcs_bucket(BUCKET) + super().tearDown() + + @provide_gcp_context(GCP_DATAPROC_KEY) + def test_run_example_dag(self): + self.run_dag(dag_id="example_gcp_dataproc_metastore", dag_folder=CLOUD_DAG_FOLDER) From 44f5dc569409417265d7d6db45f40fc821425994 Mon Sep 17 00:00:00 2001 From: Aakcht Date: Mon, 15 Nov 2021 02:53:43 +0300 Subject: [PATCH 080/250] Add hdfs requirement for hdfs provider (#19540) (cherry picked from commit 317953a4c7749bb74c1db750caec53f92c05f1ff) --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 008e2ef4d9acb..1fc3bcc85c85a 100644 --- a/setup.py +++ b/setup.py @@ -344,6 +344,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version ] hdfs = [ 'snakebite-py3', + 'hdfs[avro,dataframe,kerberos]>=2.0.4', ] hive = [ 'hmsclient>=0.1.0', From e2dcdfd29784926470b44e3a7a127e0aa5018063 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Fri, 19 Nov 2021 02:08:07 +0100 Subject: [PATCH 081/250] Fix failures with recent moto library 2.2.15 (#19693) The recent moto library is more picky about parameters passed to it: * when you are sending too old logs they are rejected * when you are passing cloud formation template they are parsed and validated for correctness Our tests had artifficial values for those, which caused failures with the recent moto version. This PR provides realistic values in tests to pass moto validation (cherry picked from commit 49b7e751eb3cb512d138a06237116c3aec6c4290) --- setup.py | 2 +- .../amazon/aws/hooks/test_cloud_formation.py | 19 ++++++++--- tests/providers/amazon/aws/hooks/test_logs.py | 4 +-- .../aws/log/test_cloudwatch_task_handler.py | 34 ++++++++++++------- 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/setup.py b/setup.py index 1fc3bcc85c85a..c3ec20f6c35ce 100644 --- a/setup.py +++ b/setup.py @@ -532,7 +532,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'jira', 'jsondiff', 'mongomock', - 'moto~=2.2, >=2.2.7', + 'moto~=2.2, >=2.2.12', 'mypy==0.770', 'parameterized', 'paramiko', diff --git a/tests/providers/amazon/aws/hooks/test_cloud_formation.py b/tests/providers/amazon/aws/hooks/test_cloud_formation.py index 09e0bb8cd9b6c..14f03751643be 100644 --- a/tests/providers/amazon/aws/hooks/test_cloud_formation.py +++ b/tests/providers/amazon/aws/hooks/test_cloud_formation.py @@ -23,7 +23,6 @@ try: from moto import mock_cloudformation - from moto.ec2.models import NetworkInterface as some_model except ImportError: mock_cloudformation = None @@ -39,10 +38,20 @@ def create_stack(self, stack_name): { 'Resources': { "myResource": { - "Type": some_model.cloudformation_type(), - "Properties": {"myProperty": "myPropertyValue"}, + "Type": "AWS::EC2::VPC", + "Properties": { + "CidrBlock": {"Ref": "VPCCidr"}, + "Tags": [{"Key": "Name", "Value": "Primary_CF_VPC"}], + }, } - } + }, + "Parameters": { + "VPCCidr": { + "Type": "String", + "Default": "10.0.0.0/16", + "Description": "Enter the CIDR block for the VPC. Default is 10.0.0.0/16.", + } + }, } ) @@ -51,7 +60,7 @@ def create_stack(self, stack_name): params={ 'TimeoutInMinutes': timeout, 'TemplateBody': template_body, - 'Parameters': [{'ParameterKey': 'myParam', 'ParameterValue': 'myParamValue'}], + 'Parameters': [{'ParameterKey': "VPCCidr", 'ParameterValue': '10.0.0.0/16'}], }, ) diff --git a/tests/providers/amazon/aws/hooks/test_logs.py b/tests/providers/amazon/aws/hooks/test_logs.py index 48a78edcebc97..ed660f606fd91 100644 --- a/tests/providers/amazon/aws/hooks/test_logs.py +++ b/tests/providers/amazon/aws/hooks/test_logs.py @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. # - +import time import unittest from airflow.providers.amazon.aws.hooks.logs import AwsLogsHook @@ -49,7 +49,7 @@ def test_get_log_events(self): conn.create_log_group(logGroupName=log_group_name) conn.create_log_stream(logGroupName=log_group_name, logStreamName=log_stream_name) - input_events = [{'timestamp': 1, 'message': 'Test Message 1'}] + input_events = [{'timestamp': int(time.time()) * 1000, 'message': 'Test Message 1'}] conn.put_log_events( logGroupName=log_group_name, logStreamName=log_stream_name, logEvents=input_events diff --git a/tests/providers/amazon/aws/log/test_cloudwatch_task_handler.py b/tests/providers/amazon/aws/log/test_cloudwatch_task_handler.py index db8ddcd7c6953..6d07f42d41428 100644 --- a/tests/providers/amazon/aws/log/test_cloudwatch_task_handler.py +++ b/tests/providers/amazon/aws/log/test_cloudwatch_task_handler.py @@ -15,8 +15,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +import time import unittest +from datetime import datetime as dt from unittest import mock from unittest.mock import ANY, call @@ -38,6 +39,11 @@ mock_logs = None +def get_time_str(time_in_milliseconds): + dt_time = dt.utcfromtimestamp(time_in_milliseconds / 1000.0) + return dt_time.strftime("%Y-%m-%d %H:%M:%S,000") + + @unittest.skipIf(mock_logs is None, "Skipping test because moto.mock_logs is not available") @mock_logs class TestCloudwatchTaskHandler(unittest.TestCase): @@ -116,16 +122,17 @@ def test_write(self): def test_event_to_str(self): handler = self.cloudwatch_task_handler + current_time = int(time.time()) * 1000 events = [ - {'timestamp': 1617400267123, 'message': 'First'}, - {'timestamp': 1617400367456, 'message': 'Second'}, - {'timestamp': 1617400467789, 'message': 'Third'}, + {'timestamp': current_time - 2000, 'message': 'First'}, + {'timestamp': current_time - 1000, 'message': 'Second'}, + {'timestamp': current_time, 'message': 'Third'}, ] assert [handler._event_to_str(event) for event in events] == ( [ - '[2021-04-02 21:51:07,123] First', - '[2021-04-02 21:52:47,456] Second', - '[2021-04-02 21:54:27,789] Third', + f'[{get_time_str(current_time-2000)}] First', + f'[{get_time_str(current_time-1000)}] Second', + f'[{get_time_str(current_time)}] Third', ] ) @@ -134,23 +141,24 @@ def test_read(self): # CloudWatch events must be ordered chronologically otherwise # boto3 put_log_event API throws InvalidParameterException # (moto does not throw this exception) + current_time = int(time.time()) * 1000 generate_log_events( self.conn, self.remote_log_group, self.remote_log_stream, [ - {'timestamp': 1617400267123, 'message': 'First'}, - {'timestamp': 1617400367456, 'message': 'Second'}, - {'timestamp': 1617400467789, 'message': 'Third'}, + {'timestamp': current_time - 2000, 'message': 'First'}, + {'timestamp': current_time - 1000, 'message': 'Second'}, + {'timestamp': current_time, 'message': 'Third'}, ], ) msg_template = '*** Reading remote log from Cloudwatch log_group: {} log_stream: {}.\n{}\n' events = '\n'.join( [ - '[2021-04-02 21:51:07,123] First', - '[2021-04-02 21:52:47,456] Second', - '[2021-04-02 21:54:27,789] Third', + f'[{get_time_str(current_time-2000)}] First', + f'[{get_time_str(current_time-1000)}] Second', + f'[{get_time_str(current_time)}] Third', ] ) assert self.cloudwatch_task_handler.read(self.ti) == ( From 67fc1d78fef6eb342f77d1536fa7d3d6dcf590dd Mon Sep 17 00:00:00 2001 From: Maksim Date: Fri, 26 Nov 2021 21:52:35 +0100 Subject: [PATCH 082/250] Create dataproc serverless spark batches operator (#19248) (cherry picked from commit bf68b9a8461eda634a7d91aa56575fb950960eaa) --- .../cloud/example_dags/example_dataproc.py | 10 +- .../providers/google/cloud/hooks/dataproc.py | 219 ++++++++++++ .../google/cloud/operators/dataproc.py | 332 +++++++++++++++++- .../apache-airflow-providers-google/index.rst | 2 +- .../operators/cloud/dataproc.rst | 1 - setup.py | 2 +- tests/always/test_project_structure.py | 4 + .../google/cloud/hooks/test_dataproc.py | 118 +++++++ .../google/cloud/operators/test_dataproc.py | 126 +++++++ .../cloud/operators/test_dataproc_system.py | 4 + 10 files changed, 811 insertions(+), 7 deletions(-) diff --git a/airflow/providers/google/cloud/example_dags/example_dataproc.py b/airflow/providers/google/cloud/example_dags/example_dataproc.py index 49594981d24a9..e5a99336b180b 100644 --- a/airflow/providers/google/cloud/example_dags/example_dataproc.py +++ b/airflow/providers/google/cloud/example_dags/example_dataproc.py @@ -149,6 +149,13 @@ }, "jobs": [{"step_id": "pig_job_1", "pig_job": PIG_JOB["pig_job"]}], } +BATCH_ID = "test-batch-id" +BATCH_CONFIG = { + "spark_batch": { + "jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"], + "main_class": "org.apache.spark.examples.SparkPi", + }, +} with models.DAG("example_gcp_dataproc", schedule_interval='@once', start_date=days_ago(1)) as dag: @@ -249,6 +256,3 @@ scale_cluster >> pyspark_task >> delete_cluster scale_cluster >> sparkr_task >> delete_cluster scale_cluster >> hadoop_task >> delete_cluster - - # Task dependency created via `XComArgs`: - # spark_task_async >> spark_task_async_sensor diff --git a/airflow/providers/google/cloud/hooks/dataproc.py b/airflow/providers/google/cloud/hooks/dataproc.py index 16aed3617e8a9..704540aa8ab0b 100644 --- a/airflow/providers/google/cloud/hooks/dataproc.py +++ b/airflow/providers/google/cloud/hooks/dataproc.py @@ -24,8 +24,11 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union from google.api_core.exceptions import ServerError +from google.api_core.operation import Operation from google.api_core.retry import Retry from google.cloud.dataproc_v1 import ( + Batch, + BatchControllerClient, Cluster, ClusterControllerClient, Job, @@ -267,6 +270,34 @@ def get_job_client( credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options ) + def get_batch_client( + self, region: Optional[str] = None, location: Optional[str] = None + ) -> BatchControllerClient: + """Returns BatchControllerClient""" + if location is not None: + warnings.warn( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead.", + DeprecationWarning, + stacklevel=2, + ) + region = location + client_options = None + if region and region != 'global': + client_options = {'api_endpoint': f'{region}-dataproc.googleapis.com:443'} + + return BatchControllerClient( + credentials=self._get_credentials(), client_info=self.client_info, client_options=client_options + ) + + def wait_for_operation(self, timeout: float, operation: Operation): + """Waits for long-lasting operation to complete.""" + try: + return operation.result(timeout=timeout) + except Exception: + error = operation.exception(timeout=timeout) + raise AirflowException(error) + @GoogleBaseHook.fallback_to_default_project_id def create_cluster( self, @@ -1030,3 +1061,191 @@ def cancel_job( metadata=metadata, ) return job + + @GoogleBaseHook.fallback_to_default_project_id + def create_batch( + self, + region: str, + project_id: str, + batch: Union[Dict, Batch], + batch_id: Optional[str] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = "", + ): + """ + Creates a batch workload. + + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param batch: Required. The batch to create. + :type batch: google.cloud.dataproc_v1.types.Batch + :param batch_id: Optional. The ID to use for the batch, which will become the final component + of the batch's resource name. + This value must be 4-63 characters. Valid characters are /[a-z][0-9]-/. + :type batch_id: str + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``CreateBatchRequest`` requests with the same id, then the second request will be ignored and + the first ``google.longrunning.Operation`` created and stored in the backend is returned. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + client = self.get_batch_client(region) + parent = f'projects/{project_id}/regions/{region}' + + result = client.create_batch( + request={ + 'parent': parent, + 'batch': batch, + 'batch_id': batch_id, + 'request_id': request_id, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def delete_batch( + self, + batch_id: str, + region: str, + project_id: str, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Deletes the batch workload resource. + + :param batch_id: Required. The ID to use for the batch, which will become the final component + of the batch's resource name. + This value must be 4-63 characters. Valid characters are /[a-z][0-9]-/. + :type batch_id: str + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + client = self.get_batch_client(region) + name = f"projects/{project_id}/regions/{region}/batches/{batch_id}" + + result = client.delete_batch( + request={ + 'name': name, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def get_batch( + self, + batch_id: str, + region: str, + project_id: str, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Gets the batch workload resource representation. + + :param batch_id: Required. The ID to use for the batch, which will become the final component + of the batch's resource name. + This value must be 4-63 characters. Valid characters are /[a-z][0-9]-/. + :type batch_id: str + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + client = self.get_batch_client(region) + name = f"projects/{project_id}/regions/{region}/batches/{batch_id}" + + result = client.get_batch( + request={ + 'name': name, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result + + @GoogleBaseHook.fallback_to_default_project_id + def list_batches( + self, + region: str, + project_id: str, + page_size: Optional[int] = None, + page_token: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = None, + ): + """ + Lists batch workloads. + + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param page_size: Optional. The maximum number of batches to return in each response. The service may + return fewer than this value. The default page size is 20; the maximum page size is 1000. + :type page_size: int + :param page_token: Optional. A page token received from a previous ``ListBatches`` call. + Provide this token to retrieve the subsequent page. + :type page_token: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + """ + client = self.get_batch_client(region) + parent = f'projects/{project_id}/regions/{region}' + + result = client.list_batches( + request={ + 'parent': parent, + 'page_size': page_size, + 'page_token': page_token, + }, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + return result diff --git a/airflow/providers/google/cloud/operators/dataproc.py b/airflow/providers/google/cloud/operators/dataproc.py index 537cc69650314..ac78179530f96 100644 --- a/airflow/providers/google/cloud/operators/dataproc.py +++ b/airflow/providers/google/cloud/operators/dataproc.py @@ -28,9 +28,10 @@ from datetime import datetime, timedelta from typing import Dict, List, Optional, Sequence, Set, Tuple, Union +from google.api_core import operation # type: ignore from google.api_core.exceptions import AlreadyExists, NotFound from google.api_core.retry import Retry, exponential_sleep_generator -from google.cloud.dataproc_v1 import Cluster +from google.cloud.dataproc_v1 import Batch, Cluster from google.protobuf.duration_pb2 import Duration from google.protobuf.field_mask_pb2 import FieldMask @@ -2162,3 +2163,332 @@ def execute(self, context: Dict): ) operation.result() self.log.info("Updated %s cluster.", self.cluster_name) + + +class DataprocCreateBatchOperator(BaseOperator): + """ + Creates a batch workload. + + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param batch: Required. The batch to create. + :type batch: google.cloud.dataproc_v1.types.Batch + :param batch_id: Optional. The ID to use for the batch, which will become the final component + of the batch's resource name. + This value must be 4-63 characters. Valid characters are /[a-z][0-9]-/. + :type batch_id: str + :param request_id: Optional. A unique id used to identify the request. If the server receives two + ``CreateBatchRequest`` requests with the same id, then the second request will be ignored and + the first ``google.longrunning.Operation`` created and stored in the backend is returned. + :type request_id: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ( + 'project_id', + 'batch_id', + 'region', + 'impersonation_chain', + ) + + def __init__( + self, + *, + region: str = None, + project_id: str, + batch: Union[Dict, Batch], + batch_id: Optional[str] = None, + request_id: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = "", + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.region = region + self.project_id = project_id + self.batch = batch + self.batch_id = batch_id + self.request_id = request_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + self.operation: Optional[operation.Operation] = None + + def execute(self, context): + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + self.log.info("Creating batch") + try: + self.operation = hook.create_batch( + region=self.region, + project_id=self.project_id, + batch=self.batch, + batch_id=self.batch_id, + request_id=self.request_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + result = hook.wait_for_operation(self.timeout, self.operation) + self.log.info("Batch %s created", self.batch_id) + except AlreadyExists: + self.log.info("Batch with given id already exists") + result = hook.get_batch( + batch_id=self.batch_id, + region=self.region, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + return Batch.to_dict(result) + + def on_kill(self): + if self.operation: + self.operation.cancel() + + +class DataprocDeleteBatchOperator(BaseOperator): + """ + Deletes the batch workload resource. + + :param batch_id: Required. The ID to use for the batch, which will become the final component + of the batch's resource name. + This value must be 4-63 characters. Valid characters are /[a-z][0-9]-/. + :type batch_id: str + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ("batch_id", "region", "project_id", "impersonation_chain") + + def __init__( + self, + *, + batch_id: str, + region: str, + project_id: str, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = "", + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.batch_id = batch_id + self.region = region + self.project_id = project_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context): + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + self.log.info("Deleting batch: %s", self.batch_id) + hook.delete_batch( + batch_id=self.batch_id, + region=self.region, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + self.log.info("Batch deleted.") + + +class DataprocGetBatchOperator(BaseOperator): + """ + Gets the batch workload resource representation. + + :param batch_id: Required. The ID to use for the batch, which will become the final component + of the batch's resource name. + This value must be 4-63 characters. Valid characters are /[a-z][0-9]-/. + :type batch_id: str + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param retry: A retry object used to retry requests. If ``None`` is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + ``retry`` is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: Sequence[Tuple[str, str]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud. + :type gcp_conn_id: str + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + """ + + template_fields = ("batch_id", "region", "project_id", "impersonation_chain") + + def __init__( + self, + *, + batch_id: str, + region: str, + project_id: str, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = "", + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.batch_id = batch_id + self.region = region + self.project_id = project_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context): + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + self.log.info("Getting batch: %s", self.batch_id) + batch = hook.get_batch( + batch_id=self.batch_id, + region=self.region, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + return Batch.to_dict(batch) + + +class DataprocListBatchesOperator(BaseOperator): + """ + Lists batch workloads. + + :param project_id: Required. The ID of the Google Cloud project that the cluster belongs to. + :type project_id: str + :param region: Required. The Cloud Dataproc region in which to handle the request. + :type region: str + :param page_size: Optional. The maximum number of batches to return in each response. The service may + return fewer than this value. The default page size is 20; the maximum page size is 1000. + :type page_size: int + :param page_token: Optional. A page token received from a previous ``ListBatches`` call. + Provide this token to retrieve the subsequent page. + :type page_token: str + :param retry: Optional, a retry object used to retry requests. If `None` is specified, requests + will not be retried. + :type retry: Optional[Retry] + :param timeout: Optional, the amount of time, in seconds, to wait for the request to complete. + Note that if `retry` is specified, the timeout applies to each individual attempt. + :type timeout: Optional[float] + :param metadata: Optional, additional metadata that is provided to the method. + :type metadata: Optional[Sequence[Tuple[str, str]]] + :param gcp_conn_id: Optional, the connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: Optional[str] + :param impersonation_chain: Optional service account to impersonate using short-term + credentials, or chained list of accounts required to get the access_token + of the last account in the list, which will be impersonated in the request. + If set as a string, the account must grant the originating account + the Service Account Token Creator IAM role. + If set as a sequence, the identities from the list must grant + Service Account Token Creator IAM role to the directly preceding identity, with first + account from the list granting this role to the originating account (templated). + :type impersonation_chain: Union[str, Sequence[str]] + + :rtype: List[dict] + """ + + template_fields = ("region", "project_id", "impersonation_chain") + + def __init__( + self, + *, + region: str, + project_id: Optional[str] = None, + page_size: Optional[int] = None, + page_token: Optional[str] = None, + retry: Optional[Retry] = None, + timeout: Optional[float] = None, + metadata: Optional[Sequence[Tuple[str, str]]] = "", + gcp_conn_id: str = "google_cloud_default", + impersonation_chain: Optional[Union[str, Sequence[str]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.region = region + self.project_id = project_id + self.page_size = page_size + self.page_token = page_token + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self.impersonation_chain = impersonation_chain + + def execute(self, context): + hook = DataprocHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) + results = hook.list_batches( + region=self.region, + project_id=self.project_id, + page_size=self.page_size, + page_token=self.page_token, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + return [Batch.to_dict(result) for result in results] diff --git a/docs/apache-airflow-providers-google/index.rst b/docs/apache-airflow-providers-google/index.rst index 67eadad2479b0..baa5516f440eb 100644 --- a/docs/apache-airflow-providers-google/index.rst +++ b/docs/apache-airflow-providers-google/index.rst @@ -102,7 +102,7 @@ PIP package Version required ``google-cloud-build`` ``>=3.0.0,<4.0.0`` ``google-cloud-container`` ``>=0.1.1,<2.0.0`` ``google-cloud-datacatalog`` ``>=3.0.0,<4.0.0`` -``google-cloud-dataproc`` ``>=2.2.0,<2.6.0`` +``google-cloud-dataproc`` ``>=3.1.0,<2.6.0`` ``google-cloud-dlp`` ``>=0.11.0,<2.0.0`` ``google-cloud-kms`` ``>=2.0.0,<3.0.0`` ``google-cloud-language`` ``>=1.1.1,<2.0.0`` diff --git a/docs/apache-airflow-providers-google/operators/cloud/dataproc.rst b/docs/apache-airflow-providers-google/operators/cloud/dataproc.rst index 55449e9fbcd55..c4b844f10ecea 100644 --- a/docs/apache-airflow-providers-google/operators/cloud/dataproc.rst +++ b/docs/apache-airflow-providers-google/operators/cloud/dataproc.rst @@ -199,7 +199,6 @@ Once a workflow is created users can trigger it using :start-after: [START how_to_cloud_dataproc_trigger_workflow_template] :end-before: [END how_to_cloud_dataproc_trigger_workflow_template] - References ^^^^^^^^^^ For further information, take a look at: diff --git a/setup.py b/setup.py index c3ec20f6c35ce..cea2aa8f1b75f 100644 --- a/setup.py +++ b/setup.py @@ -303,7 +303,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'google-cloud-build>=3.0.0,<4.0.0', 'google-cloud-container>=0.1.1,<2.0.0', 'google-cloud-datacatalog>=3.0.0,<4.0.0', - 'google-cloud-dataproc>=2.2.0,<4.0.0', + 'google-cloud-dataproc>=3.1.0,<4.0.0', 'google-cloud-dataproc-metastore>=1.2.0,<2.0.0', 'google-cloud-dlp>=0.11.0,<2.0.0', 'google-cloud-kms>=2.0.0,<3.0.0', diff --git a/tests/always/test_project_structure.py b/tests/always/test_project_structure.py index 5b40d74554402..c3997e3ecfc30 100644 --- a/tests/always/test_project_structure.py +++ b/tests/always/test_project_structure.py @@ -219,6 +219,10 @@ class TestGoogleProviderProjectStructure(unittest.TestCase): 'airflow.providers.google.cloud.operators.datastore.CloudDatastoreGetOperationOperator', 'airflow.providers.google.cloud.sensors.gcs.GCSObjectUpdateSensor', 'airflow.providers.google.cloud.sensors.gcs.GCSUploadSessionCompleteSensor', + 'airflow.providers.google.cloud.operators.dataproc.DataprocGetBatchOperator', + 'airflow.providers.google.cloud.operators.dataproc.DataprocCreateBatchOperator', + 'airflow.providers.google.cloud.operators.dataproc.DataprocListBatchesOperator', + 'airflow.providers.google.cloud.operators.dataproc.DataprocDeleteBatchOperator', } def test_example_dags(self): diff --git a/tests/providers/google/cloud/hooks/test_dataproc.py b/tests/providers/google/cloud/hooks/test_dataproc.py index c81c39cc6eb49..598bb910785a0 100644 --- a/tests/providers/google/cloud/hooks/test_dataproc.py +++ b/tests/providers/google/cloud/hooks/test_dataproc.py @@ -42,6 +42,10 @@ "labels": LABELS, "project_id": GCP_PROJECT, } +BATCH = {"batch": "test-batch"} +BATCH_ID = "batch-id" +BATCH_NAME = "projects/{}/regions/{}/batches/{}" +PARENT = "projects/{}/regions/{}" BASE_STRING = "airflow.providers.google.common.hooks.base_google.{}" DATAPROC_STRING = "airflow.providers.google.cloud.hooks.dataproc.{}" @@ -179,6 +183,47 @@ def test_get_job_client_region_deprecation_warning( ) assert warning_message == str(warnings[0].message) + @mock.patch(DATAPROC_STRING.format("DataprocHook._get_credentials")) + @mock.patch(DATAPROC_STRING.format("DataprocHook.client_info"), new_callable=mock.PropertyMock) + @mock.patch(DATAPROC_STRING.format("BatchControllerClient")) + def test_get_batch_client(self, mock_client, mock_client_info, mock_get_credentials): + self.hook.get_batch_client(region=GCP_LOCATION) + mock_client.assert_called_once_with( + credentials=mock_get_credentials.return_value, + client_info=mock_client_info.return_value, + client_options=None, + ) + + @mock.patch(DATAPROC_STRING.format("DataprocHook._get_credentials")) + @mock.patch(DATAPROC_STRING.format("DataprocHook.client_info"), new_callable=mock.PropertyMock) + @mock.patch(DATAPROC_STRING.format("BatchControllerClient")) + def test_get_batch_client_region(self, mock_client, mock_client_info, mock_get_credentials): + self.hook.get_batch_client(region='region1') + mock_client.assert_called_once_with( + credentials=mock_get_credentials.return_value, + client_info=mock_client_info.return_value, + client_options={'api_endpoint': 'region1-dataproc.googleapis.com:443'}, + ) + + @mock.patch(DATAPROC_STRING.format("DataprocHook._get_credentials")) + @mock.patch(DATAPROC_STRING.format("DataprocHook.client_info"), new_callable=mock.PropertyMock) + @mock.patch(DATAPROC_STRING.format("BatchControllerClient")) + def test_get_batch_client_region_deprecation_warning( + self, mock_client, mock_client_info, mock_get_credentials + ): + warning_message = ( + "Parameter `location` will be deprecated. " + "Please provide value through `region` parameter instead." + ) + with pytest.warns(DeprecationWarning) as warnings: + self.hook.get_batch_client(location='region1') + mock_client.assert_called_once_with( + credentials=mock_get_credentials.return_value, + client_info=mock_client_info.return_value, + client_options={'api_endpoint': 'region1-dataproc.googleapis.com:443'}, + ) + assert warning_message == str(warnings[0].message) + @mock.patch(DATAPROC_STRING.format("DataprocHook.get_cluster_client")) def test_create_cluster(self, mock_client): self.hook.create_cluster( @@ -615,6 +660,79 @@ def test_cancel_job_deprecation_warning_param_rename(self, mock_client): ) assert warning_message == str(warnings[0].message) + @mock.patch(DATAPROC_STRING.format("DataprocHook.get_batch_client")) + def test_create_batch(self, mock_client): + self.hook.create_batch( + project_id=GCP_PROJECT, + region=GCP_LOCATION, + batch=BATCH, + batch_id=BATCH_ID, + ) + mock_client.assert_called_once_with(GCP_LOCATION) + mock_client.return_value.create_batch.assert_called_once_with( + request=dict( + parent=PARENT.format(GCP_PROJECT, GCP_LOCATION), + batch=BATCH, + batch_id=BATCH_ID, + request_id=None, + ), + metadata="", + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_STRING.format("DataprocHook.get_batch_client")) + def test_delete_batch(self, mock_client): + self.hook.delete_batch( + batch_id=BATCH_ID, + region=GCP_LOCATION, + project_id=GCP_PROJECT, + ) + mock_client.assert_called_once_with(GCP_LOCATION) + mock_client.return_value.delete_batch.assert_called_once_with( + request=dict( + name=BATCH_NAME.format(GCP_PROJECT, GCP_LOCATION, BATCH_ID), + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_STRING.format("DataprocHook.get_batch_client")) + def test_get_batch(self, mock_client): + self.hook.get_batch( + batch_id=BATCH_ID, + region=GCP_LOCATION, + project_id=GCP_PROJECT, + ) + mock_client.assert_called_once_with(GCP_LOCATION) + mock_client.return_value.get_batch.assert_called_once_with( + request=dict( + name=BATCH_NAME.format(GCP_PROJECT, GCP_LOCATION, BATCH_ID), + ), + metadata=None, + retry=None, + timeout=None, + ) + + @mock.patch(DATAPROC_STRING.format("DataprocHook.get_batch_client")) + def test_list_batches(self, mock_client): + self.hook.list_batches( + project_id=GCP_PROJECT, + region=GCP_LOCATION, + ) + mock_client.assert_called_once_with(GCP_LOCATION) + mock_client.return_value.list_batches.assert_called_once_with( + request=dict( + parent=PARENT.format(GCP_PROJECT, GCP_LOCATION), + page_size=None, + page_token=None, + ), + metadata=None, + retry=None, + timeout=None, + ) + class TestDataProcJobBuilder(unittest.TestCase): def setUp(self) -> None: diff --git a/tests/providers/google/cloud/operators/test_dataproc.py b/tests/providers/google/cloud/operators/test_dataproc.py index f8500aa9b0080..34e63537f0c55 100644 --- a/tests/providers/google/cloud/operators/test_dataproc.py +++ b/tests/providers/google/cloud/operators/test_dataproc.py @@ -29,12 +29,16 @@ from airflow.providers.google.cloud.operators.dataproc import ( ClusterGenerator, DataprocClusterLink, + DataprocCreateBatchOperator, DataprocCreateClusterOperator, DataprocCreateWorkflowTemplateOperator, + DataprocDeleteBatchOperator, DataprocDeleteClusterOperator, + DataprocGetBatchOperator, DataprocInstantiateInlineWorkflowTemplateOperator, DataprocInstantiateWorkflowTemplateOperator, DataprocJobLink, + DataprocListBatchesOperator, DataprocScaleClusterOperator, DataprocSubmitHadoopJobOperator, DataprocSubmitHiveJobOperator, @@ -199,6 +203,13 @@ "region": GCP_LOCATION, "project_id": GCP_PROJECT, } +BATCH_ID = "test-batch-id" +BATCH = { + "spark_batch": { + "jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"], + "main_class": "org.apache.spark.examples.SparkPi", + }, +} def assert_warning(msg: str, warnings): @@ -1661,3 +1672,118 @@ def test_location_deprecation_warning(self, mock_hook): template=WORKFLOW_TEMPLATE, ) op.execute(context={}) + + +class TestDataprocCreateBatchOperator: + @mock.patch(DATAPROC_PATH.format("Batch.to_dict")) + @mock.patch(DATAPROC_PATH.format("DataprocHook")) + def test_execute(self, mock_hook, to_dict_mock): + op = DataprocCreateBatchOperator( + task_id=TASK_ID, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + region=GCP_LOCATION, + project_id=GCP_PROJECT, + batch=BATCH, + batch_id=BATCH_ID, + request_id=REQUEST_ID, + retry=RETRY, + timeout=TIMEOUT, + metadata=METADATA, + ) + op.execute(context={}) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.create_batch.assert_called_once_with( + region=GCP_LOCATION, + project_id=GCP_PROJECT, + batch=BATCH, + batch_id=BATCH_ID, + request_id=REQUEST_ID, + retry=RETRY, + timeout=TIMEOUT, + metadata=METADATA, + ) + + +class TestDataprocDeleteBatchOperator: + @mock.patch(DATAPROC_PATH.format("DataprocHook")) + def test_execute(self, mock_hook): + op = DataprocDeleteBatchOperator( + task_id=TASK_ID, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + project_id=GCP_PROJECT, + region=GCP_LOCATION, + batch_id=BATCH_ID, + retry=RETRY, + timeout=TIMEOUT, + metadata=METADATA, + ) + op.execute(context={}) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.delete_batch.assert_called_once_with( + project_id=GCP_PROJECT, + region=GCP_LOCATION, + batch_id=BATCH_ID, + retry=RETRY, + timeout=TIMEOUT, + metadata=METADATA, + ) + + +class TestDataprocGetBatchOperator: + @mock.patch(DATAPROC_PATH.format("Batch.to_dict")) + @mock.patch(DATAPROC_PATH.format("DataprocHook")) + def test_execute(self, mock_hook, to_dict_mock): + op = DataprocGetBatchOperator( + task_id=TASK_ID, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + project_id=GCP_PROJECT, + region=GCP_LOCATION, + batch_id=BATCH_ID, + retry=RETRY, + timeout=TIMEOUT, + metadata=METADATA, + ) + op.execute(context={}) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.get_batch.assert_called_once_with( + project_id=GCP_PROJECT, + region=GCP_LOCATION, + batch_id=BATCH_ID, + retry=RETRY, + timeout=TIMEOUT, + metadata=METADATA, + ) + + +class TestDataprocListBatchesOperator: + @mock.patch(DATAPROC_PATH.format("DataprocHook")) + def test_execute(self, mock_hook): + page_token = "page_token" + page_size = 42 + + op = DataprocListBatchesOperator( + task_id=TASK_ID, + gcp_conn_id=GCP_CONN_ID, + impersonation_chain=IMPERSONATION_CHAIN, + region=GCP_LOCATION, + project_id=GCP_PROJECT, + page_size=page_size, + page_token=page_token, + retry=RETRY, + timeout=TIMEOUT, + metadata=METADATA, + ) + op.execute(context={}) + mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN) + mock_hook.return_value.list_batches.assert_called_once_with( + region=GCP_LOCATION, + project_id=GCP_PROJECT, + page_size=page_size, + page_token=page_token, + retry=RETRY, + timeout=TIMEOUT, + metadata=METADATA, + ) diff --git a/tests/providers/google/cloud/operators/test_dataproc_system.py b/tests/providers/google/cloud/operators/test_dataproc_system.py index 568af28f53fa0..30f9a35d9a4c4 100644 --- a/tests/providers/google/cloud/operators/test_dataproc_system.py +++ b/tests/providers/google/cloud/operators/test_dataproc_system.py @@ -63,3 +63,7 @@ def tearDown(self): @provide_gcp_context(GCP_DATAPROC_KEY) def test_run_example_dag(self): self.run_dag(dag_id="example_gcp_dataproc", dag_folder=CLOUD_DAG_FOLDER) + + @provide_gcp_context(GCP_DATAPROC_KEY) + def test_run_batch_example_dag(self): + self.run_dag(dag_id="example_gcp_batch_dataproc", dag_folder=CLOUD_DAG_FOLDER) From 95b9d484bbeddc5115b8ce79e0e6919887b008ad Mon Sep 17 00:00:00 2001 From: Niko Date: Wed, 1 Dec 2021 09:53:30 -0800 Subject: [PATCH 083/250] Move to watchtower 2.0.1 (#19907) - This version of watchtower contains patches that fixes #15279 where empty log lines would crash Watchtower. (cherry picked from commit 2539cb44b47d78e81a88fde51087f4cc77c924c5) --- airflow/providers/amazon/aws/log/cloudwatch_task_handler.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/airflow/providers/amazon/aws/log/cloudwatch_task_handler.py b/airflow/providers/amazon/aws/log/cloudwatch_task_handler.py index 0b0a103105312..1a180188b53c2 100644 --- a/airflow/providers/amazon/aws/log/cloudwatch_task_handler.py +++ b/airflow/providers/amazon/aws/log/cloudwatch_task_handler.py @@ -81,7 +81,7 @@ def set_context(self, ti): self.handler = watchtower.CloudWatchLogHandler( log_group=self.log_group, stream_name=self._render_filename(ti, ti.try_number), - boto3_session=self.hook.get_session(self.region_name), + boto3_client=self.hook.get_conn(), ) def close(self): diff --git a/setup.py b/setup.py index cea2aa8f1b75f..6f64f8f2a1add 100644 --- a/setup.py +++ b/setup.py @@ -186,7 +186,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version ] amazon = [ 'boto3>=1.15.0,<1.19.0', - 'watchtower~=1.0.6', + 'watchtower~=2.0.1', 'jsonpath_ng>=1.5.3', 'redshift_connector~=2.0.888', 'sqlalchemy_redshift~=0.8.6', From 85cce0769460829865c5f374809607ad922de715 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Fri, 3 Dec 2021 19:41:34 +0000 Subject: [PATCH 084/250] Fix ``breeze kind-cluster shell`` (#20015) This was failing with the following: ``` /Users/kaxilnaik/Documents/GitHub/astronomer/airflow/scripts/ci/kubernetes/ci_run_kubernetes_tests.sh: line 102: constraints[@]: unbound variable Exporting logs for cluster "airflow-python-3.7-v1.20.2" to: /tmp/kind_logs_2021-12-03_0_0 ``` and was caused by https://github.com/apache/airflow/pull/17290 (cherry picked from commit 0afed43a8afde093277be2862138cb32fba8ed29) --- scripts/ci/kubernetes/ci_run_kubernetes_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/kubernetes/ci_run_kubernetes_tests.sh b/scripts/ci/kubernetes/ci_run_kubernetes_tests.sh index e586c300be37d..ef920b4b5d547 100755 --- a/scripts/ci/kubernetes/ci_run_kubernetes_tests.sh +++ b/scripts/ci/kubernetes/ci_run_kubernetes_tests.sh @@ -88,7 +88,7 @@ function create_virtualenv() { --constraint "https://raw.githubusercontent.com/${CONSTRAINTS_GITHUB_REPOSITORY}/${DEFAULT_CONSTRAINTS_BRANCH}/constraints-${HOST_PYTHON_VERSION}.txt" ) - if [[ -n ${GITHUB_REGISTRY_PULL_IMAGE_TAG=} ]]; then + if [[ ${CI:=} == "true" && -n ${GITHUB_REGISTRY_PULL_IMAGE_TAG=} ]]; then # Disable constraints when building in CI with specific version of sources # In case there will be conflicting constraints constraints=() From 5f1236f062a3980ad36dc24213366d1db0e00112 Mon Sep 17 00:00:00 2001 From: Tajinder Singh Date: Mon, 6 Dec 2021 09:28:14 -0500 Subject: [PATCH 085/250] update upper bound for MarkupSafe (#19953) Co-authored-by: Tzu-ping Chung (cherry picked from commit ba6b7c7424f6b5ea2c1464304be8738ea482f8c1) --- setup.cfg | 4 ++-- tests/www/views/test_views_rendered.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 65c22d2280e17..b6fe6a1d9d7a9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -123,7 +123,7 @@ install_requires = lazy-object-proxy lockfile>=0.12.2 markdown>=2.5.2, <4.0 - markupsafe>=1.1.1 + markupsafe>=1.1.1, <=2.0 marshmallow-oneofschema>=2.0.1 # Required by vendored-in connexion openapi-spec-validator>=0.2.4 @@ -206,7 +206,7 @@ ignore_errors = True line_length=110 combine_as_imports = true default_section = THIRDPARTY -known_first_party=airflow,tests +known_first_party=airflow,airflow_breeze,tests # Need to be consistent with the exclude config defined in pre-commit-config.yaml skip=build,.tox,venv profile = black diff --git a/tests/www/views/test_views_rendered.py b/tests/www/views/test_views_rendered.py index 129baf7c0e6b2..f88db34796430 100644 --- a/tests/www/views/test_views_rendered.py +++ b/tests/www/views/test_views_rendered.py @@ -157,7 +157,7 @@ def test_user_defined_filter_and_macros_raise_error(admin_client, create_dag_run assert resp.status_code == 200 resp_html: str = resp.data.decode("utf-8") - assert "echo Hello Apache Airflow" not in resp_html + assert "echo Hello Apache Airflow" in resp_html assert ( "Webserver does not have access to User-defined Macros or Filters when " "Dag Serialization is enabled. Hence for the task that have not yet " From f1c7f0689b1a4b8d5ef67086177b8c52f6276e9f Mon Sep 17 00:00:00 2001 From: philipherrmann <72450023+philipherrmann@users.noreply.github.com> Date: Wed, 29 Dec 2021 00:55:51 +0100 Subject: [PATCH 086/250] switch to follow_redirects on httpx.get call in CloudSQL provider (#20239) * switch to follow_redirects on httpx.get call in CloudSQL provider * sense for parameter as suggested in review (cherry picked from commit bfd6d45cecbc7714cea8e2ce5d8920bdb4819887) --- airflow/providers/google/cloud/hooks/cloud_sql.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/airflow/providers/google/cloud/hooks/cloud_sql.py b/airflow/providers/google/cloud/hooks/cloud_sql.py index ce77671e1ed95..4abfce182b621 100644 --- a/airflow/providers/google/cloud/hooks/cloud_sql.py +++ b/airflow/providers/google/cloud/hooks/cloud_sql.py @@ -31,6 +31,7 @@ import subprocess import time import uuid +from inspect import signature from pathlib import Path from subprocess import PIPE, Popen from typing import Any, Dict, List, Optional, Sequence, Union @@ -498,7 +499,12 @@ def _download_sql_proxy_if_needed(self) -> None: ) proxy_path_tmp = self.sql_proxy_path + ".tmp" self.log.info("Downloading cloud_sql_proxy from %s to %s", download_url, proxy_path_tmp) - response = httpx.get(download_url, allow_redirects=True) + # httpx has a breaking API change (follow_redirects vs allow_redirects) + # and this should work with both versions (cf. issue #20088) + if "follow_redirects" in signature(httpx.get).parameters.keys(): + response = httpx.get(download_url, follow_redirects=True) + else: + response = httpx.get(download_url, allow_redirects=True) # Downloading to .tmp file first to avoid case where partially downloaded # binary is used by parallel operator which uses the same fixed binary path with open(proxy_path_tmp, 'wb') as file: @@ -768,7 +774,7 @@ def __init__( @staticmethod def _get_bool(val: Any) -> bool: - if val == 'False': + if val == 'False' or val is False: return False return True From 7250d89f239a42482faa758a67f6afa3229ad709 Mon Sep 17 00:00:00 2001 From: Tajinder Singh Date: Tue, 7 Dec 2021 20:26:31 -0500 Subject: [PATCH 087/250] Lift off upper bound for MarkupSafe (#20113) Per discussion and guidance from #19753, opening this PR for review. Based on if all the tests pass, this could be reviewed further. Resolves #19761. (cherry picked from commit bcacc51a16697a656357c29c7a40240e422e4bf9) --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index b6fe6a1d9d7a9..68d1efb9ff6f0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -123,7 +123,7 @@ install_requires = lazy-object-proxy lockfile>=0.12.2 markdown>=2.5.2, <4.0 - markupsafe>=1.1.1, <=2.0 + markupsafe>=1.1.1 marshmallow-oneofschema>=2.0.1 # Required by vendored-in connexion openapi-spec-validator>=0.2.4 From 27fbbea3d839d5f09e51c9db231215d5e8368a55 Mon Sep 17 00:00:00 2001 From: Malthe Borch Date: Wed, 8 Dec 2021 15:01:02 +0100 Subject: [PATCH 088/250] Upload provider artifacts before install/test step (#20137) The reasoning is that uploading is rather quick and sometimes you'll want to download the artifact regardless of whether tests pass or fail. (cherry picked from commit 35be8bdee0cdd3e5c73270b0b65e0552fb9d9946) --- .github/workflows/ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0015a9062ce3c..d9a90cf44c27e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -615,17 +615,17 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" run: ./scripts/ci/build_airflow/ci_build_airflow_packages.sh env: PACKAGE_FORMAT: "sdist" - - name: "Install and test provider packages and airflow via sdist files" - run: ./scripts/ci/provider_packages/ci_install_and_test_provider_packages.sh - env: - USE_AIRFLOW_VERSION: "sdist" - PACKAGE_FORMAT: "sdist" - name: "Upload provider distribution artifacts" uses: actions/upload-artifact@v2 with: name: airflow-provider-packages path: "./dist/apache-airflow-providers-*.tar.gz" retention-days: 1 + - name: "Install and test provider packages and airflow via sdist files" + run: ./scripts/ci/provider_packages/ci_install_and_test_provider_packages.sh + env: + USE_AIRFLOW_VERSION: "sdist" + PACKAGE_FORMAT: "sdist" tests-helm: timeout-minutes: 40 From 96f88a6ed9b3ea8bc331e384a716bd81f95594e1 Mon Sep 17 00:00:00 2001 From: Ash Berlin-Taylor Date: Thu, 9 Dec 2021 14:02:46 +0000 Subject: [PATCH 089/250] Update Sphinx and Sphinx-AutoAPI (#20079) We were stuck on an old version of Sphinx AutoAPI for a long while as more recent versions wouldn't build Airflow's docs, but that seems to have finally been resolved. We can remove the run_patched_sphinx.py as that was included in sphinx-autoapi 1.1 * Fix doc rendering glitch in Google provider utils * Remove duplicated link from cncf-kubernetes provider index (cherry picked from commit fa96b093952f96449d6d328a2b9e9300b81cf08e) --- airflow/models/dag.py | 3 +- .../cloud/utils/credentials_provider.py | 10 +- .../utils/mlengine_prediction_summary.py | 4 + .../common/utils/id_token_credentials.py | 4 + airflow/sensors/base.py | 18 +-- .../index.rst | 1 - docs/apache-airflow/executor/kubernetes.rst | 6 +- docs/conf.py | 21 ++++ docs/exts/docs_build/docs_builder.py | 5 +- docs/exts/docs_build/run_patched_sphinx.py | 105 ------------------ docs/exts/exampleinclude.py | 8 +- docs/spelling_wordlist.txt | 25 ++++- setup.py | 6 +- 13 files changed, 75 insertions(+), 141 deletions(-) delete mode 100755 docs/exts/docs_build/run_patched_sphinx.py diff --git a/airflow/models/dag.py b/airflow/models/dag.py index 429e1b3adc467..2a08d269b30d7 100644 --- a/airflow/models/dag.py +++ b/airflow/models/dag.py @@ -2938,7 +2938,7 @@ def calculate_dagrun_date_fields( def dag(*dag_args, **dag_kwargs): """ Python dag decorator. Wraps a function into an Airflow DAG. - Accepts kwargs for operator kwarg. Can be used to parametrize DAGs. + Accepts kwargs for operator kwarg. Can be used to parameterize DAGs. :param dag_args: Arguments for DAG object :type dag_args: Any @@ -2998,6 +2998,7 @@ def factory(*args, **kwargs): from airflow.models.serialized_dag import SerializedDagModel DagModel.serialized_dag = relationship(SerializedDagModel) + """:sphinx-autoapi-skip:""" class DagContext: diff --git a/airflow/providers/google/cloud/utils/credentials_provider.py b/airflow/providers/google/cloud/utils/credentials_provider.py index 414c9c145588f..6b5382c11ed2c 100644 --- a/airflow/providers/google/cloud/utils/credentials_provider.py +++ b/airflow/providers/google/cloud/utils/credentials_provider.py @@ -78,8 +78,8 @@ def build_gcp_conn( @contextmanager def provide_gcp_credentials(key_file_path: Optional[str] = None, key_file_dict: Optional[Dict] = None): """ - Context manager that provides a Google Cloud credentials for application supporting `Application - Default Credentials (ADC) strategy `__. + Context manager that provides a Google Cloud credentials for application supporting + `Application Default Credentials (ADC) strategy`__. It can be used to provide credentials for external programs (e.g. gcloud) that expect authorization file in ``GOOGLE_APPLICATION_CREDENTIALS`` environment variable. @@ -88,6 +88,8 @@ def provide_gcp_credentials(key_file_path: Optional[str] = None, key_file_dict: :type key_file_path: str :param key_file_dict: Dictionary with credentials. :type key_file_dict: Dict + + __ https://cloud.google.com/docs/authentication/production """ if not key_file_path and not key_file_dict: raise ValueError("Please provide `key_file_path` or `key_file_dict`.") @@ -145,7 +147,7 @@ def provide_gcp_conn_and_credentials( Context manager that provides both: - Google Cloud credentials for application supporting `Application Default Credentials (ADC) - strategy `__. + strategy`__. - temporary value of :envvar:`AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT` connection :param key_file_path: Path to file with Google Cloud Service Account .json file. @@ -154,6 +156,8 @@ def provide_gcp_conn_and_credentials( :type scopes: Sequence :param project_id: The id of Google Cloud project for the connection. :type project_id: str + + __ https://cloud.google.com/docs/authentication/production """ with ExitStack() as stack: if key_file_path: diff --git a/airflow/providers/google/cloud/utils/mlengine_prediction_summary.py b/airflow/providers/google/cloud/utils/mlengine_prediction_summary.py index bba8f38fcb6c9..482d809c6e631 100644 --- a/airflow/providers/google/cloud/utils/mlengine_prediction_summary.py +++ b/airflow/providers/google/cloud/utils/mlengine_prediction_summary.py @@ -101,6 +101,10 @@ def metric_fn(inst): "--temp_location=gs://...", ] ) + +.. spelling:: + + pcoll """ import argparse diff --git a/airflow/providers/google/common/utils/id_token_credentials.py b/airflow/providers/google/common/utils/id_token_credentials.py index c14509ede43c1..9d7a8c67f513d 100644 --- a/airflow/providers/google/common/utils/id_token_credentials.py +++ b/airflow/providers/google/common/utils/id_token_credentials.py @@ -23,6 +23,10 @@ ID_TOKEN="$(python -m airflow.providers.google.common.utils.id_token_credentials)" curl "https://www.googleapis.com/oauth2/v3/tokeninfo?id_token=${ID_TOKEN}" -v + +.. spelling:: + + RefreshError """ import json diff --git a/airflow/sensors/base.py b/airflow/sensors/base.py index 0019c41046c42..039a21ad7b697 100644 --- a/airflow/sensors/base.py +++ b/airflow/sensors/base.py @@ -18,7 +18,6 @@ import datetime import hashlib -import os import time from datetime import timedelta from typing import Any, Callable, Dict, Iterable @@ -39,7 +38,7 @@ # We need to keep the import here because GCSToLocalFilesystemOperator released in # Google Provider before 3.0.0 imported apply_defaults from here. # See https://github.com/apache/airflow/issues/16035 -from airflow.utils.decorators import apply_defaults +from airflow.utils.decorators import apply_defaults # noqa: F401 class BaseSensorOperator(BaseOperator, SkipMixin): @@ -122,13 +121,8 @@ def _validate_input_values(self) -> None: raise AirflowException("The timeout must be a non-negative number") if self.mode not in self.valid_modes: raise AirflowException( - "The mode must be one of {valid_modes}," - "'{d}.{t}'; received '{m}'.".format( - valid_modes=self.valid_modes, - d=self.dag.dag_id if self.has_dag() else "", - t=self.task_id, - m=self.mode, - ) + f"The mode must be one of {self.valid_modes},'{self.dag.dag_id if self.has_dag() else ''} " + f".{self.task_id}'; received '{self.mode}'." ) def poke(self, context: Dict) -> bool: @@ -324,9 +318,3 @@ def mode_setter(_, value): return cls_type return decorate(cls) - - -if 'BUILDING_AIRFLOW_DOCS' in os.environ: - # flake8: noqa: F811 - # Monkey patch hook to get good function headers while building docs - apply_defaults = lambda x: x diff --git a/docs/apache-airflow-providers-cncf-kubernetes/index.rst b/docs/apache-airflow-providers-cncf-kubernetes/index.rst index 1dc838d17fcd6..e37c4ffe2e3dd 100644 --- a/docs/apache-airflow-providers-cncf-kubernetes/index.rst +++ b/docs/apache-airflow-providers-cncf-kubernetes/index.rst @@ -42,7 +42,6 @@ Content Example DAGs PyPI Repository Installing from sources - Installing from sources .. THE REMAINDER OF THE FILE IS AUTOMATICALLY GENERATED. IT WILL BE OVERWRITTEN AT RELEASE TIME! diff --git a/docs/apache-airflow/executor/kubernetes.rst b/docs/apache-airflow/executor/kubernetes.rst index ab607919ed1c6..341f9fea0c03c 100644 --- a/docs/apache-airflow/executor/kubernetes.rst +++ b/docs/apache-airflow/executor/kubernetes.rst @@ -100,21 +100,21 @@ With these requirements in mind, here are some examples of basic ``pod_template_ Storing DAGs in the image: -.. exampleinclude:: /../../airflow/kubernetes/pod_template_file_examples/dags_in_image_template.yaml +.. literalinclude:: /../../airflow/kubernetes/pod_template_file_examples/dags_in_image_template.yaml :language: yaml :start-after: [START template_with_dags_in_image] :end-before: [END template_with_dags_in_image] Storing DAGs in a ``persistentVolume``: -.. exampleinclude:: /../../airflow/kubernetes/pod_template_file_examples/dags_in_volume_template.yaml +.. literalinclude:: /../../airflow/kubernetes/pod_template_file_examples/dags_in_volume_template.yaml :language: yaml :start-after: [START template_with_dags_in_volume] :end-before: [END template_with_dags_in_volume] Pulling DAGs from ``git``: -.. exampleinclude:: /../../airflow/kubernetes/pod_template_file_examples/git_sync_template.yaml +.. literalinclude:: /../../airflow/kubernetes/pod_template_file_examples/git_sync_template.yaml :language: yaml :start-after: [START git_sync_template] :end-before: [END git_sync_template] diff --git a/docs/conf.py b/docs/conf.py index 8afba7fed2601..f21a1f40e02b0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -121,6 +121,9 @@ .. |experimental| replace:: This is an :ref:`experimental feature `. """ +smartquotes_excludes = {'builders': ['man', 'text', 'spelling']} + + # -- General configuration ----------------------------------------------------- # See: https://www.sphinx-doc.org/en/master/usage/configuration.html @@ -674,6 +677,8 @@ def _get_params(root_schema: dict, prefix: str = "", default_section: str = "") ] if PACKAGE_NAME == 'apache-airflow': autoapi_ignore.append('*/airflow/providers/*') +else: + autoapi_ignore.append('*/airflow/providers/cncf/kubernetes/backcompat/*') # Keep the AutoAPI generated files on the filesystem after the run. # Useful for debugging. autoapi_keep_files = True @@ -696,6 +701,10 @@ def _get_params(root_schema: dict, prefix: str = "", default_section: str = "") 'special-members', ] +suppress_warnings = [ + "autoapi.python_import_resolution", +] + # -- Options for ext.exampleinclude -------------------------------------------- exampleinclude_sourceroot = os.path.abspath('..') @@ -709,6 +718,7 @@ def _get_params(root_schema: dict, prefix: str = "", default_section: str = "") if PACKAGE_NAME == 'helm-chart': spelling_exclude_patterns = ['changelog.rst'] spelling_ignore_contributor_names = False +spelling_ignore_importable_modules = True # -- Options for sphinxcontrib.redoc ------------------------------------------- # See: https://sphinxcontrib-redoc.readthedocs.io/en/stable/ @@ -730,3 +740,14 @@ def _get_params(root_schema: dict, prefix: str = "", default_section: str = "") # Options for script updater redoc_script_url = "https://cdn.jsdelivr.net/npm/redoc@2.0.0-rc.48/bundles/redoc.standalone.js" + + +def skip_util_classes(app, what, name, obj, skip, options): + if (what == "data" and "STATICA_HACK" in name) or ":sphinx-autoapi-skip:" in obj.docstring: + skip = True + return skip + + +def setup(sphinx): + if 'autoapi.extension' in extensions: + sphinx.connect("autoapi-skip-member", skip_util_classes) diff --git a/docs/exts/docs_build/docs_builder.py b/docs/exts/docs_build/docs_builder.py index 7164ac641a16a..ad343b46cfb67 100644 --- a/docs/exts/docs_build/docs_builder.py +++ b/docs/exts/docs_build/docs_builder.py @@ -30,7 +30,6 @@ CONSOLE_WIDTH, DOCS_DIR, PROCESS_TIMEOUT, - ROOT_PROJECT_DIR, pretty_format_path, ) from docs.exts.docs_build.errors import DocBuildError, parse_sphinx_warnings @@ -138,7 +137,7 @@ def check_spelling(self, verbose: bool) -> List[SpellingError]: os.makedirs(self.log_spelling_output_dir, exist_ok=True) build_cmd = [ - os.path.join(ROOT_PROJECT_DIR, "docs", "exts", "docs_build", "run_patched_sphinx.py"), + "sphinx-build", "-W", # turn warnings into errors "--color", # do emit colored output "-T", # show full traceback on exception @@ -213,7 +212,7 @@ def build_sphinx_docs(self, verbose: bool) -> List[DocBuildError]: os.makedirs(self._build_dir, exist_ok=True) build_cmd = [ - os.path.join(ROOT_PROJECT_DIR, "docs", "exts", "docs_build", "run_patched_sphinx.py"), + "sphinx-build", "-T", # show full traceback on exception "--color", # do emit colored output "-b", # builder to use diff --git a/docs/exts/docs_build/run_patched_sphinx.py b/docs/exts/docs_build/run_patched_sphinx.py deleted file mode 100755 index 887b982e5c0d8..0000000000000 --- a/docs/exts/docs_build/run_patched_sphinx.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env python -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -import sys - -import autoapi -from autoapi.extension import ( - LOGGER, - ExtensionError, - bold, - darkgreen, - default_backend_mapping, - default_file_mapping, - default_ignore_patterns, -) -from sphinx.cmd.build import main - - -def run_autoapi(app): - """Load AutoAPI data from the filesystem.""" - if not app.config.autoapi_dirs: - raise ExtensionError("You must configure an autoapi_dirs setting") - - # Make sure the paths are full - normalized_dirs = [] - autoapi_dirs = app.config.autoapi_dirs - if isinstance(autoapi_dirs, str): - autoapi_dirs = [autoapi_dirs] - for path in autoapi_dirs: - if os.path.isabs(path): - normalized_dirs.append(path) - else: - normalized_dirs.append(os.path.normpath(os.path.join(app.confdir, path))) - - for _dir in normalized_dirs: - if not os.path.exists(_dir): - raise ExtensionError( - "AutoAPI Directory `{dir}` not found. " - "Please check your `autoapi_dirs` setting.".format(dir=_dir) - ) - - # Change from app.confdir to app.srcdir. - # Before: - # - normalized_root = os.path.normpath( - # - os.path.join(app.confdir, app.config.autoapi_root) - # -) - normalized_root = os.path.normpath(os.path.join(app.srcdir, app.config.autoapi_root)) - url_root = os.path.join("/", app.config.autoapi_root) - sphinx_mapper = default_backend_mapping[app.config.autoapi_type] - sphinx_mapper_obj = sphinx_mapper(app, template_dir=app.config.autoapi_template_dir, url_root=url_root) - app.env.autoapi_mapper = sphinx_mapper_obj - - if app.config.autoapi_file_patterns: - file_patterns = app.config.autoapi_file_patterns - else: - file_patterns = default_file_mapping.get(app.config.autoapi_type, []) - - if app.config.autoapi_ignore: - ignore_patterns = app.config.autoapi_ignore - else: - ignore_patterns = default_ignore_patterns.get(app.config.autoapi_type, []) - - if ".rst" in app.config.source_suffix: - out_suffix = ".rst" - elif ".txt" in app.config.source_suffix: - out_suffix = ".txt" - else: - # Fallback to first suffix listed - out_suffix = app.config.source_suffix[0] - - # Actual meat of the run. - LOGGER.info(bold("[AutoAPI] ") + darkgreen("Loading Data")) - sphinx_mapper_obj.load(patterns=file_patterns, dirs=normalized_dirs, ignore=ignore_patterns) - - LOGGER.info(bold("[AutoAPI] ") + darkgreen("Mapping Data")) - sphinx_mapper_obj.map(options=app.config.autoapi_options) - - if app.config.autoapi_generate_api_docs: - LOGGER.info(bold("[AutoAPI] ") + darkgreen("Rendering Data")) - sphinx_mapper_obj.output_rst(root=normalized_root, source_suffix=out_suffix) - - -# HACK: sphinx-auto map did not correctly use the confdir attribute instead of srcdir when specifying the -# directory to contain the generated files. -# Unfortunately we have a problem updating to a newer version of this library and we have to use -# sphinx-autoapi v1.0.0, so I am monkeypatching this library to fix this one problem. -autoapi.extension.run_autoapi = run_autoapi - -sys.exit(main(sys.argv[1:])) diff --git a/docs/exts/exampleinclude.py b/docs/exts/exampleinclude.py index 097ec7c4b71be..2e8126da3918a 100644 --- a/docs/exts/exampleinclude.py +++ b/docs/exts/exampleinclude.py @@ -25,8 +25,8 @@ from docutils import nodes from docutils.parsers.rst import directives -from sphinx import addnodes from sphinx.directives.code import LiteralIncludeReader +from sphinx.ext.viewcode import viewcode_anchor from sphinx.locale import _ from sphinx.pycode import ModuleAnalyzer from sphinx.util import logging, parselinenos @@ -194,11 +194,7 @@ def create_node(env, relative_path, show_button): paragraph = nodes.paragraph(relative_path, classes=header_classes) paragraph += nodes.inline("", relative_path, classes=["example-title"]) if show_button: - pending_ref = addnodes.pending_xref( - "", - reftype="viewcode", - refdomain="std", - refexplicit=False, + pending_ref = viewcode_anchor( reftarget=pagename, refid="", refdoc=env.docname, diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 91d0471faa871..46595ebd57a1c 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -1,5 +1,6 @@ Ack Acyclic +AddressesType AgentKey Airbnb Airbyte @@ -17,6 +18,7 @@ Arg Args Asana Async +AsyncResult Atlassian Auth AutoMlClient @@ -33,6 +35,7 @@ Banco BaseClient BaseOperator BaseView +BaseXCom Beauchemin Behaviour Bigquery @@ -50,6 +53,7 @@ Cassanda Catchup Celect Cgroups +Chainable Changelog CheckOperator Checklicence @@ -62,6 +66,7 @@ Cloudwatch ClusterManagerClient Codecov Colour +CommandType ComputeNodeState Computenodes Config @@ -75,6 +80,7 @@ DBs Daemonize DagFileProcessorManager DagRun +DagRunState Dagbag Dagre Dask @@ -183,6 +189,7 @@ InspectTemplate Investorise JPype Jdbc +JenkinsRequest Jinja Jinjafied Jinjafy @@ -259,6 +266,7 @@ Optimise PEM POSIX PTarget +PTransform Pagerduty Papermill Parallelize @@ -276,6 +284,7 @@ Postgresql Pre Precommit PredictionServiceClient +Preload Preprocessed Proc ProductSearchClient @@ -285,13 +294,14 @@ Pubsub Py PyPI Pyarrow -Pyspark PythonOperator Qplum Quantopian Qubole +QuboleCheckHook Quboles RBAC +ReadOnlyCredentials Readme Realtime Rebasing @@ -299,6 +309,7 @@ Rebrand RedactImageResponse Reddit Redhat +RefreshError ReidentifyContentResponse Reinitialising Remoting @@ -315,10 +326,12 @@ SecretManagerClient Seedlist Sendgrid SlackHook +SnowflakeHook Spark SparkPi SparkR SparkSQL +SparkSession SpeechClient Splunk Sql @@ -350,6 +363,7 @@ TaskFlow TaskGroup TaskGroups TaskInstance +TaskInstanceKey Taskfail Templated Templating @@ -432,6 +446,7 @@ appbuilder approle arg args +argv arn arraysize asana @@ -543,6 +558,7 @@ cloudant cloudml cloudsqldatabehook cloudwatch +cls cmake cmd cmdline @@ -735,6 +751,7 @@ faq fargate fbee fc +fd feedCard feng fernet @@ -775,6 +792,7 @@ gcpcloudsql gcs gdbm generateUploadUrl +getattr getfqdn getframe getsource @@ -1124,6 +1142,7 @@ pymysql pyodbc pypa pypsrp +pyspark pytest pythonic pythonpath @@ -1160,6 +1179,8 @@ renewer replicaSet repo repos +repr +req reqs resetdb resourceVersion @@ -1209,6 +1230,7 @@ serialise serializable serverless setMachineType +setattr setdefault setted sftp @@ -1289,6 +1311,7 @@ subscriptionId substring subtask subtasks +subtype sudo sudoers summarization diff --git a/setup.py b/setup.py index 6f64f8f2a1add..911d5f8ee078a 100644 --- a/setup.py +++ b/setup.py @@ -251,16 +251,16 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version doc = [ 'click>=7.1,<9', # Sphinx is limited to < 3.5.0 because of https://github.com/sphinx-doc/sphinx/issues/8880 - 'sphinx>=2.1.2, <3.5.0', + 'sphinx>=3.5.0, <5.0.0', 'sphinx-airflow-theme', 'sphinx-argparse>=0.1.13', - 'sphinx-autoapi==1.0.0', + 'sphinx-autoapi==1.8.0', 'sphinx-copybutton', 'sphinx-jinja~=1.1', 'sphinx-rtd-theme>=0.1.6', 'sphinxcontrib-httpdomain>=1.7.0', 'sphinxcontrib-redoc>=1.6.0', - 'sphinxcontrib-spelling==7.2.1', + 'sphinxcontrib-spelling~=7.3', ] docker = [ 'docker>=5.0.3', From d761affd4f7fd3812ddbc53399f6f4fbe1fbb82b Mon Sep 17 00:00:00 2001 From: Ash Berlin-Taylor Date: Fri, 10 Dec 2021 09:47:01 +0000 Subject: [PATCH 090/250] Update minimum sphinx versions after upgrading sphinx-autoapi (#20170) * Allow point releases of AutoAPI 1.8 (I used with 1.8.4 in all my testing) * Require at least Sphinx v4 A few things got deprecated in Sphinx 4, and as this dep is only for us building docs we can pick and choose what we like without impacting users, so lets stay up-to-date. (cherry picked from commit 214b62d88de90d107ea8fe6640cf150ead5784f6) --- setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 911d5f8ee078a..13197da7320ba 100644 --- a/setup.py +++ b/setup.py @@ -250,11 +250,10 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version ] doc = [ 'click>=7.1,<9', - # Sphinx is limited to < 3.5.0 because of https://github.com/sphinx-doc/sphinx/issues/8880 - 'sphinx>=3.5.0, <5.0.0', + 'sphinx>=4.0.0, <5.0.0', 'sphinx-airflow-theme', 'sphinx-argparse>=0.1.13', - 'sphinx-autoapi==1.8.0', + 'sphinx-autoapi~=1.8.0', 'sphinx-copybutton', 'sphinx-jinja~=1.1', 'sphinx-rtd-theme>=0.1.6', From 689ded092c0b19e7804358cb25c36907f6c5b5dd Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sun, 12 Dec 2021 17:32:30 +0100 Subject: [PATCH 091/250] Increase limit of time for constraint job (#20230) Depending on how many changes are there from the previous images building images for constrainst (with eager upgrade) might take more time than 10 minutes which causes constraints generation job to fail. This change increases the limit to 25 minutes. (cherry picked from commit bc76126a9f6172a360fd4301eeb82372d000f70a) --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d9a90cf44c27e..1daf44e844bf0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1202,7 +1202,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" constraints: permissions: contents: write - timeout-minutes: 10 + timeout-minutes: 25 name: "Constraints" runs-on: ${{ fromJson(needs.build-info.outputs.runsOn) }} needs: From 989e73cd30ed9299a65ca1f5369ebdc8ed150b3b Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Mon, 13 Dec 2021 18:25:16 +0100 Subject: [PATCH 092/250] Remove Integration tests from MSSQL on Public Runners (#20231) The Integration tests with MSSQL often fail on Public Runners without a reason. The database becomes inaccessible and no logs are explaining what's going on. Its very likely however that this is a memory-related issue (Integration tests take a lot of memory as they run a lot of extra containers. Those tests will eventually run on Self-hosted runner after merge and they are also run for Postgres/MySQL/SQlite so there is no need to run them also for MSSQL if it causes random failures. (cherry picked from commit d1848bcf2460fa82cd6c1fc1e9e5f9b103d95479) --- scripts/ci/testing/ci_run_airflow_testing.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/ci/testing/ci_run_airflow_testing.sh b/scripts/ci/testing/ci_run_airflow_testing.sh index c3bee5e131d30..8ee56921a6bf4 100755 --- a/scripts/ci/testing/ci_run_airflow_testing.sh +++ b/scripts/ci/testing/ci_run_airflow_testing.sh @@ -88,9 +88,18 @@ function run_all_test_types_in_parallel() { echo "${COLOR_YELLOW}Heavy tests will be run sequentially after parallel tests including cleaning up docker between tests${COLOR_RESET}" echo "" if [[ ${test_types_to_run} == *"Integration"* ]]; then - echo "${COLOR_YELLOW}Remove Integration from tests_types_to_run and add them to sequential tests due to low memory.${COLOR_RESET}" test_types_to_run="${test_types_to_run//Integration/}" - sequential_tests+=("Integration") + if [[ ${BACKEND} == "mssql" ]]; then + # Also for mssql we skip Integration tests altogether on Public Runners. Mssql uses far + # too much memory and often shuts down and similarly as in case of Providers tests, + # there is no need to run them also for MsSQL engine as those integration tests + # are not really using any metadata-specific behaviour. + # Those tests will run in `main` anyway. + echo "${COLOR_YELLOW}Do not run integration tests for mssql in small systems due to memory issues.${COLOR_RESET}" + else + echo "${COLOR_YELLOW}Remove Integration from tests_types_to_run and add them to sequential tests due to low memory.${COLOR_RESET}" + sequential_tests+=("Integration") + fi fi if [[ ${BACKEND} == "mssql" || ${BACKEND} == "mysql" ]]; then # For mssql/mysql - they take far more memory than postgres (or sqlite) - we skip the Provider From 4c60fb2fd2a25ddf2a7097c8254d326f3b6078ed Mon Sep 17 00:00:00 2001 From: Peter Reznikov Date: Tue, 14 Dec 2021 18:43:54 +0300 Subject: [PATCH 093/250] YandexCloud provider: Support new Yandex SDK features: log_group_id, user-agent, maven packages (#20103) (cherry picked from commit 41c49c7ff6dfa1d6805e5f74c0a36dd549e159ea) --- .../example_yandexcloud_dataproc.py | 12 ++++-- airflow/providers/yandex/hooks/yandex.py | 16 +++++++- .../yandex/operators/yandexcloud_dataproc.py | 40 +++++++++++++++++++ docs/spelling_wordlist.txt | 1 + setup.py | 2 +- .../operators/test_yandexcloud_dataproc.py | 11 +++++ 6 files changed, 77 insertions(+), 5 deletions(-) diff --git a/airflow/providers/yandex/example_dags/example_yandexcloud_dataproc.py b/airflow/providers/yandex/example_dags/example_yandexcloud_dataproc.py index e35fae527688f..7d42946380233 100644 --- a/airflow/providers/yandex/example_dags/example_yandexcloud_dataproc.py +++ b/airflow/providers/yandex/example_dags/example_yandexcloud_dataproc.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +import uuid from datetime import datetime from airflow import DAG @@ -81,7 +81,7 @@ '-input', 's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2', '-output', - f's3a://{S3_BUCKET_NAME_FOR_JOB_LOGS}/dataproc/job/results', + f's3a://{S3_BUCKET_NAME_FOR_JOB_LOGS}/dataproc/job/results/{uuid.uuid4()}', ], properties={ 'yarn.app.mapreduce.am.resource.mb': '2048', @@ -113,6 +113,9 @@ properties={ 'spark.submit.deployMode': 'cluster', }, + packages=['org.slf4j:slf4j-simple:1.7.30'], + repositories=['https://repo1.maven.org/maven2'], + exclude_packages=['com.amazonaws:amazon-kinesis-client'], ) create_pyspark_job = DataprocCreatePysparkJobOperator( @@ -129,7 +132,7 @@ ], args=[ 's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2', - f's3a://{S3_BUCKET_NAME_FOR_JOB_LOGS}/jobs/results/${{JOB_ID}}', + f's3a://{S3_BUCKET_NAME_FOR_JOB_LOGS}/dataproc/job/results/${{JOB_ID}}', ], jar_file_uris=[ 's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar', @@ -139,6 +142,9 @@ properties={ 'spark.submit.deployMode': 'cluster', }, + packages=['org.slf4j:slf4j-simple:1.7.30'], + repositories=['https://repo1.maven.org/maven2'], + exclude_packages=['com.amazonaws:amazon-kinesis-client'], ) delete_cluster = DataprocDeleteClusterOperator( diff --git a/airflow/providers/yandex/hooks/yandex.py b/airflow/providers/yandex/hooks/yandex.py index ee1ae0dffe5ab..f47e169029d9a 100644 --- a/airflow/providers/yandex/hooks/yandex.py +++ b/airflow/providers/yandex/hooks/yandex.py @@ -80,6 +80,20 @@ def get_connection_form_widgets() -> Dict[str, Any]: ), } + @classmethod + def provider_user_agent(cls) -> Optional[str]: + """Construct User-Agent from Airflow core & provider package versions""" + import airflow + from airflow.providers_manager import ProvidersManager + + try: + manager = ProvidersManager() + provider_name = manager.hooks[cls.conn_type].package_name + provider = manager.providers[provider_name] + return f'apache-airflow/{airflow.__version__} {provider_name}/{provider.version}' + except KeyError: + warnings.warn(f"Hook '{cls.hook_name}' info is not initialized in airflow.ProviderManager") + @staticmethod def get_ui_field_behaviour() -> Dict: """Returns custom field behaviour""" @@ -107,7 +121,7 @@ def __init__( self.connection = self.get_connection(self.connection_id) self.extras = self.connection.extra_dejson credentials = self._get_credentials() - self.sdk = yandexcloud.SDK(**credentials) + self.sdk = yandexcloud.SDK(user_agent=self.provider_user_agent(), **credentials) self.default_folder_id = default_folder_id or self._get_field('folder_id', False) self.default_public_ssh_key = default_public_ssh_key or self._get_field('public_ssh_key', False) self.client = self.sdk.client diff --git a/airflow/providers/yandex/operators/yandexcloud_dataproc.py b/airflow/providers/yandex/operators/yandexcloud_dataproc.py index 84ac354e77aec..ead144b37b367 100644 --- a/airflow/providers/yandex/operators/yandexcloud_dataproc.py +++ b/airflow/providers/yandex/operators/yandexcloud_dataproc.py @@ -93,6 +93,9 @@ class DataprocCreateClusterOperator(BaseOperator): :param computenode_decommission_timeout: Timeout to gracefully decommission nodes during downscaling. In seconds. :type computenode_decommission_timeout: int + :param log_group_id: Id of log group to write logs. By default logs will be sent to default log group. + To disable cloud log sending set cluster property dataproc:disable_cloud_logging = true + :type log_group_id: str """ def __init__( @@ -127,6 +130,7 @@ def __init__( computenode_cpu_utilization_target: Optional[int] = None, computenode_decommission_timeout: Optional[int] = None, connection_id: Optional[str] = None, + log_group_id: Optional[str] = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -159,6 +163,7 @@ def __init__( self.computenode_preemptible = computenode_preemptible self.computenode_cpu_utilization_target = computenode_cpu_utilization_target self.computenode_decommission_timeout = computenode_decommission_timeout + self.log_group_id = log_group_id self.hook: Optional[DataprocHook] = None @@ -195,6 +200,7 @@ def execute(self, context) -> None: computenode_preemptible=self.computenode_preemptible, computenode_cpu_utilization_target=self.computenode_cpu_utilization_target, computenode_decommission_timeout=self.computenode_decommission_timeout, + log_group_id=self.log_group_id, ) context['task_instance'].xcom_push(key='cluster_id', value=operation_result.response.id) context['task_instance'].xcom_push(key='yandexcloud_connection_id', value=self.yandex_conn_id) @@ -399,6 +405,14 @@ class DataprocCreateSparkJobOperator(BaseOperator): :type cluster_id: Optional[str] :param connection_id: ID of the Yandex.Cloud Airflow connection. :type connection_id: Optional[str] + :param packages: List of maven coordinates of jars to include on the driver and executor classpaths. + :type packages: Optional[Iterable[str]] + :param repositories: List of additional remote repositories to search for the maven coordinates + given with --packages. + :type repositories: Optional[Iterable[str]] + :param exclude_packages: List of groupId:artifactId, to exclude while resolving the dependencies + provided in --packages to avoid dependency conflicts. + :type exclude_packages: Optional[Iterable[str]] """ template_fields = ['cluster_id'] @@ -416,6 +430,9 @@ def __init__( name: str = 'Spark job', cluster_id: Optional[str] = None, connection_id: Optional[str] = None, + packages: Optional[Iterable[str]] = None, + repositories: Optional[Iterable[str]] = None, + exclude_packages: Optional[Iterable[str]] = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -429,6 +446,9 @@ def __init__( self.name = name self.cluster_id = cluster_id self.connection_id = connection_id + self.packages = packages + self.repositories = repositories + self.exclude_packages = exclude_packages self.hook: Optional[DataprocHook] = None def execute(self, context) -> None: @@ -447,6 +467,9 @@ def execute(self, context) -> None: file_uris=self.file_uris, args=self.args, properties=self.properties, + packages=self.packages, + repositories=self.repositories, + exclude_packages=self.exclude_packages, name=self.name, cluster_id=cluster_id, ) @@ -476,6 +499,14 @@ class DataprocCreatePysparkJobOperator(BaseOperator): :type cluster_id: Optional[str] :param connection_id: ID of the Yandex.Cloud Airflow connection. :type connection_id: Optional[str] + :param packages: List of maven coordinates of jars to include on the driver and executor classpaths. + :type packages: Optional[Iterable[str]] + :param repositories: List of additional remote repositories to search for the maven coordinates + given with --packages. + :type repositories: Optional[Iterable[str]] + :param exclude_packages: List of groupId:artifactId, to exclude while resolving the dependencies + provided in --packages to avoid dependency conflicts. + :type exclude_packages: Optional[Iterable[str]] """ template_fields = ['cluster_id'] @@ -493,6 +524,9 @@ def __init__( name: str = 'Pyspark job', cluster_id: Optional[str] = None, connection_id: Optional[str] = None, + packages: Optional[Iterable[str]] = None, + repositories: Optional[Iterable[str]] = None, + exclude_packages: Optional[Iterable[str]] = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -506,6 +540,9 @@ def __init__( self.name = name self.cluster_id = cluster_id self.connection_id = connection_id + self.packages = packages + self.repositories = repositories + self.exclude_packages = exclude_packages self.hook: Optional[DataprocHook] = None def execute(self, context) -> None: @@ -524,6 +561,9 @@ def execute(self, context) -> None: file_uris=self.file_uris, args=self.args, properties=self.properties, + packages=self.packages, + repositories=self.repositories, + exclude_packages=self.exclude_packages, name=self.name, cluster_id=cluster_id, ) diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 46595ebd57a1c..64d839f428865 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -449,6 +449,7 @@ args argv arn arraysize +artifactId asana asc ascii diff --git a/setup.py b/setup.py index 13197da7320ba..6c86df0b0bcc6 100644 --- a/setup.py +++ b/setup.py @@ -505,7 +505,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'pywinrm~=0.4', ] yandex = [ - 'yandexcloud>=0.97.0', + 'yandexcloud>=0.122.0', ] zendesk = [ 'zdesk', diff --git a/tests/providers/yandex/operators/test_yandexcloud_dataproc.py b/tests/providers/yandex/operators/test_yandexcloud_dataproc.py index d52607ceb8739..f54087c742af9 100644 --- a/tests/providers/yandex/operators/test_yandexcloud_dataproc.py +++ b/tests/providers/yandex/operators/test_yandexcloud_dataproc.py @@ -60,6 +60,9 @@ 'cFDe6faKCxH6iDRteo4D8L8BxwzN42uZSB0nfmjkIxFTcEU3mFSXEbWByg78aoddMrAAjatyrhH1pON6P0=' ] +# https://cloud.yandex.com/en-ru/docs/logging/concepts/log-group +LOG_GROUP_ID = 'my_log_group_id' + class DataprocClusterCreateOperatorTest(TestCase): def setUp(self): @@ -87,6 +90,7 @@ def test_create_cluster(self, create_cluster_mock, *_): connection_id=CONNECTION_ID, s3_bucket=S3_BUCKET_NAME_FOR_LOGS, cluster_image_version=CLUSTER_IMAGE_VERSION, + log_group_id=LOG_GROUP_ID, ) context = {'task_instance': MagicMock()} operator.execute(context) @@ -122,6 +126,7 @@ def test_create_cluster(self, create_cluster_mock, *_): ], subnet_id='my_subnet_id', zone='ru-central1-c', + log_group_id=LOG_GROUP_ID, ) context['task_instance'].xcom_push.assert_has_calls( [ @@ -300,6 +305,9 @@ def test_create_spark_job_operator(self, create_spark_job_mock, *_): main_jar_file_uri='s3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar', name='Spark job', properties={'spark.submit.deployMode': 'cluster'}, + packages=None, + repositories=None, + exclude_packages=None, ) @patch('airflow.providers.yandex.hooks.yandex.YandexCloudBaseHook._get_credentials') @@ -359,4 +367,7 @@ def test_create_pyspark_job_operator(self, create_pyspark_job_mock, *_): name='Pyspark job', properties={'spark.submit.deployMode': 'cluster'}, python_file_uris=['s3a://some-in-bucket/jobs/sources/pyspark-001/geonames.py'], + packages=None, + repositories=None, + exclude_packages=None, ) From ac6b016e3f56e4e743eb24312461d5d0baa361c4 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 14 Dec 2021 17:33:21 +0100 Subject: [PATCH 094/250] Speed up Helm Upgrade tests (#20289) The Helm Upgrade tests took a long time on Public Runners. This is in part because we were running tests before and after upgrade, but we do not need to run them before the upgrade, simply because those tests are already run elsewhere. Also increased the timeout for the Upgrade Job - just in case it will still not be enough (cherry picked from commit 8c7bdfc6a315a73057f6bff022d0df7897cf11af) --- .github/workflows/ci.yml | 2 +- scripts/ci/kubernetes/ci_run_helm_upgrade.sh | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1daf44e844bf0..28e4493911bc4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1132,7 +1132,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" retention-days: 7 tests-helm-executor-upgrade: - timeout-minutes: 50 + timeout-minutes: 80 name: Helm Chart Executor Upgrade runs-on: ${{ fromJson(needs.build-info.outputs.runsOn) }} needs: [build-info, prod-images] diff --git a/scripts/ci/kubernetes/ci_run_helm_upgrade.sh b/scripts/ci/kubernetes/ci_run_helm_upgrade.sh index 1e9b94c7af2a0..f07455db7bb0f 100755 --- a/scripts/ci/kubernetes/ci_run_helm_upgrade.sh +++ b/scripts/ci/kubernetes/ci_run_helm_upgrade.sh @@ -18,11 +18,7 @@ # shellcheck source=scripts/ci/libraries/_script_init.sh . "$( dirname "${BASH_SOURCE[0]}" )/../libraries/_script_init.sh" -EXECUTOR=KubernetesExecutor -export EXECUTOR - -# We started with KubernetesExecutor. Let's run tests first -"$( dirname "${BASH_SOURCE[0]}" )/ci_run_kubernetes_tests.sh" +# There is no need to run tests before upgrade (other tests do that). Let's test it after. for EXECUTOR in CeleryExecutor KubernetesExecutor do kind::upgrade_airflow_with_helm "${EXECUTOR}" From 09602fcee7ccced177a78367486fbee9cc0508a3 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 14 Dec 2021 19:11:43 +0100 Subject: [PATCH 095/250] Fix race condition when flake checks run in parallel (#20294) The Flake checks run in parallel and when you had an image which required rebuild, it performed additional check on whether the image needs build or "pull+build". When it was run in parallel a temporary file containing hash of the remote image could be overwritten and emptied while another process was reading it which resulted in error when running flake command. This has been changed - the files are now stored in a temporary files - unique to each of the processes running in parallel and the file in question is moved as an atomic operation so it will never become empty. (cherry picked from commit 0b7734ee36670a70363b5866f7a064a5a0d67be8) --- scripts/ci/libraries/_build_images.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/ci/libraries/_build_images.sh b/scripts/ci/libraries/_build_images.sh index 9d5096bb64aff..ec3608b922f3d 100644 --- a/scripts/ci/libraries/_build_images.sh +++ b/scripts/ci/libraries/_build_images.sh @@ -305,9 +305,10 @@ function build_images::get_local_build_cache_hash() { function build_images::get_remote_image_build_cache_hash() { set +e local remote_image_container_id_file - remote_image_container_id_file="${AIRFLOW_SOURCES}/manifests/remote-airflow-manifest-image-${PYTHON_MAJOR_MINOR_VERSION}" + remote_image_container_id_file="$(mktemp)" local remote_image_build_cache_file - remote_image_build_cache_file="${AIRFLOW_SOURCES}/manifests/remote-build-cache-hash-${PYTHON_MAJOR_MINOR_VERSION}" + remote_image_build_cache_file=$(mktemp) + local target_remote_cache_file="${AIRFLOW_SOURCES}/manifests/remote-build-cache-hash-${PYTHON_MAJOR_MINOR_VERSION}" # Pull remote manifest image if ! docker_v pull "${AIRFLOW_CI_REMOTE_MANIFEST_IMAGE}" 2>/dev/null >/dev/null; then verbosity::print_info @@ -326,10 +327,13 @@ function build_images::get_remote_image_build_cache_hash() { # Extract manifest and store it in local file docker_v cp "$(cat "${remote_image_container_id_file}"):/build-cache-hash" \ "${remote_image_build_cache_file}" + # The `mv` is an atomic operation so even if we run it in parallel (for example in flake) it will + # never be empty (happened in the past) + mv "${remote_image_build_cache_file}" "${target_remote_cache_file}" docker_v rm --force "$(cat "${remote_image_container_id_file}")" rm -f "${remote_image_container_id_file}" verbosity::print_info - verbosity::print_info "Remote build cache hash: '$(cat "${remote_image_build_cache_file}")'" + verbosity::print_info "Remote build cache hash: '$(cat "${target_remote_cache_file}")'" verbosity::print_info } From 53d73dccc408d117a464c1a4de76fcbf218ddee6 Mon Sep 17 00:00:00 2001 From: Aakcht Date: Sat, 18 Dec 2021 00:39:34 +0300 Subject: [PATCH 096/250] Change default python version in docker image docs (#20389) (cherry picked from commit 78e4d16d970dbcb599d9c7f8df2a1a4273649ae8) --- docs/docker-stack/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/docker-stack/index.rst b/docs/docker-stack/index.rst index 96ce305be4d11..37018445aa471 100644 --- a/docs/docker-stack/index.rst +++ b/docs/docker-stack/index.rst @@ -46,9 +46,9 @@ for all the supported Python versions. You can find the following images there (Assuming Airflow version |airflow-version|): -* :subst-code:`apache/airflow:latest` - the latest released Airflow image with default Python version (3.6 currently) +* :subst-code:`apache/airflow:latest` - the latest released Airflow image with default Python version (3.7 currently) * :subst-code:`apache/airflow:latest-pythonX.Y` - the latest released Airflow image with specific Python version -* :subst-code:`apache/airflow:|airflow-version|` - the versioned Airflow image with default Python version (3.6 currently) +* :subst-code:`apache/airflow:|airflow-version|` - the versioned Airflow image with default Python version (3.7 currently) * :subst-code:`apache/airflow:|airflow-version|-pythonX.Y` - the versioned Airflow image with specific Python version Those are "reference" images. They contain the most common set of extras, dependencies and providers that are From 793aaa939a645511a4fa3f4c37a9c7fc7675b109 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sat, 18 Dec 2021 16:14:42 +0100 Subject: [PATCH 097/250] Limit Snowflake connector to< 2.7.2 (#20395) The Snowflake connector 2.7.2 requires pyarrow to be >=6.0.0 (but it has no "install_requires" for it - it checks it dynamically and prints warning when imported. We should limit the provider until apache-beam will remove the pyarrow < 6.0.0 limitation. (cherry picked from commit 0050e44f473ad2802c882cf008846a32c83d009d) --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6c86df0b0bcc6..687153298e416 100644 --- a/setup.py +++ b/setup.py @@ -465,7 +465,9 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'slack_sdk>=3.0.0,<4.0.0', ] snowflake = [ - 'snowflake-connector-python>=2.4.1', + # Snowflake connector 2.7.2 requires pyarrow >=6.0.0 but apache-beam requires < 6.0.0 + # We should remove the limitation when apache-beam upgrades pyarrow + 'snowflake-connector-python>=2.4.1,<2.7.2', # The snowflake-alchemy 1.2.5 introduces a hard dependency on sqlalchemy>=1.4.0, but they didn't define # this requirements in setup.py, so pip cannot figure out the correct set of dependencies. # See: https://github.com/snowflakedb/snowflake-sqlalchemy/issues/234 From 2dbbe2bbac35a77c9e3dd16ea1fc6ed861ef5db1 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Mon, 20 Dec 2021 22:22:54 +0100 Subject: [PATCH 098/250] Add pre-commit that checks credentials are not persisted in CI (#20430) For security reason we should not persist credentials on checking out code during GitHub actions. This pre-commit prevents this from happening. (cherry picked from commit 3ccbd4f4ee4f27c08ab39aa61aa0cf1e631bd154) --- .pre-commit-config.yaml | 7 ++ BREEZE.rst | 16 ++-- STATIC_CODE_CHECKS.rst | 2 + breeze-complete | 1 + .../pre_commit_checkout_no_credentials.py | 87 +++++++++++++++++++ 5 files changed, 105 insertions(+), 8 deletions(-) create mode 100755 scripts/ci/pre_commit/pre_commit_checkout_no_credentials.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5ba9cc682f1f4..9d0ff65000b2d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -658,6 +658,13 @@ repos: files: airflow/config_templates/config\.yml$ require_serial: true additional_dependencies: ['jsonschema==3.2.0', 'PyYAML==5.3.1', 'requests==2.25.0'] + - id: persist-credentials-disabled + name: Check that workflow files have persist-credentials disabled + entry: ./scripts/ci/pre_commit/pre_commit_checkout_no_credentials.py + language: python + pass_filenames: true + files: \.github/workflows/.*\.yml$ + additional_dependencies: ['PyYAML', 'rich'] - id: ui-lint name: ESLint against airflow/ui language: node diff --git a/BREEZE.rst b/BREEZE.rst index b10fd6c6cdc9c..b2593235af601 100644 --- a/BREEZE.rst +++ b/BREEZE.rst @@ -2196,14 +2196,14 @@ This is the current syntax for `./breeze <./breeze>`_: fix-encoding-pragma flake8 flynt forbid-tabs helm-lint identity incorrect-use-of-LoggingMixin insert-license isort json-schema language-matters lint-dockerfile lint-openapi markdownlint mermaid mixed-line-ending mypy mypy-helm - no-providers-in-core-examples no-relative-imports pre-commit-descriptions - pre-commit-hook-names pretty-format-json provide-create-sessions - providers-changelogs providers-init-file providers-subpackages-init-file - provider-yamls pydevd pydocstyle python-no-log-warn pyupgrade restrict-start_date - rst-backticks setup-order setup-extra-packages shellcheck sort-in-the-wild - sort-spelling-wordlist stylelint trailing-whitespace ui-lint update-breeze-file - update-extras update-local-yml-file update-setup-cfg-file update-versions - verify-db-migrations-documented version-sync www-lint yamllint yesqa + no-providers-in-core-examples no-relative-imports persist-credentials-disabled + pre-commit-descriptions pre-commit-hook-names pretty-format-json + provide-create-sessions providers-changelogs providers-init-file + providers-subpackages-init-file provider-yamls pydevd pydocstyle python-no-log-warn + pyupgrade restrict-start_date rst-backticks setup-order setup-extra-packages + shellcheck sort-in-the-wild sort-spelling-wordlist stylelint trailing-whitespace + ui-lint update-breeze-file update-extras update-local-yml-file update-setup-cfg-file + update-versions verify-db-migrations-documented version-sync www-lint yamllint yesqa You can pass extra arguments including options to the pre-commit framework as passed after --. For example: diff --git a/STATIC_CODE_CHECKS.rst b/STATIC_CODE_CHECKS.rst index 81bc6de62c156..5eec049786db9 100644 --- a/STATIC_CODE_CHECKS.rst +++ b/STATIC_CODE_CHECKS.rst @@ -216,6 +216,8 @@ require Breeze Docker images to be installed locally. ------------------------------------ ---------------------------------------------------------------- ------------ ``mypy`` Runs mypy * ------------------------------------ ---------------------------------------------------------------- ------------ +``persist-credentials-disabled`` Check that workflow files have persist-credentials disabled +------------------------------------ ---------------------------------------------------------------- ------------ ``pre-commit-descriptions`` Check if all pre-commits are described in docs ------------------------------------ ---------------------------------------------------------------- ------------ ``pre-commit-hook-names`` Check that hook names are not overly long diff --git a/breeze-complete b/breeze-complete index 1d673e04222b1..fcf902321a45a 100644 --- a/breeze-complete +++ b/breeze-complete @@ -119,6 +119,7 @@ mypy mypy-helm no-providers-in-core-examples no-relative-imports +persist-credentials-disabled pre-commit-descriptions pre-commit-hook-names pretty-format-json diff --git a/scripts/ci/pre_commit/pre_commit_checkout_no_credentials.py b/scripts/ci/pre_commit/pre_commit_checkout_no_credentials.py new file mode 100755 index 0000000000000..20625c90923c7 --- /dev/null +++ b/scripts/ci/pre_commit/pre_commit_checkout_no_credentials.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import sys +from pathlib import Path + +import yaml +from rich.console import Console + +if __name__ not in ("__main__", "__mp_main__"): + raise SystemExit( + "This file is intended to be executed as an executable program. You cannot use it as a module." + f"To run this script, run the ./{__file__} command [FILE] ..." + ) + + +console = Console(color_system="standard", width=200) + + +def check_file(the_file: Path) -> int: + """Returns number of wrong checkout instructions in the workflow file""" + error_num = 0 + res = yaml.safe_load(the_file.read_text()) + console.print(f"Checking file [yellow]{the_file}[/]") + for job in res['jobs'].values(): + for step in job['steps']: + uses = step.get('uses') + pretty_step = yaml.safe_dump(step, indent=2) + if uses is not None and uses.startswith('actions/checkout'): + with_clause = step.get('with') + if with_clause is None: + console.print(f"\n[red]The `with` clause is missing in step:[/]\n\n{pretty_step}") + error_num += 1 + continue + persist_credentials = with_clause.get("persist-credentials") + if persist_credentials is None: + console.print( + "\n[red]The `with` clause does not have persist-credentials in step:[/]" + f"\n\n{pretty_step}" + ) + error_num += 1 + continue + else: + if persist_credentials: + console.print( + "\n[red]The `with` clause have persist-credentials=True in step:[/]" + f"\n\n{pretty_step}" + ) + error_num += 1 + continue + return error_num + + +if __name__ == '__main__': + total_err_num = 0 + for a_file in sys.argv[1:]: + total_err_num += check_file(Path(a_file)) + if total_err_num: + console.print( + """ +[red]There are are some checkout instructions in github workflows that have no "persist_credentials" +set to False.[/] + +For security reasons - make sure all of the checkout actions have persist_credentials set, similar to: + + - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" + uses: actions/checkout@v2 + with: + persist-credentials: false + +""" + ) + sys.exit(1) From 48f39d1277289d12b2aa6717983d5c8247fe374f Mon Sep 17 00:00:00 2001 From: Emmanuel Roux <15956441+e-roux@users.noreply.github.com> Date: Wed, 22 Dec 2021 01:01:47 +0100 Subject: [PATCH 099/250] Add custom pip.conf to docker-context-files (#20445) (cherry picked from commit 0e4d1e43fdd357f9e3aa83158b76d133fdd9ee3d) --- Dockerfile | 5 +++-- docs/docker-stack/build.rst | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 088d561af7005..b5113be0ff735 100644 --- a/Dockerfile +++ b/Dockerfile @@ -199,8 +199,9 @@ ENV PATH=${PATH}:/opt/mssql-tools/bin COPY docker-context-files /docker-context-files -RUN if [[ -f /docker-context-files/.pypirc ]]; then \ - cp /docker-context-files/.pypirc /root/.pypirc; \ +RUN if [[ -f /docker-context-files/pip.conf ]]; then \ + mkdir -p /root/.config/pip; \ + cp /docker-context-files/pip.conf /root/.config/pip/pip.conf; \ fi ENV AIRFLOW_PRE_CACHED_PIP_PACKAGES=${AIRFLOW_PRE_CACHED_PIP_PACKAGES} \ diff --git a/docs/docker-stack/build.rst b/docs/docker-stack/build.rst index a09f879835977..165c5b0b8c889 100644 --- a/docs/docker-stack/build.rst +++ b/docs/docker-stack/build.rst @@ -522,12 +522,12 @@ described below but here is an example of rather complex command to customize th based on example in `this comment `_: In case you need to use your custom PyPI package indexes, you can also customize PYPI sources used during -image build by adding a ``docker-context-files``/``.pypirc`` file when building the image. -This ``.pypirc`` will not be committed to the repository (it is added to ``.gitignore``) and it will not be +image build by adding a ``docker-context-files``/``pip.conf`` file when building the image. +This ``pip.conf`` will not be committed to the repository (it is added to ``.gitignore``) and it will not be present in the final production image. It is added and used only in the build segment of the image. -Therefore this ``.pypirc`` file can safely contain list of package indexes you want to use, -usernames and passwords used for authentication. More details about ``.pypirc`` file can be found in the -`pypirc specification `_. +Therefore this ``pip.conf`` file can safely contain list of package indexes you want to use, +usernames and passwords used for authentication. More details about ``pip.conf`` file can be found in the +`pip configuration `_. Such customizations are independent of the way how airflow is installed. From 0746bc837525d7c731d7c59a2e47cdc489fba487 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 22 Dec 2021 16:10:51 +0100 Subject: [PATCH 100/250] Checks if the user running Breeze has permissions to run docker cmd (#20462) (cherry picked from commit 3ccabbcf93fa695513163903c412bf65f52e2d4e) --- scripts/ci/libraries/_initialization.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/ci/libraries/_initialization.sh b/scripts/ci/libraries/_initialization.sh index 87a64aba33c1a..197b1eec318fe 100644 --- a/scripts/ci/libraries/_initialization.sh +++ b/scripts/ci/libraries/_initialization.sh @@ -915,6 +915,16 @@ function initialization::ver() { } function initialization::check_docker_version() { + local permission_denied + permission_denied=$(docker info 2>/dev/null | grep "ERROR: Got permission denied while trying " || true) + if [[ ${permission_denied} != "" ]]; then + echo + echo "${COLOR_RED}ERROR: You have 'permission denied' error when trying to communicate with docker.${COLOR_RESET}" + echo + echo "${COLOR_YELLOW}Most likely you need to add your user to 'docker' group: https://docs.docker.com/engine/install/linux-postinstall/ .${COLOR_RESET}" + echo + exit 1 + fi local docker_version # In GitHub Code QL, the version of docker has +azure suffix which we should remove docker_version=$(docker version --format '{{.Client.Version}}' | sed 's/\+.*$//' || true) From ef319b06949951b144003d5362179fd8790c5ded Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Thu, 23 Dec 2021 13:38:57 +0100 Subject: [PATCH 101/250] Add autoflake precommit to automatically remove unused code (#20466) (cherry picked from commit f0cf15cfe3a2f16741063d0368216b7067f38245) --- .pre-commit-config.yaml | 5 ++++ BREEZE.rst | 23 +++++++++++-------- STATIC_CODE_CHECKS.rst | 2 ++ airflow/kubernetes/pod.py | 2 +- airflow/kubernetes/pod_launcher.py | 4 +--- airflow/kubernetes/pod_runtime_info_env.py | 4 +--- airflow/kubernetes/volume.py | 5 +--- airflow/kubernetes/volume_mount.py | 5 +--- ...2c6edca13270_resource_based_permissions.py | 1 - ...resource_based_permissions_for_default_.py | 1 - airflow/timetables/base.py | 1 - airflow/utils/db.py | 2 -- airflow/www/decorators.py | 1 - airflow/www/views.py | 8 ------- breeze-complete | 1 + docs/exts/exampleinclude.py | 2 +- tests/task/__init__.py | 2 -- 17 files changed, 28 insertions(+), 41 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d0ff65000b2d..798237f3f5883 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -262,6 +262,11 @@ repos: ^airflow/_vendor/ - repo: local hooks: + - id: autoflake + name: Remove all unused code + entry: autoflake --remove-all-unused-imports --ignore-init-module-imports --in-place + language: python + additional_dependencies: ['autoflake'] - id: lint-openapi name: Lint OpenAPI using spectral language: docker_image diff --git a/BREEZE.rst b/BREEZE.rst index b2593235af601..823ab5b7d4f48 100644 --- a/BREEZE.rst +++ b/BREEZE.rst @@ -115,6 +115,12 @@ Docker in WSL 2 E.g. Run ``cd ~`` and create a development folder in your Linux distro home and git pull the Airflow repo there. +- **WSL 2 Docker mount errors**: + Another reason to use Linux filesystem, is that sometimes - depending on the length of + your path, you might get strange errors when you try start ``Breeze``, such us + ``caused: mount through procfd: not a directory: unknown:``. Therefore checking out + Airflow in Windows-mounted Filesystem is strongly discouraged. + - **WSL 2 Memory Usage** : WSL 2 can consume a lot of memory under the process name "Vmmem". To reclaim the memory after development you can: @@ -125,7 +131,7 @@ Docker in WSL 2 * If no longer using WSL you can shut it down on the Windows Host with the following command: ``wsl --shutdown`` -- **Developing in WSL 2** : +- **Developing in WSL 2**: You can use all the standard Linux command line utilities to develop on WSL 2. Further VS Code supports developing in Windows but remotely executing in WSL. If VS Code is installed on the Windows host system then in the WSL Linux Distro @@ -146,7 +152,7 @@ If you use bash, run this command and re-login: .. code-block:: bash - echo 'export PATH="/usr/local/opt/gnu-getopt/bin:$PATH"' >> ~/.bash_profile + echo 'export PATH="$(brew --prefix)/opt/gnu-getopt/bin:$PATH"' >> ~/.bash_profile . ~/.bash_profile @@ -154,7 +160,7 @@ If you use zsh, run this command and re-login: .. code-block:: bash - echo 'export PATH="/usr/local/opt/gnu-getopt/bin:$PATH"' >> ~/.zprofile + echo 'export PATH="$(brew --prefix)/opt/gnu-getopt/bin:$PATH"' >> ~/.zprofile . ~/.zprofile @@ -387,11 +393,10 @@ you can also start integrations (separate Docker images) if specified as extra ` chose which backend database should be used with ``--backend`` flag and python version with ``--python`` flag. You can also have breeze launch Airflow automatically ``breeze start-airflow``, this will drop you in a -tmux session with four panes: +tmux session with three panes: - one to monitor the scheduler, - one for the webserver, - - one monitors and compiles JavaScript files, - one with a shell for additional commands. Managing Prod environment (with ``--production-image`` flag): @@ -1275,7 +1280,7 @@ This is the current syntax for `./breeze <./breeze>`_: -t, --install-airflow-reference INSTALL_AIRFLOW_REFERENCE Installs Airflow directly from reference in GitHub when building PROD image. - This can be a GitHub branch like main or v2-1-test, or a tag like 2.1.0a1. + This can be a GitHub branch like main or v2-2-test, or a tag like 2.2.0rc1. --installation-method INSTALLATION_METHOD Method of installing Airflow in PROD image - either from the sources ('.') @@ -2188,8 +2193,8 @@ This is the current syntax for `./breeze <./breeze>`_: you would like to run or 'all' to run all checks. One of: all airflow-config-yaml airflow-providers-available airflow-provider-yaml-files-ok - base-operator bats-tests bats-in-container-tests black blacken-docs boring-cyborg - build build-providers-dependencies check-apache-license check-builtin-literals + autoflake base-operator bats-tests bats-in-container-tests black blacken-docs + boring-cyborg build build-providers-dependencies check-apache-license check-builtin-literals check-executables-have-shebangs check-extras-order check-hooks-apply check-integrations check-merge-conflict check-xml daysago-import-check debug-statements detect-private-key doctoc dont-use-safe-filter end-of-file-fixer @@ -2448,7 +2453,7 @@ This is the current syntax for `./breeze <./breeze>`_: -t, --install-airflow-reference INSTALL_AIRFLOW_REFERENCE Installs Airflow directly from reference in GitHub when building PROD image. - This can be a GitHub branch like main or v2-1-test, or a tag like 2.1.0a1. + This can be a GitHub branch like main or v2-2-test, or a tag like 2.2.0rc1. --installation-method INSTALLATION_METHOD Method of installing Airflow in PROD image - either from the sources ('.') diff --git a/STATIC_CODE_CHECKS.rst b/STATIC_CODE_CHECKS.rst index 5eec049786db9..9afca67a9669c 100644 --- a/STATIC_CODE_CHECKS.rst +++ b/STATIC_CODE_CHECKS.rst @@ -132,6 +132,8 @@ require Breeze Docker images to be installed locally. ------------------------------------ ---------------------------------------------------------------- ------------ ``airflow-provider-yaml-files-ok`` Checks that providers YAML files are valid ------------------------------------ ---------------------------------------------------------------- ------------ +``autoflake`` Remove unused imports and unnecessary code +------------------------------------ ---------------------------------------------------------------- ------------ ``base-operator`` Checks that BaseOperator is imported properly ------------------------------------ ---------------------------------------------------------------- ------------ ``bats-tests`` Runs BATS bash unit tests diff --git a/airflow/kubernetes/pod.py b/airflow/kubernetes/pod.py index 6bced0f55da0d..733f18477779e 100644 --- a/airflow/kubernetes/pod.py +++ b/airflow/kubernetes/pod.py @@ -24,7 +24,7 @@ import warnings with warnings.catch_warnings(): - from airflow.providers.cncf.kubernetes.backcompat.pod import Port, Resources + from airflow.providers.cncf.kubernetes.backcompat.pod import Port, Resources # noqa: autoflake warnings.warn( "This module is deprecated. Please use `kubernetes.client.models for V1ResourceRequirements and Port.", diff --git a/airflow/kubernetes/pod_launcher.py b/airflow/kubernetes/pod_launcher.py index 48a90605d442a..0b9cbbe45a481 100644 --- a/airflow/kubernetes/pod_launcher.py +++ b/airflow/kubernetes/pod_launcher.py @@ -19,6 +19,4 @@ This module is deprecated. Please use :mod:`kubernetes.client.models` for V1ResourceRequirements and Port. """ -# flake8: noqa - -from airflow.kubernetes.pod_launcher_deprecated import PodLauncher, PodStatus +from airflow.kubernetes.pod_launcher_deprecated import PodLauncher, PodStatus # noqa: autoflake diff --git a/airflow/kubernetes/pod_runtime_info_env.py b/airflow/kubernetes/pod_runtime_info_env.py index 4d7bd9f8773eb..5dbbd4249d211 100644 --- a/airflow/kubernetes/pod_runtime_info_env.py +++ b/airflow/kubernetes/pod_runtime_info_env.py @@ -16,12 +16,10 @@ # specific language governing permissions and limitations # under the License. """This module is deprecated. Please use :mod:`kubernetes.client.models.V1EnvVar`.""" -# flake8: noqa - import warnings with warnings.catch_warnings(): - from airflow.providers.cncf.kubernetes.backcompat.pod_runtime_info_env import PodRuntimeInfoEnv + from airflow.providers.cncf.kubernetes.backcompat.pod_runtime_info_env import PodRuntimeInfoEnv # noqa warnings.warn( "This module is deprecated. Please use `kubernetes.client.models.V1EnvVar`.", diff --git a/airflow/kubernetes/volume.py b/airflow/kubernetes/volume.py index 7fd58e22998cd..90bd4c23bbd25 100644 --- a/airflow/kubernetes/volume.py +++ b/airflow/kubernetes/volume.py @@ -16,13 +16,10 @@ # specific language governing permissions and limitations # under the License. """This module is deprecated. Please use :mod:`kubernetes.client.models.V1Volume`.""" -# flake8: noqa - - import warnings with warnings.catch_warnings(): - from airflow.providers.cncf.kubernetes.backcompat.volume import Volume + from airflow.providers.cncf.kubernetes.backcompat.volume import Volume # noqa: autoflake warnings.warn( "This module is deprecated. Please use `kubernetes.client.models.V1Volume`.", diff --git a/airflow/kubernetes/volume_mount.py b/airflow/kubernetes/volume_mount.py index 08bc5d36782d2..aff5f30d5840e 100644 --- a/airflow/kubernetes/volume_mount.py +++ b/airflow/kubernetes/volume_mount.py @@ -16,13 +16,10 @@ # specific language governing permissions and limitations # under the License. """This module is deprecated. Please use :mod:`kubernetes.client.models.V1VolumeMount`.""" -# flake8: noqa - - import warnings with warnings.catch_warnings(): - from airflow.providers.cncf.kubernetes.backcompat.volume_mount import VolumeMount + from airflow.providers.cncf.kubernetes.backcompat.volume_mount import VolumeMount # noqa: autoflake warnings.warn( "This module is deprecated. Please use `kubernetes.client.models.V1VolumeMount`.", diff --git a/airflow/migrations/versions/2c6edca13270_resource_based_permissions.py b/airflow/migrations/versions/2c6edca13270_resource_based_permissions.py index fc792d3789648..85673ce7fb34c 100644 --- a/airflow/migrations/versions/2c6edca13270_resource_based_permissions.py +++ b/airflow/migrations/versions/2c6edca13270_resource_based_permissions.py @@ -317,4 +317,3 @@ def upgrade(): def downgrade(): """Unapply Resource based permissions.""" - pass diff --git a/airflow/migrations/versions/a13f7613ad25_resource_based_permissions_for_default_.py b/airflow/migrations/versions/a13f7613ad25_resource_based_permissions_for_default_.py index 294f59cb90081..917bb8d8896bf 100644 --- a/airflow/migrations/versions/a13f7613ad25_resource_based_permissions_for_default_.py +++ b/airflow/migrations/versions/a13f7613ad25_resource_based_permissions_for_default_.py @@ -169,4 +169,3 @@ def upgrade(): def downgrade(): """Unapply Resource based permissions.""" - pass diff --git a/airflow/timetables/base.py b/airflow/timetables/base.py index e97f2532b9e1b..926055d6d3080 100644 --- a/airflow/timetables/base.py +++ b/airflow/timetables/base.py @@ -145,7 +145,6 @@ def validate(self) -> None: :raises: AirflowTimetableInvalid on validation failure. """ - pass @property def summary(self) -> str: diff --git a/airflow/utils/db.py b/airflow/utils/db.py index 0b058af533601..f35d1659f8cb9 100644 --- a/airflow/utils/db.py +++ b/airflow/utils/db.py @@ -663,7 +663,6 @@ def check_conn_id_duplicates(session=None) -> Iterable[str]: except (exc.OperationalError, exc.ProgrammingError): # fallback if tables hasn't been created yet session.rollback() - pass if dups: yield ( 'Seems you have non unique conn_id in connection table.\n' @@ -686,7 +685,6 @@ def check_conn_type_null(session=None) -> Iterable[str]: except (exc.OperationalError, exc.ProgrammingError, exc.InternalError): # fallback if tables hasn't been created yet session.rollback() - pass if n_nulls: yield ( diff --git a/airflow/www/decorators.py b/airflow/www/decorators.py index f6f2ed0b2a3d5..080fe682991c2 100644 --- a/airflow/www/decorators.py +++ b/airflow/www/decorators.py @@ -65,7 +65,6 @@ def wrapper(*args, **kwargs): logger.exception( "Failed to parse execution_date from the request: %s", execution_date_value ) - pass session.add(log) diff --git a/airflow/www/views.py b/airflow/www/views.py index 51782832853a0..f22735c7e1f83 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -4726,22 +4726,14 @@ def class_permission_name(self, name): class CustomUserLDAPModelView(MultiResourceUserMixin, UserLDAPModelView): """Customize permission names for FAB's builtin UserLDAPModelView.""" - pass - class CustomUserOAuthModelView(MultiResourceUserMixin, UserOAuthModelView): """Customize permission names for FAB's builtin UserOAuthModelView.""" - pass - class CustomUserOIDModelView(MultiResourceUserMixin, UserOIDModelView): """Customize permission names for FAB's builtin UserOIDModelView.""" - pass - class CustomUserRemoteUserModelView(MultiResourceUserMixin, UserRemoteUserModelView): """Customize permission names for FAB's builtin UserRemoteUserModelView.""" - - pass diff --git a/breeze-complete b/breeze-complete index fcf902321a45a..b36b4880f4c0b 100644 --- a/breeze-complete +++ b/breeze-complete @@ -77,6 +77,7 @@ all airflow-config-yaml airflow-providers-available airflow-provider-yaml-files-ok +autoflake base-operator bats-tests bats-in-container-tests diff --git a/docs/exts/exampleinclude.py b/docs/exts/exampleinclude.py index 2e8126da3918a..64a2915970685 100644 --- a/docs/exts/exampleinclude.py +++ b/docs/exts/exampleinclude.py @@ -34,7 +34,7 @@ from sphinx.util.nodes import set_source_info try: - import sphinx_airflow_theme + import sphinx_airflow_theme # noqa: autoflake airflow_theme_is_available = True except ImportError: diff --git a/tests/task/__init__.py b/tests/task/__init__.py index a5912f8b5801f..9e7116e7f735c 100644 --- a/tests/task/__init__.py +++ b/tests/task/__init__.py @@ -15,7 +15,5 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - # flake8: noqa - from .task_runner import * From 58a65e1acfd462deedd0fb1f6071a484864dcc07 Mon Sep 17 00:00:00 2001 From: Ashwin Madavan Date: Sat, 25 Dec 2021 23:26:11 -0800 Subject: [PATCH 102/250] Bump PyJWT from `<2` to `<3` (#20490) * Bump pyjwt from `<2` to `<3` * Update setup.cfg (cherry picked from commit 9e3fad9209031e52d3bcca6089c7813b4d1b8408) --- setup.cfg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 68d1efb9ff6f0..befdee356c873 100644 --- a/setup.cfg +++ b/setup.cfg @@ -132,8 +132,7 @@ install_requires = pep562~=1.0;python_version<"3.7" psutil>=4.2.0, <6.0.0 pygments>=2.0.1, <3.0 - # Required for flask-jwt-extended and msal - pyjwt<2 + pyjwt<3 # python daemon crashes with 'socket operation on non-socket' for python 3.8+ in version < 2.2.4 # https://pagure.io/python-daemon/issue/34 python-daemon>=2.2.4 From 933716b258841251b47c9c3c6d0152b19971b182 Mon Sep 17 00:00:00 2001 From: John Cheng Date: Tue, 28 Dec 2021 03:28:50 +0800 Subject: [PATCH 103/250] Fix: pin pymongo < 4.0.0 (#20511) (cherry picked from commit f85880e989d7751cfa3ae2d4665d7cc0cb3cc945) --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 687153298e416..e3fe301627ae9 100644 --- a/setup.py +++ b/setup.py @@ -389,7 +389,8 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version leveldb = ['plyvel'] mongo = [ 'dnspython>=1.13.0,<3.0.0', - 'pymongo>=3.6.0', + # pymongo 4.0.0 removes connection option `ssl_cert_reqs` which is used in providers-mongo/2.2.0 + 'pymongo>=3.6.0,<4.0.0', ] mssql = [ 'pymssql~=2.1,>=2.1.5', From 0df0dcd7fef79896229dc7637ead3e771c70aa4f Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Mon, 27 Dec 2021 21:07:51 +0100 Subject: [PATCH 104/250] Increase time limit for Helm chart unit tests (#20525) Sometimes the helm chart unit tests exceed the allocated time for the job for Public Runners by a small margin. (9X% tests successful). This change increases the limit. (cherry picked from commit f99f2c39a8729976517947d4c904d68b055b976c) --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 28e4493911bc4..36d4e8252983f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -628,7 +628,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" PACKAGE_FORMAT: "sdist" tests-helm: - timeout-minutes: 40 + timeout-minutes: 80 name: "Python unit tests for helm chart" runs-on: ${{ fromJson(needs.build-info.outputs.runsOn) }} needs: [build-info, ci-images] From 95e71b731feccc6c75b3ae5494e8bd8aac3d9ba6 Mon Sep 17 00:00:00 2001 From: Ashwin Madavan Date: Tue, 28 Dec 2021 14:58:50 -0800 Subject: [PATCH 105/250] Remove PyJWT upper bound from Dockerfile (#20503) (cherry picked from commit 528baf4fd301ff0f97edbf59cb42888a71c6b95c) --- Dockerfile | 3 +-- Dockerfile.ci | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index b5113be0ff735..b3246bb955bed 100644 --- a/Dockerfile +++ b/Dockerfile @@ -239,10 +239,9 @@ ARG INSTALL_FROM_PYPI="true" # Those are additional constraints that are needed for some extras but we do not want to # Force them on the main Airflow package. # * certifi<2021.0.0 required to keep snowflake happy -# * pyjwt<2.0.0: flask-jwt-extended requires it # * dill<0.3.3 required by apache-beam # * google-ads<14.0.1 required to prevent updating google-python-api>=2.0.0 -ARG EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS="pyjwt<2.0.0 dill<0.3.3 certifi<2021.0.0 google-ads<14.0.1" +ARG EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS="dill<0.3.3 certifi<2021.0.0 google-ads<14.0.1" ENV ADDITIONAL_PYTHON_DEPS=${ADDITIONAL_PYTHON_DEPS} \ INSTALL_FROM_DOCKER_CONTEXT_FILES=${INSTALL_FROM_DOCKER_CONTEXT_FILES} \ diff --git a/Dockerfile.ci b/Dockerfile.ci index b4eb4653c5db0..5ea6c990b3651 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -272,10 +272,9 @@ ENV AIRFLOW_REPO=${AIRFLOW_REPO}\ # force them on the main Airflow package. Those limitations are: # * certifi<2021.0.0: required by snowflake provider # * lazy-object-proxy<1.5.0: required by astroid -# * pyjwt<2.0.0: flask-jwt-extended requires it # * dill<0.3.3 required by apache-beam # * google-ads<14.0.1 required to prevent updating google-python-api>=2.0.0 -ARG EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS="lazy-object-proxy<1.5.0 pyjwt<2.0.0 dill<0.3.3 certifi<2021.0.0 google-ads<14.0.1" +ARG EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS="lazy-object-proxy<1.5.0 dill<0.3.3 certifi<2021.0.0 google-ads<14.0.1" ARG UPGRADE_TO_NEWER_DEPENDENCIES="false" ENV EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS=${EAGER_UPGRADE_ADDITIONAL_REQUIREMENTS} \ UPGRADE_TO_NEWER_DEPENDENCIES=${UPGRADE_TO_NEWER_DEPENDENCIES} From bccf2f57c3ca62ad7103571d03ea18013d08f3e5 Mon Sep 17 00:00:00 2001 From: Ashwin Madavan Date: Wed, 29 Dec 2021 05:09:21 -0800 Subject: [PATCH 106/250] Bump croniter from `<1.1` to `<1.2` (#20489) (cherry picked from commit 8a03a505e1df0f9de276038c5509135ac569a667) --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index befdee356c873..4698801c91da8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -92,7 +92,7 @@ install_requires = # Required by vendored-in connexion clickclick>=1.2 colorlog>=4.0.2, <6.0 - croniter>=0.3.17, <1.1 + croniter>=0.3.17 cryptography>=0.9.3 dataclasses;python_version<"3.7" dill>=0.2.2, <0.4 From e2946208cee70fb682f0aaaf12a439b03a806846 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Wed, 20 Oct 2021 21:57:12 -0600 Subject: [PATCH 107/250] Update "Release Airflow" doc (#19111) Co-authored-by: Tzu-ping Chung (cherry picked from commit 3928eecc024f99088d487957e161880f5fe75ec9) --- dev/README_RELEASE_AIRFLOW.md | 74 ++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 22 deletions(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index 5b7f8e38e1b3c..98db8847e994b 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -42,6 +42,8 @@ - [Publish documentation](#publish-documentation) - [Notify developers of release](#notify-developers-of-release) - [Update Announcements page](#update-announcements-page) + - [Update the bug issue template](#update-the-bug-issue-template) + - [Update default Airflow version in the helm chart](#update-default-airflow-version-in-the-helm-chart) - [Update airflow/config_templates/config.yml file](#update-airflowconfig_templatesconfigyml-file) @@ -171,7 +173,7 @@ branches: `vX-Y-test` and `vX-Y-stable` (for example with `2.1.0rc1` release you ```shell script # First clone the repo - BRANCH_PREFIX=v2-1 + export BRANCH_PREFIX=v2-1 git branch ${BRANCH_PREFIX}-test git branch ${BRANCH_PREFIX}-stable git push origin ${BRANCH_PREFIX}-test ${BRANCH_PREFIX}-stable @@ -252,24 +254,32 @@ also performs image verification before pushing the images. Subject: -``` -[VOTE] Release Airflow 2.0.2 from 2.0.2rc1 +```shell script +cat < +EOF ``` @@ -515,7 +526,7 @@ Once the vote has been passed, you will need to send a result vote to dev@airflo Subject: ``` -[RESULT][VOTE] Airflow 2.0.2rc3 +[RESULT][VOTE] Release Airflow 2.0.2 from 2.0.2rc3 ``` Message: @@ -572,12 +583,12 @@ export AIRFLOW_DEV_SVN=$(pwd) # svn checkout https://dist.apache.org/repos/dist/release/airflow airflow-release cd svn update +export AIRFLOW_RELEASE_SVN=$(pwd) export RC=2.0.2rc5 export VERSION=${RC/rc?/} # Create new folder for the release -cd airflow-release svn mkdir "${VERSION}" cd "${VERSION}" @@ -607,7 +618,7 @@ previously released RC candidates in "${AIRFLOW_SOURCES}/dist": - Verify the artifacts that would be uploaded: ```shell script - cd "${AIRFLOW_SOURCES}" + cd "${AIRFLOW_RELEASE_SVN}/${VERSION}" twine check dist/* ``` @@ -672,6 +683,7 @@ Documentation for providers can be found in the ``/docs/apache-airflow`` directo ```shell script git clone https://github.com/apache/airflow-site.git airflow-site cd airflow-site + git checkout -b ${VERSION}-docs export AIRFLOW_SITE_DIRECTORY="$(pwd)" ``` @@ -688,13 +700,15 @@ Documentation for providers can be found in the ``/docs/apache-airflow`` directo ./docs/start_doc_server.sh ``` -- Copy the documentation to the ``airflow-site`` repository, create commit and push changes. +- Copy the documentation to the ``airflow-site`` repository, create commit, push changes and open a PR. ```shell script ./docs/publish_docs.py --package-filter apache-airflow --package-filter docker-stack cd "${AIRFLOW_SITE_DIRECTORY}" + git add . git commit -m "Add documentation for Apache Airflow ${VERSION}" git push + # and finally open a PR ``` ## Notify developers of release @@ -722,14 +736,20 @@ The released sources and packages can be downloaded via https://airflow.apache.o Other installation methods are described in https://airflow.apache.org/docs/apache-airflow/stable/installation/ +We also made this version available on PyPI for convenience: +\`pip install apache-airflow\` +https://pypi.org/project/apache-airflow/${VERSION}/ + The documentation is available on: https://airflow.apache.org/ https://airflow.apache.org/docs/apache-airflow/${VERSION}/ Find the CHANGELOG here for more details: - https://airflow.apache.org/docs/apache-airflow/${VERSION}/changelog.html +Container images are published at: +https://hub.docker.com/r/apache/airflow/tags/?page=1&name=${VERSION} + Cheers, EOF @@ -739,6 +759,16 @@ EOF Update "Announcements" page at the [Official Airflow website](https://airflow.apache.org/announcements/) +## Update the bug issue template + +Make sure the version you just released is listed in the bug issue template in `.github/ISSUE_TEMPLATE/airflow_bug_report.yml`. + +## Update default Airflow version in the helm chart + +Update the values of `airflowVersion`, `defaultAirflowTag` and `appVersion` in the helm chart so the next helm chart release +will use the latest released version. You'll need to update `chart/values.yaml`, `chart/values.schema.json` and +`chart/Chart.yaml`. + ## Update airflow/config_templates/config.yml file File `airflow/config_templates/config.yml` contains documentation on all configuration options available in Airflow. The `version_added` fields must be updated when a new Airflow version is released. From 59a6aca0d6ac219212d0067b3ee734ef2bde9f83 Mon Sep 17 00:00:00 2001 From: Daniel Standish <15932138+dstandish@users.noreply.github.com> Date: Tue, 26 Oct 2021 16:31:48 -0700 Subject: [PATCH 108/250] Dev: Clarify file naming in release verification doc (#19233) (cherry picked from commit efdfd15477f92da059fa86b4fa18b6f29cb97feb) --- dev/README_RELEASE_AIRFLOW.md | 10 +++++----- dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md | 16 ++++++++-------- dev/README_RELEASE_PROVIDER_PACKAGES.md | 15 +++++++-------- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index 98db8847e994b..b84d91c32e60c 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -354,9 +354,9 @@ The files should be present in the sub-folder of The following files should be present (9 files): -* -bin-tar.gz + .asc + .sha512 * -source.tar.gz + .asc + .sha512 -* -.whl + .asc + .sha512 +* .tar.gz + .asc + .sha512 +* -py3-none-any.whl + .asc + .sha512 As a PMC you should be able to clone the SVN repository: @@ -438,8 +438,8 @@ warning. By importing the server in the previous step and importing it via ID fr this is a valid Key already. ``` -Checking apache-airflow-2.0.2rc4-bin.tar.gz.asc -gpg: assuming signed data in 'apache-airflow-2.0.2rc4-bin.tar.gz' +Checking apache-airflow-2.0.2rc4.tar.gz.asc +gpg: assuming signed data in 'apache-airflow-2.0.2rc4.tar.gz' gpg: Signature made sob, 22 sie 2020, 20:28:28 CEST gpg: using RSA key 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B gpg: Good signature from "Kaxil Naik " [unknown] @@ -478,7 +478,7 @@ done You should get output similar to: ``` -Checking apache-airflow-2.0.2rc4-bin.tar.gz.sha512 +Checking apache-airflow-2.0.2rc4.tar.gz.sha512 Checking apache_airflow-2.0.2rc4-py2.py3-none-any.whl.sha512 Checking apache-airflow-2.0.2rc4-source.tar.gz.sha512 ``` diff --git a/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md b/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md index a144389392b9e..cb83f3f892351 100644 --- a/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md +++ b/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md @@ -120,7 +120,7 @@ official Apache releases must not include the rcN suffix. - Rename the sdist ```shell script - mv dist/apache-airflow-upgrade-check-${VERSION%rc?}.tar.gz apache-airflow-upgrade-check-${VERSION}-bin.tar.gz + mv dist/apache-airflow-upgrade-check-${VERSION%rc?}.tar.gz apache-airflow-upgrade-check-${VERSION}.tar.gz mv dist/apache_airflow_upgrade_check-${VERSION%rc?}-py2.py3-none-any.whl apache_airflow_upgrade_check-${VERSION}-py2.py3-none-any.whl ``` @@ -129,7 +129,7 @@ official Apache releases must not include the rcN suffix. ```shell script ${AIRFLOW_REPO_ROOT}/dev/sign.sh apache-airflow-upgrade-check-${VERSION}-source.tar.gz - ${AIRFLOW_REPO_ROOT}/dev/sign.sh apache-airflow-upgrade-check-${VERSION}-bin.tar.gz + ${AIRFLOW_REPO_ROOT}/dev/sign.sh apache-airflow-upgrade-check-${VERSION}.tar.gz ${AIRFLOW_REPO_ROOT}/dev/sign.sh apache_airflow_upgrade_check-${VERSION}-py2.py3-none-any.whl ``` @@ -228,7 +228,7 @@ The files can be downloaded from https://dist.apache.org/repos/dist/dev/airflow/ - apache-airflow-upgrade-check-1.3.0rc1-source.tar.gz is a source release containing the files that made up the binary and wheel releases. -- apache-airflow-upgrade-check-1.3.0rc1-bin.tar.gz is the binary +- apache-airflow-upgrade-check-1.3.0rc1.tar.gz is the binary Python "sdist" release. - apache_airflow_upgrade_check-1.3.0rc1-py2.py3-none-any.whl is the binary Python pre-compiled wheel file. @@ -283,9 +283,9 @@ The files should be present in the sub-folder of The following files should be present (9 files): -* -bin-tar.gz + .asc + .sha512 * -source.tar.gz + .asc + .sha512 -* -.whl + .asc + .sha512 +* .tar.gz + .asc + .sha512 +* -py3-none-any.whl + .asc + .sha512 As a PMC you should be able to clone the SVN repository: @@ -360,8 +360,8 @@ warning. By importing the server in the previous step and importing it via ID fr this is a valid Key already. ``` -Checking apache-airflow-upgrade-check-1.3.0rc1-bin.tar.gz.asc -gpg: assuming signed data in 'apache-airflow-upgrade-check-1.3.0rc1-bin.tar.gz' +Checking apache-airflow-upgrade-check-1.3.0rc1.tar.gz.asc +gpg: assuming signed data in 'apache-airflow-upgrade-check-1.3.0rc1.tar.gz' gpg: Signature made Tue 9 Mar 23:22:24 2021 GMT gpg: using RSA key CDE15C6E4D3A8EC4ECF4BA4B6674E08AD7DE406F gpg: Good signature from "Kaxil Naik " [ultimate] @@ -400,7 +400,7 @@ done You should get output similar to: ``` -Checking apache-airflow-upgrade-check-1.3.0rc1-bin.tar.gz.sha512 +Checking apache-airflow-upgrade-check-1.3.0rc1.tar.gz.sha512 Checking apache_airflow_upgrade_check-1.3.0rc1-py2.py3-none-any.whl.sha512 Checking apache-airflow-upgrade-check-1.3.0rc1-source.tar.gz.sha512 ``` diff --git a/dev/README_RELEASE_PROVIDER_PACKAGES.md b/dev/README_RELEASE_PROVIDER_PACKAGES.md index 3edb68ec74b08..65f94fb65e458 100644 --- a/dev/README_RELEASE_PROVIDER_PACKAGES.md +++ b/dev/README_RELEASE_PROVIDER_PACKAGES.md @@ -385,7 +385,7 @@ Consider this my (binding) +1. Airflow Providers are available at: https://dist.apache.org/repos/dist/dev/airflow/providers/ -*apache-airflow-providers--*-bin.tar.gz* are the binary +*apache-airflow-providers--*.tar.gz* are the binary Python "sdist" release - they are also official "sources" for the provider packages. *apache_airflow_providers_-*.whl are the binary @@ -443,11 +443,10 @@ Please modify the message above accordingly to clearly exclude those packages. The files should be present in [Airflow dist](https://dist.apache.org/repos/dist/dev/airflow/providers/) -The following files should be present (9 files): +The following files should be present (6 files): -* -source.tar.gz + .asc + .sha512 (one set of files) -* -bin-tar.gz + .asc + .sha512 (one set of files per provider) -* -.whl + .asc + .sha512 (one set of files per provider) +* .tar.gz + .asc + .sha512 (one set of files per provider) +* -py3-none-any.whl + .asc + .sha512 (one set of files per provider) As a PMC you should be able to clone the SVN repository: @@ -529,8 +528,8 @@ warning. By importing the server in the previous step and importing it via ID fr this is a valid Key already. ``` -Checking apache-airflow-2.0.2rc4-bin.tar.gz.asc -gpg: assuming signed data in 'apache-airflow-2.0.2rc4-bin.tar.gz' +Checking apache-airflow-2.0.2rc4.tar.gz.asc +gpg: assuming signed data in 'apache-airflow-2.0.2rc4.tar.gz' gpg: Signature made sob, 22 sie 2020, 20:28:28 CEST gpg: using RSA key 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B gpg: Good signature from "Kaxil Naik " [unknown] @@ -569,7 +568,7 @@ done You should get output similar to: ``` -Checking apache-airflow-providers-google-1.0.0rc1-bin.tar.gz.sha512 +Checking apache-airflow-providers-google-1.0.0rc1.tar.gz.sha512 Checking apache_airflow-providers-google-1.0.0rc1-py3-none-any.whl.sha512 ``` From 23b2e7bf2eeee658e755726111e00d1535d38ef4 Mon Sep 17 00:00:00 2001 From: Daniel Standish <15932138+dstandish@users.noreply.github.com> Date: Wed, 27 Oct 2021 11:05:48 -0700 Subject: [PATCH 109/250] Fix release check script (#19238) There have been some changes to the filename conventions over time and the release check script was not updated to reflect this. This PR fixes the script and tries to simplify it a little bit. In particular, the regex approach used previously was broken by the removal of the `-bin` identifier. It is easy enough to simply compute all the expected files exactly and look for them, so that is what we do here (cherry picked from commit b49b81ac8466922cb704f989910e1811b7cb4fa9) --- dev/check_files.py | 159 ++++++++++++++++++++++++++++++--------------- 1 file changed, 105 insertions(+), 54 deletions(-) diff --git a/dev/check_files.py b/dev/check_files.py index 9117aade82052..acc14adc1173d 100644 --- a/dev/check_files.py +++ b/dev/check_files.py @@ -17,6 +17,7 @@ import os import re +from itertools import product from typing import List import click as click @@ -45,26 +46,22 @@ """ - DOCKER_CMD = """ docker build --tag local/airflow . docker local/airflow info """ - AIRFLOW = "AIRFLOW" PROVIDERS = "PROVIDERS" UPGRADE_CHECK = "UPGRADE_CHECK" -ASC = re.compile(r".*\.asc$") -SHA = re.compile(r".*\.sha512$") -NORM = re.compile(r".*\.(whl|gz)$") - def get_packages() -> List[str]: - with open("packages.txt") as file: - content = file.read() - + try: + with open("packages.txt") as file: + content = file.read() + except FileNotFoundError: + content = '' if not content: raise SystemExit("List of packages to check is empty. Please add packages to `packages.txt`") @@ -86,66 +83,80 @@ def create_docker(txt: str): ) -def check_all_present(prefix: str, files: List[str]): - all_present = True - for ext in [ASC, SHA, NORM]: - if any(re.match(ext, f) for f in files): - print(f" - {prefix} {ext.pattern}: [green]OK[/green]") - else: - print(f" - {prefix} {ext.pattern}: [red]MISSING[/red]") - all_present = False - return all_present +def check_providers(files: List[str], version: str): + print(f"Checking providers for version {version}:\n") + version = strip_rc_suffix(version) + missing_list = [] + for p in get_packages(): + print(p) + expected_files = expand_name_variations( + [ + f"{p}-{version}.tar.gz", + f"{p.replace('-', '_')}-{version}-py3-none-any.whl", + ] + ) + missing_list.extend(check_all_files(expected_files=expected_files, actual_files=files)) -def filter_files(files: List[str], prefix: str): - return [f for f in files if f.startswith(prefix)] + return missing_list -def check_providers(files: List[str], version: str): - name_tpl = "apache_airflow_providers_{}-{}" - pip_packages = [] - for p in get_packages(): - print(p) +def strip_rc_suffix(version): + return re.sub(r'rc\d+$', '', version) - name = name_tpl.format(p.replace(".", "_"), version) - # Check sources - check_all_present("sources", filter_files(files, name)) - # Check wheels - name = name.replace("_", "-") - if check_all_present("wheel", filter_files(files, name)): - pip_packages.append(f"{name.rpartition('-')[0]}=={version}") +def print_status(file, is_found: bool): + color, status = ('green', 'OK') if is_found else ('red', 'MISSING') + print(f" - {file}: [{color}]{status}[/{color}]") - return pip_packages +def check_all_files(actual_files, expected_files): + missing_list = [] + for file in expected_files: + is_found = file in actual_files + if not is_found: + missing_list.append(file) + print_status(file=file, is_found=is_found) + return missing_list -def check_release(files: List[str], version: str): - print(f"apache_airflow-{version}") - # Check bin - name = f"apache-airflow-{version}-bin" - check_all_present("binaries", filter_files(files, name)) +def check_release(files: List[str], version: str): + print(f"Checking airflow release for version {version}:\n") + version = strip_rc_suffix(version) + + expected_files = expand_name_variations( + [ + f"apache-airflow-{version}.tar.gz", + f"apache-airflow-{version}-source.tar.gz", + f"apache_airflow-{version}-py3-none-any.whl", + ] + ) + return check_all_files(expected_files=expected_files, actual_files=files) - # Check sources - name = f"apache-airflow-{version}-source" - check_all_present("sources", filter_files(files, name)) - # Check wheels - name = f"apache_airflow-{version}-py" - check_all_present("wheel", filter_files(files, name)) +def expand_name_variations(files): + return list(sorted(base + suffix for base, suffix in product(files, ['', '.asc', '.sha512']))) def check_upgrade_check(files: List[str], version: str): - print(f"apache_airflow-upgrade-check-{version}") + print(f"Checking upgrade_check for version {version}:\n") + version = strip_rc_suffix(version) + + expected_files = expand_name_variations( + [ + f"apache-airflow-upgrade-check-{version}-bin.tar.gz", + f"apache-airflow-upgrade-check-{version}-source.tar.gz", + f"apache_airflow_upgrade_check-{version}-py2.py3-none-any.whl", + ] + ) + return check_all_files(expected_files=expected_files, actual_files=files) - name = f"apache-airflow-upgrade-check-{version}-bin" - check_all_present("binaries", filter_files(files, name)) - name = f"apache-airflow-upgrade-check-{version}-source" - check_all_present("sources", filter_files(files, name)) +def warn_of_missing_files(files): + print("[red]Check failed. Here are the files we expected but did not find:[/red]\n") - name = f"apache_airflow_upgrade_check-{version}-py" - check_all_present("wheel", filter_files(files, name)) + for file in files: + print(f" - [red]{file}[/red]") @click.command() @@ -188,24 +199,31 @@ def main(check_type: str, path: str, version: str): if check_type.upper() == PROVIDERS: files = os.listdir(os.path.join(path, "providers")) - pips = check_providers(files, version) + pips = [f"{p}=={version}" for p in get_packages()] + missing_files = check_providers(files, version) create_docker(PROVIDERS_DOCKER.format("\n".join(f"RUN pip install '{p}'" for p in pips))) + if missing_files: + warn_of_missing_files(missing_files) return if check_type.upper() == AIRFLOW: files = os.listdir(os.path.join(path, version)) - check_release(files, version) + missing_files = check_release(files, version) base_version = version.split("rc")[0] prev_version = base_version[:-1] + str(int(base_version[-1]) - 1) create_docker(AIRFLOW_DOCKER.format(prev_version, version)) + if missing_files: + warn_of_missing_files(missing_files) return if check_type.upper() == UPGRADE_CHECK: files = os.listdir(os.path.join(path, "upgrade-check", version)) - check_upgrade_check(files, version) + missing_files = check_upgrade_check(files, version) create_docker(DOCKER_UPGRADE.format(version)) + if missing_files: + warn_of_missing_files(missing_files) return raise SystemExit(f"Unknown check type: {check_type}") @@ -213,3 +231,36 @@ def main(check_type: str, path: str, version: str): if __name__ == "__main__": main() + + +def test_check_release_pass(): + """Passes if all present""" + files = [ + 'apache_airflow-2.2.1-py3-none-any.whl', + 'apache_airflow-2.2.1-py3-none-any.whl.asc', + 'apache_airflow-2.2.1-py3-none-any.whl.sha512', + 'apache-airflow-2.2.1-source.tar.gz', + 'apache-airflow-2.2.1-source.tar.gz.asc', + 'apache-airflow-2.2.1-source.tar.gz.sha512', + 'apache-airflow-2.2.1.tar.gz', + 'apache-airflow-2.2.1.tar.gz.asc', + 'apache-airflow-2.2.1.tar.gz.sha512', + ] + assert check_release(files, version='2.2.1rc2') == [] + + +def test_check_release_fail(): + """Fails if missing one""" + files = [ + 'apache_airflow-2.2.1-py3-none-any.whl', + 'apache_airflow-2.2.1-py3-none-any.whl.asc', + 'apache_airflow-2.2.1-py3-none-any.whl.sha512', + 'apache-airflow-2.2.1-source.tar.gz', + 'apache-airflow-2.2.1-source.tar.gz.asc', + 'apache-airflow-2.2.1-source.tar.gz.sha512', + 'apache-airflow-2.2.1.tar.gz.asc', + 'apache-airflow-2.2.1.tar.gz.sha512', + ] + + missing_files = check_release(files, version='2.2.1rc2') + assert missing_files == ['apache-airflow-2.2.1.tar.gz'] From 35b5cd2d7cf2f13c429d15071a1d9863dbc538c5 Mon Sep 17 00:00:00 2001 From: Daniel Standish <15932138+dstandish@users.noreply.github.com> Date: Fri, 29 Oct 2021 07:03:31 -0700 Subject: [PATCH 110/250] Clarify rat test guidance in release check docs (#19296) * Clarify rat test guidance in release check docs In the context of the rat tests there are two binaries that need unpacking -- the rat test jar and the airflow release. This change clarifies the references to the airflow binaries. (cherry picked from commit 2fdcb8a89cd1aaf1a90657385a257e58926c21a9) --- dev/README_RELEASE_AIRFLOW.md | 10 ++++++---- dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md | 10 ++++++---- dev/README_RELEASE_HELM_CHART.md | 9 +++++---- dev/README_RELEASE_PROVIDER_PACKAGES.md | 10 ++++++---- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index b84d91c32e60c..93fc834aa6ae2 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -383,8 +383,8 @@ This can be done with the Apache RAT tool. * Download the latest jar from https://creadur.apache.org/rat/download_rat.cgi (unpack the binary, the jar is inside) -* Unpack the binary (`-bin.tar.gz`) to a folder -* Enter the folder and run the check (point to the place where you extracted the .jar) +* Unpack the release source archive (the `-source.tar.gz` file) to a folder +* Enter the sources folder run the check ```shell script java -jar ../../apache-rat-0.13/apache-rat-0.13.jar -E .rat-excludes -d . @@ -394,7 +394,7 @@ where `.rat-excludes` is the file in the root of Airflow source code. ## Signature check -Make sure you have the key of person signed imported in your GPG. You can find the valid keys in +Make sure you have imported into your GPG the PGP key of the person signing the release. You can find the valid keys in [KEYS](https://dist.apache.org/repos/dist/release/airflow/KEYS). You can import the whole KEYS file: @@ -426,7 +426,7 @@ Once you have the keys, the signatures can be verified by running this: ```shell script for i in *.asc do - echo "Checking $i"; gpg --verify $i + echo -e "Checking $i\n"; gpg --verify $i done ``` @@ -446,6 +446,7 @@ gpg: Good signature from "Kaxil Naik " [unknown] gpg: WARNING: This key is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. Primary key fingerprint: 1271 7556 040E EF2E EAF1 B9C2 75FC CD0A 25FA 0E4B + Checking apache_airflow-2.0.2rc4-py2.py3-none-any.whl.asc gpg: assuming signed data in 'apache_airflow-2.0.2rc4-py2.py3-none-any.whl' gpg: Signature made sob, 22 sie 2020, 20:28:31 CEST @@ -454,6 +455,7 @@ gpg: Good signature from "Kaxil Naik " [unknown] gpg: WARNING: This key is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. Primary key fingerprint: 1271 7556 040E EF2E EAF1 B9C2 75FC CD0A 25FA 0E4B + Checking apache-airflow-2.0.2rc4-source.tar.gz.asc gpg: assuming signed data in 'apache-airflow-2.0.2rc4-source.tar.gz' gpg: Signature made sob, 22 sie 2020, 20:28:25 CEST diff --git a/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md b/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md index cb83f3f892351..1fe973cf393f5 100644 --- a/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md +++ b/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md @@ -305,8 +305,8 @@ This can be done with the Apache RAT tool. * Download the latest jar from https://creadur.apache.org/rat/download_rat.cgi (unpack the binary, the jar is inside) -* Unpack the binary (`-bin.tar.gz`) to a folder -* Enter the folder and run the check (point to the place where you extracted the .jar) +* Unpack the release source archive (the `-source.tar.gz` file) to a folder +* Enter the sources folder run the check ```shell script java -jar ../../apache-rat-0.13/apache-rat-0.13.jar -E .rat-excludes -d . @@ -316,7 +316,7 @@ where `.rat-excludes` is the file in the root of Airflow source code. ## Signature check -Make sure you have the key of person signed imported in your GPG. You can find the valid keys in +Make sure you have imported into your GPG the PGP key of the person signing the release. You can find the valid keys in [KEYS](https://dist.apache.org/repos/dist/release/airflow/KEYS). You can import the whole KEYS file: @@ -348,7 +348,7 @@ Once you have the keys, the signatures can be verified by running this: ```shell script for i in *.asc do - echo "Checking $i"; gpg --verify $i + echo -e "Checking $i\n"; gpg --verify $i done ``` @@ -368,6 +368,7 @@ gpg: Good signature from "Kaxil Naik " [ultimate] gpg: aka "Kaxil Naik " [ultimate] gpg: WARNING: This key is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. + Checking apache-airflow-upgrade-check-1.3.0rc1-source.tar.gz.asc gpg: assuming signed data in 'apache-airflow-upgrade-check-1.3.0rc1-source.tar.gz' gpg: Signature made Tue 9 Mar 23:22:21 2021 GMT @@ -376,6 +377,7 @@ gpg: Good signature from "Kaxil Naik " [ultimate] gpg: aka "Kaxil Naik " [ultimate] gpg: WARNING: This key is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. + Checking apache_airflow_upgrade_check-1.3.0rc1-py2.py3-none-any.whl.asc gpg: assuming signed data in 'apache_airflow_upgrade_check-1.3.0rc1-py2.py3-none-any.whl' gpg: Signature made Tue 9 Mar 23:22:27 2021 GMT diff --git a/dev/README_RELEASE_HELM_CHART.md b/dev/README_RELEASE_HELM_CHART.md index ecc81a8da6d9a..b595eec91e44a 100644 --- a/dev/README_RELEASE_HELM_CHART.md +++ b/dev/README_RELEASE_HELM_CHART.md @@ -328,8 +328,8 @@ This can be done with the Apache RAT tool. * Download the latest jar from https://creadur.apache.org/rat/download_rat.cgi (unpack the binary, the jar is inside) -* Unpack the binary (`-bin.tar.gz`) to a folder -* Enter the folder and run the check (point to the place where you extracted the .jar) +* Unpack the release source archive (the `-source.tar.gz` file) to a folder +* Enter the sources folder run the check ```shell java -jar $PATH_TO_RAT/apache-rat-0.13/apache-rat-0.13.jar chart -E .rat-excludes @@ -339,7 +339,7 @@ where `.rat-excludes` is the file in the root of Chart source code. ## Signature check -Make sure you have the key of person signed imported in your GPG. You can find the valid keys in +Make sure you have imported into your GPG the PGP key of the person signing the release. You can find the valid keys in [KEYS](https://dist.apache.org/repos/dist/release/airflow/KEYS). You can import the whole KEYS file: @@ -371,7 +371,7 @@ Once you have the keys, the signatures can be verified by running this: ```shell for i in *.asc do - echo "Checking $i"; gpg --verify $i + echo -e "Checking $i\n"; gpg --verify $i done ``` @@ -393,6 +393,7 @@ gpg: aka "Kaxil Naik " [unknown] gpg: WARNING: The key's User ID is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. Primary key fingerprint: CDE1 5C6E 4D3A 8EC4 ECF4 BA4B 6674 E08A D7DE 406F + Checking airflow-chart-1.0.0-source.tar.gz.asc gpg: assuming signed data in 'airflow-chart-1.0.0-source.tar.gz' gpg: Signature made Sun 16 May 02:24:09 2021 BST diff --git a/dev/README_RELEASE_PROVIDER_PACKAGES.md b/dev/README_RELEASE_PROVIDER_PACKAGES.md index 65f94fb65e458..ed4853b382a57 100644 --- a/dev/README_RELEASE_PROVIDER_PACKAGES.md +++ b/dev/README_RELEASE_PROVIDER_PACKAGES.md @@ -473,8 +473,8 @@ This can be done with the Apache RAT tool. * Download the latest jar from https://creadur.apache.org/rat/download_rat.cgi (unpack the binary, the jar is inside) -* Unpack the binary (`-bin.tar.gz`) to a folder -* Enter the folder and run the check (point to the place where you extracted the .jar) +* Unpack the release source archive (the `-source.tar.gz` file) to a folder +* Enter the sources folder run the check ```shell script java -jar ../../apache-rat-0.13/apache-rat-0.13.jar -E .rat-excludes -d . @@ -484,7 +484,7 @@ where `.rat-excludes` is the file in the root of Airflow source code. ### Signature check -Make sure you have the key of person signed imported in your GPG. You can find the valid keys in +Make sure you have imported into your GPG the PGP key of the person signing the release. You can find the valid keys in [KEYS](https://dist.apache.org/repos/dist/release/airflow/KEYS). You can import the whole KEYS file: @@ -516,7 +516,7 @@ Once you have the keys, the signatures can be verified by running this: ```shell script for i in *.asc do - echo "Checking $i"; gpg --verify $i + echo -e "Checking $i\n"; gpg --verify $i done ``` @@ -536,6 +536,7 @@ gpg: Good signature from "Kaxil Naik " [unknown] gpg: WARNING: This key is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. Primary key fingerprint: 1271 7556 040E EF2E EAF1 B9C2 75FC CD0A 25FA 0E4B + Checking apache_airflow-2.0.2rc4-py2.py3-none-any.whl.asc gpg: assuming signed data in 'apache_airflow-2.0.2rc4-py2.py3-none-any.whl' gpg: Signature made sob, 22 sie 2020, 20:28:31 CEST @@ -544,6 +545,7 @@ gpg: Good signature from "Kaxil Naik " [unknown] gpg: WARNING: This key is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. Primary key fingerprint: 1271 7556 040E EF2E EAF1 B9C2 75FC CD0A 25FA 0E4B + Checking apache-airflow-2.0.2rc4-source.tar.gz.asc gpg: assuming signed data in 'apache-airflow-2.0.2rc4-source.tar.gz' gpg: Signature made sob, 22 sie 2020, 20:28:25 CEST From ae184690fd27cc3f549dc427d9d70cc1c10020da Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sun, 31 Oct 2021 10:24:45 +0100 Subject: [PATCH 111/250] Update known warnings for Python 3.7 (#19333) After seting 3.7 the default (#19317) the warning printed by Python during importing all providers (specifically apache beam) has slightly changed. Apparently collections.abc warning was a bit more "scary" - warning that it's 3.9 not 3.10 where the old collection imports will stop working (Note that actually this did not happen even in 3.10, apparently) This PR fixes the "known" warning message to match it but also a separate PR (https://github.com/apache/beam/pull/15850) was opened to Beam to get rid of the warnings altogether. Also seems 'dns` stopped generating this warning so I removed it and in case warnings are generated, they are printed outside of the folded group, so that it's immediately visible. (cherry picked from commit 4bb1317c892c6d65557b3d998bd8d1bd971ba96b) --- dev/provider_packages/prepare_provider_packages.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dev/provider_packages/prepare_provider_packages.py b/dev/provider_packages/prepare_provider_packages.py index a56b5299cf106..82c2236fe810c 100755 --- a/dev/provider_packages/prepare_provider_packages.py +++ b/dev/provider_packages/prepare_provider_packages.py @@ -2026,6 +2026,9 @@ def summarise_total_vs_bad_and_warnings(total: int, bad: int, warns: List[warnin console.print() raise_error = True if warns: + if os.environ.get('GITHUB_ACTIONS'): + # Ends group in GitHub Actions so that the errors are immediately visible in CI log + console.print("::endgroup::") console.print() console.print("[red]Unknown warnings generated:[/]") console.print() @@ -2072,14 +2075,9 @@ def summarise_total_vs_bad_and_warnings(total: int, bad: int, warns: List[warnin ), ( "Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since" - " Python 3.3, and in 3.10 it will stop working", + " Python 3.3,and in 3.9 it will stop working", "apache_beam", ), - ( - "Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since" - " Python 3.3, and in 3.10 it will stop working", - "dns", - ), ( 'pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.', "papermill", From 734ab553e3c56ef24e6ec17c5de6f811ac66b38c Mon Sep 17 00:00:00 2001 From: Daniel Standish <15932138+dstandish@users.noreply.github.com> Date: Mon, 8 Nov 2021 12:27:55 -0800 Subject: [PATCH 112/250] Clarify guidance re trust of keys in release docs (#19480) * Clarify guidance re trust of keys in release docs 1. Kaxil's key referenced in the docs is expired. I update with the current key. 2. keys.openpgp.org no longer seems to be set as the default (at least it was not on my machine). So I update the key import to specify this server i.e. 3. clarify language concerning the remote key servers * fix spelling (cherry picked from commit ae1fa4c8b2b2d91ab01697e7a201f321c5c767c3) --- dev/README_RELEASE_AIRFLOW.md | 11 ++++++----- dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md | 11 ++++++----- dev/README_RELEASE_HELM_CHART.md | 19 ++++++++++--------- dev/README_RELEASE_PROVIDER_PACKAGES.md | 11 ++++++----- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index 93fc834aa6ae2..54700db0c4f52 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -408,7 +408,7 @@ retrieves it from the default GPG keyserver [OpenPGP.org](https://keys.openpgp.org): ```shell script -gpg --receive-keys 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B +gpg --keyserver keys.openpgp.org --receive-keys CDE15C6E4D3A8EC4ECF4BA4B6674E08AD7DE406F ``` You should choose to import the key when asked. @@ -418,7 +418,7 @@ errors or timeouts. Many of the release managers also uploaded their keys to the [GNUPG.net](https://keys.gnupg.net) keyserver, and you can retrieve it from there. ```shell script -gpg --keyserver keys.gnupg.net --receive-keys 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B +gpg --keyserver keys.gnupg.net --receive-keys CDE15C6E4D3A8EC4ECF4BA4B6674E08AD7DE406F ``` Once you have the keys, the signatures can be verified by running this: @@ -432,10 +432,11 @@ done This should produce results similar to the below. The "Good signature from ..." is indication that the signatures are correct. Do not worry about the "not certified with a trusted signature" -warning. Most of the certificates used by release managers are self signed, that's why you get this -warning. By importing the server in the previous step and importing it via ID from +warning. Most of the certificates used by release managers are self-signed, and that's why you get this +warning. By importing the key either from the server in the previous step or from the [KEYS](https://dist.apache.org/repos/dist/release/airflow/KEYS) page, you know that -this is a valid Key already. +this is a valid key already. To suppress the warning you may edit the key's trust level +by running `gpg --edit-key trust` and entering `5` to assign trust level `ultimate`. ``` Checking apache-airflow-2.0.2rc4.tar.gz.asc diff --git a/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md b/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md index 1fe973cf393f5..ae5dd8d100d3a 100644 --- a/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md +++ b/dev/README_RELEASE_AIRFLOW_UPGRADE_CHECK.md @@ -330,7 +330,7 @@ retrieves it from the default GPG keyserver [OpenPGP.org](https://keys.openpgp.org): ```shell script -gpg --receive-keys 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B +gpg --keyserver keys.openpgp.org --receive-keys CDE15C6E4D3A8EC4ECF4BA4B6674E08AD7DE406F ``` You should choose to import the key when asked. @@ -340,7 +340,7 @@ errors or timeouts. Many of the release managers also uploaded their keys to the [GNUPG.net](https://keys.gnupg.net) keyserver, and you can retrieve it from there. ```shell script -gpg --keyserver keys.gnupg.net --receive-keys 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B +gpg --keyserver keys.gnupg.net --receive-keys CDE15C6E4D3A8EC4ECF4BA4B6674E08AD7DE406F ``` Once you have the keys, the signatures can be verified by running this: @@ -354,10 +354,11 @@ done This should produce results similar to the below. The "Good signature from ..." is indication that the signatures are correct. Do not worry about the "not certified with a trusted signature" -warning. Most of the certificates used by release managers are self signed, that's why you get this -warning. By importing the server in the previous step and importing it via ID from +warning. Most of the certificates used by release managers are self-signed, and that's why you get this +warning. By importing the key either from the server in the previous step or from the [KEYS](https://dist.apache.org/repos/dist/release/airflow/KEYS) page, you know that -this is a valid Key already. +this is a valid key already. To suppress the warning you may edit the key's trust level +by running `gpg --edit-key trust` and entering `5` to assign trust level `ultimate`. ``` Checking apache-airflow-upgrade-check-1.3.0rc1.tar.gz.asc diff --git a/dev/README_RELEASE_HELM_CHART.md b/dev/README_RELEASE_HELM_CHART.md index b595eec91e44a..ad103243057a9 100644 --- a/dev/README_RELEASE_HELM_CHART.md +++ b/dev/README_RELEASE_HELM_CHART.md @@ -344,7 +344,7 @@ Make sure you have imported into your GPG the PGP key of the person signing the You can import the whole KEYS file: -```shell +```shell script gpg --import KEYS ``` @@ -352,8 +352,8 @@ You can also import the keys individually from a keyserver. The below one uses K retrieves it from the default GPG keyserver [OpenPGP.org](https://keys.openpgp.org): -```shell -gpg --receive-keys 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B +```shell script +gpg --keyserver keys.openpgp.org --receive-keys CDE15C6E4D3A8EC4ECF4BA4B6674E08AD7DE406F ``` You should choose to import the key when asked. @@ -362,13 +362,13 @@ Note that by being default, the OpenPGP server tends to be overloaded often and errors or timeouts. Many of the release managers also uploaded their keys to the [GNUPG.net](https://keys.gnupg.net) keyserver, and you can retrieve it from there. -```shell -gpg --keyserver keys.gnupg.net --receive-keys 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B +```shell script +gpg --keyserver keys.gnupg.net --receive-keys CDE15C6E4D3A8EC4ECF4BA4B6674E08AD7DE406F ``` Once you have the keys, the signatures can be verified by running this: -```shell +```shell script for i in *.asc do echo -e "Checking $i\n"; gpg --verify $i @@ -377,10 +377,11 @@ done This should produce results similar to the below. The "Good signature from ..." is indication that the signatures are correct. Do not worry about the "not certified with a trusted signature" -warning. Most of the certificates used by release managers are self signed, that's why you get this -warning. By importing the server in the previous step and importing it via ID from +warning. Most of the certificates used by release managers are self-signed, and that's why you get this +warning. By importing the key either from the server in the previous step or from the [KEYS](https://dist.apache.org/repos/dist/release/airflow/KEYS) page, you know that -this is a valid Key already. +this is a valid key already. To suppress the warning you may edit the key's trust level +by running `gpg --edit-key trust` and entering `5` to assign trust level `ultimate`. ``` Checking airflow-1.0.0.tgz.asc diff --git a/dev/README_RELEASE_PROVIDER_PACKAGES.md b/dev/README_RELEASE_PROVIDER_PACKAGES.md index ed4853b382a57..f4e4bb08aa19d 100644 --- a/dev/README_RELEASE_PROVIDER_PACKAGES.md +++ b/dev/README_RELEASE_PROVIDER_PACKAGES.md @@ -498,7 +498,7 @@ retrieves it from the default GPG keyserver [OpenPGP.org](https://keys.openpgp.org): ```shell script -gpg --receive-keys 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B +gpg --keyserver keys.openpgp.org --receive-keys CDE15C6E4D3A8EC4ECF4BA4B6674E08AD7DE406F ``` You should choose to import the key when asked. @@ -508,7 +508,7 @@ errors or timeouts. Many of the release managers also uploaded their keys to the [GNUPG.net](https://keys.gnupg.net) keyserver, and you can retrieve it from there. ```shell script -gpg --keyserver keys.gnupg.net --receive-keys 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B +gpg --keyserver keys.gnupg.net --receive-keys CDE15C6E4D3A8EC4ECF4BA4B6674E08AD7DE406F ``` Once you have the keys, the signatures can be verified by running this: @@ -522,10 +522,11 @@ done This should produce results similar to the below. The "Good signature from ..." is indication that the signatures are correct. Do not worry about the "not certified with a trusted signature" -warning. Most of the certificates used by release managers are self signed, that's why you get this -warning. By importing the server in the previous step and importing it via ID from +warning. Most of the certificates used by release managers are self-signed, and that's why you get this +warning. By importing the key either from the server in the previous step or from the [KEYS](https://dist.apache.org/repos/dist/release/airflow/KEYS) page, you know that -this is a valid Key already. +this is a valid key already. To suppress the warning you may edit the key's trust level +by running `gpg --edit-key trust` and entering `5` to assign trust level `ultimate`. ``` Checking apache-airflow-2.0.2rc4.tar.gz.asc From ae28789902c048afb8e35bcadc8c2934538d7cb5 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 9 Nov 2021 00:27:16 +0100 Subject: [PATCH 113/250] Update description of release process for adding new major release (#19483) (cherry picked from commit 7b705aade6812bca5a806056fe40fb8db47f79c7) --- dev/README_RELEASE_AIRFLOW.md | 104 +++++++++++++++++++++++++++++++--- dev/retag_docker_images.py | 4 +- 2 files changed, 97 insertions(+), 11 deletions(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index 54700db0c4f52..24dbd89633f80 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -24,7 +24,7 @@ - [Selecting what to cherry-pick](#selecting-what-to-cherry-pick) - [Prepare the Apache Airflow Package RC](#prepare-the-apache-airflow-package-rc) - [Build RC artifacts](#build-rc-artifacts) - - [[\Optional\] Create new release branch](#%5Coptional%5C-create-new-release-branch) + - [[\Optional\] Prepare new release branches and cache](#%5Coptional%5C-prepare-new-release-branches-and-cache) - [Prepare PyPI convenience "snapshot" packages](#prepare-pypi-convenience-snapshot-packages) - [Prepare production Docker Image](#prepare-production-docker-image) - [Prepare Vote email on the Apache Airflow release candidate](#prepare-vote-email-on-the-apache-airflow-release-candidate) @@ -164,30 +164,116 @@ For now this is done manually, example run `git log --oneline v2-2-test..HEAD - svn commit -m "Add artifacts for Airflow ${VERSION}" ``` -## [\Optional\] Create new release branch +## [\Optional\] Prepare new release branches and cache When you just released the `X.Y.0` version (first release of new minor version) you need to create release branches: `vX-Y-test` and `vX-Y-stable` (for example with `2.1.0rc1` release you need to create v2-1-test and -`v2-1-stable` branches): +`v2-1-stable` branches). You also need to configure the branch +### Create test source branch ```shell script # First clone the repo - export BRANCH_PREFIX=v2-1 - git branch ${BRANCH_PREFIX}-test - git branch ${BRANCH_PREFIX}-stable - git push origin ${BRANCH_PREFIX}-test ${BRANCH_PREFIX}-stable + export BRANCH_PREFIX=2-1 + git branch v${BRANCH_PREFIX}-test ``` -Search and replace all the vX-Y for previous branches (TODO: we should likely automate this a bit more) +### Re-tag images from main Run script to re-tag images from the ``main`` branch to the ``vX-Y-test`` branch: ```shell script - ./dev/retag_docker_images.py --source-branch main --target-branch ${BRANCH_PREFIX}-test + ./dev/retag_docker_images.py --source-branch main --target-branch v${BRANCH_PREFIX}-test ``` +### Update default branches + +In ``./scripts/ci/libraries/_intialization.sh`` update branches to reflect the new branch: + +```bash +export DEFAULT_BRANCH=${DEFAULT_BRANCH="main"} +export DEFAULT_CONSTRAINTS_BRANCH=${DEFAULT_CONSTRAINTS_BRANCH="constraints-main"} +``` + +should become this, where ``X-Y`` is your new branch version: + +```bash +export DEFAULT_BRANCH=${DEFAULT_BRANCH="vX-Y-test"} +export DEFAULT_CONSTRAINTS_BRANCH=${DEFAULT_CONSTRAINTS_BRANCH="constraints-X-Y"} +``` + +In ``./scripts/ci/libraries/_build_images.sh`` add branch to preload packages from (replace X and Y in +values for comparison and regexp): + +```bash + elif [[ ${AIRFLOW_VERSION} =~ v?X\.Y* ]]; then + AIRFLOW_BRANCH_FOR_PYPI_PRELOADING="vX-Y-stable" +``` + +### Commit the changes to the test branch + +```bash +git add -p . +git commit "Update default branches for ${BRANCH_PREFIX}" +``` + +### Create stable branch + +```bash +git branch v${BRANCH_PREFIX}-stable +```` + +### Push test and stable branch + +```bash +git checkout v${BRANCH_PREFIX}-test +git push --set-upstream origin v${BRANCH_PREFIX}-test +git checkout v${BRANCH_PREFIX}-stable +git push --set-upstream origin v${BRANCH_PREFIX}-stable +```` + +### Update coverage branches + +Add ``vX-Y-stable`` and ``vX-Y-test`` branches in ``codecov.yml`` (there are 2 places in the file!) + +```yaml + branches: + - main + - v2-0-stable + - v2-0-test + - v2-1-stable + - v2-1-test + - v2-2-stable + - v2-2-test +``` + +#### Add protected branches to .asf.yaml + +Add vX-Y-stable to .asf.yaml (X-Y is your new branch) + +```yaml +protected_branches: + main: + required_pull_request_reviews: + required_approving_review_count: 1 + ... + vX-Y-stable: + required_pull_request_reviews: + required_approving_review_count: 1 + +``` + +### Create constraints orphan branch + + ```shell script + # First clone the repo + export BRANCH_PREFIX=2-1 + git checkout constraints-main + git checkout -b constraints-${BRANCH_PREFIX} + git push origin constraints-${BRANCH_PREFIX} + ``` + ## Prepare PyPI convenience "snapshot" packages At this point we have the artefact that we vote on, but as a convenience to developers we also want to diff --git a/dev/retag_docker_images.py b/dev/retag_docker_images.py index f29ce1b5a13db..7ab3e0ef42825 100755 --- a/dev/retag_docker_images.py +++ b/dev/retag_docker_images.py @@ -38,8 +38,8 @@ GHCR_IO_IMAGES = [ "{prefix}/{branch}/ci-manifest/python{python_version}:latest", "{prefix}/{branch}/ci/python{python_version}:latest", - "{prefix}/{branch}/prod-build/python{python_version}-build-v2:latest", - "{prefix}/{branch}/prod/python{python_version}-build-v2:latest", + "{prefix}/{branch}/prod-build/python{python_version}-build:latest", + "{prefix}/{branch}/prod/python{python_version}-build:latest", "{prefix}/{branch}/python:{python_version}-slim-buster", ] From 8a66731c1f0fd0ab01928edc0a1670399ab4967a Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Tue, 9 Nov 2021 13:59:42 -0700 Subject: [PATCH 114/250] Update helm chart release docs (#19494) (cherry picked from commit 316632e63bf1ef79446ed2cd9587b2d3d666bf1a) --- dev/README_RELEASE_HELM_CHART.md | 97 ++++++++++++++++++++++++++------ docs/publish_docs.py | 2 +- 2 files changed, 80 insertions(+), 19 deletions(-) diff --git a/dev/README_RELEASE_HELM_CHART.md b/dev/README_RELEASE_HELM_CHART.md index ad103243057a9..54afbe129fca8 100644 --- a/dev/README_RELEASE_HELM_CHART.md +++ b/dev/README_RELEASE_HELM_CHART.md @@ -38,6 +38,11 @@ - [Publish documentation](#publish-documentation) - [Notify developers of release](#notify-developers-of-release) - [Update Announcements page](#update-announcements-page) + - [Create release on GitHub](#create-release-on-github) + - [Close the milestone](#close-the-milestone) + - [Announce the release on the community slack](#announce-the-release-on-the-community-slack) + - [Tweet about the release](#tweet-about-the-release) + - [Bump chart version in Chart.yaml](#bump-chart-version-in-chartyaml) - [Remove old releases](#remove-old-releases) @@ -499,6 +504,7 @@ svn checkout https://dist.apache.org/repos/dist/release/airflow airflow-release # Create new folder for the release cd airflow-release/helm-chart +export AIRFLOW_SVN_RELEASE_HELM=$(pwd) svn mkdir ${VERSION} cd ${VERSION} @@ -518,7 +524,7 @@ Create and push the release tag: ```shell cd "${AIRFLOW_REPO_ROOT}" git checkout helm-chart/${RC} -git tag -s helm-chart/${VERSION} +git tag -s helm-chart/${VERSION} -m "Apache Airflow Helm Chart ${VERSION}" git push origin helm-chart/${VERSION} ``` @@ -529,11 +535,12 @@ In our cases, documentation for the released versions is published in a separate build tools are available in the `apache/airflow` repository, so you have to coordinate between the two repositories to be able to build the documentation. -- First, copy the airflow-site repository and set the environment variable ``AIRFLOW_SITE_DIRECTORY``. +- First, copy the airflow-site repository, create branch, and set the environment variable ``AIRFLOW_SITE_DIRECTORY``. ```shell git clone https://github.com/apache/airflow-site.git airflow-site cd airflow-site + git checkout -b helm-${VERSION}-docs export AIRFLOW_SITE_DIRECTORY="$(pwd)" ``` @@ -545,20 +552,6 @@ between the two repositories to be able to build the documentation. ./breeze build-docs -- --package-filter helm-chart --for-production ``` -- Update `index.yaml` - - We upload `index.yaml` to the Airflow website to allow: `helm repo add https://airflow.apache.org`. - - ```shell - cd "${AIRFLOW_SITE_DIRECTORY}" - curl https://dist.apache.org/repos/dist/dev/airflow/helm-chart/${RC}/index.yaml -o index.yaml - https://dist.apache.org/repos/dist/dev/airflow/helm-chart/${VERSION} - sed -i "s|https://dist.apache.org/repos/dist/dev/airflow/helm-chart/$RC|https://downloads.apache.org/airflow/helm-chart/$VERSION|" index.yaml - - git commit -m "Add documentation for Apache Airflow Helm Chart ${VERSION}" - git push - ``` - - Now you can preview the documentation. ```shell @@ -569,14 +562,33 @@ between the two repositories to be able to build the documentation. ```shell ./docs/publish_docs.py --package-filter helm-chart + ``` + +- Update `index.yaml` + + Regenerate `index.yaml` so it can be added to the Airflow website to allow: `helm repo add https://airflow.apache.org`. + + ```shell cd "${AIRFLOW_SITE_DIRECTORY}" + curl https://dist.apache.org/repos/dist/dev/airflow/helm-chart/$RC/index.yaml -o index.yaml + cp ${AIRFLOW_SVN_RELEASE_HELM}/${VERSION}/airflow-${VERSION}.tgz . + helm repo index --merge ./index.yaml . --url "https://downloads.apache.org/airflow/helm-chart/$VERSION" + rm airflow-${VERSION}.tgz + mv index.yaml landing-pages/site/static/index.yaml + ``` + +- Commit new docs, push, and open PR + + ```shell + git add . git commit -m "Add documentation for Apache Airflow Helm Chart ${VERSION}" git push + # and finally open a PR ``` ## Notify developers of release -- Notify users@airflow.apache.org (cc'ing dev@airflow.apache.org and announce@apache.org) that +- Notify users@airflow.apache.org (cc'ing dev@airflow.apache.org) that the artifacts have been published: Subject: @@ -597,7 +609,7 @@ I am pleased to announce that we have released Apache Airflow Helm chart $VERSIO The source release, as well as the "binary" Helm Chart release, are available: -📦 Official Sources: https://airflow.apache.org/helm-chart/installing-helm-chart-from-sources.html +📦 Official Sources: https://airflow.apache.org/docs/helm-chart/$VERSION/installing-helm-chart-from-sources.html 📦 ArtifactHub: https://artifacthub.io/packages/helm/apache-airflow/airflow 📚 Docs: https://airflow.apache.org/docs/helm-chart/$VERSION/ 🚀 Quick Start Installation Guide: https://airflow.apache.org/docs/helm-chart/$VERSION/quick-start.html @@ -610,10 +622,59 @@ Cheers, EOF ``` +Send the same email to announce@apache.org, except change the opening line to `Dear community,`. +It is more reliable to set it via the web ui at https://lists.apache.org/list.html?announce@apache.org + ## Update Announcements page Update "Announcements" page at the [Official Airflow website](https://airflow.apache.org/announcements/) +## Create release on GitHub + +Create a new release on GitHub with the changelog and assets from the release svn. + +## Close the milestone + +Close the milestone on GitHub. Create the next one if it hasn't been already (it probably has been). + +## Announce the release on the community slack + +Post this in the #announce channel: + +```shell +cat < Date: Tue, 9 Nov 2021 22:03:23 +0100 Subject: [PATCH 115/250] Clarify that .asf.yml and codecov.yml should be changed in main (#19496) (cherry picked from commit 2590013710af22b88a2166a5db1637ee9f789639) --- dev/README_RELEASE_AIRFLOW.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index 24dbd89633f80..bb232ace005ea 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -233,7 +233,14 @@ git checkout v${BRANCH_PREFIX}-stable git push --set-upstream origin v${BRANCH_PREFIX}-stable ```` -### Update coverage branches +### Add branches in the main branch + +You have to do those steps in the `main` branch of the repository: + +```bash +git checkout main +git pull +``` Add ``vX-Y-stable`` and ``vX-Y-test`` branches in ``codecov.yml`` (there are 2 places in the file!) @@ -248,9 +255,7 @@ Add ``vX-Y-stable`` and ``vX-Y-test`` branches in ``codecov.yml`` (there are 2 p - v2-2-test ``` -#### Add protected branches to .asf.yaml - -Add vX-Y-stable to .asf.yaml (X-Y is your new branch) +Add vX-Y-stable to `.asf.yaml` (X-Y is your new branch) ```yaml protected_branches: @@ -264,7 +269,7 @@ protected_branches: ``` -### Create constraints orphan branch +### Create constraints branch out of the constraints-main one ```shell script # First clone the repo From 04b380ba9ba6b304098de46376ec3ad27e444344 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Mon, 15 Nov 2021 11:40:33 +0100 Subject: [PATCH 116/250] Add script to generate issue for status of testing of the rc (#19247) (cherry picked from commit 849a94b5a40c51a7344d158a02a39449ebd720f2) --- dev/ISSUE_TEMPLATE.md.jinja2 | 21 +++ dev/README_RELEASE_AIRFLOW.md | 29 ++++ dev/prepare_release_issue.py | 286 ++++++++++++++++++++++++++++++++++ 3 files changed, 336 insertions(+) create mode 100644 dev/ISSUE_TEMPLATE.md.jinja2 create mode 100755 dev/prepare_release_issue.py diff --git a/dev/ISSUE_TEMPLATE.md.jinja2 b/dev/ISSUE_TEMPLATE.md.jinja2 new file mode 100644 index 0000000000000..35be89265299a --- /dev/null +++ b/dev/ISSUE_TEMPLATE.md.jinja2 @@ -0,0 +1,21 @@ + + +We have a kind request for all the contributors to the latest [Apache Airflow RC {{version}}](https://pypi.org/project/apache-airflow/{{version}}/). + +Could you please help us to test the RC versions of Airflow? + +Please let us know in the comment if the issue is addressed in the latest RC. + +{% for pr_number in pr_list %} + {%- set pr = pull_requests[pr_number] -%} +- [ ] [{{ pr.title }} (#{{ pr.number }})]({{ pr.html_url }}): {{ user_logins[pr_number] }} + {%- if linked_issues[pr_number] %} + Linked issues: + {%- for linked_issue in linked_issues[pr_number] %} + - [{{ linked_issue.title }} (#{{ linked_issue.number }})]({{ linked_issue.html_url }}) + {%- endfor %} + {%- endif %} +{% endfor %} + +Thanks to all who contributed to the release (probably not a complete list!): +{{ all_user_logins }} diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index bb232ace005ea..63d549c3a0bfe 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -27,6 +27,7 @@ - [[\Optional\] Prepare new release branches and cache](#%5Coptional%5C-prepare-new-release-branches-and-cache) - [Prepare PyPI convenience "snapshot" packages](#prepare-pypi-convenience-snapshot-packages) - [Prepare production Docker Image](#prepare-production-docker-image) + - [Prepare issue for testing status of rc](#prepare-issue-for-testing-status-of-rc) - [Prepare Vote email on the Apache Airflow release candidate](#prepare-vote-email-on-the-apache-airflow-release-candidate) - [Verify the release candidate by PMCs](#verify-the-release-candidate-by-pmcs) - [SVN check](#svn-check) @@ -337,6 +338,32 @@ Production Docker images should be manually prepared and pushed by the release m This will wipe Breeze cache and docker-context-files in order to make sure the build is "clean". It also performs image verification before pushing the images. +## Prepare issue for testing status of rc + +For now this part works for bugfix releases only, for major/minor ones we will experiment and +see if there is a way to only extract important/not tested bugfixes and high-level changes to +make the process manageable. + + +Create an issue for testing status of the RC (PREVIOUS_RELEASE should be the previous release version +(for example 2.1.0). + +```shell script +cat < \ + --current-release ${VERSION} + +``` + +Copy the URL of the issue. + ## Prepare Vote email on the Apache Airflow release candidate - Use the dev/airflow-jira script to generate a list of Airflow JIRAs that were closed in the release. @@ -363,6 +390,8 @@ until Monday, October 11, 2021 at 4:00 pm UTC, or until 3 binding +1 votes have https://www.timeanddate.com/worldclock/fixedtime.html?msg=8&iso=20211011T1600&p1=1440 +Status of testing of the release is kept in TODO:URL_OF_THE_ISSUE_HERE + Consider this my (binding) +1. Airflow ${VERSION} is available at: diff --git a/dev/prepare_release_issue.py b/dev/prepare_release_issue.py new file mode 100755 index 0000000000000..c98684d19d200 --- /dev/null +++ b/dev/prepare_release_issue.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import logging +import os +import re +import subprocess +import textwrap +from collections import defaultdict +from typing import Any, Dict, List, NamedTuple, Optional, Set + +import click +from github import Github, Issue, PullRequest, UnknownObjectException +from rich.console import Console +from rich.progress import Progress + +logger = logging.getLogger(__name__) + +console = Console(width=400, color_system="standard") + +MY_DIR_PATH = os.path.dirname(__file__) +SOURCE_DIR_PATH = os.path.abspath(os.path.join(MY_DIR_PATH, os.pardir)) +PR_PATTERN = re.compile(r".*\(#([0-9]+)\)") +ISSUE_MATCH_IN_BODY = re.compile(r" #([0-9]+)[^0-9]") + + +@click.group(context_settings={'help_option_names': ['-h', '--help'], 'max_content_width': 500}) +def cli(): + ... + + +option_verbose = click.option( + "--verbose", + is_flag=True, + help="Print verbose information about performed steps", +) + +option_previous_release = click.option( + "--previous-release", + type=str, + required=True, + help="commit reference (for example hash or tag) of the previous release.", +) + +option_current_release = click.option( + "--current-release", + type=str, + required=True, + help="commit reference (for example hash or tag) of the current release.", +) + +option_github_token = click.option( + "--github-token", + type=str, + required=True, + help=textwrap.dedent( + """ + Github token used to authenticate. + You can set omit it if you have GITHUB_TOKEN env variable set + Can be generated with: + https://github.com/settings/tokens/new?description=Read%20sssues&scopes=repo:status""" + ), + envvar='GITHUB_TOKEN', +) + +option_excluded_pr_list = click.option( + "--excluded-pr-list", type=str, default='', help="Coma-separated list of PRs to exclude from the issue." +) + +option_limit_pr_count = click.option( + "--limit-pr-count", + type=int, + default=None, + help="Limit PR count processes (useful for testing small subset of PRs).", +) + + +def get_git_log_command( + verbose: bool, from_commit: Optional[str] = None, to_commit: Optional[str] = None +) -> List[str]: + """ + Get git command to run for the current repo from the current folder (which is the package folder). + :param verbose: whether to print verbose info while getting the command + :param from_commit: if present - base commit from which to start the log from + :param to_commit: if present - final commit which should be the start of the log + :return: git command to run + """ + git_cmd = [ + "git", + "log", + "--pretty=format:%H %h %cd %s", + "--date=short", + ] + if from_commit and to_commit: + git_cmd.append(f"{from_commit}...{to_commit}") + elif from_commit: + git_cmd.append(from_commit) + git_cmd.extend(['--', '.']) + if verbose: + console.print(f"Command to run: '{' '.join(git_cmd)}'") + return git_cmd + + +class Change(NamedTuple): + """Stores details about commits""" + + full_hash: str + short_hash: str + date: str + message: str + message_without_backticks: str + pr: Optional[int] + + +def get_change_from_line(line: str): + split_line = line.split(" ", maxsplit=3) + message = split_line[3] + pr = None + pr_match = PR_PATTERN.match(message) + if pr_match: + pr = pr_match.group(1) + return Change( + full_hash=split_line[0], + short_hash=split_line[1], + date=split_line[2], + message=message, + message_without_backticks=message.replace("`", "'").replace("&39;", "'"), + pr=int(pr) if pr else None, + ) + + +def get_changes(verbose: bool, previous_release: str, current_release: str) -> List[Change]: + change_strings = subprocess.check_output( + get_git_log_command(verbose, from_commit=previous_release, to_commit=current_release), + cwd=SOURCE_DIR_PATH, + universal_newlines=True, + ) + return [get_change_from_line(line) for line in change_strings.split("\n")] + + +def render_template( + template_name: str, + context: Dict[str, Any], + autoescape: bool = True, + keep_trailing_newline: bool = False, +) -> str: + """ + Renders template based on it's name. Reads the template from _TEMPLATE.md.jinja2 in current dir. + :param template_name: name of the template to use + :param context: Jinja2 context + :param autoescape: Whether to autoescape HTML + :param keep_trailing_newline: Whether to keep the newline in rendered output + :return: rendered template + """ + import jinja2 + + template_loader = jinja2.FileSystemLoader(searchpath=MY_DIR_PATH) + template_env = jinja2.Environment( + loader=template_loader, + undefined=jinja2.StrictUndefined, + autoescape=autoescape, + keep_trailing_newline=keep_trailing_newline, + ) + template = template_env.get_template(f"{template_name}_TEMPLATE.md.jinja2") + content: str = template.render(context) + return content + + +def print_issue_content( + current_release: str, + pull_requests: Dict[int, PullRequest.PullRequest], + linked_issues: Dict[int, List[Issue.Issue]], + users: Dict[int, Set[str]], +): + pr_list = list(pull_requests.keys()) + pr_list.sort() + user_logins: Dict[int, str] = {pr: "@" + " @".join(users[pr]) for pr in users} + all_users: Set[str] = set() + for user_list in users.values(): + all_users.update(user_list) + all_user_logins = "@" + " @".join(all_users) + content = render_template( + template_name='ISSUE', + context={ + 'version': current_release, + 'pr_list': pr_list, + 'pull_requests': pull_requests, + 'linked_issues': linked_issues, + 'users': users, + 'user_logins': user_logins, + 'all_user_logins': all_user_logins, + }, + autoescape=False, + keep_trailing_newline=True, + ) + print(content) + + +@cli.command() +@option_github_token +@option_previous_release +@option_current_release +@option_excluded_pr_list +@option_verbose +@option_limit_pr_count +def generate_issue_content( + github_token: str, + previous_release: str, + current_release: str, + excluded_pr_list: str, + verbose: bool, + limit_pr_count: Optional[int], +): + if excluded_pr_list: + excluded_prs = [int(pr) for pr in excluded_pr_list.split(",")] + else: + excluded_prs = [] + changes = get_changes(verbose, previous_release, current_release) + prs = list( + filter(lambda pr: pr is not None and pr not in excluded_prs, [change.pr for change in changes]) + ) + g = Github(github_token) + repo = g.get_repo("apache/airflow") + pull_requests: Dict[int, PullRequest.PullRequest] = {} + linked_issues: Dict[int, List[Issue.Issue]] = defaultdict(lambda: []) + users: Dict[int, Set[str]] = defaultdict(lambda: set()) + count_prs = len(prs) + if limit_pr_count: + count_prs = limit_pr_count + with Progress(console=console) as progress: + task = progress.add_task(f"Retrieving {count_prs} PRs ", total=count_prs) + for i in range(count_prs): + pr_number = prs[i] + progress.console.print( + f"Retrieving PR#{pr_number}: " f"https://github.com/apache/airflow/pull/{pr_number}" + ) + try: + pr = repo.get_pull(pr_number) + except UnknownObjectException: + # Fallback to issue if PR not found + try: + pr = repo.get_issue(pr_number) # (same fields as PR) + except UnknownObjectException: + console.print(f"[red]The PR #{pr_number} could not be found[/]") + continue + pull_requests[pr_number] = pr + # GitHub does not have linked issues in PR - but we quite rigorously add Fixes/Closes + # Relate so we can find those from the body + if pr.body: + body = pr.body.replace("\n", " ").replace("\r", " ") + for issue_match in ISSUE_MATCH_IN_BODY.finditer(body): + linked_issue_number = int(issue_match.group(1)) + progress.console.print( + f"Retrieving Linked issue PR#{linked_issue_number}: " + f"https://github.com/apache/airflow/issue/{linked_issue_number}" + ) + try: + linked_issues[pr_number].append(repo.get_issue(linked_issue_number)) + except UnknownObjectException: + progress.console.print( + f"Failed to retrieve linked issue #{linked_issue_number}: Unknown Issue" + ) + users[pr_number].add(pr.user.login) + for linked_issue in linked_issues[pr_number]: + users[pr_number].add(linked_issue.user.login) + progress.advance(task) + print_issue_content(current_release, pull_requests, linked_issues, users) + + +if __name__ == "__main__": + cli() From b2c90852748697c1d95740e28edc437d6b842a2e Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Wed, 29 Dec 2021 14:24:46 -0700 Subject: [PATCH 117/250] Add script to generate chart changelog annotations (#20555) (cherry picked from commit c56835304318f0695c79ac42df7a97ad05ccd21e) --- dev/README_RELEASE_HELM_CHART.md | 20 +++++ dev/chart/build_changelog_annotations.py | 108 +++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100755 dev/chart/build_changelog_annotations.py diff --git a/dev/README_RELEASE_HELM_CHART.md b/dev/README_RELEASE_HELM_CHART.md index 54afbe129fca8..eedaf156d7f20 100644 --- a/dev/README_RELEASE_HELM_CHART.md +++ b/dev/README_RELEASE_HELM_CHART.md @@ -65,6 +65,26 @@ commits between the last release, `1.1.0`, and `main`: git log --oneline helm-chart/1.1.0..main --pretty='format:- %s' -- chart/ docs/helm-chart/ ``` +### Add changelog annotations to `Chart.yaml` + +Once the changelog has been built, run the script to generate the changelog annotations. + +```shell +./dev/chart/build_changelog_annotations.py +``` + +Verify the output looks right (only entries from this release), then put them in `Chart.yaml`, for example: + +```yaml +annotations: + artifacthub.io/changes: | + - kind: added + description: Add resources for `cleanup` and `createuser` jobs + links: + - name: "#19263" + url: https://github.com/apache/airflow/pull/19263 +``` + ## Build RC artifacts The Release Candidate artifacts we vote upon should be the exact ones we vote against, diff --git a/dev/chart/build_changelog_annotations.py b/dev/chart/build_changelog_annotations.py new file mode 100755 index 0000000000000..0497588fe3eae --- /dev/null +++ b/dev/chart/build_changelog_annotations.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +''' +Take normal chart CHANGELOG entries and build ArtifactHub changelog annotations. +Only outputs the annotations for the latest release in the CHANGELOG. + +e.g from: + +New Features +"""""""""""" + +- Add resources for `cleanup` and `createuser` jobs (#19263) + +to: + +- kind: added + description: Add resources for `cleanup` and `createuser` jobs + links: + - name: "#19263" + url: https://github.com/apache/airflow/pull/19263 +''' + + +import re +from typing import Dict, List, Optional, Tuple, Union + +import yaml + +TYPE_MAPPING = { + # CHANGELOG: (ArtifactHub kind, prefix for description) + # ArtifactHub kind must be one of: added, changed, deprecated, removed, fixed or security + "New Features": ("added", None), + "Improvements": ("changed", None), + "Bug Fixes": ("fixed", None), + "Doc only changes": ("changed", "Docs"), + "Misc": ("changed", "Misc"), +} + +PREFIXES_TO_STRIP = [ + # case insensitive + "Chart:", + "Chart Docs:", +] + + +def parse_line(line: str) -> Tuple[Optional[str], Optional[int]]: + match = re.search(r'^- (.*?)(?:\(#(\d+)\)){0,1}$', line) + if not match: + return None, None + desc, pr_number = match.groups() + return desc.strip(), int(pr_number) + + +def print_entry(section: str, description: str, pr_number: Optional[int]): + for unwanted_prefix in PREFIXES_TO_STRIP: + if description.lower().startswith(unwanted_prefix.lower()): + description = description[len(unwanted_prefix) :].strip() + + kind, prefix = TYPE_MAPPING[section] + if prefix: + description = f"{prefix}: {description}" + entry: Dict[str, Union[str, List]] = {"kind": kind, "description": description} + if pr_number: + entry["links"] = [ + {"name": f"#{pr_number}", "url": f"https://github.com/apache/airflow/pull/{pr_number}"} + ] + print(yaml.dump([entry])) + + +in_first_release = False +section = "" +with open("chart/CHANGELOG.txt") as f: + for line in f: + line = line.strip() + if not line: + continue + if line.startswith("Airflow Helm Chart"): + # We only want to get annotations for the "latest" release + if in_first_release: + break + in_first_release = True + continue + if line.startswith('"""') or line.startswith('----'): + continue + if not line.startswith('- '): + section = line + continue + + description, pr = parse_line(line) + if description: + print_entry(section, description, pr) From c92409c8e18e09650f317f8435f030662adb964c Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 16 Nov 2021 18:08:24 +0100 Subject: [PATCH 118/250] Update docs about releasing providersk (#19549) (cherry picked from commit dd410fd3c9de14e94034dcb4ccae52bbf5216199) --- dev/README_RELEASE_PROVIDER_PACKAGES.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/dev/README_RELEASE_PROVIDER_PACKAGES.md b/dev/README_RELEASE_PROVIDER_PACKAGES.md index f4e4bb08aa19d..c9d94b1b7b38b 100644 --- a/dev/README_RELEASE_PROVIDER_PACKAGES.md +++ b/dev/README_RELEASE_PROVIDER_PACKAGES.md @@ -38,7 +38,7 @@ - [Publish release](#publish-release) - [Summarize the voting for the Apache Airflow release](#summarize-the-voting-for-the-apache-airflow-release) - [Publish release to SVN](#publish-release-to-svn) - - [Publish the Regular convenience package to PyPI](#publish-the-regular-convenience-package-to-pypi-1) + - [Publish the packages to PyPI](#publish-the-packages-to-pypi) - [Publish documentation prepared before](#publish-documentation-prepared-before) - [Add tags in git](#add-tags-in-git-1) - [Notify developers of release](#notify-developers-of-release) @@ -738,13 +738,13 @@ Verify that the packages appear in [providers](https://dist.apache.org/repos/dist/release/airflow/providers) -## Publish the Regular convenience package to PyPI +## Publish the packages to PyPI -By that time the packages with proper name (renamed from rc* to final version should be in your dist -folder. +By that time the packages should be in your dist folder. ```shell script cd ${AIRFLOW_REPO_ROOT} +git checkout ``` * Verify the artifacts that would be uploaded: @@ -768,6 +768,8 @@ twine upload -r pypitest ${AIRFLOW_REPO_ROOT}/dist/*.whl ${AIRFLOW_REPO_ROOT}/di twine upload -r pypi ${AIRFLOW_REPO_ROOT}/dist/*.whl ${AIRFLOW_REPO_ROOT}/dist/*.tar.gz ``` +Copy links to updated packages. + * Again, confirm that the packages are available under the links printed. ## Publish documentation prepared before @@ -805,11 +807,13 @@ Dear Airflow community, I'm happy to announce that new versions of Airflow Providers packages were just released. +TODO: If there is just a few packages to release - paste the links to PyPI packages. Otherwise delete this TODO (too many links make the message unclear). + The source release, as well as the binary releases, are available here: https://airflow.apache.org/docs/apache-airflow-providers/installing-from-sources -You can install the providers via PyPI https://airflow.apache.org/apache-airflow-providers/installing-from-pypi +You can install the providers via PyPI https://airflow.apache.org/docs/apache-airflow-providers/installing-from-pypi The documentation is available at https://airflow.apache.org/docs/ and linked from the PyPI packages. From c7236c7389ba254f8d3ad3285ccab0062c561d5c Mon Sep 17 00:00:00 2001 From: Khalid Mammadov Date: Wed, 17 Nov 2021 13:41:52 +0000 Subject: [PATCH 119/250] Fix failing CI phase with unhealthy container issue (#19633) Fix failing CI phase with unhealthy container issue * Add post cleanup * Pin pinot to stable version * Pin grafana to stable version Co-authored-by: Jarek Potiuk (cherry picked from commit fcf90c5970aaf7043b1a57d58296d7fd80d6ebf9) --- scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh index dd5c27a9aefe4..427fc5a2783cb 100755 --- a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh +++ b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh @@ -125,6 +125,7 @@ function run_airflow_testing_in_docker() { "${DOCKER_COMPOSE_LOCAL[@]}" \ --project-name "airflow-${TEST_TYPE}-${BACKEND}" \ run airflow "${@}" + docker ps exit_code=$? docker ps if [[ ${exit_code} != "0" && ${CI} == "true" ]]; then From 371ae8f542d48b18d02d1d85b0949882c67c0d07 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 17 Nov 2021 19:49:46 +0100 Subject: [PATCH 120/250] Fix dumping container logs on error (#19645) When we optimized tests for memory use we added cleanup of all containers after each test suite. Unfortunately it caused dumping container logs to stop working because this dumping was done only only when the script was exiting. This PR moves dumping container logs to between the test run and cleanup, so that we can see the logs when there is a test failure. Related to: #19633 where the logs were not dumped and it made the analysis much more difficult. (cherry picked from commit 7cda7d4b5e413925bf639976e77ebf2442b4bff9) --- scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh index 427fc5a2783cb..596bea14d5838 100755 --- a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh +++ b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh @@ -127,7 +127,6 @@ function run_airflow_testing_in_docker() { run airflow "${@}" docker ps exit_code=$? - docker ps if [[ ${exit_code} != "0" && ${CI} == "true" ]]; then docker ps --all local container From a5ad8bbe257afc442eee3e792b9c2b75eaf7c88b Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Wed, 17 Nov 2021 16:23:37 -0700 Subject: [PATCH 121/250] Update Airflow release guide (#19663) (cherry picked from commit 9a246d3fa30439fb2240458dbb220c24214b4831) --- dev/README_RELEASE_AIRFLOW.md | 112 ++++++++++++++++++++++++++++------ 1 file changed, 94 insertions(+), 18 deletions(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index 63d549c3a0bfe..a209f2a4a10be 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -43,7 +43,11 @@ - [Publish documentation](#publish-documentation) - [Notify developers of release](#notify-developers-of-release) - [Update Announcements page](#update-announcements-page) - - [Update the bug issue template](#update-the-bug-issue-template) + - [Create release on GitHub](#create-release-on-github) + - [Close the milestone](#close-the-milestone) + - [Announce the release on the community slack](#announce-the-release-on-the-community-slack) + - [Tweet about the release](#tweet-about-the-release) + - [Update `main` with latest release details](#update-main-with-latest-release-details) - [Update default Airflow version in the helm chart](#update-default-airflow-version-in-the-helm-chart) - [Update airflow/config_templates/config.yml file](#update-airflowconfig_templatesconfigyml-file) @@ -81,7 +85,7 @@ The Release Candidate artifacts we vote upon should be the exact ones we vote ag # Set Version export VERSION=2.1.2rc3 export VERSION_SUFFIX=rc3 - export VERSION_CONSTRAINT_BRANCH=2-1 + export VERSION_BRANCH=2-1 export VERSION_WITHOUT_RC=${VERSION/rc?/} # Set AIRFLOW_REPO_ROOT to the path of your git repo @@ -94,17 +98,29 @@ The Release Candidate artifacts we vote upon should be the exact ones we vote ag export AIRFLOW_REPO_ROOT=$(pwd) ``` +- Check out the 'test' branch + + ```shell script + git checkout v${VERSION_BRANCH}-test + ``` + - Set your version to 2.0.N in `setup.py` (without the RC tag) - Replace the version in `README.md` and verify that installation instructions work fine. - Add a commit that updates `CHANGELOG.md` to add changes from previous version if it has not already added. For now this is done manually, example run `git log --oneline v2-2-test..HEAD --pretty='format:- %s'` and categorize them. - Add section for the release in `UPDATING.md`. If no new entries exist, put "No breaking changes" (e.g. `2.1.4`). - Commit the version change. +- PR from the 'test' branch to the 'stable' branch, and manually merge it once approved. +- Check out the 'stable' branch + + ```shell script + git checkout v${VERSION_BRANCH}-stable + ``` - Tag your release ```shell script - git tag -s ${VERSION} + git tag -s ${VERSION} -m "Apache Airflow ${VERSION}" ``` - Clean the checkout: the sdist step below will @@ -143,7 +159,7 @@ For now this is done manually, example run `git log --oneline v2-2-test..HEAD - - Tag & Push the latest constraints files. This pushes constraints with rc suffix (this is expected)! ```shell script - git checkout origin/constraints-${VERSION_CONSTRAINT_BRANCH} + git checkout origin/constraints-${VERSION_BRANCH} git tag -s "constraints-${VERSION}" git push origin "constraints-${VERSION}" ``` @@ -153,9 +169,11 @@ For now this is done manually, example run `git log --oneline v2-2-test..HEAD - ```shell script # First clone the repo svn checkout https://dist.apache.org/repos/dist/dev/airflow airflow-dev + cd airflow-dev + # Or move into it if you already have it cloned # Create new folder for the release - cd airflow-dev + svn update svn mkdir ${VERSION} # Move the artifacts to svn folder & commit @@ -287,6 +305,13 @@ publish "snapshots" of the RC builds to PyPI for installing via pip: To do this we need to +- Checkout the rc tag: + + ```shell script + cd "${AIRFLOW_REPO_ROOT}" + git checkout ${VERSION} + ``` + - Build the package: ```shell script @@ -309,7 +334,10 @@ To do this we need to https://test.pypi.org/project/apache-airflow/#files - Upload the package to PyPI's production environment: -`twine upload -r pypi dist/*` + + ```shell script + twine upload -r pypi dist/* + ``` - Again, confirm that the package is available here: https://pypi.python.org/pypi/apache-airflow @@ -422,7 +450,7 @@ the artifact checksums when we actually release. Full Changelog: https://github.com/apache/airflow/blob/${VERSION}/CHANGELOG.txt -Changes since ${VERSION}: +Changes since PREVIOUS_VERSION_OR_RC: *Bugs*: [AIRFLOW-3732] Fix issue when trying to edit connection in RBAC UI [AIRFLOW-2866] Fix missing CSRF token head when using RBAC UI (#3804) @@ -718,8 +746,6 @@ cd "${VERSION}" # Move the artifacts to svn folder & commit for f in ${AIRFLOW_DEV_SVN}/$RC/*; do svn cp "$f" "${$(basename $f)/}" - # Those will be used to upload to PyPI - cp "$f" "${AIRFLOW_SOURCES}/dist/${$(basename $f)/}" done svn commit -m "Release Airflow ${VERSION} from ${RC}" @@ -742,13 +768,13 @@ previously released RC candidates in "${AIRFLOW_SOURCES}/dist": ```shell script cd "${AIRFLOW_RELEASE_SVN}/${VERSION}" - twine check dist/* + twine check *.whl *${VERSION}.tar.gz ``` - Upload the package to PyPI's test environment: ```shell script - twine upload -r pypitest dist/* + twine upload -r pypitest *.whl *${VERSION}.tar.gz ``` - Verify that the test package looks good by downloading it and installing it into a virtual environment. @@ -757,7 +783,7 @@ previously released RC candidates in "${AIRFLOW_SOURCES}/dist": - Upload the package to PyPI's production environment: ```shell script - twine upload -r pypi dist/* + twine upload -r pypi *.whl *${VERSION}.tar.gz ``` - Again, confirm that the package is available here: https://pypi.python.org/pypi/apache-airflow @@ -765,8 +791,9 @@ previously released RC candidates in "${AIRFLOW_SOURCES}/dist": - Re-Tag & Push the constraints files with the final release version. ```shell script + cd "${AIRFLOW_REPO_ROOT}" git checkout constraints-${RC} - git tag -s "constraints-${VERSION}" + git tag -s "constraints-${VERSION}" -m "Constraints for Apache Airflow ${VERSION}" git push origin "constraints-${VERSION}" ``` @@ -777,7 +804,8 @@ previously released RC candidates in "${AIRFLOW_SOURCES}/dist": (both airflow and latest provider packages). ```shell script - git checkout ${VERSION} + git checkout ${RC} + git tag -s ${VERSION} -m "Apache Airflow ${VERSION}" git push origin ${VERSION} ``` @@ -836,14 +864,14 @@ Documentation for providers can be found in the ``/docs/apache-airflow`` directo ## Notify developers of release -- Notify users@airflow.apache.org (cc'ing dev@airflow.apache.org and announce@apache.org) that +- Notify users@airflow.apache.org (cc'ing dev@airflow.apache.org) that the artifacts have been published: Subject: ```shell script cat < Date: Thu, 18 Nov 2021 12:50:14 +0000 Subject: [PATCH 122/250] Fix CI tests so they correctly fail in case of error! (#19678) (cherry picked from commit 889f1571259ae5ce83fb8723ac2d10cd21dc9d50) --- scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh index 596bea14d5838..dd5c27a9aefe4 100755 --- a/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh +++ b/scripts/ci/testing/ci_run_single_airflow_test_in_docker.sh @@ -125,8 +125,8 @@ function run_airflow_testing_in_docker() { "${DOCKER_COMPOSE_LOCAL[@]}" \ --project-name "airflow-${TEST_TYPE}-${BACKEND}" \ run airflow "${@}" - docker ps exit_code=$? + docker ps if [[ ${exit_code} != "0" && ${CI} == "true" ]]; then docker ps --all local container From 84082d3fdb56a07c641709d8327261eb8718e0b1 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 23 Nov 2021 19:04:50 +0100 Subject: [PATCH 123/250] Add retagging images accross repos (#19778) Useful to refresh cache images to a different repository - in order to speed up builds there. (cherry picked from commit bccb45f5fe067ffa64f5f303bfbb6e8c1b552add) --- dev/retag_docker_images.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/dev/retag_docker_images.py b/dev/retag_docker_images.py index 7ab3e0ef42825..09faff025f82f 100755 --- a/dev/retag_docker_images.py +++ b/dev/retag_docker_images.py @@ -33,28 +33,35 @@ PYTHON_VERSIONS = ["3.6", "3.7", "3.8", "3.9"] -GHCR_IO_PREFIX = "ghcr.io/apache/airflow" +GHCR_IO_PREFIX = "ghcr.io" + GHCR_IO_IMAGES = [ - "{prefix}/{branch}/ci-manifest/python{python_version}:latest", - "{prefix}/{branch}/ci/python{python_version}:latest", - "{prefix}/{branch}/prod-build/python{python_version}-build:latest", - "{prefix}/{branch}/prod/python{python_version}-build:latest", - "{prefix}/{branch}/python:{python_version}-slim-buster", + "{prefix}/{repo}/{branch}/ci-manifest/python{python_version}:latest", + "{prefix}/{repo}/{branch}/ci/python{python_version}:latest", + "{prefix}/{repo}/{branch}/prod-build/python{python_version}:latest", + "{prefix}/{repo}/{branch}/prod/python{python_version}:latest", + "{prefix}/{repo}/{branch}/python:{python_version}-slim-buster", ] # noinspection StrFormat def pull_push_all_images( - source_prefix: str, target_prefix: str, images: List[str], source_branch: str, target_branch: str + source_prefix: str, + target_prefix: str, + images: List[str], + source_branch: str, + source_repo: str, + target_branch: str, + target_repo: str, ): for python_version in PYTHON_VERSIONS: for image in images: source_image = image.format( - prefix=source_prefix, branch=source_branch, python_version=python_version + prefix=source_prefix, branch=source_branch, repo=source_repo, python_version=python_version ) target_image = image.format( - prefix=target_prefix, branch=target_branch, python_version=python_version + prefix=target_prefix, branch=target_branch, repo=target_repo, python_version=python_version ) print(f"Copying image: {source_image} -> {target_image}") subprocess.run(["docker", "pull", source_image], check=True) @@ -65,11 +72,17 @@ def pull_push_all_images( @click.group(invoke_without_command=True) @click.option("--source-branch", type=str, default="main", help="Source branch name [main]") @click.option("--target-branch", type=str, default="main", help="Target branch name [main]") +@click.option("--source-repo", type=str, default="apache/airflow", help="Source repo") +@click.option("--target-repo", type=str, default="apache/airflow", help="Target repo") def main( source_branch: str, target_branch: str, + source_repo: str, + target_repo: str, ): - pull_push_all_images(GHCR_IO_PREFIX, GHCR_IO_PREFIX, GHCR_IO_IMAGES, source_branch, target_branch) + pull_push_all_images( + GHCR_IO_PREFIX, GHCR_IO_PREFIX, GHCR_IO_IMAGES, source_branch, source_repo, target_branch, target_repo + ) if __name__ == "__main__": From 3af9ffdf274df161e26481a9bef1d64429a9a54e Mon Sep 17 00:00:00 2001 From: Malthe Borch Date: Wed, 24 Nov 2021 18:54:38 +0100 Subject: [PATCH 124/250] Upload provider distribution artifacts during CI (#19807) (cherry picked from commit 5e78c2c3cc94da2328c3f664f80f00b86489e512) --- .github/workflows/ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 36d4e8252983f..9002b284453fc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -626,6 +626,12 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" env: USE_AIRFLOW_VERSION: "sdist" PACKAGE_FORMAT: "sdist" + - name: "Upload provider distribution artifacts" + uses: actions/upload-artifact@v2 + with: + name: airflow-provider-packages + path: "./dist/apache-airflow-providers-*.tar.gz" + retention-days: 1 tests-helm: timeout-minutes: 80 From dfde1bac9245ac441869beb9853063fc5048ee91 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 24 Nov 2021 19:36:26 +0100 Subject: [PATCH 125/250] Bring back Core and Other tests to be run in parallel (#19812) After merging #19809 we can very likely come back to parallel running of Core and Other tests as we separated them out thinking that the parallel runs were the cause of the problems. Those tests should be perfectly fine to run in parallel now. (cherry picked from commit 6c80149d0abf84caec8f4c1b4e8795ea5923f89a) --- scripts/ci/testing/ci_run_airflow_testing.sh | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/scripts/ci/testing/ci_run_airflow_testing.sh b/scripts/ci/testing/ci_run_airflow_testing.sh index 8ee56921a6bf4..c3bee5e131d30 100755 --- a/scripts/ci/testing/ci_run_airflow_testing.sh +++ b/scripts/ci/testing/ci_run_airflow_testing.sh @@ -88,18 +88,9 @@ function run_all_test_types_in_parallel() { echo "${COLOR_YELLOW}Heavy tests will be run sequentially after parallel tests including cleaning up docker between tests${COLOR_RESET}" echo "" if [[ ${test_types_to_run} == *"Integration"* ]]; then + echo "${COLOR_YELLOW}Remove Integration from tests_types_to_run and add them to sequential tests due to low memory.${COLOR_RESET}" test_types_to_run="${test_types_to_run//Integration/}" - if [[ ${BACKEND} == "mssql" ]]; then - # Also for mssql we skip Integration tests altogether on Public Runners. Mssql uses far - # too much memory and often shuts down and similarly as in case of Providers tests, - # there is no need to run them also for MsSQL engine as those integration tests - # are not really using any metadata-specific behaviour. - # Those tests will run in `main` anyway. - echo "${COLOR_YELLOW}Do not run integration tests for mssql in small systems due to memory issues.${COLOR_RESET}" - else - echo "${COLOR_YELLOW}Remove Integration from tests_types_to_run and add them to sequential tests due to low memory.${COLOR_RESET}" - sequential_tests+=("Integration") - fi + sequential_tests+=("Integration") fi if [[ ${BACKEND} == "mssql" || ${BACKEND} == "mysql" ]]; then # For mssql/mysql - they take far more memory than postgres (or sqlite) - we skip the Provider From b96093d08f672b08d43236dfaeca6ccfc9b485ca Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 24 Nov 2021 18:25:06 +0100 Subject: [PATCH 126/250] Fix OOM error in tests when using public Github Runners. (#19809) There was a side effect caused by th TestStandardRunner test that caused broken logging configuration, which in turn created OutOfMemory condition for our Public GitHubRunners. The problem was that the test overrode the configuration of logging with some simple test configuration, but never restored the default configuration, which resulted in airflow.processor logger that was created before contain empty handlers. Since the airflow.processor logger has "propagate" set to False, empty handlers normally cause a lastResort handler call, which by default redirects everything to Stderr and this is what happened in DagFile Processor tests. However, DagFileProcessor uses stderr_redirect which replaces sys.stderr with provided stream. In this case however the stream set (StreamLogWriter) redirected the output to "airflow.processor" logger - which in turn (as last resort) redirected everything to sys.stderr which in turn redirected everything to "airflow.processor" logger etc. This resulted in: * OOM condition in Public GitHub Runners * DagFileProcessor failing with exceeded recursion depth when there was enough memory to get there. The condition was triggered by two preceding tests: 1) First test_plugins_manger.py initialized logging for airflow.processor and stored it in logging manager 2) The TestStandardTaskRunner test applied simpler configuration but the way configure() works - it did not remove the "airflow.processor" logger, but it REMOVED all handlers registered for it - and never restored the default configuration 3) The DagFileProcessor logs caused infinite recursion The fix is two-fold: * the TestStandardTaskRunner restores default config after test * the DagFileProcessor sets default config before starting (cherry picked from commit 14bff79bf271cd63cc6e3b98dc4aa232001472cb) --- tests/dag_processing/test_manager.py | 3 +++ tests/task/task_runner/test_standard_task_runner.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/tests/dag_processing/test_manager.py b/tests/dag_processing/test_manager.py index 78921b2b66415..b549b2b1e129a 100644 --- a/tests/dag_processing/test_manager.py +++ b/tests/dag_processing/test_manager.py @@ -26,6 +26,7 @@ import threading import unittest from datetime import datetime, timedelta +from logging.config import dictConfig from tempfile import TemporaryDirectory from textwrap import dedent from unittest import mock @@ -34,6 +35,7 @@ import pytest from freezegun import freeze_time +from airflow.config_templates.airflow_local_settings import DEFAULT_LOGGING_CONFIG from airflow.configuration import conf from airflow.dag_processing.manager import ( DagFileProcessorAgent, @@ -111,6 +113,7 @@ def waitable_handle(self): class TestDagFileProcessorManager: def setup_method(self): + dictConfig(DEFAULT_LOGGING_CONFIG) clear_db_runs() clear_db_serialized_dags() clear_db_dags() diff --git a/tests/task/task_runner/test_standard_task_runner.py b/tests/task/task_runner/test_standard_task_runner.py index abd9f7103ef1e..34054106f68e8 100644 --- a/tests/task/task_runner/test_standard_task_runner.py +++ b/tests/task/task_runner/test_standard_task_runner.py @@ -24,6 +24,7 @@ import psutil import pytest +from airflow.config_templates.airflow_local_settings import DEFAULT_LOGGING_CONFIG from airflow.jobs.local_task_job import LocalTaskJob from airflow.models.dagbag import DagBag from airflow.models.taskinstance import TaskInstance @@ -70,6 +71,7 @@ def logging_and_db(self): airflow_logger = logging.getLogger('airflow') airflow_logger.handlers = [] clear_db_runs() + dictConfig(DEFAULT_LOGGING_CONFIG) def test_start_and_terminate(self): local_task_job = mock.Mock() From 59177190d01a722d3cc49d9b8c06f6daa1ac2ad3 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Sat, 11 Dec 2021 09:04:35 -0700 Subject: [PATCH 127/250] Update docs/tools for releasing core Airflow (#20211) When building the "testing status" issue, don't include things skipped on the changelog or doc-only changes. Also, don't add skipped changelog entries in the changelog. (cherry picked from commit 993ed933e95970d14e0b0b5659ad28f15a0e5fde) --- dev/README_RELEASE_AIRFLOW.md | 4 ++-- dev/airflow-github | 9 ++++++++- dev/prepare_release_issue.py | 6 ++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index a209f2a4a10be..f10306b2ee380 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -160,7 +160,7 @@ For now this is done manually, example run `git log --oneline v2-2-test..HEAD - ```shell script git checkout origin/constraints-${VERSION_BRANCH} - git tag -s "constraints-${VERSION}" + git tag -s "constraints-${VERSION}" -m "Constraints for Apache Airflow ${VERSION}" git push origin "constraints-${VERSION}" ``` @@ -300,7 +300,7 @@ protected_branches: ## Prepare PyPI convenience "snapshot" packages -At this point we have the artefact that we vote on, but as a convenience to developers we also want to +At this point we have the artifact that we vote on, but as a convenience to developers we also want to publish "snapshots" of the RC builds to PyPI for installing via pip: To do this we need to diff --git a/dev/airflow-github b/dev/airflow-github index 75a37633453d3..2a09a72f64926 100755 --- a/dev/airflow-github +++ b/dev/airflow-github @@ -72,6 +72,8 @@ def get_issue_type(issue): for label in issue.labels: if label.name.startswith(label_prefix): return label.name.replace(label_prefix, "").strip() + if label.name == "changelog:skip": + return "(skip)" return issue_type @@ -107,6 +109,8 @@ def is_pr(issue: Issue) -> bool: def print_changelog(sections): for section, lines in sections.items(): + if section == "(skip)": + continue print(section) print('"' * len(section)) for line in lines: @@ -133,7 +137,7 @@ def cli(): help="Specify the previous tag on the working branch to limit" " searching for few commits to find the cherry-picked commits", ) -@click.option('--unmerged', 'show_uncherrypicked_only', help="Show unmerged issues only", is_flag=True) +@click.option('--unmerged', 'show_uncherrypicked_only', help="Show unmerged PRs only", is_flag=True) def compare(target_version, github_token, previous_version=None, show_uncherrypicked_only=False): repo = git.Repo(".", search_parent_directories=True) @@ -174,6 +178,9 @@ def compare(target_version, github_token, previous_version=None, show_uncherrypi if show_uncherrypicked_only: continue cherrypicked = click.style("Yes".ljust(6), "green") + elif not issue_is_pr and show_uncherrypicked_only: + # Don't show issues when looking for unmerged PRs + continue elif issue_is_pr: num_uncherrypicked[status] += 1 cherrypicked = click.style("No".ljust(6), "red") diff --git a/dev/prepare_release_issue.py b/dev/prepare_release_issue.py index c98684d19d200..f744da7d5a074 100755 --- a/dev/prepare_release_issue.py +++ b/dev/prepare_release_issue.py @@ -258,6 +258,12 @@ def generate_issue_content( except UnknownObjectException: console.print(f"[red]The PR #{pr_number} could not be found[/]") continue + + # Ignore doc-only and skipped PRs + label_names = [label.name for label in pr.labels] + if "type:doc-only" in label_names or "changelog:skip" in label_names: + continue + pull_requests[pr_number] = pr # GitHub does not have linked issues in PR - but we quite rigorously add Fixes/Closes # Relate so we can find those from the body From 3162de5109f1eb25a8c419654cc9a827983eff1a Mon Sep 17 00:00:00 2001 From: Josh Fell <48934154+josh-fell@users.noreply.github.com> Date: Mon, 13 Dec 2021 14:01:25 -0500 Subject: [PATCH 128/250] Fix MyPy errors in `dev/*` (#20261) (cherry picked from commit 08e835729b50cac2a68fab24bf2b52a587112776) --- dev/import_all_classes.py | 4 ++-- dev/prepare_release_issue.py | 16 ++++++++----- .../prepare_provider_packages.py | 24 +++++++++++++------ ...validate_version_added_fields_in_config.py | 8 +++---- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/dev/import_all_classes.py b/dev/import_all_classes.py index 67a76c840a1bf..fa42f400b8f1d 100755 --- a/dev/import_all_classes.py +++ b/dev/import_all_classes.py @@ -22,7 +22,7 @@ import traceback import warnings from inspect import isclass -from typing import List, Set, Tuple +from typing import List, Optional, Set, Tuple from warnings import WarningMessage from rich import print @@ -31,7 +31,7 @@ def import_all_classes( paths: List[str], prefix: str, - provider_ids: List[str] = None, + provider_ids: Optional[List[str]] = None, print_imports: bool = False, print_skips: bool = False, ) -> Tuple[List[str], List[WarningMessage]]: diff --git a/dev/prepare_release_issue.py b/dev/prepare_release_issue.py index f744da7d5a074..3be78fd1cb8f5 100755 --- a/dev/prepare_release_issue.py +++ b/dev/prepare_release_issue.py @@ -22,7 +22,7 @@ import subprocess import textwrap from collections import defaultdict -from typing import Any, Dict, List, NamedTuple, Optional, Set +from typing import Any, Dict, List, NamedTuple, Optional, Set, Union import click from github import Github, Issue, PullRequest, UnknownObjectException @@ -33,6 +33,8 @@ console = Console(width=400, color_system="standard") +PullRequestOrIssue = Union[PullRequest.PullRequest, Issue.Issue] + MY_DIR_PATH = os.path.dirname(__file__) SOURCE_DIR_PATH = os.path.abspath(os.path.join(MY_DIR_PATH, os.pardir)) PR_PATTERN = re.compile(r".*\(#([0-9]+)\)") @@ -183,7 +185,7 @@ def render_template( def print_issue_content( current_release: str, - pull_requests: Dict[int, PullRequest.PullRequest], + pull_requests: Dict[int, PullRequestOrIssue], linked_issues: Dict[int, List[Issue.Issue]], users: Dict[int, Set[str]], ): @@ -231,12 +233,12 @@ def generate_issue_content( else: excluded_prs = [] changes = get_changes(verbose, previous_release, current_release) - prs = list( - filter(lambda pr: pr is not None and pr not in excluded_prs, [change.pr for change in changes]) - ) + change_prs = [change.pr for change in changes] + prs = [pr for pr in change_prs if pr is not None and pr not in excluded_prs] + g = Github(github_token) repo = g.get_repo("apache/airflow") - pull_requests: Dict[int, PullRequest.PullRequest] = {} + pull_requests: Dict[int, PullRequestOrIssue] = {} linked_issues: Dict[int, List[Issue.Issue]] = defaultdict(lambda: []) users: Dict[int, Set[str]] = defaultdict(lambda: set()) count_prs = len(prs) @@ -249,6 +251,8 @@ def generate_issue_content( progress.console.print( f"Retrieving PR#{pr_number}: " f"https://github.com/apache/airflow/pull/{pr_number}" ) + + pr: PullRequestOrIssue try: pr = repo.get_pull(pr_number) except UnknownObjectException: diff --git a/dev/provider_packages/prepare_provider_packages.py b/dev/provider_packages/prepare_provider_packages.py index 82c2236fe810c..9c60574253b0a 100755 --- a/dev/provider_packages/prepare_provider_packages.py +++ b/dev/provider_packages/prepare_provider_packages.py @@ -40,11 +40,11 @@ from os.path import dirname, relpath from pathlib import Path from shutil import copyfile -from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Set, Tuple, Type +from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Set, Tuple, Type, Union import click import jsonschema -from github import Github, PullRequest, UnknownObjectException +from github import Github, Issue, PullRequest, UnknownObjectException from packaging.version import Version from rich.console import Console from rich.progress import Progress @@ -1286,7 +1286,7 @@ def get_all_changes_for_package( provider_package_id: str, source_provider_package_path: str, verbose: bool, -) -> Tuple[bool, Optional[List[List[Change]]], str]: +) -> Tuple[bool, Optional[Union[List[List[Change]], Change]], str]: """ Retrieves all changes for the package. :param versions: list of versions @@ -1576,7 +1576,14 @@ def update_release_notes( return False else: if interactive and confirm("Are those changes documentation-only?"): - mark_latest_changes_as_documentation_only(provider_details, latest_change) + if isinstance(latest_change, Change): + mark_latest_changes_as_documentation_only(provider_details, latest_change) + else: + raise ValueError( + "Expected only one change to be present to mark changes " + f"in provider {provider_package_id} as docs-only. " + f"Received {len(latest_change)}." + ) return False jinja_context["DETAILED_CHANGES_RST"] = changes @@ -1773,7 +1780,7 @@ def get_all_providers() -> List[str]: return list(PROVIDERS_REQUIREMENTS.keys()) -def verify_provider_package(provider_package_id: str) -> str: +def verify_provider_package(provider_package_id: str) -> None: """ Verifies if the provider package is good. :param provider_package_id: package id to verify @@ -2383,9 +2390,12 @@ def get_prs_for_package(package_id: str) -> List[int]: return prs +PullRequestOrIssue = Union[PullRequest.PullRequest, Issue.Issue] + + class ProviderPRInfo(NamedTuple): provider_details: ProviderPackageDetails - pr_list: List[PullRequest.PullRequest] + pr_list: List[PullRequestOrIssue] @cli.command() @@ -2411,7 +2421,7 @@ def generate_issue_content(package_ids: List[str], github_token: str, suffix: st all_prs.update(provider_prs[package_id]) g = Github(github_token) repo = g.get_repo("apache/airflow") - pull_requests: Dict[int, PullRequest.PullRequest] = {} + pull_requests: Dict[int, PullRequestOrIssue] = {} with Progress(console=console) as progress: task = progress.add_task(f"Retrieving {len(all_prs)} PRs ", total=len(all_prs)) pr_list = list(all_prs) diff --git a/dev/validate_version_added_fields_in_config.py b/dev/validate_version_added_fields_in_config.py index e02e2fe2429e5..d48ff1e82125f 100755 --- a/dev/validate_version_added_fields_in_config.py +++ b/dev/validate_version_added_fields_in_config.py @@ -71,16 +71,16 @@ def read_local_config_options() -> Set[Tuple[str, str, str]]: to_check_versions: List[str] = [d for d in airflow_version if d.startswith("2.")] # 2. Compute expected options set with version added fields -computed_options: Set[Tuple[str, str, str]] = set() +expected_computed_options: Set[Tuple[str, str, str]] = set() for prev_version, curr_version in zip(to_check_versions[:-1], to_check_versions[1:]): print("Processing version:", curr_version) options_1 = fetch_config_options_for_version(prev_version) options_2 = fetch_config_options_for_version(curr_version) new_options = options_2 - options_1 - computed_options.update( + expected_computed_options.update( {(section_name, option_name, curr_version) for section_name, option_name in new_options} ) -print("Computed options count:", len(computed_options)) +print("Expected computed options count:", len(expected_computed_options)) # 3. Read local options set local_options = read_local_config_options() @@ -92,7 +92,7 @@ def read_local_config_options() -> Set[Tuple[str, str, str]]: } computed_options: Set[Tuple[str, str, str]] = { (section_name, option_name, version_added) - for section_name, option_name, version_added in computed_options + for section_name, option_name, version_added in expected_computed_options if (section_name, option_name) in local_options_plain } print("Visible computed options count:", len(computed_options)) From 3087dd283d98b81b96bc227b9b1b375a35c1d0f2 Mon Sep 17 00:00:00 2001 From: Josh Fell <48934154+josh-fell@users.noreply.github.com> Date: Tue, 14 Dec 2021 06:31:47 -0500 Subject: [PATCH 129/250] Fix MyPy errors in `scripts/in_container` (#20280) (cherry picked from commit 96212cb8f5e7e1e8caba18d92f58755a33b07a67) --- scripts/in_container/check_junitxml_result.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100755 scripts/in_container/check_junitxml_result.py diff --git a/scripts/in_container/check_junitxml_result.py b/scripts/in_container/check_junitxml_result.py new file mode 100755 index 0000000000000..7381904d35aba --- /dev/null +++ b/scripts/in_container/check_junitxml_result.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +import xml.etree.ElementTree as ET + +TEXT_RED = '\033[31m' +TEXT_GREEN = '\033[32m' +TEXT_RESET = '\033[0m' + +if __name__ == '__main__': + fname = sys.argv[1] + try: + with open(fname) as fh: + root = ET.parse(fh) + testsuite = root.find('.//testsuite') + if testsuite: + num_failures = testsuite.get('failures') + num_errors = testsuite.get('errors') + if num_failures == "0" and num_errors == "0": + print(f'\n{TEXT_GREEN}==== No errors, no failures. Good to go! ===={TEXT_RESET}\n') + sys.exit(0) + else: + print( + f'\n{TEXT_RED}==== Errors: {num_errors}, Failures: {num_failures}. ' + f'Failing the test! ===={TEXT_RESET}\n' + ) + sys.exit(1) + else: + print( + f'\n{TEXT_RED}==== The testsuite element does not exist in file {fname!r}. ' + f'Cannot evaluate status of the test! ===={TEXT_RESET}\n' + ) + sys.exit(1) + except Exception as e: + print( + f'\n{TEXT_RED}==== There was an error when parsing the junitxml file.' + f' Likely the file was corrupted ===={TEXT_RESET}\n' + ) + print(f'\n{TEXT_RED}==== Error: {e} {TEXT_RESET}\n') + sys.exit(2) From b43fb3d9be4bac809183aef95df96c678c387664 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Wed, 15 Dec 2021 16:51:28 -0700 Subject: [PATCH 130/250] Only list linked issues once in release issues (#20299) (cherry picked from commit 58464d830eb018575033a4a88c92facc7cd41e9a) --- dev/prepare_release_issue.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dev/prepare_release_issue.py b/dev/prepare_release_issue.py index 3be78fd1cb8f5..2d4c175bb0714 100755 --- a/dev/prepare_release_issue.py +++ b/dev/prepare_release_issue.py @@ -273,8 +273,10 @@ def generate_issue_content( # Relate so we can find those from the body if pr.body: body = pr.body.replace("\n", " ").replace("\r", " ") - for issue_match in ISSUE_MATCH_IN_BODY.finditer(body): - linked_issue_number = int(issue_match.group(1)) + linked_issue_numbers = { + int(issue_match.group(1)) for issue_match in ISSUE_MATCH_IN_BODY.finditer(body) + } + for linked_issue_number in linked_issue_numbers: progress.console.print( f"Retrieving Linked issue PR#{linked_issue_number}: " f"https://github.com/apache/airflow/issue/{linked_issue_number}" From 92ef164624fd4068aa4becf2052f14d8d0dffd1b Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 21 Dec 2021 19:28:28 +0100 Subject: [PATCH 131/250] Add exiting on error in prod image script (#20447) The script did not fail but continued on error which might have resulted in one or more images missing. Adding `set -e` fixes it. (cherry picked from commit 52f3c7ca679350ccc60dfb5b0c5cd9c98e2d7e23) --- dev/prepare_prod_docker_images.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/prepare_prod_docker_images.sh b/dev/prepare_prod_docker_images.sh index c2a3ad2f13f27..dfccd9143f194 100755 --- a/dev/prepare_prod_docker_images.sh +++ b/dev/prepare_prod_docker_images.sh @@ -18,6 +18,8 @@ AIRFLOW_SOURCES_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)" export AIRFLOW_SOURCES_DIR +set -e + CURRENT_PYTHON_MAJOR_MINOR_VERSIONS=("3.7" "3.8" "3.9" "3.6") usage() { From ec08a5c1a639dad70a2ca63a34b116f3e1f1c16e Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 21 Dec 2021 18:21:57 +0100 Subject: [PATCH 132/250] Add possibility to ignore common deprecated message (#20444) There are some cases where deprecation of a commonly used dependency causes deprecation message in multiple dependencies. This happened in December 2021 with distutils deprecation. The disutil deprecation started to appear as new versions of multiple packages were released. This change adds such "common" deprecation messages that should be filtered out independently where they were generated. (cherry picked from commit daeeb7d401cd30063ca3de3bf5153e8ffb3741b6) --- .../prepare_provider_packages.py | 54 ++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/dev/provider_packages/prepare_provider_packages.py b/dev/provider_packages/prepare_provider_packages.py index 9c60574253b0a..0b534c25379cb 100755 --- a/dev/provider_packages/prepare_provider_packages.py +++ b/dev/provider_packages/prepare_provider_packages.py @@ -1238,7 +1238,7 @@ def validate_provider_info_with_runtime_schema(provider_info: Dict[str, Any]) -> console.print("[red]Provider info not validated against runtime schema[/]") raise Exception( "Error when validating schema. The schema must be compatible with " - + "airflow/provider_info.schema.json.", + "airflow/provider_info.schema.json.", ex, ) @@ -2049,6 +2049,10 @@ def summarise_total_vs_bad_and_warnings(total: int, bad: int, warns: List[warnin console.print("[yellow]There are two cases that are legitimate deprecation warnings though:[/]") console.print("[yellow] 1) when you deprecate whole module or class and replace it in provider[/]") console.print("[yellow] 2) when 3rd-party module generates Deprecation and you cannot upgrade it[/]") + console.print( + "[yellow] 3) when many 3rd-party module generates same Deprecation warning that " + "comes from another common library[/]" + ) console.print() console.print( "[yellow]In case 1), add the deprecation message to " @@ -2058,6 +2062,10 @@ def summarise_total_vs_bad_and_warnings(total: int, bad: int, warns: List[warnin "[yellow]In case 2), add the deprecation message together with module it generates to " "the KNOWN_DEPRECATED_MESSAGES in prepare_provider_packages.py[/]" ) + console.print( + "[yellow]In case 3), add the deprecation message to " + "the KNOWN_COMMON_DEPRECATED_MESSAGES in prepare_provider_packages.py[/]" + ) console.print() raise_error = True else: @@ -2117,11 +2125,28 @@ def summarise_total_vs_bad_and_warnings(total: int, bad: int, warns: List[warnin ("The module cloudant is now deprecated. The replacement is ibmcloudant.", "cloudant"), } +KNOWN_COMMON_DEPRECATED_MESSAGES: Set[str] = { + "distutils Version classes are deprecated. Use packaging.version instead." +} + # The set of warning messages generated by direct importing of some deprecated modules. We should only # ignore those messages when the warnings are generated directly by importlib - which means that # we imported it directly during module walk by the importlib library KNOWN_DEPRECATED_DIRECT_IMPORTS: Set[str] = { + "This module is deprecated. Please use `airflow.providers.microsoft.azure.hooks.batch`.", + "This module is deprecated. Please use `airflow.providers.microsoft.azure.hooks.container_instance`.", + "This module is deprecated. Please use `airflow.providers.microsoft.azure.hooks.container_registry`.", + "This module is deprecated. Please use `airflow.providers.microsoft.azure.hooks.container_volume`.", + "This module is deprecated. Please use `airflow.providers.microsoft.azure.hooks.cosmos`.", "This module is deprecated. Please use `airflow.providers.microsoft.azure.hooks.data_factory`.", + "This module is deprecated. Please use `airflow.providers.microsoft.azure.hooks.data_lake`.", + "This module is deprecated. Please use `airflow.providers.microsoft.azure.hooks.fileshare`.", + "This module is deprecated. Please use `airflow.providers.microsoft.azure.operators.batch`.", + "This module is deprecated. " + "Please use `airflow.providers.microsoft.azure.operators.container_instances`.", + "This module is deprecated. Please use `airflow.providers.microsoft.azure.operators.cosmos`.", + "This module is deprecated. Please use `airflow.providers.microsoft.azure.secrets.key_vault`.", + "This module is deprecated. Please use `airflow.providers.microsoft.azure.sensors.cosmos`.", "This module is deprecated. Please use `airflow.providers.amazon.aws.hooks.dynamodb`.", "This module is deprecated. Please use `airflow.providers.microsoft.azure.transfers.local_to_wasb`.", "This module is deprecated. Please use `airflow.providers.tableau.operators.tableau_refresh_workbook`.", @@ -2131,6 +2156,24 @@ def summarise_total_vs_bad_and_warnings(total: int, bad: int, warns: List[warnin "This module is deprecated. Please use `kubernetes.client.models.V1VolumeMount`.", 'numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header,' ' got 216 from PyObject', + "This module is deprecated. Please use `airflow.providers.amazon.aws.sensors.step_function`.", + "This module is deprecated. Please use `airflow.providers.amazon.aws.operators.step_function`.", + 'This module is deprecated. Please use `airflow.providers.amazon.aws.operators.ec2`.', + 'This module is deprecated. Please use `airflow.providers.amazon.aws.sensors.ec2`.', + "This module is deprecated. Please use `airflow.providers.amazon.aws.sensors.s3`.", + "This module is deprecated. Please use `airflow.providers.amazon.aws.operators.s3`.", + "This module is deprecated. Please use `airflow.providers.amazon.aws.operators.dms`.", + "This module is deprecated. Please use `airflow.providers.amazon.aws.sensors.dms`.", + 'This module is deprecated. Please use `airflow.providers.amazon.aws.operators.emr`.', + 'This module is deprecated. Please use `airflow.providers.amazon.aws.sensors.emr`.', + "This module is deprecated. Please use `airflow.providers.amazon.aws.hooks.redshift_cluster` " + "or `airflow.providers.amazon.aws.hooks.redshift_sql` as appropriate.", + "This module is deprecated. Please use `airflow.providers.amazon.aws.operators.redshift_sql` " + "or `airflow.providers.amazon.aws.operators.redshift_cluster` as appropriate.", + "This module is deprecated. Please use `airflow.providers.amazon.aws.sensors.redshift_cluster`.", + 'This module is deprecated. Please use `airflow.providers.amazon.aws.operators.sagemaker`.', + 'This module is deprecated. Please use `airflow.providers.amazon.aws.sensors.sagemaker`.', + 'This module is deprecated. Please use `airflow.providers.amazon.aws.hooks.emr`.', } @@ -2151,6 +2194,14 @@ def filter_direct_importlib_warning(warn: warnings.WarningMessage) -> bool: return True +def filter_known_common_deprecated_messages(warn: warnings.WarningMessage) -> bool: + msg_string = str(warn.message).replace("\n", " ") + for m in KNOWN_COMMON_DEPRECATED_MESSAGES: + if msg_string == m: + return False + return True + + @cli.command() def verify_provider_classes(): """Verifies names for all provider classes.""" @@ -2172,6 +2223,7 @@ def verify_provider_classes(): bad += inc_bad warns = list(filter(filter_known_warnings, warns)) warns = list(filter(filter_direct_importlib_warning, warns)) + warns = list(filter(filter_known_common_deprecated_messages, warns)) if not summarise_total_vs_bad_and_warnings(total, bad, warns): sys.exit(1) From ce55c30a8b74894a024658cb11c3634c8bc8f429 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Wed, 22 Dec 2021 08:38:27 -0700 Subject: [PATCH 133/250] Update the "releasing Airflow" docs (#20456) (cherry picked from commit 81f92d6c321992905d239bb9e8556720218fe745) --- dev/README_RELEASE_AIRFLOW.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index f10306b2ee380..aa5d7389f1e91 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -295,7 +295,7 @@ protected_branches: export BRANCH_PREFIX=2-1 git checkout constraints-main git checkout -b constraints-${BRANCH_PREFIX} - git push origin constraints-${BRANCH_PREFIX} + git push origin tag constraints-${BRANCH_PREFIX} ``` ## Prepare PyPI convenience "snapshot" packages @@ -352,7 +352,7 @@ is not supposed to be used by and advertised to the end-users who do not read th (both airflow and latest provider packages). ```shell script - git push origin ${VERSION} + git push origin tag ${VERSION} ``` ## Prepare production Docker Image @@ -721,8 +721,8 @@ The best way of doing this is to svn cp between the two repos (this avoids havin ```shell script # GO to Airflow Sources first -cd -export AIRFLOW_SOURCES=$(pwd) +cd +export AIRFLOW_REPO_ROOT=$(pwd) # GO to Checked out DEV repo. Should be checked out before via: # svn checkout https://dist.apache.org/repos/dist/dev/airflow airflow-dev @@ -761,8 +761,7 @@ Verify that the packages appear in [airflow](https://dist.apache.org/repos/dist/ ## Prepare PyPI "release" packages -At this point we release an official package (they should be copied and renamed from the -previously released RC candidates in "${AIRFLOW_SOURCES}/dist": +At this point we release an official package: - Verify the artifacts that would be uploaded: @@ -794,7 +793,7 @@ previously released RC candidates in "${AIRFLOW_SOURCES}/dist": cd "${AIRFLOW_REPO_ROOT}" git checkout constraints-${RC} git tag -s "constraints-${VERSION}" -m "Constraints for Apache Airflow ${VERSION}" - git push origin "constraints-${VERSION}" + git push origin tag "constraints-${VERSION}" ``` - Push Tag for the final version @@ -806,7 +805,7 @@ previously released RC candidates in "${AIRFLOW_SOURCES}/dist": ```shell script git checkout ${RC} git tag -s ${VERSION} -m "Apache Airflow ${VERSION}" - git push origin ${VERSION} + git push origin tag ${VERSION} ``` ## Manually prepare production Docker Image @@ -891,8 +890,7 @@ We also made this version available on PyPI for convenience: \`pip install apache-airflow\` https://pypi.org/project/apache-airflow/${VERSION}/ -The documentation is available on: -https://airflow.apache.org/ +The documentation is available at: https://airflow.apache.org/docs/apache-airflow/${VERSION}/ Find the CHANGELOG here for more details: @@ -968,6 +966,8 @@ Update the values of `airflowVersion`, `defaultAirflowTag` and `appVersion` in t will use the latest released version. You'll need to update `chart/values.yaml`, `chart/values.schema.json` and `chart/Chart.yaml`. +Also add a note to `UPDATING.rst` that the default version of Airflow has changed. + ## Update airflow/config_templates/config.yml file File `airflow/config_templates/config.yml` contains documentation on all configuration options available in Airflow. The `version_added` fields must be updated when a new Airflow version is released. From e02aa46a5a208ec8f7b1426107b9f38216906e40 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Thu, 30 Dec 2021 22:13:10 +0100 Subject: [PATCH 134/250] Generate version documentation from single source of truth (#20594) We used to maintain supported versions separately in the docs and it led to discrepancies. Now we have single source of truth which is used to generate it automatically with pre-commits (cherry picked from commit 18b2ca4ecf52e8c4d0c39923aa55fe29ea2d6c07) --- .pre-commit-config.yaml | 7 +++ BREEZE.rst | 2 +- README.md | 19 +++--- STATIC_CODE_CHECKS.rst | 2 + breeze-complete | 1 + dev/README_RELEASE_AIRFLOW.md | 1 + .../installation/supported-versions.rst | 28 +++++---- scripts/ci/pre_commit/supported_versions.py | 63 +++++++++++++++++++ 8 files changed, 102 insertions(+), 21 deletions(-) create mode 100755 scripts/ci/pre_commit/supported_versions.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 798237f3f5883..14fa4830742db 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -338,6 +338,13 @@ repos: files: ^Dockerfile$ pass_filenames: false additional_dependencies: ['rich'] + - id: update-supported-versions + name: Updates supported versions in documentation + entry: ./scripts/ci/pre_commit/supported_versions.py + language: python + files: ^scripts/ci/pre_commit/supported_versions.py$|^README.md$|^docs/apache-airflow/supported-versions.rst$ + pass_filenames: false + additional_dependencies: ['tabulate'] - id: update-version name: Update version to the latest version in the documentation entry: ./scripts/ci/pre_commit/pre_commit_update_versions.py diff --git a/BREEZE.rst b/BREEZE.rst index 823ab5b7d4f48..4d809a2455551 100644 --- a/BREEZE.rst +++ b/BREEZE.rst @@ -2208,7 +2208,7 @@ This is the current syntax for `./breeze <./breeze>`_: pyupgrade restrict-start_date rst-backticks setup-order setup-extra-packages shellcheck sort-in-the-wild sort-spelling-wordlist stylelint trailing-whitespace ui-lint update-breeze-file update-extras update-local-yml-file update-setup-cfg-file - update-versions verify-db-migrations-documented version-sync www-lint yamllint yesqa + update-supported-versions update-versions verify-db-migrations-documented version-sync www-lint yamllint yesqa You can pass extra arguments including options to the pre-commit framework as passed after --. For example: diff --git a/README.md b/README.md index 2534c94e9ff8d..35243f0e03c95 100644 --- a/README.md +++ b/README.md @@ -261,13 +261,18 @@ packages: Apache Airflow version life cycle: -| Version | Current Patch/Minor | State | First Release | Limited Support | EOL/Terminated | -|---------|---------------------|-----------|---------------|-----------------|----------------| -| 2 | 2.2.4 | Supported | Dec 17, 2020 | TBD | TBD | -| 1.10 | 1.10.15 | EOL | Aug 27, 2018 | Dec 17, 2020 | June 17, 2021 | -| 1.9 | 1.9.0 | EOL | Jan 03, 2018 | Aug 27, 2018 | Aug 27, 2018 | -| 1.8 | 1.8.2 | EOL | Mar 19, 2017 | Jan 03, 2018 | Jan 03, 2018 | -| 1.7 | 1.7.1.2 | EOL | Mar 28, 2016 | Mar 19, 2017 | Mar 19, 2017 | + + + +| Version | Current Patch/Minor | State | First Release | Limited Support | EOL/Terminated | +|-----------|-----------------------|-----------|-----------------|-------------------|------------------| +| 2 | 2.2.4 | Supported | Dec 17, 2020 | TBD | TBD | +| 1.10 | 1.10.15 | EOL | Aug 27, 2018 | Dec 17, 2020 | June 17, 2021 | +| 1.9 | 1.9.0 | EOL | Jan 03, 2018 | Aug 27, 2018 | Aug 27, 2018 | +| 1.8 | 1.8.2 | EOL | Mar 19, 2017 | Jan 03, 2018 | Jan 03, 2018 | +| 1.7 | 1.7.1.2 | EOL | Mar 28, 2016 | Mar 19, 2017 | Mar 19, 2017 | + + Limited support versions will be supported with security and critical bug fix only. EOL versions will not get any fixes nor support. diff --git a/STATIC_CODE_CHECKS.rst b/STATIC_CODE_CHECKS.rst index 9afca67a9669c..c46d30cc95f66 100644 --- a/STATIC_CODE_CHECKS.rst +++ b/STATIC_CODE_CHECKS.rst @@ -270,6 +270,8 @@ require Breeze Docker images to be installed locally. ------------------------------------ ---------------------------------------------------------------- ------------ ``update-setup-cfg-file`` Update setup.cfg file with all licenses ------------------------------------ ---------------------------------------------------------------- ------------ +``update-supported-versions`` Updates supported versions in documentation +------------------------------------ ---------------------------------------------------------------- ------------ ``update-versions`` Updates latest versions in the documentation ------------------------------------ ---------------------------------------------------------------- ------------ ``verify-db-migrations-documented`` Verify DB Migrations have been documented diff --git a/breeze-complete b/breeze-complete index b36b4880f4c0b..8aeba8450dee6 100644 --- a/breeze-complete +++ b/breeze-complete @@ -147,6 +147,7 @@ update-breeze-file update-extras update-local-yml-file update-setup-cfg-file +update-supported-versions update-versions verify-db-migrations-documented version-sync diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index aa5d7389f1e91..41fcd200c4b1a 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -957,6 +957,7 @@ EOF This includes: +- Modify `./scripts/ci/pre-commit/supported_versions.py` and let pre-commit do the job - Sync `CHANGELOG.txt`, `UPDATING.md` and `README.md` changes - Updating issue templates in `.github/ISSUE_TEMPLATE/` with the new version diff --git a/docs/apache-airflow/installation/supported-versions.rst b/docs/apache-airflow/installation/supported-versions.rst index c4a2a26df18dc..3e48523b05dbf 100644 --- a/docs/apache-airflow/installation/supported-versions.rst +++ b/docs/apache-airflow/installation/supported-versions.rst @@ -23,19 +23,21 @@ Version Life Cycle Apache Airflow version life cycle: -+---------+-----------------+---------------+-----------------+----------------+ -| Version | State | First Release | Limited Support | EOL/Terminated | -+---------+-----------------+---------------+-----------------+----------------+ -| 2 | Supported | Dec 17, 2020 | TBD | TBD | -+---------+-----------------+---------------+-----------------+----------------+ -| 1.10 | EOL | Aug 27, 2018 | Dec 17, 2020 | June 2021 | -+---------+-----------------+---------------+-----------------+----------------+ -| 1.9 | EOL | Jan 03, 2018 | Aug 27, 2018 | Aug 2018 | -+---------+-----------------+---------------+-----------------+----------------+ -| 1.8 | EOL | Mar 19, 2017 | Jan 03, 2018 | Jan 2018 | -+---------+-----------------+---------------+-----------------+----------------+ -| 1.7 | EOL | Mar 28, 2016 | Mar 19, 2017 | Mar 2017 | -+---------+-----------------+---------------+-----------------+----------------+ + .. This table is automatically updated by pre-commit scripts/ci/pre-commit/supported_versions.py + .. Beginning of auto-generated table + +========= ===================== ========= =============== ================= ================ +Version Current Patch/Minor State First Release Limited Support EOL/Terminated +========= ===================== ========= =============== ================= ================ +2 2.2.4 Supported Dec 17, 2020 TBD TBD +1.10 1.10.15 EOL Aug 27, 2018 Dec 17, 2020 June 17, 2021 +1.9 1.9.0 EOL Jan 03, 2018 Aug 27, 2018 Aug 27, 2018 +1.8 1.8.2 EOL Mar 19, 2017 Jan 03, 2018 Jan 03, 2018 +1.7 1.7.1.2 EOL Mar 28, 2016 Mar 19, 2017 Mar 19, 2017 +========= ===================== ========= =============== ================= ================ + + .. End of auto-generated table + Limited support versions will be supported with security and critical bug fix only. EOL versions will not get any fixes nor support. diff --git a/scripts/ci/pre_commit/supported_versions.py b/scripts/ci/pre_commit/supported_versions.py new file mode 100755 index 0000000000000..c345006e22cdb --- /dev/null +++ b/scripts/ci/pre_commit/supported_versions.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from pathlib import Path + +from tabulate import tabulate + +AIRFLOW_SOURCES = Path(__file__).resolve().parent.parent.parent.parent + + +HEADERS = ("Version", "Current Patch/Minor", "State", "First Release", "Limited Support", "EOL/Terminated") + +SUPPORTED_VERSIONS = ( + ("2", "2.2.4", "Supported", "Dec 17, 2020", "TBD", "TBD"), + ("1.10", "1.10.15", "EOL", "Aug 27, 2018", "Dec 17, 2020", "June 17, 2021"), + ("1.9", "1.9.0", "EOL", "Jan 03, 2018", "Aug 27, 2018", "Aug 27, 2018"), + ("1.8", "1.8.2", "EOL", "Mar 19, 2017", "Jan 03, 2018", "Jan 03, 2018"), + ("1.7", "1.7.1.2", "EOL", "Mar 28, 2016", "Mar 19, 2017", "Mar 19, 2017"), +) + + +def replace_text_between(file: Path, start: str, end: str, replacement_text: str): + original_text = file.read_text() + leading_text = original_text.split(start)[0] + trailing_text = original_text.split(end)[1] + file.write_text(leading_text + start + replacement_text + end + trailing_text) + + +if __name__ == '__main__': + replace_text_between( + file=AIRFLOW_SOURCES / "README.md", + start="\n", + end="\n", + replacement_text="\n" + + tabulate( + SUPPORTED_VERSIONS, tablefmt="github", headers=HEADERS, stralign="left", disable_numparse=True + ) + + "\n\n", + ) + replace_text_between( + file=AIRFLOW_SOURCES / "docs" / "apache-airflow" / "installation" / "supported-versions.rst", + start=" .. Beginning of auto-generated table\n", + end=" .. End of auto-generated table\n", + replacement_text="\n" + + tabulate( + SUPPORTED_VERSIONS, tablefmt="rst", headers=HEADERS, stralign="left", disable_numparse=True + ) + + "\n\n", + ) From 8443c730621df9cfeeeedf4df1c6d3cc41c63a21 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Fri, 31 Dec 2021 11:00:30 +0100 Subject: [PATCH 135/250] Add known warning generated by snowflake new version (#20604) The new snowflake library version generates a different warning message as they bumped pyarrow version used. This PR adds the warning to known warnings. (cherry picked from commit 80bccfded3d45220a7f6e80c4e616ab3164da198) --- dev/provider_packages/prepare_provider_packages.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dev/provider_packages/prepare_provider_packages.py b/dev/provider_packages/prepare_provider_packages.py index 0b534c25379cb..80410a4f1b656 100755 --- a/dev/provider_packages/prepare_provider_packages.py +++ b/dev/provider_packages/prepare_provider_packages.py @@ -2121,6 +2121,11 @@ def summarise_total_vs_bad_and_warnings(total: int, bad: int, warns: List[warnin " adheres to: 'pyarrow<3.1.0,>=3.0.0; extra == \"pandas\"'", "snowflake", ), + ( + "You have an incompatible version of 'pyarrow' installed (6.0.1), please install a version that" + " adheres to: 'pyarrow<5.1.0,>=5.0.0; extra == \"pandas\"'", + "snowflake", + ), ("SelectableGroups dict interface is deprecated. Use select.", "kombu"), ("The module cloudant is now deprecated. The replacement is ibmcloudant.", "cloudant"), } From 8a432bbf6feb57d4da7a4f7300e423af209a6d70 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sun, 2 Jan 2022 11:37:40 +0100 Subject: [PATCH 136/250] Add twine check for provider packages (#20619) Twine (which we use to upload packages to PyPI) has the ability to run checks of packages before uploading them. This allows to detect cases like when we are using forbidden directives in README.rst (which delayed slightly preparing the December 2021 provider packages and resulted in #20614 With this PR Twine check will be run for all packages in CI before we even attempt to merge such change that could break them. (cherry picked from commit f011f66f763cb9bfcccea085dbd2cb2b44614d20) --- .github/workflows/ci.yml | 1 + breeze | 1 + scripts/ci/docker-compose/_docker.env | 1 + scripts/ci/docker-compose/base.yml | 1 + scripts/ci/libraries/_initialization.sh | 3 +++ scripts/in_container/_in_container_utils.sh | 14 ++++++++++++ .../run_install_and_test_provider_packages.sh | 22 +++++++++++++++++++ setup.py | 1 + 8 files changed, 44 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9002b284453fc..d5d75ad4821c6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -578,6 +578,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}" run: ./scripts/ci/provider_packages/ci_install_and_test_provider_packages.sh env: USE_AIRFLOW_VERSION: "2.1.0" + SKIP_TWINE_CHECK: "true" PACKAGE_FORMAT: "wheel" prepare-test-provider-packages-sdist: diff --git a/breeze b/breeze index dd72e6e1f1aaf..c646e08068714 100755 --- a/breeze +++ b/breeze @@ -647,6 +647,7 @@ export AIRFLOW_PROD_IMAGE="${AIRFLOW_PROD_IMAGE}" export AIRFLOW_IMAGE_KUBERNETES="${AIRFLOW_IMAGE_KUBERNETES}" export SQLITE_URL="${SQLITE_URL}" export USE_AIRFLOW_VERSION="${USE_AIRFLOW_VERSION}" +export SKIP_TWINE_CHECK="${SKIP_TWINE_CHECK}" export USE_PACKAGES_FROM_DIST="${USE_PACKAGES_FROM_DIST}" export EXECUTOR="${EXECUTOR}" export START_AIRFLOW="${START_AIRFLOW}" diff --git a/scripts/ci/docker-compose/_docker.env b/scripts/ci/docker-compose/_docker.env index 72a1afc2d8b8c..0f3a940eef378 100644 --- a/scripts/ci/docker-compose/_docker.env +++ b/scripts/ci/docker-compose/_docker.env @@ -57,6 +57,7 @@ RUN_TESTS LIST_OF_INTEGRATION_TESTS_TO_RUN RUN_SYSTEM_TESTS START_AIRFLOW +SKIP_TWINE_CHECK TEST_TYPE UPGRADE_TO_NEWER_DEPENDENCIES VERBOSE diff --git a/scripts/ci/docker-compose/base.yml b/scripts/ci/docker-compose/base.yml index 8413179a665a5..5125dd691ceb8 100644 --- a/scripts/ci/docker-compose/base.yml +++ b/scripts/ci/docker-compose/base.yml @@ -70,6 +70,7 @@ services: - LIST_OF_INTEGRATION_TESTS_TO_RUN=${LIST_OF_INTEGRATION_TESTS_TO_RUN} - RUN_SYSTEM_TESTS=${RUN_SYSTEM_TESTS} - START_AIRFLOW=${START_AIRFLOW} + - SKIP_TWINE_CHECK=${SKIP_TWINE_CHECK} - TEST_TYPE=${TEST_TYPE} - UPGRADE_TO_NEWER_DEPENDENCIES=${UPGRADE_TO_NEWER_DEPENDENCIES} - VERBOSE=${VERBOSE} diff --git a/scripts/ci/libraries/_initialization.sh b/scripts/ci/libraries/_initialization.sh index 197b1eec318fe..3235f45442ea4 100644 --- a/scripts/ci/libraries/_initialization.sh +++ b/scripts/ci/libraries/_initialization.sh @@ -416,6 +416,9 @@ function initialization::initialize_image_build_variables() { INSTALL_PROVIDERS_FROM_SOURCES=${INSTALL_PROVIDERS_FROM_SOURCES:="true"} export INSTALL_PROVIDERS_FROM_SOURCES + SKIP_TWINE_CHECK=${SKIP_TWINE_CHECK:=""} + export SKIP_TWINE_CHECK + export INSTALLED_EXTRAS="async,amazon,celery,cncf.kubernetes,docker,dask,elasticsearch,ftp,grpc,hashicorp,http,imap,ldap,google,microsoft.azure,mysql,postgres,redis,sendgrid,sftp,slack,ssh,statsd,virtualenv" AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION:="21.2.4"} diff --git a/scripts/in_container/_in_container_utils.sh b/scripts/in_container/_in_container_utils.sh index ca0a84f55f643..1cf30dd550b47 100644 --- a/scripts/in_container/_in_container_utils.sh +++ b/scripts/in_container/_in_container_utils.sh @@ -301,6 +301,20 @@ function install_all_provider_packages_from_sdist() { pip install /dist/apache-airflow-*providers-*.tar.gz } +function twine_check_provider_packages_from_wheels() { + echo + echo "Twine check of all provider packages from wheels" + echo + twine check /dist/apache_airflow*providers_*.whl +} + +function twine_check_provider_packages_from_sdist() { + echo + echo "Twine check all provider packages from sdist" + echo + twine check /dist/apache-airflow-*providers-*.tar.gz +} + function setup_provider_packages() { export PACKAGE_TYPE="regular" export PACKAGE_PREFIX_UPPERCASE="" diff --git a/scripts/in_container/run_install_and_test_provider_packages.sh b/scripts/in_container/run_install_and_test_provider_packages.sh index 1e80dd3ded6a7..d556581adfdfa 100755 --- a/scripts/in_container/run_install_and_test_provider_packages.sh +++ b/scripts/in_container/run_install_and_test_provider_packages.sh @@ -90,6 +90,22 @@ function install_provider_packages() { group_end } +function twine_check_provider_packages() { + group_start "Twine check provider packages" + if [[ ${PACKAGE_FORMAT} == "wheel" ]]; then + twine_check_provider_packages_from_wheels + elif [[ ${PACKAGE_FORMAT} == "sdist" ]]; then + twine_check_provider_packages_from_sdist + else + echo + echo "${COLOR_RED}ERROR: Wrong package format ${PACKAGE_FORMAT}. Should be wheel or sdist${COLOR_RESET}" + echo + exit 1 + fi + group_end +} + + function discover_all_provider_packages() { group_start "Listing available providers via 'airflow providers list'" # Columns is to force it wider, so it doesn't wrap at 80 characters @@ -225,6 +241,12 @@ function ver() { setup_provider_packages verify_parameters install_airflow_as_specified + +if [[ ${SKIP_TWINE_CHECK=""} != "true" ]]; then + # Airflow 2.1.0 installs importlib_metadata version that does not work well with twine + # So we should skip twine check in this case + twine_check_provider_packages +fi install_provider_packages import_all_provider_classes diff --git a/setup.py b/setup.py index e3fe301627ae9..a95203d50b5cf 100644 --- a/setup.py +++ b/setup.py @@ -556,6 +556,7 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version 'pytest-httpx', 'requests_mock', 'semver', + 'twine', 'wheel', 'yamllint', ] From 3c3f246fdf71c52c6a9ec9ce51aec27439821a07 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sun, 2 Jan 2022 11:38:42 +0100 Subject: [PATCH 137/250] Fix generation of "Status provider" issue (#20621) The script for generating issue for "Provider status" and release process did not work well when only subset of providers were released. The issue was generated including some already released packages even if they were not released in recent batch of providers (if there was not even a doc change since last release, the package was considered as being released again). This PR fixes it by adding a flag that only considers packages that are present in dist folder (which matches the process of release manager) The process has also been updated with more accurate description of the steps to take - including manual execution of the script rather than using Breeze (Breeze is not neede for this script). (cherry picked from commit d823cf7406092cf5b9b7b6df46738cd95a08c485) --- dev/README_RELEASE_PROVIDER_PACKAGES.md | 26 +++-- .../PROVIDER_ISSUE_TEMPLATE.md.jinja2 | 15 +-- .../prepare_provider_packages.py | 100 ++++++++++++------ 3 files changed, 85 insertions(+), 56 deletions(-) diff --git a/dev/README_RELEASE_PROVIDER_PACKAGES.md b/dev/README_RELEASE_PROVIDER_PACKAGES.md index c9d94b1b7b38b..e0b55db5b7808 100644 --- a/dev/README_RELEASE_PROVIDER_PACKAGES.md +++ b/dev/README_RELEASE_PROVIDER_PACKAGES.md @@ -110,18 +110,6 @@ are updated, run it in non-interactive mode: ./breeze --non-interactive prepare-provider-documentation [packages] ``` -When you run the command and documentation generation is successful you will get a command that you can run to -create GitHub issue where you will be tracking status of tests for the providers you release. - -You can also trigger automated execution of the issue by running: - -```shell script -./breeze --non-interactive --generate-providers-issue prepare-provider-documentation [packages] -``` - -Once you release packages, you should create the issue with the content specified and link to it in -the email sent to the devlist. - ## Build provider packages for SVN apache upload Those packages might get promoted to "final" packages by just renaming the files, so internally they @@ -352,8 +340,18 @@ git push --set-upstream origin "${branch}" ## Prepare issue in GitHub to keep status of testing -Create GitHub issue with the content generated via prepare-provider-documentation or manual -execution of the script above. You will use link to that issue in the next step. +Create a GitHub issue with the content generated via manual +execution of the script below. You will use link to that issue in the next step. You need a GITHUB_TOKEN +set as your environment variable. + +You can also pass the token as `--github-token` option in the script. + +```shell script +./dev/provider_packages/prepare_provider_packages.py generate-issue-content --only-available-in-dist +``` + +You can also generate the token by following +[this link](https://github.com/settings/tokens/new?description=Read%20sssues&scopes=repo:status) ## Prepare voting email for Providers release candidate diff --git a/dev/provider_packages/PROVIDER_ISSUE_TEMPLATE.md.jinja2 b/dev/provider_packages/PROVIDER_ISSUE_TEMPLATE.md.jinja2 index a436bcbe84b7e..bb3c6469ac361 100644 --- a/dev/provider_packages/PROVIDER_ISSUE_TEMPLATE.md.jinja2 +++ b/dev/provider_packages/PROVIDER_ISSUE_TEMPLATE.md.jinja2 @@ -1,26 +1,17 @@ I have a kind request for all the contributors to the latest provider packages release. -Could you help us to test the RC versions of the providers and let us know in the comment, -if the issue is addressed there. +Could you please help us to test the RC versions of the providers? -## Providers that need testing +Let us know in the comment, whether the issue is addressed. Those are providers that require testing as there were some substantial changes introduced: {% for provider_id, provider_pr_info in interesting_providers.items() %} -### Provider [{{ provider_id }}: {{ provider_pr_info.provider_details.versions[0] }}{{ suffix }}](https://pypi.org/project/{{ provider_pr_info.provider_details.pypi_package_name }}/{{ provider_pr_info.provider_details.versions[0] }}{{ suffix }}) +## Provider [{{ provider_id }}: {{ provider_pr_info.provider_details.versions[0] }}{{ suffix }}](https://pypi.org/project/{{ provider_pr_info.provider_details.pypi_package_name }}/{{ provider_pr_info.provider_details.versions[0] }}{{ suffix }}) {%- for pr in provider_pr_info.pr_list %} - [ ] [{{ pr.title }} (#{{ pr.number }})]({{ pr.html_url }}): @{{ pr.user.login }} {%- endfor %} {%- endfor %} -## Providers that do not need testing - -Those are providers that were either doc-only or had changes that do not require testing. - -{% for provider_id, provider_pr_info in non_interesting_providers.items() %} -* Provider [{{ provider_id }}: {{ provider_pr_info.provider_details.versions[0] }}{{ suffix }}](https://pypi.org/project/{{ provider_pr_info.provider_details.pypi_package_name }}/{{ provider_pr_info.provider_details.versions[0] }}{{ suffix }}) -{%- endfor %} - +## Airflow 2.2.4 + +### Smart sensors deprecated + +Smart sensors, an "early access" feature added in Airflow 2, are now deprecated and will be removed in Airflow 2.4.0. They have been superseded by Deferable Operators, added in Airflow 2.2.0. + +See [Migrating to Deferrable Operators](https://airflow.apache.org/docs/apache-airflow/2.3.0/concepts/smart-sensors.html#migrating-to-deferrable-operators) for details on how to migrate. + ## Airflow 2.2.3 -No breaking changes. +Continuing the effort to bind TaskInstance to a DagRun, XCom entries are now also tied to a DagRun. Use the ``run_id`` argument to specify the DagRun instead. ## Airflow 2.2.2 diff --git a/airflow/jobs/scheduler_job.py b/airflow/jobs/scheduler_job.py index 44c4df7b5e3fe..2fedf807958c2 100644 --- a/airflow/jobs/scheduler_job.py +++ b/airflow/jobs/scheduler_job.py @@ -49,6 +49,7 @@ from airflow.ti_deps.dependencies_states import EXECUTION_STATES from airflow.utils import timezone from airflow.utils.callback_requests import DagCallbackRequest, TaskCallbackRequest +from airflow.utils.docs import get_docs_url from airflow.utils.event_scheduler import EventScheduler from airflow.utils.retries import MAX_DB_RETRIES, retry_db_transaction, run_with_db_retries from airflow.utils.session import create_session, provide_session @@ -146,6 +147,17 @@ def __init__( self.dagbag = DagBag(dag_folder=self.subdir, read_dags_from_db=True, load_op_links=False) + if conf.getboolean('smart_sensor', 'use_smart_sensor'): + compatible_sensors = set( + map(lambda l: l.strip(), conf.get('smart_sensor', 'sensors_enabled').split(',')) + ) + docs_url = get_docs_url('concepts/smart-sensors.html#migrating-to-deferrable-operators') + warnings.warn( + f'Smart sensors are deprecated, yet can be used for {compatible_sensors} sensors.' + f' Please use Deferrable Operators instead. See {docs_url} for more info.', + DeprecationWarning, + ) + def register_signals(self) -> None: """Register signals that stop child processes""" signal.signal(signal.SIGINT, self._exit_gracefully) diff --git a/airflow/sensors/base.py b/airflow/sensors/base.py index 039a21ad7b697..a2ef9c4817b23 100644 --- a/airflow/sensors/base.py +++ b/airflow/sensors/base.py @@ -19,6 +19,7 @@ import datetime import hashlib import time +import warnings from datetime import timedelta from typing import Any, Callable, Dict, Iterable @@ -39,6 +40,7 @@ # Google Provider before 3.0.0 imported apply_defaults from here. # See https://github.com/apache/airflow/issues/16035 from airflow.utils.decorators import apply_defaults # noqa: F401 +from airflow.utils.docs import get_docs_url class BaseSensorOperator(BaseOperator, SkipMixin): @@ -154,6 +156,12 @@ def register_in_sensor_service(self, ti, context): :param context: TaskInstance template context from the ti. :return: boolean """ + docs_url = get_docs_url('concepts/smart-sensors.html#migrating-to-deferrable-operators') + warnings.warn( + 'Your sensor is using Smart Sensors, which are deprecated.' + f' Please use Deferrable Operators instead. See {docs_url} for more info.', + DeprecationWarning, + ) poke_context = self.get_poke_context(context) execution_context = self.get_execution_context(context) diff --git a/docs/apache-airflow/concepts/deferring.rst b/docs/apache-airflow/concepts/deferring.rst index d9126c435a2e6..ca810d7972089 100644 --- a/docs/apache-airflow/concepts/deferring.rst +++ b/docs/apache-airflow/concepts/deferring.rst @@ -49,6 +49,7 @@ That's it; everything else will be automatically handled for you. If you're upgr Note that you cannot yet use the deferral ability from inside custom PythonOperator/TaskFlow Python functions; it is only available to traditional, class-based Operators at the moment. +.. _deferring/writing: Writing Deferrable Operators ---------------------------- @@ -163,4 +164,4 @@ Note that every extra ``triggerer`` you run will result in an extra persistent c Smart Sensors ------------- -Deferrable Operators essentially supersede :doc:`Smart Sensors `, and should be preferred for almost all situations. They do solve fundamentally the same problem; Smart Sensors, however, only work for certain Sensor workload styles, have no redundancy, and require a custom DAG to run at all times. +Deferrable Operators supersede :doc:`Smart Sensors `. They do solve fundamentally the same problem; Smart Sensors, however, only work for certain Sensor workload styles, have no redundancy, and require a custom DAG to run at all times. diff --git a/docs/apache-airflow/concepts/smart-sensors.rst b/docs/apache-airflow/concepts/smart-sensors.rst index e654d91e697a2..a188ea691f98a 100644 --- a/docs/apache-airflow/concepts/smart-sensors.rst +++ b/docs/apache-airflow/concepts/smart-sensors.rst @@ -23,15 +23,11 @@ Smart Sensors .. warning:: - This is an **early-access** feature and might change in incompatible ways in future Airflow versions. - However this feature can be considered bug-free, and Airbnb has been using this feature in production - since early 2020 and has significantly reduced their costs for heavy use of sensors. - -.. note:: - - :doc:`Deferrable Operators ` are a more flexible way to achieve efficient long-running sensors, - as well as allowing Operators to also achieve similar efficiency gains. If you are considering writing a - new Smart Sensor, you may want to instead write it as a Deferrable Operator. + This is a **deprecated early-access** feature that will be removed in Airflow 2.4.0. + It is superseded by :doc:`Deferrable Operators `, which offer a more flexible way to + achieve efficient long-running sensors, as well as allowing operators to also achieve similar + efficiency gains. If you are considering writing a new Smart Sensor, you should instead write it + as a Deferrable Operator. The smart sensor is a service (run by a builtin DAG) which greatly reduces Airflow’s infrastructure cost by consolidating multiple instances of small, light-weight Sensors into a single process. @@ -96,3 +92,15 @@ Support new operators in the smart sensor service include all key names used for initializing a sensor object. * In ``airflow.cfg``, add the new operator's classname to ``[smart_sensor] sensors_enabled``. All supported sensors' classname should be comma separated. + +Migrating to Deferrable Operators +---------------------------------- + +There is not a direct migration path from Smart Sensors to :doc:`Deferrable Operators `. +You have a few paths forward, depending on your needs and situation: + +* Do nothing - your DAGs will continue to run as-is, however they will no longer get the optimization smart sensors brought +* Deferrable Operator - move to a Deferrable Operator that alleviates the need for a sensor all-together +* Deferrable Sensor - move to an async version of the sensor you are already using + +See :ref:`Writing Deferrable Operators ` for details on writing Deferrable Operators and Sensors. From 50f4f9eaef78e652afd21ee07a8dca3107c391c8 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 28 Dec 2021 21:11:07 +0100 Subject: [PATCH 187/250] Removes unnecessary --upgrade option from our examples (#20537) (cherry picked from commit 2976e6f829e727c01b9c2838e32d210d40e7a03c) --- docs/apache-airflow/installation/installing-from-pypi.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/apache-airflow/installation/installing-from-pypi.rst b/docs/apache-airflow/installation/installing-from-pypi.rst index 95d055210259a..33a7d7c9a4918 100644 --- a/docs/apache-airflow/installation/installing-from-pypi.rst +++ b/docs/apache-airflow/installation/installing-from-pypi.rst @@ -118,7 +118,7 @@ being installed. AIRFLOW_VERSION=|version| PYTHON_VERSION="$(python --version | cut -d " " -f 2 | cut -d "." -f 1-2)" CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" - pip install --upgrade "apache-airflow[postgres,google]==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" + pip install "apache-airflow[postgres,google]==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" Installation and upgrading of Airflow providers separately ========================================================== From e43138f48d5e6ea8af8d07f2d2c84583002a4122 Mon Sep 17 00:00:00 2001 From: Matt Rixman <5834582+MatrixManAtYrService@users.noreply.github.com> Date: Mon, 3 Jan 2022 20:40:10 -0700 Subject: [PATCH 188/250] Improve documentation on ``Params`` (#20567) I think that this doc could be improved by adding examples of how to reference the params in your dag. (Also, the current example code causes this: #20559.) While trying to find the right place to work a few reference examples in, I ended up rewriting quite a lot of it. Let me know if you think that this is an improvement. I haven't yet figured out how to build this and view it locally, and I'd want to do that as a sanity check before merging it, but I figured get feedback on what I've written before I do that. (cherry picked from commit 064efbeae7c2560741c5a8928799482ef795e100) --- docs/apache-airflow/concepts/params.rst | 146 +++++++++++++++++++----- 1 file changed, 119 insertions(+), 27 deletions(-) diff --git a/docs/apache-airflow/concepts/params.rst b/docs/apache-airflow/concepts/params.rst index c5082793c2993..ef266ea56dc31 100644 --- a/docs/apache-airflow/concepts/params.rst +++ b/docs/apache-airflow/concepts/params.rst @@ -15,16 +15,21 @@ specific language governing permissions and limitations under the License. +.. _concepts:params: + Params ====== -Params are Airflow's concept of providing runtime configuration to tasks when a dag gets triggered manually. -Params are configured while defining the dag & tasks, that can be altered while doing a manual trigger. The -ability to update params while triggering a DAG depends on the flag ``core.dag_run_conf_overrides_params``, -so if that flag is ``False``, params would behave like constants. +Params are how Airflow provides runtime configuration to tasks. +When you trigger a DAG manually, you can modify its Params before the dagrun starts. +If the user-supplied values don't pass validation, Airflow shows a warning instead of creating the dagrun. +(For scheduled runs, the default values are used.) + +Adding Params to a DAG +---------------------- -To use them, one can use the ``Param`` class for complex trigger-time validations or simply use primitive types, -which won't be doing any such validations. +To add Params to a :class:`~airflow.models.dag.DAG`, initialize it with the ``params`` kwarg. +Use a dictionary that maps Param names to a either a :class:`~airflow.models.param.Param` or an object indicating the parameter's default value. .. code-block:: @@ -32,33 +37,120 @@ which won't be doing any such validations. from airflow.models.param import Param with DAG( - 'my_dag', + "the_dag", params={ - 'int_param': Param(10, type='integer', minimum=0, maximum=20), # a int param with default value - 'str_param': Param(type='string', minLength=2, maxLength=4), # a mandatory str param - 'dummy_param': Param(type=['null', 'number', 'string']) # a param which can be None as well - 'old_param': 'old_way_of_passing', # i.e. no data or type validations - 'simple_param': Param('im_just_like_old_param'), # i.e. no data or type validations - 'email_param': Param( - default='example@example.com', - type='string', - format='idn-email', - minLength=5, - maxLength=255, - ), + "x": Param(5, type="integer", minimum=3), + "y": 6 }, + ) as the_dag: + +Referencing Params in a Task +---------------------------- + +Params are stored as ``params`` in the :ref:`template context `. +So you can reference them in a template. + +.. code-block:: + + PythonOperator( + task_id="from_template", + op_args=[ + "{{ params.int_param + 10 }}", + ], + python_callable=( + lambda x: print(x) + ), + ) + +Even though Params can use a variety of types, the default behavior of templates is to provide your task with a string. +You can change this by setting ``render_template_as_native_obj=True`` while initializing the :class:`~airflow.models.dag.DAG`. + +.. code-block:: + + with DAG( + "the_dag", + params={"x": Param(5, type="integer", minimum=3)}, + render_template_as_native_obj=True + ) as the_dag: + + +This way, the Param's type is respected when its provided to your task. + +.. code-block:: + + # prints by default + # prints if render_template_as_native_obj=True + PythonOperator( + task_id="template_type", + op_args=[ + "{{ params.int_param }}", + ], + python_callable=( + lambda x: print(type(x)) + ), ) -``Param`` make use of `json-schema `__ to define the properties and doing the -validation, so one can use the full json-schema specifications mentioned at -https://json-schema.org/draft/2020-12/json-schema-validation.html to define the construct of a ``Param`` -objects. +Another way to access your param is via a task's ``context`` kwarg. -Also, it worthwhile to note that if you have any DAG which uses a mandatory param value, i.e. a ``Param`` -object with no default value or ``null`` as an allowed type, that DAG schedule has to be ``None``. However, -if such ``Param`` has been defined at task level, Airflow has no way to restrict that & the task would be -failing at the execution time. +.. code-block:: + + def print_x(**context): + print(context["params"]["x"]) + + PythonOperator( + task_id="print_x", + python_callable=print_it, + ) + +Task-level Params +----------------- + +You can also add Params to individual tasks. + +.. code-block:: + + PythonOperator( + task_id="print_x", + params={"x": 10}, + python_callable=print_it, + ) + +If there's already a dag param with that name, the task-level default will take precedence over the dag-level default. +If a user supplies their own value when the DAG was triggered, Airflow ignores all defaults and uses the user's value. + +JSON Schema Validation +---------------------- + +:class:`~airflow.modules.param.Param` makes use of ``json-schema ``, so you can use the full json-schema specifications mentioned at https://json-schema.org/draft/2020-12/json-schema-validation.html to define ``Param`` objects. + +.. code-block:: + + with DAG( + "my_dag", + params={ + # a int with a default value + "int_param": Param(10, type="integer", minimum=0, maximum=20), + + # a required param which can be of multiple types + "dummy": Param(type=["null", "number", "string"]), + + # a param which uses json-schema formatting + "email": Param( + default="example@example.com", + type="string", + format="idn-email", + minLength=5, + maxLength=255, + ), + }, + ) as my_dag: .. note:: As of now, for security reasons, one can not use Param objects derived out of custom classes. We are planning to have a registration system for custom Param classes, just like we've for Operator ExtraLinks. + +Disabling Runtime Param Modification +------------------------------------ + +The ability to update params while triggering a DAG depends on the flag ``core.dag_run_conf_overrides_params``. +Setting this config to ``False`` will effectively turn your default params into constants. From 915054f8c8754d69ff86378c672bac2e62055993 Mon Sep 17 00:00:00 2001 From: Marcin Molak Date: Thu, 6 Jan 2022 16:36:09 +0100 Subject: [PATCH 189/250] Update operators.rst (#20640) (cherry picked from commit fa802ede6c4763c8f432100ca78a313f147a77a0) --- docs/apache-airflow/concepts/operators.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/apache-airflow/concepts/operators.rst b/docs/apache-airflow/concepts/operators.rst index c66c2ebef322a..13020f1be4ca1 100644 --- a/docs/apache-airflow/concepts/operators.rst +++ b/docs/apache-airflow/concepts/operators.rst @@ -208,3 +208,17 @@ In this case, ``order_data`` argument is passed: ``{"1001": 301.27, "1002": 433. Airflow uses Jinja's `NativeEnvironment `_ when ``render_template_as_native_obj`` is set to ``True``. With ``NativeEnvironment``, rendering a template produces a native Python type. + +.. _concepts:reserved-keywords: + +Reserved params keyword +----------------------- + +In Apache Airflow 2.2.0 ``params`` variable is used during DAG serialization. Please do not use that name in third party operators. +If you upgrade your environment and get the following error: + +.. code-block:: + + AttributeError: 'str' object has no attribute '__module__' + +change name from ``params`` in your operators. From 88d1da8a68e02ddae3967fbaaa4f9c4c17ca3d06 Mon Sep 17 00:00:00 2001 From: Alan Ma Date: Sun, 9 Jan 2022 13:58:26 -0800 Subject: [PATCH 190/250] Compare taskgroup and subdag (#20700) (cherry picked from commit 6b0c52898555641059e149c5ff0d9b46b2d45379) --- docs/apache-airflow/concepts/dags.rst | 43 +++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/docs/apache-airflow/concepts/dags.rst b/docs/apache-airflow/concepts/dags.rst index 8aa49553fbfd1..8d9b387a7163b 100644 --- a/docs/apache-airflow/concepts/dags.rst +++ b/docs/apache-airflow/concepts/dags.rst @@ -605,8 +605,47 @@ Some other tips when using SubDAGs: See ``airflow/example_dags`` for a demonstration. -Note that :doc:`pools` are *not honored* by :class:`~airflow.operators.subdag.SubDagOperator`, and so -resources could be consumed by SubdagOperators beyond any limits you may have set. + +.. note:: + + Parallelism is *not honored* by :class:`~airflow.operators.subdag.SubDagOperator`, and so resources could be consumed by SubdagOperators beyond any limits you may have set. + + + +TaskGroups vs SubDAGs +---------------------- + +SubDAGs, while serving a similar purpose as TaskGroups, introduces both performance and functional issues due to its implementation. + +* The SubDagOperator starts a BackfillJob, which ignores existing parallelism configurations potentially oversubscribing the worker environment. +* SubDAGs have their own DAG attributes. When the SubDAG DAG attributes are inconsistent with its parent DAG, unexpected behavior can occur. +* Unable to see the "full" DAG in one view as SubDAGs exists as a full fledged DAG. +* SubDAGs introduces all sorts of edge cases and caveats. This can disrupt user experience and expectation. + +TaskGroups, on the other hand, is a better option given that it is purely a UI grouping concept. All tasks within the TaskGroup still behave as any other tasks outside of the TaskGroup. + +You can see the core differences between these two constructs. + ++--------------------------------------------------------+--------------------------------------------------------+ +| TaskGroup | SubDAG | ++========================================================+========================================================+ +| Repeating patterns as part of the same DAG | Repeating patterns as a separate DAG | ++--------------------------------------------------------+--------------------------------------------------------+ +| One set of views and statistics for the DAG | Separate set of views and statistics between parent | +| | and child DAGs | ++--------------------------------------------------------+--------------------------------------------------------+ +| One set of DAG configuration | Several sets of DAG configurations | ++--------------------------------------------------------+--------------------------------------------------------+ +| Honors parallelism configurations through existing | Does not honor parallelism configurations due to | +| SchedulerJob | newly spawned BackfillJob | ++--------------------------------------------------------+--------------------------------------------------------+ +| Simple construct declaration with context manager | Complex DAG factory with naming restrictions | ++--------------------------------------------------------+--------------------------------------------------------+ + +.. note:: + + SubDAG is deprecated hence TaskGroup is always the preferred choice. + Packaging DAGs From 08ddaea6499ec609b6edee91cb63961f3fab40ac Mon Sep 17 00:00:00 2001 From: humit Date: Tue, 11 Jan 2022 02:12:20 +0900 Subject: [PATCH 191/250] Update metric name in documentation (#20764) (cherry picked from commit 6825c9acf435bc0c7b6e77e552c2a33a4966d740) --- docs/apache-airflow/logging-monitoring/metrics.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/apache-airflow/logging-monitoring/metrics.rst b/docs/apache-airflow/logging-monitoring/metrics.rst index 025e310c6b5ac..c8fd1820c0ea9 100644 --- a/docs/apache-airflow/logging-monitoring/metrics.rst +++ b/docs/apache-airflow/logging-monitoring/metrics.rst @@ -101,13 +101,13 @@ Name Description section (needed to send tasks to the executor) and found it locked by another process. ``sla_email_notification_failure`` Number of failed SLA miss email notification attempts -``ti.start..`` Number of started task in a given dag. Similar to _start but for task -``ti.finish...`` Number of completed task in a given dag. Similar to _end but for task +``ti.start..`` Number of started task in a given dag. Similar to _start but for task +``ti.finish...`` Number of completed task in a given dag. Similar to _end but for task ``dag.callback_exceptions`` Number of exceptions raised from DAG callbacks. When this happens, it means DAG callback is not working. ``celery.task_timeout_error`` Number of ``AirflowTaskTimeout`` errors raised when publishing Task to Celery Broker. -``task_removed_from_dag.`` Number of tasks removed for a given dag (i.e. task no longer exists in DAG) -``task_restored_to_dag.`` Number of tasks restored for a given dag (i.e. task instance which was +``task_removed_from_dag.`` Number of tasks removed for a given dag (i.e. task no longer exists in DAG) +``task_restored_to_dag.`` Number of tasks restored for a given dag (i.e. task instance which was previously in REMOVED state in the DB is added to DAG file) ``task_instance_created-`` Number of tasks instances created for a given Operator ``triggers.blocked_main_thread`` Number of triggers that blocked the main thread (likely due to not being From 8b199eb8248a2bf9a033cbc11e45b7125c01a7d4 Mon Sep 17 00:00:00 2001 From: Michael Robinson <68482867+merobi-hub@users.noreply.github.com> Date: Mon, 10 Jan 2022 14:55:33 -0500 Subject: [PATCH 192/250] Python3 requisite start local (#20777) (cherry picked from commit bb5bd64948596f284dd70929010724946cbfa414) --- docs/apache-airflow/start/local.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/apache-airflow/start/local.rst b/docs/apache-airflow/start/local.rst index 644130601c25b..8d0c71e82fee7 100644 --- a/docs/apache-airflow/start/local.rst +++ b/docs/apache-airflow/start/local.rst @@ -24,6 +24,8 @@ This quick start guide will help you bootstrap an Airflow standalone instance on .. note:: + Successful installation requires a Python 3 environment. + Only ``pip`` installation is currently officially supported. While there have been successes with using other tools like `poetry `_ or From d0a9f9aeadde47221dc2554a046b2b5e33996639 Mon Sep 17 00:00:00 2001 From: Matt Rixman <5834582+MatrixManAtYrService@users.noreply.github.com> Date: Wed, 12 Jan 2022 23:48:35 -0700 Subject: [PATCH 193/250] Doc: Added an enum param example (#20841) More examples makes it easier to compare our docs with the json-schema docs and figure out how they work together. I ended up doing something similar to this in my code and figured I'd contribute an example. Co-authored-by: Matt Rixman (cherry picked from commit 8dc68d47048d559cf4b76874d8d5e7a5af6359b6) --- docs/apache-airflow/concepts/params.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/apache-airflow/concepts/params.rst b/docs/apache-airflow/concepts/params.rst index ef266ea56dc31..fd4a1670907d5 100644 --- a/docs/apache-airflow/concepts/params.rst +++ b/docs/apache-airflow/concepts/params.rst @@ -134,6 +134,9 @@ JSON Schema Validation # a required param which can be of multiple types "dummy": Param(type=["null", "number", "string"]), + # an enum param, must be one of three values + "enum_param": Param("foo", enum=["foo", "bar", 42]), + # a param which uses json-schema formatting "email": Param( default="example@example.com", From 06828d6640926e6895c3006f3a005bd9bcbbcbca Mon Sep 17 00:00:00 2001 From: aabhaschopra <51877900+aabhaschopra@users.noreply.github.com> Date: Fri, 21 Jan 2022 19:59:26 +0530 Subject: [PATCH 194/250] Fix grammar in ``dags.rst`` (#20988) grammar correction (cherry picked from commit 754d8bcb5a2d461b71bebfa261a0c41a995d79e4) --- docs/apache-airflow/concepts/dags.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/apache-airflow/concepts/dags.rst b/docs/apache-airflow/concepts/dags.rst index 8d9b387a7163b..3edaf35d5759b 100644 --- a/docs/apache-airflow/concepts/dags.rst +++ b/docs/apache-airflow/concepts/dags.rst @@ -260,7 +260,7 @@ The task_id returned by the Python function has to reference a task directly dow .. image:: /img/branch_note.png - The paths of the branching task are ``branch_a``, ``join`` and ``branch_b``. Since ``join`` is a downstream task of ``branch_a``, it will be still be run, even though it was not returned as part of the branch decision. + The paths of the branching task are ``branch_a``, ``join`` and ``branch_b``. Since ``join`` is a downstream task of ``branch_a``, it will still be run, even though it was not returned as part of the branch decision. The ``BranchPythonOperator`` can also be used with XComs allowing branching context to dynamically decide what branch to follow based on upstream tasks. For example: From cf9051107865d6e3f4d2baae3bc02491a27f0fcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Bregu=C5=82a?= Date: Sun, 23 Jan 2022 14:15:45 +0100 Subject: [PATCH 195/250] Add image labels required by ArtifactHub (#21040) (cherry picked from commit 7f02b4718bc9e282c1d59b0aab5dd46972b6f79c) --- Dockerfile | 6 +- docs/docker-stack/README.md | 75 +++++++++++++++++++ docs/docker-stack/index.rst | 5 +- scripts/ci/libraries/_build_images.sh | 1 + .../pre_commit/pre_commit_update_versions.py | 7 +- 5 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 docs/docker-stack/README.md diff --git a/Dockerfile b/Dockerfile index 89ada3b5f939a..f880ec59460c1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,6 +48,7 @@ ARG PYTHON_BASE_IMAGE="python:3.7-slim-buster" ARG AIRFLOW_PIP_VERSION=21.3.1 ARG AIRFLOW_IMAGE_REPOSITORY="https://github.com/apache/airflow" +ARG AIRFLOW_IMAGE_README_URL="https://raw.githubusercontent.com/apache/airflow/main/docs/docker-stack/README.md" # By default latest released version of airflow is installed (when empty) but this value can be overridden # and we can install version according to specification (For example ==2.0.2 or <3.0.0). @@ -400,6 +401,7 @@ ARG AIRFLOW_HOME # production image is prepared from sources rather than from package ARG AIRFLOW_INSTALLATION_METHOD="apache-airflow" ARG AIRFLOW_IMAGE_REPOSITORY +ARG AIRFLOW_IMAGE_README_URL ENV RUNTIME_APT_DEPS=${RUNTIME_APT_DEPS} \ ADDITIONAL_RUNTIME_APT_DEPS=${ADDITIONAL_RUNTIME_APT_DEPS} \ @@ -528,7 +530,9 @@ LABEL org.apache.airflow.distro="debian" \ org.opencontainers.image.licenses="Apache-2.0" \ org.opencontainers.image.ref.name="airflow" \ org.opencontainers.image.title="Production Airflow Image" \ - org.opencontainers.image.description="Reference, production-ready Apache Airflow image" + org.opencontainers.image.description="Reference, production-ready Apache Airflow image" \ + io.artifacthub.package.license='Apache-2.0' \ + io.artifacthub.package.readme-url='${AIRFLOW_IMAGE_README_URL}' ENTRYPOINT ["/usr/bin/dumb-init", "--", "/entrypoint"] CMD [] diff --git a/docs/docker-stack/README.md b/docs/docker-stack/README.md new file mode 100644 index 0000000000000..d6f1d816d8add --- /dev/null +++ b/docs/docker-stack/README.md @@ -0,0 +1,75 @@ + + + +# Docker Image for Apache Airflow + +For the ease of deployment in production, the community releases a production-ready reference container +image. + +The Apache Airflow community, releases Docker Images which are `reference images` for Apache Airflow. +Every time a new version of Airflow is released, the images are prepared in the +[apache/airflow DockerHub](https://hub.docker.com/r/apache/airflow) +for all the supported Python versions. + +You can find the following images there (Assuming Airflow version `2.2.4`): + +* `apache/airflow:latest` - the latest released Airflow image with default Python version (3.7 currently) +* `apache/airflow:latest-pythonX.Y` - the latest released Airflow image with specific Python version +* `apache/airflow:2.2.4` - the versioned Airflow image with default Python version (3.7 currently) +* `apache/airflow:2.2.4-pythonX.Y` - the versioned Airflow image with specific Python version + +Those are "reference" images. They contain the most common set of extras, dependencies and providers that are +often used by the users and they are good to "try-things-out" when you want to just take Airflow for a spin, + +The Apache Airflow image provided as convenience package is optimized for size, and +it provides just a bare minimal set of the extras and dependencies installed and in most cases +you want to either extend or customize the image. You can see all possible extras in [Reference for package extras](https://airflow.apache.org/docs/apache-airflow/stable/extra-packages-ref.html). +The set of extras used in Airflow Production image are available in the +[Dockerfile](https://github.com/apache/airflow/blob/2c6c7fdb2308de98e142618836bdf414df9768c8/Dockerfile#L37). + +However, Airflow has more than 60 community-managed providers (installable via extras) and some of the +default extras/providers installed are not used by everyone, sometimes others extras/providers +are needed, sometimes (very often actually) you need to add your own custom dependencies, +packages or even custom providers. You can learn how to do it in [Building the image](https://airflow.apache.org/docs/docker-stack/build.html#build-build-image). + +The production images are build in DockerHub from released version and release candidates. There +are also images published from branches but they are used mainly for development and testing purpose. +See [Airflow Git Branching](https://github.com/apache/airflow/blob/main/CONTRIBUTING.rst#airflow-git-branches) +for details. + +## Usage + +The [`AIRFLOW_HOME`](https://airflow.apache.org/docs/apache-airflow/stable/cli-and-env-variables-ref.html#envvar-AIRFLOW_HOME) is set by default to ``/opt/airflow/`` - this means that DAGs +are in default in the ``/opt/airflow/dags`` folder and logs are in the ``/opt/airflow/logs`` + +The working directory is ``/opt/airflow`` by default. + +If no `AIRFLOW__CORE__SQL_ALCHEMY_CONN` variable is set then SQLite database is created in +``${AIRFLOW_HOME}/airflow.db``. + +For example commands that start Airflow see: [Executing commands](https://airflow.apache.org/docs/docker-stack/entrypoint.html#entrypoint-commands). + +Airflow requires many components to function as it is a distributed application. You may therefore also be interested +in launching Airflow in the Docker Compose environment, see: [Quick Start](https://airflow.apache.org/docs/apache-airflow/stable/start/index.html). + +You can use this image in [Helm Chart](https://airflow.apache.org/docs/helm-chart/stable/index.html) as well. diff --git a/docs/docker-stack/index.rst b/docs/docker-stack/index.rst index 37018445aa471..411af82776344 100644 --- a/docs/docker-stack/index.rst +++ b/docs/docker-stack/index.rst @@ -15,6 +15,9 @@ specific language governing permissions and limitations under the License. + .. WARNING: + IF YOU ARE UPDATING THIS FILE, CONSIDER UPDATING README.MD TOO. + .. image:: /img/docker-logo.png :width: 100 @@ -44,7 +47,7 @@ Every time a new version of Airflow is released, the images are prepared in the `apache/airflow DockerHub `_ for all the supported Python versions. -You can find the following images there (Assuming Airflow version |airflow-version|): +You can find the following images there (Assuming Airflow version :subst-code:`|airflow-version|`): * :subst-code:`apache/airflow:latest` - the latest released Airflow image with default Python version (3.7 currently) * :subst-code:`apache/airflow:latest-pythonX.Y` - the latest released Airflow image with specific Python version diff --git a/scripts/ci/libraries/_build_images.sh b/scripts/ci/libraries/_build_images.sh index 2d84a92f86eed..c05226171e9d3 100644 --- a/scripts/ci/libraries/_build_images.sh +++ b/scripts/ci/libraries/_build_images.sh @@ -673,6 +673,7 @@ function build_images::build_prod_images() { --build-arg AIRFLOW_CONSTRAINTS="${AIRFLOW_CONSTRAINTS}" \ --build-arg AIRFLOW_IMAGE_REPOSITORY="https://github.com/${GITHUB_REPOSITORY}" \ --build-arg AIRFLOW_IMAGE_DATE_CREATED="$(date -u +'%Y-%m-%dT%H:%M:%SZ')" \ + --build-arg AIRFLOW_IMAGE_README_URL="https://raw.githubusercontent.com/apache/airflow/${COMMIT_SHA}/docs/docker-stack/README.md" \ "${additional_dev_args[@]}" \ "${additional_runtime_args[@]}" \ "${docker_cache_prod_directive[@]}" \ diff --git a/scripts/ci/pre_commit/pre_commit_update_versions.py b/scripts/ci/pre_commit/pre_commit_update_versions.py index 2af06980755fc..3898d64a445cc 100755 --- a/scripts/ci/pre_commit/pre_commit_update_versions.py +++ b/scripts/ci/pre_commit/pre_commit_update_versions.py @@ -30,7 +30,7 @@ def update_version(pattern: re.Pattern, v: str, file_path: str): - print(f"Replacing {pattern} to {version} in {file_path}") + print(f"Checking {pattern} in {file_path}") with open(file_path, "r+") as f: file_content = f.read() if not pattern.search(file_content): @@ -38,6 +38,7 @@ def update_version(pattern: re.Pattern, v: str, file_path: str): new_content = pattern.sub(fr'\g<1>{v}\g<2>', file_content) if file_content == new_content: return + print(" Updated.") f.seek(0) f.truncate() f.write(new_content) @@ -46,8 +47,12 @@ def update_version(pattern: re.Pattern, v: str, file_path: str): REPLACEMENTS = { r'^(FROM apache\/airflow:).*($)': "docs/docker-stack/docker-examples/extending/*/Dockerfile", r'(apache\/airflow:)[^-]*(\-)': "docs/docker-stack/entrypoint.rst", + r'(`apache/airflow:)[0-9].*?((?:-pythonX.Y)?`)': "docs/docker-stack/README.md", + r'(\(Assuming Airflow version `).*(`\))': "docs/docker-stack/README.md", } +print(f"Current version: {version}") + if __name__ == '__main__': for regexp, p in REPLACEMENTS.items(): text_pattern = re.compile(regexp, flags=re.MULTILINE) From 72085cc735cd953d7d8db081f9cb738554ada058 Mon Sep 17 00:00:00 2001 From: aabhaschopra <51877900+aabhaschopra@users.noreply.github.com> Date: Sun, 23 Jan 2022 19:46:35 +0530 Subject: [PATCH 196/250] Update tutorial.rst (#21043) Updated bad link (cherry picked from commit cee50d99f50198d1c9beb1e3545e0e9393cfb3b1) --- docs/apache-airflow/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/apache-airflow/tutorial.rst b/docs/apache-airflow/tutorial.rst index babb8d6397712..1c32e78e78fea 100644 --- a/docs/apache-airflow/tutorial.rst +++ b/docs/apache-airflow/tutorial.rst @@ -176,7 +176,7 @@ to use ``{{ foo }}`` in your templates. Moreover, specifying passing ``dict(hello=lambda name: 'Hello %s' % name)`` to this argument allows you to use ``{{ 'world' | hello }}`` in your templates. For more information regarding custom filters have a look at the -`Jinja Documentation `_. +`Jinja Documentation `_. For more information on the variables and macros that can be referenced in templates, make sure to read through the :ref:`templates-ref`. From a262d9cc84668ab6d3240525b551011fb4dfd98a Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sun, 23 Jan 2022 19:28:11 +0100 Subject: [PATCH 197/250] Temporary limit Pandas version (#21045) This is likely only for couple of days to avoid test failures in `main`. When the 3.4.4 version of Flask Builder gets released we should be able to relax the limit as it will allow us to migrate to sqlalchemy 1.4 (cherry picked from commit eac7cbe39341ef821eca8cb472a1f8e2a3876706) --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 851af029720d0..b82eb81b32044 100644 --- a/setup.py +++ b/setup.py @@ -176,7 +176,10 @@ def write_version(filename: str = os.path.join(*[my_dir, "airflow", "git_version file.write(text) -pandas_requirement = 'pandas>=0.17.1, <2.0' +# We limit Pandas to <1.4 because Pandas 1.4 requires SQLAlchemy 1.4 which +# We should remove the limits as soon as Flask App Builder releases version 3.4.4 +# Release candidate is there: https://pypi.org/project/Flask-AppBuilder/3.4.4rc1/ +pandas_requirement = 'pandas>=0.17.1, <1.4' # 'Start dependencies group' and 'Start dependencies group' are mark for ./scripts/ci/check_order_setup.py # If you change this mark you should also change ./scripts/ci/check_order_setup.py From a88930572492f0ba0ca4996ec2b06edda61efdbb Mon Sep 17 00:00:00 2001 From: Chenglong Yan Date: Mon, 24 Jan 2022 07:59:10 +0800 Subject: [PATCH 198/250] Fix running airflow dags test results in error when run twice (#21031) related: #21023 (cherry picked from commit 515ea84335fc440fe022db2a0e3b158e0d7702da) --- airflow/cli/commands/dag_command.py | 2 +- tests/cli/commands/test_dag_command.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/airflow/cli/commands/dag_command.py b/airflow/cli/commands/dag_command.py index b94d6cf8a22e1..e04bc7381a2f0 100644 --- a/airflow/cli/commands/dag_command.py +++ b/airflow/cli/commands/dag_command.py @@ -406,7 +406,7 @@ def dag_list_dag_runs(args, dag=None): def dag_test(args, session=None): """Execute one single DagRun for a given DAG and execution date, using the DebugExecutor.""" dag = get_dag(subdir=args.subdir, dag_id=args.dag_id) - dag.clear(start_date=args.execution_date, end_date=args.execution_date, dag_run_state=State.NONE) + dag.clear(start_date=args.execution_date, end_date=args.execution_date, dag_run_state=False) try: dag.run( executor=DebugExecutor(), diff --git a/tests/cli/commands/test_dag_command.py b/tests/cli/commands/test_dag_command.py index 0044761bd58f7..a937676c9d57b 100644 --- a/tests/cli/commands/test_dag_command.py +++ b/tests/cli/commands/test_dag_command.py @@ -517,7 +517,7 @@ def test_dag_test(self, mock_get_dag, mock_executor): mock.call().clear( start_date=cli_args.execution_date, end_date=cli_args.execution_date, - dag_run_state=State.NONE, + dag_run_state=False, ), mock.call().run( executor=mock_executor.return_value, @@ -548,7 +548,7 @@ def test_dag_test_show_dag(self, mock_get_dag, mock_executor, mock_render_dag): mock.call().clear( start_date=cli_args.execution_date, end_date=cli_args.execution_date, - dag_run_state=State.NONE, + dag_run_state=False, ), mock.call().run( executor=mock_executor.return_value, From b1862ca1170a5cf0749b922b5da0f803b0c885be Mon Sep 17 00:00:00 2001 From: Bijan Soltani <46958547+soltanianalytics@users.noreply.github.com> Date: Mon, 24 Jan 2022 15:28:24 +0100 Subject: [PATCH 199/250] Add a link to the DAG model in the Python API reference (#21060) (cherry picked from commit 160f2e0f1e39e7fdfc56a9248ac32c3a6cb3dae6) --- docs/apache-airflow/python-api-ref.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/apache-airflow/python-api-ref.rst b/docs/apache-airflow/python-api-ref.rst index 591bb84f5ecca..e8c4dff3c499c 100644 --- a/docs/apache-airflow/python-api-ref.rst +++ b/docs/apache-airflow/python-api-ref.rst @@ -20,6 +20,12 @@ Python API Reference ==================== +.. _pythonapi:dags: + +DAGs +--------- +The DAG is Airflow's core model that represents a recurring workflow. Check out :class:`~airflow.models.dag.DAG` for details. + .. _pythonapi:operators: Operators From cc9a03461d1bcdc742e170a572e1b6e91917e847 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Mon, 24 Jan 2022 17:59:25 +0100 Subject: [PATCH 200/250] Logs in to Github Registry when preparing cache (#21069) Whe we are preparing cache on CI, we should login to the GitHub registry (using GITHUB_TOKEN) in order for --cache-to to be able to push images. --- scripts/ci/libraries/_build_images.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/ci/libraries/_build_images.sh b/scripts/ci/libraries/_build_images.sh index c05226171e9d3..470a26d462171 100644 --- a/scripts/ci/libraries/_build_images.sh +++ b/scripts/ci/libraries/_build_images.sh @@ -470,6 +470,8 @@ function build_images::build_ci_image() { exit 1 fi if [[ ${PREPARE_BUILDX_CACHE} == "true" ]]; then + # we need to login to docker registry so that we can push cache there + build_images::login_to_docker_registry docker_ci_cache_directive+=( "--cache-to=type=registry,ref=${AIRFLOW_CI_IMAGE}:cache" "--load" @@ -624,6 +626,8 @@ function build_images::build_prod_images() { exit 1 fi if [[ ${PREPARE_BUILDX_CACHE} == "true" ]]; then + # we need to login to docker registry so that we can push cache there + build_images::login_to_docker_registry # Cache for prod image contains also build stage for buildx when mode=max specified! docker_cache_prod_directive+=( "--cache-to=type=registry,ref=${AIRFLOW_PROD_IMAGE}:cache,mode=max" From f99f7df727eb2c609480e4e403d441e455529482 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 25 Jan 2022 12:16:55 +0100 Subject: [PATCH 201/250] Add documentation and release policy on "latest" constraints (#21093) (cherry picked from commit aac5a1dba8b2b1a2b66fd4b1271dc03170c64dfb) --- dev/README_RELEASE_AIRFLOW.md | 10 +++++++++- .../installation/installing-from-pypi.rst | 7 +++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index c8310642e3228..4aae94537d3ab 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -156,7 +156,7 @@ For now this is done manually, example run `git log --oneline v2-2-test..HEAD - popd ``` -- Tag & Push the latest constraints files. This pushes constraints with rc suffix (this is expected)! +- Tag & Push the constraints files. This pushes constraints with rc suffix (this is expected)! ```shell script git checkout origin/constraints-${VERSION_BRANCH} @@ -298,6 +298,7 @@ protected_branches: git push origin tag constraints-${BRANCH_PREFIX} ``` + ## Prepare PyPI convenience "snapshot" packages At this point we have the artifact that we vote on, but as a convenience to developers we also want to @@ -796,6 +797,13 @@ At this point we release an official package: git push origin tag "constraints-${VERSION}" ``` +- In case you release "latest stable" version, also update "latest" constraints + + ```shell script + git tag -f -s "constraints-latest" -m "Latest constraints set to Apache Airflow ${VERSION}" + git push origin tag "constraints-latest" + ``` + - Push Tag for the final version This step should only be done now and not before, because it triggers an automated build of diff --git a/docs/apache-airflow/installation/installing-from-pypi.rst b/docs/apache-airflow/installation/installing-from-pypi.rst index 33a7d7c9a4918..5edf5e1148042 100644 --- a/docs/apache-airflow/installation/installing-from-pypi.rst +++ b/docs/apache-airflow/installation/installing-from-pypi.rst @@ -81,6 +81,13 @@ You can create the URL to the file substituting the variables in the template be https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-no-providers-${PYTHON_VERSION}.txt +You can also use "latest" as version when you install "latest" stable version of Airflow. The "latest" +constraints always points to the "latest" released Airflow version constraints: + +.. code-block:: + + https://raw.githubusercontent.com/apache/airflow/constraints-latest/constraints-3.7.txt + Installation and upgrade scenarios '''''''''''''''''''''''''''''''''' From fab8b121c6906f62e28ea6485d8d8e493cb8b769 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Thu, 6 Jan 2022 01:26:21 +0530 Subject: [PATCH 202/250] Allow Viewing DagRuns and TIs if a user has DAG "read" perms (#20663) This was updated in Airflow 2.2.0 via https://github.com/apache/airflow/pull/16634 which restricts a user to even views the DagRuns and TI records if they don't have "edit" permissions on DAG even though it has "read" permissions. The Behaviour seems inconsistent as a User can still view those from the Graph and Tree View of the DAG. And since we have got `@action_has_dag_edit_access` on all the Actions like Delete/Clear etc the approach in this PR is better as when a user will try to perform any actions from the List Dag Run view like deleting the record it will give an Access Denied error. (cherry picked from commit 05b9f3db5471e49e894e4148d8d1deb4361a9b53) --- airflow/www/views.py | 12 ++---------- tests/www/views/test_views_dagrun.py | 23 ++++++++++++++++++++--- tests/www/views/test_views_tasks.py | 4 +++- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/airflow/www/views.py b/airflow/www/views.py index f22735c7e1f83..9b868f3faaccd 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -3144,14 +3144,6 @@ def apply(self, query, func): return query.filter(self.model.dag_id.in_(filter_dag_ids)) -class DagEditFilter(BaseFilter): - """Filter using DagIDs""" - - def apply(self, query, func): # pylint: disable=redefined-outer-name,unused-argument - filter_dag_ids = current_app.appbuilder.sm.get_editable_dag_ids(g.user) - return query.filter(self.model.dag_id.in_(filter_dag_ids)) - - class AirflowModelView(ModelView): """Airflow Mode View.""" @@ -3951,7 +3943,7 @@ class DagRunModelView(AirflowPrivilegeVerifierModelView): base_order = ('execution_date', 'desc') - base_filters = [['dag_id', DagEditFilter, lambda: []]] + base_filters = [['dag_id', DagFilter, lambda: []]] edit_form = DagRunEditForm @@ -4299,7 +4291,7 @@ class TaskInstanceModelView(AirflowPrivilegeVerifierModelView): base_order = ('job_id', 'asc') - base_filters = [['dag_id', DagEditFilter, lambda: []]] + base_filters = [['dag_id', DagFilter, lambda: []]] def log_url_formatter(self): """Formats log URL.""" diff --git a/tests/www/views/test_views_dagrun.py b/tests/www/views/test_views_dagrun.py index 2268db96f740c..6a194e4b974f5 100644 --- a/tests/www/views/test_views_dagrun.py +++ b/tests/www/views/test_views_dagrun.py @@ -15,6 +15,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import flask +import markupsafe import pytest import werkzeug @@ -73,6 +75,21 @@ def reset_dagrun(): session.query(TaskInstance).delete() +def test_get_dagrun_can_view_dags_without_edit_perms(session, running_dag_run, client_dr_without_dag_edit): + """Test that a user without dag_edit but with dag_read permission can view the records""" + assert session.query(DagRun).filter(DagRun.dag_id == running_dag_run.dag_id).count() == 1 + resp = client_dr_without_dag_edit.get('/dagrun/list/', follow_redirects=True) + + with client_dr_without_dag_edit.application.test_request_context(): + url = flask.url_for( + 'Airflow.graph', dag_id=running_dag_run.dag_id, execution_date=running_dag_run.execution_date + ) + dag_url_link = markupsafe.Markup('{dag_id}').format( + url=url, dag_id=running_dag_run.dag_id + ) + check_content_in_response(dag_url_link, resp) + + def test_create_dagrun_permission_denied(session, client_dr_without_dag_edit): data = { "state": "running", @@ -102,7 +119,7 @@ def running_dag_run(session): TaskInstance(dag.get_task("runme_1"), run_id=dr.run_id, state="failed"), ] session.bulk_save_objects(tis) - session.flush() + session.commit() return dr @@ -113,12 +130,12 @@ def test_delete_dagrun(session, admin_client, running_dag_run): assert session.query(DagRun).filter(DagRun.dag_id == running_dag_run.dag_id).count() == 0 -def test_delete_dagrun_permission_denied(session, client_dr_without_dag_edit, running_dag_run): +def test_delete_dagrun_permission_denied(session, running_dag_run, client_dr_without_dag_edit): composite_key = _get_appbuilder_pk_string(DagRunModelView, running_dag_run) assert session.query(DagRun).filter(DagRun.dag_id == running_dag_run.dag_id).count() == 1 resp = client_dr_without_dag_edit.post(f"/dagrun/delete/{composite_key}", follow_redirects=True) - assert resp.status_code == 404 # If it doesn't fully succeed it gives a 404. + check_content_in_response(f"Access denied for dag_id {running_dag_run.dag_id}", resp) assert session.query(DagRun).filter(DagRun.dag_id == running_dag_run.dag_id).count() == 1 diff --git a/tests/www/views/test_views_tasks.py b/tests/www/views/test_views_tasks.py index 3a4be864c99c0..e4336c5bc5769 100644 --- a/tests/www/views/test_views_tasks.py +++ b/tests/www/views/test_views_tasks.py @@ -635,13 +635,15 @@ def test_task_instance_delete_permission_denied(session, client_ti_without_dag_e task_id="test_task_instance_delete_permission_denied", execution_date=timezone.utcnow(), state=State.DEFERRED, + session=session, ) + session.commit() composite_key = _get_appbuilder_pk_string(TaskInstanceModelView, task_instance_to_delete) task_id = task_instance_to_delete.task_id assert session.query(TaskInstance).filter(TaskInstance.task_id == task_id).count() == 1 resp = client_ti_without_dag_edit.post(f"/taskinstance/delete/{composite_key}", follow_redirects=True) - assert resp.status_code == 404 # If it doesn't fully succeed it gives a 404. + check_content_in_response(f"Access denied for dag_id {task_instance_to_delete.dag_id}", resp) assert session.query(TaskInstance).filter(TaskInstance.task_id == task_id).count() == 1 From a9c178e23dc918a184a99029f883fecfe2d0cd84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rincones?= Date: Thu, 6 Jan 2022 22:50:41 +0100 Subject: [PATCH 203/250] Add Roles from Azure OAUTH Response in security manager as it is currently not able map any AD roles to airflow ones (#20707) (cherry picked from commit 088cbf2835cb6e29deb555a931bafc6b73deadef) --- airflow/www/fab_security/manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airflow/www/fab_security/manager.py b/airflow/www/fab_security/manager.py index 85341e9c58cab..e340c179c72f6 100644 --- a/airflow/www/fab_security/manager.py +++ b/airflow/www/fab_security/manager.py @@ -591,6 +591,7 @@ def get_oauth_user_info(self, provider, resp): "last_name": me.get("family_name", ""), "id": me["oid"], "username": me["oid"], + "role_keys": me.get("roles", []), } # for OpenShift if provider == "openshift": From 31c66eb9d7722ba751ecee3540b350cb685891f0 Mon Sep 17 00:00:00 2001 From: Ilia Lazebnik Date: Sun, 23 Jan 2022 21:29:52 +0200 Subject: [PATCH 204/250] Update v1.yaml (#21024) (cherry picked from commit 2af0f700857cbf7401d930ff24cdff273b501beb) --- airflow/api_connexion/openapi/v1.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/airflow/api_connexion/openapi/v1.yaml b/airflow/api_connexion/openapi/v1.yaml index 3669c66d1a3e7..e7553a3aa1ab6 100644 --- a/airflow/api_connexion/openapi/v1.yaml +++ b/airflow/api_connexion/openapi/v1.yaml @@ -2161,8 +2161,6 @@ components: The value of this field can be set only when creating the object. If you try to modify the field of an existing object, the request fails with an BAD_REQUEST error. - required: - - dag_id UpdateDagRunState: type: object From a670f8c340b9e5a21f349c8dc5a7b9ff38579df9 Mon Sep 17 00:00:00 2001 From: caxefaizan <63395276+caxefaizan@users.noreply.github.com> Date: Mon, 24 Jan 2022 16:17:23 +0530 Subject: [PATCH 205/250] name mismatch (#21055) (cherry picked from commit 4fb005ec122a1c0091db0083c2fe4305473abb49) --- .../pod_template_file_examples/dags_in_volume_template.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airflow/kubernetes/pod_template_file_examples/dags_in_volume_template.yaml b/airflow/kubernetes/pod_template_file_examples/dags_in_volume_template.yaml index 389fe379c37d0..cc4614996c760 100644 --- a/airflow/kubernetes/pod_template_file_examples/dags_in_volume_template.yaml +++ b/airflow/kubernetes/pod_template_file_examples/dags_in_volume_template.yaml @@ -63,7 +63,7 @@ spec: fsGroup: 50000 serviceAccountName: "RELEASE-NAME-worker-serviceaccount" volumes: - - name: dags + - name: airflow-dags persistentVolumeClaim: claimName: RELEASE-NAME-dags - emptyDir: {} From 9f6d6b9a13d808c0faff899f79a7bbcaf78fc5aa Mon Sep 17 00:00:00 2001 From: caxefaizan <63395276+caxefaizan@users.noreply.github.com> Date: Wed, 26 Jan 2022 04:15:58 +0530 Subject: [PATCH 206/250] Update logging-tasks.rst (#21088) (cherry picked from commit 156284650f20bad131f26b91061e207e2e39253e) --- docs/apache-airflow/logging-monitoring/logging-tasks.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/apache-airflow/logging-monitoring/logging-tasks.rst b/docs/apache-airflow/logging-monitoring/logging-tasks.rst index 13cb24855e017..e64905f8b7e2b 100644 --- a/docs/apache-airflow/logging-monitoring/logging-tasks.rst +++ b/docs/apache-airflow/logging-monitoring/logging-tasks.rst @@ -122,7 +122,7 @@ Serving logs from workers Most task handlers send logs upon completion of a task. In order to view logs in real time, Airflow automatically starts an HTTP server to serve the logs in the following cases: -- If ``SchedulerExecutor`` or ``LocalExecutor`` is used, then when ``airflow scheduler`` is running. +- If ``SequentialExecutor`` or ``LocalExecutor`` is used, then when ``airflow scheduler`` is running. - If ``CeleryExecutor`` is used, then when ``airflow worker`` is running. The server is running on the port specified by ``worker_log_server_port`` option in ``[logging]`` section. By default, it is ``8793``. From 680c011f3050d2858e5d648b68e59185a213709c Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 26 Jan 2022 18:04:19 +0100 Subject: [PATCH 207/250] Add back legacy .piprc customization for pip (#21124) This change brings back backwards compatibility to using .piprc to customize Airflow Image. Some older vrsions of pip used .piprc (even though documentation about is difficult to find now) and we used to support this option. With #20445, we changed to use (fully documented) ``pip.conf`` option, however if someone used .piprc before to customize their image, this change would break it. The PR brings back also the .piprc option to the image (even if it is not really clear whether current and future versions of pip will support it. (cherry picked from commit d5a9edf25723396d17fd10bb980fb99ccac618bb) --- Dockerfile | 3 +++ docs/docker-stack/build.rst | 10 +++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f880ec59460c1..aadf896a46fd8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -212,6 +212,9 @@ USER airflow RUN if [[ -f /docker-context-files/pip.conf ]]; then \ mkdir -p ${AIRFLOW_USER_HOME_DIR}/.config/pip; \ cp /docker-context-files/pip.conf "${AIRFLOW_USER_HOME_DIR}/.config/pip/pip.conf"; \ + fi; \ + if [[ -f /docker-context-files/.piprc ]]; then \ + cp /docker-context-files/.piprc "${AIRFLOW_USER_HOME_DIR}/.piprc"; \ fi ENV AIRFLOW_PIP_VERSION=${AIRFLOW_PIP_VERSION} \ diff --git a/docs/docker-stack/build.rst b/docs/docker-stack/build.rst index 2702c667c9955..b85bf1caf522f 100644 --- a/docs/docker-stack/build.rst +++ b/docs/docker-stack/build.rst @@ -522,13 +522,21 @@ described below but here is an example of rather complex command to customize th based on example in `this comment `_: In case you need to use your custom PyPI package indexes, you can also customize PYPI sources used during -image build by adding a ``docker-context-files``/``pip.conf`` file when building the image. +image build by adding a ``docker-context-files/pip.conf`` file when building the image. This ``pip.conf`` will not be committed to the repository (it is added to ``.gitignore``) and it will not be present in the final production image. It is added and used only in the build segment of the image. Therefore this ``pip.conf`` file can safely contain list of package indexes you want to use, usernames and passwords used for authentication. More details about ``pip.conf`` file can be found in the `pip configuration `_. +If you used the ``.piprc`` before (some older versions of ``pip`` used it for customization), you can put it +in the ``docker-context-files/.piprc`` file and it will be automatically copied to ``HOME`` directory +of the ``airflow`` user. + +Note, that those customizations are only available in the ``build`` segment of the Airflow image and they +are not present in the ``final`` image. If you wish to extend the final image and add custom ``.piprc`` and +``pip.conf``, you should add them in your own Dockerfile used to extend the Airflow image. + Such customizations are independent of the way how airflow is installed. .. note:: From 5b51c41a39ee9238bc0fba0ace8c5ec27b9e8875 Mon Sep 17 00:00:00 2001 From: Omer Ginosar <94788242+omer-ginosar@users.noreply.github.com> Date: Tue, 25 Jan 2022 23:51:16 +0200 Subject: [PATCH 208/250] Improved instructions for custom image build with docker compose (#21052) * Create build.rst * Update docs/docker-stack/build.rst Co-authored-by: Jarek Potiuk * fix doc build Co-authored-by: Jarek Potiuk Co-authored-by: eladkal <45845474+eladkal@users.noreply.github.com> (cherry picked from commit 17b48e5baf09a86ea6e2036c864a882bb0c328e2) --- docs/docker-stack/build.rst | 19 +++++++++++++++++-- docs/spelling_wordlist.txt | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/docs/docker-stack/build.rst b/docs/docker-stack/build.rst index b85bf1caf522f..6b5dc472c481f 100644 --- a/docs/docker-stack/build.rst +++ b/docs/docker-stack/build.rst @@ -81,8 +81,23 @@ In the simplest case building your image consists of those steps: 4) Once you build the image locally you have usually several options to make them available for your deployment: -* For ``docker-compose`` deployment, that's all you need. The image is stored in docker engine cache - and docker compose will use it from there. +* For ``docker-compose`` deployment, if you've already built your image, and want to continue + building the image manually when needed with ``docker build``, you can edit the + docker-compose.yaml and replace the "apache/airflow:" image with the + image you've just built ``my-image:0.0.1`` - it will be used from your local Docker + Engine cache. You can also simply set ``AIRFLOW_IMAGE_NAME`` variable to + point to your image and ``docker-compose`` will use it automatically without having + to modify the file. + +* Also for ``docker-compose`` deployment, you can delegate image building to the docker-compose. + To do that - open your ``docker-compose.yaml`` file and search for the phrase "In order to add custom dependencies". + Follow these instructions of commenting the "image" line and uncommenting the "build" line. + This is a standard docker-compose feature and you can read about it in + `Docker Compose build reference `_. + Run ``docker-compose build`` to build the images. Similarly as in the previous case, the + image is stored in Docker engine cache and Docker Compose will use it from there. + The ``docker-compose build`` command uses the same ``docker build`` command that + you can run manually under-the-hood. * For some - development targeted - Kubernetes deployments you can load the images directly to Kubernetes clusters. Clusters such as ``kind`` or ``minikube`` have dedicated ``load`` method to load the diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 64d839f428865..5d77e299558b9 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -1384,6 +1384,7 @@ uid umask un unarchived +uncommenting undead ungenerated unicode From 9f7d292769fd21c06ab0f222d7c75085d8aab288 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Thu, 27 Jan 2022 09:45:33 -0700 Subject: [PATCH 209/250] Update `version_added` for `[email] from_email` (#21138) (cherry picked from commit 362f397d7a3351c718b798a146f2f955a17b7074) --- airflow/config_templates/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index a70854e6bf5f9..6941f03e53550 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -1357,7 +1357,7 @@ description: | Email address that will be used as sender address. It can either be raw email or the complete address in a format ``Sender Name `` - version_added: 2.3.0 + version_added: 2.2.4 type: string example: "Airflow " default: ~ From 07102e96dfb3c9794882f562548b37738ee4a37a Mon Sep 17 00:00:00 2001 From: yuqian90 Date: Thu, 27 Jan 2022 06:47:10 +0800 Subject: [PATCH 210/250] Do not set `TaskInstance.max_tries` in `refresh_from_task` (#21018) (cherry picked from commit e3832a77a3e0d374dfdbe14f34a941d22c9c459d) --- airflow/models/taskinstance.py | 4 +++- tests/models/test_taskinstance.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/airflow/models/taskinstance.py b/airflow/models/taskinstance.py index 281d067861f44..ec34156931edf 100644 --- a/airflow/models/taskinstance.py +++ b/airflow/models/taskinstance.py @@ -447,6 +447,7 @@ def __init__( self.run_id = run_id self.try_number = 0 + self.max_tries = self.task.retries self.unixname = getuser() if state: self.state = state @@ -775,7 +776,8 @@ def refresh_from_task(self, task, pool_override=None): self.pool_slots = task.pool_slots self.priority_weight = task.priority_weight_total self.run_as_user = task.run_as_user - self.max_tries = task.retries + # Do not set max_tries to task.retries here because max_tries is a cumulative + # value that needs to be stored in the db. self.executor_config = task.executor_config self.operator = task.task_type diff --git a/tests/models/test_taskinstance.py b/tests/models/test_taskinstance.py index d1113714071fd..4fec49fe2b05c 100644 --- a/tests/models/test_taskinstance.py +++ b/tests/models/test_taskinstance.py @@ -2143,6 +2143,12 @@ def test_refresh_from_task(pool_override): assert ti.executor_config == task.executor_config assert ti.operator == DummyOperator.__name__ + # Test that refresh_from_task does not reset ti.max_tries + expected_max_tries = task.retries + 10 + ti.max_tries = expected_max_tries + ti.refresh_from_task(task) + assert ti.max_tries == expected_max_tries + class TestRunRawTaskQueriesCount: """ From dda8f4356525041c5200c42d00e5dc05fd79c54b Mon Sep 17 00:00:00 2001 From: SeonghwanLee <50520567+uplsh580@users.noreply.github.com> Date: Thu, 27 Jan 2022 14:36:24 +0900 Subject: [PATCH 211/250] Fix 'airflow dags backfill --reset-dagruns' errors when run twice (#21062) Co-authored-by: uplsh (cherry picked from commit d97e2bac854f9891eb47f0c06c261e89723038ca) --- airflow/cli/commands/dag_command.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airflow/cli/commands/dag_command.py b/airflow/cli/commands/dag_command.py index e04bc7381a2f0..6e8e157e01922 100644 --- a/airflow/cli/commands/dag_command.py +++ b/airflow/cli/commands/dag_command.py @@ -47,7 +47,7 @@ ) from airflow.utils.dot_renderer import render_dag from airflow.utils.session import create_session, provide_session -from airflow.utils.state import State +from airflow.utils.state import DagRunState @cli_utils.action_logging @@ -105,7 +105,7 @@ def dag_backfill(args, dag=None): end_date=args.end_date, confirm_prompt=not args.yes, include_subdags=True, - dag_run_state=State.NONE, + dag_run_state=DagRunState.QUEUED, ) try: From 016929f782022791557d0ec1f316b338dc210566 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Tue, 7 Dec 2021 17:55:00 +0800 Subject: [PATCH 212/250] Type-annotate SkipMixin and BaseXCom (#20011) (cherry picked from commit 6dd0a0df7e6a2f025e9234bdbf97b41e9b8f6257) --- airflow/models/skipmixin.py | 15 +- airflow/models/xcom.py | 335 ++++++++++++++++++++++++------------ 2 files changed, 232 insertions(+), 118 deletions(-) diff --git a/airflow/models/skipmixin.py b/airflow/models/skipmixin.py index 5cd50a3165b7e..765a94712ca0e 100644 --- a/airflow/models/skipmixin.py +++ b/airflow/models/skipmixin.py @@ -17,7 +17,7 @@ # under the License. import warnings -from typing import TYPE_CHECKING, Iterable, Union +from typing import TYPE_CHECKING, Iterable, Optional, Sequence, Union from airflow.models.taskinstance import TaskInstance from airflow.utils import timezone @@ -26,6 +26,7 @@ from airflow.utils.state import State if TYPE_CHECKING: + from pendulum import DateTime from sqlalchemy import Session from airflow.models import DagRun @@ -66,9 +67,9 @@ def _set_state_to_skipped(self, dag_run: "DagRun", tasks: "Iterable[BaseOperator def skip( self, dag_run: "DagRun", - execution_date: "timezone.DateTime", - tasks: "Iterable[BaseOperator]", - session: "Session" = None, + execution_date: "DateTime", + tasks: Sequence["BaseOperator"], + session: "Session", ): """ Sets tasks instances to skipped from the same dag run. @@ -114,11 +115,7 @@ def skip( session.commit() # SkipMixin may not necessarily have a task_id attribute. Only store to XCom if one is available. - try: - task_id = self.task_id - except AttributeError: - task_id = None - + task_id: Optional[str] = getattr(self, "task_id", None) if task_id is not None: from airflow.models.xcom import XCom diff --git a/airflow/models/xcom.py b/airflow/models/xcom.py index 99c2b9aca5b2b..4bb9689e7dda6 100644 --- a/airflow/models/xcom.py +++ b/airflow/models/xcom.py @@ -16,10 +16,11 @@ # specific language governing permissions and limitations # under the License. +import datetime import json import logging import pickle -from typing import Any, Iterable, Optional, Union +from typing import TYPE_CHECKING, Any, Iterable, Optional, Type, Union, cast, overload import pendulum from sqlalchemy import Column, LargeBinary, String @@ -79,14 +80,60 @@ def init_on_load(self): def __repr__(self): return f'' + @overload @classmethod - @provide_session - def set(cls, key, value, task_id, dag_id, execution_date=None, run_id=None, session=None): + def set( + cls, + key: str, + value: Any, + *, + dag_id: str, + task_id: str, + run_id: str, + session: Optional[Session] = None, + ) -> None: + """Store an XCom value. + + A deprecated form of this function accepts ``execution_date`` instead of + ``run_id``. The two arguments are mutually exclusive. + + :param key: Key to store the XCom. + :param value: XCom value to store. + :param dag_id: DAG ID. + :param task_id: Task ID. + :param run_id: DAG run ID for the task. + :param session: Database session. If not given, a new session will be + created for this function. + :type session: sqlalchemy.orm.session.Session """ - Store an XCom value. - :return: None - """ + @overload + @classmethod + def set( + cls, + key: str, + value: Any, + task_id: str, + dag_id: str, + execution_date: datetime.datetime, + session: Optional[Session] = None, + ) -> None: + """:sphinx-autoapi-skip:""" + + @classmethod + @provide_session + def set( + cls, + key: str, + value: Any, + task_id: str, + dag_id: str, + execution_date: Optional[datetime.datetime] = None, + session: Session = None, + *, + run_id: Optional[str] = None, + ) -> None: + """:sphinx-autoapi-skip:""" if not (execution_date is None) ^ (run_id is None): raise ValueError("Exactly one of execution_date or run_id must be passed") @@ -94,70 +141,95 @@ def set(cls, key, value, task_id, dag_id, execution_date=None, run_id=None, sess from airflow.models.dagrun import DagRun dag_run = session.query(DagRun).filter_by(dag_id=dag_id, run_id=run_id).one() - execution_date = dag_run.execution_date - value = XCom.serialize_value(value) - - # remove any duplicate XComs + # Remove duplicate XComs and insert a new one. session.query(cls).filter( - cls.key == key, cls.execution_date == execution_date, cls.task_id == task_id, cls.dag_id == dag_id + cls.key == key, + cls.execution_date == execution_date, + cls.task_id == task_id, + cls.dag_id == dag_id, ).delete() - + new = cast(Any, cls)( # Work around Mypy complaining model not defining '__init__'. + key=key, + value=cls.serialize_value(value), + execution_date=execution_date, + task_id=task_id, + dag_id=dag_id, + ) + session.add(new) session.flush() - # insert new XCom - session.add(XCom(key=key, value=value, execution_date=execution_date, task_id=task_id, dag_id=dag_id)) + @overload + @classmethod + def get_one( + cls, + *, + run_id: str, + key: Optional[str] = None, + task_id: Optional[str] = None, + dag_id: Optional[str] = None, + include_prior_dates: bool = False, + session: Optional[Session] = None, + ) -> Optional[Any]: + """Retrieve an XCom value, optionally meeting certain criteria. + + This method returns "full" XCom values (i.e. uses ``deserialize_value`` + from the XCom backend). Use :meth:`get_many` if you want the "shortened" + value via ``orm_deserialize_value``. + + If there are no results, *None* is returned. + + A deprecated form of this function accepts ``execution_date`` instead of + ``run_id``. The two arguments are mutually exclusive. + + :param run_id: DAG run ID for the task. + :param key: A key for the XCom. If provided, only XCom with matching + keys will be returned. Pass *None* (default) to remove the filter. + :param task_id: Only XCom from task with matching ID will be pulled. + Pass *None* (default) to remove the filter. + :param dag_id: Only pull XCom from this DAG. If *None* (default), the + DAG of the calling task is used. + :param include_prior_dates: If *False* (default), only XCom from the + specified DAG run is returned. If *True*, the latest matching XCom is + returned regardless of the run it belongs to. + :param session: Database session. If not given, a new session will be + created for this function. + :type session: sqlalchemy.orm.session.Session + """ - session.flush() + @overload + @classmethod + def get_one( + cls, + execution_date: pendulum.DateTime, + key: Optional[str] = None, + task_id: Optional[str] = None, + dag_id: Optional[str] = None, + include_prior_dates: bool = False, + session: Optional[Session] = None, + ) -> Optional[Any]: + """:sphinx-autoapi-skip:""" @classmethod @provide_session def get_one( cls, execution_date: Optional[pendulum.DateTime] = None, - run_id: Optional[str] = None, key: Optional[str] = None, task_id: Optional[Union[str, Iterable[str]]] = None, dag_id: Optional[Union[str, Iterable[str]]] = None, include_prior_dates: bool = False, session: Session = None, + *, + run_id: Optional[str] = None, ) -> Optional[Any]: - """ - Retrieve an XCom value, optionally meeting certain criteria. Returns None - of there are no results. - - ``run_id`` and ``execution_date`` are mutually exclusive. - - This method returns "full" XCom values (i.e. it uses ``deserialize_value`` from the XCom backend). - Please use :meth:`get_many` if you want the "shortened" value via ``orm_deserialize_value`` - - :param execution_date: Execution date for the task - :type execution_date: pendulum.datetime - :param run_id: Dag run id for the task - :type run_id: str - :param key: A key for the XCom. If provided, only XComs with matching - keys will be returned. To remove the filter, pass key=None. - :type key: str - :param task_id: Only XComs from task with matching id will be - pulled. Can pass None to remove the filter. - :type task_id: str - :param dag_id: If provided, only pulls XCom from this DAG. - If None (default), the DAG of the calling task is used. - :type dag_id: str - :param include_prior_dates: If False, only XCom from the current - execution_date are returned. If True, XCom from previous dates - are returned as well. - :type include_prior_dates: bool - :param session: database session - :type session: sqlalchemy.orm.session.Session - """ + """:sphinx-autoapi-skip:""" if not (execution_date is None) ^ (run_id is None): raise ValueError("Exactly one of execution_date or run_id must be passed") - result = ( - cls.get_many( - execution_date=execution_date, + if run_id is not None: + query = cls.get_many( run_id=run_id, key=key, task_ids=task_id, @@ -165,58 +237,88 @@ def get_one( include_prior_dates=include_prior_dates, session=session, ) - .with_entities(cls.value) - .first() - ) + elif execution_date is not None: + query = cls.get_many( + execution_date=execution_date, + key=key, + task_ids=task_id, + dag_ids=dag_id, + include_prior_dates=include_prior_dates, + session=session, + ) + else: + raise RuntimeError("Should not happen?") + + result = query.with_entities(cls.value).first() if result: return cls.deserialize_value(result) return None + @overload + @classmethod + def get_many( + cls, + *, + run_id: str, + key: Optional[str] = None, + task_ids: Union[str, Iterable[str], None] = None, + dag_ids: Union[str, Iterable[str], None] = None, + include_prior_dates: bool = False, + limit: Optional[int] = None, + session: Optional[Session] = None, + ) -> Query: + """Composes a query to get one or more XCom entries. + + This function returns an SQLAlchemy query of full XCom objects. If you + just want one stored value, use :meth:`get_one` instead. + + A deprecated form of this function accepts ``execution_date`` instead of + ``run_id``. The two arguments are mutually exclusive. + + :param run_id: DAG run ID for the task. + :param key: A key for the XComs. If provided, only XComs with matching + keys will be returned. Pass *None* (default) to remove the filter. + :param task_ids: Only XComs from task with matching IDs will be pulled. + Pass *None* (default) to remove the filter. + :param dag_id: Only pulls XComs from this DAG. If *None* (default), the + DAG of the calling task is used. + :param include_prior_dates: If *False* (default), only XComs from the + specified DAG run are returned. If *True*, all matching XComs are + returned regardless of the run it belongs to. + :param session: Database session. If not given, a new session will be + created for this function. + :type session: sqlalchemy.orm.session.Session + """ + + @overload + @classmethod + def get_many( + cls, + execution_date: pendulum.DateTime, + key: Optional[str] = None, + task_ids: Union[str, Iterable[str], None] = None, + dag_ids: Union[str, Iterable[str], None] = None, + include_prior_dates: bool = False, + limit: Optional[int] = None, + session: Optional[Session] = None, + ) -> Query: + """:sphinx-autoapi-skip:""" + @classmethod @provide_session def get_many( cls, execution_date: Optional[pendulum.DateTime] = None, - run_id: Optional[str] = None, key: Optional[str] = None, task_ids: Optional[Union[str, Iterable[str]]] = None, dag_ids: Optional[Union[str, Iterable[str]]] = None, include_prior_dates: bool = False, limit: Optional[int] = None, session: Session = None, + *, + run_id: Optional[str] = None, ) -> Query: - """ - Composes a query to get one or more values from the xcom table. - - ``run_id`` and ``execution_date`` are mutually exclusive. - - This function returns an SQLAlchemy query of full XCom objects. If you just want one stored value then - use :meth:`get_one`. - - :param execution_date: Execution date for the task - :type execution_date: pendulum.datetime - :param run_id: Dag run id for the task - :type run_id: str - :param key: A key for the XCom. If provided, only XComs with matching - keys will be returned. To remove the filter, pass key=None. - :type key: str - :param task_ids: Only XComs from tasks with matching ids will be - pulled. Can pass None to remove the filter. - :type task_ids: str or iterable of strings (representing task_ids) - :param dag_ids: If provided, only pulls XComs from this DAG. - If None (default), the DAG of the calling task is used. - :type dag_ids: str - :param include_prior_dates: If False, only XComs from the current - execution_date are returned. If True, XComs from previous dates - are returned as well. - :type include_prior_dates: bool - :param limit: If required, limit the number of returned objects. - XCom objects can be quite big and you might want to limit the - number of rows. - :type limit: int - :param session: database session - :type session: sqlalchemy.orm.session.Session - """ + """:sphinx-autoapi-skip:""" if not (execution_date is None) ^ (run_id is None): raise ValueError("Exactly one of execution_date or run_id must be passed") @@ -262,8 +364,8 @@ def get_many( @classmethod @provide_session - def delete(cls, xcoms, session=None): - """Delete Xcom""" + def delete(cls, xcoms: Union["XCom", Iterable["XCom"]], session: Session) -> None: + """Delete one or multiple XCom entries.""" if isinstance(xcoms, XCom): xcoms = [xcoms] for xcom in xcoms: @@ -272,37 +374,49 @@ def delete(cls, xcoms, session=None): session.delete(xcom) session.commit() + @overload + @classmethod + def clear(cls, *, dag_id: str, task_id: str, run_id: str, session: Optional[Session] = None) -> None: + """Clear all XCom data from the database for the given task instance. + + A deprecated form of this function accepts ``execution_date`` instead of + ``run_id``. The two arguments are mutually exclusive. + + :param dag_id: ID of DAG to clear the XCom for. + :param task_id: ID of task to clear the XCom for. + :param run_id: ID of DAG run to clear the XCom for. + :param session: Database session. If not given, a new session will be + created for this function. + :type session: sqlalchemy.orm.session.Session + """ + + @overload + @classmethod + def clear( + cls, + execution_date: pendulum.DateTime, + dag_id: str, + task_id: str, + session: Optional[Session] = None, + ) -> None: + """:sphinx-autoapi-skip:""" + @classmethod @provide_session def clear( cls, execution_date: Optional[pendulum.DateTime] = None, - dag_id: str = None, - task_id: str = None, - run_id: str = None, + dag_id: Optional[str] = None, + task_id: Optional[str] = None, + run_id: Optional[str] = None, session: Session = None, ) -> None: - """ - Clears all XCom data from the database for the task instance - - ``run_id`` and ``execution_date`` are mutually exclusive. - - :param execution_date: Execution date for the task - :type execution_date: pendulum.datetime or None - :param dag_id: ID of DAG to clear the XCom for. - :type dag_id: str - :param task_id: Only XComs from task with matching id will be cleared. - :type task_id: str - :param run_id: Dag run id for the task - :type run_id: str or None - :param session: database session - :type session: sqlalchemy.orm.session.Session - """ + """:sphinx-autoapi-skip:""" # Given the historic order of this function (execution_date was first argument) to add a new optional # param we need to add default values for everything :( - if not dag_id: + if dag_id is None: raise TypeError("clear() missing required argument: dag_id") - if not task_id: + if task_id is None: raise TypeError("clear() missing required argument: task_id") if not (execution_date is None) ^ (run_id is None): @@ -364,7 +478,7 @@ def orm_deserialize_value(self) -> Any: return BaseXCom.deserialize_value(self) -def resolve_xcom_backend(): +def resolve_xcom_backend() -> Type[BaseXCom]: """Resolves custom XCom class""" clazz = conf.getimport("core", "xcom_backend", fallback=f"airflow.models.xcom.{BaseXCom.__name__}") if clazz: @@ -376,4 +490,7 @@ def resolve_xcom_backend(): return BaseXCom -XCom = resolve_xcom_backend() +if TYPE_CHECKING: + XCom = BaseXCom # Hack to avoid Mypy "Variable 'XCom' is not valid as a type". +else: + XCom = resolve_xcom_backend() From dda864d585431c1c46c2705c40ed27ab9c43be72 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Tue, 7 Dec 2021 21:50:34 +0800 Subject: [PATCH 213/250] Helper for provide_session-decorated functions (#20104) * Helper for provide_session-decorated functions * Apply NEW_SESSION trick on XCom (cherry picked from commit a80ac1eecc0ea187de7984510b4ef6f981b97196) --- airflow/models/xcom.py | 24 ++++++++++++------------ airflow/settings.py | 10 ++++++---- airflow/utils/session.py | 11 +++++++++-- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/airflow/models/xcom.py b/airflow/models/xcom.py index 4bb9689e7dda6..5efaa0ac54f61 100644 --- a/airflow/models/xcom.py +++ b/airflow/models/xcom.py @@ -32,7 +32,7 @@ from airflow.utils import timezone from airflow.utils.helpers import is_container from airflow.utils.log.logging_mixin import LoggingMixin -from airflow.utils.session import provide_session +from airflow.utils.session import NEW_SESSION, provide_session from airflow.utils.sqlalchemy import UtcDateTime log = logging.getLogger(__name__) @@ -90,7 +90,7 @@ def set( dag_id: str, task_id: str, run_id: str, - session: Optional[Session] = None, + session: Session = NEW_SESSION, ) -> None: """Store an XCom value. @@ -116,7 +116,7 @@ def set( task_id: str, dag_id: str, execution_date: datetime.datetime, - session: Optional[Session] = None, + session: Session = NEW_SESSION, ) -> None: """:sphinx-autoapi-skip:""" @@ -129,7 +129,7 @@ def set( task_id: str, dag_id: str, execution_date: Optional[datetime.datetime] = None, - session: Session = None, + session: Session = NEW_SESSION, *, run_id: Optional[str] = None, ) -> None: @@ -170,7 +170,7 @@ def get_one( task_id: Optional[str] = None, dag_id: Optional[str] = None, include_prior_dates: bool = False, - session: Optional[Session] = None, + session: Session = NEW_SESSION, ) -> Optional[Any]: """Retrieve an XCom value, optionally meeting certain criteria. @@ -207,7 +207,7 @@ def get_one( task_id: Optional[str] = None, dag_id: Optional[str] = None, include_prior_dates: bool = False, - session: Optional[Session] = None, + session: Session = NEW_SESSION, ) -> Optional[Any]: """:sphinx-autoapi-skip:""" @@ -220,7 +220,7 @@ def get_one( task_id: Optional[Union[str, Iterable[str]]] = None, dag_id: Optional[Union[str, Iterable[str]]] = None, include_prior_dates: bool = False, - session: Session = None, + session: Session = NEW_SESSION, *, run_id: Optional[str] = None, ) -> Optional[Any]: @@ -265,7 +265,7 @@ def get_many( dag_ids: Union[str, Iterable[str], None] = None, include_prior_dates: bool = False, limit: Optional[int] = None, - session: Optional[Session] = None, + session: Session = NEW_SESSION, ) -> Query: """Composes a query to get one or more XCom entries. @@ -300,7 +300,7 @@ def get_many( dag_ids: Union[str, Iterable[str], None] = None, include_prior_dates: bool = False, limit: Optional[int] = None, - session: Optional[Session] = None, + session: Session = NEW_SESSION, ) -> Query: """:sphinx-autoapi-skip:""" @@ -314,7 +314,7 @@ def get_many( dag_ids: Optional[Union[str, Iterable[str]]] = None, include_prior_dates: bool = False, limit: Optional[int] = None, - session: Session = None, + session: Session = NEW_SESSION, *, run_id: Optional[str] = None, ) -> Query: @@ -397,7 +397,7 @@ def clear( execution_date: pendulum.DateTime, dag_id: str, task_id: str, - session: Optional[Session] = None, + session: Session = NEW_SESSION, ) -> None: """:sphinx-autoapi-skip:""" @@ -409,7 +409,7 @@ def clear( dag_id: Optional[str] = None, task_id: Optional[str] = None, run_id: Optional[str] = None, - session: Session = None, + session: Session = NEW_SESSION, ) -> None: """:sphinx-autoapi-skip:""" # Given the historic order of this function (execution_date was first argument) to add a new optional diff --git a/airflow/settings.py b/airflow/settings.py index f9b97a25c4c2a..139d6a40b18d6 100644 --- a/airflow/settings.py +++ b/airflow/settings.py @@ -22,7 +22,7 @@ import os import sys import warnings -from typing import Optional +from typing import TYPE_CHECKING, Callable, List, Optional import pendulum import sqlalchemy @@ -37,6 +37,9 @@ from airflow.logging_config import configure_logging from airflow.utils.orm_event_handlers import setup_event_handlers +if TYPE_CHECKING: + from airflow.www.utils import UIAlert + log = logging.getLogger(__name__) @@ -77,7 +80,7 @@ DAGS_FOLDER: str = os.path.expanduser(conf.get('core', 'DAGS_FOLDER')) engine: Optional[Engine] = None -Session: Optional[SASession] = None +Session: Callable[..., SASession] # The JSON library to use for DAG Serialization and De-Serialization json = json @@ -563,8 +566,7 @@ def initialize(): # UIAlert('Visit airflow.apache.org', html=True), # ] # -# DASHBOARD_UIALERTS: List["UIAlert"] -DASHBOARD_UIALERTS = [] +DASHBOARD_UIALERTS: List["UIAlert"] = [] # Prefix used to identify tables holding data moved during migration. AIRFLOW_MOVED_TABLE_PREFIX = "_airflow_moved" diff --git a/airflow/utils/session.py b/airflow/utils/session.py index 9636fc401e6cc..f0c31687ff1ab 100644 --- a/airflow/utils/session.py +++ b/airflow/utils/session.py @@ -18,7 +18,7 @@ import contextlib from functools import wraps from inspect import signature -from typing import Callable, Iterator, TypeVar +from typing import Callable, Iterator, TypeVar, cast from airflow import settings @@ -26,7 +26,7 @@ @contextlib.contextmanager def create_session() -> Iterator[settings.SASession]: """Contextmanager that will create and teardown a session.""" - session: settings.SASession = settings.Session() + session = settings.Session() try: yield session session.commit() @@ -105,3 +105,10 @@ def create_global_lock(session=None, pg_lock_id=1, lock_name='init', mysql_lock_ if dialect.name == 'mssql': # TODO: make locking works for MSSQL pass + + +# A fake session to use in functions decorated by provide_session. This allows +# the 'session' argument to be of type Session instead of Optional[Session], +# making it easier to type hint the function body without dealing with the None +# case that can never happen at runtime. +NEW_SESSION: settings.SASession = cast(settings.SASession, None) From daebc586d0aaaddaea4658734c9292dece150c6a Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Fri, 21 Jan 2022 21:44:40 +0800 Subject: [PATCH 214/250] Fix session usage in ``/rendered-k8s`` view (#21006) We can't commit the session too early because later functions need that session to fetch related objects. Fix #20534. (cherry picked from commit a665f48b606065977e0d3952bc74635ce11726d1) --- airflow/www/views.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/airflow/www/views.py b/airflow/www/views.py index 9b868f3faaccd..2182a1706aeec 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -84,7 +84,7 @@ from pygments.formatters import HtmlFormatter from sqlalchemy import Date, and_, desc, func, inspect, union_all from sqlalchemy.exc import IntegrityError -from sqlalchemy.orm import joinedload +from sqlalchemy.orm import Session, joinedload from wtforms import SelectField, validators from wtforms.validators import InputRequired @@ -116,7 +116,7 @@ from airflow.utils.helpers import alchemy_to_dict from airflow.utils.log import secrets_masker from airflow.utils.log.log_reader import TaskLogReader -from airflow.utils.session import create_session, provide_session +from airflow.utils.session import NEW_SESSION, create_session, provide_session from airflow.utils.state import State from airflow.utils.strings import to_boolean from airflow.version import version @@ -1124,7 +1124,8 @@ def rendered_templates(self, session): ] ) @action_logging - def rendered_k8s(self): + @provide_session + def rendered_k8s(self, session: Session = NEW_SESSION): """Get rendered k8s yaml.""" if not settings.IS_K8S_OR_K8SCELERY_EXECUTOR: abort(404) @@ -1135,14 +1136,15 @@ def rendered_k8s(self): form = DateTimeForm(data={'execution_date': dttm}) root = request.args.get('root', '') logging.info("Retrieving rendered templates.") - dag = current_app.dag_bag.get_dag(dag_id) + + dag: DAG = current_app.dag_bag.get_dag(dag_id) task = dag.get_task(task_id) - dag_run = dag.get_dagrun(execution_date=dttm) - ti = dag_run.get_task_instance(task_id=task.task_id) + dag_run = dag.get_dagrun(execution_date=dttm, session=session) + ti = dag_run.get_task_instance(task_id=task.task_id, session=session) pod_spec = None try: - pod_spec = ti.get_rendered_k8s_spec() + pod_spec = ti.get_rendered_k8s_spec(session=session) except AirflowException as e: msg = "Error rendering Kubernetes POD Spec: " + escape(e) if e.__cause__: From 663bb546e782748bdd315483ca2070a77046997a Mon Sep 17 00:00:00 2001 From: Ephraim Anierobi Date: Thu, 16 Dec 2021 12:30:42 +0100 Subject: [PATCH 215/250] Deprecate some functions in the experimental API (#19931) This PR seeks to deprecate some functions in the experimental API. Some of the deprecated functions are only used in the experimental REST API, others that are valid are being moved out of the experimental package. (cherry picked from commit 6239ae91a4c8bfb05f053a61cb8386f2d63b8b3a) --- airflow/api/client/local_client.py | 29 ++-- airflow/api/common/delete_dag.py | 83 ++++++++++++ airflow/api/common/experimental/delete_dag.py | 70 +--------- airflow/api/common/experimental/get_code.py | 3 + .../common/experimental/get_dag_run_state.py | 3 + airflow/api/common/experimental/get_task.py | 3 + .../common/experimental/get_task_instance.py | 3 + airflow/api/common/experimental/pool.py | 6 + .../api/common/experimental/trigger_dag.py | 115 +--------------- airflow/api/common/trigger_dag.py | 127 ++++++++++++++++++ .../api_connexion/endpoints/dag_endpoint.py | 7 +- airflow/models/pool.py | 52 ++++++- airflow/operators/trigger_dagrun.py | 2 +- airflow/utils/db.py | 15 +++ airflow/www/views.py | 2 +- setup.cfg | 1 + tests/api/client/test_local_client.py | 31 ++++- .../{experimental => }/test_delete_dag.py | 2 +- .../{experimental => }/test_trigger_dag.py | 8 +- tests/models/test_pool.py | 71 ++++++++++ 20 files changed, 435 insertions(+), 198 deletions(-) create mode 100644 airflow/api/common/delete_dag.py create mode 100644 airflow/api/common/trigger_dag.py rename tests/api/common/{experimental => }/test_delete_dag.py (99%) rename tests/api/common/{experimental => }/test_trigger_dag.py (93%) diff --git a/airflow/api/client/local_client.py b/airflow/api/client/local_client.py index 7ce0d1655da6e..c0050672a8e47 100644 --- a/airflow/api/client/local_client.py +++ b/airflow/api/client/local_client.py @@ -18,8 +18,10 @@ """Local client API""" from airflow.api.client import api_client -from airflow.api.common.experimental import delete_dag, pool, trigger_dag +from airflow.api.common import delete_dag, trigger_dag from airflow.api.common.experimental.get_lineage import get_lineage as get_lineage_api +from airflow.exceptions import AirflowBadRequest, PoolNotFound +from airflow.models.pool import Pool class Client(api_client.Client): @@ -36,19 +38,30 @@ def delete_dag(self, dag_id): return f"Removed {count} record(s)" def get_pool(self, name): - the_pool = pool.get_pool(name=name) - return the_pool.pool, the_pool.slots, the_pool.description + pool = Pool.get_pool(pool_name=name) + if not pool: + raise PoolNotFound(f"Pool {name} not found") + return pool.pool, pool.slots, pool.description def get_pools(self): - return [(p.pool, p.slots, p.description) for p in pool.get_pools()] + return [(p.pool, p.slots, p.description) for p in Pool.get_pools()] def create_pool(self, name, slots, description): - the_pool = pool.create_pool(name=name, slots=slots, description=description) - return the_pool.pool, the_pool.slots, the_pool.description + if not (name and name.strip()): + raise AirflowBadRequest("Pool name shouldn't be empty") + pool_name_length = Pool.pool.property.columns[0].type.length + if len(name) > pool_name_length: + raise AirflowBadRequest(f"pool name cannot be more than {pool_name_length} characters") + try: + slots = int(slots) + except ValueError: + raise AirflowBadRequest(f"Bad value for `slots`: {slots}") + pool = Pool.create_or_update_pool(name=name, slots=slots, description=description) + return pool.pool, pool.slots, pool.description def delete_pool(self, name): - the_pool = pool.delete_pool(name=name) - return the_pool.pool, the_pool.slots, the_pool.description + pool = Pool.delete_pool(name=name) + return pool.pool, pool.slots, pool.description def get_lineage(self, dag_id, execution_date): lineage = get_lineage_api(dag_id=dag_id, execution_date=execution_date) diff --git a/airflow/api/common/delete_dag.py b/airflow/api/common/delete_dag.py new file mode 100644 index 0000000000000..c448127f2c484 --- /dev/null +++ b/airflow/api/common/delete_dag.py @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Delete DAGs APIs.""" +import logging + +from sqlalchemy import or_ + +from airflow import models +from airflow.exceptions import AirflowException, DagNotFound +from airflow.models import DagModel, TaskFail +from airflow.models.serialized_dag import SerializedDagModel +from airflow.utils.db import get_sqla_model_classes +from airflow.utils.session import provide_session +from airflow.utils.state import State + +log = logging.getLogger(__name__) + + +@provide_session +def delete_dag(dag_id: str, keep_records_in_log: bool = True, session=None) -> int: + """ + :param dag_id: the dag_id of the DAG to delete + :param keep_records_in_log: whether keep records of the given dag_id + in the Log table in the backend database (for reasons like auditing). + The default value is True. + :param session: session used + :return count of deleted dags + """ + log.info("Deleting DAG: %s", dag_id) + running_tis = ( + session.query(models.TaskInstance.state) + .filter(models.TaskInstance.dag_id == dag_id) + .filter(models.TaskInstance.state == State.RUNNING) + .first() + ) + if running_tis: + raise AirflowException("TaskInstances still running") + dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).first() + if dag is None: + raise DagNotFound(f"Dag id {dag_id} not found") + + # Scheduler removes DAGs without files from serialized_dag table every dag_dir_list_interval. + # There may be a lag, so explicitly removes serialized DAG here. + if SerializedDagModel.has_dag(dag_id=dag_id, session=session): + SerializedDagModel.remove_dag(dag_id=dag_id, session=session) + + count = 0 + + for model in get_sqla_model_classes(): + if hasattr(model, "dag_id"): + if keep_records_in_log and model.__name__ == 'Log': + continue + cond = or_(model.dag_id == dag_id, model.dag_id.like(dag_id + ".%")) + count += session.query(model).filter(cond).delete(synchronize_session='fetch') + if dag.is_subdag: + parent_dag_id, task_id = dag_id.rsplit(".", 1) + for model in TaskFail, models.TaskInstance: + count += ( + session.query(model).filter(model.dag_id == parent_dag_id, model.task_id == task_id).delete() + ) + + # Delete entries in Import Errors table for a deleted DAG + # This handles the case when the dag_id is changed in the file + session.query(models.ImportError).filter(models.ImportError.filename == dag.fileloc).delete( + synchronize_session='fetch' + ) + + return count diff --git a/airflow/api/common/experimental/delete_dag.py b/airflow/api/common/experimental/delete_dag.py index 44e54e3738349..36bf7dd8c46a7 100644 --- a/airflow/api/common/experimental/delete_dag.py +++ b/airflow/api/common/experimental/delete_dag.py @@ -15,68 +15,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Delete DAGs APIs.""" -import logging +import warnings -from sqlalchemy import or_ +from airflow.api.common.delete_dag import * # noqa -from airflow import models -from airflow.exceptions import AirflowException, DagNotFound -from airflow.models import DagModel, TaskFail -from airflow.models.serialized_dag import SerializedDagModel -from airflow.utils.session import provide_session -from airflow.utils.state import State - -log = logging.getLogger(__name__) - - -@provide_session -def delete_dag(dag_id: str, keep_records_in_log: bool = True, session=None) -> int: - """ - :param dag_id: the dag_id of the DAG to delete - :param keep_records_in_log: whether keep records of the given dag_id - in the Log table in the backend database (for reasons like auditing). - The default value is True. - :param session: session used - :return count of deleted dags - """ - log.info("Deleting DAG: %s", dag_id) - running_tis = ( - session.query(models.TaskInstance.state) - .filter(models.TaskInstance.dag_id == dag_id) - .filter(models.TaskInstance.state == State.RUNNING) - .first() - ) - if running_tis: - raise AirflowException("TaskInstances still running") - dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).first() - if dag is None: - raise DagNotFound(f"Dag id {dag_id} not found") - - # Scheduler removes DAGs without files from serialized_dag table every dag_dir_list_interval. - # There may be a lag, so explicitly removes serialized DAG here. - if SerializedDagModel.has_dag(dag_id=dag_id, session=session): - SerializedDagModel.remove_dag(dag_id=dag_id, session=session) - - count = 0 - - for model in models.base.Base._decl_class_registry.values(): - if hasattr(model, "dag_id"): - if keep_records_in_log and model.__name__ == 'Log': - continue - cond = or_(model.dag_id == dag_id, model.dag_id.like(dag_id + ".%")) - count += session.query(model).filter(cond).delete(synchronize_session='fetch') - if dag.is_subdag: - parent_dag_id, task_id = dag_id.rsplit(".", 1) - for model in TaskFail, models.TaskInstance: - count += ( - session.query(model).filter(model.dag_id == parent_dag_id, model.task_id == task_id).delete() - ) - - # Delete entries in Import Errors table for a deleted DAG - # This handles the case when the dag_id is changed in the file - session.query(models.ImportError).filter(models.ImportError.filename == dag.fileloc).delete( - synchronize_session='fetch' - ) - - return count +warnings.warn( + "This module is deprecated. Please use `airflow.api.common.delete_dag` instead.", + DeprecationWarning, + stacklevel=2, +) diff --git a/airflow/api/common/experimental/get_code.py b/airflow/api/common/experimental/get_code.py index 79b0b9f492654..1a1fb621dbe48 100644 --- a/airflow/api/common/experimental/get_code.py +++ b/airflow/api/common/experimental/get_code.py @@ -16,11 +16,14 @@ # specific language governing permissions and limitations # under the License. """Get code APIs.""" +from deprecated import deprecated + from airflow.api.common.experimental import check_and_get_dag from airflow.exceptions import AirflowException, DagCodeNotFound from airflow.models.dagcode import DagCode +@deprecated(reason="Use DagCode().get_code_by_fileloc() instead", version="2.2.3") def get_code(dag_id: str) -> str: """Return python code of a given dag_id. diff --git a/airflow/api/common/experimental/get_dag_run_state.py b/airflow/api/common/experimental/get_dag_run_state.py index ca71a9afb3853..b2dedd5113ae9 100644 --- a/airflow/api/common/experimental/get_dag_run_state.py +++ b/airflow/api/common/experimental/get_dag_run_state.py @@ -19,9 +19,12 @@ from datetime import datetime from typing import Dict +from deprecated import deprecated + from airflow.api.common.experimental import check_and_get_dag, check_and_get_dagrun +@deprecated(reason="Use DagRun().get_state() instead", version="2.2.3") def get_dag_run_state(dag_id: str, execution_date: datetime) -> Dict[str, str]: """Return the Dag Run state identified by the given dag_id and execution_date. diff --git a/airflow/api/common/experimental/get_task.py b/airflow/api/common/experimental/get_task.py index 302ad6430efe9..fae5fd7ef1851 100644 --- a/airflow/api/common/experimental/get_task.py +++ b/airflow/api/common/experimental/get_task.py @@ -16,10 +16,13 @@ # specific language governing permissions and limitations # under the License. """Task APIs..""" +from deprecated import deprecated + from airflow.api.common.experimental import check_and_get_dag from airflow.models import TaskInstance +@deprecated(reason="Use DAG().get_task", version="2.2.3") def get_task(dag_id: str, task_id: str) -> TaskInstance: """Return the task object identified by the given dag_id and task_id.""" dag = check_and_get_dag(dag_id, task_id) diff --git a/airflow/api/common/experimental/get_task_instance.py b/airflow/api/common/experimental/get_task_instance.py index f3ca1cf2f6380..137f8a3aef9e7 100644 --- a/airflow/api/common/experimental/get_task_instance.py +++ b/airflow/api/common/experimental/get_task_instance.py @@ -18,11 +18,14 @@ """Task Instance APIs.""" from datetime import datetime +from deprecated import deprecated + from airflow.api.common.experimental import check_and_get_dag, check_and_get_dagrun from airflow.exceptions import TaskInstanceNotFound from airflow.models import TaskInstance +@deprecated(version="2.2.3", reason="Use DagRun.get_task_instance instead") def get_task_instance(dag_id: str, task_id: str, execution_date: datetime) -> TaskInstance: """Return the task instance identified by the given dag_id, task_id and execution_date.""" dag = check_and_get_dag(dag_id, task_id) diff --git a/airflow/api/common/experimental/pool.py b/airflow/api/common/experimental/pool.py index 30950ea0026ee..0b9c3a5d4903b 100644 --- a/airflow/api/common/experimental/pool.py +++ b/airflow/api/common/experimental/pool.py @@ -16,11 +16,14 @@ # specific language governing permissions and limitations # under the License. """Pool APIs.""" +from deprecated import deprecated + from airflow.exceptions import AirflowBadRequest, PoolNotFound from airflow.models import Pool from airflow.utils.session import provide_session +@deprecated(reason="Use Pool.get_pool() instead", version="2.2.3") @provide_session def get_pool(name, session=None): """Get pool by a given name.""" @@ -34,12 +37,14 @@ def get_pool(name, session=None): return pool +@deprecated(reason="Use Pool.get_pools() instead", version="2.2.3") @provide_session def get_pools(session=None): """Get all pools.""" return session.query(Pool).all() +@deprecated(reason="Use Pool.create_pool() instead", version="2.2.3") @provide_session def create_pool(name, slots, description, session=None): """Create a pool with a given parameters.""" @@ -70,6 +75,7 @@ def create_pool(name, slots, description, session=None): return pool +@deprecated(reason="Use Pool.delete_pool() instead", version="2.2.3") @provide_session def delete_pool(name, session=None): """Delete pool by a given name.""" diff --git a/airflow/api/common/experimental/trigger_dag.py b/airflow/api/common/experimental/trigger_dag.py index 38a873ce2e013..d52631281f534 100644 --- a/airflow/api/common/experimental/trigger_dag.py +++ b/airflow/api/common/experimental/trigger_dag.py @@ -15,114 +15,13 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Triggering DAG runs APIs.""" -import json -from datetime import datetime -from typing import List, Optional, Union -from airflow.exceptions import DagNotFound, DagRunAlreadyExists -from airflow.models import DagBag, DagModel, DagRun -from airflow.utils import timezone -from airflow.utils.state import State -from airflow.utils.types import DagRunType +import warnings +from airflow.api.common.trigger_dag import * # noqa -def _trigger_dag( - dag_id: str, - dag_bag: DagBag, - run_id: Optional[str] = None, - conf: Optional[Union[dict, str]] = None, - execution_date: Optional[datetime] = None, - replace_microseconds: bool = True, -) -> List[DagRun]: - """Triggers DAG run. - - :param dag_id: DAG ID - :param dag_bag: DAG Bag model - :param run_id: ID of the dag_run - :param conf: configuration - :param execution_date: date of execution - :param replace_microseconds: whether microseconds should be zeroed - :return: list of triggered dags - """ - dag = dag_bag.get_dag(dag_id) # prefetch dag if it is stored serialized - - if dag_id not in dag_bag.dags: - raise DagNotFound(f"Dag id {dag_id} not found") - - execution_date = execution_date if execution_date else timezone.utcnow() - - if not timezone.is_localized(execution_date): - raise ValueError("The execution_date should be localized") - - if replace_microseconds: - execution_date = execution_date.replace(microsecond=0) - - if dag.default_args and 'start_date' in dag.default_args: - min_dag_start_date = dag.default_args["start_date"] - if min_dag_start_date and execution_date < min_dag_start_date: - raise ValueError( - "The execution_date [{}] should be >= start_date [{}] from DAG's default_args".format( - execution_date.isoformat(), min_dag_start_date.isoformat() - ) - ) - - run_id = run_id or DagRun.generate_run_id(DagRunType.MANUAL, execution_date) - dag_run = DagRun.find_duplicate(dag_id=dag_id, execution_date=execution_date, run_id=run_id) - - if dag_run: - raise DagRunAlreadyExists( - f"A Dag Run already exists for dag id {dag_id} at {execution_date} with run id {run_id}" - ) - - run_conf = None - if conf: - run_conf = conf if isinstance(conf, dict) else json.loads(conf) - - dag_runs = [] - dags_to_run = [dag] + dag.subdags - for _dag in dags_to_run: - dag_run = _dag.create_dagrun( - run_id=run_id, - execution_date=execution_date, - state=State.QUEUED, - conf=run_conf, - external_trigger=True, - dag_hash=dag_bag.dags_hash.get(dag_id), - ) - dag_runs.append(dag_run) - - return dag_runs - - -def trigger_dag( - dag_id: str, - run_id: Optional[str] = None, - conf: Optional[Union[dict, str]] = None, - execution_date: Optional[datetime] = None, - replace_microseconds: bool = True, -) -> Optional[DagRun]: - """Triggers execution of DAG specified by dag_id - - :param dag_id: DAG ID - :param run_id: ID of the dag_run - :param conf: configuration - :param execution_date: date of execution - :param replace_microseconds: whether microseconds should be zeroed - :return: first dag run triggered - even if more than one Dag Runs were triggered or None - """ - dag_model = DagModel.get_current(dag_id) - if dag_model is None: - raise DagNotFound(f"Dag id {dag_id} not found in DagModel") - - dagbag = DagBag(dag_folder=dag_model.fileloc, read_dags_from_db=True) - triggers = _trigger_dag( - dag_id=dag_id, - dag_bag=dagbag, - run_id=run_id, - conf=conf, - execution_date=execution_date, - replace_microseconds=replace_microseconds, - ) - - return triggers[0] if triggers else None +warnings.warn( + "This module is deprecated. Please use `airflow.api.common.trigger_dag` instead.", + DeprecationWarning, + stacklevel=2, +) diff --git a/airflow/api/common/trigger_dag.py b/airflow/api/common/trigger_dag.py new file mode 100644 index 0000000000000..70bbb78312209 --- /dev/null +++ b/airflow/api/common/trigger_dag.py @@ -0,0 +1,127 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Triggering DAG runs APIs.""" +import json +from datetime import datetime +from typing import List, Optional, Union + +from airflow.exceptions import DagNotFound, DagRunAlreadyExists +from airflow.models import DagBag, DagModel, DagRun +from airflow.utils import timezone +from airflow.utils.state import State +from airflow.utils.types import DagRunType + + +def _trigger_dag( + dag_id: str, + dag_bag: DagBag, + run_id: Optional[str] = None, + conf: Optional[Union[dict, str]] = None, + execution_date: Optional[datetime] = None, + replace_microseconds: bool = True, +) -> List[DagRun]: + """Triggers DAG run. + + :param dag_id: DAG ID + :param dag_bag: DAG Bag model + :param run_id: ID of the dag_run + :param conf: configuration + :param execution_date: date of execution + :param replace_microseconds: whether microseconds should be zeroed + :return: list of triggered dags + """ + dag = dag_bag.get_dag(dag_id) # prefetch dag if it is stored serialized + + if dag_id not in dag_bag.dags: + raise DagNotFound(f"Dag id {dag_id} not found") + + execution_date = execution_date if execution_date else timezone.utcnow() + + if not timezone.is_localized(execution_date): + raise ValueError("The execution_date should be localized") + + if replace_microseconds: + execution_date = execution_date.replace(microsecond=0) + + if dag.default_args and 'start_date' in dag.default_args: + min_dag_start_date = dag.default_args["start_date"] + if min_dag_start_date and execution_date < min_dag_start_date: + raise ValueError( + f"The execution_date [{execution_date.isoformat()}] should be >= start_date " + f"[{min_dag_start_date.isoformat()}] from DAG's default_args" + ) + + run_id = run_id or DagRun.generate_run_id(DagRunType.MANUAL, execution_date) + dag_run = DagRun.find_duplicate(dag_id=dag_id, execution_date=execution_date, run_id=run_id) + + if dag_run: + raise DagRunAlreadyExists( + f"A Dag Run already exists for dag id {dag_id} at {execution_date} with run id {run_id}" + ) + + run_conf = None + if conf: + run_conf = conf if isinstance(conf, dict) else json.loads(conf) + + dag_runs = [] + dags_to_run = [dag] + dag.subdags + for _dag in dags_to_run: + dag_run = _dag.create_dagrun( + run_id=run_id, + execution_date=execution_date, + state=State.QUEUED, + conf=run_conf, + external_trigger=True, + dag_hash=dag_bag.dags_hash.get(dag_id), + ) + dag_runs.append(dag_run) + + return dag_runs + + +def trigger_dag( + dag_id: str, + run_id: Optional[str] = None, + conf: Optional[Union[dict, str]] = None, + execution_date: Optional[datetime] = None, + replace_microseconds: bool = True, +) -> Optional[DagRun]: + """Triggers execution of DAG specified by dag_id + + :param dag_id: DAG ID + :param run_id: ID of the dag_run + :param conf: configuration + :param execution_date: date of execution + :param replace_microseconds: whether microseconds should be zeroed + :return: first dag run triggered - even if more than one Dag Runs were triggered or None + """ + dag_model = DagModel.get_current(dag_id) + if dag_model is None: + raise DagNotFound(f"Dag id {dag_id} not found in DagModel") + + dagbag = DagBag(dag_folder=dag_model.fileloc, read_dags_from_db=True) + triggers = _trigger_dag( + dag_id=dag_id, + dag_bag=dagbag, + run_id=run_id, + conf=conf, + execution_date=execution_date, + replace_microseconds=replace_microseconds, + ) + + return triggers[0] if triggers else None diff --git a/airflow/api_connexion/endpoints/dag_endpoint.py b/airflow/api_connexion/endpoints/dag_endpoint.py index c164fccc37dbf..286b191601caf 100644 --- a/airflow/api_connexion/endpoints/dag_endpoint.py +++ b/airflow/api_connexion/endpoints/dag_endpoint.py @@ -110,13 +110,10 @@ def patch_dag(session, dag_id, update_mask=None): @provide_session def delete_dag(dag_id: str, session: Session): """Delete the specific DAG.""" - # TODO: This function is shared with the /delete endpoint used by the web - # UI, so we're reusing it to simplify maintenance. Refactor the function to - # another place when the experimental/legacy API is removed. - from airflow.api.common.experimental import delete_dag + from airflow.api.common import delete_dag as delete_dag_module try: - delete_dag.delete_dag(dag_id, session=session) + delete_dag_module.delete_dag(dag_id, session=session) except DagNotFound: raise NotFound(f"Dag with id: '{dag_id}' not found") except AirflowException: diff --git a/airflow/models/pool.py b/airflow/models/pool.py index 6f217c4b025a2..8ae88aabcd45f 100644 --- a/airflow/models/pool.py +++ b/airflow/models/pool.py @@ -21,11 +21,11 @@ from sqlalchemy import Column, Integer, String, Text, func from sqlalchemy.orm.session import Session -from airflow.exceptions import AirflowException +from airflow.exceptions import AirflowException, PoolNotFound from airflow.models.base import Base from airflow.ti_deps.dependencies_states import EXECUTION_STATES from airflow.typing_compat import TypedDict -from airflow.utils.session import provide_session +from airflow.utils.session import NEW_SESSION, provide_session from airflow.utils.sqlalchemy import nowait, with_row_locks from airflow.utils.state import State @@ -57,7 +57,13 @@ def __repr__(self): @staticmethod @provide_session - def get_pool(pool_name, session: Session = None): + def get_pools(session: Session = NEW_SESSION): + """Get all pools.""" + return session.query(Pool).all() + + @staticmethod + @provide_session + def get_pool(pool_name: str, session: Session = NEW_SESSION): """ Get the Pool with specific pool name from the Pools. @@ -69,7 +75,7 @@ def get_pool(pool_name, session: Session = None): @staticmethod @provide_session - def get_default_pool(session: Session = None): + def get_default_pool(session: Session = NEW_SESSION): """ Get the Pool of the default_pool from the Pools. @@ -78,12 +84,46 @@ def get_default_pool(session: Session = None): """ return Pool.get_pool(Pool.DEFAULT_POOL_NAME, session=session) + @staticmethod + @provide_session + def create_or_update_pool(name: str, slots: int, description: str, session: Session = NEW_SESSION): + """Create a pool with given parameters or update it if it already exists.""" + if not name: + return + pool = session.query(Pool).filter_by(pool=name).first() + if pool is None: + pool = Pool(pool=name, slots=slots, description=description) + session.add(pool) + else: + pool.slots = slots + pool.description = description + + session.commit() + + return pool + + @staticmethod + @provide_session + def delete_pool(name: str, session: Session = NEW_SESSION): + """Delete pool by a given name.""" + if name == Pool.DEFAULT_POOL_NAME: + raise AirflowException("default_pool cannot be deleted") + + pool = session.query(Pool).filter_by(pool=name).first() + if pool is None: + raise PoolNotFound(f"Pool '{name}' doesn't exist") + + session.delete(pool) + session.commit() + + return pool + @staticmethod @provide_session def slots_stats( *, lock_rows: bool = False, - session: Session = None, + session: Session = NEW_SESSION, ) -> Dict[str, PoolStats]: """ Get Pool stats (Number of Running, Queued, Open & Total tasks) @@ -210,7 +250,7 @@ def queued_slots(self, session: Session): ) @provide_session - def open_slots(self, session: Session) -> float: + def open_slots(self, session: Session = NEW_SESSION) -> float: """ Get the number of slots open at the moment. diff --git a/airflow/operators/trigger_dagrun.py b/airflow/operators/trigger_dagrun.py index 1e6cb7f6ab38f..421c7963d0d68 100644 --- a/airflow/operators/trigger_dagrun.py +++ b/airflow/operators/trigger_dagrun.py @@ -21,7 +21,7 @@ import time from typing import Dict, List, Optional, Union -from airflow.api.common.experimental.trigger_dag import trigger_dag +from airflow.api.common.trigger_dag import trigger_dag from airflow.exceptions import AirflowException, DagNotFound, DagRunAlreadyExists from airflow.models import BaseOperator, BaseOperatorLink, DagBag, DagModel, DagRun from airflow.utils import timezone diff --git a/airflow/utils/db.py b/airflow/utils/db.py index f35d1659f8cb9..023f482790d00 100644 --- a/airflow/utils/db.py +++ b/airflow/utils/db.py @@ -991,3 +991,18 @@ def check(session=None): """ session.execute('select 1 as is_alive;') log.info("Connection successful.") + + +def get_sqla_model_classes(): + """ + Get all SQLAlchemy class mappers. + + SQLAlchemy < 1.4 does not support registry.mappers so we use + try/except to handle it. + """ + from airflow.models.base import Base + + try: + return [mapper.class_ for mapper in Base.registry.mappers] + except AttributeError: + return Base._decl_class_registry.values() diff --git a/airflow/www/views.py b/airflow/www/views.py index 2182a1706aeec..f2642a73f1f0e 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -1607,7 +1607,7 @@ def run(self): @action_logging def delete(self): """Deletes DAG.""" - from airflow.api.common.experimental import delete_dag + from airflow.api.common import delete_dag from airflow.exceptions import DagNotFound dag_id = request.values.get('dag_id') diff --git a/setup.cfg b/setup.cfg index b83ef9be02826..c3cce1c0ac0c5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -95,6 +95,7 @@ install_requires = croniter>=0.3.17 cryptography>=0.9.3 dataclasses;python_version<"3.7" + deprecated>=1.2.13 dill>=0.2.2, <0.4 # Sphinx RTD theme 0.5.2. introduced limitation to docutils to account for some docutils markup # change: diff --git a/tests/api/client/test_local_client.py b/tests/api/client/test_local_client.py index a2af8ca245e6b..9f574e4fc657a 100644 --- a/tests/api/client/test_local_client.py +++ b/tests/api/client/test_local_client.py @@ -17,6 +17,8 @@ # under the License. import json +import random +import string import unittest from unittest.mock import ANY, patch @@ -25,7 +27,7 @@ from airflow.api.client.local_client import Client from airflow.example_dags import example_bash_operator -from airflow.exceptions import AirflowException +from airflow.exceptions import AirflowBadRequest, AirflowException, PoolNotFound from airflow.models import DAG, DagBag, DagModel, DagRun, Pool from airflow.utils import timezone from airflow.utils.session import create_session @@ -133,6 +135,10 @@ def test_get_pool(self): pool = self.client.get_pool(name='foo') assert pool == ('foo', 1, '') + def test_get_pool_non_existing_raises(self): + with pytest.raises(PoolNotFound): + self.client.get_pool(name='foo') + def test_get_pools(self): self.client.create_pool(name='foo1', slots=1, description='') self.client.create_pool(name='foo2', slots=2, description='') @@ -145,6 +151,26 @@ def test_create_pool(self): with create_session() as session: assert session.query(Pool).count() == 2 + def test_create_pool_bad_slots(self): + with pytest.raises(AirflowBadRequest, match="^Bad value for `slots`: foo$"): + self.client.create_pool( + name='foo', + slots='foo', + description='', + ) + + def test_create_pool_name_too_long(self): + long_name = ''.join(random.choices(string.ascii_lowercase, k=300)) + pool_name_length = Pool.pool.property.columns[0].type.length + with pytest.raises( + AirflowBadRequest, match=f"^pool name cannot be more than {pool_name_length} characters" + ): + self.client.create_pool( + name=long_name, + slots=5, + description='', + ) + def test_delete_pool(self): self.client.create_pool(name='foo', slots=1, description='') with create_session() as session: @@ -152,3 +178,6 @@ def test_delete_pool(self): self.client.delete_pool(name='foo') with create_session() as session: assert session.query(Pool).count() == 1 + for name in ('', ' '): + with pytest.raises(PoolNotFound, match=f"^Pool {name!r} doesn't exist$"): + Pool.delete_pool(name=name) diff --git a/tests/api/common/experimental/test_delete_dag.py b/tests/api/common/test_delete_dag.py similarity index 99% rename from tests/api/common/experimental/test_delete_dag.py rename to tests/api/common/test_delete_dag.py index 5984cd2b14f0f..0eb058a18337a 100644 --- a/tests/api/common/experimental/test_delete_dag.py +++ b/tests/api/common/test_delete_dag.py @@ -20,7 +20,7 @@ import pytest from airflow import models -from airflow.api.common.experimental.delete_dag import delete_dag +from airflow.api.common.delete_dag import delete_dag from airflow.exceptions import AirflowException, DagNotFound from airflow.operators.dummy import DummyOperator from airflow.utils.dates import days_ago diff --git a/tests/api/common/experimental/test_trigger_dag.py b/tests/api/common/test_trigger_dag.py similarity index 93% rename from tests/api/common/experimental/test_trigger_dag.py rename to tests/api/common/test_trigger_dag.py index 2f164468d085f..f79d413ed5eae 100644 --- a/tests/api/common/experimental/test_trigger_dag.py +++ b/tests/api/common/test_trigger_dag.py @@ -22,7 +22,7 @@ import pytest from parameterized import parameterized -from airflow.api.common.experimental.trigger_dag import _trigger_dag +from airflow.api.common.trigger_dag import _trigger_dag from airflow.exceptions import AirflowException from airflow.models import DAG, DagRun from airflow.utils import timezone @@ -42,7 +42,7 @@ def test_trigger_dag_dag_not_found(self, dag_bag_mock): with pytest.raises(AirflowException): _trigger_dag('dag_not_found', dag_bag_mock) - @mock.patch('airflow.api.common.experimental.trigger_dag.DagRun', spec=DagRun) + @mock.patch('airflow.api.common.trigger_dag.DagRun', spec=DagRun) @mock.patch('airflow.models.DagBag') def test_trigger_dag_dag_run_exist(self, dag_bag_mock, dag_run_mock): dag_id = "dag_run_exist" @@ -54,7 +54,7 @@ def test_trigger_dag_dag_run_exist(self, dag_bag_mock, dag_run_mock): _trigger_dag(dag_id, dag_bag_mock) @mock.patch('airflow.models.DAG') - @mock.patch('airflow.api.common.experimental.trigger_dag.DagRun', spec=DagRun) + @mock.patch('airflow.api.common.trigger_dag.DagRun', spec=DagRun) @mock.patch('airflow.models.DagBag') def test_trigger_dag_include_subdags(self, dag_bag_mock, dag_run_mock, dag_mock): dag_id = "trigger_dag" @@ -70,7 +70,7 @@ def test_trigger_dag_include_subdags(self, dag_bag_mock, dag_run_mock, dag_mock) assert 3 == len(triggers) @mock.patch('airflow.models.DAG') - @mock.patch('airflow.api.common.experimental.trigger_dag.DagRun', spec=DagRun) + @mock.patch('airflow.api.common.trigger_dag.DagRun', spec=DagRun) @mock.patch('airflow.models.DagBag') def test_trigger_dag_include_nested_subdags(self, dag_bag_mock, dag_run_mock, dag_mock): dag_id = "trigger_dag" diff --git a/tests/models/test_pool.py b/tests/models/test_pool.py index 00fe14039d7e3..95e585efa5974 100644 --- a/tests/models/test_pool.py +++ b/tests/models/test_pool.py @@ -16,11 +16,15 @@ # specific language governing permissions and limitations # under the License. +import pytest + from airflow import settings +from airflow.exceptions import AirflowException, PoolNotFound from airflow.models.pool import Pool from airflow.models.taskinstance import TaskInstance as TI from airflow.operators.dummy import DummyOperator from airflow.utils import timezone +from airflow.utils.session import create_session from airflow.utils.state import State from tests.test_utils.db import clear_db_dags, clear_db_pools, clear_db_runs, set_default_pool_slots @@ -28,6 +32,10 @@ class TestPool: + + USER_POOL_COUNT = 2 + TOTAL_POOL_COUNT = USER_POOL_COUNT + 1 # including default_pool + @staticmethod def clean_db(): clear_db_dags() @@ -36,6 +44,20 @@ def clean_db(): def setup_method(self): self.clean_db() + self.pools = [] + + def add_pools(self): + self.pools = [Pool.get_default_pool()] + for i in range(self.USER_POOL_COUNT): + name = f'experimental_{i + 1}' + pool = Pool( + pool=name, + slots=i, + description=name, + ) + self.pools.append(pool) + with create_session() as session: + session.add_all(self.pools) def teardown_method(self): self.clean_db() @@ -149,3 +171,52 @@ def test_default_pool_open_slots(self, dag_maker): "running": 1, } } == Pool.slots_stats() + + def test_get_pool(self): + self.add_pools() + pool = Pool.get_pool(pool_name=self.pools[0].pool) + assert pool.pool == self.pools[0].pool + + def test_get_pool_non_existing(self): + self.add_pools() + assert not Pool.get_pool(pool_name='test') + + def test_get_pool_bad_name(self): + for name in ('', ' '): + assert not Pool.get_pool(pool_name=name) + + def test_get_pools(self): + self.add_pools() + pools = sorted(Pool.get_pools(), key=lambda p: p.pool) + assert pools[0].pool == self.pools[0].pool + assert pools[1].pool == self.pools[1].pool + + def test_create_pool(self, session): + self.add_pools() + pool = Pool.create_or_update_pool(name='foo', slots=5, description='') + assert pool.pool == 'foo' + assert pool.slots == 5 + assert pool.description == '' + assert session.query(Pool).count() == self.TOTAL_POOL_COUNT + 1 + + def test_create_pool_existing(self, session): + self.add_pools() + pool = Pool.create_or_update_pool(name=self.pools[0].pool, slots=5, description='') + assert pool.pool == self.pools[0].pool + assert pool.slots == 5 + assert pool.description == '' + assert session.query(Pool).count() == self.TOTAL_POOL_COUNT + + def test_delete_pool(self, session): + self.add_pools() + pool = Pool.delete_pool(name=self.pools[-1].pool) + assert pool.pool == self.pools[-1].pool + assert session.query(Pool).count() == self.TOTAL_POOL_COUNT - 1 + + def test_delete_pool_non_existing(self): + with pytest.raises(PoolNotFound, match="^Pool 'test' doesn't exist$"): + Pool.delete_pool(name='test') + + def test_delete_default_pool_not_allowed(self): + with pytest.raises(AirflowException, match="^default_pool cannot be deleted$"): + Pool.delete_pool(Pool.DEFAULT_POOL_NAME) From 4dc8b909bedd04094be3079c3f7384ea044ec011 Mon Sep 17 00:00:00 2001 From: Sam Wheating Date: Mon, 10 Jan 2022 11:55:51 -0800 Subject: [PATCH 216/250] Avoid unintentional data loss when deleting DAGs (#20758) (cherry picked from commit 5980d2b05eee484256c634d5efae9410265c65e9) --- airflow/api/common/delete_dag.py | 18 +++++++++++++++--- tests/api/common/test_delete_dag.py | 14 ++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/airflow/api/common/delete_dag.py b/airflow/api/common/delete_dag.py index c448127f2c484..5e0afa81cb5c9 100644 --- a/airflow/api/common/delete_dag.py +++ b/airflow/api/common/delete_dag.py @@ -18,7 +18,7 @@ """Delete DAGs APIs.""" import logging -from sqlalchemy import or_ +from sqlalchemy import and_, or_ from airflow import models from airflow.exceptions import AirflowException, DagNotFound @@ -54,6 +54,15 @@ def delete_dag(dag_id: str, keep_records_in_log: bool = True, session=None) -> i if dag is None: raise DagNotFound(f"Dag id {dag_id} not found") + # deleting a DAG should also delete all of its subdags + dags_to_delete_query = session.query(DagModel.dag_id).filter( + or_( + DagModel.dag_id == dag_id, + and_(DagModel.dag_id.like(f"{dag_id}.%"), DagModel.is_subdag), + ) + ) + dags_to_delete = [dag_id for dag_id, in dags_to_delete_query] + # Scheduler removes DAGs without files from serialized_dag table every dag_dir_list_interval. # There may be a lag, so explicitly removes serialized DAG here. if SerializedDagModel.has_dag(dag_id=dag_id, session=session): @@ -65,8 +74,11 @@ def delete_dag(dag_id: str, keep_records_in_log: bool = True, session=None) -> i if hasattr(model, "dag_id"): if keep_records_in_log and model.__name__ == 'Log': continue - cond = or_(model.dag_id == dag_id, model.dag_id.like(dag_id + ".%")) - count += session.query(model).filter(cond).delete(synchronize_session='fetch') + count += ( + session.query(model) + .filter(model.dag_id.in_(dags_to_delete)) + .delete(synchronize_session='fetch') + ) if dag.is_subdag: parent_dag_id, task_id = dag_id.rsplit(".", 1) for model in TaskFail, models.TaskInstance: diff --git a/tests/api/common/test_delete_dag.py b/tests/api/common/test_delete_dag.py index 0eb058a18337a..d9dc0b0a01c7f 100644 --- a/tests/api/common/test_delete_dag.py +++ b/tests/api/common/test_delete_dag.py @@ -162,3 +162,17 @@ def test_delete_subdag_successful_delete(self): self.check_dag_models_exists() delete_dag(dag_id=self.key, keep_records_in_log=False) self.check_dag_models_removed(expect_logs=0) + + def test_delete_dag_preserves_other_dags(self): + + self.setup_dag_models() + + with create_session() as session: + session.add(DM(dag_id=self.key + ".other_dag", fileloc=self.dag_file_path)) + session.add(DM(dag_id=self.key + ".subdag", fileloc=self.dag_file_path, is_subdag=True)) + + delete_dag(self.key) + + with create_session() as session: + assert session.query(DM).filter(DM.dag_id == self.key + ".other_dag").count() == 1 + assert session.query(DM).filter(DM.dag_id.like(self.key + "%")).count() == 1 From 6d8342e78c6b6546845a7d2d5ba0a761af73d5f0 Mon Sep 17 00:00:00 2001 From: hubert-pietron <94397721+hubert-pietron@users.noreply.github.com> Date: Thu, 27 Jan 2022 06:20:17 +0100 Subject: [PATCH 217/250] Removed duplicated dag_run join in Dag.get_task_instances() (#20591) Co-authored-by: hubert-pietron (cherry picked from commit 960f573615b5357677c10bd9f7ec11811a0355c6) --- airflow/models/dag.py | 1 - 1 file changed, 1 deletion(-) diff --git a/airflow/models/dag.py b/airflow/models/dag.py index 2a08d269b30d7..477e597b49129 100644 --- a/airflow/models/dag.py +++ b/airflow/models/dag.py @@ -1343,7 +1343,6 @@ def get_task_instances( as_pk_tuple=False, session=session, ) - .join(TaskInstance.dag_run) .order_by(DagRun.execution_date) .all() ) From 55a4abbe1631f34325327d1494f1faaaa0c7e359 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C4=90=E1=BA=B7ng=20Minh=20D=C5=A9ng?= Date: Wed, 5 Jan 2022 14:42:57 +0700 Subject: [PATCH 218/250] bugfix: deferred tasks does not cancel when DAG is marked fail (#20649) (cherry picked from commit 64c0bd50155dfdb84671ac35d645b812fafa78a1) --- airflow/api/common/experimental/mark_tasks.py | 121 ++++++++++++------ 1 file changed, 85 insertions(+), 36 deletions(-) diff --git a/airflow/api/common/experimental/mark_tasks.py b/airflow/api/common/experimental/mark_tasks.py index 28e733dd96a89..4131cb50ace9a 100644 --- a/airflow/api/common/experimental/mark_tasks.py +++ b/airflow/api/common/experimental/mark_tasks.py @@ -17,23 +17,27 @@ # under the License. """Marks tasks APIs.""" -import datetime -from typing import Iterable +from datetime import datetime +from typing import Generator, Iterable, List, Optional -from sqlalchemy import or_ from sqlalchemy.orm import contains_eager +from sqlalchemy.orm.session import Session as SASession +from sqlalchemy.sql.expression import or_ +from airflow import DAG from airflow.models.baseoperator import BaseOperator from airflow.models.dagrun import DagRun from airflow.models.taskinstance import TaskInstance from airflow.operators.subdag import SubDagOperator from airflow.utils import timezone -from airflow.utils.session import provide_session -from airflow.utils.state import State +from airflow.utils.session import NEW_SESSION, provide_session +from airflow.utils.state import State, TaskInstanceState from airflow.utils.types import DagRunType -def _create_dagruns(dag, execution_dates, state, run_type): +def _create_dagruns( + dag: DAG, execution_dates: List[datetime], state: TaskInstanceState, run_type: DagRunType +) -> List[DagRun]: """ Infers from the dates which dag runs need to be created and does so. @@ -63,15 +67,15 @@ def _create_dagruns(dag, execution_dates, state, run_type): @provide_session def set_state( tasks: Iterable[BaseOperator], - execution_date: datetime.datetime, + execution_date: datetime, upstream: bool = False, downstream: bool = False, future: bool = False, past: bool = False, state: str = State.SUCCESS, commit: bool = False, - session=None, -): + session: SASession = NEW_SESSION, +) -> List[TaskInstance]: """ Set the state of a task instance and if needed its relatives. Can set state for future tasks (calculated from execution_date) and retroactively @@ -134,7 +138,9 @@ def set_state( return tis_altered -def all_subdag_tasks_query(sub_dag_run_ids, session, state, confirmed_dates): +def all_subdag_tasks_query( + sub_dag_run_ids: List[str], session: SASession, state: TaskInstanceState, confirmed_dates: List[datetime] +): """Get *all* tasks of the sub dags""" qry_sub_dag = ( session.query(TaskInstance) @@ -144,7 +150,13 @@ def all_subdag_tasks_query(sub_dag_run_ids, session, state, confirmed_dates): return qry_sub_dag -def get_all_dag_task_query(dag, session, state, task_ids, confirmed_dates): +def get_all_dag_task_query( + dag: DAG, + session: SASession, + state: TaskInstanceState, + task_ids: List[str], + confirmed_dates: List[datetime], +): """Get all tasks of the main dag that will be affected by a state change""" qry_dag = ( session.query(TaskInstance) @@ -160,7 +172,14 @@ def get_all_dag_task_query(dag, session, state, task_ids, confirmed_dates): return qry_dag -def get_subdag_runs(dag, session, state, task_ids, commit, confirmed_dates): +def get_subdag_runs( + dag: DAG, + session: SASession, + state: TaskInstanceState, + task_ids: List[str], + commit: bool, + confirmed_dates: List[datetime], +) -> List[str]: """Go through subdag operators and create dag runs. We will only work within the scope of the subdag. We won't propagate to the parent dag, but we will propagate from parent to subdag. @@ -181,7 +200,7 @@ def get_subdag_runs(dag, session, state, task_ids, commit, confirmed_dates): dag_runs = _create_dagruns( current_task.subdag, execution_dates=confirmed_dates, - state=State.RUNNING, + state=TaskInstanceState.RUNNING, run_type=DagRunType.BACKFILL_JOB, ) @@ -192,7 +211,13 @@ def get_subdag_runs(dag, session, state, task_ids, commit, confirmed_dates): return sub_dag_ids -def verify_dagruns(dag_runs, commit, state, session, current_task): +def verify_dagruns( + dag_runs: List[DagRun], + commit: bool, + state: TaskInstanceState, + session: SASession, + current_task: BaseOperator, +): """Verifies integrity of dag_runs. :param dag_runs: dag runs to verify @@ -210,7 +235,7 @@ def verify_dagruns(dag_runs, commit, state, session, current_task): session.merge(dag_run) -def verify_dag_run_integrity(dag, dates): +def verify_dag_run_integrity(dag: DAG, dates: List[datetime]) -> List[datetime]: """ Verify the integrity of the dag runs in case a task was added or removed set the confirmed execution dates as they might be different @@ -225,7 +250,9 @@ def verify_dag_run_integrity(dag, dates): return confirmed_dates -def find_task_relatives(tasks, downstream, upstream): +def find_task_relatives( + tasks: Iterable[BaseOperator], downstream: bool, upstream: bool +) -> Generator[str, None, None]: """Yield task ids and optionally ancestor and descendant ids.""" for task in tasks: yield task.task_id @@ -237,7 +264,7 @@ def find_task_relatives(tasks, downstream, upstream): yield relative.task_id -def get_execution_dates(dag, execution_date, future, past): +def get_execution_dates(dag: DAG, execution_date: datetime, future: bool, past: bool) -> List[datetime]: """Returns dates of DAG execution""" latest_execution_date = dag.get_latest_execution_date() if latest_execution_date is None: @@ -266,7 +293,9 @@ def get_execution_dates(dag, execution_date, future, past): @provide_session -def _set_dag_run_state(dag_id, execution_date, state, session=None): +def _set_dag_run_state( + dag_id: str, execution_date: datetime, state: TaskInstanceState, session: SASession = NEW_SESSION +): """ Helper method that set dag run state in the DB. @@ -279,7 +308,7 @@ def _set_dag_run_state(dag_id, execution_date, state, session=None): session.query(DagRun).filter(DagRun.dag_id == dag_id, DagRun.execution_date == execution_date).one() ) dag_run.state = state - if state == State.RUNNING: + if state == TaskInstanceState.RUNNING: dag_run.start_date = timezone.utcnow() dag_run.end_date = None else: @@ -288,7 +317,12 @@ def _set_dag_run_state(dag_id, execution_date, state, session=None): @provide_session -def set_dag_run_state_to_success(dag, execution_date, commit=False, session=None): +def set_dag_run_state_to_success( + dag: Optional[DAG], + execution_date: Optional[datetime], + commit: bool = False, + session: SASession = NEW_SESSION, +) -> List[TaskInstance]: """ Set the dag run for a specific execution date and its task instances to success. @@ -306,18 +340,27 @@ def set_dag_run_state_to_success(dag, execution_date, commit=False, session=None # Mark the dag run to success. if commit: - _set_dag_run_state(dag.dag_id, execution_date, State.SUCCESS, session) + _set_dag_run_state(dag.dag_id, execution_date, TaskInstanceState.SUCCESS, session) # Mark all task instances of the dag run to success. for task in dag.tasks: task.dag = dag return set_state( - tasks=dag.tasks, execution_date=execution_date, state=State.SUCCESS, commit=commit, session=session + tasks=dag.tasks, + execution_date=execution_date, + state=TaskInstanceState.SUCCESS, + commit=commit, + session=session, ) @provide_session -def set_dag_run_state_to_failed(dag, execution_date, commit=False, session=None): +def set_dag_run_state_to_failed( + dag: Optional[DAG], + execution_date: Optional[datetime], + commit: bool = False, + session: SASession = NEW_SESSION, +) -> List[TaskInstance]: """ Set the dag run for a specific execution date and its running task instances to failed. @@ -335,18 +378,15 @@ def set_dag_run_state_to_failed(dag, execution_date, commit=False, session=None) # Mark the dag run to failed. if commit: - _set_dag_run_state(dag.dag_id, execution_date, State.FAILED, session) + _set_dag_run_state(dag.dag_id, execution_date, TaskInstanceState.FAILED, session) - # Mark only RUNNING task instances. + # Mark only running task instances. task_ids = [task.task_id for task in dag.tasks] - tis = ( - session.query(TaskInstance) - .filter( - TaskInstance.dag_id == dag.dag_id, - TaskInstance.execution_date == execution_date, - TaskInstance.task_id.in_(task_ids), - ) - .filter(TaskInstance.state == State.RUNNING) + tis = session.query(TaskInstance).filter( + TaskInstance.dag_id == dag.dag_id, + TaskInstance.execution_date == execution_date, + TaskInstance.task_id.in_(task_ids), + TaskInstance.state.in_(State.running), ) task_ids_of_running_tis = [task_instance.task_id for task_instance in tis] @@ -358,12 +398,21 @@ def set_dag_run_state_to_failed(dag, execution_date, commit=False, session=None) tasks.append(task) return set_state( - tasks=tasks, execution_date=execution_date, state=State.FAILED, commit=commit, session=session + tasks=tasks, + execution_date=execution_date, + state=TaskInstanceState.FAILED, + commit=commit, + session=session, ) @provide_session -def set_dag_run_state_to_running(dag, execution_date, commit=False, session=None): +def set_dag_run_state_to_running( + dag: Optional[DAG], + execution_date: Optional[datetime], + commit: bool = False, + session: SASession = NEW_SESSION, +) -> List[TaskInstance]: """ Set the dag run for a specific execution date to running. @@ -380,7 +429,7 @@ def set_dag_run_state_to_running(dag, execution_date, commit=False, session=None # Mark the dag run to running. if commit: - _set_dag_run_state(dag.dag_id, execution_date, State.RUNNING, session) + _set_dag_run_state(dag.dag_id, execution_date, TaskInstanceState.RUNNING, session) # To keep the return type consistent with the other similar functions. return res From 0ba033daead44624847cbf26a5e19962575c94d0 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sun, 30 Jan 2022 16:59:36 +0100 Subject: [PATCH 219/250] Actually fix tuple and bool checks for black 22.1.0 (#21221) Previous two fixes in #21215 and #21216 did not really fix the problem introduced by Black 22.1.0 (they could not as they were wrong). This change was actually tested with the new black and should fix it finally. (cherry picked from commit f9e20067e0ac593fd18ad068fcc56501c6a99f2b) --- .../prepare_provider_packages.py | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/dev/provider_packages/prepare_provider_packages.py b/dev/provider_packages/prepare_provider_packages.py index 7d0e1e5afc3ae..8d920d6c8572d 100755 --- a/dev/provider_packages/prepare_provider_packages.py +++ b/dev/provider_packages/prepare_provider_packages.py @@ -1334,9 +1334,14 @@ def get_all_changes_for_package( ) # Returns 66 in case of doc-only changes sys.exit(66) + if len(changes) > len(changes_since_last_doc_only_check): + # if doc-only was released after previous release - use it as starting point + # but if before - stay with the releases from last tag. + changes = changes_since_last_doc_only_check except subprocess.CalledProcessError: # ignore when the commit mentioned as last doc-only change is obsolete pass + console.print(f"[yellow]The provider {provider_package_id} has changes since last release[/]") console.print() console.print( @@ -1697,16 +1702,16 @@ def black_mode(): config = parse_pyproject_toml(os.path.join(SOURCE_DIR_PATH, "pyproject.toml")) target_versions = set( - target_version_option_callback(None, None, config.get('target_version', [])), + target_version_option_callback(None, None, tuple(config.get('target_version', ()))), ) return Mode( target_versions=target_versions, line_length=config.get('line_length', Mode.line_length), - is_pyi=config.get('is_pyi', Mode.is_pyi), - string_normalization=not config.get('skip_string_normalization', not Mode.string_normalization), - experimental_string_processing=config.get( - 'experimental_string_processing', Mode.experimental_string_processing + is_pyi=bool(config.get('is_pyi', Mode.is_pyi)), + string_normalization=not bool(config.get('skip_string_normalization', not Mode.string_normalization)), + experimental_string_processing=bool( + config.get('experimental_string_processing', Mode.experimental_string_processing) ), ) @@ -2180,6 +2185,14 @@ def summarise_total_vs_bad_and_warnings(total: int, bad: int, warns: List[warnin 'This module is deprecated. Please use `airflow.providers.amazon.aws.operators.sagemaker`.', 'This module is deprecated. Please use `airflow.providers.amazon.aws.sensors.sagemaker`.', 'This module is deprecated. Please use `airflow.providers.amazon.aws.hooks.emr`.', + 'This module is deprecated. Please use `airflow.providers.opsgenie.hooks.opsgenie`.', + 'This module is deprecated. Please use `airflow.providers.opsgenie.operators.opsgenie`.', + 'This module is deprecated. Please use `airflow.hooks.redshift_sql` ' + 'or `airflow.hooks.redshift_cluster` as appropriate.', + 'This module is deprecated. Please use `airflow.providers.amazon.aws.operators.redshift_sql` or ' + '`airflow.providers.amazon.aws.operators.redshift_cluster` as appropriate.', + 'This module is deprecated. Please use `airflow.providers.amazon.aws.sensors.redshift_cluster`.', + "This module is deprecated. Please use airflow.providers.amazon.aws.transfers.sql_to_s3`.", } From 1b139a77d012971d147bffc74646923514db4b48 Mon Sep 17 00:00:00 2001 From: Ephraim Anierobi Date: Thu, 13 Jan 2022 22:23:10 +0100 Subject: [PATCH 220/250] Fix Scheduler crash when executing task instances of missing DAG (#20349) When executing task instances, we do not check if the dag is missing in the dagbag. This PR fixes it by ignoring task instances if we can't find the dag in serialized dag table Closes: #20099 (cherry picked from commit 98715760f72e5205c291293088b5e79636884491) --- airflow/jobs/scheduler_job.py | 11 +++++++++++ tests/jobs/test_scheduler_job.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/airflow/jobs/scheduler_job.py b/airflow/jobs/scheduler_job.py index 2fedf807958c2..490d5077be66e 100644 --- a/airflow/jobs/scheduler_job.py +++ b/airflow/jobs/scheduler_job.py @@ -375,6 +375,17 @@ def _executable_task_instances_to_queued(self, max_tis: int, session: Session = # Many dags don't have a task_concurrency, so where we can avoid loading the full # serialized DAG the better. serialized_dag = self.dagbag.get_dag(dag_id, session=session) + # If the dag is missing, fail the task and continue to the next task. + if not serialized_dag: + self.log.error( + "DAG '%s' for task instance %s not found in serialized_dag table", + dag_id, + task_instance, + ) + session.query(TI).filter(TI.dag_id == dag_id, TI.state == State.SCHEDULED).update( + {TI.state: State.FAILED}, synchronize_session='fetch' + ) + continue if serialized_dag.has_task(task_instance.task_id): task_concurrency_limit = serialized_dag.get_task( task_instance.task_id diff --git a/tests/jobs/test_scheduler_job.py b/tests/jobs/test_scheduler_job.py index a308029a7da1c..718572004016e 100644 --- a/tests/jobs/test_scheduler_job.py +++ b/tests/jobs/test_scheduler_job.py @@ -610,6 +610,34 @@ def test_find_executable_task_instances_in_default_pool(self, dag_maker): session.rollback() session.close() + def test_queued_task_instances_fails_with_missing_dag(self, dag_maker, session): + """Check that task instances of missing DAGs are failed""" + dag_id = 'SchedulerJobTest.test_find_executable_task_instances_not_in_dagbag' + task_id_1 = 'dummy' + task_id_2 = 'dummydummy' + + with dag_maker(dag_id=dag_id, session=session, default_args={"max_active_tis_per_dag": 1}): + DummyOperator(task_id=task_id_1) + DummyOperator(task_id=task_id_2) + + self.scheduler_job = SchedulerJob(subdir=os.devnull) + self.scheduler_job.dagbag = mock.MagicMock() + self.scheduler_job.dagbag.get_dag.return_value = None + + dr = dag_maker.create_dagrun(state=DagRunState.RUNNING) + + tis = dr.task_instances + for ti in tis: + ti.state = State.SCHEDULED + session.merge(ti) + session.flush() + res = self.scheduler_job._executable_task_instances_to_queued(max_tis=32, session=session) + session.flush() + assert 0 == len(res) + tis = dr.get_task_instances(session=session) + assert len(tis) == 2 + assert all(ti.state == State.FAILED for ti in tis) + def test_nonexistent_pool(self, dag_maker): dag_id = 'SchedulerJobTest.test_nonexistent_pool' with dag_maker(dag_id=dag_id, max_active_tasks=16): From 4ff0ab16868d7e7b765a4ab1d285088cd1f162fe Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Mon, 31 Jan 2022 19:23:58 +0100 Subject: [PATCH 221/250] Limit SQLAlchemy to < 1.4.0 for 2.2.* line (#21235) The recent release of FAB 3.4.4 has unblocked us from upgrading SQLAlchemy to 1.4.* version. We wanted to do it for quite some time however upgrading to 1.4.* of sqlalchemy and allowing our users to use it for 2.2.4 is a bit risky. We are fixing resulting "aftermath" in the main branch and as of this commit there are two fixes merged and remaining MsSQL problem. The MSSql problem does not affect 2.2.4 as MsSQL will be available only starting from 2.3.0, however the two other problems have shown that SQLAlchemy has a potential to break things and we might want to test it more thoroughly before releasing 2.3.0. The problems in question are #21205 and #21228. Both were only test problems but the indicate that there might be more hidden issues involved. In order to limit risks, this PR proposes to limit SQLAlchemy for 2.2.* to < 1.4.0. This will allow to upgrade FAB and related dependencies without opening up Airflow to upgrade to SQLAlchemy 1.4 (yet). --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index c3cce1c0ac0c5..7ab5c77c0688d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -145,7 +145,7 @@ install_requires = python3-openid~=3.2 rich>=9.2.0 setproctitle>=1.1.8, <2 - sqlalchemy>=1.3.18 + sqlalchemy>=1.3.18, <1.4.0 sqlalchemy_jsonfield~=1.0 tabulate>=0.7.5, <0.9 tenacity>=6.2.0 From ede6d8f1d5aa7d34368df6e03c0890fba3f20b9f Mon Sep 17 00:00:00 2001 From: fritz-astronomer <80706212+fritz-astronomer@users.noreply.github.com> Date: Tue, 1 Feb 2022 14:03:46 -0500 Subject: [PATCH 222/250] =?UTF-8?q?f=D7=9F=D7=A1=20Broken=20link=20in=20ap?= =?UTF-8?q?i.rst=20(#21165)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: eladkal <45845474+eladkal@users.noreply.github.com> (cherry picked from commit 817bec0417b291326dfd760bd85439b3ba0a728d) --- docs/apache-airflow/security/api.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/apache-airflow/security/api.rst b/docs/apache-airflow/security/api.rst index d7de78ac11941..48a5b5ad30d2b 100644 --- a/docs/apache-airflow/security/api.rst +++ b/docs/apache-airflow/security/api.rst @@ -80,8 +80,7 @@ principal exists in the keytab file. Basic authentication '''''''''''''''''''' -`Basic username password authentication `_ is currently +`Basic username password authentication `_ is currently supported for the API. This works for users created through LDAP login or within Airflow Metadata DB using password. From 2066812960018ce0d7ba774dd2f9fe5c0d8b52a4 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Tue, 1 Feb 2022 12:05:04 -0700 Subject: [PATCH 223/250] Update version to 2.2.4 for things in that release (#21196) (cherry picked from commit 093702e9f579ee028a103cdc9acf0e6acccd6d79) --- airflow/api/common/experimental/get_code.py | 2 +- airflow/api/common/experimental/get_dag_run_state.py | 2 +- airflow/api/common/experimental/get_task.py | 2 +- airflow/api/common/experimental/get_task_instance.py | 2 +- airflow/api/common/experimental/pool.py | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/airflow/api/common/experimental/get_code.py b/airflow/api/common/experimental/get_code.py index 1a1fb621dbe48..d4232b1d0903b 100644 --- a/airflow/api/common/experimental/get_code.py +++ b/airflow/api/common/experimental/get_code.py @@ -23,7 +23,7 @@ from airflow.models.dagcode import DagCode -@deprecated(reason="Use DagCode().get_code_by_fileloc() instead", version="2.2.3") +@deprecated(reason="Use DagCode().get_code_by_fileloc() instead", version="2.2.4") def get_code(dag_id: str) -> str: """Return python code of a given dag_id. diff --git a/airflow/api/common/experimental/get_dag_run_state.py b/airflow/api/common/experimental/get_dag_run_state.py index b2dedd5113ae9..7201186ea9331 100644 --- a/airflow/api/common/experimental/get_dag_run_state.py +++ b/airflow/api/common/experimental/get_dag_run_state.py @@ -24,7 +24,7 @@ from airflow.api.common.experimental import check_and_get_dag, check_and_get_dagrun -@deprecated(reason="Use DagRun().get_state() instead", version="2.2.3") +@deprecated(reason="Use DagRun().get_state() instead", version="2.2.4") def get_dag_run_state(dag_id: str, execution_date: datetime) -> Dict[str, str]: """Return the Dag Run state identified by the given dag_id and execution_date. diff --git a/airflow/api/common/experimental/get_task.py b/airflow/api/common/experimental/get_task.py index fae5fd7ef1851..4589cc6ce4d42 100644 --- a/airflow/api/common/experimental/get_task.py +++ b/airflow/api/common/experimental/get_task.py @@ -22,7 +22,7 @@ from airflow.models import TaskInstance -@deprecated(reason="Use DAG().get_task", version="2.2.3") +@deprecated(reason="Use DAG().get_task", version="2.2.4") def get_task(dag_id: str, task_id: str) -> TaskInstance: """Return the task object identified by the given dag_id and task_id.""" dag = check_and_get_dag(dag_id, task_id) diff --git a/airflow/api/common/experimental/get_task_instance.py b/airflow/api/common/experimental/get_task_instance.py index 137f8a3aef9e7..7361efdc4c796 100644 --- a/airflow/api/common/experimental/get_task_instance.py +++ b/airflow/api/common/experimental/get_task_instance.py @@ -25,7 +25,7 @@ from airflow.models import TaskInstance -@deprecated(version="2.2.3", reason="Use DagRun.get_task_instance instead") +@deprecated(version="2.2.4", reason="Use DagRun.get_task_instance instead") def get_task_instance(dag_id: str, task_id: str, execution_date: datetime) -> TaskInstance: """Return the task instance identified by the given dag_id, task_id and execution_date.""" dag = check_and_get_dag(dag_id, task_id) diff --git a/airflow/api/common/experimental/pool.py b/airflow/api/common/experimental/pool.py index 0b9c3a5d4903b..fe4f161bde682 100644 --- a/airflow/api/common/experimental/pool.py +++ b/airflow/api/common/experimental/pool.py @@ -23,7 +23,7 @@ from airflow.utils.session import provide_session -@deprecated(reason="Use Pool.get_pool() instead", version="2.2.3") +@deprecated(reason="Use Pool.get_pool() instead", version="2.2.4") @provide_session def get_pool(name, session=None): """Get pool by a given name.""" @@ -37,14 +37,14 @@ def get_pool(name, session=None): return pool -@deprecated(reason="Use Pool.get_pools() instead", version="2.2.3") +@deprecated(reason="Use Pool.get_pools() instead", version="2.2.4") @provide_session def get_pools(session=None): """Get all pools.""" return session.query(Pool).all() -@deprecated(reason="Use Pool.create_pool() instead", version="2.2.3") +@deprecated(reason="Use Pool.create_pool() instead", version="2.2.4") @provide_session def create_pool(name, slots, description, session=None): """Create a pool with a given parameters.""" @@ -75,7 +75,7 @@ def create_pool(name, slots, description, session=None): return pool -@deprecated(reason="Use Pool.delete_pool() instead", version="2.2.3") +@deprecated(reason="Use Pool.delete_pool() instead", version="2.2.4") @provide_session def delete_pool(name, session=None): """Delete pool by a given name.""" From 88900872e642903eff59c82f319d5c137ff5d5db Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Thu, 3 Feb 2022 08:50:25 -0800 Subject: [PATCH 224/250] Augment xcom docs (#20755) (cherry picked from commit 40d3a76a9bce2360b951f2e990cba571c5f51a76) --- docs/apache-airflow/concepts/xcoms.rst | 40 ++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/docs/apache-airflow/concepts/xcoms.rst b/docs/apache-airflow/concepts/xcoms.rst index eb11ff707fa5d..57b9e54477fb6 100644 --- a/docs/apache-airflow/concepts/xcoms.rst +++ b/docs/apache-airflow/concepts/xcoms.rst @@ -42,8 +42,8 @@ XComs are a relative of :doc:`variables`, with the main difference being that XC Note: If the first task run is not succeeded then on every retry task XComs will be cleared to make the task run idempotent. -Custom Backends ---------------- +Custom XCom Backends +-------------------- The XCom system has interchangeable backends, and you can set which backend is being used via the ``xcom_backend`` configuration option. @@ -51,4 +51,38 @@ If you want to implement your own backend, you should subclass :class:`~airflow. There is also an ``orm_deserialize_value`` method that is called whenever the XCom objects are rendered for UI or reporting purposes; if you have large or expensive-to-retrieve values in your XComs, you should override this method to avoid calling that code (and instead return a lighter, incomplete representation) so the UI remains responsive. -You can also override the ``clear`` method and use it when clearing results for given dags and tasks. This allows the custom XCom backend process the data lifecycle easier. +You can also override the ``clear`` method and use it when clearing results for given dags and tasks. This allows the custom XCom backend to process the data lifecycle easier. + +Working with Custom XCom Backends in Containers +----------------------------------------------- + +Depending on where Airflow is deployed i.e., local, Docker, K8s, etc. it can be useful to be assured that a custom XCom backend is actually being initialized. For example, the complexity of the container environment can make it more difficult to determine if your backend is being loaded correctly during container deployment. Luckily the following guidance can be used to assist you in building confidence in your custom XCom implementation. + +Firstly, if you can exec into a terminal in the container then you should be able to do: + +.. code-block:: python + + from airflow.models.xcom import XCom + + print(XCom.__name__) + +which will print the actual class that is being used. + +You can also examine Airflow's configuration: + +.. code-block:: python + + from airflow.settings import conf + + conf.get("core", "xcom_backend") + +Working with Custom Backends in K8s via Helm +-------------------------------------------- + +Running custom XCom backends in K8s will introduce even more complexity to you Airflow deployment. Put simply, sometimes things go wrong which can be difficult to debug. + +For example, if you define a custom XCom backend in the Chart ``values.yaml`` (via the ``xcom_backend`` configuration) and Airflow fails to load the class, the entire Chart deployment will fail with each pod container attempting to restart time and time again. + +When deploying in K8s your custom XCom backend needs to be reside in a ``config`` directory otherwise it cannot be located during Chart deployment. + +An observed problem is that it is very difficult to acquire logs from the container because there is a very small window of availability where the trace can be obtained. The only way you can determine the root cause is if you are fortunate enough to query and acquire the container logs at the right time. This in turn prevents the entire Helm chart from deploying successfully. From 4b3fa3a99a90eff00b244a62b52a2d6c8e25d285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Bregu=C5=82a?= Date: Thu, 3 Feb 2022 19:20:12 +0100 Subject: [PATCH 225/250] Update recipe for Google Cloud SDK (#21268) (cherry picked from commit 874a22ee9b77f8f100736558723ceaf2d04b446b) --- docs/docker-stack/docker-images-recipes/gcloud.Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/docker-stack/docker-images-recipes/gcloud.Dockerfile b/docs/docker-stack/docker-images-recipes/gcloud.Dockerfile index b1589e167d8f5..48f7c2ddf7a11 100644 --- a/docs/docker-stack/docker-images-recipes/gcloud.Dockerfile +++ b/docs/docker-stack/docker-images-recipes/gcloud.Dockerfile @@ -36,6 +36,7 @@ RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/goo --additional-components alpha beta kubectl \ --quiet \ && rm -rf "${TMP_DIR}" \ + && rm -rf "${GCLOUD_HOME}/.install/.backup/" \ && gcloud --version USER ${AIRFLOW_UID} From a519e53ddee5675402c091c1e99fe1643a968e87 Mon Sep 17 00:00:00 2001 From: Lucia Kasman <38845383+luciakasman@users.noreply.github.com> Date: Thu, 3 Feb 2022 15:10:27 -0300 Subject: [PATCH 226/250] Docs: Fix task order in overview example (#21282) (cherry picked from commit 1ba83c01b2b466ad5a76a453e5f6ee2884081e53) --- docs/apache-airflow/concepts/overview.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/apache-airflow/concepts/overview.rst b/docs/apache-airflow/concepts/overview.rst index fd862ea64610a..567c3c81f2927 100644 --- a/docs/apache-airflow/concepts/overview.rst +++ b/docs/apache-airflow/concepts/overview.rst @@ -65,12 +65,12 @@ Control Flow :doc:`tasks` have dependencies declared on each other. You'll see this in a DAG either using the ``>>`` and ``<<`` operators:: first_task >> [second_task, third_task] - third_task << fourth_task + fourth_task << third_task Or, with the ``set_upstream`` and ``set_downstream`` methods:: first_task.set_downstream([second_task, third_task]) - third_task.set_upstream(fourth_task) + fourth_task.set_upstream(third_task) These dependencies are what make up the "edges" of the graph, and how Airflow works out which order to run your tasks in. By default, a task will wait for all of its upstream tasks to succeed before it runs, but this can be customized using features like :ref:`Branching `, :ref:`LatestOnly `, and :ref:`Trigger Rules `. From 015c481f63825e6d5a7d044d970c563a6450040c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=20S=C3=A1nchez?= Date: Thu, 3 Feb 2022 18:12:08 +0000 Subject: [PATCH 227/250] Update stat_name_handler documentation (#21298) Previously stat_name_handler was under the scheduler section of the configuration but it was moved to the metrics section since 2.0.0. (cherry picked from commit 0ae31e9cb95e5061a23c2f397ab9716391c1a488) --- docs/apache-airflow/logging-monitoring/metrics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/apache-airflow/logging-monitoring/metrics.rst b/docs/apache-airflow/logging-monitoring/metrics.rst index c8fd1820c0ea9..bdbd2ec5c56a5 100644 --- a/docs/apache-airflow/logging-monitoring/metrics.rst +++ b/docs/apache-airflow/logging-monitoring/metrics.rst @@ -50,7 +50,7 @@ the metrics that start with the elements of the list: statsd_allow_list = scheduler,executor,dagrun If you want to redirect metrics to different name, you can configure ``stat_name_handler`` option -in ``[scheduler]`` section. It should point to a function that validates the statsd stat name, applies changes +in ``[metrics]`` section. It should point to a function that validates the StatsD stat name, applies changes to the stat name if necessary, and returns the transformed stat name. The function may looks as follow: .. code-block:: python From 64e0c5024b3cb13d2fc53f42b8096c2ae3441553 Mon Sep 17 00:00:00 2001 From: wano <55907021+wanlce@users.noreply.github.com> Date: Mon, 7 Feb 2022 02:02:57 +0800 Subject: [PATCH 228/250] Fix the incorrect scheduling time for the first run of dag (#21011) When Catchup_by_default is set to false and start_date in the DAG is the previous day, the first schedule time for this DAG may be incorrect Co-authored-by: wanlce (cherry picked from commit 0bcca55f4881bacc3fbe86f69e71981f5552b398) --- airflow/timetables/interval.py | 2 +- tests/timetables/test_interval_timetable.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/airflow/timetables/interval.py b/airflow/timetables/interval.py index d669cb652d153..01fac3a44e5d1 100644 --- a/airflow/timetables/interval.py +++ b/airflow/timetables/interval.py @@ -218,7 +218,7 @@ def _skip_to_latest(self, earliest: Optional[DateTime]) -> DateTime: raise AssertionError("next schedule shouldn't be earlier") if earliest is None: return new_start - return max(new_start, earliest) + return max(new_start, self._align(earliest)) def infer_manual_data_interval(self, *, run_after: DateTime) -> DataInterval: # Get the last complete period before run_after, e.g. if a DAG run is diff --git a/tests/timetables/test_interval_timetable.py b/tests/timetables/test_interval_timetable.py index 842cc1f234f3c..fe09e0c58969a 100644 --- a/tests/timetables/test_interval_timetable.py +++ b/tests/timetables/test_interval_timetable.py @@ -35,11 +35,32 @@ PREV_DATA_INTERVAL = DataInterval(start=PREV_DATA_INTERVAL_START, end=PREV_DATA_INTERVAL_END) CURRENT_TIME = pendulum.DateTime(2021, 9, 7, tzinfo=TIMEZONE) +YESTERDAY = CURRENT_TIME - datetime.timedelta(days=1) HOURLY_CRON_TIMETABLE = CronDataIntervalTimetable("@hourly", TIMEZONE) HOURLY_TIMEDELTA_TIMETABLE = DeltaDataIntervalTimetable(datetime.timedelta(hours=1)) HOURLY_RELATIVEDELTA_TIMETABLE = DeltaDataIntervalTimetable(dateutil.relativedelta.relativedelta(hours=1)) +CRON_TIMETABLE = CronDataIntervalTimetable("30 16 * * *", TIMEZONE) +DELTA_FROM_MIDNIGHT = datetime.timedelta(minutes=30, hours=16) + + +@pytest.mark.parametrize( + "last_automated_data_interval", + [pytest.param(None, id="first-run"), pytest.param(PREV_DATA_INTERVAL, id="subsequent")], +) +@freezegun.freeze_time(CURRENT_TIME) +def test_no_catchup_first_starts_at_current_time( + last_automated_data_interval: Optional[DataInterval], +) -> None: + """If ``catchup=False`` and start_date is a day before""" + next_info = CRON_TIMETABLE.next_dagrun_info( + last_automated_data_interval=last_automated_data_interval, + restriction=TimeRestriction(earliest=YESTERDAY, latest=None, catchup=False), + ) + expected_start = YESTERDAY + DELTA_FROM_MIDNIGHT + assert next_info == DagRunInfo.interval(start=expected_start, end=CURRENT_TIME + DELTA_FROM_MIDNIGHT) + @pytest.mark.parametrize( "timetable", From 270516cb1ac54bcf3ede888ddfffeb9154fa37d3 Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Thu, 3 Feb 2022 18:15:21 -0500 Subject: [PATCH 229/250] Update error docs to include before_send option (#21275) https://github.com/apache/airflow/pull/18261 Added support for the `before_send` option when initializing the Sentry SDK in airflow. This patch updates the documentation to reflect this change. (cherry picked from commit b38391e2f91760e64576723c876341f532a6ee2d) --- docs/apache-airflow/logging-monitoring/errors.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/apache-airflow/logging-monitoring/errors.rst b/docs/apache-airflow/logging-monitoring/errors.rst index 9f4256ae04c0e..30bb66d5f61fb 100644 --- a/docs/apache-airflow/logging-monitoring/errors.rst +++ b/docs/apache-airflow/logging-monitoring/errors.rst @@ -42,8 +42,14 @@ Add your ``SENTRY_DSN`` to your configuration file e.g. ``airflow.cfg`` in ``[se .. note:: If this value is not provided, the SDK will try to read it from the ``SENTRY_DSN`` environment variable. -You can supply `additional configuration options `__ based on the Python platform via ``[sentry]`` section. -Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``, ``ignore_errors``, ``before_breadcrumb``, ``before_send``, ``transport``. +The ``before_send`` option can be used to modify or drop events before they are sent to Sentry. To set this option, provide a dotted path to a before_send function that the sentry SDK should be configured to use. + +.. code-block:: ini + + [sentry] + before_send = path.to.my.sentry.before_send + +You can supply `additional configuration options `__ based on the Python platform via ``[sentry]`` section. Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``, ``ignore_errors``, ``before_breadcrumb``, ``transport``. Tags ----- From 5c078cda332d42baba72c985532e0060d30a31ef Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Mon, 7 Feb 2022 11:24:31 -0700 Subject: [PATCH 230/250] Update example DAGs (#21372) (cherry picked from commit 7a38ec2ad3b3bd6fda5e1ee9fe9e644ccb8b4c12) --- .../example_passing_params_via_test_command.py | 11 ++++++----- airflow/example_dags/tutorial.py | 2 -- docs/apache-airflow/tutorial.rst | 9 ++------- tests/cli/commands/test_task_command.py | 1 - 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/airflow/example_dags/example_passing_params_via_test_command.py b/airflow/example_dags/example_passing_params_via_test_command.py index e3f04c430c609..d4781afab8d98 100644 --- a/airflow/example_dags/example_passing_params_via_test_command.py +++ b/airflow/example_dags/example_passing_params_via_test_command.py @@ -68,17 +68,18 @@ def print_env_vars(test_mode=None): ) as dag: run_this = my_py_command(params={"miff": "agg"}) - my_templated_command = dedent( + my_command = dedent( + """ + echo "'foo' was passed in via Airflow CLI Test command with value '$FOO'" + echo "'miff' was passed in via BashOperator with value '$MIFF'" """ - echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} " - echo " 'miff was passed in via BashOperator with value {{ params.miff }} " - """ ) also_run_this = BashOperator( task_id='also_run_this', - bash_command=my_templated_command, + bash_command=my_command, params={"miff": "agg"}, + env={"FOO": "{{ params.foo }}", "MIFF": "{{ params.miff }}"}, ) env_var_test_task = print_env_vars() diff --git a/airflow/example_dags/tutorial.py b/airflow/example_dags/tutorial.py index 1049772d87d93..ff2bd2fe95cf7 100644 --- a/airflow/example_dags/tutorial.py +++ b/airflow/example_dags/tutorial.py @@ -109,7 +109,6 @@ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" - echo "{{ params.my_param }}" {% endfor %} """ ) @@ -118,7 +117,6 @@ task_id='templated', depends_on_past=False, bash_command=templated_command, - params={'my_param': 'Parameter I passed in'}, ) # [END jinja_template] diff --git a/docs/apache-airflow/tutorial.rst b/docs/apache-airflow/tutorial.rst index 1c32e78e78fea..0034b2ce059af 100644 --- a/docs/apache-airflow/tutorial.rst +++ b/docs/apache-airflow/tutorial.rst @@ -151,13 +151,8 @@ stamp"). :end-before: [END jinja_template] Notice that the ``templated_command`` contains code logic in ``{% %}`` blocks, -references parameters like ``{{ ds }}``, calls a function as in -``{{ macros.ds_add(ds, 7)}}``, and references a user-defined parameter -in ``{{ params.my_param }}``. - -The ``params`` hook in ``BaseOperator`` allows you to pass a dictionary of -parameters and/or objects to your templates. Please take the time -to understand how the parameter ``my_param`` makes it through to the template. +references parameters like ``{{ ds }}``, and calls a function as in +``{{ macros.ds_add(ds, 7)}}``. Files can also be passed to the ``bash_command`` argument, like ``bash_command='templated_command.sh'``, where the file location is relative to diff --git a/tests/cli/commands/test_task_command.py b/tests/cli/commands/test_task_command.py index 201af16bb75be..76c6cdb01972d 100644 --- a/tests/cli/commands/test_task_command.py +++ b/tests/cli/commands/test_task_command.py @@ -263,7 +263,6 @@ def test_task_render(self): assert 'echo "2016-01-01"' in output assert 'echo "2016-01-08"' in output - assert 'echo "Parameter I passed in"' in output def test_cli_run_when_pickle_and_dag_cli_method_selected(self): """ From f41ea340769ac8d7d0eec10e4de68e446abac2e4 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Mon, 7 Feb 2022 08:55:20 -0700 Subject: [PATCH 231/250] Fix docs link for smart sensor deprecation (#21394) We are releasing the deprecation in version 2.2.4, not 2.3.0 like originally planned. (cherry picked from commit 3a780380d8f5d50ffc876c326e70ee0eee033c0d) --- UPDATING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/UPDATING.md b/UPDATING.md index a75b2d6685355..2ed4aace4f6eb 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -87,7 +87,7 @@ https://developers.google.com/style/inclusive-documentation Smart sensors, an "early access" feature added in Airflow 2, are now deprecated and will be removed in Airflow 2.4.0. They have been superseded by Deferable Operators, added in Airflow 2.2.0. -See [Migrating to Deferrable Operators](https://airflow.apache.org/docs/apache-airflow/2.3.0/concepts/smart-sensors.html#migrating-to-deferrable-operators) for details on how to migrate. +See [Migrating to Deferrable Operators](https://airflow.apache.org/docs/apache-airflow/2.2.4/concepts/smart-sensors.html#migrating-to-deferrable-operators) for details on how to migrate. ## Airflow 2.2.3 From f2fe0df6b3caa86a4315322264fad077f03b32e6 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Mon, 7 Feb 2022 20:12:05 +0100 Subject: [PATCH 232/250] Avoid deadlock when rescheduling task (#21362) The scheduler job performs scheduling after locking the "scheduled" DagRun row for writing. This should prevent from modifying DagRun and related task instances by another scheduler or "mini-scheduler" run after task is completed. However there is apparently one more case where the DagRun is being locked by "Task" processes - namely when task throws AirflowRescheduleException. In this case a new "TaskReschedule" entity is inserted into the database and it also performs lock on the DagRun (because TaskReschedule has "DagRun" relationship. This PR modifies handling the AirflowRescheduleException to obtain the very same DagRun lock before it attempts to insert TaskReschedule entity. Seems that TaskReschedule is the only one that has this relationship so likely all the misterious SchedulerJob deadlock cases we experienced might be explained (and fixed) by this one. It is likely that this one: * Fixes: #16982 * Fixes: #19957 (cherry picked from commit 6d110b565a505505351d1ff19592626fb24e4516) --- airflow/models/taskinstance.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/airflow/models/taskinstance.py b/airflow/models/taskinstance.py index ec34156931edf..2dcc923c1e6f8 100644 --- a/airflow/models/taskinstance.py +++ b/airflow/models/taskinstance.py @@ -93,7 +93,7 @@ from airflow.utils.platform import getuser from airflow.utils.retries import run_with_db_retries from airflow.utils.session import create_session, provide_session -from airflow.utils.sqlalchemy import ExtendedJSON, UtcDateTime +from airflow.utils.sqlalchemy import ExtendedJSON, UtcDateTime, with_row_locks from airflow.utils.state import DagRunState, State from airflow.utils.timeout import timeout @@ -1657,11 +1657,24 @@ def _handle_reschedule(self, actual_start_date, reschedule_exception, test_mode= # Don't record reschedule request in test mode if test_mode: return + + from airflow.models.dagrun import DagRun # Avoid circular import + self.refresh_from_db(session) self.end_date = timezone.utcnow() self.set_duration() + # Lock DAG run to be sure not to get into a deadlock situation when trying to insert + # TaskReschedule which apparently also creates lock on corresponding DagRun entity + with_row_locks( + session.query(DagRun).filter_by( + dag_id=self.dag_id, + run_id=self.run_id, + ), + session=session, + ).one() + # Log reschedule request session.add( TaskReschedule( From 9b03071333cf8e40f5ee9b8aa030656df59eb83c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Wyszomirski?= Date: Fri, 28 Jan 2022 06:18:05 +0100 Subject: [PATCH 233/250] Add possibility to create user in the Remote User mode (#19963) (cherry picked from commit cdd9ea66208e3d70d1cf2a34530ba69bc3c58a50) --- airflow/www/views.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/airflow/www/views.py b/airflow/www/views.py index f2642a73f1f0e..2ed2a67b58ff1 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -4731,3 +4731,28 @@ class CustomUserOIDModelView(MultiResourceUserMixin, UserOIDModelView): class CustomUserRemoteUserModelView(MultiResourceUserMixin, UserRemoteUserModelView): """Customize permission names for FAB's builtin UserRemoteUserModelView.""" + + _class_permission_name = permissions.RESOURCE_USER + + class_permission_name_mapping = { + 'userinfoedit': permissions.RESOURCE_MY_PROFILE, + 'userinfo': permissions.RESOURCE_MY_PROFILE, + } + + method_permission_name = { + 'add': 'create', + 'userinfo': 'read', + 'download': 'read', + 'show': 'read', + 'list': 'read', + 'edit': 'edit', + 'userinfoedit': 'edit', + 'delete': 'delete', + } + + base_permissions = [ + permissions.ACTION_CAN_CREATE, + permissions.ACTION_CAN_READ, + permissions.ACTION_CAN_EDIT, + permissions.ACTION_CAN_DELETE, + ] From 95eaef37f621e8abaa30bb145866f1863471fdd6 Mon Sep 17 00:00:00 2001 From: Niko Date: Thu, 9 Dec 2021 05:46:59 -0800 Subject: [PATCH 234/250] Fix TriggerDagRunOperator extra link (#19410) The extra link provided by the operator was previously using the execution date of the triggering dag, not the triggered dag. Store the execution date of the triggered dag in xcom so that it can be read back later within the webserver when the link is being created. (cherry picked from commit 820e836c4a2e45239279d4d71e1db9434022fec5) --- airflow/operators/trigger_dagrun.py | 19 +++++++++- tests/operators/test_trigger_dagrun.py | 49 ++++++++++++++++++++------ 2 files changed, 57 insertions(+), 11 deletions(-) diff --git a/airflow/operators/trigger_dagrun.py b/airflow/operators/trigger_dagrun.py index 421c7963d0d68..a346db10f0fc9 100644 --- a/airflow/operators/trigger_dagrun.py +++ b/airflow/operators/trigger_dagrun.py @@ -24,11 +24,15 @@ from airflow.api.common.trigger_dag import trigger_dag from airflow.exceptions import AirflowException, DagNotFound, DagRunAlreadyExists from airflow.models import BaseOperator, BaseOperatorLink, DagBag, DagModel, DagRun +from airflow.models.xcom import XCom from airflow.utils import timezone from airflow.utils.helpers import build_airflow_url_with_query from airflow.utils.state import State from airflow.utils.types import DagRunType +XCOM_EXECUTION_DATE_ISO = "trigger_execution_date_iso" +XCOM_RUN_ID = "trigger_run_id" + class TriggerDagRunLink(BaseOperatorLink): """ @@ -39,7 +43,13 @@ class TriggerDagRunLink(BaseOperatorLink): name = 'Triggered DAG' def get_link(self, operator, dttm): - query = {"dag_id": operator.trigger_dag_id, "execution_date": dttm.isoformat()} + # Fetch the correct execution date for the triggerED dag which is + # stored in xcom during execution of the triggerING task. + trigger_execution_date_iso = XCom.get_one( + execution_date=dttm, key=XCOM_EXECUTION_DATE_ISO, task_id=operator.task_id, dag_id=operator.dag_id + ) + + query = {"dag_id": operator.trigger_dag_id, "base_date": trigger_execution_date_iso} return build_airflow_url_with_query(query) @@ -140,6 +150,7 @@ def execute(self, context: Dict): execution_date=self.execution_date, replace_microseconds=False, ) + except DagRunAlreadyExists as e: if self.reset_dag_run: self.log.info("Clearing %s on %s", self.trigger_dag_id, self.execution_date) @@ -157,6 +168,12 @@ def execute(self, context: Dict): else: raise e + # Store the execution date from the dag run (either created or found above) to + # be used when creating the extra link on the webserver. + ti = context['task_instance'] + ti.xcom_push(key=XCOM_EXECUTION_DATE_ISO, value=dag_run.execution_date.isoformat()) + ti.xcom_push(key=XCOM_RUN_ID, value=dag_run.run_id) + if self.wait_for_completion: # wait for dag to complete while True: diff --git a/tests/operators/test_trigger_dagrun.py b/tests/operators/test_trigger_dagrun.py index 9ff87358d0db2..1934c4d4174b0 100644 --- a/tests/operators/test_trigger_dagrun.py +++ b/tests/operators/test_trigger_dagrun.py @@ -19,7 +19,7 @@ import pathlib import tempfile from datetime import datetime -from unittest import TestCase +from unittest import TestCase, mock import pytest @@ -76,6 +76,25 @@ def tearDown(self): pathlib.Path(self._tmpfile).unlink() + @mock.patch('airflow.operators.trigger_dagrun.build_airflow_url_with_query') + def assert_extra_link(self, triggering_exec_date, triggered_dag_run, triggering_task, mock_build_url): + """ + Asserts whether the correct extra links url will be created. + + Specifically it tests whether the correct dag id and date are passed to + the method which constructs the final url. + Note: We can't run that method to generate the url itself because the Flask app context + isn't available within the test logic, so it is mocked here. + """ + triggering_task.get_extra_links(triggering_exec_date, 'Triggered DAG') + assert mock_build_url.called + args, _ = mock_build_url.call_args + expected_args = { + 'dag_id': triggered_dag_run.dag_id, + 'base_date': triggered_dag_run.execution_date.isoformat(), + } + assert expected_args in args + def test_trigger_dagrun(self): """Test TriggerDagRunOperator.""" task = TriggerDagRunOperator(task_id="test_task", trigger_dag_id=TRIGGERED_DAG_ID, dag=self.dag) @@ -84,7 +103,9 @@ def test_trigger_dagrun(self): with create_session() as session: dagruns = session.query(DagRun).filter(DagRun.dag_id == TRIGGERED_DAG_ID).all() assert len(dagruns) == 1 - assert dagruns[0].external_trigger + triggered_dag_run = dagruns[0] + assert triggered_dag_run.external_trigger + self.assert_extra_link(DEFAULT_DATE, triggered_dag_run, task) def test_trigger_dagrun_custom_run_id(self): task = TriggerDagRunOperator( @@ -114,8 +135,10 @@ def test_trigger_dagrun_with_execution_date(self): with create_session() as session: dagruns = session.query(DagRun).filter(DagRun.dag_id == TRIGGERED_DAG_ID).all() assert len(dagruns) == 1 - assert dagruns[0].external_trigger - assert dagruns[0].execution_date == utc_now + triggered_dag_run = dagruns[0] + assert triggered_dag_run.external_trigger + assert triggered_dag_run.execution_date == utc_now + self.assert_extra_link(DEFAULT_DATE, triggered_dag_run, task) def test_trigger_dagrun_twice(self): """Test TriggerDagRunOperator with custom execution_date.""" @@ -140,12 +163,14 @@ def test_trigger_dagrun_twice(self): ) session.add(dag_run) session.commit() - task.execute(None) + task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) dagruns = session.query(DagRun).filter(DagRun.dag_id == TRIGGERED_DAG_ID).all() assert len(dagruns) == 1 - assert dagruns[0].external_trigger - assert dagruns[0].execution_date == utc_now + triggered_dag_run = dagruns[0] + assert triggered_dag_run.external_trigger + assert triggered_dag_run.execution_date == utc_now + self.assert_extra_link(DEFAULT_DATE, triggered_dag_run, task) def test_trigger_dagrun_with_templated_execution_date(self): """Test TriggerDagRunOperator with templated execution_date.""" @@ -160,8 +185,10 @@ def test_trigger_dagrun_with_templated_execution_date(self): with create_session() as session: dagruns = session.query(DagRun).filter(DagRun.dag_id == TRIGGERED_DAG_ID).all() assert len(dagruns) == 1 - assert dagruns[0].external_trigger - assert dagruns[0].execution_date == DEFAULT_DATE + triggered_dag_run = dagruns[0] + assert triggered_dag_run.external_trigger + assert triggered_dag_run.execution_date == DEFAULT_DATE + self.assert_extra_link(DEFAULT_DATE, triggered_dag_run, task) def test_trigger_dagrun_operator_conf(self): """Test passing conf to the triggered DagRun.""" @@ -288,7 +315,9 @@ def test_trigger_dagrun_triggering_itself(self): .all() ) assert len(dagruns) == 2 - assert dagruns[1].state == State.QUEUED + triggered_dag_run = dagruns[1] + assert triggered_dag_run.state == State.QUEUED + self.assert_extra_link(execution_date, triggered_dag_run, task) def test_trigger_dagrun_triggering_itself_with_execution_date(self): """Test TriggerDagRunOperator that triggers itself with execution date, From 1c2340558b96cde92390bf1b4dc9483236675e18 Mon Sep 17 00:00:00 2001 From: David Caron Date: Thu, 3 Feb 2022 21:14:19 -0500 Subject: [PATCH 235/250] Fix mismatch in generated run_id and logical date of DAG run (#18707) Co-authored-by: Tzu-ping Chung Co-authored-by: Jed Cunningham (cherry picked from commit 1f08d281632670aef1de8dfc62c9f63aeec18760) --- airflow/operators/trigger_dagrun.py | 20 +++++++++----------- tests/operators/test_trigger_dagrun.py | 25 ++++++++++++------------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/airflow/operators/trigger_dagrun.py b/airflow/operators/trigger_dagrun.py index a346db10f0fc9..7dae1963f37b8 100644 --- a/airflow/operators/trigger_dagrun.py +++ b/airflow/operators/trigger_dagrun.py @@ -115,13 +115,13 @@ def __init__( self.allowed_states = allowed_states or [State.SUCCESS] self.failed_states = failed_states or [State.FAILED] - if not isinstance(execution_date, (str, datetime.datetime, type(None))): + if execution_date is not None and not isinstance(execution_date, (str, datetime.datetime)): raise TypeError( "Expected str or datetime.datetime type for execution_date." "Got {}".format(type(execution_date)) ) - self.execution_date: Optional[datetime.datetime] = execution_date # type: ignore + self.execution_date = execution_date try: json.dumps(self.conf) @@ -130,30 +130,28 @@ def __init__( def execute(self, context: Dict): if isinstance(self.execution_date, datetime.datetime): - execution_date = self.execution_date + parsed_execution_date = self.execution_date elif isinstance(self.execution_date, str): - execution_date = timezone.parse(self.execution_date) - self.execution_date = execution_date + parsed_execution_date = timezone.parse(self.execution_date) else: - execution_date = timezone.utcnow() + parsed_execution_date = timezone.utcnow() if self.trigger_run_id: run_id = self.trigger_run_id else: - run_id = DagRun.generate_run_id(DagRunType.MANUAL, execution_date) - + run_id = DagRun.generate_run_id(DagRunType.MANUAL, parsed_execution_date) try: dag_run = trigger_dag( dag_id=self.trigger_dag_id, run_id=run_id, conf=self.conf, - execution_date=self.execution_date, + execution_date=parsed_execution_date, replace_microseconds=False, ) except DagRunAlreadyExists as e: if self.reset_dag_run: - self.log.info("Clearing %s on %s", self.trigger_dag_id, self.execution_date) + self.log.info("Clearing %s on %s", self.trigger_dag_id, parsed_execution_date) # Get target dag object and call clear() @@ -163,7 +161,7 @@ def execute(self, context: Dict): dag_bag = DagBag(dag_folder=dag_model.fileloc, read_dags_from_db=True) dag = dag_bag.get_dag(self.trigger_dag_id) - dag.clear(start_date=self.execution_date, end_date=self.execution_date) + dag.clear(start_date=parsed_execution_date, end_date=parsed_execution_date) dag_run = DagRun.find(dag_id=dag.dag_id, run_id=run_id)[0] else: raise e diff --git a/tests/operators/test_trigger_dagrun.py b/tests/operators/test_trigger_dagrun.py index 1934c4d4174b0..180781eed6109 100644 --- a/tests/operators/test_trigger_dagrun.py +++ b/tests/operators/test_trigger_dagrun.py @@ -30,6 +30,7 @@ from airflow.utils import timezone from airflow.utils.session import create_session from airflow.utils.state import State +from airflow.utils.types import DagRunType DEFAULT_DATE = datetime(2019, 1, 1, tzinfo=timezone.utc) TEST_DAG_ID = "testdag" @@ -101,11 +102,10 @@ def test_trigger_dagrun(self): task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with create_session() as session: - dagruns = session.query(DagRun).filter(DagRun.dag_id == TRIGGERED_DAG_ID).all() - assert len(dagruns) == 1 - triggered_dag_run = dagruns[0] - assert triggered_dag_run.external_trigger - self.assert_extra_link(DEFAULT_DATE, triggered_dag_run, task) + dagrun = session.query(DagRun).filter(DagRun.dag_id == TRIGGERED_DAG_ID).one() + assert dagrun.external_trigger + assert dagrun.run_id == DagRun.generate_run_id(DagRunType.MANUAL, dagrun.execution_date) + self.assert_extra_link(DEFAULT_DATE, dagrun, task) def test_trigger_dagrun_custom_run_id(self): task = TriggerDagRunOperator( @@ -123,22 +123,21 @@ def test_trigger_dagrun_custom_run_id(self): def test_trigger_dagrun_with_execution_date(self): """Test TriggerDagRunOperator with custom execution_date.""" - utc_now = timezone.utcnow() + custom_execution_date = timezone.datetime(2021, 1, 2, 3, 4, 5) task = TriggerDagRunOperator( task_id="test_trigger_dagrun_with_execution_date", trigger_dag_id=TRIGGERED_DAG_ID, - execution_date=utc_now, + execution_date=custom_execution_date, dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with create_session() as session: - dagruns = session.query(DagRun).filter(DagRun.dag_id == TRIGGERED_DAG_ID).all() - assert len(dagruns) == 1 - triggered_dag_run = dagruns[0] - assert triggered_dag_run.external_trigger - assert triggered_dag_run.execution_date == utc_now - self.assert_extra_link(DEFAULT_DATE, triggered_dag_run, task) + dagrun = session.query(DagRun).filter(DagRun.dag_id == TRIGGERED_DAG_ID).one() + assert dagrun.external_trigger + assert dagrun.execution_date == custom_execution_date + assert dagrun.run_id == DagRun.generate_run_id(DagRunType.MANUAL, custom_execution_date) + self.assert_extra_link(DEFAULT_DATE, dagrun, task) def test_trigger_dagrun_twice(self): """Test TriggerDagRunOperator with custom execution_date.""" From efc281829c6d9458ae83e026afcc753fe935ba75 Mon Sep 17 00:00:00 2001 From: KevinYanesG <75472729+KevinYanesG@users.noreply.github.com> Date: Thu, 10 Feb 2022 15:38:53 +0100 Subject: [PATCH 236/250] Fix postgres hook import pipeline tutorial (#21491) (cherry picked from commit a2abf663157aea14525e1a55eb9735ba659ae8d6) --- docs/apache-airflow/tutorial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/apache-airflow/tutorial.rst b/docs/apache-airflow/tutorial.rst index 0034b2ce059af..a8f76bc8689dd 100644 --- a/docs/apache-airflow/tutorial.rst +++ b/docs/apache-airflow/tutorial.rst @@ -413,7 +413,7 @@ Let's break this down into 2 steps: get data & merge data: import requests from airflow.decorators import task - from airflow.hooks.postgres import PostgresHook + from airflow.providers.postgres.hooks.postgres import PostgresHook @task @@ -478,7 +478,7 @@ Lets look at our DAG: import requests from airflow.decorators import dag, task - from airflow.hooks.postgres import PostgresHook + from airflow.providers.postgres.hooks.postgres import PostgresHook @dag( From 79e995480822fd68c715c6ab5d83357721fa2d55 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Sat, 12 Feb 2022 11:40:29 +0800 Subject: [PATCH 237/250] Use compat data interval shim in log handlers (#21289) (cherry picked from commit 44bd211b19dcb75eeb53ced5bea2cf0c80654b1a) --- .../elasticsearch/log/es_task_handler.py | 27 +++++++++----- airflow/utils/log/file_task_handler.py | 35 ++++++++++++++----- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/airflow/providers/elasticsearch/log/es_task_handler.py b/airflow/providers/elasticsearch/log/es_task_handler.py index cd0897153dfdc..b591aef44191a 100644 --- a/airflow/providers/elasticsearch/log/es_task_handler.py +++ b/airflow/providers/elasticsearch/log/es_task_handler.py @@ -101,15 +101,25 @@ def __init__( self.context_set = False def _render_log_id(self, ti: TaskInstance, try_number: int) -> str: - dag_run = ti.dag_run + dag_run = ti.get_dagrun() + try: + data_interval: Tuple[datetime, datetime] = ti.task.dag.get_run_data_interval(dag_run) + except AttributeError: # ti.task is not always set. + data_interval = (dag_run.data_interval_start, dag_run.data_interval_end) if self.json_format: - data_interval_start = self._clean_date(dag_run.data_interval_start) - data_interval_end = self._clean_date(dag_run.data_interval_end) + data_interval_start = self._clean_date(data_interval[0]) + data_interval_end = self._clean_date(data_interval[1]) execution_date = self._clean_date(dag_run.execution_date) else: - data_interval_start = dag_run.data_interval_start.isoformat() - data_interval_end = dag_run.data_interval_end.isoformat() + if data_interval[0]: + data_interval_start = data_interval[0].isoformat() + else: + data_interval_start = "" + if data_interval[1]: + data_interval_end = data_interval[1].isoformat() + else: + data_interval_end = "" execution_date = dag_run.execution_date.isoformat() return self.log_id_template.format( @@ -123,14 +133,15 @@ def _render_log_id(self, ti: TaskInstance, try_number: int) -> str: ) @staticmethod - def _clean_date(value: datetime) -> str: + def _clean_date(value: Optional[datetime]) -> str: """ Clean up a date value so that it is safe to query in elasticsearch by removing reserved characters. - # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#_reserved_characters - :param execution_date: execution date of the dag run. + https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#_reserved_characters """ + if value is None: + return "" return value.strftime("%Y_%m_%dT%H_%M_%S_%f") def _group_logs_by_host(self, logs): diff --git a/airflow/utils/log/file_task_handler.py b/airflow/utils/log/file_task_handler.py index 6e57c671073fa..e13b8d4a9caae 100644 --- a/airflow/utils/log/file_task_handler.py +++ b/airflow/utils/log/file_task_handler.py @@ -18,8 +18,9 @@ """File logging handler for tasks.""" import logging import os +from datetime import datetime from pathlib import Path -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Optional, Tuple import httpx from itsdangerous import TimedJSONWebSignatureSerializer @@ -82,13 +83,31 @@ def _render_filename(self, ti: "TaskInstance", try_number: int) -> str: context = Context(ti=ti, ts=ti.get_dagrun().logical_date.isoformat()) context["try_number"] = try_number return render_template_to_string(self.filename_jinja_template, context) - - return self.filename_template.format( - dag_id=ti.dag_id, - task_id=ti.task_id, - execution_date=ti.get_dagrun().logical_date.isoformat(), - try_number=try_number, - ) + elif self.filename_template: + dag_run = ti.get_dagrun() + try: + data_interval: Tuple[datetime, datetime] = ti.task.dag.get_run_data_interval(dag_run) + except AttributeError: # ti.task is not always set. + data_interval = (dag_run.data_interval_start, dag_run.data_interval_end) + if data_interval[0]: + data_interval_start = data_interval[0].isoformat() + else: + data_interval_start = "" + if data_interval[1]: + data_interval_end = data_interval[1].isoformat() + else: + data_interval_end = "" + return self.filename_template.format( + dag_id=ti.dag_id, + task_id=ti.task_id, + run_id=ti.run_id, + data_interval_start=data_interval_start, + data_interval_end=data_interval_end, + execution_date=ti.get_dagrun().logical_date.isoformat(), + try_number=try_number, + ) + else: + raise RuntimeError(f"Unable to render log filename for {ti}. This should never happen") def _read_grouped_logs(self): return False From f25a58eebeb9ad3283fca2daa6666811f1a036c6 Mon Sep 17 00:00:00 2001 From: Aleksey Kirilishin <54231417+avkirilishin@users.noreply.github.com> Date: Mon, 14 Feb 2022 18:55:00 +0300 Subject: [PATCH 238/250] Show task status only for running dags or only for the last finished dag (#21352) * Show task status only for running dags or only for the last finished dag * Brought the logic of getting task statistics into a separate function (cherry picked from commit 28d7bde2750c38300e5cf70ba32be153b1a11f2c) --- airflow/www/views.py | 64 ++++++++++++++++++++++++++++------- tests/www/views/test_views.py | 35 ++++++++++++++++++- 2 files changed, 85 insertions(+), 14 deletions(-) diff --git a/airflow/www/views.py b/airflow/www/views.py index 2ed2a67b58ff1..9ebe8992708ab 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -408,6 +408,31 @@ def get_downstream(task): return result +def get_task_stats_from_query(qry): + """ + Return a dict of the task quantity, grouped by dag id and task status. + + :param qry: The data in the format (, , , ), + ordered by and + """ + data = {} + last_dag_id = None + has_running_dags = False + for dag_id, state, is_dag_running, count in qry: + if last_dag_id != dag_id: + last_dag_id = dag_id + has_running_dags = False + elif not is_dag_running and has_running_dags: + continue + + if is_dag_running: + has_running_dags = True + if dag_id not in data: + data[dag_id] = {} + data[dag_id][state] = count + return data + + ###################################################################################### # Error handlers ###################################################################################### @@ -814,7 +839,9 @@ def task_stats(self, session=None): # Select all task_instances from active dag_runs. running_task_instance_query_result = session.query( - TaskInstance.dag_id.label('dag_id'), TaskInstance.state.label('state') + TaskInstance.dag_id.label('dag_id'), + TaskInstance.state.label('state'), + sqla.literal(True).label('is_dag_running'), ).join( running_dag_run_query_result, and_( @@ -838,7 +865,11 @@ def task_stats(self, session=None): # Select all task_instances from active dag_runs. # If no dag_run is active, return task instances from most recent dag_run. last_task_instance_query_result = ( - session.query(TaskInstance.dag_id.label('dag_id'), TaskInstance.state.label('state')) + session.query( + TaskInstance.dag_id.label('dag_id'), + TaskInstance.state.label('state'), + sqla.literal(False).label('is_dag_running'), + ) .join(TaskInstance.dag_run) .join( last_dag_run, @@ -855,18 +886,25 @@ def task_stats(self, session=None): else: final_task_instance_query_result = running_task_instance_query_result.subquery('final_ti') - qry = session.query( - final_task_instance_query_result.c.dag_id, - final_task_instance_query_result.c.state, - sqla.func.count(), - ).group_by(final_task_instance_query_result.c.dag_id, final_task_instance_query_result.c.state) - - data = {} - for dag_id, state, count in qry: - if dag_id not in data: - data[dag_id] = {} - data[dag_id][state] = count + qry = ( + session.query( + final_task_instance_query_result.c.dag_id, + final_task_instance_query_result.c.state, + final_task_instance_query_result.c.is_dag_running, + sqla.func.count(), + ) + .group_by( + final_task_instance_query_result.c.dag_id, + final_task_instance_query_result.c.state, + final_task_instance_query_result.c.is_dag_running, + ) + .order_by( + final_task_instance_query_result.c.dag_id, + final_task_instance_query_result.c.is_dag_running.desc(), + ) + ) + data = get_task_stats_from_query(qry) payload = {} for dag_id in filter_dag_ids: payload[dag_id] = [] diff --git a/tests/www/views/test_views.py b/tests/www/views/test_views.py index b98c1bc71253f..672d4a157281b 100644 --- a/tests/www/views/test_views.py +++ b/tests/www/views/test_views.py @@ -24,7 +24,13 @@ from airflow.configuration import initialize_config from airflow.plugins_manager import AirflowPlugin, EntryPointSource from airflow.www import views -from airflow.www.views import get_key_paths, get_safe_url, get_value_from_path, truncate_task_duration +from airflow.www.views import ( + get_key_paths, + get_safe_url, + get_task_stats_from_query, + get_value_from_path, + truncate_task_duration, +) from tests.test_utils.config import conf_vars from tests.test_utils.mock_plugins import mock_plugin_manager from tests.test_utils.www import check_content_in_response, check_content_not_in_response @@ -333,3 +339,30 @@ def test_dag_edit_privileged_requires_view_has_action_decorators(cls: type): action_funcs = action_funcs - {"action_post"} for action_function in action_funcs: assert_decorator_used(cls, action_function, views.action_has_dag_edit_access) + + +def test_get_task_stats_from_query(): + query_data = [ + ['dag1', 'queued', True, 1], + ['dag1', 'running', True, 2], + ['dag1', 'success', False, 3], + ['dag2', 'running', True, 4], + ['dag2', 'success', True, 5], + ['dag3', 'success', False, 6], + ] + expected_data = { + 'dag1': { + 'queued': 1, + 'running': 2, + }, + 'dag2': { + 'running': 4, + 'success': 5, + }, + 'dag3': { + 'success': 6, + }, + } + + data = get_task_stats_from_query(query_data) + assert data == expected_data From 1c2909f8d69ade70803f10653e4845319ae99c0e Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Tue, 15 Feb 2022 10:57:46 -0700 Subject: [PATCH 239/250] Add a session backend to store session data in the database (#21478) Co-authored-by: Jed Cunningham (cherry picked from commit da9d0863c7ff121c111a455708163b026943bdf1) --- airflow/config_templates/config.yml | 7 ++ airflow/config_templates/default_airflow.cfg | 4 ++ .../c381b21cb7e4_add_session_table_to_db.py | 54 +++++++++++++++ airflow/utils/db.py | 3 + airflow/www/app.py | 3 +- airflow/www/extensions/init_session.py | 63 +++++++++++------- airflow/www/session.py | 40 ++++++++++++ docs/apache-airflow/migrations-ref.rst | 4 +- docs/spelling_wordlist.txt | 1 + setup.cfg | 3 + tests/api_connexion/conftest.py | 7 +- tests/api_connexion/test_security.py | 4 ++ tests/test_utils/decorators.py | 2 +- tests/utils/test_db.py | 3 + tests/www/views/conftest.py | 1 + tests/www/views/test_session.py | 65 +++++++++++++++++++ 16 files changed, 234 insertions(+), 30 deletions(-) create mode 100644 airflow/migrations/versions/c381b21cb7e4_add_session_table_to_db.py create mode 100644 airflow/www/session.py create mode 100644 tests/www/views/test_session.py diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index 6941f03e53550..1e7704108773f 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -999,6 +999,13 @@ type: string example: ~ default: "" + - name: session_backend + description: | + The type of backend used to store web session data, can be 'database' or 'securecookie' + version_added: 2.2.4 + type: string + example: securecookie + default: database - name: web_server_master_timeout description: | Number of seconds the webserver waits before killing gunicorn master that doesn't respond diff --git a/airflow/config_templates/default_airflow.cfg b/airflow/config_templates/default_airflow.cfg index 6a5449b76d29a..826eaf42dc435 100644 --- a/airflow/config_templates/default_airflow.cfg +++ b/airflow/config_templates/default_airflow.cfg @@ -516,6 +516,10 @@ web_server_ssl_cert = # provided SSL will be enabled. This does not change the web server port. web_server_ssl_key = +# The type of backend used to store web session data, can be 'database' or 'securecookie' +# Example: session_backend = securecookie +session_backend = database + # Number of seconds the webserver waits before killing gunicorn master that doesn't respond web_server_master_timeout = 120 diff --git a/airflow/migrations/versions/c381b21cb7e4_add_session_table_to_db.py b/airflow/migrations/versions/c381b21cb7e4_add_session_table_to_db.py new file mode 100644 index 0000000000000..cc6b9ab35f0b2 --- /dev/null +++ b/airflow/migrations/versions/c381b21cb7e4_add_session_table_to_db.py @@ -0,0 +1,54 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""add session table to db + +Revision ID: c381b21cb7e4 +Revises: be2bfac3da23 +Create Date: 2022-01-25 13:56:35.069429 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = 'c381b21cb7e4' +down_revision = 'be2bfac3da23' +branch_labels = None +depends_on = None + +TABLE_NAME = 'session' + + +def upgrade(): + """Apply add session table to db""" + op.create_table( + TABLE_NAME, + sa.Column('id', sa.Integer()), + sa.Column('session_id', sa.String(255)), + sa.Column('data', sa.LargeBinary()), + sa.Column('expiry', sa.DateTime()), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('session_id'), + ) + + +def downgrade(): + """Unapply add session table to db""" + op.drop_table(TABLE_NAME) diff --git a/airflow/utils/db.py b/airflow/utils/db.py index 023f482790d00..c038d661286cb 100644 --- a/airflow/utils/db.py +++ b/airflow/utils/db.py @@ -954,9 +954,12 @@ def drop_airflow_models(connection): users.drop(settings.engine, checkfirst=True) dag_stats = Table('dag_stats', Base.metadata) dag_stats.drop(settings.engine, checkfirst=True) + session = Table('session', Base.metadata) + session.drop(settings.engine, checkfirst=True) Base.metadata.drop_all(connection) # we remove the Tables here so that if resetdb is run metadata does not keep the old tables. + Base.metadata.remove(session) Base.metadata.remove(dag_stats) Base.metadata.remove(users) Base.metadata.remove(user) diff --git a/airflow/www/app.py b/airflow/www/app.py index 2de041ba25966..16780cb7090cf 100644 --- a/airflow/www/app.py +++ b/airflow/www/app.py @@ -36,7 +36,7 @@ from airflow.www.extensions.init_manifest_files import configure_manifest_files from airflow.www.extensions.init_robots import init_robots from airflow.www.extensions.init_security import init_api_experimental_auth, init_xframe_protection -from airflow.www.extensions.init_session import init_airflow_session_interface, init_permanent_session +from airflow.www.extensions.init_session import init_airflow_session_interface from airflow.www.extensions.init_views import ( init_api_connexion, init_api_experimental, @@ -135,7 +135,6 @@ def create_app(config=None, testing=False): init_jinja_globals(flask_app) init_xframe_protection(flask_app) - init_permanent_session(flask_app) init_airflow_session_interface(flask_app) return flask_app diff --git a/airflow/www/extensions/init_session.py b/airflow/www/extensions/init_session.py index 06e0ba5396339..7a09de7de6436 100644 --- a/airflow/www/extensions/init_session.py +++ b/airflow/www/extensions/init_session.py @@ -15,33 +15,46 @@ # specific language governing permissions and limitations # under the License. -from flask import request, session as flask_session -from flask.sessions import SecureCookieSessionInterface +from flask import session as builtin_flask_session - -class AirflowSessionInterface(SecureCookieSessionInterface): - """ - Airflow cookie session interface. - Modifications of sessions should be done here because - the change here is global. - """ - - def save_session(self, *args, **kwargs): - """Prevent creating session from REST API requests.""" - if request.blueprint == '/api/v1': - return None - return super().save_session(*args, **kwargs) - - -def init_permanent_session(app): - """Make session permanent to allows us to store data""" - - def make_session_permanent(): - flask_session.permanent = True - - app.before_request(make_session_permanent) +from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException +from airflow.www.session import AirflowDatabaseSessionInterface, AirflowSecureCookieSessionInterface def init_airflow_session_interface(app): """Set airflow session interface""" - app.session_interface = AirflowSessionInterface() + config = app.config.copy() + selected_backend = conf.get('webserver', 'SESSION_BACKEND') + # A bit of a misnomer - normally cookies expire whenever the browser is closed + # or when they hit their expiry datetime, whichever comes first. "Permanent" + # cookies only expire when they hit their expiry datetime, and can outlive + # the browser being closed. + permanent_cookie = config.get('SESSION_PERMANENT', True) + + if selected_backend == 'securecookie': + app.session_interface = AirflowSecureCookieSessionInterface() + if permanent_cookie: + + def make_session_permanent(): + builtin_flask_session.permanent = True + + app.before_request(make_session_permanent) + elif selected_backend == 'database': + app.session_interface = AirflowDatabaseSessionInterface( + app=app, + db=None, + permanent=permanent_cookie, + # Typically these would be configurable with Flask-Session, + # but we will set them explicitly instead as they don't make + # sense to have configurable in Airflow's use case + table='session', + key_prefix='', + use_signer=True, + ) + else: + raise AirflowConfigException( + "Unrecognized session backend specified in " + f"web_server_session_backend: '{selected_backend}'. Please set " + "this to either 'database' or 'securecookie'." + ) diff --git a/airflow/www/session.py b/airflow/www/session.py new file mode 100644 index 0000000000000..4092565b385a2 --- /dev/null +++ b/airflow/www/session.py @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from flask import request +from flask.sessions import SecureCookieSessionInterface +from flask_session.sessions import SqlAlchemySessionInterface + + +class SesssionExemptMixin: + """Exempt certain blueprints/paths from autogenerated sessions""" + + def save_session(self, *args, **kwargs): + """Prevent creating session from REST API and health requests.""" + if request.blueprint == '/api/v1': + return None + if request.path == '/health': + return None + return super().save_session(*args, **kwargs) + + +class AirflowDatabaseSessionInterface(SesssionExemptMixin, SqlAlchemySessionInterface): + """Session interface that exempts some routes and stores session data in the database""" + + +class AirflowSecureCookieSessionInterface(SesssionExemptMixin, SecureCookieSessionInterface): + """Session interface that exempts some routes and stores session data in a signed cookie""" diff --git a/docs/apache-airflow/migrations-ref.rst b/docs/apache-airflow/migrations-ref.rst index 016c6243df6a2..8dc1a55922ef4 100644 --- a/docs/apache-airflow/migrations-ref.rst +++ b/docs/apache-airflow/migrations-ref.rst @@ -23,7 +23,9 @@ Here's the list of all the Database Migrations that are executed via when you ru +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ | Revision ID | Revises ID | Airflow Version | Description | +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ -| ``be2bfac3da23`` (head) | ``7b2661a43ba3`` | ``2.2.3`` | Add has_import_errors column to DagModel | +| ``c381b21cb7e4`` (head) | ``be2bfac3da23`` | ``2.2.4`` | Create a ``session`` table to store web session data | ++--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ +| ``be2bfac3da23`` | ``7b2661a43ba3`` | ``2.2.3`` | Add has_import_errors column to DagModel | +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ | ``7b2661a43ba3`` | ``142555e44c17`` | ``2.2.0`` | Change ``TaskInstance`` and ``TaskReschedule`` tables from execution_date to run_id. | +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 5d77e299558b9..ed114b67bae7d 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -1222,6 +1222,7 @@ sdk secretRef secretRefs securable +securecookie securityManager seealso seedlist diff --git a/setup.cfg b/setup.cfg index 7ab5c77c0688d..8e36d06f43a09 100644 --- a/setup.cfg +++ b/setup.cfg @@ -107,6 +107,9 @@ install_requires = flask-appbuilder>=3.3.4, <4.0.0 flask-caching>=1.5.0, <2.0.0 flask-login>=0.3, <0.5 + # Strict upper-bound on the latest release of flask-session, + # as any schema changes will require a migration. + flask-session>=0.3.1, <=0.4.0 flask-wtf>=0.14.3, <0.15 graphviz>=0.12 gunicorn>=20.1.0 diff --git a/tests/api_connexion/conftest.py b/tests/api_connexion/conftest.py index cc92733642d46..9b37b52e1da1e 100644 --- a/tests/api_connexion/conftest.py +++ b/tests/api_connexion/conftest.py @@ -25,7 +25,12 @@ @pytest.fixture(scope="session") def minimal_app_for_api(): @dont_initialize_flask_app_submodules( - skip_all_except=["init_appbuilder", "init_api_experimental_auth", "init_api_connexion"] + skip_all_except=[ + "init_appbuilder", + "init_api_experimental_auth", + "init_api_connexion", + "init_airflow_session_interface", + ] ) def factory(): with conf_vars({("api", "auth_backend"): "tests.test_utils.remote_user_api_auth_backend"}): diff --git a/tests/api_connexion/test_security.py b/tests/api_connexion/test_security.py index 244a8a2c356e8..68f6d31d99b9f 100644 --- a/tests/api_connexion/test_security.py +++ b/tests/api_connexion/test_security.py @@ -45,3 +45,7 @@ def setup_attrs(self, configured_app) -> None: def test_session_not_created_on_api_request(self): self.client.get("api/v1/dags", environ_overrides={'REMOTE_USER': "test"}) assert all(cookie.name != "session" for cookie in self.client.cookie_jar) + + def test_session_not_created_on_health_endpoint_request(self): + self.client.get("health") + assert all(cookie.name != "session" for cookie in self.client.cookie_jar) diff --git a/tests/test_utils/decorators.py b/tests/test_utils/decorators.py index d08d159485780..949df63683a37 100644 --- a/tests/test_utils/decorators.py +++ b/tests/test_utils/decorators.py @@ -42,7 +42,7 @@ def no_op(*args, **kwargs): "sync_appbuilder_roles", "init_jinja_globals", "init_xframe_protection", - "init_permanent_session", + "init_airflow_session_interface", "init_appbuilder", ] diff --git a/tests/utils/test_db.py b/tests/utils/test_db.py index 601dc6f9fe9da..27fa67b0b19de 100644 --- a/tests/utils/test_db.py +++ b/tests/utils/test_db.py @@ -74,6 +74,9 @@ def test_database_schema_and_sqlalchemy_model_are_in_sync(self): lambda t: (t[0] == 'remove_table' and t[1].name == 'spt_fallback_usg'), lambda t: (t[0] == 'remove_table' and t[1].name == 'MSreplication_options'), lambda t: (t[0] == 'remove_table' and t[1].name == 'spt_fallback_dev'), + # Ignore flask-session table/index + lambda t: (t[0] == 'remove_table' and t[1].name == 'session'), + lambda t: (t[0] == 'remove_index' and t[1].name == 'session_id'), ] for ignore in ignores: diff = [d for d in diff if not ignore(d)] diff --git a/tests/www/views/conftest.py b/tests/www/views/conftest.py index 05fe1e425a60b..f95a81474a73d 100644 --- a/tests/www/views/conftest.py +++ b/tests/www/views/conftest.py @@ -55,6 +55,7 @@ def app(examples_dag_bag): "init_flash_views", "init_jinja_globals", "init_plugins", + "init_airflow_session_interface", ] ) def factory(): diff --git a/tests/www/views/test_session.py b/tests/www/views/test_session.py new file mode 100644 index 0000000000000..9fb6f364695f7 --- /dev/null +++ b/tests/www/views/test_session.py @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + +from airflow.exceptions import AirflowConfigException +from airflow.www import app +from tests.test_utils.config import conf_vars +from tests.test_utils.decorators import dont_initialize_flask_app_submodules + + +def test_session_cookie_created_on_login(user_client): + assert any(cookie.name == 'session' for cookie in user_client.cookie_jar) + + +def test_session_inaccessible_after_logout(user_client): + session_cookie = next((cookie for cookie in user_client.cookie_jar if cookie.name == 'session'), None) + assert session_cookie is not None + + resp = user_client.get('/logout/') + assert resp.status_code == 302 + + # Try to access /home with the session cookie from earlier + user_client.set_cookie('session', session_cookie.value) + user_client.get('/home/') + assert resp.status_code == 302 + + +def test_invalid_session_backend_option(): + @dont_initialize_flask_app_submodules( + skip_all_except=[ + "init_api_connexion", + "init_appbuilder", + "init_appbuilder_links", + "init_appbuilder_views", + "init_flash_views", + "init_jinja_globals", + "init_plugins", + "init_airflow_session_interface", + ] + ) + def poorly_configured_app_factory(): + with conf_vars({("webserver", "session_backend"): "invalid_value_for_session_backend"}): + return app.create_app(testing=True) + + expected_exc_regex = ( + "^Unrecognized session backend specified in web_server_session_backend: " + r"'invalid_value_for_session_backend'\. Please set this to .+\.$" + ) + with pytest.raises(AirflowConfigException, match=expected_exc_regex): + poorly_configured_app_factory() From 628aa1f99c865d97d0b1c7c76e630e43a7b8d319 Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Tue, 15 Feb 2022 11:00:26 -0700 Subject: [PATCH 240/250] Simplify trigger cancel button (#21591) Co-authored-by: Jed Cunningham (cherry picked from commit 65297673a318660fba76797e50d0c06804dfcafc) --- airflow/www/templates/airflow/trigger.html | 2 +- tests/www/views/test_views_trigger_dag.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/airflow/www/templates/airflow/trigger.html b/airflow/www/templates/airflow/trigger.html index efc1650d3533d..2388d4e319056 100644 --- a/airflow/www/templates/airflow/trigger.html +++ b/airflow/www/templates/airflow/trigger.html @@ -63,7 +63,7 @@

Trigger DAG: {{ dag_id }}

- + Cancel {% endblock %} diff --git a/tests/www/views/test_views_trigger_dag.py b/tests/www/views/test_views_trigger_dag.py index f261438595383..2b4346836d767 100644 --- a/tests/www/views/test_views_trigger_dag.py +++ b/tests/www/views/test_views_trigger_dag.py @@ -133,6 +133,10 @@ def test_trigger_dag_form(admin_client): ("javascript:alert(1)", "/home"), ("http://google.com", "/home"), ("36539'%3balert(1)%2f%2f166", "/home"), + ( + '">'.format( - expected_origin - ), - resp, - ) + check_content_in_response(f'Cancel', resp) @pytest.mark.parametrize( From dd0a3a3d768b8cb2118c0b8d89ed0af0b393d865 Mon Sep 17 00:00:00 2001 From: eladkal <45845474+eladkal@users.noreply.github.com> Date: Fri, 11 Feb 2022 10:17:18 +0200 Subject: [PATCH 241/250] update tutorial_etl_dag notes (#21503) * update tutorial_etl_dag notes (cherry picked from commit a42607a4b75586a396d6a56145ed048d127dd344) --- airflow/example_dags/tutorial_etl_dag.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/airflow/example_dags/tutorial_etl_dag.py b/airflow/example_dags/tutorial_etl_dag.py index 8dd0ea426fc42..dd18449786973 100644 --- a/airflow/example_dags/tutorial_etl_dag.py +++ b/airflow/example_dags/tutorial_etl_dag.py @@ -19,9 +19,7 @@ """ ### ETL DAG Tutorial Documentation -This ETL DAG is compatible with Airflow 1.10.x (specifically tested with 1.10.12) and is referenced -as part of the documentation that goes along with the Airflow Functional DAG tutorial located -[here](https://airflow.apache.org/tutorial_decorated_flows.html) +This ETL DAG is demonstrating an Extract -> Transform -> Load pipeline """ # [START tutorial] # [START import_module] From 436f452ab8e32bfd5997e9650d1cfc490a41b0e4 Mon Sep 17 00:00:00 2001 From: Kush <3647166+kushsharma@users.noreply.github.com> Date: Thu, 30 Dec 2021 15:56:24 +0530 Subject: [PATCH 242/250] Fix slow DAG deletion due to missing ``dag_id`` index for job table (#20282) Fixes #20249 (cherry picked from commit ac9f29da200c208bb52d412186c5a1b936eb0b5a) --- airflow/jobs/base_job.py | 1 + ...df053233_adding_index_for_dag_id_in_job.py | 43 +++++++++++++++++++ docs/apache-airflow/migrations-ref.rst | 4 +- 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 airflow/migrations/versions/587bdf053233_adding_index_for_dag_id_in_job.py diff --git a/airflow/jobs/base_job.py b/airflow/jobs/base_job.py index 745f248fc4da0..174e4d59f372e 100644 --- a/airflow/jobs/base_job.py +++ b/airflow/jobs/base_job.py @@ -71,6 +71,7 @@ class BaseJob(Base, LoggingMixin): __table_args__ = ( Index('job_type_heart', job_type, latest_heartbeat), Index('idx_job_state_heartbeat', state, latest_heartbeat), + Index('idx_job_dag_id', dag_id), ) task_instances_enqueued = relationship( diff --git a/airflow/migrations/versions/587bdf053233_adding_index_for_dag_id_in_job.py b/airflow/migrations/versions/587bdf053233_adding_index_for_dag_id_in_job.py new file mode 100644 index 0000000000000..3532fe9e8df14 --- /dev/null +++ b/airflow/migrations/versions/587bdf053233_adding_index_for_dag_id_in_job.py @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""adding index for dag_id in job + +Revision ID: 587bdf053233 +Revises: f9da662e7089 +Create Date: 2021-12-14 10:20:12.482940 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = '587bdf053233' +down_revision = 'f9da662e7089' +branch_labels = None +depends_on = None + + +def upgrade(): + """Apply adding index for dag_id in job""" + op.create_index('idx_job_dag_id', 'job', ['dag_id'], unique=False) + + +def downgrade(): + """Unapply adding index for dag_id in job""" + op.drop_index('idx_job_dag_id', table_name='job') diff --git a/docs/apache-airflow/migrations-ref.rst b/docs/apache-airflow/migrations-ref.rst index 8dc1a55922ef4..0eac32999f067 100644 --- a/docs/apache-airflow/migrations-ref.rst +++ b/docs/apache-airflow/migrations-ref.rst @@ -23,7 +23,9 @@ Here's the list of all the Database Migrations that are executed via when you ru +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ | Revision ID | Revises ID | Airflow Version | Description | +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ -| ``c381b21cb7e4`` (head) | ``be2bfac3da23`` | ``2.2.4`` | Create a ``session`` table to store web session data | +| ``587bdf053233`` (head) | ``f9da662e7089`` | ``2.3.0`` | Add index for ``dag_id`` column in ``job`` table. | ++--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ +| ``c381b21cb7e4`` | ``be2bfac3da23`` | ``2.2.4`` | Create a ``session`` table to store web session data | +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ | ``be2bfac3da23`` | ``7b2661a43ba3`` | ``2.2.3`` | Add has_import_errors column to DagModel | +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ From 1cbad378cb778fca879a522916c11d32d80ac84e Mon Sep 17 00:00:00 2001 From: Jed Cunningham <66968678+jedcunningham@users.noreply.github.com> Date: Tue, 15 Feb 2022 16:38:56 -0700 Subject: [PATCH 243/250] Reorder migrations to include bugfix in 2.2.4 (#21598) (cherry picked from commit 005cef042bc4184c24ad03c1b4ee40cdbaf96cb5) --- .../versions/587bdf053233_adding_index_for_dag_id_in_job.py | 4 ++-- docs/apache-airflow/migrations-ref.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/airflow/migrations/versions/587bdf053233_adding_index_for_dag_id_in_job.py b/airflow/migrations/versions/587bdf053233_adding_index_for_dag_id_in_job.py index 3532fe9e8df14..c643a6298442a 100644 --- a/airflow/migrations/versions/587bdf053233_adding_index_for_dag_id_in_job.py +++ b/airflow/migrations/versions/587bdf053233_adding_index_for_dag_id_in_job.py @@ -19,7 +19,7 @@ """adding index for dag_id in job Revision ID: 587bdf053233 -Revises: f9da662e7089 +Revises: c381b21cb7e4 Create Date: 2021-12-14 10:20:12.482940 """ @@ -28,7 +28,7 @@ # revision identifiers, used by Alembic. revision = '587bdf053233' -down_revision = 'f9da662e7089' +down_revision = 'c381b21cb7e4' branch_labels = None depends_on = None diff --git a/docs/apache-airflow/migrations-ref.rst b/docs/apache-airflow/migrations-ref.rst index 0eac32999f067..cdaa447dd077a 100644 --- a/docs/apache-airflow/migrations-ref.rst +++ b/docs/apache-airflow/migrations-ref.rst @@ -23,7 +23,7 @@ Here's the list of all the Database Migrations that are executed via when you ru +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ | Revision ID | Revises ID | Airflow Version | Description | +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ -| ``587bdf053233`` (head) | ``f9da662e7089`` | ``2.3.0`` | Add index for ``dag_id`` column in ``job`` table. | +| ``587bdf053233`` (head) | ``c381b21cb7e4`` | ``2.2.4`` | Add index for ``dag_id`` column in ``job`` table. | +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ | ``c381b21cb7e4`` | ``be2bfac3da23`` | ``2.2.4`` | Create a ``session`` table to store web session data | +--------------------------------+------------------+-----------------+---------------------------------------------------------------------------------------+ From 7e8012703cb7d79386b5c59e076a81dad60eabf3 Mon Sep 17 00:00:00 2001 From: Madison Swain-Bowden Date: Tue, 15 Feb 2022 13:56:00 -0800 Subject: [PATCH 244/250] Add note about Variable precedence with env vars (#21568) This PR updates some documentation regarding setting Airflow Variables using environment variables. Environment variables take precedence over variables defined in the UI/metastore based on this default search path list: https://github.dev/apache/airflow/blob/7864693e43c40fd8f0914c05f7e196a007d16d50/airflow/secrets/__init__.py#L29-L30 (cherry picked from commit 7a268cb3c9fc6bc03f2400c6632ff8dccf4e451e) --- docs/apache-airflow/howto/variable.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/apache-airflow/howto/variable.rst b/docs/apache-airflow/howto/variable.rst index 7cb9377bdcf15..401dcb146b4aa 100644 --- a/docs/apache-airflow/howto/variable.rst +++ b/docs/apache-airflow/howto/variable.rst @@ -62,7 +62,8 @@ You can use them in your DAGs as: Single underscores surround ``VAR``. This is in contrast with the way ``airflow.cfg`` parameters are stored, where double underscores surround the config section name. Variables set using Environment Variables would not appear in the Airflow UI but you will - be able to use them in your DAG file. + be able to use them in your DAG file. Variables set using Environment Variables will also + take precedence over variables defined in the Airflow UI. Securing Variables ------------------ From 8cbf9340ec020810b505d0ccf197435eb0e8a704 Mon Sep 17 00:00:00 2001 From: Pankaj Singh Date: Fri, 18 Feb 2022 02:25:22 +0530 Subject: [PATCH 245/250] Adding missing login provider related methods from Flask-Appbuilder (#21294) (cherry picked from commit 38894e8013b5c38468e912164f80282e3b579993) --- airflow/www/fab_security/manager.py | 15 +++++++++++++++ setup.cfg | 7 ++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/airflow/www/fab_security/manager.py b/airflow/www/fab_security/manager.py index e340c179c72f6..f5385a64771a8 100644 --- a/airflow/www/fab_security/manager.py +++ b/airflow/www/fab_security/manager.py @@ -187,6 +187,7 @@ def __init__(self, appbuilder): # Role Mapping app.config.setdefault("AUTH_ROLES_MAPPING", {}) app.config.setdefault("AUTH_ROLES_SYNC_AT_LOGIN", False) + app.config.setdefault("AUTH_API_LOGIN_ALLOW_MULTIPLE_PROVIDERS", False) # LDAP Config if self.auth_type == AUTH_LDAP: @@ -292,11 +293,21 @@ def get_roles_from_keys(self, role_keys: List[str]) -> Set[role_model]: log.warning(f"Can't find role specified in AUTH_ROLES_MAPPING: {fab_role_name}") return _roles + @property + def auth_type_provider_name(self): + provider_to_auth_type = {AUTH_DB: "db", AUTH_LDAP: "ldap"} + return provider_to_auth_type.get(self.auth_type) + @property def get_url_for_registeruser(self): """Gets the URL for Register User""" return url_for(f"{self.registeruser_view.endpoint}.{self.registeruser_view.default_view}") + @property + def get_user_datamodel(self): + """Gets the User data model""" + return self.user_view.datamodel + @property def get_register_user_datamodel(self): """Gets the Register User data model""" @@ -307,6 +318,10 @@ def builtin_roles(self): """Get the builtin roles""" return self._builtin_roles + @property + def api_login_allow_multiple_providers(self): + return self.appbuilder.get_app.config["AUTH_API_LOGIN_ALLOW_MULTIPLE_PROVIDERS"] + @property def auth_type(self): """Get the auth type""" diff --git a/setup.cfg b/setup.cfg index 8e36d06f43a09..12bdeaeb3c51e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -104,7 +104,12 @@ install_requires = # https://github.com/readthedocs/sphinx_rtd_theme/issues/1115 docutils<0.17 flask>=1.1.0, <2.0 - flask-appbuilder>=3.3.4, <4.0.0 + # We are tightly coupled with FAB version because we vendored in part of FAB code related to security manager + # This is done as part of preparation to removing FAB as dependency, but we are not ready for it yet + # Every time we update FAB version here, please make sure that you review the classes and models in + # `airflow/www/fab_security` with their upstream counterparts. In particular, make sure any breaking changes, + # for example any new methods, are accounted for. + flask-appbuilder==3.4.4 flask-caching>=1.5.0, <2.0.0 flask-login>=0.3, <0.5 # Strict upper-bound on the latest release of flask-session, From 56d82fc3483f500d7a1da36019888849e8784c12 Mon Sep 17 00:00:00 2001 From: Howard Yoo <32691630+howardyoo@users.noreply.github.com> Date: Thu, 17 Feb 2022 14:01:58 -0600 Subject: [PATCH 246/250] added explaining concept of logical date in DAG run docs (#21433) (cherry picked from commit 752d53860e636ead2be7c3f2044b9b312ba86b95) --- docs/apache-airflow/concepts/dags.rst | 16 ++++++++++++++++ docs/apache-airflow/dag-run.rst | 2 ++ docs/apache-airflow/faq.rst | 7 +++++++ 3 files changed, 25 insertions(+) diff --git a/docs/apache-airflow/concepts/dags.rst b/docs/apache-airflow/concepts/dags.rst index 3edaf35d5759b..e339abeda65bc 100644 --- a/docs/apache-airflow/concepts/dags.rst +++ b/docs/apache-airflow/concepts/dags.rst @@ -157,6 +157,8 @@ The ``schedule_interval`` argument takes any value that is a valid `Crontab `. If ``schedule_interval`` is not enough to express the DAG's schedule, see :doc:`Timetables `. + For more information on ``logical date``, see :ref:`data-interval` and + :ref:`faq:what-does-execution-date-mean`. Every time you run a DAG, you are creating a new instance of that DAG which Airflow calls a :doc:`DAG Run `. DAG Runs can run in parallel for the @@ -177,6 +179,20 @@ In much the same way a DAG instantiates into a DAG Run every time it's run, Tasks specified inside a DAG are also instantiated into :ref:`Task Instances ` along with it. +A DAG run will have a start date when it starts, and end date when it ends. +This period describes the time when the DAG actually 'ran.' Aside from the DAG +run's start and end date, there is another date called *logical date* +(formally known as execution date), which describes the intended time a +DAG run is scheduled or triggered. The reason why this is called +*logical* is because of the abstract nature of it having multiple meanings, +depending on the context of the DAG run itself. + +For example, if a DAG run is manually triggered by the user, its logical date would be the +date and time of which the DAG run was triggered, and the value should be equal +to DAG run's start date. However, when the DAG is being automatically scheduled, with certain +schedule interval put in place, the logical date is going to indicate the time +at which it marks the start of the data interval, where the DAG run's start +date would then be the logical date + scheduled interval. DAG Assignment -------------- diff --git a/docs/apache-airflow/dag-run.rst b/docs/apache-airflow/dag-run.rst index 90bb404696106..62555b10ed3ab 100644 --- a/docs/apache-airflow/dag-run.rst +++ b/docs/apache-airflow/dag-run.rst @@ -84,6 +84,8 @@ scheduled one interval after ``start_date``. If ``schedule_interval`` is not enough to express your DAG's schedule, logical date, or data interval, see :doc:`/concepts/timetable`. + For more information on ``logical date``, see :ref:`concepts:dag-run` and + :ref:`faq:what-does-execution-date-mean` Re-run DAG '''''''''' diff --git a/docs/apache-airflow/faq.rst b/docs/apache-airflow/faq.rst index 857e685abbda2..7f72a0d162f5f 100644 --- a/docs/apache-airflow/faq.rst +++ b/docs/apache-airflow/faq.rst @@ -214,6 +214,8 @@ This allows for a backfill on tasks that have ``depends_on_past=True`` to actually start. If this were not the case, the backfill just would not start. +.. _faq:what-does-execution-date-mean: + What does ``execution_date`` mean? ---------------------------------- @@ -248,6 +250,11 @@ misunderstandings. Note that ``ds`` (the YYYY-MM-DD form of ``data_interval_start``) refers to *date* ***string***, not *date* ***start*** as may be confusing to some. +.. tip:: + + For more information on ``logical date``, see :ref:`data-interval` and + :ref:`concepts:dag-run`. + How to create DAGs dynamically? ------------------------------- From 969a275df02a81d1f3176ca010e565fb950e6d35 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Fri, 18 Feb 2022 01:08:42 +0100 Subject: [PATCH 247/250] Clarify pendulum use in timezone cases (#21646) It is important to use Pendulum in case timezone is used - because there are a number of limitations coming from using stdlib timezone implementation. However our documentation was not very clear about it, especially some examples shown using standard datetime in DAGs which could mislead our users to continue using datetime if they use timezone. This PR clarifies and stresses the use of pendulum is necessary when timezone is used. Also it points to the documentation in case serialization throws error about not using Pendulum so that the users can learn about the reasoning. This is the first part of the change - the follow up will be changing all provider examples to also use timezone and pendulum explicitly. See also #20070 (cherry picked from commit f011da235f705411239d992bc3c92f1c072f89a9) --- CONTRIBUTING.rst | 2 +- UPDATING.md | 6 ++--- airflow/example_dags/example_bash_operator.py | 8 +++--- .../example_branch_datetime_operator.py | 14 +++++----- .../example_branch_day_of_week_operator.py | 4 +-- airflow/example_dags/example_branch_labels.py | 7 +++-- .../example_dags/example_branch_operator.py | 5 ++-- .../example_branch_python_dop_operator_3.py | 4 +-- airflow/example_dags/example_complex.py | 4 +-- airflow/example_dags/example_dag_decorator.py | 9 +++++-- .../example_external_task_marker_dag.py | 4 +-- .../example_kubernetes_executor.py | 5 ++-- .../example_latest_only_with_trigger.py | 8 +++--- .../example_dags/example_nested_branch_dag.py | 4 +-- ...example_passing_params_via_test_command.py | 8 +++--- .../example_dags/example_python_operator.py | 5 ++-- .../example_short_circuit_operator.py | 4 +-- airflow/example_dags/example_skip_dag.py | 9 +++++-- airflow/example_dags/example_sla_dag.py | 8 +++--- airflow/example_dags/example_task_group.py | 7 +++-- .../example_task_group_decorator.py | 7 +++-- .../example_time_delta_sensor_async.py | 8 +++--- .../example_trigger_controller_dag.py | 4 +-- .../example_trigger_target_dag.py | 4 +-- airflow/example_dags/example_xcom.py | 4 +-- airflow/example_dags/example_xcomargs.py | 7 ++--- airflow/example_dags/subdags/subdag.py | 4 +-- airflow/example_dags/tutorial_etl_dag.py | 5 ++-- .../example_dags/tutorial_taskflow_api_etl.py | 10 +++++-- airflow/models/dag.py | 3 +++ airflow/serialization/serialized_objects.py | 6 ++++- docs/apache-airflow/best-practices.rst | 14 +++++----- docs/apache-airflow/concepts/dags.rst | 26 +++++++++++++------ docs/apache-airflow/concepts/operators.rst | 2 +- docs/apache-airflow/dag-run.rst | 11 ++++---- docs/apache-airflow/executor/kubernetes.rst | 5 ++-- docs/apache-airflow/faq.rst | 24 +++++++++++++---- docs/apache-airflow/howto/timetable.rst | 8 +++--- docs/apache-airflow/lineage.rst | 7 ++--- .../logging-monitoring/callbacks.rst | 8 +++--- docs/apache-airflow/timezone.rst | 19 +++++++++----- docs/apache-airflow/tutorial.rst | 15 ++++++++--- .../extending/embedding-dags/test_dag.py | 10 ++++--- 43 files changed, 216 insertions(+), 120 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index d598742bbef46..838f658caafef 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -1376,7 +1376,7 @@ We are using certain prefixes for email subjects for different purposes. Start y Voting is governed by the rules described in `Voting `_ We are all devoting our time for community as individuals who except for being active in Apache Airflow have -families, daily jobs, right for vacation. Sometimes we are in different time zones or simply are +families, daily jobs, right for vacation. Sometimes we are in different timezones or simply are busy with day-to-day duties that our response time might be delayed. For us it's crucial to remember to respect each other in the project with no formal structure. There are no managers, departments, most of us is autonomous in our opinions, decisions. diff --git a/UPDATING.md b/UPDATING.md index 2ed4aace4f6eb..0273d5a26669e 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -184,7 +184,7 @@ Similarly, `DAG.concurrency` has been renamed to `DAG.max_active_tasks`. ```python dag = DAG( dag_id="example_dag", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, concurrency=3, ) @@ -195,7 +195,7 @@ dag = DAG( ```python dag = DAG( dag_id="example_dag", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, max_active_tasks=3, ) @@ -3216,7 +3216,7 @@ Type "help", "copyright", "credits" or "license" for more information. >>> from airflow.models.dag import DAG >>> from airflow.operators.dummy import DummyOperator >>> ->>> dag = DAG('simple_dag', start_date=datetime(2017, 9, 1)) +>>> dag = DAG('simple_dag', start_date=pendulum.datetime(2017, 9, 1, tz="UTC")) >>> >>> task = DummyOperator(task_id='task_1', dag=dag) >>> diff --git a/airflow/example_dags/example_bash_operator.py b/airflow/example_dags/example_bash_operator.py index f679f8d87532f..8204592220350 100644 --- a/airflow/example_dags/example_bash_operator.py +++ b/airflow/example_dags/example_bash_operator.py @@ -18,7 +18,9 @@ """Example DAG demonstrating the usage of the BashOperator.""" -from datetime import datetime, timedelta +import datetime + +import pendulum from airflow import DAG from airflow.operators.bash import BashOperator @@ -27,9 +29,9 @@ with DAG( dag_id='example_bash_operator', schedule_interval='0 0 * * *', - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, - dagrun_timeout=timedelta(minutes=60), + dagrun_timeout=datetime.timedelta(minutes=60), tags=['example', 'example2'], params={"example_key": "example_value"}, ) as dag: diff --git a/airflow/example_dags/example_branch_datetime_operator.py b/airflow/example_dags/example_branch_datetime_operator.py index bdc50ca43686b..76b109fb688e7 100644 --- a/airflow/example_dags/example_branch_datetime_operator.py +++ b/airflow/example_dags/example_branch_datetime_operator.py @@ -20,7 +20,7 @@ Example DAG demonstrating the usage of DateTimeBranchOperator with datetime as well as time objects as targets. """ -import datetime +import pendulum from airflow import DAG from airflow.operators.datetime import BranchDateTimeOperator @@ -28,7 +28,7 @@ dag = DAG( dag_id="example_branch_datetime_operator", - start_date=datetime.datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=["example"], schedule_interval="@daily", @@ -42,8 +42,8 @@ task_id='datetime_branch', follow_task_ids_if_true=['date_in_range'], follow_task_ids_if_false=['date_outside_range'], - target_upper=datetime.datetime(2020, 10, 10, 15, 0, 0), - target_lower=datetime.datetime(2020, 10, 10, 14, 0, 0), + target_upper=pendulum.datetime(2020, 10, 10, 15, 0, 0), + target_lower=pendulum.datetime(2020, 10, 10, 14, 0, 0), dag=dag, ) @@ -54,7 +54,7 @@ dag = DAG( dag_id="example_branch_datetime_operator_2", - start_date=datetime.datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=["example"], schedule_interval="@daily", @@ -67,8 +67,8 @@ task_id='datetime_branch', follow_task_ids_if_true=['date_in_range'], follow_task_ids_if_false=['date_outside_range'], - target_upper=datetime.time(0, 0, 0), - target_lower=datetime.time(15, 0, 0), + target_upper=pendulum.time(0, 0, 0), + target_lower=pendulum.time(15, 0, 0), dag=dag, ) diff --git a/airflow/example_dags/example_branch_day_of_week_operator.py b/airflow/example_dags/example_branch_day_of_week_operator.py index 6d1a33117cfb5..dae303a9035fb 100644 --- a/airflow/example_dags/example_branch_day_of_week_operator.py +++ b/airflow/example_dags/example_branch_day_of_week_operator.py @@ -19,7 +19,7 @@ """ Example DAG demonstrating the usage of BranchDayOfWeekOperator. """ -from datetime import datetime +import pendulum from airflow import DAG from airflow.operators.dummy import DummyOperator @@ -27,7 +27,7 @@ with DAG( dag_id="example_weekday_branch_operator", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=["example"], schedule_interval="@daily", diff --git a/airflow/example_dags/example_branch_labels.py b/airflow/example_dags/example_branch_labels.py index bd6ce09819885..2215bcfe19c41 100644 --- a/airflow/example_dags/example_branch_labels.py +++ b/airflow/example_dags/example_branch_labels.py @@ -19,14 +19,17 @@ """ Example DAG demonstrating the usage of labels with different branches. """ -from datetime import datetime +import pendulum from airflow import DAG from airflow.operators.dummy import DummyOperator from airflow.utils.edgemodifier import Label with DAG( - "example_branch_labels", schedule_interval="@daily", start_date=datetime(2021, 1, 1), catchup=False + "example_branch_labels", + schedule_interval="@daily", + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + catchup=False, ) as dag: ingest = DummyOperator(task_id="ingest") analyse = DummyOperator(task_id="analyze") diff --git a/airflow/example_dags/example_branch_operator.py b/airflow/example_dags/example_branch_operator.py index 69f939e9df20e..eaa1532eeef81 100644 --- a/airflow/example_dags/example_branch_operator.py +++ b/airflow/example_dags/example_branch_operator.py @@ -19,7 +19,8 @@ """Example DAG demonstrating the usage of the BranchPythonOperator.""" import random -from datetime import datetime + +import pendulum from airflow import DAG from airflow.operators.dummy import DummyOperator @@ -29,7 +30,7 @@ with DAG( dag_id='example_branch_operator', - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, schedule_interval="@daily", tags=['example', 'example2'], diff --git a/airflow/example_dags/example_branch_python_dop_operator_3.py b/airflow/example_dags/example_branch_python_dop_operator_3.py index 09d96bea7edb0..d85eda140aedc 100644 --- a/airflow/example_dags/example_branch_python_dop_operator_3.py +++ b/airflow/example_dags/example_branch_python_dop_operator_3.py @@ -20,7 +20,7 @@ Example DAG demonstrating the usage of BranchPythonOperator with depends_on_past=True, where tasks may be run or skipped on alternating runs. """ -from datetime import datetime +import pendulum from airflow import DAG from airflow.operators.dummy import DummyOperator @@ -49,7 +49,7 @@ def should_run(**kwargs): with DAG( dag_id='example_branch_dop_operator_v3', schedule_interval='*/1 * * * *', - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, default_args={'depends_on_past': True}, tags=['example'], diff --git a/airflow/example_dags/example_complex.py b/airflow/example_dags/example_complex.py index a141236cd20b6..22e1906c042dd 100644 --- a/airflow/example_dags/example_complex.py +++ b/airflow/example_dags/example_complex.py @@ -19,7 +19,7 @@ """ Example Airflow DAG that shows the complex DAG structure. """ -from datetime import datetime +import pendulum from airflow import models from airflow.models.baseoperator import chain @@ -28,7 +28,7 @@ with models.DAG( dag_id="example_complex", schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=['example', 'example2', 'example3'], ) as dag: diff --git a/airflow/example_dags/example_dag_decorator.py b/airflow/example_dags/example_dag_decorator.py index 66b0fa4ab0bd5..af1438cddbbee 100644 --- a/airflow/example_dags/example_dag_decorator.py +++ b/airflow/example_dags/example_dag_decorator.py @@ -15,10 +15,10 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from datetime import datetime from typing import Any, Dict import httpx +import pendulum from airflow.decorators import dag, task from airflow.models.baseoperator import BaseOperator @@ -37,7 +37,12 @@ def execute(self, context): # [START dag_decorator_usage] -@dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example']) +@dag( + schedule_interval=None, + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + catchup=False, + tags=['example'], +) def example_dag_decorator(email: str = 'example@example.com'): """ DAG to send server IP to email. diff --git a/airflow/example_dags/example_external_task_marker_dag.py b/airflow/example_dags/example_external_task_marker_dag.py index 851a7ad71ca54..eed2f727fc317 100644 --- a/airflow/example_dags/example_external_task_marker_dag.py +++ b/airflow/example_dags/example_external_task_marker_dag.py @@ -37,13 +37,13 @@ exception """ -import datetime +import pendulum from airflow import DAG from airflow.operators.dummy import DummyOperator from airflow.sensors.external_task import ExternalTaskMarker, ExternalTaskSensor -start_date = datetime.datetime(2015, 1, 1) +start_date = pendulum.datetime(2021, 1, 1, tz="UTC") with DAG( dag_id="example_external_task_marker_parent", diff --git a/airflow/example_dags/example_kubernetes_executor.py b/airflow/example_dags/example_kubernetes_executor.py index f984909cfccb1..6318d51af1fe9 100644 --- a/airflow/example_dags/example_kubernetes_executor.py +++ b/airflow/example_dags/example_kubernetes_executor.py @@ -20,7 +20,8 @@ """ import logging import os -from datetime import datetime + +import pendulum from airflow import DAG from airflow.configuration import conf @@ -45,7 +46,7 @@ with DAG( dag_id='example_kubernetes_executor', schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=['example3'], ) as dag: diff --git a/airflow/example_dags/example_latest_only_with_trigger.py b/airflow/example_dags/example_latest_only_with_trigger.py index 76b5f630c7d9d..67f004aef38f7 100644 --- a/airflow/example_dags/example_latest_only_with_trigger.py +++ b/airflow/example_dags/example_latest_only_with_trigger.py @@ -20,7 +20,9 @@ """ # [START example] -import datetime as dt +import datetime + +import pendulum from airflow import DAG from airflow.operators.dummy import DummyOperator @@ -29,8 +31,8 @@ with DAG( dag_id='latest_only_with_trigger', - schedule_interval=dt.timedelta(hours=4), - start_date=dt.datetime(2021, 1, 1), + schedule_interval=datetime.timedelta(hours=4), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=['example3'], ) as dag: diff --git a/airflow/example_dags/example_nested_branch_dag.py b/airflow/example_dags/example_nested_branch_dag.py index add81a9fd692d..27e71054d1176 100644 --- a/airflow/example_dags/example_nested_branch_dag.py +++ b/airflow/example_dags/example_nested_branch_dag.py @@ -21,7 +21,7 @@ ``none_failed_min_one_success`` trigger rule such that they are skipped whenever their corresponding ``BranchPythonOperator`` are skipped. """ -from datetime import datetime +import pendulum from airflow.models import DAG from airflow.operators.dummy import DummyOperator @@ -30,7 +30,7 @@ with DAG( dag_id="example_nested_branch_dag", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, schedule_interval="@daily", tags=["example"], diff --git a/airflow/example_dags/example_passing_params_via_test_command.py b/airflow/example_dags/example_passing_params_via_test_command.py index d4781afab8d98..f97f941db0d3b 100644 --- a/airflow/example_dags/example_passing_params_via_test_command.py +++ b/airflow/example_dags/example_passing_params_via_test_command.py @@ -18,10 +18,12 @@ """Example DAG demonstrating the usage of the params arguments in templated arguments.""" +import datetime import os -from datetime import datetime, timedelta from textwrap import dedent +import pendulum + from airflow import DAG from airflow.decorators import task from airflow.operators.bash import BashOperator @@ -61,9 +63,9 @@ def print_env_vars(test_mode=None): with DAG( "example_passing_params_via_test_command", schedule_interval='*/1 * * * *', - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, - dagrun_timeout=timedelta(minutes=4), + dagrun_timeout=datetime.timedelta(minutes=4), tags=['example'], ) as dag: run_this = my_py_command(params={"miff": "agg"}) diff --git a/airflow/example_dags/example_python_operator.py b/airflow/example_dags/example_python_operator.py index d533d84506af1..0f9a7fc476acb 100644 --- a/airflow/example_dags/example_python_operator.py +++ b/airflow/example_dags/example_python_operator.py @@ -23,9 +23,10 @@ import logging import shutil import time -from datetime import datetime from pprint import pprint +import pendulum + from airflow import DAG from airflow.decorators import task @@ -34,7 +35,7 @@ with DAG( dag_id='example_python_operator', schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=['example'], ) as dag: diff --git a/airflow/example_dags/example_short_circuit_operator.py b/airflow/example_dags/example_short_circuit_operator.py index d349685eaea99..4c1187aee3465 100644 --- a/airflow/example_dags/example_short_circuit_operator.py +++ b/airflow/example_dags/example_short_circuit_operator.py @@ -17,7 +17,7 @@ # under the License. """Example DAG demonstrating the usage of the ShortCircuitOperator.""" -from datetime import datetime +import pendulum from airflow import DAG from airflow.models.baseoperator import chain @@ -26,7 +26,7 @@ with DAG( dag_id='example_short_circuit_operator', - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=['example'], ) as dag: diff --git a/airflow/example_dags/example_skip_dag.py b/airflow/example_dags/example_skip_dag.py index cb664e7e1f195..0e67ed1dc91f7 100644 --- a/airflow/example_dags/example_skip_dag.py +++ b/airflow/example_dags/example_skip_dag.py @@ -18,7 +18,7 @@ """Example DAG demonstrating the DummyOperator and a custom DummySkipOperator which skips by default.""" -from datetime import datetime +import pendulum from airflow import DAG from airflow.exceptions import AirflowSkipException @@ -54,6 +54,11 @@ def create_test_pipeline(suffix, trigger_rule): join >> final -with DAG(dag_id='example_skip_dag', start_date=datetime(2021, 1, 1), catchup=False, tags=['example']) as dag: +with DAG( + dag_id='example_skip_dag', + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + catchup=False, + tags=['example'], +) as dag: create_test_pipeline('1', TriggerRule.ALL_SUCCESS) create_test_pipeline('2', TriggerRule.ONE_SUCCESS) diff --git a/airflow/example_dags/example_sla_dag.py b/airflow/example_dags/example_sla_dag.py index 7a46bc4ec118e..0db6bc1ba7fcc 100644 --- a/airflow/example_dags/example_sla_dag.py +++ b/airflow/example_dags/example_sla_dag.py @@ -15,8 +15,10 @@ # specific language governing permissions and limitations # under the License. +import datetime import time -from datetime import datetime, timedelta + +import pendulum from airflow.decorators import dag, task @@ -39,13 +41,13 @@ def sla_callback(dag, task_list, blocking_task_list, slas, blocking_tis): @dag( schedule_interval="*/2 * * * *", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, sla_miss_callback=sla_callback, default_args={'email': "email@example.com"}, ) def example_sla_dag(): - @task(sla=timedelta(seconds=10)) + @task(sla=datetime.timedelta(seconds=10)) def sleep_20(): """Sleep for 20 seconds""" time.sleep(20) diff --git a/airflow/example_dags/example_task_group.py b/airflow/example_dags/example_task_group.py index d81bf007bab58..46f709eaf873b 100644 --- a/airflow/example_dags/example_task_group.py +++ b/airflow/example_dags/example_task_group.py @@ -17,7 +17,7 @@ # under the License. """Example DAG demonstrating the usage of the TaskGroup.""" -from datetime import datetime +import pendulum from airflow.models.dag import DAG from airflow.operators.bash import BashOperator @@ -26,7 +26,10 @@ # [START howto_task_group] with DAG( - dag_id="example_task_group", start_date=datetime(2021, 1, 1), catchup=False, tags=["example"] + dag_id="example_task_group", + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + catchup=False, + tags=["example"], ) as dag: start = DummyOperator(task_id="start") diff --git a/airflow/example_dags/example_task_group_decorator.py b/airflow/example_dags/example_task_group_decorator.py index 0e53a98ea4376..30f9d6f1ab2ca 100644 --- a/airflow/example_dags/example_task_group_decorator.py +++ b/airflow/example_dags/example_task_group_decorator.py @@ -18,7 +18,7 @@ """Example DAG demonstrating the usage of the @taskgroup decorator.""" -from datetime import datetime +import pendulum from airflow.decorators import task, task_group from airflow.models.dag import DAG @@ -65,7 +65,10 @@ def task_group_function(value): # Executing Tasks and TaskGroups with DAG( - dag_id="example_task_group_decorator", start_date=datetime(2021, 1, 1), catchup=False, tags=["example"] + dag_id="example_task_group_decorator", + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + catchup=False, + tags=["example"], ) as dag: start_task = task_start() end_task = task_end() diff --git a/airflow/example_dags/example_time_delta_sensor_async.py b/airflow/example_dags/example_time_delta_sensor_async.py index ce8cab005e64a..1a7126a22627a 100644 --- a/airflow/example_dags/example_time_delta_sensor_async.py +++ b/airflow/example_dags/example_time_delta_sensor_async.py @@ -21,7 +21,9 @@ defers and doesn't occupy a worker slot while it waits """ -from datetime import datetime, timedelta +import datetime + +import pendulum from airflow import DAG from airflow.operators.dummy import DummyOperator @@ -30,10 +32,10 @@ with DAG( dag_id="example_time_delta_sensor_async", schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=["example"], ) as dag: - wait = TimeDeltaSensorAsync(task_id="wait", delta=timedelta(seconds=10)) + wait = TimeDeltaSensorAsync(task_id="wait", delta=datetime.timedelta(seconds=10)) finish = DummyOperator(task_id="finish") wait >> finish diff --git a/airflow/example_dags/example_trigger_controller_dag.py b/airflow/example_dags/example_trigger_controller_dag.py index 27df3d2651007..a017c9a5b4176 100644 --- a/airflow/example_dags/example_trigger_controller_dag.py +++ b/airflow/example_dags/example_trigger_controller_dag.py @@ -21,14 +21,14 @@ 1. 1st DAG (example_trigger_controller_dag) holds a TriggerDagRunOperator, which will trigger the 2nd DAG 2. 2nd DAG (example_trigger_target_dag) which will be triggered by the TriggerDagRunOperator in the 1st DAG """ -from datetime import datetime +import pendulum from airflow import DAG from airflow.operators.trigger_dagrun import TriggerDagRunOperator with DAG( dag_id="example_trigger_controller_dag", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, schedule_interval="@once", tags=['example'], diff --git a/airflow/example_dags/example_trigger_target_dag.py b/airflow/example_dags/example_trigger_target_dag.py index 41aecf1a1b613..64ccb59e0348d 100644 --- a/airflow/example_dags/example_trigger_target_dag.py +++ b/airflow/example_dags/example_trigger_target_dag.py @@ -21,7 +21,7 @@ 1. 1st DAG (example_trigger_controller_dag) holds a TriggerDagRunOperator, which will trigger the 2nd DAG 2. 2nd DAG (example_trigger_target_dag) which will be triggered by the TriggerDagRunOperator in the 1st DAG """ -from datetime import datetime +import pendulum from airflow import DAG from airflow.decorators import task @@ -41,7 +41,7 @@ def run_this_func(dag_run=None): with DAG( dag_id="example_trigger_target_dag", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, schedule_interval=None, tags=['example'], diff --git a/airflow/example_dags/example_xcom.py b/airflow/example_dags/example_xcom.py index 405d5c527d1e4..b55d4e5d667cd 100644 --- a/airflow/example_dags/example_xcom.py +++ b/airflow/example_dags/example_xcom.py @@ -17,7 +17,7 @@ # under the License. """Example DAG demonstrating the usage of XComs.""" -from datetime import datetime +import pendulum from airflow import DAG from airflow.decorators import task @@ -64,7 +64,7 @@ def pull_value_from_bash_push(ti=None): with DAG( 'example_xcom', schedule_interval="@once", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=['example'], ) as dag: diff --git a/airflow/example_dags/example_xcomargs.py b/airflow/example_dags/example_xcomargs.py index 7e0cdd901cedb..00af4725c240f 100644 --- a/airflow/example_dags/example_xcomargs.py +++ b/airflow/example_dags/example_xcomargs.py @@ -18,7 +18,8 @@ """Example DAG demonstrating the usage of the XComArgs.""" import logging -from datetime import datetime + +import pendulum from airflow import DAG from airflow.decorators import task @@ -41,7 +42,7 @@ def print_value(value, ts=None): with DAG( dag_id='example_xcom_args', - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, schedule_interval=None, tags=['example'], @@ -50,7 +51,7 @@ def print_value(value, ts=None): with DAG( "example_xcom_args_with_operators", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, schedule_interval=None, tags=['example'], diff --git a/airflow/example_dags/subdags/subdag.py b/airflow/example_dags/subdags/subdag.py index d337a03679c48..7c913099b3107 100644 --- a/airflow/example_dags/subdags/subdag.py +++ b/airflow/example_dags/subdags/subdag.py @@ -19,7 +19,7 @@ """Helper function to generate a DAG and operators given some arguments.""" # [START subdag] -from datetime import datetime +import pendulum from airflow import DAG from airflow.operators.dummy import DummyOperator @@ -38,7 +38,7 @@ def subdag(parent_dag_name, child_dag_name, args): dag_subdag = DAG( dag_id=f'{parent_dag_name}.{child_dag_name}', default_args=args, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, schedule_interval="@daily", ) diff --git a/airflow/example_dags/tutorial_etl_dag.py b/airflow/example_dags/tutorial_etl_dag.py index dd18449786973..d039a73488c18 100644 --- a/airflow/example_dags/tutorial_etl_dag.py +++ b/airflow/example_dags/tutorial_etl_dag.py @@ -24,9 +24,10 @@ # [START tutorial] # [START import_module] import json -from datetime import datetime from textwrap import dedent +import pendulum + # The DAG object; we'll need this to instantiate a DAG from airflow import DAG @@ -45,7 +46,7 @@ # [END default_args] description='ETL DAG tutorial', schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=['example'], ) as dag: diff --git a/airflow/example_dags/tutorial_taskflow_api_etl.py b/airflow/example_dags/tutorial_taskflow_api_etl.py index 3b0ba51a28c87..f6af78f0a5a2c 100644 --- a/airflow/example_dags/tutorial_taskflow_api_etl.py +++ b/airflow/example_dags/tutorial_taskflow_api_etl.py @@ -20,7 +20,8 @@ # [START tutorial] # [START import_module] import json -from datetime import datetime + +import pendulum from airflow.decorators import dag, task @@ -28,7 +29,12 @@ # [START instantiate_dag] -@dag(schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False, tags=['example']) +@dag( + schedule_interval=None, + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + catchup=False, + tags=['example'], +) def tutorial_taskflow_api_etl(): """ ### TaskFlow API Tutorial Documentation diff --git a/airflow/models/dag.py b/airflow/models/dag.py index 477e597b49129..150220cbc4142 100644 --- a/airflow/models/dag.py +++ b/airflow/models/dag.py @@ -187,6 +187,9 @@ class DAG(LoggingMixin): DAGs essentially act as namespaces for tasks. A task_id can only be added once to a DAG. + Note that if you plan to use time zones all the dates provided should be pendulum + dates. See :ref:`timezone_aware_dags`. + :param dag_id: The id of the DAG; must consist exclusively of alphanumeric characters, dashes, dots and underscores (all ASCII) :type dag_id: str diff --git a/airflow/serialization/serialized_objects.py b/airflow/serialization/serialized_objects.py index bc8361dcb7ca8..ebe20b03b03be 100644 --- a/airflow/serialization/serialized_objects.py +++ b/airflow/serialization/serialized_objects.py @@ -42,6 +42,7 @@ from airflow.settings import json from airflow.timetables.base import Timetable from airflow.utils.code_utils import get_python_source +from airflow.utils.docs import get_docs_url from airflow.utils.module_loading import as_importable_string, import_string from airflow.utils.task_group import TaskGroup @@ -113,7 +114,10 @@ def encode_timezone(var: Timezone) -> Union[str, int]: return var.offset if isinstance(var, Timezone): return var.name - raise ValueError(f"DAG timezone should be a pendulum.tz.Timezone, not {var!r}") + raise ValueError( + f"DAG timezone should be a pendulum.tz.Timezone, not {var!r}. " + f"See {get_docs_url('timezone.html#time-zone-aware-dags')}" + ) def decode_timezone(var: Union[str, int]) -> Timezone: diff --git a/docs/apache-airflow/best-practices.rst b/docs/apache-airflow/best-practices.rst index 951e6b4290779..3b01b3ece98a0 100644 --- a/docs/apache-airflow/best-practices.rst +++ b/docs/apache-airflow/best-practices.rst @@ -121,7 +121,7 @@ Bad example: .. code-block:: python - from datetime import datetime + import pendulum from airflow import DAG from airflow.operators.python import PythonOperator @@ -131,7 +131,7 @@ Bad example: with DAG( dag_id="example_python_operator", schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=["example"], ) as dag: @@ -151,7 +151,7 @@ Good example: .. code-block:: python - from datetime import datetime + import pendulum from airflow import DAG from airflow.operators.python import PythonOperator @@ -159,7 +159,7 @@ Good example: with DAG( dag_id="example_python_operator", schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=["example"], ) as dag: @@ -237,12 +237,13 @@ Then you can import and use the ``ALL_TASKS`` constant in all your DAGs like tha .. code-block:: python + import pendulum from my_company_utils.common import ALL_TASKS with DAG( dag_id="my_dag", schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, ) as dag: for task in ALL_TASKS: @@ -486,13 +487,14 @@ This is an example test want to verify the structure of a code-generated DAG aga .. code-block:: python import datetime + import pendulum import pytest from airflow.utils.state import DagRunState from airflow.utils.types import DagRunType - DATA_INTERVAL_START = datetime.datetime(2021, 9, 13) + DATA_INTERVAL_START = pendulum.datetime(2021, 9, 13, tz="UTC") DATA_INTERVAL_END = DATA_INTERVAL_START + datetime.timedelta(days=1) TEST_DAG_ID = "my_custom_operator_dag" diff --git a/docs/apache-airflow/concepts/dags.rst b/docs/apache-airflow/concepts/dags.rst index e339abeda65bc..32d21cea19788 100644 --- a/docs/apache-airflow/concepts/dags.rst +++ b/docs/apache-airflow/concepts/dags.rst @@ -38,19 +38,22 @@ There are three ways to declare a DAG - either you can use a context manager, which will add the DAG to anything inside it implicitly:: with DAG( - "my_dag_name", start_date=datetime(2021, 1, 1), schedule_interval="@daily", catchup=False + "my_dag_name", start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + schedule_interval="@daily", catchup=False ) as dag: op = DummyOperator(task_id="task") Or, you can use a standard constructor, passing the dag into any operators you use:: - my_dag = DAG("my_dag_name", start_date=datetime(2021, 1, 1), schedule_interval="@daily", catchup=False) + my_dag = DAG("my_dag_name", start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + schedule_interval="@daily", catchup=False) op = DummyOperator(task_id="task", dag=my_dag) Or, you can use the ``@dag`` decorator to :ref:`turn a function into a DAG generator `:: - @dag(start_date=datetime(2021, 1, 1), schedule_interval="@daily", catchup=False) + @dag(start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + schedule_interval="@daily", catchup=False) def generate_dag(): op = DummyOperator(task_id="task") @@ -214,10 +217,11 @@ Default Arguments Often, many Operators inside a DAG need the same set of default arguments (such as their ``retries``). Rather than having to specify this individually for every Operator, you can instead pass ``default_args`` to the DAG when you create it, and it will auto-apply them to any operator tied to it:: + import pendulum with DAG( dag_id='my_dag', - start_date=datetime(2016, 1, 1), + start_date=pendulum.datetime(2016, 1, 1, tz="UTC"), schedule_interval='@daily', catchup=False, default_args={'retries': 2}, @@ -390,7 +394,7 @@ You can also combine this with the :ref:`concepts:depends-on-past` functionality .. code-block:: python # dags/branch_without_trigger.py - import datetime as dt + import pendulum from airflow.models import DAG from airflow.operators.dummy import DummyOperator @@ -399,7 +403,7 @@ You can also combine this with the :ref:`concepts:depends-on-past` functionality dag = DAG( dag_id="branch_without_trigger", schedule_interval="@once", - start_date=dt.datetime(2019, 2, 28), + start_date=pendulum.datetime(2019, 2, 28, tz="UTC"), ) run_this_first = DummyOperator(task_id="run_this_first", dag=dag) @@ -483,9 +487,11 @@ Dependency relationships can be applied across all tasks in a TaskGroup with the TaskGroup also supports ``default_args`` like DAG, it will overwrite the ``default_args`` in DAG level:: + import pendulum + with DAG( dag_id='dag1', - start_date=datetime(2016, 1, 1), + start_date=pendulum.datetime(2016, 1, 1, tz="UTC"), schedule_interval="@daily", catchup=False, default_args={'retries': 1}, @@ -563,9 +569,13 @@ This is especially useful if your tasks are built dynamically from configuration """ ### My great DAG """ + import pendulum dag = DAG( - "my_dag", start_date=datetime(2021, 1, 1), schedule_interval="@daily", catchup=False + "my_dag", + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + schedule_interval="@daily", + catchup=False, ) dag.doc_md = __doc__ diff --git a/docs/apache-airflow/concepts/operators.rst b/docs/apache-airflow/concepts/operators.rst index 13020f1be4ca1..30ec5ceff8b3f 100644 --- a/docs/apache-airflow/concepts/operators.rst +++ b/docs/apache-airflow/concepts/operators.rst @@ -175,7 +175,7 @@ you can pass ``render_template_as_native_obj=True`` to the DAG as follows: dag = DAG( dag_id="example_template_as_python_object", schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, render_template_as_native_obj=True, ) diff --git a/docs/apache-airflow/dag-run.rst b/docs/apache-airflow/dag-run.rst index 62555b10ed3ab..1a2bbe3d6ecc4 100644 --- a/docs/apache-airflow/dag-run.rst +++ b/docs/apache-airflow/dag-run.rst @@ -113,17 +113,18 @@ in the configuration file. When turned off, the scheduler creates a DAG run only """ from airflow.models.dag import DAG from airflow.operators.bash import BashOperator - from datetime import datetime, timedelta + import datetime + import pendulum dag = DAG( "tutorial", default_args={ "depends_on_past": True, "retries": 1, - "retry_delay": timedelta(minutes=3), + "retry_delay": datetime.timedelta(minutes=3), }, - start_date=datetime(2015, 12, 1), + start_date=pendulum.datetime(2015, 12, 1, tz="UTC"), description="A simple tutorial DAG", schedule_interval="@daily", catchup=False, @@ -225,7 +226,7 @@ Example of a parameterized DAG: .. code-block:: python - from datetime import datetime + import pendulum from airflow import DAG from airflow.operators.bash import BashOperator @@ -233,7 +234,7 @@ Example of a parameterized DAG: dag = DAG( "example_parameterized_dag", schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, ) diff --git a/docs/apache-airflow/executor/kubernetes.rst b/docs/apache-airflow/executor/kubernetes.rst index 341f9fea0c03c..496e3e7eadb2d 100644 --- a/docs/apache-airflow/executor/kubernetes.rst +++ b/docs/apache-airflow/executor/kubernetes.rst @@ -154,7 +154,8 @@ Here is an example of a task with both features: .. code-block:: python import os - from datetime import datetime + + import pendulum from airflow import DAG from airflow.decorators import task @@ -166,7 +167,7 @@ Here is an example of a task with both features: with DAG( dag_id="example_pod_template_file", schedule_interval=None, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, tags=["example3"], ) as dag: diff --git a/docs/apache-airflow/faq.rst b/docs/apache-airflow/faq.rst index 7f72a0d162f5f..6f2e778f2bfb4 100644 --- a/docs/apache-airflow/faq.rst +++ b/docs/apache-airflow/faq.rst @@ -149,7 +149,8 @@ until ``min_file_process_interval`` is reached since DAG Parser will look for mo from airflow import DAG from airflow.operators.python_operator import PythonOperator - from datetime import datetime + + import pendulum def create_dag(dag_id, schedule, dag_number, default_args): @@ -157,7 +158,12 @@ until ``min_file_process_interval`` is reached since DAG Parser will look for mo print("Hello World") print("This is DAG: {}".format(str(dag_number))) - dag = DAG(dag_id, schedule_interval=schedule, default_args=default_args) + dag = DAG( + dag_id, + schedule_interval=schedule, + default_args=default_args, + pendulum.datetime(2021, 9, 13, tz="UTC"), + ) with dag: t1 = PythonOperator(task_id="hello_world", python_callable=hello_world_py) @@ -213,6 +219,14 @@ backfill CLI command, gets overridden by the backfill's ``start_date`` commands. This allows for a backfill on tasks that have ``depends_on_past=True`` to actually start. If this were not the case, the backfill just would not start. +Using time zones +---------------- + +Creating a time zone aware datetime (e.g. DAG's ``start_date``) is quite simple. Just make sure to supply +a time zone aware dates using ``pendulum``. Don't try to use standard library +`timezone `_ as they are known to +have limitations and we deliberately disallow using them in DAGs. + .. _faq:what-does-execution-date-mean: @@ -360,12 +374,12 @@ upstream task. .. code-block:: python + import pendulum + from airflow.decorators import dag, task from airflow.exceptions import AirflowException from airflow.utils.trigger_rule import TriggerRule - from datetime import datetime - @task def a_func(): @@ -379,7 +393,7 @@ upstream task. pass - @dag(schedule_interval="@once", start_date=datetime(2021, 1, 1)) + @dag(schedule_interval="@once", start_date=pendulum.datetime(2021, 1, 1, tz="UTC")) def my_dag(): a = a_func() b = b_func() diff --git a/docs/apache-airflow/howto/timetable.rst b/docs/apache-airflow/howto/timetable.rst index 1fd71028071cf..ed902fcffed19 100644 --- a/docs/apache-airflow/howto/timetable.rst +++ b/docs/apache-airflow/howto/timetable.rst @@ -69,7 +69,7 @@ file: .. code-block:: python - import datetime + import pendulum from airflow import DAG from airflow.example_dags.plugins.workday import AfterWorkdayTimetable @@ -77,7 +77,7 @@ file: with DAG( dag_id="example_after_workday_timetable_dag", - start_date=datetime.datetime(2021, 3, 10), + start_date=pendulum.datetime(2021, 3, 10, tz="UTC"), timetable=AfterWorkdayTimetable(), tags=["example", "timetable"], ) as dag: @@ -190,7 +190,7 @@ For reference, here's our plugin and DAG files in their entirety: .. code-block:: python - import datetime + import pendulum from airflow import DAG from airflow.example_dags.plugins.workday import AfterWorkdayTimetable @@ -199,7 +199,7 @@ For reference, here's our plugin and DAG files in their entirety: with DAG( dag_id="example_workday_timetable", - start_date=datetime.datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), timetable=AfterWorkdayTimetable(), tags=["example", "timetable"], ) as dag: diff --git a/docs/apache-airflow/lineage.rst b/docs/apache-airflow/lineage.rst index 9b8bb71a16385..3a4db94dc5615 100644 --- a/docs/apache-airflow/lineage.rst +++ b/docs/apache-airflow/lineage.rst @@ -30,7 +30,8 @@ works. .. code-block:: python - from datetime import datetime, timedelta + import datetime + import pendulum from airflow.lineage import AUTO from airflow.lineage.entities import File @@ -42,10 +43,10 @@ works. dag = DAG( dag_id="example_lineage", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), schedule_interval="0 0 * * *", catchup=False, - dagrun_timeout=timedelta(minutes=60), + dagrun_timeout=datetime.timedelta(minutes=60), ) f_final = File(url="/tmp/final") diff --git a/docs/apache-airflow/logging-monitoring/callbacks.rst b/docs/apache-airflow/logging-monitoring/callbacks.rst index 77ac594aa37b4..15bbacbabe849 100644 --- a/docs/apache-airflow/logging-monitoring/callbacks.rst +++ b/docs/apache-airflow/logging-monitoring/callbacks.rst @@ -51,7 +51,9 @@ In the following example, failures in any task call the ``task_failure_alert`` f .. code-block:: python - from datetime import datetime, timedelta + import datetime + import pendulum + from airflow import DAG from airflow.operators.dummy import DummyOperator @@ -67,8 +69,8 @@ In the following example, failures in any task call the ``task_failure_alert`` f with DAG( dag_id="example_callback", schedule_interval=None, - start_date=datetime(2021, 1, 1), - dagrun_timeout=timedelta(minutes=60), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + dagrun_timeout=datetime.timedelta(minutes=60), catchup=False, on_success_callback=None, on_failure_callback=task_failure_alert, diff --git a/docs/apache-airflow/timezone.rst b/docs/apache-airflow/timezone.rst index 32e52239f453b..bcbbd11293000 100644 --- a/docs/apache-airflow/timezone.rst +++ b/docs/apache-airflow/timezone.rst @@ -40,6 +40,7 @@ The time zone is set in ``airflow.cfg``. By default it is set to utc, but you ch an arbitrary IANA time zone, e.g. ``Europe/Amsterdam``. It is dependent on ``pendulum``, which is more accurate than ``pytz``. Pendulum is installed when you install Airflow. + Web UI ------ @@ -90,7 +91,11 @@ words if you have a default time zone setting of ``Europe/Amsterdam`` and create .. code-block:: python - dag = DAG("my_dag", start_date=datetime(2017, 1, 1), default_args={"retries": 3}) + dag = DAG( + "my_dag", + start_date=pendulum.datetime(2017, 1, 1, tz="UTC"), + default_args={"retries": 3}, + ) op = BashOperator(task_id="dummy", bash_command="Hello World!", dag=dag) print(op.retries) # 3 @@ -120,19 +125,21 @@ it is therefore important to make sure this setting is equal on all Airflow node .. note:: For more information on setting the configuration, see :doc:`howto/set-config` +.. _timezone_aware_dags: + Time zone aware DAGs -------------------- Creating a time zone aware DAG is quite simple. Just make sure to supply a time zone aware ``start_date`` -using ``pendulum``. +using ``pendulum``. Don't try to use standard library +`timezone `_ as they are known to +have limitations and we deliberately disallow using them in DAGs. .. code-block:: python import pendulum - local_tz = pendulum.timezone("Europe/Amsterdam") - - dag = DAG("my_tz_dag", start_date=datetime(2016, 1, 1, tzinfo=local_tz)) + dag = DAG("my_tz_dag", start_date=pendulum.datetime(2016, 1, 1, tz="Europe/Amsterdam")) op = DummyOperator(task_id="dummy", dag=dag) print(dag.timezone) # @@ -170,6 +177,6 @@ Time deltas Time zone aware DAGs that use ``timedelta`` or ``relativedelta`` schedules respect daylight savings time for the start date but do not adjust for daylight savings time when scheduling subsequent runs. For example, a -DAG with a start date of ``pendulum.datetime(2020, 1, 1, tz="US/Eastern")`` +DAG with a start date of ``pendulum.datetime(2020, 1, 1, tz="UTC")`` and a schedule interval of ``timedelta(days=1)`` will run daily at 05:00 UTC regardless of daylight savings time. diff --git a/docs/apache-airflow/tutorial.rst b/docs/apache-airflow/tutorial.rst index a8f76bc8689dd..085be42b7bd01 100644 --- a/docs/apache-airflow/tutorial.rst +++ b/docs/apache-airflow/tutorial.rst @@ -230,6 +230,14 @@ Note that when executing your script, Airflow will raise exceptions when it finds cycles in your DAG or when a dependency is referenced more than once. +Using time zones +---------------- + +Creating a time zone aware DAG is quite simple. Just make sure to supply a time zone aware dates +using ``pendulum``. Don't try to use standard library +`timezone `_ as they are known to +have limitations and we deliberately disallow using them in DAGs. + Recap ----- Alright, so we have a pretty basic DAG. At this point your code should look @@ -474,7 +482,8 @@ Lets look at our DAG: .. code-block:: python - from datetime import datetime, timedelta + import datetime + import pendulum import requests from airflow.decorators import dag, task @@ -483,9 +492,9 @@ Lets look at our DAG: @dag( schedule_interval="0 0 * * *", - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, - dagrun_timeout=timedelta(minutes=60), + dagrun_timeout=datetime.timedelta(minutes=60), ) def Etl(): @task diff --git a/docs/docker-stack/docker-examples/extending/embedding-dags/test_dag.py b/docs/docker-stack/docker-examples/extending/embedding-dags/test_dag.py index 25ceeba6d3e8e..a12f2f65d34ff 100644 --- a/docs/docker-stack/docker-examples/extending/embedding-dags/test_dag.py +++ b/docs/docker-stack/docker-examples/extending/embedding-dags/test_dag.py @@ -17,13 +17,15 @@ # under the License. # [START dag] """This dag only runs some simple tasks to test Airflow's task execution.""" -from datetime import datetime, timedelta +import datetime + +import pendulum from airflow.models.dag import DAG from airflow.operators.dummy import DummyOperator -now = datetime.now() -now_to_the_hour = (now - timedelta(0, 0, 0, 0, 0, 3)).replace(minute=0, second=0, microsecond=0) +now = pendulum.now(tz="UTC") +now_to_the_hour = (now - datetime.timedelta(0, 0, 0, 0, 0, 3)).replace(minute=0, second=0, microsecond=0) START_DATE = now_to_the_hour DAG_NAME = 'test_dag_v1' @@ -31,7 +33,7 @@ DAG_NAME, schedule_interval='*/10 * * * *', default_args={'depends_on_past': True}, - start_date=datetime(2021, 1, 1), + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), catchup=False, ) From eb87aeb1604a6ed1a51895f04fca2c5c7e39c223 Mon Sep 17 00:00:00 2001 From: Jed Cunningham Date: Thu, 17 Feb 2022 17:11:00 -0700 Subject: [PATCH 248/250] Add changelog for 2.2.4rc1 --- CHANGELOG.txt | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 4c9a2da4252fb..4e7e548d3a856 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,77 @@ +Airflow 2.2.4, 2021-02-22 +------------------------- + +Bug Fixes +""""""""" + +- Adding missing login provider related methods from Flask-Appbuilder (#21294) +- Fix slow DAG deletion due to missing ``dag_id`` index for job table (#20282) +- Add a session backend to store session data in the database (#21478) +- Show task status only for running dags or only for the last finished dag (#21352) +- Use compat data interval shim in log handlers (#21289) +- Fix mismatch in generated run_id and logical date of DAG run (#18707) +- Fix TriggerDagRunOperator extra link (#19410) +- Add possibility to create user in the Remote User mode (#19963) +- Avoid deadlock when rescheduling task (#21362) +- Fix the incorrect scheduling time for the first run of dag (#21011) +- Fix Scheduler crash when executing task instances of missing DAG (#20349) +- Deferred tasks does not cancel when DAG is marked fail (#20649) +- Removed duplicated dag_run join in ``Dag.get_task_instances()`` (#20591) +- Avoid unintentional data loss when deleting DAGs (#20758) +- Fix session usage in ``/rendered-k8s`` view (#21006) +- Fix ``airflow dags backfill --reset-dagruns`` errors when run twice (#21062) +- Do not set ``TaskInstance.max_tries`` in ``refresh_from_task`` (#21018) +- Don't require dag_id in body in dagrun REST API endpoint (#21024) +- Add Roles from Azure OAUTH Response in internal Security Manager (#20707) +- Allow Viewing DagRuns and TIs if a user has DAG "read" perms (#20663) +- Fix running ``airflow dags test `` results in error when run twice (#21031) +- Switch to non-vendored latest connexion library (#20910) +- Bump flask-appbuilder to ``>=3.3.4`` (#20628) +- upgrade celery to ``5.2.3`` (#19703) +- Bump croniter from ``<1.1`` to ``<1.2`` (#20489) +- Lift off upper bound for MarkupSafe (#20113) +- Avoid calling ``DAG.following_schedule()`` for ``TaskInstance.get_template_context()`` (#20486) +- Fix(standalone): Remove hardcoded Webserver port (#20429) +- Remove unnecssary logging in experimental API (#20356) +- Un-ignore DeprecationWarning (#20322) +- Deepcopying Kubernetes Secrets attributes causing issues (#20318) +- Fix(dag-dependencies): fix arrow styling (#20303) +- Adds retry on taskinstance retrieval lock (#20030) +- Correctly send timing metrics when using dogstatsd (fix schedule_delay metric) (#19973) +- Enhance ``multiple_outputs`` inference of dict typing (#19608) +- Fixing ses email backend (#18042) + +Doc only changes +"""""""""""""""" + +- Added explaining concept of logical date in DAG run docs (#21433) +- Add note about Variable precedence with env vars (#21568) +- Update error docs to include before_send option (#21275) +- Augment xcom docs (#20755) +- Add documentation and release policy on "latest" constraints (#21093) +- Add a link to the DAG model in the Python API reference (#21060) +- Added an enum param example (#20841) +- Compare taskgroup and subdag (#20700) +- Add note about reserved ``params`` keyword (#20640) +- Improve documentation on ``Params`` (#20567) +- Fix typo in MySQL Database creation code (Set up DB docs) (#20102) +- Add requirements.txt description (#20048) +- Clean up ``default_args`` usage in docs (#19803) +- Add docker-compose explanation to conn localhost (#19076) +- Update CSV ingest code for tutorial (#18960) +- Adds Pendulum 1.x -> 2.x upgrade documentation (#18955) +- Updating explicit arg example in TaskFlow API tutorial doc (#18907) +- Adds back documentation about context usage in Python/@task (#18868) +- Clean up dynamic `start_date` values from docs (#19607) +- Docs for multiple pool slots (#20257) +- Update upgrading.rst with detailed code example of how to resolve post-upgrade warning (#19993) + +Misc +"""" + +- Deprecate some functions in the experimental API (#19931) +- Deprecate smart sensors (#20151) + Airflow 2.2.3, 2021-12-20 ------------------------- From 01b909b5f71ec1d4563be23b3c590bd4240d513d Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Fri, 18 Feb 2022 11:12:37 +0100 Subject: [PATCH 249/250] Pin Markupsafe until we are able to upgrade Flask/Jinja (#21664) Markupsafe 2.1.0 breaks with error: import name 'soft_unicode' from 'markupsafe'. This should be removed when either this issue is closed: https://github.com/pallets/markupsafe/issues/284 or when we will be able to upgrade JINJA to newer version (currently limited due to Flask and Flask Application Builder) (cherry picked from commit 366c66b8f6eddc0d22028ef494c62bb757bd8b8b) --- setup.cfg | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 12bdeaeb3c51e..8352f94cea6ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -132,7 +132,12 @@ install_requires = lazy-object-proxy lockfile>=0.12.2 markdown>=2.5.2, <4.0 - markupsafe>=1.1.1 + # Markupsafe 2.1.0 breaks with error: import name 'soft_unicode' from 'markupsafe'. + # This should be removed when either this issue is closed: + # https://github.com/pallets/markupsafe/issues/284 + # or when we will be able to upgrade JINJA to newer version (currently limited due to Flask and + # Flask Application Builder) + markupsafe>=1.1.1,<2.1.0 marshmallow-oneofschema>=2.0.1 packaging>=14.0 pendulum~=2.0 From ee9049c0566b2539a247687de05f9cffa008f871 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Fri, 18 Feb 2022 13:02:28 +0100 Subject: [PATCH 250/250] fixup! Add changelog for 2.2.4rc1 --- CHANGELOG.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 4e7e548d3a856..cb50c54639083 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -40,6 +40,7 @@ Bug Fixes - Correctly send timing metrics when using dogstatsd (fix schedule_delay metric) (#19973) - Enhance ``multiple_outputs`` inference of dict typing (#19608) - Fixing ses email backend (#18042) +- Pin Markupsafe until we are able to upgrade Flask/Jinja (#21664) Doc only changes """"""""""""""""