From 19595fcdb9b24c9e8c3b13187f9473c27b9e6cc6 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Tue, 24 Sep 2024 16:24:54 +0800 Subject: [PATCH 1/2] Change default .airflowignore syntax to glob This matches the ignore file behavior of many popular tools (e.g. Git, Docker) and is likely what most users expect without reading the documentation. The documentation is also tweaked to emphasize the new default. I also decided to get rid of some regexp examples entirely; I figure most of the users are better off either using glob, or just customize the might_contain_dag_callable hook instead. The regexp syntax parser uses re2, which is quite limiting for security reasons, and does not offer significant advantages over glob for almost all use cases. --- airflow/config_templates/config.yml | 2 +- airflow/utils/file.py | 6 ++-- .../modules_management.rst | 9 +----- docs/apache-airflow/core-concepts/dags.rst | 32 ++++++++----------- .../howto/dynamic-dag-generation.rst | 2 +- newsfragments/42436.significant.rst | 7 ++++ tests/dags/.airflowignore | 5 ++- tests/dags/subdir1/.airflowignore | 2 +- tests/plugins/test_plugin_ignore.py | 2 +- 9 files changed, 30 insertions(+), 37 deletions(-) create mode 100644 newsfragments/42436.significant.rst diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index 3bef18058dfbb..d91415886a874 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -310,7 +310,7 @@ core: version_added: 2.3.0 type: string example: ~ - default: "regexp" + default: "glob" default_task_retries: description: | The number of retries each task is going to have by default. Can be overridden at dag or task level. diff --git a/airflow/utils/file.py b/airflow/utils/file.py index 7081113d5bd46..2e39eb7dd7b52 100644 --- a/airflow/utils/file.py +++ b/airflow/utils/file.py @@ -221,7 +221,7 @@ def _find_path_from_directory( def find_path_from_directory( base_dir_path: str | os.PathLike[str], ignore_file_name: str, - ignore_file_syntax: str = conf.get_mandatory_value("core", "DAG_IGNORE_FILE_SYNTAX", fallback="regexp"), + ignore_file_syntax: str = conf.get_mandatory_value("core", "DAG_IGNORE_FILE_SYNTAX", fallback="glob"), ) -> Generator[str, None, None]: """ Recursively search the base path for a list of file paths that should not be ignored. @@ -232,9 +232,9 @@ def find_path_from_directory( :return: a generator of file paths. """ - if ignore_file_syntax == "glob": + if ignore_file_syntax == "glob" or not ignore_file_syntax: return _find_path_from_directory(base_dir_path, ignore_file_name, _GlobIgnoreRule) - elif ignore_file_syntax == "regexp" or not ignore_file_syntax: + elif ignore_file_syntax == "regexp": return _find_path_from_directory(base_dir_path, ignore_file_name, _RegexpIgnoreRule) else: raise ValueError(f"Unsupported ignore_file_syntax: {ignore_file_syntax}") diff --git a/docs/apache-airflow/administration-and-deployment/modules_management.rst b/docs/apache-airflow/administration-and-deployment/modules_management.rst index dc6be49b1d43d..25adb5f333c91 100644 --- a/docs/apache-airflow/administration-and-deployment/modules_management.rst +++ b/docs/apache-airflow/administration-and-deployment/modules_management.rst @@ -125,14 +125,7 @@ for the paths that should be ignored. You do not need to have that file in any o In the example above the DAGs are only in ``my_custom_dags`` folder, the ``common_package`` should not be scanned by scheduler when searching for DAGS, so we should ignore ``common_package`` folder. You also want to ignore the ``base_dag.py`` if you keep a base DAG there that ``my_dag1.py`` and ``my_dag2.py`` derives -from. Your ``.airflowignore`` should look then like this: - -.. code-block:: none - - my_company/common_package/.* - my_company/my_custom_dags/base_dag\.py - -If ``DAG_IGNORE_FILE_SYNTAX`` is set to ``glob``, the equivalent ``.airflowignore`` file would be: +from. Your ``.airflowignore`` should look then like this (using the default ``glob`` syntax): .. code-block:: none diff --git a/docs/apache-airflow/core-concepts/dags.rst b/docs/apache-airflow/core-concepts/dags.rst index fbef745e46d34..306e5fde3bc4b 100644 --- a/docs/apache-airflow/core-concepts/dags.rst +++ b/docs/apache-airflow/core-concepts/dags.rst @@ -712,19 +712,10 @@ configuration parameter (*added in Airflow 2.3*): ``regexp`` and ``glob``. .. note:: - The default ``DAG_IGNORE_FILE_SYNTAX`` is ``regexp`` to ensure backwards compatibility. + The default ``DAG_IGNORE_FILE_SYNTAX`` is ``glob`` in Airflow 3 or later. + The default was ``regexp`` in previous Airflow versions. -For the ``regexp`` pattern syntax (the default), each line in ``.airflowignore`` -specifies a regular expression pattern, and directories or files whose names (not DAG id) -match any of the patterns would be ignored (under the hood, ``Pattern.search()`` is used -to match the pattern). Use the ``#`` character to indicate a comment; all characters -on lines starting with ``#`` will be ignored. - -As with most regexp matching in Airflow, the regexp engine is ``re2``, which explicitly -doesn't support many advanced features, please check its -`documentation `_ for more information. - -With the ``glob`` syntax, the patterns work just like those in a ``.gitignore`` file: +With the ``glob`` syntax (the default), the patterns work just like those in a ``.gitignore`` file: * The ``*`` character will match any number of characters, except ``/`` * The ``?`` character will match any single character, except ``/`` @@ -738,15 +729,18 @@ With the ``glob`` syntax, the patterns work just like those in a ``.gitignore`` is relative to the directory level of the particular .airflowignore file itself. Otherwise the pattern may also match at any level below the .airflowignore level. -The ``.airflowignore`` file should be put in your ``DAG_FOLDER``. For example, you can prepare -a ``.airflowignore`` file using the ``regexp`` syntax with content - -.. code-block:: +For the ``regexp`` pattern syntax, each line in ``.airflowignore`` +specifies a regular expression pattern, and directories or files whose names (not DAG id) +match any of the patterns would be ignored (under the hood, ``Pattern.search()`` is used +to match the pattern). Use the ``#`` character to indicate a comment; all characters +on lines starting with ``#`` will be ignored. - project_a - tenant_[\d] +As with most regexp matching in Airflow, the regexp engine is ``re2``, which explicitly +doesn't support many advanced features, please check its +`documentation `_ for more information. -Or, equivalently, in the ``glob`` syntax +The ``.airflowignore`` file should be put in your ``DAG_FOLDER``. For example, you can prepare +a ``.airflowignore`` file with the ``glob`` syntax .. code-block:: diff --git a/docs/apache-airflow/howto/dynamic-dag-generation.rst b/docs/apache-airflow/howto/dynamic-dag-generation.rst index 5d542a29320b7..9aa988f28bdb1 100644 --- a/docs/apache-airflow/howto/dynamic-dag-generation.rst +++ b/docs/apache-airflow/howto/dynamic-dag-generation.rst @@ -91,7 +91,7 @@ Then you can import and use the ``ALL_TASKS`` constant in all your DAGs like tha ... Don't forget that in this case you need to add empty ``__init__.py`` file in the ``my_company_utils`` folder -and you should add the ``my_company_utils/.*`` line to ``.airflowignore`` file (if using the regexp ignore +and you should add the ``my_company_utils/*`` line to ``.airflowignore`` file (using the default glob syntax), so that the whole folder is ignored by the scheduler when it looks for DAGs. diff --git a/newsfragments/42436.significant.rst b/newsfragments/42436.significant.rst new file mode 100644 index 0000000000000..d9dbcfc4c9f5d --- /dev/null +++ b/newsfragments/42436.significant.rst @@ -0,0 +1,7 @@ +Default ``.airflowignore`` syntax changed to ``glob`` + +The default value to the configuration ``[core] dag_ignore_file_syntax`` has +been changed to ``glob``, which better matches the ignore file behavior of many +popular tools. + +To revert to the previous behavior, set the configuration to ``regexp``. diff --git a/tests/dags/.airflowignore b/tests/dags/.airflowignore index 313b04ef81cd4..7daaf22e65efc 100644 --- a/tests/dags/.airflowignore +++ b/tests/dags/.airflowignore @@ -1,3 +1,2 @@ -.*_invalid.* # Skip invalid files -subdir3 # Skip the nested subdir3 directory -# *badrule # This rule is an invalid regex. It would be warned about and skipped. +*_invalid_* # Skip invalid files +subdir3 # Skip the nested subdir3 directory diff --git a/tests/dags/subdir1/.airflowignore b/tests/dags/subdir1/.airflowignore index 8b69a752e69fb..0bfa43be300a1 100644 --- a/tests/dags/subdir1/.airflowignore +++ b/tests/dags/subdir1/.airflowignore @@ -1 +1 @@ -.*_ignore_this.py # Ignore files ending with "_ignore_this.py" +*_ignore_this.py # Ignore files ending with "_ignore_this.py" diff --git a/tests/plugins/test_plugin_ignore.py b/tests/plugins/test_plugin_ignore.py index d995fabd080f8..92951304d2b9f 100644 --- a/tests/plugins/test_plugin_ignore.py +++ b/tests/plugins/test_plugin_ignore.py @@ -77,7 +77,7 @@ def test_find_not_should_ignore_path_regexp(self, tmp_path): "test_load_sub1.py", } ignore_list_file = ".airflowignore" - for file_path in find_path_from_directory(plugin_folder_path, ignore_list_file): + for file_path in find_path_from_directory(plugin_folder_path, ignore_list_file, "regexp"): file_path = Path(file_path) if file_path.is_file() and file_path.suffix == ".py": detected_files.add(file_path.name) From f08d5f74bcd16a4e5b9d3310614ae8ba556b9742 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Mon, 30 Sep 2024 12:43:35 +0800 Subject: [PATCH 2/2] Shorten sentence Co-authored-by: Shahar Epstein <60007259+shahar1@users.noreply.github.com> --- docs/apache-airflow/core-concepts/dags.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/apache-airflow/core-concepts/dags.rst b/docs/apache-airflow/core-concepts/dags.rst index 306e5fde3bc4b..f9dc7d64c72e0 100644 --- a/docs/apache-airflow/core-concepts/dags.rst +++ b/docs/apache-airflow/core-concepts/dags.rst @@ -712,8 +712,7 @@ configuration parameter (*added in Airflow 2.3*): ``regexp`` and ``glob``. .. note:: - The default ``DAG_IGNORE_FILE_SYNTAX`` is ``glob`` in Airflow 3 or later. - The default was ``regexp`` in previous Airflow versions. + The default ``DAG_IGNORE_FILE_SYNTAX`` is ``glob`` in Airflow 3 or later (in previous versions it was ``regexp``). With the ``glob`` syntax (the default), the patterns work just like those in a ``.gitignore`` file: