diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index 3bef18058dfbb..d91415886a874 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -310,7 +310,7 @@ core: version_added: 2.3.0 type: string example: ~ - default: "regexp" + default: "glob" default_task_retries: description: | The number of retries each task is going to have by default. Can be overridden at dag or task level. diff --git a/airflow/utils/file.py b/airflow/utils/file.py index 7081113d5bd46..2e39eb7dd7b52 100644 --- a/airflow/utils/file.py +++ b/airflow/utils/file.py @@ -221,7 +221,7 @@ def _find_path_from_directory( def find_path_from_directory( base_dir_path: str | os.PathLike[str], ignore_file_name: str, - ignore_file_syntax: str = conf.get_mandatory_value("core", "DAG_IGNORE_FILE_SYNTAX", fallback="regexp"), + ignore_file_syntax: str = conf.get_mandatory_value("core", "DAG_IGNORE_FILE_SYNTAX", fallback="glob"), ) -> Generator[str, None, None]: """ Recursively search the base path for a list of file paths that should not be ignored. @@ -232,9 +232,9 @@ def find_path_from_directory( :return: a generator of file paths. """ - if ignore_file_syntax == "glob": + if ignore_file_syntax == "glob" or not ignore_file_syntax: return _find_path_from_directory(base_dir_path, ignore_file_name, _GlobIgnoreRule) - elif ignore_file_syntax == "regexp" or not ignore_file_syntax: + elif ignore_file_syntax == "regexp": return _find_path_from_directory(base_dir_path, ignore_file_name, _RegexpIgnoreRule) else: raise ValueError(f"Unsupported ignore_file_syntax: {ignore_file_syntax}") diff --git a/docs/apache-airflow/administration-and-deployment/modules_management.rst b/docs/apache-airflow/administration-and-deployment/modules_management.rst index dc6be49b1d43d..25adb5f333c91 100644 --- a/docs/apache-airflow/administration-and-deployment/modules_management.rst +++ b/docs/apache-airflow/administration-and-deployment/modules_management.rst @@ -125,14 +125,7 @@ for the paths that should be ignored. You do not need to have that file in any o In the example above the DAGs are only in ``my_custom_dags`` folder, the ``common_package`` should not be scanned by scheduler when searching for DAGS, so we should ignore ``common_package`` folder. You also want to ignore the ``base_dag.py`` if you keep a base DAG there that ``my_dag1.py`` and ``my_dag2.py`` derives -from. Your ``.airflowignore`` should look then like this: - -.. code-block:: none - - my_company/common_package/.* - my_company/my_custom_dags/base_dag\.py - -If ``DAG_IGNORE_FILE_SYNTAX`` is set to ``glob``, the equivalent ``.airflowignore`` file would be: +from. Your ``.airflowignore`` should look then like this (using the default ``glob`` syntax): .. code-block:: none diff --git a/docs/apache-airflow/core-concepts/dags.rst b/docs/apache-airflow/core-concepts/dags.rst index fbef745e46d34..f9dc7d64c72e0 100644 --- a/docs/apache-airflow/core-concepts/dags.rst +++ b/docs/apache-airflow/core-concepts/dags.rst @@ -712,19 +712,9 @@ configuration parameter (*added in Airflow 2.3*): ``regexp`` and ``glob``. .. note:: - The default ``DAG_IGNORE_FILE_SYNTAX`` is ``regexp`` to ensure backwards compatibility. + The default ``DAG_IGNORE_FILE_SYNTAX`` is ``glob`` in Airflow 3 or later (in previous versions it was ``regexp``). -For the ``regexp`` pattern syntax (the default), each line in ``.airflowignore`` -specifies a regular expression pattern, and directories or files whose names (not DAG id) -match any of the patterns would be ignored (under the hood, ``Pattern.search()`` is used -to match the pattern). Use the ``#`` character to indicate a comment; all characters -on lines starting with ``#`` will be ignored. - -As with most regexp matching in Airflow, the regexp engine is ``re2``, which explicitly -doesn't support many advanced features, please check its -`documentation `_ for more information. - -With the ``glob`` syntax, the patterns work just like those in a ``.gitignore`` file: +With the ``glob`` syntax (the default), the patterns work just like those in a ``.gitignore`` file: * The ``*`` character will match any number of characters, except ``/`` * The ``?`` character will match any single character, except ``/`` @@ -738,15 +728,18 @@ With the ``glob`` syntax, the patterns work just like those in a ``.gitignore`` is relative to the directory level of the particular .airflowignore file itself. Otherwise the pattern may also match at any level below the .airflowignore level. -The ``.airflowignore`` file should be put in your ``DAG_FOLDER``. For example, you can prepare -a ``.airflowignore`` file using the ``regexp`` syntax with content - -.. code-block:: +For the ``regexp`` pattern syntax, each line in ``.airflowignore`` +specifies a regular expression pattern, and directories or files whose names (not DAG id) +match any of the patterns would be ignored (under the hood, ``Pattern.search()`` is used +to match the pattern). Use the ``#`` character to indicate a comment; all characters +on lines starting with ``#`` will be ignored. - project_a - tenant_[\d] +As with most regexp matching in Airflow, the regexp engine is ``re2``, which explicitly +doesn't support many advanced features, please check its +`documentation `_ for more information. -Or, equivalently, in the ``glob`` syntax +The ``.airflowignore`` file should be put in your ``DAG_FOLDER``. For example, you can prepare +a ``.airflowignore`` file with the ``glob`` syntax .. code-block:: diff --git a/docs/apache-airflow/howto/dynamic-dag-generation.rst b/docs/apache-airflow/howto/dynamic-dag-generation.rst index 5d542a29320b7..9aa988f28bdb1 100644 --- a/docs/apache-airflow/howto/dynamic-dag-generation.rst +++ b/docs/apache-airflow/howto/dynamic-dag-generation.rst @@ -91,7 +91,7 @@ Then you can import and use the ``ALL_TASKS`` constant in all your DAGs like tha ... Don't forget that in this case you need to add empty ``__init__.py`` file in the ``my_company_utils`` folder -and you should add the ``my_company_utils/.*`` line to ``.airflowignore`` file (if using the regexp ignore +and you should add the ``my_company_utils/*`` line to ``.airflowignore`` file (using the default glob syntax), so that the whole folder is ignored by the scheduler when it looks for DAGs. diff --git a/newsfragments/42436.significant.rst b/newsfragments/42436.significant.rst new file mode 100644 index 0000000000000..d9dbcfc4c9f5d --- /dev/null +++ b/newsfragments/42436.significant.rst @@ -0,0 +1,7 @@ +Default ``.airflowignore`` syntax changed to ``glob`` + +The default value to the configuration ``[core] dag_ignore_file_syntax`` has +been changed to ``glob``, which better matches the ignore file behavior of many +popular tools. + +To revert to the previous behavior, set the configuration to ``regexp``. diff --git a/tests/dags/.airflowignore b/tests/dags/.airflowignore index 313b04ef81cd4..7daaf22e65efc 100644 --- a/tests/dags/.airflowignore +++ b/tests/dags/.airflowignore @@ -1,3 +1,2 @@ -.*_invalid.* # Skip invalid files -subdir3 # Skip the nested subdir3 directory -# *badrule # This rule is an invalid regex. It would be warned about and skipped. +*_invalid_* # Skip invalid files +subdir3 # Skip the nested subdir3 directory diff --git a/tests/dags/subdir1/.airflowignore b/tests/dags/subdir1/.airflowignore index 8b69a752e69fb..0bfa43be300a1 100644 --- a/tests/dags/subdir1/.airflowignore +++ b/tests/dags/subdir1/.airflowignore @@ -1 +1 @@ -.*_ignore_this.py # Ignore files ending with "_ignore_this.py" +*_ignore_this.py # Ignore files ending with "_ignore_this.py" diff --git a/tests/plugins/test_plugin_ignore.py b/tests/plugins/test_plugin_ignore.py index d995fabd080f8..92951304d2b9f 100644 --- a/tests/plugins/test_plugin_ignore.py +++ b/tests/plugins/test_plugin_ignore.py @@ -77,7 +77,7 @@ def test_find_not_should_ignore_path_regexp(self, tmp_path): "test_load_sub1.py", } ignore_list_file = ".airflowignore" - for file_path in find_path_from_directory(plugin_folder_path, ignore_list_file): + for file_path in find_path_from_directory(plugin_folder_path, ignore_list_file, "regexp"): file_path = Path(file_path) if file_path.is_file() and file_path.suffix == ".py": detected_files.add(file_path.name)