diff --git a/airflow/api/common/trigger_dag.py b/airflow/api/common/trigger_dag.py index b18957261f3a0..44beae3f1f78c 100644 --- a/airflow/api/common/trigger_dag.py +++ b/airflow/api/common/trigger_dag.py @@ -25,6 +25,7 @@ from airflow.api_internal.internal_api_call import internal_api_call from airflow.exceptions import DagNotFound, DagRunAlreadyExists from airflow.models import DagBag, DagModel, DagRun +from airflow.models.dag_version import DagVersion from airflow.utils import timezone from airflow.utils.session import NEW_SESSION, provide_session from airflow.utils.state import DagRunState @@ -92,14 +93,14 @@ def _trigger_dag( run_conf = None if conf: run_conf = conf if isinstance(conf, dict) else json.loads(conf) - + dag_version = DagVersion.get_latest_version(dag.dag_id) dag_run = dag.create_dagrun( run_id=run_id, execution_date=execution_date, state=DagRunState.QUEUED, conf=run_conf, external_trigger=True, - dag_hash=dag_bag.dags_hash.get(dag_id), + dag_version=dag_version, data_interval=data_interval, triggered_by=triggered_by, ) diff --git a/airflow/api_connexion/endpoints/dag_run_endpoint.py b/airflow/api_connexion/endpoints/dag_run_endpoint.py index 8ebb2b44e2bb3..6a38eb27ff45c 100644 --- a/airflow/api_connexion/endpoints/dag_run_endpoint.py +++ b/airflow/api_connexion/endpoints/dag_run_endpoint.py @@ -61,6 +61,7 @@ from airflow.auth.managers.models.resource_details import DagAccessEntity from airflow.exceptions import ParamValidationError from airflow.models import DagModel, DagRun +from airflow.models.dag_version import DagVersion from airflow.timetables.base import DataInterval from airflow.utils.airflow_flask_app import get_airflow_app from airflow.utils.api_migration import mark_fastapi_migration_done @@ -341,7 +342,7 @@ def post_dag_run(*, dag_id: str, session: Session = NEW_SESSION) -> APIResponse: ) else: data_interval = dag.timetable.infer_manual_data_interval(run_after=logical_date) - + dag_version = DagVersion.get_latest_version(dag.dag_id) dag_run = dag.create_dagrun( run_type=DagRunType.MANUAL, run_id=run_id, @@ -350,7 +351,7 @@ def post_dag_run(*, dag_id: str, session: Session = NEW_SESSION) -> APIResponse: state=DagRunState.QUEUED, conf=post_body.get("conf"), external_trigger=True, - dag_hash=get_airflow_app().dag_bag.dags_hash.get(dag_id), + dag_version=dag_version, session=session, triggered_by=DagRunTriggeredByType.REST_API, ) diff --git a/airflow/dag_processing/manager.py b/airflow/dag_processing/manager.py index 0f3441a5d4d13..3bc467e2f7063 100644 --- a/airflow/dag_processing/manager.py +++ b/airflow/dag_processing/manager.py @@ -50,7 +50,6 @@ from airflow.models.dagwarning import DagWarning from airflow.models.db_callback_request import DbCallbackRequest from airflow.models.errors import ParseImportError -from airflow.models.serialized_dag import SerializedDagModel from airflow.secrets.cache import SecretCache from airflow.stats import Stats from airflow.traces.tracer import Trace, add_span @@ -539,10 +538,6 @@ def deactivate_stale_dags( if deactivated: cls.logger().info("Deactivated %i DAGs which are no longer present in file.", deactivated) - for dag_id in to_deactivate: - SerializedDagModel.remove_dag(dag_id) - cls.logger().info("Deleted DAG %s in serialized_dag table", dag_id) - def _run_parsing_loop(self): # In sync mode we want timeout=None -- wait forever until a message is received if self._async_mode: @@ -819,20 +814,10 @@ def _iter_dag_filelocs(fileloc: str) -> Iterator[str]: dag_filelocs = {full_loc for path in self._file_paths for full_loc in _iter_dag_filelocs(path)} - from airflow.models.dagcode import DagCode - - SerializedDagModel.remove_deleted_dags( - alive_dag_filelocs=dag_filelocs, - processor_subdir=self.get_dag_directory(), - ) DagModel.deactivate_deleted_dags( dag_filelocs, processor_subdir=self.get_dag_directory(), ) - DagCode.remove_deleted_code( - dag_filelocs, - processor_subdir=self.get_dag_directory(), - ) return True return False diff --git a/airflow/example_dags/plugins/event_listener.py b/airflow/example_dags/plugins/event_listener.py index 4b9be307c4e7e..6d9fe2ff11735 100644 --- a/airflow/example_dags/plugins/event_listener.py +++ b/airflow/example_dags/plugins/event_listener.py @@ -164,9 +164,10 @@ def on_dag_run_running(dag_run: DagRun, msg: str): """ print("Dag run in running state") queued_at = dag_run.queued_at - dag_hash_info = dag_run.dag_hash - print(f"Dag information Queued at: {queued_at} hash info: {dag_hash_info}") + version = dag_run.dag_version.version + + print(f"Dag information Queued at: {queued_at} version: {version}") # [END howto_listen_dagrun_running_task] diff --git a/airflow/jobs/scheduler_job_runner.py b/airflow/jobs/scheduler_job_runner.py index fb85a4a73cc32..ffa250fc81441 100644 --- a/airflow/jobs/scheduler_job_runner.py +++ b/airflow/jobs/scheduler_job_runner.py @@ -54,10 +54,10 @@ ) from airflow.models.backfill import Backfill from airflow.models.dag import DAG, DagModel +from airflow.models.dag_version import DagVersion from airflow.models.dagbag import DagBag from airflow.models.dagrun import DagRun from airflow.models.dagwarning import DagWarning, DagWarningType -from airflow.models.serialized_dag import SerializedDagModel from airflow.models.taskinstance import SimpleTaskInstance, TaskInstance from airflow.stats import Stats from airflow.ti_deps.dependencies_states import EXECUTION_STATES @@ -1338,7 +1338,7 @@ def _create_dag_runs(self, dag_models: Collection[DagModel], session: Session) - self.log.error("DAG '%s' not found in serialized_dag table", dag_model.dag_id) continue - dag_hash = self.dagbag.dags_hash.get(dag.dag_id) + latest_dag_version = DagVersion.get_latest_version(dag.dag_id, session=session) data_interval = dag.get_next_data_interval(dag_model) # Explicitly check if the DagRun already exists. This is an edge case @@ -1358,7 +1358,7 @@ def _create_dag_runs(self, dag_models: Collection[DagModel], session: Session) - data_interval=data_interval, external_trigger=False, session=session, - dag_hash=dag_hash, + dag_version=latest_dag_version, creating_job_id=self.job.id, triggered_by=DagRunTriggeredByType.TIMETABLE, ) @@ -1417,7 +1417,7 @@ def _create_dag_runs_asset_triggered( ) continue - dag_hash = self.dagbag.dags_hash.get(dag.dag_id) + latest_dag_version = DagVersion.get_latest_version(dag.dag_id, session=session) # Explicitly check if the DagRun already exists. This is an edge case # where a Dag Run is created but `DagModel.next_dagrun` and `DagModel.next_dagrun_create_after` @@ -1472,7 +1472,7 @@ def _create_dag_runs_asset_triggered( state=DagRunState.QUEUED, external_trigger=False, session=session, - dag_hash=dag_hash, + dag_version=latest_dag_version, creating_job_id=self.job.id, triggered_by=DagRunTriggeredByType.ASSET, ) @@ -1750,18 +1750,20 @@ def _verify_integrity_if_dag_changed(self, dag_run: DagRun, session: Session) -> Return True if we determine that DAG still exists. """ - latest_version = SerializedDagModel.get_latest_version_hash(dag_run.dag_id, session=session) - if dag_run.dag_hash == latest_version: + latest_dag_version = DagVersion.get_latest_version(dag_run.dag_id, session=session) + if TYPE_CHECKING: + assert latest_dag_version + if dag_run.dag_version_id == latest_dag_version.id: self.log.debug("DAG %s not changed structure, skipping dagrun.verify_integrity", dag_run.dag_id) return True - dag_run.dag_hash = latest_version - # Refresh the DAG dag_run.dag = self.dagbag.get_dag(dag_id=dag_run.dag_id, session=session) if not dag_run.dag: return False + dag_run.dag_version = latest_dag_version + # Verify integrity also takes care of session.flush dag_run.verify_integrity(session=session) return True @@ -2041,7 +2043,6 @@ def _cleanup_stale_dags(self, session: Session = NEW_SESSION) -> None: In case one of DagProcessors is stopped (in case there are multiple of them for different dag folders), its dags are never marked as inactive. - Also remove dags from SerializedDag table. Executed on schedule only if [scheduler]standalone_dag_processor is True. """ self.log.debug("Checking dags not parsed within last %s seconds.", self._dag_stale_not_seen_duration) @@ -2056,7 +2057,6 @@ def _cleanup_stale_dags(self, session: Session = NEW_SESSION) -> None: self.log.info("Found (%d) stales dags not parsed after %s.", len(stale_dags), limit_lpt) for dag in stale_dags: dag.is_active = False - SerializedDagModel.remove_dag(dag_id=dag.dag_id, session=session) session.flush() @provide_session diff --git a/airflow/migrations/versions/0047_3_0_0_add_dag_versioning.py b/airflow/migrations/versions/0047_3_0_0_add_dag_versioning.py new file mode 100644 index 0000000000000..12ffc174c750a --- /dev/null +++ b/airflow/migrations/versions/0047_3_0_0_add_dag_versioning.py @@ -0,0 +1,151 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +add dag versioning. + +Revision ID: 2b47dc6bc8df +Revises: d03e4a635aa3 +Create Date: 2024-10-09 05:44:04.670984 + +""" + +from __future__ import annotations + +import sqlalchemy as sa +from alembic import op +from sqlalchemy_utils import UUIDType + +from airflow.migrations.db_types import StringID +from airflow.models.base import naming_convention +from airflow.utils import timezone +from airflow.utils.sqlalchemy import UtcDateTime + +# revision identifiers, used by Alembic. +revision = "2b47dc6bc8df" +down_revision = "d03e4a635aa3" +branch_labels = None +depends_on = None +airflow_version = "3.0.0" + + +def upgrade(): + """Apply add dag versioning.""" + op.create_table( + "dag_version", + sa.Column("id", UUIDType(binary=False), nullable=False), + sa.Column("version_number", sa.Integer(), nullable=False), + sa.Column("version_name", StringID()), + sa.Column("dag_id", StringID(), nullable=False), + sa.Column("created_at", UtcDateTime(), nullable=False, default=timezone.utcnow), + sa.ForeignKeyConstraint( + ("dag_id",), ["dag.dag_id"], name=op.f("dag_version_dag_id_fkey"), ondelete="CASCADE" + ), + sa.PrimaryKeyConstraint("id", name=op.f("dag_version_pkey")), + sa.UniqueConstraint("dag_id", "version_number", name="dag_id_v_name_v_number_unique_constraint"), + ) + with op.batch_alter_table("dag_code", recreate="always", naming_convention=naming_convention) as batch_op: + batch_op.drop_constraint("dag_code_pkey", type_="primary") + batch_op.add_column( + sa.Column("id", UUIDType(binary=False), primary_key=True), insert_before="fileloc_hash" + ) + batch_op.create_primary_key("dag_code_pkey", ["id"]) + batch_op.add_column(sa.Column("dag_version_id", UUIDType(binary=False), nullable=False)) + batch_op.create_foreign_key( + batch_op.f("dag_code_dag_version_id_fkey"), + "dag_version", + ["dag_version_id"], + ["id"], + ondelete="CASCADE", + ) + batch_op.create_unique_constraint("dag_code_dag_version_id_uq", ["dag_version_id"]) + + with op.batch_alter_table( + "serialized_dag", recreate="always", naming_convention=naming_convention + ) as batch_op: + batch_op.drop_constraint("serialized_dag_pkey", type_="primary") + batch_op.add_column(sa.Column("id", UUIDType(binary=False), primary_key=True)) + batch_op.drop_index("idx_fileloc_hash") + batch_op.drop_column("fileloc_hash") + batch_op.drop_column("fileloc") + batch_op.create_primary_key("serialized_dag_pkey", ["id"]) + batch_op.add_column(sa.Column("dag_version_id", UUIDType(binary=False), nullable=False)) + batch_op.create_foreign_key( + batch_op.f("serialized_dag_dag_version_id_fkey"), + "dag_version", + ["dag_version_id"], + ["id"], + ondelete="CASCADE", + ) + batch_op.create_unique_constraint("serialized_dag_dag_version_id_uq", ["dag_version_id"]) + + with op.batch_alter_table("task_instance", schema=None) as batch_op: + batch_op.add_column(sa.Column("dag_version_id", UUIDType(binary=False))) + batch_op.create_foreign_key( + batch_op.f("task_instance_dag_version_id_fkey"), + "dag_version", + ["dag_version_id"], + ["id"], + ondelete="CASCADE", + ) + + with op.batch_alter_table("task_instance_history", schema=None) as batch_op: + batch_op.add_column(sa.Column("dag_version_id", UUIDType(binary=False))) + + with op.batch_alter_table("dag_run", schema=None) as batch_op: + batch_op.add_column(sa.Column("dag_version_id", UUIDType(binary=False))) + batch_op.create_foreign_key( + batch_op.f("dag_run_dag_version_id_fkey"), + "dag_version", + ["dag_version_id"], + ["id"], + ondelete="CASCADE", + ) + batch_op.drop_column("dag_hash") + + +def downgrade(): + """Unapply add dag versioning.""" + with op.batch_alter_table("task_instance_history", schema=None) as batch_op: + batch_op.drop_column("dag_version_id") + + with op.batch_alter_table("task_instance", schema=None) as batch_op: + batch_op.drop_constraint(batch_op.f("task_instance_dag_version_id_fkey"), type_="foreignkey") + batch_op.drop_column("dag_version_id") + + with op.batch_alter_table("dag_code", schema=None) as batch_op: + batch_op.drop_column("id") + batch_op.drop_constraint(batch_op.f("dag_code_dag_version_id_fkey"), type_="foreignkey") + batch_op.drop_column("dag_version_id") + batch_op.create_primary_key("dag_code_pkey", ["fileloc_hash"]) + + with op.batch_alter_table("serialized_dag", schema=None, naming_convention=naming_convention) as batch_op: + batch_op.drop_column("id") + batch_op.add_column(sa.Column("fileloc", sa.String(length=2000), autoincrement=False, nullable=False)) + batch_op.add_column(sa.Column("fileloc_hash", sa.BIGINT(), autoincrement=False, nullable=False)) + batch_op.create_index("idx_fileloc_hash", ["fileloc_hash"], unique=False) + batch_op.create_primary_key("serialized_dag_pkey", ["dag_id"]) + batch_op.drop_constraint(batch_op.f("serialized_dag_dag_version_id_fkey"), type_="foreignkey") + batch_op.drop_column("dag_version_id") + + with op.batch_alter_table("dag_run", schema=None) as batch_op: + batch_op.add_column(sa.Column("dag_hash", sa.String(length=32), autoincrement=False, nullable=True)) + batch_op.drop_constraint(batch_op.f("dag_run_dag_version_id_fkey"), type_="foreignkey") + batch_op.drop_column("dag_version_id") + + op.drop_table("dag_version") diff --git a/airflow/models/__init__.py b/airflow/models/__init__.py index 1ab4e5584c976..6d8803410532a 100644 --- a/airflow/models/__init__.py +++ b/airflow/models/__init__.py @@ -57,6 +57,7 @@ def import_all_models(): import airflow.models.asset import airflow.models.backfill + import airflow.models.dag_version import airflow.models.dagwarning import airflow.models.errors import airflow.models.serialized_dag diff --git a/airflow/models/backfill.py b/airflow/models/backfill.py index 648b35c5bdebe..11d677542fc0e 100644 --- a/airflow/models/backfill.py +++ b/airflow/models/backfill.py @@ -43,6 +43,7 @@ from airflow.api_connexion.exceptions import NotFound from airflow.exceptions import AirflowException from airflow.models.base import Base, StringID +from airflow.models.dag_version import DagVersion from airflow.settings import json from airflow.utils import timezone from airflow.utils.session import create_session @@ -200,7 +201,7 @@ def _create_backfill_dag_run( ) ) return - + dag_version = DagVersion.get_latest_version(dag.dag_id, session=session) dr = dag.create_dagrun( triggered_by=DagRunTriggeredByType.BACKFILL, execution_date=info.logical_date, @@ -213,6 +214,7 @@ def _create_backfill_dag_run( creating_job_id=None, session=session, backfill_id=backfill_id, + dag_version=dag_version, ) session.add( BackfillDagRun( @@ -253,7 +255,7 @@ def _create_backfill( from airflow.models.serialized_dag import SerializedDagModel with create_session() as session: - serdag = session.get(SerializedDagModel, dag_id) + serdag = session.scalar(SerializedDagModel.latest_item_select_object(dag_id)) if not serdag: raise NotFound(f"Could not find dag {dag_id}") # todo: if dag has no schedule, raise diff --git a/airflow/models/dag.py b/airflow/models/dag.py index 337fc5c8163e1..e6a67c6ad7e5e 100644 --- a/airflow/models/dag.py +++ b/airflow/models/dag.py @@ -85,7 +85,7 @@ ) from airflow.models.base import Base, StringID from airflow.models.baseoperator import BaseOperator -from airflow.models.dagcode import DagCode +from airflow.models.dag_version import DagVersion from airflow.models.dagrun import RUN_ID_REGEX, DagRun from airflow.models.taskinstance import ( Context, @@ -257,7 +257,7 @@ def _create_orm_dagrun( conf, state, run_type, - dag_hash, + dag_version, creating_job_id, data_interval, backfill_id, @@ -273,7 +273,7 @@ def _create_orm_dagrun( conf=conf, state=state, run_type=run_type, - dag_hash=dag_hash, + dag_version=dag_version, creating_job_id=creating_job_id, data_interval=data_interval, triggered_by=triggered_by, @@ -424,6 +424,7 @@ class DAG(TaskSDKDag, LoggingMixin): **Warning**: A fail stop dag can only have tasks with the default trigger rule ("all_success"). An exception will be thrown if any task in a fail stop dag has a non default trigger rule. :param dag_display_name: The display name of the DAG which appears on the UI. + :param version_name: The version name to use in storing the dag to the DB. """ partial: bool = False @@ -1708,7 +1709,7 @@ def create_dagrun( conf: dict | None = None, run_type: DagRunType | None = None, session: Session = NEW_SESSION, - dag_hash: str | None = None, + dag_version: DagVersion | None = None, creating_job_id: int | None = None, data_interval: tuple[datetime, datetime] | None = None, backfill_id: int | None = None, @@ -1728,7 +1729,7 @@ def create_dagrun( :param conf: Dict containing configuration/parameters to pass to the DAG :param creating_job_id: id of the job creating this DagRun :param session: database session - :param dag_hash: Hash of Serialized DAG + :param dag_version: The DagVersion object for this run :param data_interval: Data interval of the DagRun :param backfill_id: id of the backfill run if one exists """ @@ -1800,7 +1801,7 @@ def create_dagrun( conf=conf, state=state, run_type=run_type, - dag_hash=dag_hash, + dag_version=dag_version, creating_job_id=creating_job_id, backfill_id=backfill_id, data_interval=data_interval, @@ -1833,7 +1834,6 @@ def bulk_write_to_db( orm_dags = dag_op.add_dags(session=session) dag_op.update_dags(orm_dags, processor_subdir=processor_subdir, session=session) - DagCode.bulk_sync_to_db((dag.fileloc for dag in dags), session=session) asset_op = AssetModelOperation.collect(dag_op.dags) @@ -2069,6 +2069,9 @@ class DagModel(Base): NUM_DAGS_PER_DAGRUN_QUERY = airflow_conf.getint( "scheduler", "max_dagruns_to_create_per_loop", fallback=10 ) + dag_versions = relationship( + "DagVersion", back_populates="dag_model", cascade="all, delete, delete-orphan" + ) def __init__(self, **kwargs): super().__init__(**kwargs) @@ -2275,9 +2278,10 @@ def dag_ready(dag_id: str, cond: BaseAsset, statuses: dict) -> bool | None: dag_statuses = {} for dag_id, records in by_dag.items(): dag_statuses[dag_id] = {x.asset.uri: True for x in records} - ser_dags = session.scalars( - select(SerializedDagModel).where(SerializedDagModel.dag_id.in_(dag_statuses.keys())) - ).all() + ser_dags = SerializedDagModel.get_latest_serialized_dags( + dag_ids=list(dag_statuses.keys()), session=session + ) + for ser_dag in ser_dags: dag_id = ser_dag.dag_id statuses = dag_statuses[dag_id] @@ -2452,6 +2456,7 @@ def _get_or_create_dagrun( if dr: session.delete(dr) session.commit() + dag_version = DagVersion.get_latest_version(dag.dag_id, session=session) dr = dag.create_dagrun( state=DagRunState.RUNNING, execution_date=execution_date, @@ -2461,6 +2466,7 @@ def _get_or_create_dagrun( conf=conf, data_interval=data_interval, triggered_by=triggered_by, + dag_version=dag_version, ) log.info("created dagrun %s", dr) return dr diff --git a/airflow/models/dag_version.py b/airflow/models/dag_version.py new file mode 100644 index 0000000000000..92511f93f5f6f --- /dev/null +++ b/airflow/models/dag_version.py @@ -0,0 +1,167 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import uuid6 +from sqlalchemy import Column, ForeignKey, Integer, UniqueConstraint, select +from sqlalchemy.orm import relationship +from sqlalchemy_utils import UUIDType + +from airflow.models.base import Base, StringID +from airflow.utils import timezone +from airflow.utils.session import NEW_SESSION, provide_session +from airflow.utils.sqlalchemy import UtcDateTime, with_row_locks + +if TYPE_CHECKING: + from sqlalchemy.orm import Session + from sqlalchemy.sql import Select + +log = logging.getLogger(__name__) + + +class DagVersion(Base): + """Model to track the versions of DAGs in the database.""" + + __tablename__ = "dag_version" + id = Column(UUIDType(binary=False), primary_key=True, default=uuid6.uuid7) + version_number = Column(Integer, nullable=False, default=1) + version_name = Column(StringID()) + dag_id = Column(StringID(), ForeignKey("dag.dag_id", ondelete="CASCADE"), nullable=False) + dag_model = relationship("DagModel", back_populates="dag_versions") + dag_code = relationship( + "DagCode", + back_populates="dag_version", + uselist=False, + cascade="all, delete, delete-orphan", + cascade_backrefs=False, + ) + serialized_dag = relationship( + "SerializedDagModel", + back_populates="dag_version", + uselist=False, + cascade="all, delete, delete-orphan", + cascade_backrefs=False, + ) + dag_runs = relationship("DagRun", back_populates="dag_version", cascade="all, delete, delete-orphan") + task_instances = relationship("TaskInstance", back_populates="dag_version") + created_at = Column(UtcDateTime, default=timezone.utcnow) + + __table_args__ = ( + UniqueConstraint("dag_id", "version_number", name="dag_id_v_name_v_number_unique_constraint"), + ) + + def __repr__(self): + """Represent the object as a string.""" + return f"" + + @classmethod + @provide_session + def write_dag( + cls, + *, + dag_id: str, + version_name: str | None = None, + version_number: int = 1, + session: Session = NEW_SESSION, + ) -> DagVersion: + """ + Write a new DagVersion into database. + + Checks if a version of the DAG exists and increments the version number if it does. + + :param dag_id: The DAG ID. + :param version_name: The version name. + :param version_number: The version number. + :param session: The database session. + :return: The DagVersion object. + """ + existing_dag_version = session.scalar( + with_row_locks(cls._latest_version_select(dag_id), of=DagVersion, session=session, nowait=True) + ) + if existing_dag_version: + version_number = existing_dag_version.version_number + 1 + + dag_version = DagVersion( + dag_id=dag_id, + version_number=version_number, + version_name=version_name, + ) + log.debug("Writing DagVersion %s to the DB", dag_version) + session.add(dag_version) + # Flush is necessary here due to the unique constraint and other linked tables + session.flush() + log.debug("DagVersion %s written to the DB", dag_version) + return dag_version + + @classmethod + def _latest_version_select(cls, dag_id: str) -> Select: + """ + Get the select object to get the latest version of the DAG. + + :param dag_id: The DAG ID. + :return: The select object. + """ + return select(cls).where(cls.dag_id == dag_id).order_by(cls.created_at.desc()).limit(1) + + @classmethod + @provide_session + def get_latest_version(cls, dag_id: str, *, session: Session = NEW_SESSION) -> DagVersion | None: + """ + Get the latest version of the DAG. + + :param dag_id: The DAG ID. + :param session: The database session. + :return: The latest version of the DAG or None if not found. + """ + return session.scalar(cls._latest_version_select(dag_id)) + + @classmethod + @provide_session + def get_version( + cls, + dag_id: str, + version_number: int = 1, + *, + session: Session = NEW_SESSION, + ) -> DagVersion | None: + """ + Get the version of the DAG. + + :param dag_id: The DAG ID. + :param version_number: The version number. + :param session: The database session. + :return: The version of the DAG or None if not found. + """ + version_select_obj = ( + select(cls) + .where(cls.dag_id == dag_id, cls.version_number == version_number) + .order_by(cls.version_number.desc()) + .limit(1) + ) + return session.scalar(version_select_obj) + + @property + def version(self) -> str: + """A human-friendly representation of the version.""" + name = f"{self.version_number}" + if self.version_name: + name = f"{self.version_name}-{self.version_number}" + return name diff --git a/airflow/models/dagbag.py b/airflow/models/dagbag.py index c9ad8edaa4018..5b57c7983ea14 100644 --- a/airflow/models/dagbag.py +++ b/airflow/models/dagbag.py @@ -650,13 +650,12 @@ def _serialize_dag_capturing_errors(dag, session, processor_subdir): ) log.debug("Calling the DAG.bulk_sync_to_db method") try: + DAG.bulk_write_to_db(dags.values(), processor_subdir=processor_subdir, session=session) # Write Serialized DAGs to DB, capturing errors for dag in dags.values(): serialize_errors.extend( _serialize_dag_capturing_errors(dag, session, processor_subdir) ) - - DAG.bulk_write_to_db(dags.values(), processor_subdir=processor_subdir, session=session) except OperationalError: session.rollback() raise diff --git a/airflow/models/dagcode.py b/airflow/models/dagcode.py index 321f819999bf6..c78f6cafaa6fa 100644 --- a/airflow/models/dagcode.py +++ b/airflow/models/dagcode.py @@ -17,26 +17,30 @@ from __future__ import annotations import logging -import os import struct -from datetime import datetime -from typing import TYPE_CHECKING, Collection, Iterable +from typing import TYPE_CHECKING, Collection -from sqlalchemy import BigInteger, Column, String, Text, delete, select +import uuid6 +from sqlalchemy import BigInteger, Column, ForeignKey, String, Text, delete, select from sqlalchemy.dialects.mysql import MEDIUMTEXT +from sqlalchemy.orm import relationship from sqlalchemy.sql.expression import literal +from sqlalchemy_utils import UUIDType from airflow.api_internal.internal_api_call import internal_api_call -from airflow.exceptions import AirflowException, DagCodeNotFound +from airflow.configuration import conf +from airflow.exceptions import DagCodeNotFound from airflow.models.base import Base from airflow.utils import timezone -from airflow.utils.file import correct_maybe_zipped, open_maybe_zipped +from airflow.utils.file import open_maybe_zipped from airflow.utils.session import NEW_SESSION, provide_session from airflow.utils.sqlalchemy import UtcDateTime if TYPE_CHECKING: from sqlalchemy.orm import Session + from airflow.models.dag_version import DagVersion + log = logging.getLogger(__name__) @@ -50,84 +54,38 @@ class DagCode(Base): """ __tablename__ = "dag_code" - - fileloc_hash = Column(BigInteger, nullable=False, primary_key=True, autoincrement=False) + id = Column(UUIDType(binary=False), primary_key=True, default=uuid6.uuid7) + fileloc_hash = Column(BigInteger, nullable=False) fileloc = Column(String(2000), nullable=False) # The max length of fileloc exceeds the limit of indexing. last_updated = Column(UtcDateTime, nullable=False) source_code = Column(Text().with_variant(MEDIUMTEXT(), "mysql"), nullable=False) + dag_version_id = Column( + UUIDType(binary=False), ForeignKey("dag_version.id", ondelete="CASCADE"), nullable=False, unique=True + ) + dag_version = relationship("DagVersion", back_populates="dag_code", uselist=False) - def __init__(self, full_filepath: str, source_code: str | None = None): + def __init__(self, dag_version, full_filepath: str, source_code: str | None = None): + self.dag_version = dag_version self.fileloc = full_filepath self.fileloc_hash = DagCode.dag_fileloc_hash(self.fileloc) self.last_updated = timezone.utcnow() self.source_code = source_code or DagCode.code(self.fileloc) - @provide_session - def sync_to_db(self, session: Session = NEW_SESSION) -> None: - """ - Write code into database. - - :param session: ORM Session - """ - self.bulk_sync_to_db([self.fileloc], session) - @classmethod @provide_session - def bulk_sync_to_db(cls, filelocs: Iterable[str], session: Session = NEW_SESSION) -> None: + def write_dag(cls, dag_version: DagVersion, fileloc: str, session: Session = NEW_SESSION) -> DagCode: """ - Write code in bulk into database. + Write code into database. - :param filelocs: file paths of DAGs to sync + :param fileloc: file path of DAG to sync :param session: ORM Session """ - filelocs = set(filelocs) - filelocs_to_hashes = {fileloc: DagCode.dag_fileloc_hash(fileloc) for fileloc in filelocs} - existing_orm_dag_codes = session.scalars( - select(DagCode) - .filter(DagCode.fileloc_hash.in_(filelocs_to_hashes.values())) - .with_for_update(of=DagCode) - ).all() - - if existing_orm_dag_codes: - existing_orm_dag_codes_map = { - orm_dag_code.fileloc: orm_dag_code for orm_dag_code in existing_orm_dag_codes - } - else: - existing_orm_dag_codes_map = {} - - existing_orm_dag_codes_by_fileloc_hashes = {orm.fileloc_hash: orm for orm in existing_orm_dag_codes} - existing_orm_filelocs = {orm.fileloc for orm in existing_orm_dag_codes_by_fileloc_hashes.values()} - if not existing_orm_filelocs.issubset(filelocs): - conflicting_filelocs = existing_orm_filelocs.difference(filelocs) - hashes_to_filelocs = {DagCode.dag_fileloc_hash(fileloc): fileloc for fileloc in filelocs} - message = "" - for fileloc in conflicting_filelocs: - filename = hashes_to_filelocs[DagCode.dag_fileloc_hash(fileloc)] - message += ( - f"Filename '{filename}' causes a hash collision in the " - f"database with '{fileloc}'. Please rename the file." - ) - raise AirflowException(message) - - existing_filelocs = {dag_code.fileloc for dag_code in existing_orm_dag_codes} - missing_filelocs = filelocs.difference(existing_filelocs) - - for fileloc in missing_filelocs: - orm_dag_code = DagCode(fileloc, cls._get_code_from_file(fileloc)) - session.add(orm_dag_code) - - for fileloc in existing_filelocs: - current_version = existing_orm_dag_codes_by_fileloc_hashes[filelocs_to_hashes[fileloc]] - file_mod_time = datetime.fromtimestamp( - os.path.getmtime(correct_maybe_zipped(fileloc)), tz=timezone.utc - ) - - if file_mod_time > current_version.last_updated: - orm_dag_code = existing_orm_dag_codes_map[fileloc] - orm_dag_code.last_updated = file_mod_time - orm_dag_code.source_code = cls._get_code_from_file(orm_dag_code.fileloc) - session.merge(orm_dag_code) + log.debug("Writing DAG file %s into DagCode table", fileloc) + dag_code = DagCode(dag_version, fileloc, cls._get_code_from_file(fileloc)) + session.add(dag_code) + log.debug("DAG file %s written into DagCode table", fileloc) + return dag_code @classmethod @internal_api_call @@ -170,7 +128,9 @@ def has_dag(cls, fileloc: str, session: Session = NEW_SESSION) -> bool: """ fileloc_hash = cls.dag_fileloc_hash(fileloc) return ( - session.scalars(select(literal(True)).where(cls.fileloc_hash == fileloc_hash)).one_or_none() + session.scalars( + select(literal(True)).where(cls.fileloc_hash == fileloc_hash).limit(1) + ).one_or_none() is not None ) @@ -196,14 +156,25 @@ def code(cls, fileloc, session: Session = NEW_SESSION) -> str: @staticmethod def _get_code_from_file(fileloc): - with open_maybe_zipped(fileloc, "r") as f: - code = f.read() - return code + try: + with open_maybe_zipped(fileloc, "r") as f: + code = f.read() + return code + except FileNotFoundError: + test_mode = conf.get("core", "unit_test_mode") + if test_mode: + return "source_code" + raise @classmethod @provide_session def _get_code_from_db(cls, fileloc, session: Session = NEW_SESSION) -> str: - dag_code = session.scalar(select(cls).where(cls.fileloc_hash == cls.dag_fileloc_hash(fileloc))) + dag_code = session.scalar( + select(cls) + .where(cls.fileloc_hash == cls.dag_fileloc_hash(fileloc)) + .order_by(cls.last_updated.desc()) + .limit(1) + ) if not dag_code: raise DagCodeNotFound() else: diff --git a/airflow/models/dagrun.py b/airflow/models/dagrun.py index 5de0466a6be0e..635cd73ccd8d7 100644 --- a/airflow/models/dagrun.py +++ b/airflow/models/dagrun.py @@ -48,6 +48,7 @@ from sqlalchemy.orm import declared_attr, joinedload, relationship, synonym, validates from sqlalchemy.sql.expression import case, false, select, true from sqlalchemy.sql.functions import coalesce +from sqlalchemy_utils import UUIDType from airflow import settings from airflow.api_internal.internal_api_call import internal_api_call @@ -59,6 +60,7 @@ from airflow.models.abstractoperator import NotMapped from airflow.models.backfill import Backfill from airflow.models.base import Base, StringID +from airflow.models.dag_version import DagVersion from airflow.models.expandinput import NotFullyPopulated from airflow.models.taskinstance import TaskInstance as TI from airflow.models.tasklog import LogTemplate @@ -144,7 +146,6 @@ class DagRun(Base, LoggingMixin): data_interval_end = Column(UtcDateTime) # When a scheduler last attempted to schedule TIs for this DagRun last_scheduling_decision = Column(UtcDateTime) - dag_hash = Column(String(32)) # Foreign key to LogTemplate. DagRun rows created prior to this column's # existence have this set to NULL. Later rows automatically populate this on # insert to point to the latest LogTemplate entry. @@ -164,6 +165,8 @@ class DagRun(Base, LoggingMixin): It's possible this could change if e.g. the dag run is cleared to be rerun, or perhaps re-backfilled. """ + dag_version_id = Column(UUIDType(binary=False), ForeignKey("dag_version.id", ondelete="CASCADE")) + dag_version = relationship("DagVersion", back_populates="dag_runs") # Remove this `if` after upgrading Sphinx-AutoAPI if not TYPE_CHECKING and "BUILDING_AIRFLOW_DOCS" in os.environ: @@ -231,11 +234,11 @@ def __init__( conf: Any | None = None, state: DagRunState | None = None, run_type: str | None = None, - dag_hash: str | None = None, creating_job_id: int | None = None, data_interval: tuple[datetime, datetime] | None = None, triggered_by: DagRunTriggeredByType | None = None, backfill_id: int | None = None, + dag_version: DagVersion | None = None, ): if data_interval is None: # Legacy: Only happen for runs created prior to Airflow 2.2. @@ -256,11 +259,11 @@ def __init__( else: self.queued_at = queued_at self.run_type = run_type - self.dag_hash = dag_hash self.creating_job_id = creating_job_id self.backfill_id = backfill_id self.clear_number = 0 self.triggered_by = triggered_by + self.dag_version = dag_version super().__init__() def __repr__(self): @@ -994,8 +997,9 @@ def recalculate(self) -> _UnfinishedStates: "DagRun Finished: dag_id=%s, execution_date=%s, run_id=%s, " "run_start_date=%s, run_end_date=%s, run_duration=%s, " "state=%s, external_trigger=%s, run_type=%s, " - "data_interval_start=%s, data_interval_end=%s, dag_hash=%s" + "data_interval_start=%s, data_interval_end=%s, dag_version_name=%s" ) + dagv = session.scalar(select(DagVersion).where(DagVersion.id == self.dag_version_id)) self.log.info( msg, self.dag_id, @@ -1013,7 +1017,7 @@ def recalculate(self) -> _UnfinishedStates: self.run_type, self.data_interval_start, self.data_interval_end, - self.dag_hash, + dagv.version if dagv else None, ) with Trace.start_span_from_dagrun(dagrun=self) as span: @@ -1037,7 +1041,7 @@ def recalculate(self) -> _UnfinishedStates: "run_type": str(self.run_type), "data_interval_start": str(self.data_interval_start), "data_interval_end": str(self.data_interval_end), - "dag_hash": str(self.dag_hash), + "dag_version": str(dagv.version if dagv else None), "conf": str(self.conf), } if span.is_recording(): @@ -1454,7 +1458,9 @@ def _get_task_creator( def create_ti_mapping(task: Operator, indexes: Iterable[int]) -> Iterator[dict[str, Any]]: created_counts[task.task_type] += 1 for map_index in indexes: - yield TI.insert_mapping(self.run_id, task, map_index=map_index) + yield TI.insert_mapping( + self.run_id, task, map_index=map_index, dag_version_id=self.dag_version_id + ) creator = create_ti_mapping @@ -1462,7 +1468,7 @@ def create_ti_mapping(task: Operator, indexes: Iterable[int]) -> Iterator[dict[s def create_ti(task: Operator, indexes: Iterable[int]) -> Iterator[TI]: for map_index in indexes: - ti = TI(task, run_id=self.run_id, map_index=map_index) + ti = TI(task, run_id=self.run_id, map_index=map_index, dag_version_id=self.dag_version_id) ti_mutation_hook(ti) created_counts[ti.operator] += 1 yield ti diff --git a/airflow/models/serialized_dag.py b/airflow/models/serialized_dag.py index 32be31d721e34..0d5667cd48fc9 100644 --- a/airflow/models/serialized_dag.py +++ b/airflow/models/serialized_dag.py @@ -25,14 +25,17 @@ from typing import TYPE_CHECKING, Any, Collection import sqlalchemy_jsonfield -from sqlalchemy import BigInteger, Column, Index, LargeBinary, String, and_, exc, or_, select +import uuid6 +from sqlalchemy import Column, ForeignKey, LargeBinary, String, exc, or_, select from sqlalchemy.orm import backref, foreign, relationship from sqlalchemy.sql.expression import func, literal +from sqlalchemy_utils import UUIDType from airflow.api_internal.internal_api_call import internal_api_call from airflow.exceptions import TaskNotFound from airflow.models.base import ID_LEN, Base from airflow.models.dag import DagModel +from airflow.models.dag_version import DagVersion from airflow.models.dagcode import DagCode from airflow.models.dagrun import DagRun from airflow.serialization.dag_dependency import DagDependency @@ -76,19 +79,14 @@ class SerializedDagModel(Base): """ __tablename__ = "serialized_dag" - - dag_id = Column(String(ID_LEN), primary_key=True) - fileloc = Column(String(2000), nullable=False) - # The max length of fileloc exceeds the limit of indexing. - fileloc_hash = Column(BigInteger(), nullable=False) + id = Column(UUIDType(binary=False), primary_key=True, default=uuid6.uuid7) + dag_id = Column(String(ID_LEN), nullable=False) _data = Column("data", sqlalchemy_jsonfield.JSONField(json=json), nullable=True) _data_compressed = Column("data_compressed", LargeBinary, nullable=True) last_updated = Column(UtcDateTime, nullable=False) dag_hash = Column(String(32), nullable=False) processor_subdir = Column(String(2000), nullable=True) - __table_args__ = (Index("idx_fileloc_hash", fileloc_hash, unique=False),) - dag_runs = relationship( DagRun, primaryjoin=dag_id == foreign(DagRun.dag_id), # type: ignore @@ -103,13 +101,15 @@ class SerializedDagModel(Base): innerjoin=True, backref=backref("serialized_dag", uselist=False, innerjoin=True), ) + dag_version_id = Column( + UUIDType(binary=False), ForeignKey("dag_version.id", ondelete="CASCADE"), nullable=False, unique=True + ) + dag_version = relationship("DagVersion", back_populates="serialized_dag") load_op_links = True def __init__(self, dag: DAG, processor_subdir: str | None = None) -> None: self.dag_id = dag.dag_id - self.fileloc = dag.fileloc - self.fileloc_hash = DagCode.dag_fileloc_hash(self.fileloc) self.last_updated = timezone.utcnow() self.processor_subdir = processor_subdir @@ -194,7 +194,9 @@ def write_dag( log.debug("Checking if DAG (%s) changed", dag.dag_id) new_serialized_dag = cls(dag, processor_subdir) serialized_dag_db = session.execute( - select(cls.dag_hash, cls.processor_subdir).where(cls.dag_id == dag.dag_id) + select(cls.dag_hash, cls.processor_subdir) + .where(cls.dag_id == dag.dag_id) + .order_by(cls.last_updated.desc()) ).first() if ( @@ -204,12 +206,52 @@ def write_dag( ): log.debug("Serialized DAG (%s) is unchanged. Skipping writing to DB", dag.dag_id) return False - + dagv = DagVersion.write_dag( + version_name=dag.version_name, + dag_id=dag.dag_id, + session=session, + ) log.debug("Writing Serialized DAG: %s to the DB", dag.dag_id) - session.merge(new_serialized_dag) + new_serialized_dag.dag_version = dagv + session.add(new_serialized_dag) log.debug("DAG: %s written to the DB", dag.dag_id) + + DagCode.write_dag(dagv, dag.fileloc, session=session) return True + @classmethod + def latest_item_select_object(cls, dag_id): + return select(cls).where(cls.dag_id == dag_id).order_by(cls.last_updated.desc()).limit(1) + + @classmethod + @provide_session + def get_latest_serialized_dags( + cls, *, dag_ids: list[str], session: Session = NEW_SESSION + ) -> list[SerializedDagModel]: + """ + Get the latest serialized dags of given DAGs. + + :param dag_ids: The list of DAG IDs. + :param session: The database session. + :return: The latest serialized dag of the DAGs. + """ + # Subquery to get the latest serdag per dag_id + latest_serdag_subquery = ( + session.query(cls.dag_id, func.max(cls.last_updated).label("last_updated")) + .filter(cls.dag_id.in_(dag_ids)) + .group_by(cls.dag_id) + .subquery() + ) + latest_serdags = session.scalars( + select(cls) + .join( + latest_serdag_subquery, + cls.last_updated == latest_serdag_subquery.c.last_updated, + ) + .where(cls.dag_id.in_(dag_ids)) + ).all() + return latest_serdags or [] + @classmethod @provide_session def read_all_dags(cls, session: Session = NEW_SESSION) -> dict[str, SerializedDAG]: @@ -219,7 +261,18 @@ def read_all_dags(cls, session: Session = NEW_SESSION) -> dict[str, SerializedDA :param session: ORM Session :returns: a dict of DAGs read from database """ - serialized_dags = session.scalars(select(cls)) + latest_serialized_dag_subquery = ( + session.query(cls.dag_id, func.max(cls.last_updated).label("max_updated")) + .group_by(cls.dag_id) + .subquery() + ) + serialized_dags = session.scalars( + select(cls).join( + latest_serialized_dag_subquery, + (cls.dag_id == latest_serialized_dag_subquery.c.dag_id) + and (cls.last_updated == latest_serialized_dag_subquery.c.max_updated), + ) + ) dags = {} for row in serialized_dags: @@ -287,22 +340,17 @@ def remove_deleted_dags( :param processor_subdir: dag processor subdir :param session: ORM Session """ - alive_fileloc_hashes = [DagCode.dag_fileloc_hash(fileloc) for fileloc in alive_dag_filelocs] - log.debug( "Deleting Serialized DAGs (for which DAG files are deleted) from %s table ", cls.__tablename__ ) - + # Deleting the DagModel cascade deletes the serialized Dag through the dag version relationship session.execute( - cls.__table__.delete().where( - and_( - cls.fileloc_hash.notin_(alive_fileloc_hashes), - cls.fileloc.notin_(alive_dag_filelocs), - or_( - cls.processor_subdir.is_(None), - cls.processor_subdir == processor_subdir, - ), - ) + DagModel.__table__.delete().where( + DagModel.fileloc.notin_(alive_dag_filelocs), + or_( + DagModel.processor_subdir.is_(None), + DagModel.processor_subdir == processor_subdir, + ), ) ) @@ -334,11 +382,7 @@ def get(cls, dag_id: str, session: Session = NEW_SESSION) -> SerializedDagModel :param dag_id: the DAG to fetch :param session: ORM Session """ - row = session.scalar(select(cls).where(cls.dag_id == dag_id)) - if row: - return row - - return session.scalar(select(cls).where(cls.dag_id == dag_id)) + return session.scalar(cls.latest_item_select_object(dag_id)) @staticmethod @provide_session @@ -373,7 +417,9 @@ def get_last_updated_datetime(cls, dag_id: str, session: Session = NEW_SESSION) :param dag_id: DAG ID :param session: ORM Session """ - return session.scalar(select(cls.last_updated).where(cls.dag_id == dag_id)) + return session.scalar( + select(cls.last_updated).where(cls.dag_id == dag_id).order_by(cls.last_updated.desc()).limit(1) + ) @classmethod @provide_session @@ -395,7 +441,9 @@ def get_latest_version_hash(cls, dag_id: str, session: Session = NEW_SESSION) -> :param session: ORM Session :return: DAG Hash, or None if the DAG is not found """ - return session.scalar(select(cls.dag_hash).where(cls.dag_id == dag_id)) + return session.scalar( + select(cls.dag_hash).where(cls.dag_id == dag_id).order_by(cls.last_updated.desc()).limit(1) + ) @classmethod def get_latest_version_hash_and_updated_datetime( @@ -413,7 +461,10 @@ def get_latest_version_hash_and_updated_datetime( :return: A tuple of DAG Hash and last updated datetime, or None if the DAG is not found """ return session.execute( - select(cls.dag_hash, cls.last_updated).where(cls.dag_id == dag_id) + select(cls.dag_hash, cls.last_updated) + .where(cls.dag_id == dag_id) + .order_by(cls.last_updated.desc()) + .limit(1) ).one_or_none() @classmethod @@ -424,14 +475,27 @@ def get_dag_dependencies(cls, session: Session = NEW_SESSION) -> dict[str, list[ :param session: ORM Session """ + latest_sdag_subquery = ( + session.query(cls.dag_id, func.max(cls.last_updated).label("max_updated")) + .group_by(cls.dag_id) + .subquery() + ) if session.bind.dialect.name in ["sqlite", "mysql"]: query = session.execute( - select(cls.dag_id, func.json_extract(cls._data, "$.dag.dag_dependencies")) + select(cls.dag_id, func.json_extract(cls._data, "$.dag.dag_dependencies")).join( + latest_sdag_subquery, + (cls.dag_id == latest_sdag_subquery.c.dag_id) + and (cls.last_updated == latest_sdag_subquery.c.max_updated), + ) ) iterator = ((dag_id, json.loads(deps_data) if deps_data else []) for dag_id, deps_data in query) else: iterator = session.execute( - select(cls.dag_id, func.json_extract_path(cls._data, "dag", "dag_dependencies")) + select(cls.dag_id, func.json_extract_path(cls._data, "dag", "dag_dependencies")).join( + latest_sdag_subquery, + (cls.dag_id == latest_sdag_subquery.c.dag_id) + and (cls.last_updated == latest_sdag_subquery.c.max_updated), + ) ) return {dag_id: [DagDependency(**d) for d in (deps_data or [])] for dag_id, deps_data in iterator} @@ -439,10 +503,9 @@ def get_dag_dependencies(cls, session: Session = NEW_SESSION) -> dict[str, list[ @internal_api_call @provide_session def get_serialized_dag(dag_id: str, task_id: str, session: Session = NEW_SESSION) -> Operator | None: - from airflow.models.serialized_dag import SerializedDagModel - try: - model = session.get(SerializedDagModel, dag_id) + # get the latest version of the DAG + model = session.scalar(SerializedDagModel.latest_item_select_object(dag_id)) if model: return model.dag.get_task(task_id) except (exc.NoResultFound, TaskNotFound): diff --git a/airflow/models/taskinstance.py b/airflow/models/taskinstance.py index dfd776e685a02..56225586f90fe 100644 --- a/airflow/models/taskinstance.py +++ b/airflow/models/taskinstance.py @@ -45,6 +45,7 @@ Column, DateTime, Float, + ForeignKey, ForeignKeyConstraint, Index, Integer, @@ -68,6 +69,7 @@ from sqlalchemy.orm import lazyload, reconstructor, relationship from sqlalchemy.orm.attributes import NO_VALUE, set_committed_value from sqlalchemy.sql.expression import case, select +from sqlalchemy_utils import UUIDType from airflow import settings from airflow.api_internal.internal_api_call import InternalApiConfig, internal_api_call @@ -818,6 +820,7 @@ def _set_ti_attrs(target, source, include_dag_run=False): target.trigger_id = source.trigger_id target.next_method = source.next_method target.next_kwargs = source.next_kwargs + target.dag_version_id = source.dag_version_id if include_dag_run: target.execution_date = source.execution_date @@ -836,7 +839,7 @@ def _set_ti_attrs(target, source, include_dag_run=False): target.dag_run.data_interval_start = source.dag_run.data_interval_start target.dag_run.data_interval_end = source.dag_run.data_interval_end target.dag_run.last_scheduling_decision = source.dag_run.last_scheduling_decision - target.dag_run.dag_hash = source.dag_run.dag_hash + target.dag_run.dag_version_id = source.dag_run.dag_version_id target.dag_run.updated_at = source.dag_run.updated_at target.dag_run.log_template_id = source.dag_run.log_template_id @@ -1873,8 +1876,10 @@ class TaskInstance(Base, LoggingMixin): next_kwargs = Column(MutableDict.as_mutable(ExtendedJSON)) _task_display_property_value = Column("task_display_name", String(2000), nullable=True) + dag_version_id = Column(UUIDType(binary=False), ForeignKey("dag_version.id", ondelete="CASCADE")) + dag_version = relationship("DagVersion", back_populates="task_instances") # If adding new fields here then remember to add them to - # refresh_from_db() or they won't display in the UI correctly + # _set_ti_attrs() or they won't display in the UI correctly __table_args__ = ( Index("ti_dag_state", dag_id, state), @@ -1939,11 +1944,13 @@ def __init__( run_id: str | None = None, state: str | None = None, map_index: int = -1, + dag_version_id: UUIDType | None = None, ): super().__init__() self.dag_id = task.dag_id self.task_id = task.task_id self.map_index = map_index + self.dag_version_id = dag_version_id self.refresh_from_task(task) if TYPE_CHECKING: assert self.task @@ -1975,7 +1982,7 @@ def stats_tags(self) -> dict[str, str]: return _stats_tags(task_instance=self) @staticmethod - def insert_mapping(run_id: str, task: Operator, map_index: int) -> dict[str, Any]: + def insert_mapping(run_id: str, task: Operator, map_index: int, dag_version_id: int) -> dict[str, Any]: """ Insert mapping. @@ -2004,6 +2011,7 @@ def insert_mapping(run_id: str, task: Operator, map_index: int) -> dict[str, Any "custom_operator_name": getattr(task, "custom_operator_name", None), "map_index": map_index, "_task_display_property_value": task.task_display_name, + "dag_version_id": dag_version_id, } @reconstructor diff --git a/airflow/models/taskinstancehistory.py b/airflow/models/taskinstancehistory.py index 8c77daf925793..e587cf083e3b5 100644 --- a/airflow/models/taskinstancehistory.py +++ b/airflow/models/taskinstancehistory.py @@ -33,6 +33,7 @@ text, ) from sqlalchemy.ext.mutable import MutableDict +from sqlalchemy_utils import UUIDType from airflow.models.base import Base, StringID from airflow.utils import timezone @@ -91,6 +92,7 @@ class TaskInstanceHistory(Base): next_kwargs = Column(MutableDict.as_mutable(ExtendedJSON)) task_display_name = Column("task_display_name", String(2000), nullable=True) + dag_version_id = Column(UUIDType(binary=False)) def __init__( self, diff --git a/airflow/serialization/pydantic/dag_run.py b/airflow/serialization/pydantic/dag_run.py index fd12ca12c0184..a0175e3749d9c 100644 --- a/airflow/serialization/pydantic/dag_run.py +++ b/airflow/serialization/pydantic/dag_run.py @@ -52,7 +52,7 @@ class DagRunPydantic(BaseModelPydantic): data_interval_start: Optional[datetime] data_interval_end: Optional[datetime] last_scheduling_decision: Optional[datetime] - dag_hash: Optional[str] + dag_version_id: Optional[int] updated_at: Optional[datetime] dag: Optional[PydanticDag] consumed_asset_events: List[AssetEventPydantic] # noqa: UP006 diff --git a/airflow/serialization/schema.json b/airflow/serialization/schema.json index 32ccd3dfff9c1..b26b59339816c 100644 --- a/airflow/serialization/schema.json +++ b/airflow/serialization/schema.json @@ -158,6 +158,7 @@ }, "dag_display_name": { "type" : "string"}, "description": { "type" : "string"}, + "version_name": {"type": "string"}, "_concurrency": { "type" : "number"}, "max_active_tasks": { "type" : "number"}, "max_active_runs": { "type" : "number"}, diff --git a/airflow/utils/db.py b/airflow/utils/db.py index d23f54068b59e..d5218b6050e6d 100644 --- a/airflow/utils/db.py +++ b/airflow/utils/db.py @@ -97,7 +97,7 @@ class MappedClassProtocol(Protocol): "2.9.2": "686269002441", "2.10.0": "22ed7efa9da2", "2.10.3": "5f2621c13b39", - "3.0.0": "d03e4a635aa3", + "3.0.0": "2b47dc6bc8df", } diff --git a/airflow/www/views.py b/airflow/www/views.py index e287c027a894e..cce2ec8b88c8b 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -105,6 +105,7 @@ from airflow.models import Connection, DagModel, DagTag, Log, Trigger, XCom from airflow.models.asset import AssetDagRunQueue, AssetEvent, AssetModel, DagScheduleAssetReference from airflow.models.dag import get_asset_triggered_next_run_info +from airflow.models.dag_version import DagVersion from airflow.models.dagrun import RUN_ID_REGEX, DagRun, DagRunType from airflow.models.errors import ParseImportError from airflow.models.serialized_dag import SerializedDagModel @@ -2201,6 +2202,7 @@ def trigger(self, dag_id: str, session: Session = NEW_SESSION): ) try: + dag_version = DagVersion.get_latest_version(dag.dag_id) dag_run = dag.create_dagrun( run_type=DagRunType.MANUAL, execution_date=execution_date, @@ -2208,7 +2210,7 @@ def trigger(self, dag_id: str, session: Session = NEW_SESSION): state=DagRunState.QUEUED, conf=run_conf, external_trigger=True, - dag_hash=get_airflow_app().dag_bag.dags_hash.get(dag_id), + dag_version=dag_version, run_id=run_id, triggered_by=DagRunTriggeredByType.UI, ) diff --git a/docs/apache-airflow/img/airflow_erd.sha256 b/docs/apache-airflow/img/airflow_erd.sha256 index 572ce439c231b..c8963880f842e 100644 --- a/docs/apache-airflow/img/airflow_erd.sha256 +++ b/docs/apache-airflow/img/airflow_erd.sha256 @@ -1 +1 @@ -5ec1019b1b0f43b29fc83638c2a13c0bda90b7e4f0ff542aeab401bbfa9a83e4 \ No newline at end of file +f997746cdee45147831f81bcd2d43ec3ca45d7429afa691e385104987ed51d88 \ No newline at end of file diff --git a/docs/apache-airflow/img/airflow_erd.svg b/docs/apache-airflow/img/airflow_erd.svg index ba935dd6c4be4..b3caf10cba3e1 100644 --- a/docs/apache-airflow/img/airflow_erd.svg +++ b/docs/apache-airflow/img/airflow_erd.svg @@ -4,2271 +4,2353 @@ - - + + %3 - + log - -log - -id - - [INTEGER] - NOT NULL - -dag_id - - [VARCHAR(250)] - -dttm - - [TIMESTAMP] - -event - - [VARCHAR(60)] - -execution_date - - [TIMESTAMP] - -extra - - [TEXT] - -map_index - - [INTEGER] - -owner - - [VARCHAR(500)] - -owner_display_name - - [VARCHAR(500)] - -run_id - - [VARCHAR(250)] - -task_id - - [VARCHAR(250)] - -try_number - - [INTEGER] + +log + +id + + [INTEGER] + NOT NULL + +dag_id + + [VARCHAR(250)] + +dttm + + [TIMESTAMP] + +event + + [VARCHAR(60)] + +execution_date + + [TIMESTAMP] + +extra + + [TEXT] + +map_index + + [INTEGER] + +owner + + [VARCHAR(500)] + +owner_display_name + + [VARCHAR(500)] + +run_id + + [VARCHAR(250)] + +task_id + + [VARCHAR(250)] + +try_number + + [INTEGER] slot_pool - -slot_pool - -id - - [INTEGER] - NOT NULL - -description - - [TEXT] - -include_deferred - - [BOOLEAN] - NOT NULL - -pool - - [VARCHAR(256)] - -slots - - [INTEGER] + +slot_pool + +id + + [INTEGER] + NOT NULL + +description + + [TEXT] + +include_deferred + + [BOOLEAN] + NOT NULL + +pool + + [VARCHAR(256)] + +slots + + [INTEGER] callback_request - -callback_request - -id - - [INTEGER] - NOT NULL - -callback_data - - [JSON] - NOT NULL - -callback_type - - [VARCHAR(20)] - NOT NULL - -created_at - - [TIMESTAMP] - NOT NULL - -priority_weight - - [INTEGER] - NOT NULL - -processor_subdir - - [VARCHAR(2000)] + +callback_request + +id + + [INTEGER] + NOT NULL + +callback_data + + [JSON] + NOT NULL + +callback_type + + [VARCHAR(20)] + NOT NULL + +created_at + + [TIMESTAMP] + NOT NULL + +priority_weight + + [INTEGER] + NOT NULL + +processor_subdir + + [VARCHAR(2000)] dag_priority_parsing_request - -dag_priority_parsing_request - -id - - [VARCHAR(32)] - NOT NULL - -fileloc - - [VARCHAR(2000)] - NOT NULL - - - -dag_code - -dag_code - -fileloc_hash - - [BIGINT] - NOT NULL - -fileloc - - [VARCHAR(2000)] - NOT NULL - -last_updated - - [TIMESTAMP] - NOT NULL - -source_code - - [TEXT] - NOT NULL + +dag_priority_parsing_request + +id + + [VARCHAR(32)] + NOT NULL + +fileloc + + [VARCHAR(2000)] + NOT NULL - + connection - -connection - -id - - [INTEGER] - NOT NULL - -conn_id - - [VARCHAR(250)] - NOT NULL - -conn_type - - [VARCHAR(500)] - NOT NULL - -description - - [TEXT] - -extra - - [TEXT] - -host - - [VARCHAR(500)] - -is_encrypted - - [BOOLEAN] - -is_extra_encrypted - - [BOOLEAN] - -login - - [TEXT] - -password - - [TEXT] - -port - - [INTEGER] - -schema - - [VARCHAR(500)] + +connection + +id + + [INTEGER] + NOT NULL + +conn_id + + [VARCHAR(250)] + NOT NULL + +conn_type + + [VARCHAR(500)] + NOT NULL + +description + + [TEXT] + +extra + + [TEXT] + +host + + [VARCHAR(500)] + +is_encrypted + + [BOOLEAN] + +is_extra_encrypted + + [BOOLEAN] + +login + + [TEXT] + +password + + [TEXT] + +port + + [INTEGER] + +schema + + [VARCHAR(500)] - + variable - -variable - -id - - [INTEGER] - NOT NULL - -description - - [TEXT] - -is_encrypted - - [BOOLEAN] - -key - - [VARCHAR(250)] - -val - - [TEXT] + +variable + +id + + [INTEGER] + NOT NULL + +description + + [TEXT] + +is_encrypted + + [BOOLEAN] + +key + + [VARCHAR(250)] + +val + + [TEXT] - + import_error - -import_error - -id - - [INTEGER] - NOT NULL - -filename - - [VARCHAR(1024)] - -processor_subdir - - [VARCHAR(2000)] - -stacktrace - - [TEXT] - -timestamp - - [TIMESTAMP] + +import_error + +id + + [INTEGER] + NOT NULL + +filename + + [VARCHAR(1024)] + +processor_subdir + + [VARCHAR(2000)] + +stacktrace + + [TEXT] + +timestamp + + [TIMESTAMP] - + job - -job - -id - - [INTEGER] - NOT NULL - -dag_id - - [VARCHAR(250)] - -end_date - - [TIMESTAMP] - -executor_class - - [VARCHAR(500)] - -hostname - - [VARCHAR(500)] - -job_type - - [VARCHAR(30)] - -latest_heartbeat - - [TIMESTAMP] - -start_date - - [TIMESTAMP] - -state - - [VARCHAR(20)] - -unixname - - [VARCHAR(1000)] - - - -serialized_dag - -serialized_dag - -dag_id - - [VARCHAR(250)] - NOT NULL - -dag_hash - - [VARCHAR(32)] - NOT NULL - -data - - [JSON] - -data_compressed - - [BYTEA] - -fileloc - - [VARCHAR(2000)] - NOT NULL - -fileloc_hash - - [BIGINT] - NOT NULL - -last_updated - - [TIMESTAMP] - NOT NULL - -processor_subdir - - [VARCHAR(2000)] + +job + +id + + [INTEGER] + NOT NULL + +dag_id + + [VARCHAR(250)] + +end_date + + [TIMESTAMP] + +executor_class + + [VARCHAR(500)] + +hostname + + [VARCHAR(500)] + +job_type + + [VARCHAR(30)] + +latest_heartbeat + + [TIMESTAMP] + +start_date + + [TIMESTAMP] + +state + + [VARCHAR(20)] + +unixname + + [VARCHAR(1000)] - + asset_alias - -asset_alias - -id - - [INTEGER] - NOT NULL - -group - - [VARCHAR(1500)] - NOT NULL - -name - - [VARCHAR(1500)] - NOT NULL + +asset_alias + +id + + [INTEGER] + NOT NULL + +group + + [VARCHAR(1500)] + NOT NULL + +name + + [VARCHAR(1500)] + NOT NULL - + asset_alias_asset - -asset_alias_asset - -alias_id - - [INTEGER] - NOT NULL - -asset_id - - [INTEGER] - NOT NULL + +asset_alias_asset + +alias_id + + [INTEGER] + NOT NULL + +asset_id + + [INTEGER] + NOT NULL asset_alias--asset_alias_asset - -0..N -1 + +0..N +1 - + asset_alias_asset_event - -asset_alias_asset_event - -alias_id - - [INTEGER] - NOT NULL - -event_id - - [INTEGER] - NOT NULL + +asset_alias_asset_event + +alias_id + + [INTEGER] + NOT NULL + +event_id + + [INTEGER] + NOT NULL asset_alias--asset_alias_asset_event - -0..N -1 + +0..N +1 - + dag_schedule_asset_alias_reference - -dag_schedule_asset_alias_reference - -alias_id - - [INTEGER] - NOT NULL - -dag_id - - [VARCHAR(250)] - NOT NULL - -created_at - - [TIMESTAMP] - NOT NULL - -updated_at - - [TIMESTAMP] - NOT NULL + +dag_schedule_asset_alias_reference + +alias_id + + [INTEGER] + NOT NULL + +dag_id + + [VARCHAR(250)] + NOT NULL + +created_at + + [TIMESTAMP] + NOT NULL + +updated_at + + [TIMESTAMP] + NOT NULL asset_alias--dag_schedule_asset_alias_reference - -0..N -1 + +0..N +1 - + asset - -asset - -id - - [INTEGER] - NOT NULL - -created_at - - [TIMESTAMP] - NOT NULL - -extra - - [JSON] - NOT NULL - -group - - [VARCHAR(1500)] - NOT NULL - -name - - [VARCHAR(1500)] - NOT NULL - -updated_at - - [TIMESTAMP] - NOT NULL - -uri - - [VARCHAR(1500)] - NOT NULL + +asset + +id + + [INTEGER] + NOT NULL + +created_at + + [TIMESTAMP] + NOT NULL + +extra + + [JSON] + NOT NULL + +group + + [VARCHAR(1500)] + NOT NULL + +name + + [VARCHAR(1500)] + NOT NULL + +updated_at + + [TIMESTAMP] + NOT NULL + +uri + + [VARCHAR(1500)] + NOT NULL asset--asset_alias_asset - -0..N -1 + +0..N +1 - + asset_active - -asset_active - -name - - [VARCHAR(1500)] - NOT NULL - -uri - - [VARCHAR(1500)] - NOT NULL + +asset_active + +name + + [VARCHAR(1500)] + NOT NULL + +uri + + [VARCHAR(1500)] + NOT NULL asset--asset_active - -1 -1 + +1 +1 asset--asset_active - -1 -1 + +1 +1 - + dag_schedule_asset_reference - -dag_schedule_asset_reference - -asset_id - - [INTEGER] - NOT NULL - -dag_id - - [VARCHAR(250)] - NOT NULL - -created_at - - [TIMESTAMP] - NOT NULL - -updated_at - - [TIMESTAMP] - NOT NULL + +dag_schedule_asset_reference + +asset_id + + [INTEGER] + NOT NULL + +dag_id + + [VARCHAR(250)] + NOT NULL + +created_at + + [TIMESTAMP] + NOT NULL + +updated_at + + [TIMESTAMP] + NOT NULL asset--dag_schedule_asset_reference - -0..N -1 + +0..N +1 - + task_outlet_asset_reference - -task_outlet_asset_reference - -asset_id - - [INTEGER] - NOT NULL - -dag_id - - [VARCHAR(250)] - NOT NULL - -task_id - - [VARCHAR(250)] - NOT NULL - -created_at - - [TIMESTAMP] - NOT NULL - -updated_at - - [TIMESTAMP] - NOT NULL + +task_outlet_asset_reference + +asset_id + + [INTEGER] + NOT NULL + +dag_id + + [VARCHAR(250)] + NOT NULL + +task_id + + [VARCHAR(250)] + NOT NULL + +created_at + + [TIMESTAMP] + NOT NULL + +updated_at + + [TIMESTAMP] + NOT NULL asset--task_outlet_asset_reference - -0..N -1 + +0..N +1 - + asset_dag_run_queue - -asset_dag_run_queue - -asset_id - - [INTEGER] - NOT NULL - -target_dag_id - - [VARCHAR(250)] - NOT NULL - -created_at - - [TIMESTAMP] - NOT NULL + +asset_dag_run_queue + +asset_id + + [INTEGER] + NOT NULL + +target_dag_id + + [VARCHAR(250)] + NOT NULL + +created_at + + [TIMESTAMP] + NOT NULL asset--asset_dag_run_queue - -0..N -1 + +0..N +1 - + asset_event - -asset_event - -id - - [INTEGER] - NOT NULL - -asset_id - - [INTEGER] - NOT NULL - -extra - - [JSON] - NOT NULL - -source_dag_id - - [VARCHAR(250)] - -source_map_index - - [INTEGER] - -source_run_id - - [VARCHAR(250)] - -source_task_id - - [VARCHAR(250)] - -timestamp - - [TIMESTAMP] - NOT NULL + +asset_event + +id + + [INTEGER] + NOT NULL + +asset_id + + [INTEGER] + NOT NULL + +extra + + [JSON] + NOT NULL + +source_dag_id + + [VARCHAR(250)] + +source_map_index + + [INTEGER] + +source_run_id + + [VARCHAR(250)] + +source_task_id + + [VARCHAR(250)] + +timestamp + + [TIMESTAMP] + NOT NULL asset_event--asset_alias_asset_event - -0..N -1 + +0..N +1 - + dagrun_asset_event - -dagrun_asset_event - -dag_run_id - - [INTEGER] - NOT NULL - -event_id - - [INTEGER] - NOT NULL + +dagrun_asset_event + +dag_run_id + + [INTEGER] + NOT NULL + +event_id + + [INTEGER] + NOT NULL asset_event--dagrun_asset_event - -0..N -1 + +0..N +1 - + dag - -dag - -dag_id - - [VARCHAR(250)] - NOT NULL - -asset_expression - - [JSON] - -dag_display_name - - [VARCHAR(2000)] - -default_view - - [VARCHAR(25)] - -description - - [TEXT] - -fileloc - - [VARCHAR(2000)] - -has_import_errors - - [BOOLEAN] - -has_task_concurrency_limits - - [BOOLEAN] - NOT NULL - -is_active - - [BOOLEAN] - -is_paused - - [BOOLEAN] - -last_expired - - [TIMESTAMP] - -last_parsed_time - - [TIMESTAMP] - -max_active_runs - - [INTEGER] - -max_active_tasks - - [INTEGER] - NOT NULL - -max_consecutive_failed_dag_runs - - [INTEGER] - NOT NULL - -next_dagrun - - [TIMESTAMP] - -next_dagrun_create_after - - [TIMESTAMP] - -next_dagrun_data_interval_end - - [TIMESTAMP] - -next_dagrun_data_interval_start - - [TIMESTAMP] - -owners - - [VARCHAR(2000)] - -processor_subdir - - [VARCHAR(2000)] - -timetable_description - - [VARCHAR(1000)] - -timetable_summary - - [TEXT] + +dag + +dag_id + + [VARCHAR(250)] + NOT NULL + +asset_expression + + [JSON] + +dag_display_name + + [VARCHAR(2000)] + +default_view + + [VARCHAR(25)] + +description + + [TEXT] + +fileloc + + [VARCHAR(2000)] + +has_import_errors + + [BOOLEAN] + +has_task_concurrency_limits + + [BOOLEAN] + NOT NULL + +is_active + + [BOOLEAN] + +is_paused + + [BOOLEAN] + +last_expired + + [TIMESTAMP] + +last_parsed_time + + [TIMESTAMP] + +max_active_runs + + [INTEGER] + +max_active_tasks + + [INTEGER] + NOT NULL + +max_consecutive_failed_dag_runs + + [INTEGER] + NOT NULL + +next_dagrun + + [TIMESTAMP] + +next_dagrun_create_after + + [TIMESTAMP] + +next_dagrun_data_interval_end + + [TIMESTAMP] + +next_dagrun_data_interval_start + + [TIMESTAMP] + +owners + + [VARCHAR(2000)] + +processor_subdir + + [VARCHAR(2000)] + +timetable_description + + [VARCHAR(1000)] + +timetable_summary + + [TEXT] dag--dag_schedule_asset_alias_reference - -0..N -1 + +0..N +1 dag--dag_schedule_asset_reference - -0..N -1 + +0..N +1 dag--task_outlet_asset_reference - -0..N -1 + +0..N +1 dag--asset_dag_run_queue - -0..N -1 + +0..N +1 + + + +dag_version + +dag_version + +id + + [UUID] + NOT NULL + +created_at + + [TIMESTAMP] + +dag_id + + [VARCHAR(250)] + NOT NULL + +version_name + + [VARCHAR(250)] + +version_number + + [INTEGER] + NOT NULL + + + +dag--dag_version + +0..N +1 - + dag_tag - -dag_tag - -dag_id - - [VARCHAR(250)] - NOT NULL - -name - - [VARCHAR(100)] - NOT NULL + +dag_tag + +dag_id + + [VARCHAR(250)] + NOT NULL + +name + + [VARCHAR(100)] + NOT NULL - + dag--dag_tag - -0..N -1 + +0..N +1 - + dag_owner_attributes - -dag_owner_attributes - -dag_id - - [VARCHAR(250)] - NOT NULL - -owner - - [VARCHAR(500)] - NOT NULL - -link - - [VARCHAR(500)] - NOT NULL + +dag_owner_attributes + +dag_id + + [VARCHAR(250)] + NOT NULL + +owner + + [VARCHAR(500)] + NOT NULL + +link + + [VARCHAR(500)] + NOT NULL - + dag--dag_owner_attributes - -0..N -1 + +0..N +1 - + dag_warning - -dag_warning - -dag_id - - [VARCHAR(250)] - NOT NULL - -warning_type - - [VARCHAR(50)] - NOT NULL - -message - - [TEXT] - NOT NULL - -timestamp - - [TIMESTAMP] - NOT NULL + +dag_warning + +dag_id + + [VARCHAR(250)] + NOT NULL + +warning_type + + [VARCHAR(50)] + NOT NULL + +message + + [TEXT] + NOT NULL + +timestamp + + [TIMESTAMP] + NOT NULL - + dag--dag_warning - -0..N -1 - - - -log_template - -log_template - -id - - [INTEGER] - NOT NULL - -created_at - - [TIMESTAMP] - NOT NULL - -elasticsearch_id - - [TEXT] - NOT NULL - -filename - - [TEXT] - NOT NULL + +0..N +1 - + dag_run - -dag_run - -id - - [INTEGER] - NOT NULL - -backfill_id - - [INTEGER] - -clear_number - - [INTEGER] - NOT NULL - -conf - - [BYTEA] - -creating_job_id - - [INTEGER] - -dag_hash - - [VARCHAR(32)] - -dag_id - - [VARCHAR(250)] - NOT NULL - -data_interval_end - - [TIMESTAMP] - -data_interval_start - - [TIMESTAMP] - -end_date - - [TIMESTAMP] - -external_trigger - - [BOOLEAN] - -last_scheduling_decision - - [TIMESTAMP] - -log_template_id - - [INTEGER] - -logical_date - - [TIMESTAMP] - NOT NULL - -queued_at - - [TIMESTAMP] - -run_id - - [VARCHAR(250)] - NOT NULL - -run_type - - [VARCHAR(50)] - NOT NULL - -start_date - - [TIMESTAMP] - -state - - [VARCHAR(50)] - -triggered_by - - [VARCHAR(50)] - -updated_at - - [TIMESTAMP] + +dag_run + +id + + [INTEGER] + NOT NULL + +backfill_id + + [INTEGER] + +clear_number + + [INTEGER] + NOT NULL + +conf + + [BYTEA] + +creating_job_id + + [INTEGER] + +dag_id + + [VARCHAR(250)] + NOT NULL + +dag_version_id + + [UUID] + +data_interval_end + + [TIMESTAMP] + +data_interval_start + + [TIMESTAMP] + +end_date + + [TIMESTAMP] + +external_trigger + + [BOOLEAN] + +last_scheduling_decision + + [TIMESTAMP] + +log_template_id + + [INTEGER] + +logical_date + + [TIMESTAMP] + NOT NULL + +queued_at + + [TIMESTAMP] + +run_id + + [VARCHAR(250)] + NOT NULL + +run_type + + [VARCHAR(50)] + NOT NULL + +start_date + + [TIMESTAMP] + +state + + [VARCHAR(50)] + +triggered_by + + [VARCHAR(50)] + +updated_at + + [TIMESTAMP] + + + +dag_version--dag_run + +0..N +{0,1} - - -log_template--dag_run - -0..N -{0,1} + + +dag_code + +dag_code + +id + + [UUID] + NOT NULL + +dag_version_id + + [UUID] + NOT NULL + +fileloc + + [VARCHAR(2000)] + NOT NULL + +fileloc_hash + + [BIGINT] + NOT NULL + +last_updated + + [TIMESTAMP] + NOT NULL + +source_code + + [TEXT] + NOT NULL + + + +dag_version--dag_code + +0..N +1 - - -dag_run--dagrun_asset_event - -0..N -1 + + +serialized_dag + +serialized_dag + +id + + [UUID] + NOT NULL + +dag_hash + + [VARCHAR(32)] + NOT NULL + +dag_id + + [VARCHAR(250)] + NOT NULL + +dag_version_id + + [UUID] + NOT NULL + +data + + [JSON] + +data_compressed + + [BYTEA] + +last_updated + + [TIMESTAMP] + NOT NULL + +processor_subdir + + [VARCHAR(2000)] + + + +dag_version--serialized_dag + +0..N +1 task_instance - -task_instance - -id - - [UUID] - NOT NULL - -custom_operator_name - - [VARCHAR(1000)] - -dag_id - - [VARCHAR(250)] - NOT NULL - -duration - - [DOUBLE_PRECISION] - -end_date - - [TIMESTAMP] - -executor - - [VARCHAR(1000)] - -executor_config - - [BYTEA] - -external_executor_id - - [VARCHAR(250)] - -hostname - - [VARCHAR(1000)] - -last_heartbeat_at - - [TIMESTAMP] - -map_index - - [INTEGER] - NOT NULL - -max_tries - - [INTEGER] - -next_kwargs - - [JSON] - -next_method - - [VARCHAR(1000)] - -operator - - [VARCHAR(1000)] - -pid - - [INTEGER] - -pool - - [VARCHAR(256)] - NOT NULL - -pool_slots - - [INTEGER] - NOT NULL - -priority_weight - - [INTEGER] - -queue - - [VARCHAR(256)] - -queued_by_job_id - - [INTEGER] - -queued_dttm - - [TIMESTAMP] - -rendered_map_index - - [VARCHAR(250)] - -run_id - - [VARCHAR(250)] - NOT NULL - -start_date - - [TIMESTAMP] - -state - - [VARCHAR(20)] - -task_display_name - - [VARCHAR(2000)] - -task_id - - [VARCHAR(250)] - NOT NULL - -trigger_id - - [INTEGER] - -trigger_timeout - - [TIMESTAMP] - -try_number - - [INTEGER] - -unixname - - [VARCHAR(1000)] - -updated_at - - [TIMESTAMP] + +task_instance + +id + + [UUID] + NOT NULL + +custom_operator_name + + [VARCHAR(1000)] + +dag_id + + [VARCHAR(250)] + NOT NULL + +dag_version_id + + [UUID] + +duration + + [DOUBLE_PRECISION] + +end_date + + [TIMESTAMP] + +executor + + [VARCHAR(1000)] + +executor_config + + [BYTEA] + +external_executor_id + + [VARCHAR(250)] + +hostname + + [VARCHAR(1000)] + +last_heartbeat_at + + [TIMESTAMP] + +map_index + + [INTEGER] + NOT NULL + +max_tries + + [INTEGER] + +next_kwargs + + [JSON] + +next_method + + [VARCHAR(1000)] + +operator + + [VARCHAR(1000)] + +pid + + [INTEGER] + +pool + + [VARCHAR(256)] + NOT NULL + +pool_slots + + [INTEGER] + NOT NULL + +priority_weight + + [INTEGER] + +queue + + [VARCHAR(256)] + +queued_by_job_id + + [INTEGER] + +queued_dttm + + [TIMESTAMP] + +rendered_map_index + + [VARCHAR(250)] + +run_id + + [VARCHAR(250)] + NOT NULL + +start_date + + [TIMESTAMP] + +state + + [VARCHAR(20)] + +task_display_name + + [VARCHAR(2000)] + +task_id + + [VARCHAR(250)] + NOT NULL + +trigger_id + + [INTEGER] + +trigger_timeout + + [TIMESTAMP] + +try_number + + [INTEGER] + +unixname + + [VARCHAR(1000)] + +updated_at + + [TIMESTAMP] + + + +dag_version--task_instance + +0..N +{0,1} + + + +dag_run--dagrun_asset_event + +0..N +1 - + dag_run--task_instance - -0..N -1 + +0..N +1 - + dag_run--task_instance - -0..N -1 + +0..N +1 - + backfill_dag_run - -backfill_dag_run - -id - - [INTEGER] - NOT NULL - -backfill_id - - [INTEGER] - NOT NULL - -dag_run_id - - [INTEGER] - -exception_reason - - [VARCHAR(250)] - -logical_date - - [TIMESTAMP] - NOT NULL - -sort_ordinal - - [INTEGER] - NOT NULL + +backfill_dag_run + +id + + [INTEGER] + NOT NULL + +backfill_id + + [INTEGER] + NOT NULL + +dag_run_id + + [INTEGER] + +exception_reason + + [VARCHAR(250)] + +logical_date + + [TIMESTAMP] + NOT NULL + +sort_ordinal + + [INTEGER] + NOT NULL - + dag_run--backfill_dag_run - -0..N -{0,1} + +0..N +{0,1} - + dag_run_note - -dag_run_note - -dag_run_id - - [INTEGER] - NOT NULL - -content - - [VARCHAR(1000)] - -created_at - - [TIMESTAMP] - NOT NULL - -updated_at - - [TIMESTAMP] - NOT NULL - -user_id - - [VARCHAR(128)] + +dag_run_note + +dag_run_id + + [INTEGER] + NOT NULL + +content + + [VARCHAR(1000)] + +created_at + + [TIMESTAMP] + NOT NULL + +updated_at + + [TIMESTAMP] + NOT NULL + +user_id + + [VARCHAR(128)] - + dag_run--dag_run_note - -1 -1 + +1 +1 - + task_reschedule - -task_reschedule - -id - - [INTEGER] - NOT NULL - -dag_id - - [VARCHAR(250)] - NOT NULL - -duration - - [INTEGER] - NOT NULL - -end_date - - [TIMESTAMP] - NOT NULL - -map_index - - [INTEGER] - NOT NULL - -reschedule_date - - [TIMESTAMP] - NOT NULL - -run_id - - [VARCHAR(250)] - NOT NULL - -start_date - - [TIMESTAMP] - NOT NULL - -task_id - - [VARCHAR(250)] - NOT NULL - -try_number - - [INTEGER] - NOT NULL + +task_reschedule + +id + + [INTEGER] + NOT NULL + +dag_id + + [VARCHAR(250)] + NOT NULL + +duration + + [INTEGER] + NOT NULL + +end_date + + [TIMESTAMP] + NOT NULL + +map_index + + [INTEGER] + NOT NULL + +reschedule_date + + [TIMESTAMP] + NOT NULL + +run_id + + [VARCHAR(250)] + NOT NULL + +start_date + + [TIMESTAMP] + NOT NULL + +task_id + + [VARCHAR(250)] + NOT NULL + +try_number + + [INTEGER] + NOT NULL - + dag_run--task_reschedule - -0..N -1 + +0..N +1 - + dag_run--task_reschedule - -0..N -1 + +0..N +1 - + task_instance--task_reschedule - -0..N -1 + +0..N +1 - + task_instance--task_reschedule - -0..N -1 + +0..N +1 - + task_instance--task_reschedule - -0..N -1 + +0..N +1 - + task_instance--task_reschedule - -0..N -1 + +0..N +1 - + rendered_task_instance_fields - -rendered_task_instance_fields - -dag_id - - [VARCHAR(250)] - NOT NULL - -map_index - - [INTEGER] - NOT NULL - -run_id - - [VARCHAR(250)] - NOT NULL - -task_id - - [VARCHAR(250)] - NOT NULL - -k8s_pod_yaml - - [JSON] - -rendered_fields - - [JSON] - NOT NULL + +rendered_task_instance_fields + +dag_id + + [VARCHAR(250)] + NOT NULL + +map_index + + [INTEGER] + NOT NULL + +run_id + + [VARCHAR(250)] + NOT NULL + +task_id + + [VARCHAR(250)] + NOT NULL + +k8s_pod_yaml + + [JSON] + +rendered_fields + + [JSON] + NOT NULL - + task_instance--rendered_task_instance_fields - -0..N -1 + +0..N +1 - + task_instance--rendered_task_instance_fields - -0..N -1 + +0..N +1 - + task_instance--rendered_task_instance_fields - -0..N -1 + +0..N +1 - + task_instance--rendered_task_instance_fields - -0..N -1 + +0..N +1 - + task_map - -task_map - -dag_id - - [VARCHAR(250)] - NOT NULL - -map_index - - [INTEGER] - NOT NULL - -run_id - - [VARCHAR(250)] - NOT NULL - -task_id - - [VARCHAR(250)] - NOT NULL - -keys - - [JSON] - -length - - [INTEGER] - NOT NULL + +task_map + +dag_id + + [VARCHAR(250)] + NOT NULL + +map_index + + [INTEGER] + NOT NULL + +run_id + + [VARCHAR(250)] + NOT NULL + +task_id + + [VARCHAR(250)] + NOT NULL + +keys + + [JSON] + +length + + [INTEGER] + NOT NULL - + task_instance--task_map - -0..N -1 + +0..N +1 - + task_instance--task_map - -0..N -1 + +0..N +1 - + task_instance--task_map - -0..N -1 + +0..N +1 - + task_instance--task_map - -0..N -1 + +0..N +1 - + xcom - -xcom - -dag_run_id - - [INTEGER] - NOT NULL - -key - - [VARCHAR(512)] - NOT NULL - -map_index - - [INTEGER] - NOT NULL - -task_id - - [VARCHAR(250)] - NOT NULL - -dag_id - - [VARCHAR(250)] - NOT NULL - -run_id - - [VARCHAR(250)] - NOT NULL - -timestamp - - [TIMESTAMP] - NOT NULL - -value - - [BYTEA] + +xcom + +dag_run_id + + [INTEGER] + NOT NULL + +key + + [VARCHAR(512)] + NOT NULL + +map_index + + [INTEGER] + NOT NULL + +task_id + + [VARCHAR(250)] + NOT NULL + +dag_id + + [VARCHAR(250)] + NOT NULL + +run_id + + [VARCHAR(250)] + NOT NULL + +timestamp + + [TIMESTAMP] + NOT NULL + +value + + [BYTEA] - + task_instance--xcom - -0..N -1 + +0..N +1 - + task_instance--xcom - -0..N -1 + +0..N +1 - + task_instance--xcom - -0..N -1 + +0..N +1 - + task_instance--xcom - -0..N -1 + +0..N +1 - + task_instance_note - -task_instance_note - -dag_id - - [VARCHAR(250)] - NOT NULL - -map_index - - [INTEGER] - NOT NULL - -run_id - - [VARCHAR(250)] - NOT NULL - -task_id - - [VARCHAR(250)] - NOT NULL - -content - - [VARCHAR(1000)] - -created_at - - [TIMESTAMP] - NOT NULL - -updated_at - - [TIMESTAMP] - NOT NULL - -user_id - - [VARCHAR(128)] + +task_instance_note + +dag_id + + [VARCHAR(250)] + NOT NULL + +map_index + + [INTEGER] + NOT NULL + +run_id + + [VARCHAR(250)] + NOT NULL + +task_id + + [VARCHAR(250)] + NOT NULL + +content + + [VARCHAR(1000)] + +created_at + + [TIMESTAMP] + NOT NULL + +updated_at + + [TIMESTAMP] + NOT NULL + +user_id + + [VARCHAR(128)] - + task_instance--task_instance_note - -0..N -1 + +0..N +1 - + task_instance--task_instance_note - -0..N -1 + +0..N +1 - + task_instance--task_instance_note - -0..N -1 + +0..N +1 - + task_instance--task_instance_note - -0..N -1 + +0..N +1 - + task_instance_history - -task_instance_history - -id - - [INTEGER] - NOT NULL - -custom_operator_name - - [VARCHAR(1000)] - -dag_id - - [VARCHAR(250)] - NOT NULL - -duration - - [DOUBLE_PRECISION] - -end_date - - [TIMESTAMP] - -executor - - [VARCHAR(1000)] - -executor_config - - [BYTEA] - -external_executor_id - - [VARCHAR(250)] - -hostname - - [VARCHAR(1000)] - -map_index - - [INTEGER] - NOT NULL - -max_tries - - [INTEGER] - -next_kwargs - - [JSON] - -next_method - - [VARCHAR(1000)] - -operator - - [VARCHAR(1000)] - -pid - - [INTEGER] - -pool - - [VARCHAR(256)] - NOT NULL - -pool_slots - - [INTEGER] - NOT NULL - -priority_weight - - [INTEGER] - -queue - - [VARCHAR(256)] - -queued_by_job_id - - [INTEGER] - -queued_dttm - - [TIMESTAMP] - -rendered_map_index - - [VARCHAR(250)] - -run_id - - [VARCHAR(250)] - NOT NULL - -start_date - - [TIMESTAMP] - -state - - [VARCHAR(20)] - -task_display_name - - [VARCHAR(2000)] - -task_id - - [VARCHAR(250)] - NOT NULL - -trigger_id - - [INTEGER] - -trigger_timeout - - [TIMESTAMP] - -try_number - - [INTEGER] - NOT NULL - -unixname - - [VARCHAR(1000)] - -updated_at - - [TIMESTAMP] + +task_instance_history + +id + + [INTEGER] + NOT NULL + +custom_operator_name + + [VARCHAR(1000)] + +dag_id + + [VARCHAR(250)] + NOT NULL + +dag_version_id + + [UUID] + +duration + + [DOUBLE_PRECISION] + +end_date + + [TIMESTAMP] + +executor + + [VARCHAR(1000)] + +executor_config + + [BYTEA] + +external_executor_id + + [VARCHAR(250)] + +hostname + + [VARCHAR(1000)] + +map_index + + [INTEGER] + NOT NULL + +max_tries + + [INTEGER] + +next_kwargs + + [JSON] + +next_method + + [VARCHAR(1000)] + +operator + + [VARCHAR(1000)] + +pid + + [INTEGER] + +pool + + [VARCHAR(256)] + NOT NULL + +pool_slots + + [INTEGER] + NOT NULL + +priority_weight + + [INTEGER] + +queue + + [VARCHAR(256)] + +queued_by_job_id + + [INTEGER] + +queued_dttm + + [TIMESTAMP] + +rendered_map_index + + [VARCHAR(250)] + +run_id + + [VARCHAR(250)] + NOT NULL + +start_date + + [TIMESTAMP] + +state + + [VARCHAR(20)] + +task_display_name + + [VARCHAR(2000)] + +task_id + + [VARCHAR(250)] + NOT NULL + +trigger_id + + [INTEGER] + +trigger_timeout + + [TIMESTAMP] + +try_number + + [INTEGER] + NOT NULL + +unixname + + [VARCHAR(1000)] + +updated_at + + [TIMESTAMP] - + task_instance--task_instance_history - -0..N -1 + +0..N +1 - + task_instance--task_instance_history - -0..N -1 + +0..N +1 - + task_instance--task_instance_history - -0..N -1 + +0..N +1 - + task_instance--task_instance_history - -0..N -1 + +0..N +1 + + + +log_template + +log_template + +id + + [INTEGER] + NOT NULL + +created_at + + [TIMESTAMP] + NOT NULL + +elasticsearch_id + + [TEXT] + NOT NULL + +filename + + [TEXT] + NOT NULL + + + +log_template--dag_run + +0..N +{0,1} - + backfill - -backfill - -id - - [INTEGER] - NOT NULL - -completed_at - - [TIMESTAMP] - -created_at - - [TIMESTAMP] - NOT NULL - -dag_id - - [VARCHAR(250)] - NOT NULL - -dag_run_conf - - [JSON] - NOT NULL - -from_date - - [TIMESTAMP] - NOT NULL - -is_paused - - [BOOLEAN] - -max_active_runs - - [INTEGER] - NOT NULL - -reprocess_behavior - - [VARCHAR(250)] - NOT NULL - -to_date - - [TIMESTAMP] - NOT NULL - -updated_at - - [TIMESTAMP] - NOT NULL + +backfill + +id + + [INTEGER] + NOT NULL + +completed_at + + [TIMESTAMP] + +created_at + + [TIMESTAMP] + NOT NULL + +dag_id + + [VARCHAR(250)] + NOT NULL + +dag_run_conf + + [JSON] + NOT NULL + +from_date + + [TIMESTAMP] + NOT NULL + +is_paused + + [BOOLEAN] + +max_active_runs + + [INTEGER] + NOT NULL + +reprocess_behavior + + [VARCHAR(250)] + NOT NULL + +to_date + + [TIMESTAMP] + NOT NULL + +updated_at + + [TIMESTAMP] + NOT NULL - + backfill--dag_run - -0..N -{0,1} + +0..N +{0,1} - + backfill--backfill_dag_run - -0..N -1 + +0..N +1 - + trigger - -trigger - -id - - [INTEGER] - NOT NULL - -classpath - - [VARCHAR(1000)] - NOT NULL - -created_date - - [TIMESTAMP] - NOT NULL - -kwargs - - [TEXT] - NOT NULL - -triggerer_id - - [INTEGER] + +trigger + +id + + [INTEGER] + NOT NULL + +classpath + + [VARCHAR(1000)] + NOT NULL + +created_date + + [TIMESTAMP] + NOT NULL + +kwargs + + [TEXT] + NOT NULL + +triggerer_id + + [INTEGER] - + trigger--task_instance - -0..N -{0,1} - - - -alembic_version - -alembic_version - -version_num - - [VARCHAR(32)] - NOT NULL + +0..N +{0,1} session - -session - -id - - [INTEGER] - NOT NULL - -data - - [BYTEA] - -expiry - - [TIMESTAMP] - -session_id - - [VARCHAR(255)] + +session + +id + + [INTEGER] + NOT NULL + +data + + [BYTEA] + +expiry + + [TIMESTAMP] + +session_id + + [VARCHAR(255)] - + +alembic_version + +alembic_version + +version_num + + [VARCHAR(32)] + NOT NULL + + + ab_user - -ab_user - -id - - [INTEGER] - NOT NULL - -active - - [BOOLEAN] - -changed_by_fk - - [INTEGER] - -changed_on - - [TIMESTAMP] - -created_by_fk - - [INTEGER] - -created_on - - [TIMESTAMP] - -email - - [VARCHAR(512)] - NOT NULL - -fail_login_count - - [INTEGER] - -first_name - - [VARCHAR(256)] - NOT NULL - -last_login - - [TIMESTAMP] - -last_name - - [VARCHAR(256)] - NOT NULL - -login_count - - [INTEGER] - -password - - [VARCHAR(256)] - -username - - [VARCHAR(512)] - NOT NULL + +ab_user + +id + + [INTEGER] + NOT NULL + +active + + [BOOLEAN] + +changed_by_fk + + [INTEGER] + +changed_on + + [TIMESTAMP] + +created_by_fk + + [INTEGER] + +created_on + + [TIMESTAMP] + +email + + [VARCHAR(512)] + NOT NULL + +fail_login_count + + [INTEGER] + +first_name + + [VARCHAR(256)] + NOT NULL + +last_login + + [TIMESTAMP] + +last_name + + [VARCHAR(256)] + NOT NULL + +login_count + + [INTEGER] + +password + + [VARCHAR(256)] + +username + + [VARCHAR(512)] + NOT NULL - + ab_user--ab_user - -0..N -{0,1} + +0..N +{0,1} - + ab_user--ab_user - -0..N -{0,1} + +0..N +{0,1} - + ab_user_role - -ab_user_role - -id - - [INTEGER] - NOT NULL - -role_id - - [INTEGER] - -user_id - - [INTEGER] + +ab_user_role + +id + + [INTEGER] + NOT NULL + +role_id + + [INTEGER] + +user_id + + [INTEGER] - + ab_user--ab_user_role - -0..N -{0,1} + +0..N +{0,1} - + ab_register_user - -ab_register_user - -id - - [INTEGER] - NOT NULL - -email - - [VARCHAR(512)] - NOT NULL - -first_name - - [VARCHAR(256)] - NOT NULL - -last_name - - [VARCHAR(256)] - NOT NULL - -password - - [VARCHAR(256)] - -registration_date - - [TIMESTAMP] - -registration_hash - - [VARCHAR(256)] - -username - - [VARCHAR(512)] - NOT NULL + +ab_register_user + +id + + [INTEGER] + NOT NULL + +email + + [VARCHAR(512)] + NOT NULL + +first_name + + [VARCHAR(256)] + NOT NULL + +last_name + + [VARCHAR(256)] + NOT NULL + +password + + [VARCHAR(256)] + +registration_date + + [TIMESTAMP] + +registration_hash + + [VARCHAR(256)] + +username + + [VARCHAR(512)] + NOT NULL - + ab_permission - -ab_permission - -id - - [INTEGER] - NOT NULL - -name - - [VARCHAR(100)] - NOT NULL + +ab_permission + +id + + [INTEGER] + NOT NULL + +name + + [VARCHAR(100)] + NOT NULL - + ab_permission_view - -ab_permission_view - -id - - [INTEGER] - NOT NULL - -permission_id - - [INTEGER] - -view_menu_id - - [INTEGER] + +ab_permission_view + +id + + [INTEGER] + NOT NULL + +permission_id + + [INTEGER] + +view_menu_id + + [INTEGER] - + ab_permission--ab_permission_view - -0..N -{0,1} + +0..N +{0,1} - + ab_permission_view_role - -ab_permission_view_role - -id - - [INTEGER] - NOT NULL - -permission_view_id - - [INTEGER] - -role_id - - [INTEGER] + +ab_permission_view_role + +id + + [INTEGER] + NOT NULL + +permission_view_id + + [INTEGER] + +role_id + + [INTEGER] - + ab_permission_view--ab_permission_view_role - -0..N -{0,1} + +0..N +{0,1} - + ab_view_menu - -ab_view_menu - -id - - [INTEGER] - NOT NULL - -name - - [VARCHAR(250)] - NOT NULL + +ab_view_menu + +id + + [INTEGER] + NOT NULL + +name + + [VARCHAR(250)] + NOT NULL - + ab_view_menu--ab_permission_view - -0..N -{0,1} + +0..N +{0,1} - + ab_role - -ab_role - -id - - [INTEGER] - NOT NULL - -name - - [VARCHAR(64)] - NOT NULL + +ab_role + +id + + [INTEGER] + NOT NULL + +name + + [VARCHAR(64)] + NOT NULL - + ab_role--ab_user_role - -0..N -{0,1} + +0..N +{0,1} - + ab_role--ab_permission_view_role - -0..N -{0,1} + +0..N +{0,1} - + alembic_version_fab - -alembic_version_fab - -version_num - - [VARCHAR(32)] - NOT NULL + +alembic_version_fab + +version_num + + [VARCHAR(32)] + NOT NULL diff --git a/docs/apache-airflow/migrations-ref.rst b/docs/apache-airflow/migrations-ref.rst index 61dde39958e21..bc73f387a2d28 100644 --- a/docs/apache-airflow/migrations-ref.rst +++ b/docs/apache-airflow/migrations-ref.rst @@ -39,7 +39,9 @@ Here's the list of all the Database Migrations that are executed via when you ru +-------------------------+------------------+-------------------+--------------------------------------------------------------+ | Revision ID | Revises ID | Airflow Version | Description | +=========================+==================+===================+==============================================================+ -| ``d03e4a635aa3`` (head) | ``d8cd3297971e`` | ``3.0.0`` | Drop DAG pickling. | +| ``2b47dc6bc8df`` (head) | ``d03e4a635aa3`` | ``3.0.0`` | add dag versioning. | ++-------------------------+------------------+-------------------+--------------------------------------------------------------+ +| ``d03e4a635aa3`` | ``d8cd3297971e`` | ``3.0.0`` | Drop DAG pickling. | +-------------------------+------------------+-------------------+--------------------------------------------------------------+ | ``d8cd3297971e`` | ``5f57a45b8433`` | ``3.0.0`` | Add last_heartbeat_at directly to TI. | +-------------------------+------------------+-------------------+--------------------------------------------------------------+ diff --git a/hatch_build.py b/hatch_build.py index 91b9256b4d031..5e95775c62f26 100644 --- a/hatch_build.py +++ b/hatch_build.py @@ -431,6 +431,7 @@ # The issue tracking it is https://github.com/apache/airflow/issues/28723 "sqlalchemy>=1.4.36,<2.0", "sqlalchemy-jsonfield>=1.0", + "sqlalchemy-utils>=0.41.2", "tabulate>=0.7.5", "tenacity>=8.0.0,!=8.2.0", "termcolor>=1.1.0", diff --git a/providers/tests/fab/auth_manager/api_endpoints/test_dag_run_endpoint.py b/providers/tests/fab/auth_manager/api_endpoints/test_dag_run_endpoint.py index 05bac3394ef0f..c1f3e71f306bd 100644 --- a/providers/tests/fab/auth_manager/api_endpoints/test_dag_run_endpoint.py +++ b/providers/tests/fab/auth_manager/api_endpoints/test_dag_run_endpoint.py @@ -138,6 +138,7 @@ def _create_dag(self, dag_id): session.add(dag_instance) dag = DAG(dag_id=dag_id, schedule=None, params={"validated_number": Param(1, minimum=1, maximum=10)}) self.app.dag_bag.bag_dag(dag) + self.app.dag_bag.sync_to_db() return dag_instance def _create_test_dag_run(self, state=DagRunState.RUNNING, extra_dag=False, commit=True, idx_start=1): diff --git a/scripts/ci/pre_commit/check_ti_vs_tis_attributes.py b/scripts/ci/pre_commit/check_ti_vs_tis_attributes.py index 16c1df48a9e8b..d1782e6290744 100755 --- a/scripts/ci/pre_commit/check_ti_vs_tis_attributes.py +++ b/scripts/ci/pre_commit/check_ti_vs_tis_attributes.py @@ -54,6 +54,7 @@ def compare_attributes(path1, path2): "rendered_task_instance_fields", # Storing last heartbeat for historic TIs is not interesting/useful "last_heartbeat_at", + "dag_version", } # exclude attrs not necessary to be in TaskInstanceHistory if not diff: return diff --git a/task_sdk/src/airflow/sdk/definitions/dag.py b/task_sdk/src/airflow/sdk/definitions/dag.py index 479c1ea09b80c..9a124d237ed57 100644 --- a/task_sdk/src/airflow/sdk/definitions/dag.py +++ b/task_sdk/src/airflow/sdk/definitions/dag.py @@ -355,6 +355,7 @@ class DAG: **Warning**: A fail stop dag can only have tasks with the default trigger rule ("all_success"). An exception will be thrown if any task in a fail stop dag has a non default trigger rule. :param dag_display_name: The display name of the DAG which appears on the UI. + :param version_name: The version name of the DAG. This is used to identify the version of the DAG. """ __serialized_fields: ClassVar[frozenset[str] | None] = None @@ -437,6 +438,10 @@ class DAG: has_on_success_callback: bool = attrs.field(init=False) has_on_failure_callback: bool = attrs.field(init=False) + version_name: str | None = attrs.field( + default=None, + validator=attrs.validators.optional(attrs.validators.instance_of(str)), + ) def __attrs_post_init__(self): from airflow.utils import timezone @@ -1063,6 +1068,7 @@ def dag( auto_register: bool = True, fail_stop: bool = False, dag_display_name: str | None = None, + version_name: str | None = None, ) -> Callable[[Callable], Callable[..., DAG]]: """ Python dag decorator which wraps a function into an Airflow DAG. diff --git a/tests/api_connexion/endpoints/test_dag_run_endpoint.py b/tests/api_connexion/endpoints/test_dag_run_endpoint.py index 576b28b153531..6bbd63fa8b49d 100644 --- a/tests/api_connexion/endpoints/test_dag_run_endpoint.py +++ b/tests/api_connexion/endpoints/test_dag_run_endpoint.py @@ -89,6 +89,7 @@ def _create_dag(self, dag_id): session.add(dag_instance) dag = DAG(dag_id=dag_id, schedule=None, params={"validated_number": Param(1, minimum=1, maximum=10)}) self.app.dag_bag.bag_dag(dag) + self.app.dag_bag.sync_to_db() return dag_instance def _create_test_dag_run(self, state=DagRunState.RUNNING, extra_dag=False, commit=True, idx_start=1): @@ -1205,12 +1206,14 @@ def test_raises_validation_error_for_invalid_params(self): assert "Invalid input for param" in response.json["detail"] @mock.patch("airflow.api_connexion.endpoints.dag_run_endpoint.get_airflow_app") - def test_dagrun_creation_exception_is_handled(self, mock_get_app, session): + @mock.patch("airflow.api_connexion.endpoints.dag_run_endpoint.DagVersion") + def test_dagrun_creation_exception_is_handled(self, mock_get_dag_version, mock_get_app, session): self._create_dag("TEST_DAG_ID") error_message = "Encountered Error" mock_get_app.return_value.dag_bag.get_dag.return_value.create_dagrun.side_effect = ValueError( error_message ) + mock_get_dag_version.get_latest_version.return_value = mock.MagicMock() response = self.client.post( "api/v1/dags/TEST_DAG_ID/dagRuns", json={"execution_date": "2020-11-10T08:25:56Z"}, diff --git a/tests/api_connexion/endpoints/test_task_endpoint.py b/tests/api_connexion/endpoints/test_task_endpoint.py index b558f8dbf161e..a6e47f4f6a1bb 100644 --- a/tests/api_connexion/endpoints/test_task_endpoint.py +++ b/tests/api_connexion/endpoints/test_task_endpoint.py @@ -78,7 +78,6 @@ def setup_dag(self, configured_app): with DAG(self.unscheduled_dag_id, start_date=None, schedule=None) as unscheduled_dag: task4 = EmptyOperator(task_id=self.unscheduled_task_id1, params={"is_unscheduled": True}) task5 = EmptyOperator(task_id=self.unscheduled_task_id2, params={"is_unscheduled": True}) - task1 >> task2 task4 >> task5 dag_bag = DagBag(os.devnull, include_examples=False) @@ -87,6 +86,7 @@ def setup_dag(self, configured_app): mapped_dag.dag_id: mapped_dag, unscheduled_dag.dag_id: unscheduled_dag, } + DagBag._sync_to_db(dag_bag.dags) configured_app.dag_bag = dag_bag # type:ignore @staticmethod @@ -246,7 +246,9 @@ def test_unscheduled_task(self): def test_should_respond_200_serialized(self): # Get the dag out of the dagbag before we patch it to an empty one - SerializedDagModel.write_dag(self.app.dag_bag.get_dag(self.dag_id)) + dag = self.app.dag_bag.get_dag(self.dag_id) + dag.sync_to_db() + SerializedDagModel.write_dag(dag) dag_bag = DagBag(os.devnull, include_examples=False, read_dags_from_db=True) patcher = unittest.mock.patch.object(self.app, "dag_bag", dag_bag) diff --git a/tests/cli/commands/test_task_command.py b/tests/cli/commands/test_task_command.py index ed1a2c28754f8..50e3d393f8108 100644 --- a/tests/cli/commands/test_task_command.py +++ b/tests/cli/commands/test_task_command.py @@ -227,7 +227,7 @@ def test_cli_test_different_path(self, session, tmp_path): .one() ) # confirm that the serialized dag location has not been updated - assert ser_dag.fileloc == orig_file_path.as_posix() + assert ser_dag.dag_version.dag_code.fileloc == orig_file_path.as_posix() assert ser_dag.data["dag"]["_processor_dags_folder"] == orig_dags_folder.as_posix() assert ser_dag.data["dag"]["fileloc"] == orig_file_path.as_posix() assert ser_dag.dag._processor_dags_folder == orig_dags_folder.as_posix() diff --git a/tests/dag_processing/test_job_runner.py b/tests/dag_processing/test_job_runner.py index 192a12358e8dd..2bf0bcb6dbb0e 100644 --- a/tests/dag_processing/test_job_runner.py +++ b/tests/dag_processing/test_job_runner.py @@ -53,6 +53,7 @@ from airflow.jobs.dag_processor_job_runner import DagProcessorJobRunner from airflow.jobs.job import Job from airflow.models import DagBag, DagModel, DbCallbackRequest +from airflow.models.dag_version import DagVersion from airflow.models.dagcode import DagCode from airflow.models.serialized_dag import SerializedDagModel from airflow.utils import timezone @@ -664,13 +665,6 @@ def test_scan_stale_dags(self): ) assert active_dag_count == 1 - serialized_dag_count = ( - session.query(func.count(SerializedDagModel.dag_id)) - .filter(SerializedDagModel.fileloc == test_dag_path) - .scalar() - ) - assert serialized_dag_count == 1 - manager.processor._scan_stale_dags() active_dag_count = ( @@ -682,10 +676,12 @@ def test_scan_stale_dags(self): serialized_dag_count = ( session.query(func.count(SerializedDagModel.dag_id)) - .filter(SerializedDagModel.fileloc == test_dag_path) + .filter(SerializedDagModel.dag_id == dag.dag_id) .scalar() ) - assert serialized_dag_count == 0 + # Deactivating the DagModel should not delete the SerializedDagModel + # SerializedDagModel gives history about Dags + assert serialized_dag_count == 1 @pytest.mark.skip_if_database_isolation_mode # Test is broken in db isolation mode @conf_vars( @@ -1088,10 +1084,12 @@ def test_refresh_dags_dir_deactivates_deleted_zipped_dags(self, tmp_path): with mock.patch("airflow.dag_processing.manager.might_contain_dag", return_value=False): manager.processor._refresh_dag_dir() - # Assert dag removed from SDM - assert not SerializedDagModel.has_dag("test_zip_dag") - # assert code deleted - assert not DagCode.has_dag(dag.fileloc) + # Deleting the python file should not delete SDM for versioning sake + assert SerializedDagModel.has_dag("test_zip_dag") + # assert code not deleted for versioning sake + assert DagCode.has_dag(dag.fileloc) + # assert dagversion was not deleted + assert DagVersion.get_latest_version(dag.dag_id) # assert dag deactivated assert not dag.get_is_active() diff --git a/tests/dag_processing/test_processor.py b/tests/dag_processing/test_processor.py index f117b3ffe4581..29aa4e15a3888 100644 --- a/tests/dag_processing/test_processor.py +++ b/tests/dag_processing/test_processor.py @@ -161,6 +161,7 @@ def test_execute_on_failure_callbacks_without_dag(self, mock_ti_handle_failure, with create_session() as session: session.query(TaskInstance).delete() dag = dagbag.get_dag("example_branch_operator") + dag.sync_to_db() triggered_by_kwargs = {"triggered_by": DagRunTriggeredByType.TEST} if AIRFLOW_V_3_0_PLUS else {} dagrun = dag.create_dagrun( state=State.RUNNING, diff --git a/tests/jobs/test_scheduler_job.py b/tests/jobs/test_scheduler_job.py index da3ccc201eb4a..d0b147a5c3727 100644 --- a/tests/jobs/test_scheduler_job.py +++ b/tests/jobs/test_scheduler_job.py @@ -54,6 +54,7 @@ from airflow.models.asset import AssetActive, AssetDagRunQueue, AssetEvent, AssetModel from airflow.models.backfill import Backfill, _create_backfill from airflow.models.dag import DAG, DagModel +from airflow.models.dag_version import DagVersion from airflow.models.dagbag import DagBag from airflow.models.dagrun import DagRun from airflow.models.db_callback_request import DbCallbackRequest @@ -141,11 +142,10 @@ def clean_db(): clear_db_runs() clear_db_backfills() clear_db_pools() - clear_db_dags() clear_db_import_errors() clear_db_jobs() clear_db_assets() - # DO NOT try to run clear_db_serialized_dags() here - this will break the tests + # DO NOT try to run clear_db_serialized_dags() or clear_db_dags here - this will break the tests # The tests expect DAGs to be fully loaded here via setUpClass method below @pytest.fixture(autouse=True) @@ -167,9 +167,7 @@ def set_instance_attrs(self, dagbag) -> Generator: # enqueue! self.null_exec: MockExecutor | None = MockExecutor() # Since we don't want to store the code for the DAG defined in this file - with patch("airflow.dag_processing.manager.SerializedDagModel.remove_deleted_dags"), patch( - "airflow.models.dag.DagCode.bulk_sync_to_db" - ): + with patch("airflow.models.serialized_dag.SerializedDagModel.remove_deleted_dags"): yield self.null_exec = None @@ -2875,7 +2873,6 @@ def test_dagrun_root_after_dagrun_unfinished(self, mock_executor): Noted: the DagRun state could be still in running state during CI. """ - clear_db_dags() dag_id = "test_dagrun_states_root_future" dag = self.dagbag.get_dag(dag_id) dag.sync_to_db() @@ -3316,7 +3313,7 @@ def test_verify_integrity_if_dag_not_changed(self, dag_maker): assert tis_count == 1 latest_dag_version = SerializedDagModel.get_latest_version_hash(dr.dag_id, session=session) - assert dr.dag_hash == latest_dag_version + assert dr.dag_version.serialized_dag.dag_hash == latest_dag_version session.rollback() session.close() @@ -3350,7 +3347,7 @@ def test_verify_integrity_if_dag_changed(self, dag_maker): dr = drs[0] dag_version_1 = SerializedDagModel.get_latest_version_hash(dr.dag_id, session=session) - assert dr.dag_hash == dag_version_1 + assert dr.dag_version.serialized_dag.dag_hash == dag_version_1 assert self.job_runner.dagbag.dags == {"test_verify_integrity_if_dag_changed": dag} assert len(self.job_runner.dagbag.dags.get("test_verify_integrity_if_dag_changed").tasks) == 1 @@ -3367,7 +3364,7 @@ def test_verify_integrity_if_dag_changed(self, dag_maker): drs = DagRun.find(dag_id=dag.dag_id, session=session) assert len(drs) == 1 dr = drs[0] - assert dr.dag_hash == dag_version_2 + assert dr.dag_version.serialized_dag.dag_hash == dag_version_2 assert self.job_runner.dagbag.dags == {"test_verify_integrity_if_dag_changed": dag} assert len(self.job_runner.dagbag.dags.get("test_verify_integrity_if_dag_changed").tasks) == 2 @@ -3383,54 +3380,7 @@ def test_verify_integrity_if_dag_changed(self, dag_maker): assert tis_count == 2 latest_dag_version = SerializedDagModel.get_latest_version_hash(dr.dag_id, session=session) - assert dr.dag_hash == latest_dag_version - - session.rollback() - session.close() - - def test_verify_integrity_if_dag_disappeared(self, dag_maker, caplog): - # CleanUp - with create_session() as session: - session.query(SerializedDagModel).filter( - SerializedDagModel.dag_id == "test_verify_integrity_if_dag_disappeared" - ).delete(synchronize_session=False) - - with dag_maker(dag_id="test_verify_integrity_if_dag_disappeared") as dag: - BashOperator(task_id="dummy", bash_command="echo hi") - - scheduler_job = Job() - self.job_runner = SchedulerJobRunner(job=scheduler_job, subdir=os.devnull) - - session = settings.Session() - orm_dag = dag_maker.dag_model - assert orm_dag is not None - - scheduler_job = Job() - self.job_runner = SchedulerJobRunner(job=scheduler_job, subdir=os.devnull) - - self.job_runner.processor_agent = mock.MagicMock() - dag = self.job_runner.dagbag.get_dag("test_verify_integrity_if_dag_disappeared", session=session) - self.job_runner._create_dag_runs([orm_dag], session) - dag_id = dag.dag_id - drs = DagRun.find(dag_id=dag_id, session=session) - assert len(drs) == 1 - dr = drs[0] - - dag_version_1 = SerializedDagModel.get_latest_version_hash(dag_id, session=session) - assert dr.dag_hash == dag_version_1 - assert self.job_runner.dagbag.dags == {"test_verify_integrity_if_dag_disappeared": dag} - assert len(self.job_runner.dagbag.dags.get("test_verify_integrity_if_dag_disappeared").tasks) == 1 - - SerializedDagModel.remove_dag(dag_id=dag_id) - dag = self.job_runner.dagbag.dags[dag_id] - self.job_runner.dagbag.dags = MagicMock() - self.job_runner.dagbag.dags.get.side_effect = [dag, None] - session.flush() - with caplog.at_level(logging.WARNING): - callback = self.job_runner._schedule_dag_run(dr, session) - assert "The DAG disappeared before verifying integrity" in caplog.text - - assert callback is None + assert dr.dag_version.serialized_dag.dag_hash == latest_dag_version session.rollback() session.close() @@ -4015,6 +3965,7 @@ def test_create_dag_runs_assets(self, session, dag_maker): - That the run created is on QUEUED State - That dag_model has next_dagrun """ + clear_db_dags() asset1 = Asset(uri="ds1") asset2 = Asset(uri="ds2") @@ -4386,6 +4337,7 @@ def test_do_schedule_max_active_runs_dag_timed_out(self, dag_maker): session = settings.Session() data_interval = dag.infer_automated_data_interval(DEFAULT_LOGICAL_DATE) triggered_by_kwargs = {"triggered_by": DagRunTriggeredByType.TEST} if AIRFLOW_V_3_0_PLUS else {} + dag_version = DagVersion.get_latest_version(dag.dag_id) run1 = dag.create_dagrun( run_type=DagRunType.SCHEDULED, execution_date=DEFAULT_DATE, @@ -4393,6 +4345,7 @@ def test_do_schedule_max_active_runs_dag_timed_out(self, dag_maker): start_date=timezone.utcnow() - timedelta(seconds=2), session=session, data_interval=data_interval, + dag_version=dag_version, **triggered_by_kwargs, ) @@ -4405,6 +4358,7 @@ def test_do_schedule_max_active_runs_dag_timed_out(self, dag_maker): state=State.QUEUED, session=session, data_interval=data_interval, + dag_version=dag_version, **triggered_by_kwargs, ) @@ -4602,10 +4556,8 @@ def test_do_schedule_max_active_runs_and_manual_trigger(self, dag_maker, mock_ex BashOperator(task_id="dummy3", bash_command="true") session = settings.Session() - dag_run = dag_maker.create_dagrun( - state=State.QUEUED, - session=session, - ) + dag_version = DagVersion.get_latest_version(dag.dag_id) + dag_run = dag_maker.create_dagrun(state=State.QUEUED, session=session, dag_version=dag_version) dag.sync_to_db(session=session) # Update the date fields @@ -4647,23 +4599,31 @@ def test_max_active_runs_in_a_dag_doesnt_stop_running_dag_runs_in_other_dags(sel start_date=DEFAULT_DATE, schedule=timedelta(hours=1), max_active_runs=1, - ): + ) as dag: EmptyOperator(task_id="mytask") - - dr = dag_maker.create_dagrun(run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dag_version = DagVersion.get_latest_version(dag.dag_id) + dr = dag_maker.create_dagrun( + run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) for _ in range(29): - dr = dag_maker.create_dagrun_after(dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun_after( + dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) with dag_maker( "test_dag2", start_date=timezone.datetime(2020, 1, 1), schedule=timedelta(hours=1), - ): + ) as dag2: EmptyOperator(task_id="mytask") - - dr = dag_maker.create_dagrun(run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dag_version = DagVersion.get_latest_version(dag2.dag_id) + dr = dag_maker.create_dagrun( + run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) for _ in range(9): - dr = dag_maker.create_dagrun_after(dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun_after( + dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) scheduler_job = Job() self.job_runner = SchedulerJobRunner(job=scheduler_job, subdir=os.devnull) @@ -4695,20 +4655,29 @@ def test_max_active_runs_in_a_dag_doesnt_prevent_backfill_from_running(self, dag ) as dag: EmptyOperator(task_id="mytask") dag1_dag_id = dag.dag_id - dr = dag_maker.create_dagrun(run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dag_version = DagVersion.get_latest_version(dag1_dag_id) + dr = dag_maker.create_dagrun( + run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) for _ in range(29): - dr = dag_maker.create_dagrun_after(dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun_after( + dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) with dag_maker( "test_dag2", start_date=timezone.datetime(2020, 1, 1), schedule=timedelta(days=1), - ): + ) as dag: EmptyOperator(task_id="mytask") - - dr = dag_maker.create_dagrun(run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dag_version = DagVersion.get_latest_version(dag.dag_id) + dr = dag_maker.create_dagrun( + run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) for _ in range(9): - dr = dag_maker.create_dagrun_after(dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun_after( + dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) scheduler_job = Job() self.job_runner = SchedulerJobRunner(job=scheduler_job, subdir=os.devnull) @@ -4842,19 +4811,30 @@ def _running_counts(): ) dag1_non_b_running, dag1_b_running, total_running = _running_counts() + dag_version = DagVersion.get_latest_version(dag1_dag_id) # now let's create some "normal" dag runs and verify that they can run - dr = dag_maker.create_dagrun(run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun( + run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) for _ in range(29): - dr = dag_maker.create_dagrun_after(dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun_after( + dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) with dag_maker( "test_dag2", start_date=timezone.datetime(2020, 1, 1), schedule=timedelta(days=1), - ): + ) as dag2: EmptyOperator(task_id="mytask") - dr = dag_maker.create_dagrun(run_type=DagRunType.SCHEDULED, state=State.QUEUED) + + dag_version = DagVersion.get_latest_version(dag2.dag_id) + dr = dag_maker.create_dagrun( + run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) for _ in range(9): - dr = dag_maker.create_dagrun_after(dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun_after( + dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) # initial state -- nothing is running assert dag1_non_b_running == 0 @@ -4982,19 +4962,30 @@ def _running_counts(): assert session.scalar(select(func.count()).select_from(DagRun)) == 6 assert session.scalar(select(func.count()).where(DagRun.dag_id == dag1_dag_id)) == 6 + dag_version = DagVersion.get_latest_version(dag1_dag_id) # now let's create some "normal" dag runs and verify that they can run - dr = dag_maker.create_dagrun(run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun( + run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) for _ in range(29): - dr = dag_maker.create_dagrun_after(dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun_after( + dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) with dag_maker( "test_dag2", start_date=timezone.datetime(2020, 1, 1), schedule=timedelta(days=1), - ): + ) as dag2: EmptyOperator(task_id="mytask") - dr = dag_maker.create_dagrun(run_type=DagRunType.SCHEDULED, state=State.QUEUED) + + dag_version = DagVersion.get_latest_version(dag2.dag_id) + dr = dag_maker.create_dagrun( + run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) for _ in range(9): - dr = dag_maker.create_dagrun_after(dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun_after( + dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) # ok at this point, there are new dag runs created, but no new running runs dag1_non_b_running, dag1_b_running, total_running = _running_counts() @@ -5130,9 +5121,14 @@ def test_start_queued_dagruns_do_follow_execution_date_order(self, dag_maker): with dag_maker("test_dag1", max_active_runs=1): EmptyOperator(task_id="mytask") date = DEFAULT_DATE + dag_version = DagVersion.get_latest_version("test_dag1") for i in range(30): dr = dag_maker.create_dagrun( - run_id=f"dagrun_{i}", run_type=DagRunType.SCHEDULED, state=State.QUEUED, execution_date=date + run_id=f"dagrun_{i}", + run_type=DagRunType.SCHEDULED, + state=State.QUEUED, + execution_date=date, + dag_version=dag_version, ) date = dr.execution_date + timedelta(hours=1) scheduler_job = Job() @@ -5175,11 +5171,15 @@ def test_no_dagruns_would_stuck_in_running(self, dag_maker): with dag_maker("test_dagrun_states_are_correct_1", max_active_runs=1, start_date=date) as dag: task1 = EmptyOperator(task_id="dummy_task") - dr1_running = dag_maker.create_dagrun(run_id="dr1_run_1", execution_date=date) + dag_version = DagVersion.get_latest_version(dag.dag_id) + dr1_running = dag_maker.create_dagrun( + run_id="dr1_run_1", execution_date=date, dag_version=dag_version + ) data_interval = dag.infer_automated_data_interval(logical_date) dag_maker.create_dagrun( run_id="dr1_run_2", state=State.QUEUED, + dag_version=dag_version, execution_date=dag.next_dagrun_info( last_automated_dagrun=data_interval, restricted=False ).data_interval.start, @@ -5188,26 +5188,48 @@ def test_no_dagruns_would_stuck_in_running(self, dag_maker): date = timezone.datetime(2020, 1, 1) with dag_maker("test_dagrun_states_are_correct_2", start_date=date) as dag: EmptyOperator(task_id="dummy_task") + dag_version = DagVersion.get_latest_version(dag.dag_id) for i in range(16): - dr = dag_maker.create_dagrun(run_id=f"dr2_run_{i+1}", state=State.RUNNING, execution_date=date) + dr = dag_maker.create_dagrun( + run_id=f"dr2_run_{i+1}", + state=State.RUNNING, + execution_date=date, + dag_version=dag_version, + ) date = dr.execution_date + timedelta(hours=1) dr16 = DagRun.find(run_id="dr2_run_16") date = dr16[0].execution_date + timedelta(hours=1) for i in range(16, 32): - dr = dag_maker.create_dagrun(run_id=f"dr2_run_{i+1}", state=State.QUEUED, execution_date=date) + dr = dag_maker.create_dagrun( + run_id=f"dr2_run_{i+1}", + state=State.QUEUED, + execution_date=date, + dag_version=dag_version, + ) date = dr.execution_date + timedelta(hours=1) # third dag and dagruns date = timezone.datetime(2021, 1, 1) with dag_maker("test_dagrun_states_are_correct_3", start_date=date) as dag: EmptyOperator(task_id="dummy_task") + dag_version = DagVersion.get_latest_version(dag.dag_id) for i in range(16): - dr = dag_maker.create_dagrun(run_id=f"dr3_run_{i+1}", state=State.RUNNING, execution_date=date) + dr = dag_maker.create_dagrun( + run_id=f"dr3_run_{i+1}", + state=State.RUNNING, + execution_date=date, + dag_version=dag_version, + ) date = dr.execution_date + timedelta(hours=1) dr16 = DagRun.find(run_id="dr3_run_16") date = dr16[0].execution_date + timedelta(hours=1) for i in range(16, 32): - dr = dag_maker.create_dagrun(run_id=f"dr2_run_{i+1}", state=State.QUEUED, execution_date=date) + dr = dag_maker.create_dagrun( + run_id=f"dr2_run_{i+1}", + state=State.QUEUED, + execution_date=date, + dag_version=dag_version, + ) date = dr.execution_date + timedelta(hours=1) scheduler_job = Job() @@ -5486,11 +5508,17 @@ def test_runs_respected_after_clear(self, dag_maker): self.job_runner = SchedulerJobRunner(job=scheduler_job, subdir=os.devnull) self.job_runner.processor_agent = mock.MagicMock() - + dag_version = DagVersion.get_latest_version(dag.dag_id) session = settings.Session() - dr = dag_maker.create_dagrun(run_type=DagRunType.SCHEDULED, state=State.QUEUED) - dr = dag_maker.create_dagrun_after(dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED) - dag_maker.create_dagrun_after(dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED) + dr = dag_maker.create_dagrun( + run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) + dr = dag_maker.create_dagrun_after( + dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) + dag_maker.create_dagrun_after( + dr, run_type=DagRunType.SCHEDULED, state=State.QUEUED, dag_version=dag_version + ) dag.clear() assert len(DagRun.find(dag_id=dag.dag_id, state=State.QUEUED, session=session)) == 3 @@ -5816,6 +5844,7 @@ def test_find_zombies_handle_failure_callbacks_are_correctly_passed_to_dag_proce assert expected_failure_callback_requests[0] == callback_requests[0] def test_cleanup_stale_dags(self): + clear_db_dags() dagbag = DagBag(TEST_DAG_FOLDER, read_dags_from_db=False) with create_session() as session: dag = dagbag.get_dag("test_example_bash_operator") diff --git a/tests/models/test_dag.py b/tests/models/test_dag.py index e38beb2110ca9..fc7a2b24b836a 100644 --- a/tests/models/test_dag.py +++ b/tests/models/test_dag.py @@ -62,6 +62,7 @@ dag as dag_decorator, get_asset_triggered_next_run_info, ) +from airflow.models.dag_version import DagVersion from airflow.models.dagrun import DagRun from airflow.models.param import DagParam, Param from airflow.models.serialized_dag import SerializedDagModel @@ -141,14 +142,11 @@ def setup_method(self) -> None: clear_db_runs() clear_db_dags() clear_db_assets() - self.patcher_dag_code = mock.patch("airflow.models.dag.DagCode.bulk_sync_to_db") - self.patcher_dag_code.start() def teardown_method(self) -> None: clear_db_runs() clear_db_dags() clear_db_assets() - self.patcher_dag_code.stop() @staticmethod def _clean_up(dag_id: str): @@ -1039,14 +1037,16 @@ def test_existing_dag_is_paused_config(self): assert dag.max_consecutive_failed_dag_runs == 2 def test_existing_dag_is_paused_after_limit(self): - def add_failed_dag_run(id, execution_date): + def add_failed_dag_run(dag, id, execution_date): triggered_by_kwargs = {"triggered_by": DagRunTriggeredByType.TEST} if AIRFLOW_V_3_0_PLUS else {} + dag_v = DagVersion.get_latest_version(dag_id=dag.dag_id) dr = dag.create_dagrun( run_type=DagRunType.MANUAL, run_id="run_id_" + id, execution_date=execution_date, state=State.FAILED, data_interval=(execution_date, execution_date), + dag_version=dag_v, **triggered_by_kwargs, ) ti_op1 = dr.get_task_instance(task_id=op1.task_id, session=session) @@ -1059,14 +1059,16 @@ def add_failed_dag_run(id, execution_date): dag.add_task(op1) session = settings.Session() dag.sync_to_db(session=session) + SerializedDagModel.write_dag(dag) assert not dag.get_is_paused() # dag should be paused after 2 failed dag_runs add_failed_dag_run( + dag, "1", TEST_DATE, ) - add_failed_dag_run("2", TEST_DATE + timedelta(days=1)) + add_failed_dag_run(dag, "2", TEST_DATE + timedelta(days=1)) assert dag.get_is_paused() dag.clear() self._clean_up(dag_id) @@ -1085,8 +1087,7 @@ def test_dag_is_deactivated_upon_dagfile_deletion(self): dag = DAG(dag_id, schedule=None, is_paused_upon_creation=True) dag.fileloc = dag_fileloc session = settings.Session() - with mock.patch("airflow.models.dag.DagCode.bulk_sync_to_db"): - dag.sync_to_db(session=session, processor_subdir="/usr/local/airflow/dags/") + dag.sync_to_db(session=session, processor_subdir="/usr/local/airflow/dags/") orm_dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).one() @@ -2370,9 +2371,8 @@ def test_relative_fileloc_serialized( """ dag = DAG(dag_id="test", schedule=None) dag.fileloc = fileloc - sdm = SerializedDagModel(dag) - session.add(sdm) - session.commit() + dag.sync_to_db() + SerializedDagModel.write_dag(dag) session.expunge_all() sdm = SerializedDagModel.get(dag.dag_id, session) dag = sdm.dag @@ -2383,8 +2383,10 @@ def test__processor_dags_folder(self, session): """Only populated after deserializtion""" dag = DAG(dag_id="test", schedule=None) dag.fileloc = "/abc/test.py" + dag.sync_to_db() assert dag._processor_dags_folder is None - sdm = SerializedDagModel(dag) + SerializedDagModel.write_dag(dag) + sdm = SerializedDagModel.get(dag.dag_id, session) assert sdm.dag._processor_dags_folder == settings.DAGS_FOLDER @pytest.mark.need_serialized_dag diff --git a/tests/models/test_dag_version.py b/tests/models/test_dag_version.py new file mode 100644 index 0000000000000..42a33b4b66f13 --- /dev/null +++ b/tests/models/test_dag_version.py @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import pytest +from sqlalchemy import func, select + +from airflow.models.dag_version import DagVersion +from airflow.models.serialized_dag import SerializedDagModel +from airflow.operators.empty import EmptyOperator + +from tests_common.test_utils.db import clear_db_dags + +pytestmark = [pytest.mark.db_test, pytest.mark.skip_if_database_isolation_mode] + + +class TestDagVersion: + def setup_method(self): + clear_db_dags() + + def teardown_method(self): + clear_db_dags() + + @pytest.mark.need_serialized_dag + def test_writing_dag_version(self, dag_maker, session): + with dag_maker("test_writing_dag_version") as dag: + pass + + latest_version = DagVersion.get_latest_version(dag.dag_id) + assert latest_version.version_number == 1 + assert not latest_version.version_name + assert latest_version.dag_id == dag.dag_id + + @pytest.mark.need_serialized_dag + def test_writing_dag_version_with_version_name(self, dag_maker, session): + version_name = "my_version" + with dag_maker(version_name=version_name) as dag: + pass + + latest_version = DagVersion.get_latest_version(dag.dag_id) + assert latest_version.version_number == 1 + assert latest_version.version_name == version_name + assert latest_version.dag_id == dag.dag_id + + def test_writing_dag_version_with_changes(self, dag_maker, session): + """This also tested the get_latest_version method""" + version_name = "my_version" + with dag_maker("test1", version_name=version_name) as dag: + EmptyOperator(task_id="task1") + dag.sync_to_db() + SerializedDagModel.write_dag(dag) + # Add extra task to change the dag + with dag_maker("test1", version_name=version_name) as dag2: + EmptyOperator(task_id="task1") + EmptyOperator(task_id="task2") + dag2.sync_to_db() + SerializedDagModel.write_dag(dag2) + + latest_version = DagVersion.get_latest_version(dag.dag_id) + assert latest_version.version_number == 2 + assert latest_version.version_name == version_name + assert 2 == session.scalar(select(func.count()).where(DagVersion.dag_id == dag.dag_id)) + + @pytest.mark.need_serialized_dag + def test_get_version(self, dag_maker, session): + """The two dags have the same version name and number but different dag ids""" + version_name = "my_version" + dag1_id = "test1" + with dag_maker(dag1_id, version_name=version_name): + EmptyOperator(task_id="task1") + + with dag_maker("test2", version_name=version_name): + EmptyOperator(task_id="task1") + + with dag_maker("test3"): + EmptyOperator(task_id="task1") + + version = DagVersion.get_version(dag1_id) + assert version.version_number == 1 + assert version.version_name == version_name + assert version.dag_id == dag1_id + assert version.version == "my_version-1" + + @pytest.mark.need_serialized_dag + def test_version_property(self, dag_maker): + version_name = "my_version" + with dag_maker("test1", version_name=version_name) as dag: + pass + + latest_version = DagVersion.get_latest_version(dag.dag_id) + assert latest_version.version == f"{version_name}-1" + + @pytest.mark.need_serialized_dag + def test_version_property_with_null_version_name(self, dag_maker): + with dag_maker("test1") as dag: + pass + + latest_version = DagVersion.get_latest_version(dag.dag_id) + assert latest_version.version == "1" diff --git a/tests/models/test_dagbag.py b/tests/models/test_dagbag.py index d91f6738822a1..6915e4df0c1fc 100644 --- a/tests/models/test_dagbag.py +++ b/tests/models/test_dagbag.py @@ -64,10 +64,10 @@ def db_clean_up(): class TestDagBag: - def setup_class(self): + def setup_class(cls): db_clean_up() - def teardown_class(self): + def teardown_class(cls): db_clean_up() def test_get_existing_dag(self, tmp_path): @@ -723,6 +723,7 @@ def _sync_to_db(): dagbag.sync_to_db(session=session) dag = dagbag.dags["test_example_bash_operator"] + dag.sync_to_db() _sync_to_db() mock_sync_perm_for_dag.assert_called_once_with(dag, session=session) @@ -820,6 +821,7 @@ def test_get_dag_with_dag_serialization(self): with time_machine.travel((tz.datetime(2020, 1, 5, 0, 0, 0)), tick=False): example_bash_op_dag = DagBag(include_examples=True).dags.get("example_bash_operator") + example_bash_op_dag.sync_to_db() SerializedDagModel.write_dag(dag=example_bash_op_dag) dag_bag = DagBag(read_dags_from_db=True) @@ -837,6 +839,7 @@ def test_get_dag_with_dag_serialization(self): # Make a change in the DAG and write Serialized DAG to the DB with time_machine.travel((tz.datetime(2020, 1, 5, 0, 0, 6)), tick=False): example_bash_op_dag.tags.add("new_tag") + example_bash_op_dag.sync_to_db() SerializedDagModel.write_dag(dag=example_bash_op_dag) # Since min_serialized_dag_fetch_interval is passed verify that calling 'dag_bag.get_dag' @@ -852,15 +855,16 @@ def test_get_dag_with_dag_serialization(self): @pytest.mark.skip_if_database_isolation_mode # Does not work in db isolation mode @patch("airflow.models.dagbag.settings.MIN_SERIALIZED_DAG_UPDATE_INTERVAL", 5) @patch("airflow.models.dagbag.settings.MIN_SERIALIZED_DAG_FETCH_INTERVAL", 5) - def test_get_dag_refresh_race_condition(self): + def test_get_dag_refresh_race_condition(self, session): """ Test that DagBag.get_dag correctly refresh the Serialized DAG even if SerializedDagModel.last_updated is before DagBag.dags_last_fetched. """ - + db_clean_up() # serialize the initial version of the DAG with time_machine.travel((tz.datetime(2020, 1, 5, 0, 0, 0)), tick=False): example_bash_op_dag = DagBag(include_examples=True).dags.get("example_bash_operator") + example_bash_op_dag.sync_to_db() SerializedDagModel.write_dag(dag=example_bash_op_dag) # deserialize the DAG @@ -886,6 +890,7 @@ def test_get_dag_refresh_race_condition(self): # long before the transaction is committed with time_machine.travel((tz.datetime(2020, 1, 5, 1, 0, 0)), tick=False): example_bash_op_dag.tags.add("new_tag") + example_bash_op_dag.sync_to_db() SerializedDagModel.write_dag(dag=example_bash_op_dag) # Since min_serialized_dag_fetch_interval is passed verify that calling 'dag_bag.get_dag' @@ -906,6 +911,7 @@ def test_collect_dags_from_db(self): example_dags = dagbag.dags for dag in example_dags.values(): + dag.sync_to_db() SerializedDagModel.write_dag(dag) new_dagbag = DagBag(read_dags_from_db=True) diff --git a/tests/models/test_dagcode.py b/tests/models/test_dagcode.py index 26b29ea8f9c02..fd7d761f9103f 100644 --- a/tests/models/test_dagcode.py +++ b/tests/models/test_dagcode.py @@ -17,7 +17,6 @@ # under the License. from __future__ import annotations -from datetime import timedelta from unittest.mock import patch import pytest @@ -25,13 +24,16 @@ import airflow.example_dags as example_dags_module from airflow.exceptions import AirflowException from airflow.models import DagBag +from airflow.models.dag import DAG +from airflow.models.dag_version import DagVersion from airflow.models.dagcode import DagCode +from airflow.models.serialized_dag import SerializedDagModel as SDM # To move it to a shared module. from airflow.utils.file import open_maybe_zipped from airflow.utils.session import create_session -from tests_common.test_utils.db import clear_db_dag_code +from tests_common.test_utils.db import clear_db_dag_code, clear_db_dags pytestmark = [pytest.mark.db_test, pytest.mark.skip_if_database_isolation_mode] @@ -39,6 +41,7 @@ def make_example_dags(module): """Loads DAGs from a module for test.""" dagbag = DagBag(module.__path__[0]) + DAG.bulk_write_to_db(dagbag.dags.values()) return dagbag.dags @@ -46,55 +49,35 @@ class TestDagCode: """Unit tests for DagCode.""" def setup_method(self): + clear_db_dags() clear_db_dag_code() def teardown_method(self): + clear_db_dags() clear_db_dag_code() def _write_two_example_dags(self): example_dags = make_example_dags(example_dags_module) bash_dag = example_dags["example_bash_operator"] - DagCode(bash_dag.fileloc).sync_to_db() + dag_version = DagVersion.get_latest_version("example_bash_operator") + DagCode(dag_version, bash_dag.fileloc).sync_to_db() xcom_dag = example_dags["example_xcom"] - DagCode(xcom_dag.fileloc).sync_to_db() + dag_version = DagVersion.get_latest_version("example_xcom") + DagCode(dag_version, xcom_dag.fileloc).sync_to_db() return [bash_dag, xcom_dag] def _write_example_dags(self): example_dags = make_example_dags(example_dags_module) for dag in example_dags.values(): - dag.sync_to_db() + SDM.write_dag(dag) return example_dags - def test_sync_to_db(self): + def test_write_to_db(self): """Dg code can be written into database.""" example_dags = self._write_example_dags() self._compare_example_dags(example_dags) - def test_bulk_sync_to_db(self): - """Dg code can be bulk written into database.""" - example_dags = make_example_dags(example_dags_module) - files = [dag.fileloc for dag in example_dags.values()] - with create_session() as session: - DagCode.bulk_sync_to_db(files, session=session) - session.commit() - - self._compare_example_dags(example_dags) - - def test_bulk_sync_to_db_half_files(self): - """Dg code can be bulk written into database.""" - example_dags = make_example_dags(example_dags_module) - files = [dag.fileloc for dag in example_dags.values()] - half_files = files[: len(files) // 2] - with create_session() as session: - DagCode.bulk_sync_to_db(half_files, session=session) - session.commit() - with create_session() as session: - DagCode.bulk_sync_to_db(files, session=session) - session.commit() - - self._compare_example_dags(example_dags) - @patch.object(DagCode, "dag_fileloc_hash") def test_detecting_duplicate_key(self, mock_hash): """Dag code detects duplicate key.""" @@ -112,6 +95,8 @@ def _compare_example_dags(self, example_dags): session.query(DagCode.fileloc, DagCode.fileloc_hash, DagCode.source_code) .filter(DagCode.fileloc == dag.fileloc) .filter(DagCode.fileloc_hash == dag_fileloc_hash) + .order_by(DagCode.last_updated.desc()) + .limit(1) .one() ) @@ -126,7 +111,7 @@ def test_code_can_be_read_when_no_access_to_file(self): Source Code should at least exist in one of DB or File. """ example_dag = make_example_dags(example_dags_module).get("example_bash_operator") - example_dag.sync_to_db() + SDM.write_dag(example_dag) # Mock that there is no access to the Dag File with patch("airflow.models.dagcode.open_maybe_zipped") as mock_open: @@ -136,27 +121,50 @@ def test_code_can_be_read_when_no_access_to_file(self): for test_string in ["example_bash_operator", "also_run_this", "run_this_last"]: assert test_string in dag_code - def test_db_code_updated_on_dag_file_change(self): - """Test if DagCode is updated in DB when DAG file is changed""" + def test_db_code_created_on_serdag_change(self, session): + """Test new DagCode is created in DB when DAG file is changed""" example_dag = make_example_dags(example_dags_module).get("example_bash_operator") - example_dag.sync_to_db() - - with create_session() as session: - result = session.query(DagCode).filter(DagCode.fileloc == example_dag.fileloc).one() - - assert result.fileloc == example_dag.fileloc - assert result.source_code is not None + SDM.write_dag(example_dag) - with patch("airflow.models.dagcode.os.path.getmtime") as mock_mtime: - mock_mtime.return_value = (result.last_updated + timedelta(seconds=1)).timestamp() + result = ( + session.query(DagCode) + .filter(DagCode.fileloc == example_dag.fileloc) + .order_by(DagCode.last_updated.desc()) + .limit(1) + .one() + ) - with patch("airflow.models.dagcode.DagCode._get_code_from_file") as mock_code: - mock_code.return_value = "# dummy code" - example_dag.sync_to_db() + assert result.fileloc == example_dag.fileloc + assert result.source_code is not None - with create_session() as session: - new_result = session.query(DagCode).filter(DagCode.fileloc == example_dag.fileloc).one() - - assert new_result.fileloc == example_dag.fileloc - assert new_result.source_code == "# dummy code" - assert new_result.last_updated > result.last_updated + example_dag = make_example_dags(example_dags_module).get("example_bash_operator") + SDM.write_dag(example_dag, processor_subdir="/tmp") + with patch("airflow.models.dagcode.DagCode._get_code_from_file") as mock_code: + mock_code.return_value = "# dummy code" + SDM.write_dag(example_dag) + + new_result = ( + session.query(DagCode) + .filter(DagCode.fileloc == example_dag.fileloc) + .order_by(DagCode.last_updated.desc()) + .limit(1) + .one() + ) + + assert new_result.fileloc == example_dag.fileloc + assert new_result.source_code != result.source_code + assert new_result.last_updated > result.last_updated + + def test_has_dag(self, dag_maker): + """Test has_dag method.""" + with dag_maker("test_has_dag") as dag: + pass + dag.sync_to_db() + SDM.write_dag(dag) + + with dag_maker() as dag2: + pass + dag2.sync_to_db() + SDM.write_dag(dag2) + + assert DagCode.has_dag(dag.fileloc) diff --git a/tests/models/test_dagrun.py b/tests/models/test_dagrun.py index e8889fb102236..dc5a8ab66e566 100644 --- a/tests/models/test_dagrun.py +++ b/tests/models/test_dagrun.py @@ -88,6 +88,7 @@ def _clean_db(): db.clear_db_variables() db.clear_db_assets() db.clear_db_xcom() + db.clear_db_dags() def create_dag_run( self, diff --git a/tests/models/test_serialized_dag.py b/tests/models/test_serialized_dag.py index cdff883760c92..d0bfe37a69cc0 100644 --- a/tests/models/test_serialized_dag.py +++ b/tests/models/test_serialized_dag.py @@ -23,7 +23,7 @@ import pendulum import pytest -from sqlalchemy import select +from sqlalchemy import func, select import airflow.example_dags as example_dags_module from airflow.assets import Asset @@ -31,6 +31,7 @@ from airflow.models.dagbag import DagBag from airflow.models.dagcode import DagCode from airflow.models.serialized_dag import SerializedDagModel as SDM +from airflow.operators.empty import EmptyOperator from airflow.providers.standard.operators.bash import BashOperator from airflow.serialization.serialized_objects import SerializedDAG from airflow.settings import json @@ -47,6 +48,7 @@ def make_example_dags(module): """Loads DAGs from a module for test.""" dagbag = DagBag(module.__path__[0]) + DAG.bulk_write_to_db(dagbag.dags.values()) return dagbag.dags @@ -61,6 +63,7 @@ class TestSerializedDagModel: ], ) def setup_test_cases(self, request, monkeypatch): + db.clear_db_dags() db.clear_db_serialized_dags() with mock.patch("airflow.models.serialized_dag.COMPRESS_SERIALIZED_DAGS", request.param): yield @@ -86,7 +89,7 @@ def test_write_dag(self): assert SDM.has_dag(dag.dag_id) result = session.query(SDM).filter(SDM.dag_id == dag.dag_id).one() - assert result.fileloc == dag.fileloc + assert result.dag_version.dag_code.fileloc == dag.fileloc # Verifies JSON schema. SerializedDAG.validate_schema(result.data) @@ -98,29 +101,28 @@ def test_serialized_dag_is_updated_if_dag_is_changed(self): dag_updated = SDM.write_dag(dag=example_bash_op_dag) assert dag_updated is True - with create_session() as session: - s_dag = session.get(SDM, example_bash_op_dag.dag_id) + s_dag = SDM.get(example_bash_op_dag.dag_id) - # Test that if DAG is not changed, Serialized DAG is not re-written and last_updated - # column is not updated - dag_updated = SDM.write_dag(dag=example_bash_op_dag) - s_dag_1 = session.get(SDM, example_bash_op_dag.dag_id) + # Test that if DAG is not changed, Serialized DAG is not re-written and last_updated + # column is not updated + dag_updated = SDM.write_dag(dag=example_bash_op_dag) + s_dag_1 = SDM.get(example_bash_op_dag.dag_id) - assert s_dag_1.dag_hash == s_dag.dag_hash - assert s_dag.last_updated == s_dag_1.last_updated - assert dag_updated is False + assert s_dag_1.dag_hash == s_dag.dag_hash + assert s_dag.last_updated == s_dag_1.last_updated + assert dag_updated is False - # Update DAG - example_bash_op_dag.tags.add("new_tag") - assert example_bash_op_dag.tags == {"example", "example2", "new_tag"} + # Update DAG + example_bash_op_dag.tags.add("new_tag") + assert example_bash_op_dag.tags == {"example", "example2", "new_tag"} - dag_updated = SDM.write_dag(dag=example_bash_op_dag) - s_dag_2 = session.get(SDM, example_bash_op_dag.dag_id) + dag_updated = SDM.write_dag(dag=example_bash_op_dag) + s_dag_2 = SDM.get(example_bash_op_dag.dag_id) - assert s_dag.last_updated != s_dag_2.last_updated - assert s_dag.dag_hash != s_dag_2.dag_hash - assert s_dag_2.data["dag"]["tags"] == ["example", "example2", "new_tag"] - assert dag_updated is True + assert s_dag.last_updated != s_dag_2.last_updated + assert s_dag.dag_hash != s_dag_2.dag_hash + assert s_dag_2.data["dag"]["tags"] == ["example", "example2", "new_tag"] + assert dag_updated is True @pytest.mark.skip_if_database_isolation_mode # Does not work in db isolation mode def test_serialized_dag_is_updated_if_processor_subdir_changed(self): @@ -131,12 +133,12 @@ def test_serialized_dag_is_updated_if_processor_subdir_changed(self): assert dag_updated is True with create_session() as session: - s_dag = session.get(SDM, example_bash_op_dag.dag_id) + s_dag = SDM.get(example_bash_op_dag.dag_id) # Test that if DAG is not changed, Serialized DAG is not re-written and last_updated # column is not updated dag_updated = SDM.write_dag(dag=example_bash_op_dag, processor_subdir="/tmp/test") - s_dag_1 = session.get(SDM, example_bash_op_dag.dag_id) + s_dag_1 = SDM.get(example_bash_op_dag.dag_id) assert s_dag_1.dag_hash == s_dag.dag_hash assert s_dag.last_updated == s_dag_1.last_updated @@ -145,7 +147,7 @@ def test_serialized_dag_is_updated_if_processor_subdir_changed(self): # Update DAG dag_updated = SDM.write_dag(dag=example_bash_op_dag, processor_subdir="/tmp/other") - s_dag_2 = session.get(SDM, example_bash_op_dag.dag_id) + s_dag_2 = SDM.get(example_bash_op_dag.dag_id) assert s_dag.processor_subdir != s_dag_2.processor_subdir assert dag_updated is True @@ -162,6 +164,19 @@ def test_read_dags(self): assert serialized_dag.dag_id == dag.dag_id assert set(serialized_dag.task_dict) == set(dag.task_dict) + @pytest.mark.skip_if_database_isolation_mode # Does not work in db isolation mode + def test_read_all_dags_only_picks_the_latest_serdags(self, session): + example_dags = self._write_example_dags() + serialized_dags = SDM.read_all_dags() + assert len(example_dags) == len(serialized_dags) + + ex_dags = make_example_dags(example_dags_module) + SDM.write_dag(ex_dags.get("example_bash_operator"), processor_subdir="/tmp/") + serialized_dags2 = SDM.read_all_dags() + sdags = session.query(SDM).all() + # assert only the latest SDM is returned + assert len(sdags) != len(serialized_dags2) + @pytest.mark.skip_if_database_isolation_mode # Does not work in db isolation mode def test_remove_dags_by_id(self): """DAGs can be removed from database.""" @@ -190,26 +205,12 @@ def test_bulk_sync_to_db(self): DAG("dag_2", schedule=None), DAG("dag_3", schedule=None), ] - with assert_queries_count(10): + DAG.bulk_write_to_db(dags) + # we also write to dag_version and dag_code tables + # in dag_version, we search for unique version_name too + with assert_queries_count(24): SDM.bulk_sync_to_db(dags) - @pytest.mark.skip_if_database_isolation_mode # Does not work in db isolation mode - @pytest.mark.parametrize("dag_dependencies_fields", [{"dag_dependencies": None}, {}]) - def test_get_dag_dependencies_default_to_empty(self, dag_dependencies_fields): - """Test a pre-2.1.0 serialized DAG can deserialize DAG dependencies.""" - example_dags = make_example_dags(example_dags_module) - - with create_session() as session: - sdms = [SDM(dag) for dag in example_dags.values()] - # Simulate pre-2.1.0 format. - for sdm in sdms: - del sdm.data["dag"]["dag_dependencies"] - sdm.data["dag"].update(dag_dependencies_fields) - session.bulk_save_objects(sdms) - - expected_dependencies = {dag_id: [] for dag_id in example_dags} - assert SDM.get_dag_dependencies() == expected_dependencies - @pytest.mark.skip_if_database_isolation_mode # Does not work in db isolation mode def test_order_of_dag_params_is_stable(self): """ @@ -284,3 +285,31 @@ def get_hash_set(): first_hashes = get_hash_set() # assert that the hashes are the same assert first_hashes == get_hash_set() + + def test_get_latest_serdag_versions(self, dag_maker, session): + # first dag + with dag_maker("dag1") as dag: + EmptyOperator(task_id="task1") + dag.sync_to_db() + SDM.write_dag(dag) + with dag_maker("dag1") as dag: + EmptyOperator(task_id="task1") + EmptyOperator(task_id="task2") + dag.sync_to_db() + SDM.write_dag(dag) + # second dag + with dag_maker("dag2") as dag: + EmptyOperator(task_id="task1") + dag.sync_to_db() + SDM.write_dag(dag) + with dag_maker("dag2") as dag: + EmptyOperator(task_id="task1") + EmptyOperator(task_id="task2") + dag.sync_to_db() + SDM.write_dag(dag) + + # Total serdags should be 4 + assert session.scalar(select(func.count()).select_from(SDM)) == 4 + + latest_versions = SDM.get_latest_serialized_dags(dag_ids=["dag1", "dag2"], session=session) + assert len(latest_versions) == 2 diff --git a/tests/models/test_taskinstance.py b/tests/models/test_taskinstance.py index 8a1df0594e4e9..36b2d22f60fdc 100644 --- a/tests/models/test_taskinstance.py +++ b/tests/models/test_taskinstance.py @@ -103,7 +103,7 @@ from tests_common.test_utils import db from tests_common.test_utils.compat import AIRFLOW_V_3_0_PLUS from tests_common.test_utils.config import conf_vars -from tests_common.test_utils.db import clear_db_connections, clear_db_runs +from tests_common.test_utils.db import clear_db_connections, clear_db_dags, clear_db_runs from tests_common.test_utils.mock_operators import MockOperator if AIRFLOW_V_3_0_PLUS: @@ -2992,6 +2992,7 @@ def test_changing_of_asset_when_adrq_is_already_populated(self, dag_maker): Test that when a task that produces asset has ran, that changing the consumer dag asset will not cause primary key blank-out """ + clear_db_dags() from airflow.assets import Asset with dag_maker(schedule=None, serialized=True) as dag1: @@ -4013,6 +4014,7 @@ def test_refresh_from_db(self, create_task_instance): "next_method": None, "updated_at": None, "task_display_name": "Test Refresh from DB Task", + "dag_version_id": None, } # Make sure we aren't missing any new value in our expected_values list. expected_keys = {f"task_instance.{key}" for key in expected_values} diff --git a/tests/operators/test_trigger_dagrun.py b/tests/operators/test_trigger_dagrun.py index 52a11d10e5e33..0b40154b21b05 100644 --- a/tests/operators/test_trigger_dagrun.py +++ b/tests/operators/test_trigger_dagrun.py @@ -28,8 +28,6 @@ from airflow.models.dag import DagModel from airflow.models.dagbag import DagBag from airflow.models.dagrun import DagRun -from airflow.models.log import Log -from airflow.models.serialized_dag import SerializedDagModel from airflow.models.taskinstance import TaskInstance from airflow.operators.trigger_dagrun import TriggerDagRunOperator from airflow.settings import TracebackSessionForTests @@ -39,6 +37,8 @@ from airflow.utils.state import DagRunState, State, TaskInstanceState from airflow.utils.types import DagRunType +from tests_common.test_utils.db import clear_db_dags, clear_db_logs, clear_db_runs + pytestmark = pytest.mark.db_test DEFAULT_DATE = datetime(2019, 1, 1, tzinfo=timezone.utc) @@ -82,12 +82,9 @@ def re_sync_triggered_dag_to_db(self, dag, dag_maker): def teardown_method(self): """Cleanup state after testing in DB.""" - with create_session() as session: - session.query(Log).filter(Log.dag_id == TEST_DAG_ID).delete(synchronize_session=False) - for dbmodel in [DagModel, DagRun, TaskInstance, SerializedDagModel]: - session.query(dbmodel).filter(dbmodel.dag_id.in_([TRIGGERED_DAG_ID, TEST_DAG_ID])).delete( - synchronize_session=False - ) + clear_db_logs() + clear_db_runs() + clear_db_dags() # pathlib.Path(self._tmpfile).unlink() diff --git a/tests/sensors/test_external_task_sensor.py b/tests/sensors/test_external_task_sensor.py index 43911c1a41d48..e2246a2f75132 100644 --- a/tests/sensors/test_external_task_sensor.py +++ b/tests/sensors/test_external_task_sensor.py @@ -124,6 +124,7 @@ def add_fake_task_group(self, target_states=None): with self.dag as dag: with TaskGroup(group_id=TEST_TASK_GROUP_ID) as task_group: _ = [EmptyOperator(task_id=f"task{i}") for i in range(len(target_states))] + dag.sync_to_db() SerializedDagModel.write_dag(dag) for idx, task in enumerate(task_group): @@ -146,7 +147,7 @@ def fake_mapped_task(x: int): fake_task() fake_mapped_task.expand(x=list(map_indexes)) - + dag.sync_to_db() SerializedDagModel.write_dag(dag) for task in task_group: diff --git a/tests/utils/test_db_cleanup.py b/tests/utils/test_db_cleanup.py index 47e93c1616d63..5df2c37cddb32 100644 --- a/tests/utils/test_db_cleanup.py +++ b/tests/utils/test_db_cleanup.py @@ -352,6 +352,7 @@ def test_no_models_missing(self): "rendered_task_instance_fields", # foreign key with TI "dag_priority_parsing_request", # Records are purged once per DAG Processing loop, not a # significant source of data. + "dag_version", # self-maintaining } from airflow.utils.db_cleanup import config_dict diff --git a/tests/www/views/test_views_tasks.py b/tests/www/views/test_views_tasks.py index 19caafe55bc63..424fb02979cdc 100644 --- a/tests/www/views/test_views_tasks.py +++ b/tests/www/views/test_views_tasks.py @@ -28,9 +28,9 @@ import time_machine from airflow import settings -from airflow.models.dag import DAG, DagModel +from airflow.models.dag import DAG from airflow.models.dagbag import DagBag -from airflow.models.dagcode import DagCode +from airflow.models.serialized_dag import SerializedDagModel from airflow.models.taskinstance import TaskInstance from airflow.models.taskreschedule import TaskReschedule from airflow.models.xcom import XCom @@ -500,7 +500,7 @@ def test_code(admin_client): def test_code_from_db(admin_client): dag = DagBag(include_examples=True).get_dag("example_bash_operator") - DagCode(dag.fileloc, DagCode._get_code_from_file(dag.fileloc)).sync_to_db() + SerializedDagModel.write_dag(dag) url = "code?dag_id=example_bash_operator" resp = admin_client.get(url, follow_redirects=True) check_content_not_in_response("Failed to load DAG file Code", resp) @@ -510,7 +510,7 @@ def test_code_from_db(admin_client): def test_code_from_db_all_example_dags(admin_client): dagbag = DagBag(include_examples=True) for dag in dagbag.dags.values(): - DagCode(dag.fileloc, DagCode._get_code_from_file(dag.fileloc)).sync_to_db() + SerializedDagModel.write_dag(dag) url = "code?dag_id=example_bash_operator" resp = admin_client.get(url, follow_redirects=True) check_content_not_in_response("Failed to load DAG file Code", resp) @@ -614,23 +614,12 @@ def heartbeat(self): return True -@pytest.fixture -def new_id_example_bash_operator(): - dag_id = "example_bash_operator" - test_dag_id = "non_existent_dag" - with create_session() as session: - dag_query = session.query(DagModel).filter(DagModel.dag_id == dag_id) - dag_query.first().tags = [] # To avoid "FOREIGN KEY constraint" error) - with create_session() as session: - dag_query.update({"dag_id": test_dag_id}) - yield test_dag_id - with create_session() as session: - session.query(DagModel).filter(DagModel.dag_id == test_dag_id).update({"dag_id": dag_id}) - - -def test_delete_dag_button_for_dag_on_scheduler_only(admin_client, new_id_example_bash_operator): +def test_delete_dag_button_for_dag_on_scheduler_only(admin_client, dag_maker): + with dag_maker() as dag: + EmptyOperator(task_id="task") + dag.sync_to_db() # The delete-dag URL should be generated correctly - test_dag_id = new_id_example_bash_operator + test_dag_id = dag.dag_id resp = admin_client.get("/", follow_redirects=True) check_content_in_response(f"/delete?dag_id={test_dag_id}", resp) check_content_in_response(f"return confirmDeleteDag(this, '{test_dag_id}')", resp) @@ -1133,6 +1122,7 @@ def test_task_instances(admin_client): "try_number": 0, "unixname": getuser(), "updated_at": DEFAULT_DATE.isoformat(), + "dag_version_id": None, }, "run_after_loop": { "custom_operator_name": None, @@ -1169,6 +1159,7 @@ def test_task_instances(admin_client): "try_number": 0, "unixname": getuser(), "updated_at": DEFAULT_DATE.isoformat(), + "dag_version_id": None, }, "run_this_last": { "custom_operator_name": None, @@ -1205,6 +1196,7 @@ def test_task_instances(admin_client): "try_number": 0, "unixname": getuser(), "updated_at": DEFAULT_DATE.isoformat(), + "dag_version_id": None, }, "runme_0": { "custom_operator_name": None, @@ -1241,6 +1233,7 @@ def test_task_instances(admin_client): "try_number": 0, "unixname": getuser(), "updated_at": DEFAULT_DATE.isoformat(), + "dag_version_id": None, }, "runme_1": { "custom_operator_name": None, @@ -1277,6 +1270,7 @@ def test_task_instances(admin_client): "try_number": 0, "unixname": getuser(), "updated_at": DEFAULT_DATE.isoformat(), + "dag_version_id": None, }, "runme_2": { "custom_operator_name": None, @@ -1313,6 +1307,7 @@ def test_task_instances(admin_client): "try_number": 0, "unixname": getuser(), "updated_at": DEFAULT_DATE.isoformat(), + "dag_version_id": None, }, "this_will_skip": { "custom_operator_name": None, @@ -1349,5 +1344,6 @@ def test_task_instances(admin_client): "try_number": 0, "unixname": getuser(), "updated_at": DEFAULT_DATE.isoformat(), + "dag_version_id": None, }, } diff --git a/tests_common/pytest_plugin.py b/tests_common/pytest_plugin.py index 2a35f79de14f5..2a14511ebf239 100644 --- a/tests_common/pytest_plugin.py +++ b/tests_common/pytest_plugin.py @@ -884,11 +884,30 @@ def __exit__(self, type, value, traceback): self.serialized_model = SerializedDagModel( dag, processor_subdir=self.dag_model.processor_subdir ) - self.session.merge(self.serialized_model) + sdm = SerializedDagModel.get(dag.dag_id, session=self.session) + from tests_common.test_utils.compat import AIRFLOW_V_3_0_PLUS + + if AIRFLOW_V_3_0_PLUS and not sdm: + from airflow.models.dag_version import DagVersion + from airflow.models.dagcode import DagCode + + dagv = DagVersion.write_dag( + dag_id=dag.dag_id, + session=self.session, + version_name=dag.version_name, + ) + dag_code = DagCode(dagv, dag.fileloc, "Source") + self.session.merge(dag_code) + self.serialized_model.dag_version = dagv + if self.want_activate_assets: + self._activate_assets() + if sdm: + self.serialized_model = sdm + else: + self.session.merge(self.serialized_model) serialized_dag = self._serialized_dag() self._bag_dag_compat(serialized_dag) - if AIRFLOW_V_3_0_PLUS and self.want_activate_assets: - self._activate_assets() + self.session.flush() else: self._bag_dag_compat(self.dag) @@ -1007,16 +1026,30 @@ def cleanup(self): return # To isolate problems here with problems from elsewhere on the session object self.session.rollback() - - self.session.query(SerializedDagModel).filter( - SerializedDagModel.dag_id.in_(dag_ids) - ).delete(synchronize_session=False) - self.session.query(DagRun).filter(DagRun.dag_id.in_(dag_ids)).delete( - synchronize_session=False, - ) - self.session.query(TaskInstance).filter(TaskInstance.dag_id.in_(dag_ids)).delete( - synchronize_session=False, - ) + from tests_common.test_utils.compat import AIRFLOW_V_3_0_PLUS + + if AIRFLOW_V_3_0_PLUS: + from airflow.models.dag_version import DagVersion + + self.session.query(DagRun).filter(DagRun.dag_id.in_(dag_ids)).delete( + synchronize_session=False, + ) + self.session.query(TaskInstance).filter(TaskInstance.dag_id.in_(dag_ids)).delete( + synchronize_session=False, + ) + self.session.query(DagVersion).filter(DagVersion.dag_id.in_(dag_ids)).delete( + synchronize_session=False + ) + else: + self.session.query(SerializedDagModel).filter( + SerializedDagModel.dag_id.in_(dag_ids) + ).delete(synchronize_session=False) + self.session.query(DagRun).filter(DagRun.dag_id.in_(dag_ids)).delete( + synchronize_session=False, + ) + self.session.query(TaskInstance).filter(TaskInstance.dag_id.in_(dag_ids)).delete( + synchronize_session=False, + ) self.session.query(XCom).filter(XCom.dag_id.in_(dag_ids)).delete( synchronize_session=False, ) diff --git a/tests_common/test_utils/db.py b/tests_common/test_utils/db.py index d37a8e942e111..14f1007af9a8b 100644 --- a/tests_common/test_utils/db.py +++ b/tests_common/test_utils/db.py @@ -114,6 +114,7 @@ def clear_db_dags(): session.query(DagTag).delete() session.query(DagOwnerAttributes).delete() session.query(DagModel).delete() + session.query(DagCode).delete() def drop_tables_with_prefix(prefix):