halconel · halconel · Mar 16, 2025 · Aug 11, 2025 · Oct 8, 2025 · Oct 8, 2025
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -2,6 +2,15 @@ name: Run linters
 
 on:
   push:
+    branches:
+      - "master"
+    paths:
+      - ".github/workflows/test.yaml"
+      - "datapipe/**"
+      - "tests/**"
+      - "pyproject.toml"
+
+  pull_request:
     branches:
       - "**"
     paths:

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -2,6 +2,15 @@ name: Run tests
 
 on:
   push:
+    branches:
+      - "master"
+    paths:
+      - ".github/workflows/test.yaml"
+      - "datapipe/**"
+      - "tests/**"
+      - "pyproject.toml"
+
+  pull_request:
     branches:
       - "**"
     paths:
@@ -111,7 +120,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install ${{ matrix.pip-extra }} ".[sqlite,excel,milvus,gcsfs,s3fs,redis,qdrant,gcp,elastic]" "pytest<8" "pytest_cases" "elasticsearch<9"
+          pip install ${{ matrix.pip-extra }} ".[sqlite,excel,milvus,gcsfs,s3fs,redis,qdrant,gcp,elastic,pyarrow]" "pytest<8" "pytest_cases" "elasticsearch<9"
 
       - name: Test with pytest
         run: |

diff --git a/.github/workflows/test_examples.yaml b/.github/workflows/test_examples.yaml
@@ -2,6 +2,15 @@ name: Test examples
 
 on:
   push:
+    branches:
+      - "master"
+    paths:
+      - ".github/workflows/test.yaml"
+      - "datapipe/**"
+      - "tests/**"
+      - "pyproject.toml"
+
+  pull_request:
     branches:
       - "**"
     paths:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,23 @@
-# WIP
+# WIP: 0.14.6
+
+* Allow `DataTable` objects directly in `ComputeStep.input_dts` parameter
+  (auto-wraps in `ComputeInput`)
+* Add backward compatibility for DataTable API in BatchTransformStep
+* Add optional `force_vectors_to_ram` parameter to `QdrantStore`
+* Moved calculating checksum from MetaTable to TableStore
+* Added PandasParquetFile adapter for TableStoreFiledir to store pandas
+  DataFrames in parquet
+* Added hash_row for BytesFile and PILFile
+* Added use_adapter_hash parameter in TableStoreFiledir for backward
+  compatibility
+
+# 0.14.5
 
 * Fix for getting existing idx of empy meta-table
+* Add `keep_existing=True` to `TransformMetaTable` to prevent table metadata
+  conflicts
+* Enhance `RedisStore` with multi-node cluster support and password
+  authentication
 
 # 0.14.4
 

diff --git a/datapipe/cli.py b/datapipe/cli.py
@@ -498,6 +498,73 @@ def migrate_transform_tables(ctx: click.Context, labels: str, name: str) -> None
     return migrations_v013.migrate_transform_tables(app, batch_transforms_steps)
 
 
+@cli.command()
+@click.option("--step", type=click.STRING, help="Step name to initialize offsets for (optional)")
+@click.pass_context
+def init_offsets(ctx, step: Optional[str]):
+    """
+    Инициализировать таблицу offset'ов из существующих данных TransformMetaTable.
+
+    Команда сканирует уже обработанные данные и устанавливает начальные значения offset'ов,
+    чтобы обеспечить плавную миграцию на оптимизацию через offset'ы (метод v2).
+
+    Если указан --step, инициализирует только этот шаг. Иначе инициализирует
+    все экземпляры BatchTransformStep в пайплайне.
+    """
+    from datapipe.meta.sql_meta import initialize_offsets_from_transform_meta
+
+    app: DatapipeApp = ctx.obj["app"]
+
+    # Collect all BatchTransformStep instances
+    transform_steps = []
+    for compute_step in app.steps:
+        if isinstance(compute_step, BaseBatchTransformStep):
+            if step is None or compute_step.get_name() == step:
+                transform_steps.append(compute_step)
+
+    if not transform_steps:
+        if step:
+            rprint(f"[red]Step '{step}' not found or is not a BatchTransformStep[/red]")
+        else:
+            rprint("[yellow]No BatchTransformStep instances found in pipeline[/yellow]")
+        return
+
+    rprint(f"[cyan]Found {len(transform_steps)} transform step(s) to initialize[/cyan]")
+
+    # Initialize offsets for each step
+    results = {}
+    for transform_step in transform_steps:
+        step_name = transform_step.get_name()
+        rprint(f"\n[cyan]Initializing offsets for: {step_name}[/cyan]")
+
+        try:
+            offsets = initialize_offsets_from_transform_meta(app.ds, transform_step)
+
+            if offsets:
+                rprint(f"[green]✓ Initialized {len(offsets)} offset(s):[/green]")
+                for input_name, offset_value in offsets.items():
+                    rprint(f"  - {input_name}: {offset_value}")
+                results[step_name] = offsets
+            else:
+                rprint("[yellow]No offsets initialized (no processed data found)[/yellow]")
+                results[step_name] = {}
+
+        except Exception as e:
+            rprint(f"[red]✗ Failed to initialize: {e}[/red]")
+            results[step_name] = {}
+
+    # Summary
+    rprint("\n[cyan]═══ Summary ═══[/cyan]")
+    success_count = sum(1 for v in results.values() if v is not None and len(v) > 0)
+    empty_count = sum(1 for v in results.values() if v is not None and len(v) == 0)
+    failed_count = sum(1 for v in results.values() if v is None)
+
+    rprint(f"[green]Successful: {success_count}[/green]")
+    rprint(f"[yellow]Empty (no data): {empty_count}[/yellow]")
+    if failed_count > 0:
+        rprint(f"[red]Failed: {failed_count}[/red]")
+
+
 try:
     entry_points = metadata.entry_points(group="datapipe.cli")  # type: ignore
 except TypeError:

diff --git a/datapipe/compute.py b/datapipe/compute.py
@@ -2,7 +2,7 @@
 import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Literal, Optional, Sequence, Tuple
+from typing import Dict, Iterable, List, Literal, Optional, Sequence, Tuple, Union
 
 from opentelemetry import trace
 
@@ -85,6 +85,9 @@ class StepStatus:
 class ComputeInput:
     dt: DataTable
     join_type: Literal["inner", "full"] = "full"
+    # Filtered join optimization: mapping from idx columns to dt columns
+    # Example: {"user_id": "id"} means filter dt by dt.id IN (idx.user_id)
+    join_keys: Optional[Dict[str, str]] = None
 
 
 class ComputeStep:
@@ -106,13 +109,17 @@ class ComputeStep:
     def __init__(
         self,
         name: str,
-        input_dts: List[ComputeInput],
+        input_dts: Sequence[Union[ComputeInput, DataTable]],
         output_dts: List[DataTable],
         labels: Optional[Labels] = None,
         executor_config: Optional[ExecutorConfig] = None,
     ) -> None:
         self._name = name
-        self.input_dts = input_dts
+        # Нормализация input_dts: автоматически оборачиваем DataTable в ComputeInput
+        self.input_dts = [
+            inp if isinstance(inp, ComputeInput) else ComputeInput(dt=inp, join_type="full")
+            for inp in input_dts
+        ]
         self.output_dts = output_dts
         self._labels = labels
         self.executor_config = executor_config

diff --git a/datapipe/datatable.py b/datapipe/datatable.py
@@ -5,11 +5,11 @@
 from opentelemetry import trace
 
 from datapipe.event_logger import EventLogger
-from datapipe.meta.sql_meta import MetaTable
+from datapipe.meta.sql_meta import MetaTable, TransformInputOffsetTable
 from datapipe.run_config import RunConfig
 from datapipe.store.database import DBConn
 from datapipe.store.table_store import TableStore
-from datapipe.types import DataDF, IndexDF, MetadataDF, data_to_index, index_difference
+from datapipe.types import DataDF, IndexDF, MetadataDF, data_to_index, index_difference, index_to_data
 
 if TYPE_CHECKING:
     try:
@@ -87,13 +87,18 @@ def store_chunk(
             if not data_df.empty:
                 logger.debug(f"Inserting chunk {len(data_df.index)} rows into {self.name}")
 
+                hash_df = self.table_store.hash_rows(data_df)
+
                 with tracer.start_as_current_span("get_changes_for_store_chunk"):
                     (
-                        new_df,
-                        changed_df,
+                        new_index_df,
+                        changed_index_df,
                         new_meta_df,
                         changed_meta_df,
-                    ) = self.meta_table.get_changes_for_store_chunk(data_df, now)
+                    ) = self.meta_table.get_changes_for_store_chunk(hash_df, now)
+
+                new_df = index_to_data(data_df, new_index_df)
+                changed_df = index_to_data(data_df, changed_index_df)
 
                 # TODO implement transaction meckanism
                 with tracer.start_as_current_span("store data"):
@@ -165,6 +170,9 @@ def __init__(
 
         self.create_meta_table = create_meta_table
 
+        # Создать таблицу offset'ов (используем тот же флаг create_meta_table)
+        self.offset_table = TransformInputOffsetTable(meta_dbconn, create_table=create_meta_table)
+
     def create_table(self, name: str, table_store: TableStore) -> DataTable:
         assert name not in self.tables