Skip to content

Compact Operation Fails with Schema Incompatibility: Blob Field Incorrectly Interpreted as Struct Type #5115

@zszheng

Description

@zszheng

Step 1:Import Mp4 To Lance,with blob type

import lance
import pyarrow as pa
import pandas as pd
from pathlib import Path

schema = pa.schema([
    pa.field("id", pa.int64()),
    pa.field("video_name", pa.string()),
    pa.field("video_data", 
             pa.large_binary(),
             metadata={"lance-encoding:blob": "true"}
            ),
])
mp4_files_dir="./HDFS"
mp4_dir = Path(mp4_files_dir)
mp4_files = list(mp4_dir.glob("*.mp4"))

if not mp4_files:
    print(f"在目录 {mp4_files_dir} 中未找到 MP4 文件")

print(f"找到 {len(mp4_files)} 个 MP4 文件")

ids = []
video_names = []
file_sizes = []
video_data_list = []

for i, mp4_file in enumerate(mp4_files):
    try:
        with open(mp4_file, 'rb') as f:
            video_bytes = f.read()

        ids.append(i + 1)
        video_names.append(mp4_file.name)
        file_sizes.append(mp4_file.stat().st_size)
        video_data_list.append(video_bytes)

        print(f"已加载: {mp4_file.name} ({len(video_bytes)} 字节)")

    except Exception as e:
        print(f"读取文件 {mp4_file} 时出错: {e}")
        continue

if not ids:
    print("没有成功读取任何 MP4 文件")

table = pa.table({
    "id": ids,
    "video_name": video_names,
    "video_data": video_data_list
}, schema=schema)
print(f"type {type(table)}{type(video_data_list[0])}, schema {schema}")
dataset_uri = "./example_video_dataset1.lance"
lance.write_dataset(table, dataset_uri, mode="append")

Step2 :Compact Files

OSError: LanceError(Schema): Attempt to project incompatible fields: Field(id=2, name=video_data, type=large_binary) and 
Field(id=2, name=video_data, type=struct, children=[Field(id=3, name=position, type=uint64), Field(id=4, name=size, 
type=uint64), ]), [/home/runner/work/lance/lance/rust/lance-core/src/datatypes/field.rs:687:31]
(http://proxy.ml.bilibili.co/home/runner/work/lance/lance/rust/lance-core/src/datatypes/field.rs#line=686)

Step3:Throw Exception

OSError                                   Traceback (most recent call last)
Cell In[7], line 6
      4 dataset = lance.dataset(dataset_uri, )
      5 print(dataset.schema)
----> 6 dataset.optimize.compact_files(target_rows_per_fragment=1024 * 1024)
      7 dataset.to_table().to_pandas()

File [/usr/local/lib/python3.10/dist-packages/lance/dataset.py:4497](http://proxy.ml.bilibili.co/usr/local/lib/python3.10/dist-packages/lance/dataset.py#line=4496), in DatasetOptimizer.compact_files(self, target_rows_per_fragment, max_rows_per_group, max_bytes_per_file, materialize_deletions, materialize_deletions_threshold, num_threads, batch_size)
   4432 """Compacts small files in the dataset, reducing total number of files.
   4433 
   4434 This does a few things:
   (...)
   4486 lance.optimize.Compaction
   4487 """
   4488 opts = dict(
   4489     target_rows_per_fragment=target_rows_per_fragment,
   4490     max_rows_per_group=max_rows_per_group,
   (...)
   4495     batch_size=batch_size,
   4496 )
-> 4497 return Compaction.execute(self._dataset, opts)

OSError: LanceError(Schema): Attempt to project incompatible fields: Field(id=2, name=video_data, type=large_binary) and Field(id=2, name=video_data, type=struct, children=[Field(id=3, name=position, type=uint64), Field(id=4, name=size, type=uint64), ]), [/home/runner/work/lance/lance/rust/lance-core/src/datatypes/field.rs:687:31](http://proxy.ml.bilibili.co/home/runner/work/lance/lance/rust/lance-core/src/datatypes/field.rs#line=686)

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions