Step 1:Import Mp4 To Lance,with blob type
import lance
import pyarrow as pa
import pandas as pd
from pathlib import Path
schema = pa.schema([
pa.field("id", pa.int64()),
pa.field("video_name", pa.string()),
pa.field("video_data",
pa.large_binary(),
metadata={"lance-encoding:blob": "true"}
),
])
mp4_files_dir="./HDFS"
mp4_dir = Path(mp4_files_dir)
mp4_files = list(mp4_dir.glob("*.mp4"))
if not mp4_files:
print(f"在目录 {mp4_files_dir} 中未找到 MP4 文件")
print(f"找到 {len(mp4_files)} 个 MP4 文件")
ids = []
video_names = []
file_sizes = []
video_data_list = []
for i, mp4_file in enumerate(mp4_files):
try:
with open(mp4_file, 'rb') as f:
video_bytes = f.read()
ids.append(i + 1)
video_names.append(mp4_file.name)
file_sizes.append(mp4_file.stat().st_size)
video_data_list.append(video_bytes)
print(f"已加载: {mp4_file.name} ({len(video_bytes)} 字节)")
except Exception as e:
print(f"读取文件 {mp4_file} 时出错: {e}")
continue
if not ids:
print("没有成功读取任何 MP4 文件")
table = pa.table({
"id": ids,
"video_name": video_names,
"video_data": video_data_list
}, schema=schema)
print(f"type {type(table)}, {type(video_data_list[0])}, schema {schema}")
dataset_uri = "./example_video_dataset1.lance"
lance.write_dataset(table, dataset_uri, mode="append")
Step2 :Compact Files
OSError: LanceError(Schema): Attempt to project incompatible fields: Field(id=2, name=video_data, type=large_binary) and
Field(id=2, name=video_data, type=struct, children=[Field(id=3, name=position, type=uint64), Field(id=4, name=size,
type=uint64), ]), [/home/runner/work/lance/lance/rust/lance-core/src/datatypes/field.rs:687:31]
(http://proxy.ml.bilibili.co/home/runner/work/lance/lance/rust/lance-core/src/datatypes/field.rs#line=686)
Step3:Throw Exception
OSError Traceback (most recent call last)
Cell In[7], line 6
4 dataset = lance.dataset(dataset_uri, )
5 print(dataset.schema)
----> 6 dataset.optimize.compact_files(target_rows_per_fragment=1024 * 1024)
7 dataset.to_table().to_pandas()
File [/usr/local/lib/python3.10/dist-packages/lance/dataset.py:4497](http://proxy.ml.bilibili.co/usr/local/lib/python3.10/dist-packages/lance/dataset.py#line=4496), in DatasetOptimizer.compact_files(self, target_rows_per_fragment, max_rows_per_group, max_bytes_per_file, materialize_deletions, materialize_deletions_threshold, num_threads, batch_size)
4432 """Compacts small files in the dataset, reducing total number of files.
4433
4434 This does a few things:
(...)
4486 lance.optimize.Compaction
4487 """
4488 opts = dict(
4489 target_rows_per_fragment=target_rows_per_fragment,
4490 max_rows_per_group=max_rows_per_group,
(...)
4495 batch_size=batch_size,
4496 )
-> 4497 return Compaction.execute(self._dataset, opts)
OSError: LanceError(Schema): Attempt to project incompatible fields: Field(id=2, name=video_data, type=large_binary) and Field(id=2, name=video_data, type=struct, children=[Field(id=3, name=position, type=uint64), Field(id=4, name=size, type=uint64), ]), [/home/runner/work/lance/lance/rust/lance-core/src/datatypes/field.rs:687:31](http://proxy.ml.bilibili.co/home/runner/work/lance/lance/rust/lance-core/src/datatypes/field.rs#line=686)
Step 1:Import Mp4 To Lance,with blob type
Step2 :Compact Files
Step3:Throw Exception