From cb9b5e98efab781b9a0df79a6dd2c9b800f3892b Mon Sep 17 00:00:00 2001 From: m-laprise <52421731+m-laprise@users.noreply.github.com> Date: Wed, 14 Jan 2026 11:51:02 -0500 Subject: [PATCH] fix(data): preserve dataset path for resharding data.py reused 'dataset' for both the dataset path string and the loaded IterableDataset. When resharding, load_dataset(path=dataset) received the dataset object instead of the path. Store the original path separately and use it for resharding. --- flame/data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flame/data.py b/flame/data.py index c9a5bc6..8feb866 100644 --- a/flame/data.py +++ b/flame/data.py @@ -557,8 +557,9 @@ def build_dataset( color = utils.Color min_num_shards = dp_degree * num_workers if dp_degree else None if len(dataset.split(',')) == 1: + dataset_path = dataset dataset = load_dataset( - path=dataset, + path=dataset_path, name=dataset_name, split=dataset_split, data_dir=data_dir, @@ -585,7 +586,7 @@ def build_dataset( f"{color.reset}" ) dataset = load_dataset( - path=dataset, + path=dataset_path, name=dataset_name, split=dataset_split, data_dir=data_dir,