From 9b39abcb8cf1e3ec97d28da20b9c8247c25dd050 Mon Sep 17 00:00:00 2001
From: hlky <hlky@hlky.ac>
Date: Mon, 27 Jan 2025 09:42:08 +0000
Subject: [PATCH 1/2] [training] Convert to ImageFolder script

---
 examples/dreambooth/README.md                 | 26 +++++++++++++
 examples/dreambooth/convert_to_imagefolder.py | 37 +++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 examples/dreambooth/convert_to_imagefolder.py

diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index f97a4d0cd0f4..eed0575c322d 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -742,3 +742,29 @@ accelerate launch train_dreambooth.py \
 ## Stable Diffusion XL
 
 We support fine-tuning of the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with DreamBooth and LoRA via the `train_dreambooth_lora_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md).
+
+## Dataset
+
+We support 🤗 [Datasets](https://huggingface.co/docs/datasets/index), you can find a dataset on the [Hugging Face Hub](https://huggingface.co/datasets) or use your own.
+
+The quickest way to get started with your custom dataset is 🤗 Datasets' [`ImageFolder`](https://huggingface.co/docs/datasets/image_dataset#imagefolder).
+
+We need to create a file `metadata.jsonl` in the directory with our images:
+
+```
+{"file_name": "01.jpg", "prompt": "prompt 01"}
+{"file_name": "02.jpg", "prompt": "prompt 02"}
+```
+
+If we have a directory with image-text pairs e.g. `01.jpg` and `01.txt` then `convert_to_imagefolder.py` can create `metadata.jsonl`.
+
+```sh
+python convert_to_imagefolder.py --path my_dataset/
+```
+
+We use `--dataset_name` and `--caption_column` with training scripts.
+
+```
+--dataset_name=my_dataset/
+--caption_column=prompt
+```
diff --git a/examples/dreambooth/convert_to_imagefolder.py b/examples/dreambooth/convert_to_imagefolder.py
new file mode 100644
index 000000000000..d6c62bf1c554
--- /dev/null
+++ b/examples/dreambooth/convert_to_imagefolder.py
@@ -0,0 +1,37 @@
+import argparse
+import json
+import pathlib
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--path",
+    type=str,
+    required=True,
+    help="Path to folder with image-text pairs.",
+)
+parser.add_argument(
+    "--caption_column", type=str, default="prompt", help="Name of caption column."
+)
+args = parser.parse_args()
+
+path = pathlib.Path(args.path)
+if not path.exists():
+    raise RuntimeError(f"`--path` '{args.path}' does not exist.")
+
+all_files = list(path.glob("*"))
+captions = list(path.glob("*.txt"))
+images = set(all_files) - set(captions)
+images = {image.stem: image for image in images}
+caption_image = {
+    caption: images.get(caption.stem)
+    for caption in captions
+    if images.get(caption.stem)
+}
+
+metadata = path.joinpath("metadata.jsonl")
+
+with metadata.open("w", encoding="utf-8") as f:
+    for caption, image in caption_image.items():
+        caption_text = caption.read_text(encoding="utf-8")
+        json.dump({"file_name": image.name, args.caption_column: caption_text}, f)
+        f.write("\n")

From 93fba26fafa31cb9cea6be28cdb23407d1b5e24f Mon Sep 17 00:00:00 2001
From: hlky <hlky@hlky.ac>
Date: Mon, 27 Jan 2025 10:02:58 +0000
Subject: [PATCH 2/2] make

---
 examples/dreambooth/convert_to_imagefolder.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/examples/dreambooth/convert_to_imagefolder.py b/examples/dreambooth/convert_to_imagefolder.py
index d6c62bf1c554..333080077428 100644
--- a/examples/dreambooth/convert_to_imagefolder.py
+++ b/examples/dreambooth/convert_to_imagefolder.py
@@ -2,6 +2,7 @@
 import json
 import pathlib
 
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--path",
@@ -9,9 +10,7 @@
     required=True,
     help="Path to folder with image-text pairs.",
 )
-parser.add_argument(
-    "--caption_column", type=str, default="prompt", help="Name of caption column."
-)
+parser.add_argument("--caption_column", type=str, default="prompt", help="Name of caption column.")
 args = parser.parse_args()
 
 path = pathlib.Path(args.path)
@@ -22,11 +21,7 @@
 captions = list(path.glob("*.txt"))
 images = set(all_files) - set(captions)
 images = {image.stem: image for image in images}
-caption_image = {
-    caption: images.get(caption.stem)
-    for caption in captions
-    if images.get(caption.stem)
-}
+caption_image = {caption: images.get(caption.stem) for caption in captions if images.get(caption.stem)}
 
 metadata = path.joinpath("metadata.jsonl")