From bad8588d8cbd21952aeb131371fd21e7e0c1179f Mon Sep 17 00:00:00 2001 From: Iuri de Silvio Date: Sun, 5 Oct 2025 17:57:25 +0200 Subject: [PATCH 1/9] Be more flexible with filename matches --- roboflow/util/folderparser.py | 22 ++++++++++++-- tests/util/test_folderparser.py | 51 +++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/roboflow/util/folderparser.py b/roboflow/util/folderparser.py index 7cc336bf..812e856b 100644 --- a/roboflow/util/folderparser.py +++ b/roboflow/util/folderparser.py @@ -140,7 +140,17 @@ def _build_image_and_annotation_maps(annotationFiles): ) if parsedType == "coco": for imageRef in parsed["images"]: - imgRefMap[f"{filename}/{imageRef['file_name']}"] = imageRef + # Normalize and index by multiple forms to improve matching robustness + file_name = _patch_sep(imageRef["file_name"]).lstrip("/") + basename = os.path.basename(file_name) + stem = os.path.splitext(basename)[0] + + # Prefer full relative path, but also allow basename and stem + imgRefMap.update({ + f"{filename}/{file_name}": imageRef, + f"{filename}/{basename}": imageRef, + f"{filename}/{stem}": imageRef, + }) for annotation in parsed["annotations"]: annotationMap[f"{dirname}/{annotation['image_id']}"].append(annotation) return imgRefMap, annotationMap @@ -149,7 +159,15 @@ def _build_image_and_annotation_maps(annotationFiles): def _filterIndividualAnnotations(image, annotation, format, imgRefMap, annotationMap): parsed = annotation["parsed"] if format == "coco": - imgReference = imgRefMap.get(f"{annotation['file']}/{image['name']}") + rel_path = image["file"].lstrip("/") + imgReference = ( + # Try matching by full relative path first + imgRefMap.get(f"{annotation['file']}/{rel_path}") + # Fallback: basename with extension + or imgRefMap.get(f"{annotation['file']}/{image['name']}") + # Fallback: stem (no extension) + or imgRefMap.get(f"{annotation['file']}/{image['key']}") + ) if imgReference: # workaround to make Annotations.js correctly identify this as coco in the backend fake_annotation = { diff --git a/tests/util/test_folderparser.py b/tests/util/test_folderparser.py index aec5ea44..8e8b041b 100644 --- a/tests/util/test_folderparser.py +++ b/tests/util/test_folderparser.py @@ -1,4 +1,6 @@ import json +import os +import tempfile import unittest from os.path import abspath, dirname @@ -95,6 +97,55 @@ def test_parse_multilabel_classification_csv(self): self.assertEqual(img1["annotationfile"]["type"], "classification_multilabel") self.assertEqual(set(img1["annotationfile"]["labels"]), {"Blackheads"}) + def test_coco_with_subdir_file_name_should_match_annotations(self): + # COCO file_name includes a subdirectory, but the actual image is at dataset root. + with tempfile.TemporaryDirectory() as tmpdir: + # Create nested image path: /2/100002/img.jpeg + image_name = "metaclip_2_100002_02f2f7c6e15f09b401575ae6.jpeg" + image_relpath = os.path.join("2", "100002", image_name) + image_path = os.path.join(tmpdir, image_name) + # Create an empty image file (content not used by parser) + open(image_path, "wb").close() + + # Create COCO annotation JSON at dataset root, referencing the image with subdir in file_name + coco = { + "info": {}, + "licenses": [], + "categories": [{"id": 1, "name": "thing"}], + "images": [ + { + "id": 10000000, + "file_name": image_relpath.replace(os.sep, "/"), + "width": 800, + "height": 533, + } + ], + "annotations": [ + { + "id": 1, + "image_id": 10000000, + "category_id": 1, + "bbox": [10, 10, 100, 50], + "area": 5000, + "segmentation": [], + "iscrowd": 0, + } + ], + } + coco_path = os.path.join(tmpdir, "_annotations.coco.json") + with open(coco_path, "w") as f: + json.dump(coco, f) + + parsed = folderparser.parsefolder(tmpdir) + # Image entries store file with a leading slash relative to root + expected_file_key = f"/{image_name}" + img_entries = [i for i in parsed["images"] if i["file"] == expected_file_key] + self.assertTrue(len(img_entries) == 1) + img_entry = img_entries[0] + + # Expect annotationfile to be populated, but this currently fails due to basename-only matching + self.assertIsNotNone(img_entry.get("annotationfile")) + def _assertJsonMatchesFile(actual, filename): with open(filename) as file: From dcd5191d45fd9ecd6b316b60ab70aab65ff375c1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 5 Oct 2025 16:11:36 +0000 Subject: [PATCH 2/9] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- roboflow/util/folderparser.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/roboflow/util/folderparser.py b/roboflow/util/folderparser.py index 812e856b..dc6cae54 100644 --- a/roboflow/util/folderparser.py +++ b/roboflow/util/folderparser.py @@ -146,11 +146,13 @@ def _build_image_and_annotation_maps(annotationFiles): stem = os.path.splitext(basename)[0] # Prefer full relative path, but also allow basename and stem - imgRefMap.update({ - f"{filename}/{file_name}": imageRef, - f"{filename}/{basename}": imageRef, - f"{filename}/{stem}": imageRef, - }) + imgRefMap.update( + { + f"{filename}/{file_name}": imageRef, + f"{filename}/{basename}": imageRef, + f"{filename}/{stem}": imageRef, + } + ) for annotation in parsed["annotations"]: annotationMap[f"{dirname}/{annotation['image_id']}"].append(annotation) return imgRefMap, annotationMap From c5260bec59a619e4ec7d498fcec4cfb872a00ed4 Mon Sep 17 00:00:00 2001 From: Iuri de Silvio Date: Wed, 8 Oct 2025 07:58:33 -0500 Subject: [PATCH 3/9] =?UTF-8?q?fixup!=20fix(pre=5Fcommit):=20=F0=9F=8E=A8?= =?UTF-8?q?=20auto=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- roboflow/util/folderparser.py | 101 ++++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 22 deletions(-) diff --git a/roboflow/util/folderparser.py b/roboflow/util/folderparser.py index dc6cae54..9509e6c0 100644 --- a/roboflow/util/folderparser.py +++ b/roboflow/util/folderparser.py @@ -111,21 +111,85 @@ def _map_annotations_to_images_1to1(images, annotations): def _map_annotations_to_images_1tomany(images, annotationFiles): - annotationsByDirname = _list_map(annotationFiles, "dirname") imgRefMap, annotationMap = _build_image_and_annotation_maps(annotationFiles) - + + # Build a map from image file paths to annotation files that reference them + # This avoids checking every annotation file for every image (O(n*m) -> O(n+m)) + image_path_to_annotation_files = _build_image_to_annotationfile_index(annotationFiles) + for image in tqdm(images): - dirname = image["dirname"] - annotationsInSameDir = annotationsByDirname.get(dirname, []) - if annotationsInSameDir: - for annotationFile in annotationsInSameDir: - format = annotationFile["parsedType"] - filtered_annotations = _filterIndividualAnnotations( - image, annotationFile, format, imgRefMap, annotationMap - ) - if filtered_annotations: - image["annotationfile"] = filtered_annotations - break + # Get candidate annotation files for this image + rel_path = image["file"].lstrip("/") + candidate_annotations = ( + image_path_to_annotation_files.get(rel_path, []) + or image_path_to_annotation_files.get(image["name"], []) + or image_path_to_annotation_files.get(image["key"], []) + or annotationFiles # Fallback to all files for non-COCO formats + ) + + for annotationFile in candidate_annotations: + format = annotationFile["parsedType"] + filtered_annotations = _filterIndividualAnnotations( + image, annotationFile, format, imgRefMap, annotationMap + ) + if filtered_annotations: + image["annotationfile"] = filtered_annotations + break + + +def _build_image_to_annotationfile_index(annotationFiles): + """Create an index mapping possible image path keys to annotation files that reference them. + + Keys include full relative path, basename, and stem to improve robustness across + different dataset layouts. Supports coco, createml, csv, multilabel_csv, jsonl. + """ + index = defaultdict(list) + for annotationFile in annotationFiles: + parsedType = annotationFile.get("parsedType") + parsed = annotationFile.get("parsed") + if not parsedType or parsed is None: + continue + + if parsedType == "coco": + for imageRef in parsed.get("images", []): + file_name = _patch_sep(imageRef.get("file_name", "")).lstrip("/") + if not file_name: + continue + basename = os.path.basename(file_name) + stem = os.path.splitext(basename)[0] + index[file_name].append(annotationFile) + index[basename].append(annotationFile) + index[stem].append(annotationFile) + + elif parsedType == "createml": + for entry in parsed: + image_name = entry.get("image") + if not image_name: + continue + index[image_name].append(annotationFile) + + elif parsedType == "csv": + for ld in parsed.get("lines", []): + image_name = ld.get("file_name") + if not image_name: + continue + index[image_name].append(annotationFile) + + elif parsedType == "multilabel_csv": + for row in parsed.get("rows", []): + image_name = row.get("file_name") + if not image_name: + continue + index[image_name].append(annotationFile) + + elif parsedType == "jsonl": + for entry in parsed: + image_name = entry.get("image") + if not image_name: + continue + index[image_name].append(annotationFile) + + return index def _build_image_and_annotation_maps(annotationFiles): @@ -154,7 +218,7 @@ def _build_image_and_annotation_maps(annotationFiles): } ) for annotation in parsed["annotations"]: - annotationMap[f"{dirname}/{annotation['image_id']}"].append(annotation) + annotationMap[f"{filename}/{annotation['image_id']}"].append(annotation) return imgRefMap, annotationMap @@ -181,7 +245,7 @@ def _filterIndividualAnnotations(image, annotation, format, imgRefMap, annotatio "iscrowd": 0, } _annotation = {"name": "annotation.coco.json"} - annotations_for_image = annotationMap.get(f"{image['dirname']}/{imgReference['id']}", []) + annotations_for_image = annotationMap.get(f"{annotation['file']}/{imgReference['id']}", []) _annotation["rawText"] = json.dumps( { "info": parsed["info"], @@ -334,13 +398,6 @@ def _decide_split(images): i["split"] = "train" -def _list_map(my_list, key): - d = {} - for i in my_list: - d.setdefault(i[key], []).append(i) - return d - - def _infer_classification_labels_from_folders(images): for image in images: if image.get("annotationfile"): From d576129035830f34cddc2a35869e1eb0dcfe95a0 Mon Sep 17 00:00:00 2001 From: Iuri de Silvio Date: Wed, 8 Oct 2025 07:58:45 -0500 Subject: [PATCH 4/9] =?UTF-8?q?fixup!=20fixup!=20fix(pre=5Fcommit):=20?= =?UTF-8?q?=F0=9F=8E=A8=20auto=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/util/test_folderparser.py | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/util/test_folderparser.py b/tests/util/test_folderparser.py index 8e8b041b..467a55db 100644 --- a/tests/util/test_folderparser.py +++ b/tests/util/test_folderparser.py @@ -146,6 +146,65 @@ def test_coco_with_subdir_file_name_should_match_annotations(self): # Expect annotationfile to be populated, but this currently fails due to basename-only matching self.assertIsNotNone(img_entry.get("annotationfile")) + def test_coco_root_annotation_matches_images_in_subdirs(self): + """Test that COCO annotation at root can match images in subdirectories. + + This tests the fix for the bug where annotation file dirname (/) didn't match + image dirname (/1/100001), causing annotations to not be found. + """ + with tempfile.TemporaryDirectory() as tmpdir: + # Create image in subdirectory + subdir = os.path.join(tmpdir, "1", "100001") + os.makedirs(subdir, exist_ok=True) + image_name = "image.jpeg" + image_path = os.path.join(subdir, image_name) + open(image_path, "wb").close() + + # Create COCO annotation at root referencing image with subdirectory path + coco = { + "info": {}, + "licenses": [], + "categories": [{"id": 1, "name": "object"}], + "images": [ + { + "id": 10000000, + "file_name": "1/100001/image.jpeg", + "width": 800, + "height": 600, + } + ], + "annotations": [ + { + "id": 1, + "image_id": 10000000, + "category_id": 1, + "bbox": [10, 20, 100, 200], + "area": 20000, + "segmentation": [[10, 20, 110, 20, 110, 220, 10, 220]], + "iscrowd": 0, + } + ], + } + coco_path = os.path.join(tmpdir, "_annotations.coco.json") + with open(coco_path, "w") as f: + json.dump(coco, f) + + parsed = folderparser.parsefolder(tmpdir) + + # Find the image + img_entries = [i for i in parsed["images"] if image_name in i["file"]] + self.assertEqual(len(img_entries), 1, "Should find exactly one image") + img_entry = img_entries[0] + + # Verify annotation was matched + self.assertIsNotNone(img_entry.get("annotationfile"), "Image should have annotation") + + # Verify annotation content + ann_data = json.loads(img_entry["annotationfile"]["rawText"]) + self.assertEqual(len(ann_data["images"]), 1, "Should have one image reference") + self.assertEqual(len(ann_data["annotations"]), 1, "Should have one annotation") + self.assertEqual(ann_data["annotations"][0]["bbox"], [10, 20, 100, 200]) + def _assertJsonMatchesFile(actual, filename): with open(filename) as file: From 80fe28a1dbe85bce69816dee095896c6c2b92f7c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:58:48 +0000 Subject: [PATCH 5/9] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- roboflow/util/folderparser.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/roboflow/util/folderparser.py b/roboflow/util/folderparser.py index 9509e6c0..b46b1372 100644 --- a/roboflow/util/folderparser.py +++ b/roboflow/util/folderparser.py @@ -112,11 +112,11 @@ def _map_annotations_to_images_1to1(images, annotations): def _map_annotations_to_images_1tomany(images, annotationFiles): imgRefMap, annotationMap = _build_image_and_annotation_maps(annotationFiles) - + # Build a map from image file paths to annotation files that reference them # This avoids checking every annotation file for every image (O(n*m) -> O(n+m)) image_path_to_annotation_files = _build_image_to_annotationfile_index(annotationFiles) - + for image in tqdm(images): # Get candidate annotation files for this image rel_path = image["file"].lstrip("/") @@ -126,12 +126,10 @@ def _map_annotations_to_images_1tomany(images, annotationFiles): or image_path_to_annotation_files.get(image["key"], []) or annotationFiles # Fallback to all files for non-COCO formats ) - + for annotationFile in candidate_annotations: format = annotationFile["parsedType"] - filtered_annotations = _filterIndividualAnnotations( - image, annotationFile, format, imgRefMap, annotationMap - ) + filtered_annotations = _filterIndividualAnnotations(image, annotationFile, format, imgRefMap, annotationMap) if filtered_annotations: image["annotationfile"] = filtered_annotations break From fc5758e1e0cde3db2b7f4db9859b77b98779e43c Mon Sep 17 00:00:00 2001 From: Iuri de Silvio Date: Wed, 8 Oct 2025 08:05:58 -0500 Subject: [PATCH 6/9] =?UTF-8?q?fixup!=20fixup!=20fixup!=20fix(pre=5Fcommit?= =?UTF-8?q?):=20=F0=9F=8E=A8=20auto=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- roboflow/util/folderparser.py | 5 +---- tests/util/test_folderparser.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/roboflow/util/folderparser.py b/roboflow/util/folderparser.py index b46b1372..4d12418f 100644 --- a/roboflow/util/folderparser.py +++ b/roboflow/util/folderparser.py @@ -111,11 +111,8 @@ def _map_annotations_to_images_1to1(images, annotations): def _map_annotations_to_images_1tomany(images, annotationFiles): - imgRefMap, annotationMap = _build_image_and_annotation_maps(annotationFiles) - - # Build a map from image file paths to annotation files that reference them - # This avoids checking every annotation file for every image (O(n*m) -> O(n+m)) image_path_to_annotation_files = _build_image_to_annotationfile_index(annotationFiles) + imgRefMap, annotationMap = _build_image_and_annotation_maps(annotationFiles) for image in tqdm(images): # Get candidate annotation files for this image diff --git a/tests/util/test_folderparser.py b/tests/util/test_folderparser.py index 467a55db..675e27a6 100644 --- a/tests/util/test_folderparser.py +++ b/tests/util/test_folderparser.py @@ -101,7 +101,7 @@ def test_coco_with_subdir_file_name_should_match_annotations(self): # COCO file_name includes a subdirectory, but the actual image is at dataset root. with tempfile.TemporaryDirectory() as tmpdir: # Create nested image path: /2/100002/img.jpeg - image_name = "metaclip_2_100002_02f2f7c6e15f09b401575ae6.jpeg" + image_name = "example_2_100002_02f2f7c6e15f09b401575ae6.jpeg" image_relpath = os.path.join("2", "100002", image_name) image_path = os.path.join(tmpdir, image_name) # Create an empty image file (content not used by parser) From f3b552ff3eb01feb4759aa645e45c56d4d876b44 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Oct 2025 13:08:01 +0000 Subject: [PATCH 7/9] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/util/test_folderparser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/util/test_folderparser.py b/tests/util/test_folderparser.py index 675e27a6..4f9ddb5b 100644 --- a/tests/util/test_folderparser.py +++ b/tests/util/test_folderparser.py @@ -148,7 +148,7 @@ def test_coco_with_subdir_file_name_should_match_annotations(self): def test_coco_root_annotation_matches_images_in_subdirs(self): """Test that COCO annotation at root can match images in subdirectories. - + This tests the fix for the bug where annotation file dirname (/) didn't match image dirname (/1/100001), causing annotations to not be found. """ @@ -190,15 +190,15 @@ def test_coco_root_annotation_matches_images_in_subdirs(self): json.dump(coco, f) parsed = folderparser.parsefolder(tmpdir) - + # Find the image img_entries = [i for i in parsed["images"] if image_name in i["file"]] self.assertEqual(len(img_entries), 1, "Should find exactly one image") img_entry = img_entries[0] - + # Verify annotation was matched self.assertIsNotNone(img_entry.get("annotationfile"), "Image should have annotation") - + # Verify annotation content ann_data = json.loads(img_entry["annotationfile"]["rawText"]) self.assertEqual(len(ann_data["images"]), 1, "Should have one image reference") From 9eefc11fd49a120d6c079ff980f14383cd089ec8 Mon Sep 17 00:00:00 2001 From: Iuri de Silvio Date: Wed, 8 Oct 2025 08:09:24 -0500 Subject: [PATCH 8/9] =?UTF-8?q?fixup!=20fix(pre=5Fcommit):=20=F0=9F=8E=A8?= =?UTF-8?q?=20auto=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- roboflow/util/folderparser.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/roboflow/util/folderparser.py b/roboflow/util/folderparser.py index 4d12418f..047cdaee 100644 --- a/roboflow/util/folderparser.py +++ b/roboflow/util/folderparser.py @@ -191,9 +191,8 @@ def _build_image_and_annotation_maps(annotationFiles): imgRefMap = {} annotationMap = defaultdict(list) for annFile in annotationFiles: - filename, dirname, parsed, parsedType = ( + filename, parsed, parsedType = ( annFile["file"], - annFile["dirname"], annFile["parsed"], annFile["parsedType"], ) From 76ae33fa4478facbd9ace792739da2061226b2a6 Mon Sep 17 00:00:00 2001 From: Iuri de Silvio Date: Wed, 8 Oct 2025 08:10:06 -0500 Subject: [PATCH 9/9] =?UTF-8?q?fixup!=20fixup!=20fix(pre=5Fcommit):=20?= =?UTF-8?q?=F0=9F=8E=A8=20auto=20format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- roboflow/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roboflow/__init__.py b/roboflow/__init__.py index a74398da..bde69972 100644 --- a/roboflow/__init__.py +++ b/roboflow/__init__.py @@ -15,7 +15,7 @@ from roboflow.models import CLIPModel, GazeModel # noqa: F401 from roboflow.util.general import write_line -__version__ = "1.2.10" +__version__ = "1.2.11" def check_key(api_key, model, notebook, num_retries=0):