From 5e293077aec997957da2b59f5f60bc00b4f74983 Mon Sep 17 00:00:00 2001 From: Ben Greiner Date: Tue, 12 Aug 2025 16:02:16 +0200 Subject: [PATCH 1/3] Add metadata test with zip inside upload package --- tests/integration/test_native_upload.py | 70 +++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tests/integration/test_native_upload.py b/tests/integration/test_native_upload.py index 479d025..0c23260 100644 --- a/tests/integration/test_native_upload.py +++ b/tests/integration/test_native_upload.py @@ -460,6 +460,76 @@ def test_zipzip_file_upload( assert sorted([file["label"] for file in files]) == sorted(expected_files) + def test_metadata_with_zip_files_in_package(self, credentials): + BASE_URL, API_TOKEN = credentials + + # Create Dataset + pid = create_dataset( + parent="Root", + server_url=BASE_URL, + api_token=API_TOKEN, + ) + + # Arrange + files = [ + File(filepath="tests/fixtures/archive.zip", + dv_dir="subdir2", + description="This file should not be unzipped", + categories=["Test file"] + ), + File(filepath="tests/fixtures/add_dir_files/somefile.txt", + dv_dir="subdir", + description="A simple text file", + categories=["Test file"] + ), + ] + + # Act + uploader = DVUploader(files=files) + uploader.upload( + persistent_id=pid, + api_token=API_TOKEN, + dataverse_url=BASE_URL, + n_parallel_uploads=10, + ) + + # Assert + files = retrieve_dataset_files( + dataverse_url=BASE_URL, + persistent_id=pid, + api_token=API_TOKEN, + ) + + assert len(files) == 2, f"Expected 2 files, got {len(files)}" + + expected_files = [ + { + "label": "archive.zip", + "description": "This file should not be unzipped", + "categories": ["Test file"] + }, + { + "label": "somefile.txt", + "description": "A simple text file", + "categories": ["Test file"] + }, + ] + + files_as_expected = sorted( + [ + { + k: (f[k] if k in f else None) + for k in expected_files[0].keys() + } + for f in files + ], + key=lambda x: x["label"] + ) + assert files_as_expected == expected_files, ( + f"File metadata not as expected: {json.dumps(files, indent=2)}" + ) + + def test_too_many_zip_files( self, credentials, From 226459cfd08d575e740665657107127420f380f0 Mon Sep 17 00:00:00 2001 From: Ben Greiner Date: Tue, 12 Aug 2025 17:33:22 +0200 Subject: [PATCH 2/3] Keep track of files put into zip package and don't skip their metadata even when they are zips --- dvuploader/file.py | 2 ++ dvuploader/nativeupload.py | 4 ++-- dvuploader/packaging.py | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/dvuploader/file.py b/dvuploader/file.py index 6107ced..2f9e9c2 100644 --- a/dvuploader/file.py +++ b/dvuploader/file.py @@ -30,6 +30,7 @@ class File(BaseModel): Private Attributes: _size (int): Size of the file in bytes. _unchanged_data (bool): Indicates if the file data has not changed since last upload. + _is_inside_zip (bool): Indicates if the file is packaged inside a zip archive. Methods: extract_file_name(): Extracts filename from filepath and initializes file handler. @@ -59,6 +60,7 @@ class File(BaseModel): _size: int = PrivateAttr(default=0) _unchanged_data: bool = PrivateAttr(default=False) + _is_inside_zip: bool = PrivateAttr(default=False) def extract_file_name(self): """ diff --git a/dvuploader/nativeupload.py b/dvuploader/nativeupload.py index a085919..40d8c80 100644 --- a/dvuploader/nativeupload.py +++ b/dvuploader/nativeupload.py @@ -369,8 +369,8 @@ async def _update_metadata( try: if _tab_extension(dv_path) in file_mapping: file_id = file_mapping[_tab_extension(dv_path)] - elif file.file_name and _is_zip(file.file_name): - # When the file is a zip it will be unpacked and thus + elif file.file_name and _is_zip(file.file_name) and not file._is_inside_zip: + # When the file is a zip package it will be unpacked and thus # the expected file name of the zip will not be in the # dataset, since it has been unpacked. continue diff --git a/dvuploader/packaging.py b/dvuploader/packaging.py index c99d4d1..9567fd0 100644 --- a/dvuploader/packaging.py +++ b/dvuploader/packaging.py @@ -98,6 +98,7 @@ def zip_files( data=file.handler.read(), # type: ignore zinfo_or_arcname=_create_arcname(file), ) + file._is_inside_zip = True return path From 7ab1e16b8398352a3f2d49a61f4881184407a9fd Mon Sep 17 00:00:00 2001 From: Ben Greiner Date: Tue, 12 Aug 2025 18:01:19 +0200 Subject: [PATCH 3/3] Enforce metadata update --- dvuploader/file.py | 2 ++ dvuploader/nativeupload.py | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/dvuploader/file.py b/dvuploader/file.py index 2f9e9c2..0ed9741 100644 --- a/dvuploader/file.py +++ b/dvuploader/file.py @@ -30,6 +30,7 @@ class File(BaseModel): Private Attributes: _size (int): Size of the file in bytes. _unchanged_data (bool): Indicates if the file data has not changed since last upload. + _enforce_metadata_update (bool): Indicates if metadata update is enforced. _is_inside_zip (bool): Indicates if the file is packaged inside a zip archive. Methods: @@ -60,6 +61,7 @@ class File(BaseModel): _size: int = PrivateAttr(default=0) _unchanged_data: bool = PrivateAttr(default=False) + _enforce_metadata_update: bool = PrivateAttr(default=False) _is_inside_zip: bool = PrivateAttr(default=False) def extract_file_name(self): diff --git a/dvuploader/nativeupload.py b/dvuploader/nativeupload.py index 40d8c80..4fe9412 100644 --- a/dvuploader/nativeupload.py +++ b/dvuploader/nativeupload.py @@ -95,6 +95,10 @@ async def native_upload( files_new_metadata = [file for file in files if file.to_replace and file._unchanged_data] files_replace = [file for file in files if file.to_replace and not file._unchanged_data] + # These are not in a package but need a metadtata update, ensure even for zips + for file in files_new_metadata: + file._enforce_metadata_update = True + async with httpx.AsyncClient(**session_params) as session: with tempfile.TemporaryDirectory() as tmp_dir: packages = distribute_files(files_new) @@ -369,7 +373,10 @@ async def _update_metadata( try: if _tab_extension(dv_path) in file_mapping: file_id = file_mapping[_tab_extension(dv_path)] - elif file.file_name and _is_zip(file.file_name) and not file._is_inside_zip: + elif ( + file.file_name and _is_zip(file.file_name) + and not file._is_inside_zip and not file._enforce_metadata_update + ): # When the file is a zip package it will be unpacked and thus # the expected file name of the zip will not be in the # dataset, since it has been unpacked.