From 8d102d4bb140f8f1b37e7b74cc1309c1daccc561 Mon Sep 17 00:00:00 2001 From: Jan Range <30547301+JR-1991@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:39:25 +0200 Subject: [PATCH 1/5] Add field descriptions and aliases to File model Enhanced the File model by adding detailed descriptions and aliases to its fields using Pydantic's Field parameters. This improves code readability and provides better documentation for each attribute. --- dvuploader/file.py | 82 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 16 deletions(-) diff --git a/dvuploader/file.py b/dvuploader/file.py index 0ed9741..3d759e8 100644 --- a/dvuploader/file.py +++ b/dvuploader/file.py @@ -44,20 +44,72 @@ class File(BaseModel): arbitrary_types_allowed=True, ) - filepath: str = Field(..., exclude=True) - handler: Union[BytesIO, StringIO, IO, None] = Field(default=None, exclude=True) - description: str = "" - directory_label: str = Field(default="", alias="directoryLabel") - mimeType: str = "application/octet-stream" - categories: Optional[List[str]] = ["DATA"] - restrict: bool = False - checksum_type: ChecksumTypes = Field(default=ChecksumTypes.MD5, exclude=True) - storageIdentifier: Optional[str] = None - file_name: Optional[str] = Field(default=None, alias="fileName") - checksum: Optional[Checksum] = None - to_replace: bool = False - file_id: Optional[Union[str, int]] = Field(default=None, alias="fileToReplaceId") - tab_ingest: bool = Field(default=True, alias="tabIngest") + filepath: str = Field( + ..., + exclude=True, + description="The path to the file", + ) + handler: Union[BytesIO, StringIO, IO, None] = Field( + default=None, + exclude=True, + description="File handler for reading the file contents", + ) + description: Optional[str] = Field( + default=None, + alias="description", + description="The description of the file", + ) + directory_label: Optional[str] = Field( + default=None, + alias="directoryLabel", + description="The label of the directory where the file is stored", + ) + mimeType: str = Field( + default="application/octet-stream", + description="The MIME type of the file", + ) + categories: Optional[List[str]] = Field( + default=["DATA"], + alias="categories", + description="The categories associated with the file", + ) + restrict: bool = Field( + default=False, + alias="restrict", + description="Indicates if the file is restricted", + ) + checksum_type: ChecksumTypes = Field( + default=ChecksumTypes.MD5, + exclude=True, + description="The type of checksum used for the file", + ) + storageIdentifier: Optional[str] = Field( + default=None, + description="The identifier of the storage where the file is stored", + ) + file_name: Optional[str] = Field( + default=None, + alias="fileName", + description="The name of the file", + ) + checksum: Optional[Checksum] = Field( + default=None, + description="The checksum of the file", + ) + file_id: Optional[Union[str, int]] = Field( + default=None, + alias="fileToReplaceId", + description="The ID of the file to replace", + ) + tab_ingest: bool = Field( + default=True, + alias="tabIngest", + description="Indicates if tabular ingest should be performed", + ) + to_replace: bool = Field( + default=False, + description="Indicates if the file should be replaced", + ) _size: int = PrivateAttr(default=0) _unchanged_data: bool = PrivateAttr(default=False) @@ -126,7 +178,6 @@ def apply_checksum(self): self.checksum.apply_checksum() - def update_checksum_chunked(self, blocksize=2**20): """Updates the checksum with data read from a file-like object in chunks. @@ -155,7 +206,6 @@ def update_checksum_chunked(self, blocksize=2**20): self.handler.seek(0) - def __del__(self): if self.handler is not None: self.handler.close() From 3ed43b27409e8460073f1590021709e8f4ed8a32 Mon Sep 17 00:00:00 2001 From: Jan Range <30547301+JR-1991@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:39:30 +0200 Subject: [PATCH 2/5] Refactor file metadata handling in native upload Simplifies metadata construction by using model_dump with explicit field inclusion and removes manual forceReplace handling. Improves code readability and consistency in file categorization and metadata update logic. --- dvuploader/nativeupload.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/dvuploader/nativeupload.py b/dvuploader/nativeupload.py index 4fe9412..8956bd7 100644 --- a/dvuploader/nativeupload.py +++ b/dvuploader/nativeupload.py @@ -92,8 +92,12 @@ async def native_upload( } files_new = [file for file in files if not file.to_replace] - files_new_metadata = [file for file in files if file.to_replace and file._unchanged_data] - files_replace = [file for file in files if file.to_replace and not file._unchanged_data] + files_new_metadata = [ + file for file in files if file.to_replace and file._unchanged_data + ] + files_replace = [ + file for file in files if file.to_replace and not file._unchanged_data + ] # These are not in a package but need a metadtata update, ensure even for zips for file in files_new_metadata: @@ -114,7 +118,7 @@ async def native_upload( file.file_name, # type: ignore total=file._size, ), - file + file, ) for file in files_replace ] @@ -325,17 +329,13 @@ def _get_json_data(file: File) -> Dict: Dict: Dictionary containing file metadata for the upload request. """ - metadata = { - "description": file.description, - "categories": file.categories, - "restrict": file.restrict, - "forceReplace": True, + include = { + "description", + "categories", + "restrict", + "tabIngest", } - - if file.directory_label: - metadata["directoryLabel"] = file.directory_label - - return metadata + return file.model_dump(by_alias=True, exclude_none=True, include=include) async def _update_metadata( @@ -374,8 +374,10 @@ async def _update_metadata( if _tab_extension(dv_path) in file_mapping: file_id = file_mapping[_tab_extension(dv_path)] elif ( - file.file_name and _is_zip(file.file_name) - and not file._is_inside_zip and not file._enforce_metadata_update + file.file_name + and _is_zip(file.file_name) + and not file._is_inside_zip + and not file._enforce_metadata_update ): # When the file is a zip package it will be unpacked and thus # the expected file name of the zip will not be in the @@ -426,8 +428,6 @@ async def _update_single_metadata( json_data = _get_json_data(file) - del json_data["forceReplace"] - # Send metadata as a readable byte stream # This is a workaround since "data" and "json" # does not work From 3b3efed8bb893293eb1698ff2ca666bb1b8853d3 Mon Sep 17 00:00:00 2001 From: Jan Range <30547301+JR-1991@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:39:34 +0200 Subject: [PATCH 3/5] Add tests for _prepare_registration function Introduces TestPrepareRegistration to verify correct handling of tab_ingest, restrict, and categories attributes in the _prepare_registration function. Ensures registration output matches expected values for various File configurations. --- tests/unit/test_directupload.py | 48 +++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/unit/test_directupload.py b/tests/unit/test_directupload.py index 0371a1c..2136832 100644 --- a/tests/unit/test_directupload.py +++ b/tests/unit/test_directupload.py @@ -4,6 +4,7 @@ from dvuploader.directupload import ( _add_files_to_ds, _validate_ticket_response, + _prepare_registration, ) from dvuploader.file import File @@ -128,3 +129,50 @@ def test_raises_assertion_error_when_abort_field_missing(self): } with pytest.raises(AssertionError): _validate_ticket_response(response) + + +class TestPrepareRegistration: + def test_tab_ingest_is_set_correctly(self): + files = [ + File(filepath="tests/fixtures/add_dir_files/somefile.txt"), + File( + filepath="tests/fixtures/add_dir_files/somefile.txt", + tab_ingest=False, # type: ignore + ), + File( + filepath="tests/fixtures/add_dir_files/somefile.txt", + restrict=True, + ), + File( + filepath="tests/fixtures/add_dir_files/somefile.txt", + categories=["Test file"], + ), + ] + registration = _prepare_registration(files, use_replace=False) + expected_registration = [ + { + "categories": ["DATA"], + "mimeType": "application/octet-stream", + "restrict": False, + "tabIngest": True, + }, + { + "categories": ["DATA"], + "mimeType": "application/octet-stream", + "restrict": False, + "tabIngest": False, + }, + { + "categories": ["DATA"], + "mimeType": "application/octet-stream", + "restrict": True, + "tabIngest": True, + }, + { + "categories": ["Test file"], + "mimeType": "application/octet-stream", + "restrict": False, + "tabIngest": True, + }, + ] + assert registration == expected_registration From f4d16cb21bb4841260d054a8bd7692f6fd80afbe Mon Sep 17 00:00:00 2001 From: Jan Range <30547301+JR-1991@users.noreply.github.com> Date: Wed, 24 Sep 2025 12:20:31 +0200 Subject: [PATCH 4/5] Handle missing directory_label in file path construction Updated dvuploader.py and nativeupload.py to properly construct file paths when directory_label is missing, falling back to file_name or raising a ValueError if neither is present. This improves robustness when processing files with incomplete metadata. --- dvuploader/dvuploader.py | 18 +++++++++++++++--- dvuploader/nativeupload.py | 9 ++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/dvuploader/dvuploader.py b/dvuploader/dvuploader.py index 20bd022..5d71e93 100644 --- a/dvuploader/dvuploader.py +++ b/dvuploader/dvuploader.py @@ -249,11 +249,15 @@ def _check_duplicates( file._unchanged_data = self._check_hashes(file, ds_file) if file._unchanged_data: table.add_row( - file.file_name, "[bright_cyan]Exists", "[bright_black]Replace Meta" + file.file_name, + "[bright_cyan]Exists", + "[bright_black]Replace Meta", ) else: table.add_row( - file.file_name, "[bright_cyan]Exists", "[bright_black]Replace" + file.file_name, + "[bright_cyan]Exists", + "[bright_black]Replace", ) else: table.add_row( @@ -302,7 +306,15 @@ def _get_file_id( # Find the file that matches label and directory_label for ds_file in ds_files: dspath = os.path.join(ds_file.get("directoryLabel", ""), ds_file["label"]) - fpath = os.path.join(file.directory_label, file.file_name) # type: ignore + + if file.directory_label: + fpath = os.path.join(file.directory_label, file.file_name) # type: ignore + elif file.file_name: + fpath = file.file_name + else: + raise ValueError( + f"File {file.file_name} has no directory label or file name." + ) if dspath == fpath: return ds_file["dataFile"]["id"] diff --git a/dvuploader/nativeupload.py b/dvuploader/nativeupload.py index 8956bd7..ea4389d 100644 --- a/dvuploader/nativeupload.py +++ b/dvuploader/nativeupload.py @@ -368,7 +368,14 @@ async def _update_metadata( tasks = [] for file in files: - dv_path = os.path.join(file.directory_label, file.file_name) # type: ignore + if file.directory_label: + dv_path = os.path.join(file.directory_label, file.file_name) # type: ignore + elif file.file_name: + dv_path = file.file_name + else: + raise ValueError( + f"File {file.file_name} has no directory label or file name." + ) try: if _tab_extension(dv_path) in file_mapping: From e41df5fdf747a70ea5de4f84f53f41d7331e0487 Mon Sep 17 00:00:00 2001 From: Jan Range <30547301+JR-1991@users.noreply.github.com> Date: Wed, 24 Sep 2025 12:20:36 +0200 Subject: [PATCH 5/5] Refactor file comparison in YAML config test Simplifies the file comparison logic in TestParseYAMLConfig by building the actual_files list with explicit handling of directory_label, improving readability and maintainability. --- tests/unit/test_cli.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 1dee82b..6cdaf8c 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -26,10 +26,15 @@ def test_full_input(self): assert cli_input.dataverse_url == "https://demo.dataverse.org/" assert cli_input.persistent_id == "doi:10.70122/XXX/XXXXX" + actual_files = [] + for file in cli_input.files: + if file.directory_label: + actual_files.append((file.directory_label, file.file_name)) + else: + actual_files.append(("", file.file_name)) + assert len(cli_input.files) == 2 - assert sorted( - [(file.directory_label, file.file_name) for file in cli_input.files] - ) == sorted(expected_files) + assert sorted(actual_files) == sorted(expected_files) class TestCLIMain: