diff --git a/dvuploader/dvuploader.py b/dvuploader/dvuploader.py index 20bd022..5d71e93 100644 --- a/dvuploader/dvuploader.py +++ b/dvuploader/dvuploader.py @@ -249,11 +249,15 @@ def _check_duplicates( file._unchanged_data = self._check_hashes(file, ds_file) if file._unchanged_data: table.add_row( - file.file_name, "[bright_cyan]Exists", "[bright_black]Replace Meta" + file.file_name, + "[bright_cyan]Exists", + "[bright_black]Replace Meta", ) else: table.add_row( - file.file_name, "[bright_cyan]Exists", "[bright_black]Replace" + file.file_name, + "[bright_cyan]Exists", + "[bright_black]Replace", ) else: table.add_row( @@ -302,7 +306,15 @@ def _get_file_id( # Find the file that matches label and directory_label for ds_file in ds_files: dspath = os.path.join(ds_file.get("directoryLabel", ""), ds_file["label"]) - fpath = os.path.join(file.directory_label, file.file_name) # type: ignore + + if file.directory_label: + fpath = os.path.join(file.directory_label, file.file_name) # type: ignore + elif file.file_name: + fpath = file.file_name + else: + raise ValueError( + f"File {file.file_name} has no directory label or file name." + ) if dspath == fpath: return ds_file["dataFile"]["id"] diff --git a/dvuploader/file.py b/dvuploader/file.py index 0ed9741..3d759e8 100644 --- a/dvuploader/file.py +++ b/dvuploader/file.py @@ -44,20 +44,72 @@ class File(BaseModel): arbitrary_types_allowed=True, ) - filepath: str = Field(..., exclude=True) - handler: Union[BytesIO, StringIO, IO, None] = Field(default=None, exclude=True) - description: str = "" - directory_label: str = Field(default="", alias="directoryLabel") - mimeType: str = "application/octet-stream" - categories: Optional[List[str]] = ["DATA"] - restrict: bool = False - checksum_type: ChecksumTypes = Field(default=ChecksumTypes.MD5, exclude=True) - storageIdentifier: Optional[str] = None - file_name: Optional[str] = Field(default=None, alias="fileName") - checksum: Optional[Checksum] = None - to_replace: bool = False - file_id: Optional[Union[str, int]] = Field(default=None, alias="fileToReplaceId") - tab_ingest: bool = Field(default=True, alias="tabIngest") + filepath: str = Field( + ..., + exclude=True, + description="The path to the file", + ) + handler: Union[BytesIO, StringIO, IO, None] = Field( + default=None, + exclude=True, + description="File handler for reading the file contents", + ) + description: Optional[str] = Field( + default=None, + alias="description", + description="The description of the file", + ) + directory_label: Optional[str] = Field( + default=None, + alias="directoryLabel", + description="The label of the directory where the file is stored", + ) + mimeType: str = Field( + default="application/octet-stream", + description="The MIME type of the file", + ) + categories: Optional[List[str]] = Field( + default=["DATA"], + alias="categories", + description="The categories associated with the file", + ) + restrict: bool = Field( + default=False, + alias="restrict", + description="Indicates if the file is restricted", + ) + checksum_type: ChecksumTypes = Field( + default=ChecksumTypes.MD5, + exclude=True, + description="The type of checksum used for the file", + ) + storageIdentifier: Optional[str] = Field( + default=None, + description="The identifier of the storage where the file is stored", + ) + file_name: Optional[str] = Field( + default=None, + alias="fileName", + description="The name of the file", + ) + checksum: Optional[Checksum] = Field( + default=None, + description="The checksum of the file", + ) + file_id: Optional[Union[str, int]] = Field( + default=None, + alias="fileToReplaceId", + description="The ID of the file to replace", + ) + tab_ingest: bool = Field( + default=True, + alias="tabIngest", + description="Indicates if tabular ingest should be performed", + ) + to_replace: bool = Field( + default=False, + description="Indicates if the file should be replaced", + ) _size: int = PrivateAttr(default=0) _unchanged_data: bool = PrivateAttr(default=False) @@ -126,7 +178,6 @@ def apply_checksum(self): self.checksum.apply_checksum() - def update_checksum_chunked(self, blocksize=2**20): """Updates the checksum with data read from a file-like object in chunks. @@ -155,7 +206,6 @@ def update_checksum_chunked(self, blocksize=2**20): self.handler.seek(0) - def __del__(self): if self.handler is not None: self.handler.close() diff --git a/dvuploader/nativeupload.py b/dvuploader/nativeupload.py index 4fe9412..ea4389d 100644 --- a/dvuploader/nativeupload.py +++ b/dvuploader/nativeupload.py @@ -92,8 +92,12 @@ async def native_upload( } files_new = [file for file in files if not file.to_replace] - files_new_metadata = [file for file in files if file.to_replace and file._unchanged_data] - files_replace = [file for file in files if file.to_replace and not file._unchanged_data] + files_new_metadata = [ + file for file in files if file.to_replace and file._unchanged_data + ] + files_replace = [ + file for file in files if file.to_replace and not file._unchanged_data + ] # These are not in a package but need a metadtata update, ensure even for zips for file in files_new_metadata: @@ -114,7 +118,7 @@ async def native_upload( file.file_name, # type: ignore total=file._size, ), - file + file, ) for file in files_replace ] @@ -325,17 +329,13 @@ def _get_json_data(file: File) -> Dict: Dict: Dictionary containing file metadata for the upload request. """ - metadata = { - "description": file.description, - "categories": file.categories, - "restrict": file.restrict, - "forceReplace": True, + include = { + "description", + "categories", + "restrict", + "tabIngest", } - - if file.directory_label: - metadata["directoryLabel"] = file.directory_label - - return metadata + return file.model_dump(by_alias=True, exclude_none=True, include=include) async def _update_metadata( @@ -368,14 +368,23 @@ async def _update_metadata( tasks = [] for file in files: - dv_path = os.path.join(file.directory_label, file.file_name) # type: ignore + if file.directory_label: + dv_path = os.path.join(file.directory_label, file.file_name) # type: ignore + elif file.file_name: + dv_path = file.file_name + else: + raise ValueError( + f"File {file.file_name} has no directory label or file name." + ) try: if _tab_extension(dv_path) in file_mapping: file_id = file_mapping[_tab_extension(dv_path)] elif ( - file.file_name and _is_zip(file.file_name) - and not file._is_inside_zip and not file._enforce_metadata_update + file.file_name + and _is_zip(file.file_name) + and not file._is_inside_zip + and not file._enforce_metadata_update ): # When the file is a zip package it will be unpacked and thus # the expected file name of the zip will not be in the @@ -426,8 +435,6 @@ async def _update_single_metadata( json_data = _get_json_data(file) - del json_data["forceReplace"] - # Send metadata as a readable byte stream # This is a workaround since "data" and "json" # does not work diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 1dee82b..6cdaf8c 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -26,10 +26,15 @@ def test_full_input(self): assert cli_input.dataverse_url == "https://demo.dataverse.org/" assert cli_input.persistent_id == "doi:10.70122/XXX/XXXXX" + actual_files = [] + for file in cli_input.files: + if file.directory_label: + actual_files.append((file.directory_label, file.file_name)) + else: + actual_files.append(("", file.file_name)) + assert len(cli_input.files) == 2 - assert sorted( - [(file.directory_label, file.file_name) for file in cli_input.files] - ) == sorted(expected_files) + assert sorted(actual_files) == sorted(expected_files) class TestCLIMain: diff --git a/tests/unit/test_directupload.py b/tests/unit/test_directupload.py index 0371a1c..2136832 100644 --- a/tests/unit/test_directupload.py +++ b/tests/unit/test_directupload.py @@ -4,6 +4,7 @@ from dvuploader.directupload import ( _add_files_to_ds, _validate_ticket_response, + _prepare_registration, ) from dvuploader.file import File @@ -128,3 +129,50 @@ def test_raises_assertion_error_when_abort_field_missing(self): } with pytest.raises(AssertionError): _validate_ticket_response(response) + + +class TestPrepareRegistration: + def test_tab_ingest_is_set_correctly(self): + files = [ + File(filepath="tests/fixtures/add_dir_files/somefile.txt"), + File( + filepath="tests/fixtures/add_dir_files/somefile.txt", + tab_ingest=False, # type: ignore + ), + File( + filepath="tests/fixtures/add_dir_files/somefile.txt", + restrict=True, + ), + File( + filepath="tests/fixtures/add_dir_files/somefile.txt", + categories=["Test file"], + ), + ] + registration = _prepare_registration(files, use_replace=False) + expected_registration = [ + { + "categories": ["DATA"], + "mimeType": "application/octet-stream", + "restrict": False, + "tabIngest": True, + }, + { + "categories": ["DATA"], + "mimeType": "application/octet-stream", + "restrict": False, + "tabIngest": False, + }, + { + "categories": ["DATA"], + "mimeType": "application/octet-stream", + "restrict": True, + "tabIngest": True, + }, + { + "categories": ["Test file"], + "mimeType": "application/octet-stream", + "restrict": False, + "tabIngest": True, + }, + ] + assert registration == expected_registration