Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions dvuploader/dvuploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,15 @@ def _check_duplicates(
file._unchanged_data = self._check_hashes(file, ds_file)
if file._unchanged_data:
table.add_row(
file.file_name, "[bright_cyan]Exists", "[bright_black]Replace Meta"
file.file_name,
"[bright_cyan]Exists",
"[bright_black]Replace Meta",
)
else:
table.add_row(
file.file_name, "[bright_cyan]Exists", "[bright_black]Replace"
file.file_name,
"[bright_cyan]Exists",
"[bright_black]Replace",
)
else:
table.add_row(
Expand Down Expand Up @@ -302,7 +306,15 @@ def _get_file_id(
# Find the file that matches label and directory_label
for ds_file in ds_files:
dspath = os.path.join(ds_file.get("directoryLabel", ""), ds_file["label"])
fpath = os.path.join(file.directory_label, file.file_name) # type: ignore

if file.directory_label:
fpath = os.path.join(file.directory_label, file.file_name) # type: ignore
elif file.file_name:
fpath = file.file_name
else:
raise ValueError(
f"File {file.file_name} has no directory label or file name."
)

if dspath == fpath:
return ds_file["dataFile"]["id"]
Expand Down
82 changes: 66 additions & 16 deletions dvuploader/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,72 @@ class File(BaseModel):
arbitrary_types_allowed=True,
)

filepath: str = Field(..., exclude=True)
handler: Union[BytesIO, StringIO, IO, None] = Field(default=None, exclude=True)
description: str = ""
directory_label: str = Field(default="", alias="directoryLabel")
mimeType: str = "application/octet-stream"
categories: Optional[List[str]] = ["DATA"]
restrict: bool = False
checksum_type: ChecksumTypes = Field(default=ChecksumTypes.MD5, exclude=True)
storageIdentifier: Optional[str] = None
file_name: Optional[str] = Field(default=None, alias="fileName")
checksum: Optional[Checksum] = None
to_replace: bool = False
file_id: Optional[Union[str, int]] = Field(default=None, alias="fileToReplaceId")
tab_ingest: bool = Field(default=True, alias="tabIngest")
filepath: str = Field(
...,
exclude=True,
description="The path to the file",
)
handler: Union[BytesIO, StringIO, IO, None] = Field(
default=None,
exclude=True,
description="File handler for reading the file contents",
)
description: Optional[str] = Field(
default=None,
alias="description",
description="The description of the file",
)
directory_label: Optional[str] = Field(
default=None,
alias="directoryLabel",
description="The label of the directory where the file is stored",
)
mimeType: str = Field(
default="application/octet-stream",
description="The MIME type of the file",
)
categories: Optional[List[str]] = Field(
default=["DATA"],
alias="categories",
description="The categories associated with the file",
)
restrict: bool = Field(
default=False,
alias="restrict",
description="Indicates if the file is restricted",
)
checksum_type: ChecksumTypes = Field(
default=ChecksumTypes.MD5,
exclude=True,
description="The type of checksum used for the file",
)
storageIdentifier: Optional[str] = Field(
default=None,
description="The identifier of the storage where the file is stored",
)
file_name: Optional[str] = Field(
default=None,
alias="fileName",
description="The name of the file",
)
checksum: Optional[Checksum] = Field(
default=None,
description="The checksum of the file",
)
file_id: Optional[Union[str, int]] = Field(
default=None,
alias="fileToReplaceId",
description="The ID of the file to replace",
)
tab_ingest: bool = Field(
default=True,
alias="tabIngest",
description="Indicates if tabular ingest should be performed",
)
to_replace: bool = Field(
default=False,
description="Indicates if the file should be replaced",
)

_size: int = PrivateAttr(default=0)
_unchanged_data: bool = PrivateAttr(default=False)
Expand Down Expand Up @@ -126,7 +178,6 @@ def apply_checksum(self):

self.checksum.apply_checksum()


def update_checksum_chunked(self, blocksize=2**20):
"""Updates the checksum with data read from a file-like object in chunks.

Expand Down Expand Up @@ -155,7 +206,6 @@ def update_checksum_chunked(self, blocksize=2**20):

self.handler.seek(0)


def __del__(self):
if self.handler is not None:
self.handler.close()
43 changes: 25 additions & 18 deletions dvuploader/nativeupload.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,12 @@ async def native_upload(
}

files_new = [file for file in files if not file.to_replace]
files_new_metadata = [file for file in files if file.to_replace and file._unchanged_data]
files_replace = [file for file in files if file.to_replace and not file._unchanged_data]
files_new_metadata = [
file for file in files if file.to_replace and file._unchanged_data
]
files_replace = [
file for file in files if file.to_replace and not file._unchanged_data
]

# These are not in a package but need a metadtata update, ensure even for zips
for file in files_new_metadata:
Expand All @@ -114,7 +118,7 @@ async def native_upload(
file.file_name, # type: ignore
total=file._size,
),
file
file,
)
for file in files_replace
]
Expand Down Expand Up @@ -325,17 +329,13 @@ def _get_json_data(file: File) -> Dict:
Dict: Dictionary containing file metadata for the upload request.
"""

metadata = {
"description": file.description,
"categories": file.categories,
"restrict": file.restrict,
"forceReplace": True,
include = {
"description",
"categories",
"restrict",
"tabIngest",
}

if file.directory_label:
metadata["directoryLabel"] = file.directory_label

return metadata
return file.model_dump(by_alias=True, exclude_none=True, include=include)


async def _update_metadata(
Expand Down Expand Up @@ -368,14 +368,23 @@ async def _update_metadata(
tasks = []

for file in files:
dv_path = os.path.join(file.directory_label, file.file_name) # type: ignore
if file.directory_label:
dv_path = os.path.join(file.directory_label, file.file_name) # type: ignore
elif file.file_name:
dv_path = file.file_name
else:
raise ValueError(
f"File {file.file_name} has no directory label or file name."
)

try:
if _tab_extension(dv_path) in file_mapping:
file_id = file_mapping[_tab_extension(dv_path)]
elif (
file.file_name and _is_zip(file.file_name)
and not file._is_inside_zip and not file._enforce_metadata_update
file.file_name
and _is_zip(file.file_name)
and not file._is_inside_zip
and not file._enforce_metadata_update
):
# When the file is a zip package it will be unpacked and thus
# the expected file name of the zip will not be in the
Expand Down Expand Up @@ -426,8 +435,6 @@ async def _update_single_metadata(

json_data = _get_json_data(file)

del json_data["forceReplace"]

# Send metadata as a readable byte stream
# This is a workaround since "data" and "json"
# does not work
Expand Down
11 changes: 8 additions & 3 deletions tests/unit/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,15 @@ def test_full_input(self):
assert cli_input.dataverse_url == "https://demo.dataverse.org/"
assert cli_input.persistent_id == "doi:10.70122/XXX/XXXXX"

actual_files = []
for file in cli_input.files:
if file.directory_label:
actual_files.append((file.directory_label, file.file_name))
else:
actual_files.append(("", file.file_name))

assert len(cli_input.files) == 2
assert sorted(
[(file.directory_label, file.file_name) for file in cli_input.files]
) == sorted(expected_files)
assert sorted(actual_files) == sorted(expected_files)


class TestCLIMain:
Expand Down
48 changes: 48 additions & 0 deletions tests/unit/test_directupload.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from dvuploader.directupload import (
_add_files_to_ds,
_validate_ticket_response,
_prepare_registration,
)

from dvuploader.file import File
Expand Down Expand Up @@ -128,3 +129,50 @@ def test_raises_assertion_error_when_abort_field_missing(self):
}
with pytest.raises(AssertionError):
_validate_ticket_response(response)


class TestPrepareRegistration:
def test_tab_ingest_is_set_correctly(self):
files = [
File(filepath="tests/fixtures/add_dir_files/somefile.txt"),
File(
filepath="tests/fixtures/add_dir_files/somefile.txt",
tab_ingest=False, # type: ignore
),
File(
filepath="tests/fixtures/add_dir_files/somefile.txt",
restrict=True,
),
File(
filepath="tests/fixtures/add_dir_files/somefile.txt",
categories=["Test file"],
),
]
registration = _prepare_registration(files, use_replace=False)
expected_registration = [
{
"categories": ["DATA"],
"mimeType": "application/octet-stream",
"restrict": False,
"tabIngest": True,
},
{
"categories": ["DATA"],
"mimeType": "application/octet-stream",
"restrict": False,
"tabIngest": False,
},
{
"categories": ["DATA"],
"mimeType": "application/octet-stream",
"restrict": True,
"tabIngest": True,
},
{
"categories": ["Test file"],
"mimeType": "application/octet-stream",
"restrict": False,
"tabIngest": True,
},
]
assert registration == expected_registration