diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 769019d..2ee6a2d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,8 +20,8 @@ jobs: env: PORT: 8080 steps: - - name: "Checkout" - uses: "actions/checkout@v4" + - name: 'Checkout' + uses: 'actions/checkout@v4' - name: Run Dataverse Action id: dataverse uses: gdcc/dataverse-action@main @@ -38,6 +38,7 @@ jobs: env: API_TOKEN: ${{ steps.dataverse.outputs.api_token }} BASE_URL: ${{ steps.dataverse.outputs.base_url }} - DVUPLOADER_TESTING: "true" + DVUPLOADER_TESTING: 'true' + TEST_ROWS: 100000 run: | python3 -m poetry run pytest diff --git a/README.md b/README.md index 049d8c5..3bc8ec5 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ Alternatively, you can also supply a `config` file that contains all necessary i * `mimetype`: Mimetype of the file. * `categories`: Optional list of categories to assign to the file. * `restrict`: Boolean to indicate that this is a restricted file. Defaults to False. + * `tabIngest`: Boolean to indicate that the file should be ingested as a tab-separated file. Defaults to True. In the following example, we upload three files to a Dataverse instance. The first file is uploaded to the root directory of the dataset, while the other two files are uploaded to the directory `some/dir`. @@ -115,6 +116,61 @@ The `config` file can then be used as follows: dvuploader --config-path config.yml ``` +### Environment variables + +DVUploader provides several environment variables that allow you to control retry logic and upload size limits. These can be set either through environment variables directly or programmatically using the `config` function. + +**Available Environment Variables:** +- `DVUPLOADER_MAX_RETRIES`: Maximum number of retry attempts (default: 15) +- `DVUPLOADER_MAX_RETRY_TIME`: Maximum wait time between retries in seconds (default: 240) +- `DVUPLOADER_MIN_RETRY_TIME`: Minimum wait time between retries in seconds (default: 1) +- `DVUPLOADER_RETRY_MULTIPLIER`: Multiplier for exponential backoff (default: 0.1) +- `DVUPLOADER_MAX_PKG_SIZE`: Maximum package size in bytes (default: 2GB) + +**Setting via environment:** +```bash +export DVUPLOADER_MAX_RETRIES=20 +export DVUPLOADER_MAX_RETRY_TIME=300 +export DVUPLOADER_MIN_RETRY_TIME=2 +export DVUPLOADER_RETRY_MULTIPLIER=0.2 +export DVUPLOADER_MAX_PKG_SIZE=3221225472 # 3GB +``` + +**Setting programmatically:** +```python +import dvuploader as dv + +# Configure the uploader settings +dv.config( + max_retries=20, + max_retry_time=300, + min_retry_time=2, + retry_multiplier=0.2, + max_package_size=3 * 1024**3 # 3GB +) + +# Continue with your upload as normal +files = [dv.File(filepath="./data.csv")] +dvuploader = dv.DVUploader(files=files) +# ... rest of your upload code +``` + +The retry logic uses exponential backoff which ensures that subsequent retries will be longer, but won't exceed exceed `max_retry_time`. This is particularly useful when dealing with native uploads that may be subject to intermediate locks on the Dataverse side. + +## Troubleshooting + +#### `500` error and `OptimisticLockException` + +When uploading multiple tabular files, you might encounter a `500` error and a `OptimisticLockException` upon the file registration step. This has been discussed in https://github.com/IQSS/dataverse/issues/11265 and is due to the fact that intermediate locks prevent the file registration step from completing. + +A workaround is to set the `tabIngest` flag to `False` for all files that are to be uploaded. This will cause the files not be ingested but will avoid the intermediate locks. + +```python +dv.File(filepath="tab_file.csv", tab_ingest=False) +``` + +Please be aware that your tabular files will not be ingested as such but will be uploaded in their native format. You can utilize [pyDataverse](https://github.com/gdcc/pyDataverse/blob/693d0ff8d2849eccc32f9e66228ee8976109881a/pyDataverse/api.py#L2475) to ingest the files after they have been uploaded. + ## Development To install the development dependencies, run the following command: diff --git a/dvuploader/__init__.py b/dvuploader/__init__.py index d357e67..fb8970f 100644 --- a/dvuploader/__init__.py +++ b/dvuploader/__init__.py @@ -1,6 +1,7 @@ from .dvuploader import DVUploader # noqa: F401 from .file import File # noqa: F401 from .utils import add_directory # noqa: F401 +from .config import config # noqa: F401 import nest_asyncio diff --git a/dvuploader/config.py b/dvuploader/config.py new file mode 100644 index 0000000..4203782 --- /dev/null +++ b/dvuploader/config.py @@ -0,0 +1,56 @@ +import os + + +def config( + max_retries: int = 15, + max_retry_time: int = 240, + min_retry_time: int = 1, + retry_multiplier: float = 0.1, + max_package_size: int = 2 * 1024**3, +): + """This function sets the environment variables for the dvuploader package. + + Use this function to set the environment variables for the dvuploader package, + which controls the behavior of the package. This is particularly useful when + you want to be more loose on the handling of the retry logic and upload size. + + Retry logic: + Native uploads in particular may be subject to intermediate locks + on the Dataverse side, which may cause the upload to fail. We provide + and exponential backoff mechanism to deal with this. + + The exponential backoff is controlled by the following environment variables: + - DVUPLOADER_MAX_RETRIES: The maximum number of retries. + - DVUPLOADER_MAX_RETRY_TIME: The maximum retry time. + - DVUPLOADER_MIN_RETRY_TIME: The minimum retry time. + - DVUPLOADER_RETRY_MULTIPLIER: The retry multiplier. + + The recursive formula for the wait time is: + wait_time = min_retry_time * retry_multiplier^n + where n is the number of retries. + + The wait time will not exceed max_retry_time. + + Upload size: + The maximum upload size is controlled by the following environment variable: + - DVUPLOADER_MAX_PKG_SIZE: The maximum package size. + + The default maximum package size is 2GB, but this can be changed by + setting the DVUPLOADER_MAX_PKG_SIZE environment variable. + + We recommend not to exceed 2GB, as this is the maximum size supported + by Dataverse and beyond that the risk of failure increases. + + Args: + max_retries (int): The maximum number of retries. + max_retry_time (int): The maximum retry time. + min_retry_time (int): The minimum retry time. + retry_multiplier (float): The retry multiplier. + max_package_size (int): The maximum package size. + """ + + os.environ["DVUPLOADER_MAX_RETRIES"] = str(max_retries) + os.environ["DVUPLOADER_MAX_RETRY_TIME"] = str(max_retry_time) + os.environ["DVUPLOADER_MIN_RETRY_TIME"] = str(min_retry_time) + os.environ["DVUPLOADER_RETRY_MULTIPLIER"] = str(retry_multiplier) + os.environ["DVUPLOADER_MAX_PKG_SIZE"] = str(max_package_size) diff --git a/dvuploader/directupload.py b/dvuploader/directupload.py index cc1ade4..a8298e9 100644 --- a/dvuploader/directupload.py +++ b/dvuploader/directupload.py @@ -337,7 +337,6 @@ async def _upload_multipart( ) file.apply_checksum() - print(file.checksum) return True, storage_identifier @@ -556,17 +555,21 @@ async def _add_files_to_ds( novel_json_data = _prepare_registration(files, use_replace=False) replace_json_data = _prepare_registration(files, use_replace=True) - await _multipart_json_data_request( - session=session, - json_data=novel_json_data, - url=novel_url, - ) + if novel_json_data: + # Register new files, if any + await _multipart_json_data_request( + session=session, + json_data=novel_json_data, + url=novel_url, + ) - await _multipart_json_data_request( - session=session, - json_data=replace_json_data, - url=replace_url, - ) + if replace_json_data: + # Register replacement files, if any + await _multipart_json_data_request( + session=session, + json_data=replace_json_data, + url=replace_url, + ) progress.update(pbar, advance=1) diff --git a/dvuploader/file.py b/dvuploader/file.py index 8a9bb44..dbd7c7a 100644 --- a/dvuploader/file.py +++ b/dvuploader/file.py @@ -45,7 +45,7 @@ class File(BaseModel): handler: Union[BytesIO, StringIO, IO, None] = Field(default=None, exclude=True) description: str = "" directory_label: str = Field(default="", alias="directoryLabel") - mimeType: str = "text/plain" + mimeType: str = "application/octet-stream" categories: List[str] = ["DATA"] restrict: bool = False checksum_type: ChecksumTypes = Field(default=ChecksumTypes.MD5, exclude=True) @@ -54,6 +54,7 @@ class File(BaseModel): checksum: Optional[Checksum] = None to_replace: bool = False file_id: Optional[Union[str, int]] = Field(default=None, alias="fileToReplaceId") + tab_ingest: bool = Field(default=True, alias="tabIngest") _size: int = PrivateAttr(default=0) diff --git a/dvuploader/nativeupload.py b/dvuploader/nativeupload.py index 55a4f07..7296c86 100644 --- a/dvuploader/nativeupload.py +++ b/dvuploader/nativeupload.py @@ -1,9 +1,11 @@ import asyncio from io import BytesIO +from pathlib import Path import httpx import json import os import tempfile +import rich import tenacity from typing import List, Optional, Tuple, Dict @@ -13,12 +15,44 @@ from dvuploader.packaging import distribute_files, zip_files from dvuploader.utils import build_url, retrieve_dataset_files +##### CONFIGURATION ##### + +# Based on MAX_RETRIES, we will wait between 0.3 and 120 seconds between retries: +# Exponential recursion: 0.1 * 2^n +# +# This will exponentially increase the wait time between retries. +# The max wait time is 240 seconds per retry though. MAX_RETRIES = int(os.environ.get("DVUPLOADER_MAX_RETRIES", 15)) +MAX_RETRY_TIME = int(os.environ.get("DVUPLOADER_MAX_RETRY_TIME", 60)) +MIN_RETRY_TIME = int(os.environ.get("DVUPLOADER_MIN_RETRY_TIME", 1)) +RETRY_MULTIPLIER = float(os.environ.get("DVUPLOADER_RETRY_MULTIPLIER", 0.1)) +RETRY_STRAT = tenacity.wait_exponential( + multiplier=RETRY_MULTIPLIER, + min=MIN_RETRY_TIME, + max=MAX_RETRY_TIME, +) + +assert isinstance(MAX_RETRIES, int), "DVUPLOADER_MAX_RETRIES must be an integer" +assert isinstance(MAX_RETRY_TIME, int), "DVUPLOADER_MAX_RETRY_TIME must be an integer" +assert isinstance(MIN_RETRY_TIME, int), "DVUPLOADER_MIN_RETRY_TIME must be an integer" +assert isinstance(RETRY_MULTIPLIER, float), ( + "DVUPLOADER_RETRY_MULTIPLIER must be a float" +) + +##### END CONFIGURATION ##### + NATIVE_UPLOAD_ENDPOINT = "/api/datasets/:persistentId/add" NATIVE_REPLACE_ENDPOINT = "/api/files/{FILE_ID}/replace" NATIVE_METADATA_ENDPOINT = "/api/files/{FILE_ID}/metadata" -assert isinstance(MAX_RETRIES, int), "DVUPLOADER_MAX_RETRIES must be an integer" +TABULAR_EXTENSIONS = [ + "csv", + "tsv", +] + +##### ERROR MESSAGES ##### + +ZIP_LIMIT_MESSAGE = "The number of files in the zip archive is over the limit" async def native_upload( @@ -174,8 +208,9 @@ def _reset_progress( @tenacity.retry( - wait=tenacity.wait_fixed(0.5), + wait=RETRY_STRAT, stop=tenacity.stop_after_attempt(MAX_RETRIES), + retry=tenacity.retry_if_exception_type((httpx.HTTPStatusError,)), ) async def _single_native_upload( session: httpx.AsyncClient, @@ -213,7 +248,11 @@ async def _single_native_upload( json_data = _get_json_data(file) files = { - "file": (file.file_name, file.handler, file.mimeType), + "file": ( + file.file_name, + file.handler, + file.mimeType, + ), "jsonData": ( None, BytesIO(json.dumps(json_data).encode()), @@ -226,9 +265,20 @@ async def _single_native_upload( files=files, # type: ignore ) + if response.status_code == 400 and response.json()["message"].startswith( + ZIP_LIMIT_MESSAGE + ): + # Explicitly handle the zip limit error, because otherwise we will run into + # unnecessary retries. + raise ValueError( + f"Could not upload file '{file.file_name}' due to zip limit:\n{response.json()['message']}" + ) + + # Any other error is re-raised and the error will be handled by the retry logic. response.raise_for_status() if response.status_code == 200: + # If we did well, update the progress bar. progress.update(pbar, advance=file._size, complete=file._size) # Wait to avoid rate limiting @@ -238,7 +288,6 @@ async def _single_native_upload( # Wait to avoid rate limiting await asyncio.sleep(1.0) - return False, {"message": "Failed to upload file"} @@ -294,14 +343,23 @@ async def _update_metadata( dv_path = os.path.join(file.directory_label, file.file_name) # type: ignore try: - file_id = file_mapping[dv_path] + if _tab_extension(dv_path) in file_mapping: + file_id = file_mapping[_tab_extension(dv_path)] + elif file.file_name and _is_zip(file.file_name): + # When the file is a zip it will be unpacked and thus + # the expected file name of the zip will not be in the + # dataset, since it has been unpacked. + continue + else: + file_id = file_mapping[dv_path] except KeyError: - raise ValueError( + rich.print( ( f"File {dv_path} not found in Dataverse repository.", - "This may be due to the file not being uploaded to the repository.", + "This may be due to the file not being uploaded to the repository:", ) ) + continue task = _update_single_metadata( session=session, @@ -315,7 +373,7 @@ async def _update_metadata( @tenacity.retry( - wait=tenacity.wait_fixed(0.3), + wait=RETRY_STRAT, stop=tenacity.stop_after_attempt(MAX_RETRIES), ) async def _update_single_metadata( @@ -356,6 +414,7 @@ async def _update_single_metadata( if response.status_code == 200: return else: + print(response.json()) await asyncio.sleep(1.0) raise ValueError(f"Failed to update metadata for file {file.file_name}.") @@ -407,3 +466,17 @@ def _create_file_id_path_mapping(files): mapping[path] = file["id"] return mapping + + +def _tab_extension(path: str) -> str: + """ + Adds a tabular extension to the path if it is not already present. + """ + return str(Path(path).with_suffix(".tab")) + + +def _is_zip(file_name: str) -> bool: + """ + Checks if a file name ends with a zip extension. + """ + return file_name.endswith(".zip") diff --git a/pyproject.toml b/pyproject.toml index d36c8d1..004eebd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.9" pydantic = "^2.5.3" -httpx = "^0.27.0" +httpx = "^0.28" typer = ">=0.9, <0.16" pyyaml = "^6.0.1" nest-asyncio = "^1.5.8" @@ -25,7 +25,7 @@ codespell = "^2.2.6" coverage = "^7.4.0" pytest-cov = "^4.1.0" pytest-asyncio = "^0.23.3" -pytest-httpx = "^0.30.0" +pytest-httpx = "^0.35.0" [tool.poetry.group.jupyter.dependencies] ipywidgets = "^8.1.1" @@ -33,7 +33,7 @@ ipywidgets = "^8.1.1" [tool.poetry.group.test.dependencies] pytest-cov = "^4.1.0" pytest-asyncio = "^0.23.3" -pytest-httpx = "^0.30.0" +pytest-httpx = "^0.35.0" [tool.poetry.group.linting.dependencies] codespell = "^2.2.6" diff --git a/tests/conftest.py b/tests/conftest.py index 2590812..84e5cb2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ import os import pytest import httpx +import random @pytest.fixture @@ -67,3 +68,34 @@ def create_mock_file( f.write(b"\0") return path + + +def create_mock_tabular_file( + directory: str, + name: str, + rows: int = 1000000, + cols: int = 10, +): + """Create a tabular file with the specified number of rows and columns. + + Args: + directory (str): The directory where the file will be created. + name (str): The name of the file. + rows (int, optional): The number of rows in the file. Defaults to 1000000. + cols (int, optional): The number of columns in the file. Defaults to 10. + """ + path = os.path.join(directory, name) + with open(path, "w") as f: + # Create header + f.write(",".join([f"col_{i}" for i in range(cols)]) + "\n") + + # Create rows + for i in range(rows): + f.write( + f"{i}" + + "," + + ",".join([f"{random.randint(0, 100)}" for j in range(cols - 1)]) + + "\n" + ) + + return path diff --git a/tests/fixtures/archive.zip b/tests/fixtures/archive.zip new file mode 100644 index 0000000..dfe0f43 Binary files /dev/null and b/tests/fixtures/archive.zip differ diff --git a/tests/fixtures/archive.zip.zip b/tests/fixtures/archive.zip.zip new file mode 100644 index 0000000..e98d470 Binary files /dev/null and b/tests/fixtures/archive.zip.zip differ diff --git a/tests/fixtures/many_files.zip b/tests/fixtures/many_files.zip new file mode 100644 index 0000000..87cc6ef Binary files /dev/null and b/tests/fixtures/many_files.zip differ diff --git a/tests/integration/test_native_upload.py b/tests/integration/test_native_upload.py index 8c17a36..ae80990 100644 --- a/tests/integration/test_native_upload.py +++ b/tests/integration/test_native_upload.py @@ -1,12 +1,15 @@ from io import BytesIO import json +import os import tempfile +import pytest + from dvuploader.dvuploader import DVUploader from dvuploader.file import File from dvuploader.utils import add_directory, retrieve_dataset_files -from tests.conftest import create_dataset, create_mock_file +from tests.conftest import create_dataset, create_mock_file, create_mock_tabular_file class TestNativeUpload: @@ -219,3 +222,273 @@ def test_native_upload_by_handler( assert file["description"] == "This is a test", ( f"Description does not match for file {json.dumps(file)}" ) + + def test_native_upload_with_large_tabular_files_loop( + self, + credentials, + ): + BASE_URL, API_TOKEN = credentials + + # Create Dataset + pid = create_dataset( + parent="Root", + server_url=BASE_URL, + api_token=API_TOKEN, + ) + + # We are uploading large tabular files in a loop to test the uploader's + # ability to wait for locks to be released. + # + # The uploader should wait for the lock to be released and then upload + # the file. + # + rows = os.environ.get("TEST_ROWS", 10000) + + try: + rows = int(rows) + except ValueError: + raise ValueError(f"TEST_ROWS must be an integer, got {rows}") + + # We first try the sequential case by uploading 10 files in a loop. + with tempfile.TemporaryDirectory() as directory: + for i in range(10): + # Arrange + path = create_mock_tabular_file( + directory, + f"large_tabular_file_{i}.csv", + rows=rows, + cols=20, + ) + + # Add all files in the directory + files = [File(filepath=path)] + + # Act + uploader = DVUploader(files=files) + uploader.upload( + persistent_id=pid, + api_token=API_TOKEN, + dataverse_url=BASE_URL, + n_parallel_uploads=1, + ) + + def test_native_upload_with_large_tabular_files( + self, + credentials, + ): + BASE_URL, API_TOKEN = credentials + + # Create Dataset + pid = create_dataset( + parent="Root", + server_url=BASE_URL, + api_token=API_TOKEN, + ) + + # We are uploading large tabular files in a loop to test the uploader's + # ability to wait for locks to be released. + # + # The uploader should wait for the lock to be released and then upload + # the file. + # + rows = os.environ.get("TEST_ROWS", 10000) + + try: + rows = int(rows) + except ValueError: + raise ValueError(f"TEST_ROWS must be an integer, got {rows}") + + # We first try the sequential case by uploading 10 files in a loop. + with tempfile.TemporaryDirectory() as directory: + files = [] + for i in range(10): + # Arrange + path = create_mock_tabular_file( + directory, + f"large_tabular_file_{i}.csv", + rows=rows, + cols=20, + ) + + # Add all files in the directory + files.append(File(filepath=path)) + + # Act + uploader = DVUploader(files=files) + uploader.upload( + persistent_id=pid, + api_token=API_TOKEN, + dataverse_url=BASE_URL, + n_parallel_uploads=1, + ) + + def test_native_upload_with_large_tabular_files_parallel( + self, + credentials, + ): + BASE_URL, API_TOKEN = credentials + + # Create Dataset + pid = create_dataset( + parent="Root", + server_url=BASE_URL, + api_token=API_TOKEN, + ) + + # We are uploading large tabular files in a loop to test the uploader's + # ability to wait for locks to be released. + # + # The uploader should wait for the lock to be released and then upload + # the file. + # + rows = os.environ.get("TEST_ROWS", 10000) + + try: + rows = int(rows) + except ValueError: + raise ValueError(f"TEST_ROWS must be an integer, got {rows}") + + # We first try the sequential case by uploading 10 files in a loop. + with tempfile.TemporaryDirectory() as directory: + files = [] + for i in range(10): + # Arrange + path = create_mock_tabular_file( + directory, + f"large_tabular_file_{i}.csv", + rows=rows, + cols=20, + ) + + # Add all files in the directory + files.append(File(filepath=path)) + + # Act + uploader = DVUploader(files=files) + uploader.upload( + persistent_id=pid, + api_token=API_TOKEN, + dataverse_url=BASE_URL, + n_parallel_uploads=10, + ) + + def test_zip_file_upload( + self, + credentials, + ): + BASE_URL, API_TOKEN = credentials + + # Create Dataset + pid = create_dataset( + parent="Root", + server_url=BASE_URL, + api_token=API_TOKEN, + ) + + # Arrange + files = [ + File(filepath="tests/fixtures/archive.zip"), + ] + + # Act + uploader = DVUploader(files=files) + uploader.upload( + persistent_id=pid, + api_token=API_TOKEN, + dataverse_url=BASE_URL, + n_parallel_uploads=10, + ) + + # Assert + files = retrieve_dataset_files( + dataverse_url=BASE_URL, + persistent_id=pid, + api_token=API_TOKEN, + ) + + assert len(files) == 5, f"Expected 5 files, got {len(files)}" + + expected_files = [ + "hallo.tab", + "hallo2.tab", + "hallo3.tab", + "hallo4.tab", + "hallo5.tab", + ] + + assert sorted([file["label"] for file in files]) == sorted(expected_files) + + def test_zipzip_file_upload( + self, + credentials, + ): + BASE_URL, API_TOKEN = credentials + + # Create Dataset + pid = create_dataset( + parent="Root", + server_url=BASE_URL, + api_token=API_TOKEN, + ) + + # Arrange + files = [ + File(filepath="tests/fixtures/archive.zip"), + ] + + # Act + uploader = DVUploader(files=files) + uploader.upload( + persistent_id=pid, + api_token=API_TOKEN, + dataverse_url=BASE_URL, + n_parallel_uploads=10, + ) + + # Assert + files = retrieve_dataset_files( + dataverse_url=BASE_URL, + persistent_id=pid, + api_token=API_TOKEN, + ) + + assert len(files) == 5, f"Expected 5 files, got {len(files)}" + + expected_files = [ + "hallo.tab", + "hallo2.tab", + "hallo3.tab", + "hallo4.tab", + "hallo5.tab", + ] + + assert sorted([file["label"] for file in files]) == sorted(expected_files) + + def test_too_many_zip_files( + self, + credentials, + ): + BASE_URL, API_TOKEN = credentials + + # Create Dataset + pid = create_dataset( + parent="Root", + server_url=BASE_URL, + api_token=API_TOKEN, + ) + + # Arrange + files = [ + File(filepath="tests/fixtures/many_files.zip"), + ] + + # Act + uploader = DVUploader(files=files) + + with pytest.raises(ValueError): + uploader.upload( + persistent_id=pid, + api_token=API_TOKEN, + dataverse_url=BASE_URL, + n_parallel_uploads=10, + ) diff --git a/tests/unit/test_directupload.py b/tests/unit/test_directupload.py index a2c2d93..0371a1c 100644 --- a/tests/unit/test_directupload.py +++ b/tests/unit/test_directupload.py @@ -12,13 +12,35 @@ class Test_AddFileToDs: # Should successfully add files to a Dataverse dataset with a valid file path @pytest.mark.asyncio - async def test_successfully_add_replace_file_with_valid_filepath(self, httpx_mock): + async def test_successfully_add_file_with_valid_filepath(self, httpx_mock): # Mock the session.post method to return a response with status code 200 httpx_mock.add_response( method="post", url="https://example.com/api/datasets/:persistentId/addFiles?persistentId=pid", ) + # Initialize the necessary variables + session = httpx.AsyncClient() + dataverse_url = "https://example.com" + pid = "pid" + fpath = "tests/fixtures/add_dir_files/somefile.txt" + files = [File(filepath=fpath)] + progress = Progress() + pbar = progress.add_task("Uploading", total=1) + + # Invoke the function + await _add_files_to_ds( + session=session, + dataverse_url=dataverse_url, + pid=pid, + files=files, + progress=progress, + pbar=pbar, + ) + + @pytest.mark.asyncio + async def test_successfully_replace_file_with_valid_filepath(self, httpx_mock): + # Mock the session.post method to return a response with status code 200 httpx_mock.add_response( method="post", url="https://example.com/api/datasets/:persistentId/replaceFiles?persistentId=pid", @@ -29,7 +51,44 @@ async def test_successfully_add_replace_file_with_valid_filepath(self, httpx_moc dataverse_url = "https://example.com" pid = "pid" fpath = "tests/fixtures/add_dir_files/somefile.txt" - files = [File(filepath=fpath)] + files = [File(filepath=fpath, to_replace=True)] + progress = Progress() + pbar = progress.add_task("Uploading", total=1) + + # Invoke the function + await _add_files_to_ds( + session=session, + dataverse_url=dataverse_url, + pid=pid, + files=files, + progress=progress, + pbar=pbar, + ) + + @pytest.mark.asyncio + async def test_successfully_add_and_replace_file_with_valid_filepath( + self, httpx_mock + ): + # Mock the session.post method to return a response with status code 200 + httpx_mock.add_response( + method="post", + url="https://example.com/api/datasets/:persistentId/replaceFiles?persistentId=pid", + ) + + httpx_mock.add_response( + method="post", + url="https://example.com/api/datasets/:persistentId/addFiles?persistentId=pid", + ) + + # Initialize the necessary variables + session = httpx.AsyncClient() + dataverse_url = "https://example.com" + pid = "pid" + fpath = "tests/fixtures/add_dir_files/somefile.txt" + files = [ + File(filepath=fpath, to_replace=True), + File(filepath=fpath), + ] progress = Progress() pbar = progress.add_task("Uploading", total=1)