gdcc · JR-1991 · Nov 8, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/.github/workflows/build_cli.yml b/.github/workflows/build_cli.yml
@@ -0,0 +1,30 @@
+name: Integration Tests
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        os: ['windows-latest', 'ubuntu-latest', 'macos-latest']
+
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+      with:
+        python-version: "3.10"
+    - run: pip install .
+    - run: pip install pyinstaller
+    - run: pyinstaller -F ./dvuploader/cli.py -n dvuploader-${{ matrix.os }} --distpath ./bin
+    - name: Push
+      run: |
+        git config --global user.name "Build Bot"
+        git config --global user.email "build.bot@bot.com"
+
+        git pull
+        git add ./bin/\*
+        git commit -a -m "🤖 Built DVUploader for ${{ matrix.os }}"
+        git push
diff --git a/README.md b/README.md
@@ -36,6 +36,8 @@ python3 -m pip install .
 
 ## Quickstart
 
+### Programmatic usage
+
 In order to perform a direct upload, you need to have a Dataverse instance running and a cloud storage provider. The following example shows how to upload files to a Dataverse instance. Simply provide the files of interest and utilize the `upload` method of a `DVUploader` instance.
 
 ```python
@@ -59,3 +61,55 @@ dvuploader.upload(
 )
 ```
 
+### Command Line Interface
+
+DVUploader ships with a CLI ready to use outside scripts. In order to upload files to a Dataverse instance, simply provide the files of interest, persistent identifier and credentials.
+
+#### Using arguments
+
+```bash
+dvuploader my_file.txt my_other_file.txt \
+           --pid doi:10.70122/XXX/XXXXX \
+           --api-token XXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX \
+           --dataverse-url https://demo.dataverse.org/ \
+```
+
+#### Using a config file
+
+Alternatively, you can also supply a `config` file that contains all necessary informations for the uploader. The `config` file is a JSON/YAML file that contains the following keys:
+
+* `persistent_id`: Persistent identifier of the dataset to upload to.
+* `dataverse_url`: URL of the Dataverse instance.
+* `api_token`: API token of the Dataverse instance.
+* `files`: List of files to upload. Each file is a dictionary with the following keys:
+  * `filepath`: Path to the file to upload.
+  * `directoryLabel`: Optional directory label to upload the file to.
+  * `description`: Optional description of the file.
+  * `mimetype`: Mimetype of the file.
+  * `categories`: Optional list of categories to assign to the file.
+  * `restrict`: Boolean to indicate that this is a restricted file. Defaults to False.
+
+In the following example, we upload three files to a Dataverse instance. The first file is uploaded to the root directory of the dataset, while the other two files are uploaded to the directory `some/dir`.
+
+```yaml
+# config.yml
+persistent_id: doi:10.70122/XXX/XXXXX
+dataverse_url: https://demo.dataverse.org/
+api_token: XXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
+files:
+    - filepath: ./small.txt
+    - filepath: ./medium.txt
+      directoryLabel: some/dir
+    - filepath: ./big.txt
+      directoryLabel: some/dir
+```
+
+The `config` file can then be used as follows:
+
+```bash
+dvuploader --config-path config.yml
+```
+
+#### CLI Binaries
+
+DVUploader ships with binaries for Linux, MacOS and Windows. You can download the binaries from the `bin` [directory](./bin) and use them in a similar fashion as described above.
diff --git a/bin/dvuploader-macos-latest b/bin/dvuploader-macos-latest
diff --git a/bin/dvuploader-ubuntu-latest b/bin/dvuploader-ubuntu-latest
diff --git a/bin/dvuploader-windows-latest.exe b/bin/dvuploader-windows-latest.exe
diff --git a/dvuploader/cli.py b/dvuploader/cli.py
@@ -0,0 +1,121 @@
+import yaml
+import typer
+
+from pydantic import BaseModel
+from typing import List, Tuple
+from dvuploader import DVUploader, File
+
+
+class CliInput(BaseModel):
+    api_token: str
+    dataverse_url: str
+    persistent_id: str
+    files: List[File]
+
+
+app = typer.Typer()
+
+
+def _parse_yaml_config(path: str) -> Tuple[List[File], str, str, str]:
+    """
+    Parses a configuration file and returns a Class instance
+    containing a list of File objects, a persistent ID, a Dataverse URL,
+    and an API token.
+
+    Args:
+        path (str): Path to a JSON/YAML file containing specifications for the files to upload.
+
+    Returns:
+        CliInput: Class instance containing a list of File objects, a persistent ID,
+                  a Dataverse URL, and an API token.
+
+    Raises:
+        ValueError: If the configuration file is invalid.
+    """
+    return CliInput(**yaml.safe_load(open(path)))
+
+
+def _validate_inputs(
+    filepaths: List[str],
+    pid: str,
+    dataverse_url: str,
+    api_token: str,
+    config_path: str,
+) -> None:
+    if config_path and len(filepaths) > 0:
+        raise typer.BadParameter(
+            "Cannot specify both a JSON/YAML file and a list of filepaths."
+        )
+
+    _has_meta_params = all(arg is not None for arg in [pid, dataverse_url, api_token])
+    _has_config_file = config_path is not None
+
+    if _has_meta_params and _has_config_file:
+        print(
+            "\n⚠️  Warning\n"
+            "├── You have specified both a configuration file and metadata parameters via the command line.\n"
+            "╰── Will use metadata parameters specified in the config file."
+        )
+    elif not _has_meta_params and not _has_config_file:
+        raise typer.BadParameter(
+            "You must specify either a JSON/YAML file or metadata parameters (dv_url, api_token, pid, files) via the command line."
+        )
+
+
+@app.command()
+def main(
+    filepaths: List[str] = typer.Argument(
+        default=None,
+        help="A list of filepaths to upload.",
+    ),
+    pid: str = typer.Option(
+        default=None,
+        help="The persistent identifier of the Dataverse dataset.",
+    ),
+    api_token: str = typer.Option(
+        default=None,
+        help="The API token for the Dataverse repository.",
+    ),
+    dataverse_url: str = typer.Option(
+        default=None,
+        help="The URL of the Dataverse repository.",
+    ),
+    config_path: str = typer.Option(
+        default=None,
+        help="Path to a JSON/YAML file containing specifications for the files to upload. Defaults to None.",
+    ),
+    n_jobs: int = typer.Option(
+        default=-1,
+        help="The number of parallel jobs to run. Defaults to -1.",
+    ),
+):
+    _validate_inputs(
+        filepaths=filepaths,
+        pid=pid,
+        dataverse_url=dataverse_url,
+        api_token=api_token,
+        config_path=config_path,
+    )
+
+    if config_path:
+        # PyYAML is a superset of JSON, so we can use the same function to parse both
+        cli_input = _parse_yaml_config(config_path)
+    else:
+        cli_input = CliInput(
+            api_token=api_token,
+            dataverse_url=dataverse_url,
+            persistent_id=pid,
+            files=[File(filepath=filepath) for filepath in filepaths],
+        )
+
+    uploader = DVUploader(files=cli_input.files)
+    uploader.upload(
+        persistent_id=cli_input.persistent_id,
+        dataverse_url=cli_input.dataverse_url,
+        api_token=cli_input.api_token,
+        n_jobs=n_jobs,
+    )
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/dvuploader/dvuploader.py b/dvuploader/dvuploader.py
@@ -5,7 +5,7 @@
 from typing import Dict, List
 from urllib.parse import urljoin
 
-from pydantic import BaseModel
+from pydantic import BaseModel, validator
 from joblib import Parallel, delayed
 from dotted_dict import DottedDict
 
@@ -43,7 +43,7 @@ def upload(
             dataverse_url (str): The URL of the Dataverse repository.
             api_token (str): The API token for the Dataverse repository.
             n_jobs (int): The number of parallel jobs to run. Defaults to -1.
-            
+
         Returns:
             None
         """
@@ -61,7 +61,7 @@ def upload(
         )
 
         if not self.files:
-            print("\n❌ No files to upload")
+            print("\n❌ No files to upload\n")
             return
 
         # Upload files in parallel
@@ -78,7 +78,7 @@ def upload(
             for position, file in enumerate(files)
         )
 
-        print("🎉 Done!")
+        print("🎉 Done!\n")
 
     def _check_duplicates(
         self,
@@ -104,9 +104,9 @@ def _check_duplicates(
         )
 
         print("\n🔎 Checking dataset files")
-        
+
         to_remove = []
-        
+
         for file in self.files:
             if any(map(lambda dsFile: self._check_hashes(file, dsFile), ds_files)):
                 print(
@@ -115,12 +115,12 @@ def _check_duplicates(
                 to_remove.append(file)
             else:
                 print(f"├── File '{file.fileName}' is new - Uploading.")
-                
+
         for file in to_remove:
             self.files.remove(file)
 
         print("🎉 Done")
-    
+
     @staticmethod
     def _check_hashes(file: File, dsFile: Dict):
         """
@@ -133,7 +133,7 @@ def _check_hashes(file: File, dsFile: Dict):
         Returns:
             bool: True if the files have the same checksum, False otherwise.
         """
-        
+
         hash_algo, hash_value = tuple(dsFile.dataFile.checksum.values())
 
         return file.checksum.value == hash_value and file.checksum.type == hash_algo

diff --git a/dvuploader/file.py b/dvuploader/file.py
@@ -1,7 +1,7 @@
 import os
 from typing import List, Optional
 
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, validator, ValidationError
 
 from dvuploader.checksum import Checksum, ChecksumTypes
 
@@ -21,12 +21,25 @@ class File(BaseModel):
     fileName: Optional[str] = None
     checksum: Optional[Checksum] = None
 
+    @staticmethod
+    def _validate_filepath(path):
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Filepath {path} does not exist.")
+        elif not os.path.isfile(path):
+            raise TypeError(f"Filepath {path} is not a file.")
+        elif not os.access(path, os.R_OK):
+            raise TypeError(f"Filepath {path} is not readable.")
+        elif os.path.getsize(path) == 0:
+            raise ValueError(f"Filepath {path} is empty.")
+        return path
+
     @validator("fileName", always=True)
     def _extract_filename(cls, v, values):
         return os.path.basename(values["filepath"])
 
     @validator("checksum", always=True)
     def _calculate_hash(cls, v, values):
+        cls._validate_filepath(values["filepath"])
         fpath = values["filepath"]
         hash_algo, hash_fun = values["checksum_type"].value