Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/workflows/build_cli.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Integration Tests

on: [push]

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
max-parallel: 1
matrix:
os: ['windows-latest', 'ubuntu-latest', 'macos-latest']

steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.10"
- run: pip install .
- run: pip install pyinstaller
- run: pyinstaller -F ./dvuploader/cli.py -n dvuploader-${{ matrix.os }} --distpath ./bin
- name: Push
run: |
git config --global user.name "Build Bot"
git config --global user.email "build.bot@bot.com"

git pull
git add ./bin/\*
git commit -a -m "🤖 Built DVUploader for ${{ matrix.os }}"
git push
54 changes: 54 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ python3 -m pip install .

## Quickstart

### Programmatic usage

In order to perform a direct upload, you need to have a Dataverse instance running and a cloud storage provider. The following example shows how to upload files to a Dataverse instance. Simply provide the files of interest and utilize the `upload` method of a `DVUploader` instance.

```python
Expand All @@ -59,3 +61,55 @@ dvuploader.upload(
)
```

### Command Line Interface

DVUploader ships with a CLI ready to use outside scripts. In order to upload files to a Dataverse instance, simply provide the files of interest, persistent identifier and credentials.

#### Using arguments

```bash
dvuploader my_file.txt my_other_file.txt \
--pid doi:10.70122/XXX/XXXXX \
--api-token XXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX \
--dataverse-url https://demo.dataverse.org/ \
```

#### Using a config file

Alternatively, you can also supply a `config` file that contains all necessary informations for the uploader. The `config` file is a JSON/YAML file that contains the following keys:

* `persistent_id`: Persistent identifier of the dataset to upload to.
* `dataverse_url`: URL of the Dataverse instance.
* `api_token`: API token of the Dataverse instance.
* `files`: List of files to upload. Each file is a dictionary with the following keys:
* `filepath`: Path to the file to upload.
* `directoryLabel`: Optional directory label to upload the file to.
* `description`: Optional description of the file.
* `mimetype`: Mimetype of the file.
* `categories`: Optional list of categories to assign to the file.
* `restrict`: Boolean to indicate that this is a restricted file. Defaults to False.

In the following example, we upload three files to a Dataverse instance. The first file is uploaded to the root directory of the dataset, while the other two files are uploaded to the directory `some/dir`.

```yaml
# config.yml
persistent_id: doi:10.70122/XXX/XXXXX
dataverse_url: https://demo.dataverse.org/
api_token: XXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
files:
- filepath: ./small.txt
- filepath: ./medium.txt
directoryLabel: some/dir
- filepath: ./big.txt
directoryLabel: some/dir
```

The `config` file can then be used as follows:

```bash
dvuploader --config-path config.yml
```

#### CLI Binaries

DVUploader ships with binaries for Linux, MacOS and Windows. You can download the binaries from the `bin` [directory](./bin) and use them in a similar fashion as described above.
Binary file added bin/dvuploader-macos-latest
Binary file not shown.
Binary file added bin/dvuploader-ubuntu-latest
Binary file not shown.
Binary file added bin/dvuploader-windows-latest.exe
Binary file not shown.
121 changes: 121 additions & 0 deletions dvuploader/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import yaml
import typer

from pydantic import BaseModel
from typing import List, Tuple
from dvuploader import DVUploader, File


class CliInput(BaseModel):
api_token: str
dataverse_url: str
persistent_id: str
files: List[File]


app = typer.Typer()


def _parse_yaml_config(path: str) -> Tuple[List[File], str, str, str]:
"""
Parses a configuration file and returns a Class instance
containing a list of File objects, a persistent ID, a Dataverse URL,
and an API token.

Args:
path (str): Path to a JSON/YAML file containing specifications for the files to upload.

Returns:
CliInput: Class instance containing a list of File objects, a persistent ID,
a Dataverse URL, and an API token.

Raises:
ValueError: If the configuration file is invalid.
"""
return CliInput(**yaml.safe_load(open(path)))


def _validate_inputs(
filepaths: List[str],
pid: str,
dataverse_url: str,
api_token: str,
config_path: str,
) -> None:
if config_path and len(filepaths) > 0:
raise typer.BadParameter(
"Cannot specify both a JSON/YAML file and a list of filepaths."
)

_has_meta_params = all(arg is not None for arg in [pid, dataverse_url, api_token])
_has_config_file = config_path is not None

if _has_meta_params and _has_config_file:
print(
"\n⚠️ Warning\n"
"├── You have specified both a configuration file and metadata parameters via the command line.\n"
"╰── Will use metadata parameters specified in the config file."
)
elif not _has_meta_params and not _has_config_file:
raise typer.BadParameter(
"You must specify either a JSON/YAML file or metadata parameters (dv_url, api_token, pid, files) via the command line."
)


@app.command()
def main(
filepaths: List[str] = typer.Argument(
default=None,
help="A list of filepaths to upload.",
),
pid: str = typer.Option(
default=None,
help="The persistent identifier of the Dataverse dataset.",
),
api_token: str = typer.Option(
default=None,
help="The API token for the Dataverse repository.",
),
dataverse_url: str = typer.Option(
default=None,
help="The URL of the Dataverse repository.",
),
config_path: str = typer.Option(
default=None,
help="Path to a JSON/YAML file containing specifications for the files to upload. Defaults to None.",
),
n_jobs: int = typer.Option(
default=-1,
help="The number of parallel jobs to run. Defaults to -1.",
),
):
_validate_inputs(
filepaths=filepaths,
pid=pid,
dataverse_url=dataverse_url,
api_token=api_token,
config_path=config_path,
)

if config_path:
# PyYAML is a superset of JSON, so we can use the same function to parse both
cli_input = _parse_yaml_config(config_path)
else:
cli_input = CliInput(
api_token=api_token,
dataverse_url=dataverse_url,
persistent_id=pid,
files=[File(filepath=filepath) for filepath in filepaths],
)

uploader = DVUploader(files=cli_input.files)
uploader.upload(
persistent_id=cli_input.persistent_id,
dataverse_url=cli_input.dataverse_url,
api_token=cli_input.api_token,
n_jobs=n_jobs,
)


if __name__ == "__main__":
typer.run(main)
18 changes: 9 additions & 9 deletions dvuploader/dvuploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Dict, List
from urllib.parse import urljoin

from pydantic import BaseModel
from pydantic import BaseModel, validator
from joblib import Parallel, delayed
from dotted_dict import DottedDict

Expand Down Expand Up @@ -43,7 +43,7 @@ def upload(
dataverse_url (str): The URL of the Dataverse repository.
api_token (str): The API token for the Dataverse repository.
n_jobs (int): The number of parallel jobs to run. Defaults to -1.

Returns:
None
"""
Expand All @@ -61,7 +61,7 @@ def upload(
)

if not self.files:
print("\n❌ No files to upload")
print("\n❌ No files to upload\n")
return

# Upload files in parallel
Expand All @@ -78,7 +78,7 @@ def upload(
for position, file in enumerate(files)
)

print("🎉 Done!")
print("🎉 Done!\n")

def _check_duplicates(
self,
Expand All @@ -104,9 +104,9 @@ def _check_duplicates(
)

print("\n🔎 Checking dataset files")

to_remove = []

for file in self.files:
if any(map(lambda dsFile: self._check_hashes(file, dsFile), ds_files)):
print(
Expand All @@ -115,12 +115,12 @@ def _check_duplicates(
to_remove.append(file)
else:
print(f"├── File '{file.fileName}' is new - Uploading.")

for file in to_remove:
self.files.remove(file)

print("🎉 Done")

@staticmethod
def _check_hashes(file: File, dsFile: Dict):
"""
Expand All @@ -133,7 +133,7 @@ def _check_hashes(file: File, dsFile: Dict):
Returns:
bool: True if the files have the same checksum, False otherwise.
"""

hash_algo, hash_value = tuple(dsFile.dataFile.checksum.values())

return file.checksum.value == hash_value and file.checksum.type == hash_algo
Expand Down
15 changes: 14 additions & 1 deletion dvuploader/file.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from typing import List, Optional

from pydantic import BaseModel, Field, validator
from pydantic import BaseModel, Field, validator, ValidationError

from dvuploader.checksum import Checksum, ChecksumTypes

Expand All @@ -21,12 +21,25 @@ class File(BaseModel):
fileName: Optional[str] = None
checksum: Optional[Checksum] = None

@staticmethod
def _validate_filepath(path):
if not os.path.exists(path):
raise FileNotFoundError(f"Filepath {path} does not exist.")
elif not os.path.isfile(path):
raise TypeError(f"Filepath {path} is not a file.")
elif not os.access(path, os.R_OK):
raise TypeError(f"Filepath {path} is not readable.")
elif os.path.getsize(path) == 0:
raise ValueError(f"Filepath {path} is empty.")
return path

@validator("fileName", always=True)
def _extract_filename(cls, v, values):
return os.path.basename(values["filepath"])

@validator("checksum", always=True)
def _calculate_hash(cls, v, values):
cls._validate_filepath(values["filepath"])
fpath = values["filepath"]
hash_algo, hash_fun = values["checksum_type"].value

Expand Down
Loading