Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 43 additions & 25 deletions dvuploader/checksum.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import hashlib
from enum import Enum
from typing import IO, Callable
from pydantic.fields import PrivateAttr
from typing_extensions import Optional

from pydantic import BaseModel, ConfigDict, Field




class ChecksumTypes(Enum):
"""Enum class representing different types of checksums.

Expand All @@ -24,11 +24,15 @@ class ChecksumTypes(Enum):


class Checksum(BaseModel):
"""Checksum class represents a checksum object with type and value fields.
"""Class for calculating and storing file checksums.

This class handles checksum calculation and storage for files being uploaded to Dataverse.
It supports multiple hash algorithms through the ChecksumTypes enum.

Attributes:
type (str): The type of the checksum.
value (str): The value of the checksum.
type (str): The type of checksum algorithm being used (e.g. "SHA-1", "MD5")
value (Optional[str]): The calculated checksum value, or None if not yet calculated
_hash_fun (PrivateAttr): Internal hash function instance used for calculation
"""

model_config = ConfigDict(
Expand All @@ -37,44 +41,58 @@ class Checksum(BaseModel):
)

type: str = Field(..., alias="@type")
value: str = Field(..., alias="@value")
value: Optional[str] = Field(None, alias="@value")
_hash_fun = PrivateAttr(default=None)

@classmethod
def from_file(
def from_algo(
cls,
handler: IO,
hash_fun: Callable,
hash_algo: str,
) -> "Checksum":
"""Takes a file path and returns a checksum object.
"""Creates a new Checksum instance configured for a specific hash algorithm.

Args:
handler (IO): The file handler to generate the checksum for.
hash_fun (Callable): The hash function to use for generating the checksum.
hash_algo (str): The hash algorithm to use for generating the checksum.
hash_fun (Callable): Hash function constructor (e.g. hashlib.sha1)
hash_algo (str): Name of the hash algorithm (e.g. "SHA-1")

Returns:
Checksum: A Checksum object with type and value fields.
Checksum: A new Checksum instance ready for calculating checksums
"""

cls = cls(type=hash_algo, value=None) # type: ignore
cls._hash_fun = hash_fun()

return cls

def apply_checksum(self):
"""Finalizes and stores the calculated checksum value.

This should be called after all data has been processed through the hash function.
The resulting checksum is stored in the value attribute.

Raises:
AssertionError: If the hash function has not been initialized
"""

value = cls._chunk_checksum(handler=handler, hash_fun=hash_fun)
return cls(type=hash_algo, value=value) # type: ignore
assert self._hash_fun is not None, "Checksum hash function is not set."

self.value = self._hash_fun.hexdigest()

@staticmethod
def _chunk_checksum(
handler: IO,
hash_fun: Callable,
blocksize=2**20
) -> str:
"""Chunks a file and returns a checksum.
def _chunk_checksum(handler: IO, hash_fun: Callable, blocksize=2**20) -> str:
"""Calculates a file's checksum by processing it in chunks.

Args:
fpath (str): The file path to generate the checksum for.
hash_fun (Callable): The hash function to use for generating the checksum.
blocksize (int): The block size to use for reading the file.
handler (IO): File-like object to read data from
hash_fun (Callable): Hash function constructor to use
blocksize (int, optional): Size of chunks to read. Defaults to 1MB (2**20)

Returns:
str: A string representing the checksum of the file.
str: Hexadecimal string representation of the calculated checksum

Note:
This method resets the file position to the start after reading
"""
m = hash_fun()
while True:
Expand Down
79 changes: 52 additions & 27 deletions dvuploader/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@


class CliInput(BaseModel):
"""
Model for CLI input parameters.

Attributes:
api_token (str): API token for authentication with Dataverse
dataverse_url (str): URL of the Dataverse instance
persistent_id (str): Persistent identifier of the dataset
files (List[File]): List of files to upload
n_jobs (int): Number of parallel upload jobs to run (default: 1)
"""

api_token: str
dataverse_url: str
persistent_id: str
Expand All @@ -19,19 +30,17 @@ class CliInput(BaseModel):

def _parse_yaml_config(path: str) -> CliInput:
"""
Parses a configuration file and returns a Class instance
containing a list of File objects, a persistent ID, a Dataverse URL,
and an API token.
Parse a YAML/JSON configuration file into a CliInput object.

Args:
path (str): Path to a JSON/YAML file containing specifications for the files to upload.
path (str): Path to a YAML/JSON configuration file containing upload specifications

Returns:
CliInput: Class instance containing a list of File objects, a persistent ID,
a Dataverse URL, and an API token.
CliInput: Object containing upload configuration parameters

Raises:
ValueError: If the configuration file is invalid.
yaml.YAMLError: If the YAML/JSON file is malformed
ValidationError: If the configuration data does not match the CliInput model
"""
return CliInput(**yaml.safe_load(open(path))) # type: ignore

Expand All @@ -44,18 +53,20 @@ def _validate_inputs(
config_path: Optional[str],
) -> None:
"""
Validates the inputs for the dvuploader command.
Validate CLI input parameters.

Checks for valid combinations of configuration file and command line parameters.

Args:
filepaths (List[str]): List of filepaths to be uploaded.
pid (str): Persistent identifier of the dataset.
dataverse_url (str): URL of the Dataverse instance.
api_token (str): API token for authentication.
config_path (Optional[str]): Path to the configuration file.
filepaths (List[str]): List of files to upload
pid (str): Persistent identifier of the dataset
dataverse_url (str): URL of the Dataverse instance
api_token (str): API token for authentication
config_path (Optional[str]): Path to configuration file

Raises:
typer.BadParameter: If both a configuration file and a list of filepaths are specified.
typer.BadParameter: If neither a configuration file nor metadata parameters are specified.
typer.BadParameter: If both config file and filepaths are specified
typer.BadParameter: If neither config file nor required parameters are provided
"""
if config_path is not None and len(filepaths) > 0:
raise typer.BadParameter(
Expand Down Expand Up @@ -97,25 +108,39 @@ def main(
),
config_path: Optional[str] = typer.Option(
default=None,
help="Path to a JSON/YAML file containing specifications for the files to upload. Defaults to None.",
help="Path to a JSON/YAML file containing specifications for the files to upload.",
),
n_jobs: int = typer.Option(
default=1,
help="The number of parallel jobs to run. Defaults to -1.",
help="Number of parallel upload jobs to run.",
),
):
"""
Uploads files to a Dataverse repository.

Args:
filepaths (List[str]): A list of filepaths to upload.
pid (str): The persistent identifier of the Dataverse dataset.
api_token (str): The API token for the Dataverse repository.
dataverse_url (str): The URL of the Dataverse repository.
config_path (Optional[str]): Path to a JSON/YAML file containing specifications for the files to upload. Defaults to None.
n_jobs (int): The number of parallel jobs to run. Defaults to -1.
Upload files to a Dataverse repository.

Files can be specified either directly via command line arguments or through a
configuration file. The configuration file can be either YAML or JSON format.

If using command line arguments, you must specify:
- One or more filepaths to upload
- The dataset's persistent identifier
- A valid API token
- The Dataverse repository URL

If using a configuration file, it should contain:
- api_token: API token for authentication
- dataverse_url: URL of the Dataverse instance
- persistent_id: Dataset persistent identifier
- files: List of file specifications
- n_jobs: (optional) Number of parallel upload jobs

Examples:
Upload files via command line:
$ dvuploader file1.txt file2.txt --pid doi:10.5072/FK2/123456 --api-token abc123 --dataverse-url https://demo.dataverse.org

Upload files via config file:
$ dvuploader --config-path upload_config.yaml
"""

_validate_inputs(
filepaths=filepaths,
pid=pid,
Expand Down
Loading