diff --git a/dandi/cli/cmd_ls.py b/dandi/cli/cmd_ls.py index 7e45f0fe5..6d185d965 100644 --- a/dandi/cli/cmd_ls.py +++ b/dandi/cli/cmd_ls.py @@ -5,6 +5,7 @@ import click from .base import devel_option, lgr, map_to_click_exceptions +from ..consts import ZARR_EXTENSIONS, metadata_all_fields from ..dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url from ..misctypes import Digest from ..utils import is_url @@ -92,7 +93,6 @@ def ls( PYOUTFormatter, YAMLFormatter, ) - from ..consts import metadata_all_fields # TODO: avoid from ..support.pyout import PYOUT_SHORT_NAMES_rev @@ -358,7 +358,20 @@ def fn(): digest=Digest.dandi_etag(digest), ).json_dict() else: - rec = get_metadata(path) + if path.endswith(tuple(ZARR_EXTENSIONS)): + if use_fake_digest: + digest = "0" * 32 + "-0--0" + else: + lgr.info("Calculating digest for %s", path) + digest = get_digest(path, digest="zarr-checksum") + rec = get_metadata(path, Digest.dandi_zarr(digest)) + else: + if use_fake_digest: + digest = "0" * 32 + "-1" + else: + lgr.info("Calculating digest for %s", path) + digest = get_digest(path, digest="dandi-etag") + rec = get_metadata(path, Digest.dandi_etag(digest)) except Exception as exc: _add_exc_error(path, rec, errors, exc) if flatten: diff --git a/dandi/cli/tests/test_ls.py b/dandi/cli/tests/test_cmd_ls.py similarity index 80% rename from dandi/cli/tests/test_ls.py rename to dandi/cli/tests/test_cmd_ls.py index da08bac92..13dbde12d 100644 --- a/dandi/cli/tests/test_ls.py +++ b/dandi/cli/tests/test_cmd_ls.py @@ -49,8 +49,16 @@ def load(s): assert metadata[f] == simple1_nwb_metadata[f] +def test_ls_nwb_file(simple2_nwb): + bids_file_path = "simple2.nwb" + bids_file_path = os.path.join(simple2_nwb, bids_file_path) + r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path]) + assert r.exit_code == 0, r.output + data = yaml_load(r.stdout, "safe") + assert len(data) == 1 + + @mark.skipif_no_network -@pytest.mark.xfail(reason="https://github.com/dandi/dandi-cli/issues/1097") def test_ls_bids_file(bids_examples): bids_file_path = "asl003/sub-Sub1/anat/sub-Sub1_T1w.nii.gz" bids_file_path = os.path.join(bids_examples, bids_file_path) @@ -58,7 +66,20 @@ def test_ls_bids_file(bids_examples): assert r.exit_code == 0, r.output data = yaml_load(r.stdout, "safe") assert len(data) == 1 - assert data[0]["subject_id"] == "Sub1" + assert data[0]["identifier"] == "Sub1" + + +@mark.skipif_no_network +def test_ls_zarrbids_file(bids_examples): + bids_file_path = ( + "micr_SEMzarr/sub-01/ses-01/micr/sub-01_ses-01_sample-A_SPIM.ome.zarr" + ) + bids_file_path = os.path.join(bids_examples, bids_file_path) + r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path]) + assert r.exit_code == 0, r.output + data = yaml_load(r.stdout, "safe") + assert len(data) == 1 + assert data[0]["identifier"] == "01" @mark.skipif_no_network diff --git a/dandi/files/bids.py b/dandi/files/bids.py index 7806f80a2..d2238c332 100644 --- a/dandi/files/bids.py +++ b/dandi/files/bids.py @@ -12,6 +12,7 @@ from .bases import GenericAsset, LocalFileAsset, NWBAsset from .zarr import ZarrAsset +from ..consts import ZARR_MIME_TYPE from ..metadata import add_common_metadata, prepare_metadata from ..misctypes import Digest from ..validate_types import ValidationResult @@ -45,6 +46,12 @@ class BIDSDatasetDescriptionAsset(LocalFileAsset): #: populated by `_validate()` _asset_metadata: Optional[dict[str, dict[str, Any]]] = None + #: Version of BIDS used for the validation; + #: populated by `_validate()` + #: In future this might be removed and the information included in the + #: BareAsset via dandischema. + _bids_version: Optional[str] = None + #: Threading lock needed in case multiple assets are validated in parallel #: during upload _lock: Lock = field(init=False, default_factory=Lock, repr=False, compare=False) @@ -65,6 +72,22 @@ def _validate(self) -> None: bids_paths = [str(self.filepath)] + [ str(asset.filepath) for asset in self.dataset_files ] + # This is an ad-hoc fix which should be removed once bidsschematools greater than + # 0.6.0 is released. + # It won't cause any trouble afterwards, but it will no longer fulfill any + # purpose. The issue is that README* is still required and if we don't + # include it explicitly in the listing validation will implicitly fail, even + # if the file is present. + readme_extensions = ["", ".md", ".rst", ".txt"] + for ext in readme_extensions: + readme_candidate = self.bids_root / Path("README" + ext) + if ( + readme_candidate.exists() + and str(readme_candidate) not in bids_paths + ): + bids_paths += [str(readme_candidate)] + # end of ad-hoc fix. + results = validate_bids(*bids_paths) self._dataset_errors: list[ValidationResult] = [] self._asset_errors: dict[str, list[ValidationResult]] = defaultdict( @@ -74,7 +97,8 @@ def _validate(self) -> None: for result in results: if result.id in BIDS_ASSET_ERRORS: assert result.path - self._asset_errors[str(result.path)].append(result) + bids_path = result.path.relative_to(self.bids_root).as_posix() + self._asset_errors[bids_path].append(result) elif result.id in BIDS_DATASET_ERRORS: self._dataset_errors.append(result) elif result.id == "BIDS.MATCH": @@ -84,6 +108,7 @@ def _validate(self) -> None: self._asset_metadata[bids_path] = prepare_metadata( result.metadata ) + self._bids_version = result.origin.bids_version def get_asset_errors(self, asset: BIDSAsset) -> list[ValidationResult]: """:meta private:""" @@ -174,6 +199,11 @@ def get_metadata( metadata["path"] = self.path return BareAsset(**metadata) + def get_validation_bids_version(self) -> str: + self.bids_dataset_description._validate() + assert self.bids_dataset_description._bids_version is not None + return self.bids_dataset_description._bids_version + class NWBBIDSAsset(BIDSAsset, NWBAsset): """ @@ -219,6 +249,18 @@ def get_validation_errors( self, schema_version, devel_debug ) + BIDSAsset.get_validation_errors(self) + def get_metadata( + self, + digest: Optional[Digest] = None, + ignore_errors: bool = True, + ) -> BareAsset: + metadata = self.bids_dataset_description.get_asset_metadata(self) + start_time = end_time = datetime.now().astimezone() + add_common_metadata(metadata, self.filepath, start_time, end_time, digest) + metadata["path"] = self.path + metadata["encodingFormat"] = ZARR_MIME_TYPE + return BareAsset(**metadata) + class GenericBIDSAsset(BIDSAsset, GenericAsset): """ diff --git a/dandi/metadata.py b/dandi/metadata.py index cfc93b037..ab4c40e08 100644 --- a/dandi/metadata.py +++ b/dandi/metadata.py @@ -28,6 +28,7 @@ import tenacity from . import __version__, get_logger +from .consts import ZARR_EXTENSIONS, metadata_all_fields from .dandiset import Dandiset from .misctypes import Digest from .pynwb_utils import ( @@ -38,14 +39,22 @@ metadata_cache, nwb_has_external_links, ) -from .utils import ensure_datetime, get_mime_type, get_utcnow_datetime +from .utils import ( + ensure_datetime, + find_parent_directory_containing, + get_mime_type, + get_utcnow_datetime, +) lgr = get_logger() # Disable this for clean hacking @metadata_cache.memoize_path -def get_metadata(path: Union[str, Path]) -> Optional[dict]: +def get_metadata( + path: Union[str, Path], + digest: Optional[Digest] = None, +) -> Optional[dict]: """ Get "flatdata" from a .nwb file or a Dandiset directory @@ -59,12 +68,15 @@ def get_metadata(path: Union[str, Path]) -> Optional[dict]: ------- dict """ + + from .files import bids, dandi_file, find_bids_dataset_description + # when we run in parallel, these annoying warnings appear ignore_benign_pynwb_warnings() - path = str(path) # for Path + path = os.path.abspath(str(path)) # for Path meta = dict() - if op.isdir(path): + if op.isdir(path) and not path.endswith(tuple(ZARR_EXTENSIONS)): try: dandiset = Dandiset(path) return cast(dict, dandiset.metadata) @@ -72,51 +84,74 @@ def get_metadata(path: Union[str, Path]) -> Optional[dict]: lgr.debug("Failed to get metadata for %s: %s", path, exc) return None - if nwb_has_external_links(path): - raise NotImplementedError( - f"NWB files with external links are not supported: {path}" + # Is the data BIDS (as defined by the presence of a BIDS dataset descriptor) + bids_dataset_description = find_bids_dataset_description(path) + if bids_dataset_description: + dandiset_path = find_parent_directory_containing("dandiset.yaml", path) + bids_dataset_description = find_bids_dataset_description(path) + df = dandi_file( + Path(path), + dandiset_path, + bids_dataset_description=bids_dataset_description, ) + path_metadata = df.get_metadata(digest=digest) + assert isinstance(df, bids.BIDSAsset) + meta["bids_version"] = df.get_validation_bids_version() + # there might be a more elegant way to do this: + for key in metadata_all_fields: + try: + value = getattr(path_metadata.wasAttributedTo[0], key) + except AttributeError: + pass + else: + meta[key] = value + elif path.endswith((".NWB", ".nwb")): + if nwb_has_external_links(path): + raise NotImplementedError( + f"NWB files with external links are not supported: {path}" + ) - # First read out possibly available versions of specifications for NWB(:N) - meta["nwb_version"] = get_nwb_version(path) - - # PyNWB might fail to load because of missing extensions. - # There is a new initiative of establishing registry of such extensions. - # Not yet sure if PyNWB is going to provide "native" support for needed - # functionality: https://github.com/NeurodataWithoutBorders/pynwb/issues/1143 - # So meanwhile, hard-coded workaround for data types we care about - ndtypes_registry = { - "AIBS_ecephys": "allensdk.brain_observatory.ecephys.nwb", - "ndx-labmetadata-abf": "ndx_dandi_icephys", - } - tried_imports = set() - while True: - try: - meta.update(_get_pynwb_metadata(path)) - break - except KeyError as exc: # ATM there is - lgr.debug("Failed to read %s: %s", path, exc) - res = re.match(r"^['\"\\]+(\S+). not a namespace", str(exc)) - if not res: - raise - ndtype = res.groups()[0] - if ndtype not in ndtypes_registry: - raise ValueError( - "We do not know which extension provides %s. " - "Original exception was: %s. " % (ndtype, exc) - ) - import_mod = ndtypes_registry[ndtype] - lgr.debug("Importing %r which should provide %r", import_mod, ndtype) - if import_mod in tried_imports: - raise RuntimeError( - "We already tried importing %s to provide %s, but it seems it didn't help" - % (import_mod, ndtype) - ) - tried_imports.add(import_mod) - __import__(import_mod) - - meta["nd_types"] = get_neurodata_types(path) - + # First read out possibly available versions of specifications for NWB(:N) + meta["nwb_version"] = get_nwb_version(path) + + # PyNWB might fail to load because of missing extensions. + # There is a new initiative of establishing registry of such extensions. + # Not yet sure if PyNWB is going to provide "native" support for needed + # functionality: https://github.com/NeurodataWithoutBorders/pynwb/issues/1143 + # So meanwhile, hard-coded workaround for data types we care about + ndtypes_registry = { + "AIBS_ecephys": "allensdk.brain_observatory.ecephys.nwb", + "ndx-labmetadata-abf": "ndx_dandi_icephys", + } + tried_imports = set() + while True: + try: + meta.update(_get_pynwb_metadata(path)) + break + except KeyError as exc: # ATM there is + lgr.debug("Failed to read %s: %s", path, exc) + res = re.match(r"^['\"\\]+(\S+). not a namespace", str(exc)) + if not res: + raise + ndtype = res.groups()[0] + if ndtype not in ndtypes_registry: + raise ValueError( + "We do not know which extension provides %s. " + "Original exception was: %s. " % (ndtype, exc) + ) + import_mod = ndtypes_registry[ndtype] + lgr.debug("Importing %r which should provide %r", import_mod, ndtype) + if import_mod in tried_imports: + raise RuntimeError( + "We already tried importing %s to provide %s, but it seems it didn't help" + % (import_mod, ndtype) + ) + tried_imports.add(import_mod) + __import__(import_mod) + + meta["nd_types"] = get_neurodata_types(path) + else: + raise RuntimeError("Unable to get metadata from non-BIDS, non-NWB asset.") return meta @@ -949,6 +984,12 @@ def add_common_metadata( "mtime %s of %s is in the future", metadata["blobDateModified"], path ) metadata["contentSize"] = os.path.getsize(path) + if digest is not None and digest.algorithm is models.DigestType.dandi_zarr_checksum: + m = re.fullmatch( + r"(?P[0-9a-f]{32})-(?P[0-9]+)--(?P[0-9]+)", digest.value + ) + if m: + metadata["contentSize"] = int(m["size"]) metadata.setdefault("wasGeneratedBy", []).append( get_generator(start_time, end_time) ) diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index d8af7871d..c5d5de590 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -220,7 +220,7 @@ def test_upload_bids(mocker: MockerFixture, bids_dandiset: SampleDandiset) -> No # Check existence of assets: dandiset = bids_dandiset.dandiset # file we created? - dandiset.get_asset_by_path("CHANGES") + dandiset.get_asset_by_path("README") # BIDS descriptor file? dandiset.get_asset_by_path("dataset_description.json") # actual data file? diff --git a/dandi/validate.py b/dandi/validate.py index 85b155a1c..53628d1a2 100644 --- a/dandi/validate.py +++ b/dandi/validate.py @@ -68,6 +68,7 @@ def validate_bids( origin = ValidationOrigin( name="bidsschematools", version=bidsschematools.__version__, + bids_version=validation_result["bids_version"], ) # Storing variable to not re-compute set paths for each individual file. diff --git a/dandi/validate_types.py b/dandi/validate_types.py index dfd920e11..ade146f38 100644 --- a/dandi/validate_types.py +++ b/dandi/validate_types.py @@ -8,6 +8,7 @@ class ValidationOrigin: name: str version: str + bids_version: Optional[str] = None class Severity(Enum):