From 418df85c0c2dfbae27d77a4a2f1f1e9fe7bc81f0 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Tue, 28 Jan 2025 12:45:41 -0600 Subject: [PATCH] First pass at a lighter weight bids validator --- hed/errors/exceptions.py | 2 + hed/tools/bids/bids_dataset.py | 4 +- hed/tools/validation/bids_file.py | 103 +++++++++++++ hed/tools/validation/bids_validator.py | 200 +++++++++++++++++++++++++ hed/validator/data/reservedTags.json | 21 --- hed/validator/reserved_checker.py | 7 - pyproject.toml | 2 +- tests/tools/bids/test_bids_dataset.py | 11 +- 8 files changed, 318 insertions(+), 32 deletions(-) create mode 100644 hed/tools/validation/bids_file.py create mode 100644 hed/tools/validation/bids_validator.py diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py index 000e910d0..110fc4c2f 100644 --- a/hed/errors/exceptions.py +++ b/hed/errors/exceptions.py @@ -15,6 +15,7 @@ class HedExceptions: INVALID_DATAFRAME = 'INVALID_DATAFRAME' INVALID_FILE_FORMAT = 'INVALID_FILE_FORMAT' + # These are actual schema issues, not that the file cannot be found or parsed SCHEMA_HEADER_MISSING = 'SCHEMA_HEADER_INVALID' SCHEMA_HEADER_INVALID = 'SCHEMA_HEADER_INVALID' @@ -32,6 +33,7 @@ class HedExceptions: SCHEMA_VERSION_INVALID = 'SCHEMA_VERSION_INVALID' SCHEMA_SECTION_MISSING = 'SCHEMA_SECTION_MISSING' + SCHEMA_INVALID = 'SCHEMA_INVALID' WIKI_SEPARATOR_INVALID = 'invalidSectionSeparator' diff --git a/hed/tools/bids/bids_dataset.py b/hed/tools/bids/bids_dataset.py index 01ec6297f..dd8af0038 100644 --- a/hed/tools/bids/bids_dataset.py +++ b/hed/tools/bids/bids_dataset.py @@ -18,7 +18,7 @@ class BidsDataset: """ - def __init__(self, root_path, schema=None, tabular_types=['events'], + def __init__(self, root_path, schema=None, tabular_types=['events', 'participants'], exclude_dirs=['sourcedata', 'derivatives', 'code', 'stimuli', 'phenotype']): """ Constructor for a BIDS dataset. @@ -41,7 +41,7 @@ def __init__(self, root_path, schema=None, tabular_types=['events'], self.exclude_dirs = exclude_dirs self.tabular_files = {} if not tabular_types: - self.tabular_files["events"] = BidsFileGroup(root_path, suffix="events", obj_type="tabular", + self.tabular_files["events"] = BidsFileGroup(root_path, obj_type="tabular", exclude_dirs=exclude_dirs) else: for suffix in tabular_types: diff --git a/hed/tools/validation/bids_file.py b/hed/tools/validation/bids_file.py new file mode 100644 index 000000000..23b0677ce --- /dev/null +++ b/hed/tools/validation/bids_file.py @@ -0,0 +1,103 @@ +import pandas as pd +import os +from functools import lru_cache + + +class BidsFile: + def __init__(self, file_path): + """ Constructor for a file path. + + Parameters: + file_path(str): Full path of the file. + + """ + self.file_path = os.path.realpath(file_path) + [self.basename, self.suffix, self.extension, self.entities] = self.get_entities(self.file_path) + self._contents = None + self.has_hed = False + + @property + def contents(self): + """ Return the current contents of this object. """ + return self._contents + + @staticmethod + def get_entities(file_path): + basename = os.path.basename(file_path) + stem = basename.split('.', 1) + if len(stem) == 2: + extension = stem[1] + else: + extension = '' + suffix = '' + parts = stem[0].split('_') + if len(parts) > 1 or parts[-1].isalnum(): + suffix = parts.pop() + entities = {} + for part in parts: + entity, _, label = part.partition('-') + entities[entity] = label if label else 'NO_ENTITY' + return basename, suffix, extension, entities + + +class JsonFile(BidsFile): + + def __init__(self, file_path): + """ Constructor for a file path. + + Parameters: + file_path(str): Full path of the file. + + """ + super().__init__(file_path) + self._initialize_contents() + + def _initialize_contents(self): + # Read the sidecar as a string + with open(self.file_path, 'r', encoding='utf-8') as fp: + json_string = fp.read() + + if '"HED":' in json_string: + self.has_hed = True + self._contents = json_string + + +class TabularFile(BidsFile): + + def __init__(self, file_path): + """ Constructor for a file path. + + Parameters: + file_path(str): Full path of the file. + + """ + super().__init__(file_path) + self._initialize_contents() + + def _initialize_contents(self): + # Read the tsv header if the file is not empty + try: + self._contents = list(pd.read_csv(self.file_path, sep='\t', nrows=0).columns) + if '"HED":' in self._contents: + self.has_hed = True + except Exception as e: + self._contents = None + + +@lru_cache(maxsize=None) +def get_bids_file(filename): + # dot = filename.find('.') + # if dot == -1: + # stem, extension = filename, '' + # else: + # stem, extension = filename[:dot], filename[dot:] + splits = filename.split('.') + if len(splits) != 2: + return None + extension = splits[1].lower() + if extension == '.json': + return JsonFile(filename) + elif extension == '.tsv': + return TabularFile(filename) + else: + return None diff --git a/hed/tools/validation/bids_validator.py b/hed/tools/validation/bids_validator.py new file mode 100644 index 000000000..23f04760e --- /dev/null +++ b/hed/tools/validation/bids_validator.py @@ -0,0 +1,200 @@ +import os +import io +import json +from hed.models.sidecar import Sidecar +from hed.models.tabular_input import TabularInput +from hed.schema.hed_schema_io import load_schema_version +from hed.schema.hed_schema import HedSchema +from hed.schema.hed_schema_group import HedSchemaGroup +from hed.errors import HedFileError, HedExceptions, ErrorHandler +from hed.tools.validation.bids_file import BidsFile, JsonFile, TabularFile, get_bids_file +from hed.tools.util import io_util +from hed import load_schema_version, get_printable_issue_string + + +class BidsValidator: + """ A BIDS dataset validator class. + + Attributes: + root_path (str): Real root path of the BIDS dataset. + schema (HedSchema or HedSchemaGroup): The schema used for evaluation. + + + """ + def __init__(self, root_path, schema=None, suffix_types=['events', 'participants'], + exclude_dirs=['sourcedata', 'derivatives', 'code', 'stimuli'], + check_for_warnings=False, verbose=False): + """ Constructor for a BIDS dataset. + + Parameters: + root_path (str): Root path of the BIDS dataset. + schema (HedSchema or HedSchemaGroup): A schema that overrides the one specified in dataset. + suffix_types (list or None): List of strings specifying the suffixes (no under_bar) of files to include. + The default is + exclude_dirs (list or None): The default is ['sourcedata', 'derivatives', 'code', 'phenotype'] + check_for_warnings (bool): If true, also check for warnings. + verbose (bool): If true give progress output. + + """ + self.root_path = os.path.realpath(root_path) + self.exclude_dirs = exclude_dirs + self.suffix_types = suffix_types + self.schema = self._get_schema(schema) + self.check_for_warnings = check_for_warnings + self.verbose = verbose + self.error_handler = ErrorHandler(check_for_warnings=self.check_for_warnings) + self.issues = [] + + def process_dataset(self): + self.process_sidecars() + + def process_sidecars(self): + + if self.suffix_types: + name_suffix = self.suffix_types + else: + name_suffix = None + json_paths = io_util.get_file_list(self.root_path, name_suffix=name_suffix, extensions=['.json'], + exclude_dirs=self.exclude_dirs) + if self.verbose: + print(f"Validating {len(json_paths)} JSON files:") + + for json_path in json_paths: + # Read the sidecar metadata and contents as a string. + json_file = JsonFile(json_path) + if not json_file.has_hed: + continue + + # Validate the sidecar + sidecar = Sidecar(files=io.StringIO(json_file.contents), name=json_file.basename) + issues = sidecar.validate(self.schema, name=sidecar.name, error_handler=self.error_handler) + if self.verbose: + print(f"\tValidating {json_file.basename}: found {len(issues)} issues") + self.issues += issues + + def process_tabular(self): + + if self.suffix_types: + name_suffix = self.suffix_types + else: + name_suffix = None + tabular_paths = io_util.get_file_list(self.root_path, name_suffix=name_suffix, extensions=['.tsv'], + exclude_dirs=self.exclude_dirs) + if self.verbose: + print(f"Validating {len(tabular_paths)} tsv files:") + + for tabular_path in tabular_paths: + tabular_file = TabularFile(tabular_path) + sidecar_dict = self.get_merged_sidecar(tabular_file) + if sidecar_dict: + sidecar_name = os.path.splitext(os.path.basename(tabular_path))[0] + '.json' + sidecar = Sidecar(files=io.StringIO(json.dumps(sidecar_dict)), name=sidecar_name) + else: + sidecar = None + print(tabular_file.basename) + tabular = TabularInput(file=tabular_file.file_path, sidecar=sidecar, name=tabular_file.basename) + issues = tabular.validate(self.schema, error_handler=self.error_handler) + if self.verbose: + print(f"\tValidating {tabular_file.basename}: found {len(issues)} issues") + self.issues += issues + + def _get_schema(self, schema): + if schema and isinstance(schema, (HedSchema, HedSchemaGroup)): + return schema + elif schema: + raise HedFileError(HedExceptions.SCHEMA_INVALID, + f"The schema passed was not a valid HedSchema or HedSchemaGroup", "") + + # Try to read the schema + with open(os.path.join(self.root_path, "dataset_description.json"), "r") as fp: + dataset_description = json.load(fp) + if not dataset_description: + raise HedFileError(HedExceptions.SCHEMA_LOAD_FAILED, + f"A schema could not be found for dataset {self.root_path}", "") + return load_schema_version(dataset_description.get("HEDVersion", None)) + + def get_merged_sidecar(self, tsv_file): + sidecar_files = [file for file in self.walk_back(tsv_file, inherit=True)] + merged_sidecar = {} + while sidecar_files: + this_sidecar = sidecar_files.pop() + merged_sidecar.update(this_sidecar.get_contents) + return merged_sidecar + + def walk_back(self, tsv_file, inherit=True): + source_dir = os.path.dirname(tsv_file.file_path) + while source_dir: + candidates = self.get_candidates(source_dir, tsv_file) + + if len(candidates) == 1: + yield candidates[0] + + exact_match = self.find_exact_match(candidates, tsv_file.entities) + if exact_match: + yield exact_match + elif len(candidates) > 1: + paths = sorted(file.file_path for file in candidates) + raise Exception({ + "code": "MULTIPLE_INHERITABLE_FILES", + "location": paths[0], + "affects": tsv_file.file_path, + "issueMessage": f"Candidate files: {paths}", + }) + + if not inherit: + break + + if source_dir == os.path.dirname(source_dir): + source_dir = None + else: + source_dir = os.path.dirname(source_dir) + + @staticmethod + def get_candidates(source_dir, tsv_file): + candidates = [] + for file in os.listdir(source_dir): + this_path = os.path.realpath(os.path.join(source_dir, file)) + if not os.path.isfile(this_path): + continue + bids_file = get_bids_file(this_path) + if not bids_file: + continue + if BidsValidator.matches_criteria(bids_file, tsv_file): + candidates.append(bids_file) + return candidates + + @staticmethod + def matches_criteria(bids_file, tsv_file): + extension_is_valid = bids_file.extension.lower() == ".json" + suffix_is_valid = (bids_file.suffix == tsv_file.suffix) or not tsv_file.suffix + entities_match = all( + bids_file.enties.get(entity) == tsv_file.entities.get(entity) for entity in tsv_file.entities.keys()) + return extension_is_valid and suffix_is_valid and entities_match + + @staticmethod + def find_exact_match(candidates, source_entities): + for bids_file in candidates: + if all(bids_file.entities.get(entity) == source_entities.get(entity) for entity in source_entities.keys()): + return bids_file + return None + + +if __name__ == '__main__': + dataset_dir = os.path.realpath('d:/eeg_ds003645s_hed_demo') + validator = BidsValidator(dataset_dir, suffix_types=None, check_for_warnings=False, verbose=True) + validator.process_sidecars() + issue_list = validator.issues + if issue_list: + issue_str = get_printable_issue_string(issue_list, "HED validation errors: ", skip_filename=False) + else: + issue_str = "No HED validation errors in JSON files" + print(issue_str) + + validator.process_tabular() + issue_list = validator.issues + if issue_list: + issue_str = get_printable_issue_string(issue_list, "HED validation errors: ", skip_filename=False) + else: + issue_str = "No HED validation errors in tsv files" + print(issue_str) + # files = io_util.get_file_list(dataset_dir, name_suffix=None, extensions=['.json'], exclude_dirs=None) diff --git a/hed/validator/data/reservedTags.json b/hed/validator/data/reservedTags.json index 34b40e43b..e89e53842 100644 --- a/hed/validator/data/reservedTags.json +++ b/hed/validator/data/reservedTags.json @@ -3,16 +3,13 @@ "name": "Definition", "noExtension": true, "allowValue": true, - "allowTwoLevelValue": true, "requireValue": true, - "exclusive": true, "tagGroup": true, "topLevelTagGroup": true, "maxNonDefSubgroups": 1, "minNonDefSubgroups": 0, "ERROR_CODE": "DEFINITION_INVALID", "noSpliceInGroup": true, - "forbiddenSubgroupTags": ["Def", "Def-expand"], "requiresTimeline": false, "requiresDef": false, "otherAllowedNonDefTags": [] @@ -21,16 +18,13 @@ "name": "Delay", "noExtension": true, "allowValue": true, - "allowTwoLevelValue": false, "requireValue": true, - "exclusive": false, "tagGroup": true, "topLevelTagGroup": true, "maxNonDefSubgroups": 1, "minNonDefSubgroups": 0, "ERROR_CODE": "TEMPORAL_TAG_ERROR", "noSpliceInGroup": false, - "forbiddenSubgroupTags": [], "requiresTimeline": true, "requiresDef": false, "otherAllowedNonDefTags": ["Duration", "Onset", "Offset", "Inset"] @@ -39,16 +33,13 @@ "name": "Duration", "noExtension": true, "allowValue": true, - "allowTwoLevelValue": false, "requireValue": true, - "exclusive": false, "tagGroup": true, "topLevelTagGroup": true, "maxNonDefSubgroups": 1, "minNonDefSubgroups": 1, "ERROR_CODE": "TEMPORAL_TAG_ERROR", "noSpliceInGroup": false, - "forbiddenSubgroupTags": [], "requiresTimeline": false, "requiresDef": false, "otherAllowedNonDefTags": ["Delay"] @@ -57,16 +48,13 @@ "name": "Event-context", "noExtension": true, "allowValue": false, - "allowTwoLevelValue": false, "requireValue": false, - "exclusive": false, "tagGroup": true, "topLevelTagGroup": true, "maxNonDefSubgroups": null, "minNonDefSubgroups": 0, "ERROR_CODE": "TAG_GROUP_ERROR", "noSpliceInGroup": true, - "forbiddenSubgroupTags": [], "requiresTimeline": false, "requiresDef": false, "otherAllowedNonDefTags": [] @@ -75,16 +63,13 @@ "name": "Inset", "noExtension": true, "allowValue": false, - "allowTwoLevelValue": false, "requireValue": false, - "exclusive": false, "tagGroup": true, "topLevelTagGroup": true, "maxNonDefSubgroups": 1, "minNonDefSubgroups": 0, "ERROR_CODE": "TEMPORAL_TAG_ERROR", "noSpliceInGroup": false, - "forbiddenSubgroupTags": [], "requiresTimeline": true, "requiresDef": true, "otherAllowedNonDefTags": ["Delay"] @@ -93,16 +78,13 @@ "name": "Offset", "noExtension": true, "allowValue": false, - "allowTwoLevelValue": false, "requireValue": false, - "exclusive": false, "tagGroup": true, "topLevelTagGroup": true, "maxNonDefSubgroups": 0, "minNonDefSubgroups": 0, "ERROR_CODE": "TEMPORAL_TAG_ERROR", "noSpliceInGroup": false, - "forbiddenSubgroupTags": [], "requiresTimeline": true, "requiresDef": true, "otherAllowedNonDefTags": ["Delay"] @@ -111,16 +93,13 @@ "name": "Onset", "noExtension": true, "allowValue": false, - "allowTwoLevelValue": false, "requireValue": false, - "exclusive": false, "tagGroup": true, "topLevelTagGroup": true, "maxNonDefSubgroups": 1, "minNonDefSubgroups": 0, "ERROR_CODE": "TEMPORAL_TAG_ERROR", "noSpliceInGroup": false, - "forbiddenSubgroupTags": [], "requiresTimeline": true, "requiresDef": true, "otherAllowedNonDefTags": ["Delay"] diff --git a/hed/validator/reserved_checker.py b/hed/validator/reserved_checker.py index 539f92d7f..0cf226a60 100644 --- a/hed/validator/reserved_checker.py +++ b/hed/validator/reserved_checker.py @@ -37,18 +37,11 @@ def _initialize_special_tags(self): self.special_names = set(self.reserved_map.keys()) self.require_value_tags = self._get_special_tags_by_property("requireValue") self.no_extension_tags = self._get_special_tags_by_property("noExtension") - self.allow_two_level_value_tags = self._get_special_tags_by_property("allowTwoLevelValue") self.top_group_tags = self._get_special_tags_by_property("topLevelTagGroup") self.requires_def_tags = self._get_special_tags_by_property("requiresDef") self.group_tags = self._get_special_tags_by_property("tagGroup") - self.exclusive_tags = self._get_special_tags_by_property("exclusive") self.timelineTags = self._get_special_tags_by_property("requiresTimeline") self.no_splice_in_group = self._get_special_tags_by_property("noSpliceInGroup") - self.has_forbidden_subgroup_tags = { - value["name"] - for value in self.reserved_map.values() - if len(value.get("forbiddenSubgroupTags", [])) > 0 - } def _get_special_tags_by_property(self, property_name): return { diff --git a/pyproject.toml b/pyproject.toml index 264e3e7f1..844d9e9e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ include = ["hed*"] namespaces = false [tool.setuptools.package-data] -hed = ["schema/schema_data/*.xml", "resources/*.png", "validator/util/class_regex.json"] +hed = ["schema/schema_data/*.xml", "resources/*.png", "validator/data/*"] [tool.codespell] skip = '*.git,*.pdf,*.svg,versioneer.py,venv*,*.tsv,*.yaml,*.yml,*.json,*.rdf,*.jsonld,spec_tests,,*.xml,*.mediawiki,*.omn,*.toml' diff --git a/tests/tools/bids/test_bids_dataset.py b/tests/tools/bids/test_bids_dataset.py index 02b167dea..f511109ca 100644 --- a/tests/tools/bids/test_bids_dataset.py +++ b/tests/tools/bids/test_bids_dataset.py @@ -19,7 +19,7 @@ def setUpClass(cls): '../../data/bids_tests/eeg_ds003645s_empty') def test_constructor(self): - bids = BidsDataset(self.root_path) + bids = BidsDataset(self.root_path, tabular_types=['events']) self.assertIsInstance(bids, BidsDataset, "BidsDataset should create a valid object from valid dataset") parts = bids.get_tabular_group("participants") self.assertFalse(parts) @@ -34,6 +34,15 @@ def test_constructor(self): self.assertTrue(bids.schema, "BidsDataset constructor extracts a schema from the dataset.") self.assertIsInstance(bids.schema, HedSchema, "BidsDataset schema should be HedSchema") + def test_constructor_all_tsv(self): + bids = BidsDataset(self.root_path) + self.assertIsInstance(bids, BidsDataset, "BidsDataset should create a valid object from valid dataset") + parts = bids.get_tabular_group("participants") + self.assertIsInstance(parts, BidsFileGroup) + events = bids.get_tabular_group("events") + self.assertIsInstance(events, BidsFileGroup) + self.assertEqual(len(events.datafile_dict), 6) + def test_constructor_libraries(self): bids = BidsDataset(self.library_path, tabular_types=['participants', 'events']) self.assertIsInstance(bids, BidsDataset,