diff --git a/hed/errors/__init__.py b/hed/errors/__init__.py index a094256df..8bbe1f662 100644 --- a/hed/errors/__init__.py +++ b/hed/errors/__init__.py @@ -1,4 +1,5 @@ from .error_reporter import ErrorHandler, get_printable_issue_string, sort_issues -from .error_types import DefinitionErrors, OnsetErrors, SchemaErrors, SchemaWarnings, SidecarErrors, ValidationErrors +from .error_types import DefinitionErrors, OnsetErrors, SchemaErrors, SchemaWarnings, SidecarErrors, \ + ValidationErrors, ColumnErrors from .error_types import ErrorContext, ErrorSeverity from .exceptions import HedExceptions, HedFileError diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index 5df62f03f..4c333ff22 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -401,23 +401,23 @@ def onset_wrong_placeholder(tag, has_placeholder): return f"Onset/offset def tag {tag} should not have a placeholder, but has one." -@hed_error(ColumnErrors.INVALID_COLUMN_REF) -def invalid_column_ref(bad_refs): - return f"Bad column references found(columns do not exist): {bad_refs}" +@hed_error(ColumnErrors.INVALID_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID) +def invalid_column_ref(bad_ref): + return f"The column '{bad_ref}' is unknown.'" -@hed_error(ColumnErrors.SELF_COLUMN_REF) +@hed_error(ColumnErrors.SELF_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID) def self_column_ref(self_ref): return f"Column references itself: {self_ref}" -@hed_error(ColumnErrors.NESTED_COLUMN_REF) +@hed_error(ColumnErrors.NESTED_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID) def nested_column_ref(column_name, ref_column): return f"Column {column_name} has a nested reference to {ref_column}. " \ f"Column reference columns cannot contain other column references." -@hed_error(ColumnErrors.MALFORMED_COLUMN_REF) +@hed_error(ColumnErrors.MALFORMED_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID) def nested_column_ref(column_name, index, symbol): return f"Column {column_name} has a malformed column reference. Improper symbol {symbol} found at index {index}." diff --git a/hed/errors/error_reporter.py b/hed/errors/error_reporter.py index 8b4808c16..4f8ba17f1 100644 --- a/hed/errors/error_reporter.py +++ b/hed/errors/error_reporter.py @@ -396,10 +396,9 @@ def val_error_unknown(*args, **kwargs): Returns: str: The error message. - dict: The extra args. """ - return f"Unknown error. Args: {str(args)}", kwargs + return f"Unknown error. Args: {str(args), str(kwargs)}" @staticmethod def filter_issues_by_severity(issues_list, severity): diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index 15b77e6a3..0e7d42aef 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -99,6 +99,7 @@ class SidecarErrors: SIDECAR_HED_USED_COLUMN = 'SIDECAR_HED_USED_COLUMN' SIDECAR_NA_USED = 'SIDECAR_NA_USED' SIDECAR_HED_USED = 'SIDECAR_HED_USED' + SIDECAR_BRACES_INVALID = "SIDECAR_BRACES_INVALID" class SchemaErrors: diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 09e0875c5..4e335a72c 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -251,9 +251,9 @@ def columns(self): Empty if no column names. Returns: - columns(dict): The column number:name pairs + columns(list): the column names """ - columns = {} + columns = [] if self._dataframe is not None and self._has_column_names: columns = list(self._dataframe.columns) return columns @@ -354,12 +354,12 @@ def _dataframe_has_names(dataframe): return True return False - def assemble(self, mapper=None, skip_square_brackets=False): + def assemble(self, mapper=None, skip_curly_braces=False): """ Assembles the hed strings Parameters: mapper(ColumnMapper or None): Generally pass none here unless you want special behavior. - skip_square_brackets (bool): If True, don't plug in square bracket values into columns. + skip_curly_braces (bool): If True, don't plug in curly brace values into columns. Returns: Dataframe: the assembled dataframe """ @@ -367,11 +367,12 @@ def assemble(self, mapper=None, skip_square_brackets=False): mapper = self._mapper all_columns = self._handle_transforms(mapper) - if skip_square_brackets: + if skip_curly_braces: return all_columns transformers, _ = mapper.get_transformers() - - return self._handle_square_brackets(all_columns, list(transformers)) + refs = self.get_column_refs() + column_names = list(transformers) + return self._handle_curly_braces_refs(all_columns, refs, column_names) def _handle_transforms(self, mapper): transformers, need_categorical = mapper.get_transformers() @@ -390,45 +391,67 @@ def _handle_transforms(self, mapper): return all_columns @staticmethod - def _find_column_refs(df, column_names): - found_column_references = [] - for column_name in column_names: - df_temp = df[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) - u_vals = pd.Series([j for i in df_temp if isinstance(i, list) for j in i], dtype=str) - u_vals = u_vals.unique() - for val in u_vals: - if val not in found_column_references: - found_column_references.append(val) - - return found_column_references + def _replace_ref(text, newvalue, column_ref): + """ Replace column ref in x with y. If it's n/a, delete extra commas/parentheses. - @staticmethod - def _handle_square_brackets(df, known_columns=None): + Note: This function could easily be updated to handle non-curly brace values, but it's faster this way. + Parameters: + text (str): The input string containing the ref enclosed in curly braces. + newvalue (str): The replacement value for the ref. + column_ref (str): The ref to be replaced, without curly braces + + Returns: + str: The modified string with the ref replaced or removed. """ - Plug in square brackets with other columns + # If it's not n/a, we can just replace directly. + if newvalue != "n/a": + return text.replace(f"{{{column_ref}}}", newvalue) + + def _remover(match): + p1 = match.group("p1").count("(") + p2 = match.group("p2").count(")") + if p1 > p2: # We have more starting parens than ending. Make sure we don't remove comma before + output = match.group("c1") + "(" * (p1 - p2) + elif p2 > p1: # We have more ending parens. Make sure we don't remove comma after + output = ")" * (p2 - p1) + match.group("c2") + else: + c1 = match.group("c1") + c2 = match.group("c2") + if c1: + c1 = "" + elif c2: + c2 = "" + output = c1 + c2 + + return output + + # this finds all surrounding commas and parentheses to a reference. + # c1/c2 contain the comma(and possibly spaces) separating this ref from other tags + # p1/p2 contain the parentheses directly surrounding the tag + # All four groups can have spaces. + pattern = r'(?P[\s,]*)(?P[(\s]*)\{' + column_ref + r'\}(?P[\s)]*)(?P[\s,]*)' + return re.sub(pattern, _remover, text) - If known columns is passed, only use those columns to find or replace references. + @staticmethod + def _handle_curly_braces_refs(df, refs, column_names): """ - if known_columns is not None: - column_names = list(known_columns) - else: - column_names = list(df.columns) - possible_column_references = [f"{column_name}" for column_name in column_names if - isinstance(column_name, str) and column_name.lower() != "hed"] - found_column_references = BaseInput._find_column_refs(df, column_names) - - valid_replacements = [col for col in found_column_references if col in possible_column_references] - - # todo: break this into a sub function(probably) - for column_name in valid_replacements: - column_names.remove(column_name) - saved_columns = df[valid_replacements] - for column_name in column_names: - for replacing_name in valid_replacements: - column_name_brackets = f"[{replacing_name}]" - df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y + Plug in curly braces with other columns + """ + # Filter out columns and refs that don't exist. + refs = [ref for ref in refs if ref in column_names] + remaining_columns = [column for column in column_names if column not in refs] + + # Replace references in the columns we are saving out. + saved_columns = df[refs] + for column_name in remaining_columns: + for replacing_name in refs: + # If the data has no n/a values, this version is MUCH faster. + # column_name_brackets = f"{{{replacing_name}}}" + # df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y + # in zip(df[column_name], saved_columns[replacing_name])) + df[column_name] = pd.Series(BaseInput._replace_ref(x, y, replacing_name) for x, y in zip(df[column_name], saved_columns[replacing_name])) - df = df[column_names] + df = df[remaining_columns] return df @@ -462,4 +485,14 @@ def get_def_dict(self, hed_schema=None, extra_def_dicts=None): DefinitionDict: A single definition dict representing all the data(and extra def dicts) """ from hed.models.definition_dict import DefinitionDict - return DefinitionDict(extra_def_dicts, hed_schema) \ No newline at end of file + return DefinitionDict(extra_def_dicts, hed_schema) + + def get_column_refs(self): + """ Returns a list of column refs for this file. + + Default implementation returns none. + + Returns: + column_refs(list): A list of unique column refs found + """ + return [] diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index 3c4c87a63..ad68114a8 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -85,12 +85,12 @@ def get_transformers(self): if column.column_type == ColumnType.Ignore: continue elif column.column_type == ColumnType.Value: - value_str = column._hed_dict + value_str = column.hed_dict from functools import partial final_transformers[assign_to_column] = partial(self._value_handler, value_str) elif column.column_type == ColumnType.Categorical: need_categorical.append(column.column_name) - category_values = column._hed_dict + category_values = column.hed_dict from functools import partial final_transformers[assign_to_column] = partial(self._category_handler, category_values) else: @@ -243,7 +243,7 @@ def _add_value_columns(self, column_prefix_dictionary): prefix = prefix + "#" else: prefix = prefix + "/#" - new_def = ColumnMetadata(ColumnType.Value, col, hed_dict=prefix) + new_def = ColumnMetadata(ColumnType.Value, col, source=prefix) self._add_column_data(new_def) def _add_column_data(self, new_column_entry): diff --git a/hed/models/column_metadata.py b/hed/models/column_metadata.py index ecdc76f08..33bb3f356 100644 --- a/hed/models/column_metadata.py +++ b/hed/models/column_metadata.py @@ -1,5 +1,6 @@ from enum import Enum from hed.errors.error_types import SidecarErrors +import pandas as pd class ColumnType(Enum): @@ -21,30 +22,20 @@ class ColumnType(Enum): class ColumnMetadata: """ Column in a ColumnMapper. """ - def __init__(self, column_type=None, name=None, hed_dict=None, column_prefix=None): + def __init__(self, column_type=None, name=None, source=None): """ A single column entry in the column mapper. Parameters: column_type (ColumnType or None): How to treat this column when reading data. name (str, int, or None): The column_name or column number identifying this column. If name is a string, you'll need to use a column map to set the number later. - hed_dict (dict or str or None): The loaded data (usually from json) for the given def - For category columns, this is a dict. - For value columns, it's a string. - column_prefix (str or None): If present, prepend the given column_prefix to all hed tags in the columns. - Only works on ColumnType HedTags. - - Notes: - - Each column from which data is retrieved must have a ColumnMetadata representing its contents. - - The column_prefix dictionaries are used when the column is processed. + source (dict or str or None): Either the entire loaded json sidecar or a single HED string """ - if hed_dict is None: - hed_dict = {} - - self.column_type = column_type self.column_name = name - self.column_prefix = column_prefix - self._hed_dict = hed_dict + self._source = source + if column_type is None: + column_type = self._detect_column_type(self.source_dict) + self.column_type = column_type @property def hed_dict(self): @@ -54,7 +45,78 @@ def hed_dict(self): dict or str: A string or dict of strings for this column """ - return self._hed_dict + if self._source is None or isinstance(self._source, str): + return self._source + return self._source[self.column_name].get("HED", {}) + + @property + def source_dict(self): + """ The raw dict for this entry(if it exists) + + Returns: + dict or str: A string or dict of strings for this column + """ + if self._source is None or isinstance(self._source, str): + return {"HED": self._source} + return self._source[self.column_name] + + def get_hed_strings(self): + if not self.column_type: + return pd.Series(dtype=str) + + series = pd.Series(self.hed_dict, dtype=str) + + return series + + def set_hed_strings(self, new_strings): + if new_strings is None: + return False + + if not self.column_type: + return False + + if isinstance(new_strings, pd.Series): + if self.column_type == ColumnType.Categorical: + new_strings = new_strings.to_dict() + else: + new_strings = new_strings.iloc[0] + + self._source[self.column_name]["HED"] = new_strings + + return True + + @staticmethod + def _detect_column_type(dict_for_entry): + """ Determine the ColumnType of a given json entry. + + Parameters: + dict_for_entry (dict): The loaded json entry a specific column. + Generally has a "HED" entry among other optional ones. + + Returns: + ColumnType: The determined type of given column. Returns None if unknown. + + """ + if not dict_for_entry or not isinstance(dict_for_entry, dict): + return ColumnType.Ignore + + minimum_required_keys = ("HED",) + if not set(minimum_required_keys).issubset(dict_for_entry.keys()): + return ColumnType.Ignore + + hed_entry = dict_for_entry["HED"] + if isinstance(hed_entry, dict): + if not all(isinstance(entry, str) for entry in hed_entry.values()): + return None + return ColumnType.Categorical + + if not isinstance(hed_entry, str): + return None + + if "#" not in dict_for_entry["HED"]: + return None + + return ColumnType.Value @staticmethod def expected_pound_sign_count(column_type): diff --git a/hed/models/df_util.py b/hed/models/df_util.py index 32311abf6..6cd4943df 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -126,17 +126,14 @@ def expand_defs(df, hed_schema, def_dict, columns=None): def _convert_to_form(hed_string, hed_schema, tag_form): - from hed import HedString return str(HedString(hed_string, hed_schema).get_as_form(tag_form)) def _shrink_defs(hed_string, hed_schema): - from hed import HedString return str(HedString(hed_string, hed_schema).shrink_defs()) def _expand_defs(hed_string, hed_schema, def_dict): - from hed import HedString return str(HedString(hed_string, hed_schema, def_dict).expand_defs()) diff --git a/hed/models/hed_string.py b/hed/models/hed_string.py index 7be20fb5d..7f17df234 100644 --- a/hed/models/hed_string.py +++ b/hed/models/hed_string.py @@ -73,17 +73,11 @@ def remove_definitions(self): """ Remove definition tags and groups from this string. This does not validate definitions and will blindly removing invalid ones as well. - - Returns: - list: An empty list as there are no possible issues, this list is always blank. - """ definition_groups = self.find_top_level_tags({DefTagNames.DEFINITION_KEY}, include_groups=1) if definition_groups: self.remove(definition_groups) - return [] - def shrink_defs(self): """ Replace def-expand tags with def tags @@ -114,7 +108,7 @@ def expand_defs(self): replacements = [] for tag in def_tags: if tag.expandable and not tag.expanded: - replacements.append((tag, tag._expandable)) + replacements.append((tag, tag.expandable)) for tag, group in replacements: tag_parent = tag._parent @@ -333,7 +327,7 @@ def validate(self, hed_schema, allow_placeholders=True, error_handler=None): from hed.validator import HedValidator validator = HedValidator(hed_schema) - return validator.validate(self, allow_placeholders=allow_placeholders) + return validator.validate(self, allow_placeholders=allow_placeholders, error_handler=error_handler) def find_top_level_tags(self, anchor_tags, include_groups=2): """ Find top level groups with an anchor tag. @@ -363,3 +357,13 @@ def find_top_level_tags(self, anchor_tags, include_groups=2): if include_groups == 0 or include_groups == 1: return [tag[include_groups] for tag in top_level_tags] return top_level_tags + + def remove_refs(self): + """ This removes any refs(tags contained entirely inside curly braces) from the string. + + This does NOT validate the contents of the curly braces. This is only relevant when directly + editing sidecar strings. Tools will naturally ignore these. + """ + ref_tags = [tag for tag in self.get_all_tags() if tag.is_column_ref()] + if ref_tags: + self.remove(ref_tags) diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py index 57fa7e3ad..d124c338e 100644 --- a/hed/models/hed_tag.py +++ b/hed/models/hed_tag.py @@ -237,7 +237,6 @@ def extension(self): def extension(self, x): self._extension_value = f"/{x}" - @property def long_tag(self): """ Long form including value or extension. @@ -298,6 +297,16 @@ def expandable(self): """ return self._expandable + def is_column_ref(self): + """ Returns if this tag is a column reference from a sidecar. + + You should only see these if you are directly accessing sidecar strings, tools should remove them otherwise. + + Returns: + bool: Returns True if this is a column ref + """ + return self.org_tag.startswith('{') and self.org_tag.endswith('}') + def __str__(self): """ Convert this HedTag to a string. diff --git a/hed/models/sidecar.py b/hed/models/sidecar.py index 958cadfba..e67670a6e 100644 --- a/hed/models/sidecar.py +++ b/hed/models/sidecar.py @@ -1,4 +1,6 @@ import json +import re + from hed.models.column_metadata import ColumnMetadata from hed.errors.error_types import ErrorContext from hed.errors import ErrorHandler @@ -8,7 +10,6 @@ from hed.models.definition_dict import DefinitionDict -# todo: Add/improve validation for definitions being in known columns(right now it just assumes they aren't) class Sidecar: """ Contents of a JSON file or merged file. @@ -35,6 +36,24 @@ def __iter__(self): """ return iter(self.column_data) + def __getitem__(self, column_name): + if column_name not in self.loaded_dict: + return None + return ColumnMetadata(name=column_name) + + @property + def all_hed_columns(self): + """ Returns all columns that are HED compatible + + returns: + column_refs(list): A list of all valid hed columns by name + """ + possible_column_references = [column.column_name for column in self if column.column_type != ColumnType.Ignore] + if "HED" not in possible_column_references: + possible_column_references.append("HED") + + return possible_column_references + @property def def_dict(self): """This is the definitions from this sidecar. @@ -53,20 +72,7 @@ def column_data(self): Returns: list(ColumnMetadata): the list of column metadata defined by this sidecar """ - for col_name, col_dict in self.loaded_dict.items(): - yield self._generate_single_column(col_name, col_dict) - - def set_hed_string(self, new_hed_string, position): - """ Set a provided column/category key/etc. - - Parameters: - new_hed_string (str or HedString): The new hed_string to replace the value at position. - position (tuple): The (HedString, str, list) tuple returned from hed_string_iter. - - """ - column_name, position = position - hed_dict = self.loaded_dict[column_name] - hed_dict["HED"] = self._set_hed_string_low(new_hed_string, hed_dict["HED"], position) + return [ColumnMetadata(name=col_name, source=self.loaded_dict) for col_name in self.loaded_dict] def get_def_dict(self, hed_schema=None, extra_def_dicts=None): """ Returns the definition dict for this sidecar. @@ -186,56 +192,6 @@ def _load_json_file(self, fp): except json.decoder.JSONDecodeError as e: raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), self.name) - def _generate_single_column(self, column_name, dict_for_entry, column_type=None): - """ Create a single column metadata entry and add to this sidecar. - - Parameters: - column_name (str or int): The column name or number - dict_for_entry (dict): The loaded dictionary for a given column entry (needs the "HED" key if nothing else). - column_type (ColumnType): Optional indicator of how to treat the column. - This overrides auto-detection from the dict_for_entry. - - """ - if column_type is None: - column_type = self._detect_column_type(dict_for_entry) - if dict_for_entry: - hed_dict = dict_for_entry.get("HED") - else: - hed_dict = None - column_entry = ColumnMetadata(column_type, column_name, hed_dict) - return column_entry - - @staticmethod - def _detect_column_type(dict_for_entry): - """ Determine the ColumnType of a given json entry. - - Parameters: - dict_for_entry (dict): The loaded json entry a specific column. - Generally has a "HED" entry among other optional ones. - - Returns: - ColumnType: The determined type of given column. Returns None if unknown. - - """ - if not dict_for_entry or not isinstance(dict_for_entry, dict): - return ColumnType.Ignore - - minimum_required_keys = ("HED",) - if not set(minimum_required_keys).issubset(dict_for_entry.keys()): - return ColumnType.Ignore - - hed_entry = dict_for_entry["HED"] - if isinstance(hed_entry, dict): - return ColumnType.Categorical - - if not isinstance(hed_entry, str): - return None - - if "#" not in dict_for_entry["HED"]: - return None - - return ColumnType.Value - def extract_definitions(self, hed_schema=None, error_handler=None): """ Gather and validate definitions in metadata. @@ -253,106 +209,40 @@ def extract_definitions(self, hed_schema=None, error_handler=None): self._extract_definition_issues = [] if hed_schema: - for hed_string, column_data, _ in self.hed_string_iter(error_handler): - hed_string_obj = HedString(hed_string, hed_schema) - error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj) - self._extract_definition_issues += def_dict.check_for_definitions(hed_string_obj, error_handler) + for column_data in self: + error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_data.column_name) + hed_strings = column_data.get_hed_strings() + for key_name, hed_string in hed_strings.items(): + hed_string_obj = HedString(hed_string, hed_schema) + if len(hed_strings) > 1: + error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) + error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj) + self._extract_definition_issues += def_dict.check_for_definitions(hed_string_obj, error_handler) + error_handler.pop_error_context() + if len(hed_strings) > 1: + error_handler.pop_error_context() + error_handler.pop_error_context() return def_dict - def hed_string_iter(self, error_handler=None): - """ Gather and validate definitions in metadata. - - Parameters: - error_handler (ErrorHandler): The error handler to use for context, uses a default one if None. + def get_column_refs(self): + """ Returns a list of column refs found in this sidecar. - Yields: - str: The hed string at a given column and key position. - column_data: the column data for the given string. - position: blackbox(pass back to set this string to a new value) + This does not validate + Returns: + column_refs(list): A list of unique column refs found """ - if error_handler is None: - error_handler = ErrorHandler() + found_vals = set() for column_data in self.column_data: - error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_data.column_name) - hed_dict = column_data.hed_dict - for (hed_string, position) in self._hed_string_iter(hed_dict, error_handler): - yield hed_string, column_data, position - error_handler.pop_error_context() - - @staticmethod - def _hed_string_iter(hed_strings, error_handler): - """ Iterate over the given dict of strings - - Parameters: - hed_strings(dict or str): A hed_string or dict of hed strings - error_handler (ErrorHandler): The error handler to use for context, uses a default one if none. - - Yields: - tuple: - - str: The hed string at a given column and key position. - - str: Indication of the where hed string was loaded from, so it can be later set by the user. + if column_data.column_type == ColumnType.Ignore: + continue + hed_strings = column_data.get_hed_strings() + matches = hed_strings.str.findall(r"\{([a-z_\-0-9]+)\}", re.IGNORECASE) + u_vals = [match for sublist in matches for match in sublist] - """ - for hed_string, key_name in Sidecar._hed_iter_low(hed_strings): - if key_name: - error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) - yield hed_string, key_name - if key_name: - error_handler.pop_error_context() - - @staticmethod - def _hed_iter_low(hed_strings): - """ Iterate over the hed string entries. - - Used by hed_string_iter - - Parameters: - hed_strings(dict or str): A hed_string or dict of hed strings - - Yields: - tuple: - - str: Individual hed strings for different entries. - - str: The position to pass back to set this string. - - """ - if isinstance(hed_strings, dict): - for key, hed_string in hed_strings.items(): - if not isinstance(hed_string, str): - continue - yield hed_string, key - elif isinstance(hed_strings, str): - yield hed_strings, None - - @staticmethod - def _set_hed_string_low(new_hed_string, hed_strings, position=None): - """ Set a hed string for a category key/etc. - - Parameters: - new_hed_string (str or HedString): The new hed_string to replace the value at position. - hed_strings(dict or str or HedString): The hed strings we want to update - position (str, optional): This should only be a value returned from hed_string_iter. - - Returns: - updated_string (str or dict): The newly updated string/dict. - Raises: - TypeError: If the mapping cannot occur. - - """ - if isinstance(hed_strings, dict): - if position is None: - raise TypeError("Error: Trying to set a category HED string with no category") - if position not in hed_strings: - raise TypeError("Error: Not allowed to add new categories to a column") - hed_strings[position] = str(new_hed_string) - elif isinstance(hed_strings, (str, HedString)): - if position is not None: - raise TypeError("Error: Trying to set a value HED string with a category") - hed_strings = str(new_hed_string) - else: - raise TypeError("Error: Trying to set a HED string on a column_type that doesn't support it.") + found_vals.update(u_vals) - return hed_strings + return list(found_vals) diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py index d504cb8a4..b32b22032 100644 --- a/hed/models/tabular_input.py +++ b/hed/models/tabular_input.py @@ -56,4 +56,16 @@ def get_def_dict(self, hed_schema=None, extra_def_dicts=None): if self._sidecar: return self._sidecar.get_def_dict(hed_schema, extra_def_dicts) else: - super().get_def_dict(hed_schema, extra_def_dicts) \ No newline at end of file + super().get_def_dict(hed_schema, extra_def_dicts) + + def get_column_refs(self): + """ Returns a list of column refs for this file. + + Default implementation returns none. + + Returns: + column_refs(list): A list of unique column refs found + """ + if self._sidecar: + return self._sidecar.get_column_refs() + return [] diff --git a/hed/validator/def_validator.py b/hed/validator/def_validator.py index f3f083933..c8b0c23ad 100644 --- a/hed/validator/def_validator.py +++ b/hed/validator/def_validator.py @@ -80,11 +80,11 @@ def _validate_def_contents(self, def_tag, def_expand_group, tag_validator): error_code = ValidationErrors.DEF_EXPAND_INVALID if placeholder_tag.is_unit_class_tag(): def_issues += tag_validator.check_tag_unit_class_units_are_valid(placeholder_tag, - report_tag_as=def_tag, + report_as=def_tag, error_code=error_code) elif placeholder_tag.is_value_class_tag(): def_issues += tag_validator.check_tag_value_class_valid(placeholder_tag, - report_tag_as=def_tag, + report_as=def_tag, error_code=error_code) elif def_entry.takes_value: diff --git a/hed/validator/hed_validator.py b/hed/validator/hed_validator.py index 9f692bdb3..ae2d791d9 100644 --- a/hed/validator/hed_validator.py +++ b/hed/validator/hed_validator.py @@ -61,7 +61,7 @@ def validate(self, hed_string, allow_placeholders, error_handler=None): def run_basic_checks(self, hed_string, allow_placeholders): issues = [] - issues += self._tag_validator.run_hed_string_validators(hed_string) + issues += self._tag_validator.run_hed_string_validators(hed_string, allow_placeholders) if check_for_any_errors(issues): return issues if hed_string == "n/a" or not self._hed_schema: @@ -161,7 +161,8 @@ def _validate_individual_tags_in_hed_string(self, hed_string_obj, allow_placehol """ from hed.models.definition_dict import DefTagNames validation_issues = [] - definition_groups = hed_string_obj.find_top_level_tags(anchor_tags={DefTagNames.DEFINITION_KEY}, include_groups=1) + definition_groups = hed_string_obj.find_top_level_tags(anchor_tags={DefTagNames.DEFINITION_KEY}, + include_groups=1) all_definition_groups = [group for sub_group in definition_groups for group in sub_group.get_all_groups()] for group in hed_string_obj.get_all_groups(): is_definition = group in all_definition_groups diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py index 450446371..a455e6806 100644 --- a/hed/validator/sidecar_validator.py +++ b/hed/validator/sidecar_validator.py @@ -1,13 +1,16 @@ import copy -from hed.errors import ErrorHandler, ErrorContext, SidecarErrors, DefinitionErrors +import re +from hed.errors import ErrorHandler, ErrorContext, SidecarErrors, DefinitionErrors, ColumnErrors from hed.models import ColumnType from hed import HedString from hed import Sidecar from hed.models.column_metadata import ColumnMetadata from hed.errors.error_reporter import sort_issues from hed.models.model_constants import DefTagNames +from hed.errors.error_reporter import check_for_any_errors +# todo: Add/improve validation for definitions being in known columns(right now it just assumes they aren't) class SidecarValidator: reserved_column_names = ["HED"] reserved_category_values = ["n/a"] @@ -38,44 +41,54 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None) error_handler = ErrorHandler() error_handler.push_error_context(ErrorContext.FILE_NAME, name) + issues += self.validate_structure(sidecar, error_handler=error_handler) + issues += self._validate_refs(sidecar, error_handler) + + # only allowed early out, something is very wrong with structure or refs + if check_for_any_errors(issues): + error_handler.pop_error_context() + return issues sidecar_def_dict = sidecar.get_def_dict(hed_schema=self._schema, extra_def_dicts=extra_def_dicts) hed_validator = HedValidator(self._schema, def_dicts=sidecar_def_dict, run_full_onset_checks=False, definitions_allowed=True) - issues += self.validate_structure(sidecar, error_handler=error_handler) issues += sidecar._extract_definition_issues issues += sidecar_def_dict.issues definition_checks = {} - for hed_string, column_data, position in sidecar.hed_string_iter(error_handler): - hed_string_obj = HedString(hed_string, hed_schema=self._schema, def_dict=sidecar_def_dict) + for column_data in sidecar.column_data: + column_name = column_data.column_name + hed_strings = column_data.get_hed_strings() + error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) + for key_name, hed_string in hed_strings.items(): + new_issues = [] + if len(hed_strings) > 1: + error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) + hed_string_obj = HedString(hed_string, hed_schema=self._schema, def_dict=sidecar_def_dict) + hed_string_obj.remove_refs() - error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj) - new_issues = hed_validator.run_basic_checks(hed_string_obj, allow_placeholders=True) - new_issues += hed_validator.run_full_string_checks(hed_string_obj) + error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj) + new_issues += hed_validator.run_basic_checks(hed_string_obj, allow_placeholders=True) + new_issues += hed_validator.run_full_string_checks(hed_string_obj) - def_check_list = definition_checks.setdefault(column_data.column_name, []) - def_check_list.append(hed_string_obj.find_tags({DefTagNames.DEFINITION_KEY}, recursive=True, include_groups=0)) - # Might refine this later - for now just skip checking placeholder counts in definition columns. - if not def_check_list[-1]: - new_issues += self._validate_pound_sign_count(hed_string_obj, column_type=column_data.column_type) + def_check_list = definition_checks.setdefault(column_name, []) + def_check_list.append(hed_string_obj.find_tags({DefTagNames.DEFINITION_KEY}, recursive=True, + include_groups=0)) + # Might refine this later - for now just skip checking placeholder counts in definition columns. + if not def_check_list[-1]: + new_issues += self._validate_pound_sign_count(hed_string_obj, column_type=column_data.column_type) - error_handler.add_context_and_filter(new_issues) - issues += new_issues - error_handler.pop_error_context() - - for col_name, has_def in definition_checks.items(): - error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, col_name) - def_check = set(bool(d) for d in has_def) - if len(def_check) != 1: - flat_def_list = [d for defs in has_def for d in defs] - for d in flat_def_list: - issues += error_handler.format_error_with_context(DefinitionErrors.BAD_DEFINITION_LOCATION, d) + if len(hed_strings) > 1: + error_handler.pop_error_context() + error_handler.add_context_and_filter(new_issues) + issues += new_issues error_handler.pop_error_context() error_handler.pop_error_context() + issues += self._check_definitions_bad_spot(definition_checks, error_handler) issues = sort_issues(issues) + return issues def validate_structure(self, sidecar, error_handler): @@ -95,6 +108,74 @@ def validate_structure(self, sidecar, error_handler): error_handler.pop_error_context() return all_validation_issues + def _validate_refs(self, sidecar, error_handler): + possible_column_refs = sidecar.all_hed_columns + + issues = [] + found_column_references = {} + for column_data in sidecar.column_data: + column_name = column_data.column_name + hed_strings = column_data.get_hed_strings() + error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) + matches = [] + for key_name, hed_string in hed_strings.items(): + new_issues = [] + if len(hed_strings) > 1: + error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) + + error_handler.push_error_context(ErrorContext.HED_STRING, HedString(hed_string)) + invalid_locations = self._find_non_matching_braces(hed_string) + for loc in invalid_locations: + bad_symbol = hed_string[loc] + new_issues += error_handler.format_error_with_context(ColumnErrors.MALFORMED_COLUMN_REF, + column_name, loc, bad_symbol) + + sub_matches = re.findall(r"\{([a-z_\-0-9]+)\}", hed_string, re.IGNORECASE) + matches.append(sub_matches) + for match in sub_matches: + if match not in possible_column_refs: + new_issues += error_handler.format_error_with_context(ColumnErrors.INVALID_COLUMN_REF, match) + + error_handler.pop_error_context() + if len(hed_strings) > 1: + error_handler.pop_error_context() + error_handler.add_context_and_filter(new_issues) + issues += new_issues + error_handler.pop_error_context() + references = [match for sublist in matches for match in sublist] + if references: + found_column_references[column_name] = references + if column_name in references: + issues += error_handler.format_error_with_context(ColumnErrors.SELF_COLUMN_REF, column_name) + + for column_name, refs in found_column_references.items(): + for ref in refs: + if ref in found_column_references and ref != column_name: + issues += error_handler.format_error_with_context(ColumnErrors.NESTED_COLUMN_REF, column_name, ref) + + return issues + + @staticmethod + def _find_non_matching_braces(hed_string): + issues = [] + open_brace_index = -1 + + for i, char in enumerate(hed_string): + if char == '{': + if open_brace_index >= 0: # Nested brace detected + issues.append(open_brace_index) + open_brace_index = i + elif char == '}': + if open_brace_index >= 0: + open_brace_index = -1 + else: + issues.append(i) + + if open_brace_index >= 0: + issues.append(open_brace_index) + + return issues + @staticmethod def _check_for_key(key, data): if isinstance(data, dict): @@ -127,7 +208,7 @@ def _validate_column_structure(self, column_name, dict_for_entry, error_handler) val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED_COLUMN) return val_issues - column_type = Sidecar._detect_column_type(dict_for_entry=dict_for_entry) + column_type = ColumnMetadata._detect_column_type(dict_for_entry=dict_for_entry) if column_type is None: val_issues += error_handler.format_error_with_context(SidecarErrors.UNKNOWN_COLUMN_TYPE, column_name=column_name) @@ -181,3 +262,17 @@ def _validate_pound_sign_count(self, hed_string, column_type): return ErrorHandler.format_error(error_type, pound_sign_count=str(hed_string_copy).count("#")) return [] + + def _check_definitions_bad_spot(self, definition_checks, error_handler): + issues = [] + # This could be simplified now + for col_name, has_def in definition_checks.items(): + error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, col_name) + def_check = set(bool(d) for d in has_def) + if len(def_check) != 1: + flat_def_list = [d for defs in has_def for d in defs] + for d in flat_def_list: + issues += error_handler.format_error_with_context(DefinitionErrors.BAD_DEFINITION_LOCATION, d) + error_handler.pop_error_context() + + return issues diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index afa041327..36a94f032 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -1,12 +1,11 @@ import pandas as pd -import re from hed import BaseInput from hed.errors import ErrorHandler, ValidationErrors, ErrorContext from hed.errors.error_types import ColumnErrors from hed.models import ColumnType from hed import HedString from hed.models.hed_string_group import HedStringGroup -from hed.errors.error_reporter import sort_issues +from hed.errors.error_reporter import sort_issues, check_for_any_errors PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " @@ -45,7 +44,6 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): # Check the structure of the input data, if it's a BaseInput if isinstance(data, BaseInput): issues += self._validate_column_structure(data, error_handler) - issues += self._validate_square_brackets(data.assemble(skip_square_brackets=True), error_handler) data = data.dataframe_a # Check the rows of the input data @@ -78,7 +76,7 @@ def _run_checks(self, data, error_handler): error_handler.pop_error_context() issues += new_column_issues - if new_column_issues: + if check_for_any_errors(new_column_issues): continue else: row_string = HedStringGroup(row_strings) @@ -97,6 +95,7 @@ def _validate_column_structure(self, base_input, error_handler): Parameters: base_input (BaseInput): The input data to be validated. + error_handler (ErrorHandler): Holds context Returns: List of issues associated with each invalid value. Each issue is a dictionary. """ @@ -117,86 +116,11 @@ def _validate_column_structure(self, base_input, error_handler): error_handler.pop_error_context() error_handler.pop_error_context() - return issues - - @staticmethod - def _validate_column_refs(df, error_handler): - possible_column_references = [f"{column_name}" for column_name in df.columns if - isinstance(column_name, str) and column_name.lower() != "hed"] - - issues = [] - found_column_references = {} - for column_name in df: - matches = df[column_name].str.findall("\[([a-z_\-\s0-9]+)(? 127: + if character in invalid_dict or ord(character) > 127: validation_issues += self._report_invalid_character_error(hed_string, index) return validation_issues @@ -283,12 +291,12 @@ def check_tag_exists_in_schema(self, original_tag): index_in_tag_end=None) return validation_issues - def check_tag_unit_class_units_are_valid(self, original_tag, report_tag_as=None, error_code=None): + def check_tag_unit_class_units_are_valid(self, original_tag, report_as=None, error_code=None): """ Report incorrect unit class or units. Parameters: original_tag (HedTag): The original tag that is used to report the error. - report_tag_as (HedTag): Report errors as coming from this tag, rather than original_tag. + report_as (HedTag): Report errors as coming from this tag, rather than original_tag. error_code (str): Override error codes to this Returns: list: Validation issues. Each issue is a dictionary. @@ -306,24 +314,23 @@ def check_tag_unit_class_units_are_valid(self, original_tag, report_tag_as=None, if original_tag.is_takes_value_tag() and\ not self._validate_value_class_portion(original_tag, stripped_value): validation_issues += ErrorHandler.format_error(ValidationErrors.VALUE_INVALID, - report_tag_as if report_tag_as else original_tag) + report_as if report_as else original_tag) if error_code: had_error = True validation_issues += ErrorHandler.format_error(ValidationErrors.VALUE_INVALID, - report_tag_as if report_tag_as else original_tag, + report_as if report_as else original_tag, actual_error=error_code) - if bad_units: tag_unit_class_units = original_tag.get_tag_unit_class_units() if tag_unit_class_units: validation_issues += ErrorHandler.format_error(ValidationErrors.UNITS_INVALID, - tag=report_tag_as if report_tag_as else original_tag, + tag=report_as if report_as else original_tag, units=tag_unit_class_units) else: default_unit = original_tag.get_unit_class_default_unit() validation_issues += ErrorHandler.format_error(ValidationErrors.UNITS_MISSING, - tag=report_tag_as if report_tag_as else original_tag, + tag=report_as if report_as else original_tag, default_unit=default_unit) # We don't want to give this overall error twice @@ -334,12 +341,12 @@ def check_tag_unit_class_units_are_valid(self, original_tag, report_tag_as=None, return validation_issues - def check_tag_value_class_valid(self, original_tag, report_tag_as=None, error_code=None): + def check_tag_value_class_valid(self, original_tag, report_as=None, error_code=None): """ Report an invalid value portion. Parameters: original_tag (HedTag): The original tag that is used to report the error. - report_tag_as (HedTag): Report errors as coming from this tag, rather than original_tag. + report_as (HedTag): Report errors as coming from this tag, rather than original_tag. error_code (str): Override error codes to this Returns: list: Validation issues. @@ -347,7 +354,7 @@ def check_tag_value_class_valid(self, original_tag, report_tag_as=None, error_co validation_issues = [] if not self._validate_value_class_portion(original_tag, original_tag.extension): validation_issues += ErrorHandler.format_error(ValidationErrors.VALUE_INVALID, - report_tag_as if report_tag_as else original_tag, + report_as if report_as else original_tag, actual_error=error_code) return validation_issues diff --git a/spec_tests/hed-specification b/spec_tests/hed-specification index 86b9c6eb8..cc219769e 160000 --- a/spec_tests/hed-specification +++ b/spec_tests/hed-specification @@ -1 +1 @@ -Subproject commit 86b9c6eb842de9dc8c9e0c63586104d8da9dffab +Subproject commit cc219769e43b1882a31473cb3d96ea2054a5a60b diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index 891e69af3..80e3c0651 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -45,15 +45,16 @@ "UNITS_MISSING", "VALUE_INVALID", - - "SCHEMA_LIBRARY_INVALID" + "SIDECAR_BRACES_INVALID", + "SCHEMA_LIBRARY_INVALID", ] skip_tests = { "VERSION_DEPRECATED": "Not applicable", "onset-offset-error-duplicated-onset-or-offset": "TBD how we implement this", "tag-extension-invalid-bad-node-name": "Part of character invalid checking/didn't get to it yet", - "SIDECAR_BRACES_INVALID": "Not in yet as curly braces" + "inset-group-has-extras": "Inset tags not in yet", + "inset-outside-its-event": "Inset tags not in yet" } @@ -220,7 +221,6 @@ def _run_single_combo_test(self, info, schema, def_dict, error_code, description def _run_single_schema_test(self, info, error_code, description,name, error_handler): for result, tests in info.items(): for test in tests: - issues = [] schema_string = "\n".join(test) try: loaded_schema = from_string(schema_string, file_type=".mediawiki") diff --git a/tests/data/sidecar_tests/bad_refs_test2.json b/tests/data/sidecar_tests/bad_refs_test2.json new file mode 100644 index 000000000..4a847963f --- /dev/null +++ b/tests/data/sidecar_tests/bad_refs_test2.json @@ -0,0 +1,5 @@ +{ + "column3": { + "HED": "{column1}, {column2}, Time-interval/# s" + } +} \ No newline at end of file diff --git a/tests/data/sidecar_tests/basic_refs_test.json b/tests/data/sidecar_tests/basic_refs_test.json new file mode 100644 index 000000000..cd3011ac1 --- /dev/null +++ b/tests/data/sidecar_tests/basic_refs_test.json @@ -0,0 +1,28 @@ +{ + "trial_type": { + "LongName": "Event category", + "Description": "Indicator of type of action that is expected", + "Levels": { + "go": "A red square is displayed to indicate starting", + "stop": "A blue square is displayed to indicate stopping" + }, + "HED": { + "go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See", + "stop": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure" + } + }, + "response_time": { + "LongName": "Response time after stimulus", + "Description": "Time from stimulus presentation until subject presses button", + "Units": "ms", + "HED": "({stim_file}, Event), Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See, Time-value/# s, {trial_type}" + }, + "stim_file": { + "LongName": "Stimulus file name", + "Description": "Relative path of the stimulus image file", + "HED": "Time-value/# s" + }, + "other_file": { + "HED": "{stim_file}, Keyboard-key/#" + } +} \ No newline at end of file diff --git a/tests/data/sidecar_tests/long_tag_test.json b/tests/data/sidecar_tests/long_tag_test.json new file mode 100644 index 000000000..9fe583a21 --- /dev/null +++ b/tests/data/sidecar_tests/long_tag_test.json @@ -0,0 +1,25 @@ +{ + "trial_type": { + "LongName": "Event category", + "Description": "Indicator of type of action that is expected", + "Levels": { + "go": "A red square is displayed to indicate starting", + "stop": "A blue square is displayed to indicate stopping" + }, + "HED": { + "go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See,{response_time}", + "stop": "{response_time},Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure" + } + }, + "response_time": { + "LongName": "Response time after stimulus", + "Description": "Time from stimulus presentation until subject presses button", + "Units": "ms", + "HED": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See,Property/Data-property/Data-value/Spatiotemporal-value/Temporal-value/Time-value/#" + }, + "stim_file": { + "LongName": "Stimulus file name", + "Description": "Relative path of the stimulus image file", + "HED": "Property/Data-property/Data-value/Spatiotemporal-value/Temporal-value/Time-value/#" + } +} \ No newline at end of file diff --git a/tests/data/sidecar_tests/malformed_refs_test.json b/tests/data/sidecar_tests/malformed_refs_test.json new file mode 100644 index 000000000..ccf9aae3e --- /dev/null +++ b/tests/data/sidecar_tests/malformed_refs_test.json @@ -0,0 +1,20 @@ +{ + "column1": { + "HED": { + "go": "Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See", + "stop": "{Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure}" + } + }, + "column2": { + "HED": "{column1}}, Time-interval/# s" + }, + "column3": { + "HED": "{column1, Time-interval/# s" + }, + "column4": { + "HED": "{{column1}, Time-interval/# s" + }, + "column5": { + "HED": "column1}, Time-interval/# s" + } +} \ No newline at end of file diff --git a/tests/data/sidecar_tests/short_tag_test.json b/tests/data/sidecar_tests/short_tag_test.json new file mode 100644 index 000000000..ab02702bc --- /dev/null +++ b/tests/data/sidecar_tests/short_tag_test.json @@ -0,0 +1,25 @@ +{ + "trial_type": { + "LongName": "Event category", + "Description": "Indicator of type of action that is expected", + "Levels": { + "go": "A red square is displayed to indicate starting", + "stop": "A blue square is displayed to indicate stopping" + }, + "HED": { + "go": "Azure,See,{response_time}", + "stop": "{response_time},Azure" + } + }, + "response_time": { + "LongName": "Response time after stimulus", + "Description": "Time from stimulus presentation until subject presses button", + "Units": "ms", + "HED": "Azure,See,Time-value/#" + }, + "stim_file": { + "LongName": "Stimulus file name", + "Description": "Relative path of the stimulus image file", + "HED": "Time-value/#" + } +} \ No newline at end of file diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py index d8e0b4a62..bda6e1259 100644 --- a/tests/models/test_base_input.py +++ b/tests/models/test_base_input.py @@ -60,86 +60,46 @@ def test_gathered_defs(self): } self.assertEqual(defs, expected_defs) - # def test_missing_column_name_issue(self): - # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_schema.mediawiki') - # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_events_bad_column_name.tsv') - # - # hed_schema = schema.load_schema(schema_path) - # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # "../data/validator_tests/bids_events.json") - # validator = HedValidator(hed_schema=hed_schema) - # sidecar = Sidecar(json_path) - # issues = sidecar.validate_entries(validator) - # self.assertEqual(len(issues), 0) - # input_file = TabularInput(events_path, sidecars=sidecar) - # - # validation_issues = input_file.validate_sidecar(validator) - # self.assertEqual(len(validation_issues), 0) - # validation_issues = input_file.validate_file(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 1) - # - # def test_expand_column_issues(self): - # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_schema.mediawiki') - # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_events_bad_category_key.tsv') - # - # hed_schema = schema.load_schema(schema_path) - # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # "../data/validator_tests/bids_events.json") - # validator = HedValidator(hed_schema=hed_schema) - # sidecar = Sidecar(json_path) - # issues = sidecar.validate_entries(validator) - # self.assertEqual(len(issues), 0) - # input_file = TabularInput(events_path, sidecars=sidecar) - # - # validation_issues = input_file.validate_sidecar(validator) - # self.assertEqual(len(validation_issues), 0) - # validation_issues = input_file.validate_file(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 1) - class TestInsertColumns(unittest.TestCase): def test_insert_columns_simple(self): df = pd.DataFrame({ - "column1": ["[column2], Event, Action"], + "column1": ["{column2}, Event, Action"], "column2": ["Item"] }) expected_df = pd.DataFrame({ "column1": ["Item, Event, Action"] }) - result = BaseInput._handle_square_brackets(df) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns) pd.testing.assert_frame_equal(result, expected_df) def test_insert_columns_multiple_rows(self): df = pd.DataFrame({ - "column1": ["[column2], Event, Action", "Event, Action"], + "column1": ["{column2}, Event, Action", "Event, Action"], "column2": ["Item", "Subject"] }) expected_df = pd.DataFrame({ "column1": ["Item, Event, Action", "Event, Action"] }) - result = BaseInput._handle_square_brackets(df) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns) pd.testing.assert_frame_equal(result, expected_df) def test_insert_columns_multiple_columns(self): df = pd.DataFrame({ - "column1": ["[column2], Event, [column3], Action"], + "column1": ["{column2}, Event, {column3}, Action"], "column2": ["Item"], "column3": ["Subject"] }) expected_df = pd.DataFrame({ "column1": ["Item, Event, Subject, Action"] }) - result = BaseInput._handle_square_brackets(df) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3"], column_names=df.columns) pd.testing.assert_frame_equal(result, expected_df) def test_insert_columns_four_columns(self): df = pd.DataFrame({ - "column1": ["[column2], Event, [column3], Action"], + "column1": ["{column2}, Event, {column3}, Action"], "column2": ["Item"], "column3": ["Subject"], "column4": ["Data"] @@ -148,7 +108,96 @@ def test_insert_columns_four_columns(self): "column1": ["Item, Event, Subject, Action"], "column4": ["Data"] }) - result = BaseInput._handle_square_brackets(df) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3"], column_names=df.columns) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_with_nested_parentheses(self): + df = pd.DataFrame({ + "column1": ["({column2}, ({column3}, {column4})), Event, Action"], + "column2": ["Item"], + "column3": ["Subject"], + "column4": ["Data"] + }) + expected_df = pd.DataFrame({ + "column1": ["(Item, (Subject, Data)), Event, Action"] + }) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_with_nested_parentheses_na_values(self): + df = pd.DataFrame({ + "column1": ["({column2}, ({column3}, {column4})), Event, Action"], + "column2": ["Data"], + "column3": ["n/a"], + "column4": ["n/a"] + }) + expected_df = pd.DataFrame({ + "column1": ["(Data), Event, Action"] + }) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_with_nested_parentheses_na_values2(self): + df = pd.DataFrame({ + "column1": ["({column2}, ({column3}, {column4})), Event, Action"], + "column2": ["n/a"], + "column3": ["n/a"], + "column4": ["Data"] + }) + expected_df = pd.DataFrame({ + "column1": ["((Data)), Event, Action"] + }) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_with_nested_parentheses_mixed_na_values(self): + df = pd.DataFrame({ + "column1": ["({column2}, ({column3}, {column4})), Event, Action"], + "column2": ["n/a"], + "column3": ["Subject"], + "column4": ["n/a"] + }) + expected_df = pd.DataFrame({ + "column1": ["((Subject)), Event, Action"] + }) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_with_nested_parentheses_all_na_values(self): + df = pd.DataFrame({ + "column1": ["({column2}, ({column3}, {column4})), Event, Action"], + "column2": ["n/a"], + "column3": ["n/a"], + "column4": ["n/a"] + }) + expected_df = pd.DataFrame({ + "column1": ["Event, Action"] + }) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2", "column3", "column4"], column_names=df.columns) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_with_parentheses(self): + df = pd.DataFrame({ + "column1": ["({column2}), Event, Action"], + "column2": ["Item"] + }) + expected_df = pd.DataFrame({ + "column1": ["(Item), Event, Action"] + }) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_with_parentheses_na_values(self): + df = pd.DataFrame({ + "column1": ["({column2}), Event, Action"], + "column2": ["n/a"], + "column3": ["n/a"] + }) + expected_df = pd.DataFrame({ + "column1": ["Event, Action"], + "column3": ["n/a"] + }) + result = BaseInput._handle_curly_braces_refs(df, refs=["column2"], column_names=df.columns) pd.testing.assert_frame_equal(result, expected_df) @@ -209,43 +258,3 @@ def test_combine_dataframe_with_mixed_values(self): expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox', '', '']) self.assertTrue(result.equals(expected)) - -class TestColumnRefs(unittest.TestCase): - def test_simple_column_refs(self): - data1 = { - 'A': ['[col1], [col2]', 'tag1, tag2'], - 'B': ['tag3, tag4', '[col3]'], - } - df1 = pd.DataFrame(data1) - result1 = BaseInput._find_column_refs(df1, df1.columns) - expected1 = ['col1', 'col2', 'col3'] - self.assertEqual(result1, expected1) - - def test_mixed_cases_and_patterns(self): - data2 = { - 'A': ['[Col1], [col2]', 'tag1, [Col3]', 'tag3, [COL4]', '[col5], [col6]'], - } - df2 = pd.DataFrame(data2) - result2 = BaseInput._find_column_refs(df2, df2.columns) - expected2 = ['Col1', 'col2', 'Col3', 'COL4', 'col5', 'col6'] - self.assertEqual(result2, expected2) - - def test_no_column_references(self): - data3 = { - 'A': ['tag1, tag2', 'tag3, tag4'], - 'B': ['tag5, tag6', 'tag7, tag8'], - } - df3 = pd.DataFrame(data3) - result3 = BaseInput._find_column_refs(df3, df3.columns) - expected3 = [] - self.assertEqual(result3, expected3) - - def test_incomplete_square_brackets(self): - data4 = { - 'A': ['[col1, [col2]', 'tag1, [Col3'], - 'B': ['tag3, [COL4', '[col5, col6]'], - } - df4 = pd.DataFrame(data4) - result4 = BaseInput._find_column_refs(df4, df4.columns) - expected4 = ['col2'] - self.assertEqual(result4, expected4) \ No newline at end of file diff --git a/tests/models/test_hed_string.py b/tests/models/test_hed_string.py index af17878bb..46f7c750c 100644 --- a/tests/models/test_hed_string.py +++ b/tests/models/test_hed_string.py @@ -69,13 +69,20 @@ def test_group_tags(self): hed_string = '/Action/Reach/To touch,(/Attribute/Object side/Left,/Participant/Effect/Body part/Arm),' \ '/Attribute/Location/Screen/Top/70 px,/Attribute/Location/Screen/Left/23 px ' string_obj = HedString(hed_string) - # result = HedString.split_into_groups(hed_string) tags_as_strings = [str(tag) for tag in string_obj.children] self.assertCountEqual(tags_as_strings, ['/Action/Reach/To touch', '(/Attribute/Object side/Left,/Participant/Effect/Body part/Arm)', '/Attribute/Location/Screen/Top/70 px', '/Attribute/Location/Screen/Left/23 px']) + def test_square_brackets_in_string(self): + # just verifying this parses, square brackets do not validate + hed_string = '[test_ref], Event/Sensory-event, Participant, ([test_ref2], Event)' + string_obj = HedString(hed_string) + tags_as_strings = [str(tag) for tag in string_obj.children] + self.assertCountEqual(tags_as_strings, + ['[test_ref]', 'Event/Sensory-event', 'Participant', '([test_ref2],Event)']) + # Potentially restore some similar behavior later if desired. # We no longer automatically remove things like quotes. # def test_double_quotes(self): diff --git a/tests/models/test_sidecar.py b/tests/models/test_sidecar.py index caec94043..4fdacb31f 100644 --- a/tests/models/test_sidecar.py +++ b/tests/models/test_sidecar.py @@ -82,10 +82,10 @@ def test__iter__(self): def test_validate_column_group(self): validation_issues = self.errors_sidecar.validate(self.hed_schema) - self.assertEqual(len(validation_issues), 23) + self.assertEqual(len(validation_issues), 5) validation_issues2 = self.errors_sidecar_minor.validate(self.hed_schema) - self.assertEqual(len(validation_issues2), 19) + self.assertEqual(len(validation_issues2), 1) validation_issues = self.json_without_definitions_sidecar.validate(self.hed_schema) self.assertEqual(len(validation_issues), 7) @@ -113,8 +113,8 @@ def test_save_load(self): reloaded_sidecar = Sidecar(save_filename) - for str1, str2 in zip(sidecar.hed_string_iter(), reloaded_sidecar.hed_string_iter()): - self.assertEqual(str1[0], str2[0]) + for data1, data2 in zip(sidecar, reloaded_sidecar): + self.assertEqual(data1.source_dict, data2.source_dict) def test_save_load2(self): sidecar = Sidecar(self.json_def_filename) @@ -122,8 +122,8 @@ def test_save_load2(self): reloaded_sidecar = Sidecar(io.StringIO(json_string)) - for str1, str2 in zip(sidecar.hed_string_iter(), reloaded_sidecar.hed_string_iter()): - self.assertEqual(str1[0], str2[0]) + for data1, data2 in zip(sidecar, reloaded_sidecar): + self.assertEqual(data1.source_dict, data2.source_dict) def test_merged_sidecar(self): base_folder = self.base_data_dir + "sidecar_tests/" @@ -136,6 +136,26 @@ def test_merged_sidecar(self): self.assertEqual(sidecar.loaded_dict, sidecar2.loaded_dict) + def test_set_hed_strings(self): + from hed.models import df_util + sidecar = Sidecar(os.path.join(self.base_data_dir, "sidecar_tests/short_tag_test.json")) + + for column_data in sidecar: + hed_strings = column_data.get_hed_strings() + hed_strings = df_util.convert_to_form(hed_strings, self.hed_schema, "long_tag") + column_data.set_hed_strings(hed_strings) + sidecar_long = Sidecar(os.path.join(self.base_data_dir, "sidecar_tests/long_tag_test.json")) + self.assertEqual(sidecar.loaded_dict, sidecar_long.loaded_dict) + + sidecar = Sidecar(os.path.join(self.base_data_dir, "sidecar_tests/long_tag_test.json")) + + for column_data in sidecar: + hed_strings = column_data.get_hed_strings() + hed_strings = df_util.convert_to_form(hed_strings, self.hed_schema, "short_tag") + column_data.set_hed_strings(hed_strings) + sidecar_short = Sidecar(os.path.join(self.base_data_dir, "sidecar_tests/short_tag_test.json")) + self.assertEqual(sidecar.loaded_dict, sidecar_short.loaded_dict) + if __name__ == '__main__': unittest.main() diff --git a/tests/validator/test_sidecar_validator.py b/tests/validator/test_sidecar_validator.py new file mode 100644 index 000000000..84ae8a2f0 --- /dev/null +++ b/tests/validator/test_sidecar_validator.py @@ -0,0 +1,66 @@ +import unittest +import os +import io +import shutil + +from hed.errors import HedFileError, ValidationErrors +from hed.models import ColumnMetadata, HedString, Sidecar +from hed.validator import HedValidator +from hed import schema +from hed.models import DefinitionDict +from hed.errors import ErrorHandler +from hed.validator.sidecar_validator import SidecarValidator + + +class Test(unittest.TestCase): + @classmethod + def setUpClass(cls): + base_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/') + cls.base_data_dir = base_data_dir + hed_xml_file = os.path.join(base_data_dir, "schema_tests/HED8.0.0t.xml") + cls.hed_schema = schema.load_schema(hed_xml_file) + cls._refs_json_filename = os.path.join(base_data_dir, "sidecar_tests/basic_refs_test.json") + cls._bad_refs_json_filename = os.path.join(base_data_dir, "sidecar_tests/bad_refs_test2.json") + cls._malformed_refs_json_filename = os.path.join(base_data_dir, "sidecar_tests/malformed_refs_test.json") + + def test_basic_refs(self): + sidecar = Sidecar(self._refs_json_filename) + issues = sidecar.validate(self.hed_schema) + + self.assertEqual(len(issues), 0) + refs = sidecar.get_column_refs() + self.assertEqual(len(refs), 2) + + def test_bad_refs(self): + sidecar = Sidecar(self._bad_refs_json_filename) + issues = sidecar.validate(self.hed_schema) + + self.assertEqual(len(issues), 2) + + def test_malformed_refs(self): + sidecar = Sidecar(self._malformed_refs_json_filename) + issues = sidecar.validate(self.hed_schema) + + self.assertEqual(len(issues), 4) + + def test_malformed_braces(self): + hed_strings = [ + "column2}, Event, Action", + "{column, Event, Action", + "This is a {malformed {input string}} with extra {opening brackets", + "{Event{Action}}", + "Event, Action}" + ] + error_counts = [ + 1, + 1, + 3, + 2, + 1 + ] + + for string, error_count in zip(hed_strings, error_counts): + issues = SidecarValidator._find_non_matching_braces(string) + + self.assertEqual(len(issues), error_count) + diff --git a/tests/validator/test_spreadsheet_validator.py b/tests/validator/test_spreadsheet_validator.py index ef43f9bf3..e32696a30 100644 --- a/tests/validator/test_spreadsheet_validator.py +++ b/tests/validator/test_spreadsheet_validator.py @@ -13,45 +13,3 @@ def setUpClass(cls): cls.schema = load_schema_version("8.1.0") cls.validator = SpreadsheetValidator(cls.schema) - def test_insert_columns_no_nested_or_circular_reference(self): - df = pd.DataFrame({ - "column1": ["[column2], Event, Action"], - "column2": ["[column1], Item"] - }) - issues = self.validator._validate_square_brackets(df, error_handler=ErrorHandler(True)) - self.assertEqual(issues[0]['code'], ColumnErrors.NESTED_COLUMN_REF) - - def test_insert_columns_invalid_column_name(self): - df = pd.DataFrame({ - "column1": ["[invalid_column], Event, Action"], - "column2": ["Item"] - }) - issues = self.validator._validate_square_brackets(df, error_handler=ErrorHandler(True)) - self.assertEqual(issues[0]['code'], ColumnErrors.INVALID_COLUMN_REF) - - def test_insert_columns_invalid_syntax(self): - df = pd.DataFrame({ - "column1": ["column2], Event, Action"], - "column2": ["Item"] - }) - issues = self.validator._validate_square_brackets(df, error_handler=ErrorHandler(True)) - self.assertEqual(issues[0]['code'], ColumnErrors.MALFORMED_COLUMN_REF) - - def test_insert_columns_invalid_syntax2(self): - df = pd.DataFrame({ - "column1": ["column2], Event, Action", "[column, Event, Action"], - "column2": ["Item", "Action"], - "column3": ["This is a [malformed [input string]] with extra [opening brackets", "[Event[Action]]"], - }) - issues = self.validator._validate_square_brackets(df, error_handler=ErrorHandler(True)) - issues = sort_issues(issues) - self.assertEqual(issues[0]['code'], ColumnErrors.MALFORMED_COLUMN_REF) - self.assertEqual(len(issues), 6) - - def test_insert_columns_no_self_reference(self): - df = pd.DataFrame({ - "column1": ["[column1], Event, Action"], - "column2": ["Item"] - }) - issues = self.validator._validate_square_brackets(df, error_handler=ErrorHandler(True)) - self.assertEqual(issues[0]['code'], ColumnErrors.SELF_COLUMN_REF)