From fbb8fd81d1332d16f5da7bed61d89318b9667f6c Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 15 Mar 2023 17:46:41 -0500 Subject: [PATCH 1/2] First pass refactor of models --- hed/__init__.py | 3 +- hed/errors/error_messages.py | 142 ++--- hed/errors/error_reporter.py | 49 +- hed/errors/error_types.py | 5 +- hed/errors/exceptions.py | 2 + hed/models/__init__.py | 3 - hed/models/base_input.py | 509 ++++++------------ hed/models/column_mapper.py | 221 +++----- hed/models/column_metadata.py | 107 +--- hed/models/def_mapper.py | 255 --------- hed/models/definition_dict.py | 154 +++++- hed/models/df_util.py | 125 +++++ hed/models/expression_parser.py | 4 +- hed/models/hed_group.py | 18 +- hed/models/hed_ops.py | 262 --------- hed/models/hed_string.py | 110 ++-- hed/models/hed_tag.py | 143 +++-- hed/models/sidecar.py | 254 ++++++--- hed/models/sidecar_base.py | 269 --------- hed/models/spreadsheet_input.py | 12 +- hed/models/tabular_input.py | 62 +-- hed/models/timeseries_input.py | 2 +- hed/schema/schema_compliance.py | 2 +- hed/validator/__init__.py | 4 + hed/validator/def_validator.py | 78 +++ hed/validator/hed_validator.py | 119 ++-- .../onset_validator.py} | 46 +- hed/validator/sidecar_validator.py | 147 +++++ hed/validator/spreadsheet_validator.py | 114 ++++ hed/validator/tag_validator.py | 100 ++-- spec_tests/test_errors.py | 182 ++++--- tests/data/model_tests/na_tag_column.tsv | 2 + tests/data/model_tests/na_value_column.json | 5 + tests/data/model_tests/na_value_column.tsv | 3 + .../no_column_header_definition.tsv | 4 +- .../no_column_header_definition_long.tsv | 4 +- .../data/validator_tests/bids_events_HED.json | 3 +- tests/models/test_base_file_input.py | 19 +- tests/models/test_column_mapper.py | 90 +--- tests/models/test_def_mapper.py | 292 ---------- tests/models/test_definition_dict.py | 36 +- tests/models/test_expression_parser.py | 11 + tests/models/test_hed_string.py | 27 + tests/models/test_hed_tag.py | 28 +- tests/models/test_sidecar.py | 38 +- tests/models/test_spreadsheet_input.py | 92 +--- tests/models/test_tabular_input.py | 55 +- tests/schema/test_convert_tags.py | 2 +- tests/validator/test_def_validator.py | 119 ++++ tests/validator/test_hed_validator.py | 92 +--- .../test_onset_validator.py} | 227 +++----- tests/validator/test_tag_validator.py | 48 +- tests/validator/test_tag_validator_base.py | 29 +- tests/validator/test_tag_validator_library.py | 33 +- 54 files changed, 1920 insertions(+), 2842 deletions(-) delete mode 100644 hed/models/def_mapper.py create mode 100644 hed/models/df_util.py delete mode 100644 hed/models/hed_ops.py delete mode 100644 hed/models/sidecar_base.py create mode 100644 hed/validator/def_validator.py rename hed/{models/onset_mapper.py => validator/onset_validator.py} (76%) create mode 100644 hed/validator/sidecar_validator.py create mode 100644 hed/validator/spreadsheet_validator.py create mode 100644 tests/data/model_tests/na_tag_column.tsv create mode 100644 tests/data/model_tests/na_value_column.json create mode 100644 tests/data/model_tests/na_value_column.tsv delete mode 100644 tests/models/test_def_mapper.py create mode 100644 tests/validator/test_def_validator.py rename tests/{models/test_onset_mapper.py => validator/test_onset_validator.py} (57%) diff --git a/hed/__init__.py b/hed/__init__.py index 40faff8ab..e2bdcd053 100644 --- a/hed/__init__.py +++ b/hed/__init__.py @@ -7,12 +7,13 @@ from hed.models.spreadsheet_input import SpreadsheetInput from hed.models.tabular_input import TabularInput from hed.models.sidecar import Sidecar +from hed.models.definition_dict import DefinitionDict + from hed.schema.hed_schema import HedSchema from hed.schema.hed_schema_group import HedSchemaGroup from hed.schema.hed_schema_io import get_schema, get_schema_versions, load_schema, load_schema_version -from hed.validator.hed_validator import HedValidator # from hed import errors, models, schema, tools, validator diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index 2d3647d9a..9ae9557f3 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -12,327 +12,333 @@ @hed_tag_error(ValidationErrors.HED_UNITS_INVALID) def val_error_invalid_unit(tag, units): units_string = ','.join(sorted(units)) - return f'Invalid unit - "{tag}" valid units are "{units_string}"', { - "units": sorted(units) - } + return f'Invalid unit - "{tag}" valid units are "{units_string}"' @hed_error(ValidationErrors.HED_TAG_EMPTY) def val_error_extra_comma(source_string, char_index): character = source_string[char_index] - return f"HED tags cannot be empty. Extra delimiter found: '{character}' at index {char_index}'", { - 'char_index': char_index - } + return f"HED tags cannot be empty. Extra delimiter found: '{character}' at index {char_index}'" @hed_tag_error(ValidationErrors.HED_GROUP_EMPTY, actual_code=ValidationErrors.HED_TAG_EMPTY) def val_error_empty_group(tag): - return f"HED tags cannot be empty. Extra delimiters found: '{tag}'", {} + return f"HED tags cannot be empty. Extra delimiters found: '{tag}'" @hed_tag_error(ValidationErrors.HED_TAG_EXTENDED, has_sub_tag=True, default_severity=ErrorSeverity.WARNING) def val_error_tag_extended(tag, problem_tag): - return f"Hed tag is extended. '{problem_tag}' in {tag}", {} + return f"Hed tag is extended. '{problem_tag}' in {tag}" @hed_error(ValidationErrors.HED_CHARACTER_INVALID) def val_error_invalid_char(source_string, char_index): character = source_string[char_index] - return f'Invalid character "{character}" at index {char_index}"', { - 'char_index': char_index - } + return f'Invalid character "{character}" at index {char_index}"' @hed_tag_error(ValidationErrors.INVALID_TAG_CHARACTER, has_sub_tag=True, actual_code=ValidationErrors.HED_CHARACTER_INVALID) def val_error_invalid_tag_character(tag, problem_tag): - return f"Invalid character '{problem_tag}' in {tag}", {} + return f"Invalid character '{problem_tag}' in {tag}" @hed_error(ValidationErrors.HED_TILDES_UNSUPPORTED) def val_error_tildes_not_supported(source_string, char_index): character = source_string[char_index] - return f"Tildes not supported. Replace (a ~ b ~ c) with (a, (b, c)). '{character}' at index {char_index}'", { - 'char_index': char_index - } + return f"Tildes not supported. Replace (a ~ b ~ c) with (a, (b, c)). '{character}' at index {char_index}'" @hed_error(ValidationErrors.HED_COMMA_MISSING) def val_error_comma_missing(tag): - return f"Comma missing after - '{tag}'", {} + return f"Comma missing after - '{tag}'" @hed_tag_error(ValidationErrors.HED_TAG_REPEATED) def val_error_duplicate_tag(tag): - return f'Repeated tag - "{tag}"', {} + return f'Repeated tag - "{tag}"' @hed_error(ValidationErrors.HED_TAG_REPEATED_GROUP) def val_error_duplicate_group(group): - return f'Repeated group - "{group}"', {} + return f'Repeated group - "{group}"' @hed_error(ValidationErrors.HED_PARENTHESES_MISMATCH) def val_error_parentheses(opening_parentheses_count, closing_parentheses_count): return f'Number of opening and closing parentheses are unequal. '\ f'{opening_parentheses_count} opening parentheses. {closing_parentheses_count} '\ - 'closing parentheses', {} + 'closing parentheses' @hed_tag_error(ValidationErrors.HED_TAG_REQUIRES_CHILD) def val_error_require_child(tag): - return f"Descendant tag required - '{tag}'", {} + return f"Descendant tag required - '{tag}'" @hed_error(ValidationErrors.HED_TAG_NOT_UNIQUE) def val_error_multiple_unique(tag_prefix): - return f"Multiple unique tags with prefix - '{tag_prefix}'", {} + return f"Multiple unique tags with prefix - '{tag_prefix}'" + + +@hed_tag_error(ValidationErrors.TAG_PREFIX_INVALID) +def val_error_prefix_invalid(tag, tag_prefix): + return f"Prefixes can only contain alpha characters. - '{tag_prefix}'" @hed_tag_error(ValidationErrors.INVALID_EXTENSION, actual_code=ValidationErrors.HED_TAG_INVALID) def val_error_invalid_extension(tag): - return f'Invalid extension on tag - "{tag}"', {} + return f'Invalid extension on tag - "{tag}"' @hed_tag_error(ValidationErrors.INVALID_PARENT_NODE, has_sub_tag=True, actual_code=ValidationErrors.HED_TAG_INVALID) def val_error_invalid_parent(tag, problem_tag, expected_parent_tag): return f"In '{tag}', '{problem_tag}' appears as '{str(expected_parent_tag)}' and cannot be used " \ - f"as an extension.", {"expected_parent_tag": expected_parent_tag} + f"as an extension." @hed_tag_error(ValidationErrors.NO_VALID_TAG_FOUND, has_sub_tag=True, actual_code=ValidationErrors.HED_TAG_INVALID) def val_error_no_valid_tag(tag, problem_tag): - return f"'{problem_tag}' in {tag} is not a valid base hed tag.", {} + return f"'{problem_tag}' in {tag} is not a valid base hed tag." @hed_tag_error(ValidationErrors.HED_VALUE_INVALID) def val_error_no_value(tag): - return f"''{tag}' has an invalid value portion.", {} + return f"''{tag}' has an invalid value portion." @hed_error(ValidationErrors.HED_MISSING_REQUIRED_COLUMN, default_severity=ErrorSeverity.WARNING) def val_error_missing_column(column_name): - return f"Required column '{column_name}' not specified or found in file.", {} + return f"Required column '{column_name}' not specified or found in file." @hed_error(ValidationErrors.HED_UNKNOWN_COLUMN, default_severity=ErrorSeverity.WARNING) def val_error_extra_column(column_name): return f"Column named '{column_name}' found in file, but not specified as a tag column " + \ - "or identified in sidecars.", {} + "or identified in sidecars." @hed_error(ValidationErrors.HED_BLANK_COLUMN, default_severity=ErrorSeverity.WARNING) def val_error_hed_blank_column(column_number): - return f"Column number {column_number} has no column name", {} + return f"Column number {column_number} has no column name" @hed_error(ValidationErrors.HED_DUPLICATE_COLUMN, default_severity=ErrorSeverity.WARNING) def val_error_hed_duplicate_column(column_name): - return f"Multiple columns have name {column_name}. This is not a fatal error, but discouraged.", {} + return f"Multiple columns have name {column_name}. This is not a fatal error, but discouraged." @hed_tag_error(ValidationErrors.HED_LIBRARY_UNMATCHED) def val_error_unknown_prefix(tag, unknown_prefix, known_prefixes): - return f"Tag '{tag} has unknown prefix '{unknown_prefix}'. Valid prefixes: {known_prefixes}", {} + return f"Tag '{tag} has unknown prefix '{unknown_prefix}'. Valid prefixes: {known_prefixes}" @hed_tag_error(ValidationErrors.HED_NODE_NAME_EMPTY, has_sub_tag=True) def val_error_extra_slashes_spaces(tag, problem_tag): - return f"Extra slashes or spaces '{problem_tag}' in tag '{tag}'", {} + return f"Extra slashes or spaces '{problem_tag}' in tag '{tag}'" @hed_error(ValidationErrors.HED_SIDECAR_KEY_MISSING, default_severity=ErrorSeverity.WARNING) def val_error_sidecar_key_missing(invalid_key, category_keys): - return f"Category key '{invalid_key}' does not exist in column. Valid keys are: {category_keys}", {} + return f"Category key '{invalid_key}' does not exist in column. Valid keys are: {category_keys}" @hed_tag_error(ValidationErrors.HED_DEF_UNMATCHED) def val_error_def_unmatched(tag): - return f"A data-recording’s Def tag cannot be matched to definition. Tag: '{tag}'", {} + return f"A data-recording’s Def tag cannot be matched to definition. Tag: '{tag}'" @hed_tag_error(ValidationErrors.HED_DEF_EXPAND_INVALID) def val_error_bad_def_expand(tag, actual_def, found_def): return f"A data-recording’s Def-expand tag does not match the given definition." + \ - f"Tag: '{tag}'. Actual Def: {actual_def}. Found Def: {found_def}", {} + f"Tag: '{tag}'. Actual Def: {actual_def}. Found Def: {found_def}" @hed_tag_error(ValidationErrors.HED_DEF_VALUE_MISSING, actual_code=ValidationErrors.HED_DEF_VALUE_INVALID) def val_error_def_value_missing(tag): - return f"A def tag requires a placeholder value, but was not given one. Definition: '{tag}'", {} + return f"A def tag requires a placeholder value, but was not given one. Definition: '{tag}'" @hed_tag_error(ValidationErrors.HED_DEF_VALUE_EXTRA, actual_code=ValidationErrors.HED_DEF_VALUE_INVALID) def val_error_def_value_extra(tag): - return f"A def tag does not take a placeholder value, but was given one. Definition: '{tag}", {} + return f"A def tag does not take a placeholder value, but was given one. Definition: '{tag}" @hed_tag_error(ValidationErrors.HED_TOP_LEVEL_TAG, actual_code=ValidationErrors.HED_TAG_GROUP_ERROR) def val_error_top_level_tag(tag): - return f"A tag that must be in a top level group was found in another location. {str(tag)}", {} + return f"A tag that must be in a top level group was found in another location. {str(tag)}" @hed_tag_error(ValidationErrors.HED_TAG_GROUP_TAG, actual_code=ValidationErrors.HED_TAG_GROUP_ERROR) def val_error_tag_group_tag(tag): - return f"A tag that must be in a group was found in another location. {str(tag)}", {} + return f"A tag that must be in a group was found in another location. {str(tag)}" @hed_tag_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, actual_code=ValidationErrors.HED_TAG_GROUP_ERROR) def val_error_top_level_tags(tag, multiple_tags): tags_as_string = [str(tag) for tag in multiple_tags] return f"Multiple top level tags found in a single group. First one found: {str(tag)}. " + \ - f"Remainder:{str(tags_as_string)}", {} + f"Remainder:{str(tags_as_string)}" @hed_error(ValidationErrors.HED_REQUIRED_TAG_MISSING) def val_warning_required_prefix_missing(tag_prefix): - return f"Tag with prefix '{tag_prefix}' is required", {} + return f"Tag with prefix '{tag_prefix}' is required" @hed_tag_error(ValidationErrors.HED_STYLE_WARNING, default_severity=ErrorSeverity.WARNING) def val_warning_capitalization(tag): - return f"First word not capitalized or camel case - '{tag}'", {} + return f"First word not capitalized or camel case - '{tag}'" @hed_tag_error(ValidationErrors.HED_UNITS_DEFAULT_USED, default_severity=ErrorSeverity.WARNING) def val_warning_default_units_used(tag, default_unit): - return f"No unit specified. Using '{default_unit}' as the default - '{tag}'", {} + return f"No unit specified. Using '{default_unit}' as the default - '{tag}'" @hed_error(SchemaErrors.HED_SCHEMA_DUPLICATE_NODE) def schema_error_hed_duplicate_node(tag, duplicate_tag_list, section): tag_join_delimiter = "\n\t" return f"Duplicate term '{str(tag)}' used {len(duplicate_tag_list)} places in '{section}' section schema as:" + \ - f"{tag_join_delimiter}{tag_join_delimiter.join(duplicate_tag_list)}", {} + f"{tag_join_delimiter}{tag_join_delimiter.join(duplicate_tag_list)}" @hed_error(SchemaErrors.HED_SCHEMA_ATTRIBUTE_INVALID) def schema_error_unknown_attribute(attribute_name, source_tag): return f"Attribute '{attribute_name}' used by '{source_tag}' was not defined in the schema, " \ - f"or was used outside of it's defined class.", {} + f"or was used outside of it's defined class." @hed_error(SchemaWarnings.INVALID_CHARACTERS_IN_DESC, default_severity=ErrorSeverity.WARNING, actual_code=SchemaWarnings.HED_SCHEMA_CHARACTER_INVALID) def schema_warning_invalid_chars_desc(desc_string, tag_name, problem_char, char_index): - return f"Invalid character '{problem_char}' in desc for '{tag_name}' at position {char_index}. '{desc_string}", {} + return f"Invalid character '{problem_char}' in desc for '{tag_name}' at position {char_index}. '{desc_string}" @hed_error(SchemaWarnings.INVALID_CHARACTERS_IN_TAG, default_severity=ErrorSeverity.WARNING, actual_code=SchemaWarnings.HED_SCHEMA_CHARACTER_INVALID) def schema_warning_invalid_chars_tag(tag_name, problem_char, char_index): - return f"Invalid character '{problem_char}' in tag '{tag_name}' at position {char_index}.", {} + return f"Invalid character '{problem_char}' in tag '{tag_name}' at position {char_index}." @hed_error(SchemaWarnings.INVALID_CAPITALIZATION, default_severity=ErrorSeverity.WARNING) def schema_warning_invalid_capitalization(tag_name, problem_char, char_index): return "First character must be a capital letter or number. " + \ - f"Found character '{problem_char}' in tag '{tag_name}' at position {char_index}.", \ - {'problem_char': problem_char} + f"Found character '{problem_char}' in tag '{tag_name}' at position {char_index}." @hed_error(SchemaWarnings.NON_PLACEHOLDER_HAS_CLASS, default_severity=ErrorSeverity.WARNING) def schema_warning_non_placeholder_class(tag_name, invalid_attribute_name): return "Only placeholder nodes('#') can have a unit or value class." + \ - f"Found {invalid_attribute_name} on {tag_name}", {} + f"Found {invalid_attribute_name} on {tag_name}" @hed_error(SidecarErrors.BLANK_HED_STRING) def sidecar_error_blank_hed_string(): - return "No HED string found for Value or Category column.", {} + return "No HED string found for Value or Category column." @hed_error(SidecarErrors.WRONG_HED_DATA_TYPE) def sidecar_error_hed_data_type(expected_type, given_type): - return f"Invalid HED string datatype sidecar. Should be '{expected_type}', but got '{given_type}'", {} + return f"Invalid HED string datatype sidecar. Should be '{expected_type}', but got '{given_type}'" @hed_error(SidecarErrors.INVALID_POUND_SIGNS_VALUE, actual_code=ValidationErrors.HED_PLACEHOLDER_INVALID) def sidecar_error_invalid_pound_sign_count(pound_sign_count): - return f"There should be exactly one # character in a sidecar string. Found {pound_sign_count}", {} + return f"There should be exactly one # character in a sidecar string. Found {pound_sign_count}" @hed_error(SidecarErrors.INVALID_POUND_SIGNS_CATEGORY, actual_code=ValidationErrors.HED_PLACEHOLDER_INVALID) def sidecar_error_too_many_pound_signs(pound_sign_count): - return f"There should be no # characters in a category sidecar string. Found {pound_sign_count}", {} + return f"There should be no # characters in a category sidecar string. Found {pound_sign_count}" @hed_error(SidecarErrors.UNKNOWN_COLUMN_TYPE) def sidecar_error_unknown_column(column_name): return f"Could not automatically identify column '{column_name}' type from file. "\ - "Most likely the column definition in question needs a # sign to replace a number somewhere.", {} + "Most likely the column definition in question needs a # sign to replace a number somewhere." + + +@hed_error(SidecarErrors.SIDECAR_HED_USED, actual_code=SidecarErrors.SIDECAR_INVALID) +def sidecar_hed_used(): + return "'HED' is a reserved name and cannot be used as a sidecar column name" + + +@hed_error(SidecarErrors.SIDECAR_NA_USED, actual_code=SidecarErrors.SIDECAR_INVALID) +def sidecar_na_used(column_name): + return f"Invalid category key 'n/a' found in column {column_name}." @hed_tag_error(DefinitionErrors.DEF_TAG_IN_DEFINITION, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_def_tag_in_definition(tag, def_name): return f"Invalid tag {tag} found in definition for {def_name}. " +\ - f"Def and Def-expand tags cannot be in definitions.", {} + f"Def and Def-expand tags cannot be in definitions." @hed_error(DefinitionErrors.WRONG_NUMBER_GROUP_TAGS, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_wrong_group_tags(def_name, tag_list): tag_list_strings = [str(tag) for tag in tag_list] - return f"Too many group tags found in definition for {def_name}. Expected 1, found: {tag_list_strings}", {} + return f"Too many group tags found in definition for {def_name}. Expected 1, found: {tag_list_strings}" @hed_error(DefinitionErrors.WRONG_NUMBER_PLACEHOLDER_TAGS, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_wrong_placeholder_count(def_name, expected_count, tag_list): tag_list_strings = [str(tag) for tag in tag_list] return f"Incorrect number placeholder tags found in definition for {def_name}. " + \ - f"Expected {expected_count}, found: {tag_list_strings}", {} + f"Expected {expected_count}, found: {tag_list_strings}" @hed_error(DefinitionErrors.DUPLICATE_DEFINITION, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_duplicate_definition(def_name): - return f"Duplicate definition found for '{def_name}'.", {} + return f"Duplicate definition found for '{def_name}'." @hed_error(DefinitionErrors.TAG_IN_SCHEMA, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_tag_already_in_schema(def_name): - return f"Term '{def_name}' already used as term in schema and cannot be re-used as a definition.", {} + return f"Term '{def_name}' already used as term in schema and cannot be re-used as a definition." @hed_error(DefinitionErrors.INVALID_DEFINITION_EXTENSION, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_invalid_def_extension(def_name): - return f"Term '{def_name}' has an invalid extension. Definitions can only have one term.", {} + return f"Term '{def_name}' has an invalid extension. Definitions can only have one term." @hed_tag_error(OnsetErrors.ONSET_DEF_UNMATCHED, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_error_def_unmatched(tag): - return f"The def tag in an onset/offset tag is unmatched. Def tag: '{tag}'", {} + return f"The def tag in an onset/offset tag is unmatched. Def tag: '{tag}'" @hed_tag_error(OnsetErrors.OFFSET_BEFORE_ONSET, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_error_offset_before_onset(tag): - return f"Offset tag '{tag}' does not have a matching onset.", {} + return f"Offset tag '{tag}' does not have a matching onset." @hed_tag_error(OnsetErrors.ONSET_NO_DEF_TAG_FOUND, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_no_def_found(tag): - return f"'{tag}' tag has no def or def-expand tag in string.", {} + return f"'{tag}' tag has no def or def-expand tag in string." @hed_tag_error(OnsetErrors.ONSET_TOO_MANY_DEFS, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_too_many_defs(tag, tag_list): tag_list_strings = [str(tag) for tag in tag_list] - return f"Too many def tags found in onset for {tag}. Expected 1, also found: {tag_list_strings}", {} + return f"Too many def tags found in onset for {tag}. Expected 1, also found: {tag_list_strings}" @hed_tag_error(OnsetErrors.ONSET_WRONG_NUMBER_GROUPS, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_too_many_groups(tag, tag_list): tag_list_strings = [str(a_tag) for a_tag in tag_list] return f"An onset tag should have at most 2 sibling nodes, an offset tag should have 1. " +\ - f"Found {len(tag_list_strings)}: {tag_list_strings}", {} + f"Found {len(tag_list_strings)}: {tag_list_strings}" @hed_tag_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_wrong_type_tag(tag, def_tag): return f"Onset def tag '{def_tag}' has an improper sibling tag '{tag}'. All onset context tags must be " + \ - f"in a single group together.", {} + f"in a single group together." @hed_tag_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_wrong_placeholder(tag, has_placeholder): if has_placeholder: - return f"Onset/offset def tag {tag} expects a placeholder value, but does not have one.", {} - return f"Onset/offset def tag {tag} should not have a placeholder, but has one.", {} + return f"Onset/offset def tag {tag} expects a placeholder value, but does not have one." + return f"Onset/offset def tag {tag} should not have a placeholder, but has one." diff --git a/hed/errors/error_reporter.py b/hed/errors/error_reporter.py index 8f8b1e368..4a7fd91a9 100644 --- a/hed/errors/error_reporter.py +++ b/hed/errors/error_reporter.py @@ -43,8 +43,8 @@ def wrapper(*args, severity=default_severity, **kwargs): Returns: list: A list of dict with the errors.= """ - base_message, error_vars = func(*args, **kwargs) - error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, **error_vars) + base_message = func(*args, **kwargs) + error_object = ErrorHandler._create_error_object(actual_code, base_message, severity) return error_object _register_error_function(error_type, wrapper_func=wrapper) @@ -97,8 +97,8 @@ def wrapper(tag, index_in_tag, index_in_tag_end, *args, severity=default_severit except AttributeError: org_tag_text = str(tag) - base_message, error_vars = func(org_tag_text, problem_sub_tag, *args, **kwargs) - error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, **error_vars, + base_message = func(org_tag_text, problem_sub_tag, *args, **kwargs) + error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, index_in_tag=index_in_tag, index_in_tag_end=index_in_tag_end, source_tag=tag) @@ -129,8 +129,8 @@ def wrapper(tag, *args, severity=default_severity, **kwargs): org_tag_text = tag.get_original_hed_string() else: org_tag_text = str(tag) - base_message, error_vars = func(org_tag_text, *args, **kwargs) - error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, **error_vars, + base_message = func(org_tag_text, *args, **kwargs) + error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, source_tag=tag) return error_object @@ -148,9 +148,10 @@ def wrapper(tag, *args, severity=default_severity, **kwargs): class ErrorHandler: - def __init__(self): + def __init__(self, check_for_warnings=True): # The current (ordered) dictionary of contexts. self.error_context = [] + self._check_for_warnings = check_for_warnings def push_error_context(self, context_type, context, increment_depth_after=True): """ Push a new error context to narrow down error scope. @@ -191,8 +192,12 @@ def get_error_context_copy(self): def format_error_with_context(self, *args, **kwargs): error_object = ErrorHandler.format_error(*args, **kwargs) if self is not None: - self._add_context_to_errors(error_object[0], self.error_context) - self._update_error_with_char_pos(error_object[0]) + actual_error = error_object[0] + # # Filter out warning errors + if not self._check_for_warnings and actual_error['severity'] >= ErrorSeverity.WARNING: + return [] + self._add_context_to_errors(actual_error, self.error_context) + self._update_error_with_char_pos(actual_error) return error_object @@ -225,26 +230,19 @@ def format_error(error_type, *args, actual_error=None, **kwargs): return [error_object] - def add_context_to_issues(self, issues): + def add_context_and_filter(self, issues): + """ Filter out warnings if requested, while adding context to issues. + + issues(list): + list: A list containing a single dictionary representing a single error. + """ + if not self._check_for_warnings: + issues[:] = self.filter_issues_by_severity(issues, ErrorSeverity.ERROR) + for error_object in issues: self._add_context_to_errors(error_object, self.error_context) self._update_error_with_char_pos(error_object) - def format_error_list(self, issue_params): - """ Convert an issue params list to an issues list. This means adding the error context primarily. - - Parameters: - issue_params (list): A list of dict containing the unformatted issues list. - - Returns: - list: A list of dict containing unformatted errors. - - """ - formatted_issues = [] - for issue in issue_params: - formatted_issues += self.format_error(**issue) - return formatted_issues - @staticmethod def format_error_from_context(error_type, error_context, *args, actual_error=None, **kwargs): """ Format an error based on the error type. @@ -262,6 +260,7 @@ def format_error_from_context(error_type, error_context, *args, actual_error=Non Notes: - Generally the error_context is returned from _add_context_to_errors. - The actual_error is useful for errors that are shared like invalid character. + - This can't filter out warnings like the other ones. """ error_func = error_functions.get(error_type) diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index feb21bef6..ac76f6992 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -47,6 +47,7 @@ class ValidationErrors: HED_UNITS_DEFAULT_USED = 'HED_UNITS_DEFAULT_USED' HED_VALUE_INVALID = 'HED_VALUE_INVALID' HED_LIBRARY_UNMATCHED = "HED_LIBRARY_UNMATCHED" + TAG_PREFIX_INVALID = "TAG_PREFIX_INVALID" # HED_VERSION_WARNING HED_MISSING_REQUIRED_COLUMN = "HED_MISSING_REQUIRED_COLUMN" @@ -75,12 +76,14 @@ class ValidationErrors: class SidecarErrors: # These are for json sidecar validation errors(sidecars can also produce most normal validation errors) + SIDECAR_INVALID = "SIDECAR_INVALID" # this is the generic error reported for several later ones BLANK_HED_STRING = 'blankValueString' WRONG_HED_DATA_TYPE = 'wrongHedDataType' INVALID_POUND_SIGNS_VALUE = 'invalidNumberPoundSigns' INVALID_POUND_SIGNS_CATEGORY = 'tooManyPoundSigns' UNKNOWN_COLUMN_TYPE = 'sidecarUnknownColumn' - + SIDECAR_HED_USED = 'SIDECAR_HED_USED' + SIDECAR_NA_USED = 'SIDECAR_NA_USED' class SchemaErrors: HED_SCHEMA_DUPLICATE_NODE = 'HED_SCHEMA_DUPLICATE_NODE' diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py index 4b90f9b66..72ab0eead 100644 --- a/hed/errors/exceptions.py +++ b/hed/errors/exceptions.py @@ -8,6 +8,8 @@ class HedExceptions: CANNOT_PARSE_JSON = 'cannotParseJson' INVALID_EXTENSION = 'invalidExtension' + INVALID_DATAFRAME = 'INVALID_DATAFRAME' + # These are actual schema issues, not that the file cannot be found or parsed SCHEMA_HEADER_MISSING = 'HED_SCHEMA_HEADER_INVALID' HED_SCHEMA_HEADER_INVALID = 'HED_SCHEMA_HEADER_INVALID' diff --git a/hed/models/__init__.py b/hed/models/__init__.py index 07c044319..3f6d50d56 100644 --- a/hed/models/__init__.py +++ b/hed/models/__init__.py @@ -5,15 +5,12 @@ from .column_metadata import ColumnMetadata, ColumnType from .definition_dict import DefinitionDict from .definition_entry import DefinitionEntry -from .def_mapper import DefMapper from .expression_parser import QueryParser from .hed_group import HedGroup from .spreadsheet_input import SpreadsheetInput -from .hed_ops import HedOps from .hed_string import HedString from .hed_string_group import HedStringGroup from .hed_tag import HedTag -from .onset_mapper import OnsetMapper from .sidecar import Sidecar from .tabular_input import TabularInput from .timeseries_input import TimeseriesInput diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 33a35a96a..869bc4ea6 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -1,19 +1,12 @@ +import re import os + import openpyxl import pandas -import copy -from hed.models.definition_dict import DefinitionDict from hed.models.column_mapper import ColumnMapper from hed.errors.exceptions import HedFileError, HedExceptions -from hed.errors.error_types import ErrorContext, ErrorSeverity from hed.errors.error_reporter import ErrorHandler -from hed.models import model_constants -from hed.models.hed_ops import translate_ops -from hed.models.onset_mapper import OnsetMapper -from hed.models.hed_string import HedString -from hed.models.hed_string_group import HedStringGroup -from hed.models.def_mapper import DefMapper class BaseInput: @@ -27,8 +20,8 @@ class BaseInput: TAB_DELIMITER = '\t' COMMA_DELIMITER = ',' - def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, def_mapper=None, - definition_columns=None, name=None, allow_blank_names=True, hed_schema=None): + def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, name=None, + allow_blank_names=True): """ Constructor for the BaseInput class. Parameters: @@ -40,10 +33,8 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T has_column_names (bool): True if file has column names. This value is ignored if you pass in a pandas dataframe. mapper (ColumnMapper or None): Indicates which columns have HED tags. - definition_columns(list or None): A list of columns to check for definitions. Explicit 'None' means all. name (str or None): Optional field for how this file will report errors. allow_blank_names(bool): If True, column names can be blank - hed_schema(HedSchema or None): The schema to use by default in identifying tags Notes: - See SpreadsheetInput or TabularInput for examples of how to use built-in a ColumnMapper. @@ -51,17 +42,11 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T if mapper is None: mapper = ColumnMapper() self._mapper = mapper - if def_mapper is None: - def_mapper = DefMapper(mapper.get_def_dicts()) - self._def_mapper = def_mapper self._has_column_names = has_column_names self._name = name - # This is the loaded workbook if we loaded originally from an excel file. + # This is the loaded workbook if we loaded originally from an Excel file. self._loaded_workbook = None self._worksheet_name = worksheet_name - self._def_columns = definition_columns - self._schema = hed_schema - self.file_def_dict = None pandas_header = 0 if not self._has_column_names: pandas_header = None @@ -82,7 +67,9 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file) elif input_type in self.TEXT_EXTENSION: self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, - dtype=str, keep_default_na=False, na_values=None) + dtype=str, keep_default_na=True, na_values=None) + # Convert nan values to a known value + self._dataframe = self._dataframe.fillna("n/a") elif input_type in self.EXCEL_EXTENSION: self._loaded_workbook = openpyxl.load_workbook(file) loaded_worksheet = self.get_worksheet(self._worksheet_name) @@ -90,8 +77,11 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T else: raise HedFileError(HedExceptions.INVALID_EXTENSION, "", file) - column_issues = ColumnMapper.validate_column_map(self.columns, - allow_blank_names=allow_blank_names) + if self._dataframe.size == 0: + raise HedFileError(HedExceptions.INVALID_DATAFRAME, "Invalid dataframe(malformed datafile, etc)", file) + + # todo: Can we get rid of this behavior now that we're using pandas? + column_issues = ColumnMapper.validate_column_map(self.columns, allow_blank_names=allow_blank_names) if column_issues: raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.", self.name, issues=column_issues) @@ -113,15 +103,29 @@ def reset_mapper(self, new_mapper): columns = self._dataframe.columns self._mapper.set_column_map(columns) - self.file_def_dict = self.extract_definitions() - - self.update_definition_mapper(self.file_def_dict) - @property def dataframe(self): """ The underlying dataframe. """ return self._dataframe + @property + def dataframe_a(self): + """Return the assembled dataframe + Probably a placeholder name. + + Returns: + Dataframe: the assembled dataframe""" + return self.assemble() + + @property + def series_a(self): + """Return the assembled dataframe as a series + Probably a placeholder name. + + Returns: + Series: the assembled dataframe with columns merged""" + return self.combine_dataframe(self.assemble()) + @property def name(self): """ Name of the data. """ @@ -142,125 +146,101 @@ def worksheet_name(self): """ The worksheet name. """ return self._worksheet_name - def get_definitions(self, as_strings=False): - if as_strings: - return DefinitionDict.get_as_strings(self._def_mapper.gathered_defs) - else: - return self._def_mapper - - def _convert_to_form(self, hed_schema, tag_form, error_handler): - """ Convert all tags to the specified form. + def convert_to_form(self, hed_schema, tag_form): + """ Convert all tags in underlying dataframe to the specified form. Parameters: - hed_schema (HedSchema or None): The schema to use to convert tags. - If None, uses the one used to open the file. - tag_form (str): The form to convert the tags to (short_tag, long_tag, base_tag, etc). - error_handler (ErrorHandler or None): The error handler to use for context or default if none. + hed_schema (HedSchema): The schema to use to convert tags. + tag_form(str): HedTag property to convert tags to. + Most cases should use convert_to_short or convert_to_long below. + """ + from hed.models.df_util import convert_to_form + convert_to_form(self._dataframe, hed_schema, tag_form, self._mapper.get_tag_columns()) - Returns: - dict: A list of issue dictionaries corresponding to issues found during conversion. + def convert_to_short(self, hed_schema): + """ Convert all tags in underlying dataframe to short form. + Parameters: + hed_schema (HedSchema): The schema to use to convert tags. """ - error_list = [] - if hed_schema is None: - hed_schema = self._schema - if hed_schema is None: - raise ValueError("Cannot convert between tag forms without a schema.") - for row_number, row_dict in enumerate(self.iter_dataframe(hed_ops=hed_schema, - return_string_only=False, - remove_definitions=False, - requested_columns=self._mapper.get_tag_columns(), - error_handler=error_handler)): - column_to_hed_tags_dictionary = row_dict[model_constants.COLUMN_TO_HED_TAGS] - error_list += row_dict[model_constants.ROW_ISSUES] - for column_number in column_to_hed_tags_dictionary: - column_hed_string = column_to_hed_tags_dictionary[column_number] - self.set_cell(row_number, column_number, column_hed_string, - include_column_prefix_if_exist=False, tag_form=tag_form) - - return error_list - - def convert_to_short(self, hed_schema=None, error_handler=None): - """ Convert all tags to short form. + return self.convert_to_form(hed_schema, "short_tag") + + def convert_to_long(self, hed_schema): + """ Convert all tags in underlying dataframe to long form. Parameters: hed_schema (HedSchema or None): The schema to use to convert tags. - If None, uses the one used to open the file. - error_handler (ErrorHandler): The error handler to use for context, uses a default if none. - - Returns: - dict: A list of issue dictionaries corresponding to issues found during conversion. - """ - return self._convert_to_form(hed_schema, "short_tag", error_handler) + return self.convert_to_form(hed_schema, "long_tag") - def convert_to_long(self, hed_schema=None, error_handler=None): - """ Convert all tags to long form. + def shrink_defs(self, hed_schema): + """ Shrinks any def-expand found in the underlying dataframe. Parameters: - hed_schema (HedSchema or None): The schema to use to convert tags. - If None, uses the one used to open the file. - error_handler (ErrorHandler): The error handler to use for context, uses a default if none. + hed_schema (HedSchema or None): The schema to use to identify defs + """ + from df_util import shrink_defs + shrink_defs(self._dataframe, hed_schema=hed_schema, columns=self._mapper.get_tag_columns()) - Returns: - dict: A list of issue dictionaries corresponding to issues found during conversion. + def expand_defs(self, hed_schema, def_dict): + """ Shrinks any def-expand found in the underlying dataframe. + Parameters: + hed_schema (HedSchema or None): The schema to use to identify defs + def_dict (DefinitionDict): The definitions to expand """ - return self._convert_to_form(hed_schema, "long_tag", error_handler) + from df_util import expand_defs + expand_defs(self._dataframe, hed_schema=hed_schema, def_dict=def_dict, columns=self._mapper.get_tag_columns()) - def to_excel(self, file, output_processed_file=False): + def to_excel(self, file, output_assembled=False): """ Output to an Excel file. Parameters: file (str or file-like): Location to save this base input. - output_processed_file (bool): If True, replace definitions and labels in HED columns. - Also fills in things like categories. + output_assembled (bool): Plug in categories and values from the sidecar directly. Raises: - HedFileError if empty file object or file cannot be opened. + ValueError: if empty file object or file cannot be opened. """ if not file: raise ValueError("Empty file name or object passed in to BaseInput.save.") - # For now just make a copy if we want to save a formatted copy. Could optimize this further. - if output_processed_file: - output_file = self._get_processed_copy() - else: - output_file = self + dataframe = self._dataframe + + if output_assembled: + dataframe = self.dataframe_a if self._loaded_workbook: old_worksheet = self.get_worksheet(self._worksheet_name) - # excel spreadsheets are 1 based, then add another 1 for column names if present + # Excel spreadsheets are 1 based, then add another 1 for column names if present adj_row_for_col_names = 1 if self._has_column_names: adj_row_for_col_names += 1 adj_for_one_based_cols = 1 - for row_number, text_file_row in output_file._dataframe.iterrows(): + for row_number, text_file_row in dataframe.iterrows(): for column_number, column_text in enumerate(text_file_row): old_worksheet.cell(row_number + adj_row_for_col_names, column_number + adj_for_one_based_cols).value = \ - output_file._dataframe.iloc[row_number, column_number] + dataframe.iloc[row_number, column_number] self._loaded_workbook.save(file) else: - output_file._dataframe.to_excel(file, header=self._has_column_names) + dataframe.to_excel(file, header=self._has_column_names) - def to_csv(self, file=None, output_processed_file=False): + def to_csv(self, file=None, output_assembled=False): """ Write to file or return as a string. Parameters: file (str, file-like, or None): Location to save this file. If None, return as string. - output_processed_file (bool): Replace all definitions and labels in HED columns as appropriate. - Also fills in things like categories. + output_assembled (bool): Plug in categories and values from the sidecar directly. Returns: None or str: None if file is given or the contents as a str if file is None. """ - # For now just make a copy if we want to save a formatted copy. Could optimize this further. - if output_processed_file: - output_file = self._get_processed_copy() - else: - output_file = self - csv_string_if_filename_none = output_file._dataframe.to_csv(file, '\t', index=False, - header=output_file._has_column_names) + dataframe = self._dataframe + + if output_assembled: + dataframe = self.dataframe_a + + csv_string_if_filename_none = dataframe.to_csv(file, '\t', index=False, header=self._has_column_names) return csv_string_if_filename_none @property @@ -277,118 +257,32 @@ def columns(self): columns = list(self._dataframe.columns) return columns - @property - def def_dict(self): - """ Returns a dict of all the definitions found in this and sidecars + def column_metadata(self): + """Get the metadata for each column Returns: - def_dict(dict): {str: DefinitionEntry} pairs for each found definition + dict: number/ColumnMeta pairs """ - if self._def_mapper: - return self._def_mapper.gathered_defs + if self._mapper: + return self._mapper._final_column_map return {} - def __iter__(self): - """ Iterate over the underlying dataframe. """ - return self.iter_dataframe() - - def iter_dataframe(self, hed_ops=None, mapper=None, requested_columns=None, return_string_only=True, - run_string_ops_on_columns=False, error_handler=None, expand_defs=False, remove_definitions=True, - **kwargs): - """ Iterate rows based on the given column mapper. - - Parameters: - hed_ops (list, func, HedOps, or None): A func, a HedOps or a list of these to apply to the - hed strings before returning. - mapper (ColumnMapper or None): The column name to column number mapper (or internal mapper if None). - requested_columns(list or None): If this is not None, return ONLY these columns. Names or numbers allowed. - return_string_only (bool): If True, do not return issues list, individual columns, attribute columns, etc. - run_string_ops_on_columns (bool): If true, run all tag and string ops on columns, - rather than columns then rows. - error_handler (ErrorHandler or None): The error handler to use for context or a default if None. - expand_defs (bool): If True, expand def tags into def-expand groups. - remove_definitions (bool): If true, remove all definition tags found. - kwargs (kwargs): See models.hed_ops.translate_ops or the specific hed_ops for additional options. - - Yields: - dict: A dict with parsed row, including keys: "HED", "column_to_hed_tags", and possibly "column_issues". - - """ - if error_handler is None: - error_handler = ErrorHandler() - - if mapper is None: - mapper = self._mapper - - if requested_columns: - # Make a copy to ensure we don't alter the actual mapper - mapper = copy.deepcopy(mapper) - mapper.set_requested_columns(requested_columns) - - tag_funcs, string_funcs = self._translate_ops(hed_ops, run_string_ops_on_columns=run_string_ops_on_columns, - expand_defs=expand_defs, remove_definitions=remove_definitions, - error_handler=error_handler, **kwargs) - - # Iter tuples is ~ 25% faster compared to iterrows in our use case - for row_number, text_file_row in enumerate(self._dataframe.itertuples(index=False)): - error_handler.push_error_context(ErrorContext.ROW, row_number) - yield self._expand_row_internal(text_file_row, tag_funcs, string_funcs, - error_handler=error_handler, - mapper=mapper, return_string_only=return_string_only) - error_handler.pop_error_context() - - def _expand_row_internal(self, text_file_row, tag_funcs, string_funcs, error_handler, - mapper=None, return_string_only=False): - row_dict = mapper.expand_row_tags(text_file_row) - column_to_hed_tags = row_dict[model_constants.COLUMN_TO_HED_TAGS] - expansion_column_issues = row_dict.get(model_constants.COLUMN_ISSUES, {}) - - row_issues = [] - if tag_funcs: - row_issues += self._run_column_ops(column_to_hed_tags, tag_funcs, - expansion_column_issues, - error_handler) - - # Return a combined string if we're also returning columns. - if not return_string_only: - final_hed_string = HedStringGroup(column_to_hed_tags.values()) - else: - final_hed_string = HedString.from_hed_strings(contents=column_to_hed_tags.values()) - - if string_funcs: - row_issues += self._run_row_ops(final_hed_string, string_funcs, error_handler) - - if not return_string_only: - row_dict[model_constants.ROW_ISSUES] = row_issues - row_dict[model_constants.ROW_HED_STRING] = final_hed_string - return row_dict - # Return a HedString rather than a HedStringGroup - return final_hed_string - - def set_cell(self, row_number, column_number, new_string_obj, include_column_prefix_if_exist=False, - tag_form="short_tag"): + def set_cell(self, row_number, column_number, new_string_obj, tag_form="short_tag"): """ Replace the specified cell with transformed text. Parameters: row_number (int): The row number of the spreadsheet to set. column_number (int): The column number of the spreadsheet to set. new_string_obj (HedString): Object with text to put in the given cell. - include_column_prefix_if_exist (bool): If True and the column matches one from mapper - _column_prefix_dictionary, remove the prefix. tag_form (str): Version of the tags (short_tag, long_tag, base_tag, etc) Notes: Any attribute of a HedTag that returns a string is a valid value of tag_form. - """ if self._dataframe is None: raise ValueError("No data frame loaded") - transform_func = None - if not include_column_prefix_if_exist: - transform_func = self._mapper.get_prefix_remove_func(column_number) - - new_text = new_string_obj.get_as_form(tag_form, transform_func) + new_text = new_string_obj.get_as_form(tag_form) self._dataframe.iloc[row_number, column_number] = new_text def get_worksheet(self, worksheet_name=None): @@ -412,47 +306,6 @@ def get_worksheet(self, worksheet_name=None): else: return None - def get_def_and_mapper_issues(self, error_handler, check_for_warnings=False): - """ Return definition and column issues. - - Parameters: - error_handler (ErrorHandler): The error handler to use. - check_for_warnings (bool): If True check for and return warnings as well as errors. - - Returns: - dict: A list of definition and mapping issues. Each issue is a dictionary. - - """ - issues = [] - issues += self.file_def_dict.get_definition_issues() - - # Gather any issues from the mapper for things like missing columns. - mapper_issues = self._mapper.get_column_mapping_issues() - error_handler.add_context_to_issues(mapper_issues) - issues += mapper_issues - if not check_for_warnings: - issues = ErrorHandler.filter_issues_by_severity(issues, ErrorSeverity.ERROR) - return issues - - def _get_processed_copy(self): - """ Return a processed copy of this file. - - Returns: - BaseInput: The copy. - - Notes: - Processing includes definitions replaced, columns expanded, etc. - - """ - output_file = copy.deepcopy(self) - for row_number, row_dict in enumerate(self.iter_dataframe(return_string_only=False)): - column_to_hed_tags_dictionary = row_dict[model_constants.COLUMN_TO_HED_TAGS] - for column_number in column_to_hed_tags_dictionary: - new_text = column_to_hed_tags_dictionary[column_number] - output_file.set_cell(row_number, column_number, new_text, tag_form="short_tag") - - return output_file - @staticmethod def _get_dataframe_from_worksheet(worksheet, has_headers): """ Create a dataframe from the worksheet. @@ -474,139 +327,91 @@ def _get_dataframe_from_worksheet(worksheet, has_headers): else: return pandas.DataFrame(worksheet.values, dtype=str) - def _run_validators(self, hed_ops, error_handler, expand_defs=False, **kwargs): - validation_issues = [] - for row_dict in self.iter_dataframe(hed_ops=hed_ops, - return_string_only=False, - error_handler=error_handler, expand_defs=expand_defs, - **kwargs): - validation_issues += row_dict[model_constants.ROW_ISSUES] - - return validation_issues - - def _run_column_ops(self, column_to_hed_tags_dictionary, column_ops, expansion_column_issues, error_handler): - validation_issues = [] - if column_to_hed_tags_dictionary: - for column_number, column_hed_string in column_to_hed_tags_dictionary.items(): - new_column_issues = [] - error_handler.push_error_context(ErrorContext.COLUMN, column_number) - if column_hed_string is not None: - error_handler.push_error_context(ErrorContext.HED_STRING, column_hed_string, - increment_depth_after=False) - if column_number in expansion_column_issues: - new_column_issues += expansion_column_issues[column_number] - - if column_hed_string is not None: - new_column_issues += column_hed_string.apply_funcs(column_ops) - error_handler.add_context_to_issues(new_column_issues) - if column_hed_string is not None: - error_handler.pop_error_context() - error_handler.pop_error_context() - validation_issues += new_column_issues - - return validation_issues - - def _run_row_ops(self, row_hed_string, row_ops, error_handler): - error_handler.push_error_context(ErrorContext.HED_STRING, row_hed_string, increment_depth_after=False) - row_issues = row_hed_string.apply_funcs(row_ops) - error_handler.add_context_to_issues(row_issues) - error_handler.pop_error_context() - return row_issues - - def validate_file(self, hed_ops, name=None, error_handler=None, check_for_warnings=True, **kwargs): - """ Run the hed_ops on columns and rows. + def validate(self, hed_schema, extra_def_dicts=None, name=None, error_handler=None): + """Creates a SpreadsheetValidator and returns all issues with this fil Parameters: - hed_ops (func, HedOps, or list of func and/or HedOps): The HedOps of funcs to apply. - name (str): If present, use this as the filename for context, rather than using the actual filename - Useful for temp filenames. - error_handler (ErrorHandler or None): Used to report errors a default one if None. - check_for_warnings (bool): If True check for and return warnings as well as errors. - kwargs: See models.hed_ops.translate_ops or the specific hed_ops for additional options. - + hed_schema(HedSchema): The schema to use for validation + extra_def_dicts(list of DefDict or DefDict): all definitions to use for validation + name(str): The name to report errors from this file as + error_handler (ErrorHandler): Error context to use. Creates a new one if None Returns: - list: The list of validation issues found. The list elements are dictionaries. - + issues (list of dict): A list of issues for hed string """ + from hed.validator.spreadsheet_validator import SpreadsheetValidator if not name: name = self.name - if not isinstance(hed_ops, list): - hed_ops = [hed_ops] - - if error_handler is None: - error_handler = ErrorHandler() - - error_handler.push_error_context(ErrorContext.FILE_NAME, name) - validation_issues = self.get_def_and_mapper_issues(error_handler, check_for_warnings=check_for_warnings) - validation_issues += self._run_validators(hed_ops, error_handler=error_handler, - check_for_warnings=check_for_warnings, **kwargs) - error_handler.pop_error_context() - + tab_validator = SpreadsheetValidator(hed_schema) + validation_issues = tab_validator.validate(self, self._mapper.get_def_dict(hed_schema, extra_def_dicts), name, + error_handler=error_handler) return validation_issues - def extract_definitions(self, error_handler=None): - """ Gather and validate all definitions. + @staticmethod + def _dataframe_has_names(dataframe): + for column in dataframe.columns: + if isinstance(column, str): + return True + return False + + def assemble(self, mapper=None): + """ Assembles the hed strings Parameters: - error_handler (ErrorHandler): The error handler to use for context or a default if None. + mapper(ColumnMapper or None): Generally pass none here unless you want special behavior. Returns: - DefinitionDict: Contains all the definitions located in the file. - + Dataframe: the assembled dataframe """ - if error_handler is None: - error_handler = ErrorHandler() - new_def_dict = DefinitionDict() - hed_ops = [self._schema, new_def_dict] - for _ in self.iter_dataframe(hed_ops=hed_ops, - return_string_only=False, - requested_columns=self._def_columns, - run_string_ops_on_columns=True, - remove_definitions=False, - error_handler=error_handler): - pass - - return new_def_dict - - def update_definition_mapper(self, def_dict): - """ Add definitions from dict(s) if mapper exists. + if mapper is None: + mapper = self._mapper + import pandas as pd + transformers, need_categorical = mapper.get_transformers() + if not transformers: + return None + all_columns = self._dataframe + if need_categorical: + all_columns[need_categorical] = all_columns[need_categorical].astype('category') + + all_columns = all_columns.transform(transformers) + + possible_column_references = [f"{column_name}" for column_name in self.columns if + column_name.lower() != "hed"] + found_column_references = [] + for column_name in all_columns: + df = all_columns[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) + u_vals = pd.Series([j for i in df for j in i], dtype=str) + u_vals = u_vals.unique() + for val in u_vals: + if val not in found_column_references: + found_column_references.append(val) + + valid_replacements = [col for col in found_column_references if col in possible_column_references] + + column_names = list(transformers.keys()) + for column_name in valid_replacements: + column_names.remove(column_name) + saved_columns = all_columns[valid_replacements] + for column_name in column_names: + for replacing_name in valid_replacements: + column_name_brackets = f"[{replacing_name}]" + all_columns[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y + in zip(all_columns[column_name], saved_columns[replacing_name])) + all_columns = all_columns[column_names] + + return all_columns + + @staticmethod + def combine_dataframe(dataframe): + """ Combines all columns in the given dataframe into a single hed string series. Parameters: - def_dict (list or DefinitionDict): Add the DefDict or list of DefDict to the internal definition mapper. + dataframe(Dataframe): The dataframe to combine + Returns: + Series: the assembled series """ - if self._def_mapper is not None: - self._def_mapper.add_definitions(def_dict) - - def _translate_ops(self, hed_ops, run_string_ops_on_columns, expand_defs, remove_definitions, **kwargs): - - tag_funcs = [] - string_funcs = [] - if hed_ops or expand_defs or remove_definitions: - if not isinstance(hed_ops, list): - hed_ops = [hed_ops] - hed_ops = hed_ops.copy() - if not run_string_ops_on_columns: - self._add_def_onset_mapper(hed_ops) - tag_funcs, string_funcs = translate_ops(hed_ops, split_ops=True, hed_schema=self._schema, - expand_defs=expand_defs, - remove_definitions=remove_definitions, - **kwargs) - else: - tag_funcs = translate_ops(hed_ops, hed_schema=self._schema, expand_defs=expand_defs, **kwargs) - - return tag_funcs, string_funcs - - def _add_def_onset_mapper(self, hed_ops): - if not any(isinstance(hed_op, DefMapper) for hed_op in hed_ops): - if self._def_mapper: - hed_ops.append(self._def_mapper) - hed_ops.append(OnsetMapper(self._def_mapper)) - return hed_ops + dataframe = dataframe.agg(', '.join, axis=1) - @staticmethod - def _dataframe_has_names(dataframe): - for column in dataframe.columns: - if isinstance(column, str): - return True - return False + # Potentially better ways to handle removing n/a by never inserting them to begin with. + dataframe = dataframe.replace("(, n/a|n/a,)", "", regex=True) + return dataframe diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index f6fd12edb..3c4c87a63 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -1,13 +1,10 @@ from hed.models.column_metadata import ColumnMetadata, ColumnType from hed.models.sidecar import Sidecar -from hed.models.hed_string import HedString -from hed.models import model_constants from hed.errors.error_reporter import ErrorHandler from hed.errors.error_types import ValidationErrors import copy - PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " @@ -27,6 +24,9 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None Sidecar column definitions will take precedent if there is a conflict with tag_columns. column_prefix_dictionary (dict): Dictionary with keys that are column numbers and values are HED tag prefixes to prepend to the tags in that column before processing. + May be deprecated. These are no longer prefixes, but rather converted to value columns. + eg. {"key": "Description"} will turn into a value column as {"key": "Description/#"} + This means it no longer accepts anything but the value portion only in the columns. optional_tag_columns (list): A list of ints or strings containing the columns that contain the HED tags. If the column is otherwise unspecified, convert this column type to HEDTags. requested_columns (list or None): A list of columns you wish to retrieve. @@ -64,11 +64,41 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None self.set_requested_columns(requested_columns, False) self.set_tag_columns(tag_columns, optional_tag_columns, False) - self.set_column_prefix_dict(column_prefix_dictionary, False) + self._add_value_columns(column_prefix_dictionary) # finalize the column map based on initial settings with no header self._finalize_mapping() + def get_transformers(self): + """ Return the transformers to use on a dataframe + + """ + final_transformers = {} + need_categorical = [] + for column in self._final_column_map.values(): + assign_to_column = column.column_name + if isinstance(assign_to_column, int): + if self._column_map: + assign_to_column = self._column_map[assign_to_column - 1] + else: + assign_to_column = assign_to_column - 1 + if column.column_type == ColumnType.Ignore: + continue + elif column.column_type == ColumnType.Value: + value_str = column._hed_dict + from functools import partial + final_transformers[assign_to_column] = partial(self._value_handler, value_str) + elif column.column_type == ColumnType.Categorical: + need_categorical.append(column.column_name) + category_values = column._hed_dict + from functools import partial + final_transformers[assign_to_column] = partial(self._category_handler, category_values) + else: + final_transformers[assign_to_column] = lambda x: x + # print(column.column_type) + + return final_transformers, need_categorical + @staticmethod def validate_column_map(column_map, allow_blank_names): """ Validate there are no issues with column names. @@ -89,10 +119,10 @@ def validate_column_map(column_map, allow_blank_names): if name is None or name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): issues += ErrorHandler.format_error(ValidationErrors.HED_BLANK_COLUMN, column_number) continue - if name in used_names: - # todo: Add this check once it's more fleshed out - # issues += ErrorHandler.format_error(ValidationErrors.HED_DUPLICATE_COLUMN, name) - continue + # if name in used_names: + # # todo: Add this check once it's more fleshed out + # issues += ErrorHandler.format_error(ValidationErrors.HED_DUPLICATE_COLUMN, name) + # continue used_names.add(name) return issues @@ -116,34 +146,18 @@ def _set_sidecar(self, sidecar): self._sidecar = sidecar def get_tag_columns(self): - """ Returns the column numbers that are mapped to be HedTags + """ Returns the column numbers or names that are mapped to be HedTags Note: This is NOT the tag_columns or optional_tag_columns parameter, though they set it. Returns: - column_numbers(list): A list of column numbers that are ColumnType.HedTags + column_identifiers(list): A list of column numbers or names that are ColumnType.HedTags. + 0-based if integer-based, otherwise column name. """ - return [number for number, column_entry in self._final_column_map.items() + return [column_entry.column_name - 1 if isinstance(column_entry.column_name, int) else column_entry.column_name + for number, column_entry in self._final_column_map.items() if column_entry.column_type == ColumnType.HEDTags] - def set_column_prefix_dict(self, column_prefix_dictionary, finalize_mapping=True): - """ Replace the column prefix dictionary - - Parameters: - column_prefix_dictionary (dict): Dictionary with keys that are column numbers and values are HED tag - prefixes to prepend to the tags in that column before processing. - finalize_mapping (bool): Re-generate the internal mapping if True, otherwise no effect until finalize. - - Returns: - list: List of issues that occurred during this process. Each issue is a dictionary. - - """ - if column_prefix_dictionary: - self._column_prefix_dictionary = column_prefix_dictionary - if finalize_mapping: - return self._finalize_mapping() - return [] - def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_mapping=True): """ Set tag columns and optional tag columns @@ -222,88 +236,15 @@ def add_columns(self, column_names_or_numbers, column_type=ColumnType.HEDTags): new_def = ColumnMetadata(column_type, column_name) self._add_column_data(new_def) - def _expand_column(self, column_number, input_text): - """ Expand the specified text based on the rules for expanding the specified column. - - Parameters: - column_number (int): The column number this text should be treated as from. - input_text (str): The text to expand, generally from a single cell of a spreadsheet. - - Returns: - str or None: The text after expansion or None if this column is undefined or the given text is null. - False or str: Depends on the value of first return value. If None, this is an error message. - If string, this is an attribute name that should be stored separately. - - """ - - # Default 1-1 mapping if we don't have specific behavior. - if self._no_mapping_info: - return HedString(input_text), False - - # If no entry, ignore this column. - if column_number not in self._final_column_map: - return None, False - - if not input_text or input_text in self._na_patterns: - return None, False - - column_entry = self._final_column_map[column_number] - return column_entry.expand(input_text) - - def expand_row_tags(self, row_text): - """ Expand all mapped columns for row. - - Parameters: - row_text (list): The text for the given row, one list entry per column number. - - Returns: - dict: A dictionary containing the keys COLUMN_TO_HED_TAGS, COLUMN_ISSUES. - - Notes: - - The "column_to_hed_tags" is each expanded column given separately as a list of HedStrings. - - Attributes are any column identified as an attribute. - They will appear in the return value as {attribute_name: value_of_column} - - """ - result_dict = {} - column_to_hed_tags_dictionary = {} - column_issues_dict = {} - for column_number, cell_text in enumerate(row_text): - translated_column, translation_errors = self._expand_column(column_number, str(cell_text)) - if translated_column is None: - if translation_errors: - if column_number not in column_issues_dict: - column_issues_dict[column_number] = [] - column_issues_dict[column_number] += translation_errors - column_to_hed_tags_dictionary[column_number] = translated_column - continue - - column_to_hed_tags_dictionary[column_number] = translated_column - - result_dict[model_constants.COLUMN_TO_HED_TAGS] = column_to_hed_tags_dictionary - if column_issues_dict: - result_dict[model_constants.COLUMN_ISSUES] = column_issues_dict - - return result_dict - - def get_prefix_remove_func(self, column_number): - """ Return a function to removes name prefixes for column - - Parameters: - column_number (int): Column number to look up in the prefix dictionary. - - Returns: - func: A function taking a tag and string, returning a string. - - """ - if column_number not in self._final_column_map: - return None - - entry = self._final_column_map[column_number] - if not entry.column_prefix: - return None - - return entry.remove_prefix + def _add_value_columns(self, column_prefix_dictionary): + if column_prefix_dictionary: + for col, prefix in column_prefix_dictionary.items(): + if prefix.endswith("/"): + prefix = prefix + "#" + else: + prefix = prefix + "/#" + new_def = ColumnMetadata(ColumnType.Value, col, hed_dict=prefix) + self._add_column_data(new_def) def _add_column_data(self, new_column_entry): """ Add the metadata of a column to this column mapper. @@ -318,34 +259,6 @@ def _add_column_data(self, new_column_entry): column_name = new_column_entry.column_name self.column_data[column_name] = copy.deepcopy(new_column_entry) - @staticmethod - def _set_column_prefix(final_map, column_number, new_required_prefix): - """ Internal function to add this as a required name_prefix to a column - - Parameters: - final_map (dict): {column_number:prefix} Dict of column numbers with prefixes - column_number (int): The column number with this name_prefix. - new_required_prefix (str): The name_prefix to add to the column when loading from a spreadsheet. - - Raises: - TypeError if column number is passed as a str rather an int. - - Notes: - If the column is not known to the mapper, it will be added as a HEDTags column. - - """ - if isinstance(column_number, str): - raise TypeError("Must pass in a column number not column_name to _set_column_prefix") - if column_number not in final_map: - column_entry = ColumnMetadata(ColumnType.HEDTags) - final_map[column_number] = column_entry - else: - column_entry = final_map[column_number] - - column_entry.column_prefix = new_required_prefix - if column_entry.column_type is None or column_entry.column_type == ColumnType.Ignore: - column_entry.column_type = ColumnType.HEDTags - @staticmethod def _get_basic_final_map(column_map, column_data): basic_final_map = {} @@ -456,15 +369,14 @@ def _finalize_mapping(self): issues += self._add_tag_columns(final_map, unhandled_names, all_tag_columns, required_tag_columns, self._warn_on_missing_column) - # Add prefixes - for column_number, prefix in self._column_prefix_dictionary.items(): - self._set_column_prefix(final_map, column_number, prefix) - issues += ColumnMapper.validate_column_map(self._column_map.values(), allow_blank_names=False) self._final_column_map = self._filter_by_requested(final_map, self._requested_columns) + # Make sure this new dict is sorted + self._final_column_map = dict(sorted(final_map.items())) self._no_mapping_info = not self._check_if_mapping_info() + self._finalize_mapping_issues = issues return issues @@ -479,15 +391,19 @@ def _column_name_requested(self, column_name): return True return column_name in self._requested_columns - def get_def_dicts(self): + def get_def_dict(self, hed_schema=None, extra_def_dicts=None): """ Return def dicts from every column description. - Returns: - list: A list of DefinitionDict objects corresponding to each column entry. + Parameters: + hed_schema (Schema or None): A HED schema object to use for extracting definitions. + extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. + Returns: + DefinitionDict: A single definition dict representing all the data(and extra def dicts) """ if self._sidecar: - return self._sidecar.get_def_dicts() + return self._sidecar.get_def_dict(hed_schema=hed_schema, extra_def_dicts=extra_def_dicts) + return [] def get_column_mapping_issues(self): @@ -498,3 +414,14 @@ def get_column_mapping_issues(self): """ return self._finalize_mapping_issues + + @staticmethod + def _category_handler(category_values, x): + return category_values.get(x, "") + + @staticmethod + def _value_handler(value_str, x): + if x == "n/a": + return "n/a" + + return value_str.replace("#", str(x)) diff --git a/hed/models/column_metadata.py b/hed/models/column_metadata.py index 3921b5b82..ecdc76f08 100644 --- a/hed/models/column_metadata.py +++ b/hed/models/column_metadata.py @@ -1,11 +1,9 @@ from enum import Enum -from hed.models.hed_string import HedString -from hed.errors.error_types import SidecarErrors, ValidationErrors -from hed.errors.error_reporter import ErrorHandler +from hed.errors.error_types import SidecarErrors class ColumnType(Enum): - """ The overall column_type of a column in column mapper, eg treat it as HED tags. + """ The overall column_type of a column in column mapper, e.g. treat it as HED tags. Mostly internal to column mapper related code """ @@ -14,7 +12,7 @@ class ColumnType(Enum): Ignore = "ignore" # This column is a category with a list of possible values to replace with hed strings. Categorical = "categorical" - # This column has a value(eg filename) that is added to a hed tag in place of a # sign. + # This column has a value(e.g. filename) that is added to a hed tag in place of a # sign. Value = "value" # Return this column exactly as given, it is HED tags. HEDTags = "hed_tags" @@ -58,105 +56,6 @@ def hed_dict(self): """ return self._hed_dict - def _get_category_hed_string(self, category): - """ Fetch the hed string for a category key. - - Parameters: - category (str): The category key to retrieve the string from. - - Returns: - str: The hed string for a given category entry in a category column. - - """ - if self.column_type != ColumnType.Categorical: - return None - - return self._hed_dict.get(category, None) - - def _get_value_hed_string(self): - """ Fetch the hed string in a value column. - - Returns: - str: The hed string for a given value column. - - """ - if self.column_type != ColumnType.Value: - return None - - return self._hed_dict - - def expand(self, input_text): - """ Expand text using the rules for this column. - - Parameters: - input_text (str): Text to expand (generally from a single cell in a spreadsheet). - - Returns: - str or None: The expanded column as a hed_string. - str or dict: If this is a string, contains the name of this column - as an attribute. If the first return value is None, this is an error message dictionary. - - Notes: - - Examples are adding name_prefix, inserting a column hed_string from a category key, etc. - - """ - column_type = self.column_type - - if column_type == ColumnType.Categorical: - final_text = self._get_category_hed_string(input_text) - if final_text: - return HedString(final_text), False - else: - return None, ErrorHandler.format_error(ValidationErrors.HED_SIDECAR_KEY_MISSING, invalid_key=input_text, - category_keys=list(self._hed_dict.keys())) - elif column_type == ColumnType.Value: - prelim_text = self._get_value_hed_string() - final_text = prelim_text.replace("#", input_text) - return HedString(final_text), False - elif column_type == ColumnType.HEDTags: - hed_string_obj = HedString(input_text) - self._prepend_required_prefix(hed_string_obj, self.column_prefix) - return hed_string_obj, False - elif column_type == ColumnType.Ignore: - return None, False - - return None, {"error_type": "INTERNAL_ERROR"} - - @staticmethod - def _prepend_required_prefix(required_tag_column_tags, required_tag_prefix): - """ Prepend the tag paths to the required tag column tags that need them. - - Parameters: - required_tag_column_tags (HedString): A string containing HED tags associated with a - required tag column that may need a tag name_prefix prepended to its tags. - required_tag_prefix (str): A string that will be added if missing to any given tag. - """ - if not required_tag_prefix: - return required_tag_column_tags - - for tag in required_tag_column_tags.get_all_tags(): - tag.add_prefix_if_needed(required_tag_prefix) - - return required_tag_column_tags - - def remove_prefix(self, original_tag, current_tag_text): - """ Remove column_prefix if present from tag. - - Parameters: - original_tag (HedTag): The original hed tag being written. - current_tag_text (str): A single tag as a string, in any form. - - Returns: - str: current_tag_text with required prefixes removed - """ - prefix_to_remove = self.column_prefix - if not prefix_to_remove: - return current_tag_text - - if current_tag_text.lower().startswith(prefix_to_remove.lower()): - current_tag_text = current_tag_text[len(prefix_to_remove):] - return current_tag_text - @staticmethod def expected_pound_sign_count(column_type): """ Return how many pound signs a column string should have. diff --git a/hed/models/def_mapper.py b/hed/models/def_mapper.py deleted file mode 100644 index 98b8bbb43..000000000 --- a/hed/models/def_mapper.py +++ /dev/null @@ -1,255 +0,0 @@ -from hed.models.hed_string import HedString -from hed.models.hed_tag import HedTag -from hed.models.definition_dict import DefinitionDict -from hed.models.model_constants import DefTagNames -from hed.errors.error_types import ValidationErrors, DefinitionErrors -from hed.errors.error_reporter import ErrorHandler -from hed.models.hed_ops import HedOps - -# TODO: should not have print statement when error - - -class DefMapper(HedOps): - """ Handles converting Def/ and Def-expand/. - - Notes: - - The class provides string funcs but no tag funcs when extending HedOps. - - The class can expand or shrink definitions in hed strings via - Def/XXX and (Def-expand/XXX ...). - - """ - - def __init__(self, def_dicts=None): - """ Initialize mapper for definitions in hed strings. - - Parameters: - def_dicts (list or DefinitionDict): DefinitionDicts containing the definitions this mapper - should initialize with. - - Notes: - - More definitions can be added later. - - """ - super().__init__() - self._gathered_defs = {} - # List of def names we want to be able to quickly purge. - self._temporary_def_names = set() - self._def_tag_name = DefTagNames.DEFINITION_KEY - self._label_tag_name = DefTagNames.DEF_KEY - # this only gathers issues with duplicate definitions - self._issues = [] - if def_dicts: - self.add_definitions(def_dicts) - - @property - def issues(self): - return self._issues - - @property - def gathered_defs(self): - return self._gathered_defs - - def get_def_entry(self, def_name): - """ Get the definition entry for the definition name. - - Parameters: - def_name (str): Name of the definition to retrieve. - - Returns: - DefinitionEntry: Definition entry for the requested definition. - - """ - - return self._gathered_defs.get(def_name.lower()) - - def clear_temporary_definitions(self): - """ Remove any previously added temporary definitions. """ - for def_name in self._temporary_def_names: - del self._gathered_defs[def_name] - self._temporary_def_names = set() - - def add_definitions_from_string_as_temp(self, hed_string_obj): - """ Add definitions from hed string as temporary. - - Parameters: - hed_string_obj (HedString): Hed string object to search for definitions - - Returns: - list: List of issues due to invalid definitions found in this string. Each issue is a dictionary. - - """ - this_string_def_dict = DefinitionDict() - validation_issues = this_string_def_dict.check_for_definitions(hed_string_obj) - self.add_definitions(this_string_def_dict, add_as_temp=True) - return validation_issues - - def add_definitions(self, def_dicts, add_as_temp=False): - """ Add definitions from dict(s) to mapper - - Parameters: - def_dicts (list or DefinitionDict): DefDict or list of DefDicts whose definitions should be added. - add_as_temp (bool): If true, mark these new definitions as temporary (easily purged). - - """ - if not isinstance(def_dicts, list): - def_dicts = [def_dicts] - for def_dict in def_dicts: - if isinstance(def_dict, DefinitionDict): - self._add_definitions_from_dict(def_dict, add_as_temp) - else: - print(f"Invalid input type '{type(def_dict)} passed to DefMapper. Skipping.") - - def _add_definitions_from_dict(self, def_dict, add_as_temp=False): - """ Add the definitions found in the given definition dictionary to this mapper. - - Parameters: - def_dict (DefinitionDict): DefDict whose definitions should be added. - add_as_temp (bool): If true, mark these new definitions as temporary (easily purged). - - """ - for def_tag, def_value in def_dict: - if def_tag in self._gathered_defs: - error_context = self._gathered_defs[def_tag].source_context - self._issues += ErrorHandler.format_error_from_context(DefinitionErrors.DUPLICATE_DEFINITION, - error_context=error_context, - def_name=def_tag) - continue - self._gathered_defs[def_tag] = def_value - if add_as_temp: - self._temporary_def_names.add(def_tag) - - def expand_def_tags(self, hed_string_obj, expand_defs=True, shrink_defs=False): - """ Validate and expand Def/Def-Expand tags. - - Parameters: - hed_string_obj (HedString): The hed string to process. - expand_defs (bool): If true, convert def tags to def-expand tag groups that include definition content. - shrink_defs (bool): If True, replace all def-expand groups with corresponding def tags. - - Returns: - list: Issues found related to validating defs. Each issue is a dictionary. - - Notes: - - This function can optionally expand or shrink Def/ and Def-expand, respectively. - - Usually issues are mismatched placeholders or a missing definition. - - The expand_defs and shrink_defs cannot both be True. - - """ - # First see if the "def" is found at all. This covers def and def-expand. - hed_string_lower = hed_string_obj.lower() - if self._label_tag_name not in hed_string_lower: - return [] - - def_issues = [] - # We need to check for labels to expand in ALL groups - for def_tag, def_expand_group, def_group in hed_string_obj.find_def_tags(recursive=True): - def_contents = self._get_definition_contents(def_tag, def_expand_group, def_issues) - if def_expand_group is def_tag: - if def_contents is not None and expand_defs: - def_tag.short_base_tag = DefTagNames.DEF_EXPAND_ORG_KEY - def_group.replace(def_tag, def_contents) - else: - if def_contents is not None and shrink_defs: - def_tag.short_base_tag = DefTagNames.DEF_ORG_KEY - def_group.replace(def_expand_group, def_tag) - - return def_issues - - def expand_and_remove_definitions(self, hed_string_obj, check_for_definitions=False, expand_defs=True, - shrink_defs=False, remove_definitions=True): - """ Validate and expand Def/Def-Expand tags. - - Also removes definitions - - Parameters: - hed_string_obj (HedString): The string to search for definitions. - check_for_definitions (bool): If True, this will first check the hed string for any definitions. - expand_defs (bool): If True, replace Def tags to Def-expand tag groups. - shrink_defs (bool): If True, replace Def-expand groups with Def tags. - remove_definitions (bool): If true, this will remove all Definition tag groups. - - Returns: - def_issues (list): A list of issues for definition-related tags in this string. Each issue is a dictionary. - - Notes: - - The check_for_definitions is mainly used for individual HedStrings in isolation. - - The defs can be expanded or shrunk, while definitions can be removed. - - This does not validate definitions, it will blindly remove invalid definitions as well. - - """ - def_issues = [] - if check_for_definitions: - def_issues += self.add_definitions_from_string_as_temp(hed_string_obj) - def_issues += self.expand_def_tags(hed_string_obj, expand_defs=expand_defs, shrink_defs=shrink_defs) - if remove_definitions: - def_issues += hed_string_obj.remove_definitions() - if check_for_definitions: - self.clear_temporary_definitions() - - return def_issues - - def _get_definition_contents(self, def_tag, def_expand_group, def_issues): - """ Check for issues with expanding a tag from Def to a Def-expand tag group and return the expanded tag group. - - Parameters: - def_tag (HedTag): Source hed tag that may be a Def or Def-expand tag. - def_expand_group (HedGroup or HedTag): - Source group for this def-expand tag. Same as def_tag if this is not a def-expand tag. - def_issues : [{}] - List of issues to append any new issues to - - Returns: - def_contents: [HedTag or HedGroup] - The contents to replace the previous def-tag with. - """ - # todo: This check could be removed for optimizing - if def_tag.short_base_tag.lower() != DefTagNames.DEF_EXPAND_KEY and \ - def_tag.short_base_tag.lower() != DefTagNames.DEF_KEY: - raise ValueError("Internal error in DefMapper") - - is_label_tag = def_tag.extension_or_value_portion - placeholder = None - found_slash = is_label_tag.find("/") - if found_slash != -1: - placeholder = is_label_tag[found_slash + 1:] - is_label_tag = is_label_tag[:found_slash] - - label_tag_lower = is_label_tag.lower() - def_entry = self._gathered_defs.get(label_tag_lower) - if def_entry is None: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_UNMATCHED, tag=def_tag) - else: - def_tag_name, def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder) - if def_tag_name: - if def_expand_group is not def_tag and def_expand_group != def_contents: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_EXPAND_INVALID, - tag=def_tag, actual_def=def_contents, - found_def=def_expand_group) - return None - return def_contents - elif def_entry.takes_value: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_VALUE_MISSING, tag=def_tag) - else: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_VALUE_EXTRA, tag=def_tag) - - return None - - def __get_string_funcs__(self, **kwargs): - """ String funcs for processing definitions. """ - string_funcs = [] - expand_defs = kwargs.get("expand_defs") - shrink_defs = kwargs.get("shrink_defs") - remove_definitions = kwargs.get("remove_definitions") - check_for_definitions = kwargs.get("check_for_definitions") - if shrink_defs and expand_defs: - raise ValueError("Cannot pass both shrink_defs and expand_defs to DefMapper") - from functools import partial - string_funcs.append(partial(self.expand_and_remove_definitions, - check_for_definitions=check_for_definitions, - expand_defs=expand_defs, - shrink_defs=shrink_defs, - remove_definitions=remove_definitions)) - return string_funcs - - def __get_tag_funcs__(self, **kwargs): - return [] diff --git a/hed/models/definition_dict.py b/hed/models/definition_dict.py index 13d0f083b..ca3b06b34 100644 --- a/hed/models/definition_dict.py +++ b/hed/models/definition_dict.py @@ -2,36 +2,60 @@ from hed.models.hed_string import HedString from hed.errors.error_types import DefinitionErrors from hed.errors.error_reporter import ErrorHandler -from functools import partial - from hed.models.model_constants import DefTagNames -from hed.models.hed_ops import HedOps -class DefinitionDict(HedOps): +class DefinitionDict: """ Gathers definitions from a single source. - This class extends HedOps because it has string_funcs to check for definitions. It has no tag_funcs. - """ - def __init__(self): + def __init__(self, def_dicts=None, hed_schema=None): """ Definitions to be considered a single source. """ - super().__init__() self.defs = {} + self._label_tag_name = DefTagNames.DEF_KEY + self._issues = [] + if def_dicts: + self.add_definitions(def_dicts, hed_schema) + + def add_definitions(self, def_dicts, hed_schema=None): + """ Add definitions from dict(s) to this dict. + + Parameters: + def_dicts (list or DefinitionDict): DefDict or list of DefDicts/strings whose definitions should be added. + hed_schema(HedSchema or None): Required if passing strings or lists of strings, unused otherwise. + """ + if not isinstance(def_dicts, list): + def_dicts = [def_dicts] + for def_dict in def_dicts: + if isinstance(def_dict, DefinitionDict): + self._add_definitions_from_dict(def_dict) + elif isinstance(def_dict, str) and hed_schema: + self.check_for_definitions(HedString(def_dict, hed_schema)) + elif isinstance(def_dict, list) and hed_schema: + for definition in def_dict: + self.check_for_definitions(HedString(definition, hed_schema)) + else: + print(f"Invalid input type '{type(def_dict)} passed to DefDict. Skipping.") - # Definition related issues - self._extract_def_issues = [] + def _add_definition(self, def_tag, def_value): + if def_tag in self.defs: + error_context = self.defs[def_tag].source_context + self._issues += ErrorHandler.format_error_from_context(DefinitionErrors.DUPLICATE_DEFINITION, + error_context=error_context, def_name=def_tag) + else: + self.defs[def_tag] = def_value - def get_definition_issues(self): - """ Return definition errors found during extraction. + def _add_definitions_from_dict(self, def_dict): + """ Add the definitions found in the given definition dictionary to this mapper. - Returns: - list: List of DefinitionErrors issues found. Each issue is a dictionary. + Parameters: + def_dict (DefinitionDict): DefDict whose definitions should be added. """ - return self._extract_def_issues + for def_tag, def_value in def_dict: + self._add_definition(def_tag, def_value) def get(self, def_name): return self.defs.get(def_name.lower()) @@ -39,12 +63,23 @@ def get(self, def_name): def __iter__(self): return iter(self.defs.items()) - def __get_string_funcs__(self, **kwargs): - error_handler = kwargs.get("error_handler") - return [partial(self.check_for_definitions, error_handler=error_handler)] + @property + def issues(self): + """Returns issues about duplicate definitions.""" + return self._issues + + def get_def_entry(self, def_name): + """ Get the definition entry for the definition name. + + Parameters: + def_name (str): Name of the definition to retrieve. + + Returns: + DefinitionEntry: Definition entry for the requested definition. + + """ - def __get_tag_funcs__(self, **kwargs): - return [] + return self.defs.get(def_name.lower()) def check_for_definitions(self, hed_string_obj, error_handler=None): """ Check string for definition tags, adding them to self. @@ -128,9 +163,84 @@ def check_for_definitions(self, hed_string_obj, error_handler=None): takes_value=def_takes_value, source_context=context) - self._extract_def_issues += new_def_issues return new_def_issues + def construct_def_tags(self, hed_string_obj): + """ Identify def/def-expand tag contents in the given string. + + Parameters: + hed_string_obj(HedString): The hed string to identify definition contents in + """ + for def_tag, def_expand_group, def_group in hed_string_obj.find_def_tags(recursive=True): + def_contents = self._get_definition_contents(def_tag) + if def_contents is not None: + def_tag._expandable = def_contents + def_tag._expanded = def_tag != def_expand_group + + def construct_def_tag(self, hed_tag): + """ Identify def/def-expand tag contents in the given HedTag. + + Parameters: + hed_tag(HedTag): The hed tag to identify definition contents in + """ + if hed_tag.short_base_tag in {DefTagNames.DEF_ORG_KEY, DefTagNames.DEF_EXPAND_ORG_KEY}: + def_contents = self._get_definition_contents(hed_tag) + if def_contents is not None: + hed_tag._expandable = def_contents + hed_tag._expanded = hed_tag.short_base_tag == DefTagNames.DEF_EXPAND_ORG_KEY + + def expand_def_tags(self, hed_string_obj): + """ Expands def tags to def-expand tags. + + Parameters: + hed_string_obj (HedString): The hed string to process. + """ + # First see if the "def" is found at all. This covers def and def-expand. + hed_string_lower = hed_string_obj.lower() + if self._label_tag_name not in hed_string_lower: + return [] + + def_issues = [] + # We need to check for labels to expand in ALL groups + for def_tag, def_group in hed_string_obj.find_tags(DefTagNames.DEF_KEY, recursive=True): + def_contents = self._get_definition_contents(def_tag) + if def_contents is not None: + def_tag.short_base_tag = DefTagNames.DEF_EXPAND_ORG_KEY + def_group.replace(def_tag, def_contents) + + return def_issues + + def _get_definition_contents(self, def_tag): + """ Get the contents for a given def tag. + + Does not validate at all. + + Parameters: + def_tag (HedTag): Source hed tag that may be a Def or Def-expand tag. + + Returns: + def_contents: HedGroup + The contents to replace the previous def-tag with. + """ + is_label_tag = def_tag.extension_or_value_portion + placeholder = None + found_slash = is_label_tag.find("/") + if found_slash != -1: + placeholder = is_label_tag[found_slash + 1:] + is_label_tag = is_label_tag[:found_slash] + + label_tag_lower = is_label_tag.lower() + def_entry = self.defs.get(label_tag_lower) + if def_entry is None: + # Could raise an error here? + return None + else: + def_tag_name, def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder) + if def_tag_name: + return def_contents + + return None + @staticmethod def get_as_strings(def_dict): """ Convert the entries to strings of the contents @@ -145,5 +255,3 @@ def get_as_strings(def_dict): def_dict = def_dict.defs return {key: str(value.contents) for key, value in def_dict.items()} - - diff --git a/hed/models/df_util.py b/hed/models/df_util.py new file mode 100644 index 000000000..b7e73a282 --- /dev/null +++ b/hed/models/df_util.py @@ -0,0 +1,125 @@ +from functools import partial + +from hed.models.sidecar import Sidecar +from hed.models.tabular_input import TabularInput +from hed import HedString + + +def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_columns=True, + shrink_defs=False, expand_defs=True): + """Load a tabular file and its associated HED sidecar file. + + Args: + tabular_file: str or TabularInput + The path to the tabular file, or a TabularInput object representing it. + sidecar: str or Sidecar + The path to the sidecar file, or a Sidecar object representing it. + hed_schema: str or HedSchema + If str, will attempt to load as a version if it doesn't have a valid extension. + extra_def_dicts: list of DefinitionDict, optional + Any extra DefinitionDict objects to use when parsing the HED tags. + join_columns: bool + If true, join all hed columns into one. + shrink_defs: bool + Shrink any def-expand tags found + expand_defs: bool + Expand any def tags found + Returns: + A list of HedStrings, or a list of lists of HedStrings + """ + if isinstance(sidecar, str): + sidecar = Sidecar(sidecar) + + if isinstance(tabular_file, str): + tabular_file = TabularInput(tabular_file, sidecar) + + def_dict = None + if sidecar: + def_dict = sidecar.get_def_dict(hed_schema=hed_schema, extra_def_dicts=extra_def_dicts) + + if join_columns: + if expand_defs: + return [HedString(x, hed_schema, def_dict).expand_defs() for x in tabular_file.series_a], def_dict + elif shrink_defs: + return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict + else: + return [HedString(x, hed_schema, def_dict) for x in tabular_file.series_a], def_dict + else: + return [[HedString(x, hed_schema, def_dict).expand_defs() if expand_defs + else HedString(x, hed_schema, def_dict).shrink_defs() if shrink_defs + else HedString(x, hed_schema, def_dict) + for x in text_file_row] for text_file_row in tabular_file.dataframe_a.itertuples(index=False)], def_dict + + +def convert_to_form(df, hed_schema, tag_form, columns): + """ Convert all tags in underlying dataframe to the specified form. + + Converts in place + Parameters: + df (pd.Dataframe): The dataframe to modify + hed_schema (HedSchema): The schema to use to convert tags. + tag_form(str): HedTag property to convert tags to. + columns (list): The columns to modify on the dataframe + """ + if columns is None: + columns = df.columns + + for column in columns: + df[column] = df[column].apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) + + return df + + +def shrink_defs(df, hed_schema, columns): + """ Shrinks any def-expand tags found in the dataframe. + + Converts in place + Parameters: + df (pd.Dataframe): The dataframe to modify + hed_schema (HedSchema or None): The schema to use to identify defs. + columns (list): The columns to modify on the dataframe + """ + if columns is None: + columns = df.columns + + for column in columns: + mask = df[column].str.contains('Def-expand/', case=False) + df[column][mask] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema)) + + return df + + +def expand_defs(df, hed_schema, def_dict, columns): + """ Expands any def tags found in the dataframe. + + Converts in place + + Parameters: + df (pd.Dataframe): The dataframe to modify + hed_schema (HedSchema or None): The schema to use to identify defs + def_dict (DefinitionDict): The definitions to expand + columns (list): The columns to modify on the dataframe + """ + if columns is None: + columns = df.columns + + for column in columns: + mask = df[column].str.contains('Def/', case=False) + df[column][mask] = df[column][mask].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) + + return df + + +def _convert_to_form(hed_string, hed_schema, tag_form): + from hed import HedString + return str(HedString(hed_string, hed_schema).get_as_form(tag_form)) + + +def _shrink_defs(hed_string, hed_schema): + from hed import HedString + return str(HedString(hed_string, hed_schema).shrink_defs()) + + +def _expand_defs(hed_string, hed_schema, def_dict): + from hed import HedString + return str(HedString(hed_string, hed_schema, def_dict).expand_defs()) diff --git a/hed/models/expression_parser.py b/hed/models/expression_parser.py index 68c4e7f59..8a9806d42 100644 --- a/hed/models/expression_parser.py +++ b/hed/models/expression_parser.py @@ -1,7 +1,6 @@ import re -# todo: Add support for early outs with and(only try to match groups we already matched instead of all groups) class search_result: def __init__(self, group, tag): self.group = group @@ -179,8 +178,6 @@ def handle_expr(self, hed_group, exact=False): continue return_list.append(merged_result) - # finally simplify the list and remove duplicates. - return return_list def __str__(self): @@ -193,6 +190,7 @@ def __str__(self): output_str += ")" return output_str + class ExpressionWildcardNew(Expression): def handle_expr(self, hed_group, exact=False): groups_found = [] diff --git a/hed/models/hed_group.py b/hed/models/hed_group.py index e61a3d3b3..6df911801 100644 --- a/hed/models/hed_group.py +++ b/hed/models/hed_group.py @@ -312,12 +312,11 @@ def get_as_long(self): """ return self.get_as_form("long_tag") - def get_as_form(self, tag_attribute, tag_transformer=None): + def get_as_form(self, tag_attribute): """ Get the string corresponding to the specified form. Parameters: tag_attribute (str): The hed_tag property to use to construct the string (usually short_tag or long_tag). - tag_transformer (func or None): A function that is applied to each tag string before returning. Returns: str: The constructed string after transformation @@ -326,13 +325,8 @@ def get_as_form(self, tag_attribute, tag_transformer=None): - The signature of a tag_transformer is str def(HedTag, str). """ - if tag_transformer: - result = ",".join([tag_transformer(child, child.__getattribute__(tag_attribute)) - if isinstance(child, HedTag) else child.get_as_form(tag_attribute, tag_transformer) - for child in self.children]) - else: - result = ",".join([child.__getattribute__(tag_attribute) if isinstance(child, HedTag) else - child.get_as_form(tag_attribute) for child in self.children]) + result = ",".join([child.__getattribute__(tag_attribute) if isinstance(child, HedTag) else + child.get_as_form(tag_attribute) for child in self.children]) if self.is_group: return f"({result})" return result @@ -365,6 +359,8 @@ def __eq__(self, other): if self is other: return True + if isinstance(other, str): + return str(self) == other if not isinstance(other, HedGroup) or self.children != other.children or self.is_group != other.is_group: return False return True @@ -484,9 +480,9 @@ def find_def_tags(self, recursive=False, include_groups=3): """ Find def and def-expand tags Parameters: recursive (bool): If true, also check subgroups. - include_groups (int, 0, 1, 2, 3): options for how to expand or include groups + include_groups (int, 0, 1, 2, 3): options for return values Returns: - list: A list of tuples. The contents depends on the values of the include group. + list: A list of tuples. The contents depend on the values of the include_group. Notes: - The include_groups option controls the tag expansion as follows: - If 0: Return only def and def expand tags/. diff --git a/hed/models/hed_ops.py b/hed/models/hed_ops.py deleted file mode 100644 index c56c93c78..000000000 --- a/hed/models/hed_ops.py +++ /dev/null @@ -1,262 +0,0 @@ -""" Infrastructure for processing HED operations. """ - -from functools import partial -from hed.schema import HedSchema, HedSchemaGroup -from hed.errors.error_types import ErrorContext, SidecarErrors -from hed.errors import ErrorHandler - - -# These are the defaults if you pass in nothing. Most built in routes will have other default values. -default_arguments = { - 'allow_placeholders': False, - 'check_for_definitions': False, - 'expand_defs': False, - 'shrink_defs': False, - 'error_handler': None, - 'check_for_warnings': False, - 'remove_definitions': True -} - - -def translate_ops(hed_ops, split_ops=False, hed_schema=None, **kwargs): - """ Return functions to apply to a hed string object. - - Parameters: - hed_ops (list): A list of func or HedOps or HedSchema to apply to hed strings. - split_ops (bool): If true, will split the operations into separate lists of tag and string operations. - hed_schema(HedSchema or None): The schema to use by default in identifying tags - kwargs (kwargs): An optional dictionary of name-value pairs representing parameters passed to each HedOps - - Returns: - list or tuple: A list of functions to apply or a tuple containing separate lists of tag and string ops. - - Notes: - - The distinction between tag and string ops primarily applies to spreadsheets. - - Splitting the ops into two lists is mainly used for parsing spreadsheets where any given - column isn't an entire hed string, but additional detail is needed on which column an - issue original came from. - - The currently accepted values of kwargs are: - - allow_placeholders - - check_for_definitions - - expand_defs - - shrink_defs - - error_handler - - check_for_warnings - - remove_definitions - - """ - if not isinstance(hed_ops, list): - hed_ops = [hed_ops] - - from hed.models.hed_string import HedString - - settings = default_arguments.copy() - settings.update(kwargs) - - tag_funcs = [] - string_funcs = [] - for hed_op in hed_ops: - if hed_op: - # Handle the special case of a hed schema. - if isinstance(hed_op, (HedSchema, HedSchemaGroup)): - tag_funcs.append(partial(HedString.convert_to_canonical_forms, hed_schema=hed_op)) - else: - try: - tag_funcs += hed_op.__get_tag_funcs__(**settings) - string_funcs += hed_op.__get_string_funcs__(**settings) - except AttributeError: - string_funcs.append(hed_op) - - # Make sure the first column operation is a convert to forms, if we don't have one. - if not _func_in_list(HedString.convert_to_canonical_forms, tag_funcs): - tag_funcs.insert(0, partial(HedString.convert_to_canonical_forms, hed_schema=hed_schema)) - - if split_ops: - return tag_funcs, string_funcs - return tag_funcs + string_funcs - - -def apply_ops(hed_strings, hed_ops, **kwargs): - """ Convenience function to update a list/dict of hed strings - - Parameters: - hed_strings(str, dict, list): A list/dict/str to update - hed_ops (list or HedOps or func): A list of func or HedOps or HedSchema to apply to hed strings. - kwargs (kwargs): An optional dictionary of name-value pairs representing parameters passed to each HedOps - - Returns: - tuple: - hed_strings(str, dict, list): Same type as input - issues(list): A list of issues found applying the hed_ops - """ - from hed.models.hed_string import HedString - - if not hed_strings: - return hed_strings, [] - issues = [] - tag_funcs = translate_ops(hed_ops, **kwargs) - if isinstance(hed_strings, str): - hed_string_obj = HedString(hed_strings) - issues += hed_string_obj.apply_funcs(tag_funcs) - return str(hed_string_obj), issues - elif isinstance(hed_strings, dict): - return_dict = {} - for key, hed_string in hed_strings.items(): - hed_string_obj = HedString(hed_string) - issues += hed_string_obj.apply_funcs(tag_funcs) - return_dict[key] = str(hed_string_obj) - return return_dict, issues - elif isinstance(hed_strings, list): - return_list = [] - for hed_string in hed_strings: - hed_string_obj = HedString(hed_string) - issues += hed_string_obj.apply_funcs(tag_funcs) - return_list.append(str(hed_string_obj)) - return return_list, issues - - raise ValueError("Unaccounted for type in apply_ops") - - -def hed_string_iter(hed_strings, tag_funcs, error_handler): - """ Iterate over the given dict of strings, returning HedStrings - - Also gives issues for blank strings - - Parameters: - hed_strings(dict or str): A hed_string or dict of hed strings - tag_funcs (list of funcs): The functions to apply before returning - error_handler (ErrorHandler): The error handler to use for context, uses a default one if none. - - Yields: - tuple: - - HedString: The hed string at a given column and key position. - - str: Indication of the where hed string was loaded from so it can be later set by the user. - - list: Issues found applying hed_ops. Each issue is a dictionary. - - """ - for hed_string_obj, key_name in _hed_iter_low(hed_strings): - new_col_issues = [] - error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) - if not hed_string_obj: - new_col_issues += ErrorHandler.format_error(SidecarErrors.BLANK_HED_STRING) - error_handler.add_context_to_issues(new_col_issues) - yield hed_string_obj, key_name, new_col_issues - else: - error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj, - increment_depth_after=False) - if tag_funcs: - new_col_issues += hed_string_obj.apply_funcs(tag_funcs) - - error_handler.add_context_to_issues(new_col_issues) - yield hed_string_obj, key_name, new_col_issues - error_handler.pop_error_context() - error_handler.pop_error_context() - - -def _hed_iter_low(hed_strings): - """ Iterate over the hed string entries. - - Used by hed_string_iter - - Parameters: - hed_strings(dict or str): A hed_string or dict of hed strings - - Yields: - tuple: - - HedString: Individual hed strings for different entries. - - str: The position to pass back to set this string. - - """ - from hed.models.hed_string import HedString - - if isinstance(hed_strings, dict): - for key, hed_string in hed_strings.items(): - if isinstance(hed_string, str): - hed_string = HedString(hed_string) - else: - continue - yield hed_string, key - elif isinstance(hed_strings, str): - hed_string = HedString(hed_strings) - yield hed_string, None - - -def set_hed_string(new_hed_string, hed_strings, position=None): - """ Set a hed string for a category key/etc. - - Parameters: - new_hed_string (str or HedString): The new hed_string to replace the value at position. - hed_strings(dict or str or HedString): The hed strings we want to update - position (str, optional): This should only be a value returned from hed_string_iter. - - Returns: - updated_string (str or dict): The newly updated string/dict. - Raises: - TypeError: If the mapping cannot occur. - - """ - from hed.models.hed_string import HedString - - if isinstance(hed_strings, dict): - if position is None: - raise TypeError("Error: Trying to set a category HED string with no category") - if position not in hed_strings: - raise TypeError("Error: Not allowed to add new categories to a column") - hed_strings[position] = str(new_hed_string) - elif isinstance(hed_strings, (str, HedString)): - if position is not None: - raise TypeError("Error: Trying to set a value HED string with a category") - hed_strings = str(new_hed_string) - else: - raise TypeError("Error: Trying to set a HED string on a column_type that doesn't support it.") - - return hed_strings - - -class HedOps: - """ Base class to support HedOps. - - Notes: - - HED ops are operations that apply to HedStrings in a sequence. - - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def __get_string_funcs__(self, **kwargs): - """ Return the operations that should be done on the full string at once. - - Parameters: - kwargs See above. - - Returns: - list: A list of functions that take a single hed string as a parameter, and return a list of issues. - - """ - return [] - - def __get_tag_funcs__(self, **kwargs): - """ Return the operations that should be done on the individual tags in the string. - - Parameters: - kwargs: See above. - - Returns: - list: A list of functions that take a single hed string as a parameter, and return a list of issues. - - """ - return [] - - # Todo: possibly add parameter validation - # def __get_valid_parameters__(self): - # return [] - - -def _func_in_list(find_func, func_list): - for func in func_list: - if func == find_func: - return True - if isinstance(func, partial) and getattr(func, 'func') == find_func: - return True - return False diff --git a/hed/models/hed_string.py b/hed/models/hed_string.py index fee47ea12..fe864b28e 100644 --- a/hed/models/hed_string.py +++ b/hed/models/hed_string.py @@ -3,9 +3,6 @@ """ from hed.models.hed_group import HedGroup from hed.models.hed_tag import HedTag -from hed.errors.error_reporter import ErrorHandler, check_for_any_errors -from hed.errors.error_types import ErrorContext -from hed.models.hed_ops import translate_ops from hed.models.model_constants import DefTagNames @@ -15,7 +12,7 @@ class HedString(HedGroup): OPENING_GROUP_CHARACTER = '(' CLOSING_GROUP_CHARACTER = ')' - def __init__(self, hed_string, hed_schema=None, _contents=None): + def __init__(self, hed_string, hed_schema=None, def_dict=None, _contents=None): """ Constructor for the HedString class. Parameters: @@ -32,7 +29,7 @@ def __init__(self, hed_string, hed_schema=None, _contents=None): contents = _contents else: try: - contents = self.split_into_groups(hed_string, hed_schema) + contents = self.split_into_groups(hed_string, hed_schema, def_dict) except ValueError: contents = [] super().__init__(hed_string, contents=contents, startpos=0, endpos=len(hed_string)) @@ -59,10 +56,8 @@ def is_group(self): def convert_to_canonical_forms(self, hed_schema): """ Identify all tags using the given schema. - If schema is None, still identify "key" tags such as definitions. - Parameters: - hed_schema (HedSchema, HedSchemaGroup, None): The schema to use to validate/convert tags. + hed_schema (HedSchema, HedSchemaGroup): The schema to use to validate/convert tags. Returns: list: A list of issues found while converting the string. Each issue is a dictionary. @@ -89,6 +84,43 @@ def remove_definitions(self): return [] + def shrink_defs(self): + """ Replace def-expand tags with def tags + + This does not validate them and will blindly shrink invalid ones as well. + + Returns: + self + """ + for def_expand_tag, def_expand_group in self.find_tags({DefTagNames.DEF_EXPAND_KEY}, recursive=True): + expanded_parent = def_expand_group._parent + if expanded_parent: + def_expand_tag.short_base_tag = DefTagNames.DEF_ORG_KEY + expanded_parent.replace(def_expand_group, def_expand_tag) + + return self + + def expand_defs(self): + """ Replace def tags with def-expand tags + + This does very minimal validation + + Returns: + self + """ + def_tags = self.find_def_tags(recursive=True, include_groups=0) + + replacements = [] + for tag in def_tags: + if not tag._expanded: + replacements.append((tag, tag._expandable)) + + for tag, group in replacements: + self.replace(tag, group) + tag.short_base_tag = DefTagNames.DEF_EXPAND_KEY + + return self + def convert_to_short(self, hed_schema): """ Compute canonical forms and return the short form. @@ -140,13 +172,13 @@ def convert_to_original(self): return self.get_as_form("org_tag") @staticmethod - def split_into_groups(hed_string, hed_schema=None): + def split_into_groups(hed_string, hed_schema=None, def_dict=None): """ Split the HED string into a parse tree. Parameters: hed_string (str): A hed string consisting of tags and tag groups to be processed. - hed_schema (HedSchema or None): Hed schema to use to identify tags. - + hed_schema (HedSchema or None): HED schema to use to identify tags. + def_dict(DefinitionDict): The definitions to identify Returns: list: A list of HedTag and/or HedGroup. @@ -162,7 +194,7 @@ def split_into_groups(hed_string, hed_schema=None): input_tags = HedString.split_hed_string(hed_string) for is_hed_tag, (startpos, endpos) in input_tags: if is_hed_tag: - new_tag = HedTag(hed_string, (startpos, endpos), hed_schema) + new_tag = HedTag(hed_string, (startpos, endpos), hed_schema, def_dict) current_tag_group[-1].append(new_tag) else: string_portion = hed_string[startpos:endpos] @@ -178,6 +210,8 @@ def split_into_groups(hed_string, hed_schema=None): current_tag_group.append(HedGroup(hed_string, startpos + delimiter_index)) if delimiter_char is HedString.CLOSING_GROUP_CHARACTER: + # if prev_delimiter == ",": + # raise ValueError(f"Closing parentheses in hed string {hed_string}") # Terminate existing group, and save it off. paren_end = startpos + delimiter_index + 1 @@ -282,54 +316,21 @@ def split_hed_string(hed_string): return result_positions - def apply_funcs(self, string_funcs): - """ Run functions on this string. - - Parameters: - string_funcs (list): A list of functions that take a hed string object and return a list of issues. - - Returns: - list: A list of issues found by these operations. Each issue is a dictionary. - - Notes: - - This method potentially modifies the hed string object. - + def validate(self, hed_schema, allow_placeholders=True, error_handler=None): """ - string_issues = [] - for string_func in string_funcs: - string_issues += string_func(self) - if string_issues: - if check_for_any_errors(string_issues): - break - - return string_issues - - def validate(self, hed_ops=None, error_handler=None, **kwargs): - """ Run the given hed_ops on this string. + Validate the string using the schema Parameters: - hed_ops: (func, HedOps, or list): Operations to apply to this object. - error_handler (ErrorHandler or None): Used to report errors in context. Uses a default if None. - kwargs: - See models.hed_ops.translate_ops or the specific hed_ops for additional options - + hed_schema(HedSchema): The schema to use to validate + allow_placeholders(bool): allow placeholders in the string + error_handler(ErrorHandler or None): the error handler to use, creates a default one if none passed Returns: - list: A list of issues encountered in applying these operations. Each issue is a dictionary. - - Notes: - - Although this function is called validation, the HedOps can represent other transformations. - + issues (list of dict): A list of issues for hed string """ - if error_handler is None: - error_handler = ErrorHandler() - tag_funcs = translate_ops(hed_ops, **kwargs) + from hed.validator import HedValidator - error_handler.push_error_context(ErrorContext.HED_STRING, self, increment_depth_after=False) - issues = self.apply_funcs(tag_funcs) - error_handler.add_context_to_issues(issues) - error_handler.pop_error_context() - - return issues + validator = HedValidator(hed_schema) + return validator.validate(self, allow_placeholders=allow_placeholders) def find_top_level_tags(self, anchor_tags, include_groups=2): """ Find top level groups with an anchor tag. @@ -359,4 +360,3 @@ def find_top_level_tags(self, anchor_tags, include_groups=2): if include_groups == 0 or include_groups == 1: return [tag[include_groups] for tag in top_level_tags] return top_level_tags - diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py index c059d8850..29bcf8cf6 100644 --- a/hed/models/hed_tag.py +++ b/hed/models/hed_tag.py @@ -1,5 +1,5 @@ from hed.schema.hed_schema_constants import HedKey -from hed.schema.hed_schema_entry import HedTagEntry +import copy class HedTag: @@ -11,7 +11,7 @@ class HedTag: """ - def __init__(self, hed_string, span=None, hed_schema=None): + def __init__(self, hed_string, span=None, hed_schema=None, def_dict=None): """ Creates a HedTag. Parameters: @@ -23,14 +23,16 @@ def __init__(self, hed_string, span=None, hed_schema=None): - This does not produce issues and is used primarily for testing. """ + if def_dict and not hed_schema: + raise ValueError("Passing a def_dict without also passing a schema is invalid.") self._hed_string = hed_string if span is None: span = (0, len(hed_string)) # This is the span into the original hed string for this tag self.span = span - # If this is present, use this as the org tag for most purposes. This is generally only filled out - # if the tag has a name_prefix added, or is an expanded def. + # If this is present, use this as the org tag for most purposes. + # This is not generally used anymore, but you can use it to replace a tag in place. self._tag = None self._schema_prefix = self._get_schema_prefix(self.org_tag) @@ -42,8 +44,15 @@ def __init__(self, hed_string, span=None, hed_schema=None): self._extension_value = "" self._parent = None + # Downsides: two new parameters + # Have to check for this value, slowing everything down potentially. + self._expandable = None + self._expanded = False + if hed_schema: self.convert_to_canonical_forms(hed_schema) + if def_dict: + def_dict.construct_def_tag(self) @property def schema_prefix(self): @@ -115,10 +124,11 @@ def short_base_tag(self, new_tag_val): - Generally this is used to swap def to def-expand. """ if self._schema_entry: + tag_entry = None if self._schema: + if self.is_takes_value_tag(): + new_tag_val = new_tag_val + "/#" tag_entry = self._schema.get_tag_entry(new_tag_val, schema_prefix=self.schema_prefix) - else: - tag_entry, remainder = HedTagEntry.get_fake_tag_entry(new_tag_val, [new_tag_val.lower()]) self._schema_entry = tag_entry else: @@ -185,15 +195,11 @@ def tag(self, new_tag_val): new_tag_val (str): New (implicitly long form) of tag to set. Notes: - - Primarily used to add prefixes from column metadata to tags. - - Only valid before calling convert_to_canonical_forms. - + - You probably don't actually want to call this. """ - - if self._schema_entry: - raise ValueError("Can only edit tags before calculating canonical forms. " + - "This could be updated to instead remove computed forms.") self._tag = new_tag_val + self._schema_entry = None + self.convert_to_canonical_forms(self._schema) @property def extension_or_value_portion(self): @@ -250,9 +256,29 @@ def tag_terms(self): if self._schema_entry: return self._schema_entry.tag_terms - # TODO: Potentially remove this. It's just a quick hack for testing - return tuple(str(self).lower()) - #return tuple() + return tuple() + + @property + def expanded(self): + """Returns if this is currently expanded or not. + + Will always be false unless expandable is set. This is primarily used for Def/Def-expand tags at present. + + Returns: + bool: Returns true if this is currently expanded + """ + return self._expanded + + @property + def expandable(self): + """Returns if this is expandable + + This is primarily used for Def/Def-expand tags at present. + + Returns: + HedGroup or HedTag or None: Returns the expanded form of this tag + """ + return self._expandable def __str__(self): """ Convert this HedTag to a string. @@ -269,39 +295,6 @@ def __str__(self): return self._hed_string[self.span[0]:self.span[1]] - def add_prefix_if_needed(self, required_prefix): - """ Add a prefix to this tag *unless* already formatted. - - Parameters: - required_prefix (str): The full name_prefix to add if not present. - - Notes: - - This means we verify the tag does not have the required name_prefix, or any partial name_prefix. - - Examples: - Required: KnownTag1/KnownTag2 - - Case 1: KnownTag1/KnownTag2/ColumnValue - Will not be changed, has name_prefix already. - - Case 2: KnownTag2/ColumnValue - Will not be changed, has partial name_prefix already. - - Case 3: ColumnValue - Prefix will be added. - - """ - - checking_prefix = required_prefix - while checking_prefix: - if self.lower().startswith(checking_prefix.lower()): - return - slash_index = checking_prefix.find("/") + 1 - if slash_index == 0: - break - checking_prefix = checking_prefix[slash_index:] - self.tag = required_prefix + self.org_tag - def lower(self): """ Convenience function, equivalent to str(self).lower(). """ return str(self).lower() @@ -316,9 +309,6 @@ def convert_to_canonical_forms(self, hed_schema): list: A list of issues found during conversion. Each element is a dictionary. """ - if not hed_schema: - return self._convert_key_tags_to_canonical_form() - tag_entry, remainder, tag_issues = hed_schema.find_tag_entry(self, self.schema_prefix) self._schema_entry = tag_entry self._schema = hed_schema @@ -433,7 +423,7 @@ def is_value_class_tag(self): """ Return true if this is a value class tag. Returns: - bool: True if this is a a tag with a value class. + bool: True if this is a tag with a value class. """ if self._schema_entry: @@ -536,26 +526,8 @@ def any_parent_has_attribute(self, attribute): if self._schema_entry: return self._schema_entry.any_parent_has_attribute(attribute=attribute) - def _convert_key_tags_to_canonical_form(self): - """ Find the canonical form for basic known tags. - - Returns: - list: Always return an empty list. - - Notes: - - This is used for such as definition and def when no schema present - - """ - tags_to_identify = ["onset", "definition", "offset", "def-expand", "def"] - tag_entry, remainder = HedTagEntry.get_fake_tag_entry(str(self), tags_to_identify) - if tag_entry: - self._schema_entry = tag_entry - self._schema = None - self._extension_value = remainder - - return [] - - def _get_schema_prefix(self, org_tag): + @staticmethod + def _get_schema_prefix(org_tag): """ Finds the library prefix for the tag. Parameters: @@ -649,3 +621,28 @@ def __eq__(self, other): if self.org_tag.lower() == other.org_tag.lower(): return True return False + + def __deepcopy__(self, memo): + # check if the object has already been copied + if id(self) in memo: + return memo[id(self)] + + # create a new instance of HedTag class + new_tag = HedTag(self._hed_string, self.span) + + # add the new object to the memo dictionary + memo[id(self)] = new_tag + + # copy all other attributes except schema and schema_entry + new_tag._tag = copy.deepcopy(self._tag, memo) + new_tag._schema_prefix = copy.deepcopy(self._schema_prefix, memo) + new_tag._extension_value = copy.deepcopy(self._extension_value, memo) + new_tag._parent = copy.deepcopy(self._parent, memo) + new_tag._expandable = copy.deepcopy(self._expandable, memo) + new_tag._expanded = copy.deepcopy(self._expanded, memo) + + # reference the schema and schema_entry from the original object + new_tag._schema = self._schema + new_tag._schema_entry = self._schema_entry + + return new_tag diff --git a/hed/models/sidecar.py b/hed/models/sidecar.py index 59052b0b1..8b808c6d1 100644 --- a/hed/models/sidecar.py +++ b/hed/models/sidecar.py @@ -1,30 +1,50 @@ import json from hed.models.column_metadata import ColumnMetadata -from hed.errors.error_types import ErrorContext, SidecarErrors +from hed.errors.error_types import ErrorContext from hed.errors import ErrorHandler from hed.errors.exceptions import HedFileError, HedExceptions from hed.models.hed_string import HedString from hed.models.column_metadata import ColumnType -from hed.models.hed_ops import apply_ops, hed_string_iter, set_hed_string -from hed.models.sidecar_base import SidecarBase +from hed.models.definition_dict import DefinitionDict -class Sidecar(SidecarBase): +# todo: Add/improve validation for definitions being in known columns(right now it just assumes they aren't) +class Sidecar: """ Contents of a JSON file or merged file. """ - def __init__(self, files, name=None, hed_schema=None): + def __init__(self, files, name=None): """ Construct a Sidecar object representing a JSON file. Parameters: files (str or FileLike or list): A string or file-like object representing a JSON file, or a list of such. name (str or None): Optional name identifying this sidecar, generally a filename. - hed_schema(HedSchema or None): The schema to use by default in identifying tags """ - super().__init__(name, hed_schema=hed_schema) + self.name = name self.loaded_dict = self.load_sidecar_files(files) - self.def_dict = self.extract_definitions(hed_schema) + self._def_dict = None + self._extract_definition_issues = [] + + def __iter__(self): + """ An iterator to go over the individual column metadata. + + Returns: + iterator: An iterator over the column metadata values. + + """ + return iter(self.column_data) + + @property + def def_dict(self): + """This is the definitions from this sidecar. + + Generally you should instead call get_def_dict to get the relevant definitions + + Returns: + DefinitionDict: The definitions for this sidecar + """ + return self._def_dict @property def column_data(self): @@ -36,53 +56,38 @@ def column_data(self): for col_name, col_dict in self.loaded_dict.items(): yield self._generate_single_column(col_name, col_dict) - def _hed_string_iter(self, tag_funcs, error_handler): - """ Low level function to retrieve hed string in sidecar - - Parameters: - tag_funcs(list): A list of functions to apply to returned strings - error_handler(ErrorHandler): Error handler to use for context - - Yields: - tuple: - string(HedString): The retrieved and modified string - position(tuple): The location of this hed string. Black box. - issues(list): A list of issues running the tag_funcs. - """ - for column_name, dict_for_entry in self.loaded_dict.items(): - error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) - hed_dict = dict_for_entry.get("HED", {}) - for (hed_string_obj, position, issues) in hed_string_iter(hed_dict, tag_funcs, error_handler): - yield hed_string_obj, (column_name, position), issues - - error_handler.pop_error_context() - - def _set_hed_string(self, new_hed_string, position): - """ Low level function to update hed string in sidecar + def set_hed_string(self, new_hed_string, position): + """ Set a provided column/category key/etc. Parameters: new_hed_string (str or HedString): The new hed_string to replace the value at position. - position (tuple): The value returned from hed_string_iter. + position (tuple): The (HedString, str, list) tuple returned from hed_string_iter. + """ column_name, position = position hed_dict = self.loaded_dict[column_name] - hed_dict["HED"] = set_hed_string(new_hed_string, hed_dict["HED"], position) + hed_dict["HED"] = self._set_hed_string_low(new_hed_string, hed_dict["HED"], position) - def validate_structure(self, error_handler): - """ Validate the raw structure of this sidecar. + def get_def_dict(self, hed_schema=None, extra_def_dicts=None): + """ Returns the definition dict for this sidecar. Parameters: - error_handler(ErrorHandler): The error handler to use for error context + hed_schema(HedSchema): used to identify tags to find definitions + extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. Returns: - issues(list): A list of issues found with the structure + DefinitionDict: A single definition dict representing all the data(and extra def dicts) """ - all_validation_issues = [] - for column_name, dict_for_entry in self.loaded_dict.items(): - error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) - all_validation_issues += self._validate_column_structure(column_name, dict_for_entry, error_handler) - error_handler.pop_error_context() - return all_validation_issues + if self._def_dict is None and hed_schema: + self._def_dict = self.extract_definitions(hed_schema) + def_dicts = [] + if self.def_dict: + def_dicts.append(self.def_dict) + if extra_def_dicts: + if not isinstance(extra_def_dicts, list): + extra_def_dicts = [extra_def_dicts] + def_dicts += extra_def_dicts + return DefinitionDict(def_dicts) def save_as_json(self, save_filename): """ Save column metadata to a JSON file. @@ -146,6 +151,26 @@ def load_sidecar_files(self, files): merged_dict.update(loaded_json) return merged_dict + def validate(self, hed_schema, extra_def_dicts=None, name=None, error_handler=None): + """Create a SidecarValidator and validate this sidecar with the schema. + + Parameters: + hed_schema (HedSchema): Input data to be validated. + extra_def_dicts(list or DefinitionDict): extra def dicts in addition to sidecar + name(str): The name to report this sidecar as + error_handler (ErrorHandler): Error context to use. Creates a new one if None + Returns: + issues (list of dict): A list of issues associated with each level in the HED string. + """ + from hed.validator.sidecar_validator import SidecarValidator + + if error_handler is None: + error_handler = ErrorHandler() + + validator = SidecarValidator(hed_schema) + issues = validator.validate(self, extra_def_dicts, name, error_handler=error_handler) + return issues + def _load_json_file(self, fp): """ Load the raw json of a given file @@ -176,8 +201,7 @@ def _generate_single_column(self, column_name, dict_for_entry, column_type=None) hed_dict = dict_for_entry.get("HED") else: hed_dict = None - def_removed_dict, _ = apply_ops(hed_dict, HedString.remove_definitions) - column_entry = ColumnMetadata(column_type, column_name, def_removed_dict) + column_entry = ColumnMetadata(column_type, column_name, hed_dict) return column_entry @staticmethod @@ -211,36 +235,124 @@ def _detect_column_type(dict_for_entry): return ColumnType.Value - def _validate_column_structure(self, column_name, dict_for_entry, error_handler): - """ Checks primarily for type errors such as expecting a string and getting a list in a json sidecar. + def extract_definitions(self, hed_schema=None, error_handler=None): + """ Gather and validate definitions in metadata. Parameters: - error_handler (ErrorHandler) Sets the context for the error reporting. Cannot be None. + error_handler (ErrorHandler): The error handler to use for context, uses a default one if None. + hed_schema (HedSchema or None): The schema to used to identify tags. Returns: - list: Issues in performing the operations. Each issue is a dictionary. + DefinitionDict: Contains all the definitions located in the sidecar. """ - val_issues = [] - column_type = self._detect_column_type(dict_for_entry=dict_for_entry) - if column_type is None: - val_issues += ErrorHandler.format_error(SidecarErrors.UNKNOWN_COLUMN_TYPE, - column_name=column_name) - elif column_type == ColumnType.Categorical: - raw_hed_dict = dict_for_entry["HED"] - if not raw_hed_dict: - val_issues += ErrorHandler.format_error(SidecarErrors.BLANK_HED_STRING) - if not isinstance(raw_hed_dict, dict): - val_issues += ErrorHandler.format_error(SidecarErrors.WRONG_HED_DATA_TYPE, - given_type=type(raw_hed_dict), - expected_type="dict") - for key_name, hed_string in raw_hed_dict.items(): + if error_handler is None: + error_handler = ErrorHandler() + def_dict = DefinitionDict() + + self._extract_definition_issues = [] + if hed_schema: + for hed_string, column_data, _ in self.hed_string_iter(error_handler): + hed_string_obj = HedString(hed_string, hed_schema) + error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj, + increment_depth_after=False) + self._extract_definition_issues += def_dict.check_for_definitions(hed_string_obj, error_handler) + error_handler.pop_error_context() + + return def_dict + + def hed_string_iter(self, error_handler=None): + """ Gather and validate definitions in metadata. + + Parameters: + error_handler (ErrorHandler): The error handler to use for context, uses a default one if None. + + Yields: + str: The hed string at a given column and key position. + column_data: the column data for the given string. + position: blackbox(pass back to set this string to a new value) + + """ + if error_handler is None: + error_handler = ErrorHandler() + + for column_data in self.column_data: + error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_data.column_name) + hed_dict = column_data.hed_dict + for (hed_string, position) in self._hed_string_iter(hed_dict, error_handler): + yield hed_string, column_data, position + error_handler.pop_error_context() + + @staticmethod + def _hed_string_iter(hed_strings, error_handler): + """ Iterate over the given dict of strings + + Parameters: + hed_strings(dict or str): A hed_string or dict of hed strings + error_handler (ErrorHandler): The error handler to use for context, uses a default one if none. + + Yields: + tuple: + - str: The hed string at a given column and key position. + - str: Indication of the where hed string was loaded from, so it can be later set by the user. + + """ + for hed_string, key_name in Sidecar._hed_iter_low(hed_strings): + if key_name: + error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) + yield hed_string, key_name + if key_name: + error_handler.pop_error_context() + + @staticmethod + def _hed_iter_low(hed_strings): + """ Iterate over the hed string entries. + + Used by hed_string_iter + + Parameters: + hed_strings(dict or str): A hed_string or dict of hed strings + + Yields: + tuple: + - str: Individual hed strings for different entries. + - str: The position to pass back to set this string. + + """ + if isinstance(hed_strings, dict): + for key, hed_string in hed_strings.items(): if not isinstance(hed_string, str): - error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) - val_issues += ErrorHandler.format_error(SidecarErrors.WRONG_HED_DATA_TYPE, - given_type=type(hed_string), - expected_type="str") - error_handler.pop_error_context() - error_handler.add_context_to_issues(val_issues) - - return val_issues + continue + yield hed_string, key + elif isinstance(hed_strings, str): + yield hed_strings, None + + @staticmethod + def _set_hed_string_low(new_hed_string, hed_strings, position=None): + """ Set a hed string for a category key/etc. + + Parameters: + new_hed_string (str or HedString): The new hed_string to replace the value at position. + hed_strings(dict or str or HedString): The hed strings we want to update + position (str, optional): This should only be a value returned from hed_string_iter. + + Returns: + updated_string (str or dict): The newly updated string/dict. + Raises: + TypeError: If the mapping cannot occur. + + """ + if isinstance(hed_strings, dict): + if position is None: + raise TypeError("Error: Trying to set a category HED string with no category") + if position not in hed_strings: + raise TypeError("Error: Not allowed to add new categories to a column") + hed_strings[position] = str(new_hed_string) + elif isinstance(hed_strings, (str, HedString)): + if position is not None: + raise TypeError("Error: Trying to set a value HED string with a category") + hed_strings = str(new_hed_string) + else: + raise TypeError("Error: Trying to set a HED string on a column_type that doesn't support it.") + + return hed_strings diff --git a/hed/models/sidecar_base.py b/hed/models/sidecar_base.py deleted file mode 100644 index 8b82d3ea3..000000000 --- a/hed/models/sidecar_base.py +++ /dev/null @@ -1,269 +0,0 @@ -import copy -from hed.models.column_metadata import ColumnMetadata -from hed.errors.error_types import ErrorContext -from hed.errors import error_reporter -from hed.errors import ErrorHandler -from hed.models.hed_string import HedString -from hed.models.def_mapper import DefMapper -from hed.models.hed_ops import translate_ops, apply_ops -from hed.models.definition_dict import DefinitionDict -from functools import partial - - -class SidecarBase: - """ Baseclass for specialized spreadsheet sidecars - - To subclass this class, you'll want to override at the minimum: - _hed_string_iter - _set_hed_string - validate_structure - column_data property <- This is the only truly mandatory one - - """ - def __init__(self, name=None, hed_schema=None): - """ Initialize a sidecar baseclass - - Parameters: - name (str or None): Optional name identifying this sidecar, generally a filename. - hed_schema(HedSchema or None): The schema to use by default in identifying tags - """ - self.name = name - self._schema = hed_schema - # Expected to be called in subclass after data is loaded - # self.def_dict = self.extract_definitions() - - @property - def column_data(self): - """ Generates the list of ColumnMetadata for this sidecar - - Returns: - list(ColumnMetadata): the list of column metadata defined by this sidecar - """ - return [] - - def _hed_string_iter(self, tag_funcs, error_handler): - """ Low level function to retrieve hed string in sidecar - - Parameters: - tag_funcs(list): A list of functions to apply to returned strings - error_handler(ErrorHandler): Error handler to use for context - - Yields: - tuple: - string(HedString): The retrieved and modified string - position(tuple): The location of this hed string. Black box. - issues(list): A list of issues running the tag_funcs. - """ - yield - - def _set_hed_string(self, new_hed_string, position): - """ Low level function to update hed string in sidecar - - Parameters: - new_hed_string (str or HedString): The new hed_string to replace the value at position. - position (tuple): The value returned from hed_string_iter. - """ - return - - def validate_structure(self, error_handler): - """ Validate the raw structure of this sidecar. - - Parameters: - error_handler(ErrorHandler): The error handler to use for error context - - Returns: - issues(list): A list of issues found with the structure - """ - return [] - - def __iter__(self): - """ An iterator to go over the individual column metadata. - - Returns: - iterator: An iterator over the column metadata values. - - """ - return iter(self.column_data) - - def hed_string_iter(self, hed_ops=None, error_handler=None, expand_defs=False, remove_definitions=False, - allow_placeholders=True, extra_def_dicts=None, **kwargs): - """ Iterator over hed strings in columns. - - Parameters: - hed_ops (func, HedOps, list): A HedOps, funcs or list of these to apply to the hed strings - before returning - error_handler (ErrorHandler): The error handler to use for context, uses a default one if none. - expand_defs (bool): If True, expand all def tags located in the strings. - remove_definitions (bool): If True, remove all definitions found in the string. - allow_placeholders (bool): If False, placeholders will be marked as validation warnings. - extra_def_dicts (DefinitionDict, list, None): Extra dicts to add to the list. - kwargs: See models.hed_ops.translate_ops or the specific hed_ops for additional options. - - Yields: - tuple: - - HedString: A HedString at a given column and key position. - - tuple: Indicates where hed_string was loaded from so it can be later set by the user - - list: A list of issues found performing ops. Each issue is a dictionary. - - """ - if error_handler is None: - error_handler = ErrorHandler() - hed_ops = self._standardize_ops(hed_ops) - if expand_defs or remove_definitions: - self._add_definition_mapper(hed_ops, extra_def_dicts) - tag_funcs = translate_ops(hed_ops, hed_schema=self._schema, error_handler=error_handler, - expand_defs=expand_defs, allow_placeholders=allow_placeholders, - remove_definitions=remove_definitions, **kwargs) - - return self._hed_string_iter(tag_funcs, error_handler) - - def set_hed_string(self, new_hed_string, position): - """ Set a provided column/category key/etc. - - Parameters: - new_hed_string (str or HedString): The new hed_string to replace the value at position. - position (tuple): The (HedString, str, list) tuple returned from hed_string_iter. - - """ - return self._set_hed_string(new_hed_string, position) - - def _add_definition_mapper(self, hed_ops, extra_def_dicts=None): - """ Add a DefMapper if the hed_ops list doesn't have one. - - Parameters: - hed_ops (list): A list of HedOps - extra_def_dicts (list): DefDicts from outside. - - Returns: - DefMapper: A shallow copy of the hed_ops list with a DefMapper added if there wasn't one. - - """ - def_mapper_list = [hed_op for hed_op in hed_ops if isinstance(hed_op, DefMapper)] - - if not def_mapper_list: - def_dicts = self.get_def_dicts(extra_def_dicts) - def_mapper = DefMapper(def_dicts) - hed_ops.append(def_mapper) - return def_mapper - return def_mapper_list[0] - - @staticmethod - def _standardize_ops(hed_ops): - if not isinstance(hed_ops, list): - hed_ops = [hed_ops] - return hed_ops.copy() - - def get_def_dicts(self, extra_def_dicts=None): - """ Returns the definition dict for this sidecar. - - Parameters: - extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. - - Returns: - list: A list with the sidecar def_dict plus any found in extra_def_dicts. - - """ - def_dicts = [self.def_dict] - if extra_def_dicts: - if not isinstance(extra_def_dicts, list): - extra_def_dicts = [extra_def_dicts] - def_dicts += extra_def_dicts - return def_dicts - - def validate_entries(self, hed_ops=None, name=None, extra_def_dicts=None, - error_handler=None, **kwargs): - """ Run the given hed_ops on all columns in this sidecar. - - Parameters: - hed_ops (list, func, or HedOps): A HedOps, func or list of these to apply to hed strings in this sidecar. - name (str): If present, will use this as the filename for context, rather than using the actual filename - Useful for temp filenames. - extra_def_dicts (DefinitionDict, list, or None): If present use these in addition to sidecar's def dicts. - error_handler (ErrorHandler or None): Used to report errors. Uses a default one if none passed in. - kwargs: See models.hed_ops.translate_ops or the specific hed_ops for additional options. - - Returns: - list: The list of validation issues found. Individual issues are in the form of a dict. - - """ - if error_handler is None: - error_handler = error_reporter.ErrorHandler() - if not name: - name = self.name - if name: - error_handler.push_error_context(ErrorContext.FILE_NAME, name, False) - - all_validation_issues = self.validate_structure(error_handler) - - # Early out major errors so the rest of our code can assume they won't happen. - if all_validation_issues: - return all_validation_issues - - hed_ops = self._standardize_ops(hed_ops) - def_mapper = self._add_definition_mapper(hed_ops, extra_def_dicts) - all_validation_issues += def_mapper.issues - - for hed_string, key_name, issues in self.hed_string_iter(hed_ops=hed_ops, allow_placeholders=True, - error_handler=error_handler, **kwargs): - self.set_hed_string(hed_string, key_name) - all_validation_issues += issues - - # Finally check what requires the final mapped data to check - for column_data in self.column_data: - validate_pound_func = partial(self._validate_pound_sign_count, column_type=column_data.column_type) - _, issues = apply_ops(column_data.hed_dict, validate_pound_func) - all_validation_issues += issues - all_validation_issues += self.def_dict.get_definition_issues() - if name: - error_handler.pop_error_context() - return all_validation_issues - - def extract_definitions(self, hed_schema=None, error_handler=None): - """ Gather and validate definitions in metadata. - - Parameters: - error_handler (ErrorHandler): The error handler to use for context, uses a default one if None. - hed_schema (HedSchema or None): The schema to used to identify tags. - - Returns: - DefinitionDict: Contains all the definitions located in the column. - issues: List of issues encountered in extracting the definitions. Each issue is a dictionary. - - """ - if error_handler is None: - error_handler = ErrorHandler() - new_def_dict = DefinitionDict() - hed_ops = [] - hed_ops.append(hed_schema) - hed_ops.append(new_def_dict) - - all_issues = [] - for hed_string, key_name, issues in self.hed_string_iter(hed_ops=hed_ops, allow_placeholders=True, - error_handler=error_handler): - all_issues += issues - - return new_def_dict - - def _validate_pound_sign_count(self, hed_string, column_type): - """ Check if a given hed string in the column has the correct number of pound signs. - - Parameters: - hed_string (str or HedString): HED string to be checked. - - Returns: - list: Issues due to pound sign errors. Each issue is a dictionary. - - Notes: - Normally the number of # should be either 0 or 1, but sometimes will be higher due to the - presence of definition tags. - - """ - # Make a copy without definitions to check placeholder count. - expected_count, error_type = ColumnMetadata.expected_pound_sign_count(column_type) - hed_string_copy = copy.deepcopy(hed_string) - hed_string_copy.remove_definitions() - - if hed_string_copy.lower().count("#") != expected_count: - return ErrorHandler.format_error(error_type, pound_sign_count=str(hed_string_copy).count("#")) - - return [] diff --git a/hed/models/spreadsheet_input.py b/hed/models/spreadsheet_input.py index 77a497449..b48f6985f 100644 --- a/hed/models/spreadsheet_input.py +++ b/hed/models/spreadsheet_input.py @@ -1,6 +1,5 @@ from hed.models.column_mapper import ColumnMapper from hed.models.base_input import BaseInput -from hed.models.def_mapper import DefMapper class SpreadsheetInput(BaseInput): @@ -8,7 +7,7 @@ class SpreadsheetInput(BaseInput): def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=None, has_column_names=True, column_prefix_dictionary=None, - def_dicts=None, name=None, hed_schema=None): + name=None): """Constructor for the SpreadsheetInput class. Parameters: @@ -21,9 +20,7 @@ def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=N has_column_names (bool): True if file has column names. Validation will skip over the first line of the file if the spreadsheet as column names. column_prefix_dictionary (dict): A dictionary with column number keys and prefix values. - def_dicts (DefinitionDict or list): A DefinitionDict or list of DefDicts containing definitions for this - object other than the ones extracted from the SpreadsheetInput object itself. - hed_schema(HedSchema or None): The schema to use by default in identifying tags + This is partially deprecated - what this now turns the given columns into Value columns. Examples: A prefix dictionary {3: 'Label/', 5: 'Description/'} indicates that column 3 and 5 have HED tags that need to be prefixed by Label/ and Description/ respectively. @@ -38,7 +35,4 @@ def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=N new_mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary, warn_on_missing_column=False) - def_mapper = DefMapper(def_dicts) - - super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, def_mapper=def_mapper, - name=name, hed_schema=hed_schema) + super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, name=name) diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py index 2b9c2089a..388718fb9 100644 --- a/hed/models/tabular_input.py +++ b/hed/models/tabular_input.py @@ -1,7 +1,6 @@ from hed.models.column_mapper import ColumnMapper from hed.models.base_input import BaseInput from hed.models.sidecar import Sidecar -from hed.models.def_mapper import DefMapper class TabularInput(BaseInput): @@ -9,64 +8,30 @@ class TabularInput(BaseInput): HED_COLUMN_NAME = "HED" - def __init__(self, file=None, sidecar=None, extra_def_dicts=None, also_gather_defs=True, name=None, - hed_schema=None): + def __init__(self, file=None, sidecar=None, name=None): """ Constructor for the TabularInput class. Parameters: file (str or file like): A tsv file to open. sidecar (str or Sidecar): A Sidecar filename or Sidecar - extra_def_dicts ([DefinitionDict], DefinitionDict, or None): DefinitionDict objects containing all - the definitions this file should use other than the ones coming from the file - itself and from the sidecar. These are added as the last entries, so names will override - earlier ones. + Note: If this is a string you MUST also pass hed_schema. name (str): The name to display for this file for error purposes. - hed_schema(HedSchema or None): The schema to use by default in identifying tags """ if sidecar and not isinstance(sidecar, Sidecar): sidecar = Sidecar(sidecar) new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME], warn_on_missing_column=True) - definition_columns = [self.HED_COLUMN_NAME] self._sidecar = sidecar - self._also_gather_defs = also_gather_defs - if extra_def_dicts and not isinstance(extra_def_dicts, list): - extra_def_dicts = [extra_def_dicts] - self._extra_def_dicts = extra_def_dicts - def_mapper = self.create_def_mapper(new_mapper) super().__init__(file, file_type=".tsv", worksheet_name=None, has_column_names=True, mapper=new_mapper, - def_mapper=def_mapper, name=name, definition_columns=definition_columns, - allow_blank_names=False, hed_schema=hed_schema) + name=name, allow_blank_names=False, ) if not self._has_column_names: raise ValueError("You are attempting to open a bids_old style file with no column headers provided.\n" "This is probably not intended.") - def create_def_mapper(self, column_mapper): - """ Create the definition mapper for this file. - - Parameters: - column_mapper (ColumnMapper): The column mapper to gather definitions from. - - - Returns: - def mapper (DefMapper): A class to validate or expand definitions with the given def dicts. - - Notes: - - The extra_def_dicts are definitions not included in the column mapper. - - """ - - def_dicts = column_mapper.get_def_dicts() - if self._extra_def_dicts: - def_dicts += self._extra_def_dicts - def_mapper = DefMapper(def_dicts) - - return def_mapper - def reset_column_mapper(self, sidecar=None): """ Change the sidecars and settings. @@ -76,25 +41,4 @@ def reset_column_mapper(self, sidecar=None): """ new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME]) - self._def_mapper = self.create_def_mapper(new_mapper) self.reset_mapper(new_mapper) - - def validate_sidecar(self, hed_ops=None, error_handler=None, **kwargs): - """ Validate column definitions and hed strings. - - Parameters: - hed_ops (list or HedOps): A list of HedOps of funcs to apply to the hed strings in the sidecars. - error_handler (ErrorHandler or None): Used to report errors. Uses a default one if none passed in. - kwargs: See models.hed_ops.translate_ops or the specific hed_ops for additional options. - - Returns: - list: A list of syntax and semantic issues found in the definitions. Each issue is a dictionary. - - Notes: - - For full validation you should validate the sidecar separately. - - """ - if not isinstance(hed_ops, list): - hed_ops = [hed_ops] - hed_ops.append(self._def_mapper) - return self._sidecar.validate_entries(hed_ops, error_handler=error_handler, **kwargs) diff --git a/hed/models/timeseries_input.py b/hed/models/timeseries_input.py index c7ca5c215..0b9cbee18 100644 --- a/hed/models/timeseries_input.py +++ b/hed/models/timeseries_input.py @@ -22,4 +22,4 @@ def __init__(self, file=None, sidecar=None, extra_def_dicts=None, name=None): """ super().__init__(file, file_type=".tsv", worksheet_name=None, has_column_names=False, mapper=None, - def_mapper=None, name=name) + name=name) diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py index 10b9aa6cc..84c2accbf 100644 --- a/hed/schema/schema_compliance.py +++ b/hed/schema/schema_compliance.py @@ -62,7 +62,7 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl if validator: error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name, False) new_issues = validator(hed_schema, tag_entry, tag_entry.attributes[attribute_name]) - error_handler.add_context_to_issues(new_issues) + error_handler.add_context_and_filter(new_issues) issues_list += new_issues error_handler.pop_error_context() error_handler.pop_error_context() diff --git a/hed/validator/__init__.py b/hed/validator/__init__.py index 88b772ca8..4a8b94209 100644 --- a/hed/validator/__init__.py +++ b/hed/validator/__init__.py @@ -2,3 +2,7 @@ from .hed_validator import HedValidator from .tag_validator import TagValidator +from .sidecar_validator import SidecarValidator +from .def_validator import DefValidator +from .onset_validator import OnsetValidator +from .spreadsheet_validator import SpreadsheetValidator \ No newline at end of file diff --git a/hed/validator/def_validator.py b/hed/validator/def_validator.py new file mode 100644 index 000000000..24a3d8e5b --- /dev/null +++ b/hed/validator/def_validator.py @@ -0,0 +1,78 @@ +from hed.models.hed_string import HedString +from hed.models.hed_tag import HedTag +from hed.models.definition_dict import DefinitionDict +from hed.errors.error_types import ValidationErrors +from hed.errors.error_reporter import ErrorHandler + + +class DefValidator(DefinitionDict): + """ Handles validating Def/ and Def-expand/. + + """ + + def __init__(self, def_dicts=None, hed_schema=None): + """ Initialize for definitions in hed strings. + + Parameters: + def_dicts (list or DefinitionDict or str): DefinitionDicts containing the definitions to pass to baseclass + + """ + super().__init__(def_dicts, hed_schema=hed_schema) + + def validate_def_tags(self, hed_string_obj): + """ Validate Def/Def-Expand tags. + + Parameters: + hed_string_obj (HedString): The hed string to process. + + Returns: + list: Issues found related to validating defs. Each issue is a dictionary. + """ + hed_string_lower = hed_string_obj.lower() + if self._label_tag_name not in hed_string_lower: + return [] + + def_issues = [] + # We need to check for labels to expand in ALL groups + for def_tag, def_expand_group, def_group in hed_string_obj.find_def_tags(recursive=True): + def_issues += self._validate_def_contents(def_tag, def_expand_group) + + return def_issues + + def _validate_def_contents(self, def_tag, def_expand_group): + """ Check for issues with expanding a tag from Def to a Def-expand tag group + + Parameters: + def_tag (HedTag): Source hed tag that may be a Def or Def-expand tag. + def_expand_group (HedGroup or HedTag): + Source group for this def-expand tag. Same as def_tag if this is not a def-expand tag. + + Returns: + issues + """ + def_issues = [] + + is_label_tag = def_tag.extension_or_value_portion + placeholder = None + found_slash = is_label_tag.find("/") + if found_slash != -1: + placeholder = is_label_tag[found_slash + 1:] + is_label_tag = is_label_tag[:found_slash] + + label_tag_lower = is_label_tag.lower() + def_entry = self.defs.get(label_tag_lower) + if def_entry is None: + def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_UNMATCHED, tag=def_tag) + else: + def_tag_name, def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder) + if def_tag_name: + if def_expand_group is not def_tag and def_expand_group != def_contents: + def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_EXPAND_INVALID, + tag=def_tag, actual_def=def_contents, + found_def=def_expand_group) + elif def_entry.takes_value: + def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_VALUE_MISSING, tag=def_tag) + else: + def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_VALUE_EXTRA, tag=def_tag) + + return def_issues diff --git a/hed/validator/hed_validator.py b/hed/validator/hed_validator.py index 600d5bb87..c7ce76adf 100644 --- a/hed/validator/hed_validator.py +++ b/hed/validator/hed_validator.py @@ -6,50 +6,86 @@ """ from hed.errors.error_types import ValidationErrors -from hed.errors.error_reporter import ErrorHandler +from hed.errors.error_reporter import ErrorHandler, check_for_any_errors from hed.models.hed_string import HedString from hed.models import HedTag from hed.validator.tag_validator import TagValidator -from functools import partial -from hed.models.hed_ops import HedOps +from hed.validator.def_validator import DefValidator +from hed.validator.onset_validator import OnsetValidator -class HedValidator(HedOps): +class HedValidator: """ Top level validation of HED strings. """ - def __init__(self, hed_schema=None, run_semantic_validation=True): + def __init__(self, hed_schema=None, def_dicts=None, run_full_onset_checks=True): """ Constructor for the HedValidator class. Parameters: hed_schema (HedSchema or HedSchemaGroup): HedSchema object to use for validation. - run_semantic_validation (bool): True if the validator should check the HED data against a schema. """ super().__init__() self._tag_validator = None self._hed_schema = hed_schema - self._tag_validator = TagValidator(hed_schema=self._hed_schema, - run_semantic_validation=run_semantic_validation) - self._run_semantic_validation = run_semantic_validation - - def __get_tag_funcs__(self, **kwargs): - string_funcs = [] - allow_placeholders = kwargs.get("allow_placeholders") - check_for_warnings = kwargs.get("check_for_warnings") - string_funcs.append(self._tag_validator.run_hed_string_validators) - string_funcs.append( - partial(HedString.convert_to_canonical_forms, hed_schema=self._hed_schema)) - string_funcs.append(partial(self._validate_individual_tags_in_hed_string, - allow_placeholders=allow_placeholders, - check_for_warnings=check_for_warnings)) - return string_funcs - - def __get_string_funcs__(self, **kwargs): - check_for_warnings = kwargs.get("check_for_warnings") - string_funcs = [partial(self._validate_tags_in_hed_string, check_for_warnings=check_for_warnings), - self._validate_groups_in_hed_string] - return string_funcs + self._tag_validator = TagValidator(hed_schema=self._hed_schema) + self._def_validator = DefValidator(def_dicts, hed_schema) + self._onset_validator = OnsetValidator(def_dict=self._def_validator, + run_full_onset_checks=run_full_onset_checks) + + def validate(self, hed_string, allow_placeholders, error_handler=None): + """ + Validate the string using the schema + + Parameters: + hed_string(HedString): the string to validate + allow_placeholders(bool): allow placeholders in the string + error_handler(ErrorHandler or None): the error handler to use, creates a default one if none passed + Returns: + issues (list of dict): A list of issues for hed string + """ + if not error_handler: + error_handler = ErrorHandler() + issues = [] + issues += self.run_basic_checks(hed_string, allow_placeholders=allow_placeholders) + error_handler.add_context_and_filter(issues) + if check_for_any_errors(issues): + return issues + issues += self.run_full_string_checks(hed_string) + error_handler.add_context_and_filter(issues) + return issues + + def run_basic_checks(self, hed_string, allow_placeholders): + issues = [] + issues += self._tag_validator.run_hed_string_validators(hed_string) + if check_for_any_errors(issues): + return issues + if hed_string == "n/a" or not self._hed_schema: + return issues + issues += hed_string.convert_to_canonical_forms(self._hed_schema) + if check_for_any_errors(issues): + return issues + # This is required so it can validate the tag a tag expands into + # e.g. checking units when a definition placeholder has units + self._def_validator.construct_def_tags(hed_string) + issues += self._validate_individual_tags_in_hed_string(hed_string, allow_placeholders=allow_placeholders) + if check_for_any_errors(issues): + return issues + issues += self._def_validator.validate_def_tags(hed_string) + if check_for_any_errors(issues): + return issues + issues += self._onset_validator.validate_onset_offset(hed_string) + if check_for_any_errors(issues): + return issues + return issues + + def run_full_string_checks(self, hed_string): + issues = [] + issues += self._validate_tags_in_hed_string(hed_string) + if check_for_any_errors(issues): + return issues + issues += self._validate_groups_in_hed_string(hed_string) + return issues def _validate_groups_in_hed_string(self, hed_string_obj): """ Report invalid groups at each level. @@ -103,26 +139,21 @@ def _check_for_duplicate_groups(self, original_group): self._check_for_duplicate_groups_recursive(sorted_group, validation_issues) return validation_issues - def _validate_tags_in_hed_string(self, hed_string_obj, check_for_warnings=False): - """ Report invalid the multi-tag properties. + def _validate_tags_in_hed_string(self, hed_string_obj): + """ Report invalid the multi-tag properties in a hed string, e.g. required tags.. Parameters: hed_string_obj (HedString): A HedString object. Returns: list: The issues associated with the tags in the HED string. Each issue is a dictionary. - - Notes: - - in a hed string, eg required tags. - - """ + """ validation_issues = [] tags = hed_string_obj.get_all_tags() - validation_issues += self._tag_validator.run_all_tags_validators(tags, check_for_warnings=check_for_warnings) + validation_issues += self._tag_validator.run_all_tags_validators(tags) return validation_issues - def _validate_individual_tags_in_hed_string(self, hed_string_obj, allow_placeholders=False, - check_for_warnings=False): + def _validate_individual_tags_in_hed_string(self, hed_string_obj, allow_placeholders=False): """ Validate individual tags in a HED string. Parameters: @@ -139,9 +170,15 @@ def _validate_individual_tags_in_hed_string(self, hed_string_obj, allow_placehol for group in hed_string_obj.get_all_groups(): is_definition = group in all_def_groups for hed_tag in group.tags(): - validation_issues += \ - self._tag_validator.run_individual_tag_validators(hed_tag, allow_placeholders=allow_placeholders, - check_for_warnings=check_for_warnings, - is_definition=is_definition) + if hed_tag.expandable and not hed_tag.expanded: + for tag in hed_tag.expandable.get_all_tags(): + validation_issues += self._tag_validator. \ + run_individual_tag_validators(tag, allow_placeholders=allow_placeholders, + is_definition=is_definition) + else: + validation_issues += self._tag_validator. \ + run_individual_tag_validators(hed_tag, + allow_placeholders=allow_placeholders, + is_definition=is_definition) return validation_issues diff --git a/hed/models/onset_mapper.py b/hed/validator/onset_validator.py similarity index 76% rename from hed/models/onset_mapper.py rename to hed/validator/onset_validator.py index 842ff25a6..942f58efb 100644 --- a/hed/models/onset_mapper.py +++ b/hed/validator/onset_validator.py @@ -2,29 +2,24 @@ from hed.models.hed_group import HedGroup from hed.errors.error_reporter import ErrorHandler from hed.errors.error_types import OnsetErrors -from hed.models.hed_ops import HedOps -class OnsetMapper(HedOps): - """ HedOps responsible for matching onset/offset pairs. """ +class OnsetValidator: + """ Validates onset/offset pairs. """ - def __init__(self, def_mapper): - super().__init__() - self._def_mapper = def_mapper + def __init__(self, def_dict, run_full_onset_checks=True): + self._defs = def_dict self._onsets = {} + self._run_full_onset_checks = run_full_onset_checks - def check_for_onset_offset(self, hed_string_obj): - """ Check for onset or offset and track context. + def validate_onset_offset(self, hed_string_obj): + """ Validate onset/offset Parameters: - hed_string_obj (HedString): The hed string to check. Finds a maximum of one onset tag. + hed_string_obj (HedString): The hed string to check. Returns: list: A list of issues found in validating onsets (i.e., out of order onsets, unknown def names). - - Notes: - - Each issue in the return list is a dictionary. - """ onset_issues = [] for found_onset, found_group in self._find_onset_tags(hed_string_obj): @@ -82,28 +77,21 @@ def _handle_onset_or_offset(self, def_tag, onset_offset_tag): placeholder = def_name[found_slash + 1:] def_name = def_name[:found_slash] - def_entry = self._def_mapper.get_def_entry(def_name) + def_entry = self._defs.get_def_entry(def_name) if def_entry is None: return ErrorHandler.format_error(OnsetErrors.ONSET_DEF_UNMATCHED, tag=def_tag) if bool(def_entry.takes_value) != bool(placeholder): return ErrorHandler.format_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, tag=def_tag, has_placeholder=bool(def_entry.takes_value)) - if is_onset: - # onset can never fail as it implies an offset - self._onsets[full_def_name.lower()] = full_def_name - else: - if full_def_name.lower() not in self._onsets: - return ErrorHandler.format_error(OnsetErrors.OFFSET_BEFORE_ONSET, tag=def_tag) + if self._run_full_onset_checks: + if is_onset: + # onset can never fail as it implies an offset + self._onsets[full_def_name.lower()] = full_def_name else: - del self._onsets[full_def_name.lower()] - - return [] - - def __get_string_funcs__(self, **kwargs): - string_funcs = [] - string_funcs.append(self.check_for_onset_offset) - return string_funcs + if full_def_name.lower() not in self._onsets: + return ErrorHandler.format_error(OnsetErrors.OFFSET_BEFORE_ONSET, tag=def_tag) + else: + del self._onsets[full_def_name.lower()] - def __get_tag_funcs__(self, **kwargs): return [] diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py new file mode 100644 index 000000000..af12005b1 --- /dev/null +++ b/hed/validator/sidecar_validator.py @@ -0,0 +1,147 @@ +import copy +from hed.errors import ErrorHandler, ErrorContext, SidecarErrors +from hed.models import ColumnType +from hed import HedString +from hed import Sidecar +from hed.models.column_metadata import ColumnMetadata + + +class SidecarValidator: + reserved_column_names = ["HED"] + reserved_category_values = ["n/a"] + + def __init__(self, hed_schema): + """ + Constructor for the HedValidator class. + + Parameters: + hed_schema (HedSchema): HED schema object to use for validation. + """ + self._schema = hed_schema + + def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None): + """Validate the input data using the schema + + Parameters: + sidecar (Sidecar): Input data to be validated. + extra_def_dicts(list or DefinitionDict): extra def dicts in addition to sidecar + name(str): The name to report this sidecar as + error_handler (ErrorHandler): Error context to use. Creates a new one if None + Returns: + issues (list of dict): A list of issues associated with each level in the HED string. + """ + from hed.validator import HedValidator + issues = [] + if error_handler is None: + error_handler = ErrorHandler() + + error_handler.push_error_context(ErrorContext.FILE_NAME, name) + sidecar_def_dict = sidecar.get_def_dict(hed_schema=self._schema, extra_def_dicts=extra_def_dicts) + hed_validator = HedValidator(self._schema, + def_dicts=sidecar_def_dict, + run_full_onset_checks=False) + + issues += self.validate_structure(sidecar, error_handler=error_handler) + issues += sidecar._extract_definition_issues + issues += sidecar_def_dict.issues + # todo: Add the definition validation. + + for hed_string, column_data, position in sidecar.hed_string_iter(error_handler): + hed_string_obj = HedString(hed_string, hed_schema=self._schema, def_dict=sidecar_def_dict) + + error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj, + increment_depth_after=False) + new_issues = hed_validator.run_basic_checks(hed_string_obj, allow_placeholders=True) + if not new_issues: + new_issues = hed_validator.run_full_string_checks(hed_string_obj) + if not new_issues: + new_issues = self._validate_pound_sign_count(hed_string_obj, column_type=column_data.column_type) + error_handler.add_context_and_filter(new_issues) + issues += new_issues + error_handler.pop_error_context() + + error_handler.pop_error_context() + return issues + + def validate_structure(self, sidecar, error_handler): + """ Validate the raw structure of this sidecar. + + Parameters: + sidecar(Sidecar): the sidecar to validate + error_handler(ErrorHandler): The error handler to use for error context + + Returns: + issues(list): A list of issues found with the structure + """ + all_validation_issues = [] + for column_name, dict_for_entry in sidecar.loaded_dict.items(): + error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) + all_validation_issues += self._validate_column_structure(column_name, dict_for_entry, error_handler) + error_handler.pop_error_context() + return all_validation_issues + + def _validate_column_structure(self, column_name, dict_for_entry, error_handler): + """ Checks primarily for type errors such as expecting a string and getting a list in a json sidecar. + + Parameters: + error_handler (ErrorHandler) Sets the context for the error reporting. Cannot be None. + + Returns: + list: Issues in performing the operations. Each issue is a dictionary. + + """ + val_issues = [] + if column_name in self.reserved_column_names: + val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED) + return val_issues + + column_type = Sidecar._detect_column_type(dict_for_entry=dict_for_entry) + if column_type is None: + val_issues += error_handler.format_error_with_context(SidecarErrors.UNKNOWN_COLUMN_TYPE, + column_name=column_name) + elif column_type == ColumnType.Categorical: + raw_hed_dict = dict_for_entry["HED"] + if not raw_hed_dict: + val_issues += error_handler.format_error_with_context(SidecarErrors.BLANK_HED_STRING) + if not isinstance(raw_hed_dict, dict): + val_issues += error_handler.format_error_with_context(SidecarErrors.WRONG_HED_DATA_TYPE, + given_type=type(raw_hed_dict), + expected_type="dict") + for key_name, hed_string in raw_hed_dict.items(): + error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) + if not isinstance(hed_string, str): + val_issues += error_handler.format_error_with_context(SidecarErrors.WRONG_HED_DATA_TYPE, + given_type=type(hed_string), + expected_type="str") + if not hed_string: + val_issues += error_handler.format_error_with_context(SidecarErrors.BLANK_HED_STRING) + if key_name in self.reserved_category_values: + val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_NA_USED, column_name) + error_handler.pop_error_context() + + return val_issues + + def _validate_pound_sign_count(self, hed_string, column_type): + """ Check if a given hed string in the column has the correct number of pound signs. + + Parameters: + hed_string (str or HedString): HED string to be checked. + + Returns: + list: Issues due to pound sign errors. Each issue is a dictionary. + + Notes: + Normally the number of # should be either 0 or 1, but sometimes will be higher due to the + presence of definition tags. + + """ + # Make a copy without definitions to check placeholder count. + expected_count, error_type = ColumnMetadata.expected_pound_sign_count(column_type) + hed_string_copy = copy.deepcopy(hed_string) + hed_string_copy.remove_definitions() + hed_string_copy.shrink_defs() + + if hed_string_copy.lower().count("#") != expected_count: + return ErrorHandler.format_error(error_type, pound_sign_count=str(hed_string_copy).count("#")) + + return [] diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py new file mode 100644 index 000000000..136b5aa73 --- /dev/null +++ b/hed/validator/spreadsheet_validator.py @@ -0,0 +1,114 @@ +import pandas as pd +from hed import BaseInput +from hed.errors import ErrorHandler, ValidationErrors, ErrorContext +from hed.models import ColumnType +from hed import HedString +from hed.models.hed_string_group import HedStringGroup + +PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " + + +class SpreadsheetValidator: + def __init__(self, hed_schema): + """ + Constructor for the HedValidator class. + + Parameters: + hed_schema (HedSchema): HED schema object to use for validation. + """ + self._schema = hed_schema + self._hed_validator = None + + def validate(self, data, def_dicts=None, name=None, error_handler=None): + """ + Validate the input data using the schema + + Parameters: + data (BaseInput or pd.DataFrame): Input data to be validated. + def_dicts(list of DefDict or DefDict): all definitions to use for validation + name(str): The name to report errors from this file as + error_handler (ErrorHandler): Error context to use. Creates a new one if None + Returns: + issues (list of dict): A list of issues for hed string + """ + from hed.validator import HedValidator + issues = [] + if error_handler is None: + error_handler = ErrorHandler() + + error_handler.push_error_context(ErrorContext.FILE_NAME, name) + self._hed_validator = HedValidator(self._schema, def_dicts=def_dicts) + # Check the structure of the input data, if it's a BaseInput + if isinstance(data, BaseInput): + issues += self._validate_column_structure(data, error_handler) + data = data.dataframe_a + + # Check the rows of the input data + issues += self._run_checks(data, error_handler) + error_handler.pop_error_context() + return issues + + def _run_checks(self, data, error_handler): + issues = [] + for row_number, text_file_row in enumerate(data.itertuples(index=False)): + error_handler.push_error_context(ErrorContext.ROW, row_number) + row_strings = [] + new_column_issues = [] + # todo: make this report the correct column numbers(somehow - it almost surely doesn't right now) + for column_number, cell in enumerate(text_file_row): + if not cell or cell == "n/a": + continue + + error_handler.push_error_context(ErrorContext.COLUMN, column_number) + + column_hed_string = HedString(cell) + row_strings.append(column_hed_string) + error_handler.push_error_context(ErrorContext.HED_STRING, column_hed_string, + increment_depth_after=False) + new_column_issues = self._hed_validator.run_basic_checks(column_hed_string, allow_placeholders=False) + + error_handler.add_context_and_filter(new_column_issues) + error_handler.pop_error_context() + error_handler.pop_error_context() + + issues += new_column_issues + if new_column_issues: + continue + else: + row_string = HedStringGroup(row_strings) + error_handler.push_error_context(ErrorContext.HED_STRING, row_string, increment_depth_after=False) + new_column_issues = self._hed_validator.run_full_string_checks(row_string) + + error_handler.add_context_and_filter(new_column_issues) + error_handler.pop_error_context() + issues += new_column_issues + error_handler.pop_error_context() + return issues + + def _validate_column_structure(self, base_input, error_handler): + """ + Validate that each column in the input data has valid values. + + Parameters: + base_input (BaseInput): The input data to be validated. + Returns: + List of issues associated with each invalid value. Each issue is a dictionary. + """ + issues = [] + col_issues = base_input._mapper.get_column_mapping_issues() + error_handler.add_context_and_filter(col_issues) + issues += col_issues + for column in base_input.column_metadata().values(): + if column.column_type == ColumnType.Categorical: + error_handler.push_error_context(ErrorContext.COLUMN, column.column_name) + valid_keys = column.hed_dict.keys() + for row_number, value in enumerate(base_input.dataframe[column.column_name]): + if value != "n/a" and value not in valid_keys: + error_handler.push_error_context(ErrorContext.ROW, row_number) + issues += error_handler.format_error_with_context(ValidationErrors.HED_SIDECAR_KEY_MISSING, + invalid_key=value, + category_keys=list(valid_keys)) + error_handler.pop_error_context() + error_handler.pop_error_context() + + return issues diff --git a/hed/validator/tag_validator.py b/hed/validator/tag_validator.py index 29b5c9f1b..2d08eae62 100644 --- a/hed/validator/tag_validator.py +++ b/hed/validator/tag_validator.py @@ -13,7 +13,7 @@ class TagValidator: """ Validation for individual HED tags. """ - CAMEL_CASE_EXPRESSION = r'([A-Z-]+\s*[a-z-]*)+' + CAMEL_CASE_EXPRESSION = r'([A-Z]+\s*[a-z-]*)+' INVALID_STRING_CHARS = '[]{}~' OPENING_GROUP_CHARACTER = '(' CLOSING_GROUP_CHARACTER = ')' @@ -24,21 +24,17 @@ class TagValidator: # Placeholder characters are checked elsewhere, but by default allowed TAG_ALLOWED_CHARS = "-_/" - def __init__(self, hed_schema=None, run_semantic_validation=True): + def __init__(self, hed_schema=None): """Constructor for the Tag_Validator class. Parameters: hed_schema (HedSchema): A HedSchema object. - run_semantic_validation (bool): True if the validator should check the HED data against a schema. Returns: TagValidator: A Tag_Validator object. """ self._hed_schema = hed_schema - self._run_semantic_validation = run_semantic_validation - if not self._hed_schema: - self._run_semantic_validation = False # Dict contains all the value portion validators for value class. e.g. "is this a number?" self._value_unit_validators = self._register_default_value_validators() @@ -67,13 +63,12 @@ def run_hed_string_validators(self, hed_string_obj): validation_issues += self.check_tag_formatting(tag) return validation_issues - def run_individual_tag_validators(self, original_tag, check_for_warnings, allow_placeholders=False, + def run_individual_tag_validators(self, original_tag, allow_placeholders=False, is_definition=False): """ Runs the hed_ops on the individual tags. Parameters: original_tag (HedTag): A original tag. - check_for_warnings (bool): If True, also check for warnings. allow_placeholders (bool): Allow value class or extensions to be placeholders rather than a specific value. is_definition (bool): This tag is part of a Definition, not a normal line. @@ -83,10 +78,10 @@ def run_individual_tag_validators(self, original_tag, check_for_warnings, allow_ """ validation_issues = [] validation_issues += self.check_tag_invalid_chars(original_tag, allow_placeholders) - if self._run_semantic_validation: - validation_issues += self.check_tag_exists_in_schema(original_tag, check_for_warnings) + if self._hed_schema: + validation_issues += self.check_tag_exists_in_schema(original_tag) if original_tag.is_unit_class_tag(): - validation_issues += self.check_tag_unit_class_units_are_valid(original_tag, check_for_warnings) + validation_issues += self.check_tag_unit_class_units_are_valid(original_tag) elif original_tag.is_value_class_tag(): validation_issues += self.check_tag_value_class_valid(original_tag) elif original_tag.extension_or_value_portion: @@ -95,8 +90,7 @@ def run_individual_tag_validators(self, original_tag, check_for_warnings, allow_ if not allow_placeholders: validation_issues += self.check_for_placeholder(original_tag, is_definition) validation_issues += self.check_tag_requires_child(original_tag) - if check_for_warnings: - validation_issues += self.check_capitalization(original_tag) + validation_issues += self.check_capitalization(original_tag) return validation_issues def run_tag_level_validators(self, original_tag_list, is_top_level, is_group): @@ -119,12 +113,11 @@ def run_tag_level_validators(self, original_tag_list, is_top_level, is_group): validation_issues += self.check_tag_level_issue(original_tag_list, is_top_level, is_group) return validation_issues - def run_all_tags_validators(self, tags, check_for_warnings): + def run_all_tags_validators(self, tags): """ Validate the multi-tag properties in a hed string. Parameters: tags (list): A list containing the HedTags in a HED string. - check_for_warnings (bool): If True, also check for warnings. Returns: list: The validation issues associated with the tags in a HED string. Each issue is a dictionary. @@ -134,9 +127,8 @@ def run_all_tags_validators(self, tags, check_for_warnings): """ validation_issues = [] - if self._run_semantic_validation: - if check_for_warnings: - validation_issues += self.check_for_required_tags(tags) + if self._hed_schema: + validation_issues += self.check_for_required_tags(tags) validation_issues += self.check_multiple_unique_tags_exist(tags) return validation_issues @@ -210,6 +202,9 @@ def check_delimiter_issues_in_hed_string(self, hed_string): current_tag = '' else: issues += ErrorHandler.format_error(ValidationErrors.HED_COMMA_MISSING, tag=current_tag) + elif last_non_empty_valid_character == "," and current_character == self.CLOSING_GROUP_CHARACTER: + issues += ErrorHandler.format_error(ValidationErrors.HED_TAG_EMPTY, source_string=hed_string, + char_index=i) elif TagValidator._comma_is_missing_after_closing_parentheses(last_non_empty_valid_character, current_character): issues += ErrorHandler.format_error(ValidationErrors.HED_COMMA_MISSING, tag=current_tag[:-1]) @@ -252,19 +247,20 @@ def check_tag_invalid_chars(self, original_tag, allow_placeholders): Returns: list: Validation issues. Each issue is a dictionary. """ + validation_issues = self._check_invalid_prefix_issues(original_tag) allowed_chars = self.TAG_ALLOWED_CHARS if not self._hed_schema or not self._hed_schema.is_hed3_schema: allowed_chars += " " if allow_placeholders: allowed_chars += "#" - return self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag) + validation_issues += self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag) + return validation_issues - def check_tag_exists_in_schema(self, original_tag, check_for_warnings=False): + def check_tag_exists_in_schema(self, original_tag): """ Report invalid tag or doesn't take a value. Parameters: original_tag (HedTag): The original tag that is used to report the error. - check_for_warnings (bool): If True, also check for warnings. Returns: list: Validation issues. Each issue is a dictionary. @@ -276,18 +272,17 @@ def check_tag_exists_in_schema(self, original_tag, check_for_warnings=False): is_extension_tag = original_tag.is_extension_allowed_tag() if not is_extension_tag: validation_issues += ErrorHandler.format_error(ValidationErrors.INVALID_EXTENSION, tag=original_tag) - elif check_for_warnings: + else: validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TAG_EXTENDED, tag=original_tag, index_in_tag=len(original_tag.org_base_tag), index_in_tag_end=None) return validation_issues - def check_tag_unit_class_units_are_valid(self, original_tag, check_for_warnings): + def check_tag_unit_class_units_are_valid(self, original_tag): """ Report incorrect unit class or units. Parameters: original_tag (HedTag): The original tag that is used to report the error. - check_for_warnings (bool): Indicates whether to check for warnings. Returns: list: Validation issues. Each issue is a dictionary. @@ -297,13 +292,12 @@ def check_tag_unit_class_units_are_valid(self, original_tag, check_for_warnings) stripped_value, unit = original_tag.get_stripped_unit_value() if not unit: if self._validate_value_class_portion(original_tag, stripped_value): - if check_for_warnings: - # only suggest a unit is missing if this is a valid number - if tag_validator_util.validate_numeric_value_class(stripped_value): - default_unit = original_tag.get_unit_class_default_unit() - validation_issues += ErrorHandler.format_error(ValidationErrors.HED_UNITS_DEFAULT_USED, - tag=original_tag, - default_unit=default_unit) + # only suggest a unit is missing if this is a valid number + if tag_validator_util.validate_numeric_value_class(stripped_value): + default_unit = original_tag.get_unit_class_default_unit() + validation_issues += ErrorHandler.format_error(ValidationErrors.HED_UNITS_DEFAULT_USED, + tag=original_tag, + default_unit=default_unit) else: tag_unit_class_units = original_tag.get_tag_unit_class_units() if tag_unit_class_units: @@ -412,24 +406,23 @@ def check_tag_level_issue(self, original_tag_list, is_top_level, is_group): - Top-level groups can contain definitions, Onset, etc tags. """ validation_issues = [] - if self._run_semantic_validation: - top_level_tags = [tag for tag in original_tag_list if - tag.base_tag_has_attribute(HedKey.TopLevelTagGroup)] - tag_group_tags = [tag for tag in original_tag_list if - tag.base_tag_has_attribute(HedKey.TagGroup)] - for tag_group_tag in tag_group_tags: - if not is_group: - validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TAG_GROUP_TAG, - tag=tag_group_tag) - for top_level_tag in top_level_tags: - if not is_top_level: - validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TOP_LEVEL_TAG, - tag=top_level_tag) - - if is_top_level and len(top_level_tags) > 1: - validation_issues += ErrorHandler.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, - tag=top_level_tags[0], - multiple_tags=top_level_tags[1:]) + top_level_tags = [tag for tag in original_tag_list if + tag.base_tag_has_attribute(HedKey.TopLevelTagGroup)] + tag_group_tags = [tag for tag in original_tag_list if + tag.base_tag_has_attribute(HedKey.TagGroup)] + for tag_group_tag in tag_group_tags: + if not is_group: + validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TAG_GROUP_TAG, + tag=tag_group_tag) + for top_level_tag in top_level_tags: + if not is_top_level: + validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TOP_LEVEL_TAG, + tag=top_level_tag) + + if is_top_level and len(top_level_tags) > 1: + validation_issues += ErrorHandler.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, + tag=top_level_tags[0], + multiple_tags=top_level_tags[1:]) return validation_issues @@ -475,6 +468,15 @@ def check_multiple_unique_tags_exist(self, tags): # ========================================================================== # Private utility functions # =========================================================================+ + def _check_invalid_prefix_issues(self, original_tag): + """Check for invalid schema prefix.""" + issues = [] + schema_prefix = original_tag.schema_prefix + if schema_prefix and not schema_prefix[:-1].isalpha(): + issues += ErrorHandler.format_error(ValidationErrors.TAG_PREFIX_INVALID, + tag=original_tag, tag_prefix=schema_prefix) + return issues + def _validate_value_class_portion(self, original_tag, portion_to_validate): if portion_to_validate is None: return False diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index f43bc9c86..9c80d4d98 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -1,13 +1,18 @@ import os -import json import unittest -from hed.models import DefinitionDict, DefMapper, OnsetMapper -from hed.models.hed_ops import apply_ops -from hed import load_schema_version -from hed import HedValidator +from hed.models import DefinitionDict + +from hed import load_schema_version, HedString +from hed.validator import HedValidator from hed import Sidecar import io import json +from hed import HedFileError +from hed.errors import ErrorHandler, get_printable_issue_string + + + +skip_tests = ["VERSION_DEPRECATED", "CHARACTER_INVALID", "STYLE_WARNING"] class MyTestCase(unittest.TestCase): @@ -17,94 +22,79 @@ def setUpClass(cls): 'hed-specification/docs/source/_static/data/error_tests')) cls.test_files = [os.path.join(test_dir, f) for f in os.listdir(test_dir) if os.path.isfile(os.path.join(test_dir, f))] - cls.fail_count = 0 + cls.fail_count = [] cls.default_sidecar = Sidecar(os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test_sidecar.json'))) - def run_single_test(self, test_file): with open(test_file, "r") as fp: test_info = json.load(fp) for info in test_info: error_code = info['error_code'] - if error_code == "VERSION_DEPRECATED": - print("Skipping VERSION_DEPRECATED test") + if error_code in skip_tests: + print(f"Skipping {error_code} test") continue name = info.get('name', '') description = info['description'] schema = info['schema'] + check_for_warnings = info.get("warning", False) + error_handler = ErrorHandler(check_for_warnings) if schema: schema = load_schema_version(schema) else: - schema = None + raise ValueError("Tests always require a schema now") definitions = info['definitions'] - def_dict = DefinitionDict() - _, issues = apply_ops(definitions, [schema, def_dict]) - self.assertFalse(issues) - validator = HedValidator(schema) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) + def_dict = DefinitionDict(definitions, schema) + self.assertFalse(def_dict.issues) for section_name, section in info["tests"].items(): if section_name == "string_tests": - self._run_single_string_test(section, validator, def_mapper, - onset_mapper, error_code, description, name) - elif section_name == "sidecar_tests": - self._run_single_sidecar_test(section, validator, def_mapper, onset_mapper, error_code, description, - name) - elif section_name == "event_tests": - self._run_single_events_test(section, validator, def_mapper, onset_mapper, error_code, description, - name) - - def _run_single_string_test(self, info, validator, def_mapper, onset_mapper, error_code, description, - name): + self._run_single_string_test(section, schema, def_dict, error_code, description, name, error_handler) + if section_name == "sidecar_tests": + self._run_single_sidecar_test(section, schema, def_dict, error_code, description, name, error_handler) + if section_name == "event_tests": + self._run_single_events_test(section, schema, def_dict, error_code, description, name, error_handler) + if section_name == "combo_tests": + self._run_single_combo_test(section, schema, def_dict, error_code, description, name, error_handler) + + def report_result(self, expected_result, issues, error_code, description, name, test, test_type): + if expected_result == "fails": + if not issues: + print(f"{error_code}: {description}") + print(f"Passed '{test_type}' (which should fail) '{name}': {test}") + print(get_printable_issue_string(issues)) + self.fail_count.append(name) + else: + if issues: + print(f"{error_code}: {description}") + print(f"Failed '{test_type}' test '{name}': {test}") + print(get_printable_issue_string(issues)) + self.fail_count.append(name) + + def _run_single_string_test(self, info, schema, def_dict, error_code, description, name, error_handler): + string_validator = HedValidator(hed_schema=schema, def_dicts=def_dict, run_full_onset_checks=False) for result, tests in info.items(): for test in tests: - modified_test, issues = apply_ops(test, [validator, def_mapper, onset_mapper], check_for_warnings=True, - expand_defs=True) - if modified_test and modified_test != test: - _, def_expand_issues = apply_ops(modified_test, validator, check_for_warnings=True) - issues += def_expand_issues - if result == "fails": - if not issues: - print(f"{error_code}: {description}") - print(f"Passed this test(that should fail) '{name}': {test}") - print(issues) - self.fail_count += 1 - else: - if issues: - print(f"{error_code}: {description}") - print(f"Failed this test {name}: {test}") - print(issues) - - self.fail_count += 1 - - def _run_single_sidecar_test(self, info, validator, def_mapper, onset_mapper, error_code, description, - name): - for result, tests in info.items(): + test_string = HedString(test, schema) + + # This expand should not be required here. + def_dict.expand_def_tags(test_string) + + issues = string_validator.run_basic_checks(test_string, False) + issues += string_validator.run_full_string_checks(test_string) + error_handler.add_context_and_filter(issues) + self.report_result(result, issues, error_code, description, name, test, "string_test") + def _run_single_sidecar_test(self, info, schema, def_dict, error_code, description, name, error_handler): + for result, tests in info.items(): for test in tests: # Well this is a disaster buffer = io.BytesIO(json.dumps(test).encode("utf-8")) sidecar = Sidecar(buffer) - issues = sidecar.validate_entries([validator, def_mapper, onset_mapper], check_for_warnings=True) - if result == "fails": - if not issues: - print(f"{error_code}: {description}") - print(f"Passed this test(that should fail) '{name}': {test}") - print(issues) - self.fail_count += 1 - else: - if issues: - print(f"{error_code}: {description}") - print(f"Failed this test {name}: {test}") - print(issues) - - self.fail_count += 1 - - def _run_single_events_test(self, info, validator, def_mapper, onset_mapper, error_code, description, - name): + issues = sidecar.validate(hed_schema=schema, extra_def_dicts=def_dict, error_handler=error_handler) + self.report_result(result, issues, error_code, description, name, test, "sidecar_test") + + def _run_single_events_test(self, info, schema, def_dict, error_code, description,name, error_handler): from hed import TabularInput for result, tests in info.items(): - for test in tests: string = "" for row in test: @@ -120,26 +110,48 @@ def _run_single_events_test(self, info, validator, def_mapper, onset_mapper, err file_obj = io.BytesIO(string.encode("utf-8")) file = TabularInput(file_obj, sidecar=self.default_sidecar) - issues = file.validate_file([validator, def_mapper, onset_mapper], check_for_warnings=True) - if result == "fails": - if not issues: - print(f"{error_code}: {description}") - print(f"Passed this test(that should fail) '{name}': {test}") - print(issues) - self.fail_count += 1 - else: - if issues: - print(f"{error_code}: {description}") - print(f"Failed this test {name}: {test}") - print(issues) - - self.fail_count += 1 - - def test_summary(self): + issues = file.validate(hed_schema=schema, extra_def_dicts=def_dict, error_handler=error_handler) + self.report_result(result, issues, error_code, description, name, test, "events_test") + + def _run_single_combo_test(self, info, schema, def_dict, error_code, description,name, error_handler): + from hed import TabularInput + for result, tests in info.items(): + for test in tests: + buffer = io.BytesIO(json.dumps(test['sidecar']).encode("utf-8")) + sidecar = Sidecar(buffer) + sidecar.loaded_dict.update(self.default_sidecar.loaded_dict) + issues = sidecar.validate(hed_schema=schema, extra_def_dicts=def_dict, error_handler=error_handler) + string = "" + try: + for row in test['events']: + if not isinstance(row, list): + print(f"Improper grouping in test: {error_code}:{name}") + print(f"Improper data for test {name}: {test}") + print(f"This is probably a missing set of square brackets.") + break + string += "\t".join(str(x) for x in row) + "\n" + + if not string: + print(F"Invalid blank events found in test: {error_code}:{name}") + continue + file_obj = io.BytesIO(string.encode("utf-8")) + + file = TabularInput(file_obj, sidecar=sidecar) + except HedFileError: + print(f"{error_code}: {description}") + print(f"Improper data for test {name}: {test}") + print(f"This is probably a missing set of square brackets.") + continue + issues += file.validate(hed_schema=schema, extra_def_dicts=def_dict, error_handler=error_handler) + self.report_result(result, issues, error_code, description, name, test, "combo_tests") + + def test_errors(self): for test_file in self.test_files: self.run_single_test(test_file) - print(f"{self.fail_count} tests got an unexpected result") - self.assertEqual(self.fail_count, 0) + print(f"{len(self.fail_count)} tests got an unexpected result") + print("\n".join(self.fail_count)) + self.assertEqual(len(self.fail_count), 0) if __name__ == '__main__': unittest.main() + diff --git a/tests/data/model_tests/na_tag_column.tsv b/tests/data/model_tests/na_tag_column.tsv new file mode 100644 index 000000000..d42bbb34b --- /dev/null +++ b/tests/data/model_tests/na_tag_column.tsv @@ -0,0 +1,2 @@ +Geometric-object Event +Square diff --git a/tests/data/model_tests/na_value_column.json b/tests/data/model_tests/na_value_column.json new file mode 100644 index 000000000..72a1d0af7 --- /dev/null +++ b/tests/data/model_tests/na_value_column.json @@ -0,0 +1,5 @@ +{ + "Value": { + "HED": "Description/#" + } +} \ No newline at end of file diff --git a/tests/data/model_tests/na_value_column.tsv b/tests/data/model_tests/na_value_column.tsv new file mode 100644 index 000000000..91d00351e --- /dev/null +++ b/tests/data/model_tests/na_value_column.tsv @@ -0,0 +1,3 @@ +HED Value +Geometric-object 1 +Square n/a diff --git a/tests/data/model_tests/no_column_header_definition.tsv b/tests/data/model_tests/no_column_header_definition.tsv index 27c89d11c..418391ef9 100644 --- a/tests/data/model_tests/no_column_header_definition.tsv +++ b/tests/data/model_tests/no_column_header_definition.tsv @@ -1,2 +1,2 @@ -Geometric-object Event, (Definition/DefTest1, (Circle)) -Square Item, Def/DefTest1 +Geometric-object Event +Circle Item,Def/DefTest1 diff --git a/tests/data/model_tests/no_column_header_definition_long.tsv b/tests/data/model_tests/no_column_header_definition_long.tsv index c58990c03..835457f00 100644 --- a/tests/data/model_tests/no_column_header_definition_long.tsv +++ b/tests/data/model_tests/no_column_header_definition_long.tsv @@ -1,2 +1,2 @@ -Item/Object/Geometric-object Event,(Property/Organizational-property/Definition/DefTest1,(InvalidDefTag)) -Item/Object/Geometric-object/2D-shape/Circle Item,Property/Organizational-property/Def/DefTest1 +Item/Object/Geometric-object Event +Item/Object/Geometric-object/2D-shape/Ellipse/Circle Item,Property/Organizational-property/Def/DefTest1 diff --git a/tests/data/validator_tests/bids_events_HED.json b/tests/data/validator_tests/bids_events_HED.json index 8cb2d6ba4..4158d47ec 100644 --- a/tests/data/validator_tests/bids_events_HED.json +++ b/tests/data/validator_tests/bids_events_HED.json @@ -8,8 +8,7 @@ "Units": "s" }, "HED": { - "Description": "This is a column to verity the often reserved HED name causes no issues.", + "Description": "This is a column to verity the often reserved HED name does cause issues.", "Units": "s" } - } \ No newline at end of file diff --git a/tests/models/test_base_file_input.py b/tests/models/test_base_file_input.py index 97efc8316..8314072bd 100644 --- a/tests/models/test_base_file_input.py +++ b/tests/models/test_base_file_input.py @@ -3,7 +3,6 @@ import shutil from hed import Sidecar from hed import BaseInput, TabularInput -from hed.models.def_mapper import DefMapper from hed.models.column_mapper import ColumnMapper from hed.models import DefinitionDict from hed import schema @@ -40,32 +39,20 @@ def setUpClass(cls): sidecar1 = Sidecar(json_path, name='face_sub1_json') mapper1 = ColumnMapper(sidecar=sidecar1, optional_tag_columns=['HED'], warn_on_missing_column=False) cls.input_data1 = BaseInput(events_path, file_type='.tsv', has_column_names=True, - name="face_sub1_events", mapper=mapper1, - definition_columns=['HED'], allow_blank_names=False) + name="face_sub1_events", mapper=mapper1, allow_blank_names=False) cls.input_data2 = BaseInput(events_path, file_type='.tsv', has_column_names=True, name="face_sub2_events") @classmethod def tearDownClass(cls): shutil.rmtree(cls.base_output_folder) - def test_get_definitions(self): - defs1 = self.input_data1.get_definitions(as_strings=True) - self.assertIsInstance(defs1, dict, "get_definitions returns dictionary when as strings") - self.assertEqual(len(defs1), 17, "get_definitions should have the right number of definitions") - - defs2 = self.input_data1.get_definitions() - self.assertIsInstance(defs2, DefMapper, "get_definitions returns a DefMapper by default") - - defs3 = self.input_data2.get_definitions(as_strings=False) - self.assertIsInstance(defs3, DefMapper, "get_definitions returns a DefMapper when not as strings") - def test_gathered_defs(self): # todo: add unit tests for definitions in tsv file - defs = DefinitionDict.get_as_strings(self.tabular_file.def_dict) + defs = DefinitionDict.get_as_strings(self.tabular_file._sidecar.extract_definitions(hed_schema=self.hed_schema)) expected_defs = { 'jsonfiledef': '(Item/JsonDef1/#,Item/JsonDef1)', 'jsonfiledef2': '(Item/JsonDef2/#,Item/JsonDef2)', - 'jsonfiledef3': '(Item/JsonDef3/#,InvalidTag)', + 'jsonfiledef3': '(Item/JsonDef3/#)', 'takesvaluedef': '(Age/#)', 'valueclassdef': '(Acceleration/#)' } diff --git a/tests/models/test_column_mapper.py b/tests/models/test_column_mapper.py index c2eeea109..78a6b99a9 100644 --- a/tests/models/test_column_mapper.py +++ b/tests/models/test_column_mapper.py @@ -1,8 +1,7 @@ import unittest import os -from hed.models import ColumnMapper, ColumnType, ColumnMetadata, HedString, model_constants -from hed.schema import load_schema +from hed.models import ColumnMapper, ColumnType, HedString from hed.models.sidecar import Sidecar @@ -44,11 +43,6 @@ def setUpClass(cls): cls.short_tag_partial_prefix = 'Language-item/Character/' cls.short_tag_partial_prefix2 = 'Character/' - def test_set_column_prefix_dict(self): - mapper = ColumnMapper() - mapper.set_column_prefix_dict(self.column_prefix_dictionary, True) - self.assertTrue(len(mapper._final_column_map) == 3) - def test_set_tag_columns(self): mapper = ColumnMapper() mapper.set_tag_columns(self.zero_based_tag_columns, finalize_mapping=True) @@ -112,94 +106,12 @@ def test_set_column_map(self): mapper.set_column_map(self.test_column_map) self.assertTrue(len(mapper._final_column_map) >= 1) - def test__set_column_prefix(self): - mapper = ColumnMapper() - mapper._set_column_prefix(mapper._final_column_map, self.add_column_number, self.required_prefix) - self.assertTrue(len(mapper._final_column_map) >= 1) - - mapper = ColumnMapper() - with self.assertRaises(TypeError): - mapper._set_column_prefix(mapper._final_column_map, self.add_column_name, self.required_prefix) - def test__finalize_mapping(self): mapper = ColumnMapper() mapper.add_columns([self.add_column_number], ColumnType.Value) mapper._finalize_mapping() self.assertTrue(len(mapper._final_column_map) >= 1) - def test_expand_column(self): - mapper = ColumnMapper() - mapper._set_sidecar(Sidecar(self.basic_events_json)) - mapper.set_column_map(self.basic_column_map) - expanded_column = mapper._expand_column(2, "go") - self.assertTrue(isinstance(expanded_column[0], HedString)) - - def test_expand_row_tags(self): - mapper = ColumnMapper() - mapper._set_sidecar(Sidecar(self.basic_events_json)) - mapper.add_columns(self.basic_hed_tags_column) - mapper.set_column_map(self.basic_column_map) - expanded_row = mapper.expand_row_tags(self.basic_event_row) - self.assertTrue(isinstance(expanded_row, dict)) - self.assertTrue(0 in expanded_row[model_constants.COLUMN_TO_HED_TAGS]) - - def test_expansion_issues(self): - mapper = ColumnMapper() - mapper._set_sidecar(Sidecar(self.basic_events_json)) - mapper.add_columns(self.basic_hed_tags_column) - mapper.set_column_map(self.basic_column_map) - expanded_row = mapper.expand_row_tags(self.basic_event_row_invalid) - column_issues = expanded_row[model_constants.COLUMN_ISSUES][2] - self.assertEqual(len(column_issues), 1) - self.assertTrue(0 in expanded_row[model_constants.COLUMN_TO_HED_TAGS]) - - def test_remove_prefix_if_needed(self): - mapper = ColumnMapper() - mapper.set_column_prefix_dict({self.add_column_number: self.required_prefix}) - remove_prefix_func = mapper.get_prefix_remove_func(self.add_column_number) - test_string_obj = HedString(self.complex_hed_tag_required_prefix) - no_prefix_string = test_string_obj.get_as_form("org_tag", remove_prefix_func) - self.assertEqual(str(no_prefix_string), str(self.complex_hed_tag_no_prefix)) - - def test__prepend_prefix_to_required_tag_column_if_needed(self): - category_tags = HedString('Participant response, Stimulus') - ColumnMetadata._prepend_required_prefix(category_tags, self.category_key) - self.assertIsInstance(category_tags, HedString) - self.assertEqual(str(category_tags), str(self.category_participant_and_stimulus_tags)) - - # Verify reading/writing a short tag to a file column with a name_prefix works - def test_add_prefix_verify_short_tag_conversion(self): - schema_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), self.schema_file) - hed_schema = load_schema(schema_file) - hed_string_obj = HedString(self.short_tag_with_missing_prefix) - ColumnMetadata._prepend_required_prefix(hed_string_obj, self.short_tag_key) - issues = hed_string_obj.convert_to_canonical_forms(hed_schema) - self.assertFalse(issues) - for tag in hed_string_obj.get_all_tags(): - self.assertEqual("Character/D", tag.short_tag) - - def test_add_prefix_verify_short_tag_read(self): - column_mapper = ColumnMapper(column_prefix_dictionary={0: self.short_tag_key}) - test_strings = { - 'test_no_prefix': self.short_tag_with_missing_prefix, - 'test_full_prefix': self.short_tag_key + self.short_tag_with_missing_prefix, - 'test_partial_prefix1': self.short_tag_partial_prefix + self.short_tag_with_missing_prefix, - 'test_partial_prefix2': self.short_tag_partial_prefix2 + self.short_tag_with_missing_prefix, - } - expected_results = { - 'test_no_prefix': self.short_tag_key + self.short_tag_with_missing_prefix, - 'test_full_prefix': self.short_tag_key + self.short_tag_with_missing_prefix, - 'test_partial_prefix1': self.short_tag_partial_prefix + self.short_tag_with_missing_prefix, - 'test_partial_prefix2': self.short_tag_partial_prefix2 + self.short_tag_with_missing_prefix, - } - - for test_key in test_strings: - test_string = test_strings[test_key] - expected_result = expected_results[test_key] - - expanded_row = column_mapper.expand_row_tags([test_string]) - prepended_hed_string = expanded_row[model_constants.COLUMN_TO_HED_TAGS][0] - self.assertEqual(expected_result, str(prepended_hed_string)) if __name__ == '__main__': diff --git a/tests/models/test_def_mapper.py b/tests/models/test_def_mapper.py deleted file mode 100644 index 4f38c88da..000000000 --- a/tests/models/test_def_mapper.py +++ /dev/null @@ -1,292 +0,0 @@ -import unittest -import os - -from hed import schema -from hed.models import DefinitionDict, DefMapper, HedString -from hed.validator import HedValidator -from hed.errors import ErrorHandler, ErrorContext - - -class Test(unittest.TestCase): - basic_hed_string_with_def_first_paren = None - - @classmethod - def setUpClass(cls): - cls.base_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/') - hed_xml_file = os.path.realpath(os.path.join(cls.base_data_dir, "schema_tests/HED8.0.0t.xml")) - cls.hed_schema = schema.load_schema(hed_xml_file) - cls.def_contents_string = "(Item/TestDef1,Item/TestDef2)" - cls.basic_definition_string = f"(Definition/TestDef,{cls.def_contents_string})" - cls.basic_definition_string_no_paren = f"Definition/TestDef,{cls.def_contents_string}" - cls.label_def_string = "Def/TestDef" - cls.expanded_def_string = f"(Def-expand/TestDef,{cls.def_contents_string})" - cls.basic_hed_string = "Item/BasicTestTag1,Item/BasicTestTag2" - cls.basic_hed_string_with_def = f"{cls.basic_hed_string},{cls.label_def_string}" - cls.basic_hed_string_with_def_first = f"{cls.label_def_string},{cls.basic_hed_string}" - cls.basic_hed_string_with_def_first_paren = f"({cls.label_def_string},{cls.basic_hed_string})" - cls.placeholder_label_def_string = "Def/TestDefPlaceholder/2471" - cls.placeholder_definition_contents = "(Item/TestDef1/#,Item/TestDef2)" - cls.placeholder_definition_string = f"(Definition/TestDefPlaceholder/#,{cls.placeholder_definition_contents})" - cls.placeholder_definition_string_no_paren = \ - f"Definition/TestDefPlaceholder/#,{cls.placeholder_definition_contents}" - cls.placeholder_expanded_def_string = "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2))" - - cls.placeholder_hed_string_with_def = f"{cls.basic_hed_string},{cls.placeholder_label_def_string}" - cls.placeholder_hed_string_with_def_first = f"{cls.placeholder_label_def_string},{cls.basic_hed_string}" - cls.placeholder_hed_string_with_def_first_paren = f"({cls.placeholder_label_def_string},{cls.basic_hed_string})" - - cls.valid_definition_strings = { - 'str_no_defs': False, - 'str2': True, - 'str3': False, - 'str4': False, - 'str5': False, - 'str6': False, - 'str7': False, - } - cls.mark_all_as_valid_strings = { - 'str_no_defs': False, - 'str2': False, - 'str3': False, - 'str4': False, - 'str5': False, - 'str6': False, - 'str7': False, - } - - def base_def_validator(self, test_strings, result_strings, valid_strings, expand_defs, shrink_defs, - remove_definitions, extra_ops=None, - basic_definition_string=None): - if not basic_definition_string: - basic_definition_string = self.basic_definition_string - def_dict = DefinitionDict() - def_string = HedString(basic_definition_string) - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - - def_mapper = DefMapper(def_dict) - hed_ops = [] - if extra_ops: - hed_ops += extra_ops - hed_ops.append(def_mapper) - - for key in test_strings: - string, expected_result, invalid = test_strings[key], result_strings[key], valid_strings[key] - test_string = HedString(string) - def_issues = test_string.validate(hed_ops, expand_defs=expand_defs, shrink_defs=shrink_defs, - remove_definitions=remove_definitions) - self.assertEqual(invalid, bool(def_issues)) - self.assertEqual(test_string.get_as_short(), expected_result) - - def test_expand_def_tags(self): - basic_def_strings = { - 'str_no_defs': self.basic_definition_string, - 'str2': self.basic_definition_string_no_paren, - 'str3': self.basic_hed_string + "," + self.basic_definition_string, - 'str4': self.basic_definition_string + "," + self.basic_hed_string, - 'str5': self.basic_hed_string_with_def, - 'str6': self.basic_hed_string_with_def_first, - 'str7': self.basic_hed_string_with_def_first_paren, - } - expanded_def_strings = { - 'str_no_defs': "", - 'str2': self.basic_definition_string_no_paren, - 'str3': self.basic_hed_string, - 'str4': self.basic_hed_string, - 'str5': self.basic_hed_string + "," + self.expanded_def_string, - 'str6': self.expanded_def_string + "," + self.basic_hed_string, - 'str7': "(" + self.expanded_def_string + "," + self.basic_hed_string + ")" - } - expanded_def_strings_with_definition = { - 'str_no_defs': self.basic_definition_string, - 'str2': self.basic_definition_string_no_paren, - 'str3': self.basic_hed_string + "," + self.basic_definition_string, - 'str4': self.basic_definition_string + "," + self.basic_hed_string, - 'str5': self.basic_hed_string + "," + self.expanded_def_string, - 'str6': self.expanded_def_string + "," + self.basic_hed_string, - 'str7': "(" + self.expanded_def_string + "," + self.basic_hed_string + ")" - } - - self.base_def_validator(basic_def_strings, expanded_def_strings_with_definition, - self.mark_all_as_valid_strings, expand_defs=True, - shrink_defs=False, remove_definitions=False) - self.base_def_validator(basic_def_strings, basic_def_strings, self.mark_all_as_valid_strings, - expand_defs=False, shrink_defs=False, remove_definitions=False) - self.base_def_validator(basic_def_strings, basic_def_strings, self.mark_all_as_valid_strings, - expand_defs=False, shrink_defs=True, remove_definitions=False) - self.base_def_validator(expanded_def_strings_with_definition, basic_def_strings, - self.mark_all_as_valid_strings, expand_defs=False, shrink_defs=True, - remove_definitions=False) - self.base_def_validator(expanded_def_strings_with_definition, expanded_def_strings_with_definition, - self.mark_all_as_valid_strings, expand_defs=True, shrink_defs=False, - remove_definitions=False) - self.base_def_validator(basic_def_strings, expanded_def_strings, self.mark_all_as_valid_strings, - expand_defs=True, shrink_defs=False, remove_definitions=True) - - validator = HedValidator(self.hed_schema) - extra_ops = [validator] - - self.base_def_validator(basic_def_strings, expanded_def_strings_with_definition, - self.valid_definition_strings, expand_defs=True, shrink_defs=False, - extra_ops=extra_ops, remove_definitions=False) - - # special case test - def test_changing_tag_then_def_mapping(self): - def_dict = DefinitionDict() - def_string = HedString(self.basic_definition_string) - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - def_mapper = DefMapper(def_dict) - validator = HedValidator(self.hed_schema) - hed_ops = [validator, def_mapper] - - test_string = HedString(self.label_def_string) - tag = test_string.children[0] - tag.tag = "Organizational-property/" + str(tag) - def_issues = test_string.validate(hed_ops, expand_defs=True) - self.assertFalse(def_issues) - self.assertEqual(test_string.get_as_short(), f"{self.expanded_def_string}") - - test_string = HedString(self.label_def_string) - tag = test_string.children[0] - tag.tag = "Organizational-property22/" + str(tag) - def_issues = test_string.validate(hed_ops, expand_defs=True) - self.assertTrue(def_issues) - - def test_expand_def_tags_placeholder(self): - basic_def_strings = { - 'str_no_defs': self.placeholder_definition_string, - 'str2': self.placeholder_definition_string_no_paren, - 'str3': self.basic_hed_string + "," + self.placeholder_definition_string, - 'str4': self.placeholder_definition_string + "," + self.basic_hed_string, - 'str5': self.placeholder_hed_string_with_def, - 'str6': self.placeholder_hed_string_with_def_first, - 'str7': self.placeholder_hed_string_with_def_first_paren, - } - expanded_def_strings = { - 'str_no_defs': "", - 'str2': self.placeholder_definition_string_no_paren, - 'str3': self.basic_hed_string, - 'str4': self.basic_hed_string, - 'str5': self.basic_hed_string + "," + self.placeholder_expanded_def_string, - 'str6': self.placeholder_expanded_def_string + "," + self.basic_hed_string, - 'str7': "(" + self.placeholder_expanded_def_string + "," + self.basic_hed_string + ")", - } - expanded_def_strings_with_definition = { - 'str_no_defs': self.placeholder_definition_string, - 'str2': self.placeholder_definition_string_no_paren, - 'str3': self.basic_hed_string + "," + self.placeholder_definition_string, - 'str4': self.placeholder_definition_string + "," + self.basic_hed_string, - 'str5': self.basic_hed_string + "," + self.placeholder_expanded_def_string, - 'str6': self.placeholder_expanded_def_string + "," + self.basic_hed_string, - 'str7': "(" + self.placeholder_expanded_def_string + "," + self.basic_hed_string + ")", - } - - self.base_def_validator(basic_def_strings, expanded_def_strings_with_definition, self.mark_all_as_valid_strings, - expand_defs=True, shrink_defs=False, - remove_definitions=False, basic_definition_string=self.placeholder_definition_string) - - self.base_def_validator(basic_def_strings, basic_def_strings, self.mark_all_as_valid_strings, - expand_defs=False, shrink_defs=False, - remove_definitions=False, basic_definition_string=self.placeholder_definition_string) - - self.base_def_validator(basic_def_strings, basic_def_strings, self.mark_all_as_valid_strings, - expand_defs=False, shrink_defs=True, - remove_definitions=False, basic_definition_string=self.placeholder_definition_string) - - self.base_def_validator(expanded_def_strings_with_definition, basic_def_strings, self.mark_all_as_valid_strings, - expand_defs=False, shrink_defs=True, - remove_definitions=False, basic_definition_string=self.placeholder_definition_string) - - self.base_def_validator(basic_def_strings, expanded_def_strings, self.mark_all_as_valid_strings, - expand_defs=True, shrink_defs=False, - remove_definitions=True, basic_definition_string=self.placeholder_definition_string) - - validator = HedValidator(self.hed_schema) - extra_ops = [validator] - self.base_def_validator(basic_def_strings, expanded_def_strings_with_definition, self.valid_definition_strings, - expand_defs=True, shrink_defs=False, - remove_definitions=False, basic_definition_string=self.placeholder_definition_string, - extra_ops=extra_ops) - - def test_expand_def_tags_placeholder_invalid(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - def_mapper = DefMapper(def_dict) - - placeholder_label_def_string_no_placeholder = "def/TestDefPlaceholder" - - test_string = HedString(placeholder_label_def_string_no_placeholder) - test_string.convert_to_canonical_forms(None) - def_issues = def_mapper.expand_def_tags(test_string) - self.assertEqual(str(test_string), placeholder_label_def_string_no_placeholder) - self.assertTrue(def_issues) - - def_dict = DefinitionDict() - def_string = HedString(self.basic_definition_string) - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - def_mapper = DefMapper(def_dict) - - label_def_string_has_invalid_placeholder = "def/TestDef/54687" - - test_string = HedString(label_def_string_has_invalid_placeholder) - test_string.convert_to_canonical_forms(None) - def_issues = def_mapper.expand_def_tags(test_string) - self.assertEqual(str(test_string), label_def_string_has_invalid_placeholder) - self.assertTrue(def_issues) - - def test_bad_def_expand(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - def_mapper = DefMapper(def_dict) - - valid_placeholder = HedString(self.placeholder_expanded_def_string) - def_issues = valid_placeholder.validate(def_mapper) - self.assertFalse(def_issues) - - invalid_placeholder = HedString("(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/21,Item/TestDef2))") - def_issues = invalid_placeholder.validate(def_mapper) - self.assertTrue(bool(def_issues)) - - def test_def_no_content(self): - def_dict = DefinitionDict() - def_string = HedString("(Definition/EmptyDef)") - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - def_mapper = DefMapper(def_dict) - - valid_empty = HedString("Def/EmptyDef") - def_issues = valid_empty.validate(def_mapper, expand_defs=True) - self.assertEqual(str(valid_empty), "(Def-expand/EmptyDef)") - self.assertFalse(def_issues) - - valid_empty = HedString("Def/EmptyDef") - def_issues = valid_empty.validate(def_mapper, expand_defs=False) - self.assertFalse(def_issues) - - def test_duplicate_def(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.convert_to_canonical_forms(None) - error_handler = ErrorHandler() - error_handler.push_error_context(ErrorContext.ROW, 5) - def_dict.check_for_definitions(def_string, error_handler=error_handler) - def_mapper = DefMapper([]) - self.assertEqual(len(def_mapper.issues), 0) - - def_mapper = DefMapper([def_dict, def_dict]) - self.assertEqual(len(def_mapper.issues), 1) - self.assertTrue('ec_row' in def_mapper.issues[0]) - - def_mapper = DefMapper([def_dict, def_dict, def_dict]) - self.assertEqual(len(def_mapper.issues), 2) - self.assertTrue('ec_row' in def_mapper.issues[0]) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/models/test_definition_dict.py b/tests/models/test_definition_dict.py index a463e60a0..ee03122aa 100644 --- a/tests/models/test_definition_dict.py +++ b/tests/models/test_definition_dict.py @@ -3,14 +3,18 @@ from hed.errors import ErrorHandler, DefinitionErrors from hed.models.hed_string import HedString from hed import HedTag +from hed import load_schema_version class TestDefBase(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.hed_schema = load_schema_version("8.0.0") + def check_def_base(self, test_strings, expected_issues): for test_key in test_strings: def_dict = DefinitionDict() - hed_string_obj = HedString(test_strings[test_key]) - hed_string_obj.convert_to_canonical_forms(None) + hed_string_obj = HedString(test_strings[test_key], self.hed_schema) test_issues = def_dict.check_for_definitions(hed_string_obj) expected_issue = expected_issues[test_key] # print(test_issues) @@ -33,16 +37,16 @@ class TestDefinitionDict(TestDefBase): def test_check_for_definitions(self): def_dict = DefinitionDict() original_def_count = len(def_dict.defs) - hed_string_obj = HedString(self.basic_definition_string) - hed_string_obj.validate(def_dict) + hed_string_obj = HedString(self.placeholder_def_string, hed_schema=self.hed_schema) + def_dict.check_for_definitions(hed_string_obj) new_def_count = len(def_dict.defs) self.assertGreater(new_def_count, original_def_count) def test_check_for_definitions_placeholder(self): def_dict = DefinitionDict() original_def_count = len(def_dict.defs) - hed_string_obj = HedString(self.placeholder_def_string) - hed_string_obj.validate(def_dict) + hed_string_obj = HedString(self.placeholder_def_string, hed_schema=self.hed_schema) + def_dict.check_for_definitions(hed_string_obj) new_def_count = len(def_dict.defs) self.assertGreater(new_def_count, original_def_count) @@ -99,6 +103,26 @@ def test_definitions(self): self.check_def_base(test_strings, expected_results) + def test_expand_defs(self): + test_strings = { + 1: "Def/TestDefPlaceholder/2471,Event", + 2: "Event,(Def/TestDefPlaceholder/2471,Event)", + 3: "Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2),Event", + } + + expected_results = { + 1: "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2)),Event", + 2: "Event,((Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2)),Event)", + # this one shouldn't change as it doesn't have a parent + 3: "Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2),Event", + } + def_dict = DefinitionDict() + definition_string = "(Definition/TestDefPlaceholder/#,(Item/TestDef1/#,Item/TestDef2))" + def_dict.check_for_definitions(HedString(definition_string, hed_schema=self.hed_schema)) + for key, test_string in test_strings.items(): + hed_string = HedString(test_string, hed_schema=self.hed_schema) + def_dict.expand_def_tags(hed_string) + self.assertEqual(str(hed_string), expected_results[key]) if __name__ == '__main__': unittest.main() diff --git a/tests/models/test_expression_parser.py b/tests/models/test_expression_parser.py index 7a7ee020d..2066e4e2a 100644 --- a/tests/models/test_expression_parser.py +++ b/tests/models/test_expression_parser.py @@ -4,6 +4,14 @@ from hed.models.expression_parser import QueryParser import os from hed import schema +from hed import HedTag + + +def tag_terms(self): + if isinstance(self, HedTag): + if self._schema_entry: + return self._tag_terms + return (str(self).lower(),) class TestParser(unittest.TestCase): @@ -14,6 +22,9 @@ def setUpClass(cls): hed_xml_file = os.path.join(base_data_dir, "schema_tests/HED8.0.0t.xml") cls.hed_schema = schema.load_schema(hed_xml_file) + HedTag._tag_terms = HedTag.tag_terms + HedTag.tag_terms = property(tag_terms) + def base_test(self, parse_expr, search_strings): expression = QueryParser(parse_expr) diff --git a/tests/models/test_hed_string.py b/tests/models/test_hed_string.py index 894668d5e..af17878bb 100644 --- a/tests/models/test_hed_string.py +++ b/tests/models/test_hed_string.py @@ -1,5 +1,6 @@ from hed.models import HedString import unittest +from hed import load_schema_version class TestHedStrings(unittest.TestCase): @@ -170,3 +171,29 @@ def test_split_hed_string(self): } self.compare_split_results(test_strings, expected_results) + +class TestHedStringShrinkDefs(unittest.TestCase): + hed_schema = load_schema_version("8.0.0") + + def test_shrink_defs(self): + test_strings = { + 1: "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2)),Event", + 2: "Event, ((Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2)),Event)", + # this one shouldn't change as it doesn't have a parent + 3: "Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2),Event", + # This one is an obviously invalid def, but still shrinks + 4: "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2), ThisDefIsInvalid),Event", + } + + expected_results = { + 1: "Def/TestDefPlaceholder/2471,Event", + 2: "Event,(Def/TestDefPlaceholder/2471,Event)", + 3: "Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2),Event", + 4: "Def/TestDefPlaceholder/2471,Event", + } + + for key, test_string in test_strings.items(): + hed_string = HedString(test_string, hed_schema=self.hed_schema) + hed_string.shrink_defs() + self.assertEqual(str(hed_string), expected_results[key]) + diff --git a/tests/models/test_hed_tag.py b/tests/models/test_hed_tag.py index 39daeec83..9eba272eb 100644 --- a/tests/models/test_hed_tag.py +++ b/tests/models/test_hed_tag.py @@ -153,30 +153,4 @@ def test_determine_allows_extensions(self): self.assertEqual(extension_tag1_result, True) self.assertEqual(no_extension_tag1_result, False) self.assertEqual(no_extension_tag2_result, False) - self.assertEqual(no_extension_tag3_result, False) - - def test_finding_tags_no_schema(self): - # Verify basic tag identification works. - tag = HedTag("Onset") - tag.convert_to_canonical_forms(hed_schema=None) - self.assertTrue(tag._schema_entry) - - tag2 = HedTag("OtherFolders/Onset") - tag2.convert_to_canonical_forms(hed_schema=None) - self.assertTrue(tag2._schema_entry) - - tag4 = HedTag("OtherFolders/Onset/Extension") - tag4.convert_to_canonical_forms(hed_schema=None) - self.assertTrue(tag4._schema_entry) - - tag3 = HedTag("OtherFolders/Onset-NotOnset") - tag3.convert_to_canonical_forms(hed_schema=None) - self.assertFalse(tag3._schema_entry) - - tag = HedTag("Onset") - tag.convert_to_canonical_forms(hed_schema=self.hed_schema) - self.assertTrue(tag._schema_entry) - - tag2 = HedTag("Property/Data-property/Data-marker/Temporal-marker/Onset") - tag2.convert_to_canonical_forms(hed_schema=self.hed_schema) - self.assertTrue(tag._schema_entry) + self.assertEqual(no_extension_tag3_result, False) \ No newline at end of file diff --git a/tests/models/test_sidecar.py b/tests/models/test_sidecar.py index 14f5ff68a..1925745ae 100644 --- a/tests/models/test_sidecar.py +++ b/tests/models/test_sidecar.py @@ -8,6 +8,7 @@ from hed.validator import HedValidator from hed import schema from hed.models import DefinitionDict +from hed.errors import ErrorHandler class Test(unittest.TestCase): @@ -80,35 +81,28 @@ def test__iter__(self): self.assertEqual(columns_target, columns_count) def test_validate_column_group(self): - validator = HedValidator(hed_schema=None) - # validation_issues = self.json_def_sidecar.validate_entries(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 0) - # - # validation_issues = self.default_sidecar.validate_entries(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 0) + validation_issues = self.errors_sidecar.validate(self.hed_schema) + self.assertEqual(len(validation_issues), 22) - validation_issues = self.errors_sidecar.validate_entries(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues), 4) + validation_issues2 = self.errors_sidecar_minor.validate(self.hed_schema) + self.assertEqual(len(validation_issues2), 18) - validation_issues2 = self.errors_sidecar_minor.validate_entries(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues2), 10) + validation_issues = self.json_without_definitions_sidecar.validate(self.hed_schema) + self.assertEqual(len(validation_issues), 8) - validation_issues = self.json_without_definitions_sidecar.validate_entries(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues), 1) - - hed_string = HedString("(Definition/JsonFileDef/#, (Item/JsonDef1/#,Item/JsonDef1))") + hed_string = HedString("(Definition/JsonFileDef/#, (Item/JsonDef1/#,Item/JsonDef1))", self.hed_schema) extra_def_dict = DefinitionDict() - hed_string.validate(extra_def_dict) + extra_def_dict.check_for_definitions(hed_string) - validation_issues = self.json_without_definitions_sidecar.validate_entries(validator, check_for_warnings=True, - extra_def_dicts=extra_def_dict) - self.assertEqual(len(validation_issues), 0) + validation_issues2 = self.json_without_definitions_sidecar.validate(self.hed_schema, extra_def_dicts=extra_def_dict) + # this removes one undef matched error and adds two extended tag warnings + self.assertEqual(len(validation_issues2), 9) def test_duplicate_def(self): sidecar = self.json_def_sidecar - def_dicts = sidecar.get_def_dicts() - issues = sidecar.validate_entries(extra_def_dicts=def_dicts) + duplicate_dict = sidecar.extract_definitions(hed_schema=self.hed_schema) + issues = sidecar.validate(self.hed_schema, extra_def_dicts=duplicate_dict, error_handler=ErrorHandler(False)) self.assertEqual(len(issues), 5) self.assertTrue(issues[0]['code'], ValidationErrors.HED_DEFINITION_INVALID) @@ -120,7 +114,7 @@ def test_save_load(self): reloaded_sidecar = Sidecar(save_filename) for str1, str2 in zip(sidecar.hed_string_iter(), reloaded_sidecar.hed_string_iter()): - self.assertEqual(str1, str2) + self.assertEqual(str1[0], str2[0]) def test_save_load2(self): sidecar = Sidecar(self.json_def_filename) @@ -129,7 +123,7 @@ def test_save_load2(self): reloaded_sidecar = Sidecar(io.StringIO(json_string)) for str1, str2 in zip(sidecar.hed_string_iter(), reloaded_sidecar.hed_string_iter()): - self.assertEqual(str1, str2) + self.assertEqual(str1[0], str2[0]) def test_merged_sidecar(self): base_folder = self.base_data_dir + "sidecar_tests/" diff --git a/tests/models/test_spreadsheet_input.py b/tests/models/test_spreadsheet_input.py index feac77f35..9fc8f5827 100644 --- a/tests/models/test_spreadsheet_input.py +++ b/tests/models/test_spreadsheet_input.py @@ -51,20 +51,13 @@ def test_all(self): file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name, tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary) - for column_to_hed_tags in file_input: - break_here = 3 + self.assertTrue(isinstance(file_input.dataframe_a, pd.DataFrame)) + self.assertTrue(isinstance(file_input.series_a, pd.Series)) + self.assertTrue(file_input.dataframe_a.size) # Just make sure this didn't crash for now self.assertTrue(True) - def test_get_row_hed_tags(self): - row_dict = self.generic_file_input._mapper.expand_row_tags(self.row_with_hed_tags) - column_to_hed_tags_dictionary = row_dict[model_constants.COLUMN_TO_HED_TAGS] - # self.assertIsInstance(hed_string, HedString) - # self.assertTrue(hed_string) - self.assertIsInstance(column_to_hed_tags_dictionary, dict) - self.assertTrue(column_to_hed_tags_dictionary) - def test_file_as_string(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/validator_tests/bids_events_no_index.tsv') @@ -72,15 +65,14 @@ def test_file_as_string(self): json_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/validator_tests/bids_events.json") sidecar = Sidecar(json_path) - self.assertEqual(len(sidecar.validate_entries(expand_defs=True)), 0) + self.assertEqual(len(sidecar.validate(self.hed_schema)), 0) input_file = TabularInput(events_path, sidecar=sidecar) with open(events_path) as file: events_file_as_string = io.StringIO(file.read()) input_file_from_string = TabularInput(file=events_file_as_string, sidecar=sidecar) - for column_dict, column_dict in zip(input_file, input_file_from_string): - self.assertEqual(column_dict, column_dict) + self.assertTrue(input_file._dataframe.equals(input_file_from_string._dataframe)) def test_bad_file_inputs(self): self.assertRaises(HedFileError, TabularInput, None) @@ -115,7 +107,7 @@ def test_to_excel_should_work(self): column_prefix_dictionary={1: 'Label/', 3: 'Description/'}, name='ExcelOneSheet.xlsx') buffer = io.BytesIO() - spreadsheet.to_excel(buffer, output_processed_file=True) + spreadsheet.to_excel(buffer, output_assembled=True) buffer.seek(0) v = buffer.getvalue() self.assertGreater(len(v), 0, "It should have a length greater than 0") @@ -145,23 +137,13 @@ def test_loading_and_reset_mapper(self): json_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/validator_tests/bids_events.json") sidecar = Sidecar(json_path) - self.assertEqual(len(sidecar.validate_entries()), 0) + self.assertEqual(len(sidecar.validate(self.hed_schema)), 0) input_file_1 = TabularInput(events_path, sidecar=sidecar) input_file_2 = TabularInput(events_path, sidecar=sidecar) input_file_2.reset_column_mapper() - for (row_number, row_dict), (row_number2, row_dict2) in \ - zip(enumerate(input_file_1.iter_dataframe(return_string_only=False)), - enumerate(input_file_2.iter_dataframe(return_string_only=False))): - self.assertEqual(row_number, row_number2, - f"TabularInput should have row {row_number} equal to {row_number2} after reset") - column_dict = row_dict["column_to_hed_tags"] - self.assertTrue(len(column_dict) == 5, - f"The column dictionary for row {row_number} should have the right length") - column_dict2 = row_dict2["column_to_hed_tags"] - self.assertTrue(len(column_dict2) == 0, - f"The reset column dictionary for row {row_number2} should have the right length") + self.assertTrue(input_file_1.dataframe.equals(input_file_2.dataframe)) def test_no_column_header_and_convert(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), @@ -172,18 +154,7 @@ def test_no_column_header_and_convert(self): events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_long.tsv') hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2]) - for column1, column2 in zip(hed_input, hed_input_long): - self.assertEqual(column1, column2) - - events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/model_tests/no_column_header.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) - events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/model_tests/no_column_header_long.tsv') - hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2]) - hed_input_long.convert_to_short(self.hed_schema) - for column1, column2 in zip(hed_input, hed_input_long): - self.assertEqual(column1, column2) + self.assertTrue(hed_input._dataframe.equals(hed_input_long._dataframe)) def test_convert_short_long_with_definitions(self): # Verify behavior works as expected even if definitions are present @@ -195,37 +166,17 @@ def test_convert_short_long_with_definitions(self): events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition_long.tsv') hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2]) - for column1, column2 in zip(hed_input, hed_input_long): - self.assertEqual(column1, column2) - - def test_convert_short_long_with_definitions_new_style(self): - # Verify behavior works as expected even if definitions are present - events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/model_tests/no_column_header_definition.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2], - hed_schema=self.hed_schema) - hed_input.convert_to_long() - - events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/model_tests/no_column_header_definition_long.tsv') - hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2]) - for column1, column2 in zip(hed_input, hed_input_long): - self.assertEqual(column1, column2) + self.assertTrue(hed_input._dataframe.equals(hed_input_long._dataframe)) def test_definitions_identified(self): + # Todo ian: this test is no longer relevant events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2], - hed_schema=self.hed_schema) - def_entry = hed_input.def_dict['deftest1'] - tag = def_entry.contents.tags()[0] - self.assertTrue(tag._schema_entry) + hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) - def_entry = hed_input.def_dict['deftest1'] - tag = def_entry.contents.tags()[0] - self.assertFalse(tag._schema_entry) + def test_loading_dataframe_directly(self): ds_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), @@ -236,9 +187,22 @@ def test_loading_dataframe_directly(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') hed_input2 = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) - for column1, column2 in zip(hed_input, hed_input2): - self.assertEqual(column1, column2) + self.assertTrue(hed_input._dataframe.equals(hed_input2._dataframe)) + def test_ignoring_na_column(self): + events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/model_tests/na_tag_column.tsv') + hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) + self.assertTrue(hed_input.dataframe_a.loc[1, 1] == 'n/a') + + def test_ignoring_na_value_column(self): + from hed import TabularInput + events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/model_tests/na_value_column.tsv') + sidecar_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/model_tests/na_value_column.json') + hed_input = TabularInput(events_path, sidecar=sidecar_path) + self.assertTrue(hed_input.dataframe_a.loc[1, 'Value'] == 'n/a') if __name__ == '__main__': unittest.main() diff --git a/tests/models/test_tabular_input.py b/tests/models/test_tabular_input.py index f514ef5ff..d306582fb 100644 --- a/tests/models/test_tabular_input.py +++ b/tests/models/test_tabular_input.py @@ -4,8 +4,8 @@ from hed.models import DefinitionEntry, Sidecar, TabularInput from hed import schema -from hed.validator import HedValidator from hed.errors import HedFileError +from hed.errors import ErrorHandler class Test(unittest.TestCase): @@ -32,38 +32,17 @@ def setUpClass(cls): def tearDownClass(cls): shutil.rmtree(cls.base_output_folder) - def test_get_definitions(self): - input_data = TabularInput(self.events_path, sidecar=self.sidecar1, name="face_sub1_events") - defs1 = input_data.get_definitions().gathered_defs - self.assertIsInstance(defs1, dict, "get_definitions returns dictionary by default") - self.assertEqual(len(defs1), 17, "get_definitions should have the right number of definitions") - for key, value in defs1.items(): - self.assertIsInstance(key, str, "get_definitions dictionary keys should be strings") - self.assertIsInstance(value, DefinitionEntry, - "get_definitions dict values should be strings when as strings") - defs2 = input_data.get_definitions(as_strings=False).gathered_defs - self.assertIsInstance(defs2, dict, "get_definitions returns dictionary by when not as strings") - self.assertEqual(len(defs2), 17, "get_definitions should have the right number of definitions when not strings") - for key, value in defs2.items(): - self.assertIsInstance(key, str, "get_definitions dictionary keys should be strings") - self.assertIsInstance(value, DefinitionEntry, - "get_definitions dictionary values should be strings when as strings") - self.assertIsInstance(defs2, dict, "get_definitions returns DefinitionDict when not as strings") - def test_missing_column_name_issue(self): events_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/validator_tests/bids_events_bad_column_name.tsv')) json_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/validator_tests/bids_events.json")) - validator = HedValidator(hed_schema=self.hed_schema) - sidecar = Sidecar(json_path, hed_schema=self.hed_schema) - issues = sidecar.validate_entries(validator) + sidecar = Sidecar(json_path) + issues = sidecar.validate(self.hed_schema) self.assertEqual(len(issues), 0) - input_file = TabularInput(events_path, sidecar=sidecar, hed_schema=self.hed_schema) + input_file = TabularInput(events_path, sidecar=sidecar) - validation_issues = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues), 0) - validation_issues = input_file.validate_file(validator, check_for_warnings=True) + validation_issues = input_file.validate(hed_schema=self.hed_schema) self.assertEqual(len(validation_issues), 1) def test_expand_column_issues(self): @@ -71,16 +50,12 @@ def test_expand_column_issues(self): '../data/validator_tests/bids_events_bad_category_key.tsv') json_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/validator_tests/bids_events.json") - validator = HedValidator(hed_schema=self.hed_schema) - sidecar = Sidecar(json_path, hed_schema=self.hed_schema) - issues = sidecar.validate_entries(validator) + sidecar = Sidecar(json_path) + issues = sidecar.validate(hed_schema=self.hed_schema) self.assertEqual(len(issues), 0) - input_file = TabularInput(events_path, sidecar=sidecar, hed_schema=self.hed_schema) + input_file = TabularInput(events_path, sidecar=sidecar) - # Fix whatever is wrong with onset tag here. It's thinking Description/Onset continues is an invalid tag???' - validation_issues = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues), 0) - validation_issues = input_file.validate_file(validator, check_for_warnings=True) + validation_issues = input_file.validate(hed_schema=self.hed_schema) self.assertEqual(len(validation_issues), 1) def test_blank_and_duplicate_columns(self): @@ -98,16 +73,14 @@ def test_blank_and_duplicate_columns(self): # _ = TabularInput(filepath) def test_validate_file_warnings(self): - validator = HedValidator(hed_schema=self.hed_schema) - issues1 = self.sidecar1.validate_entries(validator, check_for_warnings=True) + issues1 = self.sidecar1.validate(hed_schema=self.hed_schema) input_file1 = TabularInput(self.events_path, sidecar=self.sidecar1) - issues1a = input_file1.validate_file(validator, check_for_warnings=True) + issues1a = input_file1.validate(hed_schema=self.hed_schema) - issues2 = self.sidecar2.validate_entries(validator, check_for_warnings=False) + issues2 = self.sidecar1.validate(hed_schema=self.hed_schema, error_handler=ErrorHandler(False)) input_file2 = TabularInput(self.events_path, sidecar=self.sidecar2) - issues2a = input_file2.validate_file(validator, check_for_warnings=False) - # TODO: Currently does not correctly check for warnings. - + issues2a = input_file2.validate(hed_schema=self.hed_schema, error_handler=ErrorHandler(False)) + breakHere = 3 if __name__ == '__main__': unittest.main() diff --git a/tests/schema/test_convert_tags.py b/tests/schema/test_convert_tags.py index 50e30af45..ebfa134a1 100644 --- a/tests/schema/test_convert_tags.py +++ b/tests/schema/test_convert_tags.py @@ -25,7 +25,7 @@ def converter_base(self, test_strings, expected_results, expected_errors, conver expected_issue = self.format_errors_fully(error_handler, hed_string=test_string_obj, params=expected_params) - error_handler.add_context_to_issues(test_issues) + error_handler.add_context_and_filter(test_issues) # print(test_key) # print(expected_issue) diff --git a/tests/validator/test_def_validator.py b/tests/validator/test_def_validator.py new file mode 100644 index 000000000..f889b36f1 --- /dev/null +++ b/tests/validator/test_def_validator.py @@ -0,0 +1,119 @@ +import unittest +import os + +from hed import schema +from hed.models import DefinitionDict, HedString +from hed.validator import DefValidator +from hed.errors import ErrorHandler, ErrorContext + + +class Test(unittest.TestCase): + basic_hed_string_with_def_first_paren = None + + @classmethod + def setUpClass(cls): + cls.base_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/') + hed_xml_file = os.path.realpath(os.path.join(cls.base_data_dir, "schema_tests/HED8.0.0t.xml")) + cls.hed_schema = schema.load_schema(hed_xml_file) + cls.def_contents_string = "(Item/TestDef1,Item/TestDef2)" + cls.basic_definition_string = f"(Definition/TestDef,{cls.def_contents_string})" + cls.basic_definition_string_no_paren = f"Definition/TestDef,{cls.def_contents_string}" + + cls.placeholder_definition_contents = "(Item/TestDef1/#,Item/TestDef2)" + cls.placeholder_definition_string = f"(Definition/TestDefPlaceholder/#,{cls.placeholder_definition_contents})" + cls.placeholder_definition_string_no_paren = \ + f"Definition/TestDefPlaceholder/#,{cls.placeholder_definition_contents}" + + + + cls.label_def_string = "Def/TestDef" + cls.expanded_def_string = f"(Def-expand/TestDef,{cls.def_contents_string})" + cls.basic_hed_string = "Item/BasicTestTag1,Item/BasicTestTag2" + cls.basic_hed_string_with_def = f"{cls.basic_hed_string},{cls.label_def_string}" + cls.basic_hed_string_with_def_first = f"{cls.label_def_string},{cls.basic_hed_string}" + cls.basic_hed_string_with_def_first_paren = f"({cls.label_def_string},{cls.basic_hed_string})" + cls.placeholder_label_def_string = "Def/TestDefPlaceholder/2471" + + cls.placeholder_expanded_def_string = "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2))" + + cls.placeholder_hed_string_with_def = f"{cls.basic_hed_string},{cls.placeholder_label_def_string}" + cls.placeholder_hed_string_with_def_first = f"{cls.placeholder_label_def_string},{cls.basic_hed_string}" + cls.placeholder_hed_string_with_def_first_paren = f"({cls.placeholder_label_def_string},{cls.basic_hed_string})" + + + def test_expand_def_tags_placeholder_invalid(self): + def_validator = DefValidator() + def_string = HedString(self.placeholder_definition_string, self.hed_schema) + def_validator.check_for_definitions(def_string) + + placeholder_label_def_string_no_placeholder = "Def/TestDefPlaceholder" + + test_string = HedString(placeholder_label_def_string_no_placeholder, self.hed_schema) + def_issues = def_validator.validate_def_tags(test_string) + def_issues += def_validator.expand_def_tags(test_string) + self.assertEqual(str(test_string), placeholder_label_def_string_no_placeholder) + self.assertTrue(def_issues) + + def_validator = DefValidator() + def_string = HedString(self.basic_definition_string, self.hed_schema) + def_validator.check_for_definitions(def_string) + + label_def_string_has_invalid_placeholder = "Def/TestDef/54687" + + def_validator = DefValidator() + def_string = HedString(self.basic_definition_string, self.hed_schema) + def_validator.check_for_definitions(def_string) + + test_string = HedString(label_def_string_has_invalid_placeholder, self.hed_schema) + def_issues = def_validator.validate_def_tags(test_string) + def_issues += def_validator.expand_def_tags(test_string) + self.assertEqual(str(test_string), label_def_string_has_invalid_placeholder) + self.assertTrue(def_issues) + + + def test_bad_def_expand(self): + def_validator = DefValidator() + def_string = HedString(self.placeholder_definition_string, self.hed_schema) + def_validator.check_for_definitions(def_string) + + valid_placeholder = HedString(self.placeholder_expanded_def_string, self.hed_schema) + def_issues = def_validator.validate_def_tags(valid_placeholder) + self.assertFalse(def_issues) + + invalid_placeholder = HedString("(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/21,Item/TestDef2))", self.hed_schema) + def_issues = def_validator.validate_def_tags(invalid_placeholder) + self.assertTrue(bool(def_issues)) + + + def test_def_no_content(self): + + def_validator = DefValidator() + def_string = HedString("(Definition/EmptyDef)", self.hed_schema) + def_validator.check_for_definitions(def_string) + + valid_empty = HedString("Def/EmptyDef", self.hed_schema) + def_issues = def_validator.validate_def_tags(valid_empty) + def_issues += def_validator.expand_def_tags(valid_empty) + self.assertEqual(str(valid_empty), "(Def-expand/EmptyDef)") + self.assertFalse(def_issues) + + valid_empty = HedString("Def/EmptyDef", self.hed_schema) + def_issues = def_validator.validate_def_tags(valid_empty) + self.assertFalse(def_issues) + + def test_duplicate_def(self): + def_dict = DefinitionDict() + def_string = HedString(self.placeholder_definition_string, self.hed_schema) + error_handler = ErrorHandler() + error_handler.push_error_context(ErrorContext.ROW, 5) + def_dict.check_for_definitions(def_string, error_handler=error_handler) + self.assertEqual(len(def_dict.issues), 0) + + def_validator = DefValidator([def_dict, def_dict]) + self.assertEqual(len(def_validator.issues), 1) + self.assertTrue('ec_row' in def_validator.issues[0]) + + def_dict = DefinitionDict([def_dict, def_dict, def_dict]) + self.assertEqual(len(def_dict.issues), 2) + self.assertTrue('ec_row' in def_dict.issues[0]) + diff --git a/tests/validator/test_hed_validator.py b/tests/validator/test_hed_validator.py index 6c9cb74e4..a523e33c3 100644 --- a/tests/validator/test_hed_validator.py +++ b/tests/validator/test_hed_validator.py @@ -4,10 +4,10 @@ # from hed import from hed.errors import ErrorContext from hed import schema -from hed.models import DefMapper, HedString, SpreadsheetInput, TabularInput, Sidecar -from hed.validator import HedValidator - +from hed.models import HedString, SpreadsheetInput, TabularInput, Sidecar +from hed.validator import HedValidator, DefValidator +# todo: redo all this so we class Test(unittest.TestCase): @classmethod def setUpClass(cls): @@ -33,31 +33,29 @@ def setUpClass(cls): def test__validate_input(self): test_string_obj = HedString(self.base_hed_input) - validation_issues = test_string_obj.validate(self.hed_validator) + validation_issues = test_string_obj.validate(self.hed_schema) self.assertIsInstance(validation_issues, list) name = "DummyDisplayFilename.txt" - validation_issues = self.hed_file_with_errors.validate_file(self.hed_validator, name=name) + validation_issues = self.hed_file_with_errors.validate(self.hed_schema, name=name) self.assertIsInstance(validation_issues, list) self.assertTrue(name in validation_issues[0][ErrorContext.FILE_NAME]) def test__validate_input_major_errors(self): name = "DummyDisplayFilename.txt" - validation_issues = self.hed_file_with_major_errors.validate_file(self.hed_validator, name=name) + validation_issues = self.hed_file_with_major_errors.validate(self.hed_schema, name=name) self.assertIsInstance(validation_issues, list) self.assertTrue(name in validation_issues[0][ErrorContext.FILE_NAME]) def test__validate_input_major_errors_columns(self): name = "DummyDisplayFilename.txt" - validation_issues = self.hed_file_with_major_errors.validate_file(self.hed_validator, - check_for_warnings=True, name=name) + validation_issues = self.hed_file_with_major_errors.validate(self.hed_schema, name=name) self.assertIsInstance(validation_issues, list) self.assertTrue(name in validation_issues[0][ErrorContext.FILE_NAME]) def test__validate_input_major_errors_multi_column(self): - validation_issues = self.hed_file_with_major_errors_multi_column.validate_file(self.hed_validator, - check_for_warnings=True) + validation_issues = self.hed_file_with_major_errors_multi_column.validate(self.hed_schema) self.assertIsInstance(validation_issues, list) self.assertEqual(len(validation_issues), 2) @@ -66,15 +64,12 @@ def test_complex_file_validation_no_index(self): '../data/validator_tests/bids_events_no_index.tsv')) json_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/validator_tests/bids_events.json')) - validator = HedValidator(hed_schema=self.hed_schema) sidecar = Sidecar(json_path) - issues = sidecar.validate_entries(validator) + issues = sidecar.validate(self.hed_schema) self.assertEqual(len(issues), 0) input_file = TabularInput(events_path, sidecar=sidecar) - validation_issues = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues), 0) - validation_issues = input_file.validate_file(validator) + validation_issues = input_file.validate(self.hed_schema) self.assertEqual(len(validation_issues), 0) def test_complex_file_validation_with_index(self): @@ -84,15 +79,12 @@ def test_complex_file_validation_with_index(self): # hed_schema = schema.load_schema(schema_path) json_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/validator_tests/bids_events.json')) - validator = HedValidator(hed_schema=self.hed_schema) sidecar = Sidecar(json_path) - issues = sidecar.validate_entries(validator) + issues = sidecar.validate(hed_schema=self.hed_schema) self.assertEqual(len(issues), 0) input_file = TabularInput(events_path, sidecar=sidecar) - validation_issues = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues), 0) - validation_issues = input_file.validate_file(validator) + validation_issues = input_file.validate(hed_schema=self.hed_schema) self.assertEqual(len(validation_issues), 0) def test_complex_file_validation_invalid(self): @@ -104,17 +96,13 @@ def test_complex_file_validation_invalid(self): hed_schema = schema.load_schema(schema_path) json_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/validator_tests/bids_events_bad_defs.json')) - validator = HedValidator(hed_schema=hed_schema) sidecar = Sidecar(json_path) - issues = sidecar.validate_entries(hed_ops=validator, check_for_warnings=True) + issues = sidecar.validate(hed_schema) self.assertEqual(len(issues), 4) input_file = TabularInput(events_path, sidecar=sidecar) - validation_issues = input_file.validate_sidecar(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues), 4) - - validation_issues = input_file.validate_file(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues), 42) + validation_issues = input_file.validate(hed_schema) + self.assertEqual(len(validation_issues), 63) def test_complex_file_validation_invalid_definitions_removed(self): # This verifies definitions are being removed from sidecar strings before being added, or it will produce @@ -128,14 +116,12 @@ def test_complex_file_validation_invalid_definitions_removed(self): json_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/validator_tests/bids_events_bad_defs2.json')) sidecar = Sidecar(json_path) + issues = sidecar.validate(hed_schema) + self.assertEqual(len(issues), 4) input_file = TabularInput(events_path, sidecar=sidecar) - validator = HedValidator(hed_schema=hed_schema) - validation_issues1 = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues1), 4) - - validation_issues = input_file.validate_file(validator) - self.assertEqual(len(validation_issues), 21) + validation_issues = input_file.validate(hed_schema) + self.assertEqual(len(validation_issues), 42) def test_file_bad_defs_in_spreadsheet(self): schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), @@ -150,9 +136,8 @@ def test_file_bad_defs_in_spreadsheet(self): column_prefix_dictionary=prefixed_needed_tag_columns, worksheet_name='LKT Events') - validator = HedValidator(hed_schema=hed_schema) - validation_issues = loaded_file.validate_file(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues), 4) + validation_issues = loaded_file.validate(hed_schema=hed_schema) + self.assertEqual(len(validation_issues), 2) def test_tabular_input_with_HED_col_in_json(self): schema_path = os.path.realpath(os.path.join(os.path.dirname(__file__), @@ -163,28 +148,20 @@ def test_tabular_input_with_HED_col_in_json(self): hed_schema = schema.load_schema(schema_path) json_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/validator_tests/bids_events_HED.json')) - validator = HedValidator(hed_schema=hed_schema) sidecar = Sidecar(json_path) - issues = sidecar.validate_entries(validator) - self.assertEqual(len(issues), 0) + issues = sidecar.validate(hed_schema) + self.assertEqual(len(issues), 1) input_file = TabularInput(events_path, sidecar=sidecar) - validation_issues = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues), 0) - validation_issues = input_file.validate_file(validator) + validation_issues = input_file.validate(hed_schema) self.assertEqual(len(validation_issues), 1) def test_error_spans_from_file_and_missing_required_column(self): - schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/schema_tests/HED8.0.0.mediawiki') events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/validator_tests/tag_error_span_test.tsv') - hed_schema = schema.load_schema(schema_path) - input_file = SpreadsheetInput(events_path, tag_columns=[0, 1, "error"]) - validator = HedValidator(hed_schema=hed_schema) - validation_issues = input_file.validate_file(validator) + validation_issues = input_file.validate(hed_schema=self.hed_schema) self.assertEqual(validation_issues[1]['char_index'], 6) self.assertEqual(validation_issues[2]['char_index'], 6) self.assertEqual(len(validation_issues), 3) @@ -201,28 +178,15 @@ def test_org_tag_missing(self): source_span = test_string_obj._get_org_span(HedTag("Event")) self.assertEqual(source_span, (None, None)) - def test_def_mapping_single_line(self): - schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/schema_tests/HED8.0.0.mediawiki') - hed_schema = schema.load_schema(schema_path) - validator = HedValidator(hed_schema=hed_schema) - def_mapper = DefMapper() - string_with_def = \ - '(Definition/TestDefPlaceholder/#,(Item/TestDef1/#,Item/TestDef2)), def/TestDefPlaceholder/2471' - test_string = HedString(string_with_def) - issues = test_string.validate([validator, def_mapper], check_for_definitions=True) - self.assertEqual(len(issues), 0) def test_duplicate_group_in_definition(self): schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/schema_tests/HED8.0.0.mediawiki') hed_schema = schema.load_schema(schema_path) - validator = HedValidator(hed_schema=hed_schema) - def_mapper = DefMapper() string_with_def = \ - '(Definition/TestDef,(Item/TestDef1,Item/TestDef1))' - test_string = HedString(string_with_def) - issues = test_string.validate([validator, def_mapper], check_for_definitions=False) + '(Definition/TestDef,(Item,Item))' + test_string = HedString(string_with_def, hed_schema) + issues = test_string.validate(hed_schema) self.assertEqual(len(issues), 1) diff --git a/tests/models/test_onset_mapper.py b/tests/validator/test_onset_validator.py similarity index 57% rename from tests/models/test_onset_mapper.py rename to tests/validator/test_onset_validator.py index a88a45f8f..1bc814f33 100644 --- a/tests/models/test_onset_mapper.py +++ b/tests/validator/test_onset_validator.py @@ -1,10 +1,11 @@ +import copy import unittest import os from hed.errors import ErrorHandler, OnsetErrors, ErrorContext, ValidationErrors -from hed.models import DefMapper, HedString, OnsetMapper, DefinitionDict +from hed.models import HedString, DefinitionDict from hed import schema -from hed.validator import HedValidator +from hed.validator import HedValidator, OnsetValidator from tests.validator.test_tag_validator_base import TestHedBase @@ -16,53 +17,66 @@ def setUpClass(cls): cls.base_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/') hed_xml_file = os.path.join(cls.base_data_dir, "schema_tests/HED8.0.0.mediawiki") cls.hed_schema = schema.load_schema(hed_xml_file) - cls.placeholder_label_def_string = "def/TestDefPlaceholder/2471" - cls.placeholder_def_contents = "(Item/TestDef1/#,Item/TestDef2)" + cls.placeholder_label_def_string = "Def/TestDefPlaceholder/2471" + cls.placeholder_def_contents = "(Action/TestDef1/#,Action/TestDef2)" cls.placeholder_definition_string = f"(Definition/TestDefPlaceholder/#,{cls.placeholder_def_contents})" - cls.placeholder_expanded_def_string = "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2))" + cls.placeholder_expanded_def_string = "(Def-expand/TestDefPlaceholder/2471,(Action/TestDef1/2471,Action/TestDef2))" - cls.label_def_string = "def/TestDefNormal" - cls.def_contents = "(Item/TestDef1,Item/TestDef2)" + cls.label_def_string = "Def/TestDefNormal" + cls.def_contents = "(Action/TestDef1,Action/TestDef2)" cls.definition_string = f"(Definition/TestDefNormal,{cls.def_contents})" - cls.expanded_def_string = "(Def-expand/TestDefNormal,(Item/TestDef1/2471,Item/TestDef2))" + cls.expanded_def_string = "(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2))" - cls.placeholder_label_def_string2 = "def/TestDefPlaceholder/123" - cls.placeholder_def_contents2 = "(Item/TestDef1/#,Item/TestDef2)" + cls.placeholder_label_def_string2 = "Def/TestDefPlaceholder/123" + cls.placeholder_def_contents2 = "(Action/TestDef1/#,Action/TestDef2)" cls.placeholder_definition_string2 = f"(Definition/TestDefPlaceholder/#,{cls.placeholder_def_contents2})" - cls.placeholder_expanded_def_string2 = "(Def-expand/TestDefPlaceholder/123,(Item/TestDef1/123,Item/TestDef2))" + cls.placeholder_expanded_def_string2 = "(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2))" - def _test_issues_base(self, test_strings, test_issues, test_context, hed_ops, expand_defs=True): + cls.def_dict_placeholder = DefinitionDict() + def_string = HedString(cls.placeholder_definition_string, hed_schema=cls.hed_schema) + cls.def_dict_placeholder.check_for_definitions(def_string) + cls.def_dict_both = copy.deepcopy(cls.def_dict_placeholder) + def_string = HedString(cls.definition_string, hed_schema=cls.hed_schema) + cls.def_dict_both.check_for_definitions(def_string) + + + def _test_issues_base(self, test_strings, test_issues, test_context, placeholder_def_only): + if placeholder_def_only: + validator = OnsetValidator(self.def_dict_placeholder) + else: + validator = OnsetValidator(self.def_dict_both) for string, expected_params, context in zip(test_strings, test_issues, test_context): - test_string = HedString(string) + test_string = HedString(string, self.hed_schema) error_handler = ErrorHandler() error_handler.push_error_context(ErrorContext.HED_STRING, test_string, increment_depth_after=False) - onset_issues = test_string.validate(hed_ops, expand_defs=expand_defs) + + onset_issues = [] + onset_issues += validator.validate_onset_offset(test_string) + + error_handler.add_context_and_filter(onset_issues) + test_string.shrink_defs() issues = self.format_errors_fully(error_handler, hed_string=test_string, params=expected_params) - # print(str(onset_issues)) - # print(str(issues)) + print(str(onset_issues)) + print(str(issues)) error_handler.pop_error_context() - self.assertEqual(len(hed_ops[-1]._onsets), context) + self.assertEqual(len(validator._onsets), context) self.assertCountEqual(onset_issues, issues) - def _test_issues_no_context(self, test_strings, test_issues, hed_ops): + def _test_issues_no_context(self, test_strings, test_issues): + hed_validator = HedValidator(self.hed_schema, self.def_dict_both) for string, expected_params in zip(test_strings, test_issues): test_string = HedString(string) - error_handler = ErrorHandler() + error_handler = ErrorHandler(check_for_warnings=False) error_handler.push_error_context(ErrorContext.HED_STRING, test_string, increment_depth_after=False) - onset_issues = test_string.validate(hed_ops, expand_defs=True) + onset_issues = hed_validator.validate(test_string, False) + error_handler.add_context_and_filter(onset_issues) issues = self.format_errors_fully(error_handler, hed_string=test_string, params=expected_params) - # print(str(onset_issues)) - # print(str(issues)) + print(str(onset_issues)) + print(str(issues)) error_handler.pop_error_context() self.assertCountEqual(onset_issues, issues) def test_basic_onset_errors(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - test_strings = [ f"({self.placeholder_label_def_string},Onset)", f"({self.placeholder_label_def_string},Offset)", @@ -70,9 +84,9 @@ def test_basic_onset_errors(self): f"({self.placeholder_label_def_string}, Onset, (Event), (Event))", f"({self.placeholder_label_def_string}, Onset, (Event))", "(Onset)", - f"({self.placeholder_label_def_string}, def/InvalidDef, Onset, (Event))", - "(def/TestDefInvalid, Onset)", - "(def/TestDefPlaceholder, Onset)", + f"({self.placeholder_label_def_string}, Def/InvalidDef, Onset, (Event))", + "(Def/TestDefInvalid, Onset)", + "(Def/TestDefPlaceholder, Onset)", f"({self.placeholder_label_def_string}, Offset, (Event))" ] # count of how many onset names are in the mapper after the line is run @@ -94,26 +108,19 @@ def test_basic_onset_errors(self): [], self.format_error(OnsetErrors.OFFSET_BEFORE_ONSET, tag=0), self.format_error(OnsetErrors.ONSET_WRONG_NUMBER_GROUPS, tag=0, - tag_list=['def/TestDefPlaceholder/2471', 'Onset', '(Event)', '(Event)']), + tag_list=['Def/TestDefPlaceholder/2471', 'Onset', '(Event)', '(Event)']), [], self.format_error(OnsetErrors.ONSET_NO_DEF_TAG_FOUND, tag=0), - self.format_error(OnsetErrors.ONSET_TOO_MANY_DEFS, tag=0, tag_list=['def/InvalidDef']), + self.format_error(OnsetErrors.ONSET_TOO_MANY_DEFS, tag=0, tag_list=['Def/InvalidDef']), self.format_error(OnsetErrors.ONSET_DEF_UNMATCHED, tag=0), self.format_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, tag=0, has_placeholder=True), self.format_error(OnsetErrors.ONSET_WRONG_NUMBER_GROUPS, tag=0, tag_list=[self.placeholder_label_def_string, 'Offset', '(Event)']), ] - self._test_issues_base(test_strings, test_issues, expected_context, [onset_mapper]) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=True) def test_basic_onset_errors_with_def_mapper(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - hed_ops = [def_mapper, onset_mapper] - test_strings = [ f"({self.placeholder_label_def_string},Onset)", f"({self.placeholder_label_def_string},Offset)", @@ -121,9 +128,9 @@ def test_basic_onset_errors_with_def_mapper(self): f"({self.placeholder_label_def_string}, Onset, (Event), (Event))", f"({self.placeholder_label_def_string}, Onset, (Event))", "(Onset)", - f"({self.placeholder_label_def_string}, def/TestDefPlaceholder/2, Onset, (Event))", - "(def/TestDefInvalid, Onset)", - "(def/TestDefPlaceholder, Onset)", + f"({self.placeholder_label_def_string}, Def/TestDefPlaceholder/2, Onset, (Event))", + "(Def/TestDefInvalid, Onset)", + "(Def/TestDefPlaceholder, Onset)", f"({self.placeholder_label_def_string}, Offset, (Event))" ] # count of how many onset names are in the mapper after the line is run @@ -149,24 +156,16 @@ def test_basic_onset_errors_with_def_mapper(self): [], self.format_error(OnsetErrors.ONSET_NO_DEF_TAG_FOUND, tag=0), self.format_error(OnsetErrors.ONSET_TOO_MANY_DEFS, tag=0, - tag_list=['def/TestDefPlaceholder/2']), - self.format_error(ValidationErrors.HED_DEF_UNMATCHED, tag=0), - self.format_error(ValidationErrors.HED_DEF_VALUE_MISSING, tag=0), + tag_list=['Def/TestDefPlaceholder/2']), + self.format_error(OnsetErrors.ONSET_DEF_UNMATCHED, tag=0), + self.format_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, tag=0, has_placeholder=True), self.format_error(OnsetErrors.ONSET_WRONG_NUMBER_GROUPS, tag=0, tag_list=[self.placeholder_label_def_string, 'Offset', '(Event)']), ] - self._test_issues_base(test_strings, test_issues, expected_context, hed_ops, expand_defs=False) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=True) def test_basic_onset_errors_expanded(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_string = HedString(self.definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - test_strings = [ f"({self.placeholder_expanded_def_string},Onset)", f"({self.placeholder_expanded_def_string},Offset)", @@ -174,10 +173,10 @@ def test_basic_onset_errors_expanded(self): f"({self.placeholder_expanded_def_string}, Onset, (Event), (Event))", f"({self.placeholder_expanded_def_string}, Onset, (Event))", "(Onset)", - f"({self.placeholder_expanded_def_string}, def/InvalidDef, Onset, (Event))", - "(def/TestDefInvalid, Onset)", - "(def/TestDefPlaceholder, Onset)", - "(def/TestDefNormal/InvalidPlaceholder, Onset)" + f"({self.placeholder_expanded_def_string}, Def/InvalidDef, Onset, (Event))", + "(Def/TestDefInvalid, Onset)", + "(Def/TestDefPlaceholder, Onset)", + "(Def/TestDefNormal/InvalidPlaceholder, Onset)" ] # count of how many onset names are in the mapper after the line is run expected_context = [ @@ -201,23 +200,15 @@ def test_basic_onset_errors_expanded(self): tag_list=[self.placeholder_expanded_def_string, 'Onset', '(Event)', '(Event)']), [], self.format_error(OnsetErrors.ONSET_NO_DEF_TAG_FOUND, tag=0), - self.format_error(OnsetErrors.ONSET_TOO_MANY_DEFS, tag=0, tag_list=['def/InvalidDef']), + self.format_error(OnsetErrors.ONSET_TOO_MANY_DEFS, tag=0, tag_list=['Def/InvalidDef']), self.format_error(OnsetErrors.ONSET_DEF_UNMATCHED, tag=0), self.format_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, tag=0, has_placeholder=True), self.format_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, tag=0, has_placeholder=False) ] - self._test_issues_base(test_strings, test_issues, expected_context, [onset_mapper]) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=False) def test_test_interleaving_onset_offset(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_string = HedString(self.definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - test_strings = [ f"({self.placeholder_label_def_string},Onset)", f"({self.placeholder_label_def_string2},Onset)", @@ -248,15 +239,9 @@ def test_test_interleaving_onset_offset(self): [], ] - self._test_issues_base(test_strings, test_issues, expected_context, [onset_mapper]) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=False) def test_onset_with_defs_in_them(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - test_strings = [ f"({self.placeholder_label_def_string},Onset, ({self.label_def_string}))", ] @@ -269,101 +254,23 @@ def test_onset_with_defs_in_them(self): [] ] - self._test_issues_base(test_strings, test_issues, expected_context, [onset_mapper]) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=True) def test_onset_multiple_or_misplaced_errors(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_string = HedString(self.definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - hed_validator = HedValidator(hed_schema=self.hed_schema) - hed_ops = [hed_validator, def_mapper, onset_mapper] - test_strings = [ f"{self.placeholder_label_def_string},Onset", f"({self.placeholder_label_def_string},Onset, Onset)", f"({self.placeholder_label_def_string},Onset, Offset)", ] - # count of issues the line generates - onset_list = ['Onset'] - offset_list = ['Offset'] - test_issues = [ - self.format_error(ValidationErrors.HED_TOP_LEVEL_TAG, tag=1), - self.format_error(ValidationErrors.HED_TAG_REPEATED, tag=2) - + self.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, tag=1, - multiple_tags=onset_list), - self.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, tag=1, - multiple_tags=offset_list), - ] - - self._test_issues_no_context(test_strings, test_issues, hed_ops) - test_issues = [ self.format_error(ValidationErrors.HED_TOP_LEVEL_TAG, tag=1), - self.format_error(ValidationErrors.HED_TAG_REPEATED, tag=2) - + self.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, tag=1, - multiple_tags=onset_list), - self.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, tag=1, - multiple_tags=offset_list), + self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=2, def_tag="Def/TestDefPlaceholder/2471"), + self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=2, def_tag="Def/TestDefPlaceholder/2471"), ] - # Repeat with just hed validator - self._test_issues_no_context(test_strings, test_issues, hed_validator) - - def test_onset_multiple_or_misplaced_errors_no_validator(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_string = HedString(self.definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - hed_ops = [def_mapper, onset_mapper] - - test_strings = [ - f"{self.placeholder_label_def_string},Onset", - f"({self.placeholder_label_def_string},Onset, Onset)", - f"({self.placeholder_label_def_string},Onset, Offset)", - f"({self.placeholder_label_def_string},Onset, Event)", - ] - # count of issues the line generates - test_issues = [ - [], - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=4, - def_tag="Def-expand/TestDefPlaceholder/2471"), - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=4, - def_tag="Def-expand/TestDefPlaceholder/2471"), - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=4, - def_tag="Def-expand/TestDefPlaceholder/2471"), - ] - - self._test_issues_no_context(test_strings, test_issues, hed_ops) - - # Verify it also works without def mapping - test_issues = [ - [], - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=2, - def_tag=self.placeholder_label_def_string), - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=2, - def_tag=self.placeholder_label_def_string), - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=2, - def_tag=self.placeholder_label_def_string), - ] - - self._test_issues_no_context(test_strings, test_issues, [hed_ops[1]]) + self._test_issues_no_context(test_strings, test_issues) def test_onset_two_in_one_line(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_string = HedString(self.definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - test_strings = [ f"({self.placeholder_label_def_string},Onset), ({self.placeholder_label_def_string2},Onset)", f"({self.placeholder_label_def_string2},Offset)", @@ -391,7 +298,7 @@ def test_onset_two_in_one_line(self): [] ] - self._test_issues_base(test_strings, test_issues, expected_context, [onset_mapper]) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=False) if __name__ == '__main__': diff --git a/tests/validator/test_tag_validator.py b/tests/validator/test_tag_validator.py index ea13e410a..dc0fb910a 100644 --- a/tests/validator/test_tag_validator.py +++ b/tests/validator/test_tag_validator.py @@ -11,8 +11,8 @@ class TestHed(TestValidatorBase): class IndividualHedTagsShort(TestHed): @staticmethod - def string_obj_func(validator, check_for_warnings): - return partial(validator._validate_individual_tags_in_hed_string, check_for_warnings=check_for_warnings) + def string_obj_func(validator): + return partial(validator._validate_individual_tags_in_hed_string) def test_exist_in_schema(self): test_strings = { @@ -66,10 +66,10 @@ def test_exist_in_schema(self): def test_proper_capitalization(self): test_strings = { 'proper': 'Event/Sensory-event', - 'camelCase': 'EvEnt/Something', + 'camelCase': 'EvEnt/Sensory-event', 'takesValue': 'Sampling-rate/20 Hz', 'numeric': 'Statistical-uncertainty/20', - 'lowercase': 'Event/something' + 'lowercase': 'Event/sensory-event' } expected_results = { 'proper': True, @@ -85,7 +85,7 @@ def test_proper_capitalization(self): 'numeric': [], 'lowercase': self.format_error(ValidationErrors.HED_STYLE_WARNING, tag=0) } - self.validator_syntactic(test_strings, expected_results, expected_issues, True) + self.validator_semantic(test_strings, expected_results, expected_issues, True) # def test_proper_capitalization(self): # test_strings = { @@ -112,7 +112,7 @@ def test_proper_capitalization(self): # 'lowercase': self.format_error(ValidationErrors.HED_STYLE_WARNING, tag=0), # 'multipleUpper': self.format_error(ValidationErrors.HED_STYLE_WARNING, tag=0) # } - # self.validator_syntactic(test_strings, expected_results, expected_issues, True) + # self.validator_semantic(test_strings, expected_results, expected_issues, True) # # def test_proper_capitalization_semantic(self): # test_strings = { @@ -352,7 +352,7 @@ def test_span_reporting(self): class TestTagLevels(TestHed): @staticmethod - def string_obj_func(validator, check_for_warnings): + def string_obj_func(validator): return validator._validate_groups_in_hed_string def test_no_duplicates(self): @@ -394,7 +394,7 @@ def test_no_duplicates(self): 'duplicateSubGroupF': self.format_error(ValidationErrors.HED_TAG_REPEATED_GROUP, group=HedString("((Sensory-event,Man-made-object/VehicleTrain),Event)")), } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) def test_no_duplicates_semantic(self): test_strings = { @@ -489,14 +489,14 @@ def test_empty_groups(self): expected_issues = { 'emptyGroup': self.format_error(ValidationErrors.HED_GROUP_EMPTY, tag=1000 + 1) } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) class FullHedString(TestHed): compute_forms = False @staticmethod - def string_obj_func(validator, check_for_warnings): + def string_obj_func(validator): return validator._tag_validator.run_hed_string_validators def test_invalid_placeholders(self): @@ -538,11 +538,13 @@ def test_mismatched_parentheses(self): closing_parentheses_count=1), 'extraClosing': self.format_error(ValidationErrors.HED_PARENTHESES_MISMATCH, opening_parentheses_count=1, - closing_parentheses_count=2), + closing_parentheses_count=2) + + self.format_error(ValidationErrors.HED_TAG_EMPTY, source_string=test_strings['extraClosing'], + char_index=84), 'valid': [] } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) def test_malformed_delimiters(self): test_strings = { @@ -676,7 +678,7 @@ def test_malformed_delimiters(self): tag="Thing)) "), # 'emptyGroup': [] } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) def test_invalid_characters(self): test_strings = { @@ -705,7 +707,7 @@ def test_invalid_characters(self): 'closingBracket': self.format_error(ValidationErrors.HED_CHARACTER_INVALID, char_index=45, source_string=test_strings['closingBracket']) } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) def test_string_extra_slash_space(self): test_strings = { @@ -778,7 +780,7 @@ def test_string_extra_slash_space(self): index_in_tag=15, index_in_tag_end=18, tag=0), } - self.validator_syntactic(test_strings, expected_results, expected_errors, False) + self.validator_semantic(test_strings, expected_results, expected_errors, False) def test_no_more_than_two_tildes(self): test_strings = { @@ -817,15 +819,15 @@ def test_no_more_than_two_tildes(self): + self.format_error(ValidationErrors.HED_TILDES_UNSUPPORTED, source_string=test_strings['invalidTildeGroup'], char_index=147) } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) class RequiredTags(TestHed): schema_file = '../data/validator_tests/HED8.0.0_added_tests.mediawiki' @staticmethod - def string_obj_func(validator, check_for_warnings): - return partial(validator._validate_tags_in_hed_string, check_for_warnings=check_for_warnings) + def string_obj_func(validator): + return partial(validator._validate_tags_in_hed_string) def test_includes_all_required_tags(self): test_strings = { @@ -857,13 +859,13 @@ def test_includes_all_required_tags(self): def test_multiple_copies_unique_tags(self): test_strings = { 'legal': 'Event-context,' - '(Vehicle,Event)', + '(Vehicle,Event), Animal-agent, Action', 'multipleDesc': 'Event-context,' 'Event-context,' - 'Vehicle,(Vehicle,Event-context)', + 'Vehicle,(Vehicle,Event-context), Animal-agent, Action', # I think this is illegal in hed2 style schema now. 'multipleDescIncShort': 'Event-context,' - 'Organizational-property/Event-context' + 'Organizational-property/Event-context, Animal-agent, Action' } expected_results = { 'legal': True, @@ -885,8 +887,8 @@ class TestHedSpecialUnits(TestHed): schema_file = '../data/validator_tests/HED8.0.0_added_tests.mediawiki' @staticmethod - def string_obj_func(validator, check_for_warnings): - return partial(validator._validate_individual_tags_in_hed_string, check_for_warnings=check_for_warnings) + def string_obj_func(validator): + return partial(validator._validate_individual_tags_in_hed_string) def test_special_units(self): test_strings = { diff --git a/tests/validator/test_tag_validator_base.py b/tests/validator/test_tag_validator_base.py index df8812479..75f2b10e7 100644 --- a/tests/validator/test_tag_validator_base.py +++ b/tests/validator/test_tag_validator_base.py @@ -66,45 +66,38 @@ class TestValidatorBase(TestHedBase): def setUpClass(cls): super().setUpClass() cls.error_handler = error_reporter.ErrorHandler() - cls.syntactic_hed_input_reader = HedValidator(hed_schema=None, - run_semantic_validation=False) - cls.syntactic_tag_validator = cls.syntactic_hed_input_reader._tag_validator - cls.semantic_hed_input_reader = HedValidator(hed_schema=cls.hed_schema, - run_semantic_validation=True) + # cls.syntactic_hed_input_reader = HedValidator(hed_schema=None) + # cls.syntactic_tag_validator = cls.syntactic_hed_input_reader._tag_validator + cls.semantic_hed_input_reader = HedValidator(hed_schema=cls.hed_schema) cls.semantic_tag_validator = cls.semantic_hed_input_reader._tag_validator def validator_base(self, test_strings, expected_results, expected_issues, test_function, - hed_schema=None): + hed_schema=None, check_for_warnings=False): for test_key in test_strings: hed_string_obj = HedString(test_strings[test_key]) - error_handler = ErrorHandler() + error_handler = ErrorHandler(check_for_warnings=check_for_warnings) error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj, increment_depth_after=False) test_issues = [] if self.compute_forms: test_issues += hed_string_obj.convert_to_canonical_forms(hed_schema) if not test_issues: test_issues += test_function(hed_string_obj) - test_result = not test_issues expected_params = expected_issues[test_key] expected_result = expected_results[test_key] expected_issue = self.format_errors_fully(error_handler, hed_string=hed_string_obj, params=expected_params) - error_handler.add_context_to_issues(test_issues) + error_handler.add_context_and_filter(test_issues) + test_result = not test_issues - # print(test_key) - # print(str(expected_issue)) - # print(str(test_issues)) + print(test_key) + print(str(expected_issue)) + print(str(test_issues)) error_handler.pop_error_context() self.assertEqual(test_result, expected_result, test_strings[test_key]) self.assertCountEqual(test_issues, expected_issue, test_strings[test_key]) - def validator_syntactic(self, test_strings, expected_results, expected_issues, check_for_warnings): - validator = self.syntactic_hed_input_reader - self.validator_base(test_strings, expected_results, expected_issues, - self.string_obj_func(validator, check_for_warnings=check_for_warnings)) - def validator_semantic(self, test_strings, expected_results, expected_issues, check_for_warnings): validator = self.semantic_hed_input_reader self.validator_base(test_strings, expected_results, expected_issues, - self.string_obj_func(validator, check_for_warnings=check_for_warnings), + self.string_obj_func(validator), check_for_warnings=check_for_warnings, hed_schema=validator._hed_schema) diff --git a/tests/validator/test_tag_validator_library.py b/tests/validator/test_tag_validator_library.py index 15c86545e..c4552f689 100644 --- a/tests/validator/test_tag_validator_library.py +++ b/tests/validator/test_tag_validator_library.py @@ -43,8 +43,8 @@ def test_invalid_load_prefix(self): class IndividualHedTagsShort(TestHed3): @staticmethod - def string_obj_func(validator, check_for_warnings): - return partial(validator._validate_individual_tags_in_hed_string, check_for_warnings=check_for_warnings) + def string_obj_func(validator): + return partial(validator._validate_individual_tags_in_hed_string) def test_exist_in_schema(self): test_strings = { @@ -102,10 +102,10 @@ def test_exist_in_schema(self): def test_proper_capitalization(self): test_strings = { 'proper': 'tl:Event/Sensory-event', - 'camelCase': 'tl:EvEnt/Something', - 'takesValue': 'tl:Attribute/Temporal rate/20 Hz', - 'numeric': 'tl:Repetition-number/20', - 'lowercase': 'tl:Event/something' + 'camelCase': 'tl:EvEnt/Sensory-event', + 'takesValue': 'tl:Sampling-rate/20 Hz', + 'numeric': 'tl:Statistical-uncertainty/20', + 'lowercase': 'tl:Event/sensory-event' } expected_results = { 'proper': True, @@ -121,7 +121,7 @@ def test_proper_capitalization(self): 'numeric': [], 'lowercase': self.format_error(ValidationErrors.HED_STYLE_WARNING, tag=0) } - self.validator_syntactic(test_strings, expected_results, expected_issues, True) + self.validator_semantic(test_strings, expected_results, expected_issues, True) def test_child_required(self): test_strings = { @@ -302,17 +302,17 @@ def test_span_reporting(self): class TestTagLevels3(TestHed3): @staticmethod - def string_obj_func(validator, check_for_warnings): + def string_obj_func(validator): return validator._validate_groups_in_hed_string def test_no_duplicates(self): test_strings = { 'topLevelDuplicate': 'tl:Event/Sensory-event,tl:Event/Sensory-event', 'groupDuplicate': 'tl:Item/Object/Man-made-object/VehicleTrain,(tl:Event/Sensory-event,' - 'tl:Attribute/Sensory/Visual/Color/CSS-color/Purple-color/Purple,tl:Event/Sensory-event)', + 'tl:Purple-color/Purple,tl:Event/Sensory-event)', 'noDuplicate': 'tl:Event/Sensory-event,' 'tl:Item/Object/Man-made-object/VehicleTrain,' - 'tl:Attribute/Sensory/Visual/Color/CSS-color/Purple-color/Purple', + 'tl:Purple-color/Purple', 'legalDuplicate': 'tl:Item/Object/Man-made-object/VehicleTrain,\ (tl:Item/Object/Man-made-object/VehicleTrain,' 'tl:Event/Sensory-event)', @@ -329,7 +329,7 @@ def test_no_duplicates(self): 'legalDuplicate': [], 'noDuplicate': [] } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) def test_no_duplicates_semantic(self): test_strings = { @@ -417,8 +417,8 @@ def test_taggroup_validation(self): class RequiredTags(TestHed3): @staticmethod - def string_obj_func(validator, check_for_warnings): - return partial(validator._validate_tags_in_hed_string, check_for_warnings=check_for_warnings) + def string_obj_func(validator): + return partial(validator._validate_tags_in_hed_string) def test_includes_all_required_tags(self): test_strings = { @@ -452,12 +452,13 @@ def test_includes_all_required_tags(self): def test_multiple_copies_unique_tags(self): test_strings = { 'legal': 'tl:Event-context,' - '(Vehicle,Event)', + '(Vehicle,Event), Animal-agent, Action, tl:Animal-agent, tl:Action', 'multipleDesc': 'tl:Event-context,' 'tl:Event-context,' - 'Vehicle,(Vehicle,tl:Event-context)', + 'Vehicle,(Vehicle,tl:Event-context), Animal-agent, Action, tl:Animal-agent, tl:Action', 'multipleDescIncShort': 'tl:Event-context,' - 'tl:Organizational-property/Event-context' + 'tl:Organizational-property/Event-context,' + ' Animal-agent, Action, tl:Animal-agent, tl:Action' } expected_results = { 'legal': True, From 28ef39e4c106e05596ca21001aa01261366ac9f2 Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 16 Mar 2023 11:21:22 -0500 Subject: [PATCH 2/2] Add missing data file. Disable prints --- tests/data/sidecar_tests/both_types_events_with_defs.json | 6 +++--- tests/validator/test_onset_validator.py | 8 ++++---- tests/validator/test_tag_validator_base.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/data/sidecar_tests/both_types_events_with_defs.json b/tests/data/sidecar_tests/both_types_events_with_defs.json index 29b133724..7047a1fdd 100644 --- a/tests/data/sidecar_tests/both_types_events_with_defs.json +++ b/tests/data/sidecar_tests/both_types_events_with_defs.json @@ -20,16 +20,16 @@ "stim_file": { "LongName": "Stimulus file name", "Description": "Relative path of the stimulus image file", - "HED": "Attribute/File/#, (Definition/JsonFileDef2/#, (Item/JsonDef2/#,Item/JsonDef2)), (Definition/JsonFileDef3/#, (Item/JsonDef3/#,InvalidTag))" + "HED": "Age/#, (Definition/JsonFileDef2/#, (Item/JsonDef2/#,Item/JsonDef2)), (Definition/JsonFileDef3/#, (Item/JsonDef3/#))" }, "takes_value_def": { "LongName": "Def with a takes value tag", "Description": "Relative path of the stimulus image file", - "HED": "Attribute/File/#, (Definition/TakesValueDef/#, (Age/#))" + "HED": "Age/#, (Definition/TakesValueDef/#, (Age/#))" }, "unit_class_def": { "LongName": "Def with a value class", "Description": "Relative path of the stimulus image file", - "HED": "Attribute/File/#, (Definition/ValueClassDef/#, (Acceleration/#))" + "HED": "Age/#, (Definition/ValueClassDef/#, (Acceleration/#))" } } \ No newline at end of file diff --git a/tests/validator/test_onset_validator.py b/tests/validator/test_onset_validator.py index 1bc814f33..de46d116b 100644 --- a/tests/validator/test_onset_validator.py +++ b/tests/validator/test_onset_validator.py @@ -56,8 +56,8 @@ def _test_issues_base(self, test_strings, test_issues, test_context, placeholder error_handler.add_context_and_filter(onset_issues) test_string.shrink_defs() issues = self.format_errors_fully(error_handler, hed_string=test_string, params=expected_params) - print(str(onset_issues)) - print(str(issues)) + # print(str(onset_issues)) + # print(str(issues)) error_handler.pop_error_context() self.assertEqual(len(validator._onsets), context) self.assertCountEqual(onset_issues, issues) @@ -71,8 +71,8 @@ def _test_issues_no_context(self, test_strings, test_issues): onset_issues = hed_validator.validate(test_string, False) error_handler.add_context_and_filter(onset_issues) issues = self.format_errors_fully(error_handler, hed_string=test_string, params=expected_params) - print(str(onset_issues)) - print(str(issues)) + # print(str(onset_issues)) + # print(str(issues)) error_handler.pop_error_context() self.assertCountEqual(onset_issues, issues) diff --git a/tests/validator/test_tag_validator_base.py b/tests/validator/test_tag_validator_base.py index 75f2b10e7..37d78668c 100644 --- a/tests/validator/test_tag_validator_base.py +++ b/tests/validator/test_tag_validator_base.py @@ -89,9 +89,9 @@ def validator_base(self, test_strings, expected_results, expected_issues, test_f error_handler.add_context_and_filter(test_issues) test_result = not test_issues - print(test_key) - print(str(expected_issue)) - print(str(test_issues)) + # print(test_key) + # print(str(expected_issue)) + # print(str(test_issues)) error_handler.pop_error_context() self.assertEqual(test_result, expected_result, test_strings[test_key]) self.assertCountEqual(test_issues, expected_issue, test_strings[test_key])