From fbb8fd81d1332d16f5da7bed61d89318b9667f6c Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 15 Mar 2023 17:46:41 -0500 Subject: [PATCH 01/19] First pass refactor of models --- hed/__init__.py | 3 +- hed/errors/error_messages.py | 142 ++--- hed/errors/error_reporter.py | 49 +- hed/errors/error_types.py | 5 +- hed/errors/exceptions.py | 2 + hed/models/__init__.py | 3 - hed/models/base_input.py | 509 ++++++------------ hed/models/column_mapper.py | 221 +++----- hed/models/column_metadata.py | 107 +--- hed/models/def_mapper.py | 255 --------- hed/models/definition_dict.py | 154 +++++- hed/models/df_util.py | 125 +++++ hed/models/expression_parser.py | 4 +- hed/models/hed_group.py | 18 +- hed/models/hed_ops.py | 262 --------- hed/models/hed_string.py | 110 ++-- hed/models/hed_tag.py | 143 +++-- hed/models/sidecar.py | 254 ++++++--- hed/models/sidecar_base.py | 269 --------- hed/models/spreadsheet_input.py | 12 +- hed/models/tabular_input.py | 62 +-- hed/models/timeseries_input.py | 2 +- hed/schema/schema_compliance.py | 2 +- hed/validator/__init__.py | 4 + hed/validator/def_validator.py | 78 +++ hed/validator/hed_validator.py | 119 ++-- .../onset_validator.py} | 46 +- hed/validator/sidecar_validator.py | 147 +++++ hed/validator/spreadsheet_validator.py | 114 ++++ hed/validator/tag_validator.py | 100 ++-- spec_tests/test_errors.py | 182 ++++--- tests/data/model_tests/na_tag_column.tsv | 2 + tests/data/model_tests/na_value_column.json | 5 + tests/data/model_tests/na_value_column.tsv | 3 + .../no_column_header_definition.tsv | 4 +- .../no_column_header_definition_long.tsv | 4 +- .../data/validator_tests/bids_events_HED.json | 3 +- tests/models/test_base_file_input.py | 19 +- tests/models/test_column_mapper.py | 90 +--- tests/models/test_def_mapper.py | 292 ---------- tests/models/test_definition_dict.py | 36 +- tests/models/test_expression_parser.py | 11 + tests/models/test_hed_string.py | 27 + tests/models/test_hed_tag.py | 28 +- tests/models/test_sidecar.py | 38 +- tests/models/test_spreadsheet_input.py | 92 +--- tests/models/test_tabular_input.py | 55 +- tests/schema/test_convert_tags.py | 2 +- tests/validator/test_def_validator.py | 119 ++++ tests/validator/test_hed_validator.py | 92 +--- .../test_onset_validator.py} | 227 +++----- tests/validator/test_tag_validator.py | 48 +- tests/validator/test_tag_validator_base.py | 29 +- tests/validator/test_tag_validator_library.py | 33 +- 54 files changed, 1920 insertions(+), 2842 deletions(-) delete mode 100644 hed/models/def_mapper.py create mode 100644 hed/models/df_util.py delete mode 100644 hed/models/hed_ops.py delete mode 100644 hed/models/sidecar_base.py create mode 100644 hed/validator/def_validator.py rename hed/{models/onset_mapper.py => validator/onset_validator.py} (76%) create mode 100644 hed/validator/sidecar_validator.py create mode 100644 hed/validator/spreadsheet_validator.py create mode 100644 tests/data/model_tests/na_tag_column.tsv create mode 100644 tests/data/model_tests/na_value_column.json create mode 100644 tests/data/model_tests/na_value_column.tsv delete mode 100644 tests/models/test_def_mapper.py create mode 100644 tests/validator/test_def_validator.py rename tests/{models/test_onset_mapper.py => validator/test_onset_validator.py} (57%) diff --git a/hed/__init__.py b/hed/__init__.py index 40faff8ab..e2bdcd053 100644 --- a/hed/__init__.py +++ b/hed/__init__.py @@ -7,12 +7,13 @@ from hed.models.spreadsheet_input import SpreadsheetInput from hed.models.tabular_input import TabularInput from hed.models.sidecar import Sidecar +from hed.models.definition_dict import DefinitionDict + from hed.schema.hed_schema import HedSchema from hed.schema.hed_schema_group import HedSchemaGroup from hed.schema.hed_schema_io import get_schema, get_schema_versions, load_schema, load_schema_version -from hed.validator.hed_validator import HedValidator # from hed import errors, models, schema, tools, validator diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index 2d3647d9a..9ae9557f3 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -12,327 +12,333 @@ @hed_tag_error(ValidationErrors.HED_UNITS_INVALID) def val_error_invalid_unit(tag, units): units_string = ','.join(sorted(units)) - return f'Invalid unit - "{tag}" valid units are "{units_string}"', { - "units": sorted(units) - } + return f'Invalid unit - "{tag}" valid units are "{units_string}"' @hed_error(ValidationErrors.HED_TAG_EMPTY) def val_error_extra_comma(source_string, char_index): character = source_string[char_index] - return f"HED tags cannot be empty. Extra delimiter found: '{character}' at index {char_index}'", { - 'char_index': char_index - } + return f"HED tags cannot be empty. Extra delimiter found: '{character}' at index {char_index}'" @hed_tag_error(ValidationErrors.HED_GROUP_EMPTY, actual_code=ValidationErrors.HED_TAG_EMPTY) def val_error_empty_group(tag): - return f"HED tags cannot be empty. Extra delimiters found: '{tag}'", {} + return f"HED tags cannot be empty. Extra delimiters found: '{tag}'" @hed_tag_error(ValidationErrors.HED_TAG_EXTENDED, has_sub_tag=True, default_severity=ErrorSeverity.WARNING) def val_error_tag_extended(tag, problem_tag): - return f"Hed tag is extended. '{problem_tag}' in {tag}", {} + return f"Hed tag is extended. '{problem_tag}' in {tag}" @hed_error(ValidationErrors.HED_CHARACTER_INVALID) def val_error_invalid_char(source_string, char_index): character = source_string[char_index] - return f'Invalid character "{character}" at index {char_index}"', { - 'char_index': char_index - } + return f'Invalid character "{character}" at index {char_index}"' @hed_tag_error(ValidationErrors.INVALID_TAG_CHARACTER, has_sub_tag=True, actual_code=ValidationErrors.HED_CHARACTER_INVALID) def val_error_invalid_tag_character(tag, problem_tag): - return f"Invalid character '{problem_tag}' in {tag}", {} + return f"Invalid character '{problem_tag}' in {tag}" @hed_error(ValidationErrors.HED_TILDES_UNSUPPORTED) def val_error_tildes_not_supported(source_string, char_index): character = source_string[char_index] - return f"Tildes not supported. Replace (a ~ b ~ c) with (a, (b, c)). '{character}' at index {char_index}'", { - 'char_index': char_index - } + return f"Tildes not supported. Replace (a ~ b ~ c) with (a, (b, c)). '{character}' at index {char_index}'" @hed_error(ValidationErrors.HED_COMMA_MISSING) def val_error_comma_missing(tag): - return f"Comma missing after - '{tag}'", {} + return f"Comma missing after - '{tag}'" @hed_tag_error(ValidationErrors.HED_TAG_REPEATED) def val_error_duplicate_tag(tag): - return f'Repeated tag - "{tag}"', {} + return f'Repeated tag - "{tag}"' @hed_error(ValidationErrors.HED_TAG_REPEATED_GROUP) def val_error_duplicate_group(group): - return f'Repeated group - "{group}"', {} + return f'Repeated group - "{group}"' @hed_error(ValidationErrors.HED_PARENTHESES_MISMATCH) def val_error_parentheses(opening_parentheses_count, closing_parentheses_count): return f'Number of opening and closing parentheses are unequal. '\ f'{opening_parentheses_count} opening parentheses. {closing_parentheses_count} '\ - 'closing parentheses', {} + 'closing parentheses' @hed_tag_error(ValidationErrors.HED_TAG_REQUIRES_CHILD) def val_error_require_child(tag): - return f"Descendant tag required - '{tag}'", {} + return f"Descendant tag required - '{tag}'" @hed_error(ValidationErrors.HED_TAG_NOT_UNIQUE) def val_error_multiple_unique(tag_prefix): - return f"Multiple unique tags with prefix - '{tag_prefix}'", {} + return f"Multiple unique tags with prefix - '{tag_prefix}'" + + +@hed_tag_error(ValidationErrors.TAG_PREFIX_INVALID) +def val_error_prefix_invalid(tag, tag_prefix): + return f"Prefixes can only contain alpha characters. - '{tag_prefix}'" @hed_tag_error(ValidationErrors.INVALID_EXTENSION, actual_code=ValidationErrors.HED_TAG_INVALID) def val_error_invalid_extension(tag): - return f'Invalid extension on tag - "{tag}"', {} + return f'Invalid extension on tag - "{tag}"' @hed_tag_error(ValidationErrors.INVALID_PARENT_NODE, has_sub_tag=True, actual_code=ValidationErrors.HED_TAG_INVALID) def val_error_invalid_parent(tag, problem_tag, expected_parent_tag): return f"In '{tag}', '{problem_tag}' appears as '{str(expected_parent_tag)}' and cannot be used " \ - f"as an extension.", {"expected_parent_tag": expected_parent_tag} + f"as an extension." @hed_tag_error(ValidationErrors.NO_VALID_TAG_FOUND, has_sub_tag=True, actual_code=ValidationErrors.HED_TAG_INVALID) def val_error_no_valid_tag(tag, problem_tag): - return f"'{problem_tag}' in {tag} is not a valid base hed tag.", {} + return f"'{problem_tag}' in {tag} is not a valid base hed tag." @hed_tag_error(ValidationErrors.HED_VALUE_INVALID) def val_error_no_value(tag): - return f"''{tag}' has an invalid value portion.", {} + return f"''{tag}' has an invalid value portion." @hed_error(ValidationErrors.HED_MISSING_REQUIRED_COLUMN, default_severity=ErrorSeverity.WARNING) def val_error_missing_column(column_name): - return f"Required column '{column_name}' not specified or found in file.", {} + return f"Required column '{column_name}' not specified or found in file." @hed_error(ValidationErrors.HED_UNKNOWN_COLUMN, default_severity=ErrorSeverity.WARNING) def val_error_extra_column(column_name): return f"Column named '{column_name}' found in file, but not specified as a tag column " + \ - "or identified in sidecars.", {} + "or identified in sidecars." @hed_error(ValidationErrors.HED_BLANK_COLUMN, default_severity=ErrorSeverity.WARNING) def val_error_hed_blank_column(column_number): - return f"Column number {column_number} has no column name", {} + return f"Column number {column_number} has no column name" @hed_error(ValidationErrors.HED_DUPLICATE_COLUMN, default_severity=ErrorSeverity.WARNING) def val_error_hed_duplicate_column(column_name): - return f"Multiple columns have name {column_name}. This is not a fatal error, but discouraged.", {} + return f"Multiple columns have name {column_name}. This is not a fatal error, but discouraged." @hed_tag_error(ValidationErrors.HED_LIBRARY_UNMATCHED) def val_error_unknown_prefix(tag, unknown_prefix, known_prefixes): - return f"Tag '{tag} has unknown prefix '{unknown_prefix}'. Valid prefixes: {known_prefixes}", {} + return f"Tag '{tag} has unknown prefix '{unknown_prefix}'. Valid prefixes: {known_prefixes}" @hed_tag_error(ValidationErrors.HED_NODE_NAME_EMPTY, has_sub_tag=True) def val_error_extra_slashes_spaces(tag, problem_tag): - return f"Extra slashes or spaces '{problem_tag}' in tag '{tag}'", {} + return f"Extra slashes or spaces '{problem_tag}' in tag '{tag}'" @hed_error(ValidationErrors.HED_SIDECAR_KEY_MISSING, default_severity=ErrorSeverity.WARNING) def val_error_sidecar_key_missing(invalid_key, category_keys): - return f"Category key '{invalid_key}' does not exist in column. Valid keys are: {category_keys}", {} + return f"Category key '{invalid_key}' does not exist in column. Valid keys are: {category_keys}" @hed_tag_error(ValidationErrors.HED_DEF_UNMATCHED) def val_error_def_unmatched(tag): - return f"A data-recording’s Def tag cannot be matched to definition. Tag: '{tag}'", {} + return f"A data-recording’s Def tag cannot be matched to definition. Tag: '{tag}'" @hed_tag_error(ValidationErrors.HED_DEF_EXPAND_INVALID) def val_error_bad_def_expand(tag, actual_def, found_def): return f"A data-recording’s Def-expand tag does not match the given definition." + \ - f"Tag: '{tag}'. Actual Def: {actual_def}. Found Def: {found_def}", {} + f"Tag: '{tag}'. Actual Def: {actual_def}. Found Def: {found_def}" @hed_tag_error(ValidationErrors.HED_DEF_VALUE_MISSING, actual_code=ValidationErrors.HED_DEF_VALUE_INVALID) def val_error_def_value_missing(tag): - return f"A def tag requires a placeholder value, but was not given one. Definition: '{tag}'", {} + return f"A def tag requires a placeholder value, but was not given one. Definition: '{tag}'" @hed_tag_error(ValidationErrors.HED_DEF_VALUE_EXTRA, actual_code=ValidationErrors.HED_DEF_VALUE_INVALID) def val_error_def_value_extra(tag): - return f"A def tag does not take a placeholder value, but was given one. Definition: '{tag}", {} + return f"A def tag does not take a placeholder value, but was given one. Definition: '{tag}" @hed_tag_error(ValidationErrors.HED_TOP_LEVEL_TAG, actual_code=ValidationErrors.HED_TAG_GROUP_ERROR) def val_error_top_level_tag(tag): - return f"A tag that must be in a top level group was found in another location. {str(tag)}", {} + return f"A tag that must be in a top level group was found in another location. {str(tag)}" @hed_tag_error(ValidationErrors.HED_TAG_GROUP_TAG, actual_code=ValidationErrors.HED_TAG_GROUP_ERROR) def val_error_tag_group_tag(tag): - return f"A tag that must be in a group was found in another location. {str(tag)}", {} + return f"A tag that must be in a group was found in another location. {str(tag)}" @hed_tag_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, actual_code=ValidationErrors.HED_TAG_GROUP_ERROR) def val_error_top_level_tags(tag, multiple_tags): tags_as_string = [str(tag) for tag in multiple_tags] return f"Multiple top level tags found in a single group. First one found: {str(tag)}. " + \ - f"Remainder:{str(tags_as_string)}", {} + f"Remainder:{str(tags_as_string)}" @hed_error(ValidationErrors.HED_REQUIRED_TAG_MISSING) def val_warning_required_prefix_missing(tag_prefix): - return f"Tag with prefix '{tag_prefix}' is required", {} + return f"Tag with prefix '{tag_prefix}' is required" @hed_tag_error(ValidationErrors.HED_STYLE_WARNING, default_severity=ErrorSeverity.WARNING) def val_warning_capitalization(tag): - return f"First word not capitalized or camel case - '{tag}'", {} + return f"First word not capitalized or camel case - '{tag}'" @hed_tag_error(ValidationErrors.HED_UNITS_DEFAULT_USED, default_severity=ErrorSeverity.WARNING) def val_warning_default_units_used(tag, default_unit): - return f"No unit specified. Using '{default_unit}' as the default - '{tag}'", {} + return f"No unit specified. Using '{default_unit}' as the default - '{tag}'" @hed_error(SchemaErrors.HED_SCHEMA_DUPLICATE_NODE) def schema_error_hed_duplicate_node(tag, duplicate_tag_list, section): tag_join_delimiter = "\n\t" return f"Duplicate term '{str(tag)}' used {len(duplicate_tag_list)} places in '{section}' section schema as:" + \ - f"{tag_join_delimiter}{tag_join_delimiter.join(duplicate_tag_list)}", {} + f"{tag_join_delimiter}{tag_join_delimiter.join(duplicate_tag_list)}" @hed_error(SchemaErrors.HED_SCHEMA_ATTRIBUTE_INVALID) def schema_error_unknown_attribute(attribute_name, source_tag): return f"Attribute '{attribute_name}' used by '{source_tag}' was not defined in the schema, " \ - f"or was used outside of it's defined class.", {} + f"or was used outside of it's defined class." @hed_error(SchemaWarnings.INVALID_CHARACTERS_IN_DESC, default_severity=ErrorSeverity.WARNING, actual_code=SchemaWarnings.HED_SCHEMA_CHARACTER_INVALID) def schema_warning_invalid_chars_desc(desc_string, tag_name, problem_char, char_index): - return f"Invalid character '{problem_char}' in desc for '{tag_name}' at position {char_index}. '{desc_string}", {} + return f"Invalid character '{problem_char}' in desc for '{tag_name}' at position {char_index}. '{desc_string}" @hed_error(SchemaWarnings.INVALID_CHARACTERS_IN_TAG, default_severity=ErrorSeverity.WARNING, actual_code=SchemaWarnings.HED_SCHEMA_CHARACTER_INVALID) def schema_warning_invalid_chars_tag(tag_name, problem_char, char_index): - return f"Invalid character '{problem_char}' in tag '{tag_name}' at position {char_index}.", {} + return f"Invalid character '{problem_char}' in tag '{tag_name}' at position {char_index}." @hed_error(SchemaWarnings.INVALID_CAPITALIZATION, default_severity=ErrorSeverity.WARNING) def schema_warning_invalid_capitalization(tag_name, problem_char, char_index): return "First character must be a capital letter or number. " + \ - f"Found character '{problem_char}' in tag '{tag_name}' at position {char_index}.", \ - {'problem_char': problem_char} + f"Found character '{problem_char}' in tag '{tag_name}' at position {char_index}." @hed_error(SchemaWarnings.NON_PLACEHOLDER_HAS_CLASS, default_severity=ErrorSeverity.WARNING) def schema_warning_non_placeholder_class(tag_name, invalid_attribute_name): return "Only placeholder nodes('#') can have a unit or value class." + \ - f"Found {invalid_attribute_name} on {tag_name}", {} + f"Found {invalid_attribute_name} on {tag_name}" @hed_error(SidecarErrors.BLANK_HED_STRING) def sidecar_error_blank_hed_string(): - return "No HED string found for Value or Category column.", {} + return "No HED string found for Value or Category column." @hed_error(SidecarErrors.WRONG_HED_DATA_TYPE) def sidecar_error_hed_data_type(expected_type, given_type): - return f"Invalid HED string datatype sidecar. Should be '{expected_type}', but got '{given_type}'", {} + return f"Invalid HED string datatype sidecar. Should be '{expected_type}', but got '{given_type}'" @hed_error(SidecarErrors.INVALID_POUND_SIGNS_VALUE, actual_code=ValidationErrors.HED_PLACEHOLDER_INVALID) def sidecar_error_invalid_pound_sign_count(pound_sign_count): - return f"There should be exactly one # character in a sidecar string. Found {pound_sign_count}", {} + return f"There should be exactly one # character in a sidecar string. Found {pound_sign_count}" @hed_error(SidecarErrors.INVALID_POUND_SIGNS_CATEGORY, actual_code=ValidationErrors.HED_PLACEHOLDER_INVALID) def sidecar_error_too_many_pound_signs(pound_sign_count): - return f"There should be no # characters in a category sidecar string. Found {pound_sign_count}", {} + return f"There should be no # characters in a category sidecar string. Found {pound_sign_count}" @hed_error(SidecarErrors.UNKNOWN_COLUMN_TYPE) def sidecar_error_unknown_column(column_name): return f"Could not automatically identify column '{column_name}' type from file. "\ - "Most likely the column definition in question needs a # sign to replace a number somewhere.", {} + "Most likely the column definition in question needs a # sign to replace a number somewhere." + + +@hed_error(SidecarErrors.SIDECAR_HED_USED, actual_code=SidecarErrors.SIDECAR_INVALID) +def sidecar_hed_used(): + return "'HED' is a reserved name and cannot be used as a sidecar column name" + + +@hed_error(SidecarErrors.SIDECAR_NA_USED, actual_code=SidecarErrors.SIDECAR_INVALID) +def sidecar_na_used(column_name): + return f"Invalid category key 'n/a' found in column {column_name}." @hed_tag_error(DefinitionErrors.DEF_TAG_IN_DEFINITION, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_def_tag_in_definition(tag, def_name): return f"Invalid tag {tag} found in definition for {def_name}. " +\ - f"Def and Def-expand tags cannot be in definitions.", {} + f"Def and Def-expand tags cannot be in definitions." @hed_error(DefinitionErrors.WRONG_NUMBER_GROUP_TAGS, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_wrong_group_tags(def_name, tag_list): tag_list_strings = [str(tag) for tag in tag_list] - return f"Too many group tags found in definition for {def_name}. Expected 1, found: {tag_list_strings}", {} + return f"Too many group tags found in definition for {def_name}. Expected 1, found: {tag_list_strings}" @hed_error(DefinitionErrors.WRONG_NUMBER_PLACEHOLDER_TAGS, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_wrong_placeholder_count(def_name, expected_count, tag_list): tag_list_strings = [str(tag) for tag in tag_list] return f"Incorrect number placeholder tags found in definition for {def_name}. " + \ - f"Expected {expected_count}, found: {tag_list_strings}", {} + f"Expected {expected_count}, found: {tag_list_strings}" @hed_error(DefinitionErrors.DUPLICATE_DEFINITION, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_duplicate_definition(def_name): - return f"Duplicate definition found for '{def_name}'.", {} + return f"Duplicate definition found for '{def_name}'." @hed_error(DefinitionErrors.TAG_IN_SCHEMA, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_tag_already_in_schema(def_name): - return f"Term '{def_name}' already used as term in schema and cannot be re-used as a definition.", {} + return f"Term '{def_name}' already used as term in schema and cannot be re-used as a definition." @hed_error(DefinitionErrors.INVALID_DEFINITION_EXTENSION, actual_code=ValidationErrors.HED_DEFINITION_INVALID) def def_error_invalid_def_extension(def_name): - return f"Term '{def_name}' has an invalid extension. Definitions can only have one term.", {} + return f"Term '{def_name}' has an invalid extension. Definitions can only have one term." @hed_tag_error(OnsetErrors.ONSET_DEF_UNMATCHED, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_error_def_unmatched(tag): - return f"The def tag in an onset/offset tag is unmatched. Def tag: '{tag}'", {} + return f"The def tag in an onset/offset tag is unmatched. Def tag: '{tag}'" @hed_tag_error(OnsetErrors.OFFSET_BEFORE_ONSET, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_error_offset_before_onset(tag): - return f"Offset tag '{tag}' does not have a matching onset.", {} + return f"Offset tag '{tag}' does not have a matching onset." @hed_tag_error(OnsetErrors.ONSET_NO_DEF_TAG_FOUND, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_no_def_found(tag): - return f"'{tag}' tag has no def or def-expand tag in string.", {} + return f"'{tag}' tag has no def or def-expand tag in string." @hed_tag_error(OnsetErrors.ONSET_TOO_MANY_DEFS, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_too_many_defs(tag, tag_list): tag_list_strings = [str(tag) for tag in tag_list] - return f"Too many def tags found in onset for {tag}. Expected 1, also found: {tag_list_strings}", {} + return f"Too many def tags found in onset for {tag}. Expected 1, also found: {tag_list_strings}" @hed_tag_error(OnsetErrors.ONSET_WRONG_NUMBER_GROUPS, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_too_many_groups(tag, tag_list): tag_list_strings = [str(a_tag) for a_tag in tag_list] return f"An onset tag should have at most 2 sibling nodes, an offset tag should have 1. " +\ - f"Found {len(tag_list_strings)}: {tag_list_strings}", {} + f"Found {len(tag_list_strings)}: {tag_list_strings}" @hed_tag_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_wrong_type_tag(tag, def_tag): return f"Onset def tag '{def_tag}' has an improper sibling tag '{tag}'. All onset context tags must be " + \ - f"in a single group together.", {} + f"in a single group together." @hed_tag_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, actual_code=ValidationErrors.HED_ONSET_OFFSET_ERROR) def onset_wrong_placeholder(tag, has_placeholder): if has_placeholder: - return f"Onset/offset def tag {tag} expects a placeholder value, but does not have one.", {} - return f"Onset/offset def tag {tag} should not have a placeholder, but has one.", {} + return f"Onset/offset def tag {tag} expects a placeholder value, but does not have one." + return f"Onset/offset def tag {tag} should not have a placeholder, but has one." diff --git a/hed/errors/error_reporter.py b/hed/errors/error_reporter.py index 8f8b1e368..4a7fd91a9 100644 --- a/hed/errors/error_reporter.py +++ b/hed/errors/error_reporter.py @@ -43,8 +43,8 @@ def wrapper(*args, severity=default_severity, **kwargs): Returns: list: A list of dict with the errors.= """ - base_message, error_vars = func(*args, **kwargs) - error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, **error_vars) + base_message = func(*args, **kwargs) + error_object = ErrorHandler._create_error_object(actual_code, base_message, severity) return error_object _register_error_function(error_type, wrapper_func=wrapper) @@ -97,8 +97,8 @@ def wrapper(tag, index_in_tag, index_in_tag_end, *args, severity=default_severit except AttributeError: org_tag_text = str(tag) - base_message, error_vars = func(org_tag_text, problem_sub_tag, *args, **kwargs) - error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, **error_vars, + base_message = func(org_tag_text, problem_sub_tag, *args, **kwargs) + error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, index_in_tag=index_in_tag, index_in_tag_end=index_in_tag_end, source_tag=tag) @@ -129,8 +129,8 @@ def wrapper(tag, *args, severity=default_severity, **kwargs): org_tag_text = tag.get_original_hed_string() else: org_tag_text = str(tag) - base_message, error_vars = func(org_tag_text, *args, **kwargs) - error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, **error_vars, + base_message = func(org_tag_text, *args, **kwargs) + error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, source_tag=tag) return error_object @@ -148,9 +148,10 @@ def wrapper(tag, *args, severity=default_severity, **kwargs): class ErrorHandler: - def __init__(self): + def __init__(self, check_for_warnings=True): # The current (ordered) dictionary of contexts. self.error_context = [] + self._check_for_warnings = check_for_warnings def push_error_context(self, context_type, context, increment_depth_after=True): """ Push a new error context to narrow down error scope. @@ -191,8 +192,12 @@ def get_error_context_copy(self): def format_error_with_context(self, *args, **kwargs): error_object = ErrorHandler.format_error(*args, **kwargs) if self is not None: - self._add_context_to_errors(error_object[0], self.error_context) - self._update_error_with_char_pos(error_object[0]) + actual_error = error_object[0] + # # Filter out warning errors + if not self._check_for_warnings and actual_error['severity'] >= ErrorSeverity.WARNING: + return [] + self._add_context_to_errors(actual_error, self.error_context) + self._update_error_with_char_pos(actual_error) return error_object @@ -225,26 +230,19 @@ def format_error(error_type, *args, actual_error=None, **kwargs): return [error_object] - def add_context_to_issues(self, issues): + def add_context_and_filter(self, issues): + """ Filter out warnings if requested, while adding context to issues. + + issues(list): + list: A list containing a single dictionary representing a single error. + """ + if not self._check_for_warnings: + issues[:] = self.filter_issues_by_severity(issues, ErrorSeverity.ERROR) + for error_object in issues: self._add_context_to_errors(error_object, self.error_context) self._update_error_with_char_pos(error_object) - def format_error_list(self, issue_params): - """ Convert an issue params list to an issues list. This means adding the error context primarily. - - Parameters: - issue_params (list): A list of dict containing the unformatted issues list. - - Returns: - list: A list of dict containing unformatted errors. - - """ - formatted_issues = [] - for issue in issue_params: - formatted_issues += self.format_error(**issue) - return formatted_issues - @staticmethod def format_error_from_context(error_type, error_context, *args, actual_error=None, **kwargs): """ Format an error based on the error type. @@ -262,6 +260,7 @@ def format_error_from_context(error_type, error_context, *args, actual_error=Non Notes: - Generally the error_context is returned from _add_context_to_errors. - The actual_error is useful for errors that are shared like invalid character. + - This can't filter out warnings like the other ones. """ error_func = error_functions.get(error_type) diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index feb21bef6..ac76f6992 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -47,6 +47,7 @@ class ValidationErrors: HED_UNITS_DEFAULT_USED = 'HED_UNITS_DEFAULT_USED' HED_VALUE_INVALID = 'HED_VALUE_INVALID' HED_LIBRARY_UNMATCHED = "HED_LIBRARY_UNMATCHED" + TAG_PREFIX_INVALID = "TAG_PREFIX_INVALID" # HED_VERSION_WARNING HED_MISSING_REQUIRED_COLUMN = "HED_MISSING_REQUIRED_COLUMN" @@ -75,12 +76,14 @@ class ValidationErrors: class SidecarErrors: # These are for json sidecar validation errors(sidecars can also produce most normal validation errors) + SIDECAR_INVALID = "SIDECAR_INVALID" # this is the generic error reported for several later ones BLANK_HED_STRING = 'blankValueString' WRONG_HED_DATA_TYPE = 'wrongHedDataType' INVALID_POUND_SIGNS_VALUE = 'invalidNumberPoundSigns' INVALID_POUND_SIGNS_CATEGORY = 'tooManyPoundSigns' UNKNOWN_COLUMN_TYPE = 'sidecarUnknownColumn' - + SIDECAR_HED_USED = 'SIDECAR_HED_USED' + SIDECAR_NA_USED = 'SIDECAR_NA_USED' class SchemaErrors: HED_SCHEMA_DUPLICATE_NODE = 'HED_SCHEMA_DUPLICATE_NODE' diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py index 4b90f9b66..72ab0eead 100644 --- a/hed/errors/exceptions.py +++ b/hed/errors/exceptions.py @@ -8,6 +8,8 @@ class HedExceptions: CANNOT_PARSE_JSON = 'cannotParseJson' INVALID_EXTENSION = 'invalidExtension' + INVALID_DATAFRAME = 'INVALID_DATAFRAME' + # These are actual schema issues, not that the file cannot be found or parsed SCHEMA_HEADER_MISSING = 'HED_SCHEMA_HEADER_INVALID' HED_SCHEMA_HEADER_INVALID = 'HED_SCHEMA_HEADER_INVALID' diff --git a/hed/models/__init__.py b/hed/models/__init__.py index 07c044319..3f6d50d56 100644 --- a/hed/models/__init__.py +++ b/hed/models/__init__.py @@ -5,15 +5,12 @@ from .column_metadata import ColumnMetadata, ColumnType from .definition_dict import DefinitionDict from .definition_entry import DefinitionEntry -from .def_mapper import DefMapper from .expression_parser import QueryParser from .hed_group import HedGroup from .spreadsheet_input import SpreadsheetInput -from .hed_ops import HedOps from .hed_string import HedString from .hed_string_group import HedStringGroup from .hed_tag import HedTag -from .onset_mapper import OnsetMapper from .sidecar import Sidecar from .tabular_input import TabularInput from .timeseries_input import TimeseriesInput diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 33a35a96a..869bc4ea6 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -1,19 +1,12 @@ +import re import os + import openpyxl import pandas -import copy -from hed.models.definition_dict import DefinitionDict from hed.models.column_mapper import ColumnMapper from hed.errors.exceptions import HedFileError, HedExceptions -from hed.errors.error_types import ErrorContext, ErrorSeverity from hed.errors.error_reporter import ErrorHandler -from hed.models import model_constants -from hed.models.hed_ops import translate_ops -from hed.models.onset_mapper import OnsetMapper -from hed.models.hed_string import HedString -from hed.models.hed_string_group import HedStringGroup -from hed.models.def_mapper import DefMapper class BaseInput: @@ -27,8 +20,8 @@ class BaseInput: TAB_DELIMITER = '\t' COMMA_DELIMITER = ',' - def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, def_mapper=None, - definition_columns=None, name=None, allow_blank_names=True, hed_schema=None): + def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, name=None, + allow_blank_names=True): """ Constructor for the BaseInput class. Parameters: @@ -40,10 +33,8 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T has_column_names (bool): True if file has column names. This value is ignored if you pass in a pandas dataframe. mapper (ColumnMapper or None): Indicates which columns have HED tags. - definition_columns(list or None): A list of columns to check for definitions. Explicit 'None' means all. name (str or None): Optional field for how this file will report errors. allow_blank_names(bool): If True, column names can be blank - hed_schema(HedSchema or None): The schema to use by default in identifying tags Notes: - See SpreadsheetInput or TabularInput for examples of how to use built-in a ColumnMapper. @@ -51,17 +42,11 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T if mapper is None: mapper = ColumnMapper() self._mapper = mapper - if def_mapper is None: - def_mapper = DefMapper(mapper.get_def_dicts()) - self._def_mapper = def_mapper self._has_column_names = has_column_names self._name = name - # This is the loaded workbook if we loaded originally from an excel file. + # This is the loaded workbook if we loaded originally from an Excel file. self._loaded_workbook = None self._worksheet_name = worksheet_name - self._def_columns = definition_columns - self._schema = hed_schema - self.file_def_dict = None pandas_header = 0 if not self._has_column_names: pandas_header = None @@ -82,7 +67,9 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file) elif input_type in self.TEXT_EXTENSION: self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, - dtype=str, keep_default_na=False, na_values=None) + dtype=str, keep_default_na=True, na_values=None) + # Convert nan values to a known value + self._dataframe = self._dataframe.fillna("n/a") elif input_type in self.EXCEL_EXTENSION: self._loaded_workbook = openpyxl.load_workbook(file) loaded_worksheet = self.get_worksheet(self._worksheet_name) @@ -90,8 +77,11 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T else: raise HedFileError(HedExceptions.INVALID_EXTENSION, "", file) - column_issues = ColumnMapper.validate_column_map(self.columns, - allow_blank_names=allow_blank_names) + if self._dataframe.size == 0: + raise HedFileError(HedExceptions.INVALID_DATAFRAME, "Invalid dataframe(malformed datafile, etc)", file) + + # todo: Can we get rid of this behavior now that we're using pandas? + column_issues = ColumnMapper.validate_column_map(self.columns, allow_blank_names=allow_blank_names) if column_issues: raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.", self.name, issues=column_issues) @@ -113,15 +103,29 @@ def reset_mapper(self, new_mapper): columns = self._dataframe.columns self._mapper.set_column_map(columns) - self.file_def_dict = self.extract_definitions() - - self.update_definition_mapper(self.file_def_dict) - @property def dataframe(self): """ The underlying dataframe. """ return self._dataframe + @property + def dataframe_a(self): + """Return the assembled dataframe + Probably a placeholder name. + + Returns: + Dataframe: the assembled dataframe""" + return self.assemble() + + @property + def series_a(self): + """Return the assembled dataframe as a series + Probably a placeholder name. + + Returns: + Series: the assembled dataframe with columns merged""" + return self.combine_dataframe(self.assemble()) + @property def name(self): """ Name of the data. """ @@ -142,125 +146,101 @@ def worksheet_name(self): """ The worksheet name. """ return self._worksheet_name - def get_definitions(self, as_strings=False): - if as_strings: - return DefinitionDict.get_as_strings(self._def_mapper.gathered_defs) - else: - return self._def_mapper - - def _convert_to_form(self, hed_schema, tag_form, error_handler): - """ Convert all tags to the specified form. + def convert_to_form(self, hed_schema, tag_form): + """ Convert all tags in underlying dataframe to the specified form. Parameters: - hed_schema (HedSchema or None): The schema to use to convert tags. - If None, uses the one used to open the file. - tag_form (str): The form to convert the tags to (short_tag, long_tag, base_tag, etc). - error_handler (ErrorHandler or None): The error handler to use for context or default if none. + hed_schema (HedSchema): The schema to use to convert tags. + tag_form(str): HedTag property to convert tags to. + Most cases should use convert_to_short or convert_to_long below. + """ + from hed.models.df_util import convert_to_form + convert_to_form(self._dataframe, hed_schema, tag_form, self._mapper.get_tag_columns()) - Returns: - dict: A list of issue dictionaries corresponding to issues found during conversion. + def convert_to_short(self, hed_schema): + """ Convert all tags in underlying dataframe to short form. + Parameters: + hed_schema (HedSchema): The schema to use to convert tags. """ - error_list = [] - if hed_schema is None: - hed_schema = self._schema - if hed_schema is None: - raise ValueError("Cannot convert between tag forms without a schema.") - for row_number, row_dict in enumerate(self.iter_dataframe(hed_ops=hed_schema, - return_string_only=False, - remove_definitions=False, - requested_columns=self._mapper.get_tag_columns(), - error_handler=error_handler)): - column_to_hed_tags_dictionary = row_dict[model_constants.COLUMN_TO_HED_TAGS] - error_list += row_dict[model_constants.ROW_ISSUES] - for column_number in column_to_hed_tags_dictionary: - column_hed_string = column_to_hed_tags_dictionary[column_number] - self.set_cell(row_number, column_number, column_hed_string, - include_column_prefix_if_exist=False, tag_form=tag_form) - - return error_list - - def convert_to_short(self, hed_schema=None, error_handler=None): - """ Convert all tags to short form. + return self.convert_to_form(hed_schema, "short_tag") + + def convert_to_long(self, hed_schema): + """ Convert all tags in underlying dataframe to long form. Parameters: hed_schema (HedSchema or None): The schema to use to convert tags. - If None, uses the one used to open the file. - error_handler (ErrorHandler): The error handler to use for context, uses a default if none. - - Returns: - dict: A list of issue dictionaries corresponding to issues found during conversion. - """ - return self._convert_to_form(hed_schema, "short_tag", error_handler) + return self.convert_to_form(hed_schema, "long_tag") - def convert_to_long(self, hed_schema=None, error_handler=None): - """ Convert all tags to long form. + def shrink_defs(self, hed_schema): + """ Shrinks any def-expand found in the underlying dataframe. Parameters: - hed_schema (HedSchema or None): The schema to use to convert tags. - If None, uses the one used to open the file. - error_handler (ErrorHandler): The error handler to use for context, uses a default if none. + hed_schema (HedSchema or None): The schema to use to identify defs + """ + from df_util import shrink_defs + shrink_defs(self._dataframe, hed_schema=hed_schema, columns=self._mapper.get_tag_columns()) - Returns: - dict: A list of issue dictionaries corresponding to issues found during conversion. + def expand_defs(self, hed_schema, def_dict): + """ Shrinks any def-expand found in the underlying dataframe. + Parameters: + hed_schema (HedSchema or None): The schema to use to identify defs + def_dict (DefinitionDict): The definitions to expand """ - return self._convert_to_form(hed_schema, "long_tag", error_handler) + from df_util import expand_defs + expand_defs(self._dataframe, hed_schema=hed_schema, def_dict=def_dict, columns=self._mapper.get_tag_columns()) - def to_excel(self, file, output_processed_file=False): + def to_excel(self, file, output_assembled=False): """ Output to an Excel file. Parameters: file (str or file-like): Location to save this base input. - output_processed_file (bool): If True, replace definitions and labels in HED columns. - Also fills in things like categories. + output_assembled (bool): Plug in categories and values from the sidecar directly. Raises: - HedFileError if empty file object or file cannot be opened. + ValueError: if empty file object or file cannot be opened. """ if not file: raise ValueError("Empty file name or object passed in to BaseInput.save.") - # For now just make a copy if we want to save a formatted copy. Could optimize this further. - if output_processed_file: - output_file = self._get_processed_copy() - else: - output_file = self + dataframe = self._dataframe + + if output_assembled: + dataframe = self.dataframe_a if self._loaded_workbook: old_worksheet = self.get_worksheet(self._worksheet_name) - # excel spreadsheets are 1 based, then add another 1 for column names if present + # Excel spreadsheets are 1 based, then add another 1 for column names if present adj_row_for_col_names = 1 if self._has_column_names: adj_row_for_col_names += 1 adj_for_one_based_cols = 1 - for row_number, text_file_row in output_file._dataframe.iterrows(): + for row_number, text_file_row in dataframe.iterrows(): for column_number, column_text in enumerate(text_file_row): old_worksheet.cell(row_number + adj_row_for_col_names, column_number + adj_for_one_based_cols).value = \ - output_file._dataframe.iloc[row_number, column_number] + dataframe.iloc[row_number, column_number] self._loaded_workbook.save(file) else: - output_file._dataframe.to_excel(file, header=self._has_column_names) + dataframe.to_excel(file, header=self._has_column_names) - def to_csv(self, file=None, output_processed_file=False): + def to_csv(self, file=None, output_assembled=False): """ Write to file or return as a string. Parameters: file (str, file-like, or None): Location to save this file. If None, return as string. - output_processed_file (bool): Replace all definitions and labels in HED columns as appropriate. - Also fills in things like categories. + output_assembled (bool): Plug in categories and values from the sidecar directly. Returns: None or str: None if file is given or the contents as a str if file is None. """ - # For now just make a copy if we want to save a formatted copy. Could optimize this further. - if output_processed_file: - output_file = self._get_processed_copy() - else: - output_file = self - csv_string_if_filename_none = output_file._dataframe.to_csv(file, '\t', index=False, - header=output_file._has_column_names) + dataframe = self._dataframe + + if output_assembled: + dataframe = self.dataframe_a + + csv_string_if_filename_none = dataframe.to_csv(file, '\t', index=False, header=self._has_column_names) return csv_string_if_filename_none @property @@ -277,118 +257,32 @@ def columns(self): columns = list(self._dataframe.columns) return columns - @property - def def_dict(self): - """ Returns a dict of all the definitions found in this and sidecars + def column_metadata(self): + """Get the metadata for each column Returns: - def_dict(dict): {str: DefinitionEntry} pairs for each found definition + dict: number/ColumnMeta pairs """ - if self._def_mapper: - return self._def_mapper.gathered_defs + if self._mapper: + return self._mapper._final_column_map return {} - def __iter__(self): - """ Iterate over the underlying dataframe. """ - return self.iter_dataframe() - - def iter_dataframe(self, hed_ops=None, mapper=None, requested_columns=None, return_string_only=True, - run_string_ops_on_columns=False, error_handler=None, expand_defs=False, remove_definitions=True, - **kwargs): - """ Iterate rows based on the given column mapper. - - Parameters: - hed_ops (list, func, HedOps, or None): A func, a HedOps or a list of these to apply to the - hed strings before returning. - mapper (ColumnMapper or None): The column name to column number mapper (or internal mapper if None). - requested_columns(list or None): If this is not None, return ONLY these columns. Names or numbers allowed. - return_string_only (bool): If True, do not return issues list, individual columns, attribute columns, etc. - run_string_ops_on_columns (bool): If true, run all tag and string ops on columns, - rather than columns then rows. - error_handler (ErrorHandler or None): The error handler to use for context or a default if None. - expand_defs (bool): If True, expand def tags into def-expand groups. - remove_definitions (bool): If true, remove all definition tags found. - kwargs (kwargs): See models.hed_ops.translate_ops or the specific hed_ops for additional options. - - Yields: - dict: A dict with parsed row, including keys: "HED", "column_to_hed_tags", and possibly "column_issues". - - """ - if error_handler is None: - error_handler = ErrorHandler() - - if mapper is None: - mapper = self._mapper - - if requested_columns: - # Make a copy to ensure we don't alter the actual mapper - mapper = copy.deepcopy(mapper) - mapper.set_requested_columns(requested_columns) - - tag_funcs, string_funcs = self._translate_ops(hed_ops, run_string_ops_on_columns=run_string_ops_on_columns, - expand_defs=expand_defs, remove_definitions=remove_definitions, - error_handler=error_handler, **kwargs) - - # Iter tuples is ~ 25% faster compared to iterrows in our use case - for row_number, text_file_row in enumerate(self._dataframe.itertuples(index=False)): - error_handler.push_error_context(ErrorContext.ROW, row_number) - yield self._expand_row_internal(text_file_row, tag_funcs, string_funcs, - error_handler=error_handler, - mapper=mapper, return_string_only=return_string_only) - error_handler.pop_error_context() - - def _expand_row_internal(self, text_file_row, tag_funcs, string_funcs, error_handler, - mapper=None, return_string_only=False): - row_dict = mapper.expand_row_tags(text_file_row) - column_to_hed_tags = row_dict[model_constants.COLUMN_TO_HED_TAGS] - expansion_column_issues = row_dict.get(model_constants.COLUMN_ISSUES, {}) - - row_issues = [] - if tag_funcs: - row_issues += self._run_column_ops(column_to_hed_tags, tag_funcs, - expansion_column_issues, - error_handler) - - # Return a combined string if we're also returning columns. - if not return_string_only: - final_hed_string = HedStringGroup(column_to_hed_tags.values()) - else: - final_hed_string = HedString.from_hed_strings(contents=column_to_hed_tags.values()) - - if string_funcs: - row_issues += self._run_row_ops(final_hed_string, string_funcs, error_handler) - - if not return_string_only: - row_dict[model_constants.ROW_ISSUES] = row_issues - row_dict[model_constants.ROW_HED_STRING] = final_hed_string - return row_dict - # Return a HedString rather than a HedStringGroup - return final_hed_string - - def set_cell(self, row_number, column_number, new_string_obj, include_column_prefix_if_exist=False, - tag_form="short_tag"): + def set_cell(self, row_number, column_number, new_string_obj, tag_form="short_tag"): """ Replace the specified cell with transformed text. Parameters: row_number (int): The row number of the spreadsheet to set. column_number (int): The column number of the spreadsheet to set. new_string_obj (HedString): Object with text to put in the given cell. - include_column_prefix_if_exist (bool): If True and the column matches one from mapper - _column_prefix_dictionary, remove the prefix. tag_form (str): Version of the tags (short_tag, long_tag, base_tag, etc) Notes: Any attribute of a HedTag that returns a string is a valid value of tag_form. - """ if self._dataframe is None: raise ValueError("No data frame loaded") - transform_func = None - if not include_column_prefix_if_exist: - transform_func = self._mapper.get_prefix_remove_func(column_number) - - new_text = new_string_obj.get_as_form(tag_form, transform_func) + new_text = new_string_obj.get_as_form(tag_form) self._dataframe.iloc[row_number, column_number] = new_text def get_worksheet(self, worksheet_name=None): @@ -412,47 +306,6 @@ def get_worksheet(self, worksheet_name=None): else: return None - def get_def_and_mapper_issues(self, error_handler, check_for_warnings=False): - """ Return definition and column issues. - - Parameters: - error_handler (ErrorHandler): The error handler to use. - check_for_warnings (bool): If True check for and return warnings as well as errors. - - Returns: - dict: A list of definition and mapping issues. Each issue is a dictionary. - - """ - issues = [] - issues += self.file_def_dict.get_definition_issues() - - # Gather any issues from the mapper for things like missing columns. - mapper_issues = self._mapper.get_column_mapping_issues() - error_handler.add_context_to_issues(mapper_issues) - issues += mapper_issues - if not check_for_warnings: - issues = ErrorHandler.filter_issues_by_severity(issues, ErrorSeverity.ERROR) - return issues - - def _get_processed_copy(self): - """ Return a processed copy of this file. - - Returns: - BaseInput: The copy. - - Notes: - Processing includes definitions replaced, columns expanded, etc. - - """ - output_file = copy.deepcopy(self) - for row_number, row_dict in enumerate(self.iter_dataframe(return_string_only=False)): - column_to_hed_tags_dictionary = row_dict[model_constants.COLUMN_TO_HED_TAGS] - for column_number in column_to_hed_tags_dictionary: - new_text = column_to_hed_tags_dictionary[column_number] - output_file.set_cell(row_number, column_number, new_text, tag_form="short_tag") - - return output_file - @staticmethod def _get_dataframe_from_worksheet(worksheet, has_headers): """ Create a dataframe from the worksheet. @@ -474,139 +327,91 @@ def _get_dataframe_from_worksheet(worksheet, has_headers): else: return pandas.DataFrame(worksheet.values, dtype=str) - def _run_validators(self, hed_ops, error_handler, expand_defs=False, **kwargs): - validation_issues = [] - for row_dict in self.iter_dataframe(hed_ops=hed_ops, - return_string_only=False, - error_handler=error_handler, expand_defs=expand_defs, - **kwargs): - validation_issues += row_dict[model_constants.ROW_ISSUES] - - return validation_issues - - def _run_column_ops(self, column_to_hed_tags_dictionary, column_ops, expansion_column_issues, error_handler): - validation_issues = [] - if column_to_hed_tags_dictionary: - for column_number, column_hed_string in column_to_hed_tags_dictionary.items(): - new_column_issues = [] - error_handler.push_error_context(ErrorContext.COLUMN, column_number) - if column_hed_string is not None: - error_handler.push_error_context(ErrorContext.HED_STRING, column_hed_string, - increment_depth_after=False) - if column_number in expansion_column_issues: - new_column_issues += expansion_column_issues[column_number] - - if column_hed_string is not None: - new_column_issues += column_hed_string.apply_funcs(column_ops) - error_handler.add_context_to_issues(new_column_issues) - if column_hed_string is not None: - error_handler.pop_error_context() - error_handler.pop_error_context() - validation_issues += new_column_issues - - return validation_issues - - def _run_row_ops(self, row_hed_string, row_ops, error_handler): - error_handler.push_error_context(ErrorContext.HED_STRING, row_hed_string, increment_depth_after=False) - row_issues = row_hed_string.apply_funcs(row_ops) - error_handler.add_context_to_issues(row_issues) - error_handler.pop_error_context() - return row_issues - - def validate_file(self, hed_ops, name=None, error_handler=None, check_for_warnings=True, **kwargs): - """ Run the hed_ops on columns and rows. + def validate(self, hed_schema, extra_def_dicts=None, name=None, error_handler=None): + """Creates a SpreadsheetValidator and returns all issues with this fil Parameters: - hed_ops (func, HedOps, or list of func and/or HedOps): The HedOps of funcs to apply. - name (str): If present, use this as the filename for context, rather than using the actual filename - Useful for temp filenames. - error_handler (ErrorHandler or None): Used to report errors a default one if None. - check_for_warnings (bool): If True check for and return warnings as well as errors. - kwargs: See models.hed_ops.translate_ops or the specific hed_ops for additional options. - + hed_schema(HedSchema): The schema to use for validation + extra_def_dicts(list of DefDict or DefDict): all definitions to use for validation + name(str): The name to report errors from this file as + error_handler (ErrorHandler): Error context to use. Creates a new one if None Returns: - list: The list of validation issues found. The list elements are dictionaries. - + issues (list of dict): A list of issues for hed string """ + from hed.validator.spreadsheet_validator import SpreadsheetValidator if not name: name = self.name - if not isinstance(hed_ops, list): - hed_ops = [hed_ops] - - if error_handler is None: - error_handler = ErrorHandler() - - error_handler.push_error_context(ErrorContext.FILE_NAME, name) - validation_issues = self.get_def_and_mapper_issues(error_handler, check_for_warnings=check_for_warnings) - validation_issues += self._run_validators(hed_ops, error_handler=error_handler, - check_for_warnings=check_for_warnings, **kwargs) - error_handler.pop_error_context() - + tab_validator = SpreadsheetValidator(hed_schema) + validation_issues = tab_validator.validate(self, self._mapper.get_def_dict(hed_schema, extra_def_dicts), name, + error_handler=error_handler) return validation_issues - def extract_definitions(self, error_handler=None): - """ Gather and validate all definitions. + @staticmethod + def _dataframe_has_names(dataframe): + for column in dataframe.columns: + if isinstance(column, str): + return True + return False + + def assemble(self, mapper=None): + """ Assembles the hed strings Parameters: - error_handler (ErrorHandler): The error handler to use for context or a default if None. + mapper(ColumnMapper or None): Generally pass none here unless you want special behavior. Returns: - DefinitionDict: Contains all the definitions located in the file. - + Dataframe: the assembled dataframe """ - if error_handler is None: - error_handler = ErrorHandler() - new_def_dict = DefinitionDict() - hed_ops = [self._schema, new_def_dict] - for _ in self.iter_dataframe(hed_ops=hed_ops, - return_string_only=False, - requested_columns=self._def_columns, - run_string_ops_on_columns=True, - remove_definitions=False, - error_handler=error_handler): - pass - - return new_def_dict - - def update_definition_mapper(self, def_dict): - """ Add definitions from dict(s) if mapper exists. + if mapper is None: + mapper = self._mapper + import pandas as pd + transformers, need_categorical = mapper.get_transformers() + if not transformers: + return None + all_columns = self._dataframe + if need_categorical: + all_columns[need_categorical] = all_columns[need_categorical].astype('category') + + all_columns = all_columns.transform(transformers) + + possible_column_references = [f"{column_name}" for column_name in self.columns if + column_name.lower() != "hed"] + found_column_references = [] + for column_name in all_columns: + df = all_columns[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) + u_vals = pd.Series([j for i in df for j in i], dtype=str) + u_vals = u_vals.unique() + for val in u_vals: + if val not in found_column_references: + found_column_references.append(val) + + valid_replacements = [col for col in found_column_references if col in possible_column_references] + + column_names = list(transformers.keys()) + for column_name in valid_replacements: + column_names.remove(column_name) + saved_columns = all_columns[valid_replacements] + for column_name in column_names: + for replacing_name in valid_replacements: + column_name_brackets = f"[{replacing_name}]" + all_columns[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y + in zip(all_columns[column_name], saved_columns[replacing_name])) + all_columns = all_columns[column_names] + + return all_columns + + @staticmethod + def combine_dataframe(dataframe): + """ Combines all columns in the given dataframe into a single hed string series. Parameters: - def_dict (list or DefinitionDict): Add the DefDict or list of DefDict to the internal definition mapper. + dataframe(Dataframe): The dataframe to combine + Returns: + Series: the assembled series """ - if self._def_mapper is not None: - self._def_mapper.add_definitions(def_dict) - - def _translate_ops(self, hed_ops, run_string_ops_on_columns, expand_defs, remove_definitions, **kwargs): - - tag_funcs = [] - string_funcs = [] - if hed_ops or expand_defs or remove_definitions: - if not isinstance(hed_ops, list): - hed_ops = [hed_ops] - hed_ops = hed_ops.copy() - if not run_string_ops_on_columns: - self._add_def_onset_mapper(hed_ops) - tag_funcs, string_funcs = translate_ops(hed_ops, split_ops=True, hed_schema=self._schema, - expand_defs=expand_defs, - remove_definitions=remove_definitions, - **kwargs) - else: - tag_funcs = translate_ops(hed_ops, hed_schema=self._schema, expand_defs=expand_defs, **kwargs) - - return tag_funcs, string_funcs - - def _add_def_onset_mapper(self, hed_ops): - if not any(isinstance(hed_op, DefMapper) for hed_op in hed_ops): - if self._def_mapper: - hed_ops.append(self._def_mapper) - hed_ops.append(OnsetMapper(self._def_mapper)) - return hed_ops + dataframe = dataframe.agg(', '.join, axis=1) - @staticmethod - def _dataframe_has_names(dataframe): - for column in dataframe.columns: - if isinstance(column, str): - return True - return False + # Potentially better ways to handle removing n/a by never inserting them to begin with. + dataframe = dataframe.replace("(, n/a|n/a,)", "", regex=True) + return dataframe diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index f6fd12edb..3c4c87a63 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -1,13 +1,10 @@ from hed.models.column_metadata import ColumnMetadata, ColumnType from hed.models.sidecar import Sidecar -from hed.models.hed_string import HedString -from hed.models import model_constants from hed.errors.error_reporter import ErrorHandler from hed.errors.error_types import ValidationErrors import copy - PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " @@ -27,6 +24,9 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None Sidecar column definitions will take precedent if there is a conflict with tag_columns. column_prefix_dictionary (dict): Dictionary with keys that are column numbers and values are HED tag prefixes to prepend to the tags in that column before processing. + May be deprecated. These are no longer prefixes, but rather converted to value columns. + eg. {"key": "Description"} will turn into a value column as {"key": "Description/#"} + This means it no longer accepts anything but the value portion only in the columns. optional_tag_columns (list): A list of ints or strings containing the columns that contain the HED tags. If the column is otherwise unspecified, convert this column type to HEDTags. requested_columns (list or None): A list of columns you wish to retrieve. @@ -64,11 +64,41 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None self.set_requested_columns(requested_columns, False) self.set_tag_columns(tag_columns, optional_tag_columns, False) - self.set_column_prefix_dict(column_prefix_dictionary, False) + self._add_value_columns(column_prefix_dictionary) # finalize the column map based on initial settings with no header self._finalize_mapping() + def get_transformers(self): + """ Return the transformers to use on a dataframe + + """ + final_transformers = {} + need_categorical = [] + for column in self._final_column_map.values(): + assign_to_column = column.column_name + if isinstance(assign_to_column, int): + if self._column_map: + assign_to_column = self._column_map[assign_to_column - 1] + else: + assign_to_column = assign_to_column - 1 + if column.column_type == ColumnType.Ignore: + continue + elif column.column_type == ColumnType.Value: + value_str = column._hed_dict + from functools import partial + final_transformers[assign_to_column] = partial(self._value_handler, value_str) + elif column.column_type == ColumnType.Categorical: + need_categorical.append(column.column_name) + category_values = column._hed_dict + from functools import partial + final_transformers[assign_to_column] = partial(self._category_handler, category_values) + else: + final_transformers[assign_to_column] = lambda x: x + # print(column.column_type) + + return final_transformers, need_categorical + @staticmethod def validate_column_map(column_map, allow_blank_names): """ Validate there are no issues with column names. @@ -89,10 +119,10 @@ def validate_column_map(column_map, allow_blank_names): if name is None or name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): issues += ErrorHandler.format_error(ValidationErrors.HED_BLANK_COLUMN, column_number) continue - if name in used_names: - # todo: Add this check once it's more fleshed out - # issues += ErrorHandler.format_error(ValidationErrors.HED_DUPLICATE_COLUMN, name) - continue + # if name in used_names: + # # todo: Add this check once it's more fleshed out + # issues += ErrorHandler.format_error(ValidationErrors.HED_DUPLICATE_COLUMN, name) + # continue used_names.add(name) return issues @@ -116,34 +146,18 @@ def _set_sidecar(self, sidecar): self._sidecar = sidecar def get_tag_columns(self): - """ Returns the column numbers that are mapped to be HedTags + """ Returns the column numbers or names that are mapped to be HedTags Note: This is NOT the tag_columns or optional_tag_columns parameter, though they set it. Returns: - column_numbers(list): A list of column numbers that are ColumnType.HedTags + column_identifiers(list): A list of column numbers or names that are ColumnType.HedTags. + 0-based if integer-based, otherwise column name. """ - return [number for number, column_entry in self._final_column_map.items() + return [column_entry.column_name - 1 if isinstance(column_entry.column_name, int) else column_entry.column_name + for number, column_entry in self._final_column_map.items() if column_entry.column_type == ColumnType.HEDTags] - def set_column_prefix_dict(self, column_prefix_dictionary, finalize_mapping=True): - """ Replace the column prefix dictionary - - Parameters: - column_prefix_dictionary (dict): Dictionary with keys that are column numbers and values are HED tag - prefixes to prepend to the tags in that column before processing. - finalize_mapping (bool): Re-generate the internal mapping if True, otherwise no effect until finalize. - - Returns: - list: List of issues that occurred during this process. Each issue is a dictionary. - - """ - if column_prefix_dictionary: - self._column_prefix_dictionary = column_prefix_dictionary - if finalize_mapping: - return self._finalize_mapping() - return [] - def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_mapping=True): """ Set tag columns and optional tag columns @@ -222,88 +236,15 @@ def add_columns(self, column_names_or_numbers, column_type=ColumnType.HEDTags): new_def = ColumnMetadata(column_type, column_name) self._add_column_data(new_def) - def _expand_column(self, column_number, input_text): - """ Expand the specified text based on the rules for expanding the specified column. - - Parameters: - column_number (int): The column number this text should be treated as from. - input_text (str): The text to expand, generally from a single cell of a spreadsheet. - - Returns: - str or None: The text after expansion or None if this column is undefined or the given text is null. - False or str: Depends on the value of first return value. If None, this is an error message. - If string, this is an attribute name that should be stored separately. - - """ - - # Default 1-1 mapping if we don't have specific behavior. - if self._no_mapping_info: - return HedString(input_text), False - - # If no entry, ignore this column. - if column_number not in self._final_column_map: - return None, False - - if not input_text or input_text in self._na_patterns: - return None, False - - column_entry = self._final_column_map[column_number] - return column_entry.expand(input_text) - - def expand_row_tags(self, row_text): - """ Expand all mapped columns for row. - - Parameters: - row_text (list): The text for the given row, one list entry per column number. - - Returns: - dict: A dictionary containing the keys COLUMN_TO_HED_TAGS, COLUMN_ISSUES. - - Notes: - - The "column_to_hed_tags" is each expanded column given separately as a list of HedStrings. - - Attributes are any column identified as an attribute. - They will appear in the return value as {attribute_name: value_of_column} - - """ - result_dict = {} - column_to_hed_tags_dictionary = {} - column_issues_dict = {} - for column_number, cell_text in enumerate(row_text): - translated_column, translation_errors = self._expand_column(column_number, str(cell_text)) - if translated_column is None: - if translation_errors: - if column_number not in column_issues_dict: - column_issues_dict[column_number] = [] - column_issues_dict[column_number] += translation_errors - column_to_hed_tags_dictionary[column_number] = translated_column - continue - - column_to_hed_tags_dictionary[column_number] = translated_column - - result_dict[model_constants.COLUMN_TO_HED_TAGS] = column_to_hed_tags_dictionary - if column_issues_dict: - result_dict[model_constants.COLUMN_ISSUES] = column_issues_dict - - return result_dict - - def get_prefix_remove_func(self, column_number): - """ Return a function to removes name prefixes for column - - Parameters: - column_number (int): Column number to look up in the prefix dictionary. - - Returns: - func: A function taking a tag and string, returning a string. - - """ - if column_number not in self._final_column_map: - return None - - entry = self._final_column_map[column_number] - if not entry.column_prefix: - return None - - return entry.remove_prefix + def _add_value_columns(self, column_prefix_dictionary): + if column_prefix_dictionary: + for col, prefix in column_prefix_dictionary.items(): + if prefix.endswith("/"): + prefix = prefix + "#" + else: + prefix = prefix + "/#" + new_def = ColumnMetadata(ColumnType.Value, col, hed_dict=prefix) + self._add_column_data(new_def) def _add_column_data(self, new_column_entry): """ Add the metadata of a column to this column mapper. @@ -318,34 +259,6 @@ def _add_column_data(self, new_column_entry): column_name = new_column_entry.column_name self.column_data[column_name] = copy.deepcopy(new_column_entry) - @staticmethod - def _set_column_prefix(final_map, column_number, new_required_prefix): - """ Internal function to add this as a required name_prefix to a column - - Parameters: - final_map (dict): {column_number:prefix} Dict of column numbers with prefixes - column_number (int): The column number with this name_prefix. - new_required_prefix (str): The name_prefix to add to the column when loading from a spreadsheet. - - Raises: - TypeError if column number is passed as a str rather an int. - - Notes: - If the column is not known to the mapper, it will be added as a HEDTags column. - - """ - if isinstance(column_number, str): - raise TypeError("Must pass in a column number not column_name to _set_column_prefix") - if column_number not in final_map: - column_entry = ColumnMetadata(ColumnType.HEDTags) - final_map[column_number] = column_entry - else: - column_entry = final_map[column_number] - - column_entry.column_prefix = new_required_prefix - if column_entry.column_type is None or column_entry.column_type == ColumnType.Ignore: - column_entry.column_type = ColumnType.HEDTags - @staticmethod def _get_basic_final_map(column_map, column_data): basic_final_map = {} @@ -456,15 +369,14 @@ def _finalize_mapping(self): issues += self._add_tag_columns(final_map, unhandled_names, all_tag_columns, required_tag_columns, self._warn_on_missing_column) - # Add prefixes - for column_number, prefix in self._column_prefix_dictionary.items(): - self._set_column_prefix(final_map, column_number, prefix) - issues += ColumnMapper.validate_column_map(self._column_map.values(), allow_blank_names=False) self._final_column_map = self._filter_by_requested(final_map, self._requested_columns) + # Make sure this new dict is sorted + self._final_column_map = dict(sorted(final_map.items())) self._no_mapping_info = not self._check_if_mapping_info() + self._finalize_mapping_issues = issues return issues @@ -479,15 +391,19 @@ def _column_name_requested(self, column_name): return True return column_name in self._requested_columns - def get_def_dicts(self): + def get_def_dict(self, hed_schema=None, extra_def_dicts=None): """ Return def dicts from every column description. - Returns: - list: A list of DefinitionDict objects corresponding to each column entry. + Parameters: + hed_schema (Schema or None): A HED schema object to use for extracting definitions. + extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. + Returns: + DefinitionDict: A single definition dict representing all the data(and extra def dicts) """ if self._sidecar: - return self._sidecar.get_def_dicts() + return self._sidecar.get_def_dict(hed_schema=hed_schema, extra_def_dicts=extra_def_dicts) + return [] def get_column_mapping_issues(self): @@ -498,3 +414,14 @@ def get_column_mapping_issues(self): """ return self._finalize_mapping_issues + + @staticmethod + def _category_handler(category_values, x): + return category_values.get(x, "") + + @staticmethod + def _value_handler(value_str, x): + if x == "n/a": + return "n/a" + + return value_str.replace("#", str(x)) diff --git a/hed/models/column_metadata.py b/hed/models/column_metadata.py index 3921b5b82..ecdc76f08 100644 --- a/hed/models/column_metadata.py +++ b/hed/models/column_metadata.py @@ -1,11 +1,9 @@ from enum import Enum -from hed.models.hed_string import HedString -from hed.errors.error_types import SidecarErrors, ValidationErrors -from hed.errors.error_reporter import ErrorHandler +from hed.errors.error_types import SidecarErrors class ColumnType(Enum): - """ The overall column_type of a column in column mapper, eg treat it as HED tags. + """ The overall column_type of a column in column mapper, e.g. treat it as HED tags. Mostly internal to column mapper related code """ @@ -14,7 +12,7 @@ class ColumnType(Enum): Ignore = "ignore" # This column is a category with a list of possible values to replace with hed strings. Categorical = "categorical" - # This column has a value(eg filename) that is added to a hed tag in place of a # sign. + # This column has a value(e.g. filename) that is added to a hed tag in place of a # sign. Value = "value" # Return this column exactly as given, it is HED tags. HEDTags = "hed_tags" @@ -58,105 +56,6 @@ def hed_dict(self): """ return self._hed_dict - def _get_category_hed_string(self, category): - """ Fetch the hed string for a category key. - - Parameters: - category (str): The category key to retrieve the string from. - - Returns: - str: The hed string for a given category entry in a category column. - - """ - if self.column_type != ColumnType.Categorical: - return None - - return self._hed_dict.get(category, None) - - def _get_value_hed_string(self): - """ Fetch the hed string in a value column. - - Returns: - str: The hed string for a given value column. - - """ - if self.column_type != ColumnType.Value: - return None - - return self._hed_dict - - def expand(self, input_text): - """ Expand text using the rules for this column. - - Parameters: - input_text (str): Text to expand (generally from a single cell in a spreadsheet). - - Returns: - str or None: The expanded column as a hed_string. - str or dict: If this is a string, contains the name of this column - as an attribute. If the first return value is None, this is an error message dictionary. - - Notes: - - Examples are adding name_prefix, inserting a column hed_string from a category key, etc. - - """ - column_type = self.column_type - - if column_type == ColumnType.Categorical: - final_text = self._get_category_hed_string(input_text) - if final_text: - return HedString(final_text), False - else: - return None, ErrorHandler.format_error(ValidationErrors.HED_SIDECAR_KEY_MISSING, invalid_key=input_text, - category_keys=list(self._hed_dict.keys())) - elif column_type == ColumnType.Value: - prelim_text = self._get_value_hed_string() - final_text = prelim_text.replace("#", input_text) - return HedString(final_text), False - elif column_type == ColumnType.HEDTags: - hed_string_obj = HedString(input_text) - self._prepend_required_prefix(hed_string_obj, self.column_prefix) - return hed_string_obj, False - elif column_type == ColumnType.Ignore: - return None, False - - return None, {"error_type": "INTERNAL_ERROR"} - - @staticmethod - def _prepend_required_prefix(required_tag_column_tags, required_tag_prefix): - """ Prepend the tag paths to the required tag column tags that need them. - - Parameters: - required_tag_column_tags (HedString): A string containing HED tags associated with a - required tag column that may need a tag name_prefix prepended to its tags. - required_tag_prefix (str): A string that will be added if missing to any given tag. - """ - if not required_tag_prefix: - return required_tag_column_tags - - for tag in required_tag_column_tags.get_all_tags(): - tag.add_prefix_if_needed(required_tag_prefix) - - return required_tag_column_tags - - def remove_prefix(self, original_tag, current_tag_text): - """ Remove column_prefix if present from tag. - - Parameters: - original_tag (HedTag): The original hed tag being written. - current_tag_text (str): A single tag as a string, in any form. - - Returns: - str: current_tag_text with required prefixes removed - """ - prefix_to_remove = self.column_prefix - if not prefix_to_remove: - return current_tag_text - - if current_tag_text.lower().startswith(prefix_to_remove.lower()): - current_tag_text = current_tag_text[len(prefix_to_remove):] - return current_tag_text - @staticmethod def expected_pound_sign_count(column_type): """ Return how many pound signs a column string should have. diff --git a/hed/models/def_mapper.py b/hed/models/def_mapper.py deleted file mode 100644 index 98b8bbb43..000000000 --- a/hed/models/def_mapper.py +++ /dev/null @@ -1,255 +0,0 @@ -from hed.models.hed_string import HedString -from hed.models.hed_tag import HedTag -from hed.models.definition_dict import DefinitionDict -from hed.models.model_constants import DefTagNames -from hed.errors.error_types import ValidationErrors, DefinitionErrors -from hed.errors.error_reporter import ErrorHandler -from hed.models.hed_ops import HedOps - -# TODO: should not have print statement when error - - -class DefMapper(HedOps): - """ Handles converting Def/ and Def-expand/. - - Notes: - - The class provides string funcs but no tag funcs when extending HedOps. - - The class can expand or shrink definitions in hed strings via - Def/XXX and (Def-expand/XXX ...). - - """ - - def __init__(self, def_dicts=None): - """ Initialize mapper for definitions in hed strings. - - Parameters: - def_dicts (list or DefinitionDict): DefinitionDicts containing the definitions this mapper - should initialize with. - - Notes: - - More definitions can be added later. - - """ - super().__init__() - self._gathered_defs = {} - # List of def names we want to be able to quickly purge. - self._temporary_def_names = set() - self._def_tag_name = DefTagNames.DEFINITION_KEY - self._label_tag_name = DefTagNames.DEF_KEY - # this only gathers issues with duplicate definitions - self._issues = [] - if def_dicts: - self.add_definitions(def_dicts) - - @property - def issues(self): - return self._issues - - @property - def gathered_defs(self): - return self._gathered_defs - - def get_def_entry(self, def_name): - """ Get the definition entry for the definition name. - - Parameters: - def_name (str): Name of the definition to retrieve. - - Returns: - DefinitionEntry: Definition entry for the requested definition. - - """ - - return self._gathered_defs.get(def_name.lower()) - - def clear_temporary_definitions(self): - """ Remove any previously added temporary definitions. """ - for def_name in self._temporary_def_names: - del self._gathered_defs[def_name] - self._temporary_def_names = set() - - def add_definitions_from_string_as_temp(self, hed_string_obj): - """ Add definitions from hed string as temporary. - - Parameters: - hed_string_obj (HedString): Hed string object to search for definitions - - Returns: - list: List of issues due to invalid definitions found in this string. Each issue is a dictionary. - - """ - this_string_def_dict = DefinitionDict() - validation_issues = this_string_def_dict.check_for_definitions(hed_string_obj) - self.add_definitions(this_string_def_dict, add_as_temp=True) - return validation_issues - - def add_definitions(self, def_dicts, add_as_temp=False): - """ Add definitions from dict(s) to mapper - - Parameters: - def_dicts (list or DefinitionDict): DefDict or list of DefDicts whose definitions should be added. - add_as_temp (bool): If true, mark these new definitions as temporary (easily purged). - - """ - if not isinstance(def_dicts, list): - def_dicts = [def_dicts] - for def_dict in def_dicts: - if isinstance(def_dict, DefinitionDict): - self._add_definitions_from_dict(def_dict, add_as_temp) - else: - print(f"Invalid input type '{type(def_dict)} passed to DefMapper. Skipping.") - - def _add_definitions_from_dict(self, def_dict, add_as_temp=False): - """ Add the definitions found in the given definition dictionary to this mapper. - - Parameters: - def_dict (DefinitionDict): DefDict whose definitions should be added. - add_as_temp (bool): If true, mark these new definitions as temporary (easily purged). - - """ - for def_tag, def_value in def_dict: - if def_tag in self._gathered_defs: - error_context = self._gathered_defs[def_tag].source_context - self._issues += ErrorHandler.format_error_from_context(DefinitionErrors.DUPLICATE_DEFINITION, - error_context=error_context, - def_name=def_tag) - continue - self._gathered_defs[def_tag] = def_value - if add_as_temp: - self._temporary_def_names.add(def_tag) - - def expand_def_tags(self, hed_string_obj, expand_defs=True, shrink_defs=False): - """ Validate and expand Def/Def-Expand tags. - - Parameters: - hed_string_obj (HedString): The hed string to process. - expand_defs (bool): If true, convert def tags to def-expand tag groups that include definition content. - shrink_defs (bool): If True, replace all def-expand groups with corresponding def tags. - - Returns: - list: Issues found related to validating defs. Each issue is a dictionary. - - Notes: - - This function can optionally expand or shrink Def/ and Def-expand, respectively. - - Usually issues are mismatched placeholders or a missing definition. - - The expand_defs and shrink_defs cannot both be True. - - """ - # First see if the "def" is found at all. This covers def and def-expand. - hed_string_lower = hed_string_obj.lower() - if self._label_tag_name not in hed_string_lower: - return [] - - def_issues = [] - # We need to check for labels to expand in ALL groups - for def_tag, def_expand_group, def_group in hed_string_obj.find_def_tags(recursive=True): - def_contents = self._get_definition_contents(def_tag, def_expand_group, def_issues) - if def_expand_group is def_tag: - if def_contents is not None and expand_defs: - def_tag.short_base_tag = DefTagNames.DEF_EXPAND_ORG_KEY - def_group.replace(def_tag, def_contents) - else: - if def_contents is not None and shrink_defs: - def_tag.short_base_tag = DefTagNames.DEF_ORG_KEY - def_group.replace(def_expand_group, def_tag) - - return def_issues - - def expand_and_remove_definitions(self, hed_string_obj, check_for_definitions=False, expand_defs=True, - shrink_defs=False, remove_definitions=True): - """ Validate and expand Def/Def-Expand tags. - - Also removes definitions - - Parameters: - hed_string_obj (HedString): The string to search for definitions. - check_for_definitions (bool): If True, this will first check the hed string for any definitions. - expand_defs (bool): If True, replace Def tags to Def-expand tag groups. - shrink_defs (bool): If True, replace Def-expand groups with Def tags. - remove_definitions (bool): If true, this will remove all Definition tag groups. - - Returns: - def_issues (list): A list of issues for definition-related tags in this string. Each issue is a dictionary. - - Notes: - - The check_for_definitions is mainly used for individual HedStrings in isolation. - - The defs can be expanded or shrunk, while definitions can be removed. - - This does not validate definitions, it will blindly remove invalid definitions as well. - - """ - def_issues = [] - if check_for_definitions: - def_issues += self.add_definitions_from_string_as_temp(hed_string_obj) - def_issues += self.expand_def_tags(hed_string_obj, expand_defs=expand_defs, shrink_defs=shrink_defs) - if remove_definitions: - def_issues += hed_string_obj.remove_definitions() - if check_for_definitions: - self.clear_temporary_definitions() - - return def_issues - - def _get_definition_contents(self, def_tag, def_expand_group, def_issues): - """ Check for issues with expanding a tag from Def to a Def-expand tag group and return the expanded tag group. - - Parameters: - def_tag (HedTag): Source hed tag that may be a Def or Def-expand tag. - def_expand_group (HedGroup or HedTag): - Source group for this def-expand tag. Same as def_tag if this is not a def-expand tag. - def_issues : [{}] - List of issues to append any new issues to - - Returns: - def_contents: [HedTag or HedGroup] - The contents to replace the previous def-tag with. - """ - # todo: This check could be removed for optimizing - if def_tag.short_base_tag.lower() != DefTagNames.DEF_EXPAND_KEY and \ - def_tag.short_base_tag.lower() != DefTagNames.DEF_KEY: - raise ValueError("Internal error in DefMapper") - - is_label_tag = def_tag.extension_or_value_portion - placeholder = None - found_slash = is_label_tag.find("/") - if found_slash != -1: - placeholder = is_label_tag[found_slash + 1:] - is_label_tag = is_label_tag[:found_slash] - - label_tag_lower = is_label_tag.lower() - def_entry = self._gathered_defs.get(label_tag_lower) - if def_entry is None: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_UNMATCHED, tag=def_tag) - else: - def_tag_name, def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder) - if def_tag_name: - if def_expand_group is not def_tag and def_expand_group != def_contents: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_EXPAND_INVALID, - tag=def_tag, actual_def=def_contents, - found_def=def_expand_group) - return None - return def_contents - elif def_entry.takes_value: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_VALUE_MISSING, tag=def_tag) - else: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_VALUE_EXTRA, tag=def_tag) - - return None - - def __get_string_funcs__(self, **kwargs): - """ String funcs for processing definitions. """ - string_funcs = [] - expand_defs = kwargs.get("expand_defs") - shrink_defs = kwargs.get("shrink_defs") - remove_definitions = kwargs.get("remove_definitions") - check_for_definitions = kwargs.get("check_for_definitions") - if shrink_defs and expand_defs: - raise ValueError("Cannot pass both shrink_defs and expand_defs to DefMapper") - from functools import partial - string_funcs.append(partial(self.expand_and_remove_definitions, - check_for_definitions=check_for_definitions, - expand_defs=expand_defs, - shrink_defs=shrink_defs, - remove_definitions=remove_definitions)) - return string_funcs - - def __get_tag_funcs__(self, **kwargs): - return [] diff --git a/hed/models/definition_dict.py b/hed/models/definition_dict.py index 13d0f083b..ca3b06b34 100644 --- a/hed/models/definition_dict.py +++ b/hed/models/definition_dict.py @@ -2,36 +2,60 @@ from hed.models.hed_string import HedString from hed.errors.error_types import DefinitionErrors from hed.errors.error_reporter import ErrorHandler -from functools import partial - from hed.models.model_constants import DefTagNames -from hed.models.hed_ops import HedOps -class DefinitionDict(HedOps): +class DefinitionDict: """ Gathers definitions from a single source. - This class extends HedOps because it has string_funcs to check for definitions. It has no tag_funcs. - """ - def __init__(self): + def __init__(self, def_dicts=None, hed_schema=None): """ Definitions to be considered a single source. """ - super().__init__() self.defs = {} + self._label_tag_name = DefTagNames.DEF_KEY + self._issues = [] + if def_dicts: + self.add_definitions(def_dicts, hed_schema) + + def add_definitions(self, def_dicts, hed_schema=None): + """ Add definitions from dict(s) to this dict. + + Parameters: + def_dicts (list or DefinitionDict): DefDict or list of DefDicts/strings whose definitions should be added. + hed_schema(HedSchema or None): Required if passing strings or lists of strings, unused otherwise. + """ + if not isinstance(def_dicts, list): + def_dicts = [def_dicts] + for def_dict in def_dicts: + if isinstance(def_dict, DefinitionDict): + self._add_definitions_from_dict(def_dict) + elif isinstance(def_dict, str) and hed_schema: + self.check_for_definitions(HedString(def_dict, hed_schema)) + elif isinstance(def_dict, list) and hed_schema: + for definition in def_dict: + self.check_for_definitions(HedString(definition, hed_schema)) + else: + print(f"Invalid input type '{type(def_dict)} passed to DefDict. Skipping.") - # Definition related issues - self._extract_def_issues = [] + def _add_definition(self, def_tag, def_value): + if def_tag in self.defs: + error_context = self.defs[def_tag].source_context + self._issues += ErrorHandler.format_error_from_context(DefinitionErrors.DUPLICATE_DEFINITION, + error_context=error_context, def_name=def_tag) + else: + self.defs[def_tag] = def_value - def get_definition_issues(self): - """ Return definition errors found during extraction. + def _add_definitions_from_dict(self, def_dict): + """ Add the definitions found in the given definition dictionary to this mapper. - Returns: - list: List of DefinitionErrors issues found. Each issue is a dictionary. + Parameters: + def_dict (DefinitionDict): DefDict whose definitions should be added. """ - return self._extract_def_issues + for def_tag, def_value in def_dict: + self._add_definition(def_tag, def_value) def get(self, def_name): return self.defs.get(def_name.lower()) @@ -39,12 +63,23 @@ def get(self, def_name): def __iter__(self): return iter(self.defs.items()) - def __get_string_funcs__(self, **kwargs): - error_handler = kwargs.get("error_handler") - return [partial(self.check_for_definitions, error_handler=error_handler)] + @property + def issues(self): + """Returns issues about duplicate definitions.""" + return self._issues + + def get_def_entry(self, def_name): + """ Get the definition entry for the definition name. + + Parameters: + def_name (str): Name of the definition to retrieve. + + Returns: + DefinitionEntry: Definition entry for the requested definition. + + """ - def __get_tag_funcs__(self, **kwargs): - return [] + return self.defs.get(def_name.lower()) def check_for_definitions(self, hed_string_obj, error_handler=None): """ Check string for definition tags, adding them to self. @@ -128,9 +163,84 @@ def check_for_definitions(self, hed_string_obj, error_handler=None): takes_value=def_takes_value, source_context=context) - self._extract_def_issues += new_def_issues return new_def_issues + def construct_def_tags(self, hed_string_obj): + """ Identify def/def-expand tag contents in the given string. + + Parameters: + hed_string_obj(HedString): The hed string to identify definition contents in + """ + for def_tag, def_expand_group, def_group in hed_string_obj.find_def_tags(recursive=True): + def_contents = self._get_definition_contents(def_tag) + if def_contents is not None: + def_tag._expandable = def_contents + def_tag._expanded = def_tag != def_expand_group + + def construct_def_tag(self, hed_tag): + """ Identify def/def-expand tag contents in the given HedTag. + + Parameters: + hed_tag(HedTag): The hed tag to identify definition contents in + """ + if hed_tag.short_base_tag in {DefTagNames.DEF_ORG_KEY, DefTagNames.DEF_EXPAND_ORG_KEY}: + def_contents = self._get_definition_contents(hed_tag) + if def_contents is not None: + hed_tag._expandable = def_contents + hed_tag._expanded = hed_tag.short_base_tag == DefTagNames.DEF_EXPAND_ORG_KEY + + def expand_def_tags(self, hed_string_obj): + """ Expands def tags to def-expand tags. + + Parameters: + hed_string_obj (HedString): The hed string to process. + """ + # First see if the "def" is found at all. This covers def and def-expand. + hed_string_lower = hed_string_obj.lower() + if self._label_tag_name not in hed_string_lower: + return [] + + def_issues = [] + # We need to check for labels to expand in ALL groups + for def_tag, def_group in hed_string_obj.find_tags(DefTagNames.DEF_KEY, recursive=True): + def_contents = self._get_definition_contents(def_tag) + if def_contents is not None: + def_tag.short_base_tag = DefTagNames.DEF_EXPAND_ORG_KEY + def_group.replace(def_tag, def_contents) + + return def_issues + + def _get_definition_contents(self, def_tag): + """ Get the contents for a given def tag. + + Does not validate at all. + + Parameters: + def_tag (HedTag): Source hed tag that may be a Def or Def-expand tag. + + Returns: + def_contents: HedGroup + The contents to replace the previous def-tag with. + """ + is_label_tag = def_tag.extension_or_value_portion + placeholder = None + found_slash = is_label_tag.find("/") + if found_slash != -1: + placeholder = is_label_tag[found_slash + 1:] + is_label_tag = is_label_tag[:found_slash] + + label_tag_lower = is_label_tag.lower() + def_entry = self.defs.get(label_tag_lower) + if def_entry is None: + # Could raise an error here? + return None + else: + def_tag_name, def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder) + if def_tag_name: + return def_contents + + return None + @staticmethod def get_as_strings(def_dict): """ Convert the entries to strings of the contents @@ -145,5 +255,3 @@ def get_as_strings(def_dict): def_dict = def_dict.defs return {key: str(value.contents) for key, value in def_dict.items()} - - diff --git a/hed/models/df_util.py b/hed/models/df_util.py new file mode 100644 index 000000000..b7e73a282 --- /dev/null +++ b/hed/models/df_util.py @@ -0,0 +1,125 @@ +from functools import partial + +from hed.models.sidecar import Sidecar +from hed.models.tabular_input import TabularInput +from hed import HedString + + +def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_columns=True, + shrink_defs=False, expand_defs=True): + """Load a tabular file and its associated HED sidecar file. + + Args: + tabular_file: str or TabularInput + The path to the tabular file, or a TabularInput object representing it. + sidecar: str or Sidecar + The path to the sidecar file, or a Sidecar object representing it. + hed_schema: str or HedSchema + If str, will attempt to load as a version if it doesn't have a valid extension. + extra_def_dicts: list of DefinitionDict, optional + Any extra DefinitionDict objects to use when parsing the HED tags. + join_columns: bool + If true, join all hed columns into one. + shrink_defs: bool + Shrink any def-expand tags found + expand_defs: bool + Expand any def tags found + Returns: + A list of HedStrings, or a list of lists of HedStrings + """ + if isinstance(sidecar, str): + sidecar = Sidecar(sidecar) + + if isinstance(tabular_file, str): + tabular_file = TabularInput(tabular_file, sidecar) + + def_dict = None + if sidecar: + def_dict = sidecar.get_def_dict(hed_schema=hed_schema, extra_def_dicts=extra_def_dicts) + + if join_columns: + if expand_defs: + return [HedString(x, hed_schema, def_dict).expand_defs() for x in tabular_file.series_a], def_dict + elif shrink_defs: + return [HedString(x, hed_schema, def_dict).shrink_defs() for x in tabular_file.series_a], def_dict + else: + return [HedString(x, hed_schema, def_dict) for x in tabular_file.series_a], def_dict + else: + return [[HedString(x, hed_schema, def_dict).expand_defs() if expand_defs + else HedString(x, hed_schema, def_dict).shrink_defs() if shrink_defs + else HedString(x, hed_schema, def_dict) + for x in text_file_row] for text_file_row in tabular_file.dataframe_a.itertuples(index=False)], def_dict + + +def convert_to_form(df, hed_schema, tag_form, columns): + """ Convert all tags in underlying dataframe to the specified form. + + Converts in place + Parameters: + df (pd.Dataframe): The dataframe to modify + hed_schema (HedSchema): The schema to use to convert tags. + tag_form(str): HedTag property to convert tags to. + columns (list): The columns to modify on the dataframe + """ + if columns is None: + columns = df.columns + + for column in columns: + df[column] = df[column].apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) + + return df + + +def shrink_defs(df, hed_schema, columns): + """ Shrinks any def-expand tags found in the dataframe. + + Converts in place + Parameters: + df (pd.Dataframe): The dataframe to modify + hed_schema (HedSchema or None): The schema to use to identify defs. + columns (list): The columns to modify on the dataframe + """ + if columns is None: + columns = df.columns + + for column in columns: + mask = df[column].str.contains('Def-expand/', case=False) + df[column][mask] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema)) + + return df + + +def expand_defs(df, hed_schema, def_dict, columns): + """ Expands any def tags found in the dataframe. + + Converts in place + + Parameters: + df (pd.Dataframe): The dataframe to modify + hed_schema (HedSchema or None): The schema to use to identify defs + def_dict (DefinitionDict): The definitions to expand + columns (list): The columns to modify on the dataframe + """ + if columns is None: + columns = df.columns + + for column in columns: + mask = df[column].str.contains('Def/', case=False) + df[column][mask] = df[column][mask].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) + + return df + + +def _convert_to_form(hed_string, hed_schema, tag_form): + from hed import HedString + return str(HedString(hed_string, hed_schema).get_as_form(tag_form)) + + +def _shrink_defs(hed_string, hed_schema): + from hed import HedString + return str(HedString(hed_string, hed_schema).shrink_defs()) + + +def _expand_defs(hed_string, hed_schema, def_dict): + from hed import HedString + return str(HedString(hed_string, hed_schema, def_dict).expand_defs()) diff --git a/hed/models/expression_parser.py b/hed/models/expression_parser.py index 68c4e7f59..8a9806d42 100644 --- a/hed/models/expression_parser.py +++ b/hed/models/expression_parser.py @@ -1,7 +1,6 @@ import re -# todo: Add support for early outs with and(only try to match groups we already matched instead of all groups) class search_result: def __init__(self, group, tag): self.group = group @@ -179,8 +178,6 @@ def handle_expr(self, hed_group, exact=False): continue return_list.append(merged_result) - # finally simplify the list and remove duplicates. - return return_list def __str__(self): @@ -193,6 +190,7 @@ def __str__(self): output_str += ")" return output_str + class ExpressionWildcardNew(Expression): def handle_expr(self, hed_group, exact=False): groups_found = [] diff --git a/hed/models/hed_group.py b/hed/models/hed_group.py index e61a3d3b3..6df911801 100644 --- a/hed/models/hed_group.py +++ b/hed/models/hed_group.py @@ -312,12 +312,11 @@ def get_as_long(self): """ return self.get_as_form("long_tag") - def get_as_form(self, tag_attribute, tag_transformer=None): + def get_as_form(self, tag_attribute): """ Get the string corresponding to the specified form. Parameters: tag_attribute (str): The hed_tag property to use to construct the string (usually short_tag or long_tag). - tag_transformer (func or None): A function that is applied to each tag string before returning. Returns: str: The constructed string after transformation @@ -326,13 +325,8 @@ def get_as_form(self, tag_attribute, tag_transformer=None): - The signature of a tag_transformer is str def(HedTag, str). """ - if tag_transformer: - result = ",".join([tag_transformer(child, child.__getattribute__(tag_attribute)) - if isinstance(child, HedTag) else child.get_as_form(tag_attribute, tag_transformer) - for child in self.children]) - else: - result = ",".join([child.__getattribute__(tag_attribute) if isinstance(child, HedTag) else - child.get_as_form(tag_attribute) for child in self.children]) + result = ",".join([child.__getattribute__(tag_attribute) if isinstance(child, HedTag) else + child.get_as_form(tag_attribute) for child in self.children]) if self.is_group: return f"({result})" return result @@ -365,6 +359,8 @@ def __eq__(self, other): if self is other: return True + if isinstance(other, str): + return str(self) == other if not isinstance(other, HedGroup) or self.children != other.children or self.is_group != other.is_group: return False return True @@ -484,9 +480,9 @@ def find_def_tags(self, recursive=False, include_groups=3): """ Find def and def-expand tags Parameters: recursive (bool): If true, also check subgroups. - include_groups (int, 0, 1, 2, 3): options for how to expand or include groups + include_groups (int, 0, 1, 2, 3): options for return values Returns: - list: A list of tuples. The contents depends on the values of the include group. + list: A list of tuples. The contents depend on the values of the include_group. Notes: - The include_groups option controls the tag expansion as follows: - If 0: Return only def and def expand tags/. diff --git a/hed/models/hed_ops.py b/hed/models/hed_ops.py deleted file mode 100644 index c56c93c78..000000000 --- a/hed/models/hed_ops.py +++ /dev/null @@ -1,262 +0,0 @@ -""" Infrastructure for processing HED operations. """ - -from functools import partial -from hed.schema import HedSchema, HedSchemaGroup -from hed.errors.error_types import ErrorContext, SidecarErrors -from hed.errors import ErrorHandler - - -# These are the defaults if you pass in nothing. Most built in routes will have other default values. -default_arguments = { - 'allow_placeholders': False, - 'check_for_definitions': False, - 'expand_defs': False, - 'shrink_defs': False, - 'error_handler': None, - 'check_for_warnings': False, - 'remove_definitions': True -} - - -def translate_ops(hed_ops, split_ops=False, hed_schema=None, **kwargs): - """ Return functions to apply to a hed string object. - - Parameters: - hed_ops (list): A list of func or HedOps or HedSchema to apply to hed strings. - split_ops (bool): If true, will split the operations into separate lists of tag and string operations. - hed_schema(HedSchema or None): The schema to use by default in identifying tags - kwargs (kwargs): An optional dictionary of name-value pairs representing parameters passed to each HedOps - - Returns: - list or tuple: A list of functions to apply or a tuple containing separate lists of tag and string ops. - - Notes: - - The distinction between tag and string ops primarily applies to spreadsheets. - - Splitting the ops into two lists is mainly used for parsing spreadsheets where any given - column isn't an entire hed string, but additional detail is needed on which column an - issue original came from. - - The currently accepted values of kwargs are: - - allow_placeholders - - check_for_definitions - - expand_defs - - shrink_defs - - error_handler - - check_for_warnings - - remove_definitions - - """ - if not isinstance(hed_ops, list): - hed_ops = [hed_ops] - - from hed.models.hed_string import HedString - - settings = default_arguments.copy() - settings.update(kwargs) - - tag_funcs = [] - string_funcs = [] - for hed_op in hed_ops: - if hed_op: - # Handle the special case of a hed schema. - if isinstance(hed_op, (HedSchema, HedSchemaGroup)): - tag_funcs.append(partial(HedString.convert_to_canonical_forms, hed_schema=hed_op)) - else: - try: - tag_funcs += hed_op.__get_tag_funcs__(**settings) - string_funcs += hed_op.__get_string_funcs__(**settings) - except AttributeError: - string_funcs.append(hed_op) - - # Make sure the first column operation is a convert to forms, if we don't have one. - if not _func_in_list(HedString.convert_to_canonical_forms, tag_funcs): - tag_funcs.insert(0, partial(HedString.convert_to_canonical_forms, hed_schema=hed_schema)) - - if split_ops: - return tag_funcs, string_funcs - return tag_funcs + string_funcs - - -def apply_ops(hed_strings, hed_ops, **kwargs): - """ Convenience function to update a list/dict of hed strings - - Parameters: - hed_strings(str, dict, list): A list/dict/str to update - hed_ops (list or HedOps or func): A list of func or HedOps or HedSchema to apply to hed strings. - kwargs (kwargs): An optional dictionary of name-value pairs representing parameters passed to each HedOps - - Returns: - tuple: - hed_strings(str, dict, list): Same type as input - issues(list): A list of issues found applying the hed_ops - """ - from hed.models.hed_string import HedString - - if not hed_strings: - return hed_strings, [] - issues = [] - tag_funcs = translate_ops(hed_ops, **kwargs) - if isinstance(hed_strings, str): - hed_string_obj = HedString(hed_strings) - issues += hed_string_obj.apply_funcs(tag_funcs) - return str(hed_string_obj), issues - elif isinstance(hed_strings, dict): - return_dict = {} - for key, hed_string in hed_strings.items(): - hed_string_obj = HedString(hed_string) - issues += hed_string_obj.apply_funcs(tag_funcs) - return_dict[key] = str(hed_string_obj) - return return_dict, issues - elif isinstance(hed_strings, list): - return_list = [] - for hed_string in hed_strings: - hed_string_obj = HedString(hed_string) - issues += hed_string_obj.apply_funcs(tag_funcs) - return_list.append(str(hed_string_obj)) - return return_list, issues - - raise ValueError("Unaccounted for type in apply_ops") - - -def hed_string_iter(hed_strings, tag_funcs, error_handler): - """ Iterate over the given dict of strings, returning HedStrings - - Also gives issues for blank strings - - Parameters: - hed_strings(dict or str): A hed_string or dict of hed strings - tag_funcs (list of funcs): The functions to apply before returning - error_handler (ErrorHandler): The error handler to use for context, uses a default one if none. - - Yields: - tuple: - - HedString: The hed string at a given column and key position. - - str: Indication of the where hed string was loaded from so it can be later set by the user. - - list: Issues found applying hed_ops. Each issue is a dictionary. - - """ - for hed_string_obj, key_name in _hed_iter_low(hed_strings): - new_col_issues = [] - error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) - if not hed_string_obj: - new_col_issues += ErrorHandler.format_error(SidecarErrors.BLANK_HED_STRING) - error_handler.add_context_to_issues(new_col_issues) - yield hed_string_obj, key_name, new_col_issues - else: - error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj, - increment_depth_after=False) - if tag_funcs: - new_col_issues += hed_string_obj.apply_funcs(tag_funcs) - - error_handler.add_context_to_issues(new_col_issues) - yield hed_string_obj, key_name, new_col_issues - error_handler.pop_error_context() - error_handler.pop_error_context() - - -def _hed_iter_low(hed_strings): - """ Iterate over the hed string entries. - - Used by hed_string_iter - - Parameters: - hed_strings(dict or str): A hed_string or dict of hed strings - - Yields: - tuple: - - HedString: Individual hed strings for different entries. - - str: The position to pass back to set this string. - - """ - from hed.models.hed_string import HedString - - if isinstance(hed_strings, dict): - for key, hed_string in hed_strings.items(): - if isinstance(hed_string, str): - hed_string = HedString(hed_string) - else: - continue - yield hed_string, key - elif isinstance(hed_strings, str): - hed_string = HedString(hed_strings) - yield hed_string, None - - -def set_hed_string(new_hed_string, hed_strings, position=None): - """ Set a hed string for a category key/etc. - - Parameters: - new_hed_string (str or HedString): The new hed_string to replace the value at position. - hed_strings(dict or str or HedString): The hed strings we want to update - position (str, optional): This should only be a value returned from hed_string_iter. - - Returns: - updated_string (str or dict): The newly updated string/dict. - Raises: - TypeError: If the mapping cannot occur. - - """ - from hed.models.hed_string import HedString - - if isinstance(hed_strings, dict): - if position is None: - raise TypeError("Error: Trying to set a category HED string with no category") - if position not in hed_strings: - raise TypeError("Error: Not allowed to add new categories to a column") - hed_strings[position] = str(new_hed_string) - elif isinstance(hed_strings, (str, HedString)): - if position is not None: - raise TypeError("Error: Trying to set a value HED string with a category") - hed_strings = str(new_hed_string) - else: - raise TypeError("Error: Trying to set a HED string on a column_type that doesn't support it.") - - return hed_strings - - -class HedOps: - """ Base class to support HedOps. - - Notes: - - HED ops are operations that apply to HedStrings in a sequence. - - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def __get_string_funcs__(self, **kwargs): - """ Return the operations that should be done on the full string at once. - - Parameters: - kwargs See above. - - Returns: - list: A list of functions that take a single hed string as a parameter, and return a list of issues. - - """ - return [] - - def __get_tag_funcs__(self, **kwargs): - """ Return the operations that should be done on the individual tags in the string. - - Parameters: - kwargs: See above. - - Returns: - list: A list of functions that take a single hed string as a parameter, and return a list of issues. - - """ - return [] - - # Todo: possibly add parameter validation - # def __get_valid_parameters__(self): - # return [] - - -def _func_in_list(find_func, func_list): - for func in func_list: - if func == find_func: - return True - if isinstance(func, partial) and getattr(func, 'func') == find_func: - return True - return False diff --git a/hed/models/hed_string.py b/hed/models/hed_string.py index fee47ea12..fe864b28e 100644 --- a/hed/models/hed_string.py +++ b/hed/models/hed_string.py @@ -3,9 +3,6 @@ """ from hed.models.hed_group import HedGroup from hed.models.hed_tag import HedTag -from hed.errors.error_reporter import ErrorHandler, check_for_any_errors -from hed.errors.error_types import ErrorContext -from hed.models.hed_ops import translate_ops from hed.models.model_constants import DefTagNames @@ -15,7 +12,7 @@ class HedString(HedGroup): OPENING_GROUP_CHARACTER = '(' CLOSING_GROUP_CHARACTER = ')' - def __init__(self, hed_string, hed_schema=None, _contents=None): + def __init__(self, hed_string, hed_schema=None, def_dict=None, _contents=None): """ Constructor for the HedString class. Parameters: @@ -32,7 +29,7 @@ def __init__(self, hed_string, hed_schema=None, _contents=None): contents = _contents else: try: - contents = self.split_into_groups(hed_string, hed_schema) + contents = self.split_into_groups(hed_string, hed_schema, def_dict) except ValueError: contents = [] super().__init__(hed_string, contents=contents, startpos=0, endpos=len(hed_string)) @@ -59,10 +56,8 @@ def is_group(self): def convert_to_canonical_forms(self, hed_schema): """ Identify all tags using the given schema. - If schema is None, still identify "key" tags such as definitions. - Parameters: - hed_schema (HedSchema, HedSchemaGroup, None): The schema to use to validate/convert tags. + hed_schema (HedSchema, HedSchemaGroup): The schema to use to validate/convert tags. Returns: list: A list of issues found while converting the string. Each issue is a dictionary. @@ -89,6 +84,43 @@ def remove_definitions(self): return [] + def shrink_defs(self): + """ Replace def-expand tags with def tags + + This does not validate them and will blindly shrink invalid ones as well. + + Returns: + self + """ + for def_expand_tag, def_expand_group in self.find_tags({DefTagNames.DEF_EXPAND_KEY}, recursive=True): + expanded_parent = def_expand_group._parent + if expanded_parent: + def_expand_tag.short_base_tag = DefTagNames.DEF_ORG_KEY + expanded_parent.replace(def_expand_group, def_expand_tag) + + return self + + def expand_defs(self): + """ Replace def tags with def-expand tags + + This does very minimal validation + + Returns: + self + """ + def_tags = self.find_def_tags(recursive=True, include_groups=0) + + replacements = [] + for tag in def_tags: + if not tag._expanded: + replacements.append((tag, tag._expandable)) + + for tag, group in replacements: + self.replace(tag, group) + tag.short_base_tag = DefTagNames.DEF_EXPAND_KEY + + return self + def convert_to_short(self, hed_schema): """ Compute canonical forms and return the short form. @@ -140,13 +172,13 @@ def convert_to_original(self): return self.get_as_form("org_tag") @staticmethod - def split_into_groups(hed_string, hed_schema=None): + def split_into_groups(hed_string, hed_schema=None, def_dict=None): """ Split the HED string into a parse tree. Parameters: hed_string (str): A hed string consisting of tags and tag groups to be processed. - hed_schema (HedSchema or None): Hed schema to use to identify tags. - + hed_schema (HedSchema or None): HED schema to use to identify tags. + def_dict(DefinitionDict): The definitions to identify Returns: list: A list of HedTag and/or HedGroup. @@ -162,7 +194,7 @@ def split_into_groups(hed_string, hed_schema=None): input_tags = HedString.split_hed_string(hed_string) for is_hed_tag, (startpos, endpos) in input_tags: if is_hed_tag: - new_tag = HedTag(hed_string, (startpos, endpos), hed_schema) + new_tag = HedTag(hed_string, (startpos, endpos), hed_schema, def_dict) current_tag_group[-1].append(new_tag) else: string_portion = hed_string[startpos:endpos] @@ -178,6 +210,8 @@ def split_into_groups(hed_string, hed_schema=None): current_tag_group.append(HedGroup(hed_string, startpos + delimiter_index)) if delimiter_char is HedString.CLOSING_GROUP_CHARACTER: + # if prev_delimiter == ",": + # raise ValueError(f"Closing parentheses in hed string {hed_string}") # Terminate existing group, and save it off. paren_end = startpos + delimiter_index + 1 @@ -282,54 +316,21 @@ def split_hed_string(hed_string): return result_positions - def apply_funcs(self, string_funcs): - """ Run functions on this string. - - Parameters: - string_funcs (list): A list of functions that take a hed string object and return a list of issues. - - Returns: - list: A list of issues found by these operations. Each issue is a dictionary. - - Notes: - - This method potentially modifies the hed string object. - + def validate(self, hed_schema, allow_placeholders=True, error_handler=None): """ - string_issues = [] - for string_func in string_funcs: - string_issues += string_func(self) - if string_issues: - if check_for_any_errors(string_issues): - break - - return string_issues - - def validate(self, hed_ops=None, error_handler=None, **kwargs): - """ Run the given hed_ops on this string. + Validate the string using the schema Parameters: - hed_ops: (func, HedOps, or list): Operations to apply to this object. - error_handler (ErrorHandler or None): Used to report errors in context. Uses a default if None. - kwargs: - See models.hed_ops.translate_ops or the specific hed_ops for additional options - + hed_schema(HedSchema): The schema to use to validate + allow_placeholders(bool): allow placeholders in the string + error_handler(ErrorHandler or None): the error handler to use, creates a default one if none passed Returns: - list: A list of issues encountered in applying these operations. Each issue is a dictionary. - - Notes: - - Although this function is called validation, the HedOps can represent other transformations. - + issues (list of dict): A list of issues for hed string """ - if error_handler is None: - error_handler = ErrorHandler() - tag_funcs = translate_ops(hed_ops, **kwargs) + from hed.validator import HedValidator - error_handler.push_error_context(ErrorContext.HED_STRING, self, increment_depth_after=False) - issues = self.apply_funcs(tag_funcs) - error_handler.add_context_to_issues(issues) - error_handler.pop_error_context() - - return issues + validator = HedValidator(hed_schema) + return validator.validate(self, allow_placeholders=allow_placeholders) def find_top_level_tags(self, anchor_tags, include_groups=2): """ Find top level groups with an anchor tag. @@ -359,4 +360,3 @@ def find_top_level_tags(self, anchor_tags, include_groups=2): if include_groups == 0 or include_groups == 1: return [tag[include_groups] for tag in top_level_tags] return top_level_tags - diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py index c059d8850..29bcf8cf6 100644 --- a/hed/models/hed_tag.py +++ b/hed/models/hed_tag.py @@ -1,5 +1,5 @@ from hed.schema.hed_schema_constants import HedKey -from hed.schema.hed_schema_entry import HedTagEntry +import copy class HedTag: @@ -11,7 +11,7 @@ class HedTag: """ - def __init__(self, hed_string, span=None, hed_schema=None): + def __init__(self, hed_string, span=None, hed_schema=None, def_dict=None): """ Creates a HedTag. Parameters: @@ -23,14 +23,16 @@ def __init__(self, hed_string, span=None, hed_schema=None): - This does not produce issues and is used primarily for testing. """ + if def_dict and not hed_schema: + raise ValueError("Passing a def_dict without also passing a schema is invalid.") self._hed_string = hed_string if span is None: span = (0, len(hed_string)) # This is the span into the original hed string for this tag self.span = span - # If this is present, use this as the org tag for most purposes. This is generally only filled out - # if the tag has a name_prefix added, or is an expanded def. + # If this is present, use this as the org tag for most purposes. + # This is not generally used anymore, but you can use it to replace a tag in place. self._tag = None self._schema_prefix = self._get_schema_prefix(self.org_tag) @@ -42,8 +44,15 @@ def __init__(self, hed_string, span=None, hed_schema=None): self._extension_value = "" self._parent = None + # Downsides: two new parameters + # Have to check for this value, slowing everything down potentially. + self._expandable = None + self._expanded = False + if hed_schema: self.convert_to_canonical_forms(hed_schema) + if def_dict: + def_dict.construct_def_tag(self) @property def schema_prefix(self): @@ -115,10 +124,11 @@ def short_base_tag(self, new_tag_val): - Generally this is used to swap def to def-expand. """ if self._schema_entry: + tag_entry = None if self._schema: + if self.is_takes_value_tag(): + new_tag_val = new_tag_val + "/#" tag_entry = self._schema.get_tag_entry(new_tag_val, schema_prefix=self.schema_prefix) - else: - tag_entry, remainder = HedTagEntry.get_fake_tag_entry(new_tag_val, [new_tag_val.lower()]) self._schema_entry = tag_entry else: @@ -185,15 +195,11 @@ def tag(self, new_tag_val): new_tag_val (str): New (implicitly long form) of tag to set. Notes: - - Primarily used to add prefixes from column metadata to tags. - - Only valid before calling convert_to_canonical_forms. - + - You probably don't actually want to call this. """ - - if self._schema_entry: - raise ValueError("Can only edit tags before calculating canonical forms. " + - "This could be updated to instead remove computed forms.") self._tag = new_tag_val + self._schema_entry = None + self.convert_to_canonical_forms(self._schema) @property def extension_or_value_portion(self): @@ -250,9 +256,29 @@ def tag_terms(self): if self._schema_entry: return self._schema_entry.tag_terms - # TODO: Potentially remove this. It's just a quick hack for testing - return tuple(str(self).lower()) - #return tuple() + return tuple() + + @property + def expanded(self): + """Returns if this is currently expanded or not. + + Will always be false unless expandable is set. This is primarily used for Def/Def-expand tags at present. + + Returns: + bool: Returns true if this is currently expanded + """ + return self._expanded + + @property + def expandable(self): + """Returns if this is expandable + + This is primarily used for Def/Def-expand tags at present. + + Returns: + HedGroup or HedTag or None: Returns the expanded form of this tag + """ + return self._expandable def __str__(self): """ Convert this HedTag to a string. @@ -269,39 +295,6 @@ def __str__(self): return self._hed_string[self.span[0]:self.span[1]] - def add_prefix_if_needed(self, required_prefix): - """ Add a prefix to this tag *unless* already formatted. - - Parameters: - required_prefix (str): The full name_prefix to add if not present. - - Notes: - - This means we verify the tag does not have the required name_prefix, or any partial name_prefix. - - Examples: - Required: KnownTag1/KnownTag2 - - Case 1: KnownTag1/KnownTag2/ColumnValue - Will not be changed, has name_prefix already. - - Case 2: KnownTag2/ColumnValue - Will not be changed, has partial name_prefix already. - - Case 3: ColumnValue - Prefix will be added. - - """ - - checking_prefix = required_prefix - while checking_prefix: - if self.lower().startswith(checking_prefix.lower()): - return - slash_index = checking_prefix.find("/") + 1 - if slash_index == 0: - break - checking_prefix = checking_prefix[slash_index:] - self.tag = required_prefix + self.org_tag - def lower(self): """ Convenience function, equivalent to str(self).lower(). """ return str(self).lower() @@ -316,9 +309,6 @@ def convert_to_canonical_forms(self, hed_schema): list: A list of issues found during conversion. Each element is a dictionary. """ - if not hed_schema: - return self._convert_key_tags_to_canonical_form() - tag_entry, remainder, tag_issues = hed_schema.find_tag_entry(self, self.schema_prefix) self._schema_entry = tag_entry self._schema = hed_schema @@ -433,7 +423,7 @@ def is_value_class_tag(self): """ Return true if this is a value class tag. Returns: - bool: True if this is a a tag with a value class. + bool: True if this is a tag with a value class. """ if self._schema_entry: @@ -536,26 +526,8 @@ def any_parent_has_attribute(self, attribute): if self._schema_entry: return self._schema_entry.any_parent_has_attribute(attribute=attribute) - def _convert_key_tags_to_canonical_form(self): - """ Find the canonical form for basic known tags. - - Returns: - list: Always return an empty list. - - Notes: - - This is used for such as definition and def when no schema present - - """ - tags_to_identify = ["onset", "definition", "offset", "def-expand", "def"] - tag_entry, remainder = HedTagEntry.get_fake_tag_entry(str(self), tags_to_identify) - if tag_entry: - self._schema_entry = tag_entry - self._schema = None - self._extension_value = remainder - - return [] - - def _get_schema_prefix(self, org_tag): + @staticmethod + def _get_schema_prefix(org_tag): """ Finds the library prefix for the tag. Parameters: @@ -649,3 +621,28 @@ def __eq__(self, other): if self.org_tag.lower() == other.org_tag.lower(): return True return False + + def __deepcopy__(self, memo): + # check if the object has already been copied + if id(self) in memo: + return memo[id(self)] + + # create a new instance of HedTag class + new_tag = HedTag(self._hed_string, self.span) + + # add the new object to the memo dictionary + memo[id(self)] = new_tag + + # copy all other attributes except schema and schema_entry + new_tag._tag = copy.deepcopy(self._tag, memo) + new_tag._schema_prefix = copy.deepcopy(self._schema_prefix, memo) + new_tag._extension_value = copy.deepcopy(self._extension_value, memo) + new_tag._parent = copy.deepcopy(self._parent, memo) + new_tag._expandable = copy.deepcopy(self._expandable, memo) + new_tag._expanded = copy.deepcopy(self._expanded, memo) + + # reference the schema and schema_entry from the original object + new_tag._schema = self._schema + new_tag._schema_entry = self._schema_entry + + return new_tag diff --git a/hed/models/sidecar.py b/hed/models/sidecar.py index 59052b0b1..8b808c6d1 100644 --- a/hed/models/sidecar.py +++ b/hed/models/sidecar.py @@ -1,30 +1,50 @@ import json from hed.models.column_metadata import ColumnMetadata -from hed.errors.error_types import ErrorContext, SidecarErrors +from hed.errors.error_types import ErrorContext from hed.errors import ErrorHandler from hed.errors.exceptions import HedFileError, HedExceptions from hed.models.hed_string import HedString from hed.models.column_metadata import ColumnType -from hed.models.hed_ops import apply_ops, hed_string_iter, set_hed_string -from hed.models.sidecar_base import SidecarBase +from hed.models.definition_dict import DefinitionDict -class Sidecar(SidecarBase): +# todo: Add/improve validation for definitions being in known columns(right now it just assumes they aren't) +class Sidecar: """ Contents of a JSON file or merged file. """ - def __init__(self, files, name=None, hed_schema=None): + def __init__(self, files, name=None): """ Construct a Sidecar object representing a JSON file. Parameters: files (str or FileLike or list): A string or file-like object representing a JSON file, or a list of such. name (str or None): Optional name identifying this sidecar, generally a filename. - hed_schema(HedSchema or None): The schema to use by default in identifying tags """ - super().__init__(name, hed_schema=hed_schema) + self.name = name self.loaded_dict = self.load_sidecar_files(files) - self.def_dict = self.extract_definitions(hed_schema) + self._def_dict = None + self._extract_definition_issues = [] + + def __iter__(self): + """ An iterator to go over the individual column metadata. + + Returns: + iterator: An iterator over the column metadata values. + + """ + return iter(self.column_data) + + @property + def def_dict(self): + """This is the definitions from this sidecar. + + Generally you should instead call get_def_dict to get the relevant definitions + + Returns: + DefinitionDict: The definitions for this sidecar + """ + return self._def_dict @property def column_data(self): @@ -36,53 +56,38 @@ def column_data(self): for col_name, col_dict in self.loaded_dict.items(): yield self._generate_single_column(col_name, col_dict) - def _hed_string_iter(self, tag_funcs, error_handler): - """ Low level function to retrieve hed string in sidecar - - Parameters: - tag_funcs(list): A list of functions to apply to returned strings - error_handler(ErrorHandler): Error handler to use for context - - Yields: - tuple: - string(HedString): The retrieved and modified string - position(tuple): The location of this hed string. Black box. - issues(list): A list of issues running the tag_funcs. - """ - for column_name, dict_for_entry in self.loaded_dict.items(): - error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) - hed_dict = dict_for_entry.get("HED", {}) - for (hed_string_obj, position, issues) in hed_string_iter(hed_dict, tag_funcs, error_handler): - yield hed_string_obj, (column_name, position), issues - - error_handler.pop_error_context() - - def _set_hed_string(self, new_hed_string, position): - """ Low level function to update hed string in sidecar + def set_hed_string(self, new_hed_string, position): + """ Set a provided column/category key/etc. Parameters: new_hed_string (str or HedString): The new hed_string to replace the value at position. - position (tuple): The value returned from hed_string_iter. + position (tuple): The (HedString, str, list) tuple returned from hed_string_iter. + """ column_name, position = position hed_dict = self.loaded_dict[column_name] - hed_dict["HED"] = set_hed_string(new_hed_string, hed_dict["HED"], position) + hed_dict["HED"] = self._set_hed_string_low(new_hed_string, hed_dict["HED"], position) - def validate_structure(self, error_handler): - """ Validate the raw structure of this sidecar. + def get_def_dict(self, hed_schema=None, extra_def_dicts=None): + """ Returns the definition dict for this sidecar. Parameters: - error_handler(ErrorHandler): The error handler to use for error context + hed_schema(HedSchema): used to identify tags to find definitions + extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. Returns: - issues(list): A list of issues found with the structure + DefinitionDict: A single definition dict representing all the data(and extra def dicts) """ - all_validation_issues = [] - for column_name, dict_for_entry in self.loaded_dict.items(): - error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) - all_validation_issues += self._validate_column_structure(column_name, dict_for_entry, error_handler) - error_handler.pop_error_context() - return all_validation_issues + if self._def_dict is None and hed_schema: + self._def_dict = self.extract_definitions(hed_schema) + def_dicts = [] + if self.def_dict: + def_dicts.append(self.def_dict) + if extra_def_dicts: + if not isinstance(extra_def_dicts, list): + extra_def_dicts = [extra_def_dicts] + def_dicts += extra_def_dicts + return DefinitionDict(def_dicts) def save_as_json(self, save_filename): """ Save column metadata to a JSON file. @@ -146,6 +151,26 @@ def load_sidecar_files(self, files): merged_dict.update(loaded_json) return merged_dict + def validate(self, hed_schema, extra_def_dicts=None, name=None, error_handler=None): + """Create a SidecarValidator and validate this sidecar with the schema. + + Parameters: + hed_schema (HedSchema): Input data to be validated. + extra_def_dicts(list or DefinitionDict): extra def dicts in addition to sidecar + name(str): The name to report this sidecar as + error_handler (ErrorHandler): Error context to use. Creates a new one if None + Returns: + issues (list of dict): A list of issues associated with each level in the HED string. + """ + from hed.validator.sidecar_validator import SidecarValidator + + if error_handler is None: + error_handler = ErrorHandler() + + validator = SidecarValidator(hed_schema) + issues = validator.validate(self, extra_def_dicts, name, error_handler=error_handler) + return issues + def _load_json_file(self, fp): """ Load the raw json of a given file @@ -176,8 +201,7 @@ def _generate_single_column(self, column_name, dict_for_entry, column_type=None) hed_dict = dict_for_entry.get("HED") else: hed_dict = None - def_removed_dict, _ = apply_ops(hed_dict, HedString.remove_definitions) - column_entry = ColumnMetadata(column_type, column_name, def_removed_dict) + column_entry = ColumnMetadata(column_type, column_name, hed_dict) return column_entry @staticmethod @@ -211,36 +235,124 @@ def _detect_column_type(dict_for_entry): return ColumnType.Value - def _validate_column_structure(self, column_name, dict_for_entry, error_handler): - """ Checks primarily for type errors such as expecting a string and getting a list in a json sidecar. + def extract_definitions(self, hed_schema=None, error_handler=None): + """ Gather and validate definitions in metadata. Parameters: - error_handler (ErrorHandler) Sets the context for the error reporting. Cannot be None. + error_handler (ErrorHandler): The error handler to use for context, uses a default one if None. + hed_schema (HedSchema or None): The schema to used to identify tags. Returns: - list: Issues in performing the operations. Each issue is a dictionary. + DefinitionDict: Contains all the definitions located in the sidecar. """ - val_issues = [] - column_type = self._detect_column_type(dict_for_entry=dict_for_entry) - if column_type is None: - val_issues += ErrorHandler.format_error(SidecarErrors.UNKNOWN_COLUMN_TYPE, - column_name=column_name) - elif column_type == ColumnType.Categorical: - raw_hed_dict = dict_for_entry["HED"] - if not raw_hed_dict: - val_issues += ErrorHandler.format_error(SidecarErrors.BLANK_HED_STRING) - if not isinstance(raw_hed_dict, dict): - val_issues += ErrorHandler.format_error(SidecarErrors.WRONG_HED_DATA_TYPE, - given_type=type(raw_hed_dict), - expected_type="dict") - for key_name, hed_string in raw_hed_dict.items(): + if error_handler is None: + error_handler = ErrorHandler() + def_dict = DefinitionDict() + + self._extract_definition_issues = [] + if hed_schema: + for hed_string, column_data, _ in self.hed_string_iter(error_handler): + hed_string_obj = HedString(hed_string, hed_schema) + error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj, + increment_depth_after=False) + self._extract_definition_issues += def_dict.check_for_definitions(hed_string_obj, error_handler) + error_handler.pop_error_context() + + return def_dict + + def hed_string_iter(self, error_handler=None): + """ Gather and validate definitions in metadata. + + Parameters: + error_handler (ErrorHandler): The error handler to use for context, uses a default one if None. + + Yields: + str: The hed string at a given column and key position. + column_data: the column data for the given string. + position: blackbox(pass back to set this string to a new value) + + """ + if error_handler is None: + error_handler = ErrorHandler() + + for column_data in self.column_data: + error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_data.column_name) + hed_dict = column_data.hed_dict + for (hed_string, position) in self._hed_string_iter(hed_dict, error_handler): + yield hed_string, column_data, position + error_handler.pop_error_context() + + @staticmethod + def _hed_string_iter(hed_strings, error_handler): + """ Iterate over the given dict of strings + + Parameters: + hed_strings(dict or str): A hed_string or dict of hed strings + error_handler (ErrorHandler): The error handler to use for context, uses a default one if none. + + Yields: + tuple: + - str: The hed string at a given column and key position. + - str: Indication of the where hed string was loaded from, so it can be later set by the user. + + """ + for hed_string, key_name in Sidecar._hed_iter_low(hed_strings): + if key_name: + error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) + yield hed_string, key_name + if key_name: + error_handler.pop_error_context() + + @staticmethod + def _hed_iter_low(hed_strings): + """ Iterate over the hed string entries. + + Used by hed_string_iter + + Parameters: + hed_strings(dict or str): A hed_string or dict of hed strings + + Yields: + tuple: + - str: Individual hed strings for different entries. + - str: The position to pass back to set this string. + + """ + if isinstance(hed_strings, dict): + for key, hed_string in hed_strings.items(): if not isinstance(hed_string, str): - error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) - val_issues += ErrorHandler.format_error(SidecarErrors.WRONG_HED_DATA_TYPE, - given_type=type(hed_string), - expected_type="str") - error_handler.pop_error_context() - error_handler.add_context_to_issues(val_issues) - - return val_issues + continue + yield hed_string, key + elif isinstance(hed_strings, str): + yield hed_strings, None + + @staticmethod + def _set_hed_string_low(new_hed_string, hed_strings, position=None): + """ Set a hed string for a category key/etc. + + Parameters: + new_hed_string (str or HedString): The new hed_string to replace the value at position. + hed_strings(dict or str or HedString): The hed strings we want to update + position (str, optional): This should only be a value returned from hed_string_iter. + + Returns: + updated_string (str or dict): The newly updated string/dict. + Raises: + TypeError: If the mapping cannot occur. + + """ + if isinstance(hed_strings, dict): + if position is None: + raise TypeError("Error: Trying to set a category HED string with no category") + if position not in hed_strings: + raise TypeError("Error: Not allowed to add new categories to a column") + hed_strings[position] = str(new_hed_string) + elif isinstance(hed_strings, (str, HedString)): + if position is not None: + raise TypeError("Error: Trying to set a value HED string with a category") + hed_strings = str(new_hed_string) + else: + raise TypeError("Error: Trying to set a HED string on a column_type that doesn't support it.") + + return hed_strings diff --git a/hed/models/sidecar_base.py b/hed/models/sidecar_base.py deleted file mode 100644 index 8b82d3ea3..000000000 --- a/hed/models/sidecar_base.py +++ /dev/null @@ -1,269 +0,0 @@ -import copy -from hed.models.column_metadata import ColumnMetadata -from hed.errors.error_types import ErrorContext -from hed.errors import error_reporter -from hed.errors import ErrorHandler -from hed.models.hed_string import HedString -from hed.models.def_mapper import DefMapper -from hed.models.hed_ops import translate_ops, apply_ops -from hed.models.definition_dict import DefinitionDict -from functools import partial - - -class SidecarBase: - """ Baseclass for specialized spreadsheet sidecars - - To subclass this class, you'll want to override at the minimum: - _hed_string_iter - _set_hed_string - validate_structure - column_data property <- This is the only truly mandatory one - - """ - def __init__(self, name=None, hed_schema=None): - """ Initialize a sidecar baseclass - - Parameters: - name (str or None): Optional name identifying this sidecar, generally a filename. - hed_schema(HedSchema or None): The schema to use by default in identifying tags - """ - self.name = name - self._schema = hed_schema - # Expected to be called in subclass after data is loaded - # self.def_dict = self.extract_definitions() - - @property - def column_data(self): - """ Generates the list of ColumnMetadata for this sidecar - - Returns: - list(ColumnMetadata): the list of column metadata defined by this sidecar - """ - return [] - - def _hed_string_iter(self, tag_funcs, error_handler): - """ Low level function to retrieve hed string in sidecar - - Parameters: - tag_funcs(list): A list of functions to apply to returned strings - error_handler(ErrorHandler): Error handler to use for context - - Yields: - tuple: - string(HedString): The retrieved and modified string - position(tuple): The location of this hed string. Black box. - issues(list): A list of issues running the tag_funcs. - """ - yield - - def _set_hed_string(self, new_hed_string, position): - """ Low level function to update hed string in sidecar - - Parameters: - new_hed_string (str or HedString): The new hed_string to replace the value at position. - position (tuple): The value returned from hed_string_iter. - """ - return - - def validate_structure(self, error_handler): - """ Validate the raw structure of this sidecar. - - Parameters: - error_handler(ErrorHandler): The error handler to use for error context - - Returns: - issues(list): A list of issues found with the structure - """ - return [] - - def __iter__(self): - """ An iterator to go over the individual column metadata. - - Returns: - iterator: An iterator over the column metadata values. - - """ - return iter(self.column_data) - - def hed_string_iter(self, hed_ops=None, error_handler=None, expand_defs=False, remove_definitions=False, - allow_placeholders=True, extra_def_dicts=None, **kwargs): - """ Iterator over hed strings in columns. - - Parameters: - hed_ops (func, HedOps, list): A HedOps, funcs or list of these to apply to the hed strings - before returning - error_handler (ErrorHandler): The error handler to use for context, uses a default one if none. - expand_defs (bool): If True, expand all def tags located in the strings. - remove_definitions (bool): If True, remove all definitions found in the string. - allow_placeholders (bool): If False, placeholders will be marked as validation warnings. - extra_def_dicts (DefinitionDict, list, None): Extra dicts to add to the list. - kwargs: See models.hed_ops.translate_ops or the specific hed_ops for additional options. - - Yields: - tuple: - - HedString: A HedString at a given column and key position. - - tuple: Indicates where hed_string was loaded from so it can be later set by the user - - list: A list of issues found performing ops. Each issue is a dictionary. - - """ - if error_handler is None: - error_handler = ErrorHandler() - hed_ops = self._standardize_ops(hed_ops) - if expand_defs or remove_definitions: - self._add_definition_mapper(hed_ops, extra_def_dicts) - tag_funcs = translate_ops(hed_ops, hed_schema=self._schema, error_handler=error_handler, - expand_defs=expand_defs, allow_placeholders=allow_placeholders, - remove_definitions=remove_definitions, **kwargs) - - return self._hed_string_iter(tag_funcs, error_handler) - - def set_hed_string(self, new_hed_string, position): - """ Set a provided column/category key/etc. - - Parameters: - new_hed_string (str or HedString): The new hed_string to replace the value at position. - position (tuple): The (HedString, str, list) tuple returned from hed_string_iter. - - """ - return self._set_hed_string(new_hed_string, position) - - def _add_definition_mapper(self, hed_ops, extra_def_dicts=None): - """ Add a DefMapper if the hed_ops list doesn't have one. - - Parameters: - hed_ops (list): A list of HedOps - extra_def_dicts (list): DefDicts from outside. - - Returns: - DefMapper: A shallow copy of the hed_ops list with a DefMapper added if there wasn't one. - - """ - def_mapper_list = [hed_op for hed_op in hed_ops if isinstance(hed_op, DefMapper)] - - if not def_mapper_list: - def_dicts = self.get_def_dicts(extra_def_dicts) - def_mapper = DefMapper(def_dicts) - hed_ops.append(def_mapper) - return def_mapper - return def_mapper_list[0] - - @staticmethod - def _standardize_ops(hed_ops): - if not isinstance(hed_ops, list): - hed_ops = [hed_ops] - return hed_ops.copy() - - def get_def_dicts(self, extra_def_dicts=None): - """ Returns the definition dict for this sidecar. - - Parameters: - extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. - - Returns: - list: A list with the sidecar def_dict plus any found in extra_def_dicts. - - """ - def_dicts = [self.def_dict] - if extra_def_dicts: - if not isinstance(extra_def_dicts, list): - extra_def_dicts = [extra_def_dicts] - def_dicts += extra_def_dicts - return def_dicts - - def validate_entries(self, hed_ops=None, name=None, extra_def_dicts=None, - error_handler=None, **kwargs): - """ Run the given hed_ops on all columns in this sidecar. - - Parameters: - hed_ops (list, func, or HedOps): A HedOps, func or list of these to apply to hed strings in this sidecar. - name (str): If present, will use this as the filename for context, rather than using the actual filename - Useful for temp filenames. - extra_def_dicts (DefinitionDict, list, or None): If present use these in addition to sidecar's def dicts. - error_handler (ErrorHandler or None): Used to report errors. Uses a default one if none passed in. - kwargs: See models.hed_ops.translate_ops or the specific hed_ops for additional options. - - Returns: - list: The list of validation issues found. Individual issues are in the form of a dict. - - """ - if error_handler is None: - error_handler = error_reporter.ErrorHandler() - if not name: - name = self.name - if name: - error_handler.push_error_context(ErrorContext.FILE_NAME, name, False) - - all_validation_issues = self.validate_structure(error_handler) - - # Early out major errors so the rest of our code can assume they won't happen. - if all_validation_issues: - return all_validation_issues - - hed_ops = self._standardize_ops(hed_ops) - def_mapper = self._add_definition_mapper(hed_ops, extra_def_dicts) - all_validation_issues += def_mapper.issues - - for hed_string, key_name, issues in self.hed_string_iter(hed_ops=hed_ops, allow_placeholders=True, - error_handler=error_handler, **kwargs): - self.set_hed_string(hed_string, key_name) - all_validation_issues += issues - - # Finally check what requires the final mapped data to check - for column_data in self.column_data: - validate_pound_func = partial(self._validate_pound_sign_count, column_type=column_data.column_type) - _, issues = apply_ops(column_data.hed_dict, validate_pound_func) - all_validation_issues += issues - all_validation_issues += self.def_dict.get_definition_issues() - if name: - error_handler.pop_error_context() - return all_validation_issues - - def extract_definitions(self, hed_schema=None, error_handler=None): - """ Gather and validate definitions in metadata. - - Parameters: - error_handler (ErrorHandler): The error handler to use for context, uses a default one if None. - hed_schema (HedSchema or None): The schema to used to identify tags. - - Returns: - DefinitionDict: Contains all the definitions located in the column. - issues: List of issues encountered in extracting the definitions. Each issue is a dictionary. - - """ - if error_handler is None: - error_handler = ErrorHandler() - new_def_dict = DefinitionDict() - hed_ops = [] - hed_ops.append(hed_schema) - hed_ops.append(new_def_dict) - - all_issues = [] - for hed_string, key_name, issues in self.hed_string_iter(hed_ops=hed_ops, allow_placeholders=True, - error_handler=error_handler): - all_issues += issues - - return new_def_dict - - def _validate_pound_sign_count(self, hed_string, column_type): - """ Check if a given hed string in the column has the correct number of pound signs. - - Parameters: - hed_string (str or HedString): HED string to be checked. - - Returns: - list: Issues due to pound sign errors. Each issue is a dictionary. - - Notes: - Normally the number of # should be either 0 or 1, but sometimes will be higher due to the - presence of definition tags. - - """ - # Make a copy without definitions to check placeholder count. - expected_count, error_type = ColumnMetadata.expected_pound_sign_count(column_type) - hed_string_copy = copy.deepcopy(hed_string) - hed_string_copy.remove_definitions() - - if hed_string_copy.lower().count("#") != expected_count: - return ErrorHandler.format_error(error_type, pound_sign_count=str(hed_string_copy).count("#")) - - return [] diff --git a/hed/models/spreadsheet_input.py b/hed/models/spreadsheet_input.py index 77a497449..b48f6985f 100644 --- a/hed/models/spreadsheet_input.py +++ b/hed/models/spreadsheet_input.py @@ -1,6 +1,5 @@ from hed.models.column_mapper import ColumnMapper from hed.models.base_input import BaseInput -from hed.models.def_mapper import DefMapper class SpreadsheetInput(BaseInput): @@ -8,7 +7,7 @@ class SpreadsheetInput(BaseInput): def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=None, has_column_names=True, column_prefix_dictionary=None, - def_dicts=None, name=None, hed_schema=None): + name=None): """Constructor for the SpreadsheetInput class. Parameters: @@ -21,9 +20,7 @@ def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=N has_column_names (bool): True if file has column names. Validation will skip over the first line of the file if the spreadsheet as column names. column_prefix_dictionary (dict): A dictionary with column number keys and prefix values. - def_dicts (DefinitionDict or list): A DefinitionDict or list of DefDicts containing definitions for this - object other than the ones extracted from the SpreadsheetInput object itself. - hed_schema(HedSchema or None): The schema to use by default in identifying tags + This is partially deprecated - what this now turns the given columns into Value columns. Examples: A prefix dictionary {3: 'Label/', 5: 'Description/'} indicates that column 3 and 5 have HED tags that need to be prefixed by Label/ and Description/ respectively. @@ -38,7 +35,4 @@ def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=N new_mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary, warn_on_missing_column=False) - def_mapper = DefMapper(def_dicts) - - super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, def_mapper=def_mapper, - name=name, hed_schema=hed_schema) + super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, name=name) diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py index 2b9c2089a..388718fb9 100644 --- a/hed/models/tabular_input.py +++ b/hed/models/tabular_input.py @@ -1,7 +1,6 @@ from hed.models.column_mapper import ColumnMapper from hed.models.base_input import BaseInput from hed.models.sidecar import Sidecar -from hed.models.def_mapper import DefMapper class TabularInput(BaseInput): @@ -9,64 +8,30 @@ class TabularInput(BaseInput): HED_COLUMN_NAME = "HED" - def __init__(self, file=None, sidecar=None, extra_def_dicts=None, also_gather_defs=True, name=None, - hed_schema=None): + def __init__(self, file=None, sidecar=None, name=None): """ Constructor for the TabularInput class. Parameters: file (str or file like): A tsv file to open. sidecar (str or Sidecar): A Sidecar filename or Sidecar - extra_def_dicts ([DefinitionDict], DefinitionDict, or None): DefinitionDict objects containing all - the definitions this file should use other than the ones coming from the file - itself and from the sidecar. These are added as the last entries, so names will override - earlier ones. + Note: If this is a string you MUST also pass hed_schema. name (str): The name to display for this file for error purposes. - hed_schema(HedSchema or None): The schema to use by default in identifying tags """ if sidecar and not isinstance(sidecar, Sidecar): sidecar = Sidecar(sidecar) new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME], warn_on_missing_column=True) - definition_columns = [self.HED_COLUMN_NAME] self._sidecar = sidecar - self._also_gather_defs = also_gather_defs - if extra_def_dicts and not isinstance(extra_def_dicts, list): - extra_def_dicts = [extra_def_dicts] - self._extra_def_dicts = extra_def_dicts - def_mapper = self.create_def_mapper(new_mapper) super().__init__(file, file_type=".tsv", worksheet_name=None, has_column_names=True, mapper=new_mapper, - def_mapper=def_mapper, name=name, definition_columns=definition_columns, - allow_blank_names=False, hed_schema=hed_schema) + name=name, allow_blank_names=False, ) if not self._has_column_names: raise ValueError("You are attempting to open a bids_old style file with no column headers provided.\n" "This is probably not intended.") - def create_def_mapper(self, column_mapper): - """ Create the definition mapper for this file. - - Parameters: - column_mapper (ColumnMapper): The column mapper to gather definitions from. - - - Returns: - def mapper (DefMapper): A class to validate or expand definitions with the given def dicts. - - Notes: - - The extra_def_dicts are definitions not included in the column mapper. - - """ - - def_dicts = column_mapper.get_def_dicts() - if self._extra_def_dicts: - def_dicts += self._extra_def_dicts - def_mapper = DefMapper(def_dicts) - - return def_mapper - def reset_column_mapper(self, sidecar=None): """ Change the sidecars and settings. @@ -76,25 +41,4 @@ def reset_column_mapper(self, sidecar=None): """ new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME]) - self._def_mapper = self.create_def_mapper(new_mapper) self.reset_mapper(new_mapper) - - def validate_sidecar(self, hed_ops=None, error_handler=None, **kwargs): - """ Validate column definitions and hed strings. - - Parameters: - hed_ops (list or HedOps): A list of HedOps of funcs to apply to the hed strings in the sidecars. - error_handler (ErrorHandler or None): Used to report errors. Uses a default one if none passed in. - kwargs: See models.hed_ops.translate_ops or the specific hed_ops for additional options. - - Returns: - list: A list of syntax and semantic issues found in the definitions. Each issue is a dictionary. - - Notes: - - For full validation you should validate the sidecar separately. - - """ - if not isinstance(hed_ops, list): - hed_ops = [hed_ops] - hed_ops.append(self._def_mapper) - return self._sidecar.validate_entries(hed_ops, error_handler=error_handler, **kwargs) diff --git a/hed/models/timeseries_input.py b/hed/models/timeseries_input.py index c7ca5c215..0b9cbee18 100644 --- a/hed/models/timeseries_input.py +++ b/hed/models/timeseries_input.py @@ -22,4 +22,4 @@ def __init__(self, file=None, sidecar=None, extra_def_dicts=None, name=None): """ super().__init__(file, file_type=".tsv", worksheet_name=None, has_column_names=False, mapper=None, - def_mapper=None, name=name) + name=name) diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py index 10b9aa6cc..84c2accbf 100644 --- a/hed/schema/schema_compliance.py +++ b/hed/schema/schema_compliance.py @@ -62,7 +62,7 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl if validator: error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name, False) new_issues = validator(hed_schema, tag_entry, tag_entry.attributes[attribute_name]) - error_handler.add_context_to_issues(new_issues) + error_handler.add_context_and_filter(new_issues) issues_list += new_issues error_handler.pop_error_context() error_handler.pop_error_context() diff --git a/hed/validator/__init__.py b/hed/validator/__init__.py index 88b772ca8..4a8b94209 100644 --- a/hed/validator/__init__.py +++ b/hed/validator/__init__.py @@ -2,3 +2,7 @@ from .hed_validator import HedValidator from .tag_validator import TagValidator +from .sidecar_validator import SidecarValidator +from .def_validator import DefValidator +from .onset_validator import OnsetValidator +from .spreadsheet_validator import SpreadsheetValidator \ No newline at end of file diff --git a/hed/validator/def_validator.py b/hed/validator/def_validator.py new file mode 100644 index 000000000..24a3d8e5b --- /dev/null +++ b/hed/validator/def_validator.py @@ -0,0 +1,78 @@ +from hed.models.hed_string import HedString +from hed.models.hed_tag import HedTag +from hed.models.definition_dict import DefinitionDict +from hed.errors.error_types import ValidationErrors +from hed.errors.error_reporter import ErrorHandler + + +class DefValidator(DefinitionDict): + """ Handles validating Def/ and Def-expand/. + + """ + + def __init__(self, def_dicts=None, hed_schema=None): + """ Initialize for definitions in hed strings. + + Parameters: + def_dicts (list or DefinitionDict or str): DefinitionDicts containing the definitions to pass to baseclass + + """ + super().__init__(def_dicts, hed_schema=hed_schema) + + def validate_def_tags(self, hed_string_obj): + """ Validate Def/Def-Expand tags. + + Parameters: + hed_string_obj (HedString): The hed string to process. + + Returns: + list: Issues found related to validating defs. Each issue is a dictionary. + """ + hed_string_lower = hed_string_obj.lower() + if self._label_tag_name not in hed_string_lower: + return [] + + def_issues = [] + # We need to check for labels to expand in ALL groups + for def_tag, def_expand_group, def_group in hed_string_obj.find_def_tags(recursive=True): + def_issues += self._validate_def_contents(def_tag, def_expand_group) + + return def_issues + + def _validate_def_contents(self, def_tag, def_expand_group): + """ Check for issues with expanding a tag from Def to a Def-expand tag group + + Parameters: + def_tag (HedTag): Source hed tag that may be a Def or Def-expand tag. + def_expand_group (HedGroup or HedTag): + Source group for this def-expand tag. Same as def_tag if this is not a def-expand tag. + + Returns: + issues + """ + def_issues = [] + + is_label_tag = def_tag.extension_or_value_portion + placeholder = None + found_slash = is_label_tag.find("/") + if found_slash != -1: + placeholder = is_label_tag[found_slash + 1:] + is_label_tag = is_label_tag[:found_slash] + + label_tag_lower = is_label_tag.lower() + def_entry = self.defs.get(label_tag_lower) + if def_entry is None: + def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_UNMATCHED, tag=def_tag) + else: + def_tag_name, def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder) + if def_tag_name: + if def_expand_group is not def_tag and def_expand_group != def_contents: + def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_EXPAND_INVALID, + tag=def_tag, actual_def=def_contents, + found_def=def_expand_group) + elif def_entry.takes_value: + def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_VALUE_MISSING, tag=def_tag) + else: + def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_VALUE_EXTRA, tag=def_tag) + + return def_issues diff --git a/hed/validator/hed_validator.py b/hed/validator/hed_validator.py index 600d5bb87..c7ce76adf 100644 --- a/hed/validator/hed_validator.py +++ b/hed/validator/hed_validator.py @@ -6,50 +6,86 @@ """ from hed.errors.error_types import ValidationErrors -from hed.errors.error_reporter import ErrorHandler +from hed.errors.error_reporter import ErrorHandler, check_for_any_errors from hed.models.hed_string import HedString from hed.models import HedTag from hed.validator.tag_validator import TagValidator -from functools import partial -from hed.models.hed_ops import HedOps +from hed.validator.def_validator import DefValidator +from hed.validator.onset_validator import OnsetValidator -class HedValidator(HedOps): +class HedValidator: """ Top level validation of HED strings. """ - def __init__(self, hed_schema=None, run_semantic_validation=True): + def __init__(self, hed_schema=None, def_dicts=None, run_full_onset_checks=True): """ Constructor for the HedValidator class. Parameters: hed_schema (HedSchema or HedSchemaGroup): HedSchema object to use for validation. - run_semantic_validation (bool): True if the validator should check the HED data against a schema. """ super().__init__() self._tag_validator = None self._hed_schema = hed_schema - self._tag_validator = TagValidator(hed_schema=self._hed_schema, - run_semantic_validation=run_semantic_validation) - self._run_semantic_validation = run_semantic_validation - - def __get_tag_funcs__(self, **kwargs): - string_funcs = [] - allow_placeholders = kwargs.get("allow_placeholders") - check_for_warnings = kwargs.get("check_for_warnings") - string_funcs.append(self._tag_validator.run_hed_string_validators) - string_funcs.append( - partial(HedString.convert_to_canonical_forms, hed_schema=self._hed_schema)) - string_funcs.append(partial(self._validate_individual_tags_in_hed_string, - allow_placeholders=allow_placeholders, - check_for_warnings=check_for_warnings)) - return string_funcs - - def __get_string_funcs__(self, **kwargs): - check_for_warnings = kwargs.get("check_for_warnings") - string_funcs = [partial(self._validate_tags_in_hed_string, check_for_warnings=check_for_warnings), - self._validate_groups_in_hed_string] - return string_funcs + self._tag_validator = TagValidator(hed_schema=self._hed_schema) + self._def_validator = DefValidator(def_dicts, hed_schema) + self._onset_validator = OnsetValidator(def_dict=self._def_validator, + run_full_onset_checks=run_full_onset_checks) + + def validate(self, hed_string, allow_placeholders, error_handler=None): + """ + Validate the string using the schema + + Parameters: + hed_string(HedString): the string to validate + allow_placeholders(bool): allow placeholders in the string + error_handler(ErrorHandler or None): the error handler to use, creates a default one if none passed + Returns: + issues (list of dict): A list of issues for hed string + """ + if not error_handler: + error_handler = ErrorHandler() + issues = [] + issues += self.run_basic_checks(hed_string, allow_placeholders=allow_placeholders) + error_handler.add_context_and_filter(issues) + if check_for_any_errors(issues): + return issues + issues += self.run_full_string_checks(hed_string) + error_handler.add_context_and_filter(issues) + return issues + + def run_basic_checks(self, hed_string, allow_placeholders): + issues = [] + issues += self._tag_validator.run_hed_string_validators(hed_string) + if check_for_any_errors(issues): + return issues + if hed_string == "n/a" or not self._hed_schema: + return issues + issues += hed_string.convert_to_canonical_forms(self._hed_schema) + if check_for_any_errors(issues): + return issues + # This is required so it can validate the tag a tag expands into + # e.g. checking units when a definition placeholder has units + self._def_validator.construct_def_tags(hed_string) + issues += self._validate_individual_tags_in_hed_string(hed_string, allow_placeholders=allow_placeholders) + if check_for_any_errors(issues): + return issues + issues += self._def_validator.validate_def_tags(hed_string) + if check_for_any_errors(issues): + return issues + issues += self._onset_validator.validate_onset_offset(hed_string) + if check_for_any_errors(issues): + return issues + return issues + + def run_full_string_checks(self, hed_string): + issues = [] + issues += self._validate_tags_in_hed_string(hed_string) + if check_for_any_errors(issues): + return issues + issues += self._validate_groups_in_hed_string(hed_string) + return issues def _validate_groups_in_hed_string(self, hed_string_obj): """ Report invalid groups at each level. @@ -103,26 +139,21 @@ def _check_for_duplicate_groups(self, original_group): self._check_for_duplicate_groups_recursive(sorted_group, validation_issues) return validation_issues - def _validate_tags_in_hed_string(self, hed_string_obj, check_for_warnings=False): - """ Report invalid the multi-tag properties. + def _validate_tags_in_hed_string(self, hed_string_obj): + """ Report invalid the multi-tag properties in a hed string, e.g. required tags.. Parameters: hed_string_obj (HedString): A HedString object. Returns: list: The issues associated with the tags in the HED string. Each issue is a dictionary. - - Notes: - - in a hed string, eg required tags. - - """ + """ validation_issues = [] tags = hed_string_obj.get_all_tags() - validation_issues += self._tag_validator.run_all_tags_validators(tags, check_for_warnings=check_for_warnings) + validation_issues += self._tag_validator.run_all_tags_validators(tags) return validation_issues - def _validate_individual_tags_in_hed_string(self, hed_string_obj, allow_placeholders=False, - check_for_warnings=False): + def _validate_individual_tags_in_hed_string(self, hed_string_obj, allow_placeholders=False): """ Validate individual tags in a HED string. Parameters: @@ -139,9 +170,15 @@ def _validate_individual_tags_in_hed_string(self, hed_string_obj, allow_placehol for group in hed_string_obj.get_all_groups(): is_definition = group in all_def_groups for hed_tag in group.tags(): - validation_issues += \ - self._tag_validator.run_individual_tag_validators(hed_tag, allow_placeholders=allow_placeholders, - check_for_warnings=check_for_warnings, - is_definition=is_definition) + if hed_tag.expandable and not hed_tag.expanded: + for tag in hed_tag.expandable.get_all_tags(): + validation_issues += self._tag_validator. \ + run_individual_tag_validators(tag, allow_placeholders=allow_placeholders, + is_definition=is_definition) + else: + validation_issues += self._tag_validator. \ + run_individual_tag_validators(hed_tag, + allow_placeholders=allow_placeholders, + is_definition=is_definition) return validation_issues diff --git a/hed/models/onset_mapper.py b/hed/validator/onset_validator.py similarity index 76% rename from hed/models/onset_mapper.py rename to hed/validator/onset_validator.py index 842ff25a6..942f58efb 100644 --- a/hed/models/onset_mapper.py +++ b/hed/validator/onset_validator.py @@ -2,29 +2,24 @@ from hed.models.hed_group import HedGroup from hed.errors.error_reporter import ErrorHandler from hed.errors.error_types import OnsetErrors -from hed.models.hed_ops import HedOps -class OnsetMapper(HedOps): - """ HedOps responsible for matching onset/offset pairs. """ +class OnsetValidator: + """ Validates onset/offset pairs. """ - def __init__(self, def_mapper): - super().__init__() - self._def_mapper = def_mapper + def __init__(self, def_dict, run_full_onset_checks=True): + self._defs = def_dict self._onsets = {} + self._run_full_onset_checks = run_full_onset_checks - def check_for_onset_offset(self, hed_string_obj): - """ Check for onset or offset and track context. + def validate_onset_offset(self, hed_string_obj): + """ Validate onset/offset Parameters: - hed_string_obj (HedString): The hed string to check. Finds a maximum of one onset tag. + hed_string_obj (HedString): The hed string to check. Returns: list: A list of issues found in validating onsets (i.e., out of order onsets, unknown def names). - - Notes: - - Each issue in the return list is a dictionary. - """ onset_issues = [] for found_onset, found_group in self._find_onset_tags(hed_string_obj): @@ -82,28 +77,21 @@ def _handle_onset_or_offset(self, def_tag, onset_offset_tag): placeholder = def_name[found_slash + 1:] def_name = def_name[:found_slash] - def_entry = self._def_mapper.get_def_entry(def_name) + def_entry = self._defs.get_def_entry(def_name) if def_entry is None: return ErrorHandler.format_error(OnsetErrors.ONSET_DEF_UNMATCHED, tag=def_tag) if bool(def_entry.takes_value) != bool(placeholder): return ErrorHandler.format_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, tag=def_tag, has_placeholder=bool(def_entry.takes_value)) - if is_onset: - # onset can never fail as it implies an offset - self._onsets[full_def_name.lower()] = full_def_name - else: - if full_def_name.lower() not in self._onsets: - return ErrorHandler.format_error(OnsetErrors.OFFSET_BEFORE_ONSET, tag=def_tag) + if self._run_full_onset_checks: + if is_onset: + # onset can never fail as it implies an offset + self._onsets[full_def_name.lower()] = full_def_name else: - del self._onsets[full_def_name.lower()] - - return [] - - def __get_string_funcs__(self, **kwargs): - string_funcs = [] - string_funcs.append(self.check_for_onset_offset) - return string_funcs + if full_def_name.lower() not in self._onsets: + return ErrorHandler.format_error(OnsetErrors.OFFSET_BEFORE_ONSET, tag=def_tag) + else: + del self._onsets[full_def_name.lower()] - def __get_tag_funcs__(self, **kwargs): return [] diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py new file mode 100644 index 000000000..af12005b1 --- /dev/null +++ b/hed/validator/sidecar_validator.py @@ -0,0 +1,147 @@ +import copy +from hed.errors import ErrorHandler, ErrorContext, SidecarErrors +from hed.models import ColumnType +from hed import HedString +from hed import Sidecar +from hed.models.column_metadata import ColumnMetadata + + +class SidecarValidator: + reserved_column_names = ["HED"] + reserved_category_values = ["n/a"] + + def __init__(self, hed_schema): + """ + Constructor for the HedValidator class. + + Parameters: + hed_schema (HedSchema): HED schema object to use for validation. + """ + self._schema = hed_schema + + def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None): + """Validate the input data using the schema + + Parameters: + sidecar (Sidecar): Input data to be validated. + extra_def_dicts(list or DefinitionDict): extra def dicts in addition to sidecar + name(str): The name to report this sidecar as + error_handler (ErrorHandler): Error context to use. Creates a new one if None + Returns: + issues (list of dict): A list of issues associated with each level in the HED string. + """ + from hed.validator import HedValidator + issues = [] + if error_handler is None: + error_handler = ErrorHandler() + + error_handler.push_error_context(ErrorContext.FILE_NAME, name) + sidecar_def_dict = sidecar.get_def_dict(hed_schema=self._schema, extra_def_dicts=extra_def_dicts) + hed_validator = HedValidator(self._schema, + def_dicts=sidecar_def_dict, + run_full_onset_checks=False) + + issues += self.validate_structure(sidecar, error_handler=error_handler) + issues += sidecar._extract_definition_issues + issues += sidecar_def_dict.issues + # todo: Add the definition validation. + + for hed_string, column_data, position in sidecar.hed_string_iter(error_handler): + hed_string_obj = HedString(hed_string, hed_schema=self._schema, def_dict=sidecar_def_dict) + + error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj, + increment_depth_after=False) + new_issues = hed_validator.run_basic_checks(hed_string_obj, allow_placeholders=True) + if not new_issues: + new_issues = hed_validator.run_full_string_checks(hed_string_obj) + if not new_issues: + new_issues = self._validate_pound_sign_count(hed_string_obj, column_type=column_data.column_type) + error_handler.add_context_and_filter(new_issues) + issues += new_issues + error_handler.pop_error_context() + + error_handler.pop_error_context() + return issues + + def validate_structure(self, sidecar, error_handler): + """ Validate the raw structure of this sidecar. + + Parameters: + sidecar(Sidecar): the sidecar to validate + error_handler(ErrorHandler): The error handler to use for error context + + Returns: + issues(list): A list of issues found with the structure + """ + all_validation_issues = [] + for column_name, dict_for_entry in sidecar.loaded_dict.items(): + error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) + all_validation_issues += self._validate_column_structure(column_name, dict_for_entry, error_handler) + error_handler.pop_error_context() + return all_validation_issues + + def _validate_column_structure(self, column_name, dict_for_entry, error_handler): + """ Checks primarily for type errors such as expecting a string and getting a list in a json sidecar. + + Parameters: + error_handler (ErrorHandler) Sets the context for the error reporting. Cannot be None. + + Returns: + list: Issues in performing the operations. Each issue is a dictionary. + + """ + val_issues = [] + if column_name in self.reserved_column_names: + val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED) + return val_issues + + column_type = Sidecar._detect_column_type(dict_for_entry=dict_for_entry) + if column_type is None: + val_issues += error_handler.format_error_with_context(SidecarErrors.UNKNOWN_COLUMN_TYPE, + column_name=column_name) + elif column_type == ColumnType.Categorical: + raw_hed_dict = dict_for_entry["HED"] + if not raw_hed_dict: + val_issues += error_handler.format_error_with_context(SidecarErrors.BLANK_HED_STRING) + if not isinstance(raw_hed_dict, dict): + val_issues += error_handler.format_error_with_context(SidecarErrors.WRONG_HED_DATA_TYPE, + given_type=type(raw_hed_dict), + expected_type="dict") + for key_name, hed_string in raw_hed_dict.items(): + error_handler.push_error_context(ErrorContext.SIDECAR_KEY_NAME, key_name) + if not isinstance(hed_string, str): + val_issues += error_handler.format_error_with_context(SidecarErrors.WRONG_HED_DATA_TYPE, + given_type=type(hed_string), + expected_type="str") + if not hed_string: + val_issues += error_handler.format_error_with_context(SidecarErrors.BLANK_HED_STRING) + if key_name in self.reserved_category_values: + val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_NA_USED, column_name) + error_handler.pop_error_context() + + return val_issues + + def _validate_pound_sign_count(self, hed_string, column_type): + """ Check if a given hed string in the column has the correct number of pound signs. + + Parameters: + hed_string (str or HedString): HED string to be checked. + + Returns: + list: Issues due to pound sign errors. Each issue is a dictionary. + + Notes: + Normally the number of # should be either 0 or 1, but sometimes will be higher due to the + presence of definition tags. + + """ + # Make a copy without definitions to check placeholder count. + expected_count, error_type = ColumnMetadata.expected_pound_sign_count(column_type) + hed_string_copy = copy.deepcopy(hed_string) + hed_string_copy.remove_definitions() + hed_string_copy.shrink_defs() + + if hed_string_copy.lower().count("#") != expected_count: + return ErrorHandler.format_error(error_type, pound_sign_count=str(hed_string_copy).count("#")) + + return [] diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py new file mode 100644 index 000000000..136b5aa73 --- /dev/null +++ b/hed/validator/spreadsheet_validator.py @@ -0,0 +1,114 @@ +import pandas as pd +from hed import BaseInput +from hed.errors import ErrorHandler, ValidationErrors, ErrorContext +from hed.models import ColumnType +from hed import HedString +from hed.models.hed_string_group import HedStringGroup + +PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " + + +class SpreadsheetValidator: + def __init__(self, hed_schema): + """ + Constructor for the HedValidator class. + + Parameters: + hed_schema (HedSchema): HED schema object to use for validation. + """ + self._schema = hed_schema + self._hed_validator = None + + def validate(self, data, def_dicts=None, name=None, error_handler=None): + """ + Validate the input data using the schema + + Parameters: + data (BaseInput or pd.DataFrame): Input data to be validated. + def_dicts(list of DefDict or DefDict): all definitions to use for validation + name(str): The name to report errors from this file as + error_handler (ErrorHandler): Error context to use. Creates a new one if None + Returns: + issues (list of dict): A list of issues for hed string + """ + from hed.validator import HedValidator + issues = [] + if error_handler is None: + error_handler = ErrorHandler() + + error_handler.push_error_context(ErrorContext.FILE_NAME, name) + self._hed_validator = HedValidator(self._schema, def_dicts=def_dicts) + # Check the structure of the input data, if it's a BaseInput + if isinstance(data, BaseInput): + issues += self._validate_column_structure(data, error_handler) + data = data.dataframe_a + + # Check the rows of the input data + issues += self._run_checks(data, error_handler) + error_handler.pop_error_context() + return issues + + def _run_checks(self, data, error_handler): + issues = [] + for row_number, text_file_row in enumerate(data.itertuples(index=False)): + error_handler.push_error_context(ErrorContext.ROW, row_number) + row_strings = [] + new_column_issues = [] + # todo: make this report the correct column numbers(somehow - it almost surely doesn't right now) + for column_number, cell in enumerate(text_file_row): + if not cell or cell == "n/a": + continue + + error_handler.push_error_context(ErrorContext.COLUMN, column_number) + + column_hed_string = HedString(cell) + row_strings.append(column_hed_string) + error_handler.push_error_context(ErrorContext.HED_STRING, column_hed_string, + increment_depth_after=False) + new_column_issues = self._hed_validator.run_basic_checks(column_hed_string, allow_placeholders=False) + + error_handler.add_context_and_filter(new_column_issues) + error_handler.pop_error_context() + error_handler.pop_error_context() + + issues += new_column_issues + if new_column_issues: + continue + else: + row_string = HedStringGroup(row_strings) + error_handler.push_error_context(ErrorContext.HED_STRING, row_string, increment_depth_after=False) + new_column_issues = self._hed_validator.run_full_string_checks(row_string) + + error_handler.add_context_and_filter(new_column_issues) + error_handler.pop_error_context() + issues += new_column_issues + error_handler.pop_error_context() + return issues + + def _validate_column_structure(self, base_input, error_handler): + """ + Validate that each column in the input data has valid values. + + Parameters: + base_input (BaseInput): The input data to be validated. + Returns: + List of issues associated with each invalid value. Each issue is a dictionary. + """ + issues = [] + col_issues = base_input._mapper.get_column_mapping_issues() + error_handler.add_context_and_filter(col_issues) + issues += col_issues + for column in base_input.column_metadata().values(): + if column.column_type == ColumnType.Categorical: + error_handler.push_error_context(ErrorContext.COLUMN, column.column_name) + valid_keys = column.hed_dict.keys() + for row_number, value in enumerate(base_input.dataframe[column.column_name]): + if value != "n/a" and value not in valid_keys: + error_handler.push_error_context(ErrorContext.ROW, row_number) + issues += error_handler.format_error_with_context(ValidationErrors.HED_SIDECAR_KEY_MISSING, + invalid_key=value, + category_keys=list(valid_keys)) + error_handler.pop_error_context() + error_handler.pop_error_context() + + return issues diff --git a/hed/validator/tag_validator.py b/hed/validator/tag_validator.py index 29b5c9f1b..2d08eae62 100644 --- a/hed/validator/tag_validator.py +++ b/hed/validator/tag_validator.py @@ -13,7 +13,7 @@ class TagValidator: """ Validation for individual HED tags. """ - CAMEL_CASE_EXPRESSION = r'([A-Z-]+\s*[a-z-]*)+' + CAMEL_CASE_EXPRESSION = r'([A-Z]+\s*[a-z-]*)+' INVALID_STRING_CHARS = '[]{}~' OPENING_GROUP_CHARACTER = '(' CLOSING_GROUP_CHARACTER = ')' @@ -24,21 +24,17 @@ class TagValidator: # Placeholder characters are checked elsewhere, but by default allowed TAG_ALLOWED_CHARS = "-_/" - def __init__(self, hed_schema=None, run_semantic_validation=True): + def __init__(self, hed_schema=None): """Constructor for the Tag_Validator class. Parameters: hed_schema (HedSchema): A HedSchema object. - run_semantic_validation (bool): True if the validator should check the HED data against a schema. Returns: TagValidator: A Tag_Validator object. """ self._hed_schema = hed_schema - self._run_semantic_validation = run_semantic_validation - if not self._hed_schema: - self._run_semantic_validation = False # Dict contains all the value portion validators for value class. e.g. "is this a number?" self._value_unit_validators = self._register_default_value_validators() @@ -67,13 +63,12 @@ def run_hed_string_validators(self, hed_string_obj): validation_issues += self.check_tag_formatting(tag) return validation_issues - def run_individual_tag_validators(self, original_tag, check_for_warnings, allow_placeholders=False, + def run_individual_tag_validators(self, original_tag, allow_placeholders=False, is_definition=False): """ Runs the hed_ops on the individual tags. Parameters: original_tag (HedTag): A original tag. - check_for_warnings (bool): If True, also check for warnings. allow_placeholders (bool): Allow value class or extensions to be placeholders rather than a specific value. is_definition (bool): This tag is part of a Definition, not a normal line. @@ -83,10 +78,10 @@ def run_individual_tag_validators(self, original_tag, check_for_warnings, allow_ """ validation_issues = [] validation_issues += self.check_tag_invalid_chars(original_tag, allow_placeholders) - if self._run_semantic_validation: - validation_issues += self.check_tag_exists_in_schema(original_tag, check_for_warnings) + if self._hed_schema: + validation_issues += self.check_tag_exists_in_schema(original_tag) if original_tag.is_unit_class_tag(): - validation_issues += self.check_tag_unit_class_units_are_valid(original_tag, check_for_warnings) + validation_issues += self.check_tag_unit_class_units_are_valid(original_tag) elif original_tag.is_value_class_tag(): validation_issues += self.check_tag_value_class_valid(original_tag) elif original_tag.extension_or_value_portion: @@ -95,8 +90,7 @@ def run_individual_tag_validators(self, original_tag, check_for_warnings, allow_ if not allow_placeholders: validation_issues += self.check_for_placeholder(original_tag, is_definition) validation_issues += self.check_tag_requires_child(original_tag) - if check_for_warnings: - validation_issues += self.check_capitalization(original_tag) + validation_issues += self.check_capitalization(original_tag) return validation_issues def run_tag_level_validators(self, original_tag_list, is_top_level, is_group): @@ -119,12 +113,11 @@ def run_tag_level_validators(self, original_tag_list, is_top_level, is_group): validation_issues += self.check_tag_level_issue(original_tag_list, is_top_level, is_group) return validation_issues - def run_all_tags_validators(self, tags, check_for_warnings): + def run_all_tags_validators(self, tags): """ Validate the multi-tag properties in a hed string. Parameters: tags (list): A list containing the HedTags in a HED string. - check_for_warnings (bool): If True, also check for warnings. Returns: list: The validation issues associated with the tags in a HED string. Each issue is a dictionary. @@ -134,9 +127,8 @@ def run_all_tags_validators(self, tags, check_for_warnings): """ validation_issues = [] - if self._run_semantic_validation: - if check_for_warnings: - validation_issues += self.check_for_required_tags(tags) + if self._hed_schema: + validation_issues += self.check_for_required_tags(tags) validation_issues += self.check_multiple_unique_tags_exist(tags) return validation_issues @@ -210,6 +202,9 @@ def check_delimiter_issues_in_hed_string(self, hed_string): current_tag = '' else: issues += ErrorHandler.format_error(ValidationErrors.HED_COMMA_MISSING, tag=current_tag) + elif last_non_empty_valid_character == "," and current_character == self.CLOSING_GROUP_CHARACTER: + issues += ErrorHandler.format_error(ValidationErrors.HED_TAG_EMPTY, source_string=hed_string, + char_index=i) elif TagValidator._comma_is_missing_after_closing_parentheses(last_non_empty_valid_character, current_character): issues += ErrorHandler.format_error(ValidationErrors.HED_COMMA_MISSING, tag=current_tag[:-1]) @@ -252,19 +247,20 @@ def check_tag_invalid_chars(self, original_tag, allow_placeholders): Returns: list: Validation issues. Each issue is a dictionary. """ + validation_issues = self._check_invalid_prefix_issues(original_tag) allowed_chars = self.TAG_ALLOWED_CHARS if not self._hed_schema or not self._hed_schema.is_hed3_schema: allowed_chars += " " if allow_placeholders: allowed_chars += "#" - return self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag) + validation_issues += self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag) + return validation_issues - def check_tag_exists_in_schema(self, original_tag, check_for_warnings=False): + def check_tag_exists_in_schema(self, original_tag): """ Report invalid tag or doesn't take a value. Parameters: original_tag (HedTag): The original tag that is used to report the error. - check_for_warnings (bool): If True, also check for warnings. Returns: list: Validation issues. Each issue is a dictionary. @@ -276,18 +272,17 @@ def check_tag_exists_in_schema(self, original_tag, check_for_warnings=False): is_extension_tag = original_tag.is_extension_allowed_tag() if not is_extension_tag: validation_issues += ErrorHandler.format_error(ValidationErrors.INVALID_EXTENSION, tag=original_tag) - elif check_for_warnings: + else: validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TAG_EXTENDED, tag=original_tag, index_in_tag=len(original_tag.org_base_tag), index_in_tag_end=None) return validation_issues - def check_tag_unit_class_units_are_valid(self, original_tag, check_for_warnings): + def check_tag_unit_class_units_are_valid(self, original_tag): """ Report incorrect unit class or units. Parameters: original_tag (HedTag): The original tag that is used to report the error. - check_for_warnings (bool): Indicates whether to check for warnings. Returns: list: Validation issues. Each issue is a dictionary. @@ -297,13 +292,12 @@ def check_tag_unit_class_units_are_valid(self, original_tag, check_for_warnings) stripped_value, unit = original_tag.get_stripped_unit_value() if not unit: if self._validate_value_class_portion(original_tag, stripped_value): - if check_for_warnings: - # only suggest a unit is missing if this is a valid number - if tag_validator_util.validate_numeric_value_class(stripped_value): - default_unit = original_tag.get_unit_class_default_unit() - validation_issues += ErrorHandler.format_error(ValidationErrors.HED_UNITS_DEFAULT_USED, - tag=original_tag, - default_unit=default_unit) + # only suggest a unit is missing if this is a valid number + if tag_validator_util.validate_numeric_value_class(stripped_value): + default_unit = original_tag.get_unit_class_default_unit() + validation_issues += ErrorHandler.format_error(ValidationErrors.HED_UNITS_DEFAULT_USED, + tag=original_tag, + default_unit=default_unit) else: tag_unit_class_units = original_tag.get_tag_unit_class_units() if tag_unit_class_units: @@ -412,24 +406,23 @@ def check_tag_level_issue(self, original_tag_list, is_top_level, is_group): - Top-level groups can contain definitions, Onset, etc tags. """ validation_issues = [] - if self._run_semantic_validation: - top_level_tags = [tag for tag in original_tag_list if - tag.base_tag_has_attribute(HedKey.TopLevelTagGroup)] - tag_group_tags = [tag for tag in original_tag_list if - tag.base_tag_has_attribute(HedKey.TagGroup)] - for tag_group_tag in tag_group_tags: - if not is_group: - validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TAG_GROUP_TAG, - tag=tag_group_tag) - for top_level_tag in top_level_tags: - if not is_top_level: - validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TOP_LEVEL_TAG, - tag=top_level_tag) - - if is_top_level and len(top_level_tags) > 1: - validation_issues += ErrorHandler.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, - tag=top_level_tags[0], - multiple_tags=top_level_tags[1:]) + top_level_tags = [tag for tag in original_tag_list if + tag.base_tag_has_attribute(HedKey.TopLevelTagGroup)] + tag_group_tags = [tag for tag in original_tag_list if + tag.base_tag_has_attribute(HedKey.TagGroup)] + for tag_group_tag in tag_group_tags: + if not is_group: + validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TAG_GROUP_TAG, + tag=tag_group_tag) + for top_level_tag in top_level_tags: + if not is_top_level: + validation_issues += ErrorHandler.format_error(ValidationErrors.HED_TOP_LEVEL_TAG, + tag=top_level_tag) + + if is_top_level and len(top_level_tags) > 1: + validation_issues += ErrorHandler.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, + tag=top_level_tags[0], + multiple_tags=top_level_tags[1:]) return validation_issues @@ -475,6 +468,15 @@ def check_multiple_unique_tags_exist(self, tags): # ========================================================================== # Private utility functions # =========================================================================+ + def _check_invalid_prefix_issues(self, original_tag): + """Check for invalid schema prefix.""" + issues = [] + schema_prefix = original_tag.schema_prefix + if schema_prefix and not schema_prefix[:-1].isalpha(): + issues += ErrorHandler.format_error(ValidationErrors.TAG_PREFIX_INVALID, + tag=original_tag, tag_prefix=schema_prefix) + return issues + def _validate_value_class_portion(self, original_tag, portion_to_validate): if portion_to_validate is None: return False diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index f43bc9c86..9c80d4d98 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -1,13 +1,18 @@ import os -import json import unittest -from hed.models import DefinitionDict, DefMapper, OnsetMapper -from hed.models.hed_ops import apply_ops -from hed import load_schema_version -from hed import HedValidator +from hed.models import DefinitionDict + +from hed import load_schema_version, HedString +from hed.validator import HedValidator from hed import Sidecar import io import json +from hed import HedFileError +from hed.errors import ErrorHandler, get_printable_issue_string + + + +skip_tests = ["VERSION_DEPRECATED", "CHARACTER_INVALID", "STYLE_WARNING"] class MyTestCase(unittest.TestCase): @@ -17,94 +22,79 @@ def setUpClass(cls): 'hed-specification/docs/source/_static/data/error_tests')) cls.test_files = [os.path.join(test_dir, f) for f in os.listdir(test_dir) if os.path.isfile(os.path.join(test_dir, f))] - cls.fail_count = 0 + cls.fail_count = [] cls.default_sidecar = Sidecar(os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test_sidecar.json'))) - def run_single_test(self, test_file): with open(test_file, "r") as fp: test_info = json.load(fp) for info in test_info: error_code = info['error_code'] - if error_code == "VERSION_DEPRECATED": - print("Skipping VERSION_DEPRECATED test") + if error_code in skip_tests: + print(f"Skipping {error_code} test") continue name = info.get('name', '') description = info['description'] schema = info['schema'] + check_for_warnings = info.get("warning", False) + error_handler = ErrorHandler(check_for_warnings) if schema: schema = load_schema_version(schema) else: - schema = None + raise ValueError("Tests always require a schema now") definitions = info['definitions'] - def_dict = DefinitionDict() - _, issues = apply_ops(definitions, [schema, def_dict]) - self.assertFalse(issues) - validator = HedValidator(schema) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) + def_dict = DefinitionDict(definitions, schema) + self.assertFalse(def_dict.issues) for section_name, section in info["tests"].items(): if section_name == "string_tests": - self._run_single_string_test(section, validator, def_mapper, - onset_mapper, error_code, description, name) - elif section_name == "sidecar_tests": - self._run_single_sidecar_test(section, validator, def_mapper, onset_mapper, error_code, description, - name) - elif section_name == "event_tests": - self._run_single_events_test(section, validator, def_mapper, onset_mapper, error_code, description, - name) - - def _run_single_string_test(self, info, validator, def_mapper, onset_mapper, error_code, description, - name): + self._run_single_string_test(section, schema, def_dict, error_code, description, name, error_handler) + if section_name == "sidecar_tests": + self._run_single_sidecar_test(section, schema, def_dict, error_code, description, name, error_handler) + if section_name == "event_tests": + self._run_single_events_test(section, schema, def_dict, error_code, description, name, error_handler) + if section_name == "combo_tests": + self._run_single_combo_test(section, schema, def_dict, error_code, description, name, error_handler) + + def report_result(self, expected_result, issues, error_code, description, name, test, test_type): + if expected_result == "fails": + if not issues: + print(f"{error_code}: {description}") + print(f"Passed '{test_type}' (which should fail) '{name}': {test}") + print(get_printable_issue_string(issues)) + self.fail_count.append(name) + else: + if issues: + print(f"{error_code}: {description}") + print(f"Failed '{test_type}' test '{name}': {test}") + print(get_printable_issue_string(issues)) + self.fail_count.append(name) + + def _run_single_string_test(self, info, schema, def_dict, error_code, description, name, error_handler): + string_validator = HedValidator(hed_schema=schema, def_dicts=def_dict, run_full_onset_checks=False) for result, tests in info.items(): for test in tests: - modified_test, issues = apply_ops(test, [validator, def_mapper, onset_mapper], check_for_warnings=True, - expand_defs=True) - if modified_test and modified_test != test: - _, def_expand_issues = apply_ops(modified_test, validator, check_for_warnings=True) - issues += def_expand_issues - if result == "fails": - if not issues: - print(f"{error_code}: {description}") - print(f"Passed this test(that should fail) '{name}': {test}") - print(issues) - self.fail_count += 1 - else: - if issues: - print(f"{error_code}: {description}") - print(f"Failed this test {name}: {test}") - print(issues) - - self.fail_count += 1 - - def _run_single_sidecar_test(self, info, validator, def_mapper, onset_mapper, error_code, description, - name): - for result, tests in info.items(): + test_string = HedString(test, schema) + + # This expand should not be required here. + def_dict.expand_def_tags(test_string) + + issues = string_validator.run_basic_checks(test_string, False) + issues += string_validator.run_full_string_checks(test_string) + error_handler.add_context_and_filter(issues) + self.report_result(result, issues, error_code, description, name, test, "string_test") + def _run_single_sidecar_test(self, info, schema, def_dict, error_code, description, name, error_handler): + for result, tests in info.items(): for test in tests: # Well this is a disaster buffer = io.BytesIO(json.dumps(test).encode("utf-8")) sidecar = Sidecar(buffer) - issues = sidecar.validate_entries([validator, def_mapper, onset_mapper], check_for_warnings=True) - if result == "fails": - if not issues: - print(f"{error_code}: {description}") - print(f"Passed this test(that should fail) '{name}': {test}") - print(issues) - self.fail_count += 1 - else: - if issues: - print(f"{error_code}: {description}") - print(f"Failed this test {name}: {test}") - print(issues) - - self.fail_count += 1 - - def _run_single_events_test(self, info, validator, def_mapper, onset_mapper, error_code, description, - name): + issues = sidecar.validate(hed_schema=schema, extra_def_dicts=def_dict, error_handler=error_handler) + self.report_result(result, issues, error_code, description, name, test, "sidecar_test") + + def _run_single_events_test(self, info, schema, def_dict, error_code, description,name, error_handler): from hed import TabularInput for result, tests in info.items(): - for test in tests: string = "" for row in test: @@ -120,26 +110,48 @@ def _run_single_events_test(self, info, validator, def_mapper, onset_mapper, err file_obj = io.BytesIO(string.encode("utf-8")) file = TabularInput(file_obj, sidecar=self.default_sidecar) - issues = file.validate_file([validator, def_mapper, onset_mapper], check_for_warnings=True) - if result == "fails": - if not issues: - print(f"{error_code}: {description}") - print(f"Passed this test(that should fail) '{name}': {test}") - print(issues) - self.fail_count += 1 - else: - if issues: - print(f"{error_code}: {description}") - print(f"Failed this test {name}: {test}") - print(issues) - - self.fail_count += 1 - - def test_summary(self): + issues = file.validate(hed_schema=schema, extra_def_dicts=def_dict, error_handler=error_handler) + self.report_result(result, issues, error_code, description, name, test, "events_test") + + def _run_single_combo_test(self, info, schema, def_dict, error_code, description,name, error_handler): + from hed import TabularInput + for result, tests in info.items(): + for test in tests: + buffer = io.BytesIO(json.dumps(test['sidecar']).encode("utf-8")) + sidecar = Sidecar(buffer) + sidecar.loaded_dict.update(self.default_sidecar.loaded_dict) + issues = sidecar.validate(hed_schema=schema, extra_def_dicts=def_dict, error_handler=error_handler) + string = "" + try: + for row in test['events']: + if not isinstance(row, list): + print(f"Improper grouping in test: {error_code}:{name}") + print(f"Improper data for test {name}: {test}") + print(f"This is probably a missing set of square brackets.") + break + string += "\t".join(str(x) for x in row) + "\n" + + if not string: + print(F"Invalid blank events found in test: {error_code}:{name}") + continue + file_obj = io.BytesIO(string.encode("utf-8")) + + file = TabularInput(file_obj, sidecar=sidecar) + except HedFileError: + print(f"{error_code}: {description}") + print(f"Improper data for test {name}: {test}") + print(f"This is probably a missing set of square brackets.") + continue + issues += file.validate(hed_schema=schema, extra_def_dicts=def_dict, error_handler=error_handler) + self.report_result(result, issues, error_code, description, name, test, "combo_tests") + + def test_errors(self): for test_file in self.test_files: self.run_single_test(test_file) - print(f"{self.fail_count} tests got an unexpected result") - self.assertEqual(self.fail_count, 0) + print(f"{len(self.fail_count)} tests got an unexpected result") + print("\n".join(self.fail_count)) + self.assertEqual(len(self.fail_count), 0) if __name__ == '__main__': unittest.main() + diff --git a/tests/data/model_tests/na_tag_column.tsv b/tests/data/model_tests/na_tag_column.tsv new file mode 100644 index 000000000..d42bbb34b --- /dev/null +++ b/tests/data/model_tests/na_tag_column.tsv @@ -0,0 +1,2 @@ +Geometric-object Event +Square diff --git a/tests/data/model_tests/na_value_column.json b/tests/data/model_tests/na_value_column.json new file mode 100644 index 000000000..72a1d0af7 --- /dev/null +++ b/tests/data/model_tests/na_value_column.json @@ -0,0 +1,5 @@ +{ + "Value": { + "HED": "Description/#" + } +} \ No newline at end of file diff --git a/tests/data/model_tests/na_value_column.tsv b/tests/data/model_tests/na_value_column.tsv new file mode 100644 index 000000000..91d00351e --- /dev/null +++ b/tests/data/model_tests/na_value_column.tsv @@ -0,0 +1,3 @@ +HED Value +Geometric-object 1 +Square n/a diff --git a/tests/data/model_tests/no_column_header_definition.tsv b/tests/data/model_tests/no_column_header_definition.tsv index 27c89d11c..418391ef9 100644 --- a/tests/data/model_tests/no_column_header_definition.tsv +++ b/tests/data/model_tests/no_column_header_definition.tsv @@ -1,2 +1,2 @@ -Geometric-object Event, (Definition/DefTest1, (Circle)) -Square Item, Def/DefTest1 +Geometric-object Event +Circle Item,Def/DefTest1 diff --git a/tests/data/model_tests/no_column_header_definition_long.tsv b/tests/data/model_tests/no_column_header_definition_long.tsv index c58990c03..835457f00 100644 --- a/tests/data/model_tests/no_column_header_definition_long.tsv +++ b/tests/data/model_tests/no_column_header_definition_long.tsv @@ -1,2 +1,2 @@ -Item/Object/Geometric-object Event,(Property/Organizational-property/Definition/DefTest1,(InvalidDefTag)) -Item/Object/Geometric-object/2D-shape/Circle Item,Property/Organizational-property/Def/DefTest1 +Item/Object/Geometric-object Event +Item/Object/Geometric-object/2D-shape/Ellipse/Circle Item,Property/Organizational-property/Def/DefTest1 diff --git a/tests/data/validator_tests/bids_events_HED.json b/tests/data/validator_tests/bids_events_HED.json index 8cb2d6ba4..4158d47ec 100644 --- a/tests/data/validator_tests/bids_events_HED.json +++ b/tests/data/validator_tests/bids_events_HED.json @@ -8,8 +8,7 @@ "Units": "s" }, "HED": { - "Description": "This is a column to verity the often reserved HED name causes no issues.", + "Description": "This is a column to verity the often reserved HED name does cause issues.", "Units": "s" } - } \ No newline at end of file diff --git a/tests/models/test_base_file_input.py b/tests/models/test_base_file_input.py index 97efc8316..8314072bd 100644 --- a/tests/models/test_base_file_input.py +++ b/tests/models/test_base_file_input.py @@ -3,7 +3,6 @@ import shutil from hed import Sidecar from hed import BaseInput, TabularInput -from hed.models.def_mapper import DefMapper from hed.models.column_mapper import ColumnMapper from hed.models import DefinitionDict from hed import schema @@ -40,32 +39,20 @@ def setUpClass(cls): sidecar1 = Sidecar(json_path, name='face_sub1_json') mapper1 = ColumnMapper(sidecar=sidecar1, optional_tag_columns=['HED'], warn_on_missing_column=False) cls.input_data1 = BaseInput(events_path, file_type='.tsv', has_column_names=True, - name="face_sub1_events", mapper=mapper1, - definition_columns=['HED'], allow_blank_names=False) + name="face_sub1_events", mapper=mapper1, allow_blank_names=False) cls.input_data2 = BaseInput(events_path, file_type='.tsv', has_column_names=True, name="face_sub2_events") @classmethod def tearDownClass(cls): shutil.rmtree(cls.base_output_folder) - def test_get_definitions(self): - defs1 = self.input_data1.get_definitions(as_strings=True) - self.assertIsInstance(defs1, dict, "get_definitions returns dictionary when as strings") - self.assertEqual(len(defs1), 17, "get_definitions should have the right number of definitions") - - defs2 = self.input_data1.get_definitions() - self.assertIsInstance(defs2, DefMapper, "get_definitions returns a DefMapper by default") - - defs3 = self.input_data2.get_definitions(as_strings=False) - self.assertIsInstance(defs3, DefMapper, "get_definitions returns a DefMapper when not as strings") - def test_gathered_defs(self): # todo: add unit tests for definitions in tsv file - defs = DefinitionDict.get_as_strings(self.tabular_file.def_dict) + defs = DefinitionDict.get_as_strings(self.tabular_file._sidecar.extract_definitions(hed_schema=self.hed_schema)) expected_defs = { 'jsonfiledef': '(Item/JsonDef1/#,Item/JsonDef1)', 'jsonfiledef2': '(Item/JsonDef2/#,Item/JsonDef2)', - 'jsonfiledef3': '(Item/JsonDef3/#,InvalidTag)', + 'jsonfiledef3': '(Item/JsonDef3/#)', 'takesvaluedef': '(Age/#)', 'valueclassdef': '(Acceleration/#)' } diff --git a/tests/models/test_column_mapper.py b/tests/models/test_column_mapper.py index c2eeea109..78a6b99a9 100644 --- a/tests/models/test_column_mapper.py +++ b/tests/models/test_column_mapper.py @@ -1,8 +1,7 @@ import unittest import os -from hed.models import ColumnMapper, ColumnType, ColumnMetadata, HedString, model_constants -from hed.schema import load_schema +from hed.models import ColumnMapper, ColumnType, HedString from hed.models.sidecar import Sidecar @@ -44,11 +43,6 @@ def setUpClass(cls): cls.short_tag_partial_prefix = 'Language-item/Character/' cls.short_tag_partial_prefix2 = 'Character/' - def test_set_column_prefix_dict(self): - mapper = ColumnMapper() - mapper.set_column_prefix_dict(self.column_prefix_dictionary, True) - self.assertTrue(len(mapper._final_column_map) == 3) - def test_set_tag_columns(self): mapper = ColumnMapper() mapper.set_tag_columns(self.zero_based_tag_columns, finalize_mapping=True) @@ -112,94 +106,12 @@ def test_set_column_map(self): mapper.set_column_map(self.test_column_map) self.assertTrue(len(mapper._final_column_map) >= 1) - def test__set_column_prefix(self): - mapper = ColumnMapper() - mapper._set_column_prefix(mapper._final_column_map, self.add_column_number, self.required_prefix) - self.assertTrue(len(mapper._final_column_map) >= 1) - - mapper = ColumnMapper() - with self.assertRaises(TypeError): - mapper._set_column_prefix(mapper._final_column_map, self.add_column_name, self.required_prefix) - def test__finalize_mapping(self): mapper = ColumnMapper() mapper.add_columns([self.add_column_number], ColumnType.Value) mapper._finalize_mapping() self.assertTrue(len(mapper._final_column_map) >= 1) - def test_expand_column(self): - mapper = ColumnMapper() - mapper._set_sidecar(Sidecar(self.basic_events_json)) - mapper.set_column_map(self.basic_column_map) - expanded_column = mapper._expand_column(2, "go") - self.assertTrue(isinstance(expanded_column[0], HedString)) - - def test_expand_row_tags(self): - mapper = ColumnMapper() - mapper._set_sidecar(Sidecar(self.basic_events_json)) - mapper.add_columns(self.basic_hed_tags_column) - mapper.set_column_map(self.basic_column_map) - expanded_row = mapper.expand_row_tags(self.basic_event_row) - self.assertTrue(isinstance(expanded_row, dict)) - self.assertTrue(0 in expanded_row[model_constants.COLUMN_TO_HED_TAGS]) - - def test_expansion_issues(self): - mapper = ColumnMapper() - mapper._set_sidecar(Sidecar(self.basic_events_json)) - mapper.add_columns(self.basic_hed_tags_column) - mapper.set_column_map(self.basic_column_map) - expanded_row = mapper.expand_row_tags(self.basic_event_row_invalid) - column_issues = expanded_row[model_constants.COLUMN_ISSUES][2] - self.assertEqual(len(column_issues), 1) - self.assertTrue(0 in expanded_row[model_constants.COLUMN_TO_HED_TAGS]) - - def test_remove_prefix_if_needed(self): - mapper = ColumnMapper() - mapper.set_column_prefix_dict({self.add_column_number: self.required_prefix}) - remove_prefix_func = mapper.get_prefix_remove_func(self.add_column_number) - test_string_obj = HedString(self.complex_hed_tag_required_prefix) - no_prefix_string = test_string_obj.get_as_form("org_tag", remove_prefix_func) - self.assertEqual(str(no_prefix_string), str(self.complex_hed_tag_no_prefix)) - - def test__prepend_prefix_to_required_tag_column_if_needed(self): - category_tags = HedString('Participant response, Stimulus') - ColumnMetadata._prepend_required_prefix(category_tags, self.category_key) - self.assertIsInstance(category_tags, HedString) - self.assertEqual(str(category_tags), str(self.category_participant_and_stimulus_tags)) - - # Verify reading/writing a short tag to a file column with a name_prefix works - def test_add_prefix_verify_short_tag_conversion(self): - schema_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), self.schema_file) - hed_schema = load_schema(schema_file) - hed_string_obj = HedString(self.short_tag_with_missing_prefix) - ColumnMetadata._prepend_required_prefix(hed_string_obj, self.short_tag_key) - issues = hed_string_obj.convert_to_canonical_forms(hed_schema) - self.assertFalse(issues) - for tag in hed_string_obj.get_all_tags(): - self.assertEqual("Character/D", tag.short_tag) - - def test_add_prefix_verify_short_tag_read(self): - column_mapper = ColumnMapper(column_prefix_dictionary={0: self.short_tag_key}) - test_strings = { - 'test_no_prefix': self.short_tag_with_missing_prefix, - 'test_full_prefix': self.short_tag_key + self.short_tag_with_missing_prefix, - 'test_partial_prefix1': self.short_tag_partial_prefix + self.short_tag_with_missing_prefix, - 'test_partial_prefix2': self.short_tag_partial_prefix2 + self.short_tag_with_missing_prefix, - } - expected_results = { - 'test_no_prefix': self.short_tag_key + self.short_tag_with_missing_prefix, - 'test_full_prefix': self.short_tag_key + self.short_tag_with_missing_prefix, - 'test_partial_prefix1': self.short_tag_partial_prefix + self.short_tag_with_missing_prefix, - 'test_partial_prefix2': self.short_tag_partial_prefix2 + self.short_tag_with_missing_prefix, - } - - for test_key in test_strings: - test_string = test_strings[test_key] - expected_result = expected_results[test_key] - - expanded_row = column_mapper.expand_row_tags([test_string]) - prepended_hed_string = expanded_row[model_constants.COLUMN_TO_HED_TAGS][0] - self.assertEqual(expected_result, str(prepended_hed_string)) if __name__ == '__main__': diff --git a/tests/models/test_def_mapper.py b/tests/models/test_def_mapper.py deleted file mode 100644 index 4f38c88da..000000000 --- a/tests/models/test_def_mapper.py +++ /dev/null @@ -1,292 +0,0 @@ -import unittest -import os - -from hed import schema -from hed.models import DefinitionDict, DefMapper, HedString -from hed.validator import HedValidator -from hed.errors import ErrorHandler, ErrorContext - - -class Test(unittest.TestCase): - basic_hed_string_with_def_first_paren = None - - @classmethod - def setUpClass(cls): - cls.base_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/') - hed_xml_file = os.path.realpath(os.path.join(cls.base_data_dir, "schema_tests/HED8.0.0t.xml")) - cls.hed_schema = schema.load_schema(hed_xml_file) - cls.def_contents_string = "(Item/TestDef1,Item/TestDef2)" - cls.basic_definition_string = f"(Definition/TestDef,{cls.def_contents_string})" - cls.basic_definition_string_no_paren = f"Definition/TestDef,{cls.def_contents_string}" - cls.label_def_string = "Def/TestDef" - cls.expanded_def_string = f"(Def-expand/TestDef,{cls.def_contents_string})" - cls.basic_hed_string = "Item/BasicTestTag1,Item/BasicTestTag2" - cls.basic_hed_string_with_def = f"{cls.basic_hed_string},{cls.label_def_string}" - cls.basic_hed_string_with_def_first = f"{cls.label_def_string},{cls.basic_hed_string}" - cls.basic_hed_string_with_def_first_paren = f"({cls.label_def_string},{cls.basic_hed_string})" - cls.placeholder_label_def_string = "Def/TestDefPlaceholder/2471" - cls.placeholder_definition_contents = "(Item/TestDef1/#,Item/TestDef2)" - cls.placeholder_definition_string = f"(Definition/TestDefPlaceholder/#,{cls.placeholder_definition_contents})" - cls.placeholder_definition_string_no_paren = \ - f"Definition/TestDefPlaceholder/#,{cls.placeholder_definition_contents}" - cls.placeholder_expanded_def_string = "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2))" - - cls.placeholder_hed_string_with_def = f"{cls.basic_hed_string},{cls.placeholder_label_def_string}" - cls.placeholder_hed_string_with_def_first = f"{cls.placeholder_label_def_string},{cls.basic_hed_string}" - cls.placeholder_hed_string_with_def_first_paren = f"({cls.placeholder_label_def_string},{cls.basic_hed_string})" - - cls.valid_definition_strings = { - 'str_no_defs': False, - 'str2': True, - 'str3': False, - 'str4': False, - 'str5': False, - 'str6': False, - 'str7': False, - } - cls.mark_all_as_valid_strings = { - 'str_no_defs': False, - 'str2': False, - 'str3': False, - 'str4': False, - 'str5': False, - 'str6': False, - 'str7': False, - } - - def base_def_validator(self, test_strings, result_strings, valid_strings, expand_defs, shrink_defs, - remove_definitions, extra_ops=None, - basic_definition_string=None): - if not basic_definition_string: - basic_definition_string = self.basic_definition_string - def_dict = DefinitionDict() - def_string = HedString(basic_definition_string) - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - - def_mapper = DefMapper(def_dict) - hed_ops = [] - if extra_ops: - hed_ops += extra_ops - hed_ops.append(def_mapper) - - for key in test_strings: - string, expected_result, invalid = test_strings[key], result_strings[key], valid_strings[key] - test_string = HedString(string) - def_issues = test_string.validate(hed_ops, expand_defs=expand_defs, shrink_defs=shrink_defs, - remove_definitions=remove_definitions) - self.assertEqual(invalid, bool(def_issues)) - self.assertEqual(test_string.get_as_short(), expected_result) - - def test_expand_def_tags(self): - basic_def_strings = { - 'str_no_defs': self.basic_definition_string, - 'str2': self.basic_definition_string_no_paren, - 'str3': self.basic_hed_string + "," + self.basic_definition_string, - 'str4': self.basic_definition_string + "," + self.basic_hed_string, - 'str5': self.basic_hed_string_with_def, - 'str6': self.basic_hed_string_with_def_first, - 'str7': self.basic_hed_string_with_def_first_paren, - } - expanded_def_strings = { - 'str_no_defs': "", - 'str2': self.basic_definition_string_no_paren, - 'str3': self.basic_hed_string, - 'str4': self.basic_hed_string, - 'str5': self.basic_hed_string + "," + self.expanded_def_string, - 'str6': self.expanded_def_string + "," + self.basic_hed_string, - 'str7': "(" + self.expanded_def_string + "," + self.basic_hed_string + ")" - } - expanded_def_strings_with_definition = { - 'str_no_defs': self.basic_definition_string, - 'str2': self.basic_definition_string_no_paren, - 'str3': self.basic_hed_string + "," + self.basic_definition_string, - 'str4': self.basic_definition_string + "," + self.basic_hed_string, - 'str5': self.basic_hed_string + "," + self.expanded_def_string, - 'str6': self.expanded_def_string + "," + self.basic_hed_string, - 'str7': "(" + self.expanded_def_string + "," + self.basic_hed_string + ")" - } - - self.base_def_validator(basic_def_strings, expanded_def_strings_with_definition, - self.mark_all_as_valid_strings, expand_defs=True, - shrink_defs=False, remove_definitions=False) - self.base_def_validator(basic_def_strings, basic_def_strings, self.mark_all_as_valid_strings, - expand_defs=False, shrink_defs=False, remove_definitions=False) - self.base_def_validator(basic_def_strings, basic_def_strings, self.mark_all_as_valid_strings, - expand_defs=False, shrink_defs=True, remove_definitions=False) - self.base_def_validator(expanded_def_strings_with_definition, basic_def_strings, - self.mark_all_as_valid_strings, expand_defs=False, shrink_defs=True, - remove_definitions=False) - self.base_def_validator(expanded_def_strings_with_definition, expanded_def_strings_with_definition, - self.mark_all_as_valid_strings, expand_defs=True, shrink_defs=False, - remove_definitions=False) - self.base_def_validator(basic_def_strings, expanded_def_strings, self.mark_all_as_valid_strings, - expand_defs=True, shrink_defs=False, remove_definitions=True) - - validator = HedValidator(self.hed_schema) - extra_ops = [validator] - - self.base_def_validator(basic_def_strings, expanded_def_strings_with_definition, - self.valid_definition_strings, expand_defs=True, shrink_defs=False, - extra_ops=extra_ops, remove_definitions=False) - - # special case test - def test_changing_tag_then_def_mapping(self): - def_dict = DefinitionDict() - def_string = HedString(self.basic_definition_string) - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - def_mapper = DefMapper(def_dict) - validator = HedValidator(self.hed_schema) - hed_ops = [validator, def_mapper] - - test_string = HedString(self.label_def_string) - tag = test_string.children[0] - tag.tag = "Organizational-property/" + str(tag) - def_issues = test_string.validate(hed_ops, expand_defs=True) - self.assertFalse(def_issues) - self.assertEqual(test_string.get_as_short(), f"{self.expanded_def_string}") - - test_string = HedString(self.label_def_string) - tag = test_string.children[0] - tag.tag = "Organizational-property22/" + str(tag) - def_issues = test_string.validate(hed_ops, expand_defs=True) - self.assertTrue(def_issues) - - def test_expand_def_tags_placeholder(self): - basic_def_strings = { - 'str_no_defs': self.placeholder_definition_string, - 'str2': self.placeholder_definition_string_no_paren, - 'str3': self.basic_hed_string + "," + self.placeholder_definition_string, - 'str4': self.placeholder_definition_string + "," + self.basic_hed_string, - 'str5': self.placeholder_hed_string_with_def, - 'str6': self.placeholder_hed_string_with_def_first, - 'str7': self.placeholder_hed_string_with_def_first_paren, - } - expanded_def_strings = { - 'str_no_defs': "", - 'str2': self.placeholder_definition_string_no_paren, - 'str3': self.basic_hed_string, - 'str4': self.basic_hed_string, - 'str5': self.basic_hed_string + "," + self.placeholder_expanded_def_string, - 'str6': self.placeholder_expanded_def_string + "," + self.basic_hed_string, - 'str7': "(" + self.placeholder_expanded_def_string + "," + self.basic_hed_string + ")", - } - expanded_def_strings_with_definition = { - 'str_no_defs': self.placeholder_definition_string, - 'str2': self.placeholder_definition_string_no_paren, - 'str3': self.basic_hed_string + "," + self.placeholder_definition_string, - 'str4': self.placeholder_definition_string + "," + self.basic_hed_string, - 'str5': self.basic_hed_string + "," + self.placeholder_expanded_def_string, - 'str6': self.placeholder_expanded_def_string + "," + self.basic_hed_string, - 'str7': "(" + self.placeholder_expanded_def_string + "," + self.basic_hed_string + ")", - } - - self.base_def_validator(basic_def_strings, expanded_def_strings_with_definition, self.mark_all_as_valid_strings, - expand_defs=True, shrink_defs=False, - remove_definitions=False, basic_definition_string=self.placeholder_definition_string) - - self.base_def_validator(basic_def_strings, basic_def_strings, self.mark_all_as_valid_strings, - expand_defs=False, shrink_defs=False, - remove_definitions=False, basic_definition_string=self.placeholder_definition_string) - - self.base_def_validator(basic_def_strings, basic_def_strings, self.mark_all_as_valid_strings, - expand_defs=False, shrink_defs=True, - remove_definitions=False, basic_definition_string=self.placeholder_definition_string) - - self.base_def_validator(expanded_def_strings_with_definition, basic_def_strings, self.mark_all_as_valid_strings, - expand_defs=False, shrink_defs=True, - remove_definitions=False, basic_definition_string=self.placeholder_definition_string) - - self.base_def_validator(basic_def_strings, expanded_def_strings, self.mark_all_as_valid_strings, - expand_defs=True, shrink_defs=False, - remove_definitions=True, basic_definition_string=self.placeholder_definition_string) - - validator = HedValidator(self.hed_schema) - extra_ops = [validator] - self.base_def_validator(basic_def_strings, expanded_def_strings_with_definition, self.valid_definition_strings, - expand_defs=True, shrink_defs=False, - remove_definitions=False, basic_definition_string=self.placeholder_definition_string, - extra_ops=extra_ops) - - def test_expand_def_tags_placeholder_invalid(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - def_mapper = DefMapper(def_dict) - - placeholder_label_def_string_no_placeholder = "def/TestDefPlaceholder" - - test_string = HedString(placeholder_label_def_string_no_placeholder) - test_string.convert_to_canonical_forms(None) - def_issues = def_mapper.expand_def_tags(test_string) - self.assertEqual(str(test_string), placeholder_label_def_string_no_placeholder) - self.assertTrue(def_issues) - - def_dict = DefinitionDict() - def_string = HedString(self.basic_definition_string) - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - def_mapper = DefMapper(def_dict) - - label_def_string_has_invalid_placeholder = "def/TestDef/54687" - - test_string = HedString(label_def_string_has_invalid_placeholder) - test_string.convert_to_canonical_forms(None) - def_issues = def_mapper.expand_def_tags(test_string) - self.assertEqual(str(test_string), label_def_string_has_invalid_placeholder) - self.assertTrue(def_issues) - - def test_bad_def_expand(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - def_mapper = DefMapper(def_dict) - - valid_placeholder = HedString(self.placeholder_expanded_def_string) - def_issues = valid_placeholder.validate(def_mapper) - self.assertFalse(def_issues) - - invalid_placeholder = HedString("(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/21,Item/TestDef2))") - def_issues = invalid_placeholder.validate(def_mapper) - self.assertTrue(bool(def_issues)) - - def test_def_no_content(self): - def_dict = DefinitionDict() - def_string = HedString("(Definition/EmptyDef)") - def_string.convert_to_canonical_forms(None) - def_dict.check_for_definitions(def_string) - def_mapper = DefMapper(def_dict) - - valid_empty = HedString("Def/EmptyDef") - def_issues = valid_empty.validate(def_mapper, expand_defs=True) - self.assertEqual(str(valid_empty), "(Def-expand/EmptyDef)") - self.assertFalse(def_issues) - - valid_empty = HedString("Def/EmptyDef") - def_issues = valid_empty.validate(def_mapper, expand_defs=False) - self.assertFalse(def_issues) - - def test_duplicate_def(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.convert_to_canonical_forms(None) - error_handler = ErrorHandler() - error_handler.push_error_context(ErrorContext.ROW, 5) - def_dict.check_for_definitions(def_string, error_handler=error_handler) - def_mapper = DefMapper([]) - self.assertEqual(len(def_mapper.issues), 0) - - def_mapper = DefMapper([def_dict, def_dict]) - self.assertEqual(len(def_mapper.issues), 1) - self.assertTrue('ec_row' in def_mapper.issues[0]) - - def_mapper = DefMapper([def_dict, def_dict, def_dict]) - self.assertEqual(len(def_mapper.issues), 2) - self.assertTrue('ec_row' in def_mapper.issues[0]) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/models/test_definition_dict.py b/tests/models/test_definition_dict.py index a463e60a0..ee03122aa 100644 --- a/tests/models/test_definition_dict.py +++ b/tests/models/test_definition_dict.py @@ -3,14 +3,18 @@ from hed.errors import ErrorHandler, DefinitionErrors from hed.models.hed_string import HedString from hed import HedTag +from hed import load_schema_version class TestDefBase(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.hed_schema = load_schema_version("8.0.0") + def check_def_base(self, test_strings, expected_issues): for test_key in test_strings: def_dict = DefinitionDict() - hed_string_obj = HedString(test_strings[test_key]) - hed_string_obj.convert_to_canonical_forms(None) + hed_string_obj = HedString(test_strings[test_key], self.hed_schema) test_issues = def_dict.check_for_definitions(hed_string_obj) expected_issue = expected_issues[test_key] # print(test_issues) @@ -33,16 +37,16 @@ class TestDefinitionDict(TestDefBase): def test_check_for_definitions(self): def_dict = DefinitionDict() original_def_count = len(def_dict.defs) - hed_string_obj = HedString(self.basic_definition_string) - hed_string_obj.validate(def_dict) + hed_string_obj = HedString(self.placeholder_def_string, hed_schema=self.hed_schema) + def_dict.check_for_definitions(hed_string_obj) new_def_count = len(def_dict.defs) self.assertGreater(new_def_count, original_def_count) def test_check_for_definitions_placeholder(self): def_dict = DefinitionDict() original_def_count = len(def_dict.defs) - hed_string_obj = HedString(self.placeholder_def_string) - hed_string_obj.validate(def_dict) + hed_string_obj = HedString(self.placeholder_def_string, hed_schema=self.hed_schema) + def_dict.check_for_definitions(hed_string_obj) new_def_count = len(def_dict.defs) self.assertGreater(new_def_count, original_def_count) @@ -99,6 +103,26 @@ def test_definitions(self): self.check_def_base(test_strings, expected_results) + def test_expand_defs(self): + test_strings = { + 1: "Def/TestDefPlaceholder/2471,Event", + 2: "Event,(Def/TestDefPlaceholder/2471,Event)", + 3: "Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2),Event", + } + + expected_results = { + 1: "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2)),Event", + 2: "Event,((Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2)),Event)", + # this one shouldn't change as it doesn't have a parent + 3: "Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2),Event", + } + def_dict = DefinitionDict() + definition_string = "(Definition/TestDefPlaceholder/#,(Item/TestDef1/#,Item/TestDef2))" + def_dict.check_for_definitions(HedString(definition_string, hed_schema=self.hed_schema)) + for key, test_string in test_strings.items(): + hed_string = HedString(test_string, hed_schema=self.hed_schema) + def_dict.expand_def_tags(hed_string) + self.assertEqual(str(hed_string), expected_results[key]) if __name__ == '__main__': unittest.main() diff --git a/tests/models/test_expression_parser.py b/tests/models/test_expression_parser.py index 7a7ee020d..2066e4e2a 100644 --- a/tests/models/test_expression_parser.py +++ b/tests/models/test_expression_parser.py @@ -4,6 +4,14 @@ from hed.models.expression_parser import QueryParser import os from hed import schema +from hed import HedTag + + +def tag_terms(self): + if isinstance(self, HedTag): + if self._schema_entry: + return self._tag_terms + return (str(self).lower(),) class TestParser(unittest.TestCase): @@ -14,6 +22,9 @@ def setUpClass(cls): hed_xml_file = os.path.join(base_data_dir, "schema_tests/HED8.0.0t.xml") cls.hed_schema = schema.load_schema(hed_xml_file) + HedTag._tag_terms = HedTag.tag_terms + HedTag.tag_terms = property(tag_terms) + def base_test(self, parse_expr, search_strings): expression = QueryParser(parse_expr) diff --git a/tests/models/test_hed_string.py b/tests/models/test_hed_string.py index 894668d5e..af17878bb 100644 --- a/tests/models/test_hed_string.py +++ b/tests/models/test_hed_string.py @@ -1,5 +1,6 @@ from hed.models import HedString import unittest +from hed import load_schema_version class TestHedStrings(unittest.TestCase): @@ -170,3 +171,29 @@ def test_split_hed_string(self): } self.compare_split_results(test_strings, expected_results) + +class TestHedStringShrinkDefs(unittest.TestCase): + hed_schema = load_schema_version("8.0.0") + + def test_shrink_defs(self): + test_strings = { + 1: "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2)),Event", + 2: "Event, ((Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2)),Event)", + # this one shouldn't change as it doesn't have a parent + 3: "Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2),Event", + # This one is an obviously invalid def, but still shrinks + 4: "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2), ThisDefIsInvalid),Event", + } + + expected_results = { + 1: "Def/TestDefPlaceholder/2471,Event", + 2: "Event,(Def/TestDefPlaceholder/2471,Event)", + 3: "Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2),Event", + 4: "Def/TestDefPlaceholder/2471,Event", + } + + for key, test_string in test_strings.items(): + hed_string = HedString(test_string, hed_schema=self.hed_schema) + hed_string.shrink_defs() + self.assertEqual(str(hed_string), expected_results[key]) + diff --git a/tests/models/test_hed_tag.py b/tests/models/test_hed_tag.py index 39daeec83..9eba272eb 100644 --- a/tests/models/test_hed_tag.py +++ b/tests/models/test_hed_tag.py @@ -153,30 +153,4 @@ def test_determine_allows_extensions(self): self.assertEqual(extension_tag1_result, True) self.assertEqual(no_extension_tag1_result, False) self.assertEqual(no_extension_tag2_result, False) - self.assertEqual(no_extension_tag3_result, False) - - def test_finding_tags_no_schema(self): - # Verify basic tag identification works. - tag = HedTag("Onset") - tag.convert_to_canonical_forms(hed_schema=None) - self.assertTrue(tag._schema_entry) - - tag2 = HedTag("OtherFolders/Onset") - tag2.convert_to_canonical_forms(hed_schema=None) - self.assertTrue(tag2._schema_entry) - - tag4 = HedTag("OtherFolders/Onset/Extension") - tag4.convert_to_canonical_forms(hed_schema=None) - self.assertTrue(tag4._schema_entry) - - tag3 = HedTag("OtherFolders/Onset-NotOnset") - tag3.convert_to_canonical_forms(hed_schema=None) - self.assertFalse(tag3._schema_entry) - - tag = HedTag("Onset") - tag.convert_to_canonical_forms(hed_schema=self.hed_schema) - self.assertTrue(tag._schema_entry) - - tag2 = HedTag("Property/Data-property/Data-marker/Temporal-marker/Onset") - tag2.convert_to_canonical_forms(hed_schema=self.hed_schema) - self.assertTrue(tag._schema_entry) + self.assertEqual(no_extension_tag3_result, False) \ No newline at end of file diff --git a/tests/models/test_sidecar.py b/tests/models/test_sidecar.py index 14f5ff68a..1925745ae 100644 --- a/tests/models/test_sidecar.py +++ b/tests/models/test_sidecar.py @@ -8,6 +8,7 @@ from hed.validator import HedValidator from hed import schema from hed.models import DefinitionDict +from hed.errors import ErrorHandler class Test(unittest.TestCase): @@ -80,35 +81,28 @@ def test__iter__(self): self.assertEqual(columns_target, columns_count) def test_validate_column_group(self): - validator = HedValidator(hed_schema=None) - # validation_issues = self.json_def_sidecar.validate_entries(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 0) - # - # validation_issues = self.default_sidecar.validate_entries(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 0) + validation_issues = self.errors_sidecar.validate(self.hed_schema) + self.assertEqual(len(validation_issues), 22) - validation_issues = self.errors_sidecar.validate_entries(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues), 4) + validation_issues2 = self.errors_sidecar_minor.validate(self.hed_schema) + self.assertEqual(len(validation_issues2), 18) - validation_issues2 = self.errors_sidecar_minor.validate_entries(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues2), 10) + validation_issues = self.json_without_definitions_sidecar.validate(self.hed_schema) + self.assertEqual(len(validation_issues), 8) - validation_issues = self.json_without_definitions_sidecar.validate_entries(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues), 1) - - hed_string = HedString("(Definition/JsonFileDef/#, (Item/JsonDef1/#,Item/JsonDef1))") + hed_string = HedString("(Definition/JsonFileDef/#, (Item/JsonDef1/#,Item/JsonDef1))", self.hed_schema) extra_def_dict = DefinitionDict() - hed_string.validate(extra_def_dict) + extra_def_dict.check_for_definitions(hed_string) - validation_issues = self.json_without_definitions_sidecar.validate_entries(validator, check_for_warnings=True, - extra_def_dicts=extra_def_dict) - self.assertEqual(len(validation_issues), 0) + validation_issues2 = self.json_without_definitions_sidecar.validate(self.hed_schema, extra_def_dicts=extra_def_dict) + # this removes one undef matched error and adds two extended tag warnings + self.assertEqual(len(validation_issues2), 9) def test_duplicate_def(self): sidecar = self.json_def_sidecar - def_dicts = sidecar.get_def_dicts() - issues = sidecar.validate_entries(extra_def_dicts=def_dicts) + duplicate_dict = sidecar.extract_definitions(hed_schema=self.hed_schema) + issues = sidecar.validate(self.hed_schema, extra_def_dicts=duplicate_dict, error_handler=ErrorHandler(False)) self.assertEqual(len(issues), 5) self.assertTrue(issues[0]['code'], ValidationErrors.HED_DEFINITION_INVALID) @@ -120,7 +114,7 @@ def test_save_load(self): reloaded_sidecar = Sidecar(save_filename) for str1, str2 in zip(sidecar.hed_string_iter(), reloaded_sidecar.hed_string_iter()): - self.assertEqual(str1, str2) + self.assertEqual(str1[0], str2[0]) def test_save_load2(self): sidecar = Sidecar(self.json_def_filename) @@ -129,7 +123,7 @@ def test_save_load2(self): reloaded_sidecar = Sidecar(io.StringIO(json_string)) for str1, str2 in zip(sidecar.hed_string_iter(), reloaded_sidecar.hed_string_iter()): - self.assertEqual(str1, str2) + self.assertEqual(str1[0], str2[0]) def test_merged_sidecar(self): base_folder = self.base_data_dir + "sidecar_tests/" diff --git a/tests/models/test_spreadsheet_input.py b/tests/models/test_spreadsheet_input.py index feac77f35..9fc8f5827 100644 --- a/tests/models/test_spreadsheet_input.py +++ b/tests/models/test_spreadsheet_input.py @@ -51,20 +51,13 @@ def test_all(self): file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name, tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary) - for column_to_hed_tags in file_input: - break_here = 3 + self.assertTrue(isinstance(file_input.dataframe_a, pd.DataFrame)) + self.assertTrue(isinstance(file_input.series_a, pd.Series)) + self.assertTrue(file_input.dataframe_a.size) # Just make sure this didn't crash for now self.assertTrue(True) - def test_get_row_hed_tags(self): - row_dict = self.generic_file_input._mapper.expand_row_tags(self.row_with_hed_tags) - column_to_hed_tags_dictionary = row_dict[model_constants.COLUMN_TO_HED_TAGS] - # self.assertIsInstance(hed_string, HedString) - # self.assertTrue(hed_string) - self.assertIsInstance(column_to_hed_tags_dictionary, dict) - self.assertTrue(column_to_hed_tags_dictionary) - def test_file_as_string(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/validator_tests/bids_events_no_index.tsv') @@ -72,15 +65,14 @@ def test_file_as_string(self): json_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/validator_tests/bids_events.json") sidecar = Sidecar(json_path) - self.assertEqual(len(sidecar.validate_entries(expand_defs=True)), 0) + self.assertEqual(len(sidecar.validate(self.hed_schema)), 0) input_file = TabularInput(events_path, sidecar=sidecar) with open(events_path) as file: events_file_as_string = io.StringIO(file.read()) input_file_from_string = TabularInput(file=events_file_as_string, sidecar=sidecar) - for column_dict, column_dict in zip(input_file, input_file_from_string): - self.assertEqual(column_dict, column_dict) + self.assertTrue(input_file._dataframe.equals(input_file_from_string._dataframe)) def test_bad_file_inputs(self): self.assertRaises(HedFileError, TabularInput, None) @@ -115,7 +107,7 @@ def test_to_excel_should_work(self): column_prefix_dictionary={1: 'Label/', 3: 'Description/'}, name='ExcelOneSheet.xlsx') buffer = io.BytesIO() - spreadsheet.to_excel(buffer, output_processed_file=True) + spreadsheet.to_excel(buffer, output_assembled=True) buffer.seek(0) v = buffer.getvalue() self.assertGreater(len(v), 0, "It should have a length greater than 0") @@ -145,23 +137,13 @@ def test_loading_and_reset_mapper(self): json_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/validator_tests/bids_events.json") sidecar = Sidecar(json_path) - self.assertEqual(len(sidecar.validate_entries()), 0) + self.assertEqual(len(sidecar.validate(self.hed_schema)), 0) input_file_1 = TabularInput(events_path, sidecar=sidecar) input_file_2 = TabularInput(events_path, sidecar=sidecar) input_file_2.reset_column_mapper() - for (row_number, row_dict), (row_number2, row_dict2) in \ - zip(enumerate(input_file_1.iter_dataframe(return_string_only=False)), - enumerate(input_file_2.iter_dataframe(return_string_only=False))): - self.assertEqual(row_number, row_number2, - f"TabularInput should have row {row_number} equal to {row_number2} after reset") - column_dict = row_dict["column_to_hed_tags"] - self.assertTrue(len(column_dict) == 5, - f"The column dictionary for row {row_number} should have the right length") - column_dict2 = row_dict2["column_to_hed_tags"] - self.assertTrue(len(column_dict2) == 0, - f"The reset column dictionary for row {row_number2} should have the right length") + self.assertTrue(input_file_1.dataframe.equals(input_file_2.dataframe)) def test_no_column_header_and_convert(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), @@ -172,18 +154,7 @@ def test_no_column_header_and_convert(self): events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_long.tsv') hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2]) - for column1, column2 in zip(hed_input, hed_input_long): - self.assertEqual(column1, column2) - - events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/model_tests/no_column_header.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) - events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/model_tests/no_column_header_long.tsv') - hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2]) - hed_input_long.convert_to_short(self.hed_schema) - for column1, column2 in zip(hed_input, hed_input_long): - self.assertEqual(column1, column2) + self.assertTrue(hed_input._dataframe.equals(hed_input_long._dataframe)) def test_convert_short_long_with_definitions(self): # Verify behavior works as expected even if definitions are present @@ -195,37 +166,17 @@ def test_convert_short_long_with_definitions(self): events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition_long.tsv') hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2]) - for column1, column2 in zip(hed_input, hed_input_long): - self.assertEqual(column1, column2) - - def test_convert_short_long_with_definitions_new_style(self): - # Verify behavior works as expected even if definitions are present - events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/model_tests/no_column_header_definition.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2], - hed_schema=self.hed_schema) - hed_input.convert_to_long() - - events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/model_tests/no_column_header_definition_long.tsv') - hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2]) - for column1, column2 in zip(hed_input, hed_input_long): - self.assertEqual(column1, column2) + self.assertTrue(hed_input._dataframe.equals(hed_input_long._dataframe)) def test_definitions_identified(self): + # Todo ian: this test is no longer relevant events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2], - hed_schema=self.hed_schema) - def_entry = hed_input.def_dict['deftest1'] - tag = def_entry.contents.tags()[0] - self.assertTrue(tag._schema_entry) + hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) - def_entry = hed_input.def_dict['deftest1'] - tag = def_entry.contents.tags()[0] - self.assertFalse(tag._schema_entry) + def test_loading_dataframe_directly(self): ds_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), @@ -236,9 +187,22 @@ def test_loading_dataframe_directly(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') hed_input2 = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) - for column1, column2 in zip(hed_input, hed_input2): - self.assertEqual(column1, column2) + self.assertTrue(hed_input._dataframe.equals(hed_input2._dataframe)) + def test_ignoring_na_column(self): + events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/model_tests/na_tag_column.tsv') + hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) + self.assertTrue(hed_input.dataframe_a.loc[1, 1] == 'n/a') + + def test_ignoring_na_value_column(self): + from hed import TabularInput + events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/model_tests/na_value_column.tsv') + sidecar_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/model_tests/na_value_column.json') + hed_input = TabularInput(events_path, sidecar=sidecar_path) + self.assertTrue(hed_input.dataframe_a.loc[1, 'Value'] == 'n/a') if __name__ == '__main__': unittest.main() diff --git a/tests/models/test_tabular_input.py b/tests/models/test_tabular_input.py index f514ef5ff..d306582fb 100644 --- a/tests/models/test_tabular_input.py +++ b/tests/models/test_tabular_input.py @@ -4,8 +4,8 @@ from hed.models import DefinitionEntry, Sidecar, TabularInput from hed import schema -from hed.validator import HedValidator from hed.errors import HedFileError +from hed.errors import ErrorHandler class Test(unittest.TestCase): @@ -32,38 +32,17 @@ def setUpClass(cls): def tearDownClass(cls): shutil.rmtree(cls.base_output_folder) - def test_get_definitions(self): - input_data = TabularInput(self.events_path, sidecar=self.sidecar1, name="face_sub1_events") - defs1 = input_data.get_definitions().gathered_defs - self.assertIsInstance(defs1, dict, "get_definitions returns dictionary by default") - self.assertEqual(len(defs1), 17, "get_definitions should have the right number of definitions") - for key, value in defs1.items(): - self.assertIsInstance(key, str, "get_definitions dictionary keys should be strings") - self.assertIsInstance(value, DefinitionEntry, - "get_definitions dict values should be strings when as strings") - defs2 = input_data.get_definitions(as_strings=False).gathered_defs - self.assertIsInstance(defs2, dict, "get_definitions returns dictionary by when not as strings") - self.assertEqual(len(defs2), 17, "get_definitions should have the right number of definitions when not strings") - for key, value in defs2.items(): - self.assertIsInstance(key, str, "get_definitions dictionary keys should be strings") - self.assertIsInstance(value, DefinitionEntry, - "get_definitions dictionary values should be strings when as strings") - self.assertIsInstance(defs2, dict, "get_definitions returns DefinitionDict when not as strings") - def test_missing_column_name_issue(self): events_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/validator_tests/bids_events_bad_column_name.tsv')) json_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/validator_tests/bids_events.json")) - validator = HedValidator(hed_schema=self.hed_schema) - sidecar = Sidecar(json_path, hed_schema=self.hed_schema) - issues = sidecar.validate_entries(validator) + sidecar = Sidecar(json_path) + issues = sidecar.validate(self.hed_schema) self.assertEqual(len(issues), 0) - input_file = TabularInput(events_path, sidecar=sidecar, hed_schema=self.hed_schema) + input_file = TabularInput(events_path, sidecar=sidecar) - validation_issues = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues), 0) - validation_issues = input_file.validate_file(validator, check_for_warnings=True) + validation_issues = input_file.validate(hed_schema=self.hed_schema) self.assertEqual(len(validation_issues), 1) def test_expand_column_issues(self): @@ -71,16 +50,12 @@ def test_expand_column_issues(self): '../data/validator_tests/bids_events_bad_category_key.tsv') json_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/validator_tests/bids_events.json") - validator = HedValidator(hed_schema=self.hed_schema) - sidecar = Sidecar(json_path, hed_schema=self.hed_schema) - issues = sidecar.validate_entries(validator) + sidecar = Sidecar(json_path) + issues = sidecar.validate(hed_schema=self.hed_schema) self.assertEqual(len(issues), 0) - input_file = TabularInput(events_path, sidecar=sidecar, hed_schema=self.hed_schema) + input_file = TabularInput(events_path, sidecar=sidecar) - # Fix whatever is wrong with onset tag here. It's thinking Description/Onset continues is an invalid tag???' - validation_issues = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues), 0) - validation_issues = input_file.validate_file(validator, check_for_warnings=True) + validation_issues = input_file.validate(hed_schema=self.hed_schema) self.assertEqual(len(validation_issues), 1) def test_blank_and_duplicate_columns(self): @@ -98,16 +73,14 @@ def test_blank_and_duplicate_columns(self): # _ = TabularInput(filepath) def test_validate_file_warnings(self): - validator = HedValidator(hed_schema=self.hed_schema) - issues1 = self.sidecar1.validate_entries(validator, check_for_warnings=True) + issues1 = self.sidecar1.validate(hed_schema=self.hed_schema) input_file1 = TabularInput(self.events_path, sidecar=self.sidecar1) - issues1a = input_file1.validate_file(validator, check_for_warnings=True) + issues1a = input_file1.validate(hed_schema=self.hed_schema) - issues2 = self.sidecar2.validate_entries(validator, check_for_warnings=False) + issues2 = self.sidecar1.validate(hed_schema=self.hed_schema, error_handler=ErrorHandler(False)) input_file2 = TabularInput(self.events_path, sidecar=self.sidecar2) - issues2a = input_file2.validate_file(validator, check_for_warnings=False) - # TODO: Currently does not correctly check for warnings. - + issues2a = input_file2.validate(hed_schema=self.hed_schema, error_handler=ErrorHandler(False)) + breakHere = 3 if __name__ == '__main__': unittest.main() diff --git a/tests/schema/test_convert_tags.py b/tests/schema/test_convert_tags.py index 50e30af45..ebfa134a1 100644 --- a/tests/schema/test_convert_tags.py +++ b/tests/schema/test_convert_tags.py @@ -25,7 +25,7 @@ def converter_base(self, test_strings, expected_results, expected_errors, conver expected_issue = self.format_errors_fully(error_handler, hed_string=test_string_obj, params=expected_params) - error_handler.add_context_to_issues(test_issues) + error_handler.add_context_and_filter(test_issues) # print(test_key) # print(expected_issue) diff --git a/tests/validator/test_def_validator.py b/tests/validator/test_def_validator.py new file mode 100644 index 000000000..f889b36f1 --- /dev/null +++ b/tests/validator/test_def_validator.py @@ -0,0 +1,119 @@ +import unittest +import os + +from hed import schema +from hed.models import DefinitionDict, HedString +from hed.validator import DefValidator +from hed.errors import ErrorHandler, ErrorContext + + +class Test(unittest.TestCase): + basic_hed_string_with_def_first_paren = None + + @classmethod + def setUpClass(cls): + cls.base_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/') + hed_xml_file = os.path.realpath(os.path.join(cls.base_data_dir, "schema_tests/HED8.0.0t.xml")) + cls.hed_schema = schema.load_schema(hed_xml_file) + cls.def_contents_string = "(Item/TestDef1,Item/TestDef2)" + cls.basic_definition_string = f"(Definition/TestDef,{cls.def_contents_string})" + cls.basic_definition_string_no_paren = f"Definition/TestDef,{cls.def_contents_string}" + + cls.placeholder_definition_contents = "(Item/TestDef1/#,Item/TestDef2)" + cls.placeholder_definition_string = f"(Definition/TestDefPlaceholder/#,{cls.placeholder_definition_contents})" + cls.placeholder_definition_string_no_paren = \ + f"Definition/TestDefPlaceholder/#,{cls.placeholder_definition_contents}" + + + + cls.label_def_string = "Def/TestDef" + cls.expanded_def_string = f"(Def-expand/TestDef,{cls.def_contents_string})" + cls.basic_hed_string = "Item/BasicTestTag1,Item/BasicTestTag2" + cls.basic_hed_string_with_def = f"{cls.basic_hed_string},{cls.label_def_string}" + cls.basic_hed_string_with_def_first = f"{cls.label_def_string},{cls.basic_hed_string}" + cls.basic_hed_string_with_def_first_paren = f"({cls.label_def_string},{cls.basic_hed_string})" + cls.placeholder_label_def_string = "Def/TestDefPlaceholder/2471" + + cls.placeholder_expanded_def_string = "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2))" + + cls.placeholder_hed_string_with_def = f"{cls.basic_hed_string},{cls.placeholder_label_def_string}" + cls.placeholder_hed_string_with_def_first = f"{cls.placeholder_label_def_string},{cls.basic_hed_string}" + cls.placeholder_hed_string_with_def_first_paren = f"({cls.placeholder_label_def_string},{cls.basic_hed_string})" + + + def test_expand_def_tags_placeholder_invalid(self): + def_validator = DefValidator() + def_string = HedString(self.placeholder_definition_string, self.hed_schema) + def_validator.check_for_definitions(def_string) + + placeholder_label_def_string_no_placeholder = "Def/TestDefPlaceholder" + + test_string = HedString(placeholder_label_def_string_no_placeholder, self.hed_schema) + def_issues = def_validator.validate_def_tags(test_string) + def_issues += def_validator.expand_def_tags(test_string) + self.assertEqual(str(test_string), placeholder_label_def_string_no_placeholder) + self.assertTrue(def_issues) + + def_validator = DefValidator() + def_string = HedString(self.basic_definition_string, self.hed_schema) + def_validator.check_for_definitions(def_string) + + label_def_string_has_invalid_placeholder = "Def/TestDef/54687" + + def_validator = DefValidator() + def_string = HedString(self.basic_definition_string, self.hed_schema) + def_validator.check_for_definitions(def_string) + + test_string = HedString(label_def_string_has_invalid_placeholder, self.hed_schema) + def_issues = def_validator.validate_def_tags(test_string) + def_issues += def_validator.expand_def_tags(test_string) + self.assertEqual(str(test_string), label_def_string_has_invalid_placeholder) + self.assertTrue(def_issues) + + + def test_bad_def_expand(self): + def_validator = DefValidator() + def_string = HedString(self.placeholder_definition_string, self.hed_schema) + def_validator.check_for_definitions(def_string) + + valid_placeholder = HedString(self.placeholder_expanded_def_string, self.hed_schema) + def_issues = def_validator.validate_def_tags(valid_placeholder) + self.assertFalse(def_issues) + + invalid_placeholder = HedString("(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/21,Item/TestDef2))", self.hed_schema) + def_issues = def_validator.validate_def_tags(invalid_placeholder) + self.assertTrue(bool(def_issues)) + + + def test_def_no_content(self): + + def_validator = DefValidator() + def_string = HedString("(Definition/EmptyDef)", self.hed_schema) + def_validator.check_for_definitions(def_string) + + valid_empty = HedString("Def/EmptyDef", self.hed_schema) + def_issues = def_validator.validate_def_tags(valid_empty) + def_issues += def_validator.expand_def_tags(valid_empty) + self.assertEqual(str(valid_empty), "(Def-expand/EmptyDef)") + self.assertFalse(def_issues) + + valid_empty = HedString("Def/EmptyDef", self.hed_schema) + def_issues = def_validator.validate_def_tags(valid_empty) + self.assertFalse(def_issues) + + def test_duplicate_def(self): + def_dict = DefinitionDict() + def_string = HedString(self.placeholder_definition_string, self.hed_schema) + error_handler = ErrorHandler() + error_handler.push_error_context(ErrorContext.ROW, 5) + def_dict.check_for_definitions(def_string, error_handler=error_handler) + self.assertEqual(len(def_dict.issues), 0) + + def_validator = DefValidator([def_dict, def_dict]) + self.assertEqual(len(def_validator.issues), 1) + self.assertTrue('ec_row' in def_validator.issues[0]) + + def_dict = DefinitionDict([def_dict, def_dict, def_dict]) + self.assertEqual(len(def_dict.issues), 2) + self.assertTrue('ec_row' in def_dict.issues[0]) + diff --git a/tests/validator/test_hed_validator.py b/tests/validator/test_hed_validator.py index 6c9cb74e4..a523e33c3 100644 --- a/tests/validator/test_hed_validator.py +++ b/tests/validator/test_hed_validator.py @@ -4,10 +4,10 @@ # from hed import from hed.errors import ErrorContext from hed import schema -from hed.models import DefMapper, HedString, SpreadsheetInput, TabularInput, Sidecar -from hed.validator import HedValidator - +from hed.models import HedString, SpreadsheetInput, TabularInput, Sidecar +from hed.validator import HedValidator, DefValidator +# todo: redo all this so we class Test(unittest.TestCase): @classmethod def setUpClass(cls): @@ -33,31 +33,29 @@ def setUpClass(cls): def test__validate_input(self): test_string_obj = HedString(self.base_hed_input) - validation_issues = test_string_obj.validate(self.hed_validator) + validation_issues = test_string_obj.validate(self.hed_schema) self.assertIsInstance(validation_issues, list) name = "DummyDisplayFilename.txt" - validation_issues = self.hed_file_with_errors.validate_file(self.hed_validator, name=name) + validation_issues = self.hed_file_with_errors.validate(self.hed_schema, name=name) self.assertIsInstance(validation_issues, list) self.assertTrue(name in validation_issues[0][ErrorContext.FILE_NAME]) def test__validate_input_major_errors(self): name = "DummyDisplayFilename.txt" - validation_issues = self.hed_file_with_major_errors.validate_file(self.hed_validator, name=name) + validation_issues = self.hed_file_with_major_errors.validate(self.hed_schema, name=name) self.assertIsInstance(validation_issues, list) self.assertTrue(name in validation_issues[0][ErrorContext.FILE_NAME]) def test__validate_input_major_errors_columns(self): name = "DummyDisplayFilename.txt" - validation_issues = self.hed_file_with_major_errors.validate_file(self.hed_validator, - check_for_warnings=True, name=name) + validation_issues = self.hed_file_with_major_errors.validate(self.hed_schema, name=name) self.assertIsInstance(validation_issues, list) self.assertTrue(name in validation_issues[0][ErrorContext.FILE_NAME]) def test__validate_input_major_errors_multi_column(self): - validation_issues = self.hed_file_with_major_errors_multi_column.validate_file(self.hed_validator, - check_for_warnings=True) + validation_issues = self.hed_file_with_major_errors_multi_column.validate(self.hed_schema) self.assertIsInstance(validation_issues, list) self.assertEqual(len(validation_issues), 2) @@ -66,15 +64,12 @@ def test_complex_file_validation_no_index(self): '../data/validator_tests/bids_events_no_index.tsv')) json_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/validator_tests/bids_events.json')) - validator = HedValidator(hed_schema=self.hed_schema) sidecar = Sidecar(json_path) - issues = sidecar.validate_entries(validator) + issues = sidecar.validate(self.hed_schema) self.assertEqual(len(issues), 0) input_file = TabularInput(events_path, sidecar=sidecar) - validation_issues = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues), 0) - validation_issues = input_file.validate_file(validator) + validation_issues = input_file.validate(self.hed_schema) self.assertEqual(len(validation_issues), 0) def test_complex_file_validation_with_index(self): @@ -84,15 +79,12 @@ def test_complex_file_validation_with_index(self): # hed_schema = schema.load_schema(schema_path) json_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/validator_tests/bids_events.json')) - validator = HedValidator(hed_schema=self.hed_schema) sidecar = Sidecar(json_path) - issues = sidecar.validate_entries(validator) + issues = sidecar.validate(hed_schema=self.hed_schema) self.assertEqual(len(issues), 0) input_file = TabularInput(events_path, sidecar=sidecar) - validation_issues = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues), 0) - validation_issues = input_file.validate_file(validator) + validation_issues = input_file.validate(hed_schema=self.hed_schema) self.assertEqual(len(validation_issues), 0) def test_complex_file_validation_invalid(self): @@ -104,17 +96,13 @@ def test_complex_file_validation_invalid(self): hed_schema = schema.load_schema(schema_path) json_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/validator_tests/bids_events_bad_defs.json')) - validator = HedValidator(hed_schema=hed_schema) sidecar = Sidecar(json_path) - issues = sidecar.validate_entries(hed_ops=validator, check_for_warnings=True) + issues = sidecar.validate(hed_schema) self.assertEqual(len(issues), 4) input_file = TabularInput(events_path, sidecar=sidecar) - validation_issues = input_file.validate_sidecar(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues), 4) - - validation_issues = input_file.validate_file(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues), 42) + validation_issues = input_file.validate(hed_schema) + self.assertEqual(len(validation_issues), 63) def test_complex_file_validation_invalid_definitions_removed(self): # This verifies definitions are being removed from sidecar strings before being added, or it will produce @@ -128,14 +116,12 @@ def test_complex_file_validation_invalid_definitions_removed(self): json_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/validator_tests/bids_events_bad_defs2.json')) sidecar = Sidecar(json_path) + issues = sidecar.validate(hed_schema) + self.assertEqual(len(issues), 4) input_file = TabularInput(events_path, sidecar=sidecar) - validator = HedValidator(hed_schema=hed_schema) - validation_issues1 = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues1), 4) - - validation_issues = input_file.validate_file(validator) - self.assertEqual(len(validation_issues), 21) + validation_issues = input_file.validate(hed_schema) + self.assertEqual(len(validation_issues), 42) def test_file_bad_defs_in_spreadsheet(self): schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), @@ -150,9 +136,8 @@ def test_file_bad_defs_in_spreadsheet(self): column_prefix_dictionary=prefixed_needed_tag_columns, worksheet_name='LKT Events') - validator = HedValidator(hed_schema=hed_schema) - validation_issues = loaded_file.validate_file(validator, check_for_warnings=True) - self.assertEqual(len(validation_issues), 4) + validation_issues = loaded_file.validate(hed_schema=hed_schema) + self.assertEqual(len(validation_issues), 2) def test_tabular_input_with_HED_col_in_json(self): schema_path = os.path.realpath(os.path.join(os.path.dirname(__file__), @@ -163,28 +148,20 @@ def test_tabular_input_with_HED_col_in_json(self): hed_schema = schema.load_schema(schema_path) json_path = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/validator_tests/bids_events_HED.json')) - validator = HedValidator(hed_schema=hed_schema) sidecar = Sidecar(json_path) - issues = sidecar.validate_entries(validator) - self.assertEqual(len(issues), 0) + issues = sidecar.validate(hed_schema) + self.assertEqual(len(issues), 1) input_file = TabularInput(events_path, sidecar=sidecar) - validation_issues = input_file.validate_sidecar(validator) - self.assertEqual(len(validation_issues), 0) - validation_issues = input_file.validate_file(validator) + validation_issues = input_file.validate(hed_schema) self.assertEqual(len(validation_issues), 1) def test_error_spans_from_file_and_missing_required_column(self): - schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/schema_tests/HED8.0.0.mediawiki') events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/validator_tests/tag_error_span_test.tsv') - hed_schema = schema.load_schema(schema_path) - input_file = SpreadsheetInput(events_path, tag_columns=[0, 1, "error"]) - validator = HedValidator(hed_schema=hed_schema) - validation_issues = input_file.validate_file(validator) + validation_issues = input_file.validate(hed_schema=self.hed_schema) self.assertEqual(validation_issues[1]['char_index'], 6) self.assertEqual(validation_issues[2]['char_index'], 6) self.assertEqual(len(validation_issues), 3) @@ -201,28 +178,15 @@ def test_org_tag_missing(self): source_span = test_string_obj._get_org_span(HedTag("Event")) self.assertEqual(source_span, (None, None)) - def test_def_mapping_single_line(self): - schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/schema_tests/HED8.0.0.mediawiki') - hed_schema = schema.load_schema(schema_path) - validator = HedValidator(hed_schema=hed_schema) - def_mapper = DefMapper() - string_with_def = \ - '(Definition/TestDefPlaceholder/#,(Item/TestDef1/#,Item/TestDef2)), def/TestDefPlaceholder/2471' - test_string = HedString(string_with_def) - issues = test_string.validate([validator, def_mapper], check_for_definitions=True) - self.assertEqual(len(issues), 0) def test_duplicate_group_in_definition(self): schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/schema_tests/HED8.0.0.mediawiki') hed_schema = schema.load_schema(schema_path) - validator = HedValidator(hed_schema=hed_schema) - def_mapper = DefMapper() string_with_def = \ - '(Definition/TestDef,(Item/TestDef1,Item/TestDef1))' - test_string = HedString(string_with_def) - issues = test_string.validate([validator, def_mapper], check_for_definitions=False) + '(Definition/TestDef,(Item,Item))' + test_string = HedString(string_with_def, hed_schema) + issues = test_string.validate(hed_schema) self.assertEqual(len(issues), 1) diff --git a/tests/models/test_onset_mapper.py b/tests/validator/test_onset_validator.py similarity index 57% rename from tests/models/test_onset_mapper.py rename to tests/validator/test_onset_validator.py index a88a45f8f..1bc814f33 100644 --- a/tests/models/test_onset_mapper.py +++ b/tests/validator/test_onset_validator.py @@ -1,10 +1,11 @@ +import copy import unittest import os from hed.errors import ErrorHandler, OnsetErrors, ErrorContext, ValidationErrors -from hed.models import DefMapper, HedString, OnsetMapper, DefinitionDict +from hed.models import HedString, DefinitionDict from hed import schema -from hed.validator import HedValidator +from hed.validator import HedValidator, OnsetValidator from tests.validator.test_tag_validator_base import TestHedBase @@ -16,53 +17,66 @@ def setUpClass(cls): cls.base_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/') hed_xml_file = os.path.join(cls.base_data_dir, "schema_tests/HED8.0.0.mediawiki") cls.hed_schema = schema.load_schema(hed_xml_file) - cls.placeholder_label_def_string = "def/TestDefPlaceholder/2471" - cls.placeholder_def_contents = "(Item/TestDef1/#,Item/TestDef2)" + cls.placeholder_label_def_string = "Def/TestDefPlaceholder/2471" + cls.placeholder_def_contents = "(Action/TestDef1/#,Action/TestDef2)" cls.placeholder_definition_string = f"(Definition/TestDefPlaceholder/#,{cls.placeholder_def_contents})" - cls.placeholder_expanded_def_string = "(Def-expand/TestDefPlaceholder/2471,(Item/TestDef1/2471,Item/TestDef2))" + cls.placeholder_expanded_def_string = "(Def-expand/TestDefPlaceholder/2471,(Action/TestDef1/2471,Action/TestDef2))" - cls.label_def_string = "def/TestDefNormal" - cls.def_contents = "(Item/TestDef1,Item/TestDef2)" + cls.label_def_string = "Def/TestDefNormal" + cls.def_contents = "(Action/TestDef1,Action/TestDef2)" cls.definition_string = f"(Definition/TestDefNormal,{cls.def_contents})" - cls.expanded_def_string = "(Def-expand/TestDefNormal,(Item/TestDef1/2471,Item/TestDef2))" + cls.expanded_def_string = "(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2))" - cls.placeholder_label_def_string2 = "def/TestDefPlaceholder/123" - cls.placeholder_def_contents2 = "(Item/TestDef1/#,Item/TestDef2)" + cls.placeholder_label_def_string2 = "Def/TestDefPlaceholder/123" + cls.placeholder_def_contents2 = "(Action/TestDef1/#,Action/TestDef2)" cls.placeholder_definition_string2 = f"(Definition/TestDefPlaceholder/#,{cls.placeholder_def_contents2})" - cls.placeholder_expanded_def_string2 = "(Def-expand/TestDefPlaceholder/123,(Item/TestDef1/123,Item/TestDef2))" + cls.placeholder_expanded_def_string2 = "(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2))" - def _test_issues_base(self, test_strings, test_issues, test_context, hed_ops, expand_defs=True): + cls.def_dict_placeholder = DefinitionDict() + def_string = HedString(cls.placeholder_definition_string, hed_schema=cls.hed_schema) + cls.def_dict_placeholder.check_for_definitions(def_string) + cls.def_dict_both = copy.deepcopy(cls.def_dict_placeholder) + def_string = HedString(cls.definition_string, hed_schema=cls.hed_schema) + cls.def_dict_both.check_for_definitions(def_string) + + + def _test_issues_base(self, test_strings, test_issues, test_context, placeholder_def_only): + if placeholder_def_only: + validator = OnsetValidator(self.def_dict_placeholder) + else: + validator = OnsetValidator(self.def_dict_both) for string, expected_params, context in zip(test_strings, test_issues, test_context): - test_string = HedString(string) + test_string = HedString(string, self.hed_schema) error_handler = ErrorHandler() error_handler.push_error_context(ErrorContext.HED_STRING, test_string, increment_depth_after=False) - onset_issues = test_string.validate(hed_ops, expand_defs=expand_defs) + + onset_issues = [] + onset_issues += validator.validate_onset_offset(test_string) + + error_handler.add_context_and_filter(onset_issues) + test_string.shrink_defs() issues = self.format_errors_fully(error_handler, hed_string=test_string, params=expected_params) - # print(str(onset_issues)) - # print(str(issues)) + print(str(onset_issues)) + print(str(issues)) error_handler.pop_error_context() - self.assertEqual(len(hed_ops[-1]._onsets), context) + self.assertEqual(len(validator._onsets), context) self.assertCountEqual(onset_issues, issues) - def _test_issues_no_context(self, test_strings, test_issues, hed_ops): + def _test_issues_no_context(self, test_strings, test_issues): + hed_validator = HedValidator(self.hed_schema, self.def_dict_both) for string, expected_params in zip(test_strings, test_issues): test_string = HedString(string) - error_handler = ErrorHandler() + error_handler = ErrorHandler(check_for_warnings=False) error_handler.push_error_context(ErrorContext.HED_STRING, test_string, increment_depth_after=False) - onset_issues = test_string.validate(hed_ops, expand_defs=True) + onset_issues = hed_validator.validate(test_string, False) + error_handler.add_context_and_filter(onset_issues) issues = self.format_errors_fully(error_handler, hed_string=test_string, params=expected_params) - # print(str(onset_issues)) - # print(str(issues)) + print(str(onset_issues)) + print(str(issues)) error_handler.pop_error_context() self.assertCountEqual(onset_issues, issues) def test_basic_onset_errors(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - test_strings = [ f"({self.placeholder_label_def_string},Onset)", f"({self.placeholder_label_def_string},Offset)", @@ -70,9 +84,9 @@ def test_basic_onset_errors(self): f"({self.placeholder_label_def_string}, Onset, (Event), (Event))", f"({self.placeholder_label_def_string}, Onset, (Event))", "(Onset)", - f"({self.placeholder_label_def_string}, def/InvalidDef, Onset, (Event))", - "(def/TestDefInvalid, Onset)", - "(def/TestDefPlaceholder, Onset)", + f"({self.placeholder_label_def_string}, Def/InvalidDef, Onset, (Event))", + "(Def/TestDefInvalid, Onset)", + "(Def/TestDefPlaceholder, Onset)", f"({self.placeholder_label_def_string}, Offset, (Event))" ] # count of how many onset names are in the mapper after the line is run @@ -94,26 +108,19 @@ def test_basic_onset_errors(self): [], self.format_error(OnsetErrors.OFFSET_BEFORE_ONSET, tag=0), self.format_error(OnsetErrors.ONSET_WRONG_NUMBER_GROUPS, tag=0, - tag_list=['def/TestDefPlaceholder/2471', 'Onset', '(Event)', '(Event)']), + tag_list=['Def/TestDefPlaceholder/2471', 'Onset', '(Event)', '(Event)']), [], self.format_error(OnsetErrors.ONSET_NO_DEF_TAG_FOUND, tag=0), - self.format_error(OnsetErrors.ONSET_TOO_MANY_DEFS, tag=0, tag_list=['def/InvalidDef']), + self.format_error(OnsetErrors.ONSET_TOO_MANY_DEFS, tag=0, tag_list=['Def/InvalidDef']), self.format_error(OnsetErrors.ONSET_DEF_UNMATCHED, tag=0), self.format_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, tag=0, has_placeholder=True), self.format_error(OnsetErrors.ONSET_WRONG_NUMBER_GROUPS, tag=0, tag_list=[self.placeholder_label_def_string, 'Offset', '(Event)']), ] - self._test_issues_base(test_strings, test_issues, expected_context, [onset_mapper]) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=True) def test_basic_onset_errors_with_def_mapper(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - hed_ops = [def_mapper, onset_mapper] - test_strings = [ f"({self.placeholder_label_def_string},Onset)", f"({self.placeholder_label_def_string},Offset)", @@ -121,9 +128,9 @@ def test_basic_onset_errors_with_def_mapper(self): f"({self.placeholder_label_def_string}, Onset, (Event), (Event))", f"({self.placeholder_label_def_string}, Onset, (Event))", "(Onset)", - f"({self.placeholder_label_def_string}, def/TestDefPlaceholder/2, Onset, (Event))", - "(def/TestDefInvalid, Onset)", - "(def/TestDefPlaceholder, Onset)", + f"({self.placeholder_label_def_string}, Def/TestDefPlaceholder/2, Onset, (Event))", + "(Def/TestDefInvalid, Onset)", + "(Def/TestDefPlaceholder, Onset)", f"({self.placeholder_label_def_string}, Offset, (Event))" ] # count of how many onset names are in the mapper after the line is run @@ -149,24 +156,16 @@ def test_basic_onset_errors_with_def_mapper(self): [], self.format_error(OnsetErrors.ONSET_NO_DEF_TAG_FOUND, tag=0), self.format_error(OnsetErrors.ONSET_TOO_MANY_DEFS, tag=0, - tag_list=['def/TestDefPlaceholder/2']), - self.format_error(ValidationErrors.HED_DEF_UNMATCHED, tag=0), - self.format_error(ValidationErrors.HED_DEF_VALUE_MISSING, tag=0), + tag_list=['Def/TestDefPlaceholder/2']), + self.format_error(OnsetErrors.ONSET_DEF_UNMATCHED, tag=0), + self.format_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, tag=0, has_placeholder=True), self.format_error(OnsetErrors.ONSET_WRONG_NUMBER_GROUPS, tag=0, tag_list=[self.placeholder_label_def_string, 'Offset', '(Event)']), ] - self._test_issues_base(test_strings, test_issues, expected_context, hed_ops, expand_defs=False) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=True) def test_basic_onset_errors_expanded(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_string = HedString(self.definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - test_strings = [ f"({self.placeholder_expanded_def_string},Onset)", f"({self.placeholder_expanded_def_string},Offset)", @@ -174,10 +173,10 @@ def test_basic_onset_errors_expanded(self): f"({self.placeholder_expanded_def_string}, Onset, (Event), (Event))", f"({self.placeholder_expanded_def_string}, Onset, (Event))", "(Onset)", - f"({self.placeholder_expanded_def_string}, def/InvalidDef, Onset, (Event))", - "(def/TestDefInvalid, Onset)", - "(def/TestDefPlaceholder, Onset)", - "(def/TestDefNormal/InvalidPlaceholder, Onset)" + f"({self.placeholder_expanded_def_string}, Def/InvalidDef, Onset, (Event))", + "(Def/TestDefInvalid, Onset)", + "(Def/TestDefPlaceholder, Onset)", + "(Def/TestDefNormal/InvalidPlaceholder, Onset)" ] # count of how many onset names are in the mapper after the line is run expected_context = [ @@ -201,23 +200,15 @@ def test_basic_onset_errors_expanded(self): tag_list=[self.placeholder_expanded_def_string, 'Onset', '(Event)', '(Event)']), [], self.format_error(OnsetErrors.ONSET_NO_DEF_TAG_FOUND, tag=0), - self.format_error(OnsetErrors.ONSET_TOO_MANY_DEFS, tag=0, tag_list=['def/InvalidDef']), + self.format_error(OnsetErrors.ONSET_TOO_MANY_DEFS, tag=0, tag_list=['Def/InvalidDef']), self.format_error(OnsetErrors.ONSET_DEF_UNMATCHED, tag=0), self.format_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, tag=0, has_placeholder=True), self.format_error(OnsetErrors.ONSET_PLACEHOLDER_WRONG, tag=0, has_placeholder=False) ] - self._test_issues_base(test_strings, test_issues, expected_context, [onset_mapper]) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=False) def test_test_interleaving_onset_offset(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_string = HedString(self.definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - test_strings = [ f"({self.placeholder_label_def_string},Onset)", f"({self.placeholder_label_def_string2},Onset)", @@ -248,15 +239,9 @@ def test_test_interleaving_onset_offset(self): [], ] - self._test_issues_base(test_strings, test_issues, expected_context, [onset_mapper]) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=False) def test_onset_with_defs_in_them(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - test_strings = [ f"({self.placeholder_label_def_string},Onset, ({self.label_def_string}))", ] @@ -269,101 +254,23 @@ def test_onset_with_defs_in_them(self): [] ] - self._test_issues_base(test_strings, test_issues, expected_context, [onset_mapper]) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=True) def test_onset_multiple_or_misplaced_errors(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_string = HedString(self.definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - hed_validator = HedValidator(hed_schema=self.hed_schema) - hed_ops = [hed_validator, def_mapper, onset_mapper] - test_strings = [ f"{self.placeholder_label_def_string},Onset", f"({self.placeholder_label_def_string},Onset, Onset)", f"({self.placeholder_label_def_string},Onset, Offset)", ] - # count of issues the line generates - onset_list = ['Onset'] - offset_list = ['Offset'] - test_issues = [ - self.format_error(ValidationErrors.HED_TOP_LEVEL_TAG, tag=1), - self.format_error(ValidationErrors.HED_TAG_REPEATED, tag=2) - + self.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, tag=1, - multiple_tags=onset_list), - self.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, tag=1, - multiple_tags=offset_list), - ] - - self._test_issues_no_context(test_strings, test_issues, hed_ops) - test_issues = [ self.format_error(ValidationErrors.HED_TOP_LEVEL_TAG, tag=1), - self.format_error(ValidationErrors.HED_TAG_REPEATED, tag=2) - + self.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, tag=1, - multiple_tags=onset_list), - self.format_error(ValidationErrors.HED_MULTIPLE_TOP_TAGS, tag=1, - multiple_tags=offset_list), + self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=2, def_tag="Def/TestDefPlaceholder/2471"), + self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=2, def_tag="Def/TestDefPlaceholder/2471"), ] - # Repeat with just hed validator - self._test_issues_no_context(test_strings, test_issues, hed_validator) - - def test_onset_multiple_or_misplaced_errors_no_validator(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_string = HedString(self.definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - hed_ops = [def_mapper, onset_mapper] - - test_strings = [ - f"{self.placeholder_label_def_string},Onset", - f"({self.placeholder_label_def_string},Onset, Onset)", - f"({self.placeholder_label_def_string},Onset, Offset)", - f"({self.placeholder_label_def_string},Onset, Event)", - ] - # count of issues the line generates - test_issues = [ - [], - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=4, - def_tag="Def-expand/TestDefPlaceholder/2471"), - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=4, - def_tag="Def-expand/TestDefPlaceholder/2471"), - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=4, - def_tag="Def-expand/TestDefPlaceholder/2471"), - ] - - self._test_issues_no_context(test_strings, test_issues, hed_ops) - - # Verify it also works without def mapping - test_issues = [ - [], - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=2, - def_tag=self.placeholder_label_def_string), - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=2, - def_tag=self.placeholder_label_def_string), - self.format_error(OnsetErrors.ONSET_TAG_OUTSIDE_OF_GROUP, tag=2, - def_tag=self.placeholder_label_def_string), - ] - - self._test_issues_no_context(test_strings, test_issues, [hed_ops[1]]) + self._test_issues_no_context(test_strings, test_issues) def test_onset_two_in_one_line(self): - def_dict = DefinitionDict() - def_string = HedString(self.placeholder_definition_string) - def_string.validate(def_dict) - def_string = HedString(self.definition_string) - def_string.validate(def_dict) - def_mapper = DefMapper(def_dict) - onset_mapper = OnsetMapper(def_mapper) - test_strings = [ f"({self.placeholder_label_def_string},Onset), ({self.placeholder_label_def_string2},Onset)", f"({self.placeholder_label_def_string2},Offset)", @@ -391,7 +298,7 @@ def test_onset_two_in_one_line(self): [] ] - self._test_issues_base(test_strings, test_issues, expected_context, [onset_mapper]) + self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=False) if __name__ == '__main__': diff --git a/tests/validator/test_tag_validator.py b/tests/validator/test_tag_validator.py index ea13e410a..dc0fb910a 100644 --- a/tests/validator/test_tag_validator.py +++ b/tests/validator/test_tag_validator.py @@ -11,8 +11,8 @@ class TestHed(TestValidatorBase): class IndividualHedTagsShort(TestHed): @staticmethod - def string_obj_func(validator, check_for_warnings): - return partial(validator._validate_individual_tags_in_hed_string, check_for_warnings=check_for_warnings) + def string_obj_func(validator): + return partial(validator._validate_individual_tags_in_hed_string) def test_exist_in_schema(self): test_strings = { @@ -66,10 +66,10 @@ def test_exist_in_schema(self): def test_proper_capitalization(self): test_strings = { 'proper': 'Event/Sensory-event', - 'camelCase': 'EvEnt/Something', + 'camelCase': 'EvEnt/Sensory-event', 'takesValue': 'Sampling-rate/20 Hz', 'numeric': 'Statistical-uncertainty/20', - 'lowercase': 'Event/something' + 'lowercase': 'Event/sensory-event' } expected_results = { 'proper': True, @@ -85,7 +85,7 @@ def test_proper_capitalization(self): 'numeric': [], 'lowercase': self.format_error(ValidationErrors.HED_STYLE_WARNING, tag=0) } - self.validator_syntactic(test_strings, expected_results, expected_issues, True) + self.validator_semantic(test_strings, expected_results, expected_issues, True) # def test_proper_capitalization(self): # test_strings = { @@ -112,7 +112,7 @@ def test_proper_capitalization(self): # 'lowercase': self.format_error(ValidationErrors.HED_STYLE_WARNING, tag=0), # 'multipleUpper': self.format_error(ValidationErrors.HED_STYLE_WARNING, tag=0) # } - # self.validator_syntactic(test_strings, expected_results, expected_issues, True) + # self.validator_semantic(test_strings, expected_results, expected_issues, True) # # def test_proper_capitalization_semantic(self): # test_strings = { @@ -352,7 +352,7 @@ def test_span_reporting(self): class TestTagLevels(TestHed): @staticmethod - def string_obj_func(validator, check_for_warnings): + def string_obj_func(validator): return validator._validate_groups_in_hed_string def test_no_duplicates(self): @@ -394,7 +394,7 @@ def test_no_duplicates(self): 'duplicateSubGroupF': self.format_error(ValidationErrors.HED_TAG_REPEATED_GROUP, group=HedString("((Sensory-event,Man-made-object/VehicleTrain),Event)")), } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) def test_no_duplicates_semantic(self): test_strings = { @@ -489,14 +489,14 @@ def test_empty_groups(self): expected_issues = { 'emptyGroup': self.format_error(ValidationErrors.HED_GROUP_EMPTY, tag=1000 + 1) } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) class FullHedString(TestHed): compute_forms = False @staticmethod - def string_obj_func(validator, check_for_warnings): + def string_obj_func(validator): return validator._tag_validator.run_hed_string_validators def test_invalid_placeholders(self): @@ -538,11 +538,13 @@ def test_mismatched_parentheses(self): closing_parentheses_count=1), 'extraClosing': self.format_error(ValidationErrors.HED_PARENTHESES_MISMATCH, opening_parentheses_count=1, - closing_parentheses_count=2), + closing_parentheses_count=2) + + self.format_error(ValidationErrors.HED_TAG_EMPTY, source_string=test_strings['extraClosing'], + char_index=84), 'valid': [] } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) def test_malformed_delimiters(self): test_strings = { @@ -676,7 +678,7 @@ def test_malformed_delimiters(self): tag="Thing)) "), # 'emptyGroup': [] } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) def test_invalid_characters(self): test_strings = { @@ -705,7 +707,7 @@ def test_invalid_characters(self): 'closingBracket': self.format_error(ValidationErrors.HED_CHARACTER_INVALID, char_index=45, source_string=test_strings['closingBracket']) } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) def test_string_extra_slash_space(self): test_strings = { @@ -778,7 +780,7 @@ def test_string_extra_slash_space(self): index_in_tag=15, index_in_tag_end=18, tag=0), } - self.validator_syntactic(test_strings, expected_results, expected_errors, False) + self.validator_semantic(test_strings, expected_results, expected_errors, False) def test_no_more_than_two_tildes(self): test_strings = { @@ -817,15 +819,15 @@ def test_no_more_than_two_tildes(self): + self.format_error(ValidationErrors.HED_TILDES_UNSUPPORTED, source_string=test_strings['invalidTildeGroup'], char_index=147) } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) class RequiredTags(TestHed): schema_file = '../data/validator_tests/HED8.0.0_added_tests.mediawiki' @staticmethod - def string_obj_func(validator, check_for_warnings): - return partial(validator._validate_tags_in_hed_string, check_for_warnings=check_for_warnings) + def string_obj_func(validator): + return partial(validator._validate_tags_in_hed_string) def test_includes_all_required_tags(self): test_strings = { @@ -857,13 +859,13 @@ def test_includes_all_required_tags(self): def test_multiple_copies_unique_tags(self): test_strings = { 'legal': 'Event-context,' - '(Vehicle,Event)', + '(Vehicle,Event), Animal-agent, Action', 'multipleDesc': 'Event-context,' 'Event-context,' - 'Vehicle,(Vehicle,Event-context)', + 'Vehicle,(Vehicle,Event-context), Animal-agent, Action', # I think this is illegal in hed2 style schema now. 'multipleDescIncShort': 'Event-context,' - 'Organizational-property/Event-context' + 'Organizational-property/Event-context, Animal-agent, Action' } expected_results = { 'legal': True, @@ -885,8 +887,8 @@ class TestHedSpecialUnits(TestHed): schema_file = '../data/validator_tests/HED8.0.0_added_tests.mediawiki' @staticmethod - def string_obj_func(validator, check_for_warnings): - return partial(validator._validate_individual_tags_in_hed_string, check_for_warnings=check_for_warnings) + def string_obj_func(validator): + return partial(validator._validate_individual_tags_in_hed_string) def test_special_units(self): test_strings = { diff --git a/tests/validator/test_tag_validator_base.py b/tests/validator/test_tag_validator_base.py index df8812479..75f2b10e7 100644 --- a/tests/validator/test_tag_validator_base.py +++ b/tests/validator/test_tag_validator_base.py @@ -66,45 +66,38 @@ class TestValidatorBase(TestHedBase): def setUpClass(cls): super().setUpClass() cls.error_handler = error_reporter.ErrorHandler() - cls.syntactic_hed_input_reader = HedValidator(hed_schema=None, - run_semantic_validation=False) - cls.syntactic_tag_validator = cls.syntactic_hed_input_reader._tag_validator - cls.semantic_hed_input_reader = HedValidator(hed_schema=cls.hed_schema, - run_semantic_validation=True) + # cls.syntactic_hed_input_reader = HedValidator(hed_schema=None) + # cls.syntactic_tag_validator = cls.syntactic_hed_input_reader._tag_validator + cls.semantic_hed_input_reader = HedValidator(hed_schema=cls.hed_schema) cls.semantic_tag_validator = cls.semantic_hed_input_reader._tag_validator def validator_base(self, test_strings, expected_results, expected_issues, test_function, - hed_schema=None): + hed_schema=None, check_for_warnings=False): for test_key in test_strings: hed_string_obj = HedString(test_strings[test_key]) - error_handler = ErrorHandler() + error_handler = ErrorHandler(check_for_warnings=check_for_warnings) error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj, increment_depth_after=False) test_issues = [] if self.compute_forms: test_issues += hed_string_obj.convert_to_canonical_forms(hed_schema) if not test_issues: test_issues += test_function(hed_string_obj) - test_result = not test_issues expected_params = expected_issues[test_key] expected_result = expected_results[test_key] expected_issue = self.format_errors_fully(error_handler, hed_string=hed_string_obj, params=expected_params) - error_handler.add_context_to_issues(test_issues) + error_handler.add_context_and_filter(test_issues) + test_result = not test_issues - # print(test_key) - # print(str(expected_issue)) - # print(str(test_issues)) + print(test_key) + print(str(expected_issue)) + print(str(test_issues)) error_handler.pop_error_context() self.assertEqual(test_result, expected_result, test_strings[test_key]) self.assertCountEqual(test_issues, expected_issue, test_strings[test_key]) - def validator_syntactic(self, test_strings, expected_results, expected_issues, check_for_warnings): - validator = self.syntactic_hed_input_reader - self.validator_base(test_strings, expected_results, expected_issues, - self.string_obj_func(validator, check_for_warnings=check_for_warnings)) - def validator_semantic(self, test_strings, expected_results, expected_issues, check_for_warnings): validator = self.semantic_hed_input_reader self.validator_base(test_strings, expected_results, expected_issues, - self.string_obj_func(validator, check_for_warnings=check_for_warnings), + self.string_obj_func(validator), check_for_warnings=check_for_warnings, hed_schema=validator._hed_schema) diff --git a/tests/validator/test_tag_validator_library.py b/tests/validator/test_tag_validator_library.py index 15c86545e..c4552f689 100644 --- a/tests/validator/test_tag_validator_library.py +++ b/tests/validator/test_tag_validator_library.py @@ -43,8 +43,8 @@ def test_invalid_load_prefix(self): class IndividualHedTagsShort(TestHed3): @staticmethod - def string_obj_func(validator, check_for_warnings): - return partial(validator._validate_individual_tags_in_hed_string, check_for_warnings=check_for_warnings) + def string_obj_func(validator): + return partial(validator._validate_individual_tags_in_hed_string) def test_exist_in_schema(self): test_strings = { @@ -102,10 +102,10 @@ def test_exist_in_schema(self): def test_proper_capitalization(self): test_strings = { 'proper': 'tl:Event/Sensory-event', - 'camelCase': 'tl:EvEnt/Something', - 'takesValue': 'tl:Attribute/Temporal rate/20 Hz', - 'numeric': 'tl:Repetition-number/20', - 'lowercase': 'tl:Event/something' + 'camelCase': 'tl:EvEnt/Sensory-event', + 'takesValue': 'tl:Sampling-rate/20 Hz', + 'numeric': 'tl:Statistical-uncertainty/20', + 'lowercase': 'tl:Event/sensory-event' } expected_results = { 'proper': True, @@ -121,7 +121,7 @@ def test_proper_capitalization(self): 'numeric': [], 'lowercase': self.format_error(ValidationErrors.HED_STYLE_WARNING, tag=0) } - self.validator_syntactic(test_strings, expected_results, expected_issues, True) + self.validator_semantic(test_strings, expected_results, expected_issues, True) def test_child_required(self): test_strings = { @@ -302,17 +302,17 @@ def test_span_reporting(self): class TestTagLevels3(TestHed3): @staticmethod - def string_obj_func(validator, check_for_warnings): + def string_obj_func(validator): return validator._validate_groups_in_hed_string def test_no_duplicates(self): test_strings = { 'topLevelDuplicate': 'tl:Event/Sensory-event,tl:Event/Sensory-event', 'groupDuplicate': 'tl:Item/Object/Man-made-object/VehicleTrain,(tl:Event/Sensory-event,' - 'tl:Attribute/Sensory/Visual/Color/CSS-color/Purple-color/Purple,tl:Event/Sensory-event)', + 'tl:Purple-color/Purple,tl:Event/Sensory-event)', 'noDuplicate': 'tl:Event/Sensory-event,' 'tl:Item/Object/Man-made-object/VehicleTrain,' - 'tl:Attribute/Sensory/Visual/Color/CSS-color/Purple-color/Purple', + 'tl:Purple-color/Purple', 'legalDuplicate': 'tl:Item/Object/Man-made-object/VehicleTrain,\ (tl:Item/Object/Man-made-object/VehicleTrain,' 'tl:Event/Sensory-event)', @@ -329,7 +329,7 @@ def test_no_duplicates(self): 'legalDuplicate': [], 'noDuplicate': [] } - self.validator_syntactic(test_strings, expected_results, expected_issues, False) + self.validator_semantic(test_strings, expected_results, expected_issues, False) def test_no_duplicates_semantic(self): test_strings = { @@ -417,8 +417,8 @@ def test_taggroup_validation(self): class RequiredTags(TestHed3): @staticmethod - def string_obj_func(validator, check_for_warnings): - return partial(validator._validate_tags_in_hed_string, check_for_warnings=check_for_warnings) + def string_obj_func(validator): + return partial(validator._validate_tags_in_hed_string) def test_includes_all_required_tags(self): test_strings = { @@ -452,12 +452,13 @@ def test_includes_all_required_tags(self): def test_multiple_copies_unique_tags(self): test_strings = { 'legal': 'tl:Event-context,' - '(Vehicle,Event)', + '(Vehicle,Event), Animal-agent, Action, tl:Animal-agent, tl:Action', 'multipleDesc': 'tl:Event-context,' 'tl:Event-context,' - 'Vehicle,(Vehicle,tl:Event-context)', + 'Vehicle,(Vehicle,tl:Event-context), Animal-agent, Action, tl:Animal-agent, tl:Action', 'multipleDescIncShort': 'tl:Event-context,' - 'tl:Organizational-property/Event-context' + 'tl:Organizational-property/Event-context,' + ' Animal-agent, Action, tl:Animal-agent, tl:Action' } expected_results = { 'legal': True, From 28ef39e4c106e05596ca21001aa01261366ac9f2 Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 16 Mar 2023 11:21:22 -0500 Subject: [PATCH 02/19] Add missing data file. Disable prints --- tests/data/sidecar_tests/both_types_events_with_defs.json | 6 +++--- tests/validator/test_onset_validator.py | 8 ++++---- tests/validator/test_tag_validator_base.py | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/data/sidecar_tests/both_types_events_with_defs.json b/tests/data/sidecar_tests/both_types_events_with_defs.json index 29b133724..7047a1fdd 100644 --- a/tests/data/sidecar_tests/both_types_events_with_defs.json +++ b/tests/data/sidecar_tests/both_types_events_with_defs.json @@ -20,16 +20,16 @@ "stim_file": { "LongName": "Stimulus file name", "Description": "Relative path of the stimulus image file", - "HED": "Attribute/File/#, (Definition/JsonFileDef2/#, (Item/JsonDef2/#,Item/JsonDef2)), (Definition/JsonFileDef3/#, (Item/JsonDef3/#,InvalidTag))" + "HED": "Age/#, (Definition/JsonFileDef2/#, (Item/JsonDef2/#,Item/JsonDef2)), (Definition/JsonFileDef3/#, (Item/JsonDef3/#))" }, "takes_value_def": { "LongName": "Def with a takes value tag", "Description": "Relative path of the stimulus image file", - "HED": "Attribute/File/#, (Definition/TakesValueDef/#, (Age/#))" + "HED": "Age/#, (Definition/TakesValueDef/#, (Age/#))" }, "unit_class_def": { "LongName": "Def with a value class", "Description": "Relative path of the stimulus image file", - "HED": "Attribute/File/#, (Definition/ValueClassDef/#, (Acceleration/#))" + "HED": "Age/#, (Definition/ValueClassDef/#, (Acceleration/#))" } } \ No newline at end of file diff --git a/tests/validator/test_onset_validator.py b/tests/validator/test_onset_validator.py index 1bc814f33..de46d116b 100644 --- a/tests/validator/test_onset_validator.py +++ b/tests/validator/test_onset_validator.py @@ -56,8 +56,8 @@ def _test_issues_base(self, test_strings, test_issues, test_context, placeholder error_handler.add_context_and_filter(onset_issues) test_string.shrink_defs() issues = self.format_errors_fully(error_handler, hed_string=test_string, params=expected_params) - print(str(onset_issues)) - print(str(issues)) + # print(str(onset_issues)) + # print(str(issues)) error_handler.pop_error_context() self.assertEqual(len(validator._onsets), context) self.assertCountEqual(onset_issues, issues) @@ -71,8 +71,8 @@ def _test_issues_no_context(self, test_strings, test_issues): onset_issues = hed_validator.validate(test_string, False) error_handler.add_context_and_filter(onset_issues) issues = self.format_errors_fully(error_handler, hed_string=test_string, params=expected_params) - print(str(onset_issues)) - print(str(issues)) + # print(str(onset_issues)) + # print(str(issues)) error_handler.pop_error_context() self.assertCountEqual(onset_issues, issues) diff --git a/tests/validator/test_tag_validator_base.py b/tests/validator/test_tag_validator_base.py index 75f2b10e7..37d78668c 100644 --- a/tests/validator/test_tag_validator_base.py +++ b/tests/validator/test_tag_validator_base.py @@ -89,9 +89,9 @@ def validator_base(self, test_strings, expected_results, expected_issues, test_f error_handler.add_context_and_filter(test_issues) test_result = not test_issues - print(test_key) - print(str(expected_issue)) - print(str(test_issues)) + # print(test_key) + # print(str(expected_issue)) + # print(str(test_issues)) error_handler.pop_error_context() self.assertEqual(test_result, expected_result, test_strings[test_key]) self.assertCountEqual(test_issues, expected_issue, test_strings[test_key]) From 21590f20c51f629624de65a50ecfd1a08c24f47f Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Thu, 16 Mar 2023 16:32:12 -0500 Subject: [PATCH 03/19] Updated unit tests --- hed/models/df_util.py | 2 +- hed/tools/analysis/analysis_util.py | 16 +++-- hed/tools/analysis/hed_context_manager.py | 2 +- hed/tools/analysis/hed_type_definitions.py | 10 +-- .../operations/convert_columns_op.py | 70 +++++++++++++++++++ .../remodeling/operations/valid_operations.py | 2 + .../test_analysis_util_assemble_hed.py | 13 ++-- .../analysis/test_hed_context_manager.py | 16 +++-- tests/tools/analysis/test_hed_tag_counts.py | 2 +- .../operations/test_convert_columns_op.py | 50 +++++++++++++ 10 files changed, 159 insertions(+), 24 deletions(-) create mode 100644 hed/tools/remodeling/operations/convert_columns_op.py create mode 100644 tests/tools/remodeling/operations/test_convert_columns_op.py diff --git a/hed/models/df_util.py b/hed/models/df_util.py index b7e73a282..d877028aa 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -14,7 +14,7 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_ The path to the tabular file, or a TabularInput object representing it. sidecar: str or Sidecar The path to the sidecar file, or a Sidecar object representing it. - hed_schema: str or HedSchema + hed_schema: HedSchema If str, will attempt to load as a version if it doesn't have a valid extension. extra_def_dicts: list of DefinitionDict, optional Any extra DefinitionDict objects to use when parsing the HED tags. diff --git a/hed/tools/analysis/analysis_util.py b/hed/tools/analysis/analysis_util.py index c93debd0d..27f442c3d 100644 --- a/hed/tools/analysis/analysis_util.py +++ b/hed/tools/analysis/analysis_util.py @@ -6,13 +6,16 @@ from hed.tools.util.data_util import separate_values from hed.models.hed_tag import HedTag from hed.models.hed_group import HedGroup +from hed.models.df_util import get_assembled, expand_defs -def assemble_hed(data_input, columns_included=None, expand_defs=False): +def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs=False): """ Return assembled HED annotations in a dataframe. Parameters: data_input (TabularInput): The tabular input file whose HED annotations are to be assembled. + sidecar (Sidecar): Sidecar with definitions. + schema (HedSchema): Hed schema columns_included (list or None): A list of additional column names to include. If None, only the list of assembled tags is included. expand_defs (bool): If True, definitions are expanded when the events are assembled. @@ -23,14 +26,19 @@ def assemble_hed(data_input, columns_included=None, expand_defs=False): """ eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included) - hed_obj_list = get_assembled_strings(data_input, expand_defs=expand_defs) - hed_string_list = [str(hed) for hed in hed_obj_list] + hed_string_list = data_input.series_a + definitions = sidecar.get_def_dict(hed_schema=schema) + if expand_defs: + expand_defs(hed_string_list, schema, definitions, columns=None) + # hed_obj_list, defs = get_assembled(data_input, sidecar, schema, extra_def_dicts=None, join_columns=True, + # shrink_defs=False, expand_defs=True) + # hed_string_list = [str(hed) for hed in hed_obj_list] if not eligible_columns: df = pd.DataFrame({"HED_assembled": hed_string_list}) else: df = data_input.dataframe[eligible_columns].copy(deep=True) df['HED_assembled'] = hed_string_list - definitions = data_input.get_definitions().gathered_defs + # definitions = data_input.get_definitions().gathered_defs return df, definitions diff --git a/hed/tools/analysis/hed_context_manager.py b/hed/tools/analysis/hed_context_manager.py index 011330662..06a02dc82 100644 --- a/hed/tools/analysis/hed_context_manager.py +++ b/hed/tools/analysis/hed_context_manager.py @@ -35,7 +35,7 @@ def __init__(self, hed_strings, hed_schema): """ - self.hed_strings = [HedString(str(hed), hed_schema=hed_schema) for hed in hed_strings] + self.hed_strings = hed_strings if not isinstance(hed_schema, HedSchema) and not isinstance(hed_schema, HedSchemaGroup): raise ValueError("ContextRequiresSchema", f"Context manager must have a valid HedSchema of HedSchemaGroup") self.hed_schema = hed_schema diff --git a/hed/tools/analysis/hed_type_definitions.py b/hed/tools/analysis/hed_type_definitions.py index 644802627..8d49dc060 100644 --- a/hed/tools/analysis/hed_type_definitions.py +++ b/hed/tools/analysis/hed_type_definitions.py @@ -1,7 +1,7 @@ """ Manages definitions associated with a type such as condition-variable. """ from hed.models.hed_tag import HedTag -from hed.models.def_mapper import DefMapper +from hed.models.definition_dict import DefinitionDict class HedTypeDefinitions: @@ -10,16 +10,18 @@ def __init__(self, definitions, hed_schema, type_tag='condition-variable'): """ Create a definition manager for a type of variable. Parameters: - definitions (dict or DefMapper): A dictionary of DefinitionEntry objects. + definitions (dict or DefinitionDict): A dictionary of DefinitionEntry objects. hed_schema (Hedschema or HedSchemaGroup): The schema used for parsing. type_tag (str): Lower-case HED tag string representing the type managed. + # TODO: [Refactor] - should dict be allowed for definitions. + """ self.type_tag = type_tag.lower() self.hed_schema = hed_schema - if isinstance(definitions, DefMapper): - self.definitions = definitions.gathered_defs + if isinstance(definitions, DefinitionDict): + self.definitions = definitions.defs elif isinstance(definitions, dict): self.definitions = definitions else: diff --git a/hed/tools/remodeling/operations/convert_columns_op.py b/hed/tools/remodeling/operations/convert_columns_op.py new file mode 100644 index 000000000..ae383a1e4 --- /dev/null +++ b/hed/tools/remodeling/operations/convert_columns_op.py @@ -0,0 +1,70 @@ +""" Convert the type of the specified columns of a tabular file. """ + +from hed.tools.remodeling.operations.base_op import BaseOp + + +class ConvertColumnsOp(BaseOp): + """ Convert. + + Required remodeling parameters: + - **column_names** (*list*): The list of columns to convert. + - **convert_to_** (*str*): Name of type to convert to. (One of 'str', 'int', 'float', 'fixed'.) + - **decimal_places** (*int*): Number decimal places to keep (for fixed only). + + + """ + + PARAMS = { + "operation": "convert_columns", + "required_parameters": { + "column_names": list, + "convert_to": str + }, + "optional_parameters": { + "decimal_places": int + } + } + + def __init__(self, parameters): + """ Constructor for the convert columns operation. + + Parameters: + parameters (dict): Parameter values for required and optional parameters. + + Raises: + KeyError + - If a required parameter is missing. + - If an unexpected parameter is provided. + + TypeError + - If a parameter has the wrong type. + + ValueError + - If convert_to is not one of the allowed values. + + """ + super().__init__(self.PARAMS, parameters) + self.column_names = parameters['column_names'] + self.convert_to = parameters['convert_to'] + self.decimal_places = parameters.get('decimal_places', None) + self.allowed_types = ['str', 'int', 'float', 'fixed'] + if self.convert_to not in self.allowed_types: + raise ValueError("CannotConvertToSpecifiedType", + f"The convert_to value {self.convert_to} must be one of {str(self.allowed_types)}") + + def do_op(self, dispatcher, df, name, sidecar=None): + """ Convert the specified column to a specified type. + + Parameters: + dispatcher (Dispatcher): Manages the operation I/O. + df (DataFrame): The DataFrame to be remodeled. + name (str): Unique identifier for the dataframe -- often the original file path. + sidecar (Sidecar or file-like): Only needed for HED operations. + + Returns: + DataFrame: A new DataFrame with the factor columns appended. + + """ + + df_new = df.copy() + return df_new diff --git a/hed/tools/remodeling/operations/valid_operations.py b/hed/tools/remodeling/operations/valid_operations.py index 36761591a..d00391270 100644 --- a/hed/tools/remodeling/operations/valid_operations.py +++ b/hed/tools/remodeling/operations/valid_operations.py @@ -1,5 +1,6 @@ """ The valid operations for the remodeling tools. """ +# from hed.tools.remodeling.operations.convert_columns_op import ConvertColumnsOp from hed.tools.remodeling.operations.factor_column_op import FactorColumnOp from hed.tools.remodeling.operations.factor_hed_tags_op import FactorHedTagsOp from hed.tools.remodeling.operations.factor_hed_type_op import FactorHedTypeOp @@ -20,6 +21,7 @@ from hed.tools.remodeling.operations.summarize_hed_validation_op import SummarizeHedValidationOp valid_operations = { + # 'convert_columns': ConvertColumnsOp, 'factor_column': FactorColumnOp, 'factor_hed_tags': FactorHedTagsOp, 'factor_hed_type': FactorHedTypeOp, diff --git a/tests/tools/analysis/test_analysis_util_assemble_hed.py b/tests/tools/analysis/test_analysis_util_assemble_hed.py index 058213e3e..9c37b8620 100644 --- a/tests/tools/analysis/test_analysis_util_assemble_hed.py +++ b/tests/tools/analysis/test_analysis_util_assemble_hed.py @@ -22,13 +22,14 @@ def setUpClass(cls): hed_schema = hedschema.load_schema(schema_path) cls.hed_schema = hed_schema - sidecar1 = Sidecar(json_path, name='face_sub1_json', hed_schema=hed_schema) + sidecar1 = Sidecar(json_path, name='face_sub1_json') cls.sidecar_path = sidecar1 - cls.input_data = TabularInput(events_path, hed_schema=hed_schema, sidecar=sidecar1, name="face_sub1_events") + cls.sidecar1 = sidecar1 + cls.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") cls.input_data_no_sidecar = TabularInput(events_path, name="face_sub1_events_no_sidecar") def test_assemble_hed_included_no_expand(self): - df1, dict1 = assemble_hed(self.input_data, + df1, dict1 = assemble_hed(self.input_data, self.sidecar1, self.hed_schema, columns_included=["onset", "duration", "event_type"], expand_defs=False) self.assertIsInstance(df1, DataFrame, "hed_assemble should return a dataframe when columns are included") columns1 = list(df1.columns) @@ -38,11 +39,11 @@ def test_assemble_hed_included_no_expand(self): self.assertNotEqual(first_str1.find('Def/'), -1, "assemble_hed with no def expand has Def tags") self.assertEqual(first_str1.find('Def-expand'), -1, "assemble_hed with no def expand does not have Def-expand tags") - self.assertIsInstance(dict1, dict, "hed_assemble returns a dictionary of definitions") - self.assertEqual(len(dict1), 17, "hed_assemble definition dictionary has the right number of elements.") + self.assertIsInstance(dict1.defs, dict, "hed_assemble returns a dictionary of definitions") + self.assertEqual(len(dict1.defs), 17, "hed_assemble definition dictionary has the right number of elements.") def test_assemble_hed_included_expand(self): - df2, dict2 = assemble_hed(self.input_data, + df2, dict2 = assemble_hed(self.input_data, self.sidecar1, self.hed_schema, columns_included=["onset", "duration", "event_type"], expand_defs=True) first_str2 = df2.iloc[0]['HED_assembled'] self.assertEqual(first_str2.find('Def/'), -1, "assemble_hed with def expand has no Def tag") diff --git a/tests/tools/analysis/test_hed_context_manager.py b/tests/tools/analysis/test_hed_context_manager.py index 9ad70e958..26e0f4e87 100644 --- a/tests/tools/analysis/test_hed_context_manager.py +++ b/tests/tools/analysis/test_hed_context_manager.py @@ -1,13 +1,12 @@ import os import unittest from hed.errors.exceptions import HedFileError -from hed.models.hed_group import HedGroup from hed.models.hed_string import HedString from hed.models.sidecar import Sidecar from hed.models.tabular_input import TabularInput from hed.schema.hed_schema_io import load_schema_version -from hed.tools.analysis.hed_context_manager import HedContextManager, OnsetGroup -from hed.tools.analysis.analysis_util import get_assembled_strings +from hed.tools.analysis.hed_context_manager import HedContextManager +from hed.models.df_util import get_assembled class Test(unittest.TestCase): @@ -37,7 +36,8 @@ def setUpClass(cls): 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) sidecar_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) sidecar1 = Sidecar(sidecar_path, name='face_sub1_json') - cls.input_data = TabularInput(events_path, sidecar=sidecar1, hed_schema=schema, name="face_sub1_events") + cls.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") + cls.sidecar1 = sidecar1 cls.schema = schema # def test_onset_group(self): @@ -71,13 +71,14 @@ def test_constructor(self): self.assertIsInstance(context, list, "The constructor event contexts should be a list") self.assertIsInstance(context[1], HedString, "The constructor event contexts has a correct element") - def test_constructor(self): + def test_constructor1(self): with self.assertRaises(ValueError) as cont: HedContextManager(self.test_strings1, None) self.assertEqual(cont.exception.args[0], "ContextRequiresSchema") def test_iter(self): - hed_strings = get_assembled_strings(self.input_data, hed_schema=self.schema, expand_defs=False) + hed_strings, _ = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) manager1 = HedContextManager(hed_strings, self.schema) i = 0 for hed, context in manager1.iter_context(): @@ -86,7 +87,8 @@ def test_iter(self): i = i + 1 def test_constructor_from_assembled(self): - hed_strings = get_assembled_strings(self.input_data, hed_schema=self.schema, expand_defs=False) + hed_strings, _ = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) manager1 = HedContextManager(hed_strings, self.schema) self.assertEqual(len(manager1.hed_strings), 200, "The constructor for assembled strings has expected # of strings") diff --git a/tests/tools/analysis/test_hed_tag_counts.py b/tests/tools/analysis/test_hed_tag_counts.py index ece27f496..76b0a9eaf 100644 --- a/tests/tools/analysis/test_hed_tag_counts.py +++ b/tests/tools/analysis/test_hed_tag_counts.py @@ -24,7 +24,7 @@ def setUpClass(cls): schema = hedschema.load_schema(schema_path) cls.hed_schema = schema sidecar1 = Sidecar(json_path, name='face_sub1_json') - input_data = TabularInput(events_path, sidecar=sidecar1, hed_schema=schema, name="face_sub1_events") + input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") input_df, def_dict = assemble_hed(input_data, expand_defs=False) cls.input_df = input_df cls.def_dict = def_dict diff --git a/tests/tools/remodeling/operations/test_convert_columns_op.py b/tests/tools/remodeling/operations/test_convert_columns_op.py new file mode 100644 index 000000000..01a27f949 --- /dev/null +++ b/tests/tools/remodeling/operations/test_convert_columns_op.py @@ -0,0 +1,50 @@ +import pandas as pd +import numpy as np +import unittest +from hed.tools.remodeling.operations.convert_columns_op import ConvertColumnsOp +from hed.tools.remodeling.dispatcher import Dispatcher + + +class Test(unittest.TestCase): + """ + + TODO: Test when no factor names and values are given. + + """ + @classmethod + def setUpClass(cls): + cls.sample_data = [[0.0776, 0.5083, 'go', 'n/a', 0.565, 'correct', 'right', 'female'], + [5.5774, 0.5083, 'unsuccesful_stop', 0.2, 0.49, 'correct', 'right', 'female'], + [9.5856, 0.5084, 'go', 'n/a', 0.45, 'correct', 'right', 'female'], + [13.5939, 0.5083, 'succesful_stop', 0.2, 'n/a', 'n/a', 'n/a', 'female'], + [17.1021, 0.5083, 'unsuccesful_stop', 0.25, 0.633, 'correct', 'left', 'male'], + [21.6103, 0.5083, 'go', 'n/a', 0.443, 'correct', 'left', 'male']] + cls.factored = [[0.0776, 0.5083, 'go', 'n/a', 0.565, 'correct', 'right', 'female', 0, 0], + [5.5774, 0.5083, 'unsuccesful_stop', 0.2, 0.49, 'correct', 'right', 'female', 0, 1], + [9.5856, 0.5084, 'go', 'n/a', 0.45, 'correct', 'right', 'female', 0, 0], + [13.5939, 0.5083, 'succesful_stop', 0.2, 'n/a', 'n/a', 'n/a', 'female', 1, 0], + [17.1021, 0.5083, 'unsuccesful_stop', 0.25, 0.633, 'correct', 'left', 'male', 0, 1], + [21.6103, 0.5083, 'go', 'n/a', 0.443, 'correct', 'left', 'male', 0, 0]] + cls.sample_columns = ['onset', 'duration', 'trial_type', 'stop_signal_delay', 'response_time', + 'response_accuracy', 'response_hand', 'sex'] + cls.default_factor_columns = ["trial_type.succesful_stop", "trial_type.unsuccesful_stop"] + + def setUp(self): + self.base_parameters = { + "column_names": ["onset", "duration", "response_time"], + "convert_to": "int" + } + + @classmethod + def tearDownClass(cls): + pass + + def test_constructor_bad_convert_to(self): + self.base_parameters["convert_to"] = "blech" + with self.assertRaises(ValueError) as context: + ConvertColumnsOp(self.base_parameters) + self.assertEqual(context.exception.args[0], "CannotConvertToSpecifiedType") + + +if __name__ == '__main__': + unittest.main() From 4c79d1b3c041cd37c9de3853a8f8e7ff4ec37a14 Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 16 Mar 2023 17:01:16 -0500 Subject: [PATCH 04/19] Add some df tests. Update hed_assemble. Make the df utils also work on series. --- hed/models/df_util.py | 54 ++++++++----- hed/tools/analysis/analysis_util.py | 7 +- tests/models/test_df_util.py | 114 ++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 23 deletions(-) create mode 100644 tests/models/test_df_util.py diff --git a/hed/models/df_util.py b/hed/models/df_util.py index d877028aa..66b5c75be 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -1,4 +1,5 @@ from functools import partial +import pandas as pd from hed.models.sidecar import Sidecar from hed.models.tabular_input import TabularInput @@ -51,7 +52,7 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_ for x in text_file_row] for text_file_row in tabular_file.dataframe_a.itertuples(index=False)], def_dict -def convert_to_form(df, hed_schema, tag_form, columns): +def convert_to_form(df, hed_schema, tag_form, columns=None): """ Convert all tags in underlying dataframe to the specified form. Converts in place @@ -61,51 +62,62 @@ def convert_to_form(df, hed_schema, tag_form, columns): tag_form(str): HedTag property to convert tags to. columns (list): The columns to modify on the dataframe """ - if columns is None: - columns = df.columns + if isinstance(df, pd.Series): + df = df.apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) + else: + if columns is None: + columns = df.columns - for column in columns: - df[column] = df[column].apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) + for column in columns: + df[column] = df[column].apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) return df -def shrink_defs(df, hed_schema, columns): +def shrink_defs(df, hed_schema, columns=None): """ Shrinks any def-expand tags found in the dataframe. Converts in place Parameters: - df (pd.Dataframe): The dataframe to modify + df (pd.Dataframe or pd.Series): The dataframe or series to modify hed_schema (HedSchema or None): The schema to use to identify defs. - columns (list): The columns to modify on the dataframe + columns (list or None): The columns to modify on the dataframe """ - if columns is None: - columns = df.columns + if isinstance(df, pd.Series): + mask = df.str.contains('Def-expand/', case=False) + df[mask] = df[mask].apply(partial(_shrink_defs, hed_schema=hed_schema)) + else: + if columns is None: + columns = df.columns - for column in columns: - mask = df[column].str.contains('Def-expand/', case=False) - df[column][mask] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema)) + for column in columns: + mask = df[column].str.contains('Def-expand/', case=False) + df[column][mask] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema)) return df -def expand_defs(df, hed_schema, def_dict, columns): +def expand_defs(df, hed_schema, def_dict, columns=None): """ Expands any def tags found in the dataframe. Converts in place Parameters: - df (pd.Dataframe): The dataframe to modify + df (pd.Dataframe or pd.Series): The dataframe or series to modify hed_schema (HedSchema or None): The schema to use to identify defs def_dict (DefinitionDict): The definitions to expand - columns (list): The columns to modify on the dataframe + columns (list or None): The columns to modify on the dataframe """ - if columns is None: - columns = df.columns + if isinstance(df, pd.Series): + mask = df.str.contains('Def/', case=False) + df[mask] = df[mask].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) + else: + if columns is None: + columns = df.columns - for column in columns: - mask = df[column].str.contains('Def/', case=False) - df[column][mask] = df[column][mask].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) + for column in columns: + mask = df[column].str.contains('Def/', case=False) + df[column][mask] = df[column][mask].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) return df diff --git a/hed/tools/analysis/analysis_util.py b/hed/tools/analysis/analysis_util.py index 27f442c3d..fcfd5284c 100644 --- a/hed/tools/analysis/analysis_util.py +++ b/hed/tools/analysis/analysis_util.py @@ -6,7 +6,7 @@ from hed.tools.util.data_util import separate_values from hed.models.hed_tag import HedTag from hed.models.hed_group import HedGroup -from hed.models.df_util import get_assembled, expand_defs +from hed.models import df_util def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs=False): @@ -29,7 +29,10 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs hed_string_list = data_input.series_a definitions = sidecar.get_def_dict(hed_schema=schema) if expand_defs: - expand_defs(hed_string_list, schema, definitions, columns=None) + df_util.expand_defs(hed_string_list, schema, definitions) + # Keep in mind hed_string_list is now a Series. The rest of the function should probably + # also be modified + # hed_obj_list, defs = get_assembled(data_input, sidecar, schema, extra_def_dicts=None, join_columns=True, # shrink_defs=False, expand_defs=True) # hed_string_list = [str(hed) for hed in hed_obj_list] diff --git a/tests/models/test_df_util.py b/tests/models/test_df_util.py new file mode 100644 index 000000000..bc9c907b7 --- /dev/null +++ b/tests/models/test_df_util.py @@ -0,0 +1,114 @@ +import unittest +import pandas as pd + + +from hed import load_schema_version +from hed.models.df_util import shrink_defs, expand_defs +from hed import DefinitionDict + + +class TestShrinkDefs(unittest.TestCase): + def setUp(self): + self.schema = load_schema_version() + + def test_shrink_defs_normal(self): + df = pd.DataFrame({"column1": ["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"]}) + expected_df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"]}) + result = shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_placeholder(self): + df = pd.DataFrame({"column1": ["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]}) + expected_df = pd.DataFrame({"column1": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) + result = shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_no_matching_tags(self): + df = pd.DataFrame({"column1": ["(Event/SomeEvent, Item/SomeItem,Age/25)"]}) + expected_df = pd.DataFrame({"column1": ["(Event/SomeEvent, Item/SomeItem,Age/25)"]}) + result = shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_multiple_columns(self): + df = pd.DataFrame({"column1": ["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"], + "column2": ["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]}) + expected_df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"], + "column2": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) + result = shrink_defs(df, self.schema, ['column1', 'column2']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_multiple_defs_same_line(self): + df = pd.DataFrame({"column1": ["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Age/30"]}) + expected_df = pd.DataFrame({"column1": ["Def/TestDefNormal,Def/TestDefPlaceholder/123,Age/30"]}) + result = shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_mixed_tags(self): + df = pd.DataFrame({"column1": [ + "(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent,(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem,Age/25"]}) + expected_df = pd.DataFrame( + {"column1": ["Def/TestDefNormal,Event/SomeEvent,Def/TestDefPlaceholder/123,Item/SomeItem,Age/25"]}) + result = shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_series_normal(self): + series = pd.Series(["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"]) + expected_series = pd.Series(["Def/TestDefNormal,Event/SomeEvent"]) + result = shrink_defs(series, self.schema, None) + pd.testing.assert_series_equal(result, expected_series) + + def test_shrink_defs_series_placeholder(self): + series = pd.Series(["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]) + expected_series = pd.Series(["Def/TestDefPlaceholder/123,Item/SomeItem"]) + result = shrink_defs(series, self.schema, None) + pd.testing.assert_series_equal(result, expected_series) + + +class TestExpandDefs(unittest.TestCase): + def setUp(self): + self.schema = load_schema_version() + self.def_dict = DefinitionDict(["(Definition/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2))", + "(Definition/TestDefPlaceholder/#,(Action/TestDef1/#,Action/TestDef2))"], + hed_schema=self.schema) + + def test_expand_defs_normal(self): + df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"]}) + expected_df = pd.DataFrame( + {"column1": ["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"]}) + result = expand_defs(df, self.schema, self.def_dict, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_expand_defs_placeholder(self): + df = pd.DataFrame({"column1": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) + expected_df = pd.DataFrame({"column1": [ + "(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]}) + result = expand_defs(df, self.schema, self.def_dict, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_expand_defs_no_matching_tags(self): + df = pd.DataFrame({"column1": ["(Event/SomeEvent,Item/SomeItem,Age/25)"]}) + expected_df = pd.DataFrame({"column1": ["(Event/SomeEvent,Item/SomeItem,Age/25)"]}) + result = expand_defs(df, self.schema, self.def_dict, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_expand_defs_multiple_columns(self): + df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"], + "column2": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) + expected_df = pd.DataFrame( + {"column1": ["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"], + "column2": [ + "(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]}) + result = expand_defs(df, self.schema, self.def_dict, ['column1', 'column2']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_expand_defs_series_normal(self): + series = pd.Series(["Def/TestDefNormal,Event/SomeEvent"]) + expected_series = pd.Series(["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"]) + result = expand_defs(series, self.schema, self.def_dict, None) + pd.testing.assert_series_equal(result, expected_series) + + def test_expand_defs_series_placeholder(self): + series = pd.Series(["Def/TestDefPlaceholder/123,Item/SomeItem"]) + expected_series = pd.Series(["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]) + result = expand_defs(series, self.schema, self.def_dict, None) + pd.testing.assert_series_equal(result, expected_series) \ No newline at end of file From 2698d6cc15d05d1f4b81b0054dbd3d86a978a2fd Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Fri, 17 Mar 2023 16:58:27 -0500 Subject: [PATCH 05/19] Fixed some of the refactoring errors --- hed/models/df_util.py | 3 +- hed/models/sidecar.py | 7 +- hed/tools/__init__.py | 3 +- hed/tools/analysis/analysis_util.py | 103 ++++++----- hed/tools/analysis/event_manager.py | 15 +- hed/tools/analysis/hed_context_manager.py | 10 +- .../operations/factor_hed_tags_op.py | 18 +- .../operations/factor_hed_type_op.py | 14 +- .../operations/summarize_hed_tags_op.py | 13 +- .../operations/summarize_hed_type_op.py | 11 +- .../operations/summarize_hed_validation_op.py | 10 +- .../test_analysis_util_assemble_hed.py | 80 +++++---- ...est_analysis_util_get_assembled_strings.py | 167 +++++++++--------- tests/tools/analysis/test_annotation_util.py | 8 +- tests/tools/analysis/test_event_manager.py | 17 +- .../analysis/test_hed_context_manager.py | 8 +- tests/tools/analysis/test_hed_tag_counts.py | 2 +- tests/tools/analysis/test_hed_type_counts.py | 10 +- .../analysis/test_hed_type_definitions.py | 9 +- tests/tools/analysis/test_hed_type_factors.py | 17 +- tests/tools/analysis/test_hed_type_manager.py | 60 ++++--- tests/tools/analysis/test_hed_type_values.py | 92 +++++----- .../operations/test_summarize_hed_tags_op.py | 19 +- 23 files changed, 355 insertions(+), 341 deletions(-) diff --git a/hed/models/df_util.py b/hed/models/df_util.py index 66b5c75be..f9fa19dcc 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -26,7 +26,8 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_ expand_defs: bool Expand any def tags found Returns: - A list of HedStrings, or a list of lists of HedStrings + tuple: A list of HedStrings, or a list of lists of HedStrings, DefinitionDict + """ if isinstance(sidecar, str): sidecar = Sidecar(sidecar) diff --git a/hed/models/sidecar.py b/hed/models/sidecar.py index 8b808c6d1..280eba77d 100644 --- a/hed/models/sidecar.py +++ b/hed/models/sidecar.py @@ -156,9 +156,10 @@ def validate(self, hed_schema, extra_def_dicts=None, name=None, error_handler=No Parameters: hed_schema (HedSchema): Input data to be validated. - extra_def_dicts(list or DefinitionDict): extra def dicts in addition to sidecar - name(str): The name to report this sidecar as - error_handler (ErrorHandler): Error context to use. Creates a new one if None + extra_def_dicts(list or DefinitionDict): Extra def dicts in addition to sidecar. + name(str): The name to report this sidecar as. + error_handler (ErrorHandler): Error context to use. Creates a new one if None. + Returns: issues (list of dict): A list of issues associated with each level in the HED string. """ diff --git a/hed/tools/__init__.py b/hed/tools/__init__.py index 8b1f6fd90..fd1dfbbce 100644 --- a/hed/tools/__init__.py +++ b/hed/tools/__init__.py @@ -47,7 +47,8 @@ from .analysis.annotation_util import \ check_df_columns, extract_tags, generate_sidecar_entry, hed_to_df, df_to_hed, merge_hed_dict from .analysis import analysis_util -from .analysis.analysis_util import assemble_hed, search_tabular, get_assembled_strings +from .analysis.analysis_util import assemble_hed +# from .analysis.analysis_util import search_tabular, get_assembled_strings from .remodeling.cli import run_remodel from .remodeling.cli import run_remodel_backup diff --git a/hed/tools/analysis/analysis_util.py b/hed/tools/analysis/analysis_util.py index fcfd5284c..a4c57c9f6 100644 --- a/hed/tools/analysis/analysis_util.py +++ b/hed/tools/analysis/analysis_util.py @@ -2,7 +2,6 @@ import pandas as pd from hed.models.tabular_input import TabularInput -from hed.models.expression_parser import QueryParser from hed.tools.util.data_util import separate_values from hed.models.hed_tag import HedTag from hed.models.hed_group import HedGroup @@ -45,57 +44,57 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs return df, definitions -def get_assembled_strings(table, hed_schema=None, expand_defs=False): - """ Return HED string objects for a tabular file. - - Parameters: - table (TabularInput): The input file to be searched. - hed_schema (HedSchema or HedschemaGroup): If provided the HedStrings are converted to canonical form. - expand_defs (bool): If True, definitions are expanded when the events are assembled. - - Returns: - list: A list of HedString or HedStringGroup objects. - - """ - hed_list = list(table.iter_dataframe(hed_ops=[hed_schema], return_string_only=True, - expand_defs=expand_defs, remove_definitions=True)) - return hed_list - - -def search_tabular(data_input, hed_schema, query, columns_included=None): - """ Return a dataframe with results of query. - - Parameters: - data_input (TabularInput): The tabular input file (e.g., events) to be searched. - hed_schema (HedSchema or HedSchemaGroup): The schema(s) under which to make the query. - query (str or list): The str query or list of string queries to make. - columns_included (list or None): List of names of columns to include - - Returns: - DataFrame or None: A DataFrame with the results of the query or None if no events satisfied the query. - - """ - - eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included) - hed_list = get_assembled_strings(data_input, hed_schema=hed_schema, expand_defs=True) - expression = QueryParser(query) - hed_tags = [] - row_numbers = [] - for index, next_item in enumerate(hed_list): - match = expression.search(next_item) - if not match: - continue - hed_tags.append(next_item) - row_numbers.append(index) - - if not row_numbers: - df = None - elif not eligible_columns: - df = pd.DataFrame({'row_number': row_numbers, 'HED_assembled': hed_tags}) - else: - df = data_input.dataframe.iloc[row_numbers][eligible_columns].reset_index() - df.rename(columns={'index': 'row_number'}) - return df +# def get_assembled_strings(table, hed_schema=None, expand_defs=False): +# """ Return HED string objects for a tabular file. +# +# Parameters: +# table (TabularInput): The input file to be searched. +# hed_schema (HedSchema or HedschemaGroup): If provided the HedStrings are converted to canonical form. +# expand_defs (bool): If True, definitions are expanded when the events are assembled. +# +# Returns: +# list: A list of HedString or HedStringGroup objects. +# +# """ +# hed_list = list(table.iter_dataframe(hed_ops=[hed_schema], return_string_only=True, +# expand_defs=expand_defs, remove_definitions=True)) +# return hed_list +# + +# def search_tabular(data_input, hed_schema, query, columns_included=None): +# """ Return a dataframe with results of query. +# +# Parameters: +# data_input (TabularInput): The tabular input file (e.g., events) to be searched. +# hed_schema (HedSchema or HedSchemaGroup): The schema(s) under which to make the query. +# query (str or list): The str query or list of string queries to make. +# columns_included (list or None): List of names of columns to include +# +# Returns: +# DataFrame or None: A DataFrame with the results of the query or None if no events satisfied the query. +# +# """ +# +# eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included) +# hed_list = get_assembled_strings(data_input, hed_schema=hed_schema, expand_defs=True) +# expression = QueryParser(query) +# hed_tags = [] +# row_numbers = [] +# for index, next_item in enumerate(hed_list): +# match = expression.search(next_item) +# if not match: +# continue +# hed_tags.append(next_item) +# row_numbers.append(index) +# +# if not row_numbers: +# df = None +# elif not eligible_columns: +# df = pd.DataFrame({'row_number': row_numbers, 'HED_assembled': hed_tags}) +# else: +# df = data_input.dataframe.iloc[row_numbers][eligible_columns].reset_index() +# df.rename(columns={'index': 'row_number'}) +# return df # def remove_defs(hed_strings): diff --git a/hed/tools/analysis/event_manager.py b/hed/tools/analysis/event_manager.py index 2d6da7adc..f8bf5e5f5 100644 --- a/hed/tools/analysis/event_manager.py +++ b/hed/tools/analysis/event_manager.py @@ -3,25 +3,26 @@ from hed.schema import HedSchema, HedSchemaGroup from hed.tools.analysis.temporal_event import TemporalEvent from hed.models.model_constants import DefTagNames +from hed.models.df_util import get_assembled class EventManager: - def __init__(self, data, hed_schema): + def __init__(self, data, schema): """ Create an event manager for an events file. Parameters: data (TabularInput): A tabular input file. - hed_schema (HedSchema): A HED schema + schema (HedSchema): A HED schema Raises: HedFileError: if there are any unmatched offsets. """ - if not isinstance(hed_schema, HedSchema) and not isinstance(hed_schema, HedSchemaGroup): + if not isinstance(schema, HedSchema) and not isinstance(schema, HedSchemaGroup): raise ValueError("ContextRequiresSchema", f"Context manager must have a valid HedSchema of HedSchemaGroup") - self.hed_schema = hed_schema + self.schema = schema self.data = data self.event_list = [[] for _ in range(len(self.data.dataframe))] self.hed_strings = [None for _ in range(len(self.data.dataframe))] @@ -56,10 +57,10 @@ def _create_event_list(self): onset_dict = {} event_index = 0 - for hed in self.data.iter_dataframe(hed_ops=[self.hed_schema], return_string_only=True, - expand_defs=False, remove_definitions=True): + self.hed_strings, definitions = get_assembled(self.data, self.data._sidecar, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + for hed in self.hed_strings: # to_remove = [] # tag_tuples = hed.find_tags(['Onset'], recursive=False, include_groups=1) - self.hed_strings[event_index] = hed group_tuples = hed.find_top_level_tags(anchor_tags={DefTagNames.ONSET_KEY, DefTagNames.OFFSET_KEY}, include_groups=2) for tup in group_tuples: diff --git a/hed/tools/analysis/hed_context_manager.py b/hed/tools/analysis/hed_context_manager.py index 06a02dc82..5c565a9a4 100644 --- a/hed/tools/analysis/hed_context_manager.py +++ b/hed/tools/analysis/hed_context_manager.py @@ -5,6 +5,7 @@ from hed.schema import HedSchema, HedSchemaGroup from hed.tools.analysis.analysis_util import hed_to_str +#TODO: [Refactor] clean up distinction between hed as strings versus objects -- maybe replace by event manager. class OnsetGroup: def __init__(self, name, contents, start_index, end_index=None): @@ -23,7 +24,8 @@ def __init__(self, hed_strings, hed_schema): """ Create an context manager for an events file. Parameters: - hed_strings (list): A list of hed_strings to be managed. + hed_strings (list): A list of HedString objects to be managed. + hed_schema (HedSchema): A HedSchema Raises: HedFileError: if there are any unmatched offsets. @@ -46,6 +48,12 @@ def __init__(self, hed_strings, hed_schema): self._create_onset_list() self._set_event_contexts() + # def _extract_hed_objs(self, assembled): + # hed_objs = [None for _ in range(len(assembled))] + # for index, value in assembled["HED_assembled"].items(): + # hed_objs[index] = HedString(value, hed_schema=self.hed_schema) + # return hed_objs + def iter_context(self): """ Iterate rows of context. diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py index 41d3f805a..aa02224b9 100644 --- a/hed/tools/remodeling/operations/factor_hed_tags_op.py +++ b/hed/tools/remodeling/operations/factor_hed_tags_op.py @@ -7,7 +7,7 @@ from hed.models.tabular_input import TabularInput from hed.models.sidecar import Sidecar from hed.models.expression_parser import QueryParser -from hed.tools.analysis.analysis_util import get_assembled_strings +from hed.models.df_util import get_assembled class FactorHedTagsOp(BaseOp): @@ -101,16 +101,16 @@ def do_op(self, dispatcher, df, name, sidecar=None): """ if sidecar and not isinstance(sidecar, Sidecar): - sidecar = Sidecar(sidecar, hed_schema=dispatcher.hed_schema) - input_data = TabularInput(df, hed_schema=dispatcher.hed_schema, sidecar=sidecar) + sidecar = Sidecar(sidecar) + input_data = TabularInput(df.copy(), sidecar=sidecar, name=name) column_names = list(df.columns) - for name in self.query_names: - if name in column_names: + for query_name in self.query_names: + if query_name in column_names: raise ValueError("QueryNameAlreadyColumn", - f"Query [{name}]: is already a column name of the data frame") - df = input_data.dataframe.copy() - df_list = [df] - hed_strings = get_assembled_strings(input_data, hed_schema=dispatcher.hed_schema, expand_defs=True) + f"Query [{query_name}]: is already a column name of the data frame") + df_list = [input_data.dataframe] + hed_strings, _ = get_assembled(input_data, sidecar, dispatcher.hed_schema, extra_def_dicts=None, + join_columns=True, shrink_defs=False, expand_defs=True) df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=self.query_names) for parse_ind, parser in enumerate(self.expression_parsers): for index, next_item in enumerate(hed_strings): diff --git a/hed/tools/remodeling/operations/factor_hed_type_op.py b/hed/tools/remodeling/operations/factor_hed_type_op.py index e4a43c181..668886c88 100644 --- a/hed/tools/remodeling/operations/factor_hed_type_op.py +++ b/hed/tools/remodeling/operations/factor_hed_type_op.py @@ -5,7 +5,7 @@ from hed.tools.remodeling.operations.base_op import BaseOp from hed.models.tabular_input import TabularInput from hed.models.sidecar import Sidecar -from hed.tools.analysis.analysis_util import get_assembled_strings +from hed.models.df_util import get_assembled from hed.tools.analysis.hed_type_manager import HedTypeManager # TODO: restricted factor values are not implemented yet. @@ -69,13 +69,13 @@ def do_op(self, dispatcher, df, name, sidecar=None): """ if sidecar and not isinstance(sidecar, Sidecar): - sidecar = Sidecar(sidecar, hed_schema=dispatcher.hed_schema) - input_data = TabularInput(df, hed_schema=dispatcher.hed_schema, sidecar=sidecar) - df = input_data.dataframe.copy() - df_list = [df] - hed_strings = get_assembled_strings(input_data, hed_schema=dispatcher.hed_schema, expand_defs=False) + sidecar = Sidecar(sidecar) + input_data = TabularInput(df, sidecar=sidecar, name=name) + df_list = [input_data.dataframe.copy()] + hed_strings, definitions = get_assembled(input_data, sidecar, dispatcher.hed_schema, + extra_def_dicts=None, join_columns=True, + shrink_defs=False, expand_defs=True) - definitions = input_data.get_definitions() var_manager = HedTypeManager(hed_strings, dispatcher.hed_schema, definitions) var_manager.add_type_variable(self.type_tag.lower()) diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index 09f7e3a48..a8d220df8 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -5,6 +5,7 @@ from hed.tools.analysis.hed_tag_counts import HedTagCounts from hed.tools.remodeling.operations.base_op import BaseOp from hed.tools.remodeling.operations.base_context import BaseContext +from hed.models.df_util import get_assembled class SummarizeHedTagsOp(BaseOp): @@ -97,12 +98,14 @@ def update_context(self, new_context): counts = HedTagCounts(new_context['name'], total_events=len(new_context['df'])) sidecar = new_context['sidecar'] if sidecar and not isinstance(sidecar, Sidecar): - sidecar = Sidecar(sidecar, hed_schema=new_context['schema']) - input_data = TabularInput(new_context['df'], hed_schema=new_context['schema'], sidecar=sidecar) + sidecar = Sidecar(sidecar) + input_data = TabularInput(new_context['df'], sidecar=sidecar, name=new_context['name']) + hed_strings, definitions = get_assembled(input_data, sidecar, new_context['schema'], + extra_def_dicts=None, join_columns=True, + shrink_defs=False, expand_defs=True) # definitions = input_data.get_definitions().gathered_defs - for objs in input_data.iter_dataframe(hed_ops=[new_context['schema']], return_string_only=False, - expand_defs=True, remove_definitions=True): - counts.update_event_counts(objs['HED'], new_context['name']) + for hed in hed_strings: + counts.update_event_counts(hed, new_context['name']) self.summary_dict[new_context["name"]] = counts def _get_summary_details(self, merge_counts): diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py index 2c7ab7c64..0e2664698 100644 --- a/hed/tools/remodeling/operations/summarize_hed_type_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_type_op.py @@ -2,7 +2,7 @@ from hed.models.tabular_input import TabularInput from hed.models.sidecar import Sidecar -from hed.tools.analysis.analysis_util import get_assembled_strings +from hed.models.df_util import get_assembled from hed.tools.analysis.hed_type_values import HedTypeValues from hed.tools.analysis.hed_type_counts import HedTypeCounts from hed.tools.analysis.hed_context_manager import HedContextManager @@ -90,10 +90,11 @@ def __init__(self, sum_op): def update_context(self, new_context): sidecar = new_context['sidecar'] if sidecar and not isinstance(sidecar, Sidecar): - sidecar = Sidecar(sidecar, hed_schema=new_context['schema']) - input_data = TabularInput(new_context['df'], hed_schema=new_context['schema'], sidecar=sidecar) - hed_strings = get_assembled_strings(input_data, hed_schema=new_context['schema'], expand_defs=False) - definitions = input_data.get_definitions().gathered_defs + sidecar = Sidecar(sidecar) + input_data = TabularInput(new_context['df'], sidecar=sidecar, name=new_context['name']) + hed_strings, definitions = get_assembled(input_data, sidecar, new_context['schema'], + extra_def_dicts=None, join_columns=True, + shrink_defs=False, expand_defs=True) context_manager = HedContextManager(hed_strings, new_context['schema']) type_values = HedTypeValues(context_manager, definitions, new_context['name'], type_tag=self.type_tag) diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py index 771b49e5c..d1bd8f53e 100644 --- a/hed/tools/remodeling/operations/summarize_hed_validation_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_validation_op.py @@ -102,7 +102,6 @@ def _get_result_string(self, name, result, indent=BaseContext.DISPLAY_INDENT): return "\n".join(sum_list) def update_context(self, new_context): - validator = HedValidator(hed_schema=new_context['schema']) results = self.get_empty_results() results["total_event_files"] = 1 results["event_issues"][new_context["name"]] = [] @@ -111,10 +110,9 @@ def update_context(self, new_context): filtered_issues = [] if sidecar: if not isinstance(sidecar, Sidecar): - sidecar = Sidecar(files=new_context['sidecar'], name=os.path.basename(sidecar), - hed_schema=new_context['schema']) + sidecar = Sidecar(files=new_context['sidecar'], name=os.path.basename(sidecar)) results["sidecar_issues"][sidecar.name] = [] - sidecar_issues = sidecar.validate_entries(validator, check_for_warnings=self.check_for_warnings) + sidecar_issues = sidecar.validate(new_context['schema']) filtered_issues = ErrorHandler.filter_issues_by_severity(sidecar_issues, ErrorSeverity.ERROR) if not self.check_for_warnings: sidecar_issues = filtered_issues @@ -123,8 +121,8 @@ def update_context(self, new_context): results['total_sidecar_files'] = 1 if not filtered_issues: results['validation_completed'] = True - input_data = TabularInput(new_context['df'], hed_schema=new_context['schema'], sidecar=sidecar) - issues = input_data.validate_file(validator, check_for_warnings=self.check_for_warnings) + input_data = TabularInput(new_context['df'], sidecar=sidecar) + issues = input_data.validate(new_context['schema']) if not self.check_for_warnings: issues = ErrorHandler.filter_issues_by_severity(issues, ErrorSeverity.ERROR) results['event_issues'][new_context["name"]] = issues diff --git a/tests/tools/analysis/test_analysis_util_assemble_hed.py b/tests/tools/analysis/test_analysis_util_assemble_hed.py index 9c37b8620..318c3aa54 100644 --- a/tests/tools/analysis/test_analysis_util_assemble_hed.py +++ b/tests/tools/analysis/test_analysis_util_assemble_hed.py @@ -2,8 +2,10 @@ import unittest from pandas import DataFrame from hed import schema as hedschema -from hed.models import Sidecar, TabularInput -from hed.tools import assemble_hed, search_tabular +from hed.models import Sidecar, TabularInput, DefinitionDict +from hed.tools.analysis.analysis_util import assemble_hed + + # noinspection PyBroadException @@ -20,8 +22,8 @@ def setUpClass(cls): events_path = os.path.realpath(os.path.join(bids_root_path, 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) - hed_schema = hedschema.load_schema(schema_path) - cls.hed_schema = hed_schema + schema = hedschema.load_schema(schema_path) + cls.schema = schema sidecar1 = Sidecar(json_path, name='face_sub1_json') cls.sidecar_path = sidecar1 cls.sidecar1 = sidecar1 @@ -29,8 +31,8 @@ def setUpClass(cls): cls.input_data_no_sidecar = TabularInput(events_path, name="face_sub1_events_no_sidecar") def test_assemble_hed_included_no_expand(self): - df1, dict1 = assemble_hed(self.input_data, self.sidecar1, self.hed_schema, - columns_included=["onset", "duration", "event_type"], expand_defs=False) + df1, dict1 = assemble_hed(self.input_data, self.sidecar1, self.schema, expand_defs=False, + columns_included=["onset", "duration", "event_type"]) self.assertIsInstance(df1, DataFrame, "hed_assemble should return a dataframe when columns are included") columns1 = list(df1.columns) self.assertEqual(len(columns1), 4, @@ -43,28 +45,29 @@ def test_assemble_hed_included_no_expand(self): self.assertEqual(len(dict1.defs), 17, "hed_assemble definition dictionary has the right number of elements.") def test_assemble_hed_included_expand(self): - df2, dict2 = assemble_hed(self.input_data, self.sidecar1, self.hed_schema, - columns_included=["onset", "duration", "event_type"], expand_defs=True) + df2, dict2 = assemble_hed(self.input_data, self.sidecar1, self.schema, expand_defs=True, + columns_included=["onset", "duration", "event_type"]) first_str2 = df2.iloc[0]['HED_assembled'] self.assertEqual(first_str2.find('Def/'), -1, "assemble_hed with def expand has no Def tag") self.assertNotEqual(first_str2.find('Def-expand/'), -1, "assemble_hed with def expand has Def-expand tags") def test_assemble_hed_included_no_expand_bad_column(self): - df3, dict3 = assemble_hed(self.input_data, - columns_included=["onset", "baloney", "duration", "event_type"], expand_defs=False) + df3, dict3 = assemble_hed(self.input_data, self.sidecar1, self.schema, expand_defs=True, + columns_included=["onset", "baloney", "duration", "event_type"]) columns3 = list(df3.columns) self.assertEqual(len(columns3), 4, "assemble_hed should return the correct number of columns when bad columns are included ") def test_assemble_hed_included_expand_bad_column(self): - df3, dict3 = assemble_hed(self.input_data, - columns_included=["onset", "baloney", "duration", "event_type"], expand_defs=True) + df3, dict3 = assemble_hed(self.input_data, self.sidecar1, self.schema, expand_defs=True, + columns_included=["onset", "baloney", "duration", "event_type"]) columns3 = list(df3.columns) self.assertEqual(len(columns3), 4, "assemble_hed should return the correct number of columns when bad columns are included ") def test_assemble_hed_no_included_no_expand(self): - df1, dict1 = assemble_hed(self.input_data, columns_included=None, expand_defs=False) + df1, dict1 = assemble_hed(self.input_data, self.sidecar1, self.schema, + columns_included=None, expand_defs=False) self.assertIsInstance(df1, DataFrame, "hed_assemble returns a dataframe when no columns are included") columns1 = list(df1.columns) self.assertEqual(len(columns1), 1, @@ -73,17 +76,18 @@ def test_assemble_hed_no_included_no_expand(self): self.assertNotEqual(first_str1.find('Def/'), -1, "assemble_hed with no def expand has Def tags") self.assertEqual(first_str1.find('Def-expand'), -1, "assemble_hed with no def expand does not have Def-expand tags") - self.assertIsInstance(dict1, dict, "hed_assemble returns a dictionary of definitions") - self.assertEqual(len(dict1), 17, "hed_assemble definition dictionary has the right number of elements.") + self.assertIsInstance(dict1, DefinitionDict, "hed_assemble returns a dictionary of definitions") + self.assertEqual(len(dict1.defs), 17, "hed_assemble definition dictionary has the right number of elements.") def test_assemble_hed_no_included_expand(self): - df2, dict2 = assemble_hed(self.input_data, columns_included=None, expand_defs=True) + df2, dict2 = assemble_hed(self.input_data, self.sidecar1, self.schema, + columns_included=None, expand_defs=True) first_str2 = df2.iloc[0]['HED_assembled'] self.assertEqual(first_str2.find('Def/'), -1, "assemble_hed with def expand has no Def tag") self.assertNotEqual(first_str2.find('Def-expand/'), -1, "assemble_hed with def expand has Def-expand tags") def test_assemble_hed_bad_column_no_expand(self): - df3, dict3 = assemble_hed(self.input_data, + df3, dict3 = assemble_hed(self.input_data, self.sidecar1, self.schema, columns_included=["onset", "baloney", "duration", "event_type"], expand_defs=False) columns3 = list(df3.columns) self.assertEqual(len(columns3), 4, @@ -92,27 +96,27 @@ def test_assemble_hed_bad_column_no_expand(self): self.assertNotEqual(first_str2.find('Def/'), -1, "assemble_hed with def expand has no Def tag") self.assertEqual(first_str2.find('Def-expand/'), -1, "assemble_hed with def expand has Def-expand tags") - def test_search_tabular(self): - query1 = "sensory-event" - df1 = search_tabular(self.input_data, self.hed_schema, query1, columns_included=None) - self.assertIsInstance(df1, DataFrame, "search_tabular returns a dataframe when the query is satisfied.") - self.assertEqual(len(df1.columns), 2, "search_tabular has the right number of columns when query okay") - self.assertEqual(len(df1.index), 155, "search_tabular has right number of rows when query okay") - query2 = 'data-feature' - df2 = search_tabular(self.input_data, self.hed_schema, query2, columns_included=None) - self.assertFalse(df2, "search_tabular returns None when query is not satisfied.") - - query3 = "sensory-event" - df3 = search_tabular(self.input_data, self.hed_schema, query3, columns_included=['event_type', 'rep_status']) - self.assertIsInstance(df3, DataFrame, "search_tabular returns a DataFrame when extra columns") - self.assertEqual(len(df3.columns), 3, "search_tabular returns right number of columns when extra columns") - self.assertEqual(len(df3.index), 155, "search_tabular has right number of rows when query okay") - - df4 = search_tabular(self.input_data, self.hed_schema, query3, - columns_included=['onset', 'event_type', 'rep_status']) - self.assertIsInstance(df4, DataFrame, "search_tabular returns a DataFrame when extra columns") - self.assertEqual(len(df4.columns), 4, "search_tabular returns right number of columns when extra columns") - self.assertEqual(len(df4.index), 155, "search_tabular has right number of rows when query okay") + # def test_search_tabular(self): + # query1 = "sensory-event" + # df1 = search_tabular(self.input_data, self.schema, query1, columns_included=None) + # self.assertIsInstance(df1, DataFrame, "search_tabular returns a dataframe when the query is satisfied.") + # self.assertEqual(len(df1.columns), 2, "search_tabular has the right number of columns when query okay") + # self.assertEqual(len(df1.index), 155, "search_tabular has right number of rows when query okay") + # query2 = 'data-feature' + # df2 = search_tabular(self.input_data, self.hed_schema, query2, columns_included=None) + # self.assertFalse(df2, "search_tabular returns None when query is not satisfied.") + # + # query3 = "sensory-event" + # df3 = search_tabular(self.input_data, self.hed_schema, query3, columns_included=['event_type', 'rep_status']) + # self.assertIsInstance(df3, DataFrame, "search_tabular returns a DataFrame when extra columns") + # self.assertEqual(len(df3.columns), 3, "search_tabular returns right number of columns when extra columns") + # self.assertEqual(len(df3.index), 155, "search_tabular has right number of rows when query okay") + # + # df4 = search_tabular(self.input_data, self.hed_schema, query3, + # columns_included=['onset', 'event_type', 'rep_status']) + # self.assertIsInstance(df4, DataFrame, "search_tabular returns a DataFrame when extra columns") + # self.assertEqual(len(df4.columns), 4, "search_tabular returns right number of columns when extra columns") + # self.assertEqual(len(df4.index), 155, "search_tabular has right number of rows when query okay") if __name__ == '__main__': diff --git a/tests/tools/analysis/test_analysis_util_get_assembled_strings.py b/tests/tools/analysis/test_analysis_util_get_assembled_strings.py index 143db3305..036b4c938 100644 --- a/tests/tools/analysis/test_analysis_util_get_assembled_strings.py +++ b/tests/tools/analysis/test_analysis_util_get_assembled_strings.py @@ -3,7 +3,7 @@ from hed import schema as hedschema from hed.models.hed_string import HedString from hed.models.tabular_input import TabularInput -from hed.tools.analysis.analysis_util import get_assembled_strings +# from hed.tools.analysis.analysis_util import get_assembled_strings # noinspection PyBroadException @@ -26,90 +26,89 @@ def setUpClass(cls): # cls.input_data_no_sidecar = TabularInput(events_path, name="face_sub1_events_no_sidecar") def setUp(self): - self.input_data = TabularInput(self.events_path, hed_schema=self.hed_schema, - sidecar=self.json_path, name="face_sub1_events") + self.input_data = TabularInput(self.events_path, sidecar=self.json_path, name="face_sub1_events") - def test_get_assembled_strings_no_schema_no_def_expand(self): - hed_list1 = get_assembled_strings(self.input_data, expand_defs=False) - self.assertIsInstance(hed_list1, list, "get_assembled_groups should return a list when expand defs is False") - self.assertIsInstance(hed_list1[0], HedString) - hed_strings1 = [str(hed) for hed in hed_list1] - self.assertIsInstance(hed_strings1[0], str, "get_assembled_strings can be converted.") - self.assertIsInstance(hed_strings1, list) - hed_strings_joined1 = ",".join(hed_strings1) - self.assertEqual(hed_strings_joined1.find("Def-expand/"), -1, - "get_assembled_strings should not have Def-expand when expand_defs is False") - self.assertNotEqual(hed_strings_joined1.find("Def/"), -1, - "get_assembled_strings should have Def/ when expand_defs is False") - - def test_get_assembled_strings_no_schema_def_expand(self): - hed_list2 = get_assembled_strings(self.input_data, expand_defs=True) - self.assertIsInstance(hed_list2, list, "get_assembled_groups should return a list") - self.assertIsInstance(hed_list2[0], HedString) - hed_strings2 = [str(hed) for hed in hed_list2] - self.assertIsInstance(hed_strings2[0], str, "get_assembled_strings can be converted.") - self.assertIsInstance(hed_strings2, list, "get_assembled") - hed_strings_joined2 = ",".join(hed_strings2) - self.assertNotEqual(hed_strings_joined2.find("Def-expand/"), -1, - "get_assembled_strings should have Def-expand when expand_defs is True") - self.assertEqual(hed_strings_joined2.find("Def/"), -1, - "get_assembled_strings should not have Def/ when expand_defs is True") - - def test_get_assembled_strings_with_schema_no_def_expand(self): - hed_list1 = get_assembled_strings(self. input_data, hed_schema=self.hed_schema, expand_defs=False) - self.assertIsInstance(hed_list1, list, "get_assembled_strings returns a list when expand defs is False") - self.assertIsInstance(hed_list1[0], HedString) - hed_strings1 = [str(hed) for hed in hed_list1] - self.assertIsInstance(hed_strings1[0], str, "get_assembled_strings can be converted.") - self.assertIsInstance(hed_strings1, list) - hed_strings_joined1 = ",".join(hed_strings1) - self.assertEqual(hed_strings_joined1.find("Def-expand/"), -1, - "get_assembled_strings does not have Def-expand when expand_defs is False") - self.assertNotEqual(hed_strings_joined1.find("Def/"), -1, - "get_assembled_strings should have Def/ when expand_defs is False") - - def test_get_assembled_strings_with_schema_def_expand(self): - hed_list2 = get_assembled_strings(self.input_data, hed_schema=self.hed_schema, expand_defs=True) - self.assertIsInstance(hed_list2, list, "get_assembled_groups should return a list") - self.assertIsInstance(hed_list2[0], HedString) - hed_strings2 = [str(hed) for hed in hed_list2] - self.assertIsInstance(hed_strings2[0], str, "get_assembled_strings can be converted.") - self.assertIsInstance(hed_strings2, list, "get_assembled") - hed_strings_joined2 = ",".join(hed_strings2) - self.assertNotEqual(hed_strings_joined2.find("Def-expand/"), -1, - "get_assembled_strings should have Def-expand when expand_defs is True") - self.assertEqual(hed_strings_joined2.find("Def/"), -1, - "get_assembled_strings should not have Def/ when expand_defs is True") - - def test_get_assembled_strings_no_sidecar_no_schema(self): - input_data = TabularInput(self.events_path, name="face_sub1_events") - hed_list1 = get_assembled_strings(input_data, expand_defs=False) - self.assertEqual(len(hed_list1), 200, - "get_assembled_strings should have right number of entries when no sidecar") - self.assertIsInstance(hed_list1[0], HedString, - "get_assembled_string should return an HedString when no sidecar") - self.assertFalse(hed_list1[0].children, "get_assembled_string returned HedString is empty when no sidecar") - hed_list2 = get_assembled_strings(input_data, expand_defs=True) - self.assertEqual(len(hed_list2), 200, - "get_assembled_strings should have right number of entries when no sidecar") - self.assertIsInstance(hed_list2[0], HedString, - "get_assembled_string should return an HedString when no sidecar") - self.assertFalse(hed_list2[0].children, "get_assembled_string returned HedString is empty when no sidecar") - - def test_get_assembled_strings_no_sidecar_schema(self): - input_data = TabularInput(self.events_path, hed_schema=self.hed_schema, name="face_sub1_events") - hed_list1 = get_assembled_strings(input_data, expand_defs=False) - self.assertEqual(len(hed_list1), 200, - "get_assembled_strings should have right number of entries when no sidecar") - self.assertIsInstance(hed_list1[0], HedString, - "get_assembled_string should return an HedString when no sidecar") - self.assertFalse(hed_list1[0].children, "get_assembled_string returned HedString is empty when no sidecar") - hed_list2 = get_assembled_strings(input_data, expand_defs=True) - self.assertEqual(len(hed_list2), 200, - "get_assembled_strings should have right number of entries when no sidecar") - self.assertIsInstance(hed_list2[0], HedString, - "get_assembled_string should return an HedString when no sidecar") - self.assertFalse(hed_list2[0].children, "get_assembled_string returned HedString is empty when no sidecar") + # def test_get_assembled_strings_no_schema_no_def_expand(self): + # hed_list1 = get_assembled_strings(self.input_data, expand_defs=False) + # self.assertIsInstance(hed_list1, list, "get_assembled_groups should return a list when expand defs is False") + # self.assertIsInstance(hed_list1[0], HedString) + # hed_strings1 = [str(hed) for hed in hed_list1] + # self.assertIsInstance(hed_strings1[0], str, "get_assembled_strings can be converted.") + # self.assertIsInstance(hed_strings1, list) + # hed_strings_joined1 = ",".join(hed_strings1) + # self.assertEqual(hed_strings_joined1.find("Def-expand/"), -1, + # "get_assembled_strings should not have Def-expand when expand_defs is False") + # self.assertNotEqual(hed_strings_joined1.find("Def/"), -1, + # "get_assembled_strings should have Def/ when expand_defs is False") + # + # def test_get_assembled_strings_no_schema_def_expand(self): + # hed_list2 = get_assembled_strings(self.input_data, self.hed_schema, expand_defs=True) + # self.assertIsInstance(hed_list2, list, "get_assembled_groups should return a list") + # self.assertIsInstance(hed_list2[0], HedString) + # hed_strings2 = [str(hed) for hed in hed_list2] + # self.assertIsInstance(hed_strings2[0], str, "get_assembled_strings can be converted.") + # self.assertIsInstance(hed_strings2, list, "get_assembled") + # hed_strings_joined2 = ",".join(hed_strings2) + # self.assertNotEqual(hed_strings_joined2.find("Def-expand/"), -1, + # "get_assembled_strings should have Def-expand when expand_defs is True") + # self.assertEqual(hed_strings_joined2.find("Def/"), -1, + # "get_assembled_strings should not have Def/ when expand_defs is True") + # + # def test_get_assembled_strings_with_schema_no_def_expand(self): + # hed_list1 = get_assembled_strings(self. input_data, hed_schema=self.hed_schema, expand_defs=False) + # self.assertIsInstance(hed_list1, list, "get_assembled_strings returns a list when expand defs is False") + # self.assertIsInstance(hed_list1[0], HedString) + # hed_strings1 = [str(hed) for hed in hed_list1] + # self.assertIsInstance(hed_strings1[0], str, "get_assembled_strings can be converted.") + # self.assertIsInstance(hed_strings1, list) + # hed_strings_joined1 = ",".join(hed_strings1) + # self.assertEqual(hed_strings_joined1.find("Def-expand/"), -1, + # "get_assembled_strings does not have Def-expand when expand_defs is False") + # self.assertNotEqual(hed_strings_joined1.find("Def/"), -1, + # "get_assembled_strings should have Def/ when expand_defs is False") + # + # def test_get_assembled_strings_with_schema_def_expand(self): + # hed_list2 = get_assembled_strings(self.input_data, hed_schema=self.hed_schema, expand_defs=True) + # self.assertIsInstance(hed_list2, list, "get_assembled_groups should return a list") + # self.assertIsInstance(hed_list2[0], HedString) + # hed_strings2 = [str(hed) for hed in hed_list2] + # self.assertIsInstance(hed_strings2[0], str, "get_assembled_strings can be converted.") + # self.assertIsInstance(hed_strings2, list, "get_assembled") + # hed_strings_joined2 = ",".join(hed_strings2) + # self.assertNotEqual(hed_strings_joined2.find("Def-expand/"), -1, + # "get_assembled_strings should have Def-expand when expand_defs is True") + # self.assertEqual(hed_strings_joined2.find("Def/"), -1, + # "get_assembled_strings should not have Def/ when expand_defs is True") + # + # def test_get_assembled_strings_no_sidecar_no_schema(self): + # input_data = TabularInput(self.events_path, name="face_sub1_events") + # hed_list1 = get_assembled_strings(input_data, expand_defs=False) + # self.assertEqual(len(hed_list1), 200, + # "get_assembled_strings should have right number of entries when no sidecar") + # self.assertIsInstance(hed_list1[0], HedString, + # "get_assembled_string should return an HedString when no sidecar") + # self.assertFalse(hed_list1[0].children, "get_assembled_string returned HedString is empty when no sidecar") + # hed_list2 = get_assembled_strings(input_data, expand_defs=True) + # self.assertEqual(len(hed_list2), 200, + # "get_assembled_strings should have right number of entries when no sidecar") + # self.assertIsInstance(hed_list2[0], HedString, + # "get_assembled_string should return an HedString when no sidecar") + # self.assertFalse(hed_list2[0].children, "get_assembled_string returned HedString is empty when no sidecar") + # + # def test_get_assembled_strings_no_sidecar_schema(self): + # input_data = TabularInput(self.events_path, hed_schema=self.hed_schema, name="face_sub1_events") + # hed_list1 = get_assembled_strings(input_data, expand_defs=False) + # self.assertEqual(len(hed_list1), 200, + # "get_assembled_strings should have right number of entries when no sidecar") + # self.assertIsInstance(hed_list1[0], HedString, + # "get_assembled_string should return an HedString when no sidecar") + # self.assertFalse(hed_list1[0].children, "get_assembled_string returned HedString is empty when no sidecar") + # hed_list2 = get_assembled_strings(input_data, expand_defs=True) + # self.assertEqual(len(hed_list2), 200, + # "get_assembled_strings should have right number of entries when no sidecar") + # self.assertIsInstance(hed_list2[0], HedString, + # "get_assembled_string should return an HedString when no sidecar") + # self.assertFalse(hed_list2[0].children, "get_assembled_string returned HedString is empty when no sidecar") if __name__ == '__main__': diff --git a/tests/tools/analysis/test_annotation_util.py b/tests/tools/analysis/test_annotation_util.py index fcf2ce03a..f54dd1dc8 100644 --- a/tests/tools/analysis/test_annotation_util.py +++ b/tests/tools/analysis/test_annotation_util.py @@ -232,19 +232,19 @@ def test_hed_to_df_with_definitions(self): "hed_to_df should have right description when in parentheses") def test_hed_to_df_to_hed(self): - validator = HedValidator(self.hed_schema) + # validator = HedValidator(self.hed_schema) side1 = Sidecar(files=self.json_path, name="sidecar_face.json") - issues1 = side1.validate_entries(validator, check_for_warnings=True) + issues1 = side1.validate(self.hed_schema) self.assertFalse(issues1, "hed_to_df_to_hed is starting with a valid JSON sidecar") df1 = hed_to_df(self.sidecar_face) self.assertIsInstance(df1, DataFrame, "hed_to_df_to_hed starting sidecar can be converted to df") hed2 = df_to_hed(df1, description_tag=True) side2 = Sidecar(files=io.StringIO(json.dumps(hed2)), name='JSON_Sidecar2') - issues2 = side2.validate_entries(validator, check_for_warnings=True) + issues2 = side2.validate(self.hed_schema) self.assertFalse(issues2, "hed_to_df_to_hed is valid after conversion back and forth with description True") hed3 = df_to_hed(df1, description_tag=False) side3 = Sidecar(files=io.StringIO(json.dumps(hed3)), name='JSON_Sidecar2') - issues3 = side3.validate_entries(validator, check_for_warnings=True) + issues3 = side3.validate(self.hed_schema) self.assertFalse(issues3, "hed_to_df_to_hed is valid after conversion back and forth with description False") def test_merge_hed_dict_cat_col(self): diff --git a/tests/tools/analysis/test_event_manager.py b/tests/tools/analysis/test_event_manager.py index dd920256a..09eb17a50 100644 --- a/tests/tools/analysis/test_event_manager.py +++ b/tests/tools/analysis/test_event_manager.py @@ -1,13 +1,9 @@ import os import unittest -from hed.errors.exceptions import HedFileError -from hed.models.hed_group import HedGroup -from hed.models.hed_string import HedString + from hed.models.sidecar import Sidecar from hed.models.tabular_input import TabularInput from hed.schema.hed_schema_io import load_schema_version -from hed.tools.analysis.hed_context_manager import HedContextManager, OnsetGroup -from hed.tools.analysis.analysis_util import get_assembled_strings from hed.tools.analysis.event_manager import EventManager from hed.tools.analysis.temporal_event import TemporalEvent @@ -23,7 +19,7 @@ def setUpClass(cls): 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) sidecar_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) sidecar1 = Sidecar(sidecar_path, name='face_sub1_json') - cls.input_data = TabularInput(events_path, sidecar=sidecar1, hed_schema=schema, name="face_sub1_events") + cls.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") cls.schema = schema def test_constructor(self): @@ -33,14 +29,13 @@ def test_constructor(self): for index, item in enumerate(manager1.event_list): for event in item: event_count = event_count + 1 - self.assertFalse(event.duration) + self.assertFalse(event.duration) self.assertTrue(event.end_index) self.assertEqual(event.start_index, index) self.assertEqual(event.start_index, index) self.assertEqual(event.start_time, manager1.data.dataframe.loc[index, "onset"]) if not event.end_time: self.assertEqual(event.end_index, len(manager1.data.dataframe)) - print("to here") # def test_constructor(self): @@ -56,7 +51,7 @@ def test_constructor(self): # self.assertEqual(hed, manager1.hed_strings[i]) # self.assertEqual(context, manager1.contexts[i]) # i = i + 1 - # + # def test_constructor_from_assembled(self): # hed_strings = get_assembled_strings(self.input_data, hed_schema=self.schema, expand_defs=False) # manager1 = HedContextManager(hed_strings, self.schema) @@ -64,12 +59,12 @@ def test_constructor(self): # "The constructor for assembled strings has expected # of strings") # self.assertEqual(len(manager1.onset_list), 261, # "The constructor for assembled strings has onset_list of correct length") - # + # def test_constructor_unmatched(self): # with self.assertRaises(HedFileError) as context: # HedContextManager(self.test_strings2, self.schema) # self.assertEqual(context.exception.args[0], 'UnmatchedOffset') - # + # def test_constructor_multiple_values(self): # manager = HedContextManager(self.test_strings3, self.schema) # self.assertEqual(len(manager.onset_list), 3, "Constructor should have right number of onsets") diff --git a/tests/tools/analysis/test_hed_context_manager.py b/tests/tools/analysis/test_hed_context_manager.py index 26e0f4e87..2ac042453 100644 --- a/tests/tools/analysis/test_hed_context_manager.py +++ b/tests/tools/analysis/test_hed_context_manager.py @@ -77,8 +77,8 @@ def test_constructor1(self): self.assertEqual(cont.exception.args[0], "ContextRequiresSchema") def test_iter(self): - hed_strings, _ = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, - join_columns=True, shrink_defs=True, expand_defs=False) + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) manager1 = HedContextManager(hed_strings, self.schema) i = 0 for hed, context in manager1.iter_context(): @@ -87,8 +87,8 @@ def test_iter(self): i = i + 1 def test_constructor_from_assembled(self): - hed_strings, _ = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, - join_columns=True, shrink_defs=True, expand_defs=False) + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) manager1 = HedContextManager(hed_strings, self.schema) self.assertEqual(len(manager1.hed_strings), 200, "The constructor for assembled strings has expected # of strings") diff --git a/tests/tools/analysis/test_hed_tag_counts.py b/tests/tools/analysis/test_hed_tag_counts.py index 76b0a9eaf..0950ea909 100644 --- a/tests/tools/analysis/test_hed_tag_counts.py +++ b/tests/tools/analysis/test_hed_tag_counts.py @@ -25,7 +25,7 @@ def setUpClass(cls): cls.hed_schema = schema sidecar1 = Sidecar(json_path, name='face_sub1_json') input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") - input_df, def_dict = assemble_hed(input_data, expand_defs=False) + input_df, def_dict = assemble_hed(input_data, sidecar1, schema, expand_defs=False) cls.input_df = input_df cls.def_dict = def_dict diff --git a/tests/tools/analysis/test_hed_type_counts.py b/tests/tools/analysis/test_hed_type_counts.py index 711b8d4c9..c4fd22cab 100644 --- a/tests/tools/analysis/test_hed_type_counts.py +++ b/tests/tools/analysis/test_hed_type_counts.py @@ -6,7 +6,7 @@ from hed.tools.analysis.hed_context_manager import HedContextManager from hed.tools.analysis.hed_type_values import HedTypeValues from hed.tools.analysis.hed_type_counts import HedTypeCount, HedTypeCounts -from hed.tools.analysis.analysis_util import get_assembled_strings +from hed.models.df_util import get_assembled class Test(unittest.TestCase): @@ -19,10 +19,10 @@ def setUpClass(cls): events_path = os.path.realpath(os.path.join(bids_root_path, 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) sidecar_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) - sidecar1 = Sidecar(sidecar_path, hed_schema=schema, name='face_sub1_json') - input_data = TabularInput(events_path, sidecar=sidecar1, hed_schema=schema, name="face_sub1_events") - hed_strings1 = get_assembled_strings(input_data, hed_schema=schema, expand_defs=False) - definitions1 = input_data.get_definitions(as_strings=False).gathered_defs + sidecar1 = Sidecar(sidecar_path, name='face_sub1_json') + input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") + hed_strings1, definitions1 = get_assembled(input_data, sidecar1, schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) cls.var_type1 = HedTypeValues(HedContextManager(hed_strings1, schema), definitions1, 'run-01', type_tag='condition-variable') diff --git a/tests/tools/analysis/test_hed_type_definitions.py b/tests/tools/analysis/test_hed_type_definitions.py index 7a66d7e8e..15cbedce2 100644 --- a/tests/tools/analysis/test_hed_type_definitions.py +++ b/tests/tools/analysis/test_hed_type_definitions.py @@ -42,9 +42,10 @@ def setUpClass(cls): events_path = os.path.realpath(os.path.join(bids_root_path, 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) sidecar_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) - sidecar1 = Sidecar(sidecar_path, hed_schema=schema, name='face_sub1_json') - cls.input_data = TabularInput(events_path, hed_schema=schema, sidecar=sidecar1, name="face_sub1_events") + sidecar1 = Sidecar(sidecar_path, name='face_sub1_json') + cls.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") cls.schema = schema + cls.sidecar1 = sidecar1 def test_constructor(self): def_man = HedTypeDefinitions(self.definitions1, self.schema) @@ -54,8 +55,8 @@ def test_constructor(self): self.assertEqual(len(def_man.def_map), len(def_man.definitions), "Constructor condition_map should be the same length as the definitions dictionary") - def test_constructor_from_tabular_input(self): - definitions = self.input_data.get_definitions(as_strings=False).gathered_defs + def test_constructor_from_sidecar(self): + definitions = self.sidecar1.get_def_dict(self.schema) def_man = HedTypeDefinitions(definitions, self.schema) self.assertIsInstance(def_man, HedTypeDefinitions, "Constructor should create a HedTypeDefinitions from a tabular input") diff --git a/tests/tools/analysis/test_hed_type_factors.py b/tests/tools/analysis/test_hed_type_factors.py index 5615453da..5821e2675 100644 --- a/tests/tools/analysis/test_hed_type_factors.py +++ b/tests/tools/analysis/test_hed_type_factors.py @@ -10,7 +10,7 @@ from hed.tools.analysis.hed_context_manager import HedContextManager from hed.tools.analysis.hed_type_values import HedTypeValues from hed.tools.analysis.hed_type_factors import HedTypeFactors -from hed.tools.analysis.analysis_util import get_assembled_strings +from hed.models.df_util import get_assembled class Test(unittest.TestCase): @@ -57,8 +57,9 @@ def setUpClass(cls): events_path = os.path.realpath(os.path.join(bids_root_path, 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) sidecar_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) - sidecar1 = Sidecar(sidecar_path, hed_schema=schema, name='face_sub1_json') - cls.input_data = TabularInput(events_path, sidecar=sidecar1, hed_schema=schema, name="face_sub1_events") + sidecar1 = Sidecar(sidecar_path, name='face_sub1_json') + cls.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") + cls.sidecar1 = sidecar1 cls.schema = schema def test_with_mixed(self): @@ -73,9 +74,9 @@ def test_with_mixed(self): self.assertIsInstance(summary1, dict) def test_tabular_input(self): - test_strings1 = get_assembled_strings(self.input_data, hed_schema=self.schema, expand_defs=False) - definitions = self.input_data.get_definitions(as_strings=False).gathered_defs - var_manager = HedTypeValues(HedContextManager(test_strings1, self.schema), definitions, 'run-01') + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeValues(HedContextManager(hed_strings, self.schema), definitions, 'run-01') self.assertIsInstance(var_manager, HedTypeValues, "Constructor should create a HedTypeManager from a tabular input") var_fact = var_manager.get_type_value_factors('face-type') @@ -154,8 +155,8 @@ def test_count_events(self): self.assertIsNone(max_multiple2, "_count_level_events should not have a max multiple for empty list") def test_get_summary(self): - hed_strings = get_assembled_strings(self.input_data, hed_schema=self.schema, expand_defs=False) - definitions = self.input_data.get_definitions(as_strings=False).gathered_defs + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) var_manager = HedTypeValues(HedContextManager(hed_strings, self.schema), definitions, 'run-01') var_key = var_manager.get_type_value_factors('key-assignment') sum_key = var_key.get_summary() diff --git a/tests/tools/analysis/test_hed_type_manager.py b/tests/tools/analysis/test_hed_type_manager.py index 82bdf0e8b..9fd7abce2 100644 --- a/tests/tools/analysis/test_hed_type_manager.py +++ b/tests/tools/analysis/test_hed_type_manager.py @@ -6,7 +6,7 @@ from hed.tools.analysis.hed_type_values import HedTypeValues from hed.tools.analysis.hed_type_factors import HedTypeFactors from hed.tools.analysis.hed_type_manager import HedTypeManager -from hed.tools.analysis.analysis_util import get_assembled_strings +from hed.models.df_util import get_assembled class Test(unittest.TestCase): @@ -18,14 +18,16 @@ def setUp(self): events_path = os.path.realpath(os.path.join(bids_root_path, 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) sidecar_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) - sidecar1 = Sidecar(sidecar_path, hed_schema=schema, name='face_sub1_json') - self.input_data = TabularInput(events_path, sidecar=sidecar1, hed_schema=schema, name="face_sub1_events") - self.hed_strings = get_assembled_strings(self.input_data, hed_schema=schema, expand_defs=False) - self.hed_schema = schema - self.definitions = self.input_data.get_definitions() + sidecar1 = Sidecar(sidecar_path, name='face_sub1_json') + self.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") + self.hed_strings, self.definitions = get_assembled(self.input_data, sidecar1, schema, + extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + self.sidecar1 = sidecar1 + self.schema = schema def test_constructor(self): - var_manager = HedTypeManager(self.hed_strings, self.hed_schema, self.definitions) + var_manager = HedTypeManager(self.hed_strings, self.schema, self.definitions) self.assertIsInstance(var_manager, HedTypeManager, "Constructor should create a HedTypeManager from a tabular input") self.assertEqual(len(var_manager.context_manager.hed_strings), len(var_manager.context_manager.contexts), @@ -33,7 +35,7 @@ def test_constructor(self): self.assertFalse(var_manager._type_tag_map, "constructor has empty map") def test_add_type_variable(self): - var_manager = HedTypeManager(self.hed_strings, self.hed_schema, self.definitions) + var_manager = HedTypeManager(self.hed_strings, self.schema, self.definitions) self.assertFalse(var_manager._type_tag_map, "constructor has empty map") var_manager.add_type_variable("Condition-variable") self.assertEqual(len(var_manager._type_tag_map), 1, @@ -48,10 +50,10 @@ def test_add_type_variable(self): "add_type_variable has 2 element map after two types are added") def test_get_factor_vectors(self): - hed_strings = get_assembled_strings(self.input_data, hed_schema=self.hed_schema, expand_defs=False) + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) base_length = len(hed_strings) - def_mapper = self.input_data._def_mapper - var_manager = HedTypeManager(hed_strings, self.hed_schema, def_mapper) + var_manager = HedTypeManager(hed_strings, self.schema, definitions) var_manager.add_type_variable("Condition-variable") var_manager.add_type_variable("task") df_cond = var_manager.get_factor_vectors("condition-variable") @@ -64,9 +66,9 @@ def test_get_factor_vectors(self): self.assertIsNone(df_baloney, "get_factor_vectors returns None if no factors") def test_get_type_variable(self): - hed_strings = get_assembled_strings(self.input_data, hed_schema=self.hed_schema, expand_defs=False) - def_mapper = self.input_data._def_mapper - var_manager = HedTypeManager(hed_strings, self.hed_schema, def_mapper) + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeManager(hed_strings, self.schema, definitions) var_manager.add_type_variable("Condition-variable") type_var = var_manager.get_type_variable("condition-variable") self.assertIsInstance(type_var, HedTypeValues, @@ -75,9 +77,9 @@ def test_get_type_variable(self): self.assertIsNone(type_var, "get_type_variable returns None if the key does not exist") def test_get_type_variable_def_names(self): - hed_strings = get_assembled_strings(self.input_data, hed_schema=self.hed_schema, expand_defs=False) - def_mapper = self.input_data._def_mapper - var_manager = HedTypeManager(hed_strings, self.hed_schema, def_mapper) + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeManager(hed_strings, self.schema, definitions) var_manager.add_type_variable("Condition-variable") def_names = var_manager.get_type_tag_def_names("condition-variable") self.assertEqual(len(def_names), 7, @@ -88,9 +90,9 @@ def test_get_type_variable_def_names(self): self.assertFalse(def_names, "get_type_tag_def_names returns empty if the type does not exist") def test_get_variable_type_map(self): - hed_strings = get_assembled_strings(self.input_data, hed_schema=self.hed_schema, expand_defs=False) - def_mapper = self.input_data._def_mapper - var_manager = HedTypeManager(hed_strings, self.hed_schema, def_mapper) + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeManager(hed_strings, self.schema, definitions) var_manager.add_type_variable("Condition-variable") this_var = var_manager.get_type_variable("condition-variable") self.assertIsInstance(this_var, HedTypeValues, @@ -104,9 +106,9 @@ def test_get_variable_type_map(self): "get_type_variable_map map has right length when key upper case") def test_get_type_variable_factor(self): - hed_strings = get_assembled_strings(self.input_data, hed_schema=self.hed_schema, expand_defs=False) - def_mapper = self.input_data._def_mapper - var_manager = HedTypeManager(hed_strings, self.hed_schema, def_mapper) + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeManager(hed_strings, self.schema, definitions) var_manager.add_type_variable("Condition-variable") var_factor1 = var_manager.get_type_tag_factor("condition-variable", "key-assignment") self.assertIsInstance(var_factor1, HedTypeFactors, @@ -117,9 +119,9 @@ def test_get_type_variable_factor(self): self.assertIsNone(var_factor3, "get_type_tag_factor returns None if type variable does not exist") def test_type_variables(self): - hed_strings = get_assembled_strings(self.input_data, hed_schema=self.hed_schema, expand_defs=False) - definitions = self.input_data.get_definitions - var_manager = HedTypeManager(hed_strings, self.hed_schema, definitions) + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeManager(hed_strings, self.schema, definitions) vars1 = var_manager.type_variables self.assertFalse(vars1, "type_variables is empty if no types have been added") var_manager.add_type_variable("Condition-variable") @@ -129,9 +131,9 @@ def test_type_variables(self): self.assertEqual(len(vars2), 2, "type_variables return list is right length") def test_summarize_all(self): - hed_strings = get_assembled_strings(self.input_data, hed_schema=self.hed_schema, expand_defs=False) - def_mapper = self.input_data._def_mapper - var_manager = HedTypeManager(hed_strings, self.hed_schema, def_mapper) + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeManager(hed_strings, self.schema, definitions) summary1 = var_manager.summarize_all() self.assertIsInstance(summary1, dict, "summarize_all returns a dictionary when nothing has been added") self.assertFalse(summary1, "summarize_all return dictionary is empty when nothing has been added") diff --git a/tests/tools/analysis/test_hed_type_values.py b/tests/tools/analysis/test_hed_type_values.py index c5ad5557a..4b3125353 100644 --- a/tests/tools/analysis/test_hed_type_values.py +++ b/tests/tools/analysis/test_hed_type_values.py @@ -10,7 +10,7 @@ from hed.schema.hed_schema_io import load_schema_version from hed.tools.analysis.hed_context_manager import HedContextManager from hed.tools.analysis.hed_type_values import HedTypeValues -from hed.tools.analysis.analysis_util import get_assembled_strings +from hed.models.df_util import get_assembled class Test(unittest.TestCase): @@ -53,12 +53,11 @@ def setUpClass(cls): cls.events_path = os.path.realpath(os.path.join(bids_root_path, 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) cls.sidecar_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) - cls.hed_schema = schema + cls.schema = schema def test_constructor(self): - strings1 = [HedString(hed, hed_schema=self.hed_schema) for hed in self.test_strings1] - strings2 = [HedString(hed, hed_schema=self.hed_schema) for hed in self.test_strings1] - con_man = HedContextManager(strings1, hed_schema=self.hed_schema) + strings1 = [HedString(hed, hed_schema=self.schema) for hed in self.test_strings1] + con_man = HedContextManager(strings1, hed_schema=self.schema) type_var = HedTypeValues(con_man, self.defs, 'run-01') self.assertIsInstance(type_var, HedTypeValues, "Constructor should create a HedTypeManager from strings") @@ -66,22 +65,20 @@ def test_constructor(self): "Constructor ConditionVariables should have the right length") def test_constructor_from_tabular_input(self): - sidecar1 = Sidecar(self.sidecar_path, hed_schema=self.hed_schema, name='face_sub1_json') - input_data = TabularInput(self.events_path, hed_schema=self.hed_schema, - sidecar=sidecar1, name="face_sub1_events") - test_strings1 = get_assembled_strings(input_data, hed_schema=self.hed_schema, expand_defs=False) - definitions = input_data.get_definitions(as_strings=False).gathered_defs - var_manager = HedTypeValues(HedContextManager(test_strings1, self.hed_schema), definitions, 'run-01') + sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') + input_data = TabularInput(self.events_path, sidecar=sidecar1, name="face_sub1_events") + test_strings1, definitions = get_assembled(input_data, sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeValues(HedContextManager(test_strings1, self.schema), definitions, 'run-01') self.assertIsInstance(var_manager, HedTypeValues, "Constructor should create a HedTypeManager from a tabular input") def test_constructor_variable_caps(self): - sidecar1 = Sidecar(self.sidecar_path, hed_schema=self.hed_schema, name='face_sub1_json') - input_data = TabularInput(self.events_path, sidecar=sidecar1, hed_schema=self.hed_schema, - name="face_sub1_events") - test_strings1 = get_assembled_strings(input_data, hed_schema=self.hed_schema, expand_defs=False) - definitions = input_data.get_definitions(as_strings=False).gathered_defs - var_manager = HedTypeValues(HedContextManager(test_strings1, self.hed_schema), + sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') + input_data = TabularInput(self.events_path, sidecar1, name="face_sub1_events") + test_strings1, definitions = get_assembled(input_data, sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeValues(HedContextManager(test_strings1, self.schema), definitions, 'run-01', type_tag="Condition-variable") self.assertIsInstance(var_manager, HedTypeValues, "Constructor should create a HedTypeManager variable caps") @@ -89,34 +86,33 @@ def test_constructor_variable_caps(self): def test_constructor_variable_task(self): sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') input_data = TabularInput(self.events_path, sidecar=sidecar1, name="face_sub1_events") - test_strings1 = get_assembled_strings(input_data, hed_schema=self.hed_schema, expand_defs=False) - definitions = input_data.get_definitions(as_strings=False).gathered_defs - var_manager = HedTypeValues(HedContextManager(test_strings1, self.hed_schema), + test_strings1, definitions = get_assembled(input_data, sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeValues(HedContextManager(test_strings1, self.schema), definitions, 'run-01', type_tag="task") self.assertIsInstance(var_manager, HedTypeValues, "Constructor should create a HedTypeManager variable task") def test_constructor_multiple_values(self): - test_strings1 = [HedString(hed, hed_schema=self.hed_schema) for hed in self.test_strings2] - var_manager = HedTypeValues(HedContextManager(test_strings1, self.hed_schema), self.defs, 'run-01') + hed_strings = [HedString(hed, self.schema) for hed in self.test_strings2] + var_manager = HedTypeValues(HedContextManager(hed_strings, self.schema), self.defs, 'run-01') self.assertIsInstance(var_manager, HedTypeValues, "Constructor should create a HedTypeManager from strings") self.assertEqual(len(var_manager._type_value_map), 3, "Constructor should have right number of type_variables if multiple") def test_constructor_unmatched(self): - test_strings1 = [HedString(hed, hed_schema=self.hed_schema) for hed in self.test_strings3] + hed_strings = [HedString(hed, self.schema) for hed in self.test_strings3] with self.assertRaises(HedFileError) as context: - HedTypeValues(HedContextManager(test_strings1, self.hed_schema), self.defs, 'run-01') + HedTypeValues(HedContextManager(hed_strings, self.schema), self.defs, 'run-01') self.assertEqual(context.exception.args[0], 'UnmatchedOffset') def test_get_variable_factors(self): - sidecar1 = Sidecar(self.sidecar_path, hed_schema=self.hed_schema, name='face_sub1_json') - input_data = TabularInput(self.events_path, sidecar=sidecar1, hed_schema=self.hed_schema, - name="face_sub1_events") - test_strings1 = get_assembled_strings(input_data, hed_schema=self.hed_schema, expand_defs=False) - definitions = input_data.get_definitions(as_strings=False).gathered_defs - var_manager = HedTypeValues(HedContextManager(test_strings1, self.hed_schema), definitions, 'run-01') + sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') + input_data = TabularInput(self.events_path, sidecar1, name="face_sub1_events") + test_strings1, definitions = get_assembled(input_data, sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeValues(HedContextManager(test_strings1, self.schema), definitions, 'run-01') df_new1 = var_manager.get_type_factors() self.assertIsInstance(df_new1, DataFrame) self.assertEqual(len(df_new1), 200) @@ -128,47 +124,45 @@ def test_get_variable_factors(self): self.assertIsNone(df_new3) def test_str(self): - sidecar1 = Sidecar(self.sidecar_path, hed_schema=self.hed_schema, name='face_sub1_json') - input_data = TabularInput(self.events_path, hed_schema=self.hed_schema, - sidecar=sidecar1, name="face_sub1_events") - test_strings1 = get_assembled_strings(input_data, hed_schema=self.hed_schema, expand_defs=False) - definitions = input_data.get_definitions(as_strings=False).gathered_defs - var_manager = HedTypeValues(HedContextManager(test_strings1, self.hed_schema), definitions, 'run-01') + sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') + input_data = TabularInput(self.events_path, sidecar1, name="face_sub1_events") + test_strings1, definitions = get_assembled(input_data, sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeValues(HedContextManager(test_strings1, self.schema), definitions, 'run-01') new_str = str(var_manager) self.assertIsInstance(new_str, str) def test_summarize_variables(self): - sidecar1 = Sidecar(self.sidecar_path, hed_schema=self.hed_schema, name='face_sub1_json') - input_data = TabularInput(self.events_path, hed_schema=self.hed_schema, - sidecar=sidecar1, name="face_sub1_events") - test_strings1 = get_assembled_strings(input_data, hed_schema=self.hed_schema, expand_defs=False) - definitions = input_data.get_definitions(as_strings=False).gathered_defs - var_manager = HedTypeValues(HedContextManager(test_strings1, self.hed_schema), definitions, 'run-01') + sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') + input_data = TabularInput(self.events_path, sidecar1, name="face_sub1_events") + test_strings1, definitions = get_assembled(input_data, sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=True, expand_defs=False) + var_manager = HedTypeValues(HedContextManager(test_strings1, self.schema), definitions, 'run-01') summary = var_manager.get_summary() self.assertIsInstance(summary, dict, "get_summary produces a dictionary if not json") self.assertEqual(len(summary), 3, "Summarize_variables has right number of condition type_variables") self.assertIn("key-assignment", summary, "get_summary has a correct key") def test_extract_definition_variables(self): - test_strings1 = [HedString(hed, hed_schema=self.hed_schema) for hed in self.test_strings1] - var_manager = HedTypeValues(HedContextManager(test_strings1, self.hed_schema), self.defs, 'run-01') + hed_strings = [HedString(hed, self.schema) for hed in self.test_strings1] + var_manager = HedTypeValues(HedContextManager(hed_strings, self.schema), self.defs, 'run-01') var_levels = var_manager._type_value_map['var3'].levels self.assertNotIn('cond3/7', var_levels, "_extract_definition_variables before extraction def/cond3/7 not in levels") - tag = HedTag("Def/Cond3/7", hed_schema=self.hed_schema) + tag = HedTag("Def/Cond3/7", hed_schema=self.schema) var_manager._extract_definition_variables(tag, 5) self.assertIn('cond3/7', var_levels, "_extract_definition_variables after extraction def/cond3/7 not in levels") def test_get_variable_names(self): - test_strings1 = [HedString(hed, hed_schema=self.hed_schema) for hed in self.test_strings1] - conditions1 = HedTypeValues(HedContextManager(test_strings1, self.hed_schema), self.defs, 'run-01') + hed_strings = [HedString(hed, self.schema) for hed in self.test_strings1] + conditions1 = HedTypeValues(HedContextManager(hed_strings, self.schema), self.defs, 'run-01') list1 = conditions1.get_type_value_names() self.assertEqual(len(list1), 8, "get_variable_tags list should have the right length") def test_get_variable_def_names(self): - test_strings1 = [HedString(hed, hed_schema=self.hed_schema) for hed in self.test_strings1] - conditions1 = HedTypeValues(HedContextManager(test_strings1, self.hed_schema), self.defs, 'run-01') + hed_strings = [HedString(hed, self.schema) for hed in self.test_strings1] + conditions1 = HedTypeValues(HedContextManager(hed_strings, self.schema), self.defs, 'run-01') list1 = conditions1.get_type_def_names() self.assertEqual(len(list1), 5, "get_type_def_names list should have the right length") diff --git a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py index d82d14ea0..5f5ee41bf 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py @@ -2,6 +2,7 @@ import os import unittest import pandas as pd +from hed.models.df_util import get_assembled from hed.tools.remodeling.dispatcher import Dispatcher from hed.tools.remodeling.operations.summarize_hed_tags_op import SummarizeHedTagsOp, HedTagSummaryContext @@ -96,16 +97,17 @@ def test_quick3(self): } } my_json_str = json.dumps(my_json) - my_sidecar = Sidecar(StringIO(my_json_str), hed_schema=my_schema) + my_sidecar = Sidecar(StringIO(my_json_str)) data = [[0.5, 0, 'code1', 'Description/This is a test, Label/Temp, (Def/Blech1, Green)'], [0.6, 0, 'code2', 'Sensory-event, ((Description/Animal, Condition-variable/Blech))']] df = pd.DataFrame(data, columns=['onset', 'duration', 'code', 'HED']) - input_data = TabularInput(df, hed_schema=my_schema, sidecar=my_sidecar) + input_data = TabularInput(df, sidecar=my_sidecar) counts = HedTagCounts('myName', 2) summary_dict = {} - for objs in input_data.iter_dataframe(hed_ops=[my_schema], return_string_only=False, - expand_defs=True, remove_definitions=True): - counts.update_event_counts(objs['HED'], 'myName') + hed_strings = get_assembled(input_data, my_sidecar, my_schema, extra_def_dicts=None, join_columns=True, + shrink_defs=False, expand_defs=True) + for hed in hed_strings: + counts.update_event_counts(hed, 'myName') summary_dict['myName'] = counts def test_quick4(self): @@ -117,10 +119,13 @@ def test_quick4(self): data_path = os.path.realpath(os.path.join(path, 'sub-002_task-FacePerception_run-1_events.tsv')) json_path = os.path.realpath(os.path.join(path, 'task-FacePerception_events.json')) my_schema = load_schema_version('8.1.0') - sidecar = Sidecar(json_path, hed_schema=my_schema) - input_data = TabularInput(data_path, hed_schema=my_schema, sidecar=sidecar) + sidecar = Sidecar(json_path,) + input_data = TabularInput(data_path, sidecar=sidecar) counts = HedTagCounts('myName', 2) summary_dict = {} + hed_strings, definitions = get_assembled(input_data, sidecar, my_schema, + extra_def_dicts=None, join_columns=True, + shrink_defs=False, expand_defs=True) for objs in input_data.iter_dataframe(hed_ops=[my_schema], return_string_only=False, expand_defs=True, remove_definitions=True): x = objs['HED'] From 84cf4e01679d0be93129e2b7143ca60cd3fa973b Mon Sep 17 00:00:00 2001 From: IanCa Date: Fri, 17 Mar 2023 18:44:20 -0500 Subject: [PATCH 06/19] Add more unit tests. better nan and empty column handling --- hed/models/base_input.py | 58 ++++-- hed/validator/spreadsheet_validator.py | 1 + tests/models/test_base_file_input.py | 103 --------- tests/models/test_base_input.py | 276 +++++++++++++++++++++++++ tests/models/test_df_util.py | 45 +++- 5 files changed, 357 insertions(+), 126 deletions(-) delete mode 100644 tests/models/test_base_file_input.py create mode 100644 tests/models/test_base_input.py diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 869bc4ea6..f50ea5e4c 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -7,6 +7,7 @@ from hed.models.column_mapper import ColumnMapper from hed.errors.exceptions import HedFileError, HedExceptions from hed.errors.error_reporter import ErrorHandler +import pandas as pd class BaseInput: @@ -66,10 +67,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T elif not file: raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file) elif input_type in self.TEXT_EXTENSION: - self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, - dtype=str, keep_default_na=True, na_values=None) - # Convert nan values to a known value - self._dataframe = self._dataframe.fillna("n/a") + self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, dtype=str) elif input_type in self.EXCEL_EXTENSION: self._loaded_workbook = openpyxl.load_workbook(file) loaded_worksheet = self.get_worksheet(self._worksheet_name) @@ -364,7 +362,7 @@ def assemble(self, mapper=None): """ if mapper is None: mapper = self._mapper - import pandas as pd + transformers, need_categorical = mapper.get_transformers() if not transformers: return None @@ -374,35 +372,53 @@ def assemble(self, mapper=None): all_columns = all_columns.transform(transformers) - possible_column_references = [f"{column_name}" for column_name in self.columns if - column_name.lower() != "hed"] + return self._insert_columns(all_columns, list(transformers.keys())) + + @staticmethod + def _find_column_refs(df): found_column_references = [] - for column_name in all_columns: - df = all_columns[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) - u_vals = pd.Series([j for i in df for j in i], dtype=str) + for column_name in df: + df_temp = df[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) + u_vals = pd.Series([j for i in df_temp for j in i], dtype=str) u_vals = u_vals.unique() for val in u_vals: if val not in found_column_references: found_column_references.append(val) + return found_column_references + + @staticmethod + def _insert_columns(df, known_columns=None): + if known_columns is None: + known_columns = list(df.columns) + possible_column_references = [f"{column_name}" for column_name in df.columns if + column_name.lower() != "hed"] + found_column_references = BaseInput._find_column_refs(df) + + invalid_replacements = [col for col in found_column_references if col not in possible_column_references] + if invalid_replacements: + # todo: This check may be moved to validation + raise ValueError(f"Bad column references found(columns do not exist): {invalid_replacements}") valid_replacements = [col for col in found_column_references if col in possible_column_references] - column_names = list(transformers.keys()) + # todo: break this into a sub function(probably) + column_names = known_columns for column_name in valid_replacements: column_names.remove(column_name) - saved_columns = all_columns[valid_replacements] + saved_columns = df[valid_replacements] for column_name in column_names: for replacing_name in valid_replacements: column_name_brackets = f"[{replacing_name}]" - all_columns[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y - in zip(all_columns[column_name], saved_columns[replacing_name])) - all_columns = all_columns[column_names] + df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y + in zip(df[column_name], saved_columns[replacing_name])) + df = df[column_names] - return all_columns + return df @staticmethod def combine_dataframe(dataframe): - """ Combines all columns in the given dataframe into a single hed string series. + """ Combines all columns in the given dataframe into a single HED string series, + skipping empty columns and columns with empty strings. Parameters: dataframe(Dataframe): The dataframe to combine @@ -410,8 +426,8 @@ def combine_dataframe(dataframe): Returns: Series: the assembled series """ - dataframe = dataframe.agg(', '.join, axis=1) + dataframe = dataframe.agg( + lambda x: ', '.join(filter(lambda e: pd.notna(e) and e != "", x)), axis=1 + ) - # Potentially better ways to handle removing n/a by never inserting them to begin with. - dataframe = dataframe.replace("(, n/a|n/a,)", "", regex=True) - return dataframe + return dataframe \ No newline at end of file diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index 136b5aa73..ba1f341ac 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -41,6 +41,7 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): # Check the structure of the input data, if it's a BaseInput if isinstance(data, BaseInput): issues += self._validate_column_structure(data, error_handler) + # todo ian: Add more checks here for column inserters data = data.dataframe_a # Check the rows of the input data diff --git a/tests/models/test_base_file_input.py b/tests/models/test_base_file_input.py deleted file mode 100644 index 8314072bd..000000000 --- a/tests/models/test_base_file_input.py +++ /dev/null @@ -1,103 +0,0 @@ -import unittest -import os -import shutil -from hed import Sidecar -from hed import BaseInput, TabularInput -from hed.models.column_mapper import ColumnMapper -from hed.models import DefinitionDict -from hed import schema - -# TODO: Add tests for base_file_input and include correct handling of 'n/a' - - -class Test(unittest.TestCase): - @classmethod - def setUpClass(cls): - # todo: clean up these unit tests/add more - base_data_dir = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/')) - cls.base_data_dir = base_data_dir - json_def_filename = os.path.join(base_data_dir, "sidecar_tests/both_types_events_with_defs.json") - # cls.json_def_filename = json_def_filename - json_def_sidecar = Sidecar(json_def_filename) - events_path = os.path.join(base_data_dir, '../data/validator_tests/bids_events_no_index.tsv') - cls.tabular_file = TabularInput(events_path, sidecar=json_def_sidecar) - - base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/") - cls.base_output_folder = base_output - os.makedirs(base_output, exist_ok=True) - - bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/bids_tests/eeg_ds003645s_hed')) - schema_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/schema_tests/HED8.0.0.xml')) - cls.bids_root_path = bids_root_path - json_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) - events_path = os.path.realpath(os.path.join(bids_root_path, - 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) - - cls.hed_schema = schema.load_schema(schema_path) - sidecar1 = Sidecar(json_path, name='face_sub1_json') - mapper1 = ColumnMapper(sidecar=sidecar1, optional_tag_columns=['HED'], warn_on_missing_column=False) - cls.input_data1 = BaseInput(events_path, file_type='.tsv', has_column_names=True, - name="face_sub1_events", mapper=mapper1, allow_blank_names=False) - cls.input_data2 = BaseInput(events_path, file_type='.tsv', has_column_names=True, name="face_sub2_events") - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.base_output_folder) - - def test_gathered_defs(self): - # todo: add unit tests for definitions in tsv file - defs = DefinitionDict.get_as_strings(self.tabular_file._sidecar.extract_definitions(hed_schema=self.hed_schema)) - expected_defs = { - 'jsonfiledef': '(Item/JsonDef1/#,Item/JsonDef1)', - 'jsonfiledef2': '(Item/JsonDef2/#,Item/JsonDef2)', - 'jsonfiledef3': '(Item/JsonDef3/#)', - 'takesvaluedef': '(Age/#)', - 'valueclassdef': '(Acceleration/#)' - } - self.assertEqual(defs, expected_defs) - - # def test_missing_column_name_issue(self): - # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_schema.mediawiki') - # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_events_bad_column_name.tsv') - # - # hed_schema = schema.load_schema(schema_path) - # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # "../data/validator_tests/bids_events.json") - # validator = HedValidator(hed_schema=hed_schema) - # sidecar = Sidecar(json_path) - # issues = sidecar.validate_entries(validator) - # self.assertEqual(len(issues), 0) - # input_file = TabularInput(events_path, sidecars=sidecar) - # - # validation_issues = input_file.validate_sidecar(validator) - # self.assertEqual(len(validation_issues), 0) - # validation_issues = input_file.validate_file(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 1) - # - # def test_expand_column_issues(self): - # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_schema.mediawiki') - # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_events_bad_category_key.tsv') - # - # hed_schema = schema.load_schema(schema_path) - # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # "../data/validator_tests/bids_events.json") - # validator = HedValidator(hed_schema=hed_schema) - # sidecar = Sidecar(json_path) - # issues = sidecar.validate_entries(validator) - # self.assertEqual(len(issues), 0) - # input_file = TabularInput(events_path, sidecars=sidecar) - # - # validation_issues = input_file.validate_sidecar(validator) - # self.assertEqual(len(validation_issues), 0) - # validation_issues = input_file.validate_file(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 1) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py new file mode 100644 index 000000000..392599f78 --- /dev/null +++ b/tests/models/test_base_input.py @@ -0,0 +1,276 @@ +import io +import unittest +import os +import shutil +from hed import Sidecar +from hed import BaseInput, TabularInput +from hed.models.column_mapper import ColumnMapper +from hed.models import DefinitionDict +from hed import schema +import pandas as pd +import numpy as np + + +class Test(unittest.TestCase): + @classmethod + def setUpClass(cls): + # todo: clean up these unit tests/add more + base_data_dir = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/')) + cls.base_data_dir = base_data_dir + json_def_filename = os.path.join(base_data_dir, "sidecar_tests/both_types_events_with_defs.json") + # cls.json_def_filename = json_def_filename + json_def_sidecar = Sidecar(json_def_filename) + events_path = os.path.join(base_data_dir, '../data/validator_tests/bids_events_no_index.tsv') + cls.tabular_file = TabularInput(events_path, sidecar=json_def_sidecar) + + base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/") + cls.base_output_folder = base_output + os.makedirs(base_output, exist_ok=True) + + bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/bids_tests/eeg_ds003645s_hed')) + schema_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/schema_tests/HED8.0.0.xml')) + cls.bids_root_path = bids_root_path + json_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) + events_path = os.path.realpath(os.path.join(bids_root_path, + 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) + + cls.hed_schema = schema.load_schema(schema_path) + sidecar1 = Sidecar(json_path, name='face_sub1_json') + mapper1 = ColumnMapper(sidecar=sidecar1, optional_tag_columns=['HED'], warn_on_missing_column=False) + cls.input_data1 = BaseInput(events_path, file_type='.tsv', has_column_names=True, + name="face_sub1_events", mapper=mapper1, allow_blank_names=False) + cls.input_data2 = BaseInput(events_path, file_type='.tsv', has_column_names=True, name="face_sub2_events") + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.base_output_folder) + + def test_gathered_defs(self): + # todo: add unit tests for definitions in tsv file + defs = DefinitionDict.get_as_strings(self.tabular_file._sidecar.extract_definitions(hed_schema=self.hed_schema)) + expected_defs = { + 'jsonfiledef': '(Item/JsonDef1/#,Item/JsonDef1)', + 'jsonfiledef2': '(Item/JsonDef2/#,Item/JsonDef2)', + 'jsonfiledef3': '(Item/JsonDef3/#)', + 'takesvaluedef': '(Age/#)', + 'valueclassdef': '(Acceleration/#)' + } + self.assertEqual(defs, expected_defs) + + # def test_missing_column_name_issue(self): + # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_schema.mediawiki') + # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_events_bad_column_name.tsv') + # + # hed_schema = schema.load_schema(schema_path) + # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # "../data/validator_tests/bids_events.json") + # validator = HedValidator(hed_schema=hed_schema) + # sidecar = Sidecar(json_path) + # issues = sidecar.validate_entries(validator) + # self.assertEqual(len(issues), 0) + # input_file = TabularInput(events_path, sidecars=sidecar) + # + # validation_issues = input_file.validate_sidecar(validator) + # self.assertEqual(len(validation_issues), 0) + # validation_issues = input_file.validate_file(validator, check_for_warnings=True) + # self.assertEqual(len(validation_issues), 1) + # + # def test_expand_column_issues(self): + # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_schema.mediawiki') + # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_events_bad_category_key.tsv') + # + # hed_schema = schema.load_schema(schema_path) + # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # "../data/validator_tests/bids_events.json") + # validator = HedValidator(hed_schema=hed_schema) + # sidecar = Sidecar(json_path) + # issues = sidecar.validate_entries(validator) + # self.assertEqual(len(issues), 0) + # input_file = TabularInput(events_path, sidecars=sidecar) + # + # validation_issues = input_file.validate_sidecar(validator) + # self.assertEqual(len(validation_issues), 0) + # validation_issues = input_file.validate_file(validator, check_for_warnings=True) + # self.assertEqual(len(validation_issues), 1) + + +class TestInsertColumns(unittest.TestCase): + + def test_insert_columns_simple(self): + df = pd.DataFrame({ + "column1": ["[column2], Event, Action"], + "column2": ["Item"] + }) + expected_df = pd.DataFrame({ + "column1": ["Item, Event, Action"] + }) + result = BaseInput._insert_columns(df) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_multiple_rows(self): + df = pd.DataFrame({ + "column1": ["[column2], Event, Action", "Event, Action"], + "column2": ["Item", "Subject"] + }) + expected_df = pd.DataFrame({ + "column1": ["Item, Event, Action", "Event, Action"] + }) + result = BaseInput._insert_columns(df) + pd.testing.assert_frame_equal(result, expected_df) + + # def test_insert_columns_no_circular_reference(self): + # df = pd.DataFrame({ + # "column1": ["[column2], Event, Action"], + # "column2": ["[column1], Item"] + # }) + # with self.assertRaises(ValueError): + # result = BaseInput._insert_columns(df) + + def test_insert_columns_multiple_columns(self): + df = pd.DataFrame({ + "column1": ["[column2], Event, [column3], Action"], + "column2": ["Item"], + "column3": ["Subject"] + }) + expected_df = pd.DataFrame({ + "column1": ["Item, Event, Subject, Action"] + }) + result = BaseInput._insert_columns(df) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_invalid_column_name(self): + df = pd.DataFrame({ + "column1": ["[invalid_column], Event, Action"], + "column2": ["Item"] + }) + with self.assertRaises(ValueError): + result = BaseInput._insert_columns(df) + + def test_insert_columns_four_columns(self): + df = pd.DataFrame({ + "column1": ["[column2], Event, [column3], Action"], + "column2": ["Item"], + "column3": ["Subject"], + "column4": ["Data"] + }) + expected_df = pd.DataFrame({ + "column1": ["Item, Event, Subject, Action"], + "column4": ["Data"] + }) + result = BaseInput._insert_columns(df) + pd.testing.assert_frame_equal(result, expected_df) + + # def test_insert_columns_invalid_syntax(self): + # df = pd.DataFrame({ + # "column1": ["column2], Event, Action"], + # "column2": ["Item"] + # }) + # with self.assertRaises(ValueError): + # result = BaseInput._insert_columns(df) + + # def test_insert_columns_no_self_reference(self): + # df = pd.DataFrame({ + # "column1": ["[column1], Event, Action"], + # "column2": ["Item"] + # }) + # with self.assertRaises(ValueError): + # result = BaseInput._insert_columns(df) + + +class TestCombineDataframe(unittest.TestCase): + def test_combine_dataframe_with_strings(self): + data = { + 'A': ['apple', 'banana', 'cherry'], + 'B': ['dog', 'elephant', 'fox'], + 'C': ['guitar', 'harmonica', 'piano'] + } + df = pd.DataFrame(data) + result = BaseInput.combine_dataframe(df) + expected = pd.Series(['apple, dog, guitar', 'banana, elephant, harmonica', 'cherry, fox, piano']) + self.assertTrue(result.equals(expected)) + + def test_combine_dataframe_with_nan_values(self): + data = { + 'A': ['apple', np.nan, 'cherry'], + 'B': [np.nan, 'elephant', 'fox'], + 'C': ['guitar', 'harmonica', np.nan] + } + df = pd.DataFrame(data) + result = BaseInput.combine_dataframe(df) + expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox']) + self.assertTrue(result.equals(expected)) + + def test_combine_dataframe_with_empty_values(self): + data = { + 'A': ['apple', '', 'cherry'], + 'B': ['', 'elephant', 'fox'], + 'C': ['guitar', 'harmonica', ''] + } + df = pd.DataFrame(data) + result = BaseInput.combine_dataframe(df) + expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox']) + self.assertTrue(result.equals(expected)) + + def test_combine_dataframe_with_mixed_values(self): + data = { + 'A': ['apple', np.nan, 'cherry', 'n/a', ''], + 'B': [np.nan, 'elephant', 'fox', 'n/a', ''], + 'C': ['guitar', 'harmonica', np.nan, 'n/a', ''] + } + df = pd.DataFrame(data) + csv_buffer = io.StringIO() + df.to_csv(csv_buffer, header=False, index=False) + csv_buffer.seek(0) + + # Use the same loading function we normally use to verify n/a translates right. + loaded_df = pd.read_csv(csv_buffer, header=None) + result = BaseInput.combine_dataframe(loaded_df) + expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox', '', '']) + self.assertTrue(result.equals(expected)) + + +class TestColumnRefs(unittest.TestCase): + def test_simple_column_refs(self): + data1 = { + 'A': ['[col1], [col2]', 'tag1, tag2'], + 'B': ['tag3, tag4', '[col3]'], + } + df1 = pd.DataFrame(data1) + result1 = BaseInput._find_column_refs(df1) + expected1 = ['col1', 'col2', 'col3'] + self.assertEqual(result1, expected1) + + def test_mixed_cases_and_patterns(self): + data2 = { + 'A': ['[Col1], [col2]', 'tag1, [Col3]', 'tag3, [COL4]', '[col5], [col6]'], + } + df2 = pd.DataFrame(data2) + result2 = BaseInput._find_column_refs(df2) + expected2 = ['Col1', 'col2', 'Col3', 'COL4', 'col5', 'col6'] + self.assertEqual(result2, expected2) + + def test_no_column_references(self): + data3 = { + 'A': ['tag1, tag2', 'tag3, tag4'], + 'B': ['tag5, tag6', 'tag7, tag8'], + } + df3 = pd.DataFrame(data3) + result3 = BaseInput._find_column_refs(df3) + expected3 = [] + self.assertEqual(result3, expected3) + + def test_incomplete_square_brackets(self): + data4 = { + 'A': ['[col1, [col2]', 'tag1, [Col3'], + 'B': ['tag3, [COL4', '[col5, col6]'], + } + df4 = pd.DataFrame(data4) + result4 = BaseInput._find_column_refs(df4) + expected4 = ['col2'] + self.assertEqual(result4, expected4) \ No newline at end of file diff --git a/tests/models/test_df_util.py b/tests/models/test_df_util.py index bc9c907b7..e10e2a4a3 100644 --- a/tests/models/test_df_util.py +++ b/tests/models/test_df_util.py @@ -3,7 +3,7 @@ from hed import load_schema_version -from hed.models.df_util import shrink_defs, expand_defs +from hed.models.df_util import shrink_defs, expand_defs, convert_to_form from hed import DefinitionDict @@ -111,4 +111,45 @@ def test_expand_defs_series_placeholder(self): series = pd.Series(["Def/TestDefPlaceholder/123,Item/SomeItem"]) expected_series = pd.Series(["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]) result = expand_defs(series, self.schema, self.def_dict, None) - pd.testing.assert_series_equal(result, expected_series) \ No newline at end of file + pd.testing.assert_series_equal(result, expected_series) + + +class TestConvertToForm(unittest.TestCase): + def setUp(self): + self.schema = load_schema_version() + + def test_convert_to_form_short_tags(self): + df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]}) + expected_df = pd.DataFrame({"column1": ["Azure,See"]}) + result = convert_to_form(df, self.schema, "short_tag", ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_convert_to_form_long_tags(self): + df = pd.DataFrame({"column1": ["CSS-color/White-color/Azure,Action/Perceive/See"]}) + expected_df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]}) + result = convert_to_form(df, self.schema, "long_tag", ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_convert_to_form_series_short_tags(self): + series = pd.Series(["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]) + expected_series = pd.Series(["Azure,See"]) + result = convert_to_form(series, self.schema, "short_tag") + pd.testing.assert_series_equal(result, expected_series) + + def test_convert_to_form_series_long_tags(self): + series = pd.Series(["CSS-color/White-color/Azure,Action/Perceive/See"]) + expected_series = pd.Series(["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]) + result = convert_to_form(series, self.schema, "long_tag") + pd.testing.assert_series_equal(result, expected_series) + + def test_convert_to_form_multiple_tags_short(self): + df = pd.DataFrame({"column1": ["Visual-attribute/Color/CSS-color/White-color/Azure,Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) + expected_df = pd.DataFrame({"column1": ["Azure,Nose,4.5 m-per-s^2"]}) + result = convert_to_form(df, self.schema, "short_tag", ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_convert_to_form_multiple_tags_long(self): + df = pd.DataFrame({"column1": ["CSS-color/White-color/Azure,Anatomical-item/Body-part/Head/Face/Nose,Rate-of-change/Acceleration/4.5 m-per-s^2"]}) + expected_df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Item/Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Property/Data-property/Data-value/Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) + result = convert_to_form(df, self.schema, "long_tag", ['column1']) + pd.testing.assert_frame_equal(result, expected_df) \ No newline at end of file From c8db8ba8f511a98078a203f17b7a622c7e3c7170 Mon Sep 17 00:00:00 2001 From: VisLab <1189050+VisLab@users.noreply.github.com> Date: Sun, 19 Mar 2023 07:36:58 -0500 Subject: [PATCH 07/19] Revert "Add more unit tests. better nan and empty column handling" --- hed/models/base_input.py | 58 ++---- hed/validator/spreadsheet_validator.py | 1 - tests/models/test_base_file_input.py | 103 +++++++++ tests/models/test_base_input.py | 276 ------------------------- tests/models/test_df_util.py | 45 +--- 5 files changed, 126 insertions(+), 357 deletions(-) create mode 100644 tests/models/test_base_file_input.py delete mode 100644 tests/models/test_base_input.py diff --git a/hed/models/base_input.py b/hed/models/base_input.py index f50ea5e4c..869bc4ea6 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -7,7 +7,6 @@ from hed.models.column_mapper import ColumnMapper from hed.errors.exceptions import HedFileError, HedExceptions from hed.errors.error_reporter import ErrorHandler -import pandas as pd class BaseInput: @@ -67,7 +66,10 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T elif not file: raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file) elif input_type in self.TEXT_EXTENSION: - self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, dtype=str) + self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, + dtype=str, keep_default_na=True, na_values=None) + # Convert nan values to a known value + self._dataframe = self._dataframe.fillna("n/a") elif input_type in self.EXCEL_EXTENSION: self._loaded_workbook = openpyxl.load_workbook(file) loaded_worksheet = self.get_worksheet(self._worksheet_name) @@ -362,7 +364,7 @@ def assemble(self, mapper=None): """ if mapper is None: mapper = self._mapper - + import pandas as pd transformers, need_categorical = mapper.get_transformers() if not transformers: return None @@ -372,53 +374,35 @@ def assemble(self, mapper=None): all_columns = all_columns.transform(transformers) - return self._insert_columns(all_columns, list(transformers.keys())) - - @staticmethod - def _find_column_refs(df): + possible_column_references = [f"{column_name}" for column_name in self.columns if + column_name.lower() != "hed"] found_column_references = [] - for column_name in df: - df_temp = df[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) - u_vals = pd.Series([j for i in df_temp for j in i], dtype=str) + for column_name in all_columns: + df = all_columns[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) + u_vals = pd.Series([j for i in df for j in i], dtype=str) u_vals = u_vals.unique() for val in u_vals: if val not in found_column_references: found_column_references.append(val) - return found_column_references - - @staticmethod - def _insert_columns(df, known_columns=None): - if known_columns is None: - known_columns = list(df.columns) - possible_column_references = [f"{column_name}" for column_name in df.columns if - column_name.lower() != "hed"] - found_column_references = BaseInput._find_column_refs(df) - - invalid_replacements = [col for col in found_column_references if col not in possible_column_references] - if invalid_replacements: - # todo: This check may be moved to validation - raise ValueError(f"Bad column references found(columns do not exist): {invalid_replacements}") valid_replacements = [col for col in found_column_references if col in possible_column_references] - # todo: break this into a sub function(probably) - column_names = known_columns + column_names = list(transformers.keys()) for column_name in valid_replacements: column_names.remove(column_name) - saved_columns = df[valid_replacements] + saved_columns = all_columns[valid_replacements] for column_name in column_names: for replacing_name in valid_replacements: column_name_brackets = f"[{replacing_name}]" - df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y - in zip(df[column_name], saved_columns[replacing_name])) - df = df[column_names] + all_columns[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y + in zip(all_columns[column_name], saved_columns[replacing_name])) + all_columns = all_columns[column_names] - return df + return all_columns @staticmethod def combine_dataframe(dataframe): - """ Combines all columns in the given dataframe into a single HED string series, - skipping empty columns and columns with empty strings. + """ Combines all columns in the given dataframe into a single hed string series. Parameters: dataframe(Dataframe): The dataframe to combine @@ -426,8 +410,8 @@ def combine_dataframe(dataframe): Returns: Series: the assembled series """ - dataframe = dataframe.agg( - lambda x: ', '.join(filter(lambda e: pd.notna(e) and e != "", x)), axis=1 - ) + dataframe = dataframe.agg(', '.join, axis=1) - return dataframe \ No newline at end of file + # Potentially better ways to handle removing n/a by never inserting them to begin with. + dataframe = dataframe.replace("(, n/a|n/a,)", "", regex=True) + return dataframe diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index ba1f341ac..136b5aa73 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -41,7 +41,6 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): # Check the structure of the input data, if it's a BaseInput if isinstance(data, BaseInput): issues += self._validate_column_structure(data, error_handler) - # todo ian: Add more checks here for column inserters data = data.dataframe_a # Check the rows of the input data diff --git a/tests/models/test_base_file_input.py b/tests/models/test_base_file_input.py new file mode 100644 index 000000000..8314072bd --- /dev/null +++ b/tests/models/test_base_file_input.py @@ -0,0 +1,103 @@ +import unittest +import os +import shutil +from hed import Sidecar +from hed import BaseInput, TabularInput +from hed.models.column_mapper import ColumnMapper +from hed.models import DefinitionDict +from hed import schema + +# TODO: Add tests for base_file_input and include correct handling of 'n/a' + + +class Test(unittest.TestCase): + @classmethod + def setUpClass(cls): + # todo: clean up these unit tests/add more + base_data_dir = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/')) + cls.base_data_dir = base_data_dir + json_def_filename = os.path.join(base_data_dir, "sidecar_tests/both_types_events_with_defs.json") + # cls.json_def_filename = json_def_filename + json_def_sidecar = Sidecar(json_def_filename) + events_path = os.path.join(base_data_dir, '../data/validator_tests/bids_events_no_index.tsv') + cls.tabular_file = TabularInput(events_path, sidecar=json_def_sidecar) + + base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/") + cls.base_output_folder = base_output + os.makedirs(base_output, exist_ok=True) + + bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/bids_tests/eeg_ds003645s_hed')) + schema_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/schema_tests/HED8.0.0.xml')) + cls.bids_root_path = bids_root_path + json_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) + events_path = os.path.realpath(os.path.join(bids_root_path, + 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) + + cls.hed_schema = schema.load_schema(schema_path) + sidecar1 = Sidecar(json_path, name='face_sub1_json') + mapper1 = ColumnMapper(sidecar=sidecar1, optional_tag_columns=['HED'], warn_on_missing_column=False) + cls.input_data1 = BaseInput(events_path, file_type='.tsv', has_column_names=True, + name="face_sub1_events", mapper=mapper1, allow_blank_names=False) + cls.input_data2 = BaseInput(events_path, file_type='.tsv', has_column_names=True, name="face_sub2_events") + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.base_output_folder) + + def test_gathered_defs(self): + # todo: add unit tests for definitions in tsv file + defs = DefinitionDict.get_as_strings(self.tabular_file._sidecar.extract_definitions(hed_schema=self.hed_schema)) + expected_defs = { + 'jsonfiledef': '(Item/JsonDef1/#,Item/JsonDef1)', + 'jsonfiledef2': '(Item/JsonDef2/#,Item/JsonDef2)', + 'jsonfiledef3': '(Item/JsonDef3/#)', + 'takesvaluedef': '(Age/#)', + 'valueclassdef': '(Acceleration/#)' + } + self.assertEqual(defs, expected_defs) + + # def test_missing_column_name_issue(self): + # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_schema.mediawiki') + # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_events_bad_column_name.tsv') + # + # hed_schema = schema.load_schema(schema_path) + # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # "../data/validator_tests/bids_events.json") + # validator = HedValidator(hed_schema=hed_schema) + # sidecar = Sidecar(json_path) + # issues = sidecar.validate_entries(validator) + # self.assertEqual(len(issues), 0) + # input_file = TabularInput(events_path, sidecars=sidecar) + # + # validation_issues = input_file.validate_sidecar(validator) + # self.assertEqual(len(validation_issues), 0) + # validation_issues = input_file.validate_file(validator, check_for_warnings=True) + # self.assertEqual(len(validation_issues), 1) + # + # def test_expand_column_issues(self): + # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_schema.mediawiki') + # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_events_bad_category_key.tsv') + # + # hed_schema = schema.load_schema(schema_path) + # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # "../data/validator_tests/bids_events.json") + # validator = HedValidator(hed_schema=hed_schema) + # sidecar = Sidecar(json_path) + # issues = sidecar.validate_entries(validator) + # self.assertEqual(len(issues), 0) + # input_file = TabularInput(events_path, sidecars=sidecar) + # + # validation_issues = input_file.validate_sidecar(validator) + # self.assertEqual(len(validation_issues), 0) + # validation_issues = input_file.validate_file(validator, check_for_warnings=True) + # self.assertEqual(len(validation_issues), 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py deleted file mode 100644 index 392599f78..000000000 --- a/tests/models/test_base_input.py +++ /dev/null @@ -1,276 +0,0 @@ -import io -import unittest -import os -import shutil -from hed import Sidecar -from hed import BaseInput, TabularInput -from hed.models.column_mapper import ColumnMapper -from hed.models import DefinitionDict -from hed import schema -import pandas as pd -import numpy as np - - -class Test(unittest.TestCase): - @classmethod - def setUpClass(cls): - # todo: clean up these unit tests/add more - base_data_dir = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/')) - cls.base_data_dir = base_data_dir - json_def_filename = os.path.join(base_data_dir, "sidecar_tests/both_types_events_with_defs.json") - # cls.json_def_filename = json_def_filename - json_def_sidecar = Sidecar(json_def_filename) - events_path = os.path.join(base_data_dir, '../data/validator_tests/bids_events_no_index.tsv') - cls.tabular_file = TabularInput(events_path, sidecar=json_def_sidecar) - - base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/") - cls.base_output_folder = base_output - os.makedirs(base_output, exist_ok=True) - - bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/bids_tests/eeg_ds003645s_hed')) - schema_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/schema_tests/HED8.0.0.xml')) - cls.bids_root_path = bids_root_path - json_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) - events_path = os.path.realpath(os.path.join(bids_root_path, - 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) - - cls.hed_schema = schema.load_schema(schema_path) - sidecar1 = Sidecar(json_path, name='face_sub1_json') - mapper1 = ColumnMapper(sidecar=sidecar1, optional_tag_columns=['HED'], warn_on_missing_column=False) - cls.input_data1 = BaseInput(events_path, file_type='.tsv', has_column_names=True, - name="face_sub1_events", mapper=mapper1, allow_blank_names=False) - cls.input_data2 = BaseInput(events_path, file_type='.tsv', has_column_names=True, name="face_sub2_events") - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.base_output_folder) - - def test_gathered_defs(self): - # todo: add unit tests for definitions in tsv file - defs = DefinitionDict.get_as_strings(self.tabular_file._sidecar.extract_definitions(hed_schema=self.hed_schema)) - expected_defs = { - 'jsonfiledef': '(Item/JsonDef1/#,Item/JsonDef1)', - 'jsonfiledef2': '(Item/JsonDef2/#,Item/JsonDef2)', - 'jsonfiledef3': '(Item/JsonDef3/#)', - 'takesvaluedef': '(Age/#)', - 'valueclassdef': '(Acceleration/#)' - } - self.assertEqual(defs, expected_defs) - - # def test_missing_column_name_issue(self): - # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_schema.mediawiki') - # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_events_bad_column_name.tsv') - # - # hed_schema = schema.load_schema(schema_path) - # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # "../data/validator_tests/bids_events.json") - # validator = HedValidator(hed_schema=hed_schema) - # sidecar = Sidecar(json_path) - # issues = sidecar.validate_entries(validator) - # self.assertEqual(len(issues), 0) - # input_file = TabularInput(events_path, sidecars=sidecar) - # - # validation_issues = input_file.validate_sidecar(validator) - # self.assertEqual(len(validation_issues), 0) - # validation_issues = input_file.validate_file(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 1) - # - # def test_expand_column_issues(self): - # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_schema.mediawiki') - # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_events_bad_category_key.tsv') - # - # hed_schema = schema.load_schema(schema_path) - # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # "../data/validator_tests/bids_events.json") - # validator = HedValidator(hed_schema=hed_schema) - # sidecar = Sidecar(json_path) - # issues = sidecar.validate_entries(validator) - # self.assertEqual(len(issues), 0) - # input_file = TabularInput(events_path, sidecars=sidecar) - # - # validation_issues = input_file.validate_sidecar(validator) - # self.assertEqual(len(validation_issues), 0) - # validation_issues = input_file.validate_file(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 1) - - -class TestInsertColumns(unittest.TestCase): - - def test_insert_columns_simple(self): - df = pd.DataFrame({ - "column1": ["[column2], Event, Action"], - "column2": ["Item"] - }) - expected_df = pd.DataFrame({ - "column1": ["Item, Event, Action"] - }) - result = BaseInput._insert_columns(df) - pd.testing.assert_frame_equal(result, expected_df) - - def test_insert_columns_multiple_rows(self): - df = pd.DataFrame({ - "column1": ["[column2], Event, Action", "Event, Action"], - "column2": ["Item", "Subject"] - }) - expected_df = pd.DataFrame({ - "column1": ["Item, Event, Action", "Event, Action"] - }) - result = BaseInput._insert_columns(df) - pd.testing.assert_frame_equal(result, expected_df) - - # def test_insert_columns_no_circular_reference(self): - # df = pd.DataFrame({ - # "column1": ["[column2], Event, Action"], - # "column2": ["[column1], Item"] - # }) - # with self.assertRaises(ValueError): - # result = BaseInput._insert_columns(df) - - def test_insert_columns_multiple_columns(self): - df = pd.DataFrame({ - "column1": ["[column2], Event, [column3], Action"], - "column2": ["Item"], - "column3": ["Subject"] - }) - expected_df = pd.DataFrame({ - "column1": ["Item, Event, Subject, Action"] - }) - result = BaseInput._insert_columns(df) - pd.testing.assert_frame_equal(result, expected_df) - - def test_insert_columns_invalid_column_name(self): - df = pd.DataFrame({ - "column1": ["[invalid_column], Event, Action"], - "column2": ["Item"] - }) - with self.assertRaises(ValueError): - result = BaseInput._insert_columns(df) - - def test_insert_columns_four_columns(self): - df = pd.DataFrame({ - "column1": ["[column2], Event, [column3], Action"], - "column2": ["Item"], - "column3": ["Subject"], - "column4": ["Data"] - }) - expected_df = pd.DataFrame({ - "column1": ["Item, Event, Subject, Action"], - "column4": ["Data"] - }) - result = BaseInput._insert_columns(df) - pd.testing.assert_frame_equal(result, expected_df) - - # def test_insert_columns_invalid_syntax(self): - # df = pd.DataFrame({ - # "column1": ["column2], Event, Action"], - # "column2": ["Item"] - # }) - # with self.assertRaises(ValueError): - # result = BaseInput._insert_columns(df) - - # def test_insert_columns_no_self_reference(self): - # df = pd.DataFrame({ - # "column1": ["[column1], Event, Action"], - # "column2": ["Item"] - # }) - # with self.assertRaises(ValueError): - # result = BaseInput._insert_columns(df) - - -class TestCombineDataframe(unittest.TestCase): - def test_combine_dataframe_with_strings(self): - data = { - 'A': ['apple', 'banana', 'cherry'], - 'B': ['dog', 'elephant', 'fox'], - 'C': ['guitar', 'harmonica', 'piano'] - } - df = pd.DataFrame(data) - result = BaseInput.combine_dataframe(df) - expected = pd.Series(['apple, dog, guitar', 'banana, elephant, harmonica', 'cherry, fox, piano']) - self.assertTrue(result.equals(expected)) - - def test_combine_dataframe_with_nan_values(self): - data = { - 'A': ['apple', np.nan, 'cherry'], - 'B': [np.nan, 'elephant', 'fox'], - 'C': ['guitar', 'harmonica', np.nan] - } - df = pd.DataFrame(data) - result = BaseInput.combine_dataframe(df) - expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox']) - self.assertTrue(result.equals(expected)) - - def test_combine_dataframe_with_empty_values(self): - data = { - 'A': ['apple', '', 'cherry'], - 'B': ['', 'elephant', 'fox'], - 'C': ['guitar', 'harmonica', ''] - } - df = pd.DataFrame(data) - result = BaseInput.combine_dataframe(df) - expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox']) - self.assertTrue(result.equals(expected)) - - def test_combine_dataframe_with_mixed_values(self): - data = { - 'A': ['apple', np.nan, 'cherry', 'n/a', ''], - 'B': [np.nan, 'elephant', 'fox', 'n/a', ''], - 'C': ['guitar', 'harmonica', np.nan, 'n/a', ''] - } - df = pd.DataFrame(data) - csv_buffer = io.StringIO() - df.to_csv(csv_buffer, header=False, index=False) - csv_buffer.seek(0) - - # Use the same loading function we normally use to verify n/a translates right. - loaded_df = pd.read_csv(csv_buffer, header=None) - result = BaseInput.combine_dataframe(loaded_df) - expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox', '', '']) - self.assertTrue(result.equals(expected)) - - -class TestColumnRefs(unittest.TestCase): - def test_simple_column_refs(self): - data1 = { - 'A': ['[col1], [col2]', 'tag1, tag2'], - 'B': ['tag3, tag4', '[col3]'], - } - df1 = pd.DataFrame(data1) - result1 = BaseInput._find_column_refs(df1) - expected1 = ['col1', 'col2', 'col3'] - self.assertEqual(result1, expected1) - - def test_mixed_cases_and_patterns(self): - data2 = { - 'A': ['[Col1], [col2]', 'tag1, [Col3]', 'tag3, [COL4]', '[col5], [col6]'], - } - df2 = pd.DataFrame(data2) - result2 = BaseInput._find_column_refs(df2) - expected2 = ['Col1', 'col2', 'Col3', 'COL4', 'col5', 'col6'] - self.assertEqual(result2, expected2) - - def test_no_column_references(self): - data3 = { - 'A': ['tag1, tag2', 'tag3, tag4'], - 'B': ['tag5, tag6', 'tag7, tag8'], - } - df3 = pd.DataFrame(data3) - result3 = BaseInput._find_column_refs(df3) - expected3 = [] - self.assertEqual(result3, expected3) - - def test_incomplete_square_brackets(self): - data4 = { - 'A': ['[col1, [col2]', 'tag1, [Col3'], - 'B': ['tag3, [COL4', '[col5, col6]'], - } - df4 = pd.DataFrame(data4) - result4 = BaseInput._find_column_refs(df4) - expected4 = ['col2'] - self.assertEqual(result4, expected4) \ No newline at end of file diff --git a/tests/models/test_df_util.py b/tests/models/test_df_util.py index e10e2a4a3..bc9c907b7 100644 --- a/tests/models/test_df_util.py +++ b/tests/models/test_df_util.py @@ -3,7 +3,7 @@ from hed import load_schema_version -from hed.models.df_util import shrink_defs, expand_defs, convert_to_form +from hed.models.df_util import shrink_defs, expand_defs from hed import DefinitionDict @@ -111,45 +111,4 @@ def test_expand_defs_series_placeholder(self): series = pd.Series(["Def/TestDefPlaceholder/123,Item/SomeItem"]) expected_series = pd.Series(["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]) result = expand_defs(series, self.schema, self.def_dict, None) - pd.testing.assert_series_equal(result, expected_series) - - -class TestConvertToForm(unittest.TestCase): - def setUp(self): - self.schema = load_schema_version() - - def test_convert_to_form_short_tags(self): - df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]}) - expected_df = pd.DataFrame({"column1": ["Azure,See"]}) - result = convert_to_form(df, self.schema, "short_tag", ['column1']) - pd.testing.assert_frame_equal(result, expected_df) - - def test_convert_to_form_long_tags(self): - df = pd.DataFrame({"column1": ["CSS-color/White-color/Azure,Action/Perceive/See"]}) - expected_df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]}) - result = convert_to_form(df, self.schema, "long_tag", ['column1']) - pd.testing.assert_frame_equal(result, expected_df) - - def test_convert_to_form_series_short_tags(self): - series = pd.Series(["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]) - expected_series = pd.Series(["Azure,See"]) - result = convert_to_form(series, self.schema, "short_tag") - pd.testing.assert_series_equal(result, expected_series) - - def test_convert_to_form_series_long_tags(self): - series = pd.Series(["CSS-color/White-color/Azure,Action/Perceive/See"]) - expected_series = pd.Series(["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]) - result = convert_to_form(series, self.schema, "long_tag") - pd.testing.assert_series_equal(result, expected_series) - - def test_convert_to_form_multiple_tags_short(self): - df = pd.DataFrame({"column1": ["Visual-attribute/Color/CSS-color/White-color/Azure,Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) - expected_df = pd.DataFrame({"column1": ["Azure,Nose,4.5 m-per-s^2"]}) - result = convert_to_form(df, self.schema, "short_tag", ['column1']) - pd.testing.assert_frame_equal(result, expected_df) - - def test_convert_to_form_multiple_tags_long(self): - df = pd.DataFrame({"column1": ["CSS-color/White-color/Azure,Anatomical-item/Body-part/Head/Face/Nose,Rate-of-change/Acceleration/4.5 m-per-s^2"]}) - expected_df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Item/Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Property/Data-property/Data-value/Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) - result = convert_to_form(df, self.schema, "long_tag", ['column1']) - pd.testing.assert_frame_equal(result, expected_df) \ No newline at end of file + pd.testing.assert_series_equal(result, expected_series) \ No newline at end of file From ffced96c2ff34db483ad96e73d5cfa537ca4a284 Mon Sep 17 00:00:00 2001 From: IanCa Date: Fri, 17 Mar 2023 18:44:20 -0500 Subject: [PATCH 08/19] Add more unit tests. better nan and empty column handling --- hed/models/base_input.py | 58 ++++-- hed/validator/spreadsheet_validator.py | 1 + tests/models/test_base_file_input.py | 103 --------- tests/models/test_base_input.py | 276 +++++++++++++++++++++++++ tests/models/test_df_util.py | 45 +++- 5 files changed, 357 insertions(+), 126 deletions(-) delete mode 100644 tests/models/test_base_file_input.py create mode 100644 tests/models/test_base_input.py diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 869bc4ea6..f50ea5e4c 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -7,6 +7,7 @@ from hed.models.column_mapper import ColumnMapper from hed.errors.exceptions import HedFileError, HedExceptions from hed.errors.error_reporter import ErrorHandler +import pandas as pd class BaseInput: @@ -66,10 +67,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T elif not file: raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file) elif input_type in self.TEXT_EXTENSION: - self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, - dtype=str, keep_default_na=True, na_values=None) - # Convert nan values to a known value - self._dataframe = self._dataframe.fillna("n/a") + self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, dtype=str) elif input_type in self.EXCEL_EXTENSION: self._loaded_workbook = openpyxl.load_workbook(file) loaded_worksheet = self.get_worksheet(self._worksheet_name) @@ -364,7 +362,7 @@ def assemble(self, mapper=None): """ if mapper is None: mapper = self._mapper - import pandas as pd + transformers, need_categorical = mapper.get_transformers() if not transformers: return None @@ -374,35 +372,53 @@ def assemble(self, mapper=None): all_columns = all_columns.transform(transformers) - possible_column_references = [f"{column_name}" for column_name in self.columns if - column_name.lower() != "hed"] + return self._insert_columns(all_columns, list(transformers.keys())) + + @staticmethod + def _find_column_refs(df): found_column_references = [] - for column_name in all_columns: - df = all_columns[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) - u_vals = pd.Series([j for i in df for j in i], dtype=str) + for column_name in df: + df_temp = df[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) + u_vals = pd.Series([j for i in df_temp for j in i], dtype=str) u_vals = u_vals.unique() for val in u_vals: if val not in found_column_references: found_column_references.append(val) + return found_column_references + + @staticmethod + def _insert_columns(df, known_columns=None): + if known_columns is None: + known_columns = list(df.columns) + possible_column_references = [f"{column_name}" for column_name in df.columns if + column_name.lower() != "hed"] + found_column_references = BaseInput._find_column_refs(df) + + invalid_replacements = [col for col in found_column_references if col not in possible_column_references] + if invalid_replacements: + # todo: This check may be moved to validation + raise ValueError(f"Bad column references found(columns do not exist): {invalid_replacements}") valid_replacements = [col for col in found_column_references if col in possible_column_references] - column_names = list(transformers.keys()) + # todo: break this into a sub function(probably) + column_names = known_columns for column_name in valid_replacements: column_names.remove(column_name) - saved_columns = all_columns[valid_replacements] + saved_columns = df[valid_replacements] for column_name in column_names: for replacing_name in valid_replacements: column_name_brackets = f"[{replacing_name}]" - all_columns[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y - in zip(all_columns[column_name], saved_columns[replacing_name])) - all_columns = all_columns[column_names] + df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y + in zip(df[column_name], saved_columns[replacing_name])) + df = df[column_names] - return all_columns + return df @staticmethod def combine_dataframe(dataframe): - """ Combines all columns in the given dataframe into a single hed string series. + """ Combines all columns in the given dataframe into a single HED string series, + skipping empty columns and columns with empty strings. Parameters: dataframe(Dataframe): The dataframe to combine @@ -410,8 +426,8 @@ def combine_dataframe(dataframe): Returns: Series: the assembled series """ - dataframe = dataframe.agg(', '.join, axis=1) + dataframe = dataframe.agg( + lambda x: ', '.join(filter(lambda e: pd.notna(e) and e != "", x)), axis=1 + ) - # Potentially better ways to handle removing n/a by never inserting them to begin with. - dataframe = dataframe.replace("(, n/a|n/a,)", "", regex=True) - return dataframe + return dataframe \ No newline at end of file diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index 136b5aa73..ba1f341ac 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -41,6 +41,7 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): # Check the structure of the input data, if it's a BaseInput if isinstance(data, BaseInput): issues += self._validate_column_structure(data, error_handler) + # todo ian: Add more checks here for column inserters data = data.dataframe_a # Check the rows of the input data diff --git a/tests/models/test_base_file_input.py b/tests/models/test_base_file_input.py deleted file mode 100644 index 8314072bd..000000000 --- a/tests/models/test_base_file_input.py +++ /dev/null @@ -1,103 +0,0 @@ -import unittest -import os -import shutil -from hed import Sidecar -from hed import BaseInput, TabularInput -from hed.models.column_mapper import ColumnMapper -from hed.models import DefinitionDict -from hed import schema - -# TODO: Add tests for base_file_input and include correct handling of 'n/a' - - -class Test(unittest.TestCase): - @classmethod - def setUpClass(cls): - # todo: clean up these unit tests/add more - base_data_dir = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/')) - cls.base_data_dir = base_data_dir - json_def_filename = os.path.join(base_data_dir, "sidecar_tests/both_types_events_with_defs.json") - # cls.json_def_filename = json_def_filename - json_def_sidecar = Sidecar(json_def_filename) - events_path = os.path.join(base_data_dir, '../data/validator_tests/bids_events_no_index.tsv') - cls.tabular_file = TabularInput(events_path, sidecar=json_def_sidecar) - - base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/") - cls.base_output_folder = base_output - os.makedirs(base_output, exist_ok=True) - - bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/bids_tests/eeg_ds003645s_hed')) - schema_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/schema_tests/HED8.0.0.xml')) - cls.bids_root_path = bids_root_path - json_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) - events_path = os.path.realpath(os.path.join(bids_root_path, - 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) - - cls.hed_schema = schema.load_schema(schema_path) - sidecar1 = Sidecar(json_path, name='face_sub1_json') - mapper1 = ColumnMapper(sidecar=sidecar1, optional_tag_columns=['HED'], warn_on_missing_column=False) - cls.input_data1 = BaseInput(events_path, file_type='.tsv', has_column_names=True, - name="face_sub1_events", mapper=mapper1, allow_blank_names=False) - cls.input_data2 = BaseInput(events_path, file_type='.tsv', has_column_names=True, name="face_sub2_events") - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.base_output_folder) - - def test_gathered_defs(self): - # todo: add unit tests for definitions in tsv file - defs = DefinitionDict.get_as_strings(self.tabular_file._sidecar.extract_definitions(hed_schema=self.hed_schema)) - expected_defs = { - 'jsonfiledef': '(Item/JsonDef1/#,Item/JsonDef1)', - 'jsonfiledef2': '(Item/JsonDef2/#,Item/JsonDef2)', - 'jsonfiledef3': '(Item/JsonDef3/#)', - 'takesvaluedef': '(Age/#)', - 'valueclassdef': '(Acceleration/#)' - } - self.assertEqual(defs, expected_defs) - - # def test_missing_column_name_issue(self): - # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_schema.mediawiki') - # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_events_bad_column_name.tsv') - # - # hed_schema = schema.load_schema(schema_path) - # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # "../data/validator_tests/bids_events.json") - # validator = HedValidator(hed_schema=hed_schema) - # sidecar = Sidecar(json_path) - # issues = sidecar.validate_entries(validator) - # self.assertEqual(len(issues), 0) - # input_file = TabularInput(events_path, sidecars=sidecar) - # - # validation_issues = input_file.validate_sidecar(validator) - # self.assertEqual(len(validation_issues), 0) - # validation_issues = input_file.validate_file(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 1) - # - # def test_expand_column_issues(self): - # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_schema.mediawiki') - # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # '../data/validator_tests/bids_events_bad_category_key.tsv') - # - # hed_schema = schema.load_schema(schema_path) - # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - # "../data/validator_tests/bids_events.json") - # validator = HedValidator(hed_schema=hed_schema) - # sidecar = Sidecar(json_path) - # issues = sidecar.validate_entries(validator) - # self.assertEqual(len(issues), 0) - # input_file = TabularInput(events_path, sidecars=sidecar) - # - # validation_issues = input_file.validate_sidecar(validator) - # self.assertEqual(len(validation_issues), 0) - # validation_issues = input_file.validate_file(validator, check_for_warnings=True) - # self.assertEqual(len(validation_issues), 1) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py new file mode 100644 index 000000000..392599f78 --- /dev/null +++ b/tests/models/test_base_input.py @@ -0,0 +1,276 @@ +import io +import unittest +import os +import shutil +from hed import Sidecar +from hed import BaseInput, TabularInput +from hed.models.column_mapper import ColumnMapper +from hed.models import DefinitionDict +from hed import schema +import pandas as pd +import numpy as np + + +class Test(unittest.TestCase): + @classmethod + def setUpClass(cls): + # todo: clean up these unit tests/add more + base_data_dir = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/')) + cls.base_data_dir = base_data_dir + json_def_filename = os.path.join(base_data_dir, "sidecar_tests/both_types_events_with_defs.json") + # cls.json_def_filename = json_def_filename + json_def_sidecar = Sidecar(json_def_filename) + events_path = os.path.join(base_data_dir, '../data/validator_tests/bids_events_no_index.tsv') + cls.tabular_file = TabularInput(events_path, sidecar=json_def_sidecar) + + base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/") + cls.base_output_folder = base_output + os.makedirs(base_output, exist_ok=True) + + bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/bids_tests/eeg_ds003645s_hed')) + schema_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/schema_tests/HED8.0.0.xml')) + cls.bids_root_path = bids_root_path + json_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) + events_path = os.path.realpath(os.path.join(bids_root_path, + 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) + + cls.hed_schema = schema.load_schema(schema_path) + sidecar1 = Sidecar(json_path, name='face_sub1_json') + mapper1 = ColumnMapper(sidecar=sidecar1, optional_tag_columns=['HED'], warn_on_missing_column=False) + cls.input_data1 = BaseInput(events_path, file_type='.tsv', has_column_names=True, + name="face_sub1_events", mapper=mapper1, allow_blank_names=False) + cls.input_data2 = BaseInput(events_path, file_type='.tsv', has_column_names=True, name="face_sub2_events") + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.base_output_folder) + + def test_gathered_defs(self): + # todo: add unit tests for definitions in tsv file + defs = DefinitionDict.get_as_strings(self.tabular_file._sidecar.extract_definitions(hed_schema=self.hed_schema)) + expected_defs = { + 'jsonfiledef': '(Item/JsonDef1/#,Item/JsonDef1)', + 'jsonfiledef2': '(Item/JsonDef2/#,Item/JsonDef2)', + 'jsonfiledef3': '(Item/JsonDef3/#)', + 'takesvaluedef': '(Age/#)', + 'valueclassdef': '(Acceleration/#)' + } + self.assertEqual(defs, expected_defs) + + # def test_missing_column_name_issue(self): + # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_schema.mediawiki') + # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_events_bad_column_name.tsv') + # + # hed_schema = schema.load_schema(schema_path) + # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # "../data/validator_tests/bids_events.json") + # validator = HedValidator(hed_schema=hed_schema) + # sidecar = Sidecar(json_path) + # issues = sidecar.validate_entries(validator) + # self.assertEqual(len(issues), 0) + # input_file = TabularInput(events_path, sidecars=sidecar) + # + # validation_issues = input_file.validate_sidecar(validator) + # self.assertEqual(len(validation_issues), 0) + # validation_issues = input_file.validate_file(validator, check_for_warnings=True) + # self.assertEqual(len(validation_issues), 1) + # + # def test_expand_column_issues(self): + # schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_schema.mediawiki') + # events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # '../data/validator_tests/bids_events_bad_category_key.tsv') + # + # hed_schema = schema.load_schema(schema_path) + # json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + # "../data/validator_tests/bids_events.json") + # validator = HedValidator(hed_schema=hed_schema) + # sidecar = Sidecar(json_path) + # issues = sidecar.validate_entries(validator) + # self.assertEqual(len(issues), 0) + # input_file = TabularInput(events_path, sidecars=sidecar) + # + # validation_issues = input_file.validate_sidecar(validator) + # self.assertEqual(len(validation_issues), 0) + # validation_issues = input_file.validate_file(validator, check_for_warnings=True) + # self.assertEqual(len(validation_issues), 1) + + +class TestInsertColumns(unittest.TestCase): + + def test_insert_columns_simple(self): + df = pd.DataFrame({ + "column1": ["[column2], Event, Action"], + "column2": ["Item"] + }) + expected_df = pd.DataFrame({ + "column1": ["Item, Event, Action"] + }) + result = BaseInput._insert_columns(df) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_multiple_rows(self): + df = pd.DataFrame({ + "column1": ["[column2], Event, Action", "Event, Action"], + "column2": ["Item", "Subject"] + }) + expected_df = pd.DataFrame({ + "column1": ["Item, Event, Action", "Event, Action"] + }) + result = BaseInput._insert_columns(df) + pd.testing.assert_frame_equal(result, expected_df) + + # def test_insert_columns_no_circular_reference(self): + # df = pd.DataFrame({ + # "column1": ["[column2], Event, Action"], + # "column2": ["[column1], Item"] + # }) + # with self.assertRaises(ValueError): + # result = BaseInput._insert_columns(df) + + def test_insert_columns_multiple_columns(self): + df = pd.DataFrame({ + "column1": ["[column2], Event, [column3], Action"], + "column2": ["Item"], + "column3": ["Subject"] + }) + expected_df = pd.DataFrame({ + "column1": ["Item, Event, Subject, Action"] + }) + result = BaseInput._insert_columns(df) + pd.testing.assert_frame_equal(result, expected_df) + + def test_insert_columns_invalid_column_name(self): + df = pd.DataFrame({ + "column1": ["[invalid_column], Event, Action"], + "column2": ["Item"] + }) + with self.assertRaises(ValueError): + result = BaseInput._insert_columns(df) + + def test_insert_columns_four_columns(self): + df = pd.DataFrame({ + "column1": ["[column2], Event, [column3], Action"], + "column2": ["Item"], + "column3": ["Subject"], + "column4": ["Data"] + }) + expected_df = pd.DataFrame({ + "column1": ["Item, Event, Subject, Action"], + "column4": ["Data"] + }) + result = BaseInput._insert_columns(df) + pd.testing.assert_frame_equal(result, expected_df) + + # def test_insert_columns_invalid_syntax(self): + # df = pd.DataFrame({ + # "column1": ["column2], Event, Action"], + # "column2": ["Item"] + # }) + # with self.assertRaises(ValueError): + # result = BaseInput._insert_columns(df) + + # def test_insert_columns_no_self_reference(self): + # df = pd.DataFrame({ + # "column1": ["[column1], Event, Action"], + # "column2": ["Item"] + # }) + # with self.assertRaises(ValueError): + # result = BaseInput._insert_columns(df) + + +class TestCombineDataframe(unittest.TestCase): + def test_combine_dataframe_with_strings(self): + data = { + 'A': ['apple', 'banana', 'cherry'], + 'B': ['dog', 'elephant', 'fox'], + 'C': ['guitar', 'harmonica', 'piano'] + } + df = pd.DataFrame(data) + result = BaseInput.combine_dataframe(df) + expected = pd.Series(['apple, dog, guitar', 'banana, elephant, harmonica', 'cherry, fox, piano']) + self.assertTrue(result.equals(expected)) + + def test_combine_dataframe_with_nan_values(self): + data = { + 'A': ['apple', np.nan, 'cherry'], + 'B': [np.nan, 'elephant', 'fox'], + 'C': ['guitar', 'harmonica', np.nan] + } + df = pd.DataFrame(data) + result = BaseInput.combine_dataframe(df) + expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox']) + self.assertTrue(result.equals(expected)) + + def test_combine_dataframe_with_empty_values(self): + data = { + 'A': ['apple', '', 'cherry'], + 'B': ['', 'elephant', 'fox'], + 'C': ['guitar', 'harmonica', ''] + } + df = pd.DataFrame(data) + result = BaseInput.combine_dataframe(df) + expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox']) + self.assertTrue(result.equals(expected)) + + def test_combine_dataframe_with_mixed_values(self): + data = { + 'A': ['apple', np.nan, 'cherry', 'n/a', ''], + 'B': [np.nan, 'elephant', 'fox', 'n/a', ''], + 'C': ['guitar', 'harmonica', np.nan, 'n/a', ''] + } + df = pd.DataFrame(data) + csv_buffer = io.StringIO() + df.to_csv(csv_buffer, header=False, index=False) + csv_buffer.seek(0) + + # Use the same loading function we normally use to verify n/a translates right. + loaded_df = pd.read_csv(csv_buffer, header=None) + result = BaseInput.combine_dataframe(loaded_df) + expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox', '', '']) + self.assertTrue(result.equals(expected)) + + +class TestColumnRefs(unittest.TestCase): + def test_simple_column_refs(self): + data1 = { + 'A': ['[col1], [col2]', 'tag1, tag2'], + 'B': ['tag3, tag4', '[col3]'], + } + df1 = pd.DataFrame(data1) + result1 = BaseInput._find_column_refs(df1) + expected1 = ['col1', 'col2', 'col3'] + self.assertEqual(result1, expected1) + + def test_mixed_cases_and_patterns(self): + data2 = { + 'A': ['[Col1], [col2]', 'tag1, [Col3]', 'tag3, [COL4]', '[col5], [col6]'], + } + df2 = pd.DataFrame(data2) + result2 = BaseInput._find_column_refs(df2) + expected2 = ['Col1', 'col2', 'Col3', 'COL4', 'col5', 'col6'] + self.assertEqual(result2, expected2) + + def test_no_column_references(self): + data3 = { + 'A': ['tag1, tag2', 'tag3, tag4'], + 'B': ['tag5, tag6', 'tag7, tag8'], + } + df3 = pd.DataFrame(data3) + result3 = BaseInput._find_column_refs(df3) + expected3 = [] + self.assertEqual(result3, expected3) + + def test_incomplete_square_brackets(self): + data4 = { + 'A': ['[col1, [col2]', 'tag1, [Col3'], + 'B': ['tag3, [COL4', '[col5, col6]'], + } + df4 = pd.DataFrame(data4) + result4 = BaseInput._find_column_refs(df4) + expected4 = ['col2'] + self.assertEqual(result4, expected4) \ No newline at end of file diff --git a/tests/models/test_df_util.py b/tests/models/test_df_util.py index bc9c907b7..e10e2a4a3 100644 --- a/tests/models/test_df_util.py +++ b/tests/models/test_df_util.py @@ -3,7 +3,7 @@ from hed import load_schema_version -from hed.models.df_util import shrink_defs, expand_defs +from hed.models.df_util import shrink_defs, expand_defs, convert_to_form from hed import DefinitionDict @@ -111,4 +111,45 @@ def test_expand_defs_series_placeholder(self): series = pd.Series(["Def/TestDefPlaceholder/123,Item/SomeItem"]) expected_series = pd.Series(["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]) result = expand_defs(series, self.schema, self.def_dict, None) - pd.testing.assert_series_equal(result, expected_series) \ No newline at end of file + pd.testing.assert_series_equal(result, expected_series) + + +class TestConvertToForm(unittest.TestCase): + def setUp(self): + self.schema = load_schema_version() + + def test_convert_to_form_short_tags(self): + df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]}) + expected_df = pd.DataFrame({"column1": ["Azure,See"]}) + result = convert_to_form(df, self.schema, "short_tag", ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_convert_to_form_long_tags(self): + df = pd.DataFrame({"column1": ["CSS-color/White-color/Azure,Action/Perceive/See"]}) + expected_df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]}) + result = convert_to_form(df, self.schema, "long_tag", ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_convert_to_form_series_short_tags(self): + series = pd.Series(["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]) + expected_series = pd.Series(["Azure,See"]) + result = convert_to_form(series, self.schema, "short_tag") + pd.testing.assert_series_equal(result, expected_series) + + def test_convert_to_form_series_long_tags(self): + series = pd.Series(["CSS-color/White-color/Azure,Action/Perceive/See"]) + expected_series = pd.Series(["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]) + result = convert_to_form(series, self.schema, "long_tag") + pd.testing.assert_series_equal(result, expected_series) + + def test_convert_to_form_multiple_tags_short(self): + df = pd.DataFrame({"column1": ["Visual-attribute/Color/CSS-color/White-color/Azure,Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) + expected_df = pd.DataFrame({"column1": ["Azure,Nose,4.5 m-per-s^2"]}) + result = convert_to_form(df, self.schema, "short_tag", ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_convert_to_form_multiple_tags_long(self): + df = pd.DataFrame({"column1": ["CSS-color/White-color/Azure,Anatomical-item/Body-part/Head/Face/Nose,Rate-of-change/Acceleration/4.5 m-per-s^2"]}) + expected_df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Item/Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Property/Data-property/Data-value/Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) + result = convert_to_form(df, self.schema, "long_tag", ['column1']) + pd.testing.assert_frame_equal(result, expected_df) \ No newline at end of file From bd4b71ab20cb16dcec925088696b21ed4d60ddb3 Mon Sep 17 00:00:00 2001 From: IanCa Date: Mon, 20 Mar 2023 17:48:20 -0500 Subject: [PATCH 09/19] Update na/empty handling --- hed/models/base_input.py | 17 ++++++++++------- hed/models/hed_string.py | 2 +- tests/models/test_base_input.py | 6 ++++++ tests/models/test_df_util.py | 2 +- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/hed/models/base_input.py b/hed/models/base_input.py index f50ea5e4c..af6249f56 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -67,7 +67,10 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T elif not file: raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file) elif input_type in self.TEXT_EXTENSION: - self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, dtype=str) + self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, + dtype=str, keep_default_na=True, na_values=None) + # Convert nan values to a known value + self._dataframe = self._dataframe.fillna("n/a") elif input_type in self.EXCEL_EXTENSION: self._loaded_workbook = openpyxl.load_workbook(file) loaded_worksheet = self.get_worksheet(self._worksheet_name) @@ -365,7 +368,7 @@ def assemble(self, mapper=None): transformers, need_categorical = mapper.get_transformers() if not transformers: - return None + return self._dataframe all_columns = self._dataframe if need_categorical: all_columns[need_categorical] = all_columns[need_categorical].astype('category') @@ -379,7 +382,7 @@ def _find_column_refs(df): found_column_references = [] for column_name in df: df_temp = df[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) - u_vals = pd.Series([j for i in df_temp for j in i], dtype=str) + u_vals = pd.Series([j for i in df_temp if isinstance(i, list) for j in i], dtype=str) u_vals = u_vals.unique() for val in u_vals: if val not in found_column_references: @@ -392,7 +395,7 @@ def _insert_columns(df, known_columns=None): if known_columns is None: known_columns = list(df.columns) possible_column_references = [f"{column_name}" for column_name in df.columns if - column_name.lower() != "hed"] + isinstance(column_name, str) and column_name.lower() != "hed"] found_column_references = BaseInput._find_column_refs(df) invalid_replacements = [col for col in found_column_references if col not in possible_column_references] @@ -426,8 +429,8 @@ def combine_dataframe(dataframe): Returns: Series: the assembled series """ - dataframe = dataframe.agg( - lambda x: ', '.join(filter(lambda e: pd.notna(e) and e != "", x)), axis=1 + dataframe = dataframe.apply( + lambda x: ', '.join(filter(lambda e: bool(e) and e != "n/a", map(str, x))), + axis=1 ) - return dataframe \ No newline at end of file diff --git a/hed/models/hed_string.py b/hed/models/hed_string.py index fe864b28e..75f2de5b9 100644 --- a/hed/models/hed_string.py +++ b/hed/models/hed_string.py @@ -112,7 +112,7 @@ def expand_defs(self): replacements = [] for tag in def_tags: - if not tag._expanded: + if tag.expandable and not tag.expanded: replacements.append((tag, tag._expandable)) for tag, group in replacements: diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py index 392599f78..8404be04e 100644 --- a/tests/models/test_base_input.py +++ b/tests/models/test_base_input.py @@ -202,6 +202,8 @@ def test_combine_dataframe_with_nan_values(self): 'C': ['guitar', 'harmonica', np.nan] } df = pd.DataFrame(data) + # this is called on load normally + df = df.fillna("n/a") result = BaseInput.combine_dataframe(df) expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox']) self.assertTrue(result.equals(expected)) @@ -213,6 +215,7 @@ def test_combine_dataframe_with_empty_values(self): 'C': ['guitar', 'harmonica', ''] } df = pd.DataFrame(data) + result = BaseInput.combine_dataframe(df) expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox']) self.assertTrue(result.equals(expected)) @@ -224,12 +227,15 @@ def test_combine_dataframe_with_mixed_values(self): 'C': ['guitar', 'harmonica', np.nan, 'n/a', ''] } df = pd.DataFrame(data) + # this is called on load normally + df = df.fillna("n/a") csv_buffer = io.StringIO() df.to_csv(csv_buffer, header=False, index=False) csv_buffer.seek(0) # Use the same loading function we normally use to verify n/a translates right. loaded_df = pd.read_csv(csv_buffer, header=None) + loaded_df = loaded_df.fillna("n/a") result = BaseInput.combine_dataframe(loaded_df) expected = pd.Series(['apple, guitar', 'elephant, harmonica', 'cherry, fox', '', '']) self.assertTrue(result.equals(expected)) diff --git a/tests/models/test_df_util.py b/tests/models/test_df_util.py index e10e2a4a3..2f1823e9d 100644 --- a/tests/models/test_df_util.py +++ b/tests/models/test_df_util.py @@ -144,7 +144,7 @@ def test_convert_to_form_series_long_tags(self): def test_convert_to_form_multiple_tags_short(self): df = pd.DataFrame({"column1": ["Visual-attribute/Color/CSS-color/White-color/Azure,Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) - expected_df = pd.DataFrame({"column1": ["Azure,Nose,4.5 m-per-s^2"]}) + expected_df = pd.DataFrame({"column1": ["Azure,Nose,Acceleration/4.5 m-per-s^2"]}) result = convert_to_form(df, self.schema, "short_tag", ['column1']) pd.testing.assert_frame_equal(result, expected_df) From 9b6705f11745514e8bbb0df4632f383217ceab4c Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Tue, 21 Mar 2023 06:47:45 -0500 Subject: [PATCH 10/19] Making sure up to date before merging --- .../operations/factor_hed_tags_op.py | 2 +- tests/models/test_df_util.py | 47 ++++++++++++++++++- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py index aa02224b9..930f1353f 100644 --- a/hed/tools/remodeling/operations/factor_hed_tags_op.py +++ b/hed/tools/remodeling/operations/factor_hed_tags_op.py @@ -110,7 +110,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): f"Query [{query_name}]: is already a column name of the data frame") df_list = [input_data.dataframe] hed_strings, _ = get_assembled(input_data, sidecar, dispatcher.hed_schema, extra_def_dicts=None, - join_columns=True, shrink_defs=False, expand_defs=True) + join_columns=True, shrink_defs=False, expand_defs=True) df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=self.query_names) for parse_ind, parser in enumerate(self.expression_parsers): for index, next_item in enumerate(hed_strings): diff --git a/tests/models/test_df_util.py b/tests/models/test_df_util.py index bc9c907b7..fe1d0f591 100644 --- a/tests/models/test_df_util.py +++ b/tests/models/test_df_util.py @@ -111,4 +111,49 @@ def test_expand_defs_series_placeholder(self): series = pd.Series(["Def/TestDefPlaceholder/123,Item/SomeItem"]) expected_series = pd.Series(["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]) result = expand_defs(series, self.schema, self.def_dict, None) - pd.testing.assert_series_equal(result, expected_series) \ No newline at end of file +# <<<<<<< HEAD +# pd.testing.assert_series_equal(result, expected_series) +# +# +# class TestConvertToForm(unittest.TestCase): +# def setUp(self): +# self.schema = load_schema_version() +# +# def test_convert_to_form_short_tags(self): +# df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]}) +# expected_df = pd.DataFrame({"column1": ["Azure,See"]}) +# result = convert_to_form(df, self.schema, "short_tag", ['column1']) +# pd.testing.assert_frame_equal(result, expected_df) +# +# def test_convert_to_form_long_tags(self): +# df = pd.DataFrame({"column1": ["CSS-color/White-color/Azure,Action/Perceive/See"]}) +# expected_df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]}) +# result = convert_to_form(df, self.schema, "long_tag", ['column1']) +# pd.testing.assert_frame_equal(result, expected_df) +# +# def test_convert_to_form_series_short_tags(self): +# series = pd.Series(["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]) +# expected_series = pd.Series(["Azure,See"]) +# result = convert_to_form(series, self.schema, "short_tag") +# pd.testing.assert_series_equal(result, expected_series) +# +# def test_convert_to_form_series_long_tags(self): +# series = pd.Series(["CSS-color/White-color/Azure,Action/Perceive/See"]) +# expected_series = pd.Series(["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]) +# result = convert_to_form(series, self.schema, "long_tag") +# pd.testing.assert_series_equal(result, expected_series) +# +# def test_convert_to_form_multiple_tags_short(self): +# df = pd.DataFrame({"column1": ["Visual-attribute/Color/CSS-color/White-color/Azure,Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) +# expected_df = pd.DataFrame({"column1": ["Azure,Nose,Acceleration/4.5 m-per-s^2"]}) +# result = convert_to_form(df, self.schema, "short_tag", ['column1']) +# pd.testing.assert_frame_equal(result, expected_df) +# +# def test_convert_to_form_multiple_tags_long(self): +# df = pd.DataFrame({"column1": ["CSS-color/White-color/Azure,Anatomical-item/Body-part/Head/Face/Nose,Rate-of-change/Acceleration/4.5 m-per-s^2"]}) +# expected_df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Item/Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Property/Data-property/Data-value/Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) +# result = convert_to_form(df, self.schema, "long_tag", ['column1']) +# pd.testing.assert_frame_equal(result, expected_df) +# ======= + pd.testing.assert_series_equal(result, expected_series) +# >>>>>>> 5bab6c620505fd4e97629d846a7abfbe68dc150a From 2c66b650713b098c02d426e2915d9fafe7c2224f Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Tue, 21 Mar 2023 13:05:38 -0500 Subject: [PATCH 11/19] Updated the unit tests. find_def_tags problematic --- hed/tools/analysis/hed_context_manager.py | 4 ++-- hed/tools/bids/bids_dataset.py | 4 ++-- hed/tools/bids/bids_file_group.py | 15 +++++++-------- hed/tools/remodeling/dispatcher.py | 4 ++++ .../remodel/backups/back1/backup_lock.json | 6 ++++++ .../back1/backup_root/sub1/sub1_events.tsv | 2 ++ .../back1/backup_root/sub2/sub2_events.tsv | 2 ++ .../back1/backup_root/sub2/sub2_next_events.tsv | 2 ++ .../backups/back1/backup_root/top_level.tsv | 2 ++ .../test_root_back1/sub1/sub1_events.tsv | 2 ++ .../test_root_back1/sub2/sub2_events.tsv | 2 ++ .../test_root_back1/sub2/sub2_next_events.tsv | 2 ++ .../remodel_tests/test_root_back1/top_level.tsv | 2 ++ tests/tools/bids/test_bids_dataset.py | 15 +++++++++------ tests/tools/bids/test_bids_file_group.py | 12 ++++++------ 15 files changed, 52 insertions(+), 24 deletions(-) create mode 100644 tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_lock.json create mode 100644 tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub1/sub1_events.tsv create mode 100644 tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_events.tsv create mode 100644 tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_next_events.tsv create mode 100644 tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/top_level.tsv create mode 100644 tests/data/remodel_tests/test_root_back1/sub1/sub1_events.tsv create mode 100644 tests/data/remodel_tests/test_root_back1/sub2/sub2_events.tsv create mode 100644 tests/data/remodel_tests/test_root_back1/sub2/sub2_next_events.tsv create mode 100644 tests/data/remodel_tests/test_root_back1/top_level.tsv diff --git a/hed/tools/analysis/hed_context_manager.py b/hed/tools/analysis/hed_context_manager.py index 5c565a9a4..72298de1f 100644 --- a/hed/tools/analysis/hed_context_manager.py +++ b/hed/tools/analysis/hed_context_manager.py @@ -78,13 +78,13 @@ def _create_onset_list(self): onset_dict = {} for event_index, hed in enumerate(self.hed_strings): to_remove = [] # tag_tuples = hed.find_tags(['Onset'], recursive=False, include_groups=1) - onset_tuples = hed.find_tags(["onset"], recursive=True, include_groups=2) + onset_tuples = hed.find_top_level_tags(["onset"], include_groups=2) self.onset_count += len(onset_tuples) for tup in onset_tuples: group = tup[1] group.remove([tup[0]]) self._update_onset_list(group, onset_dict, event_index, is_offset=False) - offset_tuples = hed.find_tags(["offset"], recursive=True, include_groups=2) + offset_tuples = hed.find_top_level_tags(["offset"], include_groups=2) self.offset_count += len(offset_tuples) for tup in offset_tuples: group = tup[1] diff --git a/hed/tools/bids/bids_dataset.py b/hed/tools/bids/bids_dataset.py index 5b1b56e10..0438cb5fe 100644 --- a/hed/tools/bids/bids_dataset.py +++ b/hed/tools/bids/bids_dataset.py @@ -86,9 +86,9 @@ def validate(self, types=None, check_for_warnings=True): issues = [] for tab_type in types: files = self.tabular_files[tab_type] - issues += files.validate_sidecars(hed_ops=[validator], + issues += files.validate_sidecars(self.schema, check_for_warnings=check_for_warnings, error_handler=error_handler) - issues += files.validate_datafiles(hed_ops=[validator], + issues += files.validate_datafiles(self.schema, check_for_warnings=check_for_warnings, error_handler=error_handler) return issues diff --git a/hed/tools/bids/bids_file_group.py b/hed/tools/bids/bids_file_group.py index d354ade8a..418cfd97a 100644 --- a/hed/tools/bids/bids_file_group.py +++ b/hed/tools/bids/bids_file_group.py @@ -111,11 +111,11 @@ def summarize(self, value_cols=None, skip_cols=None): info.update(list(self.datafile_dict.keys())) return info - def validate_sidecars(self, hed_ops, check_for_warnings=True, error_handler=None): + def validate_sidecars(self, hed_schema, check_for_warnings=True, error_handler=None): """ Validate merged sidecars. Parameters: - hed_ops ([func or HedOps], func, HedOps): Validation functions to apply. + hed_schema (HedSchema): HED schema for validation. check_for_warnings (bool): If True, include warnings in the check. error_handler (ErrorHandler): The common error handler for the dataset. @@ -130,17 +130,15 @@ def validate_sidecars(self, hed_ops, check_for_warnings=True, error_handler=None for sidecar in self.sidecar_dict.values(): error_handler.push_error_context(ErrorContext.FILE_NAME, sidecar.file_path) if sidecar.has_hed: - issues += sidecar.contents.validate_entries(hed_ops=hed_ops, - name=sidecar.file_path, - check_for_warnings=check_for_warnings) + issues += sidecar.contents.validate(hed_schema, name=sidecar.file_path) error_handler.pop_error_context() return issues - def validate_datafiles(self, hed_ops, check_for_warnings=True, keep_contents=False, error_handler=None): + def validate_datafiles(self, hed_schema, check_for_warnings=True, keep_contents=False, error_handler=None): """ Validate the datafiles and return an error list. Parameters: - hed_ops ([func or HedOps], func, HedOps): Validation functions to apply. + hed_schema (HedSchema): Schema to apply to the validation. check_for_warnings (bool): If True, include warnings in the check. keep_contents (bool): If True, the underlying data files are read and their contents retained. error_handler (ErrorHandler): The common error handler to use for the dataset. @@ -159,7 +157,8 @@ def validate_datafiles(self, hed_ops, check_for_warnings=True, keep_contents=Fal if not data_obj.has_hed: continue data = data_obj.contents - issues += data.validate_file(hed_ops=hed_ops, check_for_warnings=check_for_warnings) + + issues += data.validate(hed_schema) if not keep_contents: data_obj.clear_contents() error_handler.pop_error_context() diff --git a/hed/tools/remodeling/dispatcher.py b/hed/tools/remodeling/dispatcher.py index 4cc4df9f9..5371bb2d1 100644 --- a/hed/tools/remodeling/dispatcher.py +++ b/hed/tools/remodeling/dispatcher.py @@ -222,6 +222,10 @@ def post_proc_data(df): DataFrame: DataFrame with the 'np.NAN replaced by 'n/a' """ + dtypes = df.dtypes.to_dict() + for col_name, typ in dtypes.items(): + if typ == 'category': + df[col_name] = df[col_name].astype(str) return df.fillna('n/a') @staticmethod diff --git a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_lock.json b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_lock.json new file mode 100644 index 000000000..d3e4b6991 --- /dev/null +++ b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_lock.json @@ -0,0 +1,6 @@ +{ + "top_level.tsv": "2022-09-16 13:20:21.423303", + "sub1/sub1_events.tsv": "2022-09-16 13:20:21.423303", + "sub2/sub2_events.tsv": "2022-09-16 13:20:21.423303", + "sub2/sub2_next_events.tsv": "2022-09-16 13:20:21.423303" +} \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub1/sub1_events.tsv b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub1/sub1_events.tsv new file mode 100644 index 000000000..d2191cec6 --- /dev/null +++ b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub1/sub1_events.tsv @@ -0,0 +1,2 @@ +onset duration stuff +3.2 0.5 junk2 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_events.tsv b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_events.tsv new file mode 100644 index 000000000..ef5c73314 --- /dev/null +++ b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_events.tsv @@ -0,0 +1,2 @@ +onset duration stuff +3.2 0.5 junk3 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_next_events.tsv b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_next_events.tsv new file mode 100644 index 000000000..ae9d3d35d --- /dev/null +++ b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_next_events.tsv @@ -0,0 +1,2 @@ +onset duration stuff +3.2 0.5 junk4 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/top_level.tsv b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/top_level.tsv new file mode 100644 index 000000000..c71cc2553 --- /dev/null +++ b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/top_level.tsv @@ -0,0 +1,2 @@ +onset duration stuff +3.2 0.5 junk1 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/sub1/sub1_events.tsv b/tests/data/remodel_tests/test_root_back1/sub1/sub1_events.tsv new file mode 100644 index 000000000..d2191cec6 --- /dev/null +++ b/tests/data/remodel_tests/test_root_back1/sub1/sub1_events.tsv @@ -0,0 +1,2 @@ +onset duration stuff +3.2 0.5 junk2 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/sub2/sub2_events.tsv b/tests/data/remodel_tests/test_root_back1/sub2/sub2_events.tsv new file mode 100644 index 000000000..ef5c73314 --- /dev/null +++ b/tests/data/remodel_tests/test_root_back1/sub2/sub2_events.tsv @@ -0,0 +1,2 @@ +onset duration stuff +3.2 0.5 junk3 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/sub2/sub2_next_events.tsv b/tests/data/remodel_tests/test_root_back1/sub2/sub2_next_events.tsv new file mode 100644 index 000000000..ae9d3d35d --- /dev/null +++ b/tests/data/remodel_tests/test_root_back1/sub2/sub2_next_events.tsv @@ -0,0 +1,2 @@ +onset duration stuff +3.2 0.5 junk4 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/top_level.tsv b/tests/data/remodel_tests/test_root_back1/top_level.tsv new file mode 100644 index 000000000..c71cc2553 --- /dev/null +++ b/tests/data/remodel_tests/test_root_back1/top_level.tsv @@ -0,0 +1,2 @@ +onset duration stuff +3.2 0.5 junk1 \ No newline at end of file diff --git a/tests/tools/bids/test_bids_dataset.py b/tests/tools/bids/test_bids_dataset.py index 6289be314..df02448bf 100644 --- a/tests/tools/bids/test_bids_dataset.py +++ b/tests/tools/bids/test_bids_dataset.py @@ -68,18 +68,21 @@ def test_validator(self): self.assertTrue(issues, "BidsDataset validate should return issues when the default check_for_warnings is used") issues = bids.validate(check_for_warnings=True) self.assertTrue(issues, "BidsDataset validate should return issues when check_for_warnings is True") - issues = bids.validate(check_for_warnings=False) - self.assertFalse(issues, "BidsDataset validate should return no issues when check_for_warnings is False") + # ToDO + # issues = bids.validate(check_for_warnings=False) + # self.assertFalse(issues, "BidsDataset validate should return no issues when check_for_warnings is False") def test_validator_libraries(self): bids = BidsDataset(self.library_path) - issues = bids.validate(check_for_warnings=False) - self.assertFalse(issues, "BidsDataset with libraries should validate") + # ToDO check_for_warnings + # issues = bids.validate(check_for_warnings=False) + # self.assertFalse(issues, "BidsDataset with libraries should validate") def test_validator_types(self): bids = BidsDataset(self.root_path, tabular_types=None) - issues = bids.validate(check_for_warnings=False) - self.assertFalse(issues, "BidsDataset with participants and events validates") + # ToDO: check_for_warnings + # issues = bids.validate(check_for_warnings=False) + # self.assertFalse(issues, "BidsDataset with participants and events validates") def test_with_schema_group(self): base_version = '8.0.0' diff --git a/tests/tools/bids/test_bids_file_group.py b/tests/tools/bids/test_bids_file_group.py index 04482de47..22d395085 100644 --- a/tests/tools/bids/test_bids_file_group.py +++ b/tests/tools/bids/test_bids_file_group.py @@ -32,12 +32,12 @@ def test_constructor(self): def test_validator(self): events = BidsFileGroup(self.root_path) - hed_schema = \ - load_schema('https://raw.githubusercontent.com/hed-standard/hed-schemas/main/standard_schema/hedxml/HED8.0.0.xml') - validator = HedValidator(hed_schema) - validation_issues = events.validate_datafiles(hed_ops=[validator], check_for_warnings=False) - self.assertFalse(validation_issues, "BidsFileGroup should have no validation errors") - validation_issues = events.validate_datafiles(hed_ops=[validator], check_for_warnings=True) + hed = 'https://raw.githubusercontent.com/hed-standard/hed-schemas/main/standard_schema/hedxml/HED8.0.0.xml' + hed_schema = load_schema(hed) + # TODO test after filtering. + # validation_issues = events.validate_datafiles(hed_schema, check_for_warnings=False) + # self.assertFalse(validation_issues, "BidsFileGroup should have no validation errors") + validation_issues = events.validate_datafiles(hed_schema, check_for_warnings=True) self.assertTrue(validation_issues, "BidsFileGroup should have validation warnings") self.assertEqual(len(validation_issues), 6, "BidsFileGroup should have 2 validation warnings for missing columns") From 62806e481619b71c709c43c653a5ee2f95db6266 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Tue, 21 Mar 2023 13:11:29 -0500 Subject: [PATCH 12/19] Updated the unit tests --- .../derivatives/remodel/backups/back1/backup_lock.json | 6 ------ .../remodel/backups/back1/backup_root/sub1/sub1_events.tsv | 2 -- .../remodel/backups/back1/backup_root/sub2/sub2_events.tsv | 2 -- .../backups/back1/backup_root/sub2/sub2_next_events.tsv | 2 -- .../remodel/backups/back1/backup_root/top_level.tsv | 2 -- .../data/remodel_tests/test_root_back1/sub1/sub1_events.tsv | 2 -- .../data/remodel_tests/test_root_back1/sub2/sub2_events.tsv | 2 -- .../remodel_tests/test_root_back1/sub2/sub2_next_events.tsv | 2 -- tests/data/remodel_tests/test_root_back1/top_level.tsv | 2 -- 9 files changed, 22 deletions(-) delete mode 100644 tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_lock.json delete mode 100644 tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub1/sub1_events.tsv delete mode 100644 tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_events.tsv delete mode 100644 tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_next_events.tsv delete mode 100644 tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/top_level.tsv delete mode 100644 tests/data/remodel_tests/test_root_back1/sub1/sub1_events.tsv delete mode 100644 tests/data/remodel_tests/test_root_back1/sub2/sub2_events.tsv delete mode 100644 tests/data/remodel_tests/test_root_back1/sub2/sub2_next_events.tsv delete mode 100644 tests/data/remodel_tests/test_root_back1/top_level.tsv diff --git a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_lock.json b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_lock.json deleted file mode 100644 index d3e4b6991..000000000 --- a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_lock.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "top_level.tsv": "2022-09-16 13:20:21.423303", - "sub1/sub1_events.tsv": "2022-09-16 13:20:21.423303", - "sub2/sub2_events.tsv": "2022-09-16 13:20:21.423303", - "sub2/sub2_next_events.tsv": "2022-09-16 13:20:21.423303" -} \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub1/sub1_events.tsv b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub1/sub1_events.tsv deleted file mode 100644 index d2191cec6..000000000 --- a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub1/sub1_events.tsv +++ /dev/null @@ -1,2 +0,0 @@ -onset duration stuff -3.2 0.5 junk2 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_events.tsv b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_events.tsv deleted file mode 100644 index ef5c73314..000000000 --- a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_events.tsv +++ /dev/null @@ -1,2 +0,0 @@ -onset duration stuff -3.2 0.5 junk3 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_next_events.tsv b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_next_events.tsv deleted file mode 100644 index ae9d3d35d..000000000 --- a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/sub2/sub2_next_events.tsv +++ /dev/null @@ -1,2 +0,0 @@ -onset duration stuff -3.2 0.5 junk4 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/top_level.tsv b/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/top_level.tsv deleted file mode 100644 index c71cc2553..000000000 --- a/tests/data/remodel_tests/test_root_back1/derivatives/remodel/backups/back1/backup_root/top_level.tsv +++ /dev/null @@ -1,2 +0,0 @@ -onset duration stuff -3.2 0.5 junk1 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/sub1/sub1_events.tsv b/tests/data/remodel_tests/test_root_back1/sub1/sub1_events.tsv deleted file mode 100644 index d2191cec6..000000000 --- a/tests/data/remodel_tests/test_root_back1/sub1/sub1_events.tsv +++ /dev/null @@ -1,2 +0,0 @@ -onset duration stuff -3.2 0.5 junk2 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/sub2/sub2_events.tsv b/tests/data/remodel_tests/test_root_back1/sub2/sub2_events.tsv deleted file mode 100644 index ef5c73314..000000000 --- a/tests/data/remodel_tests/test_root_back1/sub2/sub2_events.tsv +++ /dev/null @@ -1,2 +0,0 @@ -onset duration stuff -3.2 0.5 junk3 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/sub2/sub2_next_events.tsv b/tests/data/remodel_tests/test_root_back1/sub2/sub2_next_events.tsv deleted file mode 100644 index ae9d3d35d..000000000 --- a/tests/data/remodel_tests/test_root_back1/sub2/sub2_next_events.tsv +++ /dev/null @@ -1,2 +0,0 @@ -onset duration stuff -3.2 0.5 junk4 \ No newline at end of file diff --git a/tests/data/remodel_tests/test_root_back1/top_level.tsv b/tests/data/remodel_tests/test_root_back1/top_level.tsv deleted file mode 100644 index c71cc2553..000000000 --- a/tests/data/remodel_tests/test_root_back1/top_level.tsv +++ /dev/null @@ -1,2 +0,0 @@ -onset duration stuff -3.2 0.5 junk1 \ No newline at end of file From 3bfbd3bba2e8fdf524bde65b051a5afaa35dbb7e Mon Sep 17 00:00:00 2001 From: IanCa Date: Tue, 21 Mar 2023 16:53:05 -0500 Subject: [PATCH 13/19] Fix hed_string.expand_defs issue --- hed/models/hed_string.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hed/models/hed_string.py b/hed/models/hed_string.py index 75f2de5b9..db16833f1 100644 --- a/hed/models/hed_string.py +++ b/hed/models/hed_string.py @@ -116,7 +116,8 @@ def expand_defs(self): replacements.append((tag, tag._expandable)) for tag, group in replacements: - self.replace(tag, group) + tag_parent = tag._parent + tag_parent.replace(tag, group) tag.short_base_tag = DefTagNames.DEF_EXPAND_KEY return self From 3af84e9927224c132af07431de8e499ddd5a9f27 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Thu, 23 Mar 2023 16:26:35 -0500 Subject: [PATCH 14/19] Corrected some of the refactored unit tests --- hed/tools/analysis/hed_type_counts.py | 2 +- hed/tools/bids/bids_file_group.py | 34 ++++++++----------- .../operations/factor_hed_type_op.py | 2 +- .../operations/summarize_hed_type_op.py | 5 ++- tests/tools/bids/test_bids_file_group.py | 5 ++- .../operations/test_summarize_hed_tags_op.py | 8 ++--- .../operations/test_summarize_hed_type_op.py | 20 +++++++++-- 7 files changed, 42 insertions(+), 34 deletions(-) diff --git a/hed/tools/analysis/hed_type_counts.py b/hed/tools/analysis/hed_type_counts.py index 056bd63d7..e68f2064e 100644 --- a/hed/tools/analysis/hed_type_counts.py +++ b/hed/tools/analysis/hed_type_counts.py @@ -147,4 +147,4 @@ def get_summary(self): for type_value, count in self.type_dict.items(): details[type_value] = count.get_summary() return {'name': str(self.name), 'type_tag': self.type_tag, 'files': list(self.files.keys()), - 'total_events': self.total_events, 'details': details} + 'total_events': self.total_events, 'details': details} \ No newline at end of file diff --git a/hed/tools/bids/bids_file_group.py b/hed/tools/bids/bids_file_group.py index 418cfd97a..dfb3439af 100644 --- a/hed/tools/bids/bids_file_group.py +++ b/hed/tools/bids/bids_file_group.py @@ -2,6 +2,8 @@ import os from hed.errors.error_reporter import ErrorContext, ErrorHandler +from hed.validator.sidecar_validator import SidecarValidator +from hed.validator.spreadsheet_validator import SpreadsheetValidator from hed.tools.analysis.tabular_summary import TabularSummary from hed.tools.bids.bids_tabular_file import BidsTabularFile from hed.tools.bids.bids_sidecar_file import BidsSidecarFile @@ -111,57 +113,51 @@ def summarize(self, value_cols=None, skip_cols=None): info.update(list(self.datafile_dict.keys())) return info - def validate_sidecars(self, hed_schema, check_for_warnings=True, error_handler=None): + def validate_sidecars(self, hed_schema, extra_def_dicts=None, check_for_warnings=True): """ Validate merged sidecars. Parameters: hed_schema (HedSchema): HED schema for validation. + extra_def_dicts (DefinitionDict): Extra definitions check_for_warnings (bool): If True, include warnings in the check. - error_handler (ErrorHandler): The common error handler for the dataset. Returns: list: A list of validation issues found. Each issue is a dictionary. """ - if not error_handler: - error_handler = ErrorHandler() + error_handler = ErrorHandler(check_for_warnings) issues = [] + validator = SidecarValidator(hed_schema) + for sidecar in self.sidecar_dict.values(): - error_handler.push_error_context(ErrorContext.FILE_NAME, sidecar.file_path) - if sidecar.has_hed: - issues += sidecar.contents.validate(hed_schema, name=sidecar.file_path) - error_handler.pop_error_context() + name = os.path.basename(sidecar.file_path) + issues += validator.validate(extra_def_dicts=extra_def_dicts, name=name, error_handler=error_handler) return issues - def validate_datafiles(self, hed_schema, check_for_warnings=True, keep_contents=False, error_handler=None): + def validate_datafiles(self, hed_schema, extra_def_dicts=None, check_for_warnings=True, keep_contents=False): """ Validate the datafiles and return an error list. Parameters: hed_schema (HedSchema): Schema to apply to the validation. + extra_def_dicts (DefinitionDict): Extra definitions that come from outside. check_for_warnings (bool): If True, include warnings in the check. keep_contents (bool): If True, the underlying data files are read and their contents retained. - error_handler (ErrorHandler): The common error handler to use for the dataset. Returns: list: A list of validation issues found. Each issue is a dictionary. """ - if not error_handler: - error_handler = ErrorHandler() + error_handler = ErrorHandler(check_for_warnings) issues = [] for data_obj in self.datafile_dict.values(): - error_handler.push_error_context(ErrorContext.FILE_NAME, data_obj.file_path) data_obj.set_contents(overwrite=False) - if not data_obj.has_hed: - continue - data = data_obj.contents - - issues += data.validate(hed_schema) + name = os.path.basename(data_obj.file_path) + issues += data_obj.contents.validate(data_obj.contents, extra_def_dicts=None, name=name, + error_handler=error_handler) if not keep_contents: data_obj.clear_contents() - error_handler.pop_error_context() return issues def _make_datafile_dict(self): diff --git a/hed/tools/remodeling/operations/factor_hed_type_op.py b/hed/tools/remodeling/operations/factor_hed_type_op.py index 668886c88..0a61974ed 100644 --- a/hed/tools/remodeling/operations/factor_hed_type_op.py +++ b/hed/tools/remodeling/operations/factor_hed_type_op.py @@ -74,7 +74,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): df_list = [input_data.dataframe.copy()] hed_strings, definitions = get_assembled(input_data, sidecar, dispatcher.hed_schema, extra_def_dicts=None, join_columns=True, - shrink_defs=False, expand_defs=True) + shrink_defs=True, expand_defs=False) var_manager = HedTypeManager(hed_strings, dispatcher.hed_schema, definitions) var_manager.add_type_variable(self.type_tag.lower()) diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py index 0e2664698..85ea41d7d 100644 --- a/hed/tools/remodeling/operations/summarize_hed_type_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_type_op.py @@ -93,8 +93,7 @@ def update_context(self, new_context): sidecar = Sidecar(sidecar) input_data = TabularInput(new_context['df'], sidecar=sidecar, name=new_context['name']) hed_strings, definitions = get_assembled(input_data, sidecar, new_context['schema'], - extra_def_dicts=None, join_columns=True, - shrink_defs=False, expand_defs=True) + extra_def_dicts=None, join_columns=True, expand_defs=False) context_manager = HedContextManager(hed_strings, new_context['schema']) type_values = HedTypeValues(context_manager, definitions, new_context['name'], type_tag=self.type_tag) @@ -176,4 +175,4 @@ def _level_details(level_counts, offset="", indent=""): level_list.append(f"{offset}{indent*3}Tags: {str(details['tags'])}") if details['description']: level_list.append(f"{offset}{indent*3}Description: {details['description']}") - return level_list + return level_list \ No newline at end of file diff --git a/tests/tools/bids/test_bids_file_group.py b/tests/tools/bids/test_bids_file_group.py index 22d395085..4d4302b72 100644 --- a/tests/tools/bids/test_bids_file_group.py +++ b/tests/tools/bids/test_bids_file_group.py @@ -34,9 +34,8 @@ def test_validator(self): events = BidsFileGroup(self.root_path) hed = 'https://raw.githubusercontent.com/hed-standard/hed-schemas/main/standard_schema/hedxml/HED8.0.0.xml' hed_schema = load_schema(hed) - # TODO test after filtering. - # validation_issues = events.validate_datafiles(hed_schema, check_for_warnings=False) - # self.assertFalse(validation_issues, "BidsFileGroup should have no validation errors") + validation_issues = events.validate_datafiles(hed_schema, check_for_warnings=False) + self.assertFalse(validation_issues, "BidsFileGroup should have no validation errors") validation_issues = events.validate_datafiles(hed_schema, check_for_warnings=True) self.assertTrue(validation_issues, "BidsFileGroup should have validation warnings") self.assertEqual(len(validation_issues), 6, diff --git a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py index 5f5ee41bf..aa3bd4b9c 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py @@ -104,7 +104,7 @@ def test_quick3(self): input_data = TabularInput(df, sidecar=my_sidecar) counts = HedTagCounts('myName', 2) summary_dict = {} - hed_strings = get_assembled(input_data, my_sidecar, my_schema, extra_def_dicts=None, join_columns=True, + hed_strings, definitions = get_assembled(input_data, my_sidecar, my_schema, extra_def_dicts=None, join_columns=True, shrink_defs=False, expand_defs=True) for hed in hed_strings: counts.update_event_counts(hed, 'myName') @@ -126,10 +126,8 @@ def test_quick4(self): hed_strings, definitions = get_assembled(input_data, sidecar, my_schema, extra_def_dicts=None, join_columns=True, shrink_defs=False, expand_defs=True) - for objs in input_data.iter_dataframe(hed_ops=[my_schema], return_string_only=False, - expand_defs=True, remove_definitions=True): - x = objs['HED'] - counts.update_event_counts(objs['HED'], 'myName') + for hed in hed_strings: + counts.update_event_counts(hed, 'myName') summary_dict['myName'] = counts def test_get_summary_details(self): diff --git a/tests/tools/remodeling/operations/test_summarize_hed_type_op.py b/tests/tools/remodeling/operations/test_summarize_hed_type_op.py index df72c65ee..c7b18ad90 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_type_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_type_op.py @@ -40,6 +40,10 @@ def setUpClass(cls): cls.summary_path = \ os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../data/remodel_tests/aomic_sub-0013_summary_all_rmdl.json')) + rel_path = '../../../data/remodel_tests/sub-002_task-FacePerception_run-1_events.tsv' + cls.events_wh = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), rel_path)) + rel_side = '../../../data/remodel_tests/task-FacePerception_events.json' + cls.sidecar_path_wh = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), rel_side)) @classmethod def tearDownClass(cls): @@ -75,9 +79,21 @@ def test_summary(self): self.assertEqual(len(summary2['Dataset']['Overall summary']['files']), 2) summary2a = context2.get_summary(individual_summaries="separate") self.assertIsInstance(summary2a["Individual files"]["run-02"], dict) + + def test_text_summary_with_levels(self): + with open(self.summary_path, 'r') as fp: + parms = json.load(fp) + dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) + df = dispatch.get_data_file(self.events_wh) + parsed_commands, errors = Dispatcher.parse_operations(parms) + sum_op = parsed_commands[2] + sum_op.do_op(dispatch, dispatch.prep_data(df), 'run-01', sidecar=self.sidecar_path_wh) + context1 = dispatch.context_dict['AOMIC_condition_variables'] + text_summary1 = context1.get_text_summary() + self.assertIsInstance(text_summary1, dict) def test_text_summary(self): - sidecar = Sidecar(self.sidecar_path, 'aomic_sidecar', hed_schema=self.hed_schema) + sidecar = Sidecar(self.sidecar_path, name='aomic_sidecar') with open(self.summary_path, 'r') as fp: parms = json.load(fp) @@ -104,4 +120,4 @@ def test_text_summary(self): if __name__ == '__main__': - unittest.main() + unittest.main() \ No newline at end of file From bc5c94915f39e8aab3708aead895893ffa357eac Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Thu, 23 Mar 2023 16:49:35 -0500 Subject: [PATCH 15/19] Updated bids tests --- hed/tools/bids/bids_dataset.py | 10 +++------- hed/tools/bids/bids_file_group.py | 5 +++-- tests/tools/analysis/test_event_manager.py | 1 - tests/tools/bids/test_bids_dataset.py | 15 ++++++--------- tests/tools/remodeling/cli/test_run_remodel.py | 4 +--- 5 files changed, 13 insertions(+), 22 deletions(-) diff --git a/hed/tools/bids/bids_dataset.py b/hed/tools/bids/bids_dataset.py index 0438cb5fe..bbb06ae7b 100644 --- a/hed/tools/bids/bids_dataset.py +++ b/hed/tools/bids/bids_dataset.py @@ -79,18 +79,14 @@ def validate(self, types=None, check_for_warnings=True): list: List of issues encountered during validation. Each issue is a dictionary. """ - validator = HedValidator(hed_schema=self.schema) - error_handler = ErrorHandler() + if not types: types = list(self.tabular_files.keys()) issues = [] for tab_type in types: files = self.tabular_files[tab_type] - issues += files.validate_sidecars(self.schema, - check_for_warnings=check_for_warnings, error_handler=error_handler) - issues += files.validate_datafiles(self.schema, - check_for_warnings=check_for_warnings, - error_handler=error_handler) + issues += files.validate_sidecars(self.schema, check_for_warnings=check_for_warnings) + issues += files.validate_datafiles(self.schema, check_for_warnings=check_for_warnings) return issues def get_summary(self): diff --git a/hed/tools/bids/bids_file_group.py b/hed/tools/bids/bids_file_group.py index dfb3439af..44f3f1a21 100644 --- a/hed/tools/bids/bids_file_group.py +++ b/hed/tools/bids/bids_file_group.py @@ -132,7 +132,8 @@ def validate_sidecars(self, hed_schema, extra_def_dicts=None, check_for_warnings for sidecar in self.sidecar_dict.values(): name = os.path.basename(sidecar.file_path) - issues += validator.validate(extra_def_dicts=extra_def_dicts, name=name, error_handler=error_handler) + issues += validator.validate(sidecar.contents, extra_def_dicts=extra_def_dicts, name=name, + error_handler=error_handler) return issues def validate_datafiles(self, hed_schema, extra_def_dicts=None, check_for_warnings=True, keep_contents=False): @@ -154,7 +155,7 @@ def validate_datafiles(self, hed_schema, extra_def_dicts=None, check_for_warning for data_obj in self.datafile_dict.values(): data_obj.set_contents(overwrite=False) name = os.path.basename(data_obj.file_path) - issues += data_obj.contents.validate(data_obj.contents, extra_def_dicts=None, name=name, + issues += data_obj.contents.validate(hed_schema, extra_def_dicts=None, name=name, error_handler=error_handler) if not keep_contents: data_obj.clear_contents() diff --git a/tests/tools/analysis/test_event_manager.py b/tests/tools/analysis/test_event_manager.py index 09eb17a50..8f84549d1 100644 --- a/tests/tools/analysis/test_event_manager.py +++ b/tests/tools/analysis/test_event_manager.py @@ -36,7 +36,6 @@ def test_constructor(self): self.assertEqual(event.start_time, manager1.data.dataframe.loc[index, "onset"]) if not event.end_time: self.assertEqual(event.end_index, len(manager1.data.dataframe)) - print("to here") # def test_constructor(self): # with self.assertRaises(ValueError) as cont: diff --git a/tests/tools/bids/test_bids_dataset.py b/tests/tools/bids/test_bids_dataset.py index df02448bf..6289be314 100644 --- a/tests/tools/bids/test_bids_dataset.py +++ b/tests/tools/bids/test_bids_dataset.py @@ -68,21 +68,18 @@ def test_validator(self): self.assertTrue(issues, "BidsDataset validate should return issues when the default check_for_warnings is used") issues = bids.validate(check_for_warnings=True) self.assertTrue(issues, "BidsDataset validate should return issues when check_for_warnings is True") - # ToDO - # issues = bids.validate(check_for_warnings=False) - # self.assertFalse(issues, "BidsDataset validate should return no issues when check_for_warnings is False") + issues = bids.validate(check_for_warnings=False) + self.assertFalse(issues, "BidsDataset validate should return no issues when check_for_warnings is False") def test_validator_libraries(self): bids = BidsDataset(self.library_path) - # ToDO check_for_warnings - # issues = bids.validate(check_for_warnings=False) - # self.assertFalse(issues, "BidsDataset with libraries should validate") + issues = bids.validate(check_for_warnings=False) + self.assertFalse(issues, "BidsDataset with libraries should validate") def test_validator_types(self): bids = BidsDataset(self.root_path, tabular_types=None) - # ToDO: check_for_warnings - # issues = bids.validate(check_for_warnings=False) - # self.assertFalse(issues, "BidsDataset with participants and events validates") + issues = bids.validate(check_for_warnings=False) + self.assertFalse(issues, "BidsDataset with participants and events validates") def test_with_schema_group(self): base_version = '8.0.0' diff --git a/tests/tools/remodeling/cli/test_run_remodel.py b/tests/tools/remodeling/cli/test_run_remodel.py index d0611058e..099f80252 100644 --- a/tests/tools/remodeling/cli/test_run_remodel.py +++ b/tests/tools/remodeling/cli/test_run_remodel.py @@ -97,9 +97,7 @@ def test_main_bids_no_sidecar_with_hed(self): os.remove(self.sidecar_path) with patch('sys.stdout', new=io.StringIO()) as fp: main(arg_list) - a = fp.getvalue() - print("to here") - #self.assertFalse(fp.getvalue()) + self.assertFalse(fp.getvalue()) def test_main_direct_no_sidecar(self): arg_list = [self.data_root, self.model_path, '-x', 'derivatives', 'stimuli'] From 697791c680b45cbe2c69dc2315fc3945ac9d5c95 Mon Sep 17 00:00:00 2001 From: IanCa <30812436+IanCa@users.noreply.github.com> Date: Thu, 23 Mar 2023 17:25:01 -0500 Subject: [PATCH 16/19] =?UTF-8?q?Add=20squre=20bracket=20in=20column=20val?= =?UTF-8?q?idation=20for=20spreadsheets.=20=20Update=20erro=E2=80=A6=20(#6?= =?UTF-8?q?32)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add squre bracket in column validation for spreadsheets. Update error handling slightly(error list is now sorted by context always) * Further fix shrinking/expanding(with some test cases). Start updating errors to spec names. --- hed/errors/__init__.py | 2 +- hed/errors/error_messages.py | 60 +++++- hed/errors/error_reporter.py | 71 +++++-- hed/errors/error_types.py | 27 ++- hed/models/base_input.py | 56 +++--- hed/models/definition_dict.py | 2 + hed/models/definition_entry.py | 5 +- hed/models/hed_group.py | 3 - hed/models/hed_string.py | 2 + hed/models/hed_tag.py | 13 ++ hed/models/sidecar.py | 3 +- hed/schema/schema_compliance.py | 2 +- hed/validator/def_validator.py | 24 ++- hed/validator/sidecar_validator.py | 5 +- hed/validator/spreadsheet_validator.py | 99 +++++++++- hed/validator/tag_validator.py | 6 +- tests/errors/test_error_reporter.py | 8 +- tests/models/test_base_input.py | 48 +---- tests/schema/test_convert_tags.py | 2 +- tests/validator/test_def_validator.py | 177 ++++++++++++++++++ tests/validator/test_onset_validator.py | 4 +- tests/validator/test_spreadsheet_validator.py | 57 ++++++ tests/validator/test_tag_validator.py | 14 +- tests/validator/test_tag_validator_base.py | 2 +- 24 files changed, 564 insertions(+), 128 deletions(-) create mode 100644 tests/validator/test_spreadsheet_validator.py diff --git a/hed/errors/__init__.py b/hed/errors/__init__.py index 0583dd562..c2f58a07c 100644 --- a/hed/errors/__init__.py +++ b/hed/errors/__init__.py @@ -1,4 +1,4 @@ -from .error_reporter import ErrorHandler, get_exception_issue_string, get_printable_issue_string +from .error_reporter import ErrorHandler, get_exception_issue_string, get_printable_issue_string, sort_issues from .error_types import DefinitionErrors, OnsetErrors, SchemaErrors, SchemaWarnings, SidecarErrors, ValidationErrors from .error_types import ErrorContext, ErrorSeverity from .exceptions import HedExceptions, HedFileError diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index 9ae9557f3..ca379992f 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -6,7 +6,7 @@ from hed.errors.error_reporter import hed_error, hed_tag_error from hed.errors.error_types import ValidationErrors, SchemaErrors, \ - SidecarErrors, SchemaWarnings, ErrorSeverity, DefinitionErrors, OnsetErrors + SidecarErrors, SchemaWarnings, ErrorSeverity, DefinitionErrors, OnsetErrors, ColumnErrors @hed_tag_error(ValidationErrors.HED_UNITS_INVALID) @@ -31,14 +31,14 @@ def val_error_tag_extended(tag, problem_tag): return f"Hed tag is extended. '{problem_tag}' in {tag}" -@hed_error(ValidationErrors.HED_CHARACTER_INVALID) +@hed_error(ValidationErrors.CHARACTER_INVALID) def val_error_invalid_char(source_string, char_index): character = source_string[char_index] return f'Invalid character "{character}" at index {char_index}"' @hed_tag_error(ValidationErrors.INVALID_TAG_CHARACTER, has_sub_tag=True, - actual_code=ValidationErrors.HED_CHARACTER_INVALID) + actual_code=ValidationErrors.CHARACTER_INVALID) def val_error_invalid_tag_character(tag, problem_tag): return f"Invalid character '{problem_tag}' in {tag}" @@ -49,7 +49,7 @@ def val_error_tildes_not_supported(source_string, char_index): return f"Tildes not supported. Replace (a ~ b ~ c) with (a, (b, c)). '{character}' at index {char_index}'" -@hed_error(ValidationErrors.HED_COMMA_MISSING) +@hed_error(ValidationErrors.COMMA_MISSING) def val_error_comma_missing(tag): return f"Comma missing after - '{tag}'" @@ -143,27 +143,44 @@ def val_error_sidecar_key_missing(invalid_key, category_keys): return f"Category key '{invalid_key}' does not exist in column. Valid keys are: {category_keys}" -@hed_tag_error(ValidationErrors.HED_DEF_UNMATCHED) -def val_error_def_unmatched(tag): - return f"A data-recording’s Def tag cannot be matched to definition. Tag: '{tag}'" -@hed_tag_error(ValidationErrors.HED_DEF_EXPAND_INVALID) +@hed_tag_error(ValidationErrors.HED_DEF_EXPAND_INVALID, actual_code=ValidationErrors.DEF_EXPAND_INVALID) def val_error_bad_def_expand(tag, actual_def, found_def): return f"A data-recording’s Def-expand tag does not match the given definition." + \ f"Tag: '{tag}'. Actual Def: {actual_def}. Found Def: {found_def}" -@hed_tag_error(ValidationErrors.HED_DEF_VALUE_MISSING, actual_code=ValidationErrors.HED_DEF_VALUE_INVALID) +@hed_tag_error(ValidationErrors.HED_DEF_UNMATCHED, actual_code=ValidationErrors.DEF_INVALID) +def val_error_def_unmatched(tag): + return f"A data-recording’s Def tag cannot be matched to definition. Tag: '{tag}'" + + +@hed_tag_error(ValidationErrors.HED_DEF_VALUE_MISSING, actual_code=ValidationErrors.DEF_INVALID) def val_error_def_value_missing(tag): return f"A def tag requires a placeholder value, but was not given one. Definition: '{tag}'" -@hed_tag_error(ValidationErrors.HED_DEF_VALUE_EXTRA, actual_code=ValidationErrors.HED_DEF_VALUE_INVALID) +@hed_tag_error(ValidationErrors.HED_DEF_VALUE_EXTRA, actual_code=ValidationErrors.DEF_INVALID) def val_error_def_value_extra(tag): return f"A def tag does not take a placeholder value, but was given one. Definition: '{tag}" +@hed_tag_error(ValidationErrors.HED_DEF_EXPAND_UNMATCHED, actual_code=ValidationErrors.DEF_EXPAND_INVALID) +def val_error_def_expand_unmatched(tag): + return f"A data-recording’s Def-expand tag cannot be matched to definition. Tag: '{tag}'" + + +@hed_tag_error(ValidationErrors.HED_DEF_EXPAND_VALUE_MISSING, actual_code=ValidationErrors.DEF_EXPAND_INVALID) +def val_error_def_expand_value_missing(tag): + return f"A Def-expand tag requires a placeholder value, but was not given one. Definition: '{tag}'" + + +@hed_tag_error(ValidationErrors.HED_DEF_EXPAND_VALUE_EXTRA, actual_code=ValidationErrors.DEF_EXPAND_INVALID) +def val_error_def_expand_value_extra(tag): + return f"A Def-expand tag does not take a placeholder value, but was given one. Definition: '{tag}" + + @hed_tag_error(ValidationErrors.HED_TOP_LEVEL_TAG, actual_code=ValidationErrors.HED_TAG_GROUP_ERROR) def val_error_top_level_tag(tag): return f"A tag that must be in a top level group was found in another location. {str(tag)}" @@ -342,3 +359,26 @@ def onset_wrong_placeholder(tag, has_placeholder): if has_placeholder: return f"Onset/offset def tag {tag} expects a placeholder value, but does not have one." return f"Onset/offset def tag {tag} should not have a placeholder, but has one." + + +@hed_error(ColumnErrors.INVALID_COLUMN_REF) +def invalid_column_ref(bad_refs): + return f"Bad column references found(columns do not exist): {bad_refs}" + + +@hed_error(ColumnErrors.SELF_COLUMN_REF) +def self_column_ref(self_ref): + return f"Column references itself: {self_ref}" + + +@hed_error(ColumnErrors.NESTED_COLUMN_REF) +def nested_column_ref(column_name, ref_column): + return f"Column {column_name} has a nested reference to {ref_column}. " \ + f"Column reference columns cannot contain other column references." + + +@hed_error(ColumnErrors.MALFORMED_COLUMN_REF) +def nested_column_ref(column_name, index, symbol): + return f"Column {column_name} has a malformed column reference. Improper symbol {symbol} found at index {index}." + + diff --git a/hed/errors/error_reporter.py b/hed/errors/error_reporter.py index 4a7fd91a9..cb1a959d5 100644 --- a/hed/errors/error_reporter.py +++ b/hed/errors/error_reporter.py @@ -10,6 +10,27 @@ error_functions = {} +# Controls if the default issue printing skips adding indentation for this context +no_tab_context = {ErrorContext.HED_STRING, ErrorContext.SCHEMA_ATTRIBUTE} + +# Default sort ordering for issues list +default_sort_list = [ + ErrorContext.CUSTOM_TITLE, + ErrorContext.FILE_NAME, + ErrorContext.SIDECAR_COLUMN_NAME, + ErrorContext.SIDECAR_KEY_NAME, + ErrorContext.ROW, + ErrorContext.COLUMN, + ErrorContext.HED_STRING, + ErrorContext.SCHEMA_SECTION, + ErrorContext.SCHEMA_TAG, + ErrorContext.SCHEMA_ATTRIBUTE, +] + +# ErrorContext which is expected to be int based. +int_sort_list = [ + ErrorContext.ROW, +] def _register_error_function(error_type, wrapper_func): if error_type in error_functions: @@ -153,19 +174,23 @@ def __init__(self, check_for_warnings=True): self.error_context = [] self._check_for_warnings = check_for_warnings - def push_error_context(self, context_type, context, increment_depth_after=True): + def push_error_context(self, context_type, context): """ Push a new error context to narrow down error scope. Parameters: context_type (ErrorContext): A value from ErrorContext representing the type of scope. context (str, int, or HedString): The main value for the context_type. - increment_depth_after (bool): If True, add an extra tab to any subsequent errors in the scope. Notes: The context depends on the context_type. For ErrorContext.FILE_NAME this would be the actual filename. """ - self.error_context.append((context_type, context, increment_depth_after)) + if context is None: + if context_type in int_sort_list: + context = 0 + else: + context_type = "" + self.error_context.append((context_type, context)) def pop_error_context(self): """ Remove the last scope from the error context. @@ -292,8 +317,8 @@ def _add_context_to_errors(error_object, error_context_to_add): """ if error_object is None: error_object = {} - for (context_type, context, increment_count) in error_context_to_add: - error_object[context_type] = (context, increment_count) + for (context_type, context) in error_context_to_add: + error_object[context_type] = context return error_object @@ -330,7 +355,7 @@ def _get_tag_span_to_error_object(error_object): else: return None, None - hed_string = error_object[ErrorContext.HED_STRING][0] + hed_string = error_object[ErrorContext.HED_STRING] span = hed_string._get_org_span(source_tag) return span @@ -385,6 +410,7 @@ def filter_issues_by_severity(issues_list, severity): def get_exception_issue_string(issues, title=None): """ Return a string with issues list flatted into single string, one issue per line. + Possibly being deprecated. Parameters: issues (list): A list of strings containing issues to print. @@ -410,6 +436,29 @@ def get_exception_issue_string(issues, title=None): return issue_str +def sort_issues(issues, reverse=False): + """Sorts a list of issues by the error context values. + + Parameters: + issues (list): A list of dictionaries representing the issues to be sorted. + reverse (bool, optional): If True, sorts the list in descending order. Default is False. + + Returns: + list: The sorted list of issues.""" + def _get_keys(d): + result = [] + for key in default_sort_list: + if key in int_sort_list: + result.append(d.get(key, -1)) + else: + result.append(d.get(key, "")) + return tuple(result) + + issues = sorted(issues, key=_get_keys, reverse=reverse) + + return issues + + def get_printable_issue_string(issues, title=None, severity=None, skip_filename=True): """ Return a string with issues list flatted into single string, one per line. @@ -471,7 +520,7 @@ def _get_context_from_issue(val_issue, skip_filename=True): if skip_filename and key == ErrorContext.FILE_NAME: continue if key.startswith("ec_"): - single_issue_context.append((key, *val_issue[key])) + single_issue_context.append((key, val_issue[key])) return single_issue_context @@ -512,7 +561,7 @@ def _get_context_string(single_issue_context, last_used_context): """ Convert a single context list into the final human readable output form. Parameters: - single_issue_context (list): A list of tuples containing the context(context_type, context, increment_tab) + single_issue_context (list): A list of tuples containing the context(context_type, context) last_used_context (list): A list of tuples containing the last drawn context. Returns: @@ -528,18 +577,18 @@ def _get_context_string(single_issue_context, last_used_context): tab_count = 0 found_difference = False for i, context_tuple in enumerate(single_issue_context): - (context_type, context, increment_tab) = context_tuple + (context_type, context) = context_tuple if len(last_used_context) > i and not found_difference: last_drawn = last_used_context[i] # Was drawn, and hasn't changed. if last_drawn == context_tuple: - if increment_tab: + if context_type not in no_tab_context: tab_count += 1 continue context_string += _format_single_context_string(context_type, context, tab_count) found_difference = True - if increment_tab: + if context_type not in no_tab_context: tab_count += 1 tab_string = '\t' * tab_count diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index ac76f6992..c4fb5df5f 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -21,11 +21,22 @@ class ErrorContext: class ValidationErrors: # General validation errors - HED_CHARACTER_INVALID = 'HED_CHARACTER_INVALID' - HED_COMMA_MISSING = 'HED_COMMA_MISSING' + CHARACTER_INVALID = 'CHARACTER_INVALID' + COMMA_MISSING = 'COMMA_MISSING' + DEF_EXPAND_INVALID = "DEF_EXPAND_INVALID" + DEF_INVALID = "DEF_INVALID" + + # NOT OFFICIAL HED_DEF_UNMATCHED = "HED_DEF_UNMATCHED" + HED_DEF_VALUE_MISSING = "HED_DEF_VALUE_MISSING" + HED_DEF_VALUE_EXTRA = "HED_DEF_VALUE_EXTRA" + HED_DEF_EXPAND_INVALID = "HED_DEF_EXPAND_INVALID" - HED_DEF_VALUE_INVALID = "HED_DEF_VALUE_INVALID" + HED_DEF_EXPAND_UNMATCHED = "HED_DEF_EXPAND_UNMATCHED" + HED_DEF_EXPAND_VALUE_MISSING = "HED_DEF_EXPAND_VALUE_MISSING" + HED_DEF_EXPAND_VALUE_EXTRA = "HED_DEF_EXPAND_VALUE_EXTRA" + # END NOT OFFICIAL + HED_DEFINITION_INVALID = "HED_DEFINITION_INVALID" HED_NODE_NAME_EMPTY = 'HED_NODE_NAME_EMPTY' HED_ONSET_OFFSET_ERROR = 'HED_ONSET_OFFSET_ERROR' @@ -70,8 +81,7 @@ class ValidationErrors: HED_MULTIPLE_TOP_TAGS = "HED_MULTIPLE_TOP_TAGS" HED_TAG_GROUP_TAG = "HED_TAG_GROUP_TAG" - HED_DEF_VALUE_MISSING = "HED_DEF_VALUE_MISSING" - HED_DEF_VALUE_EXTRA = "HED_DEF_VALUE_EXTRA" + class SidecarErrors: @@ -117,3 +127,10 @@ class OnsetErrors: ONSET_PLACEHOLDER_WRONG = "ONSET_PLACEHOLDER_WRONG" ONSET_TOO_MANY_DEFS = "ONSET_TOO_MANY_DEFS" ONSET_TAG_OUTSIDE_OF_GROUP = "ONSET_TAG_OUTSIDE_OF_GROUP" + + +class ColumnErrors: + INVALID_COLUMN_REF = "INVALID_COLUMN_REF" + SELF_COLUMN_REF = "SELF_COLUMN_REF" + NESTED_COLUMN_REF = "NESTED_COLUMN_REF" + MALFORMED_COLUMN_REF = "MALFORMED_COLUMN_REF" diff --git a/hed/models/base_input.py b/hed/models/base_input.py index af6249f56..f0e4209c2 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -354,33 +354,45 @@ def _dataframe_has_names(dataframe): return True return False - def assemble(self, mapper=None): + def assemble(self, mapper=None, skip_square_brackets=False): """ Assembles the hed strings Parameters: mapper(ColumnMapper or None): Generally pass none here unless you want special behavior. - + skip_square_brackets (bool): If True, don't plug in square bracket values into columns. Returns: Dataframe: the assembled dataframe """ if mapper is None: mapper = self._mapper + all_columns = self._handle_transforms(mapper) + if skip_square_brackets: + return all_columns + transformers, _ = mapper.get_transformers() + + return self._handle_square_brackets(all_columns, list(transformers)) + + def _handle_transforms(self, mapper): transformers, need_categorical = mapper.get_transformers() - if not transformers: - return self._dataframe - all_columns = self._dataframe - if need_categorical: - all_columns[need_categorical] = all_columns[need_categorical].astype('category') + if transformers: + all_columns = self._dataframe + if need_categorical: + all_columns[need_categorical] = all_columns[need_categorical].astype('category') - all_columns = all_columns.transform(transformers) + all_columns = all_columns.transform(transformers) + + if need_categorical: + all_columns[need_categorical] = all_columns[need_categorical].astype('str') + else: + all_columns = self._dataframe - return self._insert_columns(all_columns, list(transformers.keys())) + return all_columns @staticmethod - def _find_column_refs(df): + def _find_column_refs(df, column_names): found_column_references = [] - for column_name in df: + for column_name in column_names: df_temp = df[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE) u_vals = pd.Series([j for i in df_temp if isinstance(i, list) for j in i], dtype=str) u_vals = u_vals.unique() @@ -391,21 +403,23 @@ def _find_column_refs(df): return found_column_references @staticmethod - def _insert_columns(df, known_columns=None): - if known_columns is None: - known_columns = list(df.columns) - possible_column_references = [f"{column_name}" for column_name in df.columns if + def _handle_square_brackets(df, known_columns=None): + """ + Plug in square brackets with other columns + + If known columns is passed, only use those columns to find or replace references. + """ + if known_columns is not None: + column_names = list(known_columns) + else: + column_names = list(df.columns) + possible_column_references = [f"{column_name}" for column_name in column_names if isinstance(column_name, str) and column_name.lower() != "hed"] - found_column_references = BaseInput._find_column_refs(df) + found_column_references = BaseInput._find_column_refs(df, column_names) - invalid_replacements = [col for col in found_column_references if col not in possible_column_references] - if invalid_replacements: - # todo: This check may be moved to validation - raise ValueError(f"Bad column references found(columns do not exist): {invalid_replacements}") valid_replacements = [col for col in found_column_references if col in possible_column_references] # todo: break this into a sub function(probably) - column_names = known_columns for column_name in valid_replacements: column_names.remove(column_name) saved_columns = df[valid_replacements] diff --git a/hed/models/definition_dict.py b/hed/models/definition_dict.py index ca3b06b34..04cbfc440 100644 --- a/hed/models/definition_dict.py +++ b/hed/models/definition_dict.py @@ -184,7 +184,9 @@ def construct_def_tag(self, hed_tag): hed_tag(HedTag): The hed tag to identify definition contents in """ if hed_tag.short_base_tag in {DefTagNames.DEF_ORG_KEY, DefTagNames.DEF_EXPAND_ORG_KEY}: + save_parent = hed_tag._parent def_contents = self._get_definition_contents(hed_tag) + hed_tag._parent = save_parent if def_contents is not None: hed_tag._expandable = def_contents hed_tag._expanded = hed_tag.short_base_tag == DefTagNames.DEF_EXPAND_ORG_KEY diff --git a/hed/models/definition_entry.py b/hed/models/definition_entry.py index cb7581fa3..27c89d33b 100644 --- a/hed/models/definition_entry.py +++ b/hed/models/definition_entry.py @@ -26,13 +26,14 @@ def __init__(self, name, contents, takes_value, source_context): if contents: add_group_to_dict(contents, self.tag_dict) - def get_definition(self, replace_tag, placeholder_value=None): + def get_definition(self, replace_tag, placeholder_value=None, return_copy_of_tag=False): """ Return a copy of the definition with the tag expanded and the placeholder plugged in. Parameters: replace_tag (HedTag): The def hed tag to replace with an expanded version placeholder_value (str or None): If present and required, will replace any pound signs in the definition contents. + return_copy_of_tag(bool): Set to true for validation Returns: str: The expanded def tag name @@ -45,6 +46,8 @@ def get_definition(self, replace_tag, placeholder_value=None): if self.takes_value == (placeholder_value is None): return None, [] + if return_copy_of_tag: + replace_tag = replace_tag.copy() output_contents = [replace_tag] name = self.name if self.contents: diff --git a/hed/models/hed_group.py b/hed/models/hed_group.py index 6df911801..7273d956c 100644 --- a/hed/models/hed_group.py +++ b/hed/models/hed_group.py @@ -132,9 +132,6 @@ def copy(self): Returns: HedGroup: The copied group. - Notes: - - The parent tag is removed. - """ save_parent = self._parent self._parent = None diff --git a/hed/models/hed_string.py b/hed/models/hed_string.py index db16833f1..7be20fb5d 100644 --- a/hed/models/hed_string.py +++ b/hed/models/hed_string.py @@ -96,6 +96,7 @@ def shrink_defs(self): expanded_parent = def_expand_group._parent if expanded_parent: def_expand_tag.short_base_tag = DefTagNames.DEF_ORG_KEY + def_expand_tag._parent = expanded_parent expanded_parent.replace(def_expand_group, def_expand_tag) return self @@ -118,6 +119,7 @@ def expand_defs(self): for tag, group in replacements: tag_parent = tag._parent tag_parent.replace(tag, group) + tag._parent = group tag.short_base_tag = DefTagNames.DEF_EXPAND_KEY return self diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py index 29bcf8cf6..689eeac1d 100644 --- a/hed/models/hed_tag.py +++ b/hed/models/hed_tag.py @@ -54,6 +54,19 @@ def __init__(self, hed_string, span=None, hed_schema=None, def_dict=None): if def_dict: def_dict.construct_def_tag(self) + def copy(self): + """ Return a deep copy of this tag. + + Returns: + HedTag: The copied group. + + """ + save_parent = self._parent + self._parent = None + return_copy = copy.deepcopy(self) + self._parent = save_parent + return return_copy + @property def schema_prefix(self): """ Library prefix for this tag if one exists. diff --git a/hed/models/sidecar.py b/hed/models/sidecar.py index 280eba77d..958cadfba 100644 --- a/hed/models/sidecar.py +++ b/hed/models/sidecar.py @@ -255,8 +255,7 @@ def extract_definitions(self, hed_schema=None, error_handler=None): if hed_schema: for hed_string, column_data, _ in self.hed_string_iter(error_handler): hed_string_obj = HedString(hed_string, hed_schema) - error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj, - increment_depth_after=False) + error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj) self._extract_definition_issues += def_dict.check_for_definitions(hed_string_obj, error_handler) error_handler.pop_error_context() diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py index 84c2accbf..c0b821723 100644 --- a/hed/schema/schema_compliance.py +++ b/hed/schema/schema_compliance.py @@ -60,7 +60,7 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl for attribute_name in tag_entry.attributes: validator = schema_attribute_validators.get(attribute_name) if validator: - error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name, False) + error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name) new_issues = validator(hed_schema, tag_entry, tag_entry.attributes[attribute_name]) error_handler.add_context_and_filter(new_issues) issues_list += new_issues diff --git a/hed/validator/def_validator.py b/hed/validator/def_validator.py index 24a3d8e5b..5b18cd466 100644 --- a/hed/validator/def_validator.py +++ b/hed/validator/def_validator.py @@ -51,7 +51,7 @@ def _validate_def_contents(self, def_tag, def_expand_group): issues """ def_issues = [] - + is_def_tag = def_expand_group is not def_tag is_label_tag = def_tag.extension_or_value_portion placeholder = None found_slash = is_label_tag.find("/") @@ -62,17 +62,27 @@ def _validate_def_contents(self, def_tag, def_expand_group): label_tag_lower = is_label_tag.lower() def_entry = self.defs.get(label_tag_lower) if def_entry is None: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_UNMATCHED, tag=def_tag) + error_code = ValidationErrors.HED_DEF_UNMATCHED + if is_def_tag: + error_code = ValidationErrors.HED_DEF_EXPAND_UNMATCHED + def_issues += ErrorHandler.format_error(error_code, tag=def_tag) else: - def_tag_name, def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder) + def_tag_name, def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder, + return_copy_of_tag=True) if def_tag_name: - if def_expand_group is not def_tag and def_expand_group != def_contents: + if is_def_tag and def_expand_group != def_contents: def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_EXPAND_INVALID, tag=def_tag, actual_def=def_contents, found_def=def_expand_group) elif def_entry.takes_value: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_VALUE_MISSING, tag=def_tag) + error_code = ValidationErrors.HED_DEF_VALUE_MISSING + if is_def_tag: + error_code = ValidationErrors.HED_DEF_EXPAND_VALUE_MISSING + def_issues += ErrorHandler.format_error(error_code, tag=def_tag) else: - def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_VALUE_EXTRA, tag=def_tag) + error_code = ValidationErrors.HED_DEF_VALUE_EXTRA + if is_def_tag: + error_code = ValidationErrors.HED_DEF_EXPAND_VALUE_EXTRA + def_issues += ErrorHandler.format_error(error_code, tag=def_tag) - return def_issues + return def_issues \ No newline at end of file diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py index af12005b1..daa71fb07 100644 --- a/hed/validator/sidecar_validator.py +++ b/hed/validator/sidecar_validator.py @@ -4,6 +4,7 @@ from hed import HedString from hed import Sidecar from hed.models.column_metadata import ColumnMetadata +from hed.errors.error_reporter import sort_issues class SidecarValidator: @@ -49,8 +50,7 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None) for hed_string, column_data, position in sidecar.hed_string_iter(error_handler): hed_string_obj = HedString(hed_string, hed_schema=self._schema, def_dict=sidecar_def_dict) - error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj, - increment_depth_after=False) + error_handler.push_error_context(ErrorContext.HED_STRING, hed_string_obj) new_issues = hed_validator.run_basic_checks(hed_string_obj, allow_placeholders=True) if not new_issues: new_issues = hed_validator.run_full_string_checks(hed_string_obj) @@ -61,6 +61,7 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None) error_handler.pop_error_context() error_handler.pop_error_context() + issues = sort_issues(issues) return issues def validate_structure(self, sidecar, error_handler): diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index ba1f341ac..8b8aa9b1f 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -1,9 +1,12 @@ import pandas as pd +import re from hed import BaseInput from hed.errors import ErrorHandler, ValidationErrors, ErrorContext +from hed.errors.error_types import ColumnErrors from hed.models import ColumnType from hed import HedString from hed.models.hed_string_group import HedStringGroup +from hed.errors.error_reporter import sort_issues PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " @@ -25,6 +28,7 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): Parameters: data (BaseInput or pd.DataFrame): Input data to be validated. + If a dataframe, it is assumed to be assembled already. def_dicts(list of DefDict or DefDict): all definitions to use for validation name(str): The name to report errors from this file as error_handler (ErrorHandler): Error context to use. Creates a new one if None @@ -41,31 +45,32 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): # Check the structure of the input data, if it's a BaseInput if isinstance(data, BaseInput): issues += self._validate_column_structure(data, error_handler) - # todo ian: Add more checks here for column inserters + issues += self._validate_square_brackets(data.assemble(skip_square_brackets=True), error_handler) data = data.dataframe_a # Check the rows of the input data issues += self._run_checks(data, error_handler) error_handler.pop_error_context() + + issues = sort_issues(issues) return issues def _run_checks(self, data, error_handler): issues = [] + columns = list(data.columns) for row_number, text_file_row in enumerate(data.itertuples(index=False)): error_handler.push_error_context(ErrorContext.ROW, row_number) row_strings = [] new_column_issues = [] - # todo: make this report the correct column numbers(somehow - it almost surely doesn't right now) for column_number, cell in enumerate(text_file_row): if not cell or cell == "n/a": continue - error_handler.push_error_context(ErrorContext.COLUMN, column_number) + error_handler.push_error_context(ErrorContext.COLUMN, columns[column_number]) column_hed_string = HedString(cell) row_strings.append(column_hed_string) - error_handler.push_error_context(ErrorContext.HED_STRING, column_hed_string, - increment_depth_after=False) + error_handler.push_error_context(ErrorContext.HED_STRING, column_hed_string) new_column_issues = self._hed_validator.run_basic_checks(column_hed_string, allow_placeholders=False) error_handler.add_context_and_filter(new_column_issues) @@ -77,7 +82,7 @@ def _run_checks(self, data, error_handler): continue else: row_string = HedStringGroup(row_strings) - error_handler.push_error_context(ErrorContext.HED_STRING, row_string, increment_depth_after=False) + error_handler.push_error_context(ErrorContext.HED_STRING, row_string) new_column_issues = self._hed_validator.run_full_string_checks(row_string) error_handler.add_context_and_filter(new_column_issues) @@ -113,3 +118,85 @@ def _validate_column_structure(self, base_input, error_handler): error_handler.pop_error_context() return issues + + @staticmethod + def _validate_column_refs(df, error_handler): + possible_column_references = [f"{column_name}" for column_name in df.columns if + isinstance(column_name, str) and column_name.lower() != "hed"] + + issues = [] + found_column_references = {} + for column_name in df: + matches = df[column_name].str.findall("\[([a-z_\-\s0-9]+)(? Date: Thu, 23 Mar 2023 18:32:37 -0500 Subject: [PATCH 17/19] Block HED from appearing in sidecars (#635) * Block HED from appearing in sidecars --- hed/errors/error_messages.py | 7 ++++++- hed/errors/error_types.py | 3 ++- hed/validator/sidecar_validator.py | 23 ++++++++++++++++++++++- spec_tests/test_errors.py | 24 ++++++++++++++++++++---- 4 files changed, 50 insertions(+), 7 deletions(-) diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index ca379992f..7fd609a64 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -277,7 +277,12 @@ def sidecar_error_unknown_column(column_name): @hed_error(SidecarErrors.SIDECAR_HED_USED, actual_code=SidecarErrors.SIDECAR_INVALID) -def sidecar_hed_used(): +def SIDECAR_HED_USED(): + return "'HED' is a reserved name and cannot be used as a sidecar except in expected places." + + +@hed_error(SidecarErrors.SIDECAR_HED_USED_COLUMN, actual_code=SidecarErrors.SIDECAR_INVALID) +def SIDECAR_HED_USED_COLUMN(): return "'HED' is a reserved name and cannot be used as a sidecar column name" diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index c4fb5df5f..272bfe299 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -92,8 +92,9 @@ class SidecarErrors: INVALID_POUND_SIGNS_VALUE = 'invalidNumberPoundSigns' INVALID_POUND_SIGNS_CATEGORY = 'tooManyPoundSigns' UNKNOWN_COLUMN_TYPE = 'sidecarUnknownColumn' - SIDECAR_HED_USED = 'SIDECAR_HED_USED' + SIDECAR_HED_USED_COLUMN = 'SIDECAR_HED_USED_COLUMN' SIDECAR_NA_USED = 'SIDECAR_NA_USED' + SIDECAR_HED_USED = 'SIDECAR_HED_USED' class SchemaErrors: HED_SCHEMA_DUPLICATE_NODE = 'HED_SCHEMA_DUPLICATE_NODE' diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py index daa71fb07..8c68808e8 100644 --- a/hed/validator/sidecar_validator.py +++ b/hed/validator/sidecar_validator.py @@ -81,6 +81,23 @@ def validate_structure(self, sidecar, error_handler): error_handler.pop_error_context() return all_validation_issues + @staticmethod + def _check_for_key(key, data): + if isinstance(data, dict): + if key in data: + return bool(data[key]) + else: + for sub_data in data.values(): + result = SidecarValidator._check_for_key(key, sub_data) + if result is not None: + return result + elif isinstance(data, list): + for sub_data in data: + result = SidecarValidator._check_for_key(key, sub_data) + if result is not None: + return result + return None + def _validate_column_structure(self, column_name, dict_for_entry, error_handler): """ Checks primarily for type errors such as expecting a string and getting a list in a json sidecar. @@ -93,13 +110,17 @@ def _validate_column_structure(self, column_name, dict_for_entry, error_handler) """ val_issues = [] if column_name in self.reserved_column_names: - val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED) + val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED_COLUMN) return val_issues column_type = Sidecar._detect_column_type(dict_for_entry=dict_for_entry) if column_type is None: val_issues += error_handler.format_error_with_context(SidecarErrors.UNKNOWN_COLUMN_TYPE, column_name=column_name) + elif column_type == ColumnType.Ignore: + found_hed = self._check_for_key("HED", dict_for_entry) + if found_hed: + val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED) elif column_type == ColumnType.Categorical: raw_hed_dict = dict_for_entry["HED"] if not raw_hed_dict: diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index 9c80d4d98..81942a915 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -11,6 +11,13 @@ from hed.errors import ErrorHandler, get_printable_issue_string +known_errors = [ + 'SIDECAR_INVALID', + 'CHARACTER_INVALID', + 'COMMA_MISSING', + "DEF_EXPAND_INVALID", + "DEF_INVALID", +] skip_tests = ["VERSION_DEPRECATED", "CHARACTER_INVALID", "STYLE_WARNING"] @@ -30,6 +37,12 @@ def run_single_test(self, test_file): test_info = json.load(fp) for info in test_info: error_code = info['error_code'] + verify_code = False + if error_code in known_errors: + verify_code = True + + # To be deprecated once we add this to all tests + self._verify_code = verify_code if error_code in skip_tests: print(f"Skipping {error_code} test") continue @@ -62,6 +75,13 @@ def report_result(self, expected_result, issues, error_code, description, name, print(f"Passed '{test_type}' (which should fail) '{name}': {test}") print(get_printable_issue_string(issues)) self.fail_count.append(name) + elif self._verify_code: + if any(issue['code'] == error_code for issue in issues): + return + print(f"{error_code}: {description}") + print(f"Failed '{test_type}' (unexpected errors found) '{name}': {test}") + print(get_printable_issue_string(issues)) + self.fail_count.append(name) else: if issues: print(f"{error_code}: {description}") @@ -75,9 +95,6 @@ def _run_single_string_test(self, info, schema, def_dict, error_code, descriptio for test in tests: test_string = HedString(test, schema) - # This expand should not be required here. - def_dict.expand_def_tags(test_string) - issues = string_validator.run_basic_checks(test_string, False) issues += string_validator.run_full_string_checks(test_string) error_handler.add_context_and_filter(issues) @@ -86,7 +103,6 @@ def _run_single_string_test(self, info, schema, def_dict, error_code, descriptio def _run_single_sidecar_test(self, info, schema, def_dict, error_code, description, name, error_handler): for result, tests in info.items(): for test in tests: - # Well this is a disaster buffer = io.BytesIO(json.dumps(test).encode("utf-8")) sidecar = Sidecar(buffer) issues = sidecar.validate(hed_schema=schema, extra_def_dicts=def_dict, error_handler=error_handler) From 69f320d4eb91d6a1b8700140699f9100f7be6469 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Mon, 27 Mar 2023 14:48:41 -0500 Subject: [PATCH 18/19] Updated the search in analysis tools --- hed/models/df_util.py | 6 +- hed/tools/analysis/analysis_util.py | 68 ++++++++++++++++++- .../operations/factor_hed_tags_op.py | 25 ++----- .../test_analysis_util_assemble_hed.py | 50 +++++++------- 4 files changed, 98 insertions(+), 51 deletions(-) diff --git a/hed/models/df_util.py b/hed/models/df_util.py index f9fa19dcc..989299d2f 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -26,7 +26,7 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_ expand_defs: bool Expand any def tags found Returns: - tuple: A list of HedStrings, or a list of lists of HedStrings, DefinitionDict + tuple: A list of HedStrings or a list of lists of HedStrings, DefinitionDict """ if isinstance(sidecar, str): @@ -76,13 +76,13 @@ def convert_to_form(df, hed_schema, tag_form, columns=None): def shrink_defs(df, hed_schema, columns=None): - """ Shrinks any def-expand tags found in the dataframe. + """ Shrinks any def-expand tags found in the specified columns in the dataframe. Converts in place Parameters: df (pd.Dataframe or pd.Series): The dataframe or series to modify hed_schema (HedSchema or None): The schema to use to identify defs. - columns (list or None): The columns to modify on the dataframe + columns (list or None): The columns to modify on the dataframe. """ if isinstance(df, pd.Series): mask = df.str.contains('Def-expand/', case=False) diff --git a/hed/tools/analysis/analysis_util.py b/hed/tools/analysis/analysis_util.py index a4c57c9f6..aa13f288d 100644 --- a/hed/tools/analysis/analysis_util.py +++ b/hed/tools/analysis/analysis_util.py @@ -6,6 +6,7 @@ from hed.models.hed_tag import HedTag from hed.models.hed_group import HedGroup from hed.models import df_util +from hed.models import QueryParser def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs=False): @@ -44,6 +45,68 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs return df, definitions +def get_expression_parsers(queries, query_names=None): + """ Returns a list of expression parsers and query_names. + + Parameters: + queries (list): A list of query strings or QueryParser objects + query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc. + + Returns: + DataFrame - containing the search strings + + Raises: + ValueError - if query names are invalid or duplicated. + + """ + expression_parsers = [] + if not query_names: + query_names = [f"query_{index}" for index in range(len(queries))] + elif len(queries) != len(query_names): + raise ValueError("QueryNamesLengthBad", + f"The query_names length {len(query_names)} must be empty or equal" + + f"to the queries length {len(queries)}.") + elif len(set(query_names)) != len(query_names): + raise ValueError("DuplicateQueryNames", f"The query names {str(query_names)} list has duplicates") + for index, query in enumerate(queries): + if not query: + raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be empty") + elif isinstance(query, str): + try: + next_query = QueryParser(query) + except Exception: + raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed") + else: + next_query = query + expression_parsers.append(next_query) + return expression_parsers, query_names + + +def search_strings(hed_strings, queries, query_names=None): + """ Returns a DataFrame of factors based on results of queries. + + Parameters: + hed_strings (list): A list of HedString objects (empty entries or None entries are 0's) + queries (list): A list of query strings or QueryParser objects + query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc. + + Returns: + DataFrame - containing the factor vectors with results of the queries + + Raises: + ValueError - if query names are invalid or duplicated. + + """ + + expression_parsers, query_names = get_expression_parsers(queries, query_names=query_names) + df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=query_names) + for parse_ind, parser in enumerate(expression_parsers): + for index, next_item in enumerate(hed_strings): + match = parser.search(next_item) + if match: + df_factors.at[index, query_names[parse_ind]] = 1 + return df_factors + # def get_assembled_strings(table, hed_schema=None, expand_defs=False): # """ Return HED string objects for a tabular file. # @@ -61,7 +124,7 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs # return hed_list # -# def search_tabular(data_input, hed_schema, query, columns_included=None): +# def search_tabular(data_input, sidecar, hed_schema, query, extra_def_dicts=None, columns_included=None): # """ Return a dataframe with results of query. # # Parameters: @@ -76,7 +139,8 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs # """ # # eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included) -# hed_list = get_assembled_strings(data_input, hed_schema=hed_schema, expand_defs=True) +# hed_list, definitions = df_util.get_assembled(data_input, sidecar, hed_schema, extra_def_dicts=None, join_columns=True, +# shrink_defs=False, expand_defs=True) # expression = QueryParser(query) # hed_tags = [] # row_numbers = [] diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py index 930f1353f..ae1f35e63 100644 --- a/hed/tools/remodeling/operations/factor_hed_tags_op.py +++ b/hed/tools/remodeling/operations/factor_hed_tags_op.py @@ -8,6 +8,7 @@ from hed.models.sidecar import Sidecar from hed.models.expression_parser import QueryParser from hed.models.df_util import get_assembled +from hed.tools.analysis.analysis_util import get_expression_parsers, search_strings class FactorHedTagsOp(BaseOp): @@ -65,21 +66,8 @@ def __init__(self, parameters): self.queries = parameters['queries'] self.query_names = parameters['query_names'] self.remove_types = parameters['remove_types'] - if not self.query_names: - self.query_names = [f"query_{index}" for index in range(len(self.queries))] - elif len(self.queries) != len(self.query_names): - raise ValueError("QueryNamesLengthBad", - f"The query_names length {len(self.query_names)} must be empty or equal" + - f"to the queries length {len(self.queries)} .") - elif len(set(self.query_names)) != len(self.query_names): - raise ValueError("DuplicateQueryNames", f"The query names {str(self.query_names)} list has duplicates") - self.expression_parsers = [] - for index, query in enumerate(self.queries): - try: - next_query = QueryParser(query) - except Exception: - raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed") - self.expression_parsers.append(next_query) + self.expression_parsers, self.query_names = get_expression_parsers(self.queries, + query_names=parameters['query_names']) def do_op(self, dispatcher, df, name, sidecar=None): """ Factor the column using HED tag queries. @@ -111,12 +99,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): df_list = [input_data.dataframe] hed_strings, _ = get_assembled(input_data, sidecar, dispatcher.hed_schema, extra_def_dicts=None, join_columns=True, shrink_defs=False, expand_defs=True) - df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=self.query_names) - for parse_ind, parser in enumerate(self.expression_parsers): - for index, next_item in enumerate(hed_strings): - match = parser.search(next_item) - if match: - df_factors.at[index, self.query_names[parse_ind]] = 1 + df_factors = search_strings(hed_strings, self.expression_parsers, query_names=self.query_names) if len(df_factors.columns) > 0: df_list.append(df_factors) df_new = pd.concat(df_list, axis=1) diff --git a/tests/tools/analysis/test_analysis_util_assemble_hed.py b/tests/tools/analysis/test_analysis_util_assemble_hed.py index 318c3aa54..75d143659 100644 --- a/tests/tools/analysis/test_analysis_util_assemble_hed.py +++ b/tests/tools/analysis/test_analysis_util_assemble_hed.py @@ -3,9 +3,8 @@ from pandas import DataFrame from hed import schema as hedschema from hed.models import Sidecar, TabularInput, DefinitionDict -from hed.tools.analysis.analysis_util import assemble_hed - - +from hed.models import df_util +from hed.tools.analysis.analysis_util import assemble_hed, search_strings # noinspection PyBroadException @@ -25,7 +24,6 @@ def setUpClass(cls): schema = hedschema.load_schema(schema_path) cls.schema = schema sidecar1 = Sidecar(json_path, name='face_sub1_json') - cls.sidecar_path = sidecar1 cls.sidecar1 = sidecar1 cls.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") cls.input_data_no_sidecar = TabularInput(events_path, name="face_sub1_events_no_sidecar") @@ -96,27 +94,29 @@ def test_assemble_hed_bad_column_no_expand(self): self.assertNotEqual(first_str2.find('Def/'), -1, "assemble_hed with def expand has no Def tag") self.assertEqual(first_str2.find('Def-expand/'), -1, "assemble_hed with def expand has Def-expand tags") - # def test_search_tabular(self): - # query1 = "sensory-event" - # df1 = search_tabular(self.input_data, self.schema, query1, columns_included=None) - # self.assertIsInstance(df1, DataFrame, "search_tabular returns a dataframe when the query is satisfied.") - # self.assertEqual(len(df1.columns), 2, "search_tabular has the right number of columns when query okay") - # self.assertEqual(len(df1.index), 155, "search_tabular has right number of rows when query okay") - # query2 = 'data-feature' - # df2 = search_tabular(self.input_data, self.hed_schema, query2, columns_included=None) - # self.assertFalse(df2, "search_tabular returns None when query is not satisfied.") - # - # query3 = "sensory-event" - # df3 = search_tabular(self.input_data, self.hed_schema, query3, columns_included=['event_type', 'rep_status']) - # self.assertIsInstance(df3, DataFrame, "search_tabular returns a DataFrame when extra columns") - # self.assertEqual(len(df3.columns), 3, "search_tabular returns right number of columns when extra columns") - # self.assertEqual(len(df3.index), 155, "search_tabular has right number of rows when query okay") - # - # df4 = search_tabular(self.input_data, self.hed_schema, query3, - # columns_included=['onset', 'event_type', 'rep_status']) - # self.assertIsInstance(df4, DataFrame, "search_tabular returns a DataFrame when extra columns") - # self.assertEqual(len(df4.columns), 4, "search_tabular returns right number of columns when extra columns") - # self.assertEqual(len(df4.index), 155, "search_tabular has right number of rows when query okay") + def test_search_strings(self): + hed_strings, dict1 = df_util.get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=False, expand_defs=True) + queries1 = ["sensory-event"] + query_names1 = ["sensory"] + df1 = search_strings(hed_strings, queries1, query_names1) + self.assertIsInstance(df1, DataFrame, "search_tabular returns a dataframe when the query is satisfied.") + self.assertEqual(len(df1.columns), 1, "search_tabular has the right number of columns when query okay") + self.assertEqual(len(df1.index), 200, "search_tabular has right number of rows when query okay") + queries2 = ['data-feature', "sensory-event"] + query_names2 = ['data', 'sensory'] + df2 = search_strings(hed_strings, queries2, query_names2) + self.assertEqual(len(df2.columns), 2, "search_tabular has the right number of columns when query okay") + self.assertEqual(len(df2.index), 200, "search_tabular has right number of rows when query okay") + totals = df2.sum(axis=0) + self.assertFalse(totals.loc['data']) + self.assertEqual(totals.loc['sensory'], 155) + queries3 = ['image', "sensory-event", "face"] + query_names3 = ['image', 'sensory', "faced"] + df3 = search_strings(hed_strings, queries3, query_names3) + self.assertIsInstance(df3, DataFrame, "search_tabular returns a DataFrame when extra columns") + self.assertEqual(len(df3.columns), 3, "search_tabular returns right number of columns when extra columns") + self.assertEqual(len(df3.index), 200, "search_tabular has right number of rows when query okay") if __name__ == '__main__': From 6708094ed6bfc61472f9660a714cbea26c3df672 Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 29 Mar 2023 19:06:01 -0500 Subject: [PATCH 19/19] Fix sorting for hed string context --- hed/errors/error_reporter.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/hed/errors/error_reporter.py b/hed/errors/error_reporter.py index cb1a959d5..836ac2c4f 100644 --- a/hed/errors/error_reporter.py +++ b/hed/errors/error_reporter.py @@ -21,7 +21,6 @@ ErrorContext.SIDECAR_KEY_NAME, ErrorContext.ROW, ErrorContext.COLUMN, - ErrorContext.HED_STRING, ErrorContext.SCHEMA_SECTION, ErrorContext.SCHEMA_TAG, ErrorContext.SCHEMA_ATTRIBUTE, @@ -32,6 +31,10 @@ ErrorContext.ROW, ] +hed_string_sort_list = [ + ErrorContext.HED_STRING +] + def _register_error_function(error_type, wrapper_func): if error_type in error_functions: raise KeyError(f"{error_type} defined more than once.") @@ -186,10 +189,13 @@ def push_error_context(self, context_type, context): """ if context is None: + from hed import HedString if context_type in int_sort_list: context = 0 + elif context_type in hed_string_sort_list: + context = HedString("") else: - context_type = "" + context = "" self.error_context.append((context_type, context)) def pop_error_context(self): @@ -446,10 +452,13 @@ def sort_issues(issues, reverse=False): Returns: list: The sorted list of issues.""" def _get_keys(d): + from hed import HedString result = [] for key in default_sort_list: if key in int_sort_list: result.append(d.get(key, -1)) + elif key in hed_string_sort_list: + result.append(d.get(key, HedString(""))) else: result.append(d.get(key, "")) return tuple(result)