diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index c0091809a..c6f4250e4 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -107,8 +107,8 @@ def val_error_no_value(tag): @hed_error(ValidationErrors.HED_MISSING_REQUIRED_COLUMN, default_severity=ErrorSeverity.WARNING) -def val_error_missing_column(column_name): - return f"Required column '{column_name}' not specified or found in file." +def val_error_missing_column(column_name, column_type): + return f"Required {column_type} column '{column_name}' not specified or found in file." @hed_error(ValidationErrors.HED_UNKNOWN_COLUMN, default_severity=ErrorSeverity.WARNING) @@ -117,19 +117,33 @@ def val_error_extra_column(column_name): "or identified in sidecars." -@hed_error(ValidationErrors.HED_BLANK_COLUMN, default_severity=ErrorSeverity.WARNING) -def val_error_hed_blank_column(column_number): - return f"Column number {column_number} has no column name" +@hed_error(ValidationErrors.SIDECAR_AND_OTHER_COLUMNS) +def val_error_sidecar_with_column(column_names): + return f"You cannot use a sidecar and tag or prefix columns at the same time. " \ + f"Found {column_names}." + +@hed_error(ValidationErrors.DUPLICATE_COLUMN_IN_LIST) +def val_error_duplicate_clumn(column_number, column_name, list_name): + if column_name: + return f"Found column '{column_name}' at index {column_number} twice in {list_name}." + else: + return f"Found column number {column_number} twice in {list_name}. This isn't a major concern, but does indicate a mistake." -@hed_error(ValidationErrors.HED_DUPLICATE_COLUMN, default_severity=ErrorSeverity.WARNING) -def val_error_hed_duplicate_column(column_name): - return f"Multiple columns have name {column_name}. This is not a fatal error, but discouraged." +@hed_error(ValidationErrors.DUPLICATE_COLUMN_BETWEEN_SOURCES) +def val_error_duplicate_clumn(column_number, column_name, list_names): + if column_name: + return f"Found column '{column_name}' at index {column_number} in the following inputs: {list_names}. " \ + f"Each entry must be unique." + else: + return f"Found column number {column_number} in the following inputs: {list_names}. " \ + f"Each entry must be unique." -@hed_error(ValidationErrors.DUPLICATE_NAME_NUMBER_COLUMN, default_severity=ErrorSeverity.WARNING) -def val_error_hed_duplicate_column_number(column_name, column_number): - return f"Column '{column_name}' added as a named column, then also as numbered column {column_number}" + +@hed_error(ValidationErrors.HED_BLANK_COLUMN, default_severity=ErrorSeverity.WARNING) +def val_error_hed_blank_column(column_number): + return f"Column number {column_number} has no column name" @hed_tag_error(ValidationErrors.HED_LIBRARY_UNMATCHED, actual_code=ValidationErrors.TAG_PREFIX_INVALID) diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index cb37be803..36ef907af 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -80,8 +80,10 @@ class ValidationErrors: HED_MISSING_REQUIRED_COLUMN = "HED_MISSING_REQUIRED_COLUMN" HED_UNKNOWN_COLUMN = "HED_UNKNOWN_COLUMN" - HED_DUPLICATE_COLUMN = "HED_DUPLICATE_COLUMN" - DUPLICATE_NAME_NUMBER_COLUMN = "DUPLICATE_NAME_NUMBER_COLUMN" + SIDECAR_AND_OTHER_COLUMNS = "SIDECAR_AND_OTHER_COLUMNS" + + DUPLICATE_COLUMN_IN_LIST = "DUPLICATE_COLUMN_IN_LIST" + DUPLICATE_COLUMN_BETWEEN_SOURCES = "DUPLICATE_COLUMN_BETWEEN_SOURCES" HED_BLANK_COLUMN = "HED_BLANK_COLUMN" # Below here shows what the given error maps to diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 41c935687..65ac75101 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -82,7 +82,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T raise HedFileError(HedExceptions.INVALID_DATAFRAME, "Invalid dataframe(malformed datafile, etc)", file) # todo: Can we get rid of this behavior now that we're using pandas? - column_issues = ColumnMapper.validate_column_map(self.columns, allow_blank_names=allow_blank_names) + column_issues = ColumnMapper.check_for_blank_names(self.columns, allow_blank_names=allow_blank_names) if column_issues: raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.", self.name, issues=column_issues) diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index 37280683f..3e82d8c5a 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -4,6 +4,7 @@ from hed.errors.error_types import ValidationErrors import copy +from collections import Counter PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " @@ -14,8 +15,9 @@ class ColumnMapper: Notes: - All column numbers are 0 based. """ + def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None, - optional_tag_columns=None, requested_columns=None, warn_on_missing_column=False): + optional_tag_columns=None, warn_on_missing_column=False): """ Constructor for ColumnMapper. Parameters: @@ -31,40 +33,55 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None This means it no longer accepts anything but the value portion only in the columns. optional_tag_columns (list): A list of ints or strings containing the columns that contain the HED tags. If the column is otherwise unspecified, convert this column type to HEDTags. - requested_columns (list or None): A list of columns you wish to retrieve. - If None, retrieve all columns. warn_on_missing_column (bool): If True, issue mapping warnings on column names that are missing from the sidecar. Notes: - All column numbers are 0 based. """ - # This points to column_type entries based on column names or indexes if columns have no column_name. - self.column_data = {} # Maps column number to column_entry. This is what's actually used by most code. self._final_column_map = {} self._no_mapping_info = True self._column_map = {} self._reverse_column_map = {} - self._requested_columns = [] self._warn_on_missing_column = warn_on_missing_column - self._tag_columns = [] - self._optional_tag_columns = [] - self._column_prefix_dictionary = {} + if tag_columns is None: + tag_columns = [] + self._tag_columns = tag_columns + if optional_tag_columns is None: + optional_tag_columns = [] + self._optional_tag_columns = optional_tag_columns + if column_prefix_dictionary is None: + column_prefix_dictionary = {} + self._column_prefix_dictionary = column_prefix_dictionary self._na_patterns = ["n/a", "nan"] - self._finalize_mapping_issues = [] self._sidecar = None self._set_sidecar(sidecar) - self.set_requested_columns(requested_columns, False) - self.set_tag_columns(tag_columns, optional_tag_columns, False) - self._add_value_columns(column_prefix_dictionary) - # finalize the column map based on initial settings with no header self._finalize_mapping() + @property + def tag_columns(self): + """ Returns the known tag and optional tag columns with numbers as names when possible + + Returns: + tag_columns(list of str or int): A list of all tag and optional tag columns as labels + """ + joined_list = self._tag_columns + self._optional_tag_columns + return list(set(self._convert_to_names(self._column_map, joined_list))) + + @property + def column_prefix_dictionary(self): + """ Returns the column_prefix_dictionary with numbers turned into names where possible + + Returns: + column_prefix_dictionary(list of str or int): A column_prefix_dictionary with column labels as keys + """ + return self._convert_to_names_dict(self._column_map, self._column_prefix_dictionary) + def get_transformers(self): """ Return the transformers to use on a dataframe @@ -91,13 +108,12 @@ def get_transformers(self): final_transformers[assign_to_column] = partial(self._category_handler, category_values) else: final_transformers[assign_to_column] = lambda x: x - # print(column.column_type) return final_transformers, need_categorical @staticmethod - def validate_column_map(column_map, allow_blank_names): - """ Validate there are no issues with column names. + def check_for_blank_names(column_map, allow_blank_names): + """ Validate there are no blank column names Parameters: column_map(iterable): A list of column names @@ -109,17 +125,12 @@ def validate_column_map(column_map, allow_blank_names): # We don't have any checks right now if blank/duplicate is allowed if allow_blank_names: return [] + issues = [] - used_names = set() for column_number, name in enumerate(column_map): - if name is None or name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): + if name is None or not name or name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): issues += ErrorHandler.format_error(ValidationErrors.HED_BLANK_COLUMN, column_number) continue - # if name in used_names: - # # todo: Add this check once it's more fleshed out - # issues += ErrorHandler.format_error(ValidationErrors.HED_DUPLICATE_COLUMN, name) - # continue - used_names.add(name) return issues @@ -136,11 +147,16 @@ def _set_sidecar(self, sidecar): raise ValueError("Trying to set a second sidecar on a column mapper.") if not sidecar: return None - for column_data in sidecar.column_data: - self._add_column_data(column_data) self._sidecar = sidecar + @property + def sidecar_column_data(self): + if self._sidecar: + return self._sidecar.column_data + + return {} + def get_tag_columns(self): """ Returns the column numbers or names that are mapped to be HedTags @@ -150,8 +166,7 @@ def get_tag_columns(self): column_identifiers(list): A list of column numbers or names that are ColumnType.HedTags. 0-based if integer-based, otherwise column name. """ - return [column_entry.column_name if isinstance(column_entry.column_name, int) else column_entry.column_name - for number, column_entry in self._final_column_map.items() + return [column_entry.column_name for number, column_entry in self._final_column_map.items() if column_entry.column_type == ColumnType.HEDTags] def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_mapping=True): @@ -164,10 +179,6 @@ def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_ but not an error if missing. If None, clears existing tag_columns finalize_mapping (bool): Re-generate the internal mapping if True, otherwise no effect until finalize. - - Returns: - list: List of issues that occurred during this process. Each issue is a dictionary. - """ if tag_columns is None: tag_columns = [] @@ -176,24 +187,7 @@ def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_ self._tag_columns = tag_columns self._optional_tag_columns = optional_tag_columns if finalize_mapping: - issues = self._finalize_mapping() - return issues - return [] - - def set_requested_columns(self, requested_columns, finalize_mapping=True): - """ Set to return only the columns listed in requested_columns - - Parameters: - requested_columns(list or None): If this is not None, return ONLY these columns. Names or numbers allowed. - finalize_mapping(bool): Finalize the mapping right now if True - - Returns: - issues(list): An empty list of mapping issues - """ - self._requested_columns = requested_columns - if finalize_mapping: - return self._finalize_mapping() - return [] + self._finalize_mapping() def set_column_map(self, new_column_map=None): """ Set the column number to name mapping. @@ -215,51 +209,18 @@ def set_column_map(self, new_column_map=None): column_map = {column_number: column_name for column_number, column_name in enumerate(new_column_map)} self._column_map = column_map self._reverse_column_map = {column_name: column_number for column_number, column_name in column_map.items()} - return self._finalize_mapping() - - def add_columns(self, column_names_or_numbers, column_type=ColumnType.HEDTags): - """ Add blank columns in the given column category. - - Parameters: - column_names_or_numbers (list): A list of column names or numbers to add as the specified type. - column_type (ColumnType property): The category of column these should be. - - """ - if column_names_or_numbers: - if not isinstance(column_names_or_numbers, list): - column_names_or_numbers = [column_names_or_numbers] - for column_name in column_names_or_numbers: - new_def = ColumnMetadata(column_type, column_name) - self._add_column_data(new_def) - - def _add_value_columns(self, column_prefix_dictionary): - if column_prefix_dictionary: - for col, prefix in column_prefix_dictionary.items(): - if prefix.endswith("/"): - prefix = prefix + "#" - else: - prefix = prefix + "/#" - new_def = ColumnMetadata(ColumnType.Value, col, source=prefix) - self._add_column_data(new_def) - - def _add_column_data(self, new_column_entry): - """ Add the metadata of a column to this column mapper. - - Parameters: - new_column_entry (ColumnMetadata): The column definition to add. - - Notes: - If an entry with the same name exists, the new entry will replace it. + self._finalize_mapping() - """ - column_name = new_column_entry.column_name - self.column_data[column_name] = copy.deepcopy(new_column_entry) + def set_column_prefix_dictionary(self, column_prefix_dictionary, finalize_mapping=True): + """Sets the column prefix dictionary""" + self._column_prefix_dictionary = column_prefix_dictionary + if finalize_mapping: + self._finalize_mapping() @staticmethod - def _get_basic_final_map(column_map, column_data): + def _get_sidecar_basic_map(column_map, column_data): basic_final_map = {} - unhandled_names = {} - issues = [] + unhandled_cols = [] if column_map: for column_number, column_name in column_map.items(): if column_name is None: @@ -267,132 +228,143 @@ def _get_basic_final_map(column_map, column_data): if column_name in column_data: column_entry = copy.deepcopy(column_data[column_name]) column_entry.column_name = column_name - basic_final_map[column_number] = column_entry + basic_final_map[column_name] = column_entry continue - elif column_name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): + elif isinstance(column_name, str) and column_name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): continue - unhandled_names[column_name] = column_number - for column_number in column_data: - if isinstance(column_number, int): - if column_number in basic_final_map: - issues += ErrorHandler.format_error(ValidationErrors.DUPLICATE_NAME_NUMBER_COLUMN, - column_name=basic_final_map[column_number].column_name, - column_number=column_number) - continue - column_entry = copy.deepcopy(column_data[column_number]) - column_entry.column_name = column_number - basic_final_map[column_number] = column_entry + unhandled_cols.append(column_name) - return basic_final_map, unhandled_names, issues + return basic_final_map, unhandled_cols @staticmethod - def _convert_to_indexes(name_to_column_map, column_list): - converted_indexes = [] - unknown_names = [] - for name in column_list: - if isinstance(name, str): - if name in name_to_column_map: - converted_indexes.append(name_to_column_map[name]) - continue - else: - unknown_names.append(name) + def _convert_to_names(column_to_name_map, column_list): + converted_names = [] + for index in column_list: + if isinstance(index, int): + if not column_to_name_map: + converted_names.append(index) + elif index in column_to_name_map: + converted_names.append(column_to_name_map[index]) else: - # Name is in int here - converted_indexes.append(name) - return converted_indexes, unknown_names + if index in column_to_name_map.values(): + converted_names.append(index) + return converted_names @staticmethod - def _add_tag_columns(final_map, unhandled_names, all_tag_columns, required_tag_columns, warn_on_missing_columns): - issues = [] - - # Add numbered tag columns - for column_name, column_number in unhandled_names.items(): - if column_number in all_tag_columns: - final_map[column_number] = ColumnMetadata(ColumnType.HEDTags, column_name) + def _convert_to_names_dict(column_to_name_map, column_dict): + converted_dict = {} + for index, column_data in column_dict.items(): + if isinstance(index, int): + if not column_to_name_map: + converted_dict[index] = column_data + elif index in column_to_name_map: + converted_dict[column_to_name_map[index]] = column_data else: - if warn_on_missing_columns and column_number not in required_tag_columns: - issues += ErrorHandler.format_error(ValidationErrors.HED_UNKNOWN_COLUMN, - column_name=column_name) - - # Add numbered tag columns - for column_name_or_number in all_tag_columns: - if isinstance(column_name_or_number, int): - if column_name_or_number not in final_map: - final_map[column_name_or_number] = ColumnMetadata(ColumnType.HEDTags, - column_name_or_number) - - # Switch any tag/requested columns to be HedTags if they were being ignored - for column_number, entry in final_map.items(): - if column_number in all_tag_columns and entry.column_type == ColumnType.Ignore: - entry.column_type = ColumnType.HEDTags - - return issues + if index in column_to_name_map.values(): + converted_dict[index] = column_data + return converted_dict @staticmethod - def _filter_by_requested(final_map, requested_columns): - if requested_columns is not None: - return {key: value for key, value in final_map.items() - if key in requested_columns or value.column_name in requested_columns} - return final_map + def _add_value_columns(final_map, column_prefix_dictionary): + for col, prefix in column_prefix_dictionary.items(): + if prefix.endswith("/"): + prefix = prefix + "#" + else: + prefix = prefix + "/#" + new_def = ColumnMetadata(ColumnType.Value, col, source=prefix) + final_map[col] = new_def @staticmethod - def _convert_tag_columns(tag_columns, optional_tag_columns, requested_columns, reverse_column_map): - all_tag_columns = tag_columns + optional_tag_columns - required_tag_columns = tag_columns.copy() - if requested_columns: - all_tag_columns += requested_columns - required_tag_columns += requested_columns + def _add_tag_columns(final_map, tag_columns): + for col in tag_columns: + new_def = ColumnMetadata(ColumnType.HEDTags, col) + final_map[col] = new_def + + def _get_column_lists(self): + column_lists = self._tag_columns, self._optional_tag_columns, self._column_prefix_dictionary + list_names = ["tag_columns", "optional_tag_columns", "column_prefix_dictionary"] - all_tag_columns, _ = ColumnMapper._convert_to_indexes(reverse_column_map, all_tag_columns) - required_tag_columns, missing_tag_column_names = ColumnMapper._convert_to_indexes(reverse_column_map, - required_tag_columns) + if not any(column for column in column_lists): + return column_lists, list_names + # Filter out empty lists from the above + column_lists, list_names = zip(*[(col_list, list_name) for col_list, list_name in zip(column_lists, list_names) + if col_list]) + return column_lists, list_names + + def _check_for_duplicates_and_required(self, list_names, column_lists): issues = [] - for column_name in missing_tag_column_names: - issues += ErrorHandler.format_error(ValidationErrors.HED_MISSING_REQUIRED_COLUMN, - column_name=column_name) + for list_name, col_list in zip(list_names, column_lists): + # Convert all known strings to ints, then check for duplicates + converted_list = [item if isinstance(item, int) else self._reverse_column_map.get(item, item) + for item in col_list] - return all_tag_columns, required_tag_columns, issues + if col_list != self._optional_tag_columns: + for test_col in converted_list: + if isinstance(test_col, str) and test_col not in self._reverse_column_map: + issues += ErrorHandler.format_error(ValidationErrors.HED_MISSING_REQUIRED_COLUMN, + test_col, list_name) - def _finalize_mapping(self): - # 1. All named and numbered columns are located from sidecars and put in final mapping - # 2. Add any tag columns and note issues about missing columns - # 3. Add any numbered columns that have required prefixes - # 4. Filter to just requested columns, if any - final_map, unhandled_names, issues = self._get_basic_final_map(self._column_map, self.column_data) - - # convert all tag lists to indexes -> Issuing warnings at this time potentially for unknown ones - all_tag_columns, required_tag_columns, tag_issues = self._convert_tag_columns(self._tag_columns, - self._optional_tag_columns, - self._requested_columns, - self._reverse_column_map) - - issues += tag_issues - # Notes any missing required columns - issues += self._add_tag_columns(final_map, unhandled_names, all_tag_columns, required_tag_columns, - self._warn_on_missing_column) - - issues += ColumnMapper.validate_column_map(self._column_map.values(), allow_blank_names=False) - - self._final_column_map = self._filter_by_requested(final_map, self._requested_columns) - # Make sure this new dict is sorted - self._final_column_map = dict(sorted(final_map.items())) + issues += self._check_for_duplicates_between_lists(converted_list, list_name, + ValidationErrors.DUPLICATE_COLUMN_IN_LIST) + + return issues + + def _check_for_duplicates_between_lists(self, checking_list, list_names, error_type): + issues = [] + duplicates = [item for item, count in Counter(checking_list).items() if count > 1] + for duplicate in duplicates: + issues += ErrorHandler.format_error(error_type, duplicate, + self._column_map.get(duplicate), list_names) + return issues + + def check_for_mapping_issues(self, allow_blank_names=False): + """ Find all issues given the current column_map, tag_columns, etc. - self._no_mapping_info = not self._check_if_mapping_info() + Parameters: + allow_blank_names(bool): Only flag blank names if False - self._finalize_mapping_issues = issues + Returns: + issue_list(list of dict): Returns all issues found as a list of dicts + """ + # 1. Get the lists with entries + column_lists, list_names = self._get_column_lists() + # 2. Verify column_prefix columns and tag columns are present, and check for duplicates + issues = self._check_for_duplicates_and_required(list_names, column_lists) + + combined_list = self.tag_columns + list(self.column_prefix_dictionary) + # 3. Verify prefix and tag columns do not conflict. + issues += self._check_for_duplicates_between_lists(combined_list, list_names, + ValidationErrors.DUPLICATE_COLUMN_BETWEEN_SOURCES) + + # 4. Verify we didn't get both a sidecar and a tag column list + if self._sidecar and combined_list and combined_list != ["HED"]: + issues += ErrorHandler.format_error(ValidationErrors.SIDECAR_AND_OTHER_COLUMNS, column_names=combined_list) + + # 5. Verify we handled all columns + if self._warn_on_missing_column: + fully_combined_list = list(self.sidecar_column_data) + combined_list + for column in self._column_map.values(): + if column not in fully_combined_list: + issues += ErrorHandler.format_error(ValidationErrors.HED_UNKNOWN_COLUMN, column) + + issues += self.check_for_blank_names(self._column_map.values(), allow_blank_names=allow_blank_names) return issues - def _check_if_mapping_info(self): - # If any of these have any data, don't do default behavior. - return bool(self.column_data or self._final_column_map - or self._requested_columns is not None or self._tag_columns - or self._optional_tag_columns or self._column_prefix_dictionary) + def _finalize_mapping(self): + final_map, unhandled_cols = self._get_sidecar_basic_map(self._column_map, self.sidecar_column_data) + + self._add_tag_columns(final_map, self.tag_columns) + self._remove_from_list(unhandled_cols, self.tag_columns) - def _column_name_requested(self, column_name): - if self._requested_columns is None: - return True - return column_name in self._requested_columns + self._add_value_columns(final_map, self.column_prefix_dictionary) + self._remove_from_list(unhandled_cols, self.column_prefix_dictionary) + + self._final_column_map = dict(sorted(final_map.items())) + + @staticmethod + def _remove_from_list(list_to_alter, to_remove): + return [item for item in list_to_alter if item not in to_remove] def get_def_dict(self, hed_schema=None, extra_def_dicts=None): """ Return def dicts from every column description. @@ -410,13 +382,14 @@ def get_def_dict(self, hed_schema=None, extra_def_dicts=None): return [] def get_column_mapping_issues(self): - """ Get all the issues with finalizing column mapping. Primarily a missing required column. + """ Get all the issues with finalizing column mapping(duplicate columns, missing required, etc) + Note: this is deprecated and now a wrapper for "check_for_mapping_issues()" Returns: list: A list dictionaries of all issues found from mapping column names to numbers. """ - return self._finalize_mapping_issues + return self.check_for_mapping_issues() @staticmethod def _category_handler(category_values, x): diff --git a/hed/models/sidecar.py b/hed/models/sidecar.py index e67670a6e..d48165e59 100644 --- a/hed/models/sidecar.py +++ b/hed/models/sidecar.py @@ -34,7 +34,7 @@ def __iter__(self): iterator: An iterator over the column metadata values. """ - return iter(self.column_data) + return iter(self.column_data.values()) def __getitem__(self, column_name): if column_name not in self.loaded_dict: @@ -72,7 +72,7 @@ def column_data(self): Returns: list(ColumnMetadata): the list of column metadata defined by this sidecar """ - return [ColumnMetadata(name=col_name, source=self.loaded_dict) for col_name in self.loaded_dict] + return {col_name: ColumnMetadata(name=col_name, source=self.loaded_dict) for col_name in self.loaded_dict} def get_def_dict(self, hed_schema=None, extra_def_dicts=None): """ Returns the definition dict for this sidecar. @@ -236,7 +236,7 @@ def get_column_refs(self): """ found_vals = set() - for column_data in self.column_data: + for column_data in self: if column_data.column_type == ColumnType.Ignore: continue hed_strings = column_data.get_hed_strings() diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py index a455e6806..cd1500d30 100644 --- a/hed/validator/sidecar_validator.py +++ b/hed/validator/sidecar_validator.py @@ -58,7 +58,7 @@ def validate(self, sidecar, extra_def_dicts=None, name=None, error_handler=None) issues += sidecar_def_dict.issues definition_checks = {} - for column_data in sidecar.column_data: + for column_data in sidecar: column_name = column_data.column_name hed_strings = column_data.get_hed_strings() error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) @@ -113,7 +113,7 @@ def _validate_refs(self, sidecar, error_handler): issues = [] found_column_references = {} - for column_data in sidecar.column_data: + for column_data in sidecar: column_name = column_data.column_name hed_strings = column_data.get_hed_strings() error_handler.push_error_context(ErrorContext.SIDECAR_COLUMN_NAME, column_name) diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index 36a94f032..66cf58f0f 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -100,7 +100,7 @@ def _validate_column_structure(self, base_input, error_handler): List of issues associated with each invalid value. Each issue is a dictionary. """ issues = [] - col_issues = base_input._mapper.get_column_mapping_issues() + col_issues = base_input._mapper.check_for_mapping_issues(base_input) error_handler.add_context_and_filter(col_issues) issues += col_issues for column in base_input.column_metadata().values(): diff --git a/tests/models/test_column_mapper.py b/tests/models/test_column_mapper.py index 78a6b99a9..7f399f0dd 100644 --- a/tests/models/test_column_mapper.py +++ b/tests/models/test_column_mapper.py @@ -3,6 +3,7 @@ from hed.models import ColumnMapper, ColumnType, HedString from hed.models.sidecar import Sidecar +from hed.errors import ValidationErrors class Test(unittest.TestCase): @@ -11,7 +12,6 @@ class Test(unittest.TestCase): @classmethod def setUpClass(cls): cls.integer_key_dictionary = {0: 'one', 1: 'two', 2: 'three'} - cls.zero_based_tag_columns = [0, 1, 2] cls.zero_based_row_column_count = 3 cls.column_prefix_dictionary = {2: 'Event/Description/', 3: 'Event/Label/', 4: 'Event/Category/'} cls.category_key = 'Event/Category/' @@ -45,20 +45,133 @@ def setUpClass(cls): def test_set_tag_columns(self): mapper = ColumnMapper() - mapper.set_tag_columns(self.zero_based_tag_columns, finalize_mapping=True) - self.assertTrue(len(mapper._final_column_map) >= 2) + zero_based_tag_columns = [0, 1, 2] + mapper.set_tag_columns(zero_based_tag_columns, finalize_mapping=True) + self.assertTrue(len(mapper._final_column_map) == 3) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 0) + + def test_set_tag_columns_named(self): + mapper = ColumnMapper(warn_on_missing_column=True) + named_columns = ["Col1", "Col2", "Col3"] + mapper.set_tag_columns(named_columns) + mapper.set_column_map(named_columns) + self.assertTrue(len(mapper._final_column_map) == 3) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 0) + + def test_set_tag_columns_named_unknown(self): + mapper = ColumnMapper(warn_on_missing_column=True) + two_columns = ["Col1", "Col2"] + named_columns = ["Col1", "Col2", "Col3"] + mapper.set_tag_columns(two_columns) + mapper.set_column_map(named_columns) + self.assertTrue(len(mapper._final_column_map) == 2) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 1) + self.assertTrue(mapper.check_for_mapping_issues()[0]['code'] == ValidationErrors.HED_UNKNOWN_COLUMN) + + def test_set_tag_columns_mixed(self): + mapper = ColumnMapper() + mixed_columns = ["Col1", "Col2", 2] + column_map = ["Col1", "Col2", "Col3"] + mapper.set_tag_columns(mixed_columns) + mapper.set_column_map(column_map) + self.assertTrue(len(mapper._final_column_map) == 3) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 0) + + def test_set_tag_column_missing(self): + mapper = ColumnMapper() + column_map = ["Col1", "Col2", "Col3"] + mapper.set_tag_columns(["Col1", "Col4"]) + mapper.set_column_map(column_map) + self.assertTrue(len(mapper._final_column_map) == 1) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 1) + self.assertTrue(mapper.check_for_mapping_issues()[0]['code'] == ValidationErrors.HED_MISSING_REQUIRED_COLUMN) - def test_optional_column(self): + column_map = ["Col1", "Col2", "Col3"] + mapper.set_tag_columns(optional_tag_columns=["Col1", "Col4"]) + mapper.set_column_map(column_map) + self.assertTrue(len(mapper._final_column_map) == 1) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 0) + + + def test_sidecar_and_columns(self): + mapper = ColumnMapper(Sidecar(self.basic_events_json)) + mapper.set_tag_columns(["Invalid", "Invalid2"]) + mapper.set_column_map(["Invalid", "Invalid2"]) + self.assertTrue(len(mapper._final_column_map) == 2) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 1) + self.assertTrue(mapper.check_for_mapping_issues()[0]['code'] == ValidationErrors.SIDECAR_AND_OTHER_COLUMNS) + + def test_duplicate_list(self): mapper = ColumnMapper() - mapper.set_tag_columns(tag_columns=["HED"]) - mapper.set_column_map({1: "HED"}) + mapper.set_tag_columns(["Invalid", "Invalid"]) + self.assertTrue(len(mapper._final_column_map) == 0) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 3) + self.assertTrue(mapper.check_for_mapping_issues()[-1]['code'] == ValidationErrors.DUPLICATE_COLUMN_IN_LIST) + + mapper.set_tag_columns([0, 0]) + self.assertTrue(len(mapper._final_column_map) == 1) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 1) + self.assertTrue(mapper.check_for_mapping_issues()[-1]['code'] == ValidationErrors.DUPLICATE_COLUMN_IN_LIST) + + mapper.set_tag_columns([0, "Column1"]) + mapper.set_column_map(["Column1"]) + self.assertTrue(len(mapper._final_column_map) == 1) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 1) + self.assertTrue(mapper.check_for_mapping_issues()[-1]['code'] == ValidationErrors.DUPLICATE_COLUMN_IN_LIST) + + def test_duplicate_prefix(self): + mapper = ColumnMapper() + prefix_dict = { + 0: "Label/", + "Column1": "Description" + } + mapper.set_column_prefix_dictionary(prefix_dict) + mapper.set_column_map(["Column1"]) self.assertTrue(len(mapper._final_column_map) == 1) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 1) + self.assertTrue(mapper.check_for_mapping_issues()[-1]['code'] == ValidationErrors.DUPLICATE_COLUMN_IN_LIST) + def test_duplicate_cross_lists(self): mapper = ColumnMapper() - mapper.set_requested_columns(requested_columns=["HED"]) + prefix_dict = { + 0: "Label/" + } + mapper.set_tag_columns([0]) + mapper.set_column_prefix_dictionary(prefix_dict) + mapper.set_column_map(["Column1"]) + self.assertTrue(len(mapper._final_column_map) == 1) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 1) + self.assertTrue(mapper.check_for_mapping_issues()[-1]['code'] == ValidationErrors.DUPLICATE_COLUMN_BETWEEN_SOURCES) + + mapper = ColumnMapper() + prefix_dict = { + "Column1": "Label/" + } + mapper.set_tag_columns([0]) + mapper.set_column_prefix_dictionary(prefix_dict) + mapper.set_column_map(["Column1"]) + self.assertTrue(len(mapper._final_column_map) == 1) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 1) + self.assertTrue(mapper.check_for_mapping_issues()[-1]['code'] == ValidationErrors.DUPLICATE_COLUMN_BETWEEN_SOURCES) + + + mapper.set_tag_columns(["Column1"]) + self.assertTrue(len(mapper._final_column_map) == 1) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 1) + self.assertTrue(mapper.check_for_mapping_issues()[-1]['code'] == ValidationErrors.DUPLICATE_COLUMN_BETWEEN_SOURCES) + + def test_blank_column(self): + mapper = ColumnMapper() + mapper.set_column_map(["", None]) + self.assertTrue(len(mapper.check_for_mapping_issues()) == 2) + self.assertTrue(mapper.check_for_mapping_issues(allow_blank_names=False)[1]['code'] == ValidationErrors.HED_BLANK_COLUMN) + self.assertTrue(mapper.check_for_mapping_issues(allow_blank_names=False)[1]['code'] == ValidationErrors.HED_BLANK_COLUMN) + + def test_optional_column(self): + mapper = ColumnMapper() + mapper.set_tag_columns(tag_columns=["HED"]) mapper.set_column_map({1: "HED"}) self.assertTrue(len(mapper._final_column_map) == 1) - self.assertTrue(len(mapper._finalize_mapping_issues) == 0) mapper = ColumnMapper() mapper.set_tag_columns(optional_tag_columns=["HED"]) @@ -68,49 +181,33 @@ def test_optional_column(self): mapper = ColumnMapper() mapper.set_tag_columns(tag_columns=["HED"]) self.assertTrue(len(mapper._final_column_map) == 0) - self.assertTrue(len(mapper._finalize_mapping_issues) == 1) - - mapper = ColumnMapper() - mapper.set_requested_columns(requested_columns=["HED"]) - self.assertTrue(len(mapper._final_column_map) == 0) - self.assertTrue(len(mapper._finalize_mapping_issues) == 1) + self.assertTrue(len(mapper.get_column_mapping_issues()) == 1) mapper = ColumnMapper() mapper.set_tag_columns(optional_tag_columns=["HED"]) self.assertTrue(len(mapper._final_column_map) == 0) - self.assertTrue(len(mapper._finalize_mapping_issues) == 0) + self.assertTrue(len(mapper.get_column_mapping_issues()) == 0) def test_add_json_file_events(self): mapper = ColumnMapper() mapper._set_sidecar(Sidecar(self.basic_events_json)) - self.assertTrue(len(mapper.column_data) >= 2) + self.assertTrue(len(mapper.sidecar_column_data) >= 2) def test__detect_event_type(self): mapper = ColumnMapper() mapper._set_sidecar(Sidecar(self.basic_events_json)) - self.assertTrue(mapper.column_data[self.basic_event_name].column_type == self.basic_event_type) - - def test_add_hed_tags_columns(self): - mapper = ColumnMapper() - mapper.add_columns([self.add_column_name], ColumnType.HEDTags) - self.assertTrue(len(mapper.column_data) >= 1) - - def test__add_single_event_type(self): - mapper = ColumnMapper() - mapper.add_columns([self.add_column_name], ColumnType.Value) - self.assertTrue(len(mapper.column_data) >= 1) - - def test_set_column_map(self): - mapper = ColumnMapper() - mapper.add_columns([self.add_column_name], ColumnType.Value) - mapper.set_column_map(self.test_column_map) - self.assertTrue(len(mapper._final_column_map) >= 1) - - def test__finalize_mapping(self): - mapper = ColumnMapper() - mapper.add_columns([self.add_column_number], ColumnType.Value) - mapper._finalize_mapping() - self.assertTrue(len(mapper._final_column_map) >= 1) + self.assertTrue(mapper.sidecar_column_data[self.basic_event_name].column_type == self.basic_event_type) + + def test_tag_mapping_complex(self): + tag_columns = [0] + column_prefix_dictionary = {1: "Label/"} + optional_tag_columns = [2] + mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary, optional_tag_columns=optional_tag_columns) + self.assertEqual(list(mapper._final_column_map), [0, 1, 2]) + self.assertEqual(mapper._final_column_map[0].column_type, ColumnType.HEDTags) + self.assertEqual(mapper._final_column_map[1].column_type, ColumnType.Value) + self.assertEqual(mapper._final_column_map[1].hed_dict, "Label/#") + self.assertEqual(mapper._final_column_map[2].column_type, ColumnType.HEDTags) diff --git a/tests/models/test_spreadsheet_input.py b/tests/models/test_spreadsheet_input.py index b009ef419..d93949eea 100644 --- a/tests/models/test_spreadsheet_input.py +++ b/tests/models/test_spreadsheet_input.py @@ -41,9 +41,7 @@ def test_all(self): self.assertTrue(isinstance(file_input.dataframe_a, pd.DataFrame)) self.assertTrue(isinstance(file_input.series_a, pd.Series)) self.assertTrue(file_input.dataframe_a.size) - - # Just make sure this didn't crash for now - self.assertTrue(True) + self.assertEqual(len(file_input._mapper.get_column_mapping_issues()), 0) def test_all2(self): # This should work, but raise an issue as Short label and column 1 overlap. @@ -59,10 +57,7 @@ def test_all2(self): self.assertTrue(isinstance(file_input.dataframe_a, pd.DataFrame)) self.assertTrue(isinstance(file_input.series_a, pd.Series)) self.assertTrue(file_input.dataframe_a.size) - self.assertTrue(len(file_input._mapper.get_column_mapping_issues()), 1) - - # Just make sure this didn't crash for now - self.assertTrue(True) + self.assertEqual(len(file_input._mapper.get_column_mapping_issues()), 1) def test_file_as_string(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), diff --git a/tests/validator/test_hed_validator.py b/tests/validator/test_hed_validator.py index 848beda34..451241377 100644 --- a/tests/validator/test_hed_validator.py +++ b/tests/validator/test_hed_validator.py @@ -139,7 +139,7 @@ def test_file_bad_defs_in_spreadsheet(self): worksheet_name='LKT Events') validation_issues = loaded_file.validate(hed_schema=hed_schema) - self.assertEqual(len(validation_issues), 4) + self.assertEqual(len(validation_issues), 5) def test_tabular_input_with_HED_col_in_json(self): schema_path = os.path.realpath(os.path.join(os.path.dirname(__file__), diff --git a/tests/validator/test_spreadsheet_validator.py b/tests/validator/test_spreadsheet_validator.py index 60b2ccf65..00528a7eb 100644 --- a/tests/validator/test_spreadsheet_validator.py +++ b/tests/validator/test_spreadsheet_validator.py @@ -6,9 +6,6 @@ from hed import load_schema_version, load_schema from hed.validator import SpreadsheetValidator from hed import SpreadsheetInput -from hed.errors import ErrorHandler, sort_issues -from hed.errors.error_types import ColumnErrors - class TestInsertColumns(unittest.TestCase): @@ -48,3 +45,4 @@ def test_basic_validate(self): issues = file_input.validate(self.schema) self.assertTrue(len(issues), 1) +