diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index 95a4f438b..c0091809a 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -127,6 +127,11 @@ def val_error_hed_duplicate_column(column_name): return f"Multiple columns have name {column_name}. This is not a fatal error, but discouraged." +@hed_error(ValidationErrors.DUPLICATE_NAME_NUMBER_COLUMN, default_severity=ErrorSeverity.WARNING) +def val_error_hed_duplicate_column_number(column_name, column_number): + return f"Column '{column_name}' added as a named column, then also as numbered column {column_number}" + + @hed_tag_error(ValidationErrors.HED_LIBRARY_UNMATCHED, actual_code=ValidationErrors.TAG_PREFIX_INVALID) def val_error_unknown_prefix(tag, unknown_prefix, known_prefixes): return f"Tag '{tag} has unknown prefix '{unknown_prefix}'. Valid prefixes: {known_prefixes}" diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index a7be7d2b4..cb37be803 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -81,6 +81,7 @@ class ValidationErrors: HED_MISSING_REQUIRED_COLUMN = "HED_MISSING_REQUIRED_COLUMN" HED_UNKNOWN_COLUMN = "HED_UNKNOWN_COLUMN" HED_DUPLICATE_COLUMN = "HED_DUPLICATE_COLUMN" + DUPLICATE_NAME_NUMBER_COLUMN = "DUPLICATE_NAME_NUMBER_COLUMN" HED_BLANK_COLUMN = "HED_BLANK_COLUMN" # Below here shows what the given error maps to diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index ad68114a8..37280683f 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -12,7 +12,7 @@ class ColumnMapper: """ Mapping of a base input file columns into HED tags. Notes: - - Functions and type_variables column and row indexing starts at 0. + - All column numbers are 0 based. """ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None, optional_tag_columns=None, requested_columns=None, warn_on_missing_column=False): @@ -22,10 +22,12 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None sidecar (Sidecar): A sidecar to gather column data from. tag_columns: (list): A list of ints or strings containing the columns that contain the HED tags. Sidecar column definitions will take precedent if there is a conflict with tag_columns. - column_prefix_dictionary (dict): Dictionary with keys that are column numbers and values are HED tag + column_prefix_dictionary (dict): Dictionary with keys that are column numbers/names and values are HED tag prefixes to prepend to the tags in that column before processing. - May be deprecated. These are no longer prefixes, but rather converted to value columns. - eg. {"key": "Description"} will turn into a value column as {"key": "Description/#"} + May be deprecated/renamed. These are no longer prefixes, but rather converted to value columns. + eg. {"key": "Description", 1: "Label/"} will turn into value columns as + {"key": "Description/#", 1: "Label/#"} + Note: It will be a validation issue if column 1 is called "key" in the above example. This means it no longer accepts anything but the value portion only in the columns. optional_tag_columns (list): A list of ints or strings containing the columns that contain the HED tags. If the column is otherwise unspecified, convert this column type to HEDTags. @@ -36,12 +38,6 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None Notes: - All column numbers are 0 based. - - Examples: - column_prefix_dictionary = {3: 'Description/', 4: 'Label/'} - - The third column contains tags that need Description/ tag prepended, while the fourth column - contains tag that needs Label/ prepended. """ # This points to column_type entries based on column names or indexes if columns have no column_name. self.column_data = {} @@ -79,9 +75,9 @@ def get_transformers(self): assign_to_column = column.column_name if isinstance(assign_to_column, int): if self._column_map: - assign_to_column = self._column_map[assign_to_column - 1] + assign_to_column = self._column_map[assign_to_column] else: - assign_to_column = assign_to_column - 1 + assign_to_column = assign_to_column if column.column_type == ColumnType.Ignore: continue elif column.column_type == ColumnType.Value: @@ -154,7 +150,7 @@ def get_tag_columns(self): column_identifiers(list): A list of column numbers or names that are ColumnType.HedTags. 0-based if integer-based, otherwise column name. """ - return [column_entry.column_name - 1 if isinstance(column_entry.column_name, int) else column_entry.column_name + return [column_entry.column_name if isinstance(column_entry.column_name, int) else column_entry.column_name for number, column_entry in self._final_column_map.items() if column_entry.column_type == ColumnType.HEDTags] @@ -263,6 +259,7 @@ def _add_column_data(self, new_column_entry): def _get_basic_final_map(column_map, column_data): basic_final_map = {} unhandled_names = {} + issues = [] if column_map: for column_number, column_name in column_map.items(): if column_name is None: @@ -277,11 +274,16 @@ def _get_basic_final_map(column_map, column_data): unhandled_names[column_name] = column_number for column_number in column_data: if isinstance(column_number, int): + if column_number in basic_final_map: + issues += ErrorHandler.format_error(ValidationErrors.DUPLICATE_NAME_NUMBER_COLUMN, + column_name=basic_final_map[column_number].column_name, + column_number=column_number) + continue column_entry = copy.deepcopy(column_data[column_number]) column_entry.column_name = column_number basic_final_map[column_number] = column_entry - return basic_final_map, unhandled_names + return basic_final_map, unhandled_names, issues @staticmethod def _convert_to_indexes(name_to_column_map, column_list): @@ -357,14 +359,15 @@ def _finalize_mapping(self): # 2. Add any tag columns and note issues about missing columns # 3. Add any numbered columns that have required prefixes # 4. Filter to just requested columns, if any - final_map, unhandled_names = self._get_basic_final_map(self._column_map, self.column_data) + final_map, unhandled_names, issues = self._get_basic_final_map(self._column_map, self.column_data) # convert all tag lists to indexes -> Issuing warnings at this time potentially for unknown ones - all_tag_columns, required_tag_columns, issues = self._convert_tag_columns(self._tag_columns, + all_tag_columns, required_tag_columns, tag_issues = self._convert_tag_columns(self._tag_columns, self._optional_tag_columns, self._requested_columns, self._reverse_column_map) + issues += tag_issues # Notes any missing required columns issues += self._add_tag_columns(final_map, unhandled_names, all_tag_columns, required_tag_columns, self._warn_on_missing_column) diff --git a/hed/models/spreadsheet_input.py b/hed/models/spreadsheet_input.py index b48f6985f..1c9b98520 100644 --- a/hed/models/spreadsheet_input.py +++ b/hed/models/spreadsheet_input.py @@ -16,16 +16,16 @@ def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=N worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags. Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed. tag_columns (list): A list of ints containing the columns that contain the HED tags. - The default value is [2] indicating only the second column has tags. + The default value is [1] indicating only the second column has tags. has_column_names (bool): True if file has column names. Validation will skip over the first line of the file if the spreadsheet as column names. - column_prefix_dictionary (dict): A dictionary with column number keys and prefix values. - This is partially deprecated - what this now turns the given columns into Value columns. - Examples: - A prefix dictionary {3: 'Label/', 5: 'Description/'} indicates that column 3 and 5 have HED tags - that need to be prefixed by Label/ and Description/ respectively. - Column numbers 3 and 5 should also be included in the tag_columns list. - + column_prefix_dictionary (dict): Dictionary with keys that are column numbers/names and values are HED tag + prefixes to prepend to the tags in that column before processing. + May be deprecated/renamed. These are no longer prefixes, but rather converted to value columns. + eg. {"key": "Description", 1: "Label/"} will turn into value columns as + {"key": "Description/#", 1: "Label/#"} + Note: It will be a validation issue if column 1 is called "key" in the above example. + This means it no longer accepts anything but the value portion only in the columns. """ if tag_columns is None: tag_columns = [1] diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py index b32b22032..8a6d5c5f8 100644 --- a/hed/models/tabular_input.py +++ b/hed/models/tabular_input.py @@ -15,7 +15,6 @@ def __init__(self, file=None, sidecar=None, name=None): Parameters: file (str or file like): A tsv file to open. sidecar (str or Sidecar): A Sidecar filename or Sidecar - Note: If this is a string you MUST also pass hed_schema. name (str): The name to display for this file for error purposes. """ if sidecar and not isinstance(sidecar, Sidecar): diff --git a/tests/data/spreadsheet_validator_tests/ExcelMultipleSheets.xlsx b/tests/data/spreadsheet_validator_tests/ExcelMultipleSheets.xlsx new file mode 100644 index 000000000..668af693b Binary files /dev/null and b/tests/data/spreadsheet_validator_tests/ExcelMultipleSheets.xlsx differ diff --git a/tests/models/test_spreadsheet_input.py b/tests/models/test_spreadsheet_input.py index 4a507fa18..0c31f35d8 100644 --- a/tests/models/test_spreadsheet_input.py +++ b/tests/models/test_spreadsheet_input.py @@ -23,16 +23,6 @@ def setUpClass(cls): "../data/validator_tests/ExcelMultipleSheets.xlsx") cls.default_test_file_name = default cls.generic_file_input = SpreadsheetInput(default) - cls.integer_key_dictionary = {1: 'one', 2: 'two', 3: 'three'} - cls.one_based_tag_columns = [1, 2, 3] - cls.zero_based_tag_columns = [0, 1, 2, 3, 4] - cls.zero_based_row_column_count = 3 - cls.zero_based_tag_columns_less_than_row_column_count = [0, 1, 2] - cls.column_prefix_dictionary = {3: 'Event/Description/', 4: 'Event/Label/', 5: 'Event/Category/'} - cls.category_key = 'Event/Category/' - cls.category_participant_and_stimulus_tags = 'Event/Category/Participant response,Event/Category/Stimulus' - cls.category_tags = 'Participant response, Stimulus' - cls.row_with_hed_tags = ['event1', 'tag1', 'tag2'] base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/") cls.base_output_folder = base_output os.makedirs(base_output, exist_ok=True) @@ -44,8 +34,8 @@ def tearDownClass(cls): def test_all(self): hed_input = self.default_test_file_name has_column_names = True - column_prefix_dictionary = {2: 'Label', 3: 'Description'} - tag_columns = [4] + column_prefix_dictionary = {1: 'Label/', 2: 'Description'} + tag_columns = [3] worksheet_name = 'LKT Events' file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name, @@ -58,6 +48,25 @@ def test_all(self): # Just make sure this didn't crash for now self.assertTrue(True) + def test_all2(self): + # This should work, but raise an issue as Short label and column 1 overlap. + hed_input = self.default_test_file_name + has_column_names = True + column_prefix_dictionary = {1: 'Label/', "Short label": 'Description'} + tag_columns = [3] + worksheet_name = 'LKT Events' + + file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name, + tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary) + + self.assertTrue(isinstance(file_input.dataframe_a, pd.DataFrame)) + self.assertTrue(isinstance(file_input.series_a, pd.Series)) + self.assertTrue(file_input.dataframe_a.size) + self.assertTrue(len(file_input._mapper.get_column_mapping_issues()), 1) + + # Just make sure this didn't crash for now + self.assertTrue(True) + def test_file_as_string(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/validator_tests/bids_events_no_index.tsv') @@ -103,8 +112,8 @@ def test_to_excel(self): def test_to_excel_should_work(self): spreadsheet = SpreadsheetInput(file=self.default_test_file_name, file_type='.xlsx', - tag_columns=[4], has_column_names=True, - column_prefix_dictionary={1: 'Label/', 3: 'Description/'}, + tag_columns=[3], has_column_names=True, + column_prefix_dictionary={1: 'Label/', 2: 'Description/'}, name='ExcelOneSheet.xlsx') buffer = io.BytesIO() spreadsheet.to_excel(buffer, output_assembled=True) @@ -148,51 +157,51 @@ def test_loading_and_reset_mapper(self): def test_no_column_header_and_convert(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) + hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1]) hed_input.convert_to_long(self.hed_schema) events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_long.tsv') - hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2]) + hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[0, 1]) self.assertTrue(hed_input._dataframe.equals(hed_input_long._dataframe)) def test_convert_short_long_with_definitions(self): # Verify behavior works as expected even if definitions are present events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) + hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1]) hed_input.convert_to_long(self.hed_schema) events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition_long.tsv') - hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2]) + hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[0, 1]) self.assertTrue(hed_input._dataframe.equals(hed_input_long._dataframe)) def test_definitions_identified(self): # Todo: this test is no longer relevant events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) + hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1]) events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) + hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1]) def test_loading_dataframe_directly(self): ds_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') ds = pd.read_csv(ds_path, delimiter="\t", header=None) - hed_input = SpreadsheetInput(ds, has_column_names=False, tag_columns=[1, 2]) + hed_input = SpreadsheetInput(ds, has_column_names=False, tag_columns=[0, 1]) events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') - hed_input2 = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) + hed_input2 = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1]) self.assertTrue(hed_input._dataframe.equals(hed_input2._dataframe)) def test_ignoring_na_column(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/na_tag_column.tsv') - hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2]) + hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1]) self.assertTrue(hed_input.dataframe_a.loc[1, 1] == 'n/a') def test_ignoring_na_value_column(self): diff --git a/tests/validator/test_spreadsheet_validator.py b/tests/validator/test_spreadsheet_validator.py index e32696a30..60b2ccf65 100644 --- a/tests/validator/test_spreadsheet_validator.py +++ b/tests/validator/test_spreadsheet_validator.py @@ -1,7 +1,11 @@ import pandas as pd +import os +import shutil + import unittest -from hed import BaseInput, load_schema_version +from hed import load_schema_version, load_schema from hed.validator import SpreadsheetValidator +from hed import SpreadsheetInput from hed.errors import ErrorHandler, sort_issues from hed.errors.error_types import ColumnErrors @@ -12,4 +16,35 @@ class TestInsertColumns(unittest.TestCase): def setUpClass(cls): cls.schema = load_schema_version("8.1.0") cls.validator = SpreadsheetValidator(cls.schema) + base = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/') + cls.base_data_dir = base + hed_xml_file = os.path.join(base, "schema_tests/HED8.0.0t.xml") + cls.hed_schema = load_schema(hed_xml_file) + default = os.path.join(os.path.dirname(os.path.realpath(__file__)), + "../data/spreadsheet_validator_tests/ExcelMultipleSheets.xlsx") + cls.default_test_file_name = default + cls.generic_file_input = SpreadsheetInput(default) + base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/") + cls.base_output_folder = base_output + os.makedirs(base_output, exist_ok=True) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.base_output_folder) + + def test_basic_validate(self): + hed_input = self.default_test_file_name + has_column_names = True + column_prefix_dictionary = {1: 'Label/', 3: 'Description'} + tag_columns = [4] + worksheet_name = 'LKT 8HED3' + + file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name, + tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary) + + self.assertTrue(isinstance(file_input.dataframe_a, pd.DataFrame)) + self.assertTrue(isinstance(file_input.series_a, pd.Series)) + self.assertTrue(file_input.dataframe_a.size) + issues = file_input.validate(self.schema) + self.assertTrue(len(issues), 1)