hed-standard · VisLab · May 19, 2023 · May 19, 2023 · May 19, 2023
diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py
@@ -127,6 +127,11 @@ def val_error_hed_duplicate_column(column_name):
     return f"Multiple columns have name {column_name}.  This is not a fatal error, but discouraged."
 
 
+@hed_error(ValidationErrors.DUPLICATE_NAME_NUMBER_COLUMN, default_severity=ErrorSeverity.WARNING)
+def val_error_hed_duplicate_column_number(column_name, column_number):
+    return f"Column '{column_name}' added as a named column, then also as numbered column {column_number}"
+
+
 @hed_tag_error(ValidationErrors.HED_LIBRARY_UNMATCHED, actual_code=ValidationErrors.TAG_PREFIX_INVALID)
 def val_error_unknown_prefix(tag, unknown_prefix, known_prefixes):
     return f"Tag '{tag} has unknown prefix '{unknown_prefix}'.  Valid prefixes: {known_prefixes}"

diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py
@@ -81,6 +81,7 @@ class ValidationErrors:
     HED_MISSING_REQUIRED_COLUMN = "HED_MISSING_REQUIRED_COLUMN"
     HED_UNKNOWN_COLUMN = "HED_UNKNOWN_COLUMN"
     HED_DUPLICATE_COLUMN = "HED_DUPLICATE_COLUMN"
+    DUPLICATE_NAME_NUMBER_COLUMN = "DUPLICATE_NAME_NUMBER_COLUMN"
     HED_BLANK_COLUMN = "HED_BLANK_COLUMN"
 
     # Below here shows what the given error maps to

diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py
@@ -12,7 +12,7 @@ class ColumnMapper:
     """ Mapping of a base input file columns into HED tags.
 
     Notes:
-        - Functions and type_variables column and row indexing starts at 0.
+        - All column numbers are 0 based.
     """
     def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None,
                  optional_tag_columns=None, requested_columns=None, warn_on_missing_column=False):
@@ -22,10 +22,12 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
             sidecar (Sidecar): A sidecar to gather column data from.
             tag_columns: (list):  A list of ints or strings containing the columns that contain the HED tags.
                 Sidecar column definitions will take precedent if there is a conflict with tag_columns.
-            column_prefix_dictionary (dict): Dictionary with keys that are column numbers and values are HED tag
+            column_prefix_dictionary (dict): Dictionary with keys that are column numbers/names and values are HED tag
                 prefixes to prepend to the tags in that column before processing.
-                May be deprecated.  These are no longer prefixes, but rather converted to value columns.
-                eg. {"key": "Description"} will turn into a value column as {"key": "Description/#"}
+                May be deprecated/renamed.  These are no longer prefixes, but rather converted to value columns.
+                eg. {"key": "Description", 1: "Label/"} will turn into value columns as
+                    {"key": "Description/#", 1: "Label/#"}
+                    Note: It will be a validation issue if column 1 is called "key" in the above example.
                 This means it no longer accepts anything but the value portion only in the columns.
             optional_tag_columns (list): A list of ints or strings containing the columns that contain
                 the HED tags. If the column is otherwise unspecified, convert this column type to HEDTags.
@@ -36,12 +38,6 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
 
         Notes:
             - All column numbers are 0 based.
-
-        Examples:
-            column_prefix_dictionary = {3: 'Description/', 4: 'Label/'}
-
-            The third column contains tags that need Description/ tag prepended, while the fourth column
-            contains tag that needs Label/ prepended.
         """
         # This points to column_type entries based on column names or indexes if columns have no column_name.
         self.column_data = {}
@@ -79,9 +75,9 @@ def get_transformers(self):
             assign_to_column = column.column_name
             if isinstance(assign_to_column, int):
                 if self._column_map:
-                    assign_to_column = self._column_map[assign_to_column - 1]
+                    assign_to_column = self._column_map[assign_to_column]
                 else:
-                    assign_to_column = assign_to_column - 1
+                    assign_to_column = assign_to_column
             if column.column_type == ColumnType.Ignore:
                 continue
             elif column.column_type == ColumnType.Value:
@@ -154,7 +150,7 @@ def get_tag_columns(self):
             column_identifiers(list): A list of column numbers or names that are ColumnType.HedTags.
             0-based if integer-based, otherwise column name.
         """
-        return [column_entry.column_name - 1 if isinstance(column_entry.column_name, int) else column_entry.column_name
+        return [column_entry.column_name if isinstance(column_entry.column_name, int) else column_entry.column_name
                 for number, column_entry in self._final_column_map.items()
                 if column_entry.column_type == ColumnType.HEDTags]
 
@@ -263,6 +259,7 @@ def _add_column_data(self, new_column_entry):
     def _get_basic_final_map(column_map, column_data):
         basic_final_map = {}
         unhandled_names = {}
+        issues = []
         if column_map:
             for column_number, column_name in column_map.items():
                 if column_name is None:
@@ -277,11 +274,16 @@ def _get_basic_final_map(column_map, column_data):
                 unhandled_names[column_name] = column_number
         for column_number in column_data:
             if isinstance(column_number, int):
+                if column_number in basic_final_map:
+                    issues += ErrorHandler.format_error(ValidationErrors.DUPLICATE_NAME_NUMBER_COLUMN,
+                                                        column_name=basic_final_map[column_number].column_name,
+                                                        column_number=column_number)
+                    continue
                 column_entry = copy.deepcopy(column_data[column_number])
                 column_entry.column_name = column_number
                 basic_final_map[column_number] = column_entry
 
-        return basic_final_map, unhandled_names
+        return basic_final_map, unhandled_names, issues
 
     @staticmethod
     def _convert_to_indexes(name_to_column_map, column_list):
@@ -357,14 +359,15 @@ def _finalize_mapping(self):
         # 2. Add any tag columns and note issues about missing columns
         # 3. Add any numbered columns that have required prefixes
         # 4. Filter to just requested columns, if any
-        final_map, unhandled_names = self._get_basic_final_map(self._column_map, self.column_data)
+        final_map, unhandled_names, issues = self._get_basic_final_map(self._column_map, self.column_data)
 
         # convert all tag lists to indexes -> Issuing warnings at this time potentially for unknown ones
-        all_tag_columns, required_tag_columns, issues = self._convert_tag_columns(self._tag_columns,
+        all_tag_columns, required_tag_columns, tag_issues = self._convert_tag_columns(self._tag_columns,
                                                                                   self._optional_tag_columns,
                                                                                   self._requested_columns,
                                                                                   self._reverse_column_map)
 
+        issues += tag_issues
         # Notes any missing required columns
         issues += self._add_tag_columns(final_map, unhandled_names, all_tag_columns, required_tag_columns,
                                         self._warn_on_missing_column)

diff --git a/hed/models/spreadsheet_input.py b/hed/models/spreadsheet_input.py
@@ -16,16 +16,16 @@ def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=N
             worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags.
                 Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed.
             tag_columns (list): A list of ints containing the columns that contain the HED tags.
-                The default value is [2] indicating only the second column has tags.
+                The default value is [1] indicating only the second column has tags.
             has_column_names (bool): True if file has column names. Validation will skip over the
                 first line of the file if the spreadsheet as column names.
-            column_prefix_dictionary (dict): A dictionary with column number keys and prefix values.
-                This is partially deprecated - what this now turns the given columns into Value columns.
-        Examples:
-            A prefix dictionary {3: 'Label/', 5: 'Description/'} indicates that column 3 and 5 have HED tags
-            that need to be prefixed by Label/ and Description/ respectively.
-            Column numbers 3 and 5 should also be included in the tag_columns list.
-
+            column_prefix_dictionary (dict): Dictionary with keys that are column numbers/names and values are HED tag
+                prefixes to prepend to the tags in that column before processing.
+                May be deprecated/renamed.  These are no longer prefixes, but rather converted to value columns.
+                eg. {"key": "Description", 1: "Label/"} will turn into value columns as
+                    {"key": "Description/#", 1: "Label/#"}
+                    Note: It will be a validation issue if column 1 is called "key" in the above example.
+                This means it no longer accepts anything but the value portion only in the columns.
         """
         if tag_columns is None:
             tag_columns = [1]

diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py
@@ -15,7 +15,6 @@ def __init__(self, file=None, sidecar=None, name=None):
         Parameters:
             file (str or file like): A tsv file to open.
             sidecar (str or Sidecar): A Sidecar filename or Sidecar
-                Note: If this is a string you MUST also pass hed_schema.
             name (str): The name to display for this file for error purposes.
         """
         if sidecar and not isinstance(sidecar, Sidecar):

diff --git a/tests/data/spreadsheet_validator_tests/ExcelMultipleSheets.xlsx b/tests/data/spreadsheet_validator_tests/ExcelMultipleSheets.xlsx
diff --git a/tests/models/test_spreadsheet_input.py b/tests/models/test_spreadsheet_input.py
@@ -23,16 +23,6 @@ def setUpClass(cls):
                                "../data/validator_tests/ExcelMultipleSheets.xlsx")
         cls.default_test_file_name = default
         cls.generic_file_input = SpreadsheetInput(default)
-        cls.integer_key_dictionary = {1: 'one', 2: 'two', 3: 'three'}
-        cls.one_based_tag_columns = [1, 2, 3]
-        cls.zero_based_tag_columns = [0, 1, 2, 3, 4]
-        cls.zero_based_row_column_count = 3
-        cls.zero_based_tag_columns_less_than_row_column_count = [0, 1, 2]
-        cls.column_prefix_dictionary = {3: 'Event/Description/', 4: 'Event/Label/', 5: 'Event/Category/'}
-        cls.category_key = 'Event/Category/'
-        cls.category_participant_and_stimulus_tags = 'Event/Category/Participant response,Event/Category/Stimulus'
-        cls.category_tags = 'Participant response, Stimulus'
-        cls.row_with_hed_tags = ['event1', 'tag1', 'tag2']
         base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/")
         cls.base_output_folder = base_output
         os.makedirs(base_output, exist_ok=True)
@@ -44,8 +34,8 @@ def tearDownClass(cls):
     def test_all(self):
         hed_input = self.default_test_file_name
         has_column_names = True
-        column_prefix_dictionary = {2: 'Label', 3: 'Description'}
-        tag_columns = [4]
+        column_prefix_dictionary = {1: 'Label/', 2: 'Description'}
+        tag_columns = [3]
         worksheet_name = 'LKT Events'
 
         file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name,
@@ -58,6 +48,25 @@ def test_all(self):
         # Just make sure this didn't crash for now
         self.assertTrue(True)
 
+    def test_all2(self):
+        # This should work, but raise an issue as Short label and column 1 overlap.
+        hed_input = self.default_test_file_name
+        has_column_names = True
+        column_prefix_dictionary = {1: 'Label/', "Short label": 'Description'}
+        tag_columns = [3]
+        worksheet_name = 'LKT Events'
+
+        file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name,
+                                      tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary)
+
+        self.assertTrue(isinstance(file_input.dataframe_a, pd.DataFrame))
+        self.assertTrue(isinstance(file_input.series_a, pd.Series))
+        self.assertTrue(file_input.dataframe_a.size)
+        self.assertTrue(len(file_input._mapper.get_column_mapping_issues()), 1)
+
+        # Just make sure this didn't crash for now
+        self.assertTrue(True)
+
     def test_file_as_string(self):
         events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                    '../data/validator_tests/bids_events_no_index.tsv')
@@ -103,8 +112,8 @@ def test_to_excel(self):
 
     def test_to_excel_should_work(self):
         spreadsheet = SpreadsheetInput(file=self.default_test_file_name, file_type='.xlsx',
-                                       tag_columns=[4], has_column_names=True,
-                                       column_prefix_dictionary={1: 'Label/', 3: 'Description/'},
+                                       tag_columns=[3], has_column_names=True,
+                                       column_prefix_dictionary={1: 'Label/', 2: 'Description/'},
                                        name='ExcelOneSheet.xlsx')
         buffer = io.BytesIO()
         spreadsheet.to_excel(buffer, output_assembled=True)
@@ -148,51 +157,51 @@ def test_loading_and_reset_mapper(self):
     def test_no_column_header_and_convert(self):
         events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                    '../data/model_tests/no_column_header.tsv')
-        hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
+        hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
         hed_input.convert_to_long(self.hed_schema)
 
         events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                         '../data/model_tests/no_column_header_long.tsv')
-        hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2])
+        hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[0, 1])
         self.assertTrue(hed_input._dataframe.equals(hed_input_long._dataframe))
 
     def test_convert_short_long_with_definitions(self):
         # Verify behavior works as expected even if definitions are present
         events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                    '../data/model_tests/no_column_header_definition.tsv')
-        hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
+        hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
         hed_input.convert_to_long(self.hed_schema)
 
         events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                         '../data/model_tests/no_column_header_definition_long.tsv')
-        hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2])
+        hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[0, 1])
         self.assertTrue(hed_input._dataframe.equals(hed_input_long._dataframe))
 
     def test_definitions_identified(self):
         # Todo: this test is no longer relevant
         events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                    '../data/model_tests/no_column_header_definition.tsv')
-        hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
+        hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
         events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                    '../data/model_tests/no_column_header_definition.tsv')
-        hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
+        hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
 
 
     def test_loading_dataframe_directly(self):
         ds_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                '../data/model_tests/no_column_header_definition.tsv')
         ds = pd.read_csv(ds_path, delimiter="\t", header=None)
-        hed_input = SpreadsheetInput(ds, has_column_names=False, tag_columns=[1, 2])
+        hed_input = SpreadsheetInput(ds, has_column_names=False, tag_columns=[0, 1])
 
         events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                    '../data/model_tests/no_column_header_definition.tsv')
-        hed_input2 = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
+        hed_input2 = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
         self.assertTrue(hed_input._dataframe.equals(hed_input2._dataframe))
 
     def test_ignoring_na_column(self):
         events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                    '../data/model_tests/na_tag_column.tsv')
-        hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
+        hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
         self.assertTrue(hed_input.dataframe_a.loc[1, 1] == 'n/a')
 
     def test_ignoring_na_value_column(self):

diff --git a/tests/validator/test_spreadsheet_validator.py b/tests/validator/test_spreadsheet_validator.py
@@ -1,7 +1,11 @@
 import pandas as pd
+import os
+import shutil
+
 import unittest
-from hed import BaseInput, load_schema_version
+from hed import load_schema_version, load_schema
 from hed.validator import SpreadsheetValidator
+from hed import SpreadsheetInput
 from hed.errors import ErrorHandler, sort_issues
 from hed.errors.error_types import ColumnErrors
 
@@ -12,4 +16,35 @@ class TestInsertColumns(unittest.TestCase):
     def setUpClass(cls):
         cls.schema = load_schema_version("8.1.0")
         cls.validator = SpreadsheetValidator(cls.schema)
+        base = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/')
+        cls.base_data_dir = base
+        hed_xml_file = os.path.join(base, "schema_tests/HED8.0.0t.xml")
+        cls.hed_schema = load_schema(hed_xml_file)
+        default = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                               "../data/spreadsheet_validator_tests/ExcelMultipleSheets.xlsx")
+        cls.default_test_file_name = default
+        cls.generic_file_input = SpreadsheetInput(default)
+        base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/")
+        cls.base_output_folder = base_output
+        os.makedirs(base_output, exist_ok=True)
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.base_output_folder)
+
+    def test_basic_validate(self):
+        hed_input = self.default_test_file_name
+        has_column_names = True
+        column_prefix_dictionary = {1: 'Label/', 3: 'Description'}
+        tag_columns = [4]
+        worksheet_name = 'LKT 8HED3'
+
+        file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name,
+                                      tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary)
+
+        self.assertTrue(isinstance(file_input.dataframe_a, pd.DataFrame))
+        self.assertTrue(isinstance(file_input.series_a, pd.Series))
+        self.assertTrue(file_input.dataframe_a.size)
 
+        issues = file_input.validate(self.schema)
+        self.assertTrue(len(issues), 1)