diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index c663ff001..25aaee9a4 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -215,6 +215,12 @@ def val_error_sidecar_key_missing(invalid_key, category_keys): return f"Category key '{invalid_key}' does not exist in column. Valid keys are: {category_keys}" +@hed_error(ValidationErrors.TSV_COLUMN_MISSING, actual_code=ValidationErrors.SIDECAR_KEY_MISSING, + default_severity=ErrorSeverity.WARNING) +def val_error_tsv_column_missing(invalid_key): + return f"{{HED}} is used as a key in a sidecar but does not appear as a column in the tabular file" + + @hed_tag_error(ValidationErrors.HED_DEF_EXPAND_INVALID, actual_code=ValidationErrors.DEF_EXPAND_INVALID) def val_error_bad_def_expand(tag, actual_def, found_def): return f"A data-recording's Def-expand tag does not match the given definition." \ @@ -314,11 +320,6 @@ def sidecar_hed_used(): return "'HED' is a reserved name and cannot be used as a sidecar except in expected places." -@hed_error(SidecarErrors.SIDECAR_HED_USED_COLUMN, actual_code=ValidationErrors.SIDECAR_INVALID) -def sidecar_hed_used_column(): - return "'HED' is a reserved name and cannot be used as a sidecar column name" - - @hed_error(SidecarErrors.SIDECAR_NA_USED, actual_code=ValidationErrors.SIDECAR_INVALID) def sidecar_na_used(column_name): return f"Invalid category key 'n/a' found in column {column_name}." diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index ebe58acbc..d743606b6 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -37,6 +37,7 @@ class ValidationErrors: REQUIRED_TAG_MISSING = 'REQUIRED_TAG_MISSING' SIDECAR_INVALID = 'SIDECAR_INVALID' SIDECAR_KEY_MISSING = 'SIDECAR_KEY_MISSING' + HED_COLUMN_MISSING = 'HED_COLUMN_MISSING' STYLE_WARNING = "STYLE_WARNING" TAG_EMPTY = 'TAG_EMPTY' TAG_EXPRESSION_REPEATED = 'TAG_EXPRESSION_REPEATED' @@ -96,6 +97,7 @@ class ValidationErrors: HED_PLACEHOLDER_OUT_OF_CONTEXT = 'HED_PLACEHOLDER_OUT_OF_CONTEXT' CURLY_BRACE_UNSUPPORTED_HERE = 'CURLY_BRACE_UNSUPPORTED_HERE' ONSETS_UNORDERED = "ONSETS_UNORDERED" + TSV_COLUMN_MISSING="TSV_COLUMN_MISSING" class SidecarErrors: @@ -105,9 +107,8 @@ class SidecarErrors: INVALID_POUND_SIGNS_VALUE = 'invalidNumberPoundSigns' INVALID_POUND_SIGNS_CATEGORY = 'tooManyPoundSigns' UNKNOWN_COLUMN_TYPE = 'sidecarUnknownColumn' - SIDECAR_HED_USED_COLUMN = 'sidecar_hed_used_column' + SIDECAR_HED_USED = 'SIDECAR_HED_USED' SIDECAR_NA_USED = 'SIDECAR_NA_USED' - SIDECAR_HED_USED = 'sidecar_hed_used' SIDECAR_BRACES_INVALID = "SIDECAR_BRACES_INVALID" diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index 203ee05f6..7f5e04312 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -49,7 +49,6 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None # Maps column number to column_entry. This is what's actually used by most code. self._final_column_map = {} self._no_mapping_info = True - self._column_map = {} self._reverse_column_map = {} self._warn_on_missing_column = warn_on_missing_column diff --git a/hed/models/df_util.py b/hed/models/df_util.py index 609ab5c84..12e57a926 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -180,20 +180,25 @@ def _handle_curly_braces_refs(df, refs, column_names): modified_df(pd.DataFrame): The modified dataframe with refs replaced """ # Filter out columns and refs that don't exist. - refs = [ref for ref in refs if ref in column_names] - remaining_columns = [column for column in column_names if column not in refs] + refs_new = [ref for ref in refs if ref in column_names] + remaining_columns = [column for column in column_names if column not in refs_new] new_df = df.copy() # Replace references in the columns we are saving out. - saved_columns = new_df[refs] + saved_columns = new_df[refs_new] for column_name in remaining_columns: - for replacing_name in refs: + for replacing_name in refs_new: # If the data has no n/a values, this version is MUCH faster. # column_name_brackets = f"{{{replacing_name}}}" # df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y # in zip(df[column_name], saved_columns[replacing_name])) new_df[column_name] = pd.Series(replace_ref(x, f"{{{replacing_name}}}", y) for x, y in zip(new_df[column_name], saved_columns[replacing_name])) + # Handle the special case of {HED} when the tsv file has no {HED} column + if 'HED' in refs and 'HED' not in column_names: + for column_name in remaining_columns: + new_df[column_name] =\ + pd.Series(replace_ref(x, "{HED}", "n/a") for x in new_df[column_name]) new_df = new_df[remaining_columns] return new_df diff --git a/hed/models/spreadsheet_input.py b/hed/models/spreadsheet_input.py index 669c8a878..ca0ccf53a 100644 --- a/hed/models/spreadsheet_input.py +++ b/hed/models/spreadsheet_input.py @@ -1,47 +1,48 @@ -""" A spreadsheet of HED tags. """ -from hed.models.column_mapper import ColumnMapper -from hed.models.base_input import BaseInput - - -class SpreadsheetInput(BaseInput): - """ A spreadsheet of HED tags. """ - - def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=None, - has_column_names=True, column_prefix_dictionary=None, - name=None): - """Constructor for the SpreadsheetInput class. - - Parameters: - file (str or file like): An xlsx/tsv file to open or a File object. - file_type (str or None): ".xlsx" for Excel, ".tsv" or ".txt" for tsv. data. - worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags. - Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed. - tag_columns (list): A list of ints or strs containing the columns that contain the HED tags. - If ints then column numbers with [1] indicating only the second column has tags. - has_column_names (bool): True if file has column names. Validation will skip over the first row. - first line of the file if the spreadsheet as column names. - column_prefix_dictionary (dict or None): Dictionary with keys that are column numbers/names and - values are HED tag prefixes to prepend to the tags in that column before processing. - - Notes: - - If file is a string, file_type is derived from file and this parameter is ignored. - - column_prefix_dictionary may be deprecated/renamed. These are no longer prefixes, - but rather converted to value columns. - e.g. {"key": "Description", 1: "Label/"} will turn into value columns as - {"key": "Description/#", 1: "Label/#"} - It will be a validation issue if column 1 is called "key" in the above example. - This means it no longer accepts anything but the value portion only in the columns. - - :raises HedFileError: - - The file is blank. - - An invalid dataframe was passed with size 0. - - An invalid extension was provided. - - A duplicate or empty column name appears. - - Cannot open the indicated file. - - The specified worksheet name does not exist. - """ - - new_mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary, - warn_on_missing_column=False) - - super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, name=name) +""" A spreadsheet of HED tags. """ +from hed.models.column_mapper import ColumnMapper +from hed.models.base_input import BaseInput + + +class SpreadsheetInput(BaseInput): + """ A spreadsheet of HED tags. """ + + def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=None, + has_column_names=True, column_prefix_dictionary=None, + name=None): + """Constructor for the SpreadsheetInput class. + + Parameters: + file (str or file like): An xlsx/tsv file to open or a File object. + file_type (str or None): ".xlsx" for Excel, ".tsv" or ".txt" for tsv. data. + worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags. + Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed. + tag_columns (list): A list of ints or strs containing the columns that contain the HED tags. + If ints then column numbers with [1] indicating only the second column has tags. + has_column_names (bool): True if file has column names. Validation will skip over the first row. + first line of the file if the spreadsheet as column names. + column_prefix_dictionary (dict or None): Dictionary with keys that are column numbers/names and + values are HED tag prefixes to prepend to the tags in that column before processing. + + Notes: + - If file is a string, file_type is derived from file and this parameter is ignored. + - column_prefix_dictionary may be deprecated/renamed. These are no longer prefixes, + but rather converted to value columns. + e.g. {"key": "Description", 1: "Label/"} will turn into value columns as + {"key": "Description/#", 1: "Label/#"} + It will be a validation issue if column 1 is called "key" in the above example. + This means it no longer accepts anything but the value portion only in the columns. + + :raises HedFileError: + - The file is blank. + - An invalid dataframe was passed with size 0. + - An invalid extension was provided. + - A duplicate or empty column name appears. + - Cannot open the indicated file. + - The specified worksheet name does not exist. + """ + + self.tag_columns = tag_columns + new_mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary, + warn_on_missing_column=False) + + super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, name=name) diff --git a/hed/validator/sidecar_validator.py b/hed/validator/sidecar_validator.py index e0c2782ef..0a5fe8e88 100644 --- a/hed/validator/sidecar_validator.py +++ b/hed/validator/sidecar_validator.py @@ -244,7 +244,7 @@ def _validate_column_structure(self, column_name, dict_for_entry, error_handler) """ val_issues = [] if column_name in self.reserved_column_names: - val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED_COLUMN) + val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED) return val_issues column_type = ColumnMetadata._detect_column_type(dict_for_entry=dict_for_entry, basic_validation=False) diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index b219a11b4..882f80678 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -236,7 +236,6 @@ def _validate_column_structure(self, base_input, error_handler, row_adj): columns = base_input.columns for ref in column_refs: if ref not in columns: - issues += error_handler.format_error_with_context(ColumnErrors.INVALID_COLUMN_REF, - bad_ref=ref) + issues += error_handler.format_error_with_context(ValidationErrors.TSV_COLUMN_MISSING, invalid_key=ref) return issues diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index 0cc871615..d5ba1e3b5 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -24,7 +24,6 @@ runAll = True runOnly = {} - class MyTestCase(unittest.TestCase): @classmethod def setUpClass(cls): diff --git a/tests/validator/test_spreadsheet_validator.py b/tests/validator/test_spreadsheet_validator.py index 6c9b08aca..2a63ae710 100644 --- a/tests/validator/test_spreadsheet_validator.py +++ b/tests/validator/test_spreadsheet_validator.py @@ -9,6 +9,7 @@ from hed.validator import SpreadsheetValidator from hed import TabularInput, SpreadsheetInput, Sidecar from hed.errors.error_types import ValidationErrors +from hed.errors.error_reporter import ErrorHandler class TestSpreadsheetValidation(unittest.TestCase): @@ -96,6 +97,65 @@ def test_invalid_onset_invalid_column(self): self.assertEqual(len(issues), 1) self.assertEqual(issues[0]['code'], ValidationErrors.TEMPORAL_TAG_ERROR) + def test_empty(self): + spreadsheet = SpreadsheetInput(file=io.StringIO("BadFile"), worksheet_name=None, + file_type=".tsv", tag_columns=[3], + has_column_names=True, column_prefix_dictionary=None, + name='spreadsheets.tsv') + error_handler = ErrorHandler(check_for_warnings=True) + issues = self.validator.validate(spreadsheet, error_handler=error_handler) + self.assertEqual(len(issues), 0) + + def test_tabular_with_hed(self): + sidecar_hed_json = ''' + { + "event_code": { + "HED": { + "face": "{HED}", + "ball": "Red" + } + } + } + ''' + sidecar = Sidecar(io.StringIO(sidecar_hed_json)) + issues = sidecar.validate(self.hed_schema) + self.assertEqual(len(issues), 0) + data = [ + ["onset", "duration", "event_code", "HED"], + [4.5, 0, "face", "Black"], + [5.0, 0, "n/a", ""] + ] + df = pd.DataFrame(data[1:], columns=data[0]) + my_tab = TabularInput(df, sidecar=sidecar, name='test_no_hed') + error_handler = ErrorHandler(check_for_warnings=False) + issues = self.validator.validate(my_tab, error_handler=error_handler) + self.assertEqual(len(issues), 0) + + def test_tabular_no_hed(self): + sidecar_hed_json = ''' + { + "event_code": { + "HED": { + "face": "{HED}", + "ball": "Red" + } + } + } + ''' + sidecar = Sidecar(io.StringIO(sidecar_hed_json)) + issues = sidecar.validate(self.hed_schema) + data = [ + ["onset", "duration", "event_code"], + [4.5, 0, "face"], + [5.0, 0, "ball"] + ] + df = pd.DataFrame(data[1:], columns=data[0]) + my_tab = TabularInput(df, sidecar=sidecar, name='test_no_hed') + error_handler = ErrorHandler(check_for_warnings=False) + issues = self.validator.validate(my_tab, error_handler=error_handler) + print(issues) + self.assertEqual(len(issues), 0) + def test_onset_na(self): # Test with no sidecar def_dict = "(Definition/Def1, (Event))"