Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions hed/errors/error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@ def val_error_sidecar_key_missing(invalid_key, category_keys):
return f"Category key '{invalid_key}' does not exist in column. Valid keys are: {category_keys}"


@hed_error(ValidationErrors.TSV_COLUMN_MISSING, actual_code=ValidationErrors.SIDECAR_KEY_MISSING,
default_severity=ErrorSeverity.WARNING)
def val_error_tsv_column_missing(invalid_key):
return f"{{HED}} is used as a key in a sidecar but does not appear as a column in the tabular file"


@hed_tag_error(ValidationErrors.HED_DEF_EXPAND_INVALID, actual_code=ValidationErrors.DEF_EXPAND_INVALID)
def val_error_bad_def_expand(tag, actual_def, found_def):
return f"A data-recording's Def-expand tag does not match the given definition." \
Expand Down Expand Up @@ -314,11 +320,6 @@ def sidecar_hed_used():
return "'HED' is a reserved name and cannot be used as a sidecar except in expected places."


@hed_error(SidecarErrors.SIDECAR_HED_USED_COLUMN, actual_code=ValidationErrors.SIDECAR_INVALID)
def sidecar_hed_used_column():
return "'HED' is a reserved name and cannot be used as a sidecar column name"


@hed_error(SidecarErrors.SIDECAR_NA_USED, actual_code=ValidationErrors.SIDECAR_INVALID)
def sidecar_na_used(column_name):
return f"Invalid category key 'n/a' found in column {column_name}."
Expand Down
5 changes: 3 additions & 2 deletions hed/errors/error_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class ValidationErrors:
REQUIRED_TAG_MISSING = 'REQUIRED_TAG_MISSING'
SIDECAR_INVALID = 'SIDECAR_INVALID'
SIDECAR_KEY_MISSING = 'SIDECAR_KEY_MISSING'
HED_COLUMN_MISSING = 'HED_COLUMN_MISSING'
STYLE_WARNING = "STYLE_WARNING"
TAG_EMPTY = 'TAG_EMPTY'
TAG_EXPRESSION_REPEATED = 'TAG_EXPRESSION_REPEATED'
Expand Down Expand Up @@ -96,6 +97,7 @@ class ValidationErrors:
HED_PLACEHOLDER_OUT_OF_CONTEXT = 'HED_PLACEHOLDER_OUT_OF_CONTEXT'
CURLY_BRACE_UNSUPPORTED_HERE = 'CURLY_BRACE_UNSUPPORTED_HERE'
ONSETS_UNORDERED = "ONSETS_UNORDERED"
TSV_COLUMN_MISSING="TSV_COLUMN_MISSING"


class SidecarErrors:
Expand All @@ -105,9 +107,8 @@ class SidecarErrors:
INVALID_POUND_SIGNS_VALUE = 'invalidNumberPoundSigns'
INVALID_POUND_SIGNS_CATEGORY = 'tooManyPoundSigns'
UNKNOWN_COLUMN_TYPE = 'sidecarUnknownColumn'
SIDECAR_HED_USED_COLUMN = 'sidecar_hed_used_column'
SIDECAR_HED_USED = 'SIDECAR_HED_USED'
SIDECAR_NA_USED = 'SIDECAR_NA_USED'
SIDECAR_HED_USED = 'sidecar_hed_used'
SIDECAR_BRACES_INVALID = "SIDECAR_BRACES_INVALID"


Expand Down
1 change: 0 additions & 1 deletion hed/models/column_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
# Maps column number to column_entry. This is what's actually used by most code.
self._final_column_map = {}
self._no_mapping_info = True

self._column_map = {}
self._reverse_column_map = {}
self._warn_on_missing_column = warn_on_missing_column
Expand Down
13 changes: 9 additions & 4 deletions hed/models/df_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,20 +180,25 @@ def _handle_curly_braces_refs(df, refs, column_names):
modified_df(pd.DataFrame): The modified dataframe with refs replaced
"""
# Filter out columns and refs that don't exist.
refs = [ref for ref in refs if ref in column_names]
remaining_columns = [column for column in column_names if column not in refs]
refs_new = [ref for ref in refs if ref in column_names]
remaining_columns = [column for column in column_names if column not in refs_new]

new_df = df.copy()
# Replace references in the columns we are saving out.
saved_columns = new_df[refs]
saved_columns = new_df[refs_new]
for column_name in remaining_columns:
for replacing_name in refs:
for replacing_name in refs_new:
# If the data has no n/a values, this version is MUCH faster.
# column_name_brackets = f"{{{replacing_name}}}"
# df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
# in zip(df[column_name], saved_columns[replacing_name]))
new_df[column_name] = pd.Series(replace_ref(x, f"{{{replacing_name}}}", y) for x, y
in zip(new_df[column_name], saved_columns[replacing_name]))
# Handle the special case of {HED} when the tsv file has no {HED} column
if 'HED' in refs and 'HED' not in column_names:
for column_name in remaining_columns:
new_df[column_name] =\
pd.Series(replace_ref(x, "{HED}", "n/a") for x in new_df[column_name])
new_df = new_df[remaining_columns]

return new_df
Expand Down
95 changes: 48 additions & 47 deletions hed/models/spreadsheet_input.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,48 @@
""" A spreadsheet of HED tags. """
from hed.models.column_mapper import ColumnMapper
from hed.models.base_input import BaseInput


class SpreadsheetInput(BaseInput):
""" A spreadsheet of HED tags. """

def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=None,
has_column_names=True, column_prefix_dictionary=None,
name=None):
"""Constructor for the SpreadsheetInput class.

Parameters:
file (str or file like): An xlsx/tsv file to open or a File object.
file_type (str or None): ".xlsx" for Excel, ".tsv" or ".txt" for tsv. data.
worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags.
Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed.
tag_columns (list): A list of ints or strs containing the columns that contain the HED tags.
If ints then column numbers with [1] indicating only the second column has tags.
has_column_names (bool): True if file has column names. Validation will skip over the first row.
first line of the file if the spreadsheet as column names.
column_prefix_dictionary (dict or None): Dictionary with keys that are column numbers/names and
values are HED tag prefixes to prepend to the tags in that column before processing.

Notes:
- If file is a string, file_type is derived from file and this parameter is ignored.
- column_prefix_dictionary may be deprecated/renamed. These are no longer prefixes,
but rather converted to value columns.
e.g. {"key": "Description", 1: "Label/"} will turn into value columns as
{"key": "Description/#", 1: "Label/#"}
It will be a validation issue if column 1 is called "key" in the above example.
This means it no longer accepts anything but the value portion only in the columns.

:raises HedFileError:
- The file is blank.
- An invalid dataframe was passed with size 0.
- An invalid extension was provided.
- A duplicate or empty column name appears.
- Cannot open the indicated file.
- The specified worksheet name does not exist.
"""

new_mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary,
warn_on_missing_column=False)

super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, name=name)
""" A spreadsheet of HED tags. """
from hed.models.column_mapper import ColumnMapper
from hed.models.base_input import BaseInput


class SpreadsheetInput(BaseInput):
""" A spreadsheet of HED tags. """

def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=None,
has_column_names=True, column_prefix_dictionary=None,
name=None):
"""Constructor for the SpreadsheetInput class.

Parameters:
file (str or file like): An xlsx/tsv file to open or a File object.
file_type (str or None): ".xlsx" for Excel, ".tsv" or ".txt" for tsv. data.
worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags.
Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed.
tag_columns (list): A list of ints or strs containing the columns that contain the HED tags.
If ints then column numbers with [1] indicating only the second column has tags.
has_column_names (bool): True if file has column names. Validation will skip over the first row.
first line of the file if the spreadsheet as column names.
column_prefix_dictionary (dict or None): Dictionary with keys that are column numbers/names and
values are HED tag prefixes to prepend to the tags in that column before processing.

Notes:
- If file is a string, file_type is derived from file and this parameter is ignored.
- column_prefix_dictionary may be deprecated/renamed. These are no longer prefixes,
but rather converted to value columns.
e.g. {"key": "Description", 1: "Label/"} will turn into value columns as
{"key": "Description/#", 1: "Label/#"}
It will be a validation issue if column 1 is called "key" in the above example.
This means it no longer accepts anything but the value portion only in the columns.

:raises HedFileError:
- The file is blank.
- An invalid dataframe was passed with size 0.
- An invalid extension was provided.
- A duplicate or empty column name appears.
- Cannot open the indicated file.
- The specified worksheet name does not exist.
"""

self.tag_columns = tag_columns
new_mapper = ColumnMapper(tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary,
warn_on_missing_column=False)

super().__init__(file, file_type, worksheet_name, has_column_names, new_mapper, name=name)
2 changes: 1 addition & 1 deletion hed/validator/sidecar_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def _validate_column_structure(self, column_name, dict_for_entry, error_handler)
"""
val_issues = []
if column_name in self.reserved_column_names:
val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED_COLUMN)
val_issues += error_handler.format_error_with_context(SidecarErrors.SIDECAR_HED_USED)
return val_issues

column_type = ColumnMetadata._detect_column_type(dict_for_entry=dict_for_entry, basic_validation=False)
Expand Down
3 changes: 1 addition & 2 deletions hed/validator/spreadsheet_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,6 @@ def _validate_column_structure(self, base_input, error_handler, row_adj):
columns = base_input.columns
for ref in column_refs:
if ref not in columns:
issues += error_handler.format_error_with_context(ColumnErrors.INVALID_COLUMN_REF,
bad_ref=ref)
issues += error_handler.format_error_with_context(ValidationErrors.TSV_COLUMN_MISSING, invalid_key=ref)

return issues
1 change: 0 additions & 1 deletion spec_tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
runAll = True
runOnly = {}


class MyTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand Down
60 changes: 60 additions & 0 deletions tests/validator/test_spreadsheet_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from hed.validator import SpreadsheetValidator
from hed import TabularInput, SpreadsheetInput, Sidecar
from hed.errors.error_types import ValidationErrors
from hed.errors.error_reporter import ErrorHandler


class TestSpreadsheetValidation(unittest.TestCase):
Expand Down Expand Up @@ -96,6 +97,65 @@ def test_invalid_onset_invalid_column(self):
self.assertEqual(len(issues), 1)
self.assertEqual(issues[0]['code'], ValidationErrors.TEMPORAL_TAG_ERROR)

def test_empty(self):
spreadsheet = SpreadsheetInput(file=io.StringIO("BadFile"), worksheet_name=None,
file_type=".tsv", tag_columns=[3],
has_column_names=True, column_prefix_dictionary=None,
name='spreadsheets.tsv')
error_handler = ErrorHandler(check_for_warnings=True)
issues = self.validator.validate(spreadsheet, error_handler=error_handler)
self.assertEqual(len(issues), 0)

def test_tabular_with_hed(self):
sidecar_hed_json = '''
{
"event_code": {
"HED": {
"face": "{HED}",
"ball": "Red"
}
}
}
'''
sidecar = Sidecar(io.StringIO(sidecar_hed_json))
issues = sidecar.validate(self.hed_schema)
self.assertEqual(len(issues), 0)
data = [
["onset", "duration", "event_code", "HED"],
[4.5, 0, "face", "Black"],
[5.0, 0, "n/a", ""]
]
df = pd.DataFrame(data[1:], columns=data[0])
my_tab = TabularInput(df, sidecar=sidecar, name='test_no_hed')
error_handler = ErrorHandler(check_for_warnings=False)
issues = self.validator.validate(my_tab, error_handler=error_handler)
self.assertEqual(len(issues), 0)

def test_tabular_no_hed(self):
sidecar_hed_json = '''
{
"event_code": {
"HED": {
"face": "{HED}",
"ball": "Red"
}
}
}
'''
sidecar = Sidecar(io.StringIO(sidecar_hed_json))
issues = sidecar.validate(self.hed_schema)
data = [
["onset", "duration", "event_code"],
[4.5, 0, "face"],
[5.0, 0, "ball"]
]
df = pd.DataFrame(data[1:], columns=data[0])
my_tab = TabularInput(df, sidecar=sidecar, name='test_no_hed')
error_handler = ErrorHandler(check_for_warnings=False)
issues = self.validator.validate(my_tab, error_handler=error_handler)
print(issues)
self.assertEqual(len(issues), 0)

def test_onset_na(self):
# Test with no sidecar
def_dict = "(Definition/Def1, (Event))"
Expand Down