Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions hed/errors/error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@ def val_error_hed_duplicate_column(column_name):
return f"Multiple columns have name {column_name}. This is not a fatal error, but discouraged."


@hed_error(ValidationErrors.DUPLICATE_NAME_NUMBER_COLUMN, default_severity=ErrorSeverity.WARNING)
def val_error_hed_duplicate_column_number(column_name, column_number):
return f"Column '{column_name}' added as a named column, then also as numbered column {column_number}"


@hed_tag_error(ValidationErrors.HED_LIBRARY_UNMATCHED, actual_code=ValidationErrors.TAG_PREFIX_INVALID)
def val_error_unknown_prefix(tag, unknown_prefix, known_prefixes):
return f"Tag '{tag} has unknown prefix '{unknown_prefix}'. Valid prefixes: {known_prefixes}"
Expand Down
1 change: 1 addition & 0 deletions hed/errors/error_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class ValidationErrors:
HED_MISSING_REQUIRED_COLUMN = "HED_MISSING_REQUIRED_COLUMN"
HED_UNKNOWN_COLUMN = "HED_UNKNOWN_COLUMN"
HED_DUPLICATE_COLUMN = "HED_DUPLICATE_COLUMN"
DUPLICATE_NAME_NUMBER_COLUMN = "DUPLICATE_NAME_NUMBER_COLUMN"
HED_BLANK_COLUMN = "HED_BLANK_COLUMN"

# Below here shows what the given error maps to
Expand Down
35 changes: 19 additions & 16 deletions hed/models/column_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class ColumnMapper:
""" Mapping of a base input file columns into HED tags.

Notes:
- Functions and type_variables column and row indexing starts at 0.
- All column numbers are 0 based.
"""
def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None,
optional_tag_columns=None, requested_columns=None, warn_on_missing_column=False):
Expand All @@ -22,10 +22,12 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None
sidecar (Sidecar): A sidecar to gather column data from.
tag_columns: (list): A list of ints or strings containing the columns that contain the HED tags.
Sidecar column definitions will take precedent if there is a conflict with tag_columns.
column_prefix_dictionary (dict): Dictionary with keys that are column numbers and values are HED tag
column_prefix_dictionary (dict): Dictionary with keys that are column numbers/names and values are HED tag
prefixes to prepend to the tags in that column before processing.
May be deprecated. These are no longer prefixes, but rather converted to value columns.
eg. {"key": "Description"} will turn into a value column as {"key": "Description/#"}
May be deprecated/renamed. These are no longer prefixes, but rather converted to value columns.
eg. {"key": "Description", 1: "Label/"} will turn into value columns as
{"key": "Description/#", 1: "Label/#"}
Note: It will be a validation issue if column 1 is called "key" in the above example.
This means it no longer accepts anything but the value portion only in the columns.
optional_tag_columns (list): A list of ints or strings containing the columns that contain
the HED tags. If the column is otherwise unspecified, convert this column type to HEDTags.
Expand All @@ -36,12 +38,6 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None

Notes:
- All column numbers are 0 based.

Examples:
column_prefix_dictionary = {3: 'Description/', 4: 'Label/'}

The third column contains tags that need Description/ tag prepended, while the fourth column
contains tag that needs Label/ prepended.
"""
# This points to column_type entries based on column names or indexes if columns have no column_name.
self.column_data = {}
Expand Down Expand Up @@ -79,9 +75,9 @@ def get_transformers(self):
assign_to_column = column.column_name
if isinstance(assign_to_column, int):
if self._column_map:
assign_to_column = self._column_map[assign_to_column - 1]
assign_to_column = self._column_map[assign_to_column]
else:
assign_to_column = assign_to_column - 1
assign_to_column = assign_to_column
if column.column_type == ColumnType.Ignore:
continue
elif column.column_type == ColumnType.Value:
Expand Down Expand Up @@ -154,7 +150,7 @@ def get_tag_columns(self):
column_identifiers(list): A list of column numbers or names that are ColumnType.HedTags.
0-based if integer-based, otherwise column name.
"""
return [column_entry.column_name - 1 if isinstance(column_entry.column_name, int) else column_entry.column_name
return [column_entry.column_name if isinstance(column_entry.column_name, int) else column_entry.column_name
for number, column_entry in self._final_column_map.items()
if column_entry.column_type == ColumnType.HEDTags]

Expand Down Expand Up @@ -263,6 +259,7 @@ def _add_column_data(self, new_column_entry):
def _get_basic_final_map(column_map, column_data):
basic_final_map = {}
unhandled_names = {}
issues = []
if column_map:
for column_number, column_name in column_map.items():
if column_name is None:
Expand All @@ -277,11 +274,16 @@ def _get_basic_final_map(column_map, column_data):
unhandled_names[column_name] = column_number
for column_number in column_data:
if isinstance(column_number, int):
if column_number in basic_final_map:
issues += ErrorHandler.format_error(ValidationErrors.DUPLICATE_NAME_NUMBER_COLUMN,
column_name=basic_final_map[column_number].column_name,
column_number=column_number)
continue
column_entry = copy.deepcopy(column_data[column_number])
column_entry.column_name = column_number
basic_final_map[column_number] = column_entry

return basic_final_map, unhandled_names
return basic_final_map, unhandled_names, issues

@staticmethod
def _convert_to_indexes(name_to_column_map, column_list):
Expand Down Expand Up @@ -357,14 +359,15 @@ def _finalize_mapping(self):
# 2. Add any tag columns and note issues about missing columns
# 3. Add any numbered columns that have required prefixes
# 4. Filter to just requested columns, if any
final_map, unhandled_names = self._get_basic_final_map(self._column_map, self.column_data)
final_map, unhandled_names, issues = self._get_basic_final_map(self._column_map, self.column_data)

# convert all tag lists to indexes -> Issuing warnings at this time potentially for unknown ones
all_tag_columns, required_tag_columns, issues = self._convert_tag_columns(self._tag_columns,
all_tag_columns, required_tag_columns, tag_issues = self._convert_tag_columns(self._tag_columns,
self._optional_tag_columns,
self._requested_columns,
self._reverse_column_map)

issues += tag_issues
# Notes any missing required columns
issues += self._add_tag_columns(final_map, unhandled_names, all_tag_columns, required_tag_columns,
self._warn_on_missing_column)
Expand Down
16 changes: 8 additions & 8 deletions hed/models/spreadsheet_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=N
worksheet_name (str or None): The name of the Excel workbook worksheet that contains the HED tags.
Not applicable to tsv files. If omitted for Excel, the first worksheet is assumed.
tag_columns (list): A list of ints containing the columns that contain the HED tags.
The default value is [2] indicating only the second column has tags.
The default value is [1] indicating only the second column has tags.
has_column_names (bool): True if file has column names. Validation will skip over the
first line of the file if the spreadsheet as column names.
column_prefix_dictionary (dict): A dictionary with column number keys and prefix values.
This is partially deprecated - what this now turns the given columns into Value columns.
Examples:
A prefix dictionary {3: 'Label/', 5: 'Description/'} indicates that column 3 and 5 have HED tags
that need to be prefixed by Label/ and Description/ respectively.
Column numbers 3 and 5 should also be included in the tag_columns list.

column_prefix_dictionary (dict): Dictionary with keys that are column numbers/names and values are HED tag
prefixes to prepend to the tags in that column before processing.
May be deprecated/renamed. These are no longer prefixes, but rather converted to value columns.
eg. {"key": "Description", 1: "Label/"} will turn into value columns as
{"key": "Description/#", 1: "Label/#"}
Note: It will be a validation issue if column 1 is called "key" in the above example.
This means it no longer accepts anything but the value portion only in the columns.
"""
if tag_columns is None:
tag_columns = [1]
Expand Down
1 change: 0 additions & 1 deletion hed/models/tabular_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ def __init__(self, file=None, sidecar=None, name=None):
Parameters:
file (str or file like): A tsv file to open.
sidecar (str or Sidecar): A Sidecar filename or Sidecar
Note: If this is a string you MUST also pass hed_schema.
name (str): The name to display for this file for error purposes.
"""
if sidecar and not isinstance(sidecar, Sidecar):
Expand Down
Binary file not shown.
55 changes: 32 additions & 23 deletions tests/models/test_spreadsheet_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,6 @@ def setUpClass(cls):
"../data/validator_tests/ExcelMultipleSheets.xlsx")
cls.default_test_file_name = default
cls.generic_file_input = SpreadsheetInput(default)
cls.integer_key_dictionary = {1: 'one', 2: 'two', 3: 'three'}
cls.one_based_tag_columns = [1, 2, 3]
cls.zero_based_tag_columns = [0, 1, 2, 3, 4]
cls.zero_based_row_column_count = 3
cls.zero_based_tag_columns_less_than_row_column_count = [0, 1, 2]
cls.column_prefix_dictionary = {3: 'Event/Description/', 4: 'Event/Label/', 5: 'Event/Category/'}
cls.category_key = 'Event/Category/'
cls.category_participant_and_stimulus_tags = 'Event/Category/Participant response,Event/Category/Stimulus'
cls.category_tags = 'Participant response, Stimulus'
cls.row_with_hed_tags = ['event1', 'tag1', 'tag2']
base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/")
cls.base_output_folder = base_output
os.makedirs(base_output, exist_ok=True)
Expand All @@ -44,8 +34,8 @@ def tearDownClass(cls):
def test_all(self):
hed_input = self.default_test_file_name
has_column_names = True
column_prefix_dictionary = {2: 'Label', 3: 'Description'}
tag_columns = [4]
column_prefix_dictionary = {1: 'Label/', 2: 'Description'}
tag_columns = [3]
worksheet_name = 'LKT Events'

file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name,
Expand All @@ -58,6 +48,25 @@ def test_all(self):
# Just make sure this didn't crash for now
self.assertTrue(True)

def test_all2(self):
# This should work, but raise an issue as Short label and column 1 overlap.
hed_input = self.default_test_file_name
has_column_names = True
column_prefix_dictionary = {1: 'Label/', "Short label": 'Description'}
tag_columns = [3]
worksheet_name = 'LKT Events'

file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name,
tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary)

self.assertTrue(isinstance(file_input.dataframe_a, pd.DataFrame))
self.assertTrue(isinstance(file_input.series_a, pd.Series))
self.assertTrue(file_input.dataframe_a.size)
self.assertTrue(len(file_input._mapper.get_column_mapping_issues()), 1)

# Just make sure this didn't crash for now
self.assertTrue(True)

def test_file_as_string(self):
events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/validator_tests/bids_events_no_index.tsv')
Expand Down Expand Up @@ -103,8 +112,8 @@ def test_to_excel(self):

def test_to_excel_should_work(self):
spreadsheet = SpreadsheetInput(file=self.default_test_file_name, file_type='.xlsx',
tag_columns=[4], has_column_names=True,
column_prefix_dictionary={1: 'Label/', 3: 'Description/'},
tag_columns=[3], has_column_names=True,
column_prefix_dictionary={1: 'Label/', 2: 'Description/'},
name='ExcelOneSheet.xlsx')
buffer = io.BytesIO()
spreadsheet.to_excel(buffer, output_assembled=True)
Expand Down Expand Up @@ -148,51 +157,51 @@ def test_loading_and_reset_mapper(self):
def test_no_column_header_and_convert(self):
events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/model_tests/no_column_header.tsv')
hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
hed_input.convert_to_long(self.hed_schema)

events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/model_tests/no_column_header_long.tsv')
hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2])
hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[0, 1])
self.assertTrue(hed_input._dataframe.equals(hed_input_long._dataframe))

def test_convert_short_long_with_definitions(self):
# Verify behavior works as expected even if definitions are present
events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/model_tests/no_column_header_definition.tsv')
hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
hed_input.convert_to_long(self.hed_schema)

events_path_long = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/model_tests/no_column_header_definition_long.tsv')
hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[1, 2])
hed_input_long = SpreadsheetInput(events_path_long, has_column_names=False, tag_columns=[0, 1])
self.assertTrue(hed_input._dataframe.equals(hed_input_long._dataframe))

def test_definitions_identified(self):
# Todo: this test is no longer relevant
events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/model_tests/no_column_header_definition.tsv')
hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/model_tests/no_column_header_definition.tsv')
hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])


def test_loading_dataframe_directly(self):
ds_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/model_tests/no_column_header_definition.tsv')
ds = pd.read_csv(ds_path, delimiter="\t", header=None)
hed_input = SpreadsheetInput(ds, has_column_names=False, tag_columns=[1, 2])
hed_input = SpreadsheetInput(ds, has_column_names=False, tag_columns=[0, 1])

events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/model_tests/no_column_header_definition.tsv')
hed_input2 = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
hed_input2 = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
self.assertTrue(hed_input._dataframe.equals(hed_input2._dataframe))

def test_ignoring_na_column(self):
events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/model_tests/na_tag_column.tsv')
hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[1, 2])
hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1])
self.assertTrue(hed_input.dataframe_a.loc[1, 1] == 'n/a')

def test_ignoring_na_value_column(self):
Expand Down
37 changes: 36 additions & 1 deletion tests/validator/test_spreadsheet_validator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import pandas as pd
import os
import shutil

import unittest
from hed import BaseInput, load_schema_version
from hed import load_schema_version, load_schema
from hed.validator import SpreadsheetValidator
from hed import SpreadsheetInput
from hed.errors import ErrorHandler, sort_issues
from hed.errors.error_types import ColumnErrors

Expand All @@ -12,4 +16,35 @@ class TestInsertColumns(unittest.TestCase):
def setUpClass(cls):
cls.schema = load_schema_version("8.1.0")
cls.validator = SpreadsheetValidator(cls.schema)
base = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/')
cls.base_data_dir = base
hed_xml_file = os.path.join(base, "schema_tests/HED8.0.0t.xml")
cls.hed_schema = load_schema(hed_xml_file)
default = os.path.join(os.path.dirname(os.path.realpath(__file__)),
"../data/spreadsheet_validator_tests/ExcelMultipleSheets.xlsx")
cls.default_test_file_name = default
cls.generic_file_input = SpreadsheetInput(default)
base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/")
cls.base_output_folder = base_output
os.makedirs(base_output, exist_ok=True)

@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.base_output_folder)

def test_basic_validate(self):
hed_input = self.default_test_file_name
has_column_names = True
column_prefix_dictionary = {1: 'Label/', 3: 'Description'}
tag_columns = [4]
worksheet_name = 'LKT 8HED3'

file_input = SpreadsheetInput(hed_input, has_column_names=has_column_names, worksheet_name=worksheet_name,
tag_columns=tag_columns, column_prefix_dictionary=column_prefix_dictionary)

self.assertTrue(isinstance(file_input.dataframe_a, pd.DataFrame))
self.assertTrue(isinstance(file_input.series_a, pd.Series))
self.assertTrue(file_input.dataframe_a.size)

issues = file_input.validate(self.schema)
self.assertTrue(len(issues), 1)