Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 21 additions & 37 deletions hed/models/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from hed.models.column_mapper import ColumnMapper
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.errors.error_reporter import ErrorHandler
import pandas as pd


class BaseInput:
Expand Down Expand Up @@ -67,7 +66,10 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T
elif not file:
raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file)
elif input_type in self.TEXT_EXTENSION:
self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, dtype=str)
self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header,
dtype=str, keep_default_na=True, na_values=None)
# Convert nan values to a known value
self._dataframe = self._dataframe.fillna("n/a")
elif input_type in self.EXCEL_EXTENSION:
self._loaded_workbook = openpyxl.load_workbook(file)
loaded_worksheet = self.get_worksheet(self._worksheet_name)
Expand Down Expand Up @@ -362,7 +364,7 @@ def assemble(self, mapper=None):
"""
if mapper is None:
mapper = self._mapper

import pandas as pd
transformers, need_categorical = mapper.get_transformers()
if not transformers:
return None
Expand All @@ -372,62 +374,44 @@ def assemble(self, mapper=None):

all_columns = all_columns.transform(transformers)

return self._insert_columns(all_columns, list(transformers.keys()))

@staticmethod
def _find_column_refs(df):
possible_column_references = [f"{column_name}" for column_name in self.columns if
column_name.lower() != "hed"]
found_column_references = []
for column_name in df:
df_temp = df[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE)
u_vals = pd.Series([j for i in df_temp for j in i], dtype=str)
for column_name in all_columns:
df = all_columns[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE)
u_vals = pd.Series([j for i in df for j in i], dtype=str)
u_vals = u_vals.unique()
for val in u_vals:
if val not in found_column_references:
found_column_references.append(val)

return found_column_references

@staticmethod
def _insert_columns(df, known_columns=None):
if known_columns is None:
known_columns = list(df.columns)
possible_column_references = [f"{column_name}" for column_name in df.columns if
column_name.lower() != "hed"]
found_column_references = BaseInput._find_column_refs(df)

invalid_replacements = [col for col in found_column_references if col not in possible_column_references]
if invalid_replacements:
# todo: This check may be moved to validation
raise ValueError(f"Bad column references found(columns do not exist): {invalid_replacements}")
valid_replacements = [col for col in found_column_references if col in possible_column_references]

# todo: break this into a sub function(probably)
column_names = known_columns
column_names = list(transformers.keys())
for column_name in valid_replacements:
column_names.remove(column_name)
saved_columns = df[valid_replacements]
saved_columns = all_columns[valid_replacements]
for column_name in column_names:
for replacing_name in valid_replacements:
column_name_brackets = f"[{replacing_name}]"
df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
in zip(df[column_name], saved_columns[replacing_name]))
df = df[column_names]
all_columns[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
in zip(all_columns[column_name], saved_columns[replacing_name]))
all_columns = all_columns[column_names]

return df
return all_columns

@staticmethod
def combine_dataframe(dataframe):
""" Combines all columns in the given dataframe into a single HED string series,
skipping empty columns and columns with empty strings.
""" Combines all columns in the given dataframe into a single hed string series.

Parameters:
dataframe(Dataframe): The dataframe to combine

Returns:
Series: the assembled series
"""
dataframe = dataframe.agg(
lambda x: ', '.join(filter(lambda e: pd.notna(e) and e != "", x)), axis=1
)
dataframe = dataframe.agg(', '.join, axis=1)

return dataframe
# Potentially better ways to handle removing n/a by never inserting them to begin with.
dataframe = dataframe.replace("(, n/a|n/a,)", "", regex=True)
return dataframe
1 change: 0 additions & 1 deletion hed/validator/spreadsheet_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None):
# Check the structure of the input data, if it's a BaseInput
if isinstance(data, BaseInput):
issues += self._validate_column_structure(data, error_handler)
# todo ian: Add more checks here for column inserters
data = data.dataframe_a

# Check the rows of the input data
Expand Down
103 changes: 103 additions & 0 deletions tests/models/test_base_file_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import unittest
import os
import shutil
from hed import Sidecar
from hed import BaseInput, TabularInput
from hed.models.column_mapper import ColumnMapper
from hed.models import DefinitionDict
from hed import schema

# TODO: Add tests for base_file_input and include correct handling of 'n/a'


class Test(unittest.TestCase):
@classmethod
def setUpClass(cls):
# todo: clean up these unit tests/add more
base_data_dir = os.path.realpath(os.path.join(os.path.dirname(__file__), '../data/'))
cls.base_data_dir = base_data_dir
json_def_filename = os.path.join(base_data_dir, "sidecar_tests/both_types_events_with_defs.json")
# cls.json_def_filename = json_def_filename
json_def_sidecar = Sidecar(json_def_filename)
events_path = os.path.join(base_data_dir, '../data/validator_tests/bids_events_no_index.tsv')
cls.tabular_file = TabularInput(events_path, sidecar=json_def_sidecar)

base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/")
cls.base_output_folder = base_output
os.makedirs(base_output, exist_ok=True)

bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/bids_tests/eeg_ds003645s_hed'))
schema_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../data/schema_tests/HED8.0.0.xml'))
cls.bids_root_path = bids_root_path
json_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json'))
events_path = os.path.realpath(os.path.join(bids_root_path,
'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv'))

cls.hed_schema = schema.load_schema(schema_path)
sidecar1 = Sidecar(json_path, name='face_sub1_json')
mapper1 = ColumnMapper(sidecar=sidecar1, optional_tag_columns=['HED'], warn_on_missing_column=False)
cls.input_data1 = BaseInput(events_path, file_type='.tsv', has_column_names=True,
name="face_sub1_events", mapper=mapper1, allow_blank_names=False)
cls.input_data2 = BaseInput(events_path, file_type='.tsv', has_column_names=True, name="face_sub2_events")

@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.base_output_folder)

def test_gathered_defs(self):
# todo: add unit tests for definitions in tsv file
defs = DefinitionDict.get_as_strings(self.tabular_file._sidecar.extract_definitions(hed_schema=self.hed_schema))
expected_defs = {
'jsonfiledef': '(Item/JsonDef1/#,Item/JsonDef1)',
'jsonfiledef2': '(Item/JsonDef2/#,Item/JsonDef2)',
'jsonfiledef3': '(Item/JsonDef3/#)',
'takesvaluedef': '(Age/#)',
'valueclassdef': '(Acceleration/#)'
}
self.assertEqual(defs, expected_defs)

# def test_missing_column_name_issue(self):
# schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
# '../data/validator_tests/bids_schema.mediawiki')
# events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
# '../data/validator_tests/bids_events_bad_column_name.tsv')
#
# hed_schema = schema.load_schema(schema_path)
# json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
# "../data/validator_tests/bids_events.json")
# validator = HedValidator(hed_schema=hed_schema)
# sidecar = Sidecar(json_path)
# issues = sidecar.validate_entries(validator)
# self.assertEqual(len(issues), 0)
# input_file = TabularInput(events_path, sidecars=sidecar)
#
# validation_issues = input_file.validate_sidecar(validator)
# self.assertEqual(len(validation_issues), 0)
# validation_issues = input_file.validate_file(validator, check_for_warnings=True)
# self.assertEqual(len(validation_issues), 1)
#
# def test_expand_column_issues(self):
# schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
# '../data/validator_tests/bids_schema.mediawiki')
# events_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
# '../data/validator_tests/bids_events_bad_category_key.tsv')
#
# hed_schema = schema.load_schema(schema_path)
# json_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
# "../data/validator_tests/bids_events.json")
# validator = HedValidator(hed_schema=hed_schema)
# sidecar = Sidecar(json_path)
# issues = sidecar.validate_entries(validator)
# self.assertEqual(len(issues), 0)
# input_file = TabularInput(events_path, sidecars=sidecar)
#
# validation_issues = input_file.validate_sidecar(validator)
# self.assertEqual(len(validation_issues), 0)
# validation_issues = input_file.validate_file(validator, check_for_warnings=True)
# self.assertEqual(len(validation_issues), 1)


if __name__ == '__main__':
unittest.main()
Loading