Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion hed/errors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .error_reporter import ErrorHandler, get_printable_issue_string, sort_issues
from .error_types import DefinitionErrors, OnsetErrors, SchemaErrors, SchemaWarnings, SidecarErrors, ValidationErrors
from .error_types import DefinitionErrors, OnsetErrors, SchemaErrors, SchemaWarnings, SidecarErrors, \
ValidationErrors, ColumnErrors
from .error_types import ErrorContext, ErrorSeverity
from .exceptions import HedExceptions, HedFileError
12 changes: 6 additions & 6 deletions hed/errors/error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,23 +401,23 @@ def onset_wrong_placeholder(tag, has_placeholder):
return f"Onset/offset def tag {tag} should not have a placeholder, but has one."


@hed_error(ColumnErrors.INVALID_COLUMN_REF)
def invalid_column_ref(bad_refs):
return f"Bad column references found(columns do not exist): {bad_refs}"
@hed_error(ColumnErrors.INVALID_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
def invalid_column_ref(bad_ref):
return f"The column '{bad_ref}' is unknown.'"


@hed_error(ColumnErrors.SELF_COLUMN_REF)
@hed_error(ColumnErrors.SELF_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
def self_column_ref(self_ref):
return f"Column references itself: {self_ref}"


@hed_error(ColumnErrors.NESTED_COLUMN_REF)
@hed_error(ColumnErrors.NESTED_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
def nested_column_ref(column_name, ref_column):
return f"Column {column_name} has a nested reference to {ref_column}. " \
f"Column reference columns cannot contain other column references."


@hed_error(ColumnErrors.MALFORMED_COLUMN_REF)
@hed_error(ColumnErrors.MALFORMED_COLUMN_REF, actual_code=SidecarErrors.SIDECAR_BRACES_INVALID)
def nested_column_ref(column_name, index, symbol):
return f"Column {column_name} has a malformed column reference. Improper symbol {symbol} found at index {index}."

Expand Down
3 changes: 1 addition & 2 deletions hed/errors/error_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,10 +396,9 @@ def val_error_unknown(*args, **kwargs):

Returns:
str: The error message.
dict: The extra args.

"""
return f"Unknown error. Args: {str(args)}", kwargs
return f"Unknown error. Args: {str(args), str(kwargs)}"

@staticmethod
def filter_issues_by_severity(issues_list, severity):
Expand Down
1 change: 1 addition & 0 deletions hed/errors/error_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class SidecarErrors:
SIDECAR_HED_USED_COLUMN = 'SIDECAR_HED_USED_COLUMN'
SIDECAR_NA_USED = 'SIDECAR_NA_USED'
SIDECAR_HED_USED = 'SIDECAR_HED_USED'
SIDECAR_BRACES_INVALID = "SIDECAR_BRACES_INVALID"


class SchemaErrors:
Expand Down
117 changes: 75 additions & 42 deletions hed/models/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,9 @@ def columns(self):
Empty if no column names.

Returns:
columns(dict): The column number:name pairs
columns(list): the column names
"""
columns = {}
columns = []
if self._dataframe is not None and self._has_column_names:
columns = list(self._dataframe.columns)
return columns
Expand Down Expand Up @@ -354,24 +354,25 @@ def _dataframe_has_names(dataframe):
return True
return False

def assemble(self, mapper=None, skip_square_brackets=False):
def assemble(self, mapper=None, skip_curly_braces=False):
""" Assembles the hed strings

Parameters:
mapper(ColumnMapper or None): Generally pass none here unless you want special behavior.
skip_square_brackets (bool): If True, don't plug in square bracket values into columns.
skip_curly_braces (bool): If True, don't plug in curly brace values into columns.
Returns:
Dataframe: the assembled dataframe
"""
if mapper is None:
mapper = self._mapper

all_columns = self._handle_transforms(mapper)
if skip_square_brackets:
if skip_curly_braces:
return all_columns
transformers, _ = mapper.get_transformers()

return self._handle_square_brackets(all_columns, list(transformers))
refs = self.get_column_refs()
column_names = list(transformers)
return self._handle_curly_braces_refs(all_columns, refs, column_names)

def _handle_transforms(self, mapper):
transformers, need_categorical = mapper.get_transformers()
Expand All @@ -390,45 +391,67 @@ def _handle_transforms(self, mapper):
return all_columns

@staticmethod
def _find_column_refs(df, column_names):
found_column_references = []
for column_name in column_names:
df_temp = df[column_name].str.findall("\[([a-z_\-0-9]+)\]", re.IGNORECASE)
u_vals = pd.Series([j for i in df_temp if isinstance(i, list) for j in i], dtype=str)
u_vals = u_vals.unique()
for val in u_vals:
if val not in found_column_references:
found_column_references.append(val)

return found_column_references
def _replace_ref(text, newvalue, column_ref):
""" Replace column ref in x with y. If it's n/a, delete extra commas/parentheses.

@staticmethod
def _handle_square_brackets(df, known_columns=None):
Note: This function could easily be updated to handle non-curly brace values, but it's faster this way.
Parameters:
text (str): The input string containing the ref enclosed in curly braces.
newvalue (str): The replacement value for the ref.
column_ref (str): The ref to be replaced, without curly braces

Returns:
str: The modified string with the ref replaced or removed.
"""
Plug in square brackets with other columns
# If it's not n/a, we can just replace directly.
if newvalue != "n/a":
return text.replace(f"{{{column_ref}}}", newvalue)

def _remover(match):
p1 = match.group("p1").count("(")
p2 = match.group("p2").count(")")
if p1 > p2: # We have more starting parens than ending. Make sure we don't remove comma before
output = match.group("c1") + "(" * (p1 - p2)
elif p2 > p1: # We have more ending parens. Make sure we don't remove comma after
output = ")" * (p2 - p1) + match.group("c2")
else:
c1 = match.group("c1")
c2 = match.group("c2")
if c1:
c1 = ""
elif c2:
c2 = ""
output = c1 + c2

return output

# this finds all surrounding commas and parentheses to a reference.
# c1/c2 contain the comma(and possibly spaces) separating this ref from other tags
# p1/p2 contain the parentheses directly surrounding the tag
# All four groups can have spaces.
pattern = r'(?P<c1>[\s,]*)(?P<p1>[(\s]*)\{' + column_ref + r'\}(?P<p2>[\s)]*)(?P<c2>[\s,]*)'
return re.sub(pattern, _remover, text)

If known columns is passed, only use those columns to find or replace references.
@staticmethod
def _handle_curly_braces_refs(df, refs, column_names):
"""
if known_columns is not None:
column_names = list(known_columns)
else:
column_names = list(df.columns)
possible_column_references = [f"{column_name}" for column_name in column_names if
isinstance(column_name, str) and column_name.lower() != "hed"]
found_column_references = BaseInput._find_column_refs(df, column_names)

valid_replacements = [col for col in found_column_references if col in possible_column_references]

# todo: break this into a sub function(probably)
for column_name in valid_replacements:
column_names.remove(column_name)
saved_columns = df[valid_replacements]
for column_name in column_names:
for replacing_name in valid_replacements:
column_name_brackets = f"[{replacing_name}]"
df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
Plug in curly braces with other columns
"""
# Filter out columns and refs that don't exist.
refs = [ref for ref in refs if ref in column_names]
remaining_columns = [column for column in column_names if column not in refs]

# Replace references in the columns we are saving out.
saved_columns = df[refs]
for column_name in remaining_columns:
for replacing_name in refs:
# If the data has no n/a values, this version is MUCH faster.
# column_name_brackets = f"{{{replacing_name}}}"
# df[column_name] = pd.Series(x.replace(column_name_brackets, y) for x, y
# in zip(df[column_name], saved_columns[replacing_name]))
df[column_name] = pd.Series(BaseInput._replace_ref(x, y, replacing_name) for x, y
in zip(df[column_name], saved_columns[replacing_name]))
df = df[column_names]
df = df[remaining_columns]

return df

Expand Down Expand Up @@ -462,4 +485,14 @@ def get_def_dict(self, hed_schema=None, extra_def_dicts=None):
DefinitionDict: A single definition dict representing all the data(and extra def dicts)
"""
from hed.models.definition_dict import DefinitionDict
return DefinitionDict(extra_def_dicts, hed_schema)
return DefinitionDict(extra_def_dicts, hed_schema)

def get_column_refs(self):
""" Returns a list of column refs for this file.

Default implementation returns none.

Returns:
column_refs(list): A list of unique column refs found
"""
return []
6 changes: 3 additions & 3 deletions hed/models/column_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,12 @@ def get_transformers(self):
if column.column_type == ColumnType.Ignore:
continue
elif column.column_type == ColumnType.Value:
value_str = column._hed_dict
value_str = column.hed_dict
from functools import partial
final_transformers[assign_to_column] = partial(self._value_handler, value_str)
elif column.column_type == ColumnType.Categorical:
need_categorical.append(column.column_name)
category_values = column._hed_dict
category_values = column.hed_dict
from functools import partial
final_transformers[assign_to_column] = partial(self._category_handler, category_values)
else:
Expand Down Expand Up @@ -243,7 +243,7 @@ def _add_value_columns(self, column_prefix_dictionary):
prefix = prefix + "#"
else:
prefix = prefix + "/#"
new_def = ColumnMetadata(ColumnType.Value, col, hed_dict=prefix)
new_def = ColumnMetadata(ColumnType.Value, col, source=prefix)
self._add_column_data(new_def)

def _add_column_data(self, new_column_entry):
Expand Down
96 changes: 79 additions & 17 deletions hed/models/column_metadata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from enum import Enum
from hed.errors.error_types import SidecarErrors
import pandas as pd


class ColumnType(Enum):
Expand All @@ -21,30 +22,20 @@ class ColumnType(Enum):
class ColumnMetadata:
""" Column in a ColumnMapper. """

def __init__(self, column_type=None, name=None, hed_dict=None, column_prefix=None):
def __init__(self, column_type=None, name=None, source=None):
""" A single column entry in the column mapper.

Parameters:
column_type (ColumnType or None): How to treat this column when reading data.
name (str, int, or None): The column_name or column number identifying this column.
If name is a string, you'll need to use a column map to set the number later.
hed_dict (dict or str or None): The loaded data (usually from json) for the given def
For category columns, this is a dict.
For value columns, it's a string.
column_prefix (str or None): If present, prepend the given column_prefix to all hed tags in the columns.
Only works on ColumnType HedTags.

Notes:
- Each column from which data is retrieved must have a ColumnMetadata representing its contents.
- The column_prefix dictionaries are used when the column is processed.
source (dict or str or None): Either the entire loaded json sidecar or a single HED string
"""
if hed_dict is None:
hed_dict = {}

self.column_type = column_type
self.column_name = name
self.column_prefix = column_prefix
self._hed_dict = hed_dict
self._source = source
if column_type is None:
column_type = self._detect_column_type(self.source_dict)
self.column_type = column_type

@property
def hed_dict(self):
Expand All @@ -54,7 +45,78 @@ def hed_dict(self):
dict or str: A string or dict of strings for this column

"""
return self._hed_dict
if self._source is None or isinstance(self._source, str):
return self._source
return self._source[self.column_name].get("HED", {})

@property
def source_dict(self):
""" The raw dict for this entry(if it exists)

Returns:
dict or str: A string or dict of strings for this column
"""
if self._source is None or isinstance(self._source, str):
return {"HED": self._source}
return self._source[self.column_name]

def get_hed_strings(self):
if not self.column_type:
return pd.Series(dtype=str)

series = pd.Series(self.hed_dict, dtype=str)

return series

def set_hed_strings(self, new_strings):
if new_strings is None:
return False

if not self.column_type:
return False

if isinstance(new_strings, pd.Series):
if self.column_type == ColumnType.Categorical:
new_strings = new_strings.to_dict()
else:
new_strings = new_strings.iloc[0]

self._source[self.column_name]["HED"] = new_strings

return True

@staticmethod
def _detect_column_type(dict_for_entry):
""" Determine the ColumnType of a given json entry.

Parameters:
dict_for_entry (dict): The loaded json entry a specific column.
Generally has a "HED" entry among other optional ones.

Returns:
ColumnType: The determined type of given column. Returns None if unknown.

"""
if not dict_for_entry or not isinstance(dict_for_entry, dict):
return ColumnType.Ignore

minimum_required_keys = ("HED",)
if not set(minimum_required_keys).issubset(dict_for_entry.keys()):
return ColumnType.Ignore

hed_entry = dict_for_entry["HED"]
if isinstance(hed_entry, dict):
if not all(isinstance(entry, str) for entry in hed_entry.values()):
return None
return ColumnType.Categorical

if not isinstance(hed_entry, str):
return None

if "#" not in dict_for_entry["HED"]:
return None

return ColumnType.Value

@staticmethod
def expected_pound_sign_count(column_type):
Expand Down
3 changes: 0 additions & 3 deletions hed/models/df_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,14 @@ def expand_defs(df, hed_schema, def_dict, columns=None):


def _convert_to_form(hed_string, hed_schema, tag_form):
from hed import HedString
return str(HedString(hed_string, hed_schema).get_as_form(tag_form))


def _shrink_defs(hed_string, hed_schema):
from hed import HedString
return str(HedString(hed_string, hed_schema).shrink_defs())


def _expand_defs(hed_string, hed_schema, def_dict):
from hed import HedString
return str(HedString(hed_string, hed_schema, def_dict).expand_defs())


Expand Down
Loading