diff --git a/hed/__init__.py b/hed/__init__.py index e2bdcd053..e9026996d 100644 --- a/hed/__init__.py +++ b/hed/__init__.py @@ -12,7 +12,7 @@ from hed.schema.hed_schema import HedSchema from hed.schema.hed_schema_group import HedSchemaGroup -from hed.schema.hed_schema_io import get_schema, get_schema_versions, load_schema, load_schema_version +from hed.schema.hed_schema_io import get_schema, load_schema, load_schema_version # from hed import errors, models, schema, tools, validator diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index a8c5dd170..3591bae83 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -245,7 +245,7 @@ def schema_error_hed_duplicate_node(tag, duplicate_tag_list, section): f"{tag_join_delimiter}{tag_join_delimiter.join(duplicate_tag_list)}" -@hed_error(SchemaErrors.HED_SCHEMA_ATTRIBUTE_INVALID) +@hed_error(SchemaErrors.SCHEMA_ATTRIBUTE_INVALID) def schema_error_unknown_attribute(attribute_name, source_tag): return f"Attribute '{attribute_name}' used by '{source_tag}' was not defined in the schema, " \ f"or was used outside of it's defined class." diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index fc3aa3788..18418a4f2 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -107,7 +107,7 @@ class SidecarErrors: class SchemaErrors: HED_SCHEMA_DUPLICATE_NODE = 'HED_SCHEMA_DUPLICATE_NODE' - HED_SCHEMA_ATTRIBUTE_INVALID = 'HED_SCHEMA_ATTRIBUTE_INVALID' + SCHEMA_ATTRIBUTE_INVALID = 'SCHEMA_ATTRIBUTE_INVALID' HED_SCHEMA_DUPLICATE_FROM_LIBRARY = "SCHEMA_LIBRARY_INVALID" diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 6b2787bd0..f0c96eaaf 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -34,11 +34,21 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T has_column_names (bool): True if file has column names. This value is ignored if you pass in a pandas dataframe. mapper (ColumnMapper or None): Indicates which columns have HED tags. + See SpreadsheetInput or TabularInput for examples of how to use built-in a ColumnMapper. name (str or None): Optional field for how this file will report errors. allow_blank_names(bool): If True, column names can be blank - Notes: - - See SpreadsheetInput or TabularInput for examples of how to use built-in a ColumnMapper. + :raises HedFileError: + - file is blank + - An invalid dataframe was passed with size 0 + - An invalid extension was provided + - A duplicate or empty column name appears + + :raises OSError: + - Cannot open the indicated file + + :raises KeyError: + - The specified worksheet name does not exist """ if mapper is None: mapper = ColumnMapper() @@ -94,7 +104,6 @@ def reset_mapper(self, new_mapper): Parameters: new_mapper (ColumnMapper): A column mapper to be associated with this base input. - """ self._mapper = new_mapper if not self._mapper: @@ -200,8 +209,10 @@ def to_excel(self, file): file (str or file-like): Location to save this base input. :raises ValueError: - - if empty file object or file cannot be opened. - + - if empty file object was passed + + :raises OSError: + - Cannot open the indicated file """ if not file: raise ValueError("Empty file name or object passed in to BaseInput.save.") @@ -232,6 +243,8 @@ def to_csv(self, file=None): Returns: None or str: None if file is given or the contents as a str if file is None. + :raises OSError: + - Cannot open the indicated file """ dataframe = self._dataframe csv_string_if_filename_none = dataframe.to_csv(file, '\t', index=False, header=self._has_column_names) @@ -272,6 +285,15 @@ def set_cell(self, row_number, column_number, new_string_obj, tag_form="short_ta Notes: Any attribute of a HedTag that returns a string is a valid value of tag_form. + + :raises ValueError: + - There is not a loaded dataframe + + :raises KeyError: + - the indicated row/column does not exist + + :raises AttributeError: + - The indicated tag_form is not an attribute of HedTag """ if self._dataframe is None: raise ValueError("No data frame loaded") @@ -291,6 +313,8 @@ def get_worksheet(self, worksheet_name=None): Notes: If None, returns the first worksheet. + :raises KeyError: + - The specified worksheet name does not exist """ if worksheet_name and self._loaded_workbook: # return self._loaded_workbook.get_sheet_by_name(worksheet_name) diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index 6b5d651cf..fedac6d8f 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -26,7 +26,6 @@ def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None Sidecar column definitions will take precedent if there is a conflict with tag_columns. column_prefix_dictionary (dict): Dictionary with keys that are column numbers/names and values are HED tag prefixes to prepend to the tags in that column before processing. - optional_tag_columns (list): A list of ints or strings containing the columns that contain the HED tags. If the column is otherwise unspecified, convert this column type to HEDTags. warn_on_missing_column (bool): If True, issue mapping warnings on column names that are missing from @@ -89,6 +88,10 @@ def column_prefix_dictionary(self): def get_transformers(self): """ Return the transformers to use on a dataframe + Returns: + tuple(dict, list): + dict({str or int: func}): the functions to use to transform each column + need_categorical(list of int): a list of columns to treat as categoriacl """ final_transformers = {} need_categorical = [] @@ -144,8 +147,8 @@ def _set_sidecar(self, sidecar): Parameters: sidecar (Sidecar or None): the sidecar to use - Returns: - + :raises ValueError: + - A sidecar was prevoiusly set """ if self._sidecar: raise ValueError("Trying to set a second sidecar on a column mapper.") @@ -156,6 +159,11 @@ def _set_sidecar(self, sidecar): @property def sidecar_column_data(self): + """ Pass through to get the sidecar ColumnMetadata + + Returns: + dict({str:ColumnMetadata}): the column metadata defined by this sidecar + """ if self._sidecar: return self._sidecar.column_data @@ -168,7 +176,7 @@ def get_tag_columns(self): Returns: column_identifiers(list): A list of column numbers or names that are ColumnType.HedTags. - 0-based if integer-based, otherwise column name. + 0-based if integer-based, otherwise column name. """ return [column_entry.column_name for number, column_entry in self._final_column_map.items() if column_entry.column_type == ColumnType.HEDTags] diff --git a/hed/models/column_metadata.py b/hed/models/column_metadata.py index 4fa43a6a5..bca22c4cd 100644 --- a/hed/models/column_metadata.py +++ b/hed/models/column_metadata.py @@ -61,6 +61,11 @@ def source_dict(self): return self._source[self.column_name] def get_hed_strings(self): + """ Returns the hed strings for this entry as a series. + + Returns: + hed_strings(pd.Series): the hed strings for this series.(potentially empty) + """ if not self.column_type: return pd.Series(dtype=str) @@ -69,6 +74,15 @@ def get_hed_strings(self): return series def set_hed_strings(self, new_strings): + """ Sets the hed strings for this entry. + + Parameters: + new_strings(pd.Series, dict, or str): The hed strings to set. + This should generally be the return value from get_hed_strings + + Returns: + hed_strings(pd.Series): the hed strings for this series.(potentially empty) + """ if new_strings is None: return False diff --git a/hed/models/def_expand_gather.py b/hed/models/def_expand_gather.py index f34461e46..380079a42 100644 --- a/hed/models/def_expand_gather.py +++ b/hed/models/def_expand_gather.py @@ -93,7 +93,6 @@ def __init__(self, hed_schema, known_defs=None, ambiguous_defs=None, errors=None """ self.hed_schema = hed_schema self.ambiguous_defs = ambiguous_defs if ambiguous_defs else {} - self.ambiguous_defs_new = ambiguous_defs if ambiguous_defs else {} self.errors = errors if errors else {} self.def_dict = DefinitionDict(known_defs, self.hed_schema) diff --git a/hed/models/definition_dict.py b/hed/models/definition_dict.py index ebe1af6f8..0fa6aa743 100644 --- a/hed/models/definition_dict.py +++ b/hed/models/definition_dict.py @@ -12,7 +12,16 @@ class DefinitionDict: """ def __init__(self, def_dicts=None, hed_schema=None): - """ Definitions to be considered a single source. """ + """ Definitions to be considered a single source. + + Parameters: + def_dicts (str or list or DefinitionDict): DefDict or list of DefDicts/strings or + a single string whose definitions should be added. + hed_schema(HedSchema or None): Required if passing strings or lists of strings, unused otherwise. + + :raises TypeError: + - Bad type passed as def_dicts + """ self.defs = {} self._label_tag_name = DefTagNames.DEF_KEY @@ -26,6 +35,9 @@ def add_definitions(self, def_dicts, hed_schema=None): Parameters: def_dicts (list or DefinitionDict): DefDict or list of DefDicts/strings whose definitions should be added. hed_schema(HedSchema or None): Required if passing strings or lists of strings, unused otherwise. + + :raises TypeError: + - Bad type passed as def_dicts """ if not isinstance(def_dicts, list): def_dicts = [def_dicts] @@ -38,7 +50,7 @@ def add_definitions(self, def_dicts, hed_schema=None): for definition in def_dict: self.check_for_definitions(HedString(definition, hed_schema)) else: - print(f"Invalid input type '{type(def_dict)} passed to DefDict. Skipping.") + raise TypeError("Invalid type '{type(def_dict)}' passed to DefinitionDict") def _add_definition(self, def_tag, def_value): if def_tag in self.defs: @@ -59,6 +71,16 @@ def _add_definitions_from_dict(self, def_dict): self._add_definition(def_tag, def_value) def get(self, def_name): + """ Get the definition entry for the definition name. + + Not case-sensitive + + Parameters: + def_name (str): Name of the definition to retrieve. + + Returns: + DefinitionEntry: Definition entry for the requested definition. + """ return self.defs.get(def_name.lower()) def __iter__(self): @@ -68,6 +90,13 @@ def __len__(self): return len(self.defs) def items(self): + """ Returns the dictionary of definitions + + Alias for .defs.items() + + Returns: + def_entries({str: DefinitionEntry}): A list of definitions + """ return self.defs.items() @property @@ -75,19 +104,6 @@ def issues(self): """Returns issues about duplicate definitions.""" return self._issues - def get_def_entry(self, def_name): - """ Get the definition entry for the definition name. - - Parameters: - def_name (str): Name of the definition to retrieve. - - Returns: - DefinitionEntry: Definition entry for the requested definition. - - """ - - return self.defs.get(def_name.lower()) - def check_for_definitions(self, hed_string_obj, error_handler=None): """ Check string for definition tags, adding them to self. @@ -97,7 +113,6 @@ def check_for_definitions(self, hed_string_obj, error_handler=None): Returns: list: List of issues encountered in checking for definitions. Each issue is a dictionary. - """ def_issues = [] for definition_tag, group in hed_string_obj.find_top_level_tags(anchor_tags={DefTagNames.DEFINITION_KEY}): @@ -208,8 +223,8 @@ def _find_group(self, definition_tag, group, error_handler): def _validate_contents(self, definition_tag, group, error_handler): issues = [] if group: - for def_tag in group.find_tags({DefTagNames.DEF_KEY, DefTagNames.DEF_EXPAND_KEY, DefTagNames.DEFINITION_KEY}, recursive=True, - include_groups=0): + def_keys = {DefTagNames.DEF_KEY, DefTagNames.DEF_EXPAND_KEY, DefTagNames.DEFINITION_KEY} + for def_tag in group.find_tags(def_keys, recursive=True, include_groups=0): issues += ErrorHandler.format_error_with_context(error_handler, DefinitionErrors.DEF_TAG_IN_DEFINITION, tag=def_tag, @@ -250,27 +265,6 @@ def construct_def_tag(self, hed_tag): hed_tag._expandable = def_contents hed_tag._expanded = hed_tag.short_base_tag == DefTagNames.DEF_EXPAND_ORG_KEY - def expand_def_tags(self, hed_string_obj): - """ Expands def tags to def-expand tags. - - Parameters: - hed_string_obj (HedString): The hed string to process. - """ - # First see if the "def" is found at all. This covers def and def-expand. - hed_string_lower = hed_string_obj.lower() - if self._label_tag_name not in hed_string_lower: - return [] - - def_issues = [] - # We need to check for labels to expand in ALL groups - for def_tag, def_group in hed_string_obj.find_tags(DefTagNames.DEF_KEY, recursive=True): - def_contents = self._get_definition_contents(def_tag) - if def_contents is not None: - def_tag.short_base_tag = DefTagNames.DEF_EXPAND_ORG_KEY - def_group.replace(def_tag, def_contents) - - return def_issues - def _get_definition_contents(self, def_tag): """ Get the contents for a given def tag. diff --git a/hed/models/definition_entry.py b/hed/models/definition_entry.py index 7c0aa3662..190d8d3d3 100644 --- a/hed/models/definition_entry.py +++ b/hed/models/definition_entry.py @@ -26,6 +26,8 @@ def __init__(self, name, contents, takes_value, source_context): def get_definition(self, replace_tag, placeholder_value=None, return_copy_of_tag=False): """ Return a copy of the definition with the tag expanded and the placeholder plugged in. + Returns None if placeholder_value passed when it doesn't take value, or vice versa. + Parameters: replace_tag (HedTag): The def hed tag to replace with an expanded version placeholder_value (str or None): If present and required, will replace any pound signs @@ -33,12 +35,12 @@ def get_definition(self, replace_tag, placeholder_value=None, return_copy_of_tag return_copy_of_tag(bool): Set to true for validation Returns: - str: The expanded def tag name - HedGroup: The contents of this definition(including the def tag itself) + tuple: + str: The expanded def tag name + HedGroup: The contents of this definition(including the def tag itself) :raises ValueError: - - If a placeholder_value is passed, but this definition doesn't have a placeholder. - + - Something internally went wrong with finding the placeholder tag. This should not be possible. """ if self.takes_value == (placeholder_value is None): return None, [] @@ -49,7 +51,7 @@ def get_definition(self, replace_tag, placeholder_value=None, return_copy_of_tag name = self.name if self.contents: output_group = self.contents - if placeholder_value: + if placeholder_value is not None: output_group = copy.deepcopy(self.contents) placeholder_tag = output_group.find_placeholder_tag() if not placeholder_tag: @@ -64,4 +66,4 @@ def get_definition(self, replace_tag, placeholder_value=None, return_copy_of_tag return f"{DefTagNames.DEF_EXPAND_ORG_KEY}/{name}", output_contents def __str__(self): - return str(self.contents) \ No newline at end of file + return str(self.contents) diff --git a/hed/models/df_util.py b/hed/models/df_util.py index 2509a059a..83184a4e9 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -27,8 +27,9 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_ expand_defs: bool Expand any def tags found Returns: - tuple: A list of HedStrings or a list of lists of HedStrings, DefinitionDict - + tuple: + hed_strings(list of HedStrings):A list of HedStrings or a list of lists of HedStrings + def_dict(DefinitionDict): The definitions from this Sidecar """ if isinstance(sidecar, str): sidecar = Sidecar(sidecar) @@ -59,14 +60,14 @@ def convert_to_form(df, hed_schema, tag_form, columns=None): """ Convert all tags in underlying dataframe to the specified form (in place). Parameters: - df (pd.Dataframe): The dataframe to modify + df (pd.Dataframe or pd.Series): The dataframe or series to modify hed_schema (HedSchema): The schema to use to convert tags. tag_form(str): HedTag property to convert tags to. columns (list): The columns to modify on the dataframe. """ if isinstance(df, pd.Series): - df = df.apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) + df[:] = df.apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) else: if columns is None: columns = df.columns @@ -74,8 +75,6 @@ def convert_to_form(df, hed_schema, tag_form, columns=None): for column in columns: df[column] = df[column].apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) - return df - def shrink_defs(df, hed_schema, columns=None): """ Shrink (in place) any def-expand tags found in the specified columns in the dataframe. @@ -97,8 +96,6 @@ def shrink_defs(df, hed_schema, columns=None): mask = df[column].str.contains('Def-expand/', case=False) df[column][mask] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema)) - return df - def expand_defs(df, hed_schema, def_dict, columns=None): """ Expands any def tags found in the dataframe. @@ -120,9 +117,7 @@ def expand_defs(df, hed_schema, def_dict, columns=None): for column in columns: mask = df[column].str.contains('Def/', case=False) - df[column][mask] = df[column][mask].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) - - return df + df.loc[mask, column] = df.loc[mask, column].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) def _convert_to_form(hed_string, hed_schema, tag_form): @@ -137,23 +132,11 @@ def _expand_defs(hed_string, hed_schema, def_dict): return str(HedString(hed_string, hed_schema, def_dict).expand_defs()) -def _get_matching_value(tags): - # Filter out values equal to "#" and get unique values - unique_values = set(tag.extension for tag in tags if tag.extension != "#") - if len(unique_values) == 0: - return "#" - - if len(unique_values) > 1: - return None - - return next(iter(unique_values)) - - def process_def_expands(hed_strings, hed_schema, known_defs=None, ambiguous_defs=None): - """ - Processes a list of HED strings according to a given HED schema, using known definitions and ambiguous definitions. + """ Processes a list of HED strings according to a given HED schema, + using known definitions and ambiguous definitions. - Args: + Parameters: hed_strings (list or pd.Series): A list of HED strings to process. hed_schema (HedSchema): The schema to use known_defs (DefinitionDict or list or str), optional): diff --git a/hed/models/hed_group.py b/hed/models/hed_group.py index 9a4af46b1..eeacd16db 100644 --- a/hed/models/hed_group.py +++ b/hed/models/hed_group.py @@ -13,10 +13,7 @@ def __init__(self, hed_string="", startpos=None, endpos=None, contents=None): startpos (int or None): Starting index of group(including parentheses) in hed_string. endpos (int or None): Position after the end (including parentheses) in hed_string. contents (list or None): A list of HedTags and/or HedGroups that will be set as the contents of this group. - - Notes: - - contents parameter is mainly used for processing definitions. - + Mostly used during definition expansion. """ self._startpos = startpos self._endpos = endpos @@ -36,10 +33,6 @@ def append(self, tag_or_group): Parameters: tag_or_group (HedTag or HedGroup): The new object to add to this group. - - :raises ValueError: - If a HedGroupFrozen. - """ tag_or_group._parent = self self._children.append(tag_or_group) @@ -72,6 +65,8 @@ def replace(self, item_to_replace, new_contents): item_to_replace (HedTag or HedGroup): The item to replace must exist or this will raise an error. new_contents (HedTag or HedGroup): Replacement contents. + :raises KeyError: + - item_to_replace does not exist """ if self._original_children is self._children: self._original_children = self._children.copy() @@ -88,24 +83,15 @@ def remove(self, items_to_remove): """ Remove any tags/groups in items_to_remove. Parameters: - items_to_remove (list): List of HedGroups and/or HedTags to remove. + items_to_remove (list): List of HedGroups and/or HedTags to remove by identity. Notes: - Any groups that become empty will also be pruned. - - Identity, not equivalence is used in determining whether to remove. - """ all_groups = self.get_all_groups() self._remove(items_to_remove, all_groups) def _remove(self, items_to_remove, all_groups): - """ Needs to be documented. - - Parameters: - items_to_remove (list): List of HedGroups and/or HedTags to remove. - all_groups (list): List of HedGroups. - - """ empty_groups = [] for remove_child in items_to_remove: for group in all_groups: @@ -333,10 +319,6 @@ def get_as_form(self, tag_attribute): Returns: str: The constructed string after transformation - - Notes: - - The signature of a tag_transformer is str def(HedTag, str). - """ result = ",".join([child.__getattribute__(tag_attribute) if isinstance(child, HedTag) else child.get_as_form(tag_attribute) for child in self.children]) @@ -356,7 +338,6 @@ def find_placeholder_tag(self): Notes: - Assumes a valid HedString with no erroneous "#" characters. - """ for tag in self.get_all_tags(): if tag.is_placeholder(): @@ -368,7 +349,10 @@ def __bool__(self): return bool(self._children) def __eq__(self, other): - """ Test whether other is equal to this object. """ + """ Test whether other is equal to this object. + + Note: This does not account for sorting. Objects must be in the same order to match. + """ if self is other: return True @@ -423,17 +407,16 @@ def find_wildcard_tags(self, search_tags, recursive=False, include_groups=2): search_tags (container): A container of the starts of short tags to search. recursive (bool): If true, also check subgroups. include_groups (0, 1 or 2): Specify return values. + If 0: return a list of the HedTags. + If 1: return a list of the HedGroups containing the HedTags. + If 2: return a list of tuples (HedTag, HedGroup) for the found tags. Returns: list: The contents of the list depends on the value of include_groups. Notes: - - If include_groups is 0, return a list of the HedTags. - - If include_groups is 1, return a list of the HedGroups containing the HedTags. - - If include_groups is 2, return a list of tuples (HedTag, HedGroup) for the found tags. - This can only find identified tags. - By default, definition, def, def-expand, onset, and offset are identified, even without a schema. - """ found_tags = [] if recursive: @@ -469,11 +452,6 @@ def find_exact_tags(self, tags_or_groups, recursive=False, include_groups=1): - This can only find identified tags. - By default, definition, def, def-expand, onset, and offset are identified, even without a schema. - If this is a HedGroup, order matters. (b, a) != (a, b) - - If this is a HedGroupFrozen: - if "(a, b)" in tags_or_groups, then it will match 1 and 2, but not 3. - 1. (a, b) - 2. (b, a) - 3. (a, b, c) """ found_tags = [] @@ -495,17 +473,16 @@ def find_exact_tags(self, tags_or_groups, recursive=False, include_groups=1): def find_def_tags(self, recursive=False, include_groups=3): """ Find def and def-expand tags + Parameters: recursive (bool): If true, also check subgroups. include_groups (int, 0, 1, 2, 3): options for return values + If 0: Return only def and def expand tags/. + If 1: Return only def tags and def-expand groups. + If 2: Return only groups containing defs, or def-expand groups. + If 3 or any other value: Return all 3 as a tuple. Returns: list: A list of tuples. The contents depend on the values of the include_group. - Notes: - - The include_groups option controls the tag expansion as follows: - - If 0: Return only def and def expand tags/. - - If 1: Return only def tags and def-expand groups. - - If 2: Return only groups containing defs, or def-expand groups. - - If 3 or any other value: Return all 3 as a tuple. """ from hed.models.definition_dict import DefTagNames if recursive: @@ -536,16 +513,14 @@ def find_tags_with_term(self, term, recursive=False, include_groups=2): Parameters: term (str): A single term to search for. recursive (bool): If true, recursively check subgroups. - include_groups: 0, 1 or 2 + include_groups(0, 1 or 2): Controls return values If 0: Return only tags If 1: Return only groups If 2 or any other value: Return both Returns: list: - """ - found_tags = [] if recursive: groups = self.get_all_groups() diff --git a/hed/models/hed_string.py b/hed/models/hed_string.py index 30b67ee3c..173b67860 100644 --- a/hed/models/hed_string.py +++ b/hed/models/hed_string.py @@ -180,12 +180,11 @@ def split_into_groups(hed_string, hed_schema=None, def_dict=None): list: A list of HedTag and/or HedGroup. :raises ValueError: - - If the string is significantly malformed, such as mismatched parentheses. + - The string is significantly malformed, such as mismatched parentheses. Notes: - The parse tree consists of tag groups, tags, and delimiters. """ - current_tag_group = [[]] input_tags = HedString.split_hed_string(hed_string) @@ -332,19 +331,16 @@ def validate(self, hed_schema, allow_placeholders=True, error_handler=None): def find_top_level_tags(self, anchor_tags, include_groups=2): """ Find top level groups with an anchor tag. + A max of 1 tag located per top level group. + Parameters: anchor_tags (container): A list/set/etc of short_base_tags to find groups by. include_groups (0, 1 or 2): Parameter indicating what return values to include. - + If 0: return only tags. + If 1: return only groups. + If 2 or any other value: return both. Returns: list or tuple: The returned result depends on include_groups: - - If 0: return only tags. - - If 1: return only groups. - - If 2 or any other value: return both. - - Notes: - - A max of 1 tag located per top level group. - """ top_level_tags = [] for group in self.groups(): diff --git a/hed/models/hed_string_group.py b/hed/models/hed_string_group.py index 14e639f51..3171823ce 100644 --- a/hed/models/hed_string_group.py +++ b/hed/models/hed_string_group.py @@ -60,15 +60,13 @@ def children(self): return [child for sub_string in self._children for child in sub_string._children] def remove(self, items_to_remove): - """ Remove any tags/groups in items_to_remove. + """ Remove tags/groups by identity. Parameters: items_to_remove (list): A list of HedGroup and HedTag objects to remove. Notes: - Any groups that become empty will also be pruned. - - This goes by identity, not equivalence. - """ all_groups = [group for sub_group in self._children for group in sub_group.get_all_groups()] self._remove(items_to_remove, all_groups) @@ -85,12 +83,9 @@ def replace(self, item_to_replace, new_contents): item_to_replace (HedTag or HedGroup): The tag to replace. new_contents (HedTag or HedGroup or list): The replacements for the tag. - Notes: - - It tag must exist in this an error is raised. - + :raises KeyError: + - item_to_replace does not exist """ - - # this needs to pass the tag off to the appropriate group replace_sub_string = None for sub_string in self._children: for i, child in enumerate(sub_string.children): diff --git a/hed/models/hed_tag.py b/hed/models/hed_tag.py index 2c17d368a..bfc06abd2 100644 --- a/hed/models/hed_tag.py +++ b/hed/models/hed_tag.py @@ -19,9 +19,8 @@ def __init__(self, hed_string, span=None, hed_schema=None, def_dict=None): span (int, int): The start and end indexes of the tag in the hed_string. hed_schema (HedSchema or None): A convenience parameter for calculating canonical forms on creation. - Notes: - - This does not produce issues and is used primarily for testing. - + :raises ValueError: + - You cannot pass a def_dict without also passing a schema. """ if def_dict and not hed_schema: raise ValueError("Passing a def_dict without also passing a schema is invalid.") @@ -131,11 +130,10 @@ def short_base_tag(self, new_tag_val): new_tag_val (str): The new short_base_tag for this tag. :raises ValueError: - - If tags cannot unidentified. + - If the tag wasn't already identified Note: - Generally this is used to swap def to def-expand. - """ if self._schema_entry: tag_entry = None @@ -159,7 +157,6 @@ def org_base_tag(self): - Warning: This could be empty if the original tag had a name_prefix prepended. e.g. a column where "Label/" is prepended, thus the column value has zero base portion. - Only valid after calling convert_to_canonical_forms. - """ if self._schema_entry: extension_len = len(self._extension_value) diff --git a/hed/models/sidecar.py b/hed/models/sidecar.py index 094edfa69..735ad3f8b 100644 --- a/hed/models/sidecar.py +++ b/hed/models/sidecar.py @@ -67,10 +67,10 @@ def def_dict(self): @property def column_data(self): - """ Generates the list of ColumnMetadata for this sidecar + """ Generates the ColumnMetadata for this sidecar Returns: - list(ColumnMetadata): the list of column metadata defined by this sidecar + dict({str:ColumnMetadata}): the column metadata defined by this sidecar """ return {col_name: ColumnMetadata(name=col_name, source=self.loaded_dict) for col_name in self.loaded_dict} @@ -200,8 +200,8 @@ def extract_definitions(self, hed_schema=None, error_handler=None): """ Gather and validate definitions in metadata. Parameters: - error_handler (ErrorHandler): The error handler to use for context, uses a default one if None. hed_schema (HedSchema or None): The schema to used to identify tags. + error_handler (ErrorHandler or None): The error handler to use for context, uses a default one if None. Returns: DefinitionDict: Contains all the definitions located in the sidecar. @@ -238,7 +238,6 @@ def get_column_refs(self): Returns: column_refs(list): A list of unique column refs found """ - found_vals = set() for column_data in self: if column_data.column_type == ColumnType.Ignore: diff --git a/hed/models/spreadsheet_input.py b/hed/models/spreadsheet_input.py index c3a059bdc..d2bcbc1bd 100644 --- a/hed/models/spreadsheet_input.py +++ b/hed/models/spreadsheet_input.py @@ -30,6 +30,17 @@ def __init__(self, file=None, file_type=None, worksheet_name=None, tag_columns=N It will be a validation issue if column 1 is called "key" in the above example. This means it no longer accepts anything but the value portion only in the columns. + :raises HedFileError: + - file is blank + - An invalid dataframe was passed with size 0 + - An invalid extension was provided + - A duplicate or empty column name appears + + :raises OSError: + - Cannot open the indicated file + + :raises KeyError: + - The specified worksheet name does not exist """ if tag_columns is None: tag_columns = [1] diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py index 8a6d5c5f8..b88ed5581 100644 --- a/hed/models/tabular_input.py +++ b/hed/models/tabular_input.py @@ -16,6 +16,18 @@ def __init__(self, file=None, sidecar=None, name=None): file (str or file like): A tsv file to open. sidecar (str or Sidecar): A Sidecar filename or Sidecar name (str): The name to display for this file for error purposes. + + :raises HedFileError: + - file is blank + - An invalid dataframe was passed with size 0 + - An invalid extension was provided + - A duplicate or empty column name appears + + :raises OSError: + - Cannot open the indicated file + + :raises ValueError: + - This file has no column names """ if sidecar and not isinstance(sidecar, Sidecar): sidecar = Sidecar(sidecar) diff --git a/hed/schema/__init__.py b/hed/schema/__init__.py index 01c9ef1d5..5db24d5ea 100644 --- a/hed/schema/__init__.py +++ b/hed/schema/__init__.py @@ -3,8 +3,7 @@ from .hed_schema_entry import HedSchemaEntry, UnitClassEntry, UnitEntry, HedTagEntry from .hed_schema_group import HedSchemaGroup from .hed_schema_section import HedSchemaSection -from .hed_schema_io import load_schema, load_schema_version, from_string, get_hed_xml_version, get_schema, \ - get_schema_versions +from .hed_schema_io import load_schema, load_schema_version, from_string, get_hed_xml_version, get_schema from .hed_schema_constants import HedKey, HedSectionKey from .hed_cache import cache_xml_versions, get_hed_versions, \ get_path_from_hed_version, set_cache_directory, get_cache_directory diff --git a/hed/schema/hed_cache.py b/hed/schema/hed_cache.py index 51c0f9ef4..793cd6d85 100644 --- a/hed/schema/hed_cache.py +++ b/hed/schema/hed_cache.py @@ -270,7 +270,7 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP _cache_hed_version(version, library_name, version_info, cache_folder=cache_folder) _write_last_cached_time(current_timestamp, cache_folder) - except portalocker.exceptions.LockException: + except portalocker.exceptions.LockException or ValueError: return -1 return 0 @@ -303,6 +303,8 @@ def _write_last_cached_time(new_time, cache_folder): new_time (float): The time this was updated. cache_folder (str): The folder used for caching the hed schema. + :raises ValueError: + - something went wrong writing to the file """ timestamp_filename = os.path.join(cache_folder, TIMESTAMP_FILENAME) try: diff --git a/hed/schema/hed_schema_group.py b/hed/schema/hed_schema_group.py index ab2112b7a..00bc2f78b 100644 --- a/hed/schema/hed_schema_group.py +++ b/hed/schema/hed_schema_group.py @@ -27,8 +27,8 @@ def __init__(self, schema_list): HedSchemaGroup: the container created. :raises HedFileError: - - If multiple schemas have the same library prefixes. - + - Multiple schemas have the same library prefixes. + - Empty list passed """ if len(schema_list) == 0: raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty list passed to HedSchemaGroup constructor.", diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index 07e59dcf5..fdfdf9775 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -24,6 +24,7 @@ def from_string(schema_string, file_type=".xml", schema_namespace=None): :raises HedFileError: - If empty string or invalid extension is passed. + - Other fatal formatting issues with file Notes: - The loading is determined by file type. @@ -57,17 +58,6 @@ def get_schema(hed_versions): raise ValueError("InvalidHedSchemaOrSchemaVersion", "Expected schema or schema version") -def get_schema_versions(hed_schema, as_string=True): - if not hed_schema and as_string: - return '' - elif not hed_schema: - return None - elif isinstance(hed_schema, HedSchema) or isinstance(hed_schema, HedSchemaGroup): - return hed_schema.get_formatted_version(as_string=as_string) - else: - raise ValueError("InvalidHedSchemaOrHedSchemaGroup", "Expected schema or schema group") - - def load_schema(hed_path=None, schema_namespace=None): """ Load a schema from the given file or URL path. @@ -79,7 +69,9 @@ def load_schema(hed_path=None, schema_namespace=None): HedSchema: The loaded schema. :raises HedFileError: - - If there are any fatal issues when loading the schema. + - Empty path passed + - Unknown extension + - Any fatal issues when loading the schema. """ if not hed_path: @@ -114,6 +106,8 @@ def get_hed_xml_version(xml_file_path): Returns: str: The version number of the HED XML file. + :raises HedFileError: + - There is an issue loading the schema """ root_node = HedSchemaXMLParser._parse_hed_xml(xml_file_path) return root_node.attrib[hed_schema_constants.VERSION_ATTRIBUTE] @@ -130,10 +124,9 @@ def _load_schema_version(xml_version=None, xml_folder=None): HedSchema or HedSchemaGroup: The requested HedSchema object. :raises HedFileError: - - If the xml_version is not valid. - - Notes: - - The library schema files have names of the form HED_(LIBRARY_NAME)_(version).xml. + - The xml_version is not valid. + - The specified version cannot be found or loaded + - Other fatal errors loading the schema (These are unlikely if you are not editing them locally) """ schema_namespace = "" library_name = None @@ -179,10 +172,8 @@ def load_schema_version(xml_version=None, xml_folder=None): HedSchema or HedSchemaGroup: The schema or schema group extracted. :raises HedFileError: - - If the xml_version is not valid. - - Notes: - - Loads the latest schema value if an empty version is given (string or list). + - The xml_version is not valid. + - A fatal error was encountered in parsing """ if xml_version and isinstance(xml_version, list): schemas = [_load_schema_version(xml_version=version, xml_folder=xml_folder) for version in xml_version] diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py index ddb222663..9f372cdb5 100644 --- a/hed/schema/schema_compliance.py +++ b/hed/schema/schema_compliance.py @@ -21,9 +21,8 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl Returns: list: A list of all warnings and errors found in the file. Each issue is a dictionary. - Notes: - - Useful for temp filenames in support of web services. - + :raises ValueError: + - Trying to validate a HedSchemaGroup directly """ if not isinstance(hed_schema, HedSchema): raise ValueError("To check compliance of a HedGroupSchema, call self.check_compliance on the schema itself.") @@ -40,7 +39,7 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl if unknown_attributes: for attribute_name, source_tags in unknown_attributes.items(): for tag in source_tags: - issues_list += error_handler.format_error_with_context(SchemaErrors.HED_SCHEMA_ATTRIBUTE_INVALID, + issues_list += error_handler.format_error_with_context(SchemaErrors.SCHEMA_ATTRIBUTE_INVALID, attribute_name, source_tag=tag) @@ -62,7 +61,10 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl validator = schema_attribute_validators.get(attribute_name) if validator: error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name) - new_issues = validator(hed_schema, tag_entry, tag_entry.attributes[attribute_name]) + new_issues = validator(hed_schema, tag_entry, attribute_name) + # if force_issues_as_warnings: + for issue in new_issues: + issue['severity'] = ErrorSeverity.WARNING error_handler.add_context_and_filter(new_issues) issues_list += new_issues error_handler.pop_error_context() @@ -75,8 +77,7 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl if len(values) == 2: error_code = SchemaErrors.HED_SCHEMA_DUPLICATE_FROM_LIBRARY issues_list += error_handler.format_error_with_context(error_code, name, - duplicate_tag_list=[entry.name for entry in - duplicate_entries], + duplicate_tag_list=[entry.name for entry in duplicate_entries], section=section_key) error_handler.pop_error_context() @@ -92,15 +93,19 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl error_handler.pop_error_context() return issues_list +# attribute_checker_template(hed_schema, tag_entry, attribute_name, possible_values): +# hed_schema (HedSchema): The schema to use for validation +# tag_entry (HedSchemaEntry): The schema entry for this tag. +# attribute_name (str): The name of this attribute + -def tag_is_placeholder_check(hed_schema, tag_entry, possible_tags, force_issues_as_warnings=True): +def tag_is_placeholder_check(hed_schema, tag_entry, attribute_name): """ Check if comma separated list has valid HedTags. Parameters: - hed_schema (HedSchema): The schema to check if the tag exists. + hed_schema (HedSchema): The schema to use for validation tag_entry (HedSchemaEntry): The schema entry for this tag. - possible_tags (str): Comma separated list of tags. Short long or mixed form valid. - force_issues_as_warnings (bool): If True sets all the severity levels to warning. + attribute_name (str): The name of this attribute Returns: list: A list of issues. Each issue is a dictionary. @@ -109,91 +114,55 @@ def tag_is_placeholder_check(hed_schema, tag_entry, possible_tags, force_issues_ issues = [] if not tag_entry.name.endswith("/#"): issues += ErrorHandler.format_error(SchemaWarnings.NON_PLACEHOLDER_HAS_CLASS, tag_entry.name, - possible_tags) - - if force_issues_as_warnings: - for issue in issues: - issue['severity'] = ErrorSeverity.WARNING - - return issues - - -def attribute_does_not_exist_check(hed_schema, tag_entry, attribute_name, force_issues_as_warnings=True): - """ Throws an error saying this is a bad attribute if found. - - Parameters: - hed_schema (HedSchema): The schema to check if the tag exists. - tag_entry (HedSchemaEntry): The schema entry for this tag. - attribute_name (str): the attribute name we're looking for - force_issues_as_warnings (bool): If True sets all the severity levels to warning. - - Returns: - list: A list of issues. Each issue is a dictionary. - - """ - issues = [] - issues += ErrorHandler.format_error(SchemaWarnings.INVALID_ATTRIBUTE, tag_entry.name, - attribute_name) - - if force_issues_as_warnings: - for issue in issues: - issue['severity'] = ErrorSeverity.WARNING + attribute_name) return issues -def tag_exists_check(hed_schema, tag_entry, possible_tags, force_issues_as_warnings=True): - """ Check if comma separated list are valid HedTags. +def tag_exists_check(hed_schema, tag_entry, attribute_name): + """ Check if the list of possible tags exists in the schema. Parameters: - hed_schema (HedSchema): The schema to check if the tag exists. + hed_schema (HedSchema): The schema to use for validation tag_entry (HedSchemaEntry): The schema entry for this tag. - possible_tags (str): Comma separated list of tags. Short long or mixed form valid. - force_issues_as_warnings (bool): If True, set all the severity levels to warning. + attribute_name (str): The name of this attribute Returns: list: A list of issues. Each issue is a dictionary. """ issues = [] + possible_tags = tag_entry.attributes.get(attribute_name, "") split_tags = possible_tags.split(",") for org_tag in split_tags: - if org_tag not in hed_schema.all_tags: + if org_tag and org_tag not in hed_schema.all_tags: issues += ErrorHandler.format_error(ValidationErrors.NO_VALID_TAG_FOUND, org_tag, index_in_tag=0, index_in_tag_end=len(org_tag)) - if force_issues_as_warnings: - for issue in issues: - issue['severity'] = ErrorSeverity.WARNING return issues -def tag_exists_base_schema_check(hed_schema, tag_entry, tag_name, force_issues_as_warnings=True): +def tag_exists_base_schema_check(hed_schema, tag_entry, attribute_name): """ Check if the single tag is a partnered schema tag Parameters: - hed_schema (HedSchema): The schema to check if the tag exists. + hed_schema (HedSchema): The schema to use for validation tag_entry (HedSchemaEntry): The schema entry for this tag. - tag_name (str): The tag to verify, can be any form. - force_issues_as_warnings (bool): If True, set all the severity levels to warning. + attribute_name (str): The name of this attribute Returns: list: A list of issues. Each issue is a dictionary. - """ issues = [] - rooted_tag = tag_name.lower() - if rooted_tag not in hed_schema.all_tags: + rooted_tag = tag_entry.attributes.get(attribute_name, "") + if rooted_tag and rooted_tag not in hed_schema.all_tags: issues += ErrorHandler.format_error(ValidationErrors.NO_VALID_TAG_FOUND, rooted_tag, index_in_tag=0, index_in_tag_end=len(rooted_tag)) - if force_issues_as_warnings: - for issue in issues: - issue['severity'] = ErrorSeverity.WARNING return issues diff --git a/hed/schema/schema_validation_util.py b/hed/schema/schema_validation_util.py index 97376d380..17052a4d1 100644 --- a/hed/schema/schema_validation_util.py +++ b/hed/schema/schema_validation_util.py @@ -64,6 +64,18 @@ def is_hed3_version_number(version_string): def validate_present_attributes(attrib_dict, filename): + """ Validate combinations of attributes + + Parameters: + attrib_dict (dict): Dictionary of attributes to be evaluated. + filename (str): File name to use in reporting errors. + + Returns: + list: List of issues. Each issue is a dictionary. + + :raises HedFileError: + - withStandard is found in th header, but a library attribute is not specified + """ if constants.WITH_STANDARD_ATTRIBUTE in attrib_dict and constants.LIBRARY_ATTRIBUTE not in attrib_dict: raise HedFileError(HedExceptions.BAD_WITH_STANDARD, "withStandard header attribute found, but no library attribute is present", @@ -81,8 +93,9 @@ def validate_attributes(attrib_dict, filename): list: List of issues. Each issue is a dictionary. :raises HedFileError: - - If invalid or version not found in the dictionary. - + - Invalid library name + - Version not present + - Invalid combinations of attributes in header """ validate_present_attributes(attrib_dict, filename) @@ -111,9 +124,12 @@ def find_rooted_entry(tag_entry, schema, loading_merged): rooted_tag(HedTagEntry or None): The base tag entry from the standard schema Returns None if this tag isn't rooted - :raises HedValueError: - - If the tag doesn't exist or similar - + :raises HedFileError: + - A rooted attribute is found in a non-paired schema + - A rooted attribute is not a string + - A rooted attribute was found on a non-root node in an unmerged schema. + - A rooted attribute is found on a root node in a merged schema. + - A rooted attribute indicates a tag that doesn't exist in the base schema. """ rooted_tag = tag_entry.has_attribute(constants.HedKey.Rooted, return_value=True) if rooted_tag is not None: diff --git a/hed/tools/remodeling/dispatcher.py b/hed/tools/remodeling/dispatcher.py index 48f862937..24dcddd08 100644 --- a/hed/tools/remodeling/dispatcher.py +++ b/hed/tools/remodeling/dispatcher.py @@ -203,7 +203,7 @@ def parse_operations(operation_list): @staticmethod def prep_data(df): - """ Replace all n/a entries in the data frame by np.NaN for processing. + """ Make a copy and replace all n/a entries in the data frame by np.NaN for processing. Parameters: df (DataFrame) - The DataFrame to be processed. diff --git a/hed/tools/remodeling/operations/base_op.py b/hed/tools/remodeling/operations/base_op.py index a524dca26..15423d64d 100644 --- a/hed/tools/remodeling/operations/base_op.py +++ b/hed/tools/remodeling/operations/base_op.py @@ -77,7 +77,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): """ - return df + return df.copy() @staticmethod def _check_list_type(param_value, param_type): diff --git a/hed/tools/remodeling/operations/factor_column_op.py b/hed/tools/remodeling/operations/factor_column_op.py index 953c327ed..e01a81d8b 100644 --- a/hed/tools/remodeling/operations/factor_column_op.py +++ b/hed/tools/remodeling/operations/factor_column_op.py @@ -60,7 +60,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: DataFrame: A new DataFrame with the factor columns appended. diff --git a/hed/tools/remodeling/operations/merge_consecutive_op.py b/hed/tools/remodeling/operations/merge_consecutive_op.py index 01a526a7a..9ce7a16d7 100644 --- a/hed/tools/remodeling/operations/merge_consecutive_op.py +++ b/hed/tools/remodeling/operations/merge_consecutive_op.py @@ -62,7 +62,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. diff --git a/hed/tools/remodeling/operations/remap_columns_op.py b/hed/tools/remodeling/operations/remap_columns_op.py index 480df8220..c83315795 100644 --- a/hed/tools/remodeling/operations/remap_columns_op.py +++ b/hed/tools/remodeling/operations/remap_columns_op.py @@ -100,7 +100,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. @@ -109,12 +109,13 @@ def do_op(self, dispatcher, df, name, sidecar=None): - If ignore_missing is false and source values from the data are not in the map. """ - df[self.source_columns] = df[self.source_columns].replace(np.NaN, 'n/a') + df1 = df.copy() + df1[self.source_columns] = df1[self.source_columns].replace(np.NaN, 'n/a') for column in self.integer_sources: - int_mask = df[column] != 'n/a' - df.loc[int_mask, column] = df.loc[int_mask, column].astype(int) - df[self.source_columns] = df[self.source_columns].astype(str) - df_new, missing = self.key_map.remap(df) + int_mask = df1[column] != 'n/a' + df1.loc[int_mask, column] = df1.loc[int_mask, column].astype(int) + df1[self.source_columns] = df1[self.source_columns].astype(str) + df_new, missing = self.key_map.remap(df1) if missing and not self.ignore_missing: raise ValueError("MapSourceValueMissing", f"{name}: Ignore missing is false, but source values [{missing}] are in data but not map") diff --git a/hed/tools/remodeling/operations/remove_columns_op.py b/hed/tools/remodeling/operations/remove_columns_op.py index 0a941ca5d..b0833cd1d 100644 --- a/hed/tools/remodeling/operations/remove_columns_op.py +++ b/hed/tools/remodeling/operations/remove_columns_op.py @@ -49,7 +49,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. @@ -58,10 +58,11 @@ def do_op(self, dispatcher, df, name, sidecar=None): - If ignore_missing is False and a column not in the data is to be removed. """ - + df_new = df.copy() try: - return df.drop(self.column_names, axis=1, errors=self.error_handling) + return df_new.drop(self.column_names, axis=1, errors=self.error_handling) except KeyError: raise KeyError("MissingColumnCannotBeRemoved", f"{name}: Ignore missing is False but a column in {str(self.column_names)} is " - f"not in the data columns [{str(df.columns)}]") + f"not in the data columns [{str(df_new.columns)}]") + return df_new diff --git a/hed/tools/remodeling/operations/remove_rows_op.py b/hed/tools/remodeling/operations/remove_rows_op.py index 2e684d2dd..217fb7934 100644 --- a/hed/tools/remodeling/operations/remove_rows_op.py +++ b/hed/tools/remodeling/operations/remove_rows_op.py @@ -46,15 +46,15 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. """ - - if self.column_name not in df.columns: - return df + df_new = df.copy() + if self.column_name not in df_new.columns: + return df_new for value in self.remove_values: - df = df.loc[df[self.column_name] != value, :] - return df + df_new = df_new.loc[df_new[self.column_name] != value, :] + return df_new diff --git a/hed/tools/remodeling/operations/rename_columns_op.py b/hed/tools/remodeling/operations/rename_columns_op.py index adc283c20..2a2f275a9 100644 --- a/hed/tools/remodeling/operations/rename_columns_op.py +++ b/hed/tools/remodeling/operations/rename_columns_op.py @@ -49,7 +49,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. @@ -58,9 +58,9 @@ def do_op(self, dispatcher, df, name, sidecar=None): - When ignore_missing is false and column_mapping has columns not in the data. """ - + df_new = df.copy() try: - return df.rename(columns=self.column_mapping, errors=self.error_handling) + return df_new.rename(columns=self.column_mapping, errors=self.error_handling) except KeyError: raise KeyError("MappedColumnsMissingFromData", f"{name}: ignore_missing is False, mapping columns [{self.column_mapping}]" diff --git a/hed/tools/remodeling/operations/reorder_columns_op.py b/hed/tools/remodeling/operations/reorder_columns_op.py index 6ae71b179..9607bb295 100644 --- a/hed/tools/remodeling/operations/reorder_columns_op.py +++ b/hed/tools/remodeling/operations/reorder_columns_op.py @@ -48,7 +48,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. @@ -57,17 +57,17 @@ def do_op(self, dispatcher, df, name, sidecar=None): - When ignore_missing is false and column_order has columns not in the data. """ - - current_columns = list(df.columns) - missing_columns = set(self.column_order).difference(set(df.columns)) + df_new = df.copy() + current_columns = list(df_new.columns) + missing_columns = set(self.column_order).difference(set(df_new.columns)) ordered = self.column_order if missing_columns and not self.ignore_missing: raise ValueError("MissingReorderedColumns", f"{str(missing_columns)} are not in dataframe columns " - f" [{str(df.columns)}] and not ignored.") + f" [{str(df_new.columns)}] and not ignored.") elif missing_columns: ordered = [elem for elem in self.column_order if elem not in list(missing_columns)] if self.keep_others: ordered += [elem for elem in current_columns if elem not in ordered] - df = df.loc[:, ordered] - return df + df_new = df_new.loc[:, ordered] + return df_new diff --git a/hed/tools/remodeling/operations/split_rows_op.py b/hed/tools/remodeling/operations/split_rows_op.py index e96e8b490..858ce7e28 100644 --- a/hed/tools/remodeling/operations/split_rows_op.py +++ b/hed/tools/remodeling/operations/split_rows_op.py @@ -51,7 +51,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py index d185d08b2..ed6082a45 100644 --- a/hed/tools/remodeling/operations/summarize_column_names_op.py +++ b/hed/tools/remodeling/operations/summarize_column_names_op.py @@ -49,28 +49,28 @@ def __init__(self, parameters): self.append_timecode = parameters.get('append_timecode', False) def do_op(self, dispatcher, df, name, sidecar=None): - """ Create factor columns corresponding to values in a specified column. + """ Create a column name summary for df. Parameters: dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: - DataFrame: A new DataFrame with the factor columns appended. + DataFrame: A copy of df. Side-effect: Updates the relevant summary. """ - + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = ColumnNameSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({"name": name, "column_names": list(df.columns)}) - return df + summary.update_summary({"name": name, "column_names": list(df_new.columns)}) + return df_new class ColumnNameSummary(BaseSummary): diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py index 539bfe2bd..dc13790c7 100644 --- a/hed/tools/remodeling/operations/summarize_column_values_op.py +++ b/hed/tools/remodeling/operations/summarize_column_values_op.py @@ -65,28 +65,29 @@ def __init__(self, parameters): self.values_per_line = parameters.get('values_per_line', self.VALUES_PER_LINE) def do_op(self, dispatcher, df, name, sidecar=None): - """ Create factor columns corresponding to values in a specified column. + """ Create a summary of the column values in df. Parameters: dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: - DataFrame: A new DataFrame with the factor columns appended. + DataFrame: A copy of df. Side-effect: Updates the relevant summary. """ - + + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = ColumnValueSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name}) - return df + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name}) + return df_new class ColumnValueSummary(BaseSummary): diff --git a/hed/tools/remodeling/operations/summarize_definitions_op.py b/hed/tools/remodeling/operations/summarize_definitions_op.py index bc988c5d3..3169d63d0 100644 --- a/hed/tools/remodeling/operations/summarize_definitions_op.py +++ b/hed/tools/remodeling/operations/summarize_definitions_op.py @@ -1,4 +1,4 @@ -""" Summarize the values in the columns of a tabular file. """ +""" Summarize the definitions in the dataset. """ from hed import TabularInput from hed.tools.remodeling.operations.base_op import BaseOp @@ -7,7 +7,7 @@ class SummarizeDefinitionsOp(BaseOp): - """ Summarize the values in the columns of a tabular file. + """ Summarize the definitions in the dataset. Required remodeling parameters: - **summary_name** (*str*): The name of the summary. @@ -42,16 +42,14 @@ def __init__(self, parameters): :raises TypeError: - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] self.append_timecode = parameters.get('append_timecode', False) def do_op(self, dispatcher, df, name, sidecar=None): - """ Create factor columns corresponding to values in a specified column. + """ Create summaries of definitions Parameters: dispatcher (Dispatcher): Manages the operation I/O. @@ -60,17 +58,18 @@ def do_op(self, dispatcher, df, name, sidecar=None): sidecar (Sidecar or file-like): Only needed for HED operations. Returns: - DataFrame: A new DataFrame with the factor columns appended. + DataFrame: a copy of df Side-effect: Updates the relevant summary. """ + df_new = df.copy() summary = dispatcher.summary_dicts.setdefault(self.summary_name, DefinitionSummary(self, dispatcher.hed_schema)) - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name, 'sidecar': sidecar, + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'sidecar': sidecar, 'schema': dispatcher.hed_schema}) - return df + return df_new class DefinitionSummary(BaseSummary): @@ -92,40 +91,40 @@ def update_summary(self, new_info): series, def_dict = data_input.series_a, data_input.get_def_dict(new_info['schema']) self.def_gatherer.process_def_expands(series, def_dict) + @staticmethod + def _build_summary_dict(items_dict, title, process_func, display_description=False): + summary_dict = {} + items = {} + for key, value in items_dict.items(): + if process_func: + value = process_func(value) + if "#" in str(value): + key = key + "/#" + if display_description: + description, value = DefinitionSummary._remove_description(value) + items[key] = {"description": description, "contents": str(value)} + elif isinstance(value, list): + items[key] = [str(x) for x in value] + else: + items[key] = str(value) + summary_dict[title] = items + return summary_dict + def get_details_dict(self, def_gatherer): """ Return the summary-specific information in a dictionary. Parameters: - summary (?): Contains the resolved dictionaries. + def_gatherer (DefExpandGatherer): Contains the resolved dictionaries. Returns: dict: dictionary with the summary results. """ - def build_summary_dict(items_dict, title, process_func, display_description=False): - summary_dict = {} - items = {} - for key, value in items_dict.items(): - if process_func: - value = process_func(value) - if "#" in str(value): - key = key + "/#" - if display_description: - description, value = DefinitionSummary.remove_description(value) - items[key] = {"description": description, "contents": str(value)} - else: - if isinstance(value, list): - items[key] = [str(x) for x in value] - else: - items[key] = str(value) - summary_dict[title] = items - return summary_dict - - known_defs_summary = build_summary_dict(def_gatherer.def_dict, "Known Definitions", None, - display_description=True) - ambiguous_defs_summary = build_summary_dict(def_gatherer.ambiguous_defs, "Ambiguous Definitions", - def_gatherer.get_ambiguous_group) - errors_summary = build_summary_dict(def_gatherer.errors, "Errors", None) + known_defs_summary = self._build_summary_dict(def_gatherer.def_dict, "Known Definitions", None, + display_description=True) + ambiguous_defs_summary = self._build_summary_dict(def_gatherer.ambiguous_defs, "Ambiguous Definitions", + def_gatherer.get_ambiguous_group) + errors_summary = self._build_summary_dict(def_gatherer.errors, "Errors", None) known_defs_summary.update(ambiguous_defs_summary) known_defs_summary.update(errors_summary) @@ -160,26 +159,27 @@ def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): return self._get_dataset_string(result, indent=indent) return self._get_individual_string(result, indent=indent) + @staticmethod + def _nested_dict_to_string(data, indent, level=1): + result = [] + for key, value in data.items(): + if isinstance(value, dict): + result.append(f"{indent * level}{key}: {len(value)} items") + result.append(DefinitionSummary._nested_dict_to_string(value, indent, level + 1)) + elif isinstance(value, list): + result.append(f"{indent * level}{key}:") + for item in value: + result.append(f"{indent * (level + 1)}{item}") + else: + result.append(f"{indent * level}{key}: {value}") + return "\n".join(result) + @staticmethod def _get_dataset_string(summary_dict, indent=BaseSummary.DISPLAY_INDENT): - def nested_dict_to_string(data, level=1): - result = [] - for key, value in data.items(): - if isinstance(value, dict): - result.append(f"{indent * level}{key}: {len(value)} items") - result.append(nested_dict_to_string(value, level + 1)) - elif isinstance(value, list): - result.append(f"{indent * level}{key}:") - for item in value: - result.append(f"{indent * (level + 1)}{item}") - else: - result.append(f"{indent * level}{key}: {value}") - return "\n".join(result) - - return nested_dict_to_string(summary_dict) + return DefinitionSummary._nested_dict_to_string(summary_dict, indent) @staticmethod - def remove_description(def_entry): + def _remove_description(def_entry): def_group = def_entry.contents.copy() description = "" desc_tag = def_group.find_tags({"description"}, include_groups=False) diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index 2c24de8ef..d74d87de6 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -63,7 +63,7 @@ def __init__(self, parameters): self.expand_context = parameters.get('expand_context', False) def do_op(self, dispatcher, df, name, sidecar=None): - """ Create factor columns corresponding to values in a specified column. + """ Summarize the HED tags present in the dataset. Parameters: dispatcher (Dispatcher): Manages the operation I/O. @@ -72,19 +72,20 @@ def do_op(self, dispatcher, df, name, sidecar=None): sidecar (Sidecar or file-like): Only needed for HED operations. Returns: - DataFrame: A new DataFrame with the factor columns appended. + DataFrame: A copy of df. Side-effect: Updates the context. """ + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = HedTagSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name, + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) - return df + return df_new class HedTagSummary(BaseSummary): @@ -100,7 +101,7 @@ def update_summary(self, new_info): Parameters: new_info (dict): A dictionary with the parameters needed to update a summary. - Notes: + Notes: - The summary needs a "name" str, a "schema", a "df, and a "Sidecar". """ diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py index 6a37b0578..9a27d22d2 100644 --- a/hed/tools/remodeling/operations/summarize_hed_type_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_type_op.py @@ -67,19 +67,20 @@ def do_op(self, dispatcher, df, name, sidecar=None): sidecar (Sidecar or file-like): Usually required unless event file has a HED column. Returns: - DataFrame: Input DataFrame, unchanged. + DataFrame: A copy of df Side-effect: Updates the relevant summary. """ + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = HedTypeSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name, + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) - return df + return df_new class HedTypeSummary(BaseSummary): diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py index 29812273d..d643e533d 100644 --- a/hed/tools/remodeling/operations/summarize_hed_validation_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_validation_op.py @@ -24,7 +24,7 @@ class SummarizeHedValidationOp(BaseOp): "operation": "summarize_hed_validation", "required_parameters": { "summary_name": str, - "summary_filename": str + "summary_filename": str }, "optional_parameters": { "append_timecode": bool, @@ -64,19 +64,20 @@ def do_op(self, dispatcher, df, name, sidecar=None): sidecar (Sidecar or file-like): Usually needed unless only HED tags in HED column of event file. Returns: - DataFrame: Input DataFrame, unchanged. + DataFrame: A copy of df Side-effect: Updates the relevant summary. """ + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = HedValidationSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name, + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) - return df + return df_new class HedValidationSummary(BaseSummary): diff --git a/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py b/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py index f206e2f5f..e0657ffef 100644 --- a/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py +++ b/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py @@ -57,28 +57,29 @@ def __init__(self, parameters): self.append_timecode = parameters.get('append_timecode', False) def do_op(self, dispatcher, df, name, sidecar=None): - """ Create factor columns corresponding to values in a specified column. + """ Extract a sidecar from events file. Parameters: dispatcher (Dispatcher): The dispatcher object for managing the operations. df (DataFrame): The tabular file to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: - DataFrame: A new DataFrame with the factor columns appended. + DataFrame: A copy of df. Side-effect: Updates the associated summary if applicable. """ + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = EventsToSidecarSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name}) - return df + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name}) + return df_new class EventsToSidecarSummary(BaseSummary): @@ -95,7 +96,8 @@ def update_summary(self, new_info): new_info (dict): A dictionary with the parameters needed to update a summary. Notes: - - The summary needs a "name" str and a "df". + - The summary needs a "name" str and a "df". + """ tab_sum = TabularSummary(value_cols=self.value_cols, skip_cols=self.skip_cols, name=new_info["name"]) @@ -164,7 +166,7 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Dataset: Total events={result.get('total_events', 0)} " - f"Total files={result.get('total_files', 0)}", + f"Total files={result.get('total_files', 0)}", f"Skip columns: {str(result.get('skip_cols', []))}", f"Value columns: {str(result.get('value_cols', []))}", f"Sidecar:\n{json.dumps(result['sidecar'], indent=indent)}"] diff --git a/hed/validator/onset_validator.py b/hed/validator/onset_validator.py index 8cac74422..f44b291ba 100644 --- a/hed/validator/onset_validator.py +++ b/hed/validator/onset_validator.py @@ -77,7 +77,7 @@ def _handle_onset_or_offset(self, def_tag, onset_offset_tag): placeholder = def_name[found_slash + 1:] def_name = def_name[:found_slash] - def_entry = self._defs.get_def_entry(def_name) + def_entry = self._defs.get(def_name) if def_entry is None: return ErrorHandler.format_error(OnsetErrors.ONSET_DEF_UNMATCHED, tag=def_tag) if bool(def_entry.takes_value) != bool(placeholder): diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index 3c2793d94..e48333c5d 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -47,6 +47,8 @@ "SIDECAR_BRACES_INVALID", "SCHEMA_LIBRARY_INVALID", + + "SCHEMA_ATTRIBUTE_INVALID" ] skip_tests = { diff --git a/tests/models/test_definition_dict.py b/tests/models/test_definition_dict.py index eb5490529..61296e638 100644 --- a/tests/models/test_definition_dict.py +++ b/tests/models/test_definition_dict.py @@ -130,8 +130,8 @@ def test_expand_defs(self): definition_string = "(Definition/TestDefPlaceholder/#,(Age/#,Item/TestDef2))" def_dict.check_for_definitions(HedString(definition_string, hed_schema=self.hed_schema)) for key, test_string in test_strings.items(): - hed_string = HedString(test_string, hed_schema=self.hed_schema) - def_dict.expand_def_tags(hed_string) + hed_string = HedString(test_string, hed_schema=self.hed_schema, def_dict=def_dict) + hed_string.expand_defs() self.assertEqual(str(hed_string), expected_results[key]) if __name__ == '__main__': diff --git a/tests/models/test_df_util.py b/tests/models/test_df_util.py index c88446956..09f913466 100644 --- a/tests/models/test_df_util.py +++ b/tests/models/test_df_util.py @@ -14,54 +14,54 @@ def setUp(self): def test_shrink_defs_normal(self): df = pd.DataFrame({"column1": ["(Def-expand/TestDefNormal,(Acceleration/2471,Action/TestDef2)),Event/SomeEvent"]}) expected_df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"]}) - result = shrink_defs(df, self.schema, ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_shrink_defs_placeholder(self): df = pd.DataFrame({"column1": ["(Def-expand/TestDefPlaceholder/123,(Acceleration/123,Action/TestDef2)),Item/SomeItem"]}) expected_df = pd.DataFrame({"column1": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) - result = shrink_defs(df, self.schema, ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_shrink_defs_no_matching_tags(self): df = pd.DataFrame({"column1": ["(Event/SomeEvent, Item/SomeItem,Acceleration/25)"]}) expected_df = pd.DataFrame({"column1": ["(Event/SomeEvent, Item/SomeItem,Acceleration/25)"]}) - result = shrink_defs(df, self.schema, ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_shrink_defs_multiple_columns(self): df = pd.DataFrame({"column1": ["(Def-expand/TestDefNormal,(Acceleration/2471,Action/TestDef2)),Event/SomeEvent"], "column2": ["(Def-expand/TestDefPlaceholder/123,(Acceleration/123,Action/TestDef2)),Item/SomeItem"]}) expected_df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"], "column2": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) - result = shrink_defs(df, self.schema, ['column1', 'column2']) - pd.testing.assert_frame_equal(result, expected_df) + shrink_defs(df, self.schema, ['column1', 'column2']) + pd.testing.assert_frame_equal(df, expected_df) def test_shrink_defs_multiple_defs_same_line(self): df = pd.DataFrame({"column1": ["(Def-expand/TestDefNormal,(Acceleration/2471,Action/TestDef2)),(Def-expand/TestDefPlaceholder/123,(Acceleration/123,Action/TestDef2)),Acceleration/30"]}) expected_df = pd.DataFrame({"column1": ["Def/TestDefNormal,Def/TestDefPlaceholder/123,Acceleration/30"]}) - result = shrink_defs(df, self.schema, ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_shrink_defs_mixed_tags(self): df = pd.DataFrame({"column1": [ "(Def-expand/TestDefNormal,(Acceleration/2471,Action/TestDef2)),Event/SomeEvent,(Def-expand/TestDefPlaceholder/123,(Acceleration/123,Action/TestDef2)),Item/SomeItem,Acceleration/25"]}) expected_df = pd.DataFrame( {"column1": ["Def/TestDefNormal,Event/SomeEvent,Def/TestDefPlaceholder/123,Item/SomeItem,Acceleration/25"]}) - result = shrink_defs(df, self.schema, ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_shrink_defs_series_normal(self): series = pd.Series(["(Def-expand/TestDefNormal,(Acceleration/2471,Action/TestDef2)),Event/SomeEvent"]) expected_series = pd.Series(["Def/TestDefNormal,Event/SomeEvent"]) - result = shrink_defs(series, self.schema, None) - pd.testing.assert_series_equal(result, expected_series) + shrink_defs(series, self.schema, None) + pd.testing.assert_series_equal(series, expected_series) def test_shrink_defs_series_placeholder(self): series = pd.Series(["(Def-expand/TestDefPlaceholder/123,(Acceleration/123,Action/TestDef2)),Item/SomeItem"]) expected_series = pd.Series(["Def/TestDefPlaceholder/123,Item/SomeItem"]) - result = shrink_defs(series, self.schema, None) - pd.testing.assert_series_equal(result, expected_series) + shrink_defs(series, self.schema, None) + pd.testing.assert_series_equal(series, expected_series) class TestExpandDefs(unittest.TestCase): @@ -75,21 +75,21 @@ def test_expand_defs_normal(self): df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"]}) expected_df = pd.DataFrame( {"column1": ["(Def-expand/TestDefNormal,(Acceleration/2471,Action/TestDef2)),Event/SomeEvent"]}) - result = expand_defs(df, self.schema, self.def_dict, ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + expand_defs(df, self.schema, self.def_dict, ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_expand_defs_placeholder(self): df = pd.DataFrame({"column1": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) expected_df = pd.DataFrame({"column1": [ "(Def-expand/TestDefPlaceholder/123,(Acceleration/123,Action/TestDef2)),Item/SomeItem"]}) - result = expand_defs(df, self.schema, self.def_dict, ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + expand_defs(df, self.schema, self.def_dict, ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_expand_defs_no_matching_tags(self): df = pd.DataFrame({"column1": ["(Event/SomeEvent,Item/SomeItem,Acceleration/25)"]}) expected_df = pd.DataFrame({"column1": ["(Event/SomeEvent,Item/SomeItem,Acceleration/25)"]}) - result = expand_defs(df, self.schema, self.def_dict, ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + expand_defs(df, self.schema, self.def_dict, ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_expand_defs_multiple_columns(self): df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"], @@ -98,20 +98,20 @@ def test_expand_defs_multiple_columns(self): {"column1": ["(Def-expand/TestDefNormal,(Acceleration/2471,Action/TestDef2)),Event/SomeEvent"], "column2": [ "(Def-expand/TestDefPlaceholder/123,(Acceleration/123,Action/TestDef2)),Item/SomeItem"]}) - result = expand_defs(df, self.schema, self.def_dict, ['column1', 'column2']) - pd.testing.assert_frame_equal(result, expected_df) + expand_defs(df, self.schema, self.def_dict, ['column1', 'column2']) + pd.testing.assert_frame_equal(df, expected_df) def test_expand_defs_series_normal(self): series = pd.Series(["Def/TestDefNormal,Event/SomeEvent"]) expected_series = pd.Series(["(Def-expand/TestDefNormal,(Acceleration/2471,Action/TestDef2)),Event/SomeEvent"]) - result = expand_defs(series, self.schema, self.def_dict, None) - pd.testing.assert_series_equal(result, expected_series) + expand_defs(series, self.schema, self.def_dict, None) + pd.testing.assert_series_equal(series, expected_series) def test_expand_defs_series_placeholder(self): series = pd.Series(["Def/TestDefPlaceholder/123,Item/SomeItem"]) expected_series = pd.Series(["(Def-expand/TestDefPlaceholder/123,(Acceleration/123,Action/TestDef2)),Item/SomeItem"]) - result = expand_defs(series, self.schema, self.def_dict, None) - pd.testing.assert_series_equal(result, expected_series) + expand_defs(series, self.schema, self.def_dict, None) + pd.testing.assert_series_equal(series, expected_series) class TestConvertToForm(unittest.TestCase): @@ -121,38 +121,38 @@ def setUp(self): def test_convert_to_form_short_tags(self): df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]}) expected_df = pd.DataFrame({"column1": ["Azure,See"]}) - result = convert_to_form(df, self.schema, "short_tag", ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + convert_to_form(df, self.schema, "short_tag", ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_convert_to_form_long_tags(self): df = pd.DataFrame({"column1": ["CSS-color/White-color/Azure,Action/Perceive/See"]}) expected_df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]}) - result = convert_to_form(df, self.schema, "long_tag", ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + convert_to_form(df, self.schema, "long_tag", ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_convert_to_form_series_short_tags(self): series = pd.Series(["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]) expected_series = pd.Series(["Azure,See"]) - result = convert_to_form(series, self.schema, "short_tag") - pd.testing.assert_series_equal(result, expected_series) + convert_to_form(series, self.schema, "short_tag") + pd.testing.assert_series_equal(series, expected_series) def test_convert_to_form_series_long_tags(self): series = pd.Series(["CSS-color/White-color/Azure,Action/Perceive/See"]) expected_series = pd.Series(["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Action/Perceive/See"]) - result = convert_to_form(series, self.schema, "long_tag") - pd.testing.assert_series_equal(result, expected_series) + convert_to_form(series, self.schema, "long_tag") + pd.testing.assert_series_equal(series, expected_series) def test_convert_to_form_multiple_tags_short(self): df = pd.DataFrame({"column1": ["Visual-attribute/Color/CSS-color/White-color/Azure,Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) expected_df = pd.DataFrame({"column1": ["Azure,Nose,Acceleration/4.5 m-per-s^2"]}) - result = convert_to_form(df, self.schema, "short_tag", ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + convert_to_form(df, self.schema, "short_tag", ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_convert_to_form_multiple_tags_long(self): df = pd.DataFrame({"column1": ["CSS-color/White-color/Azure,Anatomical-item/Body-part/Head/Face/Nose,Rate-of-change/Acceleration/4.5 m-per-s^2"]}) expected_df = pd.DataFrame({"column1": ["Property/Sensory-property/Sensory-attribute/Visual-attribute/Color/CSS-color/White-color/Azure,Item/Biological-item/Anatomical-item/Body-part/Head/Face/Nose,Property/Data-property/Data-value/Spatiotemporal-value/Rate-of-change/Acceleration/4.5 m-per-s^2"]}) - result = convert_to_form(df, self.schema, "long_tag", ['column1']) - pd.testing.assert_frame_equal(result, expected_df) + convert_to_form(df, self.schema, "long_tag", ['column1']) + pd.testing.assert_frame_equal(df, expected_df) def test_basic_expand_detection(self): # all simple cases with no duplicates diff --git a/tests/models/test_sidecar.py b/tests/models/test_sidecar.py index 4fdacb31f..8383de6f8 100644 --- a/tests/models/test_sidecar.py +++ b/tests/models/test_sidecar.py @@ -142,7 +142,7 @@ def test_set_hed_strings(self): for column_data in sidecar: hed_strings = column_data.get_hed_strings() - hed_strings = df_util.convert_to_form(hed_strings, self.hed_schema, "long_tag") + df_util.convert_to_form(hed_strings, self.hed_schema, "long_tag") column_data.set_hed_strings(hed_strings) sidecar_long = Sidecar(os.path.join(self.base_data_dir, "sidecar_tests/long_tag_test.json")) self.assertEqual(sidecar.loaded_dict, sidecar_long.loaded_dict) @@ -151,7 +151,7 @@ def test_set_hed_strings(self): for column_data in sidecar: hed_strings = column_data.get_hed_strings() - hed_strings = df_util.convert_to_form(hed_strings, self.hed_schema, "short_tag") + df_util.convert_to_form(hed_strings, self.hed_schema, "short_tag") column_data.set_hed_strings(hed_strings) sidecar_short = Sidecar(os.path.join(self.base_data_dir, "sidecar_tests/short_tag_test.json")) self.assertEqual(sidecar.loaded_dict, sidecar_short.loaded_dict) diff --git a/tests/schema/test_hed_cache.py b/tests/schema/test_hed_cache.py index bf8091b15..55a343a26 100644 --- a/tests/schema/test_hed_cache.py +++ b/tests/schema/test_hed_cache.py @@ -26,10 +26,9 @@ def setUpClass(cls): cls.semantic_version_two = '1.2.4' cls.semantic_version_three = '1.2.5' cls.semantic_version_list = ['1.2.3', '1.2.4', '1.2.5'] - cls.specific_base_url = "https://api.github.com/repos/hed-standard/hed-specification/contents/hedxml" + cls.specific_base_url = "https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema/hedxml" + cls.specific_hed_url = "https://raw.githubusercontent.com/hed-standard/hed-schemas/master/standard_schema/hedxml/HED8.0.0.xml" try: - cls.specific_hed_url = \ - """https://raw.githubusercontent.com/hed-standard/hed-specification/master/hedxml/HED8.0.0.xml""" hed_cache.cache_xml_versions(cache_folder=cls.hed_cache_dir) except urllib.error.HTTPError as e: schema.set_cache_directory(cls.saved_cache_folder) diff --git a/tests/schema/test_schema_compliance.py b/tests/schema/test_schema_compliance.py index a9eb18b9c..1578e57d9 100644 --- a/tests/schema/test_schema_compliance.py +++ b/tests/schema/test_schema_compliance.py @@ -1,18 +1,15 @@ import unittest import os +import copy from hed.schema import schema_compliance from hed import schema from hed.errors import ErrorHandler, SchemaWarnings class Test(unittest.TestCase): - # a known schema with some issues - schema_file = '../data/schema_tests/HED8.0.0.mediawiki' - @classmethod def setUpClass(cls): - cls.error_handler = ErrorHandler() - cls.schema_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.schema_file) + cls.hed_schema = schema.load_schema_version("8.1.0") def validate_term_base(self, input_text, expected_issues): for text, issues in zip(input_text, expected_issues): @@ -25,7 +22,9 @@ def validate_desc_base(self, input_descriptions, expected_issues): self.assertCountEqual(issues, test_issues) def test_validate_schema(self): - hed_schema = schema.load_schema(self.schema_path) + schema_path_with_issues = '../data/schema_tests/HED8.0.0.mediawiki' + schema_path_with_issues = os.path.join(os.path.dirname(os.path.realpath(__file__)), schema_path_with_issues) + hed_schema = schema.load_schema(schema_path_with_issues) issues = hed_schema.check_compliance() self.assertTrue(isinstance(issues, list)) self.assertTrue(len(issues) > 1) @@ -72,3 +71,34 @@ def test_validate_schema_description(self): ] self.validate_desc_base(test_descs, expected_issues) + + def test_util_placeholder(self): + tag_entry = self.hed_schema.all_tags["Event"] + attribute_name = "unitClass" + self.assertTrue(schema_compliance.tag_is_placeholder_check(self.hed_schema, tag_entry, attribute_name)) + attribute_name = "unitClass" + tag_entry = self.hed_schema.all_tags["Age/#"] + self.assertFalse(schema_compliance.tag_is_placeholder_check(self.hed_schema, tag_entry, attribute_name)) + + def test_util_suggested(self): + tag_entry = self.hed_schema.all_tags["Event/Sensory-event"] + attribute_name = "suggestedTag" + self.assertFalse(schema_compliance.tag_exists_check(self.hed_schema, tag_entry, attribute_name)) + tag_entry = self.hed_schema.all_tags["Property"] + self.assertFalse(schema_compliance.tag_exists_check(self.hed_schema, tag_entry, attribute_name)) + tag_entry = copy.deepcopy(tag_entry) + tag_entry.attributes["suggestedTag"] = "InvalidSuggestedTag" + self.assertTrue(schema_compliance.tag_exists_check(self.hed_schema, tag_entry, attribute_name)) + + def test_util_rooted(self): + tag_entry = self.hed_schema.all_tags["Event"] + attribute_name = "rooted" + self.assertFalse(schema_compliance.tag_exists_base_schema_check(self.hed_schema, tag_entry, attribute_name)) + tag_entry = self.hed_schema.all_tags["Property"] + self.assertFalse(schema_compliance.tag_exists_base_schema_check(self.hed_schema, tag_entry, attribute_name)) + tag_entry = copy.deepcopy(tag_entry) + tag_entry.attributes["rooted"] = "Event" + self.assertFalse(schema_compliance.tag_exists_base_schema_check(self.hed_schema, tag_entry, attribute_name)) + tag_entry = copy.deepcopy(tag_entry) + tag_entry.attributes["rooted"] = "NotRealTag" + self.assertTrue(schema_compliance.tag_exists_base_schema_check(self.hed_schema, tag_entry, attribute_name)) \ No newline at end of file diff --git a/tests/schema/test_schema_util.py b/tests/schema/test_schema_util.py index ab6099276..0fb72539d 100644 --- a/tests/schema/test_schema_util.py +++ b/tests/schema/test_schema_util.py @@ -8,7 +8,7 @@ class Test(unittest.TestCase): @classmethod def setUpClass(cls): cls.default_test_url = \ - """https://raw.githubusercontent.com/hed-standard/hed-specification/master/hedxml/HED8.0.0.xml""" + """https://raw.githubusercontent.com/hed-standard/hed-schemas/master/standard_schema/hedxml/HED8.0.0.xml""" cls.hed_xml_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../schema_tests/HED8.0.0t.xml') diff --git a/tests/schema/test_schema_wiki_fatal_errors.py b/tests/schema/test_schema_wiki_fatal_errors.py index 50f3cf6ca..583579b17 100644 --- a/tests/schema/test_schema_wiki_fatal_errors.py +++ b/tests/schema/test_schema_wiki_fatal_errors.py @@ -96,7 +96,7 @@ def test_merging_errors_schema(self): error_handler.push_error_context(ErrorContext.ROW, 1) error_handler.push_error_context(ErrorContext.COLUMN, 2) - issues = error_handler.format_error_with_context(SchemaErrors.HED_SCHEMA_ATTRIBUTE_INVALID, + issues = error_handler.format_error_with_context(SchemaErrors.SCHEMA_ATTRIBUTE_INVALID, "error_attribute", source_tag="error_tag") error_handler.pop_error_context() error_handler.pop_error_context() diff --git a/tests/tools/remodeling/operations/test_summarize_definitions_op.py b/tests/tools/remodeling/operations/test_summarize_definitions_op.py index c01e949be..4b4784f64 100644 --- a/tests/tools/remodeling/operations/test_summarize_definitions_op.py +++ b/tests/tools/remodeling/operations/test_summarize_definitions_op.py @@ -41,7 +41,7 @@ def test_do_op(self): parms = json.loads(self.json_parms) sum_op = SummarizeDefinitionsOp(parms) df = pd.read_csv(self.data_path, delimiter='\t', header=0, keep_default_na=False, na_values=",null") - df_new = sum_op.do_op(dispatch, dispatch.prep_data(df), 'subj2_run1', sidecar=self.json_path) + df_new = sum_op.do_op(dispatch, df, 'subj2_run1', sidecar=self.json_path) self.assertEqual(200, len(df_new), " dataframe length is correct") self.assertEqual(10, len(df_new.columns), " has correct number of columns") self.assertIn(sum_op.summary_name, dispatch.summary_dicts) @@ -78,7 +78,8 @@ def test_summary_errors(self): df_new = sum_op.do_op(dispatch, dispatch.prep_data(df), 'subj2_run1', sidecar=self.json_path) self.assertIn(sum_op.summary_name, dispatch.summary_dicts) self.assertIsInstance(dispatch.summary_dicts[sum_op.summary_name], DefinitionSummary) - #print(str(dispatch.summary_dicts[sum_op.summary_name].get_text_summary()['Dataset'])) + # print(str(dispatch.summary_dicts[sum_op.summary_name].get_text_summary()['Dataset'])) + if __name__ == '__main__': unittest.main() diff --git a/tests/validator/test_def_validator.py b/tests/validator/test_def_validator.py index 7464e985d..bbaf3eb58 100644 --- a/tests/validator/test_def_validator.py +++ b/tests/validator/test_def_validator.py @@ -50,7 +50,7 @@ def test_expand_def_tags_placeholder_invalid(self): test_string = HedString(placeholder_label_def_string_no_placeholder, self.hed_schema) def_issues = def_validator.validate_def_tags(test_string) - def_issues += def_validator.expand_def_tags(test_string) + test_string.expand_defs() self.assertEqual(str(test_string), placeholder_label_def_string_no_placeholder) self.assertTrue(def_issues) @@ -66,7 +66,7 @@ def test_expand_def_tags_placeholder_invalid(self): test_string = HedString(label_def_string_has_invalid_placeholder, self.hed_schema) def_issues = def_validator.validate_def_tags(test_string) - def_issues += def_validator.expand_def_tags(test_string) + test_string.expand_defs() self.assertEqual(str(test_string), label_def_string_has_invalid_placeholder) self.assertTrue(def_issues)