From 08c7d2be477c14d13fe56ca7b8396d5c3c04b336 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Sun, 27 Apr 2025 11:09:49 -0500 Subject: [PATCH] Updated wikimedia to correctly save external annotations --- hed/schema/schema_io/base2schema.py | 8 ++++---- hed/schema/schema_io/df2schema.py | 3 ++- hed/schema/schema_io/df_util.py | 21 +++++++++++++++++++++ hed/schema/schema_io/wiki2schema.py | 6 ++++-- hed/schema/schema_io/wiki_constants.py | 6 ++++++ 5 files changed, 37 insertions(+), 7 deletions(-) diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py index 5daca8889..45f78f62e 100644 --- a/hed/schema/schema_io/base2schema.py +++ b/hed/schema/schema_io/base2schema.py @@ -210,6 +210,7 @@ def find_rooted_entry(tag_entry, schema, loading_merged): return None return rooted_entry + return None def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed", error_code=HedExceptions.WIKI_DELIMITERS_INVALID): @@ -224,11 +225,10 @@ def fix_extras(self): for key, extra in self._schema.extras.items(): self._schema.extras[key] = extra.rename(columns=df_constants.EXTRAS_CONVERSIONS) if key in df_constants.extras_column_dict: - self._schema.extras[key] = self.fix_extra(self._schema, key) + self._schema.extras[key] = self.fix_extra(key) - @staticmethod - def fix_extra(schema, key): - df = schema.extras[key] + def fix_extra(self, key): + df = self._schema.extras[key] priority_cols = df_constants.extras_column_dict[key] col_to_add = [col for col in priority_cols if col not in df.columns] if col_to_add: diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py index 46a8f1310..60a53144b 100644 --- a/hed/schema/schema_io/df2schema.py +++ b/hed/schema/schema_io/df2schema.py @@ -87,7 +87,8 @@ def _parse_data(self): f"parameter on this exception for more details.", self.name, issues=self.fatal_errors) extras = {key: self.input_data[key] for key in constants.DF_EXTRA_SUFFIXES if key in self.input_data} - self._schema.extras = extras + for key, item in extras.items(): + self._schema.extras[key] = df_util.merge_dataframes(extras[key], self._schema.extras.get(key, None), key) def _get_prologue_epilogue(self, file_data): prologue, epilogue = "", "" diff --git a/hed/schema/schema_io/df_util.py b/hed/schema/schema_io/df_util.py index 0de0a7114..b03a1a655 100644 --- a/hed/schema/schema_io/df_util.py +++ b/hed/schema/schema_io/df_util.py @@ -11,6 +11,27 @@ UNKNOWN_LIBRARY_VALUE = 0 +def merge_dataframes(df1, df2, key) : + """ Create a new dataframe where df2 is merged into df1 and duplicates are eliminated. + + Parameters: + df1(df.DataFrame): dataframe to use as destination merge. + df2(df.DataFrame): dataframe to use as a merge element. + key(str): name of the column that is treated as the key when dataframes are merged + + Returns: + df.DataFrame: The merged dataframe. + """ + if df2 is None or df2.empty: + return df1 + if set(df1.columns) != set(df2.columns): + raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, + f"Both dataframes corresponding to {key} to be merged must have the same columns. " + f"df1 columns: {list(df1.columns)} df2 columns: {list(df2.columns)}", "") + combined = pd.concat([df1, df2], ignore_index=True) + combined = combined.sort_values(by=list(combined.columns)) + combined = combined.drop_duplicates() + return combined def merge_dataframe_dicts(df_dict1, df_dict2, key_column=constants.KEY_COLUMN_NAME): """ Create a new dictionary of DataFrames where dict2 is merged into dict1. diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py index 9a19daa67..71cd14a10 100644 --- a/hed/schema/schema_io/wiki2schema.py +++ b/hed/schema/schema_io/wiki2schema.py @@ -9,7 +9,7 @@ from hed.errors import error_reporter from hed.schema.schema_io import wiki_constants, df_constants from hed.schema.schema_io.base2schema import SchemaLoader -from hed.schema.schema_io.wiki_constants import HedWikiSection, SectionNames +from hed.schema.schema_io.wiki_constants import HedWikiSection, SectionNames, WIKI_EXTRA_DICT from hed.schema.schema_io import text_util @@ -120,7 +120,9 @@ def _parse_extras(self, wiki_lines_by_section): if not data: continue df = pd.DataFrame(data).fillna('').astype(str) - self._schema.extras[extra_key.strip('"')] = df + stripped_key = extra_key.strip("'") + stripped_key = WIKI_EXTRA_DICT.get(stripped_key, stripped_key) + self._schema.extras[stripped_key] = df @staticmethod def parse_star_string(s): diff --git a/hed/schema/schema_io/wiki_constants.py b/hed/schema/schema_io/wiki_constants.py index 8c6fcd368..abea7d72d 100644 --- a/hed/schema/schema_io/wiki_constants.py +++ b/hed/schema/schema_io/wiki_constants.py @@ -1,4 +1,5 @@ from hed.schema.hed_schema_constants import HedSectionKey +from hed.schema.schema_io import df_constants START_HED_STRING = "!# start schema" END_SCHEMA_STRING = "!# end schema" END_HED_STRING = "!# end hed" @@ -17,6 +18,11 @@ PREFIXES_SECTION_ELEMENT = "'''Prefixes'''" EXTERNAL_ANNOTATION_SECTION_ELEMENT = "'''External annotations'''" +WIKI_EXTRA_DICT = {'Sources': df_constants.SOURCES_KEY, + 'Prefixes': df_constants.PREFIXES_KEY, + 'External annotations': df_constants.EXTERNAL_ANNOTATION_KEY} + + wiki_section_headers = { HedSectionKey.Tags: START_HED_STRING, HedSectionKey.UnitClasses: UNIT_CLASS_STRING,