From 08c7d2be477c14d13fe56ca7b8396d5c3c04b336 Mon Sep 17 00:00:00 2001
From: Kay Robbins <1189050+VisLab@users.noreply.github.com>
Date: Sun, 27 Apr 2025 11:09:49 -0500
Subject: [PATCH] Updated wikimedia to correctly save external annotations

---
 hed/schema/schema_io/base2schema.py    |  8 ++++----
 hed/schema/schema_io/df2schema.py      |  3 ++-
 hed/schema/schema_io/df_util.py        | 21 +++++++++++++++++++++
 hed/schema/schema_io/wiki2schema.py    |  6 ++++--
 hed/schema/schema_io/wiki_constants.py |  6 ++++++
 5 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py
index 5daca8889..45f78f62e 100644
--- a/hed/schema/schema_io/base2schema.py
+++ b/hed/schema/schema_io/base2schema.py
@@ -210,6 +210,7 @@ def find_rooted_entry(tag_entry, schema, loading_merged):
                 return None
 
             return rooted_entry
+        return None
 
     def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed",
                          error_code=HedExceptions.WIKI_DELIMITERS_INVALID):
@@ -224,11 +225,10 @@ def fix_extras(self):
         for key, extra in self._schema.extras.items():
             self._schema.extras[key] = extra.rename(columns=df_constants.EXTRAS_CONVERSIONS)
             if key in df_constants.extras_column_dict:
-               self._schema.extras[key] = self.fix_extra(self._schema, key)
+               self._schema.extras[key] = self.fix_extra(key)
 
-    @staticmethod
-    def fix_extra(schema, key):
-        df = schema.extras[key]
+    def fix_extra(self, key):
+        df = self._schema.extras[key]
         priority_cols = df_constants.extras_column_dict[key]
         col_to_add = [col for col in priority_cols if col not in df.columns]
         if col_to_add:
diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py
index 46a8f1310..60a53144b 100644
--- a/hed/schema/schema_io/df2schema.py
+++ b/hed/schema/schema_io/df2schema.py
@@ -87,7 +87,8 @@ def _parse_data(self):
                                f"parameter on this exception for more details.", self.name,
                                issues=self.fatal_errors)
         extras =  {key: self.input_data[key] for key in constants.DF_EXTRA_SUFFIXES if key in self.input_data}
-        self._schema.extras = extras
+        for key, item in extras.items():
+            self._schema.extras[key] = df_util.merge_dataframes(extras[key], self._schema.extras.get(key, None), key)
 
     def _get_prologue_epilogue(self, file_data):
         prologue, epilogue = "", ""
diff --git a/hed/schema/schema_io/df_util.py b/hed/schema/schema_io/df_util.py
index 0de0a7114..b03a1a655 100644
--- a/hed/schema/schema_io/df_util.py
+++ b/hed/schema/schema_io/df_util.py
@@ -11,6 +11,27 @@
 
 UNKNOWN_LIBRARY_VALUE = 0
 
+def merge_dataframes(df1, df2, key) :
+    """ Create a new dataframe where df2 is merged into df1 and duplicates are eliminated.
+
+    Parameters:
+        df1(df.DataFrame): dataframe to use as destination merge.
+        df2(df.DataFrame): dataframe to use as a merge element.
+        key(str): name of the column that is treated as the key when dataframes are merged
+
+    Returns:
+        df.DataFrame: The merged dataframe.
+    """
+    if df2 is None or df2.empty:
+        return df1
+    if set(df1.columns) != set(df2.columns):
+        raise HedFileError(HedExceptions.BAD_COLUMN_NAMES,
+                           f"Both dataframes corresponding to {key} to be merged must have the same columns.  "
+                           f"df1 columns: {list(df1.columns)} df2 columns: {list(df2.columns)}", "")
+    combined = pd.concat([df1, df2], ignore_index=True)
+    combined = combined.sort_values(by=list(combined.columns))
+    combined = combined.drop_duplicates()
+    return combined
 
 def merge_dataframe_dicts(df_dict1, df_dict2, key_column=constants.KEY_COLUMN_NAME):
     """ Create a new dictionary of DataFrames where dict2 is merged into dict1.
diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py
index 9a19daa67..71cd14a10 100644
--- a/hed/schema/schema_io/wiki2schema.py
+++ b/hed/schema/schema_io/wiki2schema.py
@@ -9,7 +9,7 @@
 from hed.errors import error_reporter
 from hed.schema.schema_io import wiki_constants, df_constants
 from hed.schema.schema_io.base2schema import SchemaLoader
-from hed.schema.schema_io.wiki_constants import HedWikiSection, SectionNames
+from hed.schema.schema_io.wiki_constants import HedWikiSection, SectionNames, WIKI_EXTRA_DICT
 from hed.schema.schema_io import text_util
 
 
@@ -120,7 +120,9 @@ def _parse_extras(self, wiki_lines_by_section):
             if not data:
                 continue
             df = pd.DataFrame(data).fillna('').astype(str)
-            self._schema.extras[extra_key.strip('"')] = df
+            stripped_key = extra_key.strip("'")
+            stripped_key = WIKI_EXTRA_DICT.get(stripped_key, stripped_key)
+            self._schema.extras[stripped_key] = df
 
     @staticmethod
     def parse_star_string(s):
diff --git a/hed/schema/schema_io/wiki_constants.py b/hed/schema/schema_io/wiki_constants.py
index 8c6fcd368..abea7d72d 100644
--- a/hed/schema/schema_io/wiki_constants.py
+++ b/hed/schema/schema_io/wiki_constants.py
@@ -1,4 +1,5 @@
 from hed.schema.hed_schema_constants import HedSectionKey
+from hed.schema.schema_io import df_constants
 START_HED_STRING = "!# start schema"
 END_SCHEMA_STRING = "!# end schema"
 END_HED_STRING = "!# end hed"
@@ -17,6 +18,11 @@
 PREFIXES_SECTION_ELEMENT = "'''Prefixes'''"
 EXTERNAL_ANNOTATION_SECTION_ELEMENT = "'''External annotations'''"
 
+WIKI_EXTRA_DICT = {'Sources': df_constants.SOURCES_KEY,
+                   'Prefixes': df_constants.PREFIXES_KEY,
+                   'External annotations': df_constants.EXTERNAL_ANNOTATION_KEY}
+
+
 wiki_section_headers = {
     HedSectionKey.Tags: START_HED_STRING,
     HedSectionKey.UnitClasses: UNIT_CLASS_STRING,