hed-standard · VisLab · Apr 8, 2025 · Apr 1, 2025 · Apr 3, 2025 · Apr 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -62,6 +62,7 @@ var/
 .installed.cfg
 *.egg
 tests/scratch
+tests/test_output
 
 # Installer logs
 pip-log.txt

diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py
@@ -40,6 +40,7 @@ class HedExceptions:
     # This issue will contain a list of lines with issues.
     WIKI_DELIMITERS_INVALID = 'WIKI_DELIMITERS_INVALID'
     WIKI_LINE_START_INVALID = 'WIKI_LINE_START_INVALID'
+    WIKI_LINE_INVALID = 'WIKI_LINE_INVALID'
     HED_SCHEMA_NODE_NAME_INVALID = 'HED_SCHEMA_NODE_NAME_INVALID'
 
     SCHEMA_DUPLICATE_PREFIX = 'SCHEMA_LOAD_FAILED'

diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py
@@ -29,6 +29,7 @@ def __init__(self):
         self.filename = None
         self.prologue = ""
         self.epilogue = ""
+        self.extras = {} # Used to store any additional data that might be needed for serialization (like OWL or other formats)
 
         # This is the specified library name_prefix - tags will be {schema_namespace}:{tag_name}
         self._namespace = ""
@@ -227,6 +228,22 @@ def valid_prefixes(self):
         """
         return [self._namespace]
 
+    def get_extras(self, extras_key):
+        """ Get the extras corresponding to the given key
+
+        Parameters:
+            extras_key (str): The key to check for in the extras dictionary.
+
+        Returns:
+            DataFrame: True if the extras dictionary has this key.
+        """
+        if not hasattr(self, 'extras') or not extras_key in self.extras:
+            return None
+        externals = self.extras[extras_key]
+        if externals.empty:
+            None
+        return externals
+
     # ===============================================
     # Creation and saving functions
     # ===============================================
@@ -366,12 +383,16 @@ def __eq__(self, other):
         if other is None:
             return False
         if self.get_save_header_attributes() != other.get_save_header_attributes():
+            # print(f"Header attributes not equal: '{self.get_save_header_attributes()}' vs '{other.get_save_header_attributes()}'")
             return False
         if self.has_duplicates() != other.has_duplicates():
+            # print(f"Duplicates: '{self.has_duplicates()}' vs '{other.has_duplicates()}'")
             return False
         if self.prologue.strip() != other.prologue.strip():
+            # print(f"PROLOGUE NOT EQUAL: '{self.prologue.strip()}' vs '{other.prologue.strip()}'")
             return False
         if self.epilogue.strip() != other.epilogue.strip():
+            # print(f"EPILOGUE NOT EQUAL: '{self.epilogue.strip()}' vs '{other.epilogue.strip()}'")
             return False
         if self._sections != other._sections:
             # This block is useful for debugging when modifying the schema class itself.
@@ -394,6 +415,7 @@ def __eq__(self, other):
             #                     print(s)
             return False
         if self._namespace != other._namespace:
+            # print(f"NAMESPACE NOT EQUAL: '{self._namespace}' vs '{other._namespace}'")
             return False
         return True
 
@@ -473,6 +495,7 @@ def find_tag_entry(self, tag, schema_namespace=""):
             return None, None, validation_issues
         return self._find_tag_entry(tag, schema_namespace)
 
+
     # ===============================================
     # Private utility functions for getting/finding tags
     # ===============================================

diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py
@@ -212,5 +212,4 @@ def find_rooted_entry(tag_entry, schema, loading_merged):
 
     def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed",
                          error_code=HedExceptions.WIKI_DELIMITERS_INVALID):
-
         self.fatal_errors += schema_util.format_error(line_number, line, warning_message, error_code)
diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py
@@ -8,7 +8,7 @@
 from hed.errors.exceptions import HedFileError, HedExceptions
 from hed.schema.schema_io.base2schema import SchemaLoader
 import pandas as pd
-import hed.schema.hed_schema_df_constants as constants
+import hed.schema.schema_io.df_constants as constants
 from hed.errors import error_reporter
 from hed.schema.schema_io import text_util
 
@@ -44,7 +44,9 @@ def load_spreadsheet(cls, filenames=None, schema_as_strings_or_df=None, name="")
             schema(HedSchema): The new schema
         """
         loader = cls(filenames, schema_as_strings_or_df=schema_as_strings_or_df, name=name)
-        return loader._load()
+        hed_schema = loader._load()
+        cls._fix_extras(hed_schema)
+        return hed_schema
 
     def _open_file(self):
         if self.filenames:
@@ -54,6 +56,20 @@ def _open_file(self):
 
         return dataframes
 
+    @staticmethod
+    def _fix_extras(hed_schema):
+        """ Fixes the extras after loading the schema, to ensure they are in the correct format.
+
+        Parameters:
+            hed_schema (HedSchema): The loaded HedSchema object to fix extras for.
+
+        """
+        if not hed_schema or not hasattr(hed_schema, 'extras') or not hed_schema.extras:
+            return
+
+        for key, extra in hed_schema.extras.items():
+            hed_schema.extras[key] = extra.rename(columns=constants.EXTRAS_CONVERSIONS)
+
     def _get_header_attributes(self, file_data):
         header_attributes = {}
         for row_number, row in file_data[constants.STRUCT_KEY].iterrows():
@@ -90,7 +106,7 @@ def _get_prologue_epilogue(self, file_data):
         prologue, epilogue = "", ""
         for row_number, row in file_data[constants.STRUCT_KEY].iterrows():
             cls = row[constants.subclass_of]
-            description = row[constants.description]
+            description = row[constants.dcdescription]
             if cls == "HedPrologue" and description:
                 prologue = description.replace("\\n", "\n")
                 continue
@@ -232,7 +248,7 @@ def _create_entry(self, row_number, row, key_class, full_tag_name=None):
         if hed_id:
             node_attributes[HedKey.HedID] = hed_id
 
-        description = row[constants.description]
+        description = row[constants.dcdescription]
         tag_entry = self._schema._create_tag_entry(element_name, key_class)
 
         if description:

diff --git a/hed/schema/hed_schema_df_constants.py → hed/schema/schema_io/df_constants.py b/hed/schema/hed_schema_df_constants.py → hed/schema/schema_io/df_constants.py
@@ -19,16 +19,17 @@
 
 PREFIXES_KEY = "Prefixes"
 EXTERNAL_ANNOTATION_KEY = "AnnotationPropertyExternal"
+SOURCES_KEY = "Sources"
 
 PROPERTY_KEYS = [ANNOTATION_KEY, DATA_KEY, OBJECT_KEY]
 DF_SUFFIXES = {TAG_KEY, STRUCT_KEY, VALUE_CLASS_KEY,
                UNIT_CLASS_KEY, UNIT_KEY, UNIT_MODIFIER_KEY,
-               *PROPERTY_KEYS, ATTRIBUTE_PROPERTY_KEY, PREFIXES_KEY, EXTERNAL_ANNOTATION_KEY}
+               *PROPERTY_KEYS, ATTRIBUTE_PROPERTY_KEY, PREFIXES_KEY,
+               EXTERNAL_ANNOTATION_KEY, SOURCES_KEY}
 
 
-DF_EXTRA_SUFFIXES = {PREFIXES_KEY, EXTERNAL_ANNOTATION_KEY}
+DF_EXTRA_SUFFIXES = {PREFIXES_KEY, EXTERNAL_ANNOTATION_KEY, SOURCES_KEY}
 #DF_SUFFIXES_OMN = {*DF_SUFFIXES, *DF_EXTRA_SUFFIXES}
-DF_SUFFIXES_OMN = DF_SUFFIXES
 
 section_mapping_hed_id = {
     STRUCT_KEY: None,
@@ -43,33 +44,55 @@
     ATTRIBUTE_PROPERTY_KEY: HedSectionKey.Properties,
 }
 
+section_key_to_suffixes = {
+    HedSectionKey.Tags: [TAG_KEY],
+    HedSectionKey.Units: [UNIT_KEY],
+    HedSectionKey.UnitClasses: [UNIT_CLASS_KEY],
+    HedSectionKey.UnitModifiers: [UNIT_MODIFIER_KEY],
+    HedSectionKey.ValueClasses: [VALUE_CLASS_KEY],
+    HedSectionKey.Attributes: [DATA_KEY, OBJECT_KEY, ANNOTATION_KEY],
+    HedSectionKey.Properties: [ATTRIBUTE_PROPERTY_KEY],
+}
+
 # Spreadsheet column ids
 hed_id = "hedId"
 level = "Level"
 name = "rdfs:label"
 subclass_of = "omn:SubClassOf"
 attributes = "Attributes"
-description = "dc:description"
+dcdescription = "dc:description"
 equivalent_to = "omn:EquivalentTo"
 has_unit_class = "hasUnitClass"
-annotations = "Annotations"
-
-struct_columns = [hed_id, name, attributes, subclass_of, description]
-tag_columns = [hed_id, name, level, subclass_of, attributes, description]
-unit_columns = [hed_id, name, subclass_of, has_unit_class, attributes, description]
+prefix = "prefix"  # for the prefixes section, this is the column name in the prefixes dataframe
+namespace = "namespace"  # for the prefixes section, this is the column name in the prefixes dataframe
+id = "id"  # for the prefixes section, this is the column name in the prefixes dataframe
+iri = "iri"  # for the prefixes section, this is the column name in the prefixes dataframe
+source = "source"  # for the sources section, this is the column name in the sources dataframe
+link = "link"
+type = "Type"
+domain = "omn:Domain"
+range = "omn:Range"
+properties = "Properties"  # for the schema properties, this is the column name in the properties dataframe
+description = "description"
+
+struct_columns = [hed_id, name, attributes, subclass_of, dcdescription]
+tag_columns = [hed_id, name, level, subclass_of, attributes, dcdescription]
+unit_columns = [hed_id, name, subclass_of, has_unit_class, attributes, dcdescription]
+attribute_columns = [hed_id, name, type, domain, range, properties, dcdescription]  # For the annotation property
+property_columns = [hed_id, name, type, dcdescription]
+prefix_columns = [prefix, namespace, description]
+external_annotation_columns = [prefix, id, iri, description]
+source_columns = [source, link]  # For the sources section
 
 # The columns for unit class, value class, and unit modifier
-other_columns = [hed_id, name, subclass_of, attributes, description]
+other_columns = [hed_id, name, subclass_of, attributes, dcdescription]
 
 # for schema attributes
 property_type = "Type"
 property_domain = "omn:Domain"
 property_range = "omn:Range"
 properties = "Properties"
-property_columns = [hed_id, name, property_type, property_domain, property_range, properties, description]
 
-# For the schema properties
-property_columns_reduced = [hed_id, name, property_type, description]
 
 # HED_00X__YY where X is the library starting index, and Y is the entity number below.
 struct_base_ids = {
@@ -95,7 +118,11 @@
     hed_schema_constants.UNMERGED_ATTRIBUTE: "HED_0000303"
 }
 
-# Extra spreadsheet column ideas
-Prefix = "Prefix"
-ID = "ID"
-NamespaceIRI = "Namespace IRI"
+# Extra spreadsheet columns
+EXTRAS_CONVERSIONS = {"Prefix": "prefix", "namespace IRI": "namespace", "namespace iri": "namespace", "ID": "id",
+                      "definition": "description", "Description": "description", "IRI": "iri"}
+
+
+Prefix = "prefix"
+ID = "id"
+NamespaceIRI = "namespaceIRI"
diff --git a/hed/schema/schema_io/df_util.py b/hed/schema/schema_io/df_util.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 from hed.errors import HedFileError, HedExceptions
-from hed.schema import hed_schema_df_constants as constants
+from hed.schema.schema_io import df_constants as constants
 from hed.schema.hed_schema_constants import HedKey
 from hed.schema.hed_cache import get_library_data
 from hed.schema.schema_io.text_util import parse_attribute_string, _parse_header_attributes_line
@@ -83,18 +83,17 @@ def save_dataframes(base_filename, dataframe_dict):
                              lineterminator="\n")
 
 
-def convert_filenames_to_dict(filenames, include_prefix_dfs=False):
+def convert_filenames_to_dict(filenames):
     """Infers filename meaning based on suffix, e.g. _Tag for the tags sheet
 
     Parameters:
         filenames(str or None or list or dict): The list to convert to a dict
             If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
             If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
-        include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes.
     Returns:
         filename_dict(str: str): The required suffix to filename mapping"""
     result_filenames = {}
-    dataframe_names = constants.DF_SUFFIXES_OMN if include_prefix_dfs else constants.DF_SUFFIXES
+    dataframe_names = constants.DF_SUFFIXES
     if isinstance(filenames, str):
         if filenames.endswith(".tsv"):
             base, base_ext = os.path.splitext(filenames)
@@ -126,37 +125,46 @@ def create_empty_dataframes():
                 constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
                 constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
                 constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
-                constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
-                constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
-                constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
-                constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), }
+                constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.attribute_columns, dtype=str),
+                constants.DATA_KEY: pd.DataFrame(columns=constants.attribute_columns, dtype=str),
+                constants.OBJECT_KEY: pd.DataFrame(columns=constants.attribute_columns, dtype=str),
+                constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
+                constants.PREFIXES_KEY: pd.DataFrame(columns=constants.prefix_columns, dtype=str),
+                constants.SOURCES_KEY: pd.DataFrame(columns=constants.source_columns, dtype=str),
+                constants.EXTERNAL_ANNOTATION_KEY:
+                    pd.DataFrame(columns=constants.external_annotation_columns, dtype=str)
+                }
     return base_dfs
 
 
-def load_dataframes(filenames, include_prefix_dfs=False):
+def load_dataframes(filenames):
     """Load the dataframes from the source folder or series of files.
 
     Parameters:
         filenames(str or None or list or dict): The input filenames
             If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
             If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
-        include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes.
     Returns:
         dataframes_dict(str: dataframes): The suffix:dataframe dict
     """
-    dict_filenames = convert_filenames_to_dict(filenames, include_prefix_dfs=include_prefix_dfs)
+    dict_filenames = convert_filenames_to_dict(filenames)
     dataframes = create_empty_dataframes()
     for key, filename in dict_filenames.items():
         try:
-            loaded_dataframe = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
             if key in dataframes:
+                loaded_dataframe = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
+                loaded_dataframe = loaded_dataframe.rename(columns=constants.EXTRAS_CONVERSIONS)
+
                 columns_not_in_loaded = dataframes[key].columns[~dataframes[key].columns.isin(loaded_dataframe.columns)]
                 # and not dataframes[key].columns.isin(loaded_dataframe.columns).all():
                 if columns_not_in_loaded.any():
                     raise HedFileError(HedExceptions.SCHEMA_LOAD_FAILED,
-                                       f"Required column(s) {list(columns_not_in_loaded)} missing from {filename}.  "
+                                          f"Required column(s) {list(columns_not_in_loaded)} missing from {filename}.  "
                                        f"The required columns are {list(dataframes[key].columns)}", filename=filename)
-            dataframes[key] = loaded_dataframe
+                dataframes[key] = loaded_dataframe
+            elif os.path.exists(filename):
+                # Handle the extra files if they are present.
+                dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
         except OSError:
             # todo: consider if we want to report this error(we probably do)
             pass  # We will use a blank one for this
Original file line number	Diff line number	Diff line change
Expand Up		@@ -212,5 +212,4 @@ def find_rooted_entry(tag_entry, schema, loading_merged):

		def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed",
		error_code=HedExceptions.WIKI_DELIMITERS_INVALID):

		self.fatal_errors += schema_util.format_error(line_number, line, warning_message, error_code)