Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ var/
.installed.cfg
*.egg
tests/scratch
tests/test_output

# Installer logs
pip-log.txt
Expand Down
1 change: 1 addition & 0 deletions hed/errors/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class HedExceptions:
# This issue will contain a list of lines with issues.
WIKI_DELIMITERS_INVALID = 'WIKI_DELIMITERS_INVALID'
WIKI_LINE_START_INVALID = 'WIKI_LINE_START_INVALID'
WIKI_LINE_INVALID = 'WIKI_LINE_INVALID'
HED_SCHEMA_NODE_NAME_INVALID = 'HED_SCHEMA_NODE_NAME_INVALID'

SCHEMA_DUPLICATE_PREFIX = 'SCHEMA_LOAD_FAILED'
Expand Down
23 changes: 23 additions & 0 deletions hed/schema/hed_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(self):
self.filename = None
self.prologue = ""
self.epilogue = ""
self.extras = {} # Used to store any additional data that might be needed for serialization (like OWL or other formats)

# This is the specified library name_prefix - tags will be {schema_namespace}:{tag_name}
self._namespace = ""
Expand Down Expand Up @@ -227,6 +228,22 @@ def valid_prefixes(self):
"""
return [self._namespace]

def get_extras(self, extras_key):
""" Get the extras corresponding to the given key

Parameters:
extras_key (str): The key to check for in the extras dictionary.

Returns:
DataFrame: True if the extras dictionary has this key.
"""
if not hasattr(self, 'extras') or not extras_key in self.extras:
return None
externals = self.extras[extras_key]
if externals.empty:
None
return externals

# ===============================================
# Creation and saving functions
# ===============================================
Expand Down Expand Up @@ -366,12 +383,16 @@ def __eq__(self, other):
if other is None:
return False
if self.get_save_header_attributes() != other.get_save_header_attributes():
# print(f"Header attributes not equal: '{self.get_save_header_attributes()}' vs '{other.get_save_header_attributes()}'")
return False
if self.has_duplicates() != other.has_duplicates():
# print(f"Duplicates: '{self.has_duplicates()}' vs '{other.has_duplicates()}'")
return False
if self.prologue.strip() != other.prologue.strip():
# print(f"PROLOGUE NOT EQUAL: '{self.prologue.strip()}' vs '{other.prologue.strip()}'")
return False
if self.epilogue.strip() != other.epilogue.strip():
# print(f"EPILOGUE NOT EQUAL: '{self.epilogue.strip()}' vs '{other.epilogue.strip()}'")
return False
if self._sections != other._sections:
# This block is useful for debugging when modifying the schema class itself.
Expand All @@ -394,6 +415,7 @@ def __eq__(self, other):
# print(s)
return False
if self._namespace != other._namespace:
# print(f"NAMESPACE NOT EQUAL: '{self._namespace}' vs '{other._namespace}'")
return False
return True

Expand Down Expand Up @@ -473,6 +495,7 @@ def find_tag_entry(self, tag, schema_namespace=""):
return None, None, validation_issues
return self._find_tag_entry(tag, schema_namespace)


# ===============================================
# Private utility functions for getting/finding tags
# ===============================================
Expand Down
1 change: 0 additions & 1 deletion hed/schema/schema_io/base2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,5 +212,4 @@ def find_rooted_entry(tag_entry, schema, loading_merged):

def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed",
error_code=HedExceptions.WIKI_DELIMITERS_INVALID):

self.fatal_errors += schema_util.format_error(line_number, line, warning_message, error_code)
24 changes: 20 additions & 4 deletions hed/schema/schema_io/df2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.schema.schema_io.base2schema import SchemaLoader
import pandas as pd
import hed.schema.hed_schema_df_constants as constants
import hed.schema.schema_io.df_constants as constants
from hed.errors import error_reporter
from hed.schema.schema_io import text_util

Expand Down Expand Up @@ -44,7 +44,9 @@ def load_spreadsheet(cls, filenames=None, schema_as_strings_or_df=None, name="")
schema(HedSchema): The new schema
"""
loader = cls(filenames, schema_as_strings_or_df=schema_as_strings_or_df, name=name)
return loader._load()
hed_schema = loader._load()
cls._fix_extras(hed_schema)
return hed_schema

def _open_file(self):
if self.filenames:
Expand All @@ -54,6 +56,20 @@ def _open_file(self):

return dataframes

@staticmethod
def _fix_extras(hed_schema):
""" Fixes the extras after loading the schema, to ensure they are in the correct format.

Parameters:
hed_schema (HedSchema): The loaded HedSchema object to fix extras for.

"""
if not hed_schema or not hasattr(hed_schema, 'extras') or not hed_schema.extras:
return

for key, extra in hed_schema.extras.items():
hed_schema.extras[key] = extra.rename(columns=constants.EXTRAS_CONVERSIONS)

def _get_header_attributes(self, file_data):
header_attributes = {}
for row_number, row in file_data[constants.STRUCT_KEY].iterrows():
Expand Down Expand Up @@ -90,7 +106,7 @@ def _get_prologue_epilogue(self, file_data):
prologue, epilogue = "", ""
for row_number, row in file_data[constants.STRUCT_KEY].iterrows():
cls = row[constants.subclass_of]
description = row[constants.description]
description = row[constants.dcdescription]
if cls == "HedPrologue" and description:
prologue = description.replace("\\n", "\n")
continue
Expand Down Expand Up @@ -232,7 +248,7 @@ def _create_entry(self, row_number, row, key_class, full_tag_name=None):
if hed_id:
node_attributes[HedKey.HedID] = hed_id

description = row[constants.description]
description = row[constants.dcdescription]
tag_entry = self._schema._create_tag_entry(element_name, key_class)

if description:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,17 @@

PREFIXES_KEY = "Prefixes"
EXTERNAL_ANNOTATION_KEY = "AnnotationPropertyExternal"
SOURCES_KEY = "Sources"

PROPERTY_KEYS = [ANNOTATION_KEY, DATA_KEY, OBJECT_KEY]
DF_SUFFIXES = {TAG_KEY, STRUCT_KEY, VALUE_CLASS_KEY,
UNIT_CLASS_KEY, UNIT_KEY, UNIT_MODIFIER_KEY,
*PROPERTY_KEYS, ATTRIBUTE_PROPERTY_KEY, PREFIXES_KEY, EXTERNAL_ANNOTATION_KEY}
*PROPERTY_KEYS, ATTRIBUTE_PROPERTY_KEY, PREFIXES_KEY,
EXTERNAL_ANNOTATION_KEY, SOURCES_KEY}


DF_EXTRA_SUFFIXES = {PREFIXES_KEY, EXTERNAL_ANNOTATION_KEY}
DF_EXTRA_SUFFIXES = {PREFIXES_KEY, EXTERNAL_ANNOTATION_KEY, SOURCES_KEY}
#DF_SUFFIXES_OMN = {*DF_SUFFIXES, *DF_EXTRA_SUFFIXES}
DF_SUFFIXES_OMN = DF_SUFFIXES

section_mapping_hed_id = {
STRUCT_KEY: None,
Expand All @@ -43,33 +44,55 @@
ATTRIBUTE_PROPERTY_KEY: HedSectionKey.Properties,
}

section_key_to_suffixes = {
HedSectionKey.Tags: [TAG_KEY],
HedSectionKey.Units: [UNIT_KEY],
HedSectionKey.UnitClasses: [UNIT_CLASS_KEY],
HedSectionKey.UnitModifiers: [UNIT_MODIFIER_KEY],
HedSectionKey.ValueClasses: [VALUE_CLASS_KEY],
HedSectionKey.Attributes: [DATA_KEY, OBJECT_KEY, ANNOTATION_KEY],
HedSectionKey.Properties: [ATTRIBUTE_PROPERTY_KEY],
}

# Spreadsheet column ids
hed_id = "hedId"
level = "Level"
name = "rdfs:label"
subclass_of = "omn:SubClassOf"
attributes = "Attributes"
description = "dc:description"
dcdescription = "dc:description"
equivalent_to = "omn:EquivalentTo"
has_unit_class = "hasUnitClass"
annotations = "Annotations"

struct_columns = [hed_id, name, attributes, subclass_of, description]
tag_columns = [hed_id, name, level, subclass_of, attributes, description]
unit_columns = [hed_id, name, subclass_of, has_unit_class, attributes, description]
prefix = "prefix" # for the prefixes section, this is the column name in the prefixes dataframe
namespace = "namespace" # for the prefixes section, this is the column name in the prefixes dataframe
id = "id" # for the prefixes section, this is the column name in the prefixes dataframe
iri = "iri" # for the prefixes section, this is the column name in the prefixes dataframe
source = "source" # for the sources section, this is the column name in the sources dataframe
link = "link"
type = "Type"
domain = "omn:Domain"
range = "omn:Range"
properties = "Properties" # for the schema properties, this is the column name in the properties dataframe
description = "description"

struct_columns = [hed_id, name, attributes, subclass_of, dcdescription]
tag_columns = [hed_id, name, level, subclass_of, attributes, dcdescription]
unit_columns = [hed_id, name, subclass_of, has_unit_class, attributes, dcdescription]
attribute_columns = [hed_id, name, type, domain, range, properties, dcdescription] # For the annotation property
property_columns = [hed_id, name, type, dcdescription]
prefix_columns = [prefix, namespace, description]
external_annotation_columns = [prefix, id, iri, description]
source_columns = [source, link] # For the sources section

# The columns for unit class, value class, and unit modifier
other_columns = [hed_id, name, subclass_of, attributes, description]
other_columns = [hed_id, name, subclass_of, attributes, dcdescription]

# for schema attributes
property_type = "Type"
property_domain = "omn:Domain"
property_range = "omn:Range"
properties = "Properties"
property_columns = [hed_id, name, property_type, property_domain, property_range, properties, description]

# For the schema properties
property_columns_reduced = [hed_id, name, property_type, description]

# HED_00X__YY where X is the library starting index, and Y is the entity number below.
struct_base_ids = {
Expand All @@ -95,7 +118,11 @@
hed_schema_constants.UNMERGED_ATTRIBUTE: "HED_0000303"
}

# Extra spreadsheet column ideas
Prefix = "Prefix"
ID = "ID"
NamespaceIRI = "Namespace IRI"
# Extra spreadsheet columns
EXTRAS_CONVERSIONS = {"Prefix": "prefix", "namespace IRI": "namespace", "namespace iri": "namespace", "ID": "id",
"definition": "description", "Description": "description", "IRI": "iri"}


Prefix = "prefix"
ID = "id"
NamespaceIRI = "namespaceIRI"
36 changes: 22 additions & 14 deletions hed/schema/schema_io/df_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd

from hed.errors import HedFileError, HedExceptions
from hed.schema import hed_schema_df_constants as constants
from hed.schema.schema_io import df_constants as constants
from hed.schema.hed_schema_constants import HedKey
from hed.schema.hed_cache import get_library_data
from hed.schema.schema_io.text_util import parse_attribute_string, _parse_header_attributes_line
Expand Down Expand Up @@ -83,18 +83,17 @@ def save_dataframes(base_filename, dataframe_dict):
lineterminator="\n")


def convert_filenames_to_dict(filenames, include_prefix_dfs=False):
def convert_filenames_to_dict(filenames):
"""Infers filename meaning based on suffix, e.g. _Tag for the tags sheet

Parameters:
filenames(str or None or list or dict): The list to convert to a dict
If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes.
Returns:
filename_dict(str: str): The required suffix to filename mapping"""
result_filenames = {}
dataframe_names = constants.DF_SUFFIXES_OMN if include_prefix_dfs else constants.DF_SUFFIXES
dataframe_names = constants.DF_SUFFIXES
if isinstance(filenames, str):
if filenames.endswith(".tsv"):
base, base_ext = os.path.splitext(filenames)
Expand Down Expand Up @@ -126,37 +125,46 @@ def create_empty_dataframes():
constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), }
constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.attribute_columns, dtype=str),
constants.DATA_KEY: pd.DataFrame(columns=constants.attribute_columns, dtype=str),
constants.OBJECT_KEY: pd.DataFrame(columns=constants.attribute_columns, dtype=str),
constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.PREFIXES_KEY: pd.DataFrame(columns=constants.prefix_columns, dtype=str),
constants.SOURCES_KEY: pd.DataFrame(columns=constants.source_columns, dtype=str),
constants.EXTERNAL_ANNOTATION_KEY:
pd.DataFrame(columns=constants.external_annotation_columns, dtype=str)
}
return base_dfs


def load_dataframes(filenames, include_prefix_dfs=False):
def load_dataframes(filenames):
"""Load the dataframes from the source folder or series of files.

Parameters:
filenames(str or None or list or dict): The input filenames
If a string with a .tsv suffix: Save to that location, adding the suffix to each .tsv file
If a string with no .tsv suffix: Save to that folder, with the contents being the separate .tsv files.
include_prefix_dfs(bool): If True, include the prefixes and external annotation dataframes.
Returns:
dataframes_dict(str: dataframes): The suffix:dataframe dict
"""
dict_filenames = convert_filenames_to_dict(filenames, include_prefix_dfs=include_prefix_dfs)
dict_filenames = convert_filenames_to_dict(filenames)
dataframes = create_empty_dataframes()
for key, filename in dict_filenames.items():
try:
loaded_dataframe = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
if key in dataframes:
loaded_dataframe = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
loaded_dataframe = loaded_dataframe.rename(columns=constants.EXTRAS_CONVERSIONS)

columns_not_in_loaded = dataframes[key].columns[~dataframes[key].columns.isin(loaded_dataframe.columns)]
# and not dataframes[key].columns.isin(loaded_dataframe.columns).all():
if columns_not_in_loaded.any():
raise HedFileError(HedExceptions.SCHEMA_LOAD_FAILED,
f"Required column(s) {list(columns_not_in_loaded)} missing from {filename}. "
f"Required column(s) {list(columns_not_in_loaded)} missing from {filename}. "
f"The required columns are {list(dataframes[key].columns)}", filename=filename)
dataframes[key] = loaded_dataframe
dataframes[key] = loaded_dataframe
elif os.path.exists(filename):
# Handle the extra files if they are present.
dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
except OSError:
# todo: consider if we want to report this error(we probably do)
pass # We will use a blank one for this
Expand Down
Loading