diff --git a/docs/requirements.txt b/docs/requirements.txt index db57d26ca..c87a365a9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -10,4 +10,3 @@ myst-parser>=1.0.0 Sphinx>=5.2.2 sphinx_rtd_theme>=1.0.0 wordcloud==1.9.3 -rdflib>=6 diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index 7f6fec499..fa22dc696 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -6,6 +6,8 @@ from hed.schema.schema_io import schema_util from hed.schema.schema_io.schema2xml import Schema2XML from hed.schema.schema_io.schema2wiki import Schema2Wiki +from hed.schema.schema_io.schema2df import Schema2DF + # from hed.schema.schema_io.schema2owl import Schema2Owl # from hed.schema.schema_io.owl_constants import ext_to_format from hed.schema.hed_schema_section import (HedSchemaSection, HedSchemaTagSection, HedSchemaUnitClassSection, @@ -298,6 +300,25 @@ def save_as_mediawiki(self, filename, save_merged=False): opened_file.write(string) opened_file.write('\n') + def save_as_dataframes(self, base_filename, save_merged=False): + """ Save as mediawiki to a file. + + base_filename: str + save filename. A suffix will be added to most, e.g. _Tag + save_merged: bool + If True, this will save the schema as a merged schema if it is a "withStandard" schema. + If it is not a "withStandard" schema, this setting has no effect. + + :raises OSError: + - File cannot be saved for some reason. + """ + output_dfs = Schema2DF.process_schema(self, save_merged) + base, base_ext = os.path.splitext(base_filename) + for suffix, dataframe in output_dfs.items(): + filename = f"{base}_{suffix}.tsv" + with open(filename, mode='w', encoding='utf-8') as opened_file: + dataframe.to_csv(opened_file, sep='\t', index=False, header=True) + # def save_as_owl(self, filename, save_merged=False, file_format=None): # """ Save as json to a file. # diff --git a/hed/schema/hed_schema_constants.py b/hed/schema/hed_schema_constants.py index f151e46a8..2153740f0 100644 --- a/hed/schema/hed_schema_constants.py +++ b/hed/schema/hed_schema_constants.py @@ -59,6 +59,7 @@ class HedKey: # Node attributes InLibrary = "inLibrary" + HedID = 'hedId' # All known properties BoolProperty = 'boolProperty' diff --git a/hed/schema/hed_schema_df_constants.py b/hed/schema/hed_schema_df_constants.py new file mode 100644 index 000000000..6114160d0 --- /dev/null +++ b/hed/schema/hed_schema_df_constants.py @@ -0,0 +1,7 @@ +# Known tsv format suffixes + +STRUCT_KEY = "Structure" +TAG_KEY = "Tag" + +# todo: move more constants up here +hed_id_column = "hedId" diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index 23b2d40d4..a0e09df8f 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -5,6 +5,7 @@ from hed.schema.schema_io.xml2schema import SchemaLoaderXML from hed.schema.schema_io.wiki2schema import SchemaLoaderWiki +from hed.schema.schema_io.df2schema import SchemaLoaderDF # from hed.schema.schema_io.owl2schema import SchemaLoaderOWL from hed.schema import hed_cache @@ -23,9 +24,11 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche """ Create a schema from the given string. Parameters: - schema_string (str): An XML, mediawiki or OWL, file as a single long string + schema_string (str or dict): An XML, mediawiki or OWL, file as a single long string + If tsv, Must be a dict of spreadsheets as strings. schema_format (str): The schema format of the source schema string. - Allowed normal values: .mediawiki, .xml + Allowed normal values: .mediawiki, .xml, .tsv + Note: tsv is in progress and has limited features schema_namespace (str, None): The name_prefix all tags in this schema will accept. schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. @@ -46,13 +49,18 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty string passed to HedSchema.from_string", filename=name) - # Replace carriage returns with new lines since this might not be done by the caller - schema_string = schema_string.replace("\r\n", "\n") + if isinstance(schema_string, str): + # Replace carriage returns with new lines since this might not be done by the caller + schema_string = schema_string.replace("\r\n", "\n") if schema_format.endswith(".xml"): hed_schema = SchemaLoaderXML.load(schema_as_string=schema_string, schema=schema, name=name) elif schema_format.endswith(".mediawiki"): hed_schema = SchemaLoaderWiki.load(schema_as_string=schema_string, schema=schema, name=name) + elif schema_format.endswith(".tsv"): + if schema is not None: + raise HedFileError(HedExceptions.INVALID_HED_FORMAT, "Cannot pass a schema to merge into spreadsheet loading currently.", filename=name) + hed_schema = SchemaLoaderDF.load_spreadsheet(schema_as_strings=schema_string, name=name) # elif schema_format: # hed_schema = SchemaLoaderOWL.load(schema_as_string=schema_string, schema=schema, file_format=schema_format, # name=name) @@ -68,7 +76,9 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None): """ Load a schema from the given file or URL path. Parameters: - hed_path (str): A filepath or url to open a schema from. + hed_path (str or dict): A filepath or url to open a schema from. + If loading a TSV file, this can be a single filename template, or a dict of filenames. + Template: basename.tsv, where files are named basename_Struct.tsv and basename_Tag.tsv schema_namespace (str or None): The name_prefix all tags in this schema will accept. schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. @@ -87,7 +97,6 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None): raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file path passed to HedSchema.load_file", filename=hed_path) - ext = os.path.splitext(hed_path.lower())[1] is_url = hed_cache._check_if_url(hed_path) if is_url: try: @@ -103,6 +112,11 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None): hed_schema = SchemaLoaderXML.load(hed_path, schema=schema, name=name) elif hed_path.lower().endswith(".mediawiki"): hed_schema = SchemaLoaderWiki.load(hed_path, schema=schema, name=name) + elif hed_path.lower().endswith(".tsv"): + if schema is not None: + raise HedFileError(HedExceptions.INVALID_HED_FORMAT, + "Cannot pass a schema to merge into spreadsheet loading currently.", filename=name) + hed_schema = SchemaLoaderDF.load_spreadsheet(filenames=hed_path, name=name) else: raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unknown schema extension", filename=hed_path) diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py index bf6a5e04b..d84d0ac34 100644 --- a/hed/schema/schema_io/base2schema.py +++ b/hed/schema/schema_io/base2schema.py @@ -1,12 +1,18 @@ import copy +import re from hed.errors.exceptions import HedFileError, HedExceptions +from hed.errors.error_types import ErrorContext from hed.schema import HedSchema, hed_schema_constants as constants from hed.schema.hed_schema_constants import HedKey from abc import abstractmethod, ABC from hed.schema import schema_header_util from hed.schema import hed_schema_constants +# Might need separate version again for wiki +header_attr_expression = "([^ ,]+?)=\"(.*?)\"" +attr_re = re.compile(header_attr_expression) + class SchemaLoader(ABC): """ Baseclass for schema loading, to handle basic errors and partnered schemas @@ -70,6 +76,7 @@ def __init__(self, filename, schema_as_string=None, schema=None, file_format=Non self._schema.filename = filename self._schema.header_attributes = hed_attributes self._loading_merged = False + self.fatal_errors = [] @property def schema(self): @@ -203,3 +210,74 @@ def find_rooted_entry(tag_entry, schema, loading_merged): return None return rooted_entry + + def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed", + error_code=HedExceptions.WIKI_DELIMITERS_INVALID): + + self.fatal_errors += self._format_error(line_number, line, warning_message, error_code) + + + @staticmethod + def _format_error(row_number, row, warning_message="Schema term is empty or the line is malformed", + error_code=HedExceptions.GENERIC_ERROR): + error = {'code': error_code, + ErrorContext.ROW: row_number, + ErrorContext.LINE: str(row), + "message": f"{warning_message}" + } + + return [error] + + # Below here are generic string loading functions, used by wiki and spreadsheet formats. + @staticmethod + def _validate_attribute_string(attribute_string): + pattern = r'^[A-Za-z]+(=.+)?$' + match = re.fullmatch(pattern, attribute_string) + if match: + return match.group() + + def _parse_attribute_string(self, row_number, attr_string): + if attr_string: + attributes_split = [x.strip() for x in attr_string.split(',')] + + final_attributes = {} + for attribute in attributes_split: + if self._validate_attribute_string(attribute) is None: + self._add_fatal_error(row_number, attr_string, + f"Malformed attribute found {attribute}. " + f"Valid formatting is: attribute, or attribute=\"value\".") + continue + split_attribute = attribute.split("=") + if len(split_attribute) == 1: + final_attributes[split_attribute[0]] = True + else: + if split_attribute[0] in final_attributes: + final_attributes[split_attribute[0]] += "," + split_attribute[1] + else: + final_attributes[split_attribute[0]] = split_attribute[1] + return final_attributes + else: + return {} + + @staticmethod + def _parse_attributes_line(version_line): + matches = {} + unmatched = [] + last_end = 0 + + for match in attr_re.finditer(version_line): + start, end = match.span() + + # If there's unmatched content between the last match and the current one. + if start > last_end: + unmatched.append(version_line[last_end:start]) + + matches[match.group(1)] = match.group(2) + last_end = end + + # If there's unmatched content after the last match + if last_end < len(version_line): + unmatched.append(version_line[last_end:]) + + unmatched = [m.strip() for m in unmatched if m.strip()] + return matches, unmatched diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py new file mode 100644 index 000000000..46f8e66db --- /dev/null +++ b/hed/schema/schema_io/df2schema.py @@ -0,0 +1,443 @@ +""" +This module is used to create a HedSchema object from a set of .tsv files. +""" +import io +import os + +from hed.schema.hed_schema_constants import HedSectionKey, HedKey +from hed.errors.exceptions import HedFileError, HedExceptions +from .base2schema import SchemaLoader +import pandas as pd +from hed.schema.schema_io.schema2df import Schema2DF +from hed.schema.hed_schema_df_constants import * +import copy +from hed.errors import error_reporter + + +class SchemaLoaderDF(SchemaLoader): + """ Load dataframe schemas from filenames + + Expected usage is SchemaLoaderDF.load(filenames) + + Note: due to supporting multiple files, this one differs from the other schema loaders + """ + + def __init__(self, filenames, schema_as_strings, name=""): + from hed.schema.hed_schema_io import load_schema_version + + self.filenames = self.convert_filenames_to_dict(filenames) + self.schema_as_strings = schema_as_strings + if self.filenames: + reported_filename = self.filenames.get(STRUCT_KEY) + else: + reported_filename = "from_strings" + super().__init__(reported_filename, None, None, None, name) + # Grab the header attributes we already loaded + save_header = self._schema.header_attributes + # BFK - just load 8.3.0 for the non tag sections + version = save_header.get("withStandard", "8.3.0") + schema = copy.deepcopy(load_schema_version(version)) + + self._schema = schema + self._schema.header_attributes = save_header + + # Blow away tags section if needed. This will eventually be removed once we load all from spreadsheets. + if self._schema.merged or not self._schema.with_standard: + # todo: reset this once we load more from the spreadsheets + clear_sections(schema, [HedSectionKey.Tags]) + # clear_sections(schema, [HedSectionKey.Tags, HedSectionKey.UnitClasses, HedSectionKey.Units, + # HedSectionKey.ValueClasses, HedSectionKey.UnitModifiers, HedSectionKey.Properties, + # HedSectionKey.Attributes]) + + self._schema.source_format = "spreadsheet" + + @classmethod + def load_spreadsheet(cls, filenames=None, schema_as_strings=None, name=""): + """ Loads and returns the schema, including partnered schema if applicable. + + Parameters: + filenames(str or None or dict of str): A valid set of schema spreadsheet filenames + If a single filename string, assumes the standard filename suffixes. + schema_as_strings(None or dict of str): A valid set of schema spreadsheet files(tsv as strings) + name (str): what to identify this schema as + Returns: + schema(HedSchema): The new schema + """ + loader = cls(filenames, schema_as_strings=schema_as_strings, name=name) + return loader._load() + + @staticmethod + def convert_filenames_to_dict(filenames): + """Infers filename meaning based on suffix, e.g. _Tag for the tags sheet + + Parameters: + filenames(None or list or dict): The list to convert to a dict + + Returns: + filename_dict(str: str): The required suffix to filename mapping""" + needed_suffixes = {TAG_KEY, STRUCT_KEY} + result_filenames = {} + if isinstance(filenames, str): + base, base_ext = os.path.splitext(filenames) + for suffix in needed_suffixes: + filename = f"{base}_{suffix}.tsv" + result_filenames[suffix] = filename + filenames = result_filenames + elif isinstance(filenames, list): + for filename in filenames: + remainder, suffix = filename.replace("_", "-").rsplit("-") + for needed_suffix in needed_suffixes: + if needed_suffix in suffix: + result_filenames[needed_suffix] = filename + filenames = result_filenames + + return filenames + + def _open_file(self): + if self.filenames: + dataframes = load_dataframes(self.filenames) + else: + dataframes = load_dataframes_from_strings(self.schema_as_strings) + + return dataframes + + def _get_header_attributes(self, file_data): + header_attributes = {} + for row_number, row in file_data[STRUCT_KEY].iterrows(): + cls = row["omn:SubClassOf"] + attributes = row["Attributes"] + if cls == "HedHeader" and attributes: + header_attributes, _ = self._parse_attributes_line(attributes) + continue + + return header_attributes + + def _parse_data(self): + self._schema.prologue, self._schema.epilogue = self._get_prologue_epilogue(self.input_data) + self._read_schema(self.input_data) + if self.fatal_errors: + self.fatal_errors = error_reporter.sort_issues(self.fatal_errors) + raise HedFileError(self.fatal_errors[0]['code'], + f"{len(self.fatal_errors)} issues found when parsing schema. See the .issues " + f"parameter on this exception for more details.", self.name, + issues=self.fatal_errors) + + def _get_prologue_epilogue(self, file_data): + prologue, epilogue = "", "" + for row_number, row in file_data[STRUCT_KEY].iterrows(): + cls = row["omn:SubClassOf"] + description = row["dc:description"] + if cls == "HedPrologue" and description: + prologue = description.replace("\\n", "\n") + continue + elif cls == "HedEpilogue" and description: + epilogue = description.replace("\\n", "\n") + + return prologue, epilogue + + def _read_schema(self, dataframe): + """Add the main schema section + + Parameters: + dataframe (pd.DataFrame): The dataframe for the main tags section + """ + # note: this assumes loading is in order line by line. + # If tags are NOT sorted this won't work.(same as mediawiki) + known_tag_levels = {"HedTag": -1} + parent_tags = [] + level_adj = 0 + self._schema._initialize_attributes(HedSectionKey.Tags) + for row_number, row in dataframe[TAG_KEY].iterrows(): + # skip blank rows, though there shouldn't be any + if not any(row): + continue + parent_tag = row["omn:SubClassOf"] + # Return -1 by default for top level rooted tag support(they might not be in the dict) + raw_level = known_tag_levels.get(parent_tag, -1) + 1 + if raw_level == 0: + parent_tags = [] + level_adj = 0 + else: + level = raw_level + level_adj + if level < len(parent_tags): + parent_tags = parent_tags[:level] + elif level > len(parent_tags): + self._add_fatal_error(row_number, row, + "Invalid level reported from Level column", + HedExceptions.GENERIC_ERROR) + continue + # Create the entry + tag_entry = self._add_tag_line(parent_tags, row_number, row) + + if not tag_entry: + # This will have already raised an error + continue + + known_tag_levels[tag_entry.short_tag_name] = raw_level + + try: + rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged) + if rooted_entry: + parent_tags = rooted_entry.long_tag_name.split("/") + level_adj = len(parent_tags) + # Create the entry again for rooted tags, to get the full name. + tag_entry = self._add_tag_line(parent_tags, row_number, row) + except HedFileError as e: + self._add_fatal_error(row_number, row, e.message, e.code) + continue + + tag_entry = self._add_to_dict(row_number, row, tag_entry, HedSectionKey.Tags) + + parent_tags.append(tag_entry.short_tag_name) + + def _add_tag_line(self, parent_tags, line_number, row): + """ Add a tag to the dictionaries. + + Parameters: + parent_tags (list): A list of parent tags in order. + line_number (int): The line number to report errors as + row (pd.Series): the pandas row + Returns: + HedSchemaEntry: The entry for the added tag. + + Notes: + Includes attributes and description. + """ + tag_name = self._get_tag_name_from_row(row) + if tag_name: + if parent_tags: + long_tag_name = "/".join(parent_tags) + "/" + tag_name + else: + long_tag_name = tag_name + long_tag_name = long_tag_name + return self._create_entry(line_number, row, HedSectionKey.Tags, long_tag_name) + + self._add_fatal_error(line_number, row, f"No tag name found in row.", + error_code=HedExceptions.GENERIC_ERROR) + + def _get_tag_name_from_row(self, row): + try: + base_tag_name = row["rdfs:label"] + if base_tag_name.endswith("-#"): + return "#" + return base_tag_name + except KeyError: + return None + + def _get_hedid_from_row(self, row): + try: + return row[hed_id_column] + except KeyError: + return None + + def _create_entry(self, line_number, row, key_class, full_tag_name=None): + element_name = self._get_tag_name_from_row(row) + if full_tag_name: + element_name = full_tag_name + + hedID = self._get_hedid_from_row(row) + + node_attributes = self._get_tag_attributes(line_number, row) + + if hedID: + node_attributes[HedKey.HedID] = hedID + + description = row["dc:description"] + tag_entry = self._schema._create_tag_entry(element_name, key_class) + + if description: + tag_entry.description = description.strip() + + for attribute_name, attribute_value in node_attributes.items(): + tag_entry._set_attribute_value(attribute_name, attribute_value) + + return tag_entry + + def _get_tag_attributes(self, row_number, row): + """ Get the tag attributes from a line. + + Parameters: + row_number (int): The line number to report errors as. + row (pd.Series): A tag line. + Returns: + dict: Dictionary of attributes. + """ + attr_string = row["Attributes"] + return self._parse_attribute_string(row_number, attr_string) + + def _add_to_dict(self, line_number, line, entry, key_class): + if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema: + self._add_fatal_error(line_number, line, + "Library tag in unmerged schema has InLibrary attribute", + HedExceptions.IN_LIBRARY_IN_UNMERGED) + + return self._add_to_dict_base(entry, key_class) + + + + +def load_dataframes(filenames): + dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames) + return {key: pd.read_csv(filename, sep="\t", dtype=str, na_filter=False) for (key, filename) in dict_filenames.items()} + + +def load_dataframes_from_strings(data_contents): + # Assume data_contents is a list of tuples (key, tsv_string) + return {key: pd.read_csv(io.StringIO(tsv_string), sep="\t", dtype=str, na_filter=False) + for key, tsv_string in data_contents.items()} + + +def get_all_ids(df): + if hed_id_column in df.columns: + modified_df = df[hed_id_column].str.replace("HED_", "") + modified_df = pd.to_numeric(modified_df, errors="coerce").dropna().astype(int) + return set(modified_df.unique()) + return None + + +tag_index_ranges = { + "": (10000, 40000), + "score": (40000, 60000), + "lang": (60000, 80000) +} + +def _get_hedid_range(schema_name, section_key): + if section_key != HedSectionKey.Tags: + raise NotImplementedError("Cannot assign hedID's to non tag sections yet") + + starting_id, ending_id = tag_index_ranges[schema_name] + + tag_section_adj = 2000 + initial_tag_adj = 1 + starting_id += tag_section_adj + initial_tag_adj + return set(range(starting_id, ending_id)) + + +def update_dataframes_from_schema(dataframes, schema, schema_name=""): + # We're going to potentially alter the schema, so make a copy + schema = copy.deepcopy(schema) + + section_mapping = { + STRUCT_KEY: None, + TAG_KEY: HedSectionKey.Tags + } + + # todo: this needs to handle other sections eventually + for key, df in dataframes.items(): + section_key = section_mapping.get(key) + if not section_key: + continue + section = schema[section_key] + + hedid_errors = _verify_hedid_matches(section, df) + if hedid_errors: + raise HedFileError(hedid_errors[0]['code'], + f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues " + f"parameter on this exception for more details.", schema.name, + issues=hedid_errors) + unused_tag_ids = _get_hedid_range(schema_name, section_key) + + # If no errors, assign new hed ID's + assign_hed_ids_section(section, unused_tag_ids, df) + + output_dfs = Schema2DF.process_schema(schema, save_merged=False) + + merge_dfs(output_dfs[TAG_KEY], dataframes[TAG_KEY]) + # Struct is special, just directly merge for now. + output_dfs[STRUCT_KEY] = pd.concat([dataframes[STRUCT_KEY], output_dfs[STRUCT_KEY]]).drop_duplicates('rdfs:label', keep='last').reset_index(drop=True) + + return output_dfs + + +def _verify_hedid_matches(section, df): + """ Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema + + Parameters: + section(HedSchemaSection): The loaded schema section to compare ID's with + df(pd.DataFrame): The loaded spreadsheet dataframe to compare with + + Returns: + error_list(list of str): A list of errors found matching id's + """ + hedid_errors = [] + for row_number, row in df.iterrows(): + if not any(row): + continue + label = row["rdfs:label"] + if label.endswith("-#"): + label = label.replace("-#", "/#") + df_id = row[hed_id_column] + entry = section.get(label) + if not entry: + hedid_errors += SchemaLoaderDF._format_error(row_number, row, + f"'{label}' does not exist in the schema file provided, only the spreadsheet.") + continue + entry_id = entry.attributes.get(HedKey.HedID) + if entry_id and entry_id != df_id: + hedid_errors += SchemaLoaderDF._format_error(row_number, row, + f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema.") + continue + + return hedid_errors + + +def assign_hed_ids_schema(schema): + """Note: only assigns values to TAGS section for now.""" + for section_key in HedSectionKey: + section = schema[section_key] + # Still need to add hed ranges for non tag sections + if section_key != HedSectionKey.Tags: + continue + unused_tag_ids = _get_hedid_range(schema.library, section_key) + assign_hed_ids_section(section, unused_tag_ids, None) + + +def assign_hed_ids_section(section, unused_tag_ids, df=None): + spreadsheet_label_to_hedid = {} + if df is not None: + # Remove hedIds already used in the dataframe + unused_tag_ids -= get_all_ids(df) + spreadsheet_label_to_hedid = df.set_index('rdfs:label')['hedId'].to_dict() + + # Remove hedId's already used in the schema + section_used_ids = set( + int(entry.attributes.get(HedKey.HedID, "0").replace("HED_", "")) for entry in section.all_entries) + unused_tag_ids -= section_used_ids + + sorted_unused_ids = sorted(unused_tag_ids, reverse=True) + + # Next assign hed ID to this if needed + for entry in section.all_entries: + if section.section_key == HedSectionKey.Tags: + name = entry.short_tag_name + else: + name = entry.name + current_tag_id = spreadsheet_label_to_hedid.get(name) + if not current_tag_id: + current_tag_id = f"HED_{sorted_unused_ids.pop():07d}" + entry._set_attribute_value(HedKey.HedID, current_tag_id) + + +def merge_dfs(df1, df2): + """Merges df2 into df1, adding the extra columns from the ontology to the schema df.""" + # todo: vectorize this at some point + save_df1_columns = df1.columns.copy() + for index, row in df2.iterrows(): + # Find matching index in df1 based on 'rdfs:label' + match_index = df1[df1['rdfs:label'] == row['rdfs:label']].index + if not match_index.empty: + for col in df2.columns: + if col not in save_df1_columns: + df1.at[match_index[0], col] = row[col] + + return df1 + + +def clear_sections(schema, sections_to_clear): + # Temporary function until these spreadsheet writers are finished + # Also clear prologue and epilogue + schema.prologue = "" + schema.epilogue = "" + empty_sections = schema._create_empty_sections() + for section_key in sections_to_clear: + schema._sections[section_key] = empty_sections[section_key] diff --git a/hed/schema/schema_io/schema2base.py b/hed/schema/schema_io/schema2base.py index bcdcd9926..5c8e1234f 100644 --- a/hed/schema/schema_io/schema2base.py +++ b/hed/schema/schema_io/schema2base.py @@ -147,3 +147,57 @@ def _should_skip(self, entry): def _attribute_disallowed(self, attribute): return self._strip_out_in_library and attribute == HedKey.InLibrary + + def _format_tag_attributes(self, attributes): + """ + Takes a dictionary of tag attributes and returns a string with the .mediawiki representation + + Parameters + ---------- + attributes : {str:str} + {attribute_name : attribute_value} + Returns + ------- + str: + The formatted string that should be output to the file. + """ + prop_string = "" + final_props = [] + for prop, value in attributes.items(): + # Never save InLibrary if saving merged. + if self._attribute_disallowed(prop): + continue + if value is True: + final_props.append(prop) + else: + if "," in value: + split_values = value.split(",") + for split_value in split_values: + final_props.append(f"{prop}={split_value}") + else: + final_props.append(f"{prop}={value}") + + if final_props: + interior = ", ".join(final_props) + prop_string = f"{interior}" + + return prop_string + + @staticmethod + def _get_attribs_string_from_schema(header_attributes, sep=" "): + """ + Gets the schema attributes and converts it to a string. + + Parameters + ---------- + header_attributes : dict + Attributes to format attributes from + + Returns + ------- + str: + A string of the attributes that can be written to a .mediawiki formatted file + """ + attrib_values = [f"{attr}=\"{value}\"" for attr, value in header_attributes.items()] + final_attrib_string = sep.join(attrib_values) + return final_attrib_string \ No newline at end of file diff --git a/hed/schema/schema_io/schema2df.py b/hed/schema/schema_io/schema2df.py new file mode 100644 index 000000000..a7389b694 --- /dev/null +++ b/hed/schema/schema_io/schema2df.py @@ -0,0 +1,85 @@ +"""Allows output of HedSchema objects as .mediawiki format""" + +from hed.schema.hed_schema_constants import HedSectionKey, HedKey +from hed.schema.schema_io.schema2base import Schema2Base +import pandas as pd +from hed.schema.hed_schema_df_constants import * + + +class Schema2DF(Schema2Base): + # todo: add omn:EquivalentTo" + struct_columns = ["hedId", "rdfs:label", "Attributes", "omn:SubClassOf", "dc:description"] + tag_columns = ["hedId", "Level", "rdfs:label", "omn:SubClassOf", "Attributes", "dc:description"] + def __init__(self): + super().__init__() + self.current_tag_string = "" + self.current_tag_extra = "" + self.output = { + STRUCT_KEY: pd.DataFrame(columns=self.struct_columns, dtype=str), + TAG_KEY: pd.DataFrame(columns=self.tag_columns, dtype=str)} + + # ========================================= + # Required baseclass function + # ========================================= + def _output_header(self, attributes, prologue): + attributes_string = self._get_attribs_string_from_schema(attributes, sep=", ") + new_row = { + "hedId": f"HED_0010010", + "rdfs:label": "StandardHeader", + "Attributes": attributes_string, + "omn:SubClassOf": "HedHeader", + "dc:description": "", + # "omn:EquivalentTo": "", + } + self.output[STRUCT_KEY].loc[len(self.output[STRUCT_KEY])] = new_row + + new_row = { + "hedId": f"HED_0010011", + "rdfs:label": "StandardPrologue", + "Attributes": "", + "omn:SubClassOf": "HedPrologue", + "dc:description": prologue.replace("\n", "\\n"), + # "omn:EquivalentTo": "", + } + self.output[STRUCT_KEY].loc[len(self.output[STRUCT_KEY])] = new_row + + def _output_footer(self, epilogue): + new_row = { + "hedId": f"HED_0010012", + "rdfs:label": "StandardEpilogue", + "Attributes": "", + "omn:SubClassOf": "HedEpilogue", + "dc:description": epilogue.replace("\n", "\\n"), + # "omn:EquivalentTo": "", + } + self.output[STRUCT_KEY].loc[len(self.output[STRUCT_KEY])] = new_row + + def _start_section(self, key_class): + pass + + def _end_tag_section(self): + pass + + def _write_tag_entry(self, tag_entry, parent_node=None, level=0): + # ["hedID", "Level", "rdfs:label", "Parent", "Attributes", "dc:description", "omn:EquivalentTo"] + tag_id = tag_entry.attributes.get(HedKey.HedID, "") + new_row = { + "hedId": tag_id, + "Level": f"{level}", + "rdfs:label": tag_entry.short_tag_name if not tag_entry.has_attribute(HedKey.TakesValue) else tag_entry.short_tag_name + "-#", + "omn:SubClassOf": tag_entry.parent.short_tag_name if tag_entry.parent else "HedTag", + "Attributes": self._format_tag_attributes(tag_entry.attributes), + "dc:description": tag_entry.description, + # "omn:EquivalentTo": "", + } + self.output[TAG_KEY].loc[len(self.output[TAG_KEY])] = new_row + + def _write_entry(self, entry, parent_node, include_props=True): + # only tags page implemented so far + pass + + def _attribute_disallowed(self, attribute): + if super()._attribute_disallowed(attribute): + return True + # strip out hedID in dataframe format + return attribute == HedKey.HedID diff --git a/hed/schema/schema_io/schema2wiki.py b/hed/schema/schema_io/schema2wiki.py index 2a8a315b4..617b182a6 100644 --- a/hed/schema/schema_io/schema2wiki.py +++ b/hed/schema/schema_io/schema2wiki.py @@ -95,7 +95,7 @@ def _format_props_and_desc(self, schema_entry): prop_string = "" tag_props = schema_entry.attributes if tag_props: - prop_string += self._format_tag_attributes(tag_props) + prop_string += f"{{{self._format_tag_attributes(tag_props)}}}" desc = schema_entry.description if desc: if tag_props: @@ -104,56 +104,3 @@ def _format_props_and_desc(self, schema_entry): return prop_string - @staticmethod - def _get_attribs_string_from_schema(header_attributes): - """ - Gets the schema attributes and converts it to a string. - - Parameters - ---------- - header_attributes : dict - Attributes to format attributes from - - Returns - ------- - str: - A string of the attributes that can be written to a .mediawiki formatted file - """ - attrib_values = [f"{attr}=\"{value}\"" for attr, value in header_attributes.items()] - final_attrib_string = " ".join(attrib_values) - return final_attrib_string - - def _format_tag_attributes(self, attributes): - """ - Takes a dictionary of tag attributes and returns a string with the .mediawiki representation - - Parameters - ---------- - attributes : {str:str} - {attribute_name : attribute_value} - Returns - ------- - str: - The formatted string that should be output to the file. - """ - prop_string = "" - final_props = [] - for prop, value in attributes.items(): - # Never save InLibrary if saving merged. - if self._attribute_disallowed(prop): - continue - if value is True: - final_props.append(prop) - else: - if "," in value: - split_values = value.split(",") - for split_value in split_values: - final_props.append(f"{prop}={split_value}") - else: - final_props.append(f"{prop}={value}") - - if final_props: - interior = ", ".join(final_props) - prop_string = f"{{{interior}}}" - - return prop_string diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py index 838572f39..84078bbe2 100644 --- a/hed/schema/schema_io/wiki2schema.py +++ b/hed/schema/schema_io/wiki2schema.py @@ -10,8 +10,7 @@ from .base2schema import SchemaLoader from .wiki_constants import HedWikiSection, SectionStarts, SectionNames -header_attr_expression = "([^ ]+?)=\"(.*?)\"" -attr_re = re.compile(header_attr_expression) + extend_here_line = 'extend here' invalid_characters_to_strip = ["​"] tag_name_expression = r'(\*+|\'{3})(.*?)(\'{3})?\s*([\[\{]|$)+' @@ -45,7 +44,6 @@ class SchemaLoaderWiki(SchemaLoader): def __init__(self, filename, schema_as_string=None, schema=None, file_format=None, name=""): super().__init__(filename, schema_as_string, schema, file_format, name) self._schema.source_format = ".mediawiki" - self.fatal_errors = [] def _open_file(self): if self.filename: @@ -259,29 +257,6 @@ def _get_header_attributes_internal(self, version_line): filename=self.name) return attributes - @staticmethod - def _parse_attributes_line(version_line): - matches = {} - unmatched = [] - last_end = 0 - - for match in attr_re.finditer(version_line): - start, end = match.span() - - # If there's unmatched content between the last match and the current one. - if start > last_end: - unmatched.append(version_line[last_end:start]) - - matches[match.group(1)] = match.group(2) - last_end = end - - # If there's unmatched content after the last match - if last_end < len(version_line): - unmatched.append(version_line[last_end:]) - - unmatched = [m.strip() for m in unmatched if m.strip()] - return matches, unmatched - def _get_header_attributes_internal_old(self, version_line): """ Extract all valid attributes like version from the HED line in .mediawiki format. @@ -367,13 +342,6 @@ def _get_tag_name(self, tag_line): return None, 0 - @staticmethod - def _validate_attribute_string(attribute_string): - pattern = r'^[A-Za-z]+(=.+)?$' - match = re.fullmatch(pattern, attribute_string) - if match: - return match.group() - def _get_tag_attributes(self, line_number, tag_line, starting_index): """ Get the tag attributes from a line. @@ -390,27 +358,7 @@ def _get_tag_attributes(self, line_number, tag_line, starting_index): attr_string, starting_index = SchemaLoaderWiki._get_line_section(tag_line, starting_index, '{', '}') if attr_string is None: return None, starting_index - if attr_string: - attributes_split = [x.strip() for x in attr_string.split(',')] - - final_attributes = {} - for attribute in attributes_split: - if self._validate_attribute_string(attribute) is None: - self._add_fatal_error(line_number, tag_line, - f"Malformed attribute found {attribute}. " - f"Valid formatting is: attribute, or attribute=\"value\".") - continue - split_attribute = attribute.split("=") - if len(split_attribute) == 1: - final_attributes[split_attribute[0]] = True - else: - if split_attribute[0] in final_attributes: - final_attributes[split_attribute[0]] += "," + split_attribute[1] - else: - final_attributes[split_attribute[0]] = split_attribute[1] - return final_attributes, starting_index - else: - return {}, starting_index + return self._parse_attribute_string(line_number, attr_string), starting_index @staticmethod def _get_line_section(tag_line, starting_index, start_delim='[', end_delim=']'): @@ -497,16 +445,6 @@ def _create_entry(self, line_number, tag_line, key_class, element_name=None): return tag_entry - def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed", - error_code=HedExceptions.WIKI_DELIMITERS_INVALID): - self.fatal_errors.append( - {'code': error_code, - ErrorContext.ROW: line_number, - ErrorContext.LINE: line, - "message": f"{warning_message}" - } - ) - def _check_for_new_section(self, line, strings_for_section, current_section): new_section = None for key, section_string in SectionStarts.items(): diff --git a/pyproject.toml b/pyproject.toml index 37f404976..1dde170f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,6 @@ dependencies = [ "portalocker", "python-dateutil", "pytz", - "rdflib", "semantic-version", "six", "wordcloud==1.9.3" diff --git a/requirements.txt b/requirements.txt index dfcc49167..9af8a52d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,5 @@ openpyxl>=3.1.0 pandas>=1.3.5 pillow>=10.2.0 portalocker>=2.7.0 -rdflib>=6 semantic_version>=2.10.0 wordcloud>=1.9.3 diff --git a/tests/schema/test_hed_schema_io.py b/tests/schema/test_hed_schema_io.py index bf8db95fc..50d5d09b9 100644 --- a/tests/schema/test_hed_schema_io.py +++ b/tests/schema/test_hed_schema_io.py @@ -1,7 +1,5 @@ import unittest -import rdflib - from hed.errors import HedFileError from hed.errors.error_types import SchemaErrors from hed.schema import load_schema, HedSchemaGroup, load_schema_version, HedSchema diff --git a/tests/schema/test_hed_schema_io_df.py b/tests/schema/test_hed_schema_io_df.py new file mode 100644 index 000000000..a22e15481 --- /dev/null +++ b/tests/schema/test_hed_schema_io_df.py @@ -0,0 +1,59 @@ +import unittest +import shutil + +from hed.schema import load_schema, load_schema_version, from_string +from hed.schema.hed_schema_df_constants import * + +import os + + +class TestHedSchemaDF(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.output_folder = "test_output/" + os.makedirs(cls.output_folder, exist_ok=True) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.output_folder) + + def test_saving_default_schemas(self): + schema = load_schema_version("8.3.0") + schema.save_as_dataframes(self.output_folder + "test_8.tsv") + + reloaded_schema = load_schema(self.output_folder + "test_8.tsv") + self.assertEqual(schema, reloaded_schema) + + schema = load_schema_version("score_1.1.1") + schema.save_as_dataframes(self.output_folder + "test_score.tsv", save_merged=True) + + reloaded_schema = load_schema(self.output_folder + "test_score.tsv") + self.assertEqual(schema, reloaded_schema) + + schema = load_schema_version("testlib_3.0.0") + schema.save_as_dataframes(self.output_folder + "test_testlib.tsv", save_merged=True) + + reloaded_schema = load_schema(self.output_folder + "test_testlib.tsv") + self.assertEqual(schema, reloaded_schema) + + schema = load_schema_version("testlib_3.0.0") + schema.save_as_dataframes(self.output_folder + "test_testlib2.tsv", save_merged=False) + + reloaded_schema = load_schema(self.output_folder + "test_testlib2.tsv") + self.assertEqual(schema, reloaded_schema) + + def test_saving_default(self): + schema = load_schema_version("8.3.0") + schema.save_as_dataframes(self.output_folder + "test_8_string.tsv") + + filenames = {STRUCT_KEY: self.output_folder + "test_8_string_Structure.tsv", + TAG_KEY: self.output_folder + "test_8_string_Tag.tsv"} + + new_file_strings = {} + for key, value in filenames.items(): + with open(value, "r") as f: + all_lines = f.readlines() + new_file_strings[key] = "".join(all_lines) + + reloaded_schema = from_string(new_file_strings, ".tsv") + self.assertEqual(schema, reloaded_schema) \ No newline at end of file