From 9852e849c3e412166988a03f18734b34ad9e207f Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Sun, 6 Aug 2023 07:25:53 -0500 Subject: [PATCH 1/3] Wrote experimental code to compare schemas --- hed/schema/schema_compare.py | 7 ++++--- .../remodeling/operations/summarize_column_values_op.py | 6 +++--- .../remodeling/operations/summarize_definitions_op.py | 3 +-- hed/tools/remodeling/operations/summarize_hed_tags_op.py | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/hed/schema/schema_compare.py b/hed/schema/schema_compare.py index 49d5c72b6..57bdb656e 100644 --- a/hed/schema/schema_compare.py +++ b/hed/schema/schema_compare.py @@ -20,7 +20,8 @@ def find_matching_tags(schema1, schema2, return_string=False): section_dict.update(unequal_entries[section_key]) if return_string: - return "\n".join([pretty_print_diff_all(entries, prompt="Found matching node ") for entries in matches.values()]) + return "\n".join([pretty_print_diff_all(entries, + prompt="Found matching node ") for entries in matches.values()]) return matches @@ -64,7 +65,7 @@ def compare_schemas(schema1, schema2, attribute_filter=HedKey.InLibrary, section attribute_filter (str, optional): The attribute to filter entries by. Entries without this attribute are skipped. If it evaluates to False, no filtering is performed. - sections(list): the list of sections to compare. By default, just the tags section. + sections(tuple): the list of sections to compare. By default, just the tags section. Returns: tuple: A tuple containing four dictionaries: @@ -199,4 +200,4 @@ def pretty_print_missing_all(entries, schema_name): output.append(f"'{key}' not in {schema_name}':") output += pretty_print_entry(entry) - return "\n".join(output) \ No newline at end of file + return "\n".join(output) diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py index 825594aea..b53d8aa38 100644 --- a/hed/tools/remodeling/operations/summarize_column_values_op.py +++ b/hed/tools/remodeling/operations/summarize_column_values_op.py @@ -80,7 +80,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): Updates the relevant summary. """ - + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: @@ -133,8 +133,8 @@ def get_details_dict(self, summary): "Files": [name for name in this_summary['Files'].keys()], "Specifics": {"Value columns": this_summary['Value columns'].keys(), "Skip columns": this_summary['Skip columns'], - "Value columns": this_summary['Value columns'], - "Categorical columns": this_summary['Categorical columns'], + "Value column summaries": this_summary['Value columns'], + "Categorical column summaries": this_summary['Categorical columns'], "Categorical counts": this_summary['Categorical counts']}} def merge_all_info(self): diff --git a/hed/tools/remodeling/operations/summarize_definitions_op.py b/hed/tools/remodeling/operations/summarize_definitions_op.py index 6be941352..5a1e21804 100644 --- a/hed/tools/remodeling/operations/summarize_definitions_op.py +++ b/hed/tools/remodeling/operations/summarize_definitions_op.py @@ -129,8 +129,7 @@ def get_details_dict(self, def_gatherer): known_defs_summary.update(ambiguous_defs_summary) known_defs_summary.update(errors_summary) return {"Name": "", "Total events": 0, "Total files": 0, "Files": [], "Specifics": known_defs_summary} - - return known_defs_summary + # return known_defs_summary def merge_all_info(self): """ Create an Object containing the definition summary. diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index f88650ccb..0fcec5411 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -35,8 +35,8 @@ class SummarizeHedTagsOp(BaseOp): }, "optional_parameters": { "append_timecode": bool, - "expand_definitions": bool, - "expand_context": bool + "expand_context": bool, + "expand_definitions": bool } } From 3c71d262c751cd78eeaae9d38a4bc4098e5d9510 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Sun, 6 Aug 2023 09:39:56 -0500 Subject: [PATCH 2/3] Corrected column value summary output --- hed/schema/schema_compare.py | 18 ++++++++++----- .../operations/summarize_column_values_op.py | 22 +++++++++---------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/hed/schema/schema_compare.py b/hed/schema/schema_compare.py index 5729dd748..fc2caf41e 100644 --- a/hed/schema/schema_compare.py +++ b/hed/schema/schema_compare.py @@ -27,7 +27,8 @@ def find_matching_tags(schema1, schema2, output='default', sections=(HedSectionK section_dict.update(unequal_entries[section_key]) if output == 'string': - return "\n".join([_pretty_print_diff_all(entries, prompt="Found matching node ") for entries in matches.values()]) + return "\n".join([_pretty_print_diff_all(entries, prompt="Found matching node ") + for entries in matches.values()]) elif output == 'dict': output_dict = {} for section_name, section_entries in matches.items(): @@ -38,16 +39,16 @@ def find_matching_tags(schema1, schema2, output='default', sections=(HedSectionK return matches -def compare_differences(schema1, schema2, output='default', attribute_filter=None, sections=(HedSectionKey.Tags,)): +def compare_differences(schema1, schema2, output='raw', attribute_filter=None, sections=(HedSectionKey.Tags,)): """ Compare the tags in two schemas, this finds any differences Parameters: schema1 (HedSchema): The first schema to be compared. schema2 (HedSchema): The second schema to be compared. - output (str): Defaults to returning a set of python object dicts. + output (str): 'raw' (default) returns a tuple of python object dicts with raw results. 'string' returns a single string - 'dict' returns a json style dictionary + 'dict' returns a json-style python dictionary that can be converted to JSON attribute_filter (str, optional): The attribute to filter entries by. Entries without this attribute are skipped. The most common use would be HedKey.InLibrary @@ -56,11 +57,16 @@ def compare_differences(schema1, schema2, output='default', attribute_filter=Non If None, checks all sections including header, prologue, and epilogue. Returns: - tuple or str: A tuple containing three dictionaries: + tuple, str or dict: + - Tuple with dict entries (not_in_schema1, not_in_schema1, unequal_entries). + - Formatted string with the output ready for printing. + - A Python dictionary with the output ready to be converted to JSON (for web output). + + Notes: The underlying dictionaries are: - not_in_schema1(dict): Entries present in schema2 but not in schema1. - not_in_schema2(dict): Entries present in schema1 but not in schema2. - unequal_entries(dict): Entries that differ between the two schemas. - - or a formatted string of the differences + """ _, not_in_1, not_in_2, unequal_entries = compare_schemas(schema1, schema2, attribute_filter=attribute_filter, sections=sections) diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py index b53d8aa38..94573a137 100644 --- a/hed/tools/remodeling/operations/summarize_column_values_op.py +++ b/hed/tools/remodeling/operations/summarize_column_values_op.py @@ -130,8 +130,8 @@ def get_details_dict(self, summary): this_summary['Categorical columns'][key] = dict(sorted_tuples[:min(num_disp, self.op.max_categorical)]) return {"Name": this_summary['Name'], "Total events": this_summary["Total events"], "Total files": this_summary['Total files'], - "Files": [name for name in this_summary['Files'].keys()], - "Specifics": {"Value columns": this_summary['Value columns'].keys(), + "Files": list(this_summary['Files'].keys()), + "Specifics": {"Value columns": list(this_summary['Value columns']), "Skip columns": this_summary['Skip columns'], "Value column summaries": this_summary['Value columns'], "Categorical column summaries": this_summary['Categorical columns'], @@ -209,9 +209,9 @@ def _get_dataset_string(self, result, indent=BaseSummary.DISPLAY_INDENT): cat_string = self._get_categorical_string(specifics, offset="", indent=indent) if cat_string: sum_list.append(cat_string) - val_cols = specifics.get("Value columns", {}) - if val_cols: - sum_list.append(ColumnValueSummary._get_value_string(val_cols, offset="", indent=indent)) + val_dict = specifics.get("Value column summaries", {}) + if val_dict: + sum_list.append(ColumnValueSummary._get_value_string(val_dict, offset="", indent=indent)) return "\n".join(sum_list) def _get_individual_string(self, result, indent=BaseSummary.DISPLAY_INDENT): @@ -228,12 +228,12 @@ def _get_individual_string(self, result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Total events={result.get('Total events', 0)}"] specifics = result.get("Specifics", {}) - cat_cols = result.get("Categorical columns", {}) - if cat_cols: - sum_list.append(self._get_categorical_string(cat_cols, offset=indent, indent=indent)) - val_cols = result.get("Value columns", {}) - if val_cols: - sum_list.append(ColumnValueSummary._get_value_string(val_cols, offset=indent, indent=indent)) + cat_dict = specifics.get("Categorical column summaries", {}) + if cat_dict: + sum_list.append(self._get_categorical_string(cat_dict, offset=indent, indent=indent)) + val_dict = specifics.get("Value column summaries", {}) + if val_dict: + sum_list.append(ColumnValueSummary._get_value_string(val_dict, offset=indent, indent=indent)) return "\n".join(sum_list) def _get_categorical_col(self, entry, count_dict, offset="", indent=" "): From 6ea3d7bb03a5d52c9d5fc4cfae26949f572e12c1 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Mon, 7 Aug 2023 07:47:31 -0500 Subject: [PATCH 3/3] Minor corrections to docs --- hed/schema/hed_schema_io.py | 12 ++++++------ spec_tests/test_errors.py | 2 +- tests/schema/test_schema_converters.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index 60a42d154..11980ed47 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -15,12 +15,12 @@ MAX_MEMORY_CACHE = 20 -def from_string(schema_string, file_type=".xml", schema_namespace=None): +def from_string(schema_string, schema_format=".xml", schema_namespace=None): """ Create a schema from the given string. Parameters: schema_string (str): An XML or mediawiki file as a single long string. - file_type (str): The extension(including the .) corresponding to a file source. + schema_format (str): The schema format of the source schema string. schema_namespace (str, None): The name_prefix all tags in this schema will accept. Returns: @@ -38,12 +38,12 @@ def from_string(schema_string, file_type=".xml", schema_namespace=None): raise HedFileError(HedExceptions.BAD_PARAMETERS, "Empty string passed to HedSchema.from_string", filename=schema_string) - if file_type.endswith(".xml"): + if schema_format.endswith(".xml"): hed_schema = SchemaLoaderXML.load(schema_as_string=schema_string) - elif file_type.endswith(".mediawiki"): + elif schema_format.endswith(".mediawiki"): hed_schema = SchemaLoaderWiki.load(schema_as_string=schema_string) else: - raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unknown schema extension", filename=file_type) + raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unknown schema extension", filename=schema_format) if schema_namespace: hed_schema.set_schema_prefix(schema_namespace=schema_namespace) @@ -75,7 +75,7 @@ def load_schema(hed_path=None, schema_namespace=None): if is_url: file_as_string = schema_util.url_to_string(hed_path) - hed_schema = from_string(file_as_string, file_type=os.path.splitext(hed_path.lower())[1]) + hed_schema = from_string(file_as_string, schema_format=os.path.splitext(hed_path.lower())[1]) elif hed_path.lower().endswith(".xml"): hed_schema = SchemaLoaderXML.load(hed_path) elif hed_path.lower().endswith(".mediawiki"): diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index e48333c5d..2ee73fc9e 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -223,7 +223,7 @@ def _run_single_schema_test(self, info, error_code, description,name, error_hand for test in tests: schema_string = "\n".join(test) try: - loaded_schema = from_string(schema_string, file_type=".mediawiki") + loaded_schema = from_string(schema_string, schema_format=".mediawiki") issues = loaded_schema.check_compliance() except HedFileError as e: issues = e.issues diff --git a/tests/schema/test_schema_converters.py b/tests/schema/test_schema_converters.py index e2e1eb465..5f7c1d121 100644 --- a/tests/schema/test_schema_converters.py +++ b/tests/schema/test_schema_converters.py @@ -47,7 +47,7 @@ def test_schema_as_string_wiki(self): with open(self.wiki_file) as file: hed_schema_as_string = "".join([line for line in file]) - string_schema = schema.from_string(hed_schema_as_string, file_type=".mediawiki") + string_schema = schema.from_string(hed_schema_as_string, schema_format=".mediawiki") self.assertEqual(string_schema, self.hed_schema_wiki) def test_wikischema2xml(self):