From 317c381d23907bacacdb6eeef0c3f0d3189913ed Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 23 May 2024 18:41:55 -0500 Subject: [PATCH] Minor tweaks to spreadsheet script/code Better error on no schemas changed Load an empty spreadhseet if no sheet found for .tsv format(this may need adjustment) --- hed/schema/hed_schema_df_constants.py | 4 ++-- hed/schema/schema_io/df2schema.py | 9 +++++---- hed/schema/schema_io/ontology_util.py | 16 ++++++++++++++++ hed/schema/schema_io/schema2df.py | 16 ++-------------- hed/scripts/convert_and_update_schema.py | 6 +++++- hed/scripts/script_util.py | 14 +++++++++++--- tests/scripts/test_script_util.py | 18 ++++++++++++++++-- 7 files changed, 57 insertions(+), 26 deletions(-) diff --git a/hed/schema/hed_schema_df_constants.py b/hed/schema/hed_schema_df_constants.py index 8642d914d..a1f3e4180 100644 --- a/hed/schema/hed_schema_df_constants.py +++ b/hed/schema/hed_schema_df_constants.py @@ -40,11 +40,11 @@ subclass_of = "omn:SubClassOf" attributes = "Attributes" description = "dc:description" -equivalent_to = "owm:EquivalentTo" +equivalent_to = "omn:EquivalentTo" has_unit_class = "hasUnitClass" struct_columns = [hed_id, name, attributes, subclass_of, description] -tag_columns = [hed_id, level, name, subclass_of, attributes, description, equivalent_to] +tag_columns = [hed_id, name, level, subclass_of, attributes, description, equivalent_to] unit_columns = [hed_id, name, subclass_of, has_unit_class, attributes, description, equivalent_to] # The columns for unit class, value class, and unit modifier diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py index 71de1a095..043617b61 100644 --- a/hed/schema/schema_io/df2schema.py +++ b/hed/schema/schema_io/df2schema.py @@ -4,7 +4,7 @@ import io import os -import hed.schema.schema_io.ontology_util +from hed.schema.schema_io import ontology_util from hed.schema.hed_schema_constants import HedSectionKey, HedKey from hed.errors.exceptions import HedFileError, HedExceptions from hed.schema.schema_io.base2schema import SchemaLoader @@ -282,7 +282,7 @@ def _get_tag_attributes(self, row_number, row): dict: Dictionary of attributes. """ try: - return hed.schema.schema_io.ontology_util.get_attributes_from_row(row) + return ontology_util.get_attributes_from_row(row) except ValueError as e: self._add_fatal_error(row_number, str(row), str(e)) @@ -297,12 +297,13 @@ def _add_to_dict(self, line_number, line, entry, key_class): def load_dataframes(filenames): dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames) - dataframes = {} + dataframes = ontology_util.create_empty_dataframes() for key, filename in dict_filenames.items(): try: dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False) except OSError: - dataframes[key] = None + # todo: consider if we want to report this error(we probably do) + pass # We will use a blank one for this return dataframes diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py index 25d53c493..d11e362d4 100644 --- a/hed/schema/schema_io/ontology_util.py +++ b/hed/schema/schema_io/ontology_util.py @@ -399,3 +399,19 @@ def get_attributes_from_row(row): else: attr_string = "" return parse_attribute_string(attr_string) + + +def create_empty_dataframes(): + """Returns the default empty dataframes""" + return { + constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str), + constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str), + constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str), + constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), + constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), + constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), + constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), + constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), + constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), + constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), + } diff --git a/hed/schema/schema_io/schema2df.py b/hed/schema/schema_io/schema2df.py index 1728a8dc5..46dc6f563 100644 --- a/hed/schema/schema_io/schema2df.py +++ b/hed/schema/schema_io/schema2df.py @@ -1,7 +1,7 @@ """Allows output of HedSchema objects as .mediawiki format""" from hed.schema.hed_schema_constants import HedSectionKey, HedKey -from hed.schema.schema_io.ontology_util import get_library_name_and_id, remove_prefix +from hed.schema.schema_io.ontology_util import get_library_name_and_id, remove_prefix, create_empty_dataframes from hed.schema.schema_io.schema2base import Schema2Base import pandas as pd import hed.schema.hed_schema_df_constants as constants @@ -56,18 +56,7 @@ def _get_object_id(self, object_name, base_id=0, include_prefix=False): # Required baseclass function # ========================================= def _initialize_output(self): - self.output = { - constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str), - constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str), - constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str), - constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), - constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), - constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str), - constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), - constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), - constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str), - constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str), - } + self.output = create_empty_dataframes() self._tag_rows = [] def _create_and_add_object_row(self, base_object, attributes="", description=""): @@ -327,4 +316,3 @@ def _calculate_attribute_type(attribute_entry): elif any(attribute in object_ranges for attribute in attributes): return "object" return "data" - diff --git a/hed/scripts/convert_and_update_schema.py b/hed/scripts/convert_and_update_schema.py index bc34d9e8d..4117985d7 100644 --- a/hed/scripts/convert_and_update_schema.py +++ b/hed/scripts/convert_and_update_schema.py @@ -19,7 +19,11 @@ def convert_and_update(filenames, set_ids): schema_files = sort_base_schemas(filenames) all_issues = validate_all_schemas(schema_files) - if all_issues or not schema_files: + if not schema_files: + print("No schema file changes found in the file list") + return 0 + + if all_issues: print("Did not attempt to update schemas due to validation failures") return 1 diff --git a/hed/scripts/script_util.py b/hed/scripts/script_util.py index 278415742..441ec736f 100644 --- a/hed/scripts/script_util.py +++ b/hed/scripts/script_util.py @@ -13,12 +13,20 @@ def validate_schema(file_path): """ validation_issues = [] try: + _, extension = os.path.splitext(file_path) + if extension.lower() != extension: + error_message = f"Only fully lowercase extensions are allowed for schema files. " \ + f"Invalid extension on file: {file_path}" + validation_issues.append(error_message) + return validation_issues + base_schema = load_schema(file_path) issues = base_schema.check_compliance() issues = [issue for issue in issues if issue["code"] != SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED] if issues: error_message = get_printable_issue_string(issues, title=file_path) validation_issues.append(error_message) + return validation_issues mediawiki_string = base_schema.get_as_mediawiki_string() reloaded_schema = from_string(mediawiki_string, schema_format=".mediawiki") @@ -47,7 +55,7 @@ def validate_schema(file_path): def add_extension(basename, extension): """Generate the final name for a given extension. Only .tsv varies notably.""" - if extension.lower() == ".tsv": + if extension == ".tsv": parent_path, basename = os.path.split(basename) return os.path.join(parent_path, "hedtsv", basename) return basename + extension @@ -74,10 +82,10 @@ def sort_base_schemas(filenames): schema_files = defaultdict(set) for file_path in filenames: basename, extension = os.path.splitext(file_path) - if extension.lower() == ".xml" or extension.lower() == ".mediawiki": + if extension == ".xml" or extension == ".mediawiki": schema_files[basename].add(extension) continue - elif extension.lower() == ".tsv": + elif extension == ".tsv": tsv_basename = basename.rpartition("_")[0] full_parent_path, real_basename = os.path.split(tsv_basename) full_parent_path, real_basename2 = os.path.split(full_parent_path) diff --git a/tests/scripts/test_script_util.py b/tests/scripts/test_script_util.py index 638ad5a84..694a61bf0 100644 --- a/tests/scripts/test_script_util.py +++ b/tests/scripts/test_script_util.py @@ -2,7 +2,7 @@ import os import shutil from hed import load_schema_version -from hed.scripts.script_util import add_extension, sort_base_schemas, validate_all_schema_formats +from hed.scripts.script_util import add_extension, sort_base_schemas, validate_all_schema_formats, validate_schema class TestAddExtension(unittest.TestCase): @@ -25,9 +25,10 @@ def test_empty_extension(self): def test_none_extension(self): """Test behavior with None as extension.""" - with self.assertRaises(AttributeError): + with self.assertRaises(TypeError): add_extension("filename", None) + class TestSortBaseSchemas(unittest.TestCase): def test_mixed_file_types(self): filenames = [ @@ -119,3 +120,16 @@ def test_error_no_error(self): def tearDownClass(cls): """Remove the entire directory created for testing to ensure a clean state.""" shutil.rmtree(cls.base_path) # This will delete the directory and all its contents + + +class TestValidateSchema(unittest.TestCase): + def test_load_invalid_extension(self): + # Verify capital letters fail validation + self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.MEDIAWIKI")[0]) + self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.Mediawiki")[0]) + self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.XML")[0]) + self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.Xml")[0]) + self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.TSV")[0]) + self.assertNotIn("Only fully lowercase extensions ", validate_schema("does_not_matter.tsv")[0]) + self.assertNotIn("Only fully lowercase extensions ", validate_schema("does_not_matter.xml")[0]) + self.assertNotIn("Only fully lowercase extensions ", validate_schema("does_not_matter.mediawiki")[0]) \ No newline at end of file