diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py index 5e3803355..75b132505 100644 --- a/hed/schema/schema_io/ontology_util.py +++ b/hed/schema/schema_io/ontology_util.py @@ -64,7 +64,10 @@ def _get_hedid_range(schema_name, df_key): starting_id, ending_id = library_index_ranges[schema_name] start_object_range, end_object_range = object_type_id_offset[df_key] - initial_tag_adj = 1 # We always skip 1 + if df_key == constants.TAG_KEY: + initial_tag_adj = 1 # We always skip 1 for tags + else: + initial_tag_adj = 0 final_start = starting_id + start_object_range + initial_tag_adj final_end = starting_id + end_object_range if end_object_range == -1: @@ -108,19 +111,22 @@ def update_dataframes_from_schema(dataframes, schema, schema_name="", get_as_ids dataframes(dict of str:pd.DataFrames): The updated dataframes These dataframes can potentially have extra columns """ + hedid_errors = [] # 1. Verify existing hed ids don't conflict between schema/dataframes - for key, df in dataframes.items(): - section_key = constants.section_mapping.get(key) + for df_key, df in dataframes.items(): + section_key = constants.section_mapping.get(df_key) if not section_key: continue section = schema[section_key] - hedid_errors = _verify_hedid_matches(section, df) - if hedid_errors: - raise HedFileError(hedid_errors[0]['code'], - f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues " - f"parameter on this exception for more details.", schema.name, - issues=hedid_errors) + unused_tag_ids = _get_hedid_range(schema_name, df_key) + hedid_errors += _verify_hedid_matches(section, df, unused_tag_ids) + + if hedid_errors: + raise HedFileError(hedid_errors[0]['code'], + f"{len(hedid_errors)} issues found with hedId mismatches. See the .issues " + f"parameter on this exception for more details.", schema.name, + issues=hedid_errors) # 2. Get the new schema as DFs from hed.schema.schema_io.schema2df import Schema2DF # Late import as this is recursive @@ -144,12 +150,13 @@ def update_dataframes_from_schema(dataframes, schema, schema_name="", get_as_ids return output_dfs -def _verify_hedid_matches(section, df): +def _verify_hedid_matches(section, df, unused_tag_ids): """ Verify ID's in both have the same label, and verify all entries in the dataframe are already in the schema Parameters: section(HedSchemaSection): The loaded schema section to compare ID's with df(pd.DataFrame): The loaded spreadsheet dataframe to compare with + unused_tag_ids(set): The valid range of ID's for this df Returns: error_list(list of str): A list of errors found matching id's @@ -168,6 +175,23 @@ def _verify_hedid_matches(section, df): f"'{label}' does not exist in the schema file provided, only the spreadsheet.") continue entry_id = entry.attributes.get(HedKey.HedID) + if df_id: + if not (df_id.startswith("HED_") and len(df_id) == len("HED_0000000")): + hedid_errors += schema_util.format_error(row_number, row, + f"'{label}' has an improperly formatted hedID in the dataframe.") + continue + id_value = remove_prefix(df_id, "HED_") + try: + id_int = int(id_value) + if id_int not in unused_tag_ids: + hedid_errors += schema_util.format_error(row_number, row, + f"'{label}' has id {id_int} which is outside of the valid range for this type. Valid range is: {min(unused_tag_ids)} to {max(unused_tag_ids)}") + continue + except ValueError: + hedid_errors += schema_util.format_error(row_number, row, + f"'{label}' has a non-numeric hedID in the dataframe.") + continue + if entry_id and entry_id != df_id: hedid_errors += schema_util.format_error(row_number, row, f"'{label}' has hedID '{df_id}' in dataframe, but '{entry_id}' in schema.") @@ -232,7 +256,9 @@ def convert_df_to_omn(dataframes): dataframes(dict): A set of dataframes representing a schema, potentially including extra columns Returns: - omn_file(str): A combined string representing (most of) a schema omn file. + tuple: + omn_file(str): A combined string representing (most of) a schema omn file. + omn_data(dict): a dict of DF_SUFFIXES:str, representing each .tsv file in omn format. """ from hed.schema.hed_schema_io import from_dataframes # Load the schema, so we can save it out with ID's @@ -243,13 +269,15 @@ def convert_df_to_omn(dataframes): # Write out the new dataframes in omn format annotation_props = _get_annotation_prop_ids(dataframes) full_text = "" + omn_data = {} for suffix, dataframe in dataframes.items(): if suffix == constants.STRUCT_KEY: # not handled here yet continue output_text = _convert_df_to_omn(dataframes[suffix], annotation_properties=annotation_props) + omn_data[suffix] = output_text full_text += output_text + "\n" - return full_text + return full_text, omn_data def _convert_df_to_omn(df, annotation_properties=("",)): diff --git a/tests/schema/test_ontology_util.py b/tests/schema/test_ontology_util.py index 224902dd4..9bada7075 100644 --- a/tests/schema/test_ontology_util.py +++ b/tests/schema/test_ontology_util.py @@ -36,7 +36,7 @@ def test_get_library_name_and_id_unknown(self): def test_get_hedid_range_normal_case(self): id_set = ontology_util._get_hedid_range("score", constants.DATA_KEY) self.assertTrue(40401 in id_set) - self.assertEqual(len(id_set), 200 - 1) # Check the range size + self.assertEqual(len(id_set), 200) # Check the range size def test_get_hedid_range_boundary(self): # Test boundary condition where end range is -1 @@ -55,27 +55,41 @@ def setUp(self): self.schema_id = load_schema_version("8.3.0") def test_no_hedid(self): - df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': '001'}, {'rdfs:label': 'Age-#', 'hedId': '002'}]) - errors = _verify_hedid_matches(self.schema_82.tags, df) + df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': ''}, {'rdfs:label': 'Age-#', 'hedId': ''}]) + errors = _verify_hedid_matches(self.schema_82.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY)) self.assertEqual(len(errors), 0) def test_id_matches(self): df = pd.DataFrame( [{'rdfs:label': 'Event', 'hedId': 'HED_0012001'}, {'rdfs:label': 'Age-#', 'hedId': 'HED_0012475'}]) - errors = _verify_hedid_matches(self.schema_id.tags, df) + errors = _verify_hedid_matches(self.schema_id.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY)) self.assertEqual(len(errors), 0) def test_label_mismatch_id(self): df = pd.DataFrame( - [{'rdfs:label': 'Event', 'hedId': 'invalid_id'}, {'rdfs:label': 'Age-#', 'hedId': 'invalid_id'}]) + [{'rdfs:label': 'Event', 'hedId': 'HED_0012005'}, {'rdfs:label': 'Age-#', 'hedId': 'HED_0012007'}]) - errors = _verify_hedid_matches(self.schema_id.tags, df) + errors = _verify_hedid_matches(self.schema_id.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY)) self.assertEqual(len(errors), 2) def test_label_no_entry(self): df = pd.DataFrame([{'rdfs:label': 'NotARealEvent', 'hedId': 'does_not_matter'}]) - errors = _verify_hedid_matches(self.schema_id.tags, df) + errors = _verify_hedid_matches(self.schema_id.tags, df, ontology_util._get_hedid_range("", constants.TAG_KEY)) + self.assertEqual(len(errors), 1) + + def test_out_of_range(self): + df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': 'HED_0000000'}]) + + errors = _verify_hedid_matches(self.schema_82.tags, df, + ontology_util._get_hedid_range("", constants.TAG_KEY)) + self.assertEqual(len(errors), 1) + + def test_not_int(self): + df = pd.DataFrame([{'rdfs:label': 'Event', 'hedId': 'HED_AAAAAAA'}]) + + errors = _verify_hedid_matches(self.schema_82.tags, df, + ontology_util._get_hedid_range("", constants.TAG_KEY)) self.assertEqual(len(errors), 1) def test_get_all_ids_exists(self): @@ -143,7 +157,7 @@ def test_update_dataframes_from_schema(self): class TestConvertOmn(unittest.TestCase): def test_convert_df_to_omn(self): dataframes = load_schema_version("8.3.0").get_as_dataframes() - omn_version = convert_df_to_omn(dataframes) + omn_version, _ = convert_df_to_omn(dataframes) # make these more robust, for now just verify it's somewhere in the result for df_name, df in dataframes.items():