From 4c79d1b3c041cd37c9de3853a8f8e7ff4ec37a14 Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 16 Mar 2023 17:01:16 -0500 Subject: [PATCH] Add some df tests. Update hed_assemble. Make the df utils also work on series. --- hed/models/df_util.py | 54 ++++++++----- hed/tools/analysis/analysis_util.py | 7 +- tests/models/test_df_util.py | 114 ++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 23 deletions(-) create mode 100644 tests/models/test_df_util.py diff --git a/hed/models/df_util.py b/hed/models/df_util.py index d877028aa..66b5c75be 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -1,4 +1,5 @@ from functools import partial +import pandas as pd from hed.models.sidecar import Sidecar from hed.models.tabular_input import TabularInput @@ -51,7 +52,7 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_ for x in text_file_row] for text_file_row in tabular_file.dataframe_a.itertuples(index=False)], def_dict -def convert_to_form(df, hed_schema, tag_form, columns): +def convert_to_form(df, hed_schema, tag_form, columns=None): """ Convert all tags in underlying dataframe to the specified form. Converts in place @@ -61,51 +62,62 @@ def convert_to_form(df, hed_schema, tag_form, columns): tag_form(str): HedTag property to convert tags to. columns (list): The columns to modify on the dataframe """ - if columns is None: - columns = df.columns + if isinstance(df, pd.Series): + df = df.apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) + else: + if columns is None: + columns = df.columns - for column in columns: - df[column] = df[column].apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) + for column in columns: + df[column] = df[column].apply(partial(_convert_to_form, hed_schema=hed_schema, tag_form=tag_form)) return df -def shrink_defs(df, hed_schema, columns): +def shrink_defs(df, hed_schema, columns=None): """ Shrinks any def-expand tags found in the dataframe. Converts in place Parameters: - df (pd.Dataframe): The dataframe to modify + df (pd.Dataframe or pd.Series): The dataframe or series to modify hed_schema (HedSchema or None): The schema to use to identify defs. - columns (list): The columns to modify on the dataframe + columns (list or None): The columns to modify on the dataframe """ - if columns is None: - columns = df.columns + if isinstance(df, pd.Series): + mask = df.str.contains('Def-expand/', case=False) + df[mask] = df[mask].apply(partial(_shrink_defs, hed_schema=hed_schema)) + else: + if columns is None: + columns = df.columns - for column in columns: - mask = df[column].str.contains('Def-expand/', case=False) - df[column][mask] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema)) + for column in columns: + mask = df[column].str.contains('Def-expand/', case=False) + df[column][mask] = df[column][mask].apply(partial(_shrink_defs, hed_schema=hed_schema)) return df -def expand_defs(df, hed_schema, def_dict, columns): +def expand_defs(df, hed_schema, def_dict, columns=None): """ Expands any def tags found in the dataframe. Converts in place Parameters: - df (pd.Dataframe): The dataframe to modify + df (pd.Dataframe or pd.Series): The dataframe or series to modify hed_schema (HedSchema or None): The schema to use to identify defs def_dict (DefinitionDict): The definitions to expand - columns (list): The columns to modify on the dataframe + columns (list or None): The columns to modify on the dataframe """ - if columns is None: - columns = df.columns + if isinstance(df, pd.Series): + mask = df.str.contains('Def/', case=False) + df[mask] = df[mask].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) + else: + if columns is None: + columns = df.columns - for column in columns: - mask = df[column].str.contains('Def/', case=False) - df[column][mask] = df[column][mask].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) + for column in columns: + mask = df[column].str.contains('Def/', case=False) + df[column][mask] = df[column][mask].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) return df diff --git a/hed/tools/analysis/analysis_util.py b/hed/tools/analysis/analysis_util.py index 27f442c3d..fcfd5284c 100644 --- a/hed/tools/analysis/analysis_util.py +++ b/hed/tools/analysis/analysis_util.py @@ -6,7 +6,7 @@ from hed.tools.util.data_util import separate_values from hed.models.hed_tag import HedTag from hed.models.hed_group import HedGroup -from hed.models.df_util import get_assembled, expand_defs +from hed.models import df_util def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs=False): @@ -29,7 +29,10 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs hed_string_list = data_input.series_a definitions = sidecar.get_def_dict(hed_schema=schema) if expand_defs: - expand_defs(hed_string_list, schema, definitions, columns=None) + df_util.expand_defs(hed_string_list, schema, definitions) + # Keep in mind hed_string_list is now a Series. The rest of the function should probably + # also be modified + # hed_obj_list, defs = get_assembled(data_input, sidecar, schema, extra_def_dicts=None, join_columns=True, # shrink_defs=False, expand_defs=True) # hed_string_list = [str(hed) for hed in hed_obj_list] diff --git a/tests/models/test_df_util.py b/tests/models/test_df_util.py new file mode 100644 index 000000000..bc9c907b7 --- /dev/null +++ b/tests/models/test_df_util.py @@ -0,0 +1,114 @@ +import unittest +import pandas as pd + + +from hed import load_schema_version +from hed.models.df_util import shrink_defs, expand_defs +from hed import DefinitionDict + + +class TestShrinkDefs(unittest.TestCase): + def setUp(self): + self.schema = load_schema_version() + + def test_shrink_defs_normal(self): + df = pd.DataFrame({"column1": ["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"]}) + expected_df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"]}) + result = shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_placeholder(self): + df = pd.DataFrame({"column1": ["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]}) + expected_df = pd.DataFrame({"column1": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) + result = shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_no_matching_tags(self): + df = pd.DataFrame({"column1": ["(Event/SomeEvent, Item/SomeItem,Age/25)"]}) + expected_df = pd.DataFrame({"column1": ["(Event/SomeEvent, Item/SomeItem,Age/25)"]}) + result = shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_multiple_columns(self): + df = pd.DataFrame({"column1": ["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"], + "column2": ["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]}) + expected_df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"], + "column2": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) + result = shrink_defs(df, self.schema, ['column1', 'column2']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_multiple_defs_same_line(self): + df = pd.DataFrame({"column1": ["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Age/30"]}) + expected_df = pd.DataFrame({"column1": ["Def/TestDefNormal,Def/TestDefPlaceholder/123,Age/30"]}) + result = shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_mixed_tags(self): + df = pd.DataFrame({"column1": [ + "(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent,(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem,Age/25"]}) + expected_df = pd.DataFrame( + {"column1": ["Def/TestDefNormal,Event/SomeEvent,Def/TestDefPlaceholder/123,Item/SomeItem,Age/25"]}) + result = shrink_defs(df, self.schema, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_shrink_defs_series_normal(self): + series = pd.Series(["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"]) + expected_series = pd.Series(["Def/TestDefNormal,Event/SomeEvent"]) + result = shrink_defs(series, self.schema, None) + pd.testing.assert_series_equal(result, expected_series) + + def test_shrink_defs_series_placeholder(self): + series = pd.Series(["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]) + expected_series = pd.Series(["Def/TestDefPlaceholder/123,Item/SomeItem"]) + result = shrink_defs(series, self.schema, None) + pd.testing.assert_series_equal(result, expected_series) + + +class TestExpandDefs(unittest.TestCase): + def setUp(self): + self.schema = load_schema_version() + self.def_dict = DefinitionDict(["(Definition/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2))", + "(Definition/TestDefPlaceholder/#,(Action/TestDef1/#,Action/TestDef2))"], + hed_schema=self.schema) + + def test_expand_defs_normal(self): + df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"]}) + expected_df = pd.DataFrame( + {"column1": ["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"]}) + result = expand_defs(df, self.schema, self.def_dict, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_expand_defs_placeholder(self): + df = pd.DataFrame({"column1": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) + expected_df = pd.DataFrame({"column1": [ + "(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]}) + result = expand_defs(df, self.schema, self.def_dict, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_expand_defs_no_matching_tags(self): + df = pd.DataFrame({"column1": ["(Event/SomeEvent,Item/SomeItem,Age/25)"]}) + expected_df = pd.DataFrame({"column1": ["(Event/SomeEvent,Item/SomeItem,Age/25)"]}) + result = expand_defs(df, self.schema, self.def_dict, ['column1']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_expand_defs_multiple_columns(self): + df = pd.DataFrame({"column1": ["Def/TestDefNormal,Event/SomeEvent"], + "column2": ["Def/TestDefPlaceholder/123,Item/SomeItem"]}) + expected_df = pd.DataFrame( + {"column1": ["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"], + "column2": [ + "(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]}) + result = expand_defs(df, self.schema, self.def_dict, ['column1', 'column2']) + pd.testing.assert_frame_equal(result, expected_df) + + def test_expand_defs_series_normal(self): + series = pd.Series(["Def/TestDefNormal,Event/SomeEvent"]) + expected_series = pd.Series(["(Def-expand/TestDefNormal,(Action/TestDef1/2471,Action/TestDef2)),Event/SomeEvent"]) + result = expand_defs(series, self.schema, self.def_dict, None) + pd.testing.assert_series_equal(result, expected_series) + + def test_expand_defs_series_placeholder(self): + series = pd.Series(["Def/TestDefPlaceholder/123,Item/SomeItem"]) + expected_series = pd.Series(["(Def-expand/TestDefPlaceholder/123,(Action/TestDef1/123,Action/TestDef2)),Item/SomeItem"]) + result = expand_defs(series, self.schema, self.def_dict, None) + pd.testing.assert_series_equal(result, expected_series) \ No newline at end of file