diff --git a/hed/models/df_util.py b/hed/models/df_util.py index f9fa19dcc..989299d2f 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -26,7 +26,7 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_ expand_defs: bool Expand any def tags found Returns: - tuple: A list of HedStrings, or a list of lists of HedStrings, DefinitionDict + tuple: A list of HedStrings or a list of lists of HedStrings, DefinitionDict """ if isinstance(sidecar, str): @@ -76,13 +76,13 @@ def convert_to_form(df, hed_schema, tag_form, columns=None): def shrink_defs(df, hed_schema, columns=None): - """ Shrinks any def-expand tags found in the dataframe. + """ Shrinks any def-expand tags found in the specified columns in the dataframe. Converts in place Parameters: df (pd.Dataframe or pd.Series): The dataframe or series to modify hed_schema (HedSchema or None): The schema to use to identify defs. - columns (list or None): The columns to modify on the dataframe + columns (list or None): The columns to modify on the dataframe. """ if isinstance(df, pd.Series): mask = df.str.contains('Def-expand/', case=False) diff --git a/hed/tools/analysis/analysis_util.py b/hed/tools/analysis/analysis_util.py index a4c57c9f6..aa13f288d 100644 --- a/hed/tools/analysis/analysis_util.py +++ b/hed/tools/analysis/analysis_util.py @@ -6,6 +6,7 @@ from hed.models.hed_tag import HedTag from hed.models.hed_group import HedGroup from hed.models import df_util +from hed.models import QueryParser def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs=False): @@ -44,6 +45,68 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs return df, definitions +def get_expression_parsers(queries, query_names=None): + """ Returns a list of expression parsers and query_names. + + Parameters: + queries (list): A list of query strings or QueryParser objects + query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc. + + Returns: + DataFrame - containing the search strings + + Raises: + ValueError - if query names are invalid or duplicated. + + """ + expression_parsers = [] + if not query_names: + query_names = [f"query_{index}" for index in range(len(queries))] + elif len(queries) != len(query_names): + raise ValueError("QueryNamesLengthBad", + f"The query_names length {len(query_names)} must be empty or equal" + + f"to the queries length {len(queries)}.") + elif len(set(query_names)) != len(query_names): + raise ValueError("DuplicateQueryNames", f"The query names {str(query_names)} list has duplicates") + for index, query in enumerate(queries): + if not query: + raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be empty") + elif isinstance(query, str): + try: + next_query = QueryParser(query) + except Exception: + raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed") + else: + next_query = query + expression_parsers.append(next_query) + return expression_parsers, query_names + + +def search_strings(hed_strings, queries, query_names=None): + """ Returns a DataFrame of factors based on results of queries. + + Parameters: + hed_strings (list): A list of HedString objects (empty entries or None entries are 0's) + queries (list): A list of query strings or QueryParser objects + query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc. + + Returns: + DataFrame - containing the factor vectors with results of the queries + + Raises: + ValueError - if query names are invalid or duplicated. + + """ + + expression_parsers, query_names = get_expression_parsers(queries, query_names=query_names) + df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=query_names) + for parse_ind, parser in enumerate(expression_parsers): + for index, next_item in enumerate(hed_strings): + match = parser.search(next_item) + if match: + df_factors.at[index, query_names[parse_ind]] = 1 + return df_factors + # def get_assembled_strings(table, hed_schema=None, expand_defs=False): # """ Return HED string objects for a tabular file. # @@ -61,7 +124,7 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs # return hed_list # -# def search_tabular(data_input, hed_schema, query, columns_included=None): +# def search_tabular(data_input, sidecar, hed_schema, query, extra_def_dicts=None, columns_included=None): # """ Return a dataframe with results of query. # # Parameters: @@ -76,7 +139,8 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs # """ # # eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included) -# hed_list = get_assembled_strings(data_input, hed_schema=hed_schema, expand_defs=True) +# hed_list, definitions = df_util.get_assembled(data_input, sidecar, hed_schema, extra_def_dicts=None, join_columns=True, +# shrink_defs=False, expand_defs=True) # expression = QueryParser(query) # hed_tags = [] # row_numbers = [] diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py index 930f1353f..ae1f35e63 100644 --- a/hed/tools/remodeling/operations/factor_hed_tags_op.py +++ b/hed/tools/remodeling/operations/factor_hed_tags_op.py @@ -8,6 +8,7 @@ from hed.models.sidecar import Sidecar from hed.models.expression_parser import QueryParser from hed.models.df_util import get_assembled +from hed.tools.analysis.analysis_util import get_expression_parsers, search_strings class FactorHedTagsOp(BaseOp): @@ -65,21 +66,8 @@ def __init__(self, parameters): self.queries = parameters['queries'] self.query_names = parameters['query_names'] self.remove_types = parameters['remove_types'] - if not self.query_names: - self.query_names = [f"query_{index}" for index in range(len(self.queries))] - elif len(self.queries) != len(self.query_names): - raise ValueError("QueryNamesLengthBad", - f"The query_names length {len(self.query_names)} must be empty or equal" + - f"to the queries length {len(self.queries)} .") - elif len(set(self.query_names)) != len(self.query_names): - raise ValueError("DuplicateQueryNames", f"The query names {str(self.query_names)} list has duplicates") - self.expression_parsers = [] - for index, query in enumerate(self.queries): - try: - next_query = QueryParser(query) - except Exception: - raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed") - self.expression_parsers.append(next_query) + self.expression_parsers, self.query_names = get_expression_parsers(self.queries, + query_names=parameters['query_names']) def do_op(self, dispatcher, df, name, sidecar=None): """ Factor the column using HED tag queries. @@ -111,12 +99,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): df_list = [input_data.dataframe] hed_strings, _ = get_assembled(input_data, sidecar, dispatcher.hed_schema, extra_def_dicts=None, join_columns=True, shrink_defs=False, expand_defs=True) - df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=self.query_names) - for parse_ind, parser in enumerate(self.expression_parsers): - for index, next_item in enumerate(hed_strings): - match = parser.search(next_item) - if match: - df_factors.at[index, self.query_names[parse_ind]] = 1 + df_factors = search_strings(hed_strings, self.expression_parsers, query_names=self.query_names) if len(df_factors.columns) > 0: df_list.append(df_factors) df_new = pd.concat(df_list, axis=1) diff --git a/tests/tools/analysis/test_analysis_util_assemble_hed.py b/tests/tools/analysis/test_analysis_util_assemble_hed.py index 318c3aa54..75d143659 100644 --- a/tests/tools/analysis/test_analysis_util_assemble_hed.py +++ b/tests/tools/analysis/test_analysis_util_assemble_hed.py @@ -3,9 +3,8 @@ from pandas import DataFrame from hed import schema as hedschema from hed.models import Sidecar, TabularInput, DefinitionDict -from hed.tools.analysis.analysis_util import assemble_hed - - +from hed.models import df_util +from hed.tools.analysis.analysis_util import assemble_hed, search_strings # noinspection PyBroadException @@ -25,7 +24,6 @@ def setUpClass(cls): schema = hedschema.load_schema(schema_path) cls.schema = schema sidecar1 = Sidecar(json_path, name='face_sub1_json') - cls.sidecar_path = sidecar1 cls.sidecar1 = sidecar1 cls.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") cls.input_data_no_sidecar = TabularInput(events_path, name="face_sub1_events_no_sidecar") @@ -96,27 +94,29 @@ def test_assemble_hed_bad_column_no_expand(self): self.assertNotEqual(first_str2.find('Def/'), -1, "assemble_hed with def expand has no Def tag") self.assertEqual(first_str2.find('Def-expand/'), -1, "assemble_hed with def expand has Def-expand tags") - # def test_search_tabular(self): - # query1 = "sensory-event" - # df1 = search_tabular(self.input_data, self.schema, query1, columns_included=None) - # self.assertIsInstance(df1, DataFrame, "search_tabular returns a dataframe when the query is satisfied.") - # self.assertEqual(len(df1.columns), 2, "search_tabular has the right number of columns when query okay") - # self.assertEqual(len(df1.index), 155, "search_tabular has right number of rows when query okay") - # query2 = 'data-feature' - # df2 = search_tabular(self.input_data, self.hed_schema, query2, columns_included=None) - # self.assertFalse(df2, "search_tabular returns None when query is not satisfied.") - # - # query3 = "sensory-event" - # df3 = search_tabular(self.input_data, self.hed_schema, query3, columns_included=['event_type', 'rep_status']) - # self.assertIsInstance(df3, DataFrame, "search_tabular returns a DataFrame when extra columns") - # self.assertEqual(len(df3.columns), 3, "search_tabular returns right number of columns when extra columns") - # self.assertEqual(len(df3.index), 155, "search_tabular has right number of rows when query okay") - # - # df4 = search_tabular(self.input_data, self.hed_schema, query3, - # columns_included=['onset', 'event_type', 'rep_status']) - # self.assertIsInstance(df4, DataFrame, "search_tabular returns a DataFrame when extra columns") - # self.assertEqual(len(df4.columns), 4, "search_tabular returns right number of columns when extra columns") - # self.assertEqual(len(df4.index), 155, "search_tabular has right number of rows when query okay") + def test_search_strings(self): + hed_strings, dict1 = df_util.get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None, + join_columns=True, shrink_defs=False, expand_defs=True) + queries1 = ["sensory-event"] + query_names1 = ["sensory"] + df1 = search_strings(hed_strings, queries1, query_names1) + self.assertIsInstance(df1, DataFrame, "search_tabular returns a dataframe when the query is satisfied.") + self.assertEqual(len(df1.columns), 1, "search_tabular has the right number of columns when query okay") + self.assertEqual(len(df1.index), 200, "search_tabular has right number of rows when query okay") + queries2 = ['data-feature', "sensory-event"] + query_names2 = ['data', 'sensory'] + df2 = search_strings(hed_strings, queries2, query_names2) + self.assertEqual(len(df2.columns), 2, "search_tabular has the right number of columns when query okay") + self.assertEqual(len(df2.index), 200, "search_tabular has right number of rows when query okay") + totals = df2.sum(axis=0) + self.assertFalse(totals.loc['data']) + self.assertEqual(totals.loc['sensory'], 155) + queries3 = ['image', "sensory-event", "face"] + query_names3 = ['image', 'sensory', "faced"] + df3 = search_strings(hed_strings, queries3, query_names3) + self.assertIsInstance(df3, DataFrame, "search_tabular returns a DataFrame when extra columns") + self.assertEqual(len(df3.columns), 3, "search_tabular returns right number of columns when extra columns") + self.assertEqual(len(df3.index), 200, "search_tabular has right number of rows when query okay") if __name__ == '__main__':