Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions hed/models/df_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_assembled(tabular_file, sidecar, hed_schema, extra_def_dicts=None, join_
expand_defs: bool
Expand any def tags found
Returns:
tuple: A list of HedStrings, or a list of lists of HedStrings, DefinitionDict
tuple: A list of HedStrings or a list of lists of HedStrings, DefinitionDict

"""
if isinstance(sidecar, str):
Expand Down Expand Up @@ -76,13 +76,13 @@ def convert_to_form(df, hed_schema, tag_form, columns=None):


def shrink_defs(df, hed_schema, columns=None):
""" Shrinks any def-expand tags found in the dataframe.
""" Shrinks any def-expand tags found in the specified columns in the dataframe.

Converts in place
Parameters:
df (pd.Dataframe or pd.Series): The dataframe or series to modify
hed_schema (HedSchema or None): The schema to use to identify defs.
columns (list or None): The columns to modify on the dataframe
columns (list or None): The columns to modify on the dataframe.
"""
if isinstance(df, pd.Series):
mask = df.str.contains('Def-expand/', case=False)
Expand Down
68 changes: 66 additions & 2 deletions hed/tools/analysis/analysis_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from hed.models.hed_tag import HedTag
from hed.models.hed_group import HedGroup
from hed.models import df_util
from hed.models import QueryParser


def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs=False):
Expand Down Expand Up @@ -44,6 +45,68 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs
return df, definitions


def get_expression_parsers(queries, query_names=None):
""" Returns a list of expression parsers and query_names.

Parameters:
queries (list): A list of query strings or QueryParser objects
query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc.

Returns:
DataFrame - containing the search strings

Raises:
ValueError - if query names are invalid or duplicated.

"""
expression_parsers = []
if not query_names:
query_names = [f"query_{index}" for index in range(len(queries))]
elif len(queries) != len(query_names):
raise ValueError("QueryNamesLengthBad",
f"The query_names length {len(query_names)} must be empty or equal" +
f"to the queries length {len(queries)}.")
elif len(set(query_names)) != len(query_names):
raise ValueError("DuplicateQueryNames", f"The query names {str(query_names)} list has duplicates")
for index, query in enumerate(queries):
if not query:
raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be empty")
elif isinstance(query, str):
try:
next_query = QueryParser(query)
except Exception:
raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed")
else:
next_query = query
expression_parsers.append(next_query)
return expression_parsers, query_names


def search_strings(hed_strings, queries, query_names=None):
""" Returns a DataFrame of factors based on results of queries.

Parameters:
hed_strings (list): A list of HedString objects (empty entries or None entries are 0's)
queries (list): A list of query strings or QueryParser objects
query_names (list): A list of column names for results of queries. If missing --- query_1, query_2, etc.

Returns:
DataFrame - containing the factor vectors with results of the queries

Raises:
ValueError - if query names are invalid or duplicated.

"""

expression_parsers, query_names = get_expression_parsers(queries, query_names=query_names)
df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=query_names)
for parse_ind, parser in enumerate(expression_parsers):
for index, next_item in enumerate(hed_strings):
match = parser.search(next_item)
if match:
df_factors.at[index, query_names[parse_ind]] = 1
return df_factors

# def get_assembled_strings(table, hed_schema=None, expand_defs=False):
# """ Return HED string objects for a tabular file.
#
Expand All @@ -61,7 +124,7 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs
# return hed_list
#

# def search_tabular(data_input, hed_schema, query, columns_included=None):
# def search_tabular(data_input, sidecar, hed_schema, query, extra_def_dicts=None, columns_included=None):
# """ Return a dataframe with results of query.
#
# Parameters:
Expand All @@ -76,7 +139,8 @@ def assemble_hed(data_input, sidecar, schema, columns_included=None, expand_defs
# """
#
# eligible_columns, missing_columns = separate_values(list(data_input.dataframe.columns), columns_included)
# hed_list = get_assembled_strings(data_input, hed_schema=hed_schema, expand_defs=True)
# hed_list, definitions = df_util.get_assembled(data_input, sidecar, hed_schema, extra_def_dicts=None, join_columns=True,
# shrink_defs=False, expand_defs=True)
# expression = QueryParser(query)
# hed_tags = []
# row_numbers = []
Expand Down
25 changes: 4 additions & 21 deletions hed/tools/remodeling/operations/factor_hed_tags_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from hed.models.sidecar import Sidecar
from hed.models.expression_parser import QueryParser
from hed.models.df_util import get_assembled
from hed.tools.analysis.analysis_util import get_expression_parsers, search_strings


class FactorHedTagsOp(BaseOp):
Expand Down Expand Up @@ -65,21 +66,8 @@ def __init__(self, parameters):
self.queries = parameters['queries']
self.query_names = parameters['query_names']
self.remove_types = parameters['remove_types']
if not self.query_names:
self.query_names = [f"query_{index}" for index in range(len(self.queries))]
elif len(self.queries) != len(self.query_names):
raise ValueError("QueryNamesLengthBad",
f"The query_names length {len(self.query_names)} must be empty or equal" +
f"to the queries length {len(self.queries)} .")
elif len(set(self.query_names)) != len(self.query_names):
raise ValueError("DuplicateQueryNames", f"The query names {str(self.query_names)} list has duplicates")
self.expression_parsers = []
for index, query in enumerate(self.queries):
try:
next_query = QueryParser(query)
except Exception:
raise ValueError("BadQuery", f"Query [{index}]: {query} cannot be parsed")
self.expression_parsers.append(next_query)
self.expression_parsers, self.query_names = get_expression_parsers(self.queries,
query_names=parameters['query_names'])

def do_op(self, dispatcher, df, name, sidecar=None):
""" Factor the column using HED tag queries.
Expand Down Expand Up @@ -111,12 +99,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
df_list = [input_data.dataframe]
hed_strings, _ = get_assembled(input_data, sidecar, dispatcher.hed_schema, extra_def_dicts=None,
join_columns=True, shrink_defs=False, expand_defs=True)
df_factors = pd.DataFrame(0, index=range(len(hed_strings)), columns=self.query_names)
for parse_ind, parser in enumerate(self.expression_parsers):
for index, next_item in enumerate(hed_strings):
match = parser.search(next_item)
if match:
df_factors.at[index, self.query_names[parse_ind]] = 1
df_factors = search_strings(hed_strings, self.expression_parsers, query_names=self.query_names)
if len(df_factors.columns) > 0:
df_list.append(df_factors)
df_new = pd.concat(df_list, axis=1)
Expand Down
50 changes: 25 additions & 25 deletions tests/tools/analysis/test_analysis_util_assemble_hed.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
from pandas import DataFrame
from hed import schema as hedschema
from hed.models import Sidecar, TabularInput, DefinitionDict
from hed.tools.analysis.analysis_util import assemble_hed


from hed.models import df_util
from hed.tools.analysis.analysis_util import assemble_hed, search_strings


# noinspection PyBroadException
Expand All @@ -25,7 +24,6 @@ def setUpClass(cls):
schema = hedschema.load_schema(schema_path)
cls.schema = schema
sidecar1 = Sidecar(json_path, name='face_sub1_json')
cls.sidecar_path = sidecar1
cls.sidecar1 = sidecar1
cls.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events")
cls.input_data_no_sidecar = TabularInput(events_path, name="face_sub1_events_no_sidecar")
Expand Down Expand Up @@ -96,27 +94,29 @@ def test_assemble_hed_bad_column_no_expand(self):
self.assertNotEqual(first_str2.find('Def/'), -1, "assemble_hed with def expand has no Def tag")
self.assertEqual(first_str2.find('Def-expand/'), -1, "assemble_hed with def expand has Def-expand tags")

# def test_search_tabular(self):
# query1 = "sensory-event"
# df1 = search_tabular(self.input_data, self.schema, query1, columns_included=None)
# self.assertIsInstance(df1, DataFrame, "search_tabular returns a dataframe when the query is satisfied.")
# self.assertEqual(len(df1.columns), 2, "search_tabular has the right number of columns when query okay")
# self.assertEqual(len(df1.index), 155, "search_tabular has right number of rows when query okay")
# query2 = 'data-feature'
# df2 = search_tabular(self.input_data, self.hed_schema, query2, columns_included=None)
# self.assertFalse(df2, "search_tabular returns None when query is not satisfied.")
#
# query3 = "sensory-event"
# df3 = search_tabular(self.input_data, self.hed_schema, query3, columns_included=['event_type', 'rep_status'])
# self.assertIsInstance(df3, DataFrame, "search_tabular returns a DataFrame when extra columns")
# self.assertEqual(len(df3.columns), 3, "search_tabular returns right number of columns when extra columns")
# self.assertEqual(len(df3.index), 155, "search_tabular has right number of rows when query okay")
#
# df4 = search_tabular(self.input_data, self.hed_schema, query3,
# columns_included=['onset', 'event_type', 'rep_status'])
# self.assertIsInstance(df4, DataFrame, "search_tabular returns a DataFrame when extra columns")
# self.assertEqual(len(df4.columns), 4, "search_tabular returns right number of columns when extra columns")
# self.assertEqual(len(df4.index), 155, "search_tabular has right number of rows when query okay")
def test_search_strings(self):
hed_strings, dict1 = df_util.get_assembled(self.input_data, self.sidecar1, self.schema, extra_def_dicts=None,
join_columns=True, shrink_defs=False, expand_defs=True)
queries1 = ["sensory-event"]
query_names1 = ["sensory"]
df1 = search_strings(hed_strings, queries1, query_names1)
self.assertIsInstance(df1, DataFrame, "search_tabular returns a dataframe when the query is satisfied.")
self.assertEqual(len(df1.columns), 1, "search_tabular has the right number of columns when query okay")
self.assertEqual(len(df1.index), 200, "search_tabular has right number of rows when query okay")
queries2 = ['data-feature', "sensory-event"]
query_names2 = ['data', 'sensory']
df2 = search_strings(hed_strings, queries2, query_names2)
self.assertEqual(len(df2.columns), 2, "search_tabular has the right number of columns when query okay")
self.assertEqual(len(df2.index), 200, "search_tabular has right number of rows when query okay")
totals = df2.sum(axis=0)
self.assertFalse(totals.loc['data'])
self.assertEqual(totals.loc['sensory'], 155)
queries3 = ['image', "sensory-event", "face"]
query_names3 = ['image', 'sensory', "faced"]
df3 = search_strings(hed_strings, queries3, query_names3)
self.assertIsInstance(df3, DataFrame, "search_tabular returns a DataFrame when extra columns")
self.assertEqual(len(df3.columns), 3, "search_tabular returns right number of columns when extra columns")
self.assertEqual(len(df3.index), 200, "search_tabular has right number of rows when query okay")


if __name__ == '__main__':
Expand Down