diff --git a/hed/tools/analysis/annotation_util.py b/hed/tools/analysis/annotation_util.py index 804a3e0cf..61704d625 100644 --- a/hed/tools/analysis/annotation_util.py +++ b/hed/tools/analysis/annotation_util.py @@ -32,17 +32,18 @@ def df_to_hed(dataframe, description_tag=True): description_tag (bool): If True description tag is included. Returns: - dict: A dictionary compatible compatible with BIDS JSON tabular file that includes HED. + dict: A dictionary compatible with BIDS JSON tabular file that includes HED. Notes: - The DataFrame must have the columns with names: column_name, column_value, description, and HED. """ - missing_cols = check_df_columns(dataframe) + df = dataframe.fillna('n/a') + missing_cols = check_df_columns(df) if missing_cols: raise HedFileError("RequiredColumnsMissing", f"Columns {str(missing_cols)} are missing from dataframe", "") hed_dict = {} - for index, row in dataframe.iterrows(): + for index, row in df.iterrows(): if row['HED'] == 'n/a' and row['description'] == 'n/a': continue if row['column_value'] == 'n/a': diff --git a/hed/tools/analysis/event_manager.py b/hed/tools/analysis/event_manager.py index 5c778d1b8..648f96358 100644 --- a/hed/tools/analysis/event_manager.py +++ b/hed/tools/analysis/event_manager.py @@ -1,17 +1,19 @@ """ Manages events of temporal extent. """ -from hed.tools.analysis.temporal_event import TemporalEvent from hed.models.model_constants import DefTagNames +from hed.models.df_util import get_assembled +from hed.tools.analysis.temporal_event import TemporalEvent class EventManager: - def __init__(self, hed_strings, onsets, def_dict): - """ Create an event manager for an events file. Manages events of temporal extent. This + def __init__(self, input_data, hed_schema, extra_defs=None): + """ Create an event manager for an events file. Manages events of temporal extent. Parameters: - data (TabularInput): A tabular input file. - schema (HedSchema): A HED schema + input_data (TabularInput): Represents an events file with its sidecar. + hed_schema (HedSchema): HED schema used in this + extra_defs (DefinitionDict): Extra definitions not included in the input_data information. :raises HedFileError: - if there are any unmatched offsets. @@ -21,27 +23,44 @@ def __init__(self, hed_strings, onsets, def_dict): """ - self.event_list = [[] for _ in range(len(onsets))] - self.onsets = onsets - self.hed_strings = hed_strings - self.def_dict = def_dict - self.contexts = [] - self._create_event_list() + self.event_list = [[] for _ in range(len(input_data.dataframe))] + self.hed_schema = hed_schema + self.def_dict = input_data.get_def_dict(hed_schema, extra_def_dicts=extra_defs) + self.onsets = input_data.dataframe['onset'].tolist() + self.hed_strings = None # Remaining HED strings copy.deepcopy(hed_strings) + self.anchor_dict = {} + self._create_event_list(input_data) + self._create_anchor_list() + + # def iter_context(self): + # """ Iterate rows of context. + # + # Yields: + # int: position in the dataFrame + # HedString: Context + # + # """ + # + # for index in range(len(self.contexts)): + # yield index, self.contexts[index] + + def _create_anchor_list(self): + """ Populate the dictionary of def names to list of temporal events. - def iter_context(self): - """ Iterate rows of context. + :raises HedFileError: + - If the hed_strings contain unmatched offsets. - Yields: - int: position in the dataFrame - HedString: Context + Notes: """ + for index, events in enumerate(self.event_list): + for event in events: + index_list = self.anchor_dict.get(event.anchor, []) + index_list.append(event) + self.anchor_dict[event.anchor] = index_list - for index in range(len(self.contexts)): - yield index, self.contexts[index] - - def _create_event_list(self): - """ Create a list of events of extended duration. + def _create_event_list(self, input_data): + """ Populate the event_list with the events with temporal extent indexed by event number. :raises HedFileError: - If the hed_strings contain unmatched offsets. @@ -49,12 +68,16 @@ def _create_event_list(self): Notes: """ + hed_strings, def_dict = get_assembled(input_data, input_data._sidecar, self.hed_schema, + extra_def_dicts=None, join_columns=True, + shrink_defs=True, expand_defs=False) onset_dict = {} # Temporary dictionary keeping track of temporal events that haven't ended yet. - for event_index, hed in enumerate(self.hed_strings): + for event_index, hed in enumerate(hed_strings): self._extract_temporal_events(hed, event_index, onset_dict) # Now handle the events that extend to end of list for item in onset_dict.values(): item.set_end(len(self.onsets), None) + self.hed_strings = hed_strings def _extract_temporal_events(self, hed, event_index, onset_dict): """ Extract the temporal events and remove them from the other HED strings. diff --git a/hed/tools/analysis/event_manager_copy.py b/hed/tools/analysis/event_manager_copy.py new file mode 100644 index 000000000..9a8dd02fa --- /dev/null +++ b/hed/tools/analysis/event_manager_copy.py @@ -0,0 +1,147 @@ +""" Manages events of temporal extent. """ + +from hed.tools.analysis.temporal_event import TemporalEvent +from hed.models.model_constants import DefTagNames + + +class EventManagerCopy: + + def __init__(self, input_data, hed_schema, extra_def_dict=None): + """ Create an event manager for an events file. Manages events of temporal extent. This + + Parameters: + hed_strings (list): A list of HED strings + onsets (list): A list of onset times that is the same length as hed_strings + def_dict (DefinitionDict): Contains the definitions for this dataset. + + :raises HedFileError: + - if there are any unmatched offsets. + + Notes: Keeps the events of temporal extend by their starting index in events file. These events + are separated from the rest of the annotations. + + """ + + self.event_list = [[] for _ in range(len(onsets))] + self.onsets = onsets + self.hed_strings = hed_strings ## copy.deepcopy(hed_strings) + self.def_dict = def_dict + self.anchor_dict ={} + self._create_event_list() + self._create_anchor_list() + + # def iter_context(self): + # """ Iterate rows of context. + # + # Yields: + # int: position in the dataFrame + # HedString: Context + # + # """ + # + # for index in range(len(self.contexts)): + # yield index, self.contexts[index] + + def _create_anchor_list(self): + """ Populate the dictionary of def names to list of temporal events. + + :raises HedFileError: + - If the hed_strings contain unmatched offsets. + + Notes: + + """ + for index, events in enumerate(self.event_list): + for event in events: + index_list = self.anchor_dict.get(event.anchor, []) + index_list.append(event) + self.anchor_dict[event.anchor] = index_list + + def _create_event_list(self): + """ Populate the event_list with the events with temporal extent indexed by event number. + + :raises HedFileError: + - If the hed_strings contain unmatched offsets. + + Notes: + + """ + onset_dict = {} # Temporary dictionary keeping track of temporal events that haven't ended yet. + for event_index, hed in enumerate(self.hed_strings): + self._extract_temporal_events(hed, event_index, onset_dict) + # Now handle the events that extend to end of list + for item in onset_dict.values(): + item.set_end(len(self.onsets), None) + + def _extract_temporal_events(self, hed, event_index, onset_dict): + """ Extract the temporal events and remove them from the other HED strings. + + Parameters: + hed (HedString): The assembled HedString at position event_index in the data. + event_index (int): The position of this string in the data. + onset_dict (dict): Running dict that keeps track of temporal events that haven't yet ended. + + Note: + This removes the events of temporal extent from the HED string. + + """ + if not hed: + return + group_tuples = hed.find_top_level_tags(anchor_tags={DefTagNames.ONSET_KEY, DefTagNames.OFFSET_KEY}, + include_groups=2) + to_remove = [] + for tup in group_tuples: + anchor_tag = tup[1].find_def_tags(recursive=False, include_groups=0)[0] + anchor = anchor_tag.extension.lower() + if anchor in onset_dict or tup[0].short_base_tag.lower() == DefTagNames.OFFSET_KEY: + temporal_event = onset_dict.pop(anchor) + temporal_event.set_end(event_index, self.onsets[event_index]) + if tup[0] == DefTagNames.ONSET_KEY: + new_event = TemporalEvent(tup[1], event_index, self.onsets[event_index]) + self.event_list[event_index].append(new_event) + onset_dict[anchor] = new_event + to_remove.append(tup[1]) + hed.remove(to_remove) + + def _set_event_contexts(self): + """ Creates an event context for each hed string. + + Notes: + The event context would be placed in an event context group, but is kept in a separate array without the + event context group or tag. + + """ + # contexts = [[] for _ in range(len(self.hed_strings))] + # for onset in self.onset_list: + # for i in range(onset.start_index+1, onset.end_index): + # contexts[i].append(onset.contents) + # for i in range(len(self.hed_strings)): + # contexts[i] = HedString(",".join(contexts[i]), hed_schema=self.hed_schema) + # self.contexts = contexts + print("_set_event_contexts not implemented yet") + + def _update_onset_list(self, group, onset_dict, event_index): + """ Process one onset or offset group to create onset_list. + + Parameters: + group (HedGroup): The HedGroup containing the onset or offset. + onset_dict (dict): A dictionary of OnsetGroup objects that keep track of span of an event. + event_index (int): The event number in the list. + + :raises HedFileError: + - if an unmatched offset is encountered. + + Notes: + - Modifies onset_dict and onset_list. + """ + # def_tags = group.find_def_tags(recursive=False, include_groups=0) + # name = def_tags[0].extension + # onset_element = onset_dict.pop(name, None) + # if onset_element: + # onset_element.end_index = event_index + # self.onset_list.append(onset_element) + # elif is_offset: + # raise HedFileError("UnmatchedOffset", f"Unmatched {name} offset at event {event_index}", " ") + # if not is_offset: + # onset_element = TemporalEvent(name, group, event_index) + # onset_dict[name] = onset_element diff --git a/hed/tools/analysis/hed_context_manager.py b/hed/tools/analysis/hed_context_manager.py index da4c13e8e..79ebb428b 100644 --- a/hed/tools/analysis/hed_context_manager.py +++ b/hed/tools/analysis/hed_context_manager.py @@ -45,7 +45,6 @@ def __init__(self, hed_strings, hed_schema): self.onset_list = [] self.onset_count = 0 self.offset_count = 0 - self.contexts = [] self._create_onset_list() self._set_event_contexts() diff --git a/hed/tools/analysis/hed_type_definitions.py b/hed/tools/analysis/hed_type_definitions.py index 7bdba7422..417083e44 100644 --- a/hed/tools/analysis/hed_type_definitions.py +++ b/hed/tools/analysis/hed_type_definitions.py @@ -5,7 +5,16 @@ class HedTypeDefinitions: + """ + Properties: + def_map (dict): keys are definition names, values are dict {type_values, description, tags} + Example: A definition 'famous-face-cond' with contents + `(Condition-variable/Face-type,Description/A face that should be recognized by the participants,(Image,(Face,Famous)))` + would have type_values ['face_type']. All items are strings not objects. + + + """ def __init__(self, definitions, hed_schema, type_tag='condition-variable'): """ Create a definition manager for a type of variable. @@ -14,8 +23,6 @@ def __init__(self, definitions, hed_schema, type_tag='condition-variable'): hed_schema (Hedschema or HedSchemaGroup): The schema used for parsing. type_tag (str): Lower-case HED tag string representing the type managed. - # TODO: [Refactor] - should dict be allowed for definitions. - """ self.type_tag = type_tag.lower() @@ -26,8 +33,8 @@ def __init__(self, definitions, hed_schema, type_tag='condition-variable'): self.definitions = definitions else: self.definitions = {} - self.def_map = self._extract_def_map() # maps def names to conditions. - self.type_map = self._extract_type_map() + self.def_map = self._extract_def_map() + self.type_map = self._extract_type_map() # def get_type_values(self, item): """ Return a list of type_tag values in item. diff --git a/hed/tools/analysis/key_map.py b/hed/tools/analysis/key_map.py index 72822add0..e2f7f535b 100644 --- a/hed/tools/analysis/key_map.py +++ b/hed/tools/analysis/key_map.py @@ -40,8 +40,8 @@ def __init__(self, key_cols, target_cols=None, name=''): f"Key cols {str(key_cols)} and target cols {str(target_cols)} must be disjoint", "") self.name = name self.col_map = pd.DataFrame(columns=self.key_cols + self.target_cols) - self.map_dict = {} - self.count_dict = {} + self.map_dict = {} # Index of key to position in the col_map DataFrame + self.count_dict = {} # Keeps a running count of the number of times a key appears in the data @property def columns(self): @@ -51,15 +51,15 @@ def __str__(self): temp_list = [f"{self.name} counts for key [{str(self.key_cols)}]:"] for index, row in self.col_map.iterrows(): key_hash = get_row_hash(row, self.columns) - temp_list.append(f"{str(list(row.values))}\t{self.count_dict[key_hash]}") + temp_list.append(f"{str(list(row.values))}:\t{self.count_dict[key_hash]}") return "\n".join(temp_list) - def make_template(self, additional_cols=None): + def make_template(self, additional_cols=None, show_counts=True): """ Return a dataframe template. Parameters: additional_cols (list or None): Optional list of additional columns to append to the returned dataframe. - + show_counts (bool): If true, number of times each key combination appears is in first column Returns: DataFrame: A dataframe containing the template. @@ -77,8 +77,17 @@ def make_template(self, additional_cols=None): df = self.col_map[self.key_cols].copy() if additional_cols: df[additional_cols] = 'n/a' + if show_counts: + df.insert(0, 'key_counts', self._get_counts()) return df + def _get_counts(self): + counts = [0 for _ in range(len(self.col_map))] + for index, row in self.col_map.iterrows(): + key_hash = get_row_hash(row, self.key_cols) + counts[index] = self.count_dict[key_hash] + return counts + def remap(self, data): """ Remap the columns of a dataframe or columnar file. @@ -134,16 +143,12 @@ def resort(self): key_hash = get_row_hash(row, self.key_cols) self.map_dict[key_hash] = index - def update(self, data, allow_missing=True, keep_counts=True): + def update(self, data, allow_missing=True): """ Update the existing map with information from data. Parameters: data (DataFrame or str): DataFrame or filename of an events file or event map. allow_missing (bool): If true allow missing keys and add as n/a columns. - keep_counts (bool): If true keep a count of the times each key is present. - - Returns: - list: The indices of duplicates. :raises HedFileError: - If there are missing keys and allow_missing is False. @@ -165,35 +170,26 @@ def update(self, data, allow_missing=True, keep_counts=True): targets_present, targets_missing = separate_values(col_list, self.target_cols) if targets_present: base_df[targets_present] = df[targets_present].values - return self._update(base_df, track_duplicates=keep_counts) + self._update(base_df) - def _update(self, base_df, track_duplicates=True): + def _update(self, base_df): """ Update the dictionary of key values based on information in the dataframe. Parameters: base_df (DataFrame): DataFrame of consisting of the columns in the KeyMap - track_duplicates (bool): If true, keep counts of the indices. - - Returns: - list: List of key positions that appeared more than once or an empty list of no duplicates or - track_duplicates was false. """ - duplicate_indices = [] row_list = [] next_pos = len(self.col_map) for index, row in base_df.iterrows(): - key, pos_update = self._handle_update(row, row_list, next_pos, track_duplicates) + key, pos_update = self._handle_update(row, row_list, next_pos) next_pos += pos_update - if not track_duplicates and not pos_update: - duplicate_indices.append(index) if row_list: df = pd.DataFrame(row_list) self.col_map = pd.concat([self.col_map, df], axis=0, ignore_index=True) - return duplicate_indices - def _handle_update(self, row, row_list, next_pos, keep_counts): + def _handle_update(self, row, row_list, next_pos): """ Update the dictionary and counts of the number of times this combination of key columns appears. Parameters: @@ -201,6 +197,9 @@ def _handle_update(self, row, row_list, next_pos, keep_counts): row_list (list): A list of rows to be appended to hold the unique rows next_pos (int): Index into the + Returns: + tuple: (key, pos_update) key is the row hash and pos_update is 1 if new row or 0 otherwise. + """ key = get_row_hash(row, self.key_cols) pos_update = 0 @@ -208,10 +207,8 @@ def _handle_update(self, row, row_list, next_pos, keep_counts): self.map_dict[key] = next_pos row_list.append(row) pos_update = 1 - if keep_counts: - self.count_dict[key] = 0 - if keep_counts: - self.count_dict[key] += 1 + self.count_dict[key] = 0 + self.count_dict[key] = self.count_dict[key] + 1 return key, pos_update @staticmethod diff --git a/hed/tools/analysis/temporal_event.py b/hed/tools/analysis/temporal_event.py index e82f1813d..876fee6ba 100644 --- a/hed/tools/analysis/temporal_event.py +++ b/hed/tools/analysis/temporal_event.py @@ -29,4 +29,4 @@ def _split_group(self): self.anchor = item.extension.lower() def __str__(self): - return f"{self.start_index}:{self.end_index} contents:{self.contents}" + return f"[{self.start_index}:{self.end_index}] anchor:{self.anchor} contents:{self.contents}" diff --git a/tests/tools/analysis/test_event_manager.py b/tests/tools/analysis/test_event_manager.py index 1e05ed2d9..5e7937f7e 100644 --- a/tests/tools/analysis/test_event_manager.py +++ b/tests/tools/analysis/test_event_manager.py @@ -25,19 +25,12 @@ def setUpClass(cls): cls.schema = schema def test_constructor(self): - self.assertTrue(True) - hed_strings, def_dict = get_assembled(self.input_data, self.sidecar, self.schema, - extra_def_dicts=None, join_columns=True, - shrink_defs=True, expand_defs=False) - def_dict = self.sidecar.get_def_dict(self.schema) - onsets = self.input_data.dataframe["onset"].tolist() - manager1 = EventManager(hed_strings, onsets, def_dict) + manager1 = EventManager(self.input_data, self.schema) self.assertIsInstance(manager1.event_list, list) - self.assertEqual(len(manager1.event_list), len(onsets)) + self.assertEqual(len(manager1.event_list), len(self.input_data.dataframe)) self.assertEqual(len(manager1.event_list[0]), 2) self.assertIsInstance(manager1.hed_strings, list) - self.assertEqual(len(manager1.hed_strings), len(onsets)) - + self.assertEqual(len(manager1.hed_strings), len(self.input_data.dataframe)) # self.assertEqual(len(manager1.event_list), len(self.input_data.dataframe)) # event_count = 0 # for index, item in enumerate(manager1.event_list): diff --git a/tests/tools/analysis/test_key_map.py b/tests/tools/analysis/test_key_map.py index d15730928..d06300667 100644 --- a/tests/tools/analysis/test_key_map.py +++ b/tests/tools/analysis/test_key_map.py @@ -54,19 +54,21 @@ def test_str(self): def test_make_template(self): t_map = KeyMap(self.key_cols1) - stern_df = get_new_dataframe(self.stern_map_path) + stern_df = get_new_dataframe(self.stern_test1_path) t_map.update(stern_df) - df1 = t_map.make_template() + df1 = t_map.make_template(show_counts=False) self.assertIsInstance(df1, pd.DataFrame, "make_template should return a DataFrame") self.assertEqual(len(df1.columns), 1, "make_template should return 1 column single key, no additional columns") + df2 = t_map.make_template() + self.assertEqual(len(df2.columns), 2, "make_template returns an extra column for counts") t_map2 = KeyMap(['event_type', 'type']) - t_map2.update(self.stern_map_path) - df2 = t_map2.make_template() - self.assertIsInstance(df2, pd.DataFrame, "make_template should return a DataFrame") - self.assertEqual(len(df2.columns), 2, "make_template should return 2 columns w 2 keys, no additional columns") + t_map2.update(self.stern_test1_path) + df3 = t_map2.make_template() + self.assertIsInstance(df3, pd.DataFrame, "make_template should return a DataFrame") + self.assertEqual(len(df3.columns), 3, "make_template should return 2 columns w 2 keys, no additional columns") df3 = t_map2.make_template(['bananas', 'pears', 'apples']) self.assertIsInstance(df3, pd.DataFrame, "make_template should return a DataFrame") - self.assertEqual(len(df3.columns), 5, "make_template should return 5 columns w 2 keys, 3 additional columns") + self.assertEqual(len(df3.columns), 6, "make_template should return 5 columns w 2 keys, 3 additional columns") def test_make_template_key_overlap(self): t_map = KeyMap(['event_type', 'type']) @@ -124,21 +126,12 @@ def test_remap_files(self): def test_update_map(self): t_map = KeyMap(self.key_cols1, self.target_cols1) stern_df = get_new_dataframe(self.stern_map_path) - duplicates = t_map.update(stern_df) + t_map.update(stern_df) df_map = t_map.col_map df_dict = t_map.map_dict self.assertEqual(len(df_map), len(stern_df), "update map should contain all the entries") self.assertEqual(len(df_dict.keys()), len(stern_df), "update dictionary should contain all the entries") - self.assertFalse(duplicates, "update should not have any duplicates for stern map") - - def test_update_map_row_list(self): - t_map = KeyMap(self.key_cols1, self.target_cols1) - stern_df = get_new_dataframe(self.stern_map_path) - duplicates1 = t_map.update(stern_df) - duplicates2 = t_map.update(stern_df) - self.assertFalse(duplicates1) - self.assertFalse(duplicates2) def test_update_map_missing(self): keys = self.key_cols1 + ['another'] @@ -159,20 +152,12 @@ def test_update_map_missing_allowed(self): stern_df = get_new_dataframe(self.stern_map_path) t_map.update(stern_df, allow_missing=True) - def test_update_map_duplicate_keys(self): - t_map = KeyMap(self.key_cols1, self.target_cols1) - stern_df = get_new_dataframe(self.stern_test2_path) - duplicates = t_map.update(stern_df, keep_counts=False) - self.assertTrue(duplicates, "update should return a list of duplicates if repeated keys") - def test_update_map_not_unique(self): t_map = KeyMap(self.key_cols1, self.target_cols1) stern_df = get_new_dataframe(self.stern_test2_path) - duplicates = t_map.update(stern_df, keep_counts=False) + t_map.update(stern_df) self.assertEqual(len(t_map.col_map.columns), 4, "update should produce correct number of columns") - self.assertEqual(len(t_map.col_map), len(stern_df) - len(duplicates), - "update should produce the correct number of rows") - self.assertTrue(duplicates, "update using event file has duplicates") + self.assertEqual(len(t_map.col_map), len(t_map.count_dict), "update should produce the correct number of rows") if __name__ == '__main__': diff --git a/tests/tools/analysis/test_temporal_event.py b/tests/tools/analysis/test_temporal_event.py index e1d854611..a05545bd6 100644 --- a/tests/tools/analysis/test_temporal_event.py +++ b/tests/tools/analysis/test_temporal_event.py @@ -45,12 +45,7 @@ def test_constructor_group(self): self.assertIsInstance(te.internal_group, HedGroup) def test_constructor_on_files(self): - hed_strings, def_dict = get_assembled(self.input_data, self.sidecar, self.schema, - extra_def_dicts=None, join_columns=True, - shrink_defs=True, expand_defs=False) - def_dict = self.sidecar.get_def_dict(self.schema) - onsets = self.input_data.dataframe["onset"].tolist() - manager1 = EventManager(hed_strings, onsets, def_dict) + manager1 = EventManager(self.input_data, self.schema) event_list = manager1.event_list for events in event_list: if not events: