diff --git a/hed/tools/remodeling/operations/number_groups_op.py b/hed/tools/remodeling/operations/number_groups_op.py index 2fc7a8b31..6c8bfb0b5 100644 --- a/hed/tools/remodeling/operations/number_groups_op.py +++ b/hed/tools/remodeling/operations/number_groups_op.py @@ -1,13 +1,36 @@ -""" Implementation in progress. """ +""" Number groups of rows in a dataframe based on start and stop markers. """ +import numpy as np from hed.tools.remodeling.operations.base_op import BaseOp -# TODO: This class is under development - - class NumberGroupsOp(BaseOp): - """ Implementation in progress. """ + """ Number groups of rows in a dataframe based on start and stop + markers. + + Required remodeling parameters: + - **number_column_name** (*str*): The name of the column to add + with the group numbers. + - **source_column** (*str*): The column to check for start and + stop markers. + - **start** (*dict*): Specification for start markers. + - **values** (*list*): List of values that mark the start of + a group. + - **inclusion** (*str*): Either "include" or "exclude" to + specify whether the start marker row should be included in + the group. + - **stop** (*dict*): Specification for stop markers. + - **values** (*list*): List of values that mark the end of + a group. + - **inclusion** (*str*): Either "include" or "exclude" to + specify whether the stop marker row should be included in + the group. + + Optional remodeling parameters: + - **overwrite** (*bool*): If true, overwrite an existing column + with the same name. + + """ NAME = "number_groups" PARAMS = { @@ -83,50 +106,105 @@ def __init__(self, parameters): self.overwrite = parameters.get('overwrite', False) def do_op(self, dispatcher, df, name, sidecar=None): - """ Add numbers to groups of events in dataframe. + """ Add numbers to groups of rows in the events dataframe. Parameters: dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. - name (str): Unique identifier for the dataframe -- often the original file path. + name (str): Unique identifier for the dataframe -- often the + original file path. sidecar (Sidecar or file-like): Only needed for HED operations. Returns: Dataframe: A new dataframe after processing. """ - # check if number_column_name exists and if so, check overwrite setting + # check if number_column_name exists and if so, check overwrite + # setting if self.number_column_name in df.columns: if self.overwrite is False: - raise ValueError("ExistingNumberColumn", - f"Column {self.number_column_name} already exists in event file.", "") + raise ValueError( + "ExistingNumberColumn", + f"Column {self.number_column_name} already exists " + f"in event file.", "") # check if source_column exists if self.source_column not in df.columns: - raise ValueError("MissingSourceColumn", - f"Column {self.source_column} does not exist in event file {name}.", "") + raise ValueError( + "MissingSourceColumn", + f"Column {self.source_column} does not exist in event " + f"file {name}.", "") - # check if all elements in value lists start and stop exist in the source_column + # check if all elements in value lists start and stop exist in + # the source_column missing = [] for element in self.start['values']: if element not in df[self.source_column].tolist(): missing.append(element) if len(missing) > 0: - raise ValueError("MissingValue", - f"Start value(s) {missing} does not exist in {self.source_column} of event file {name}") + raise ValueError( + "MissingValue", + f"Start value(s) {missing} does not exist in " + f"{self.source_column} of event file {name}") missing = [] for element in self.stop['values']: if element not in df[self.source_column].tolist(): missing.append(element) if len(missing) > 0: - raise ValueError("MissingValue", - f"Start value(s) {missing} does not exist in {self.source_column} of event file {name}") + raise ValueError( + "MissingValue", + f"Start value(s) {missing} does not exist in " + f"{self.source_column} of event file {name}") df_new = df.copy() + df_new[self.number_column_name] = np.nan + + # Track current group number and whether we're inside a group + current_group = 0 + in_group = False + + for idx in range(len(df_new)): + # Use the original df to read source values in case we're + # overwriting the source column + value = df.iloc[idx][self.source_column] + + # Check if this is a start marker + if value in self.start['values']: + if not in_group: # Start a new group only if not already + # in one + current_group += 1 + in_group = True + if self.start['inclusion'] == 'include': + df_new.at[idx, self.number_column_name] = \ + current_group + # If already in a group and this is a start marker: + # - If inclusion is 'exclude', it acts as both end and + # start + elif self.start['inclusion'] == 'exclude': + # This marker ends the previous group and starts a + # new one + current_group += 1 + # Don't assign the number to this row (it's excluded) + continue + + # Check if this is a stop marker + if value in self.stop['values']: + if in_group: + if self.stop['inclusion'] == 'include': + df_new.at[idx, self.number_column_name] = \ + current_group + in_group = False + continue + + # Regular row - if in group, assign current group number + if in_group: + df_new.at[idx, self.number_column_name] = current_group + return df_new @staticmethod def validate_input_data(parameters): - """ Additional validation required of operation parameters not performed by JSON schema validator. """ + """ Additional validation required of operation parameters not + performed by JSON schema validator. """ return [] diff --git a/hed/tools/remodeling/operations/number_rows_op.py b/hed/tools/remodeling/operations/number_rows_op.py index e122be27e..fb327a68c 100644 --- a/hed/tools/remodeling/operations/number_rows_op.py +++ b/hed/tools/remodeling/operations/number_rows_op.py @@ -1,12 +1,25 @@ -""" Implementation in progress. """ +""" Number rows in a dataframe based on optional criteria. """ +import numpy as np from hed.tools.remodeling.operations.base_op import BaseOp -# TODO: This class is under development - class NumberRowsOp(BaseOp): - """ Implementation in progress. """ + """ Number rows in a dataframe based on optional criteria. + + Required remodeling parameters: + - **number_column_name** (*str*): The name of the column to add + with the row numbers. + + Optional remodeling parameters: + - **overwrite** (*bool*): If true, overwrite an existing column + with the same name. + - **match_value** (*dict*): If provided, only number rows where + the specified column matches the specified value. + - **column** (*str*): The column name to match. + - **value** (*str* or *number*): The value to match. + + """ NAME = "number_rows" PARAMS = { @@ -51,13 +64,14 @@ def __init__(self, parameters): self.match_value = parameters.get('match_value', False) def do_op(self, dispatcher, df, name, sidecar=None): - """ Add numbers events dataframe. + """ Add numbers to rows in the events dataframe. Parameters: dispatcher (Dispatcher): Manages operation I/O. df (DataFrame): - The DataFrame to be remodeled. - name (str): - Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + name (str): - Unique identifier for the dataframe -- often + the original file path. + sidecar (Sidecar or file-like): Only needed for HED operations. Returns: Dataframe: A new dataframe after processing. @@ -65,30 +79,39 @@ def do_op(self, dispatcher, df, name, sidecar=None): """ if self.number_column_name in df.columns: if self.overwrite is False: - raise ValueError("ExistingNumberColumn", - f"Column {self.number_column_name} already exists in event file.", "") + raise ValueError( + "ExistingNumberColumn", + f"Column {self.number_column_name} already exists " + f"in event file.", "") if self.match_value: if self.match_value['column'] not in df.columns: - raise ValueError("MissingMatchColumn", - f"Column {self.match_value['column']} does not exist in event file.", "") - if self.match_value['value'] not in df[self.match_value['column']].tolist(): - raise ValueError("MissingMatchValue", - f"Value {self.match_value['value']} does not exist in event file column" - f"{self.match_value['column']}.", "") + raise ValueError( + "MissingMatchColumn", + f"Column {self.match_value['column']} does not " + f"exist in event file.", "") + if self.match_value['value'] not in \ + df[self.match_value['column']].tolist(): + raise ValueError( + "MissingMatchValue", + f"Value {self.match_value['value']} does not exist " + f"in event file column " + f"{self.match_value['column']}.", "") df_new = df.copy() - # df_new[self.number_column_name] = np.nan - # if self.match_value: - # filter = df[self.match_value['column']] == self.match_value['value'] - # numbers = [*range(1, sum(filter)+1)] - # df_new.loc[filter, self.number_column_name] = numbers - # else: - # df_new[self.number_column_name] = df_new.index + 1 + df_new[self.number_column_name] = np.nan + if self.match_value: + filter_mask = \ + df[self.match_value['column']] == self.match_value['value'] + numbers = [*range(1, sum(filter_mask)+1)] + df_new.loc[filter_mask, self.number_column_name] = numbers + else: + df_new[self.number_column_name] = df_new.index + 1 return df_new @staticmethod def validate_input_data(parameters): - """ Additional validation required of operation parameters not performed by JSON schema validator. """ + """ Additional validation required of operation parameters not + performed by JSON schema validator. """ return [] diff --git a/tests/tools/remodeling/operations/test_number_groups.py b/tests/tools/remodeling/operations/test_number_groups.py index 6aa4cad9f..092c5d618 100644 --- a/tests/tools/remodeling/operations/test_number_groups.py +++ b/tests/tools/remodeling/operations/test_number_groups.py @@ -1,5 +1,7 @@ from copy import deepcopy import json +import numpy as np +import pandas as pd import unittest from hed.tools.remodeling.operations.number_groups_op import NumberGroupsOp @@ -151,54 +153,58 @@ def tearDownClass(cls): # test working def test_number_groups_new_column(self): - pass # Test when new column name is given with overwrite unspecified (=False) - # parms = json.loads(self.json_parms) - # op = NumberGroupsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.sample_columns) - # df_check = pd.DataFrame(self.numbered_data, columns=self.numbered_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.sample_columns) - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - # - # self.assertTrue(list(df_new.columns) == list(self.numbered_columns), - # "numbered_events should have the expected columns") - # self.assertTrue(len(df_new) == len(df_test), - # "numbered_events should have same length as original dataframe") - # self.assertTrue(np.nanmax(df_new["number"]) == 5.0, - # "max value in numbered_events should match the number of groups") - # - # # fill na to match postprocessing dispatcher - # df_new = df_new.fillna('n/a') - # self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()), - # "numbered_events should not differ from check") - # - # # Test that df has not been changed by the op - # self.assertTrue(list(df.columns) == list(df_test.columns), - # "number_rows should not change the input df columns") - # self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), - # "number_rows should not change the input df values") - # - # def test_existing_column_overwrite_true(self): - # # Test when existing column name is given with overwrite True - # parms = json.loads(self.json_overwrite_true_parms) - # op = NumberGroupsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # df_check = pd.DataFrame(self.overwritten_data, columns=self.existing_sample_columns) - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - # - # self.assertTrue(list(df_new.columns) == list(self.existing_sample_columns), - # "numbered_events should have the same columns as original dataframe in case of overwrite") - # self.assertTrue(len(df_new) == len(df_test), - # "numbered_events should have same length as original dataframe") - # self.assertTrue(np.nanmax(df_new["number"]) == 5.0, - # "max value in numbered_events should match the number of groups") - # df_new = df_new.fillna('n/a') - # self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()), - # "numbered_events should not differ from check") - # - # # Test that df has not been changed by the op - # self.assertTrue(list(df.columns) == list(df_test.columns), - # "split_rows should not change the input df columns") - # self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), - # "split_rows should not change the input df values") + parms = json.loads(self.json_parms) + op = NumberGroupsOp(parms) + df = pd.DataFrame(self.sample_data, columns=self.sample_columns) + df_check = pd.DataFrame(self.numbered_data, columns=self.numbered_columns) + df_test = pd.DataFrame(self.sample_data, columns=self.sample_columns) + df_new = op.do_op(self.dispatcher, df_test, self.file_name) + + self.assertTrue(list(df_new.columns) == list(self.numbered_columns), + "numbered_events should have the expected columns") + self.assertTrue(len(df_new) == len(df_test), + "numbered_events should have same length as original dataframe") + self.assertTrue(np.nanmax(df_new["number"]) == 5.0, + "max value in numbered_events should match the number of groups") + + # fill na to match postprocessing dispatcher + df_new = df_new.fillna('n/a') + self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()), + "numbered_events should not differ from check") + + # Test that df has not been changed by the op + self.assertTrue(list(df.columns) == list(df_test.columns), + "number_rows should not change the input df columns") + self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), + "number_rows should not change the input df values") + + def test_existing_column_overwrite_true(self): + # Test when existing column name is given with overwrite True + parms = json.loads(self.json_overwrite_true_parms) + op = NumberGroupsOp(parms) + df = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) + df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) + df_check = pd.DataFrame(self.overwritten_data, columns=self.existing_sample_columns) + df_new = op.do_op(self.dispatcher, df_test, self.file_name) + + self.assertTrue(list(df_new.columns) == list(self.existing_sample_columns), + "numbered_events should have the same columns as original dataframe in case of overwrite") + self.assertTrue(len(df_new) == len(df_test), + "numbered_events should have same length as original dataframe") + self.assertTrue(np.nanmax(df_new["number"]) == 5.0, + "max value in numbered_events should match the number of groups") + df_new = df_new.fillna('n/a') + self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()), + "numbered_events should not differ from check") + + # Test that df has not been changed by the op + self.assertTrue(list(df.columns) == list(df_test.columns), + "split_rows should not change the input df columns") + self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), + "split_rows should not change the input df values") + + +if __name__ == '__main__': + unittest.main() + diff --git a/tests/tools/remodeling/operations/test_number_rows_op.py b/tests/tools/remodeling/operations/test_number_rows_op.py index 26cf50acc..6c446e872 100644 --- a/tests/tools/remodeling/operations/test_number_rows_op.py +++ b/tests/tools/remodeling/operations/test_number_rows_op.py @@ -1,4 +1,6 @@ import json +import numpy as np +import pandas as pd import unittest from hed.tools.remodeling.operations.number_rows_op import NumberRowsOp @@ -178,26 +180,26 @@ def test_number_rows_new_column(self): parms = json.loads(self.json_parms) op = NumberRowsOp(parms) self.assertIsInstance(op, NumberRowsOp) - # df = pd.DataFrame(self.sample_data, columns=self.sample_columns) - # df_check = pd.DataFrame(self.numbered_data, columns=self.numbered_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.sample_columns) - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - # df_new = df_new.fillna('n/a') - - # self.assertTrue(list(df_new.columns) == list(df_check.columns), - # "numbered_events should have the expected columns") - # self.assertTrue(len(df_new) == len(df_test), - # "numbered_events should have same length as original dataframe") - # self.assertTrue(all([i + 1 == value for (i, value) in enumerate(df_new[parms['number_column_name']])]), - # "event should be numbered consecutively from 1 to length of the dataframe") - # self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()), - # "numbered_events should not differ from check") - - # # Test that df has not been changed by the op - # self.assertTrue(list(df.columns) == list(df_test.columns), - # "number_rows should not change the input df columns") - # self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), - # "number_rows should not change the input df values") + df = pd.DataFrame(self.sample_data, columns=self.sample_columns) + df_check = pd.DataFrame(self.numbered_data, columns=self.numbered_columns) + df_test = pd.DataFrame(self.sample_data, columns=self.sample_columns) + df_new = op.do_op(self.dispatcher, df_test, self.file_name) + df_new = df_new.fillna('n/a') + + self.assertTrue(list(df_new.columns) == list(df_check.columns), + "numbered_events should have the expected columns") + self.assertTrue(len(df_new) == len(df_test), + "numbered_events should have same length as original dataframe") + self.assertTrue(all([i + 1 == value for (i, value) in enumerate(df_new[parms['number_column_name']])]), + "event should be numbered consecutively from 1 to length of the dataframe") + self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()), + "numbered_events should not differ from check") + + # Test that df has not been changed by the op + self.assertTrue(list(df.columns) == list(df_test.columns), + "number_rows should not change the input df columns") + self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), + "number_rows should not change the input df values") if __name__ == '__main__':