diff --git a/hed/tools/remodeling/dispatcher.py b/hed/tools/remodeling/dispatcher.py index 48f862937..24dcddd08 100644 --- a/hed/tools/remodeling/dispatcher.py +++ b/hed/tools/remodeling/dispatcher.py @@ -203,7 +203,7 @@ def parse_operations(operation_list): @staticmethod def prep_data(df): - """ Replace all n/a entries in the data frame by np.NaN for processing. + """ Make a copy and replace all n/a entries in the data frame by np.NaN for processing. Parameters: df (DataFrame) - The DataFrame to be processed. diff --git a/hed/tools/remodeling/operations/base_op.py b/hed/tools/remodeling/operations/base_op.py index a524dca26..15423d64d 100644 --- a/hed/tools/remodeling/operations/base_op.py +++ b/hed/tools/remodeling/operations/base_op.py @@ -77,7 +77,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): """ - return df + return df.copy() @staticmethod def _check_list_type(param_value, param_type): diff --git a/hed/tools/remodeling/operations/factor_column_op.py b/hed/tools/remodeling/operations/factor_column_op.py index 953c327ed..e01a81d8b 100644 --- a/hed/tools/remodeling/operations/factor_column_op.py +++ b/hed/tools/remodeling/operations/factor_column_op.py @@ -60,7 +60,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: DataFrame: A new DataFrame with the factor columns appended. diff --git a/hed/tools/remodeling/operations/merge_consecutive_op.py b/hed/tools/remodeling/operations/merge_consecutive_op.py index 01a526a7a..9ce7a16d7 100644 --- a/hed/tools/remodeling/operations/merge_consecutive_op.py +++ b/hed/tools/remodeling/operations/merge_consecutive_op.py @@ -62,7 +62,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. diff --git a/hed/tools/remodeling/operations/remap_columns_op.py b/hed/tools/remodeling/operations/remap_columns_op.py index 480df8220..c83315795 100644 --- a/hed/tools/remodeling/operations/remap_columns_op.py +++ b/hed/tools/remodeling/operations/remap_columns_op.py @@ -100,7 +100,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. @@ -109,12 +109,13 @@ def do_op(self, dispatcher, df, name, sidecar=None): - If ignore_missing is false and source values from the data are not in the map. """ - df[self.source_columns] = df[self.source_columns].replace(np.NaN, 'n/a') + df1 = df.copy() + df1[self.source_columns] = df1[self.source_columns].replace(np.NaN, 'n/a') for column in self.integer_sources: - int_mask = df[column] != 'n/a' - df.loc[int_mask, column] = df.loc[int_mask, column].astype(int) - df[self.source_columns] = df[self.source_columns].astype(str) - df_new, missing = self.key_map.remap(df) + int_mask = df1[column] != 'n/a' + df1.loc[int_mask, column] = df1.loc[int_mask, column].astype(int) + df1[self.source_columns] = df1[self.source_columns].astype(str) + df_new, missing = self.key_map.remap(df1) if missing and not self.ignore_missing: raise ValueError("MapSourceValueMissing", f"{name}: Ignore missing is false, but source values [{missing}] are in data but not map") diff --git a/hed/tools/remodeling/operations/remove_columns_op.py b/hed/tools/remodeling/operations/remove_columns_op.py index 0a941ca5d..b0833cd1d 100644 --- a/hed/tools/remodeling/operations/remove_columns_op.py +++ b/hed/tools/remodeling/operations/remove_columns_op.py @@ -49,7 +49,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. @@ -58,10 +58,11 @@ def do_op(self, dispatcher, df, name, sidecar=None): - If ignore_missing is False and a column not in the data is to be removed. """ - + df_new = df.copy() try: - return df.drop(self.column_names, axis=1, errors=self.error_handling) + return df_new.drop(self.column_names, axis=1, errors=self.error_handling) except KeyError: raise KeyError("MissingColumnCannotBeRemoved", f"{name}: Ignore missing is False but a column in {str(self.column_names)} is " - f"not in the data columns [{str(df.columns)}]") + f"not in the data columns [{str(df_new.columns)}]") + return df_new diff --git a/hed/tools/remodeling/operations/remove_rows_op.py b/hed/tools/remodeling/operations/remove_rows_op.py index 2e684d2dd..217fb7934 100644 --- a/hed/tools/remodeling/operations/remove_rows_op.py +++ b/hed/tools/remodeling/operations/remove_rows_op.py @@ -46,15 +46,15 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. """ - - if self.column_name not in df.columns: - return df + df_new = df.copy() + if self.column_name not in df_new.columns: + return df_new for value in self.remove_values: - df = df.loc[df[self.column_name] != value, :] - return df + df_new = df_new.loc[df_new[self.column_name] != value, :] + return df_new diff --git a/hed/tools/remodeling/operations/rename_columns_op.py b/hed/tools/remodeling/operations/rename_columns_op.py index adc283c20..2a2f275a9 100644 --- a/hed/tools/remodeling/operations/rename_columns_op.py +++ b/hed/tools/remodeling/operations/rename_columns_op.py @@ -49,7 +49,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. @@ -58,9 +58,9 @@ def do_op(self, dispatcher, df, name, sidecar=None): - When ignore_missing is false and column_mapping has columns not in the data. """ - + df_new = df.copy() try: - return df.rename(columns=self.column_mapping, errors=self.error_handling) + return df_new.rename(columns=self.column_mapping, errors=self.error_handling) except KeyError: raise KeyError("MappedColumnsMissingFromData", f"{name}: ignore_missing is False, mapping columns [{self.column_mapping}]" diff --git a/hed/tools/remodeling/operations/reorder_columns_op.py b/hed/tools/remodeling/operations/reorder_columns_op.py index 6ae71b179..9607bb295 100644 --- a/hed/tools/remodeling/operations/reorder_columns_op.py +++ b/hed/tools/remodeling/operations/reorder_columns_op.py @@ -48,7 +48,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. @@ -57,17 +57,17 @@ def do_op(self, dispatcher, df, name, sidecar=None): - When ignore_missing is false and column_order has columns not in the data. """ - - current_columns = list(df.columns) - missing_columns = set(self.column_order).difference(set(df.columns)) + df_new = df.copy() + current_columns = list(df_new.columns) + missing_columns = set(self.column_order).difference(set(df_new.columns)) ordered = self.column_order if missing_columns and not self.ignore_missing: raise ValueError("MissingReorderedColumns", f"{str(missing_columns)} are not in dataframe columns " - f" [{str(df.columns)}] and not ignored.") + f" [{str(df_new.columns)}] and not ignored.") elif missing_columns: ordered = [elem for elem in self.column_order if elem not in list(missing_columns)] if self.keep_others: ordered += [elem for elem in current_columns if elem not in ordered] - df = df.loc[:, ordered] - return df + df_new = df_new.loc[:, ordered] + return df_new diff --git a/hed/tools/remodeling/operations/split_rows_op.py b/hed/tools/remodeling/operations/split_rows_op.py index e96e8b490..858ce7e28 100644 --- a/hed/tools/remodeling/operations/split_rows_op.py +++ b/hed/tools/remodeling/operations/split_rows_op.py @@ -51,7 +51,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: Dataframe: A new dataframe after processing. diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py index d185d08b2..ed6082a45 100644 --- a/hed/tools/remodeling/operations/summarize_column_names_op.py +++ b/hed/tools/remodeling/operations/summarize_column_names_op.py @@ -49,28 +49,28 @@ def __init__(self, parameters): self.append_timecode = parameters.get('append_timecode', False) def do_op(self, dispatcher, df, name, sidecar=None): - """ Create factor columns corresponding to values in a specified column. + """ Create a column name summary for df. Parameters: dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: - DataFrame: A new DataFrame with the factor columns appended. + DataFrame: A copy of df. Side-effect: Updates the relevant summary. """ - + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = ColumnNameSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({"name": name, "column_names": list(df.columns)}) - return df + summary.update_summary({"name": name, "column_names": list(df_new.columns)}) + return df_new class ColumnNameSummary(BaseSummary): diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py index 539bfe2bd..dc13790c7 100644 --- a/hed/tools/remodeling/operations/summarize_column_values_op.py +++ b/hed/tools/remodeling/operations/summarize_column_values_op.py @@ -65,28 +65,29 @@ def __init__(self, parameters): self.values_per_line = parameters.get('values_per_line', self.VALUES_PER_LINE) def do_op(self, dispatcher, df, name, sidecar=None): - """ Create factor columns corresponding to values in a specified column. + """ Create a summary of the column values in df. Parameters: dispatcher (Dispatcher): Manages the operation I/O. df (DataFrame): The DataFrame to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: - DataFrame: A new DataFrame with the factor columns appended. + DataFrame: A copy of df. Side-effect: Updates the relevant summary. """ - + + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = ColumnValueSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name}) - return df + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name}) + return df_new class ColumnValueSummary(BaseSummary): diff --git a/hed/tools/remodeling/operations/summarize_definitions_op.py b/hed/tools/remodeling/operations/summarize_definitions_op.py index 4e9407969..3169d63d0 100644 --- a/hed/tools/remodeling/operations/summarize_definitions_op.py +++ b/hed/tools/remodeling/operations/summarize_definitions_op.py @@ -58,17 +58,18 @@ def do_op(self, dispatcher, df, name, sidecar=None): sidecar (Sidecar or file-like): Only needed for HED operations. Returns: - DataFrame: the same datafarme + DataFrame: a copy of df Side-effect: Updates the relevant summary. """ + df_new = df.copy() summary = dispatcher.summary_dicts.setdefault(self.summary_name, DefinitionSummary(self, dispatcher.hed_schema)) - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name, 'sidecar': sidecar, + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'sidecar': sidecar, 'schema': dispatcher.hed_schema}) - return df + return df_new class DefinitionSummary(BaseSummary): diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index 2c24de8ef..d74d87de6 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -63,7 +63,7 @@ def __init__(self, parameters): self.expand_context = parameters.get('expand_context', False) def do_op(self, dispatcher, df, name, sidecar=None): - """ Create factor columns corresponding to values in a specified column. + """ Summarize the HED tags present in the dataset. Parameters: dispatcher (Dispatcher): Manages the operation I/O. @@ -72,19 +72,20 @@ def do_op(self, dispatcher, df, name, sidecar=None): sidecar (Sidecar or file-like): Only needed for HED operations. Returns: - DataFrame: A new DataFrame with the factor columns appended. + DataFrame: A copy of df. Side-effect: Updates the context. """ + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = HedTagSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name, + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) - return df + return df_new class HedTagSummary(BaseSummary): @@ -100,7 +101,7 @@ def update_summary(self, new_info): Parameters: new_info (dict): A dictionary with the parameters needed to update a summary. - Notes: + Notes: - The summary needs a "name" str, a "schema", a "df, and a "Sidecar". """ diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py index 6a37b0578..9a27d22d2 100644 --- a/hed/tools/remodeling/operations/summarize_hed_type_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_type_op.py @@ -67,19 +67,20 @@ def do_op(self, dispatcher, df, name, sidecar=None): sidecar (Sidecar or file-like): Usually required unless event file has a HED column. Returns: - DataFrame: Input DataFrame, unchanged. + DataFrame: A copy of df Side-effect: Updates the relevant summary. """ + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = HedTypeSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name, + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) - return df + return df_new class HedTypeSummary(BaseSummary): diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py index 29812273d..d643e533d 100644 --- a/hed/tools/remodeling/operations/summarize_hed_validation_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_validation_op.py @@ -24,7 +24,7 @@ class SummarizeHedValidationOp(BaseOp): "operation": "summarize_hed_validation", "required_parameters": { "summary_name": str, - "summary_filename": str + "summary_filename": str }, "optional_parameters": { "append_timecode": bool, @@ -64,19 +64,20 @@ def do_op(self, dispatcher, df, name, sidecar=None): sidecar (Sidecar or file-like): Usually needed unless only HED tags in HED column of event file. Returns: - DataFrame: Input DataFrame, unchanged. + DataFrame: A copy of df Side-effect: Updates the relevant summary. """ + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = HedValidationSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name, + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) - return df + return df_new class HedValidationSummary(BaseSummary): diff --git a/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py b/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py index f206e2f5f..e0657ffef 100644 --- a/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py +++ b/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py @@ -57,28 +57,29 @@ def __init__(self, parameters): self.append_timecode = parameters.get('append_timecode', False) def do_op(self, dispatcher, df, name, sidecar=None): - """ Create factor columns corresponding to values in a specified column. + """ Extract a sidecar from events file. Parameters: dispatcher (Dispatcher): The dispatcher object for managing the operations. df (DataFrame): The tabular file to be remodeled. name (str): Unique identifier for the dataframe -- often the original file path. - sidecar (Sidecar or file-like): Only needed for HED operations. + sidecar (Sidecar or file-like): Not needed for this operation. Returns: - DataFrame: A new DataFrame with the factor columns appended. + DataFrame: A copy of df. Side-effect: Updates the associated summary if applicable. """ + df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: summary = EventsToSidecarSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name}) - return df + summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name}) + return df_new class EventsToSidecarSummary(BaseSummary): @@ -95,7 +96,8 @@ def update_summary(self, new_info): new_info (dict): A dictionary with the parameters needed to update a summary. Notes: - - The summary needs a "name" str and a "df". + - The summary needs a "name" str and a "df". + """ tab_sum = TabularSummary(value_cols=self.value_cols, skip_cols=self.skip_cols, name=new_info["name"]) @@ -164,7 +166,7 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Dataset: Total events={result.get('total_events', 0)} " - f"Total files={result.get('total_files', 0)}", + f"Total files={result.get('total_files', 0)}", f"Skip columns: {str(result.get('skip_cols', []))}", f"Value columns: {str(result.get('value_cols', []))}", f"Sidecar:\n{json.dumps(result['sidecar'], indent=indent)}"] diff --git a/tests/tools/remodeling/operations/test_summarize_definitions_op.py b/tests/tools/remodeling/operations/test_summarize_definitions_op.py index e829c2739..4b4784f64 100644 --- a/tests/tools/remodeling/operations/test_summarize_definitions_op.py +++ b/tests/tools/remodeling/operations/test_summarize_definitions_op.py @@ -41,7 +41,7 @@ def test_do_op(self): parms = json.loads(self.json_parms) sum_op = SummarizeDefinitionsOp(parms) df = pd.read_csv(self.data_path, delimiter='\t', header=0, keep_default_na=False, na_values=",null") - df_new = sum_op.do_op(dispatch, dispatch.prep_data(df), 'subj2_run1', sidecar=self.json_path) + df_new = sum_op.do_op(dispatch, df, 'subj2_run1', sidecar=self.json_path) self.assertEqual(200, len(df_new), " dataframe length is correct") self.assertEqual(10, len(df_new.columns), " has correct number of columns") self.assertIn(sum_op.summary_name, dispatch.summary_dicts)