hed-standard · VisLab · Jun 16, 2023 · Jun 16, 2023
diff --git a/hed/tools/remodeling/dispatcher.py b/hed/tools/remodeling/dispatcher.py
@@ -203,7 +203,7 @@ def parse_operations(operation_list):
 
     @staticmethod
     def prep_data(df):
-        """ Replace all n/a entries in the data frame by np.NaN for processing.
+        """ Make a copy and replace all n/a entries in the data frame by np.NaN for processing.
 
         Parameters:
             df (DataFrame) - The DataFrame to be processed.

diff --git a/hed/tools/remodeling/operations/base_op.py b/hed/tools/remodeling/operations/base_op.py
@@ -77,7 +77,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
 
         """
 
-        return df
+        return df.copy()
 
     @staticmethod
     def _check_list_type(param_value, param_type):

diff --git a/hed/tools/remodeling/operations/factor_column_op.py b/hed/tools/remodeling/operations/factor_column_op.py
@@ -60,7 +60,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             dispatcher (Dispatcher): Manages the operation I/O.
             df (DataFrame): The DataFrame to be remodeled.
             name (str): Unique identifier for the dataframe -- often the original file path.
-            sidecar (Sidecar or file-like):  Only needed for HED operations.
+            sidecar (Sidecar or file-like): Not needed for this operation.
 
         Returns:
             DataFrame: A new DataFrame with the factor columns appended.

diff --git a/hed/tools/remodeling/operations/merge_consecutive_op.py b/hed/tools/remodeling/operations/merge_consecutive_op.py
@@ -62,7 +62,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             dispatcher (Dispatcher): Manages the operation I/O.
             df (DataFrame): The DataFrame to be remodeled.
             name (str): Unique identifier for the dataframe -- often the original file path.
-            sidecar (Sidecar or file-like): Only needed for HED operations.
+            sidecar (Sidecar or file-like): Not needed for this operation.
 
         Returns:
             Dataframe: A new dataframe after processing.

diff --git a/hed/tools/remodeling/operations/remap_columns_op.py b/hed/tools/remodeling/operations/remap_columns_op.py
@@ -100,7 +100,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             dispatcher (Dispatcher): Manages the operation I/O.
             df (DataFrame): The DataFrame to be remodeled.
             name (str): Unique identifier for the dataframe -- often the original file path.
-            sidecar (Sidecar or file-like): Only needed for HED operations.
+            sidecar (Sidecar or file-like): Not needed for this operation.
 
         Returns:
             Dataframe: A new dataframe after processing.
@@ -109,12 +109,13 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             - If ignore_missing is false and source values from the data are not in the map.
 
         """
-        df[self.source_columns] = df[self.source_columns].replace(np.NaN, 'n/a')
+        df1 = df.copy()
+        df1[self.source_columns] = df1[self.source_columns].replace(np.NaN, 'n/a')
         for column in self.integer_sources:
-            int_mask = df[column] != 'n/a'
-            df.loc[int_mask, column] = df.loc[int_mask, column].astype(int)
-        df[self.source_columns] = df[self.source_columns].astype(str)
-        df_new, missing = self.key_map.remap(df)
+            int_mask = df1[column] != 'n/a'
+            df1.loc[int_mask, column] = df1.loc[int_mask, column].astype(int)
+        df1[self.source_columns] = df1[self.source_columns].astype(str)
+        df_new, missing = self.key_map.remap(df1)
         if missing and not self.ignore_missing:
             raise ValueError("MapSourceValueMissing",
                              f"{name}: Ignore missing is false, but source values [{missing}] are in data but not map")

diff --git a/hed/tools/remodeling/operations/remove_columns_op.py b/hed/tools/remodeling/operations/remove_columns_op.py
@@ -49,7 +49,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             dispatcher (Dispatcher): Manages the operation I/O.
             df (DataFrame): The DataFrame to be remodeled.
             name (str): Unique identifier for the dataframe -- often the original file path.
-            sidecar (Sidecar or file-like):  Only needed for HED operations.
+            sidecar (Sidecar or file-like):  Not needed for this operation.
 
         Returns:
             Dataframe: A new dataframe after processing.
@@ -58,10 +58,11 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             - If ignore_missing is False and a column not in the data is to be removed.
 
         """
-
+        df_new = df.copy()
         try:
-            return df.drop(self.column_names, axis=1, errors=self.error_handling)
+            return df_new.drop(self.column_names, axis=1, errors=self.error_handling)
         except KeyError:
             raise KeyError("MissingColumnCannotBeRemoved",
                            f"{name}: Ignore missing is False but a column in {str(self.column_names)} is "
-                           f"not in the data columns [{str(df.columns)}]")
+                           f"not in the data columns [{str(df_new.columns)}]")
+        return df_new
diff --git a/hed/tools/remodeling/operations/remove_rows_op.py b/hed/tools/remodeling/operations/remove_rows_op.py
@@ -46,15 +46,15 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             dispatcher (Dispatcher): Manages the operation I/O.
             df (DataFrame): The DataFrame to be remodeled.
             name (str):  Unique identifier for the dataframe -- often the original file path.
-            sidecar (Sidecar or file-like): Only needed for HED operations.
+            sidecar (Sidecar or file-like): Not needed for this operation.
 
         Returns:
             Dataframe: A new dataframe after processing.
 
         """
-
-        if self.column_name not in df.columns:
-            return df
+        df_new = df.copy()
+        if self.column_name not in df_new.columns:
+            return df_new
         for value in self.remove_values:
-            df = df.loc[df[self.column_name] != value, :]
-        return df
+            df_new = df_new.loc[df_new[self.column_name] != value, :]
+        return df_new
diff --git a/hed/tools/remodeling/operations/rename_columns_op.py b/hed/tools/remodeling/operations/rename_columns_op.py
@@ -49,7 +49,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             dispatcher (Dispatcher): Manages the operation I/O.
             df (DataFrame): The DataFrame to be remodeled.
             name (str): Unique identifier for the dataframe -- often the original file path.
-            sidecar (Sidecar or file-like):  Only needed for HED operations.
+            sidecar (Sidecar or file-like):  Not needed for this operation.
 
         Returns:
             Dataframe: A new dataframe after processing.
@@ -58,9 +58,9 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             - When ignore_missing is false and column_mapping has columns not in the data.
 
         """
-
+        df_new = df.copy()
         try:
-            return df.rename(columns=self.column_mapping, errors=self.error_handling)
+            return df_new.rename(columns=self.column_mapping, errors=self.error_handling)
         except KeyError:
             raise KeyError("MappedColumnsMissingFromData",
                            f"{name}: ignore_missing is False, mapping columns [{self.column_mapping}]"

diff --git a/hed/tools/remodeling/operations/reorder_columns_op.py b/hed/tools/remodeling/operations/reorder_columns_op.py
@@ -48,7 +48,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             dispatcher (Dispatcher): Manages the operation I/O.
             df (DataFrame):  The DataFrame to be remodeled.
             name (str): Unique identifier for the dataframe -- often the original file path.
-            sidecar (Sidecar or file-like):   Only needed for HED operations.
+            sidecar (Sidecar or file-like):  Not needed for this operation.
 
         Returns:
             Dataframe: A new dataframe after processing.
@@ -57,17 +57,17 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             - When ignore_missing is false and column_order has columns not in the data.
 
         """
-
-        current_columns = list(df.columns)
-        missing_columns = set(self.column_order).difference(set(df.columns))
+        df_new = df.copy()
+        current_columns = list(df_new.columns)
+        missing_columns = set(self.column_order).difference(set(df_new.columns))
         ordered = self.column_order
         if missing_columns and not self.ignore_missing:
             raise ValueError("MissingReorderedColumns",
                              f"{str(missing_columns)} are not in dataframe columns "
-                             f" [{str(df.columns)}] and not ignored.")
+                             f" [{str(df_new.columns)}] and not ignored.")
         elif missing_columns:
             ordered = [elem for elem in self.column_order if elem not in list(missing_columns)]
         if self.keep_others:
             ordered += [elem for elem in current_columns if elem not in ordered]
-        df = df.loc[:, ordered]
-        return df
+        df_new = df_new.loc[:, ordered]
+        return df_new
diff --git a/hed/tools/remodeling/operations/split_rows_op.py b/hed/tools/remodeling/operations/split_rows_op.py
@@ -51,7 +51,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             dispatcher (Dispatcher): Manages the operation I/O.
             df (DataFrame): The DataFrame to be remodeled.
             name (str):  Unique identifier for the dataframe -- often the original file path.
-            sidecar (Sidecar or file-like):  Only needed for HED operations.
+            sidecar (Sidecar or file-like):  Not needed for this operation.
 
         Returns:
             Dataframe: A new dataframe after processing.

diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py
@@ -49,28 +49,28 @@ def __init__(self, parameters):
         self.append_timecode = parameters.get('append_timecode', False)
 
     def do_op(self, dispatcher, df, name, sidecar=None):
-        """ Create factor columns corresponding to values in a specified column.
+        """ Create a column name summary for df.
 
         Parameters:
             dispatcher (Dispatcher): Manages the operation I/O.
             df (DataFrame): The DataFrame to be remodeled.
             name (str): Unique identifier for the dataframe -- often the original file path.
-            sidecar (Sidecar or file-like):   Only needed for HED operations.
+            sidecar (Sidecar or file-like):  Not needed for this operation.
 
         Returns:
-            DataFrame: A new DataFrame with the factor columns appended.
+            DataFrame: A copy of df.
 
         Side-effect:
             Updates the relevant summary.
 
         """
-
+        df_new = df.copy()
         summary = dispatcher.summary_dicts.get(self.summary_name, None)
         if not summary:
             summary = ColumnNameSummary(self)
             dispatcher.summary_dicts[self.summary_name] = summary
-        summary.update_summary({"name": name, "column_names": list(df.columns)})
-        return df
+        summary.update_summary({"name": name, "column_names": list(df_new.columns)})
+        return df_new
 
 
 class ColumnNameSummary(BaseSummary):

diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py
@@ -65,28 +65,29 @@ def __init__(self, parameters):
         self.values_per_line = parameters.get('values_per_line', self.VALUES_PER_LINE)
 
     def do_op(self, dispatcher, df, name, sidecar=None):
-        """ Create factor columns corresponding to values in a specified column.
+        """ Create a summary of the column values in df.
 
         Parameters:
             dispatcher (Dispatcher): Manages the operation I/O.
             df (DataFrame): The DataFrame to be remodeled.
             name (str):  Unique identifier for the dataframe -- often the original file path.
-            sidecar (Sidecar or file-like): Only needed for HED operations.
+            sidecar (Sidecar or file-like): Not needed for this operation.
 
         Returns:
-            DataFrame: A new DataFrame with the factor columns appended.
+            DataFrame: A copy of df.
 
         Side-effect:
             Updates the relevant summary.
 
         """
-
+
+        df_new = df.copy()
         summary = dispatcher.summary_dicts.get(self.summary_name, None)
         if not summary:
             summary = ColumnValueSummary(self)
             dispatcher.summary_dicts[self.summary_name] = summary
-        summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name})
-        return df
+        summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name})
+        return df_new
 
 
 class ColumnValueSummary(BaseSummary):

diff --git a/hed/tools/remodeling/operations/summarize_definitions_op.py b/hed/tools/remodeling/operations/summarize_definitions_op.py
@@ -58,17 +58,18 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             sidecar (Sidecar or file-like): Only needed for HED operations.
 
         Returns:
-            DataFrame: the same datafarme
+            DataFrame: a copy of df
 
         Side-effect:
             Updates the relevant summary.
 
         """
+        df_new = df.copy()
         summary = dispatcher.summary_dicts.setdefault(self.summary_name,
                                                       DefinitionSummary(self, dispatcher.hed_schema))
-        summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name, 'sidecar': sidecar,
+        summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'sidecar': sidecar,
                                 'schema': dispatcher.hed_schema})
-        return df
+        return df_new
 
 
 class DefinitionSummary(BaseSummary):

diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py
@@ -63,7 +63,7 @@ def __init__(self, parameters):
         self.expand_context = parameters.get('expand_context', False)
 
     def do_op(self, dispatcher, df, name, sidecar=None):
-        """ Create factor columns corresponding to values in a specified column.
+        """ Summarize the HED tags present in the dataset.
 
         Parameters:
             dispatcher (Dispatcher): Manages the operation I/O.
@@ -72,19 +72,20 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             sidecar (Sidecar or file-like):  Only needed for HED operations.
 
         Returns:
-            DataFrame: A new DataFrame with the factor columns appended.
+            DataFrame: A copy of df.
 
         Side-effect:
             Updates the context.
 
         """
+        df_new = df.copy()
         summary = dispatcher.summary_dicts.get(self.summary_name, None)
         if not summary:
             summary = HedTagSummary(self)
             dispatcher.summary_dicts[self.summary_name] = summary
-        summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name,
+        summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name,
                                 'schema': dispatcher.hed_schema, 'sidecar': sidecar})
-        return df
+        return df_new
 
 
 class HedTagSummary(BaseSummary):
@@ -100,7 +101,7 @@ def update_summary(self, new_info):
         Parameters:
             new_info (dict):  A dictionary with the parameters needed to update a summary.
 
-        Notes:  
+        Notes:
             - The summary needs a "name" str, a "schema", a "df, and a "Sidecar".
 
         """

diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py
@@ -67,19 +67,20 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             sidecar (Sidecar or file-like): Usually required unless event file has a HED column.
 
         Returns:
-            DataFrame: Input DataFrame, unchanged.
+            DataFrame: A copy of df
 
         Side-effect:
             Updates the relevant summary.
 
         """
+        df_new = df.copy()
         summary = dispatcher.summary_dicts.get(self.summary_name, None)
         if not summary:
             summary = HedTypeSummary(self)
             dispatcher.summary_dicts[self.summary_name] = summary
-        summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name,
+        summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name,
                                 'schema': dispatcher.hed_schema, 'sidecar': sidecar})
-        return df
+        return df_new
 
 
 class HedTypeSummary(BaseSummary):

diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py
@@ -24,7 +24,7 @@ class SummarizeHedValidationOp(BaseOp):
         "operation": "summarize_hed_validation",
         "required_parameters": {
             "summary_name": str,
-            "summary_filename": str 
+            "summary_filename": str
         },
         "optional_parameters": {
             "append_timecode": bool,
@@ -64,19 +64,20 @@ def do_op(self, dispatcher, df, name, sidecar=None):
             sidecar (Sidecar or file-like): Usually needed unless only HED tags in HED column of event file.
 
         Returns:
-            DataFrame: Input DataFrame, unchanged.
+            DataFrame: A copy of df
 
         Side-effect:
             Updates the relevant summary.
 
         """
+        df_new = df.copy()
         summary = dispatcher.summary_dicts.get(self.summary_name, None)
         if not summary:
             summary = HedValidationSummary(self)
             dispatcher.summary_dicts[self.summary_name] = summary
-        summary.update_summary({'df': dispatcher.post_proc_data(df), 'name': name,
+        summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name,
                                 'schema': dispatcher.hed_schema, 'sidecar': sidecar})
-        return df
+        return df_new
 
 
 class HedValidationSummary(BaseSummary):