From 744a237a9fdda1e316ba2ef97d2a178fca003eca Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 12 Jun 2024 15:23:35 -0500 Subject: [PATCH 1/3] optimize KeyMap:_remap --- hed/tools/analysis/key_map.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/hed/tools/analysis/key_map.py b/hed/tools/analysis/key_map.py index 39a0ccf19..62b9bd25f 100644 --- a/hed/tools/analysis/key_map.py +++ b/hed/tools/analysis/key_map.py @@ -2,6 +2,7 @@ import pandas as pd +import numpy as np from hed.errors.exceptions import HedFileError from hed.tools.util import data_util @@ -128,27 +129,32 @@ def remap(self, data): return df_new, missing_indices def _remap(self, df): - """ Utility method that iterates through dataframes to do the remapping. + """ Utility method that does the remapping Parameters: df (DataFrame): DataFrame in which to perform the mapping. Returns: list: The row numbers that had no correspondence in the mapping. - """ + key_series = df.apply(lambda row: data_util.get_row_hash(row, self.key_cols), axis=1) + + # Add a column containing the mapped index for each row + map_series = pd.Series(self.map_dict) + key_values = key_series.map(map_series) + merged_df = df.assign(key_value=key_values.values) + + # Add new columns with the updated values + remapped_df = pd.merge(merged_df, self.col_map, left_on='key_value', right_index=True, + suffixes=('', '_new'), how='left').fillna("n/a") + + # Override the original columns with our newly calculated ones + for col in self.target_cols: + df[col] = remapped_df[col + '_new'] + + # Finally calculate missing indices + missing_indices = key_series.index[key_values.isna()].tolist() - missing_indices = [] - for index, row in df.iterrows(): - key = data_util.get_row_hash(row, self.key_cols) - key_value = self.map_dict.get(key, None) - if key_value is not None: - result = self.col_map.iloc[key_value] - row[self.target_cols] = result[self.target_cols].values - new_index = df.index.get_loc(index) # In case index and location don't agree. - df.iloc[new_index] = row - else: - missing_indices.append(index) return missing_indices def resort(self): From 51c54a22946bf17030a2bc2c68c68b820bfe2fc6 Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 12 Jun 2024 15:24:21 -0500 Subject: [PATCH 2/3] Remove extraneous import --- hed/tools/analysis/key_map.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hed/tools/analysis/key_map.py b/hed/tools/analysis/key_map.py index 62b9bd25f..160279942 100644 --- a/hed/tools/analysis/key_map.py +++ b/hed/tools/analysis/key_map.py @@ -2,7 +2,6 @@ import pandas as pd -import numpy as np from hed.errors.exceptions import HedFileError from hed.tools.util import data_util From 69e627594429662dc42ec95e0b757acb44a0f359 Mon Sep 17 00:00:00 2001 From: IanCa Date: Wed, 12 Jun 2024 16:32:27 -0500 Subject: [PATCH 3/3] Add some more clarifying comments --- hed/tools/analysis/key_map.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hed/tools/analysis/key_map.py b/hed/tools/analysis/key_map.py index 160279942..21dc7137b 100644 --- a/hed/tools/analysis/key_map.py +++ b/hed/tools/analysis/key_map.py @@ -1,6 +1,5 @@ """ A map of column value keys into new column values. """ - import pandas as pd from hed.errors.exceptions import HedFileError from hed.tools.util import data_util @@ -18,6 +17,7 @@ class KeyMap: The remapping does not support other types of columns. """ + def __init__(self, key_cols, target_cols=None, name=''): """ Information for remapping columns of tabular files. @@ -137,13 +137,17 @@ def _remap(self, df): list: The row numbers that had no correspondence in the mapping. """ key_series = df.apply(lambda row: data_util.get_row_hash(row, self.key_cols), axis=1) + # Key series now contains row_number: hash for each row in the dataframe # Add a column containing the mapped index for each row - map_series = pd.Series(self.map_dict) - key_values = key_series.map(map_series) + map_series = pd.Series(self.map_dict) # map_series is hash:row_index for each entry in the map_dict index + key_values = key_series.map(map_series) # key_values is df_row_number:map_dict_index + # e.g. a key_value entry of 0:79 means row 0 maps to row 79 in the map_dict + + # This adds the map_dict_index column, to merged_df as a new column "key_value" merged_df = df.assign(key_value=key_values.values) - # Add new columns with the updated values + # Copy all the map_dict data into merged_df as new columns, merging on the map_dict_index number of both remapped_df = pd.merge(merged_df, self.col_map, left_on='key_value', right_index=True, suffixes=('', '_new'), how='left').fillna("n/a")