From ddb40dfd71239fbc9e50a6afc3ed7e61346c729c Mon Sep 17 00:00:00 2001 From: IanCa Date: Mon, 22 May 2023 19:42:43 -0500 Subject: [PATCH] Fix to_excel function for new system --- hed/models/base_input.py | 19 +++++++-- tests/models/test_spreadsheet_input.py | 57 +++++++++++++++++++++++--- 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 4e335a72c..2e9ae5adc 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -206,22 +206,33 @@ def to_excel(self, file, output_assembled=False): raise ValueError("Empty file name or object passed in to BaseInput.save.") dataframe = self._dataframe + old_columns = dataframe.columns if output_assembled: dataframe = self.dataframe_a + new_columns = dataframe.columns + else: + new_columns = old_columns if self._loaded_workbook: + column_mapping = {} # assembled dataframe column number to original worksheet number + for new_c, column in enumerate(new_columns): + for old_c, old_column in enumerate(old_columns): + if column == old_column: + column_mapping[new_c] = old_c + old_worksheet = self.get_worksheet(self._worksheet_name) # Excel spreadsheets are 1 based, then add another 1 for column names if present adj_row_for_col_names = 1 if self._has_column_names: adj_row_for_col_names += 1 adj_for_one_based_cols = 1 - for row_number, text_file_row in dataframe.iterrows(): - for column_number, column_text in enumerate(text_file_row): + for row_number in range(len(dataframe)): + for df_column_number, ws_column_number in column_mapping.items(): + cell_value = dataframe.iat[row_number, df_column_number] + old_worksheet.cell(row_number + adj_row_for_col_names, - column_number + adj_for_one_based_cols).value = \ - dataframe.iloc[row_number, column_number] + ws_column_number + adj_for_one_based_cols).value = cell_value self._loaded_workbook.save(file) else: dataframe.to_excel(file, header=self._has_column_names) diff --git a/tests/models/test_spreadsheet_input.py b/tests/models/test_spreadsheet_input.py index 0c31f35d8..7d3f590d4 100644 --- a/tests/models/test_spreadsheet_input.py +++ b/tests/models/test_spreadsheet_input.py @@ -9,9 +9,6 @@ import pandas as pd -# TODO: Add tests about correct handling of 'n/a' - - class Test(unittest.TestCase): @classmethod def setUpClass(cls): @@ -20,7 +17,7 @@ def setUpClass(cls): hed_xml_file = os.path.join(base, "schema_tests/HED8.0.0t.xml") cls.hed_schema = schema.load_schema(hed_xml_file) default = os.path.join(os.path.dirname(os.path.realpath(__file__)), - "../data/validator_tests/ExcelMultipleSheets.xlsx") + "../data/spreadsheet_validator_tests/ExcelMultipleSheets.xlsx") cls.default_test_file_name = default cls.generic_file_input = SpreadsheetInput(default) base_output = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../data/tests_output/") @@ -186,7 +183,6 @@ def test_definitions_identified(self): '../data/model_tests/no_column_header_definition.tsv') hed_input = SpreadsheetInput(events_path, has_column_names=False, tag_columns=[0, 1]) - def test_loading_dataframe_directly(self): ds_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/no_column_header_definition.tsv') @@ -209,9 +205,58 @@ def test_ignoring_na_value_column(self): events_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/model_tests/na_value_column.tsv') sidecar_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../data/model_tests/na_value_column.json') + '../data/model_tests/na_value_column.json') hed_input = TabularInput(events_path, sidecar=sidecar_path) self.assertTrue(hed_input.dataframe_a.loc[1, 'Value'] == 'n/a') + def test_to_excel_workbook(self): + excel_book = SpreadsheetInput(self.default_test_file_name, worksheet_name="LKT 8HED3", + tag_columns=["HED tags"]) + test_output_name = self.base_output_folder + "ExcelMultipleSheets_resave_assembled.xlsx" + excel_book.convert_to_long(self.hed_schema) + excel_book.to_excel(test_output_name, True) + reloaded_df = SpreadsheetInput(test_output_name, worksheet_name="LKT 8HED3") + + self.assertTrue(excel_book.dataframe.equals(reloaded_df.dataframe)) + + excel_book = SpreadsheetInput(self.default_test_file_name, worksheet_name="LKT 8HED3", + tag_columns=["HED tags"], + column_prefix_dictionary={ + "Short label": "Label/", + "Description in text": "Description" + }) + test_output_name = self.base_output_folder + "ExcelMultipleSheets_resave_assembled_prefix.xlsx" + excel_book.convert_to_long(self.hed_schema) + excel_book.to_excel(test_output_name, True) + reloaded_df = SpreadsheetInput(test_output_name, worksheet_name="LKT 8HED3", + tag_columns=["Short label", "Description in text", "HED tags"]) + + self.assertTrue(excel_book.dataframe_a.equals(reloaded_df.dataframe_a)) + + def test_to_excel_workbook_no_col_names(self): + excel_book = SpreadsheetInput(self.default_test_file_name, worksheet_name="LKT 8HED3", + tag_columns=[4], has_column_names=False) + test_output_name = self.base_output_folder + "ExcelMultipleSheets_resave_assembled_no_col_names.xlsx" + excel_book.convert_to_long(self.hed_schema) + excel_book.to_excel(test_output_name, True) + reloaded_df = SpreadsheetInput(test_output_name, worksheet_name="LKT 8HED3", tag_columns=[4], + has_column_names=False) + self.assertTrue(excel_book.dataframe.equals(reloaded_df.dataframe)) + + excel_book = SpreadsheetInput(self.default_test_file_name, worksheet_name="LKT 8HED3", has_column_names=False, + tag_columns=[4], + column_prefix_dictionary={ + 1: "Label/", + 3: "Description" + }) + test_output_name = self.base_output_folder + "ExcelMultipleSheets_resave_assembled_prefix.xlsx" + excel_book.convert_to_long(self.hed_schema) + excel_book.to_excel(test_output_name, True) + reloaded_df = SpreadsheetInput(test_output_name, worksheet_name="LKT 8HED3", tag_columns=[1, 3, 4], + has_column_names=False) + + self.assertTrue(excel_book.dataframe_a.equals(reloaded_df.dataframe_a)) + + if __name__ == '__main__': unittest.main()