From a12efaaa927daa3dda4c04a4f14050ad2f562236 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Mon, 26 Jun 2023 08:51:10 -0500 Subject: [PATCH 1/2] Started working on allowing multiple iteration of remodeling --- ...name_summary.py => column_name_summary.py} | 2 +- hed/tools/analysis/tabular_summary.py | 29 +++++++++++++++++-- .../operations/summarize_column_names_op.py | 28 ++++++++++-------- .../operations/summarize_column_values_op.py | 2 +- .../operations/summarize_hed_tags_op.py | 11 +++---- ...ummary_word_cloud.py => tag_word_cloud.py} | 8 +++-- ...summary.py => test_column_name_summary.py} | 16 +++++----- tests/tools/analysis/test_tabular_summary.py | 18 +++++++++++- .../test_summarize_column_names_op.py | 8 ++--- ...y_word_cloud.py => test_tag_word_cloud.py} | 13 +++++---- 10 files changed, 92 insertions(+), 43 deletions(-) rename hed/tools/analysis/{tabular_column_name_summary.py => column_name_summary.py} (95%) rename hed/tools/visualizations/{tag_summary_word_cloud.py => tag_word_cloud.py} (83%) rename tests/tools/analysis/{test_tabular_column_name_summary.py => test_column_name_summary.py} (78%) rename tests/tools/visualizations/{test_tag_summary_word_cloud.py => test_tag_word_cloud.py} (79%) diff --git a/hed/tools/analysis/tabular_column_name_summary.py b/hed/tools/analysis/column_name_summary.py similarity index 95% rename from hed/tools/analysis/tabular_column_name_summary.py rename to hed/tools/analysis/column_name_summary.py index cd42651ae..5c7a710c9 100644 --- a/hed/tools/analysis/tabular_column_name_summary.py +++ b/hed/tools/analysis/column_name_summary.py @@ -3,7 +3,7 @@ import json -class TabularColumnNameSummary: +class ColumnNameSummary: def __init__(self, name=''): self.name = name self.file_dict = {} diff --git a/hed/tools/analysis/tabular_summary.py b/hed/tools/analysis/tabular_summary.py index d9fd79702..1262f368b 100644 --- a/hed/tools/analysis/tabular_summary.py +++ b/hed/tools/analysis/tabular_summary.py @@ -81,8 +81,9 @@ def get_summary(self, as_json=False): value_cols = {} for key in sorted_cols: value_cols[key] = self.value_info[key] - summary = {"Summary name": self.name, "Total events": self.total_events, "Total files": self.total_files, - "Categorical columns": categorical_cols, "Value columns": value_cols} + summary = {"Name": self.name, "Total events": self.total_events, "Total files": self.total_files, + "Categorical columns": categorical_cols, "Value columns": value_cols, + "Skip columns": self.skip_cols, "Files": self.files} if as_json: return json.dumps(summary, indent=4) else: @@ -215,6 +216,30 @@ def _update_dict_value(self, col_dict): self.value_info[col] = [self.value_info[col][0] + col_dict.value_info[col][0], self.value_info[col][1] + col_dict.value_info[col][1]] + @staticmethod + def extract_summary(summary_info): + """ Create a TabularSummary object from a serialized summary + + Parameters: + summary_info (dict or str): A JSON string or a dictionary containing contents of a TabularSummary. + + Returns: + TabularSummary: contains the information in summary_info as a TabularSummary object. + """ + + if isinstance(summary_info, str): + summary_info = json.loads(summary_info) + new_tab = TabularSummary(value_cols=summary_info.get('Value columns', {}).keys(), + skip_cols=summary_info.get('Skip columns', []), + name=summary_info.get('Summary name', '')) + new_tab.value_info = summary_info.get('Value_columns', {}) + new_tab.total_files = summary_info.get('Total files', 0) + new_tab.total_events = summary_info.get('Total events', 0) + new_tab.skip_cols = summary_info.get('Skip columns', []) + new_tab.categorical_info = summary_info.get('Categorical columns', {}) + new_tab.files = summary_info.get('Files', {}) + return new_tab + @staticmethod def get_columns_info(dataframe, skip_cols=None): """ Extract unique value counts for columns. diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py index ed6082a45..2201827f9 100644 --- a/hed/tools/remodeling/operations/summarize_column_names_op.py +++ b/hed/tools/remodeling/operations/summarize_column_names_op.py @@ -1,6 +1,6 @@ """ Summarize the column names in a collection of tabular files. """ -from hed.tools.analysis.tabular_column_name_summary import TabularColumnNameSummary +from hed.tools.analysis.column_name_summary import ColumnNameSummary from hed.tools.remodeling.operations.base_op import BaseOp from hed.tools.remodeling.operations.base_summary import BaseSummary @@ -67,13 +67,13 @@ def do_op(self, dispatcher, df, name, sidecar=None): df_new = df.copy() summary = dispatcher.summary_dicts.get(self.summary_name, None) if not summary: - summary = ColumnNameSummary(self) + summary = ColumnNamesSummary(self) dispatcher.summary_dicts[self.summary_name] = summary summary.update_summary({"name": name, "column_names": list(df_new.columns)}) return df_new -class ColumnNameSummary(BaseSummary): +class ColumnNamesSummary(BaseSummary): def __init__(self, sum_op): super().__init__(sum_op) @@ -85,35 +85,39 @@ def update_summary(self, new_info): new_info (dict): A dictionary with the parameters needed to update a summary. Notes: - - The summary information is kept in separate TabularColumnNameSummary objects for each file. + - The summary information is kept in separate ColumnNameSummary objects for each file. - The summary needs a "name" str and a "column_names" list. - - The summary uses TabularColumnNameSummary as the summary object. + - The summary uses ColumnNameSummary as the summary object. """ name = new_info['name'] if name not in self.summary_dict: - self.summary_dict[name] = TabularColumnNameSummary(name=name) + self.summary_dict[name] = ColumnNameSummary(name=name) self.summary_dict[name].update(name, new_info["column_names"]) def get_details_dict(self, column_summary): """ Return the summary dictionary extracted from a ColumnNameSummary. Parameters: - column_summary (TabularColumnNameSummary): A column name summary for the data file. + column_summary (ColumnNameSummary): A column name summary for the data file. Returns: dict - a dictionary with the summary information for column names. """ - return column_summary.get_summary() + summary = column_summary.get_summary() + return {"Name": summary['Summary name'], "Total events": "n/a", + "Total files": summary['Number files'], + "Files": [name for name in column_summary.file_dict.keys()], + "Columns": summary['Columns']} def merge_all_info(self): - """ Create a TabularColumnNameSummary containing the overall dataset summary. + """ Create a ColumnNameSummary containing the overall dataset summary. Returns: - TabularColumnNameSummary - the overall summary object for column names. + ColumnNameSummary - the overall summary object for column names. """ - all_sum = TabularColumnNameSummary(name='Dataset') + all_sum = ColumnNameSummary(name='Dataset') for key, counts in self.summary_dict.items(): for name, pos in counts.file_dict.items(): all_sum.update(name, counts.unique_headers[pos]) @@ -152,7 +156,7 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Dataset: Number of files={result.get('Number files', 0)}"] - for element in result.get("Columns", []): + for element in result.get("Unique headers", []): sum_list.append(f"{indent}Columns: {str(element['Column names'])}") for file in element.get("Files", []): sum_list.append(f"{indent}{indent}{file}") diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py index dc13790c7..0c80c6382 100644 --- a/hed/tools/remodeling/operations/summarize_column_values_op.py +++ b/hed/tools/remodeling/operations/summarize_column_values_op.py @@ -138,7 +138,7 @@ def merge_all_info(self): """ all_sum = TabularSummary(value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name='Dataset') - for key, counts in self.summary_dict.items(): + for counts in self.summary_dict.values(): all_sum.update_summary(counts) return all_sum diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index d74d87de6..1eea941d2 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -133,8 +133,9 @@ def get_details_dict(self, merge_counts): for key, key_list in self.tags.items(): details[key] = self._get_details(key_list, template, verbose=True) leftovers = [value.get_info(verbose=True) for value in unmatched] - return {"name": merge_counts.name, "total_events": merge_counts.total_events, - "files": [name for name in merge_counts.files.keys()], + return {"Name": merge_counts.name, "Total events": merge_counts.total_events, + "Total files": len(merge_counts.files.keys()), + "Files": [name for name in merge_counts.files.keys()], "Main tags": details, "Other tags": leftovers} def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): @@ -185,8 +186,8 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - sum_list = [f"Dataset: Total events={result.get('total_events', 0)} " - f"Total files={len(result.get('files', []))}"] + sum_list = [f"Dataset: Total events={result.get('Total events', 0)} " + f"Total files={len(result.get('Files', 0))}"] sum_list = sum_list + HedTagSummary._get_tag_list(result, indent=indent) return "\n".join(sum_list) @@ -202,7 +203,7 @@ def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - sum_list = [f"Total events={result.get('total_events', 0)}"] + sum_list = [f"Total events={result.get('Total events', 0)}"] sum_list = sum_list + HedTagSummary._get_tag_list(result, indent=indent) return "\n".join(sum_list) diff --git a/hed/tools/visualizations/tag_summary_word_cloud.py b/hed/tools/visualizations/tag_word_cloud.py similarity index 83% rename from hed/tools/visualizations/tag_summary_word_cloud.py rename to hed/tools/visualizations/tag_word_cloud.py index a3949206e..bee6cef56 100644 --- a/hed/tools/visualizations/tag_summary_word_cloud.py +++ b/hed/tools/visualizations/tag_word_cloud.py @@ -22,19 +22,21 @@ def create_wordcloud(word_dict, width=400, height=200): return wc -def convert_summary_to_word_dict(summary_json): +def summary_to_dict(summary): """Converts a HedTagSummary json dict into the word cloud input format Parameters: - summary_json(dict): The summary from a summarize hed tags op + summary(dict): The summary from a summarize hed tags op Returns: word_dict(dict): a dict of the words and their occurrence count :raises KeyError: A malformed dictionary was passed + """ - tag_dict = summary_json['Dataset']['Overall summary']['Main tags'] + overall_summary = summary.get("Overall summary", {}) + tag_dict = overall_summary.get("Main tags", {}) word_dict = {} for tag_sub_list in tag_dict.values(): for tag_sub_dict in tag_sub_list: diff --git a/tests/tools/analysis/test_tabular_column_name_summary.py b/tests/tools/analysis/test_column_name_summary.py similarity index 78% rename from tests/tools/analysis/test_tabular_column_name_summary.py rename to tests/tools/analysis/test_column_name_summary.py index d2825fcb8..31cb551c0 100644 --- a/tests/tools/analysis/test_tabular_column_name_summary.py +++ b/tests/tools/analysis/test_column_name_summary.py @@ -1,6 +1,6 @@ import json import unittest -from hed.tools.analysis.tabular_column_name_summary import TabularColumnNameSummary +from hed.tools.analysis.column_name_summary import ColumnNameSummary class Test(unittest.TestCase): @@ -17,16 +17,16 @@ def tearDownClass(cls): pass def test_constructor(self): - column_summary1 = TabularColumnNameSummary(name='Dataset') - self.assertIsInstance(column_summary1, TabularColumnNameSummary) + column_summary1 = ColumnNameSummary(name='Dataset') + self.assertIsInstance(column_summary1, ColumnNameSummary) self.assertEqual(column_summary1.name, 'Dataset') self.assertFalse(column_summary1.file_dict) self.assertFalse(column_summary1.unique_headers) - column_summary2 = TabularColumnNameSummary() - self.assertIsInstance(column_summary2, TabularColumnNameSummary) + column_summary2 = ColumnNameSummary() + self.assertIsInstance(column_summary2, ColumnNameSummary) def test_update(self): - column_summary = TabularColumnNameSummary() + column_summary = ColumnNameSummary() column_summary.update('run-01', self.columns1) column_summary.update('run-02', self.columns1) self.assertEqual(len(column_summary.unique_headers), 1) @@ -41,7 +41,7 @@ def test_update(self): self.assertEqual(context.exception.args[0], "FileHasChangedColumnNames") def test_update_headers(self): - column_summary = TabularColumnNameSummary() + column_summary = ColumnNameSummary() pos1 = column_summary.update_headers(self.columns1) self.assertEqual(pos1, 0) pos2 = column_summary.update_headers(self.columns1) @@ -50,7 +50,7 @@ def test_update_headers(self): self.assertEqual(pos3, 1) def test_get_summary(self): - column_summary = TabularColumnNameSummary('Dataset') + column_summary = ColumnNameSummary('Dataset') column_summary.update('run-01', self.columns1) column_summary.update('run-02', self.columns1) summary1 = column_summary.get_summary() diff --git a/tests/tools/analysis/test_tabular_summary.py b/tests/tools/analysis/test_tabular_summary.py index 1a35dabec..b983c6f8b 100644 --- a/tests/tools/analysis/test_tabular_summary.py +++ b/tests/tools/analysis/test_tabular_summary.py @@ -32,6 +32,22 @@ def test_constructor(self): self.assertIsInstance(dict2, TabularSummary, "TabularSummary: multiple values are okay in constructor") self.assertEqual(len(dict2.value_info.keys()), 3, "TabularSummary should have keys for each value column") + def test_extract_summary(self): + tab1 = TabularSummary() + stern_df = get_new_dataframe(self.stern_map_path) + tab1.update(stern_df) + sum_info = tab1.get_summary() + new_tab1 = TabularSummary.extract_summary(sum_info) + tab2 = TabularSummary(value_cols=['letter'], skip_cols=['event_type']) + tabular_info = {} + new_tab = TabularSummary.extract_summary(tabular_info) + self.assertIsInstance(new_tab, TabularSummary) + + def test_extract_summary_empty(self): + tabular_info = {} + new_tab = TabularSummary.extract_summary(tabular_info) + self.assertIsInstance(new_tab, TabularSummary) + def test_get_number_unique_values(self): dict1 = TabularSummary() wh_df = get_new_dataframe(self.wh_events_path) @@ -54,7 +70,7 @@ def test_get_summary(self): "TabularSummary categorical_info be columns minus skip and value columns") summary1 = dict1.get_summary(as_json=False) self.assertIsInstance(summary1, dict) - self.assertEqual(len(summary1), 5) + self.assertEqual(len(summary1), 7) summary2 = dict1.get_summary(as_json=True).replace('"', '') self.assertIsInstance(summary2, str) diff --git a/tests/tools/remodeling/operations/test_summarize_column_names_op.py b/tests/tools/remodeling/operations/test_summarize_column_names_op.py index ddd5a8658..2ef5eee27 100644 --- a/tests/tools/remodeling/operations/test_summarize_column_names_op.py +++ b/tests/tools/remodeling/operations/test_summarize_column_names_op.py @@ -2,9 +2,9 @@ import os import pandas as pd import unittest -from hed.tools.analysis.tabular_column_name_summary import TabularColumnNameSummary +# from hed.tools.analysis.column_name_summary import ColumnNameSummary from hed.tools.remodeling.dispatcher import Dispatcher -from hed.tools.remodeling.operations.summarize_column_names_op import ColumnNameSummary, SummarizeColumnNamesOp +from hed.tools.remodeling.operations.summarize_column_names_op import ColumnNamesSummary, SummarizeColumnNamesOp class Test(unittest.TestCase): @@ -77,7 +77,7 @@ def test_summary_op(self): new_summary = json.loads(json_value) self.assertIsInstance(new_summary, dict) merged1 = this_context.merge_all_info() - self.assertIsInstance(merged1, TabularColumnNameSummary) + # self.assertIsInstance(merged1, ColumnNameSummary) self.assertEqual(len(merged1.file_dict), 3) self.assertEqual(len(merged1.unique_headers), 2) with self.assertRaises(ValueError) as except_context: @@ -111,7 +111,7 @@ def test_text_summary(self): self.get_dfs(op, 'run-01', dispatch) self.get_dfs(op, 'run-02', dispatch) context = dispatch.summary_dicts['columns'] - self.assertIsInstance(context, ColumnNameSummary) + # self.assertIsInstance(context, ColumnNameSummary) text_summary1 = context.get_text_summary() self.assertIsInstance(text_summary1, dict) diff --git a/tests/tools/visualizations/test_tag_summary_word_cloud.py b/tests/tools/visualizations/test_tag_word_cloud.py similarity index 79% rename from tests/tools/visualizations/test_tag_summary_word_cloud.py rename to tests/tools/visualizations/test_tag_word_cloud.py index b1d5c4853..fa09e1710 100644 --- a/tests/tools/visualizations/test_tag_summary_word_cloud.py +++ b/tests/tools/visualizations/test_tag_word_cloud.py @@ -1,6 +1,7 @@ import unittest from wordcloud import WordCloud -from hed.tools.visualizations import tag_summary_word_cloud +from hed.tools.visualizations import tag_word_cloud + class TestWordCloudFunctions(unittest.TestCase): @@ -23,14 +24,14 @@ def test_convert_summary_to_word_dict(self): } expected_output = {'tag1': 5, 'tag2': 3, 'tag3': 7} - word_dict = tag_summary_word_cloud.convert_summary_to_word_dict(summary_json) + word_dict = tag_word_cloud.summary_to_dict(summary_json) self.assertEqual(word_dict, expected_output) def test_create_wordcloud(self): word_dict = {'tag1': 5, 'tag2': 3, 'tag3': 7} width = 400 height = 200 - wc = tag_summary_word_cloud.create_wordcloud(word_dict, width, height) + wc = tag_word_cloud.create_wordcloud(word_dict, width, height) self.assertIsInstance(wc, WordCloud) self.assertEqual(wc.width, width) @@ -40,12 +41,12 @@ def test_create_wordcloud_with_empty_dict(self): # Test creation of word cloud with an empty dictionary word_dict = {} with self.assertRaises(ValueError): - tag_summary_word_cloud.create_wordcloud(word_dict) + tag_word_cloud.create_wordcloud(word_dict) def test_create_wordcloud_with_single_word(self): # Test creation of word cloud with a single word word_dict = {'single_word': 1} - wc = tag_summary_word_cloud.create_wordcloud(word_dict) + wc = tag_word_cloud.create_wordcloud(word_dict) self.assertIsInstance(wc, WordCloud) # Check that the single word is in the word cloud - self.assertIn('single_word', wc.words_) \ No newline at end of file + self.assertIn('single_word', wc.words_) From 8f9647d092514b8ef20e7df09a761d43266720fd Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Mon, 26 Jun 2023 09:25:06 -0500 Subject: [PATCH 2/2] Added a Specifics level to the summary output --- hed/tools/remodeling/operations/summarize_hed_tags_op.py | 5 +++-- hed/tools/visualizations/tag_word_cloud.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index 1eea941d2..5a504fed1 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -136,7 +136,7 @@ def get_details_dict(self, merge_counts): return {"Name": merge_counts.name, "Total events": merge_counts.total_events, "Total files": len(merge_counts.files.keys()), "Files": [name for name in merge_counts.files.keys()], - "Main tags": details, "Other tags": leftovers} + "Specifics": {"Main tags": details, "Other tags": leftovers}} def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): """ Return a formatted string with the summary for the indicated name. @@ -215,7 +215,8 @@ def _tag_details(tags): return tag_list @staticmethod - def _get_tag_list(tag_info, indent=BaseSummary.DISPLAY_INDENT): + def _get_tag_list(result, indent=BaseSummary.DISPLAY_INDENT): + tag_info = result["Specifics"] sum_list = [f"\n{indent}Main tags[events,files]:"] for category, tags in tag_info['Main tags'].items(): sum_list.append(f"{indent}{indent}{category}:") diff --git a/hed/tools/visualizations/tag_word_cloud.py b/hed/tools/visualizations/tag_word_cloud.py index bee6cef56..2f2c25236 100644 --- a/hed/tools/visualizations/tag_word_cloud.py +++ b/hed/tools/visualizations/tag_word_cloud.py @@ -36,7 +36,8 @@ def summary_to_dict(summary): """ overall_summary = summary.get("Overall summary", {}) - tag_dict = overall_summary.get("Main tags", {}) + specifics = overall_summary.get("Specifics", {}) + tag_dict = specifics.get("Main tags", {}) word_dict = {} for tag_sub_list in tag_dict.values(): for tag_sub_dict in tag_sub_list: