Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json


class TabularColumnNameSummary:
class ColumnNameSummary:
def __init__(self, name=''):
self.name = name
self.file_dict = {}
Expand Down
29 changes: 27 additions & 2 deletions hed/tools/analysis/tabular_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,9 @@ def get_summary(self, as_json=False):
value_cols = {}
for key in sorted_cols:
value_cols[key] = self.value_info[key]
summary = {"Summary name": self.name, "Total events": self.total_events, "Total files": self.total_files,
"Categorical columns": categorical_cols, "Value columns": value_cols}
summary = {"Name": self.name, "Total events": self.total_events, "Total files": self.total_files,
"Categorical columns": categorical_cols, "Value columns": value_cols,
"Skip columns": self.skip_cols, "Files": self.files}
if as_json:
return json.dumps(summary, indent=4)
else:
Expand Down Expand Up @@ -215,6 +216,30 @@ def _update_dict_value(self, col_dict):
self.value_info[col] = [self.value_info[col][0] + col_dict.value_info[col][0],
self.value_info[col][1] + col_dict.value_info[col][1]]

@staticmethod
def extract_summary(summary_info):
""" Create a TabularSummary object from a serialized summary

Parameters:
summary_info (dict or str): A JSON string or a dictionary containing contents of a TabularSummary.

Returns:
TabularSummary: contains the information in summary_info as a TabularSummary object.
"""

if isinstance(summary_info, str):
summary_info = json.loads(summary_info)
new_tab = TabularSummary(value_cols=summary_info.get('Value columns', {}).keys(),
skip_cols=summary_info.get('Skip columns', []),
name=summary_info.get('Summary name', ''))
new_tab.value_info = summary_info.get('Value_columns', {})
new_tab.total_files = summary_info.get('Total files', 0)
new_tab.total_events = summary_info.get('Total events', 0)
new_tab.skip_cols = summary_info.get('Skip columns', [])
new_tab.categorical_info = summary_info.get('Categorical columns', {})
new_tab.files = summary_info.get('Files', {})
return new_tab

@staticmethod
def get_columns_info(dataframe, skip_cols=None):
""" Extract unique value counts for columns.
Expand Down
28 changes: 16 additions & 12 deletions hed/tools/remodeling/operations/summarize_column_names_op.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" Summarize the column names in a collection of tabular files. """

from hed.tools.analysis.tabular_column_name_summary import TabularColumnNameSummary
from hed.tools.analysis.column_name_summary import ColumnNameSummary
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.tools.remodeling.operations.base_summary import BaseSummary

Expand Down Expand Up @@ -67,13 +67,13 @@ def do_op(self, dispatcher, df, name, sidecar=None):
df_new = df.copy()
summary = dispatcher.summary_dicts.get(self.summary_name, None)
if not summary:
summary = ColumnNameSummary(self)
summary = ColumnNamesSummary(self)
dispatcher.summary_dicts[self.summary_name] = summary
summary.update_summary({"name": name, "column_names": list(df_new.columns)})
return df_new


class ColumnNameSummary(BaseSummary):
class ColumnNamesSummary(BaseSummary):

def __init__(self, sum_op):
super().__init__(sum_op)
Expand All @@ -85,35 +85,39 @@ def update_summary(self, new_info):
new_info (dict): A dictionary with the parameters needed to update a summary.

Notes:
- The summary information is kept in separate TabularColumnNameSummary objects for each file.
- The summary information is kept in separate ColumnNameSummary objects for each file.
- The summary needs a "name" str and a "column_names" list.
- The summary uses TabularColumnNameSummary as the summary object.
- The summary uses ColumnNameSummary as the summary object.
"""
name = new_info['name']
if name not in self.summary_dict:
self.summary_dict[name] = TabularColumnNameSummary(name=name)
self.summary_dict[name] = ColumnNameSummary(name=name)
self.summary_dict[name].update(name, new_info["column_names"])

def get_details_dict(self, column_summary):
""" Return the summary dictionary extracted from a ColumnNameSummary.

Parameters:
column_summary (TabularColumnNameSummary): A column name summary for the data file.
column_summary (ColumnNameSummary): A column name summary for the data file.

Returns:
dict - a dictionary with the summary information for column names.

"""
return column_summary.get_summary()
summary = column_summary.get_summary()
return {"Name": summary['Summary name'], "Total events": "n/a",
"Total files": summary['Number files'],
"Files": [name for name in column_summary.file_dict.keys()],
"Columns": summary['Columns']}

def merge_all_info(self):
""" Create a TabularColumnNameSummary containing the overall dataset summary.
""" Create a ColumnNameSummary containing the overall dataset summary.

Returns:
TabularColumnNameSummary - the overall summary object for column names.
ColumnNameSummary - the overall summary object for column names.

"""
all_sum = TabularColumnNameSummary(name='Dataset')
all_sum = ColumnNameSummary(name='Dataset')
for key, counts in self.summary_dict.items():
for name, pos in counts.file_dict.items():
all_sum.update(name, counts.unique_headers[pos])
Expand Down Expand Up @@ -152,7 +156,7 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT):

"""
sum_list = [f"Dataset: Number of files={result.get('Number files', 0)}"]
for element in result.get("Columns", []):
for element in result.get("Unique headers", []):
sum_list.append(f"{indent}Columns: {str(element['Column names'])}")
for file in element.get("Files", []):
sum_list.append(f"{indent}{indent}{file}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def merge_all_info(self):

"""
all_sum = TabularSummary(value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name='Dataset')
for key, counts in self.summary_dict.items():
for counts in self.summary_dict.values():
all_sum.update_summary(counts)
return all_sum

Expand Down
16 changes: 9 additions & 7 deletions hed/tools/remodeling/operations/summarize_hed_tags_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,10 @@ def get_details_dict(self, merge_counts):
for key, key_list in self.tags.items():
details[key] = self._get_details(key_list, template, verbose=True)
leftovers = [value.get_info(verbose=True) for value in unmatched]
return {"name": merge_counts.name, "total_events": merge_counts.total_events,
"files": [name for name in merge_counts.files.keys()],
"Main tags": details, "Other tags": leftovers}
return {"Name": merge_counts.name, "Total events": merge_counts.total_events,
"Total files": len(merge_counts.files.keys()),
"Files": [name for name in merge_counts.files.keys()],
"Specifics": {"Main tags": details, "Other tags": leftovers}}

def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT):
""" Return a formatted string with the summary for the indicated name.
Expand Down Expand Up @@ -185,8 +186,8 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT):
str: Formatted string suitable for saving in a file or printing.

"""
sum_list = [f"Dataset: Total events={result.get('total_events', 0)} "
f"Total files={len(result.get('files', []))}"]
sum_list = [f"Dataset: Total events={result.get('Total events', 0)} "
f"Total files={len(result.get('Files', 0))}"]
sum_list = sum_list + HedTagSummary._get_tag_list(result, indent=indent)
return "\n".join(sum_list)

Expand All @@ -202,7 +203,7 @@ def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT):
str: Formatted string suitable for saving in a file or printing.

"""
sum_list = [f"Total events={result.get('total_events', 0)}"]
sum_list = [f"Total events={result.get('Total events', 0)}"]
sum_list = sum_list + HedTagSummary._get_tag_list(result, indent=indent)
return "\n".join(sum_list)

Expand All @@ -214,7 +215,8 @@ def _tag_details(tags):
return tag_list

@staticmethod
def _get_tag_list(tag_info, indent=BaseSummary.DISPLAY_INDENT):
def _get_tag_list(result, indent=BaseSummary.DISPLAY_INDENT):
tag_info = result["Specifics"]
sum_list = [f"\n{indent}Main tags[events,files]:"]
for category, tags in tag_info['Main tags'].items():
sum_list.append(f"{indent}{indent}{category}:")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,22 @@ def create_wordcloud(word_dict, width=400, height=200):
return wc


def convert_summary_to_word_dict(summary_json):
def summary_to_dict(summary):
"""Converts a HedTagSummary json dict into the word cloud input format

Parameters:
summary_json(dict): The summary from a summarize hed tags op
summary(dict): The summary from a summarize hed tags op

Returns:
word_dict(dict): a dict of the words and their occurrence count

:raises KeyError:
A malformed dictionary was passed

"""
tag_dict = summary_json['Dataset']['Overall summary']['Main tags']
overall_summary = summary.get("Overall summary", {})
specifics = overall_summary.get("Specifics", {})
tag_dict = specifics.get("Main tags", {})
word_dict = {}
for tag_sub_list in tag_dict.values():
for tag_sub_dict in tag_sub_list:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import unittest
from hed.tools.analysis.tabular_column_name_summary import TabularColumnNameSummary
from hed.tools.analysis.column_name_summary import ColumnNameSummary


class Test(unittest.TestCase):
Expand All @@ -17,16 +17,16 @@ def tearDownClass(cls):
pass

def test_constructor(self):
column_summary1 = TabularColumnNameSummary(name='Dataset')
self.assertIsInstance(column_summary1, TabularColumnNameSummary)
column_summary1 = ColumnNameSummary(name='Dataset')
self.assertIsInstance(column_summary1, ColumnNameSummary)
self.assertEqual(column_summary1.name, 'Dataset')
self.assertFalse(column_summary1.file_dict)
self.assertFalse(column_summary1.unique_headers)
column_summary2 = TabularColumnNameSummary()
self.assertIsInstance(column_summary2, TabularColumnNameSummary)
column_summary2 = ColumnNameSummary()
self.assertIsInstance(column_summary2, ColumnNameSummary)

def test_update(self):
column_summary = TabularColumnNameSummary()
column_summary = ColumnNameSummary()
column_summary.update('run-01', self.columns1)
column_summary.update('run-02', self.columns1)
self.assertEqual(len(column_summary.unique_headers), 1)
Expand All @@ -41,7 +41,7 @@ def test_update(self):
self.assertEqual(context.exception.args[0], "FileHasChangedColumnNames")

def test_update_headers(self):
column_summary = TabularColumnNameSummary()
column_summary = ColumnNameSummary()
pos1 = column_summary.update_headers(self.columns1)
self.assertEqual(pos1, 0)
pos2 = column_summary.update_headers(self.columns1)
Expand All @@ -50,7 +50,7 @@ def test_update_headers(self):
self.assertEqual(pos3, 1)

def test_get_summary(self):
column_summary = TabularColumnNameSummary('Dataset')
column_summary = ColumnNameSummary('Dataset')
column_summary.update('run-01', self.columns1)
column_summary.update('run-02', self.columns1)
summary1 = column_summary.get_summary()
Expand Down
18 changes: 17 additions & 1 deletion tests/tools/analysis/test_tabular_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,22 @@ def test_constructor(self):
self.assertIsInstance(dict2, TabularSummary, "TabularSummary: multiple values are okay in constructor")
self.assertEqual(len(dict2.value_info.keys()), 3, "TabularSummary should have keys for each value column")

def test_extract_summary(self):
tab1 = TabularSummary()
stern_df = get_new_dataframe(self.stern_map_path)
tab1.update(stern_df)
sum_info = tab1.get_summary()
new_tab1 = TabularSummary.extract_summary(sum_info)
tab2 = TabularSummary(value_cols=['letter'], skip_cols=['event_type'])
tabular_info = {}
new_tab = TabularSummary.extract_summary(tabular_info)
self.assertIsInstance(new_tab, TabularSummary)

def test_extract_summary_empty(self):
tabular_info = {}
new_tab = TabularSummary.extract_summary(tabular_info)
self.assertIsInstance(new_tab, TabularSummary)

def test_get_number_unique_values(self):
dict1 = TabularSummary()
wh_df = get_new_dataframe(self.wh_events_path)
Expand All @@ -54,7 +70,7 @@ def test_get_summary(self):
"TabularSummary categorical_info be columns minus skip and value columns")
summary1 = dict1.get_summary(as_json=False)
self.assertIsInstance(summary1, dict)
self.assertEqual(len(summary1), 5)
self.assertEqual(len(summary1), 7)
summary2 = dict1.get_summary(as_json=True).replace('"', '')
self.assertIsInstance(summary2, str)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import os
import pandas as pd
import unittest
from hed.tools.analysis.tabular_column_name_summary import TabularColumnNameSummary
# from hed.tools.analysis.column_name_summary import ColumnNameSummary
from hed.tools.remodeling.dispatcher import Dispatcher
from hed.tools.remodeling.operations.summarize_column_names_op import ColumnNameSummary, SummarizeColumnNamesOp
from hed.tools.remodeling.operations.summarize_column_names_op import ColumnNamesSummary, SummarizeColumnNamesOp


class Test(unittest.TestCase):
Expand Down Expand Up @@ -77,7 +77,7 @@ def test_summary_op(self):
new_summary = json.loads(json_value)
self.assertIsInstance(new_summary, dict)
merged1 = this_context.merge_all_info()
self.assertIsInstance(merged1, TabularColumnNameSummary)
# self.assertIsInstance(merged1, ColumnNameSummary)
self.assertEqual(len(merged1.file_dict), 3)
self.assertEqual(len(merged1.unique_headers), 2)
with self.assertRaises(ValueError) as except_context:
Expand Down Expand Up @@ -111,7 +111,7 @@ def test_text_summary(self):
self.get_dfs(op, 'run-01', dispatch)
self.get_dfs(op, 'run-02', dispatch)
context = dispatch.summary_dicts['columns']
self.assertIsInstance(context, ColumnNameSummary)
# self.assertIsInstance(context, ColumnNameSummary)
text_summary1 = context.get_text_summary()
self.assertIsInstance(text_summary1, dict)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest
from wordcloud import WordCloud
from hed.tools.visualizations import tag_summary_word_cloud
from hed.tools.visualizations import tag_word_cloud


class TestWordCloudFunctions(unittest.TestCase):

Expand All @@ -23,14 +24,14 @@ def test_convert_summary_to_word_dict(self):
}
expected_output = {'tag1': 5, 'tag2': 3, 'tag3': 7}

word_dict = tag_summary_word_cloud.convert_summary_to_word_dict(summary_json)
word_dict = tag_word_cloud.summary_to_dict(summary_json)
self.assertEqual(word_dict, expected_output)

def test_create_wordcloud(self):
word_dict = {'tag1': 5, 'tag2': 3, 'tag3': 7}
width = 400
height = 200
wc = tag_summary_word_cloud.create_wordcloud(word_dict, width, height)
wc = tag_word_cloud.create_wordcloud(word_dict, width, height)

self.assertIsInstance(wc, WordCloud)
self.assertEqual(wc.width, width)
Expand All @@ -40,12 +41,12 @@ def test_create_wordcloud_with_empty_dict(self):
# Test creation of word cloud with an empty dictionary
word_dict = {}
with self.assertRaises(ValueError):
tag_summary_word_cloud.create_wordcloud(word_dict)
tag_word_cloud.create_wordcloud(word_dict)

def test_create_wordcloud_with_single_word(self):
# Test creation of word cloud with a single word
word_dict = {'single_word': 1}
wc = tag_summary_word_cloud.create_wordcloud(word_dict)
wc = tag_word_cloud.create_wordcloud(word_dict)
self.assertIsInstance(wc, WordCloud)
# Check that the single word is in the word cloud
self.assertIn('single_word', wc.words_)
self.assertIn('single_word', wc.words_)