Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hed/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from .remodeling.dispatcher import Dispatcher
from .remodeling.backup_manager import BackupManager
from .remodeling.operations.base_context import BaseContext
from .remodeling.operations.base_summary import BaseSummary
from .remodeling.operations.base_op import BaseOp
from .remodeling.operations.factor_column_op import FactorColumnOp
from .remodeling.operations.factor_hed_tags_op import FactorHedTagsOp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json


class ColumnNameSummary:
class TabularColumnNameSummary:
def __init__(self, name=''):
self.name = name
self.file_dict = {}
Expand Down
1 change: 1 addition & 0 deletions hed/tools/analysis/tabular_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def update(self, data, name=None):

Parameters:
data (DataFrame, str, or list): DataFrame containing data to update.
name (str): Name of the summary

"""

Expand Down
8 changes: 4 additions & 4 deletions hed/tools/remodeling/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, operation_list, data_root=None,
raise ValueError("InvalidOperationList", f"{these_errors}")
self.parsed_ops = op_list
self.hed_schema = get_schema(hed_versions)
self.context_dict = {}
self.summary_dicts = {}

def get_summaries(self, file_formats=['.txt', '.json']):
""" Return the summaries in a dictionary of strings suitable for saving or archiving.
Expand All @@ -62,8 +62,8 @@ def get_summaries(self, file_formats=['.txt', '.json']):

summary_list = []
time_stamp = '_' + get_timestamp()
for context_name, context_item in self.context_dict.items():
file_base = context_item.context_filename
for context_name, context_item in self.summary_dicts.items():
file_base = context_item.op.summary_filename
if self.data_root:
file_base = extract_suffix_path(self.data_root, file_base)
file_base = clean_filename(file_base)
Expand Down Expand Up @@ -171,7 +171,7 @@ def save_summaries(self, save_formats=['.json', '.txt'], individual_summaries="s
if not summary_dir:
summary_dir = self.get_summary_save_dir()
os.makedirs(summary_dir, exist_ok=True)
for context_name, context_item in self.context_dict.items():
for context_name, context_item in self.summary_dicts.items():
context_item.save(summary_dir, save_formats, individual_summaries=individual_summaries)

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
@@ -1,28 +1,24 @@
""" Abstract base class for the context of summary operations. """
""" Abstract base class for the contents of summary operations. """

import os
from abc import ABC, abstractmethod
import json
from hed.tools.util.io_util import get_timestamp


class BaseContext(ABC):
""" Abstract base class for summary contexts. Should not be instantiated.
class BaseSummary(ABC):
""" Abstract base class for summary contents. Should not be instantiated.

Parameters:
context_type (str) Type of summary.
context_name (str) Printable name -- should be unique.
context_filename (str) Base filename for saving the context.
sum_op (BaseOp): Operation corresponding to this summary.

"""

DISPLAY_INDENT = " "
INDIVIDUAL_SUMMARIES_PATH = 'individual_summaries'

def __init__(self, context_type, context_name, context_filename):
self.context_type = context_type
self.context_name = context_name
self.context_filename = context_filename
def __init__(self, sum_op):
self.op = sum_op
self.summary_dict = {}

def get_summary_details(self, include_individual=True):
Expand All @@ -39,19 +35,19 @@ def get_summary_details(self, include_individual=True):
- The 'Individual files' value is dictionary whose keys are file names and values are
their corresponding summaries.

Users are expected to provide _merge_all and _get_details_dict to support this.
Users are expected to provide merge_all_info and get_details_dict to support this.

"""
merged_summary = self._merge_all()
merged_summary = self.merge_all_info()
if merged_summary:
details = self._get_details_dict(merged_summary)
details = self.get_details_dict(merged_summary)
else:
details = "Overall summary unavailable"

summary_details = {"Dataset": details, "Individual files": {}}
if include_individual:
for name, count in self.summary_dict.items():
summary_details["Individual files"][name] = self._get_details_dict(count)
summary_details["Individual files"][name] = self.get_details_dict(count)
return summary_details

def get_summary(self, individual_summaries="separate"):
Expand All @@ -71,8 +67,8 @@ def get_summary(self, individual_summaries="separate"):
"""
include_individual = individual_summaries == "separate" or individual_summaries == "consolidated"
summary_details = self.get_summary_details(include_individual=include_individual)
dataset_summary = {"Context name": self.context_name, "Context type": self.context_type,
"Context filename": self.context_filename, "Overall summary": summary_details['Dataset']}
dataset_summary = {"Summary name": self.op.summary_name, "Summary type": self.op.SUMMARY_TYPE,
"Summary filename": self.op.summary_filename, "Overall summary": summary_details['Dataset']}
summary = {"Dataset": dataset_summary, "Individual files": {}}
if summary_details["Individual files"]:
summary["Individual files"] = self.get_individual(summary_details["Individual files"],
Expand All @@ -83,8 +79,8 @@ def get_individual(self, summary_details, separately=True):
individual_dict = {}
for name, name_summary in summary_details.items():
if separately:
individual_dict[name] = {"Context name": self.context_name, "Context type": self.context_type,
"Context filename": self.context_filename, "File summary": name_summary}
individual_dict[name] = {"Summary name": self.op.summary_name, "summary type": self.op.SUMMARY_TYPE,
"Summary filename": self.op.summary_filename, "File summary": name_summary}
else:
individual_dict[name] = name_summary
return individual_dict
Expand All @@ -101,14 +97,16 @@ def get_text_summary_details(self, include_individual=True):
def get_text_summary(self, individual_summaries="separate"):
include_individual = individual_summaries == "separate" or individual_summaries == "consolidated"
summary_details = self.get_text_summary_details(include_individual=include_individual)
summary = {"Dataset": f"Context name: {self.context_name}\n" + f"Context type: {self.context_type}\n" +
f"Context filename: {self.context_filename}\n\n" + f"Overall summary:\n{summary_details['Dataset']}"}
summary = {"Dataset": f"Summary name: {self.op.summary_name}\n" +
f"Summary type: {self.op.SUMMARY_TYPE}\n" +
f"Summary filename: {self.op.summary_filename}\n\n" +
f"Overall summary:\n{summary_details['Dataset']}"}
if individual_summaries == "separate":
summary["Individual files"] = {}
for name, name_summary in summary_details["Individual files"].items():
summary["Individual files"][name] = f"Context name: {self.context_name}\n" + \
f"Context type: {self.context_type}\n" + \
f"Context filename: {self.context_filename}\n\n" + \
summary["Individual files"][name] = f"Summary name: {self.op.summary_name}\n" + \
f"Summary type: {self.op.SUMMARY_TYPE}\n" + \
f"Summary filename: {self.op.summary_filename}\n\n" + \
f"Summary for {name}:\n{name_summary}"
elif include_individual:
ind_list = []
Expand All @@ -132,17 +130,17 @@ def save(self, save_dir, file_formats=['.txt'], individual_summaries="separate")

def _save_summary_files(self, save_dir, file_format, summary, individual_summaries):
""" Save the files in the appropriate format.

Parameters:
save_dir (str): Path to the directory in which the summaries will be saved.
file_format (str): string representing the extension (including .), '.txt' or '.json'.
summary (dictionary): Dictionary of summaries (has "Dataset" and "Individual files" keys.

"""
time_stamp = '_' + get_timestamp()
this_save = os.path.join(save_dir, self.context_name + '/')
this_save = os.path.join(save_dir, self.op.summary_name + '/')
os.makedirs(os.path.realpath(this_save), exist_ok=True)
filename = os.path.realpath(os.path.join(this_save, self.context_filename + time_stamp + file_format))
filename = os.path.realpath(os.path.join(this_save, self.op.summary_filename + time_stamp + file_format))
individual = summary.get("Individual files", {})
if individual_summaries == "none" or not individual:
self.dump_summary(filename, summary["Dataset"])
Expand All @@ -159,7 +157,7 @@ def _save_summary_files(self, save_dir, file_format, summary, individual_summari

def _get_summary_filepath(self, individual_dir, name, time_stamp, file_format):
""" Return the filepath for the summary including the timestamp

Parameters:
individual_dir (str): path of the directory in which the summary should be stored.
name (str): Path of the original file from which the summary was extracted.
Expand All @@ -175,7 +173,7 @@ def _get_summary_filepath(self, individual_dir, name, time_stamp, file_format):
match = True
filename = None
while match:
filename = f"{self.context_filename}_{this_name}_{count}{time_stamp}{file_format}"
filename = f"{self.op.summary_filename}_{this_name}_{count}{time_stamp}{file_format}"
filename = os.path.realpath(os.path.join(individual_dir, filename))
if not os.path.isfile(filename):
break
Expand Down Expand Up @@ -207,7 +205,7 @@ def dump_summary(filename, summary):
text_file.write(summary)

@abstractmethod
def _get_details_dict(self, summary_info):
def get_details_dict(self, summary_info):
""" Return the summary-specific information.

Parameters:
Expand All @@ -217,30 +215,30 @@ def _get_details_dict(self, summary_info):
dict: dictionary with the results.

Notes:
Abstract method be implemented by each individual context summary.
Abstract method be implemented by each individual summary.

"""
raise NotImplementedError

@abstractmethod
def _merge_all(self):
def merge_all_info(self):
""" Return merged information.

Returns:
object: Consolidated summary of information.

Notes:
Abstract method be implemented by each individual context summary.
Abstract method be implemented by each individual summary.

"""
raise NotImplementedError

@abstractmethod
def update_context(self, context_dict):
def update_summary(self, summary_dict):
""" Method to update summary for a given tabular input.

Parameters:
context_dict (dict) A context specific dictionary with the update information.
summary_dict (dict) A summary specific dictionary with the update information.

"""
raise NotImplementedError
48 changes: 24 additions & 24 deletions hed/tools/remodeling/operations/summarize_column_names_op.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
""" Summarize the column names in a collection of tabular files. """

from hed.tools.analysis.column_name_summary import ColumnNameSummary
from hed.tools.analysis.tabular_column_name_summary import TabularColumnNameSummary
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.tools.remodeling.operations.base_context import BaseContext
from hed.tools.remodeling.operations.base_summary import BaseSummary


class SummarizeColumnNamesOp(BaseOp):
Expand Down Expand Up @@ -60,65 +60,65 @@ def do_op(self, dispatcher, df, name, sidecar=None):
DataFrame: A new DataFrame with the factor columns appended.

Side-effect:
Updates the context.
Updates the relevant summary.

"""

summary = dispatcher.context_dict.get(self.summary_name, None)
summary = dispatcher.summary_dicts.get(self.summary_name, None)
if not summary:
summary = ColumnNameSummaryContext(self)
dispatcher.context_dict[self.summary_name] = summary
summary.update_context({"name": name, "column_names": list(df.columns)})
summary = ColumnNameSummary(self)
dispatcher.summary_dicts[self.summary_name] = summary
summary.update_summary({"name": name, "column_names": list(df.columns)})
return df


class ColumnNameSummaryContext(BaseContext):
class ColumnNameSummary(BaseSummary):

def __init__(self, sum_op):
super().__init__(sum_op.SUMMARY_TYPE, sum_op.summary_name, sum_op.summary_filename)
super().__init__(sum_op)

def update_context(self, new_context):
def update_summary(self, new_info):
""" Update the summary for a given tabular input file.

Parameters:
new_context (dict): A dictionary with the parameters needed to update a summary.
new_info (dict): A dictionary with the parameters needed to update a summary.

Notes:
- The summary information is kept in separate ColumnNameSummary objects for each file.
- The summary information is kept in separate TabularColumnNameSummary objects for each file.
- The summary needs a "name" str and a "column_names" list.
- The summary uses ColumnNameSummary as the summary object.
- The summary uses TabularColumnNameSummary as the summary object.
"""
name = new_context['name']
name = new_info['name']
if name not in self.summary_dict:
self.summary_dict[name] = ColumnNameSummary(name=name)
self.summary_dict[name].update(name, new_context["column_names"])
self.summary_dict[name] = TabularColumnNameSummary(name=name)
self.summary_dict[name].update(name, new_info["column_names"])

def _get_details_dict(self, column_summary):
def get_details_dict(self, column_summary):
""" Return the summary dictionary extracted from a ColumnNameSummary.

Parameters:
column_summary (ColumnNameSummary): A column name summary for the data file.
column_summary (TabularColumnNameSummary): A column name summary for the data file.

Returns:
dict - a dictionary with the summary information for column names.

"""
return column_summary.get_summary()

def _merge_all(self):
""" Create a ColumnNameSummary containing the overall dataset summary.
def merge_all_info(self):
""" Create a TabularColumnNameSummary containing the overall dataset summary.

Returns:
ColumnNameSummary - the overall summary object for column names.
TabularColumnNameSummary - the overall summary object for column names.

"""
all_sum = ColumnNameSummary(name='Dataset')
all_sum = TabularColumnNameSummary(name='Dataset')
for key, counts in self.summary_dict.items():
for name, pos in counts.file_dict.items():
all_sum.update(name, counts.unique_headers[pos])
return all_sum

def _get_result_string(self, name, result, indent=BaseContext.DISPLAY_INDENT):
def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT):
""" Return a formatted string with the summary for the indicated name.

Parameters:
Expand All @@ -139,7 +139,7 @@ def _get_result_string(self, name, result, indent=BaseContext.DISPLAY_INDENT):
return f"{indent}{str(columns['Column names'])}"

@staticmethod
def _get_dataset_string(result, indent=BaseContext.DISPLAY_INDENT):
def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT):
""" Return a string with the overall summary for all of the tabular files.

Parameters:
Expand Down
Loading