From 8daf3839600b4a15dbc6dbd664fcf09d4bc692f7 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Wed, 28 Jun 2023 14:40:13 -0500 Subject: [PATCH 1/3] Made the get_details_dict method return a uniform dict with Specifics --- .../remodeling/operations/base_summary.py | 11 +++- .../operations/summarize_column_names_op.py | 15 +++-- .../operations/summarize_column_values_op.py | 15 ++++- .../operations/summarize_definitions_op.py | 2 + .../operations/summarize_hed_tags_op.py | 12 ++-- .../operations/summarize_hed_type_op.py | 29 +++++---- .../operations/summarize_hed_validation_op.py | 41 ++++++------ .../summarize_sidecar_from_events_op.py | 30 +++++---- .../tag_word_cloud.py | 0 hedtools/conda_build_info.txt | 24 ------- hedtools/meta.yaml | 63 ------------------- .../test_summarize_column_names_op.py | 2 +- .../test_summarize_column_values_op.py | 18 +++--- .../test_summarize_definitions_op.py | 10 +++ .../operations/test_summarize_hed_type_op.py | 4 +- .../test_summarize_hed_validation_op.py | 40 ++++++++++-- .../visualizations/test_tag_word_cloud.py | 2 +- 17 files changed, 153 insertions(+), 165 deletions(-) rename hed/tools/{visualizations => visualization}/tag_word_cloud.py (100%) delete mode 100644 hedtools/conda_build_info.txt delete mode 100644 hedtools/meta.yaml diff --git a/hed/tools/remodeling/operations/base_summary.py b/hed/tools/remodeling/operations/base_summary.py index 9153ed670..2b732ae2d 100644 --- a/hed/tools/remodeling/operations/base_summary.py +++ b/hed/tools/remodeling/operations/base_summary.py @@ -38,9 +38,9 @@ def get_summary_details(self, include_individual=True): Users are expected to provide merge_all_info and get_details_dict to support this. """ - merged_summary = self.merge_all_info() - if merged_summary: - details = self.get_details_dict(merged_summary) + merged_counts = self.merge_all_info() + if merged_counts: + details = self.get_details_dict(merged_counts) else: details = "Overall summary unavailable" @@ -220,6 +220,11 @@ def get_details_dict(self, summary_info): Notes: Abstract method be implemented by each individual summary. + Notes: + The expected return value is a dictionary of the form: + + {"Name": "", "Total events": 0, "Total files": 0, "Files": [], "Specifics": {}}" + """ raise NotImplementedError diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py index 2201827f9..fd70e3f8d 100644 --- a/hed/tools/remodeling/operations/summarize_column_names_op.py +++ b/hed/tools/remodeling/operations/summarize_column_names_op.py @@ -108,7 +108,7 @@ def get_details_dict(self, column_summary): return {"Name": summary['Summary name'], "Total events": "n/a", "Total files": summary['Number files'], "Files": [name for name in column_summary.file_dict.keys()], - "Columns": summary['Columns']} + "Specifics": {"Columns": summary['Columns']}} def merge_all_info(self): """ Create a ColumnNameSummary containing the overall dataset summary. @@ -140,8 +140,11 @@ def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): """ if name == "Dataset": return self._get_dataset_string(result, indent) - columns = result["Columns"][0] - return f"{indent}{str(columns['Column names'])}" + columns = result.get("Specifics", {}).get("Columns", []) + if columns: + return f"{indent}{str(columns[0])}" + else: + return "" @staticmethod def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): @@ -155,8 +158,10 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - sum_list = [f"Dataset: Number of files={result.get('Number files', 0)}"] - for element in result.get("Unique headers", []): + sum_list = [f"Dataset: Number of files={result.get('Total files', 0)}"] + specifics = result.get("Specifics", {}) + columns = specifics.get("Columns", {}) + for element in columns: sum_list.append(f"{indent}Columns: {str(element['Column names'])}") for file in element.get("Files", []): sum_list.append(f"{indent}{indent}{file}") diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py index 0c80c6382..825594aea 100644 --- a/hed/tools/remodeling/operations/summarize_column_values_op.py +++ b/hed/tools/remodeling/operations/summarize_column_values_op.py @@ -128,7 +128,14 @@ def get_details_dict(self, summary): for key, dict_entry in this_summary['Categorical columns'].items(): num_disp, sorted_tuples = ColumnValueSummary.sort_dict(dict_entry, reverse=True) this_summary['Categorical columns'][key] = dict(sorted_tuples[:min(num_disp, self.op.max_categorical)]) - return this_summary + return {"Name": this_summary['Name'], "Total events": this_summary["Total events"], + "Total files": this_summary['Total files'], + "Files": [name for name in this_summary['Files'].keys()], + "Specifics": {"Value columns": this_summary['Value columns'].keys(), + "Skip columns": this_summary['Skip columns'], + "Value columns": this_summary['Value columns'], + "Categorical columns": this_summary['Categorical columns'], + "Categorical counts": this_summary['Categorical counts']}} def merge_all_info(self): """ Create a TabularSummary containing the overall dataset summary. @@ -198,10 +205,11 @@ def _get_dataset_string(self, result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Dataset: Total events={result.get('Total events', 0)} " f"Total files={result.get('Total files', 0)}"] - cat_string = self._get_categorical_string(result, offset="", indent=indent) + specifics = result["Specifics"] + cat_string = self._get_categorical_string(specifics, offset="", indent=indent) if cat_string: sum_list.append(cat_string) - val_cols = result.get("Value columns", {}) + val_cols = specifics.get("Value columns", {}) if val_cols: sum_list.append(ColumnValueSummary._get_value_string(val_cols, offset="", indent=indent)) return "\n".join(sum_list) @@ -219,6 +227,7 @@ def _get_individual_string(self, result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Total events={result.get('Total events', 0)}"] + specifics = result.get("Specifics", {}) cat_cols = result.get("Categorical columns", {}) if cat_cols: sum_list.append(self._get_categorical_string(cat_cols, offset=indent, indent=indent)) diff --git a/hed/tools/remodeling/operations/summarize_definitions_op.py b/hed/tools/remodeling/operations/summarize_definitions_op.py index 3169d63d0..6be941352 100644 --- a/hed/tools/remodeling/operations/summarize_definitions_op.py +++ b/hed/tools/remodeling/operations/summarize_definitions_op.py @@ -128,6 +128,8 @@ def get_details_dict(self, def_gatherer): known_defs_summary.update(ambiguous_defs_summary) known_defs_summary.update(errors_summary) + return {"Name": "", "Total events": 0, "Total files": 0, "Files": [], "Specifics": known_defs_summary} + return known_defs_summary def merge_all_info(self): diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index 5a504fed1..2379dc4a5 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -118,24 +118,24 @@ def update_summary(self, new_info): counts.update_event_counts(hed, new_info['name']) self.summary_dict[new_info["name"]] = counts - def get_details_dict(self, merge_counts): + def get_details_dict(self, tag_counts): """ Return the summary-specific information in a dictionary. Parameters: - merge_counts (HedTagCounts): Contains the counts of tags in the dataset. + tag_counts (HedTagCounts): Contains the counts of tags in the dataset. Returns: dict: dictionary with the summary results. """ - template, unmatched = merge_counts.organize_tags(self.tags) + template, unmatched = tag_counts.organize_tags(self.tags) details = {} for key, key_list in self.tags.items(): details[key] = self._get_details(key_list, template, verbose=True) leftovers = [value.get_info(verbose=True) for value in unmatched] - return {"Name": merge_counts.name, "Total events": merge_counts.total_events, - "Total files": len(merge_counts.files.keys()), - "Files": [name for name in merge_counts.files.keys()], + return {"Name": tag_counts.name, "Total events": tag_counts.total_events, + "Total files": len(tag_counts.files.keys()), + "Files": [name for name in tag_counts.files.keys()], "Specifics": {"Main tags": details, "Other tags": leftovers}} def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py index 9a27d22d2..04c1ad89b 100644 --- a/hed/tools/remodeling/operations/summarize_hed_type_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_type_op.py @@ -95,7 +95,7 @@ def update_summary(self, new_info): Parameters: new_info (dict): A dictionary with the parameters needed to update a summary. - Notes: + Notes: - The summary needs a "name" str, a "schema", a "df, and a "Sidecar". """ @@ -104,7 +104,7 @@ def update_summary(self, new_info): if sidecar and not isinstance(sidecar, Sidecar): sidecar = Sidecar(sidecar) input_data = TabularInput(new_info['df'], sidecar=sidecar, name=new_info['name']) - hed_strings, definitions = get_assembled(input_data, sidecar, new_info['schema'], + hed_strings, definitions = get_assembled(input_data, sidecar, new_info['schema'], extra_def_dicts=None, join_columns=True, expand_defs=False) context_manager = HedContextManager(hed_strings, new_info['schema']) type_values = HedTypeValues(context_manager, definitions, new_info['name'], type_tag=self.type_tag) @@ -124,7 +124,12 @@ def get_details_dict(self, counts): dict: dictionary with the summary results. """ - return counts.get_summary() + summary = counts.get_summary() + files = summary.get('files', []) + return {"Name": summary.get("name", ""), "Total events": summary.get("total_events", 0), + "Total files": len(files), "Files": files, + "Specifics": {"Type tag": summary.get('type_tag', 'condition-variable'), + "Type info": summary.get('details', {})}} def merge_all_info(self): """ Create a HedTypeCounts containing the overall dataset HED type summary. @@ -170,11 +175,12 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - details = result.get('details', {}) - sum_list = [f"Dataset: Type={result['type_tag']} Type values={len(details)} " - f"Total events={result.get('total_events', 0)} Total files={len(result.get('files', []))}"] + specifics = result.get('Specifics', {}) + type_info = specifics.get('Type info', {}) + sum_list = [f"Dataset: Type={specifics.get('Type tag', 'condition-variable')} Type values={len(type_info)} " + f"Total events={result.get('Total events', 0)} Total files={len(result.get('Files', []))}"] - for key, item in details.items(): + for key, item in type_info.items(): str1 = f"{item['events']} event(s) out of {item['total_events']} total events in " + \ f"{len(item['files'])} file(s)" if item['level_counts']: @@ -200,11 +206,12 @@ def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - details = result.get('details', {}) - sum_list = [f"Type={result['type_tag']} Type values={len(details)} " - f"Total events={result.get('total_events', 0)}"] + specifics = result.get('Specifics', {}) + type_info = specifics.get('Type info', {}) + sum_list = [f"Type={specifics.get('Type tag', 'condition-variable')} Type values={len(type_info)} " + f"Total events={result.get('Total events', 0)}"] - for key, item in details.items(): + for key, item in type_info.items(): sum_list.append(f"{indent*2}{key}: {item['levels']} levels in {item['events']} events") str1 = "" if item['direct_references']: diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py index d643e533d..18d417f0e 100644 --- a/hed/tools/remodeling/operations/summarize_hed_validation_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_validation_op.py @@ -101,17 +101,16 @@ def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): This gets the error list from "sidecar_issues" and "event_issues". """ - - if result["is_merged"]: - sum_list = [f"{name}: [{result['total_sidecar_files']} sidecar files, " - f"{result['total_event_files']} event files]"] - sum_list = sum_list + self.get_error_list(result['sidecar_issues'], count_only=True, indent=indent) - sum_list = sum_list + self.get_error_list(result['event_issues'], count_only=True, indent=indent) + specifics = result.get("Specifics", {}) + sum_list = [f"{name}: [{len(specifics['sidecar_files'])} sidecar files, " + f"{len(specifics['event_files'])} event files]"] + if specifics.get('is_merged'): + sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], count_only=True, indent=indent) + sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=True, indent=indent) else: - sum_list = [f"{indent}{name}: {result['total_sidecar_files']} sidecar files"] - sum_list = sum_list + self.get_error_list(result['sidecar_issues'], indent=indent*2) - if result['validation_completed']: - sum_list = sum_list + self.get_error_list(result['event_issues'], count_only=False, indent=indent*2) + sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], indent=indent*2) + if specifics['validation_completed']: + sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=False, indent=indent*2) else: sum_list = sum_list + [f"{indent*2}Event file validation was incomplete because of sidecar errors"] return "\n".join(sum_list) @@ -127,14 +126,14 @@ def update_summary(self, new_info): """ results = self.get_empty_results() - results["total_event_files"] = 1 + results["event_files"].append(new_info["name"]) results["event_issues"][new_info["name"]] = [] - self.summary_dict[new_info["name"]] = results sidecar = new_info.get('sidecar', None) filtered_issues = [] if sidecar: if not isinstance(sidecar, Sidecar): sidecar = Sidecar(files=new_info['sidecar'], name=os.path.basename(sidecar)) + results["sidecar_files"].append(sidecar.name) results["sidecar_issues"][sidecar.name] = [] sidecar_issues = sidecar.validate(new_info['schema']) filtered_issues = ErrorHandler.filter_issues_by_severity(sidecar_issues, ErrorSeverity.ERROR) @@ -142,7 +141,6 @@ def update_summary(self, new_info): sidecar_issues = filtered_issues results['sidecar_issues'][sidecar.name] = sidecar_issues results['total_sidecar_issues'] = len(sidecar_issues) - results['total_sidecar_files'] = 1 if not filtered_issues: results['validation_completed'] = True input_data = TabularInput(new_info['df'], sidecar=sidecar) @@ -151,6 +149,7 @@ def update_summary(self, new_info): issues = ErrorHandler.filter_issues_by_severity(issues, ErrorSeverity.ERROR) results['event_issues'][new_info["name"]] = issues results['total_event_issues'] = len(issues) + self.summary_dict[new_info["name"]] = results def get_details_dict(self, summary_info): """Return the summary details from the summary_info. @@ -162,7 +161,11 @@ def get_details_dict(self, summary_info): dict: Same summary_info as was passed in. """ - return summary_info + + return {"Name": "", "Total events": "n/a", + "Total files": len(summary_info.get("event_files", [])), + "Files": summary_info.get("event_files", []), + "Specifics": summary_info} def merge_all_info(self): """ Create a dictionary containing all of the errors in the dataset. @@ -175,9 +178,10 @@ def merge_all_info(self): results = self.get_empty_results() results["is_merged"] = True for key, ind_results in self.summary_dict.items(): - results["total_event_files"] += ind_results["total_event_files"] + results["event_files"].append(key) results["total_event_issues"] += ind_results["total_event_issues"] - + results["total_sidecar_issues"] += ind_results["total_sidecar_issues"] + results["sidecar_files"] = results["sidecar_files"] + ind_results["sidecar_files"] for ikey, errors in ind_results["sidecar_issues"].items(): results["sidecar_issues"][ikey] = errors for ikey, errors in ind_results["event_issues"].items(): @@ -186,13 +190,12 @@ def merge_all_info(self): f"Validation incomplete due to {ind_results['total_sidecar_issues']} sidecar issues" else: results["event_issues"][ikey] = f"{len(errors)}" - results["total_sidecar_files"] += ind_results["total_sidecar_files"] return results @staticmethod def get_empty_results(): - return {"total_event_files": 0, "total_event_issues": 0, "event_issues": {}, "is_merged": False, - "total_sidecar_files": 0, "total_sidecar_issues": 0, "sidecar_issues": {}, + return {"event_files": [], "total_event_issues": 0, "event_issues": {}, "is_merged": False, + "sidecar_files": [], "total_sidecar_issues": 0, "sidecar_issues": {}, "validation_completed": False} @staticmethod diff --git a/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py b/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py index e0657ffef..28a0b9389 100644 --- a/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py +++ b/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py @@ -115,9 +115,13 @@ def get_details_dict(self, summary_info): """ - return {"files": summary_info.files, "total_files": summary_info.total_files, - "total_events": summary_info.total_events, "skip_cols": summary_info.skip_cols, - "sidecar": summary_info.extract_sidecar_template()} + return {"Name": summary_info.name, "Total events": summary_info.total_events, + "Total files": summary_info.total_files, + "Files": summary_info.files.keys(), + "Specifics": {"Categorical info": summary_info.categorical_info, + "Value info": summary_info.value_info, + "Skip columns": summary_info.skip_cols, + "Sidecar": summary_info.extract_sidecar_template()}} def merge_all_info(self): """ Merge summary information from all of the files @@ -165,11 +169,12 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - sum_list = [f"Dataset: Total events={result.get('total_events', 0)} " - f"Total files={result.get('total_files', 0)}", - f"Skip columns: {str(result.get('skip_cols', []))}", - f"Value columns: {str(result.get('value_cols', []))}", - f"Sidecar:\n{json.dumps(result['sidecar'], indent=indent)}"] + specifics = result.get("Specifics", {}) + sum_list = [f"Dataset: Total events={result.get('Total events', 0)} " + f"Total files={result.get('Total files', 0)}", + f"Skip columns: {str(specifics.get('Skip columns', []))}", + f"Value columns: {str(specifics.get('Value info', {}).keys())}", + f"Sidecar:\n{json.dumps(specifics.get('Sidecar', {}), indent=indent)}"] return "\n".join(sum_list) @staticmethod @@ -184,8 +189,9 @@ def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - sum_list = [f"Total events={result.get('total_events', 0)}", - f"Skip columns: {str(result.get('skip_cols', []))}", - f"Value columns: {str(result.get('value_cols', []))}", - f"Sidecar:\n{json.dumps(result['sidecar'], indent=indent)}"] + specifics = result.get("Specifics", {}) + sum_list = [f"Total events={result.get('Total events', 0)}", + f"Skip columns: {str(specifics.get('Slip columns', []))}", + f"Value columns: {str(specifics.get('Value info', {}).keys())}", + f"Sidecar:\n{json.dumps(specifics['Sidecar'], indent=indent)}"] return "\n".join(sum_list) diff --git a/hed/tools/visualizations/tag_word_cloud.py b/hed/tools/visualization/tag_word_cloud.py similarity index 100% rename from hed/tools/visualizations/tag_word_cloud.py rename to hed/tools/visualization/tag_word_cloud.py diff --git a/hedtools/conda_build_info.txt b/hedtools/conda_build_info.txt deleted file mode 100644 index 0ea449c6f..000000000 --- a/hedtools/conda_build_info.txt +++ /dev/null @@ -1,24 +0,0 @@ -To create the base meta.yaml I used grayskull: -https://conda-forge.org/docs/maintainer/adding_pkgs.html#build - -Commands for building(uploading to conda-forge - note this is just for testing): -# Make sure conda forge is allowed -conda config --add channels conda-forge -# Make sure we try to always get from the same repo if there are conflicts -# see https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html for more details on channels -# this avoids issues with conflicting c libraries and similar. -conda config --set channel_priority strict -# Actually build the recipe. -conda build hedtools - -To locally install it after building: -conda install --use-local hedtools - -Then you follow the instructions here to make a PR(this is the actual upload to conda forge): -https://conda-forge.org/docs/maintainer/adding_pkgs.html#staging-test-locally - -To install from conda-forge(in theory, doesn't work yet): -# Note the -c conda-forge shouldn't be required if you called the build config step above.(but users won't have done this) -conda install hedtools -c conda-forge - - diff --git a/hedtools/meta.yaml b/hedtools/meta.yaml deleted file mode 100644 index 80a8fcdef..000000000 --- a/hedtools/meta.yaml +++ /dev/null @@ -1,63 +0,0 @@ -{% set name = "hedtools" %} -{% set version = "0.2.0" %} - -package: - name: {{ name|lower }} - version: {{ version }} - -source: - url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/hedtools-{{ version }}.tar.gz - sha256: 2452f8e4e79e50750147a437410ccfd9ea04ad4e6390edc14dbcd663a7a9fa08 - -build: - entry_points: - - run_remodel=hed.tools.remodeling.cli.run_remodel:main - - run_remodel_backup=hed.tools.remodeling.cli.run_remodel_backup:main - - run_remodel_restore=hed.tools.remodeling.cli.run_remodel_restore:main - noarch: python - script: {{ PYTHON }} -m pip install . -vv - number: 0 - -requirements: - host: - - python >=3.7 - - setuptools >=42 - - versioneer-518 - - pip - run: - - python >=3.7 - - defusedxml - - et-xmlfile - - inflect - - jdcal - - numpy - - openpyxl - - pandas - - portalocker - - python-dateutil - - pytz - - semantic_version - - six - - werkzeug - -test: - imports: - hed - commands: - - run_remodel --help - - run_remodel_backup --help - - run_remodel_restore --help - requires: - - pip - -about: - home: https://github.com/hed-standard/hed-python/ - summary: HED validation, summary, and analysis tools. - license: MIT - license_file: LICENSE - -extra: - recipe-maintainers: - - hed-maintainers - - VisLab - - IanCa \ No newline at end of file diff --git a/tests/tools/remodeling/operations/test_summarize_column_names_op.py b/tests/tools/remodeling/operations/test_summarize_column_names_op.py index 2ef5eee27..2aadd8e72 100644 --- a/tests/tools/remodeling/operations/test_summarize_column_names_op.py +++ b/tests/tools/remodeling/operations/test_summarize_column_names_op.py @@ -98,7 +98,7 @@ def test_summary(self): dataset_sum = summary['Dataset'] json_str = json.dumps(dataset_sum) json_obj = json.loads(json_str) - columns = json_obj["Overall summary"]["Columns"] + columns = json_obj["Overall summary"]["Specifics"]["Columns"] self.assertEqual(len(columns), 1) self.assertEqual(len(columns[0]['Files']), 2) ind_sum = summary['Individual files'] diff --git a/tests/tools/remodeling/operations/test_summarize_column_values_op.py b/tests/tools/remodeling/operations/test_summarize_column_values_op.py index 2d69655c0..c3cc322d1 100644 --- a/tests/tools/remodeling/operations/test_summarize_column_values_op.py +++ b/tests/tools/remodeling/operations/test_summarize_column_values_op.py @@ -5,7 +5,6 @@ from hed.tools.remodeling.dispatcher import Dispatcher from hed.tools.remodeling.operations.summarize_column_values_op import \ ColumnValueSummary, SummarizeColumnValuesOp -from hed.tools.util.io_util import get_file_list class Test(unittest.TestCase): @@ -68,17 +67,16 @@ def test_get_summary(self): self.get_dfs(sum_op, 'name1', dispatch) cont = dispatch.summary_dicts - context1 = cont.get("test summary", None) - self.assertIsInstance(context1, ColumnValueSummary, "get_summary testing ColumnValueSummary") - # summary1 = context1.get_summary() - # self.assertIsInstance(summary1, dict, "get_summary returns a dictionary") - # self.assertIsInstance(summary1["Dataset"], dict) - summary1a = context1.get_summary() + context = cont.get("test summary", None) + self.assertIsInstance(context, ColumnValueSummary, "get_summary testing ColumnValueSummary") + summary1a = context.get_summary() self.assertIsInstance(summary1a, dict) self.assertIsInstance(summary1a["Dataset"], dict) - text_summary = context1.get_text_summary(individual_summaries="separate") - self.assertIsInstance(text_summary, dict) - self.assertIsInstance(text_summary["Dataset"], str) + text_summary1 = context.get_text_summary(individual_summaries=None) + self.assertIsInstance(text_summary1, dict) + self.assertIsInstance(text_summary1["Dataset"], str) + text_summary1a = context.get_text_summary(individual_summaries="separate") + self.assertIsInstance(text_summary1a, dict) self.get_dfs(sum_op, 'name2', dispatch) self.get_dfs(sum_op, 'name3', dispatch) context2 = dispatch.summary_dicts.get(parms['summary_name'], None) diff --git a/tests/tools/remodeling/operations/test_summarize_definitions_op.py b/tests/tools/remodeling/operations/test_summarize_definitions_op.py index 4b4784f64..6cfddbd90 100644 --- a/tests/tools/remodeling/operations/test_summarize_definitions_op.py +++ b/tests/tools/remodeling/operations/test_summarize_definitions_op.py @@ -59,6 +59,16 @@ def test_summary(self): self.assertIsInstance(dispatch.summary_dicts[sum_op.summary_name], DefinitionSummary) # print(str(dispatch.summary_dicts[sum_op.summary_name].get_text_summary()['Dataset'])) + cont = dispatch.summary_dicts + context = cont.get("get_definition_summary", None) + self.assertIsInstance(context, DefinitionSummary, "get_summary testing DefinitionSummary") + summary1a = context.get_summary() + self.assertIsInstance(summary1a, dict) + self.assertIsInstance(summary1a["Dataset"], dict) + text_summary1 = context.get_text_summary(individual_summaries=None) + self.assertIsInstance(text_summary1, dict) + self.assertIsInstance(text_summary1["Dataset"], str) + def test_summary_errors(self): dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) parms = json.loads(self.json_parms) diff --git a/tests/tools/remodeling/operations/test_summarize_hed_type_op.py b/tests/tools/remodeling/operations/test_summarize_hed_type_op.py index faa3b79bd..642539967 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_type_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_type_op.py @@ -75,8 +75,8 @@ def test_summary(self): sum_op.do_op(dispatch, dispatch.prep_data(df), 'run-02', sidecar=self.sidecar_path) context2 = dispatch.summary_dicts['AOMIC_condition_variables'] summary2 = context2.get_summary(individual_summaries="separate") - self.assertEqual(summary2['Dataset']['Overall summary']['files'][0], 'run-01') - self.assertEqual(len(summary2['Dataset']['Overall summary']['files']), 2) + self.assertEqual(summary2['Dataset']['Overall summary']['Files'][0], 'run-01') + self.assertEqual(len(summary2['Dataset']['Overall summary']['Files']), 2) summary2a = context2.get_summary(individual_summaries="separate") self.assertIsInstance(summary2a["Individual files"]["run-02"], dict) diff --git a/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py b/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py index 7b76e7c1c..9ae1ef776 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py @@ -89,7 +89,7 @@ def test_get_summary_details(self): sum_obj4 = sum_context2.get_summary_details(include_individual=True) self.assertIsInstance(sum_obj4, dict) - def test_get_summary_text_summary(self): + def test_get_summary(self): dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) parms = json.loads(self.json_parms) sum_op = SummarizeHedValidationOp(parms) @@ -97,14 +97,42 @@ def test_get_summary_text_summary(self): df = dispatch.prep_data(df) sum_op.do_op(dispatch, df, 'subj2_run1', sidecar=self.bad_json_path) - sum_context1 = dispatch.summary_dicts[sum_op.summary_name] - text_sum1 = sum_context1.get_text_summary(individual_summaries="separate") + context = dispatch.summary_dicts[sum_op.summary_name] + sum1a = context.get_summary(individual_summaries="separate") + self.assertEqual(len(sum1a['Dataset']['Overall summary']['Files']), 1) + self.assertEqual(sum1a['Dataset']['Overall summary']['Files'][0], 'subj2_run1') + self.assertEqual(len(sum1a['Dataset']['Overall summary']), 5) + sum2a = context.get_summary(individual_summaries="separate") + self.assertIsInstance(sum2a["Individual files"]["subj2_run1"], dict) + sum_op.do_op(dispatch, df, 'subj2_run2', sidecar=self.json_path) + sum_op.do_op(dispatch, df, 'subj2_run3', sidecar=self.bad_json_path) + sum3a = context.get_summary(individual_summaries="none") + self.assertIsInstance(sum3a, dict) + self.assertFalse(sum3a["Individual files"]) + self.assertEqual(len(sum3a['Dataset']['Overall summary']['Files']), 3) + sum3b = context.get_summary(individual_summaries="consolidated") + self.assertEqual(len(sum3b["Individual files"]), 3) + self.assertEqual(sum3b['Dataset']['Overall summary']['Total files'], 3) + self.assertIsInstance(sum3b, dict) + + def test_get_text_summary(self): + dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) + parms = json.loads(self.json_parms) + sum_op = SummarizeHedValidationOp(parms) + df = pd.read_csv(self.data_path, delimiter='\t', header=0, keep_default_na=False, na_values=",null") + df = dispatch.prep_data(df) + sum_op.do_op(dispatch, df, 'subj2_run1', sidecar=self.bad_json_path) + context = dispatch.summary_dicts[sum_op.summary_name] + text_sum1 = context.get_text_summary(individual_summaries="separate") + self.assertEqual(len(text_sum1), 2) sum_op.do_op(dispatch, df, 'subj2_run2', sidecar=self.json_path) sum_op.do_op(dispatch, df, 'subj2_run3', sidecar=self.bad_json_path) - text_sum2 = sum_context1.get_text_summary(individual_summaries="none") - text_sum3 = sum_context1.get_text_summary(individual_summaries="consolidated") + text_sum2 = context.get_text_summary(individual_summaries="none") + text_sum3 = context.get_text_summary(individual_summaries="consolidated") self.assertIsInstance(text_sum3, dict) self.assertIsInstance(text_sum2, dict) + self.assertEqual(len(text_sum2), 1) + self.assertEqual(len(text_sum3), 1) def test_with_sample_data(self): dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) @@ -113,6 +141,8 @@ def test_with_sample_data(self): sum_op = SummarizeHedValidationOp(parms) sum_op.do_op(dispatch, df, 'sub-0013_task-stopsignal_acq-seq_events.tsv', sidecar=self.sample_sidecar_path) sum_context1 = dispatch.summary_dicts[sum_op.summary_name] + self.assertIsInstance(sum_context1, HedValidationSummary) + self.assertEqual(len(sum_context1.summary_dict), 1) if __name__ == '__main__': diff --git a/tests/tools/visualizations/test_tag_word_cloud.py b/tests/tools/visualizations/test_tag_word_cloud.py index fa09e1710..3b0878f79 100644 --- a/tests/tools/visualizations/test_tag_word_cloud.py +++ b/tests/tools/visualizations/test_tag_word_cloud.py @@ -1,6 +1,6 @@ import unittest from wordcloud import WordCloud -from hed.tools.visualizations import tag_word_cloud +from hed.tools.visualization import tag_word_cloud class TestWordCloudFunctions(unittest.TestCase): From def4fc80f71c871ce0ca61da925853c6fd24e9ce Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Thu, 29 Jun 2023 07:57:02 -0500 Subject: [PATCH 2/3] Minor refactoring to reduce complexity in remodeling validation sum op --- .../operations/summarize_hed_validation_op.py | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py index 18d417f0e..211dc602b 100644 --- a/hed/tools/remodeling/operations/summarize_hed_validation_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_validation_op.py @@ -109,7 +109,7 @@ def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=True, indent=indent) else: sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], indent=indent*2) - if specifics['validation_completed']: + if specifics['sidecar_had_issues']: sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=False, indent=indent*2) else: sum_list = sum_list + [f"{indent*2}Event file validation was incomplete because of sidecar errors"] @@ -125,24 +125,11 @@ def update_summary(self, new_info): - The summary needs a "name" str, a schema, a "df", and a "Sidecar". """ - results = self.get_empty_results() - results["event_files"].append(new_info["name"]) - results["event_issues"][new_info["name"]] = [] sidecar = new_info.get('sidecar', None) - filtered_issues = [] - if sidecar: - if not isinstance(sidecar, Sidecar): - sidecar = Sidecar(files=new_info['sidecar'], name=os.path.basename(sidecar)) - results["sidecar_files"].append(sidecar.name) - results["sidecar_issues"][sidecar.name] = [] - sidecar_issues = sidecar.validate(new_info['schema']) - filtered_issues = ErrorHandler.filter_issues_by_severity(sidecar_issues, ErrorSeverity.ERROR) - if not self.check_for_warnings: - sidecar_issues = filtered_issues - results['sidecar_issues'][sidecar.name] = sidecar_issues - results['total_sidecar_issues'] = len(sidecar_issues) - if not filtered_issues: - results['validation_completed'] = True + if sidecar and not isinstance(sidecar, Sidecar): + sidecar = Sidecar(files=new_info['sidecar'], name=os.path.basename(sidecar)) + results = self._get_sidecar_results(sidecar, new_info, self.check_for_warnings) + if not results['sidecar_had_issues']: input_data = TabularInput(new_info['df'], sidecar=sidecar) issues = input_data.validate(new_info['schema']) if not self.check_for_warnings: @@ -185,7 +172,7 @@ def merge_all_info(self): for ikey, errors in ind_results["sidecar_issues"].items(): results["sidecar_issues"][ikey] = errors for ikey, errors in ind_results["event_issues"].items(): - if not ind_results["validation_completed"]: + if not ind_results["sidecar_had_issues"]: results["event_issues"][ikey] = \ f"Validation incomplete due to {ind_results['total_sidecar_issues']} sidecar issues" else: @@ -196,7 +183,7 @@ def merge_all_info(self): def get_empty_results(): return {"event_files": [], "total_event_issues": 0, "event_issues": {}, "is_merged": False, "sidecar_files": [], "total_sidecar_issues": 0, "sidecar_issues": {}, - "validation_completed": False} + "sidecar_had_issues": False} @staticmethod def get_error_list(error_dict, count_only=False, indent=BaseSummary.DISPLAY_INDENT): @@ -237,3 +224,21 @@ def format_error(error): def update_error_location(error_locations, location_name, location_key, error): if location_key in error: error_locations.append(f"{location_name}={error[location_key][0]}") + + @staticmethod + def _get_sidecar_results(sidecar, new_info, check_for_warnings): + results = HedValidationSummary.get_empty_results() + results["event_files"].append(new_info["name"]) + results["event_issues"][new_info["name"]] = [] + if sidecar: + results["sidecar_files"].append(sidecar.name) + results["sidecar_issues"][sidecar.name] = [] + sidecar_issues = sidecar.validate(new_info['schema']) + filtered_issues = ErrorHandler.filter_issues_by_severity(sidecar_issues, ErrorSeverity.ERROR) + if filtered_issues: + results["sidecar_had_issues"] = True + if not check_for_warnings: + sidecar_issues = filtered_issues + results['sidecar_issues'][sidecar.name] = sidecar_issues + results['total_sidecar_issues'] = len(sidecar_issues) + return results From 2e5c30435f2b11c3f6a4b2f2d27294010b7760ae Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Thu, 29 Jun 2023 09:00:28 -0500 Subject: [PATCH 3/3] Refactored merge_all in remodeling validation sum op to reduce complexity --- .../operations/summarize_hed_validation_op.py | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py index 211dc602b..ce7595d55 100644 --- a/hed/tools/remodeling/operations/summarize_hed_validation_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_validation_op.py @@ -161,24 +161,31 @@ def merge_all_info(self): dict - dictionary of issues organized into sidecar_issues and event_issues. """ - results = self.get_empty_results() results["is_merged"] = True for key, ind_results in self.summary_dict.items(): + HedValidationSummary._update_sidecar_results(results, ind_results) results["event_files"].append(key) - results["total_event_issues"] += ind_results["total_event_issues"] - results["total_sidecar_issues"] += ind_results["total_sidecar_issues"] - results["sidecar_files"] = results["sidecar_files"] + ind_results["sidecar_files"] - for ikey, errors in ind_results["sidecar_issues"].items(): - results["sidecar_issues"][ikey] = errors - for ikey, errors in ind_results["event_issues"].items(): - if not ind_results["sidecar_had_issues"]: - results["event_issues"][ikey] = \ - f"Validation incomplete due to {ind_results['total_sidecar_issues']} sidecar issues" - else: - results["event_issues"][ikey] = f"{len(errors)}" + HedValidationSummary._update_events_results(results, ind_results) return results + @staticmethod + def _update_events_results(results, ind_results): + results["total_event_issues"] += ind_results["total_event_issues"] + for ikey, errors in ind_results["event_issues"].items(): + if ind_results["sidecar_had_issues"]: + results["event_issues"][ikey] = \ + f"Validation incomplete due to {ind_results['total_sidecar_issues']} sidecar issues" + else: + results["event_issues"][ikey] = f"{len(errors)}" + + @staticmethod + def _update_sidecar_results(results, ind_results): + results["total_sidecar_issues"] += ind_results["total_sidecar_issues"] + results["sidecar_files"] = results["sidecar_files"] + ind_results["sidecar_files"] + for ikey, errors in ind_results["sidecar_issues"].items(): + results["sidecar_issues"][ikey] = errors + @staticmethod def get_empty_results(): return {"event_files": [], "total_event_issues": 0, "event_issues": {}, "is_merged": False, @@ -196,17 +203,17 @@ def get_error_list(error_dict, count_only=False, indent=BaseSummary.DISPLAY_INDE elif not len(item): error_list.append(f"{indent}{key} has no issues") else: - error_list.append(f"{indent}{key} issues:") - for this_item in item: - error_list.append(f"{indent*2}{HedValidationSummary.format_error(this_item)}") + HedValidationSummary._format_errors(error_list, key, item, indent) return error_list @staticmethod - def format_errors(error_list): - pass + def _format_errors(error_list, name, errors, indent): + error_list.append(f"{indent}{name} issues:") + for this_item in errors: + error_list.append(f"{indent * 2}{HedValidationSummary._format_error(this_item)}") @staticmethod - def format_error(error): + def _format_error(error): error_str = error['code'] error_locations = [] HedValidationSummary.update_error_location(error_locations, "row", "ec_row", error)