diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 120662503..0c7d2d647 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -25,7 +25,7 @@ jobs: - uses: actions/cache@v3 with: path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('docs/requirements.txt') }} - name: Install dependencies run: | @@ -85,7 +85,7 @@ jobs: - uses: actions/cache@v3 with: path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('docs/requirements.txt') }} - name: Install dependencies run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d02b3de6..316d45dfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +Release 0.3.1 July 3, 2023 +- Pinned the version of the pydantic and inflect libraries due to inflict. +- Reorganized JSON output of remodeling summaries so that all of consistent form. +- Fixed summarize_hed_tags_op so that tags were correctly categorized for output. +- Minor refactoring to reduce code complexity. +- BaseInput and Sidecar now raise HedFileError if input could not be read. + + Release 0.3.0 June 20, 2023 - Introduction of partnered schema. - Improved error handling for schema validation. diff --git a/LICENSE b/LICENSE index 2be171efb..2e7808f43 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -MIT License +The MIT License (MIT) Copyright (c) 2020+ HED Standard Working Group diff --git a/docs/requirements.txt b/docs/requirements.txt index 9783a3079..952120af1 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,5 @@ defusedxml>=0.7.1 -inflect>=6.0.2 -myst-parser>=0.18.1 +inflect>=6.0.5 numpy>=1.21.6 openpyxl>=3.1.0 pandas>=1.3.5 @@ -8,4 +7,4 @@ portalocker>=2.7.0 semantic_version>=2.10.0 Sphinx>=5.2.2 sphinx_rtd_theme>=1.0.0 -wordcloud>=1.9.2 +wordcloud==1.9.2 diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py index 59ca86340..a1def1caf 100644 --- a/hed/errors/exceptions.py +++ b/hed/errors/exceptions.py @@ -2,6 +2,7 @@ class HedExceptions: + GENERIC_ERROR = 'GENERIC_ERROR' # A list of all exceptions that can be generated by the hedtools. FILE_NOT_FOUND = 'fileNotFound' BAD_PARAMETERS = 'badParameters' @@ -10,7 +11,7 @@ class HedExceptions: INVALID_EXTENSION = 'invalidExtension' INVALID_DATAFRAME = 'INVALID_DATAFRAME' - + INVALID_FILE_FORMAT = 'INVALID_FILE_FORMAT' # These are actual schema issues, not that the file cannot be found or parsed SCHEMA_HEADER_MISSING = 'HED_SCHEMA_HEADER_INVALID' HED_SCHEMA_HEADER_INVALID = 'HED_SCHEMA_HEADER_INVALID' diff --git a/hed/models/base_input.py b/hed/models/base_input.py index f0c96eaaf..852d7bd6f 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -43,12 +43,10 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T - An invalid dataframe was passed with size 0 - An invalid extension was provided - A duplicate or empty column name appears - - :raises OSError: - Cannot open the indicated file - - :raises KeyError: - The specified worksheet name does not exist + - If the sidecar file or tabular file had invalid format and could not be read. + """ if mapper is None: mapper = ColumnMapper() @@ -77,14 +75,20 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T elif not file: raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file) elif input_type in self.TEXT_EXTENSION: - self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, - dtype=str, keep_default_na=True, na_values=None) + try: + self._dataframe = pandas.read_csv(file, delimiter='\t', header=pandas_header, + dtype=str, keep_default_na=True, na_values=None) + except Exception as e: + raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, str(e), self.name) from e # Convert nan values to a known value self._dataframe = self._dataframe.fillna("n/a") elif input_type in self.EXCEL_EXTENSION: - self._loaded_workbook = openpyxl.load_workbook(file) - loaded_worksheet = self.get_worksheet(self._worksheet_name) - self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names) + try: + self._loaded_workbook = openpyxl.load_workbook(file) + loaded_worksheet = self.get_worksheet(self._worksheet_name) + self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names) + except Exception as e: + raise HedFileError(HedExceptions.GENERIC_ERROR, str(e), self.name) from e else: raise HedFileError(HedExceptions.INVALID_EXTENSION, "", file) @@ -94,7 +98,7 @@ def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=T # todo: Can we get rid of this behavior now that we're using pandas? column_issues = ColumnMapper.check_for_blank_names(self.columns, allow_blank_names=allow_blank_names) if column_issues: - raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.", + raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.", self.name, issues=column_issues) self.reset_mapper(mapper) @@ -285,7 +289,7 @@ def set_cell(self, row_number, column_number, new_string_obj, tag_form="short_ta Notes: Any attribute of a HedTag that returns a string is a valid value of tag_form. - + :raises ValueError: - There is not a loaded dataframe diff --git a/hed/models/sidecar.py b/hed/models/sidecar.py index 735ad3f8b..d3038fff6 100644 --- a/hed/models/sidecar.py +++ b/hed/models/sidecar.py @@ -127,15 +127,13 @@ def load_sidecar_file(self, file): if not file: return {} elif isinstance(file, str): + if not self.name: + self.name = file try: with open(file, "r") as fp: - if not self.name: - self.name = file return self._load_json_file(fp) - except FileNotFoundError as e: - raise HedFileError(HedExceptions.FILE_NOT_FOUND, e.strerror, file) - except TypeError as e: - raise HedFileError(HedExceptions.FILE_NOT_FOUND, str(e), file) + except OSError as e: + raise HedFileError(HedExceptions.FILE_NOT_FOUND, e.strerror, file) from e else: return self._load_json_file(file) @@ -189,12 +187,11 @@ def _load_json_file(self, fp): :raises HedFileError: - If the file cannot be parsed. - """ try: return json.load(fp) - except json.decoder.JSONDecodeError as e: - raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), self.name) + except (json.decoder.JSONDecodeError, AttributeError) as e: + raise HedFileError(HedExceptions.CANNOT_PARSE_JSON, str(e), self.name) from e def extract_definitions(self, hed_schema=None, error_handler=None): """ Gather and validate definitions in metadata. diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py index b88ed5581..1b9570105 100644 --- a/hed/models/tabular_input.py +++ b/hed/models/tabular_input.py @@ -13,8 +13,8 @@ def __init__(self, file=None, sidecar=None, name=None): """ Constructor for the TabularInput class. Parameters: - file (str or file like): A tsv file to open. - sidecar (str or Sidecar): A Sidecar filename or Sidecar + file (str or FileLike): A tsv file to open. + sidecar (str or Sidecar or FileLike): A Sidecar or source file/filename. name (str): The name to display for this file for error purposes. :raises HedFileError: diff --git a/hed/tools/analysis/hed_context_manager.py b/hed/tools/analysis/hed_context_manager.py index ebf053d2f..f3a5b8758 100644 --- a/hed/tools/analysis/hed_context_manager.py +++ b/hed/tools/analysis/hed_context_manager.py @@ -5,7 +5,8 @@ from hed.schema import HedSchema, HedSchemaGroup from hed.tools.analysis.analysis_util import hed_to_str -#TODO: [Refactor] clean up distinction between hed as strings versus objects -- maybe replace by event manager. +# TODO: [Refactor] clean up distinction between hed as strings versus objects -- maybe replace by event manager. +# TODO: Implement insets class OnsetGroup: def __init__(self, name, contents, start_index, end_index=None): diff --git a/hed/tools/analysis/hed_tag_counts.py b/hed/tools/analysis/hed_tag_counts.py index 845e448b5..300319820 100644 --- a/hed/tools/analysis/hed_tag_counts.py +++ b/hed/tools/analysis/hed_tag_counts.py @@ -21,11 +21,11 @@ def __init__(self, hed_tag, file_name): self.set_value(hed_tag) def set_value(self, hed_tag): - """ Update the tag term value counts for a HedTag. - + """ Update the tag term value counts for a HedTag. + Parameters: - hed_tag (HedTag or None): Item to use to update the value counts. - + hed_tag (HedTag or None): Item to use to update the value counts. + """ if not hed_tag: return @@ -43,13 +43,13 @@ def get_info(self, verbose=False): else: files = len(self.files) return {'tag': self.tag, 'events': self.events, 'files': files} - + def get_summary(self): """ Return a dictionary summary of the events and files for this tag. - + Returns: dict: dictionary summary of events and files that contain this tag. - + """ return {'tag': self.tag, 'events': self.events, 'files': [name for name in self.files]} @@ -63,12 +63,11 @@ def get_empty(self): class HedTagCounts: """ Counts of HED tags for a tabular file. - + Parameters: name (str): An identifier for these counts (usually the filename of the tabular file) total_events (int): The total number of events in the tabular file. - """ def __init__(self, name, total_events=0): @@ -76,15 +75,15 @@ def __init__(self, name, total_events=0): self.name = name self.files = {} self.total_events = total_events - + def update_event_counts(self, hed_string_obj, file_name, definitions=None): - """ Update the tag counts based on a hed string object. - + """ Update the tag counts based on a hed string object. + Parameters: hed_string_obj (HedString): The HED string whose tags should be counted. file_name (str): The name of the file corresponding to these counts. definitions (dict): The definitions associated with the HED string. - + """ if file_name not in self.files: self.files[file_name] = "" @@ -100,17 +99,20 @@ def update_event_counts(self, hed_string_obj, file_name, definitions=None): self.merge_tag_dicts(tag_dict) def organize_tags(self, tag_template): + """ Organize tags into categories as specified by the tag_template. + + Parameters: + tag_template (dict): A dictionary whose keys are titles and values are lists of HED tags (str). + + Returns: + dict - keys are tags (strings) and values are list of HedTagCount for items fitting template. + list - of HedTagCount objects corresponding to tags that don't fit the template. + + """ template = self.create_template(tag_template) unmatched = [] - for key, tag_count in self.tag_dict.items(): - matched = False - for tag in reversed(tag_count.tag_terms): - if tag in template: - template[tag].append(tag_count) - matched = True - break - if not matched: - unmatched.append(tag_count) + for tag_count in self.tag_dict.values(): + self._update_template(tag_count, template, unmatched) return template, unmatched def merge_tag_dicts(self, other_dict): @@ -118,20 +120,21 @@ def merge_tag_dicts(self, other_dict): if tag not in self.tag_dict: self.tag_dict[tag] = count.get_empty() self.tag_dict[tag].events = self.tag_dict[tag].events + count.events - value_dict = self.tag_dict[tag].value_dict - for value, val_count in count.value_dict.items(): - if value in value_dict: - value_dict[value] = value_dict[value] + val_count - else: - value_dict[value] = val_count for file in count.files: self.tag_dict[tag].files[file] = '' + if not self.tag_dict[tag].value_dict: + continue + for value, val_count in count.value_dict.items(): + if value in self.tag_dict[tag].value_dict: + self.tag_dict[tag].value_dict[value] = self.tag_dict[tag].value_dict + val_count + else: + self.tag_dict[tag].value_dict[value] = val_count def get_summary(self): details = {} for tag, count in self.tag_dict.items(): details[tag] = count.get_summary() - return {'name': str(self.name), 'type_tag': self.type_tag, 'files': list(self.files.keys()), + return {'name': str(self.name), 'files': list(self.files.keys()), 'total_events': self.total_events, 'details': details} @staticmethod @@ -141,3 +144,19 @@ def create_template(tags): for element in key_list: template_dict[element.lower()] = [] return template_dict + + @staticmethod + def _update_template(tag_count, template, unmatched): + """ Update the template or unmatched with info in the tag_count. + + Parameters: + tag_count (HedTagCount): Information for a particular tag. + template (dict): The + + """ + tag_list = reversed(list(tag_count.tag_terms)) + for tkey in tag_list: + if tkey in template.keys(): + template[tkey].append(tag_count) + return + unmatched.append(tag_count) diff --git a/hed/tools/analysis/hed_type_factors.py b/hed/tools/analysis/hed_type_factors.py index bf6afff2f..b4cc92af4 100644 --- a/hed/tools/analysis/hed_type_factors.py +++ b/hed/tools/analysis/hed_type_factors.py @@ -39,18 +39,18 @@ def get_factors(self, factor_encoding="one-hot"): DataFrame: DataFrame containing the factor vectors as the columns. """ - df = pd.DataFrame(0, index=range(self.number_elements), columns=[self.type_value]) - df.loc[list(self.direct_indices.keys()), [self.type_value]] = 1 + if not self.levels: + df = pd.DataFrame(0, index=range(self.number_elements), columns=[self.type_value]) + df.loc[list(self.direct_indices.keys()), [self.type_value]] = 1 return df levels = list(self.levels.keys()) levels_list = [f"{self.type_value}.{level}" for level in levels] - df_levels = pd.DataFrame(0, index=range(self.number_elements), columns=levels_list) + factors = pd.DataFrame(0, index=range(self.number_elements), columns=levels_list) for index, level in enumerate(levels): index_keys = list(self.levels[level].keys()) - df_levels.loc[index_keys, [levels_list[index]]] = 1 - factors = pd.concat([df, df_levels], axis=1) + factors.loc[index_keys, [levels_list[index]]] = 1 if factor_encoding == "one-hot": return factors sum_factors = factors.sum(axis=1) diff --git a/hed/tools/analysis/hed_type_manager.py b/hed/tools/analysis/hed_type_manager.py index 43ff826e8..3abe427ff 100644 --- a/hed/tools/analysis/hed_type_manager.py +++ b/hed/tools/analysis/hed_type_manager.py @@ -44,19 +44,21 @@ def get_factor_vectors(self, type_tag, type_values=None, factor_encoding="one-ho factor_encoding (str): Specifies type of factor encoding (one-hot or categorical). Returns: - DataFrame: DataFrame containing the factor vectors as the columns. + DataFrame or None: DataFrame containing the factor vectors as the columns. """ - this_var = self.get_type_variable(type_tag) + this_var = self.get_type_variable(type_tag.lower()) if this_var is None: return None variables = this_var.get_type_value_names() - if variables is None: - variables = type_values - df_list = [0]*len(variables) - for index, variable in enumerate(variables): + if not type_values: + type_values = variables + df_list = [0]*len(type_values) + for index, variable in enumerate(type_values): var_sum = this_var._type_value_map[variable] df_list[index] = var_sum.get_factors(factor_encoding=factor_encoding) + if not df_list: + return None return pd.concat(df_list, axis=1) def get_type_variable(self, type_tag): diff --git a/hed/tools/remodeling/operations/base_summary.py b/hed/tools/remodeling/operations/base_summary.py index 9153ed670..2b732ae2d 100644 --- a/hed/tools/remodeling/operations/base_summary.py +++ b/hed/tools/remodeling/operations/base_summary.py @@ -38,9 +38,9 @@ def get_summary_details(self, include_individual=True): Users are expected to provide merge_all_info and get_details_dict to support this. """ - merged_summary = self.merge_all_info() - if merged_summary: - details = self.get_details_dict(merged_summary) + merged_counts = self.merge_all_info() + if merged_counts: + details = self.get_details_dict(merged_counts) else: details = "Overall summary unavailable" @@ -220,6 +220,11 @@ def get_details_dict(self, summary_info): Notes: Abstract method be implemented by each individual summary. + Notes: + The expected return value is a dictionary of the form: + + {"Name": "", "Total events": 0, "Total files": 0, "Files": [], "Specifics": {}}" + """ raise NotImplementedError diff --git a/hed/tools/remodeling/operations/factor_hed_type_op.py b/hed/tools/remodeling/operations/factor_hed_type_op.py index 1d5674d7a..21057b798 100644 --- a/hed/tools/remodeling/operations/factor_hed_type_op.py +++ b/hed/tools/remodeling/operations/factor_hed_type_op.py @@ -78,7 +78,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): var_manager = HedTypeManager(hed_strings, dispatcher.hed_schema, definitions) var_manager.add_type_variable(self.type_tag.lower()) - df_factors = var_manager.get_factor_vectors(self.type_tag, [], factor_encoding="one-hot") + df_factors = var_manager.get_factor_vectors(self.type_tag, self.type_values, factor_encoding="one-hot") if len(df_factors.columns) > 0: df_list.append(df_factors) df_new = pd.concat(df_list, axis=1) diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py index 2201827f9..fd70e3f8d 100644 --- a/hed/tools/remodeling/operations/summarize_column_names_op.py +++ b/hed/tools/remodeling/operations/summarize_column_names_op.py @@ -108,7 +108,7 @@ def get_details_dict(self, column_summary): return {"Name": summary['Summary name'], "Total events": "n/a", "Total files": summary['Number files'], "Files": [name for name in column_summary.file_dict.keys()], - "Columns": summary['Columns']} + "Specifics": {"Columns": summary['Columns']}} def merge_all_info(self): """ Create a ColumnNameSummary containing the overall dataset summary. @@ -140,8 +140,11 @@ def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): """ if name == "Dataset": return self._get_dataset_string(result, indent) - columns = result["Columns"][0] - return f"{indent}{str(columns['Column names'])}" + columns = result.get("Specifics", {}).get("Columns", []) + if columns: + return f"{indent}{str(columns[0])}" + else: + return "" @staticmethod def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): @@ -155,8 +158,10 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - sum_list = [f"Dataset: Number of files={result.get('Number files', 0)}"] - for element in result.get("Unique headers", []): + sum_list = [f"Dataset: Number of files={result.get('Total files', 0)}"] + specifics = result.get("Specifics", {}) + columns = specifics.get("Columns", {}) + for element in columns: sum_list.append(f"{indent}Columns: {str(element['Column names'])}") for file in element.get("Files", []): sum_list.append(f"{indent}{indent}{file}") diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py index 0c80c6382..825594aea 100644 --- a/hed/tools/remodeling/operations/summarize_column_values_op.py +++ b/hed/tools/remodeling/operations/summarize_column_values_op.py @@ -128,7 +128,14 @@ def get_details_dict(self, summary): for key, dict_entry in this_summary['Categorical columns'].items(): num_disp, sorted_tuples = ColumnValueSummary.sort_dict(dict_entry, reverse=True) this_summary['Categorical columns'][key] = dict(sorted_tuples[:min(num_disp, self.op.max_categorical)]) - return this_summary + return {"Name": this_summary['Name'], "Total events": this_summary["Total events"], + "Total files": this_summary['Total files'], + "Files": [name for name in this_summary['Files'].keys()], + "Specifics": {"Value columns": this_summary['Value columns'].keys(), + "Skip columns": this_summary['Skip columns'], + "Value columns": this_summary['Value columns'], + "Categorical columns": this_summary['Categorical columns'], + "Categorical counts": this_summary['Categorical counts']}} def merge_all_info(self): """ Create a TabularSummary containing the overall dataset summary. @@ -198,10 +205,11 @@ def _get_dataset_string(self, result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Dataset: Total events={result.get('Total events', 0)} " f"Total files={result.get('Total files', 0)}"] - cat_string = self._get_categorical_string(result, offset="", indent=indent) + specifics = result["Specifics"] + cat_string = self._get_categorical_string(specifics, offset="", indent=indent) if cat_string: sum_list.append(cat_string) - val_cols = result.get("Value columns", {}) + val_cols = specifics.get("Value columns", {}) if val_cols: sum_list.append(ColumnValueSummary._get_value_string(val_cols, offset="", indent=indent)) return "\n".join(sum_list) @@ -219,6 +227,7 @@ def _get_individual_string(self, result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Total events={result.get('Total events', 0)}"] + specifics = result.get("Specifics", {}) cat_cols = result.get("Categorical columns", {}) if cat_cols: sum_list.append(self._get_categorical_string(cat_cols, offset=indent, indent=indent)) diff --git a/hed/tools/remodeling/operations/summarize_definitions_op.py b/hed/tools/remodeling/operations/summarize_definitions_op.py index 3169d63d0..6be941352 100644 --- a/hed/tools/remodeling/operations/summarize_definitions_op.py +++ b/hed/tools/remodeling/operations/summarize_definitions_op.py @@ -128,6 +128,8 @@ def get_details_dict(self, def_gatherer): known_defs_summary.update(ambiguous_defs_summary) known_defs_summary.update(errors_summary) + return {"Name": "", "Total events": 0, "Total files": 0, "Files": [], "Specifics": known_defs_summary} + return known_defs_summary def merge_all_info(self): diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index 5a504fed1..f88650ccb 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -35,6 +35,7 @@ class SummarizeHedTagsOp(BaseOp): }, "optional_parameters": { "append_timecode": bool, + "expand_definitions": bool, "expand_context": bool } } @@ -118,24 +119,24 @@ def update_summary(self, new_info): counts.update_event_counts(hed, new_info['name']) self.summary_dict[new_info["name"]] = counts - def get_details_dict(self, merge_counts): + def get_details_dict(self, tag_counts): """ Return the summary-specific information in a dictionary. Parameters: - merge_counts (HedTagCounts): Contains the counts of tags in the dataset. + tag_counts (HedTagCounts): Contains the counts of tags in the dataset. Returns: dict: dictionary with the summary results. """ - template, unmatched = merge_counts.organize_tags(self.tags) + template, unmatched = tag_counts.organize_tags(self.tags) details = {} for key, key_list in self.tags.items(): details[key] = self._get_details(key_list, template, verbose=True) leftovers = [value.get_info(verbose=True) for value in unmatched] - return {"Name": merge_counts.name, "Total events": merge_counts.total_events, - "Total files": len(merge_counts.files.keys()), - "Files": [name for name in merge_counts.files.keys()], + return {"Name": tag_counts.name, "Total events": tag_counts.total_events, + "Total files": len(tag_counts.files.keys()), + "Files": [name for name in tag_counts.files.keys()], "Specifics": {"Main tags": details, "Other tags": leftovers}} def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py index 9a27d22d2..04c1ad89b 100644 --- a/hed/tools/remodeling/operations/summarize_hed_type_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_type_op.py @@ -95,7 +95,7 @@ def update_summary(self, new_info): Parameters: new_info (dict): A dictionary with the parameters needed to update a summary. - Notes: + Notes: - The summary needs a "name" str, a "schema", a "df, and a "Sidecar". """ @@ -104,7 +104,7 @@ def update_summary(self, new_info): if sidecar and not isinstance(sidecar, Sidecar): sidecar = Sidecar(sidecar) input_data = TabularInput(new_info['df'], sidecar=sidecar, name=new_info['name']) - hed_strings, definitions = get_assembled(input_data, sidecar, new_info['schema'], + hed_strings, definitions = get_assembled(input_data, sidecar, new_info['schema'], extra_def_dicts=None, join_columns=True, expand_defs=False) context_manager = HedContextManager(hed_strings, new_info['schema']) type_values = HedTypeValues(context_manager, definitions, new_info['name'], type_tag=self.type_tag) @@ -124,7 +124,12 @@ def get_details_dict(self, counts): dict: dictionary with the summary results. """ - return counts.get_summary() + summary = counts.get_summary() + files = summary.get('files', []) + return {"Name": summary.get("name", ""), "Total events": summary.get("total_events", 0), + "Total files": len(files), "Files": files, + "Specifics": {"Type tag": summary.get('type_tag', 'condition-variable'), + "Type info": summary.get('details', {})}} def merge_all_info(self): """ Create a HedTypeCounts containing the overall dataset HED type summary. @@ -170,11 +175,12 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - details = result.get('details', {}) - sum_list = [f"Dataset: Type={result['type_tag']} Type values={len(details)} " - f"Total events={result.get('total_events', 0)} Total files={len(result.get('files', []))}"] + specifics = result.get('Specifics', {}) + type_info = specifics.get('Type info', {}) + sum_list = [f"Dataset: Type={specifics.get('Type tag', 'condition-variable')} Type values={len(type_info)} " + f"Total events={result.get('Total events', 0)} Total files={len(result.get('Files', []))}"] - for key, item in details.items(): + for key, item in type_info.items(): str1 = f"{item['events']} event(s) out of {item['total_events']} total events in " + \ f"{len(item['files'])} file(s)" if item['level_counts']: @@ -200,11 +206,12 @@ def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - details = result.get('details', {}) - sum_list = [f"Type={result['type_tag']} Type values={len(details)} " - f"Total events={result.get('total_events', 0)}"] + specifics = result.get('Specifics', {}) + type_info = specifics.get('Type info', {}) + sum_list = [f"Type={specifics.get('Type tag', 'condition-variable')} Type values={len(type_info)} " + f"Total events={result.get('Total events', 0)}"] - for key, item in details.items(): + for key, item in type_info.items(): sum_list.append(f"{indent*2}{key}: {item['levels']} levels in {item['events']} events") str1 = "" if item['direct_references']: diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py index d643e533d..ce7595d55 100644 --- a/hed/tools/remodeling/operations/summarize_hed_validation_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_validation_op.py @@ -101,17 +101,16 @@ def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): This gets the error list from "sidecar_issues" and "event_issues". """ - - if result["is_merged"]: - sum_list = [f"{name}: [{result['total_sidecar_files']} sidecar files, " - f"{result['total_event_files']} event files]"] - sum_list = sum_list + self.get_error_list(result['sidecar_issues'], count_only=True, indent=indent) - sum_list = sum_list + self.get_error_list(result['event_issues'], count_only=True, indent=indent) + specifics = result.get("Specifics", {}) + sum_list = [f"{name}: [{len(specifics['sidecar_files'])} sidecar files, " + f"{len(specifics['event_files'])} event files]"] + if specifics.get('is_merged'): + sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], count_only=True, indent=indent) + sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=True, indent=indent) else: - sum_list = [f"{indent}{name}: {result['total_sidecar_files']} sidecar files"] - sum_list = sum_list + self.get_error_list(result['sidecar_issues'], indent=indent*2) - if result['validation_completed']: - sum_list = sum_list + self.get_error_list(result['event_issues'], count_only=False, indent=indent*2) + sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], indent=indent*2) + if specifics['sidecar_had_issues']: + sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=False, indent=indent*2) else: sum_list = sum_list + [f"{indent*2}Event file validation was incomplete because of sidecar errors"] return "\n".join(sum_list) @@ -126,31 +125,18 @@ def update_summary(self, new_info): - The summary needs a "name" str, a schema, a "df", and a "Sidecar". """ - results = self.get_empty_results() - results["total_event_files"] = 1 - results["event_issues"][new_info["name"]] = [] - self.summary_dict[new_info["name"]] = results sidecar = new_info.get('sidecar', None) - filtered_issues = [] - if sidecar: - if not isinstance(sidecar, Sidecar): - sidecar = Sidecar(files=new_info['sidecar'], name=os.path.basename(sidecar)) - results["sidecar_issues"][sidecar.name] = [] - sidecar_issues = sidecar.validate(new_info['schema']) - filtered_issues = ErrorHandler.filter_issues_by_severity(sidecar_issues, ErrorSeverity.ERROR) - if not self.check_for_warnings: - sidecar_issues = filtered_issues - results['sidecar_issues'][sidecar.name] = sidecar_issues - results['total_sidecar_issues'] = len(sidecar_issues) - results['total_sidecar_files'] = 1 - if not filtered_issues: - results['validation_completed'] = True + if sidecar and not isinstance(sidecar, Sidecar): + sidecar = Sidecar(files=new_info['sidecar'], name=os.path.basename(sidecar)) + results = self._get_sidecar_results(sidecar, new_info, self.check_for_warnings) + if not results['sidecar_had_issues']: input_data = TabularInput(new_info['df'], sidecar=sidecar) issues = input_data.validate(new_info['schema']) if not self.check_for_warnings: issues = ErrorHandler.filter_issues_by_severity(issues, ErrorSeverity.ERROR) results['event_issues'][new_info["name"]] = issues results['total_event_issues'] = len(issues) + self.summary_dict[new_info["name"]] = results def get_details_dict(self, summary_info): """Return the summary details from the summary_info. @@ -162,7 +148,11 @@ def get_details_dict(self, summary_info): dict: Same summary_info as was passed in. """ - return summary_info + + return {"Name": "", "Total events": "n/a", + "Total files": len(summary_info.get("event_files", [])), + "Files": summary_info.get("event_files", []), + "Specifics": summary_info} def merge_all_info(self): """ Create a dictionary containing all of the errors in the dataset. @@ -171,29 +161,36 @@ def merge_all_info(self): dict - dictionary of issues organized into sidecar_issues and event_issues. """ - results = self.get_empty_results() results["is_merged"] = True for key, ind_results in self.summary_dict.items(): - results["total_event_files"] += ind_results["total_event_files"] - results["total_event_issues"] += ind_results["total_event_issues"] - - for ikey, errors in ind_results["sidecar_issues"].items(): - results["sidecar_issues"][ikey] = errors - for ikey, errors in ind_results["event_issues"].items(): - if not ind_results["validation_completed"]: - results["event_issues"][ikey] = \ - f"Validation incomplete due to {ind_results['total_sidecar_issues']} sidecar issues" - else: - results["event_issues"][ikey] = f"{len(errors)}" - results["total_sidecar_files"] += ind_results["total_sidecar_files"] + HedValidationSummary._update_sidecar_results(results, ind_results) + results["event_files"].append(key) + HedValidationSummary._update_events_results(results, ind_results) return results + @staticmethod + def _update_events_results(results, ind_results): + results["total_event_issues"] += ind_results["total_event_issues"] + for ikey, errors in ind_results["event_issues"].items(): + if ind_results["sidecar_had_issues"]: + results["event_issues"][ikey] = \ + f"Validation incomplete due to {ind_results['total_sidecar_issues']} sidecar issues" + else: + results["event_issues"][ikey] = f"{len(errors)}" + + @staticmethod + def _update_sidecar_results(results, ind_results): + results["total_sidecar_issues"] += ind_results["total_sidecar_issues"] + results["sidecar_files"] = results["sidecar_files"] + ind_results["sidecar_files"] + for ikey, errors in ind_results["sidecar_issues"].items(): + results["sidecar_issues"][ikey] = errors + @staticmethod def get_empty_results(): - return {"total_event_files": 0, "total_event_issues": 0, "event_issues": {}, "is_merged": False, - "total_sidecar_files": 0, "total_sidecar_issues": 0, "sidecar_issues": {}, - "validation_completed": False} + return {"event_files": [], "total_event_issues": 0, "event_issues": {}, "is_merged": False, + "sidecar_files": [], "total_sidecar_issues": 0, "sidecar_issues": {}, + "sidecar_had_issues": False} @staticmethod def get_error_list(error_dict, count_only=False, indent=BaseSummary.DISPLAY_INDENT): @@ -206,17 +203,17 @@ def get_error_list(error_dict, count_only=False, indent=BaseSummary.DISPLAY_INDE elif not len(item): error_list.append(f"{indent}{key} has no issues") else: - error_list.append(f"{indent}{key} issues:") - for this_item in item: - error_list.append(f"{indent*2}{HedValidationSummary.format_error(this_item)}") + HedValidationSummary._format_errors(error_list, key, item, indent) return error_list @staticmethod - def format_errors(error_list): - pass + def _format_errors(error_list, name, errors, indent): + error_list.append(f"{indent}{name} issues:") + for this_item in errors: + error_list.append(f"{indent * 2}{HedValidationSummary._format_error(this_item)}") @staticmethod - def format_error(error): + def _format_error(error): error_str = error['code'] error_locations = [] HedValidationSummary.update_error_location(error_locations, "row", "ec_row", error) @@ -234,3 +231,21 @@ def format_error(error): def update_error_location(error_locations, location_name, location_key, error): if location_key in error: error_locations.append(f"{location_name}={error[location_key][0]}") + + @staticmethod + def _get_sidecar_results(sidecar, new_info, check_for_warnings): + results = HedValidationSummary.get_empty_results() + results["event_files"].append(new_info["name"]) + results["event_issues"][new_info["name"]] = [] + if sidecar: + results["sidecar_files"].append(sidecar.name) + results["sidecar_issues"][sidecar.name] = [] + sidecar_issues = sidecar.validate(new_info['schema']) + filtered_issues = ErrorHandler.filter_issues_by_severity(sidecar_issues, ErrorSeverity.ERROR) + if filtered_issues: + results["sidecar_had_issues"] = True + if not check_for_warnings: + sidecar_issues = filtered_issues + results['sidecar_issues'][sidecar.name] = sidecar_issues + results['total_sidecar_issues'] = len(sidecar_issues) + return results diff --git a/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py b/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py index e0657ffef..28a0b9389 100644 --- a/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py +++ b/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py @@ -115,9 +115,13 @@ def get_details_dict(self, summary_info): """ - return {"files": summary_info.files, "total_files": summary_info.total_files, - "total_events": summary_info.total_events, "skip_cols": summary_info.skip_cols, - "sidecar": summary_info.extract_sidecar_template()} + return {"Name": summary_info.name, "Total events": summary_info.total_events, + "Total files": summary_info.total_files, + "Files": summary_info.files.keys(), + "Specifics": {"Categorical info": summary_info.categorical_info, + "Value info": summary_info.value_info, + "Skip columns": summary_info.skip_cols, + "Sidecar": summary_info.extract_sidecar_template()}} def merge_all_info(self): """ Merge summary information from all of the files @@ -165,11 +169,12 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - sum_list = [f"Dataset: Total events={result.get('total_events', 0)} " - f"Total files={result.get('total_files', 0)}", - f"Skip columns: {str(result.get('skip_cols', []))}", - f"Value columns: {str(result.get('value_cols', []))}", - f"Sidecar:\n{json.dumps(result['sidecar'], indent=indent)}"] + specifics = result.get("Specifics", {}) + sum_list = [f"Dataset: Total events={result.get('Total events', 0)} " + f"Total files={result.get('Total files', 0)}", + f"Skip columns: {str(specifics.get('Skip columns', []))}", + f"Value columns: {str(specifics.get('Value info', {}).keys())}", + f"Sidecar:\n{json.dumps(specifics.get('Sidecar', {}), indent=indent)}"] return "\n".join(sum_list) @staticmethod @@ -184,8 +189,9 @@ def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT): str: Formatted string suitable for saving in a file or printing. """ - sum_list = [f"Total events={result.get('total_events', 0)}", - f"Skip columns: {str(result.get('skip_cols', []))}", - f"Value columns: {str(result.get('value_cols', []))}", - f"Sidecar:\n{json.dumps(result['sidecar'], indent=indent)}"] + specifics = result.get("Specifics", {}) + sum_list = [f"Total events={result.get('Total events', 0)}", + f"Skip columns: {str(specifics.get('Slip columns', []))}", + f"Value columns: {str(specifics.get('Value info', {}).keys())}", + f"Sidecar:\n{json.dumps(specifics['Sidecar'], indent=indent)}"] return "\n".join(sum_list) diff --git a/hed/tools/visualization/__init__.py b/hed/tools/visualization/__init__.py new file mode 100644 index 000000000..a40c0333b --- /dev/null +++ b/hed/tools/visualization/__init__.py @@ -0,0 +1 @@ +from .tag_word_cloud import create_wordcloud, summary_to_dict diff --git a/hed/tools/visualization/tag_word_cloud.py b/hed/tools/visualization/tag_word_cloud.py new file mode 100644 index 000000000..c8d4159d7 --- /dev/null +++ b/hed/tools/visualization/tag_word_cloud.py @@ -0,0 +1,112 @@ +import numpy as np +from PIL import Image +from hed.tools.visualization.word_cloud_util import default_color_func, WordCloud + + +def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400, height=200, **kwargs): + """Takes a word dict and returns a generated word cloud object + + Parameters: + word_dict(dict): words and their frequencies + mask_path(str or None): The path of the mask file + background_color(str or None): If None, transparent background. + width(int): width in pixels + height(int): height in pixels + kwargs(kwargs): Any other parameters WordCloud accepts, overrides default values where relevant. + Returns: + word_cloud(WordCloud): The generated cloud. + Use .to_file to save it out as an image. + + :raises ValueError: + An empty dictionary was passed + """ + mask_image = None + if mask_path: + mask_image = load_and_resize_mask(mask_path, width, height) + width = mask_image.shape[1] + height = mask_image.shape[0] + kwargs.setdefault('contour_width', 3) + kwargs.setdefault('contour_color', 'black') + kwargs.setdefault('prefer_horizontal', 0.75) + kwargs.setdefault('color_func', default_color_func) + kwargs.setdefault('relative_scaling', 1) + kwargs.setdefault('max_font_size', height / 15) + kwargs.setdefault('min_font_size', 5) + + wc = WordCloud(background_color=background_color, mask=mask_image, + width=width, height=height, mode="RGBA", **kwargs) + + wc.generate_from_frequencies(word_dict) + + return wc + + +def summary_to_dict(summary, transform=np.log10, adjustment=5): + """Converts a HedTagSummary json dict into the word cloud input format + + Parameters: + summary(dict): The summary from a summarize hed tags op + transform(func): The function to transform the number of found tags + Default log10 + adjustment(int): Value added after transform. + Returns: + word_dict(dict): a dict of the words and their occurrence count + + :raises KeyError: + A malformed dictionary was passed + + """ + if transform is None: + transform = lambda x: x + overall_summary = summary.get("Overall summary", {}) + specifics = overall_summary.get("Specifics", {}) + tag_dict = specifics.get("Main tags", {}) + word_dict = {} + for tag_sub_list in tag_dict.values(): + for tag_sub_dict in tag_sub_list: + word_dict[tag_sub_dict['tag']] = transform(tag_sub_dict['events']) + adjustment + + return word_dict + + +def load_and_resize_mask(mask_path, width=None, height=None): + """ Load a mask image and resize it according to given dimensions. + + The image is resized maintaining aspect ratio if only width or height is provided. + + Returns None if no mask_path. + + Parameters: + mask_path (str): The path to the mask image file. + width (int, optional): The desired width of the resized image. If only width is provided, + the image is scaled to maintain its original aspect ratio. Defaults to None. + height (int, optional): The desired height of the resized image. If only height is provided, + the image is scaled to maintain its original aspect ratio. Defaults to None. + + Returns: + numpy.ndarray: The loaded and processed mask image as a numpy array with binary values (0 or 255). + """ + if mask_path: + mask_image = Image.open(mask_path) + + if width or height: + original_size = np.array((mask_image.width, mask_image.height)) + output_size = np.array((width, height)) + # Handle one missing param + if not height: + scale = original_size[0] / width + output_size = original_size / scale + elif not width: + scale = original_size[1] / height + output_size = original_size / scale + + mask_image = mask_image.resize(output_size.astype(int), Image.LANCZOS) + + # Convert to greyscale then to binary black and white (0 or 255) + mask_image = mask_image.convert('L') + mask_image_array = np.array(mask_image) + mask_image_array = np.where(mask_image_array > 127, 255, 0) + else: + mask_image_array = np.array(mask_image) + + return mask_image_array.astype(np.uint8) \ No newline at end of file diff --git a/hed/tools/visualization/word_cloud_util.py b/hed/tools/visualization/word_cloud_util.py new file mode 100644 index 000000000..ba25e0133 --- /dev/null +++ b/hed/tools/visualization/word_cloud_util.py @@ -0,0 +1,86 @@ +import random +from random import Random + +import numpy as np +from PIL import Image, ImageFilter +from matplotlib import cm +from wordcloud import WordCloud + + +def _draw_contour(wc, img): + """Slightly tweaked copy of internal WorldCloud function to allow transparency""" + if wc.mask is None or wc.contour_width == 0 or wc.contour_color is None: + return img + + mask = wc._get_bolean_mask(wc.mask) * 255 + contour = Image.fromarray(mask.astype(np.uint8)) + contour = contour.resize(img.size) + contour = contour.filter(ImageFilter.FIND_EDGES) + contour = np.array(contour) + + # make sure borders are not drawn before changing width + contour[[0, -1], :] = 0 + contour[:, [0, -1]] = 0 + + # use gaussian to change width, divide by 10 to give more resolution + radius = wc.contour_width / 10 + contour = Image.fromarray(contour) + contour = contour.filter(ImageFilter.GaussianBlur(radius=radius)) + contour = np.array(contour) > 0 + if img.mode == 'RGBA': + contour = np.dstack((contour, contour, contour, contour)) + else: + contour = np.dstack((contour, contour, contour)) + + # color the contour + ret = np.array(img) * np.invert(contour) + color = np.array(Image.new(img.mode, img.size, wc.contour_color)) + ret += color * contour + + return Image.fromarray(ret) + +# Replace WordCloud function with one that can handle transparency +WordCloud._draw_contour = _draw_contour + + +def random_color_darker(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None): + """Random color generation func""" + if random_state is None: + random_state = Random() + return f"hsl({random_state.randint(0, 255)}, {random_state.randint(50, 100)}%, {random_state.randint(0, 50)}%)" + + +class ColormapColorFunc: + def __init__(self, colormap='nipy_spectral', color_range=(0.0, 0.5), color_step_range=(0.15, 0.25)): + """Initialize a word cloud color generator. + + Parameters: + colormap (str, optional): The name of the matplotlib colormap to use for generating colors. + Defaults to 'nipy_spectral'. + color_range (tuple of float, optional): A tuple containing the minimum and maximum values to use + from the colormap. Defaults to (0.0, 0.5). + color_step_range (tuple of float, optional): A tuple containing the minimum and maximum values to step + through the colormap. Defaults to (0.15, 0.25). + This is the speed at which it goes through the range chosen. + .25 means it will go through 1/4 of the range each pick. + """ + self.colormap = cm.get_cmap(colormap) + self.color_range = color_range + self.color_step_range = color_step_range + self.current_fraction = random.uniform(0, 1) # Start at a random point + + def color_func(self, word, font_size, position, orientation, random_state=None, **kwargs): + # Update the current color fraction and wrap around if necessary + color_step = random.uniform(*self.color_step_range) + self.current_fraction = (self.current_fraction + color_step) % 1.0 + + # Scale the fraction to the desired range + scaled_fraction = self.color_range[0] + (self.current_fraction * (self.color_range[1] - self.color_range[0])) + + # Get the color from the colormap + color = self.colormap(scaled_fraction) + + return tuple(int(c * 255) for c in color[:3]) # Convert to RGB format + + +default_color_func = ColormapColorFunc().color_func diff --git a/hed/tools/visualizations/tag_word_cloud.py b/hed/tools/visualizations/tag_word_cloud.py deleted file mode 100644 index 2f2c25236..000000000 --- a/hed/tools/visualizations/tag_word_cloud.py +++ /dev/null @@ -1,46 +0,0 @@ -from wordcloud import WordCloud - - -def create_wordcloud(word_dict, width=400, height=200): - """Takes a word dict and returns a generated word cloud object - - Parameters: - word_dict(dict): words and their frequencies - width(int): width in pixels - height(int): height in pixels - Returns: - word_cloud(WordCloud): The generated cloud. - Use .to_file to save it out as an image. - - :raises ValueError: - An empty dictionary was passed - """ - wc = WordCloud(background_color='white', width=width, height=height) - - wc.generate_from_frequencies(word_dict) - - return wc - - -def summary_to_dict(summary): - """Converts a HedTagSummary json dict into the word cloud input format - - Parameters: - summary(dict): The summary from a summarize hed tags op - - Returns: - word_dict(dict): a dict of the words and their occurrence count - - :raises KeyError: - A malformed dictionary was passed - - """ - overall_summary = summary.get("Overall summary", {}) - specifics = overall_summary.get("Specifics", {}) - tag_dict = specifics.get("Main tags", {}) - word_dict = {} - for tag_sub_list in tag_dict.values(): - for tag_sub_dict in tag_sub_list: - word_dict[tag_sub_dict['tag']] = tag_sub_dict['events'] - - return word_dict diff --git a/hedtools/conda_build_info.txt b/hedtools/conda_build_info.txt deleted file mode 100644 index 0ea449c6f..000000000 --- a/hedtools/conda_build_info.txt +++ /dev/null @@ -1,24 +0,0 @@ -To create the base meta.yaml I used grayskull: -https://conda-forge.org/docs/maintainer/adding_pkgs.html#build - -Commands for building(uploading to conda-forge - note this is just for testing): -# Make sure conda forge is allowed -conda config --add channels conda-forge -# Make sure we try to always get from the same repo if there are conflicts -# see https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-channels.html for more details on channels -# this avoids issues with conflicting c libraries and similar. -conda config --set channel_priority strict -# Actually build the recipe. -conda build hedtools - -To locally install it after building: -conda install --use-local hedtools - -Then you follow the instructions here to make a PR(this is the actual upload to conda forge): -https://conda-forge.org/docs/maintainer/adding_pkgs.html#staging-test-locally - -To install from conda-forge(in theory, doesn't work yet): -# Note the -c conda-forge shouldn't be required if you called the build config step above.(but users won't have done this) -conda install hedtools -c conda-forge - - diff --git a/hedtools/meta.yaml b/hedtools/meta.yaml deleted file mode 100644 index 80a8fcdef..000000000 --- a/hedtools/meta.yaml +++ /dev/null @@ -1,63 +0,0 @@ -{% set name = "hedtools" %} -{% set version = "0.2.0" %} - -package: - name: {{ name|lower }} - version: {{ version }} - -source: - url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/hedtools-{{ version }}.tar.gz - sha256: 2452f8e4e79e50750147a437410ccfd9ea04ad4e6390edc14dbcd663a7a9fa08 - -build: - entry_points: - - run_remodel=hed.tools.remodeling.cli.run_remodel:main - - run_remodel_backup=hed.tools.remodeling.cli.run_remodel_backup:main - - run_remodel_restore=hed.tools.remodeling.cli.run_remodel_restore:main - noarch: python - script: {{ PYTHON }} -m pip install . -vv - number: 0 - -requirements: - host: - - python >=3.7 - - setuptools >=42 - - versioneer-518 - - pip - run: - - python >=3.7 - - defusedxml - - et-xmlfile - - inflect - - jdcal - - numpy - - openpyxl - - pandas - - portalocker - - python-dateutil - - pytz - - semantic_version - - six - - werkzeug - -test: - imports: - hed - commands: - - run_remodel --help - - run_remodel_backup --help - - run_remodel_restore --help - requires: - - pip - -about: - home: https://github.com/hed-standard/hed-python/ - summary: HED validation, summary, and analysis tools. - license: MIT - license_file: LICENSE - -extra: - recipe-maintainers: - - hed-maintainers - - VisLab - - IanCa \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7dd623faa..07c3304d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ defusedxml>=0.7.1 -inflect>=6.0.2 +inflect>=6.0.5 numpy>=1.21.6 openpyxl>=3.1.0 pandas>=1.3.5 portalocker>=2.7.0 semantic_version>=2.10.0 -wordcloud>=1.9.2 +wordcloud==1.9.2 \ No newline at end of file diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py index bda6e1259..d93983d59 100644 --- a/tests/models/test_base_input.py +++ b/tests/models/test_base_input.py @@ -7,6 +7,8 @@ from hed.models.column_mapper import ColumnMapper from hed.models import DefinitionDict from hed import schema +from hed import HedFileError + import pandas as pd import numpy as np @@ -60,6 +62,19 @@ def test_gathered_defs(self): } self.assertEqual(defs, expected_defs) + def test_file_not_found(self): + with self.assertRaises(HedFileError): + BaseInput('nonexistent_file.tsv') + + def test_invalid_input_type_int(self): + with self.assertRaises(HedFileError): + BaseInput(123) + + def test_invalid_input_type_dict(self): + with self.assertRaises(HedFileError): + BaseInput({'key': 'value'}) + + class TestInsertColumns(unittest.TestCase): diff --git a/tests/models/test_sidecar.py b/tests/models/test_sidecar.py index 8383de6f8..897d01563 100644 --- a/tests/models/test_sidecar.py +++ b/tests/models/test_sidecar.py @@ -40,6 +40,18 @@ def setUpClass(cls): def tearDownClass(cls): shutil.rmtree(cls.base_output_folder) + def test_file_not_found(self): + with self.assertRaises(HedFileError): + Sidecar('nonexistent_file.json') + + def test_invalid_input_type_int(self): + with self.assertRaises(HedFileError): + Sidecar(123) + + def test_invalid_input_type_dict(self): + with self.assertRaises(HedFileError): + Sidecar({'key': 'value'}) + def test_invalid_filenames(self): # Handle missing or invalid files. invalid_json = "invalidxmlfile.json" diff --git a/tests/models/test_spreadsheet_input.py b/tests/models/test_spreadsheet_input.py index 0a1d83a2e..eeee6bc8d 100644 --- a/tests/models/test_spreadsheet_input.py +++ b/tests/models/test_spreadsheet_input.py @@ -24,6 +24,7 @@ def setUpClass(cls): cls.base_output_folder = base_output os.makedirs(base_output, exist_ok=True) + @classmethod def tearDownClass(cls): shutil.rmtree(cls.base_output_folder) diff --git a/tests/models/test_tabular_input.py b/tests/models/test_tabular_input.py index c3c01f2b7..95eb527f0 100644 --- a/tests/models/test_tabular_input.py +++ b/tests/models/test_tabular_input.py @@ -28,6 +28,8 @@ def setUpClass(cls): cls.sidecar2 = Sidecar(sidecar2_path, name='face_small_json') cls.base_output_folder = base_output_folder + cls.invalid_inputs = [123, {'key': 'value'}, 'nonexistent_file.tsv'] + @classmethod def tearDownClass(cls): shutil.rmtree(cls.base_output_folder) @@ -82,6 +84,19 @@ def test_validate_file_warnings(self): issues2a = input_file2.validate(hed_schema=self.hed_schema, error_handler=ErrorHandler(False)) breakHere = 3 + def test_invalid_file(self): + for invalid_input in self.invalid_inputs: + with self.subTest(input=invalid_input): + with self.assertRaises(HedFileError): + TabularInput(file=invalid_input) + + def test_invalid_sidecar(self): + for invalid_input in self.invalid_inputs: + with self.subTest(input=invalid_input): + with self.assertRaises(HedFileError): + # Replace 'valid_path.tsv' with a path to an existing .tsv file + TabularInput(file=self.events_path, sidecar=invalid_input) + if __name__ == '__main__': unittest.main() diff --git a/tests/tools/analysis/test_hed_tag_counts.py b/tests/tools/analysis/test_hed_tag_counts.py index 0950ea909..5f2eebc27 100644 --- a/tests/tools/analysis/test_hed_tag_counts.py +++ b/tests/tools/analysis/test_hed_tag_counts.py @@ -1,10 +1,10 @@ import os import unittest -from pandas import DataFrame from hed import schema as hedschema -from hed.models import Sidecar, TabularInput, HedString, HedTag +from hed.models import Sidecar, TabularInput, HedString +from hed.models.df_util import get_assembled from hed.tools import assemble_hed -from hed.tools.analysis.hed_tag_counts import HedTagCount, HedTagCounts +from hed.tools.analysis.hed_tag_counts import HedTagCounts # noinspection PyBroadException @@ -13,7 +13,7 @@ class Test(unittest.TestCase): @classmethod def setUpClass(cls): bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), - '../../data/bids_tests/eeg_ds003645s_hed')) + '../../data/bids_tests/eeg_ds003645s_hed')) schema_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data/schema_tests/HED8.0.0.xml')) cls.bids_root_path = bids_root_path @@ -25,9 +25,20 @@ def setUpClass(cls): cls.hed_schema = schema sidecar1 = Sidecar(json_path, name='face_sub1_json') input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") + cls.input_data = input_data + cls.sidecar1 = sidecar1 input_df, def_dict = assemble_hed(input_data, sidecar1, schema, expand_defs=False) cls.input_df = input_df cls.def_dict = def_dict + cls.tag_template = { + "Sensory events": ["Sensory-event", "Sensory-presentation", "Sensory-attribute", + "Experimental-stimulus", "Task-stimulus-role", + "Task-attentional-demand", "Incidental", "Instructional", "Warning"], + "Agent actions": ["Agent-action", "Agent", "Action", "Agent-task-role", + "Task-action-type", "Participant-response"], + "Objects": ["Item"], + "Other events": ["Event", "Task-event-role", "Mishap"] + } def test_constructor(self): counts = HedTagCounts('Base_name') @@ -58,10 +69,24 @@ def test_merge_tag_dicts(self): def test_hed_tag_count(self): name = 'Base_name1' counts1 = HedTagCounts(name, 0) - counts1.update_event_counts(HedString(self.input_df.iloc[0]['HED_assembled'], self.hed_schema), + counts1.update_event_counts(HedString(self.input_df.iloc[0]['HED_assembled'], self.hed_schema), file_name=name) self.assertIsInstance(counts1, HedTagCounts) + def test_organize_tags(self): + counts = HedTagCounts('Base_name') + hed_strings, definitions = get_assembled(self.input_data, self.sidecar1, self.hed_schema, + extra_def_dicts=None, join_columns=True, + shrink_defs=False, expand_defs=True) + # definitions = input_data.get_definitions().gathered_defs + for hed in hed_strings: + counts.update_event_counts(hed, 'run-1') + self.assertIsInstance(counts.tag_dict, dict) + self.assertEqual(len(counts.tag_dict), 47) + org_tags, leftovers = counts.organize_tags(self.tag_template) + self.assertEqual(len(org_tags), 19) + self.assertEqual(len(leftovers), 22) + if __name__ == '__main__': unittest.main() diff --git a/tests/tools/analysis/test_hed_type_factors.py b/tests/tools/analysis/test_hed_type_factors.py index 5821e2675..378617a12 100644 --- a/tests/tools/analysis/test_hed_type_factors.py +++ b/tests/tools/analysis/test_hed_type_factors.py @@ -139,8 +139,11 @@ def test_get_variable_factors(self): self.assertIsInstance(factors, pd.DataFrame, "get_factors contains dataframe.") self.assertEqual(len(factors), var_sum.number_elements, "get_factors has factors of same length as number of elements") - self.assertEqual(len(factors.columns), summary["levels"] + 1, - 'get_factors has factors levels + 1 (direct references)') + if not var_manager._type_value_map[variable].levels: + self.assertEqual(len(factors.columns), 1) + else: + self.assertEqual(len(factors.columns), summary["levels"], 'get_factors has factors levels') + self.assertEqual(len(factors.columns), len(var_manager._type_value_map[variable].levels)) def test_count_events(self): list1 = [0, 2, 6, 1, 2, 0, 0] diff --git a/tests/tools/analysis/test_hed_type_manager.py b/tests/tools/analysis/test_hed_type_manager.py index 9fd7abce2..657e9fec5 100644 --- a/tests/tools/analysis/test_hed_type_manager.py +++ b/tests/tools/analysis/test_hed_type_manager.py @@ -60,8 +60,8 @@ def test_get_factor_vectors(self): df_task = var_manager.get_factor_vectors("task") self.assertEqual(len(df_cond), base_length, "get_factor_vectors returns df same length as original") self.assertEqual(len(df_task), base_length, "get_factor_vectors returns df same length as original if 2 types") - self.assertEqual(len(df_cond.columns), 10, "get_factor_vectors has right number of factors") - self.assertEqual(len(df_task.columns), 4, "get_factor_vectors has right number of factors if 2 types") + self.assertEqual(len(df_cond.columns), 7, "get_factor_vectors has right number of factors") + self.assertEqual(len(df_task.columns), 2, "get_factor_vectors has right number of factors if 2 types") df_baloney = var_manager.get_factor_vectors("baloney") self.assertIsNone(df_baloney, "get_factor_vectors returns None if no factors") diff --git a/tests/tools/analysis/test_hed_type_values.py b/tests/tools/analysis/test_hed_type_values.py index 4b3125353..d8428e23c 100644 --- a/tests/tools/analysis/test_hed_type_values.py +++ b/tests/tools/analysis/test_hed_type_values.py @@ -116,10 +116,10 @@ def test_get_variable_factors(self): df_new1 = var_manager.get_type_factors() self.assertIsInstance(df_new1, DataFrame) self.assertEqual(len(df_new1), 200) - self.assertEqual(len(df_new1.columns), 10) + self.assertEqual(len(df_new1.columns), 7) df_new2 = var_manager.get_type_factors(type_values=["face-type"]) self.assertEqual(len(df_new2), 200) - self.assertEqual(len(df_new2.columns), 4) + self.assertEqual(len(df_new2.columns), 3) df_new3 = var_manager.get_type_factors(type_values=["junk"]) self.assertIsNone(df_new3) diff --git a/tests/tools/remodeling/operations/test_factor_hed_type_op.py b/tests/tools/remodeling/operations/test_factor_hed_type_op.py index 22f39617a..e43e0e803 100644 --- a/tests/tools/remodeling/operations/test_factor_hed_type_op.py +++ b/tests/tools/remodeling/operations/test_factor_hed_type_op.py @@ -33,11 +33,9 @@ def test_valid(self): op = FactorHedTypeOp(self.base_parameters) df_new = op.do_op(self.dispatch, self.data_path, 'subj2_run1', sidecar=self.json_path) self.assertEqual(len(df_new), 200, "factor_hed_type_op length is correct") - self.assertEqual(len(df_new.columns), 20, "factor_hed_type_op has correct number of columns") + self.assertEqual(len(df_new.columns), 17, "factor_hed_type_op has correct number of columns") def test_valid_specific_column(self): - # Not implemented yet - # Test correct when all valid and no unwanted information parms = self.base_parameters parms["type_values"] = ["key-assignment"] op = FactorHedTypeOp(parms) @@ -46,7 +44,7 @@ def test_valid_specific_column(self): df_new = op.do_op(dispatch, dispatch.prep_data(df_new), 'run-01', sidecar=self.json_path) df_new = dispatch.post_proc_data(df_new) self.assertEqual(len(df_new), 200, "factor_hed_type_op length is correct when type_values specified") - self.assertEqual(len(df_new.columns), 20, + self.assertEqual(len(df_new.columns), 11, "factor_hed_type_op has correct number of columns when type_values specified") diff --git a/tests/tools/remodeling/operations/test_summarize_column_names_op.py b/tests/tools/remodeling/operations/test_summarize_column_names_op.py index 2ef5eee27..2aadd8e72 100644 --- a/tests/tools/remodeling/operations/test_summarize_column_names_op.py +++ b/tests/tools/remodeling/operations/test_summarize_column_names_op.py @@ -98,7 +98,7 @@ def test_summary(self): dataset_sum = summary['Dataset'] json_str = json.dumps(dataset_sum) json_obj = json.loads(json_str) - columns = json_obj["Overall summary"]["Columns"] + columns = json_obj["Overall summary"]["Specifics"]["Columns"] self.assertEqual(len(columns), 1) self.assertEqual(len(columns[0]['Files']), 2) ind_sum = summary['Individual files'] diff --git a/tests/tools/remodeling/operations/test_summarize_column_values_op.py b/tests/tools/remodeling/operations/test_summarize_column_values_op.py index 2d69655c0..c3cc322d1 100644 --- a/tests/tools/remodeling/operations/test_summarize_column_values_op.py +++ b/tests/tools/remodeling/operations/test_summarize_column_values_op.py @@ -5,7 +5,6 @@ from hed.tools.remodeling.dispatcher import Dispatcher from hed.tools.remodeling.operations.summarize_column_values_op import \ ColumnValueSummary, SummarizeColumnValuesOp -from hed.tools.util.io_util import get_file_list class Test(unittest.TestCase): @@ -68,17 +67,16 @@ def test_get_summary(self): self.get_dfs(sum_op, 'name1', dispatch) cont = dispatch.summary_dicts - context1 = cont.get("test summary", None) - self.assertIsInstance(context1, ColumnValueSummary, "get_summary testing ColumnValueSummary") - # summary1 = context1.get_summary() - # self.assertIsInstance(summary1, dict, "get_summary returns a dictionary") - # self.assertIsInstance(summary1["Dataset"], dict) - summary1a = context1.get_summary() + context = cont.get("test summary", None) + self.assertIsInstance(context, ColumnValueSummary, "get_summary testing ColumnValueSummary") + summary1a = context.get_summary() self.assertIsInstance(summary1a, dict) self.assertIsInstance(summary1a["Dataset"], dict) - text_summary = context1.get_text_summary(individual_summaries="separate") - self.assertIsInstance(text_summary, dict) - self.assertIsInstance(text_summary["Dataset"], str) + text_summary1 = context.get_text_summary(individual_summaries=None) + self.assertIsInstance(text_summary1, dict) + self.assertIsInstance(text_summary1["Dataset"], str) + text_summary1a = context.get_text_summary(individual_summaries="separate") + self.assertIsInstance(text_summary1a, dict) self.get_dfs(sum_op, 'name2', dispatch) self.get_dfs(sum_op, 'name3', dispatch) context2 = dispatch.summary_dicts.get(parms['summary_name'], None) diff --git a/tests/tools/remodeling/operations/test_summarize_definitions_op.py b/tests/tools/remodeling/operations/test_summarize_definitions_op.py index 4b4784f64..6cfddbd90 100644 --- a/tests/tools/remodeling/operations/test_summarize_definitions_op.py +++ b/tests/tools/remodeling/operations/test_summarize_definitions_op.py @@ -59,6 +59,16 @@ def test_summary(self): self.assertIsInstance(dispatch.summary_dicts[sum_op.summary_name], DefinitionSummary) # print(str(dispatch.summary_dicts[sum_op.summary_name].get_text_summary()['Dataset'])) + cont = dispatch.summary_dicts + context = cont.get("get_definition_summary", None) + self.assertIsInstance(context, DefinitionSummary, "get_summary testing DefinitionSummary") + summary1a = context.get_summary() + self.assertIsInstance(summary1a, dict) + self.assertIsInstance(summary1a["Dataset"], dict) + text_summary1 = context.get_text_summary(individual_summaries=None) + self.assertIsInstance(text_summary1, dict) + self.assertIsInstance(text_summary1["Dataset"], str) + def test_summary_errors(self): dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) parms = json.loads(self.json_parms) diff --git a/tests/tools/remodeling/operations/test_summarize_hed_type_op.py b/tests/tools/remodeling/operations/test_summarize_hed_type_op.py index faa3b79bd..642539967 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_type_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_type_op.py @@ -75,8 +75,8 @@ def test_summary(self): sum_op.do_op(dispatch, dispatch.prep_data(df), 'run-02', sidecar=self.sidecar_path) context2 = dispatch.summary_dicts['AOMIC_condition_variables'] summary2 = context2.get_summary(individual_summaries="separate") - self.assertEqual(summary2['Dataset']['Overall summary']['files'][0], 'run-01') - self.assertEqual(len(summary2['Dataset']['Overall summary']['files']), 2) + self.assertEqual(summary2['Dataset']['Overall summary']['Files'][0], 'run-01') + self.assertEqual(len(summary2['Dataset']['Overall summary']['Files']), 2) summary2a = context2.get_summary(individual_summaries="separate") self.assertIsInstance(summary2a["Individual files"]["run-02"], dict) diff --git a/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py b/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py index 7b76e7c1c..9ae1ef776 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py @@ -89,7 +89,7 @@ def test_get_summary_details(self): sum_obj4 = sum_context2.get_summary_details(include_individual=True) self.assertIsInstance(sum_obj4, dict) - def test_get_summary_text_summary(self): + def test_get_summary(self): dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) parms = json.loads(self.json_parms) sum_op = SummarizeHedValidationOp(parms) @@ -97,14 +97,42 @@ def test_get_summary_text_summary(self): df = dispatch.prep_data(df) sum_op.do_op(dispatch, df, 'subj2_run1', sidecar=self.bad_json_path) - sum_context1 = dispatch.summary_dicts[sum_op.summary_name] - text_sum1 = sum_context1.get_text_summary(individual_summaries="separate") + context = dispatch.summary_dicts[sum_op.summary_name] + sum1a = context.get_summary(individual_summaries="separate") + self.assertEqual(len(sum1a['Dataset']['Overall summary']['Files']), 1) + self.assertEqual(sum1a['Dataset']['Overall summary']['Files'][0], 'subj2_run1') + self.assertEqual(len(sum1a['Dataset']['Overall summary']), 5) + sum2a = context.get_summary(individual_summaries="separate") + self.assertIsInstance(sum2a["Individual files"]["subj2_run1"], dict) + sum_op.do_op(dispatch, df, 'subj2_run2', sidecar=self.json_path) + sum_op.do_op(dispatch, df, 'subj2_run3', sidecar=self.bad_json_path) + sum3a = context.get_summary(individual_summaries="none") + self.assertIsInstance(sum3a, dict) + self.assertFalse(sum3a["Individual files"]) + self.assertEqual(len(sum3a['Dataset']['Overall summary']['Files']), 3) + sum3b = context.get_summary(individual_summaries="consolidated") + self.assertEqual(len(sum3b["Individual files"]), 3) + self.assertEqual(sum3b['Dataset']['Overall summary']['Total files'], 3) + self.assertIsInstance(sum3b, dict) + + def test_get_text_summary(self): + dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) + parms = json.loads(self.json_parms) + sum_op = SummarizeHedValidationOp(parms) + df = pd.read_csv(self.data_path, delimiter='\t', header=0, keep_default_na=False, na_values=",null") + df = dispatch.prep_data(df) + sum_op.do_op(dispatch, df, 'subj2_run1', sidecar=self.bad_json_path) + context = dispatch.summary_dicts[sum_op.summary_name] + text_sum1 = context.get_text_summary(individual_summaries="separate") + self.assertEqual(len(text_sum1), 2) sum_op.do_op(dispatch, df, 'subj2_run2', sidecar=self.json_path) sum_op.do_op(dispatch, df, 'subj2_run3', sidecar=self.bad_json_path) - text_sum2 = sum_context1.get_text_summary(individual_summaries="none") - text_sum3 = sum_context1.get_text_summary(individual_summaries="consolidated") + text_sum2 = context.get_text_summary(individual_summaries="none") + text_sum3 = context.get_text_summary(individual_summaries="consolidated") self.assertIsInstance(text_sum3, dict) self.assertIsInstance(text_sum2, dict) + self.assertEqual(len(text_sum2), 1) + self.assertEqual(len(text_sum3), 1) def test_with_sample_data(self): dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) @@ -113,6 +141,8 @@ def test_with_sample_data(self): sum_op = SummarizeHedValidationOp(parms) sum_op.do_op(dispatch, df, 'sub-0013_task-stopsignal_acq-seq_events.tsv', sidecar=self.sample_sidecar_path) sum_context1 = dispatch.summary_dicts[sum_op.summary_name] + self.assertIsInstance(sum_context1, HedValidationSummary) + self.assertEqual(len(sum_context1.summary_dict), 1) if __name__ == '__main__': diff --git a/tests/tools/remodeling/test_dispatcher.py b/tests/tools/remodeling/test_dispatcher.py index e2ff311a9..177a4fe43 100644 --- a/tests/tools/remodeling/test_dispatcher.py +++ b/tests/tools/remodeling/test_dispatcher.py @@ -182,8 +182,8 @@ def test_run_operations_hed(self): df = dispatch.run_operations(events_path, sidecar=sidecar_path, verbose=False) self.assertIsInstance(df, pd.DataFrame) self.assertEqual(len(df), 200) - self.assertEqual(len(df.columns), 20) - self.assertIn('key-assignment', df.columns) + self.assertEqual(len(df.columns), 17) + self.assertIn('key-assignment.right-sym-cond', df.columns) def test_save_summaries(self): with open(self.summarize_model) as fp: diff --git a/tests/tools/visualization/test_tag_word_cloud.py b/tests/tools/visualization/test_tag_word_cloud.py new file mode 100644 index 000000000..2b515c941 --- /dev/null +++ b/tests/tools/visualization/test_tag_word_cloud.py @@ -0,0 +1,110 @@ +import unittest +from wordcloud import WordCloud +from hed.tools.visualization import tag_word_cloud +from hed.tools.visualization.tag_word_cloud import load_and_resize_mask +import numpy as np +from PIL import Image, ImageDraw +import os + + +class TestWordCloudFunctions(unittest.TestCase): + def test_convert_summary_to_word_dict(self): + # Assume we have a valid summary_json + summary_json = { + 'Overall summary': { + 'Specifics': { + 'Main tags': { + 'tag_category_1': [ + {'tag': 'tag1', 'events': 5}, + {'tag': 'tag2', 'events': 3} + ], + 'tag_category_2': [ + {'tag': 'tag3', 'events': 7} + ] + } + } + } + } + expected_output = {'tag1': 5, 'tag2': 3, 'tag3': 7} + + word_dict = tag_word_cloud.summary_to_dict(summary_json, transform=None, adjustment=0) + self.assertEqual(word_dict, expected_output) + + def test_create_wordcloud(self): + word_dict = {'tag1': 5, 'tag2': 3, 'tag3': 7} + width = 400 + height = 200 + wc = tag_word_cloud.create_wordcloud(word_dict, width=width, height=height) + + self.assertIsInstance(wc, WordCloud) + self.assertEqual(wc.width, width) + self.assertEqual(wc.height, height) + + def test_create_wordcloud_with_empty_dict(self): + # Test creation of word cloud with an empty dictionary + word_dict = {} + with self.assertRaises(ValueError): + tag_word_cloud.create_wordcloud(word_dict) + + def test_create_wordcloud_with_single_word(self): + # Test creation of word cloud with a single word + word_dict = {'single_word': 1} + wc = tag_word_cloud.create_wordcloud(word_dict) + self.assertIsInstance(wc, WordCloud) + # Check that the single word is in the word cloud + self.assertIn('single_word', wc.words_) + + +class TestLoadAndResizeMask(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Create a simple black and white image + cls.original_size = (300, 200) + cls.img = Image.new('L', cls.original_size, 0) # Start with a black image + + # Draw a white circle in the middle of the image + d = ImageDraw.Draw(cls.img) + circle_radius = min(cls.original_size) // 4 # Radius of the circle is a quarter of the smaller dimension of the image + circle_center = (cls.original_size[0] // 2, cls.original_size[1] // 2) # Center of the circle is the center of the image + d.ellipse((circle_center[0] - circle_radius, + circle_center[1] - circle_radius, + circle_center[0] + circle_radius, + circle_center[1] + circle_radius), + fill=255) # Fill the ellipse with white + cls.img_path = 'temp_img.bmp' + cls.img.save(cls.img_path) + + @classmethod + def tearDownClass(cls): + # Clean up the temp image + os.remove(cls.img_path) + + def test_no_resizing(self): + mask = load_and_resize_mask(self.img_path) + mask_img = Image.fromarray(mask) + self.assertEqual((mask_img.width, mask_img.height), self.original_size) + + def test_width_resizing(self): + width = 150 + mask = load_and_resize_mask(self.img_path, width=width) + mask_img = Image.fromarray(mask) + expected_width, expected_height = width, int(self.original_size[1] * width / self.original_size[0]) + self.assertEqual((mask_img.width, mask_img.height), (expected_width, expected_height)) + + def test_height_resizing(self): + height = 100 + mask = load_and_resize_mask(self.img_path, height=height) + mask_img = Image.fromarray(mask) + expected_shape = (int(self.original_size[0] * height / self.original_size[1]), height) + self.assertEqual((mask_img.width, mask_img.height), expected_shape) + + def test_both_dimensions_resizing(self): + width, height = 100, 75 + mask = load_and_resize_mask(self.img_path, width=width, height=height) + self.assertEqual(mask.shape, (height, width)) + + def test_mask_color(self): + mask = load_and_resize_mask(self.img_path) + # Since we created an image with '1' mode, all values should be either 0 or 255 + unique_values = np.unique(mask) + self.assertCountEqual(unique_values, [0, 255]) diff --git a/tests/tools/visualizations/test_tag_word_cloud.py b/tests/tools/visualizations/test_tag_word_cloud.py deleted file mode 100644 index fa09e1710..000000000 --- a/tests/tools/visualizations/test_tag_word_cloud.py +++ /dev/null @@ -1,52 +0,0 @@ -import unittest -from wordcloud import WordCloud -from hed.tools.visualizations import tag_word_cloud - - -class TestWordCloudFunctions(unittest.TestCase): - - def test_convert_summary_to_word_dict(self): - # Assume we have a valid summary_json - summary_json = { - 'Dataset': { - 'Overall summary': { - 'Main tags': { - 'tag_category_1': [ - {'tag': 'tag1', 'events': 5}, - {'tag': 'tag2', 'events': 3} - ], - 'tag_category_2': [ - {'tag': 'tag3', 'events': 7} - ] - } - } - } - } - expected_output = {'tag1': 5, 'tag2': 3, 'tag3': 7} - - word_dict = tag_word_cloud.summary_to_dict(summary_json) - self.assertEqual(word_dict, expected_output) - - def test_create_wordcloud(self): - word_dict = {'tag1': 5, 'tag2': 3, 'tag3': 7} - width = 400 - height = 200 - wc = tag_word_cloud.create_wordcloud(word_dict, width, height) - - self.assertIsInstance(wc, WordCloud) - self.assertEqual(wc.width, width) - self.assertEqual(wc.height, height) - - def test_create_wordcloud_with_empty_dict(self): - # Test creation of word cloud with an empty dictionary - word_dict = {} - with self.assertRaises(ValueError): - tag_word_cloud.create_wordcloud(word_dict) - - def test_create_wordcloud_with_single_word(self): - # Test creation of word cloud with a single word - word_dict = {'single_word': 1} - wc = tag_word_cloud.create_wordcloud(word_dict) - self.assertIsInstance(wc, WordCloud) - # Check that the single word is in the word cloud - self.assertIn('single_word', wc.words_)