diff --git a/hed/tools/analysis/annotation_util.py b/hed/tools/analysis/annotation_util.py index 82d0d99bc..7abe01bbd 100644 --- a/hed/tools/analysis/annotation_util.py +++ b/hed/tools/analysis/annotation_util.py @@ -4,8 +4,10 @@ import re import pandas as pd +from pandas import DataFrame, Series from hed.models.sidecar import Sidecar from hed.models.tabular_input import TabularInput + from hed.errors.exceptions import HedFileError from hed.models import df_util from hed.tools.bids.bids_dataset import BidsDataset @@ -63,21 +65,6 @@ def df_to_hed(dataframe, description_tag=True): return hed_dict -def series_to_factor(series): - """Convert a series to an integer factor list. - - Parameters: - series (Series) - Series to be converted to a list. - - Returns: - list - contains 0's and 1's, empty, 'n/a' and np.NAN are converted to 0. - """ - replaced = series.replace('n/a', False) - filled = replaced.fillna(False) - bool_list = filled.astype(bool).tolist() - return [int(value) for value in bool_list] - - def extract_tags(hed_string, search_tag): """ Extract all instances of specified tag from a tag_string. @@ -129,6 +116,7 @@ def generate_sidecar_entry(column_name, column_values=None): sidecar_entry["HED"] = hed return sidecar_entry + def get_bids_dataset(data_root): """ Return a BIDS dataset object given a path to a dataset root. @@ -141,6 +129,7 @@ def get_bids_dataset(data_root): """ return BidsDataset(data_root) + def hed_to_df(sidecar_dict, col_names=None): """ Return a 4-column dataframe of HED portions of sidecar. @@ -202,6 +191,21 @@ def merge_hed_dict(sidecar_dict, hed_dict): sidecar_dict[key]['Levels'] = value_dict['Levels'] +def series_to_factor(series): + """Convert a series to an integer factor list. + + Parameters: + series (Series) - Series to be converted to a list. + + Returns: + list - contains 0's and 1's, empty, 'n/a' and np.NAN are converted to 0. + """ + replaced = series.replace('n/a', False) + filled = replaced.fillna(False) + bool_list = filled.astype(bool).tolist() + return [int(value) for value in bool_list] + + def str_to_tabular(tsv_str, sidecar=None): """ Return a TabularInput a tsv string. @@ -223,9 +227,11 @@ def strs_to_sidecar(sidecar_strings): sidecar_strings (string or list): String or strings representing sidecars. Returns: - Sidecar: the merged sidecar from the list. + Sidecar or None: the merged sidecar from the list. """ + if not sidecar_strings: + return None if not isinstance(sidecar_strings, list): sidecar_strings = [sidecar_strings] if sidecar_strings: @@ -236,6 +242,7 @@ def strs_to_sidecar(sidecar_strings): else: return None + def to_factor(data, column=None): """Convert data to an integer factor list. @@ -246,11 +253,11 @@ def to_factor(data, column=None): Returns: list - contains 0's and 1's, empty, 'n/a' and np.NAN are converted to 0. """ - if isinstance(data, pd.Series): + if isinstance(data, Series): series = data - elif isinstance(data, pd.DataFrame) and column: + elif isinstance(data, DataFrame) and column: series = data[column] - elif isinstance(data, pd.DataFrame): + elif isinstance(data, DataFrame): series = data.iloc[:, 0] else: raise HedFileError("CannotConvertToFactor", diff --git a/hed/tools/visualization/tag_word_cloud.py b/hed/tools/visualization/tag_word_cloud.py index fefef35fa..90a06a877 100644 --- a/hed/tools/visualization/tag_word_cloud.py +++ b/hed/tools/visualization/tag_word_cloud.py @@ -46,7 +46,7 @@ def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400 kwargs.setdefault('min_font_size', 8) if 'font_path' not in kwargs: kwargs['font_path'] = None - elif kwargs['font_path'] and not kwargs['font_path'].lower().endswith((".ttf", ".otf")): + elif kwargs['font_path'] and not kwargs['font_path'].lower().endswith((".ttf", ".otf", ".ttc")): raise HedFileError("InvalidFontPath", f"Font {kwargs['font_path']} not valid on this system", "") wc = WordCloud(background_color=background_color, mask=mask_image, diff --git a/tests/tools/analysis/test_annotation_util.py b/tests/tools/analysis/test_annotation_util.py index d7bc2de3a..0de57665b 100644 --- a/tests/tools/analysis/test_annotation_util.py +++ b/tests/tools/analysis/test_annotation_util.py @@ -9,6 +9,7 @@ from hed.models.sidecar import Sidecar from hed.models.hed_string import HedString from hed.models.tabular_input import TabularInput + from hed.tools.analysis import annotation_util from hed.tools.analysis.tabular_summary import TabularSummary from hed.tools.util import io_util @@ -196,17 +197,6 @@ def test_generate_sidecar_entry(self): self.assertIsInstance(entry2['HED'], str, "generate_sidecar_entry HED entry should be str when no column values") - def test_series_to_factor(self): - series1 = Series([1.0, 2.0, 3.0, 4.0]) - factor1 = annotation_util.series_to_factor(series1) - self.assertEqual(len(series1), len(factor1)) - self.assertEqual(sum(factor1), len(factor1)) - series2 = Series(['a', '', None, np.NAN, 'n/a']) - factor2 = annotation_util.series_to_factor(series2) - self.assertEqual(len(series2), len(factor2)) - self.assertEqual(sum(factor2), 1) - - def test_generate_sidecar_entry_non_letters(self): entry1 = annotation_util.generate_sidecar_entry('my !#$-123_10', column_values=['apple 1', '@banana', 'grape%cherry&']) @@ -294,7 +284,7 @@ def test_merge_hed_dict_full(self): skip_columns = ["onset", "duration", "sample", "trial", "response_time"] value_columns = ["rep_lag", "stim_file", "value"] event_files = io_util.get_file_list(self.bids_root_path, extensions=[".tsv"], name_suffix="_events", - exclude_dirs=exclude_dirs) + exclude_dirs=exclude_dirs) value_sum = TabularSummary(value_cols=value_columns, skip_cols=skip_columns) value_sum.update(event_files) sidecar_template = value_sum.extract_sidecar_template() @@ -305,6 +295,37 @@ def test_merge_hed_dict_full(self): annotation_util.merge_hed_dict(example_sidecar, spreadsheet_sidecar) self.assertEqual(6, len(example_sidecar), 'merge_hed_dict merges with the correct length') + def test_to_factor(self): + series1 = Series([1.0, 2.0, 3.0, 4.0]) + factor1 = annotation_util.to_factor(series1) + self.assertEqual(len(series1), len(factor1)) + self.assertEqual(sum(factor1), len(factor1)) + series2 = Series(['a', '', None, np.NAN, 'n/a']) + factor2 = annotation_util.to_factor(series2) + self.assertEqual(len(series2), len(factor2)) + self.assertEqual(sum(factor2), 1) + data = { + 'Name': ['Alice', '', 'n/a', 1.0], # Contains a space + 'Age': [25, np.NaN, 35, 0] + } + df = DataFrame(data) + factor3 = annotation_util.to_factor(df, column='Name') + self.assertEqual(sum(factor3), 2) + factor4 = annotation_util.to_factor(df) + self.assertEqual(sum(factor4), 2) + with self.assertRaises(HedFileError) as context5: + annotation_util.to_factor(data) + + def test_series_to_factor(self): + series1 = Series([1.0, 2.0, 3.0, 4.0]) + factor1 = annotation_util.series_to_factor(series1) + self.assertEqual(len(series1), len(factor1)) + self.assertEqual(sum(factor1), len(factor1)) + series2 = Series(['a', '', None, np.NAN, 'n/a']) + factor2 = annotation_util.series_to_factor(series2) + self.assertEqual(len(series2), len(factor2)) + self.assertEqual(sum(factor2), 1) + def test_strs_to_sidecar(self): with open(self.json_path, 'r') as fp: sidecar_dict = json.load(fp) @@ -336,7 +357,6 @@ def test_to_strlist(self): self.assertEqual(str_list2[0], 'Red,Sensory-event') self.assertEqual(str_list2[2], '') - def test_flatten_cat_col(self): col1 = self.sidecar2c["a"] col2 = self.sidecar2c["b"] @@ -358,13 +378,14 @@ def test_flatten_cat_col(self): "_flatten_cat_col should use the Description tag if available") def test_flatten_cat_col_only_description(self): - keys, values, descriptions, tags = annotation_util._flatten_cat_col("event_type", - {"HED": {"code1": "Description/Code 1 here."}}) + keys, values, descriptions, tags = \ + annotation_util._flatten_cat_col("event_type", {"HED": {"code1": "Description/Code 1 here."}}) self.assertIsInstance(tags, list) self.assertEqual(tags[0], 'n/a') def test_flatten_val_col_only_description(self): - keys, values, descriptions, tags = annotation_util._flatten_val_col("response", {"HED": "Description/Code 1 here."}) + keys, values, descriptions, tags = annotation_util._flatten_val_col("response", + {"HED": "Description/Code 1 here."}) self.assertEqual(descriptions[0], 'Code 1 here.') self.assertFalse(tags[0])