From 9c0b3d38a403ff6493bb47038e110ab880bcc674 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Sat, 27 Apr 2024 11:45:40 -0500 Subject: [PATCH 1/5] Updated the to_factor helper to also handle DataFrame --- hed/tools/analysis/annotation_util.py | 42 +++++++++++++------- tests/tools/analysis/test_annotation_util.py | 35 ++++++++++------ 2 files changed, 49 insertions(+), 28 deletions(-) diff --git a/hed/tools/analysis/annotation_util.py b/hed/tools/analysis/annotation_util.py index d80ed8d2e..acd9ab208 100644 --- a/hed/tools/analysis/annotation_util.py +++ b/hed/tools/analysis/annotation_util.py @@ -2,7 +2,7 @@ import io import re -from pandas import DataFrame, to_numeric +from pandas import DataFrame, Series, to_numeric from hed.models import Sidecar, TabularInput from hed.errors.exceptions import HedFileError from hed.models.df_util import replace_ref @@ -60,20 +60,6 @@ def df_to_hed(dataframe, description_tag=True): return hed_dict -def series_to_factor(series): - """Convert a series to an integer factor list. - - Parameters: - series (Series) - Series to be converted to a list. - - Returns: - list - contains 0's and 1's, empty, 'n/a' and np.NAN are converted to 0. - """ - replaced = series.replace('n/a', False) - filled = replaced.fillna(False) - bool_list = filled.astype(bool).tolist() - return [int(value) for value in bool_list] - def extract_tags(hed_string, search_tag): """ Extract all instances of specified tag from a tag_string. @@ -223,6 +209,32 @@ def strs_to_sidecar(sidecar_strings): return None +def to_factor(data, column=None): + """Convert data to an integer factor list. + + Parameters: + data (Series or DataFrame) - Series to be converted to a list. + column (str): Optional column name if DataFrame (otherwise column 0). + + Returns: + list - contains 0's and 1's, empty, 'n/a' and np.NAN are converted to 0. + """ + if isinstance(data, Series): + series = data + elif isinstance(data, DataFrame) and column: + series = data[column] + elif isinstance(data, DataFrame): + series = data.iloc[:, 0] + else: + raise HedFileError("CannotConvertToFactor", + f"Expecting Series or DataFrame but got {type(data)}", "") + + replaced = series.replace('n/a', False) + filled = replaced.fillna(False) + bool_list = filled.astype(bool).tolist() + return [int(value) for value in bool_list] + + def to_strlist(obj_list): """ Return a list with the objects converted to string except for None elements. diff --git a/tests/tools/analysis/test_annotation_util.py b/tests/tools/analysis/test_annotation_util.py index c379d63b1..175b926cb 100644 --- a/tests/tools/analysis/test_annotation_util.py +++ b/tests/tools/analysis/test_annotation_util.py @@ -10,7 +10,7 @@ from hed.models.hed_string import HedString from hed.models.tabular_input import TabularInput from hed.tools.analysis.annotation_util import check_df_columns, df_to_hed, extract_tags, \ - hed_to_df, merge_hed_dict, series_to_factor, strs_to_sidecar, str_to_tabular, to_strlist + hed_to_df, merge_hed_dict, to_factor, strs_to_sidecar, str_to_tabular, to_strlist from hed.tools.analysis.annotation_util import _flatten_cat_col, _flatten_val_col, _get_value_entry, _tag_list_to_str, \ _update_cat_dict, generate_sidecar_entry from hed.tools.analysis.tabular_summary import TabularSummary @@ -199,17 +199,6 @@ def test_generate_sidecar_entry(self): self.assertIsInstance(entry2['HED'], str, "generate_sidecar_entry HED entry should be str when no column values") - def test_series_to_factor(self): - series1 = Series([1.0, 2.0, 3.0, 4.0]) - factor1 = series_to_factor(series1) - self.assertEqual(len(series1), len(factor1)) - self.assertEqual(sum(factor1), len(factor1)) - series2 = Series(['a', '', None, np.NAN, 'n/a']) - factor2 = series_to_factor(series2) - self.assertEqual(len(series2), len(factor2)) - self.assertEqual(sum(factor2), 1) - - def test_generate_sidecar_entry_non_letters(self): entry1 = generate_sidecar_entry('my !#$-123_10', column_values=['apple 1', '@banana', 'grape%cherry&']) self.assertIsInstance(entry1, dict, @@ -307,6 +296,27 @@ def test_merge_hed_dict_full(self): merge_hed_dict(example_sidecar, spreadsheet_sidecar) self.assertEqual(6, len(example_sidecar), 'merge_hed_dict merges with the correct length') + def test_to_factor(self): + series1 = Series([1.0, 2.0, 3.0, 4.0]) + factor1 = to_factor(series1) + self.assertEqual(len(series1), len(factor1)) + self.assertEqual(sum(factor1), len(factor1)) + series2 = Series(['a', '', None, np.NAN, 'n/a']) + factor2 = to_factor(series2) + self.assertEqual(len(series2), len(factor2)) + self.assertEqual(sum(factor2), 1) + data = { + 'Name': ['Alice', '', 'n/a', 1.0], # Contains a space + 'Age': [25, np.NaN, 35, 0] + } + df = DataFrame(data) + factor3 = to_factor(df, column='Name') + self.assertEqual(sum(factor3), 2) + factor4 = to_factor(df) + self.assertEqual(sum(factor4), 2) + with self.assertRaises(HedFileError) as context5: + to_factor(data) + def test_strs_to_sidecar(self): with open(self.json_path, 'r') as fp: sidecar_dict = json.load(fp) @@ -338,7 +348,6 @@ def test_to_strlist(self): self.assertEqual(str_list2[0], 'Red,Sensory-event') self.assertEqual(str_list2[2], '') - def test_flatten_cat_col(self): col1 = self.sidecar2c["a"] col2 = self.sidecar2c["b"] From 98775408b1d43fa3283f9c55fd026a7c9553f2b9 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Tue, 14 May 2024 15:46:21 -0500 Subject: [PATCH 2/5] Updated imports --- hed/tools/analysis/annotation_util.py | 16 +++++ tests/tools/analysis/test_annotation_util.py | 61 ++++++++++---------- 2 files changed, 45 insertions(+), 32 deletions(-) diff --git a/hed/tools/analysis/annotation_util.py b/hed/tools/analysis/annotation_util.py index f0a66c987..c8733d06b 100644 --- a/hed/tools/analysis/annotation_util.py +++ b/hed/tools/analysis/annotation_util.py @@ -4,6 +4,7 @@ import re import pandas as pd +from pandas import DataFrame, Series from hed.models.sidecar import Sidecar from hed.models.tabular_input import TabularInput @@ -189,6 +190,21 @@ def merge_hed_dict(sidecar_dict, hed_dict): sidecar_dict[key]['Levels'] = value_dict['Levels'] +def series_to_factor(series): + """Convert a series to an integer factor list. + + Parameters: + series (Series) - Series to be converted to a list. + + Returns: + list - contains 0's and 1's, empty, 'n/a' and np.NAN are converted to 0. + """ + replaced = series.replace('n/a', False) + filled = replaced.fillna(False) + bool_list = filled.astype(bool).tolist() + return [int(value) for value in bool_list] + + def str_to_tabular(tsv_str, sidecar=None): """ Return a TabularInput a tsv string. diff --git a/tests/tools/analysis/test_annotation_util.py b/tests/tools/analysis/test_annotation_util.py index 09933a50d..0de57665b 100644 --- a/tests/tools/analysis/test_annotation_util.py +++ b/tests/tools/analysis/test_annotation_util.py @@ -197,20 +197,6 @@ def test_generate_sidecar_entry(self): self.assertIsInstance(entry2['HED'], str, "generate_sidecar_entry HED entry should be str when no column values") -<<<<<<< HEAD -======= - def test_series_to_factor(self): - series1 = Series([1.0, 2.0, 3.0, 4.0]) - factor1 = annotation_util.series_to_factor(series1) - self.assertEqual(len(series1), len(factor1)) - self.assertEqual(sum(factor1), len(factor1)) - series2 = Series(['a', '', None, np.NAN, 'n/a']) - factor2 = annotation_util.series_to_factor(series2) - self.assertEqual(len(series2), len(factor2)) - self.assertEqual(sum(factor2), 1) - - ->>>>>>> 1a62ba9386a7d690124be4998a6b4d9267b66201 def test_generate_sidecar_entry_non_letters(self): entry1 = annotation_util.generate_sidecar_entry('my !#$-123_10', column_values=['apple 1', '@banana', 'grape%cherry&']) @@ -298,7 +284,7 @@ def test_merge_hed_dict_full(self): skip_columns = ["onset", "duration", "sample", "trial", "response_time"] value_columns = ["rep_lag", "stim_file", "value"] event_files = io_util.get_file_list(self.bids_root_path, extensions=[".tsv"], name_suffix="_events", - exclude_dirs=exclude_dirs) + exclude_dirs=exclude_dirs) value_sum = TabularSummary(value_cols=value_columns, skip_cols=skip_columns) value_sum.update(event_files) sidecar_template = value_sum.extract_sidecar_template() @@ -309,26 +295,36 @@ def test_merge_hed_dict_full(self): annotation_util.merge_hed_dict(example_sidecar, spreadsheet_sidecar) self.assertEqual(6, len(example_sidecar), 'merge_hed_dict merges with the correct length') - def test_to_factor(self): + def test_to_factor(self): + series1 = Series([1.0, 2.0, 3.0, 4.0]) + factor1 = annotation_util.to_factor(series1) + self.assertEqual(len(series1), len(factor1)) + self.assertEqual(sum(factor1), len(factor1)) + series2 = Series(['a', '', None, np.NAN, 'n/a']) + factor2 = annotation_util.to_factor(series2) + self.assertEqual(len(series2), len(factor2)) + self.assertEqual(sum(factor2), 1) + data = { + 'Name': ['Alice', '', 'n/a', 1.0], # Contains a space + 'Age': [25, np.NaN, 35, 0] + } + df = DataFrame(data) + factor3 = annotation_util.to_factor(df, column='Name') + self.assertEqual(sum(factor3), 2) + factor4 = annotation_util.to_factor(df) + self.assertEqual(sum(factor4), 2) + with self.assertRaises(HedFileError) as context5: + annotation_util.to_factor(data) + + def test_series_to_factor(self): series1 = Series([1.0, 2.0, 3.0, 4.0]) - factor1 = to_factor(series1) + factor1 = annotation_util.series_to_factor(series1) self.assertEqual(len(series1), len(factor1)) self.assertEqual(sum(factor1), len(factor1)) series2 = Series(['a', '', None, np.NAN, 'n/a']) - factor2 = to_factor(series2) + factor2 = annotation_util.series_to_factor(series2) self.assertEqual(len(series2), len(factor2)) self.assertEqual(sum(factor2), 1) - data = { - 'Name': ['Alice', '', 'n/a', 1.0], # Contains a space - 'Age': [25, np.NaN, 35, 0] - } - df = DataFrame(data) - factor3 = to_factor(df, column='Name') - self.assertEqual(sum(factor3), 2) - factor4 = to_factor(df) - self.assertEqual(sum(factor4), 2) - with self.assertRaises(HedFileError) as context5: - to_factor(data) def test_strs_to_sidecar(self): with open(self.json_path, 'r') as fp: @@ -382,13 +378,14 @@ def test_flatten_cat_col(self): "_flatten_cat_col should use the Description tag if available") def test_flatten_cat_col_only_description(self): - keys, values, descriptions, tags = annotation_util._flatten_cat_col("event_type", - {"HED": {"code1": "Description/Code 1 here."}}) + keys, values, descriptions, tags = \ + annotation_util._flatten_cat_col("event_type", {"HED": {"code1": "Description/Code 1 here."}}) self.assertIsInstance(tags, list) self.assertEqual(tags[0], 'n/a') def test_flatten_val_col_only_description(self): - keys, values, descriptions, tags = annotation_util._flatten_val_col("response", {"HED": "Description/Code 1 here."}) + keys, values, descriptions, tags = annotation_util._flatten_val_col("response", + {"HED": "Description/Code 1 here."}) self.assertEqual(descriptions[0], 'Code 1 here.') self.assertFalse(tags[0]) From d5fef610e3449a7a3b3316085cf5c64f006d72eb Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Tue, 14 May 2024 17:15:38 -0500 Subject: [PATCH 3/5] Updated return value on strs_sidecar --- hed/tools/analysis/annotation_util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hed/tools/analysis/annotation_util.py b/hed/tools/analysis/annotation_util.py index c8733d06b..13143e354 100644 --- a/hed/tools/analysis/annotation_util.py +++ b/hed/tools/analysis/annotation_util.py @@ -226,9 +226,11 @@ def strs_to_sidecar(sidecar_strings): sidecar_strings (string or list): String or strings representing sidecars. Returns: - Sidecar: the merged sidecar from the list. + Sidecar or None: the merged sidecar from the list. """ + if not sidecar_strings: + return None if not isinstance(sidecar_strings, list): sidecar_strings = [sidecar_strings] if sidecar_strings: From 1d880edb80c93145e4eb21a1388dbca44e97b617 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Wed, 15 May 2024 05:28:55 -0500 Subject: [PATCH 4/5] Updated allowed font extensions --- hed/tools/visualization/tag_word_cloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hed/tools/visualization/tag_word_cloud.py b/hed/tools/visualization/tag_word_cloud.py index fefef35fa..90a06a877 100644 --- a/hed/tools/visualization/tag_word_cloud.py +++ b/hed/tools/visualization/tag_word_cloud.py @@ -46,7 +46,7 @@ def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400 kwargs.setdefault('min_font_size', 8) if 'font_path' not in kwargs: kwargs['font_path'] = None - elif kwargs['font_path'] and not kwargs['font_path'].lower().endswith((".ttf", ".otf")): + elif kwargs['font_path'] and not kwargs['font_path'].lower().endswith((".ttf", ".otf", ".ttc")): raise HedFileError("InvalidFontPath", f"Font {kwargs['font_path']} not valid on this system", "") wc = WordCloud(background_color=background_color, mask=mask_image, From 10eae2870257148dd9c53bfeb2ca04e84574ce21 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Wed, 15 May 2024 08:41:49 -0500 Subject: [PATCH 5/5] Updated annotation_util formating --- hed/tools/analysis/annotation_util.py | 28 ++------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/hed/tools/analysis/annotation_util.py b/hed/tools/analysis/annotation_util.py index 13143e354..7abe01bbd 100644 --- a/hed/tools/analysis/annotation_util.py +++ b/hed/tools/analysis/annotation_util.py @@ -65,7 +65,6 @@ def df_to_hed(dataframe, description_tag=True): return hed_dict - def extract_tags(hed_string, search_tag): """ Extract all instances of specified tag from a tag_string. @@ -117,6 +116,7 @@ def generate_sidecar_entry(column_name, column_values=None): sidecar_entry["HED"] = hed return sidecar_entry + def get_bids_dataset(data_root): """ Return a BIDS dataset object given a path to a dataset root. @@ -129,6 +129,7 @@ def get_bids_dataset(data_root): """ return BidsDataset(data_root) + def hed_to_df(sidecar_dict, col_names=None): """ Return a 4-column dataframe of HED portions of sidecar. @@ -241,31 +242,6 @@ def strs_to_sidecar(sidecar_strings): else: return None -def to_factor(data, column=None): - """Convert data to an integer factor list. - - Parameters: - data (Series or DataFrame) - Series to be converted to a list. - column (str): Optional column name if DataFrame (otherwise column 0). - - Returns: - list - contains 0's and 1's, empty, 'n/a' and np.NAN are converted to 0. - """ - if isinstance(data, pd.Series): - series = data - elif isinstance(data, pd.DataFrame) and column: - series = data[column] - elif isinstance(data, pd.DataFrame): - series = data.iloc[:, 0] - else: - raise HedFileError("CannotConvertToFactor", - f"Expecting Series or DataFrame but got {type(data)}", "") - - replaced = series.replace('n/a', False) - filled = replaced.fillna(False) - bool_list = filled.astype(bool).tolist() - return [int(value) for value in bool_list] - def to_factor(data, column=None): """Convert data to an integer factor list.