From eca3e252dd9bf3514d9c33cf168b97aff3dd93ac Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 29 Jun 2023 17:46:05 -0500 Subject: [PATCH 1/3] Second pass word cloud --- docs/requirements.txt | 2 +- hed/tools/visualization/__init__.py | 1 + hed/tools/visualization/tag_word_cloud.py | 78 ++++++++++++- hed/tools/visualization/word_cloud_util.py | 86 ++++++++++++++ requirements.txt | 2 +- .../visualization/test_tag_word_cloud.py | 110 ++++++++++++++++++ .../visualizations/test_tag_word_cloud.py | 52 --------- 7 files changed, 271 insertions(+), 60 deletions(-) create mode 100644 hed/tools/visualization/__init__.py create mode 100644 hed/tools/visualization/word_cloud_util.py create mode 100644 tests/tools/visualization/test_tag_word_cloud.py delete mode 100644 tests/tools/visualizations/test_tag_word_cloud.py diff --git a/docs/requirements.txt b/docs/requirements.txt index 9783a3079..b5ee9d833 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,4 +8,4 @@ portalocker>=2.7.0 semantic_version>=2.10.0 Sphinx>=5.2.2 sphinx_rtd_theme>=1.0.0 -wordcloud>=1.9.2 +wordcloud==1.9.2 diff --git a/hed/tools/visualization/__init__.py b/hed/tools/visualization/__init__.py new file mode 100644 index 000000000..677aba8ac --- /dev/null +++ b/hed/tools/visualization/__init__.py @@ -0,0 +1 @@ +from tag_word_cloud import create_wordcloud, summary_to_dict diff --git a/hed/tools/visualization/tag_word_cloud.py b/hed/tools/visualization/tag_word_cloud.py index 2f2c25236..4e352f387 100644 --- a/hed/tools/visualization/tag_word_cloud.py +++ b/hed/tools/visualization/tag_word_cloud.py @@ -1,13 +1,18 @@ -from wordcloud import WordCloud +import numpy as np +from PIL import Image +from hed.tools.visualization.word_cloud_util import default_color_func, WordCloud -def create_wordcloud(word_dict, width=400, height=200): +def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400, height=200, **kwargs): """Takes a word dict and returns a generated word cloud object Parameters: word_dict(dict): words and their frequencies + mask_path(str or None): The path of the mask file + background_color(str or None): If None, transparent background. width(int): width in pixels height(int): height in pixels + kwargs(kwargs): Any other parameters WordCloud accepts, overrides default values where relevant. Returns: word_cloud(WordCloud): The generated cloud. Use .to_file to save it out as an image. @@ -15,19 +20,35 @@ def create_wordcloud(word_dict, width=400, height=200): :raises ValueError: An empty dictionary was passed """ - wc = WordCloud(background_color='white', width=width, height=height) + mask_image = None + if mask_path: + mask_image = load_and_resize_mask(mask_path, width, height) + width = mask_image.shape[1] + height = mask_image.shape[0] + kwargs.setdefault('contour_width', 3) + kwargs.setdefault('contour_color', 'black') + kwargs.setdefault('prefer_horizontal', 0.75) + kwargs.setdefault('default_color_func', default_color_func) + kwargs.setdefault('relative_scaling', 1) + kwargs.setdefault('max_font_size', height / 15) + kwargs.setdefault('min_font_size', 5) + + wc = WordCloud(background_color=background_color, mask=mask_image, + width=width, height=height, mode="RGBA", **kwargs) wc.generate_from_frequencies(word_dict) return wc -def summary_to_dict(summary): +def summary_to_dict(summary, transform=np.log10, adjustment=5): """Converts a HedTagSummary json dict into the word cloud input format Parameters: summary(dict): The summary from a summarize hed tags op - + transform(func): The function to transform the number of found tags + Default log10 + adjustment(int): Value added after transform. Returns: word_dict(dict): a dict of the words and their occurrence count @@ -35,12 +56,57 @@ def summary_to_dict(summary): A malformed dictionary was passed """ + if transform is None: + transform = lambda x: x overall_summary = summary.get("Overall summary", {}) specifics = overall_summary.get("Specifics", {}) tag_dict = specifics.get("Main tags", {}) word_dict = {} for tag_sub_list in tag_dict.values(): for tag_sub_dict in tag_sub_list: - word_dict[tag_sub_dict['tag']] = tag_sub_dict['events'] + word_dict[tag_sub_dict['tag']] = transform(tag_sub_dict['events']) + adjustment return word_dict + + +def load_and_resize_mask(mask_path, width=None, height=None): + """ Load a mask image and resize it according to given dimensions. + + The image is resized maintaining aspect ratio if only width or height is provided. + + Returns None if no mask_path. + + Parameters: + mask_path (str): The path to the mask image file. + width (int, optional): The desired width of the resized image. If only width is provided, + the image is scaled to maintain its original aspect ratio. Defaults to None. + height (int, optional): The desired height of the resized image. If only height is provided, + the image is scaled to maintain its original aspect ratio. Defaults to None. + + Returns: + numpy.ndarray: The loaded and processed mask image as a numpy array with binary values (0 or 255). + """ + if mask_path: + mask_image = Image.open(mask_path) + + if width or height: + original_size = np.array((mask_image.width, mask_image.height)) + output_size = np.array((width, height)) + # Handle one missing param + if not height: + scale = original_size[0] / width + output_size = original_size / scale + elif not width: + scale = original_size[1] / height + output_size = original_size / scale + + mask_image = mask_image.resize(output_size.astype(int), Image.LANCZOS) + + # Convert to greyscale then to binary black and white (0 or 255) + mask_image = mask_image.convert('L') + mask_image_array = np.array(mask_image) + mask_image_array = np.where(mask_image_array > 127, 255, 0) + else: + mask_image_array = np.array(mask_image) + + return mask_image_array.astype(np.uint8) \ No newline at end of file diff --git a/hed/tools/visualization/word_cloud_util.py b/hed/tools/visualization/word_cloud_util.py new file mode 100644 index 000000000..ba25e0133 --- /dev/null +++ b/hed/tools/visualization/word_cloud_util.py @@ -0,0 +1,86 @@ +import random +from random import Random + +import numpy as np +from PIL import Image, ImageFilter +from matplotlib import cm +from wordcloud import WordCloud + + +def _draw_contour(wc, img): + """Slightly tweaked copy of internal WorldCloud function to allow transparency""" + if wc.mask is None or wc.contour_width == 0 or wc.contour_color is None: + return img + + mask = wc._get_bolean_mask(wc.mask) * 255 + contour = Image.fromarray(mask.astype(np.uint8)) + contour = contour.resize(img.size) + contour = contour.filter(ImageFilter.FIND_EDGES) + contour = np.array(contour) + + # make sure borders are not drawn before changing width + contour[[0, -1], :] = 0 + contour[:, [0, -1]] = 0 + + # use gaussian to change width, divide by 10 to give more resolution + radius = wc.contour_width / 10 + contour = Image.fromarray(contour) + contour = contour.filter(ImageFilter.GaussianBlur(radius=radius)) + contour = np.array(contour) > 0 + if img.mode == 'RGBA': + contour = np.dstack((contour, contour, contour, contour)) + else: + contour = np.dstack((contour, contour, contour)) + + # color the contour + ret = np.array(img) * np.invert(contour) + color = np.array(Image.new(img.mode, img.size, wc.contour_color)) + ret += color * contour + + return Image.fromarray(ret) + +# Replace WordCloud function with one that can handle transparency +WordCloud._draw_contour = _draw_contour + + +def random_color_darker(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None): + """Random color generation func""" + if random_state is None: + random_state = Random() + return f"hsl({random_state.randint(0, 255)}, {random_state.randint(50, 100)}%, {random_state.randint(0, 50)}%)" + + +class ColormapColorFunc: + def __init__(self, colormap='nipy_spectral', color_range=(0.0, 0.5), color_step_range=(0.15, 0.25)): + """Initialize a word cloud color generator. + + Parameters: + colormap (str, optional): The name of the matplotlib colormap to use for generating colors. + Defaults to 'nipy_spectral'. + color_range (tuple of float, optional): A tuple containing the minimum and maximum values to use + from the colormap. Defaults to (0.0, 0.5). + color_step_range (tuple of float, optional): A tuple containing the minimum and maximum values to step + through the colormap. Defaults to (0.15, 0.25). + This is the speed at which it goes through the range chosen. + .25 means it will go through 1/4 of the range each pick. + """ + self.colormap = cm.get_cmap(colormap) + self.color_range = color_range + self.color_step_range = color_step_range + self.current_fraction = random.uniform(0, 1) # Start at a random point + + def color_func(self, word, font_size, position, orientation, random_state=None, **kwargs): + # Update the current color fraction and wrap around if necessary + color_step = random.uniform(*self.color_step_range) + self.current_fraction = (self.current_fraction + color_step) % 1.0 + + # Scale the fraction to the desired range + scaled_fraction = self.color_range[0] + (self.current_fraction * (self.color_range[1] - self.color_range[0])) + + # Get the color from the colormap + color = self.colormap(scaled_fraction) + + return tuple(int(c * 255) for c in color[:3]) # Convert to RGB format + + +default_color_func = ColormapColorFunc().color_func diff --git a/requirements.txt b/requirements.txt index 7dd623faa..96a737c45 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,4 @@ openpyxl>=3.1.0 pandas>=1.3.5 portalocker>=2.7.0 semantic_version>=2.10.0 -wordcloud>=1.9.2 +wordcloud==1.9.2 diff --git a/tests/tools/visualization/test_tag_word_cloud.py b/tests/tools/visualization/test_tag_word_cloud.py new file mode 100644 index 000000000..2b515c941 --- /dev/null +++ b/tests/tools/visualization/test_tag_word_cloud.py @@ -0,0 +1,110 @@ +import unittest +from wordcloud import WordCloud +from hed.tools.visualization import tag_word_cloud +from hed.tools.visualization.tag_word_cloud import load_and_resize_mask +import numpy as np +from PIL import Image, ImageDraw +import os + + +class TestWordCloudFunctions(unittest.TestCase): + def test_convert_summary_to_word_dict(self): + # Assume we have a valid summary_json + summary_json = { + 'Overall summary': { + 'Specifics': { + 'Main tags': { + 'tag_category_1': [ + {'tag': 'tag1', 'events': 5}, + {'tag': 'tag2', 'events': 3} + ], + 'tag_category_2': [ + {'tag': 'tag3', 'events': 7} + ] + } + } + } + } + expected_output = {'tag1': 5, 'tag2': 3, 'tag3': 7} + + word_dict = tag_word_cloud.summary_to_dict(summary_json, transform=None, adjustment=0) + self.assertEqual(word_dict, expected_output) + + def test_create_wordcloud(self): + word_dict = {'tag1': 5, 'tag2': 3, 'tag3': 7} + width = 400 + height = 200 + wc = tag_word_cloud.create_wordcloud(word_dict, width=width, height=height) + + self.assertIsInstance(wc, WordCloud) + self.assertEqual(wc.width, width) + self.assertEqual(wc.height, height) + + def test_create_wordcloud_with_empty_dict(self): + # Test creation of word cloud with an empty dictionary + word_dict = {} + with self.assertRaises(ValueError): + tag_word_cloud.create_wordcloud(word_dict) + + def test_create_wordcloud_with_single_word(self): + # Test creation of word cloud with a single word + word_dict = {'single_word': 1} + wc = tag_word_cloud.create_wordcloud(word_dict) + self.assertIsInstance(wc, WordCloud) + # Check that the single word is in the word cloud + self.assertIn('single_word', wc.words_) + + +class TestLoadAndResizeMask(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Create a simple black and white image + cls.original_size = (300, 200) + cls.img = Image.new('L', cls.original_size, 0) # Start with a black image + + # Draw a white circle in the middle of the image + d = ImageDraw.Draw(cls.img) + circle_radius = min(cls.original_size) // 4 # Radius of the circle is a quarter of the smaller dimension of the image + circle_center = (cls.original_size[0] // 2, cls.original_size[1] // 2) # Center of the circle is the center of the image + d.ellipse((circle_center[0] - circle_radius, + circle_center[1] - circle_radius, + circle_center[0] + circle_radius, + circle_center[1] + circle_radius), + fill=255) # Fill the ellipse with white + cls.img_path = 'temp_img.bmp' + cls.img.save(cls.img_path) + + @classmethod + def tearDownClass(cls): + # Clean up the temp image + os.remove(cls.img_path) + + def test_no_resizing(self): + mask = load_and_resize_mask(self.img_path) + mask_img = Image.fromarray(mask) + self.assertEqual((mask_img.width, mask_img.height), self.original_size) + + def test_width_resizing(self): + width = 150 + mask = load_and_resize_mask(self.img_path, width=width) + mask_img = Image.fromarray(mask) + expected_width, expected_height = width, int(self.original_size[1] * width / self.original_size[0]) + self.assertEqual((mask_img.width, mask_img.height), (expected_width, expected_height)) + + def test_height_resizing(self): + height = 100 + mask = load_and_resize_mask(self.img_path, height=height) + mask_img = Image.fromarray(mask) + expected_shape = (int(self.original_size[0] * height / self.original_size[1]), height) + self.assertEqual((mask_img.width, mask_img.height), expected_shape) + + def test_both_dimensions_resizing(self): + width, height = 100, 75 + mask = load_and_resize_mask(self.img_path, width=width, height=height) + self.assertEqual(mask.shape, (height, width)) + + def test_mask_color(self): + mask = load_and_resize_mask(self.img_path) + # Since we created an image with '1' mode, all values should be either 0 or 255 + unique_values = np.unique(mask) + self.assertCountEqual(unique_values, [0, 255]) diff --git a/tests/tools/visualizations/test_tag_word_cloud.py b/tests/tools/visualizations/test_tag_word_cloud.py deleted file mode 100644 index 3b0878f79..000000000 --- a/tests/tools/visualizations/test_tag_word_cloud.py +++ /dev/null @@ -1,52 +0,0 @@ -import unittest -from wordcloud import WordCloud -from hed.tools.visualization import tag_word_cloud - - -class TestWordCloudFunctions(unittest.TestCase): - - def test_convert_summary_to_word_dict(self): - # Assume we have a valid summary_json - summary_json = { - 'Dataset': { - 'Overall summary': { - 'Main tags': { - 'tag_category_1': [ - {'tag': 'tag1', 'events': 5}, - {'tag': 'tag2', 'events': 3} - ], - 'tag_category_2': [ - {'tag': 'tag3', 'events': 7} - ] - } - } - } - } - expected_output = {'tag1': 5, 'tag2': 3, 'tag3': 7} - - word_dict = tag_word_cloud.summary_to_dict(summary_json) - self.assertEqual(word_dict, expected_output) - - def test_create_wordcloud(self): - word_dict = {'tag1': 5, 'tag2': 3, 'tag3': 7} - width = 400 - height = 200 - wc = tag_word_cloud.create_wordcloud(word_dict, width, height) - - self.assertIsInstance(wc, WordCloud) - self.assertEqual(wc.width, width) - self.assertEqual(wc.height, height) - - def test_create_wordcloud_with_empty_dict(self): - # Test creation of word cloud with an empty dictionary - word_dict = {} - with self.assertRaises(ValueError): - tag_word_cloud.create_wordcloud(word_dict) - - def test_create_wordcloud_with_single_word(self): - # Test creation of word cloud with a single word - word_dict = {'single_word': 1} - wc = tag_word_cloud.create_wordcloud(word_dict) - self.assertIsInstance(wc, WordCloud) - # Check that the single word is in the word cloud - self.assertIn('single_word', wc.words_) From a47a3361eb78b2e905cbc023b4450a07306da46a Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 29 Jun 2023 17:58:11 -0500 Subject: [PATCH 2/3] Fix reference in init --- hed/tools/visualization/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hed/tools/visualization/__init__.py b/hed/tools/visualization/__init__.py index 677aba8ac..a40c0333b 100644 --- a/hed/tools/visualization/__init__.py +++ b/hed/tools/visualization/__init__.py @@ -1 +1 @@ -from tag_word_cloud import create_wordcloud, summary_to_dict +from .tag_word_cloud import create_wordcloud, summary_to_dict From 2cde33ff507a7db3134506ca6270123638271af0 Mon Sep 17 00:00:00 2001 From: IanCa Date: Fri, 30 Jun 2023 16:06:10 -0500 Subject: [PATCH 3/3] Fix for inflect --- .github/workflows/ci.yaml | 4 ++-- docs/requirements.txt | 4 ++-- requirements.txt | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 120662503..0c7d2d647 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -25,7 +25,7 @@ jobs: - uses: actions/cache@v3 with: path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('docs/requirements.txt') }} - name: Install dependencies run: | @@ -85,7 +85,7 @@ jobs: - uses: actions/cache@v3 with: path: ${{ env.pythonLocation }} - key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('dev-requirements.txt') }} + key: ${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}-${{ hashFiles('docs/requirements.txt') }} - name: Install dependencies run: | diff --git a/docs/requirements.txt b/docs/requirements.txt index b5ee9d833..b3aa11ea9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,10 +1,10 @@ defusedxml>=0.7.1 -inflect>=6.0.2 -myst-parser>=0.18.1 +inflect==6.0.2 numpy>=1.21.6 openpyxl>=3.1.0 pandas>=1.3.5 portalocker>=2.7.0 +pydantic<2 # For compatibility with inflect semantic_version>=2.10.0 Sphinx>=5.2.2 sphinx_rtd_theme>=1.0.0 diff --git a/requirements.txt b/requirements.txt index 96a737c45..f23f4bd4d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ defusedxml>=0.7.1 -inflect>=6.0.2 +inflect==6.0.2 numpy>=1.21.6 openpyxl>=3.1.0 pandas>=1.3.5 portalocker>=2.7.0 +pydantic<2 # For compatibility with inflect semantic_version>=2.10.0 -wordcloud==1.9.2 +wordcloud==1.9.2 \ No newline at end of file