Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion hed/errors/schema_error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,10 @@ def schema_error_SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE(tag, conversion_factor):
@hed_error(SchemaAttributeErrors.SCHEMA_ALLOWED_CHARACTERS_INVALID,
actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID)
def schema_error_SCHEMA_ALLOWED_CHARACTERS_INVALID(tag, invalid_character):
from hed.schema.hed_schema_constants import character_types
return (f"Tag '{tag}' has an invalid allowedCharacter: '{invalid_character}'. "
f"Allowed characters are: a single character, "
f"or one of the following - letters, blank, digits, alphanumeric.")
f"or one of the following - {', '.join(character_types.keys())}.")


@hed_error(SchemaAttributeErrors.SCHEMA_IN_LIBRARY_INVALID,
Expand Down
8 changes: 8 additions & 0 deletions hed/schema/hed_schema_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,11 @@ class HedKey:
NO_LOC_ATTRIB,
UNMERGED_ATTRIBUTE
}

character_types = {
"letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
"blank": set(" "),
"digits": set("0123456789"),
"alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"),
"nonascii": "nonascii" # Special case for all other printable unicode characters
}
8 changes: 6 additions & 2 deletions hed/schema/schema_attribute_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from hed.errors.error_types import SchemaWarnings, ValidationErrors, SchemaAttributeErrors
from hed.errors.error_reporter import ErrorHandler
from hed.schema.hed_cache import get_hed_versions
from hed.schema.hed_schema_constants import HedKey
from hed.schema.hed_schema_constants import HedKey, character_types


def tag_is_placeholder_check(hed_schema, tag_entry, attribute_name):
Expand Down Expand Up @@ -141,6 +141,10 @@ def tag_is_deprecated_check(hed_schema, tag_entry, attribute_name):
deprecated_version = tag_entry.attributes.get(attribute_name, "")
library_name = tag_entry.has_attribute(HedKey.InLibrary, return_value=True)
all_versions = get_hed_versions(library_name=library_name)
if not library_name:
library_name = ""
if library_name == hed_schema.library and hed_schema.version_number not in all_versions:
all_versions.append(hed_schema.version_number)
if deprecated_version and deprecated_version not in all_versions:
issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_DEPRECATED_INVALID,
tag_entry.name,
Expand Down Expand Up @@ -182,7 +186,7 @@ def allowed_characters_check(hed_schema, tag_entry, attribute_name):

"""
issues = []
allowed_strings = {'letters', 'blank', 'digits', 'alphanumeric'}
allowed_strings = character_types

char_string = tag_entry.attributes.get(attribute_name, "")
characters = char_string.split(",")
Expand Down
1 change: 1 addition & 0 deletions hed/schema/schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def check_invalid_chars(self):
for tag_name, desc in self.hed_schema.get_desc_iter():
issues_list += validate_schema_description(tag_name, desc)

# todo: Do we want to add this?
# todo Activate this session once we have clearer rules on spaces in unit names
# for unit in self.hed_schema.units:
# for i, char in enumerate(unit):
Expand Down
11 changes: 8 additions & 3 deletions hed/tools/visualization/tag_word_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import numpy as np
from PIL import Image
from hed.tools.visualization.word_cloud_util import default_color_func, WordCloud, generate_contour_svg
import matplotlib.font_manager as fm


def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400, height=300, **kwargs):
def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400, height=300, font_path=None, **kwargs):
""" Takes a word dict and returns a generated word cloud object.

Parameters:
Expand All @@ -14,6 +15,8 @@ def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400
background_color (str or None): If None, transparent background.
width (int): width in pixels.
height (int): height in pixels.
font_path (str): a filename or font name to use. Assumed to be a full file path if it ends with .ttf or .otf.
Font names will use a default if a close enough match isn't found.
kwargs (kwargs): Any other parameters WordCloud accepts, overrides default values where relevant.

Returns:
Expand Down Expand Up @@ -41,9 +44,11 @@ def create_wordcloud(word_dict, mask_path=None, background_color=None, width=400
kwargs.setdefault('color_func', default_color_func)
kwargs.setdefault('relative_scaling', 1)
kwargs.setdefault('max_font_size', height / 20)
kwargs.setdefault('min_font_size', 8),
kwargs.setdefault('min_font_size', 8)
if font_path and not font_path.endswith(".ttf") and not font_path.endswith(".otf"):
font_path = fm.findfont(font_path)

wc = WordCloud(background_color=background_color, mask=mask_image,
wc = WordCloud(font_path=font_path, background_color=background_color, mask=mask_image,
width=width, height=height, mode="RGBA", **kwargs)

wc.generate_from_frequencies(word_dict)
Expand Down
14 changes: 12 additions & 2 deletions hed/validator/hed_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@

"""
import re
from semantic_version import Version
from hed.errors.error_types import ValidationErrors, DefinitionErrors
from hed.errors.error_reporter import ErrorHandler, check_for_any_errors

from hed.validator.def_validator import DefValidator
from hed.validator.tag_util import UnitValueValidator, CharValidator, StringValidator, TagValidator, GroupValidator
from hed.schema import HedSchema


class HedValidator:
Expand All @@ -31,8 +33,16 @@ def __init__(self, hed_schema, def_dicts=None, definitions_allowed=False):
self._def_validator = DefValidator(def_dicts, hed_schema)
self._definitions_allowed = definitions_allowed

self._unit_validator = UnitValueValidator()
self._char_validator = CharValidator()
self._validate_characters = False
# todo: This could still do validation on schema groups.
if isinstance(hed_schema, HedSchema):
validation_version = hed_schema.with_standard
if not validation_version:
validation_version = hed_schema.version_number
self._validate_characters = Version(validation_version) >= Version("8.3.0")

self._unit_validator = UnitValueValidator(modern_allowed_char_rules=self._validate_characters)
self._char_validator = CharValidator(modern_allowed_char_rules=self._validate_characters)
self._string_validator = StringValidator()
self._tag_validator = TagValidator()
self._group_validator = GroupValidator(hed_schema)
Expand Down
16 changes: 14 additions & 2 deletions hed/validator/tag_util/char_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ class CharValidator:
INVALID_STRING_CHARS = '[]{}~'
INVALID_STRING_CHARS_PLACEHOLDERS = '[]~'

def __init__(self, modern_allowed_char_rules=False):
"""Does basic character validation for hed strings/tags

Parameters:
modern_allowed_char_rules(bool): If True, use 8.3 style rules for unicode characters.
"""
self._validate_characters = modern_allowed_char_rules

def check_invalid_character_issues(self, hed_string, allow_placeholders):
""" Report invalid characters.

Expand All @@ -33,8 +41,12 @@ def check_invalid_character_issues(self, hed_string, allow_placeholders):
if allow_placeholders:
invalid_dict = self.INVALID_STRING_CHARS_PLACEHOLDERS
for index, character in enumerate(hed_string):
if character in invalid_dict or ord(character) > 127:
validation_issues += self._report_invalid_character_error(hed_string, index)
if self._validate_characters:
if character in invalid_dict or not character.isprintable():
validation_issues += self._report_invalid_character_error(hed_string, index)
else:
if character in invalid_dict or ord(character) > 127:
validation_issues += self._report_invalid_character_error(hed_string, index)

return validation_issues

Expand Down
65 changes: 35 additions & 30 deletions hed/validator/tag_util/class_util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
""" Utilities to support HED validation. """
import datetime
import re
import functools


from hed.errors.error_reporter import ErrorHandler
from hed.errors.error_types import ValidationErrors
from hed.schema.hed_schema_constants import HedKey, character_types


class UnitValueValidator:
Expand All @@ -18,13 +20,14 @@ class UnitValueValidator:

VALUE_CLASS_ALLOWED_CACHE = 20

def __init__(self, value_validators=None):
def __init__(self, modern_allowed_char_rules=False, value_validators=None):
""" Validates the unit and value classes on a given tag.

Parameters:
value_validators(dict or None): Override or add value class validators

"""
self._validate_characters = modern_allowed_char_rules
self._value_validators = self._get_default_value_class_validators()
if value_validators and isinstance(value_validators, dict):
self._value_validators.update(value_validators)
Expand Down Expand Up @@ -97,25 +100,20 @@ def check_tag_value_class_valid(self, original_tag, validate_text, report_as=Non
"""
return self._check_value_class(original_tag, validate_text, report_as, error_code, index_offset)

# char_sets = {
# "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
# "blank": set(" "),
# "digits": set("0123456789"),
# "alphanumeric": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
# }
#
# @functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE)
# def _get_allowed_characters(self, value_classes):
# # This could be pre-computed
# character_set = set()
# for value_class in value_classes:
# allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "")
# for single_type in allowed_types.split(","):
# if single_type in self.char_sets:
# character_set.update(self.char_sets[single_type])
# else:
# character_set.add(single_type)
# return character_set
@functools.lru_cache(maxsize=VALUE_CLASS_ALLOWED_CACHE)
def _get_allowed_characters(self, value_classes):
# This could be pre-computed
character_set = set()
for value_class in value_classes:
allowed_types = value_class.attributes.get(HedKey.AllowedCharacter, "")
for single_type in allowed_types.split(","):
if single_type in character_types and single_type != "nonascii":
character_set.update(character_types[single_type])
else:
character_set.add(single_type)
# for now, just always allow these special cases(it's validated extensively elsewhere)
character_set.update("#/")
return character_set

def _get_problem_indexes(self, original_tag, stripped_value):
""" Return list of problem indices for error messages.
Expand All @@ -127,19 +125,24 @@ def _get_problem_indexes(self, original_tag, stripped_value):
Returns:
list: List of int locations in which error occurred.
"""
indexes = []
# Extra +1 for the slash
start_index = original_tag.extension.find(stripped_value) + len(original_tag.org_base_tag) + 1
if start_index == -1:
return []
return indexes

problem_indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"]
return problem_indexes
# Partial implementation of allowedCharacter
# allowed_characters = self._get_allowed_characters(original_tag.value_classes.values())
# if allowed_characters:
# # Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably.
# indexes = [index for index, char in enumerate(stripped_value) if char not in allowed_characters]
# pass
if self._validate_characters:
allowed_characters = self._get_allowed_characters(original_tag.value_classes.values())

if allowed_characters:
# Only test the strippedvalue - otherwise numericClass + unitClass won't validate reasonably.
indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char not in allowed_characters]
if "nonascii" in allowed_characters:
# Filter out ascii characters
indexes = [(char, index) for char, index in indexes if not (ord(char) > 127 and char.isprintable())]
else:
indexes = [(char, index + start_index) for index, char in enumerate(stripped_value) if char in "{}"]
return indexes

def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None, index_offset=0):
""" Return any issues found if this is a value tag,
Expand Down Expand Up @@ -219,12 +222,14 @@ def validate_value_class_type(self, unit_or_value_portion, valid_types):
type_valid (bool): True if this is one of the valid_types validators.

"""
has_valid_func = False
for unit_class_type in valid_types:
valid_func = self._value_validators.get(unit_class_type)
if valid_func:
has_valid_func = True
if valid_func(unit_or_value_portion):
return True
return False
return not has_valid_func


def is_date_time(date_time_string):
Expand Down
Loading