diff --git a/.travis.yml b/.travis.yml index a979755..c00ee3e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,24 +1,48 @@ -# This deliberately is not "python" as a work-around to support -# multi-os builds with custom Python versions in Travis CI. -language: cpp - -os: - - osx - - linux +language: generic env: matrix: - - PYTHON_EXE="`pyenv install -s 2.7.13 && pyenv local 2.7.13`" - - PYTHON_EXE="`pyenv install -s 3.5.3 && pyenv local 3.5.3`" + - PYTHON_EXE="`pyenv install -s 2.7.14 && pyenv local 2.7.14`" - PYTHON_EXE="`pyenv install -s 3.6.1 && pyenv local 3.6.1`" + +# Travis does not offer OSX with arbitrary python versions (like 2.7.13 above) +# So, you cannot simply have the following section in your build matrix: +# os: +# - linux +# - osx +# Instead, you have to include OSX entries into the build matrix manually. +# In particular, this means specifying the environment variables again. + +# The following was adapted from here: +# https://docs.travis-ci.com/user/multi-os/ +# Set `language: generic` to clear `language: python` from above +# Set `python:` (to empty) to clear it from the travis-ci web interface +# Set `osx_image: xcode7.3` to pin OSX version see here: +# https://docs.travis-ci.com/user/osx-ci-environment/ + +matrix: + include: + - os: osx + language: generic + python: + osx_image: xcode7.3 + env: PYTHON_EXE="`pyenv install -s 2.7.14 && pyenv local 2.7.14`" + - os: osx + language: generic + python: + osx_image: xcode7.3 + env: PYTHON_EXE="`pyenv install -s 3.6.1 && pyenv local 3.6.1`" + + install: - pyenv install --list + - echo $PYTHON_EXE + - python --version - ./configure before_script: - - bin/pip install aboutcode-toolkit - - bin/about-code check --show-all . + - bin/about-code check --verbose . script: - "bin/py.test -vvs" @@ -32,4 +56,4 @@ notifications: use_notice: true skip_join: true template: - - "%{repository_slug}#%{build_number} (%{branch} - %{commit} : %{author}): %{message} : %{build_url}" + - "%{repository_slug}#%{build_number} (%{branch}-%{commit}:%{author})-%{message}- %{build_url}" diff --git a/README.rst b/README.rst index 77f24fb..09587d0 100644 --- a/README.rst +++ b/README.rst @@ -104,6 +104,8 @@ And expression can be simplified: >>> expression2 = ' GPL-2.0 or (mit and LGPL 2.1) or bsd Or GPL-2.0 or (mit and LGPL 2.1)' >>> parsed2 = licensing.parse(expression2) + >>> str(parsed2) + 'GPL-2.0 OR (mit AND LGPL 2.1) OR BSD OR GPL-2.0 OR (mit AND LGPL 2.1)' >>> assert str(parsed2.simplify()) == 'BSD OR GPL-2.0 OR (LGPL 2.1 AND mit)' Two expressions can be compared for equivalence and containment: diff --git a/configure b/configure index 8ceb9d6..4f9fdcc 100755 --- a/configure +++ b/configure @@ -16,17 +16,15 @@ CONF_DEFAULT="etc/conf/dev" CFG_CMD_LINE_ARGS="$@" -if [ "$1" == "--init" ]; then - CFG_CMD_LINE_ARGS=$CONF_INIT -fi - -if [ "$1" == "" ]; then +if [[ "$1" == "" ]]; then # default for dev conf if not argument is provided CFG_CMD_LINE_ARGS=$CONF_DEFAULT fi -if [ "$PYTHON_EXE" == "" ]; then +CONFIGURE_ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +if [[ "$PYTHON_EXE" == "" ]]; then PYTHON_EXE=python fi -$PYTHON_EXE etc/configure.py $CFG_CMD_LINE_ARGS +$PYTHON_EXE "$CONFIGURE_ROOT_DIR/etc/configure.py" $CFG_CMD_LINE_ARGS diff --git a/setup.py b/setup.py index ea2eae5..87a527c 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( name='license-expression', - version='0.98', + version='0.99', license='apache-2.0', description=desc, long_description=desc, @@ -30,7 +30,7 @@ include_package_data=True, zip_safe=False, classifiers=[ - 'Development Status :: 4 - Beta', + 'Development Status :: 5 - Production/Stable', 'License :: OSI Approved :: Apache Software License', 'Intended Audience :: Developers', 'Operating System :: OS Independent', @@ -48,6 +48,6 @@ 'licence' ], install_requires=[ - 'boolean.py >= 3.5, < 4.0.0', + 'boolean.py >= 3.6, < 4.0.0', ] ) diff --git a/src/license_expression/__init__.py b/src/license_expression/__init__.py index 43c7b2c..e37f183 100644 --- a/src/license_expression/__init__.py +++ b/src/license_expression/__init__.py @@ -31,16 +31,10 @@ from __future__ import unicode_literals from __future__ import print_function -# Python 2 and 3 support -try: - # Python 2 - unicode - str = unicode # NOQA -except NameError: - # Python 3 - unicode = str # NOQA - -import collections +from collections import defaultdict +from collections import deque +from collections import namedtuple +from collections import OrderedDict from copy import copy from copy import deepcopy from functools import total_ordering @@ -56,9 +50,11 @@ from boolean.boolean import PARSE_ERRORS from boolean.boolean import PARSE_INVALID_EXPRESSION from boolean.boolean import PARSE_INVALID_NESTING +from boolean.boolean import PARSE_INVALID_OPERATOR_SEQUENCE from boolean.boolean import PARSE_INVALID_SYMBOL_SEQUENCE from boolean.boolean import PARSE_UNBALANCED_CLOSING_PARENS from boolean.boolean import PARSE_UNKNOWN_TOKEN + from boolean.boolean import ParseError from boolean.boolean import TOKEN_SYMBOL from boolean.boolean import TOKEN_AND @@ -66,9 +62,19 @@ from boolean.boolean import TOKEN_LPAR from boolean.boolean import TOKEN_RPAR -from license_expression._pyahocorasick import Trie as Scanner -from license_expression._pyahocorasick import Output -from license_expression._pyahocorasick import Result +from license_expression._pyahocorasick import Trie as AdvancedTokenizer +from license_expression._pyahocorasick import Token + + +# Python 2 and 3 support +try: + # Python 2 + unicode + str = unicode # NOQA +except NameError: + # Python 3 + unicode = str # NOQA + # append new error codes to PARSE_ERRORS by monkey patching PARSE_EXPRESSION_NOT_UNICODE = 100 @@ -98,29 +104,65 @@ class ExpressionError(Exception): # Used for tokenizing -Keyword = collections.namedtuple('Keyword', 'value type') +Keyword = namedtuple('Keyword', 'value type') +Keyword.__len__ = lambda self: len(self.value) # id for "with" token which is not a proper boolean symbol but an expression symbol TOKEN_WITH = 10 -# actual keyword types +# keyword types that include operators and parens + KW_LPAR = Keyword('(', TOKEN_LPAR) KW_RPAR = Keyword(')', TOKEN_RPAR) -_KEYWORDS = [ - Keyword(' and ', TOKEN_AND), - Keyword(' or ', TOKEN_OR), - KW_LPAR, - KW_RPAR, - Keyword(' with ', TOKEN_WITH), -] +KW_AND = Keyword('and', TOKEN_AND) +KW_OR = Keyword('or', TOKEN_OR) +KW_WITH = Keyword('with', TOKEN_WITH) -KEYWORDS = tuple(kw.value for kw in _KEYWORDS) -KEYWORDS_STRIPPED = tuple(k.strip() for k in KEYWORDS) +KEYWORDS = (KW_AND, KW_OR, KW_LPAR, KW_RPAR, KW_WITH,) +KEYWORDS_STRINGS = set(kw.value for kw in KEYWORDS) + +# mapping of lowercase operator strings to an operator object +OPERATORS = {'and': KW_AND, 'or': KW_OR, 'with': KW_WITH} + +_simple_tokenizer = re.compile(''' + (?P[^\s\(\)]+) + | + (?P\s+) + | + (?P\() + | + (?P\)) + ''', + re.VERBOSE | re.MULTILINE | re.UNICODE +).finditer class Licensing(boolean.BooleanAlgebra): """ - Define a mini language to parse, validate and compare license expressions. + Licensing defines a mini language to parse, validate and compare license + expressions. This is the main entry point this library. + + Some of the features are: + + - licenses can be validated against user-provided lists of known licenses + "symbols" (such as ScanCode licenses or the SPDX list). + + - flexible expression parsing and recognition of licenses (including + licenses with spaces and keywords (such as AND, OR WITH) or parens in + their names). + + - in an expression licenses can be more than just identifiers such short or + long names + + - A license can have multiple aliases (such as GPLv2 or GPL2) and each will + be properly recognized when parsing. + + - expressions can be simplified, normalized, sorted and compared for + containment and/or logical equivalence thanks to a built-in boolean logic + engine. + + - Once parsed, expressions can be rendered using simple templates (for + instance to render HTML links in a GUI). For example: @@ -154,9 +196,9 @@ class Licensing(boolean.BooleanAlgebra): def __init__(self, symbols=tuple(), quiet=True): """ - Initialize a Licensing with an optional `symbols` sequence of LicenseSymbol - or LicenseSymbol-like objects or license key strings. If provided and this - list data is invalid, raise a ValueError. + Initialize a Licensing with an optional `symbols` sequence of + LicenseSymbol or LicenseSymbol-like objects or license key strings. If + provided and this list data is invalid, raise a ValueError. """ super(Licensing, self).__init__(Symbol_class=LicenseSymbol, AND_class=AND, OR_class=OR) @@ -168,9 +210,11 @@ def __init__(self, symbols=tuple(), quiet=True): if symbols: symbols = tuple(as_symbols(symbols)) warns, errors = validate_symbols(symbols) + if warns and not quiet: for w in warns: print(w) + if errors and not quiet: for e in errors: print(e) @@ -178,13 +222,14 @@ def __init__(self, symbols=tuple(), quiet=True): if errors: raise ValueError('\n'.join(warns + errors)) - # mapping of known symbol used for parsing and resolution as (key, symbol) - # TODO: inject lpar, rpar and spaces sourround, before and after - # e.g "(sym)" "(sym " "sym)" " sym " + # mapping of known symbol key to symbol for reference self.known_symbols = {symbol.key: symbol for symbol in symbols} - # Aho-Corasick automaton-based Scanner used for expression tokenizing - self.scanner = None + # mapping of known symbol lowercase key to symbol for reference + self.known_symbols_lowercase = {symbol.key.lower(): symbol for symbol in symbols} + + # Aho-Corasick automaton-based Advanced Tokenizer + self.advanced_tokenizer = None def is_equivalent(self, expression1, expression2, **kwargs): """ @@ -331,29 +376,33 @@ def unknown_license_keys(self, expression, unique=True, **kwargs): symbols = self.unknown_license_symbols(expression, unique=False, **kwargs) return self._keys(symbols, unique) - def parse(self, expression, validate=False, strict=False, **kwargs): + def parse(self, expression, validate=False, strict=False, simple=False, **kwargs): """ - Return a new license LicenseExpression object by parsing a license expression - string. Check that the expression syntax is valid and raise an Exception, - ExpressionError or ParseError on errors. Return None for empty expressions. - `expression` is either a string or a LicenseExpression object. If this is a - LicenseExpression it is retruned as-si. - - Symbols are always recognized from known symbols if `symbols` were provided - Licensing creation time: each license and exception is recognized from known - license keys (and from aliases for a symbol if available). - - If `validate` is True and a symbol is unknown, an ExpressionError error + Return a new license LicenseExpression object by parsing a license + `expression` string. Check that the expression syntax is valid and raise + an Exception, an ExpressionError or a ParseError on errors. + Return None for empty expressions. + `expression` is either a string or a LicenseExpression object. If this + is a LicenseExpression it is returned as-is. + Symbols are always recognized from known symbols if `symbols` were + provided at Licensing creation time: each license and exception is + recognized from known license keys (and from aliases for a symbol if + available). + + If `validate` is True and a license is unknown, an ExpressionError error is raised with a message listing the unknown license keys. - If `validate` is False, no error is triggered. + If `validate` is False, no error is raised. You can call the + `unknown_license_keys` or `unknown_license_symbols` methods to get + unknown license keys or symbols found in a parsed LicenseExpression. - You can call the `unknown_license_keys` or `unknown_license_symbols` methods - to get unknown license keys or symbols found in a parsed LicenseExpression. + If `strict` is True, additional exceptions will be raised if in a + "WITH" expression such as "XXX with ZZZ" if the XXX symbol has + `is_exception` set to True or the YYY symbol has `is_exception` set to + False. This checks that symbols are used strictly as constructed. - If `strict` is True, additional exceptions will be raised if in a expression - such as "XXX with ZZZ" if the XXX symbol has `is_exception` set to True or - the YYY symbol has `is_exception` set to False. + If `simple` is True, parsing will use a simple tokenizer that assumes + that license symbols are all license keys that cannot contain spaces. For example: >>> expression = 'EPL-1.0 and Apache-1.1 OR GPL-2.0 with Classpath-exception' @@ -369,7 +418,7 @@ def parse(self, expression, validate=False, strict=False, **kwargs): if isinstance(expression, bytes): try: - expression = unicode(expression) + expression = str(expression) except: ext = type(expression) raise ExpressionError('expression must be a string and not: %(ext)r' % locals()) @@ -382,7 +431,7 @@ def parse(self, expression, validate=False, strict=False, **kwargs): return try: # this will raise a ParseError on errors - tokens = list(self.tokenize(expression, strict=strict)) + tokens = list(self.tokenize(expression, strict=strict, simple=simple)) expression = super(Licensing, self).parse(tokens) except TypeError as e: msg = 'Invalid expression syntax: ' + repr(e) @@ -399,7 +448,7 @@ def parse(self, expression, validate=False, strict=False, **kwargs): return expression - def tokenize(self, expression, strict=False): + def tokenize(self, expression, strict=False, simple=False): """ Return an iterable of 3-tuple describing each token given an expression unicode string. See boolean.BooleanAlgreba.tokenize() for API details. @@ -407,157 +456,349 @@ def tokenize(self, expression, strict=False): This 3-tuple contains these items: (token, token string, position): - token: either a Symbol instance or one of TOKEN_* token types.. - token string: the original token unicode string. - - position: some simple object describing the starting position of the - original token string in the `expr` string. It can be an int for a - character offset, or a tuple of starting (row/line, column). + - position: the starting index of the token string in the `expr` string. - If `strict` is True, additional exceptions will be raised in a expression - such as "XXX with ZZZ" if the XXX symbol has is_exception` set to True or the - ZZZ symbol has `is_exception` set to False. + If `strict` is True, additional exceptions will be raised in a + expression such as "XXX with ZZZ" if the XXX symbol has is_exception` + set to True or the ZZZ symbol has `is_exception` set to False. + + If `simple` is True, use a simple tokenizer that assumes that license + symbols are all license keys that cannot contain spaces. """ - if self.known_symbols: - # scan with an automaton, recognize whole symbols+keywords or only keywords - scanner = self.get_scanner() - results = scanner.scan(expression) + if not expression: + return + + if not isinstance(expression, str): + raise ParseError(error_code=PARSE_EXPRESSION_NOT_UNICODE) + + if simple: + tokens = self.simple_tokenizer(expression) else: - # scan with a simple regex-based splitter - results = splitter(expression) + advanced_tokenizer = self.get_advanced_tokenizer() + tokens = advanced_tokenizer.tokenize(expression) - results = strip_and_skip_spaces(results) - result_groups = group_results_for_with_subexpression(results) + # Assign symbol for unknown tokens + tokens = build_symbols_from_unknown_tokens(tokens) - for group in result_groups: - len_group = len(group) - if not len_group: - # This should never happen - continue - if len_group == 1: - # a single token - result = group[0] - pos = result.start - token_string = result.string - output = result.output - if output: - val = output.value - if isinstance(val, Keyword): - # keyword - token = val.type - # WITH is not known from the boolean parser as a proper - # boolean element so we handle validation ourselves: by - # design a single group cannot be a single 'WITH' keyword: - # this is an error that we catch and raise here. - if token == TOKEN_WITH: - raise ParseError(token_type=TOKEN_WITH, - token_string=result.string, - position=result.start, - error_code=PARSE_INVALID_EXPRESSION) - - elif isinstance(val, LicenseSymbol): - if strict and val.is_exception: - raise ParseError(token_type=TOKEN_SYMBOL, - token_string=result.string, - position=result.start, - error_code=PARSE_INVALID_EXCEPTION) - - # known symbol: The strict check above handled possible errors before. - token = val - else: - # this should not be possible by design - raise Exception('Licensing.tokenize is internally confused...') - else: - token = LicenseSymbol(result.string) + # skip whitespace-only tokens + tokens = (t for t in tokens if t.string and t.string.strip()) + + # create atomic LicenseWithExceptionSymbol from WITH subexpressions + tokens = replace_with_subexpression_by_license_symbol(tokens, strict) + # finally yield the actual args expected by the boolean parser + for token in tokens: + pos = token.start + token_string = token.string + token_value = token.value + + if isinstance(token_value, BaseSymbol): + token_obj = token_value + elif isinstance(token_value, Keyword): + token_obj = token_value.type else: - if len_group != 3: - # this should never happen - string = ' '.join([res.string for res in group]) - start = group[0].start - raise ParseError( - TOKEN_SYMBOL, string, start, PARSE_INVALID_EXPRESSION) + raise ParseError(error_code=PARSE_INVALID_EXPRESSION) - # this is a A with B seq of three results - lic_res, WITH , exc_res = group - pos = lic_res.start - WITHs = ' ' + WITH.string.strip() + ' ' - token_string = ''.join([lic_res.string, WITHs, exc_res.string]) + yield token_obj, token_string, pos - # licenses - lic_out = lic_res.output - lic_sym = lic_out and lic_out.value + def get_advanced_tokenizer(self): + """ + Return an AdvancedTokenizer instance either cached or created as needed. + + If symbols were provided when this Licensing object was created, the + tokenizer will recognize known symbol keys and aliases (ignoring case) + when tokenizing expressions. - # this should not happen - if lic_sym and not isinstance(lic_sym, LicenseSymbol): - raise ParseError(TOKEN_SYMBOL, lic_res.string, lic_res.start, - PARSE_INVALID_SYMBOL) + A license symbol is any string separated by keywords and parens (and it + can include spaces). + """ + if self.advanced_tokenizer is not None: + return self.advanced_tokenizer + + self.advanced_tokenizer = tokenizer = AdvancedTokenizer() + + add_item = tokenizer.add + for keyword in KEYWORDS: + add_item(keyword.value, keyword) + + # self.known_symbols has been created at Licensing initialization time and is + # already validated and trusted here + for key, symbol in self.known_symbols.items(): + # always use the key even if there are no aliases. + add_item(key, symbol) + aliases = getattr(symbol, 'aliases', []) + for alias in aliases: + # normalize spaces for each alias. The AdvancedTokenizer will lowercase them + if alias: + alias = ' '.join(alias.split()) + add_item(alias, symbol) - if not lic_sym: - lic_sym = LicenseSymbol(lic_res.string, is_exception=False) + tokenizer.make_automaton() + return tokenizer - if not isinstance(lic_sym, LicenseSymbol): - raise ParseError(TOKEN_SYMBOL, lic_res.string, lic_res.start, - PARSE_INVALID_SYMBOL) + def advanced_tokenizer(self, expression): + """ + Return an iterable of Token describing each token given an expression + unicode string. + """ + tokenizer = self.get_advanced_tokenizer() + return tokenizer.tokenize(expression) - if strict and lic_sym.is_exception: - raise ParseError(TOKEN_SYMBOL, lic_res.string, lic_res.start, - PARSE_INVALID_EXCEPTION) + def simple_tokenizer(self, expression): + """ + Return an iterable of Token describing each token given an expression + unicode string. - # exception - exc_out = exc_res.output - exc_sym = exc_out and exc_out.value + The split is done on spaces, keywords and parens. Anything else is a + symbol token, e.g. a typically license key or license id (that contains + no spaces or parens). - # this should not happen - if exc_sym and not isinstance(exc_sym, LicenseSymbol): - raise ParseError(TOKEN_SYMBOL, lic_sym.string, lic_sym.start, - PARSE_INVALID_SYMBOL) - if exc_sym: - exc_sym = copy(exc_sym) + If symbols were provided when this Licensing object was created, the + tokenizer will recognize known symbol keys (ignoring case) when + tokenizing expressions. + """ - if not exc_sym: - exc_sym = LicenseSymbol(exc_res.string) + symbols = self.known_symbols_lowercase or {} - if not isinstance(exc_sym, LicenseSymbol): - raise ParseError(TOKEN_SYMBOL, exc_res.string, exc_res.start, - PARSE_INVALID_SYMBOL) + for match in _simple_tokenizer(expression): + if not match: + continue + # set start and end as string indexes + start, end = match.span() + end = end - 1 + match_getter = match.groupdict().get + + space = match_getter('space') + if space: + yield Token(start, end, space, None) + + lpar = match_getter('lpar') + if lpar: + yield Token(start, end, lpar, KW_LPAR) + + rpar = match_getter('rpar') + if rpar: + yield Token(start, end, rpar, KW_RPAR) + + sym_or_op = match_getter('symop') + if sym_or_op: + sym_or_op_lower = sym_or_op.lower() + + operator = OPERATORS.get(sym_or_op_lower) + if operator: + yield Token(start, end, sym_or_op, operator) + else: + sym = symbols.get(sym_or_op_lower) + if not sym: + sym = LicenseSymbol(key=sym_or_op) + yield Token(start, end, sym_or_op, sym) - if strict and self.known_symbols and not exc_sym.is_exception: - raise ParseError(TOKEN_SYMBOL, exc_res.string, exc_res.start, - PARSE_INVALID_SYMBOL_AS_EXCEPTION) - token = LicenseWithExceptionSymbol(lic_sym, exc_sym, strict) +def build_symbols_from_unknown_tokens(tokens): + """ + Yield Token given a sequence of Token replacing unmatched contiguous Tokens + by a single token with a LicenseSymbol. + """ + tokens = list(tokens) - yield token, token_string, pos + unmatched = deque() - def get_scanner(self): + def build_token_with_symbol(): """ - Return a scanner either cached or created as needed. If symbols were provided - when this Licensing object was created, the scanner will recognize known - symbols when tokenizing expressions. Otherwise, only keywords are recognized - and a license symbol is anything in between keywords. + Build and return a new Token from accumulated unmatched tokens or None. """ - if self.scanner is not None: - return self.scanner + if not unmatched: + return + # strip trailing spaces + trailing_spaces = [] + while unmatched and not unmatched[-1].string.strip(): + trailing_spaces.append(unmatched.pop()) + + if unmatched: + string = ' '.join(t.string for t in unmatched if t.string.strip()) + start = unmatched[0].start + end = unmatched[-1].end + toksym = LicenseSymbol(string) + unmatched.clear() + yield Token(start, end, string, toksym) + + for ts in trailing_spaces: + yield ts + + for tok in tokens: + if tok.value: + for symtok in build_token_with_symbol(): + yield symtok + yield tok + else: + if not unmatched and not tok.string.strip(): + # skip leading spaces + yield tok + else: + unmatched.append(tok) - self.scanner = scanner = Scanner(ignore_case=True) + # end remainders + for symtok in build_token_with_symbol(): + yield symtok - for keyword in _KEYWORDS: - scanner.add(keyword.value, keyword, priority=0) - # self.known_symbols has been created at Licensing initialization time and is - # already validated and trusted here - for key, symbol in self.known_symbols.items(): - # always use the key even if there are no aliases. - scanner.add(key, symbol, priority=1) - aliases = getattr(symbol, 'aliases', []) - for alias in aliases: - # normalize spaces for each alias. The Scanner will lowercase them - # since we created it with ignore_case=True - if alias: - alias = ' '.join(alias.split()) - if alias: - scanner.add(alias, symbol, priority=2) +def build_token_groups_for_with_subexpression(tokens): + """ + Yield tuples of Token given a sequence of Token such that: + - all symbol-with-symbol sequences of 3 tokens are grouped in a three-tuple + - other tokens are a single token wrapped in a tuple. + """ + + # if n-1 is sym, n is with and n+1 is sym: yield this as a group for a with + # exp otherwise: yield each single token as a group + + tokens = list(tokens) + + # check three contiguous tokens that may form "lic WITh exception" sequence + triple_len = 3 + + # shortcut if there are no grouping possible + if len(tokens) < triple_len: + for tok in tokens: + yield (tok,) + return + + # accumulate three contiguous tokens + triple = deque() + triple_popleft = triple.popleft + triple_clear = triple.clear + tripple_append = triple.append + + for tok in tokens: + if len(triple) == triple_len: + if is_with_subexpression(triple): + yield tuple(triple) + triple_clear() + else: + prev_tok = triple_popleft() + yield (prev_tok,) + tripple_append(tok) + + # end remainders + if triple: + if len(triple) == triple_len and is_with_subexpression(triple): + yield tuple(triple) + else: + for tok in triple: + yield (tok,) - scanner.make_automaton() - return scanner + +def is_with_subexpression(tokens_tripple): + """ + Return True if a Token tripple is a WITH license sub-expression. + """ + lic, wit, exc = tokens_tripple + return (isinstance(lic.value, LicenseSymbol) + and wit.value == KW_WITH + and isinstance(exc.value, LicenseSymbol) + ) + + +def replace_with_subexpression_by_license_symbol(tokens, strict=False): + """ + Given an iterable of Token, yiled token, replacing any XXX WITH ZZZ + subexpression by a LicenseWithExceptionSymbol symbol. + + Check validity of with subexpessions and raise ParseError as needed. + + If `strict` is True also raise ParseError if the left hand side + LicenseSymbol has is_exception True or if the right hand side + LicenseSymbol has is_exception False. + """ + token_groups = build_token_groups_for_with_subexpression(tokens) + + for token_group in token_groups: + len_group = len(token_group) + + if not len_group: + # This should never happen + continue + + if len_group == 1: + # a single token + token = token_group[0] + tval = token.value + + if isinstance(tval, Keyword): + if tval.type == TOKEN_WITH: + # keyword + # a single group cannot be a single 'WITH' keyword: + # this is an error that we catch and raise here. + raise ParseError( + token_type=TOKEN_WITH, token_string=token.string, + position=token.start, error_code=PARSE_INVALID_EXPRESSION) + + elif isinstance(tval, LicenseSymbol): + if strict and tval.is_exception: + raise ParseError( + token_type=TOKEN_SYMBOL, token_string=token.string, + position=token.start, error_code=PARSE_INVALID_EXCEPTION) + + else: + # this should not be possible by design + raise Exception('Licensing.tokenize is internally confused...:' + repr(tval)) + + yield token + continue + + if len_group != 3: + # this should never happen + string = ' '.join([tok.string for tok in token_group]) + start = token_group[0].start + raise ParseError( + TOKEN_SYMBOL, string, start, PARSE_INVALID_EXPRESSION) + + # from now on we have a tripple of tokens: a WITH sub-expression such as "A with + # B" seq of three tokens + lic_token, WITH , exc_token = token_group + + token_string = ' '.join([ + lic_token.string, + WITH.string.strip(), + exc_token.string + ]) + + # the left hand side license symbol + lic_sym = lic_token.value + + # this should not happen + if not isinstance(lic_sym, LicenseSymbol): + raise ParseError( + TOKEN_SYMBOL, lic_token.string, lic_token.start, + PARSE_INVALID_SYMBOL) + + if strict and lic_sym.is_exception: + raise ParseError( + TOKEN_SYMBOL, lic_token.string, lic_token.start, + PARSE_INVALID_EXCEPTION) + + # the right hand side exception symbol + exc_sym = exc_token.value + + if not isinstance(exc_sym, LicenseSymbol): + raise ParseError( + TOKEN_SYMBOL, lic_sym.string, lic_sym.start, + PARSE_INVALID_SYMBOL) + + if strict and not exc_sym.is_exception: + raise ParseError( + TOKEN_SYMBOL, exc_token.string, exc_token.start, + PARSE_INVALID_SYMBOL_AS_EXCEPTION) + + lic_exc_sym = LicenseWithExceptionSymbol(lic_sym, exc_sym, strict) + + token = Token( + lic_token.start, + exc_token.end, + token_string, + lic_exc_sym, + ) + yield token class Renderable(object): @@ -608,7 +849,7 @@ def __contains__(self, other): is_valid_license_key = re.compile(r'^[-\w\s\.\+]+$', re.UNICODE).match -#FIXME: we need to implement comparison!!!! +# TODO: we need to implement comparison by hand instead @total_ordering class LicenseSymbol(BaseSymbol): """ @@ -623,7 +864,7 @@ def __init__(self, key, aliases=tuple(), is_exception=False, *args, **kwargs): if not isinstance(key, str): if isinstance(key, bytes): try: - key = unicode(key) + key = str(key) except: raise ExpressionError( 'A license key must be a unicode string: %(key)r' % locals()) @@ -646,7 +887,7 @@ def __init__(self, key, aliases=tuple(), is_exception=False, *args, **kwargs): # normalize for spaces key = ' '.join(key.split()) - if key.lower() in KEYWORDS_STRIPPED: + if key.lower() in KEYWORDS_STRINGS: raise ExpressionError( 'Invalid license key: a key cannot be a reserved keyword: "or", "and" or "with: "%(key)s"' % locals()) @@ -662,7 +903,7 @@ def __init__(self, key, aliases=tuple(), is_exception=False, *args, **kwargs): def decompose(self): """ - Return an iterable the underlying symbols for this symbol + Return an iterable of the underlying symbols for this symbol. """ yield self @@ -698,6 +939,9 @@ def render(self, template='{symbol.key}', *args, **kwargs): def __str__(self): return self.key + def __len__(self): + return len(self.key) + def __repr__(self): cls = self.__class__.__name__ key = self.key @@ -716,12 +960,12 @@ def symbol_like(cls, symbol): return hasattr(symbol, 'key') and hasattr(symbol, 'is_exception') -#FIXME: we need to implement comparison!!!! +# TODO: we need to implement comparison by hand instead @total_ordering class LicenseSymbolLike(LicenseSymbol): """ - A LicenseSymbolLike object wraps a symbol-like object to expose a LicenseSymbol - behavior. + A LicenseSymbolLike object wraps a symbol-like object to expose a + LicenseSymbol behavior. """ def __init__(self, symbol_like, *args, **kwargs): @@ -777,7 +1021,7 @@ def __lt__(self, other): return NotImplemented -#FIXME: we need to implement comparison!!!! +# TODO: we need to implement comparison by hand instead @total_ordering class LicenseWithExceptionSymbol(BaseSymbol): """ @@ -921,6 +1165,8 @@ class AND(RenderableFunction, boolean.AND): """ def __init__(self, *args): + if len(args) < 2: + raise ExpressionError('AND requires two or more licenses as in: MIT AND BSD') super(AND, self).__init__(*args) self.operator = ' AND ' @@ -931,6 +1177,8 @@ class OR(RenderableFunction, boolean.OR): """ def __init__(self, *args): + if len(args) < 2: + raise ExpressionError('OR requires two or more licenses as in: MIT OR BSD') super(OR, self).__init__(*args) self.operator = ' OR ' @@ -949,84 +1197,13 @@ def ordered_unique(seq): return uniques -def strip_and_skip_spaces(results): - """ - Yield results given a sequence of Result skipping whitespace-only results - """ - for result in results: - if result.string.strip(): - yield result - - -def group_results_for_with_subexpression(results): - """ - Yield tuples of (Result) given a sequence of Result such that: - - all symbol-with-symbol subsequences of three results are grouped in a three-tuple - - other results are the single result in a tuple. - """ - - # if n-1 is sym, n is with and n+1 is sym: yield this as a group for a with exp - # otherwise: yield each single result as a group - - results = list(results) - - # check three contiguous result from scanning at a time - triple_len = 3 - - # shortcut if there are no grouping possible - if len(results) < triple_len: - for res in results: - yield (res,) - return - - # accumulate three contiguous results - triple = collections.deque() - triple_popleft = triple.popleft - triple_clear = triple.clear - tripple_append = triple.append - - for res in results: - if len(triple) == triple_len: - if is_with_subexpression(triple): - yield tuple(triple) - triple_clear() - else: - prev_res = triple_popleft() - yield (prev_res,) - tripple_append(res) - - # end remainders - if triple: - if len(triple) == triple_len and is_with_subexpression(triple): - yield tuple(triple) - else: - for res in triple: - yield (res,) - - -def is_symbol(result): - # either the output value is a known sym, or we have no output for unknown sym - return result.output and isinstance(result.output.value, LicenseSymbol) or not result.output - - -def is_with_keyword(result): - return (result.output - and isinstance(result.output.value, Keyword) - and result.output.value.type == TOKEN_WITH) - - -def is_with_subexpression(results): - lic, wit, exc = results - return (is_symbol(lic) and is_with_keyword(wit) and is_symbol(exc)) - - def as_symbols(symbols): """ Return an iterable of LicenseSymbol objects from a sequence of `symbols` or - strings. If an item is a string, then create a new LicenseSymbol for it using the - string as key. If this is not a string it must be a LicenseSymbol-like type. It - will raise a TypeError expection if an item is neither a string or LicenseSymbol- - like. + strings. If an item is a string, then create a new LicenseSymbol for it + using the string as key. If this is not a string it must be a LicenseSymbol- + like type. It will raise a TypeError expection if an item is neither a + string or LicenseSymbol- like. """ if symbols: for symbol in symbols: @@ -1034,11 +1211,11 @@ def as_symbols(symbols): continue if isinstance(symbol, bytes): try: - symbol = unicode(symbol) + symbol = str(symbol) except: raise TypeError('%(symbol)r is not a unicode string.' % locals()) - if isinstance(symbol, unicode): + if isinstance(symbol, str): if symbol.strip(): yield LicenseSymbol(symbol) @@ -1053,7 +1230,7 @@ def as_symbols(symbols): 'or a LicenseSymbol-like instance.' % locals()) -def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS): +def validate_symbols(symbols, validate_keys=False): """ Return a tuple of (`warnings`, `errors`) given a sequence of `symbols` LicenseSymbol-like objects. @@ -1075,9 +1252,9 @@ def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS): not_symbol_classes = [] dupe_keys = set() dupe_exceptions = set() - dupe_aliases = collections.defaultdict(list) + dupe_aliases = defaultdict(list) invalid_keys_as_kw = set() - invalid_alias_as_kw = collections.defaultdict(list) + invalid_alias_as_kw = defaultdict(list) # warning warning_dupe_aliases = set() @@ -1096,7 +1273,7 @@ def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS): dupe_keys.add(key) # key cannot be an expression keyword - if keyl in _keywords: + if keyl in KEYWORDS_STRINGS: invalid_keys_as_kw.add(key) # keep a set of unique seen keys @@ -1129,7 +1306,7 @@ def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS): dupe_aliases[alias].append(key) # an alias cannot be an expression keyword - if alias in _keywords: + if alias in KEYWORDS_STRINGS: invalid_alias_as_kw[key].append(alias) seen_aliases[alias] = keyl @@ -1169,75 +1346,3 @@ def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS): errors.append('Duplicated or empty aliases ignored for license key: %(dupeal)r.' % locals()) return warnings, errors - - -_splitter = re.compile(''' - (?P[^\s\(\)]+) - | - (?P\s+) - | - (?P\() - | - (?P\)) - ''', - re.VERBOSE | re.MULTILINE | re.UNICODE -).finditer - - -def splitter(expression): - """ - Return an iterable of Result describing each token given an - expression unicode string. - - This is a simpler tokenizer used when the Licensing does not have - known symbols. The split is done on spaces and parens. Anything else - is either a token or a symbol. - """ - if not expression: - return - - if not isinstance(expression, str): - raise ParseError(error_code=PARSE_EXPRESSION_NOT_UNICODE) - - # mapping of lowercase token strings to a token type id - TOKENS = { - 'and': Keyword(value='and', type=TOKEN_AND), - 'or': Keyword(value='or', type=TOKEN_OR), - 'with': Keyword(value='with', type=TOKEN_WITH), - } - - for match in _splitter(expression): - if not match: - continue - - start, end = match.span() - end = end - 1 - mgd = match.groupdict() - - space = mgd.get('space') - if space: - yield Result(start, end, space, None) - - lpar = mgd.get('lpar') - if lpar: - yield Result(start, end, lpar, Output(lpar, KW_LPAR)) - - rpar = mgd.get('rpar') - if rpar: - yield Result(start, end, rpar, Output(rpar, KW_RPAR)) - - token_or_sym = mgd.get('symbol') - if not token_or_sym: - continue - - token = TOKENS.get(token_or_sym.lower()) - if token: - yield Result(start, end, token_or_sym, Output(token_or_sym, token)) -# elif token_or_sym.endswith('+') and token_or_sym != '+': -# val = token_or_sym[:-1] -# sym = LicenseSymbol(key=val) -# yield Result(start, end - 1, val, Output(val, sym)) -# yield Result(end, end, '+', Output('+', KW_PLUS)) - else: - sym = LicenseSymbol(key=token_or_sym) - yield Result(start, end, token_or_sym, Output(token_or_sym, sym)) diff --git a/src/license_expression/_pyahocorasick.py b/src/license_expression/_pyahocorasick.py index 4c73709..fefe51f 100644 --- a/src/license_expression/_pyahocorasick.py +++ b/src/license_expression/_pyahocorasick.py @@ -6,119 +6,171 @@ WWW : http://0x80.pl License : public domain -Modified for use in the license_expression library and in particular: - - add support for unicode key strinsg. - - rename word to key and output to value (to be more like a mapping/dict) - - case insensitive search +Modified for use in the license_expression library: + - add support for unicode strings. + - case insensitive search using sequence of words and not characters - improve returned results with the actual start,end and matched string. - support returning non-matched parts of a string """ -from __future__ import unicode_literals from __future__ import absolute_import from __future__ import print_function +from __future__ import unicode_literals from collections import deque from collections import OrderedDict import logging +import re + +# Python 2 and 3 support +try: + # Python 2 + unicode + str = unicode # NOQA +except NameError: + # Python 3 + unicode = str # NOQA + +TRACE = False logger = logging.getLogger(__name__) def logger_debug(*args): - return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) + pass + -# uncomment for local debug logging -# import sys -# logging.basicConfig(stream=sys.stdout) -# logger.setLevel(logging.DEBUG) +if TRACE: + def logger_debug(*args): + return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) + + import sys + logging.basicConfig(stream=sys.stdout) + logger.setLevel(logging.DEBUG) # used to distinguish from None nil = object() +class TrieNode(object): + """ + Node of the Trie/Aho-Corasick automaton. + """ + __slots__ = ['token', 'output', 'fail', 'children'] + + def __init__(self, token, output=nil): + # token of a tokens string added to the Trie as a string + self.token = token + + # an output function (in the Aho-Corasick meaning) for this node: this + # is an object that contains the original key string and any + # additional value data associated to that key. Or "nil" for a node that + # is not a terminal leave for a key. It will be returned with a match. + self.output = output + + # failure link used by the Aho-Corasick automaton and its search procedure + self.fail = nil + + # children of this node as a mapping of char->node + self.children = {} + + def __repr__(self): + if self.output is not nil: + return 'TrieNode(%r, %r)' % (self.token, self.output) + else: + return 'TrieNode(%r)' % self.token + + class Trie(object): """ A Trie and Aho-Corasick automaton. This behaves more or less like a mapping of key->value. This is the main entry point. """ - def __init__(self, ignore_case=True): + def __init__(self): """ Initialize a new Trie. - - If `ignore_case` is True, searches in the Trie will be case insensitive. """ self.root = TrieNode('') - self.ignore_case = ignore_case - # set of any unique character in the trie, updated on each addition - # we keep track of the set of chars added to the trie to build the automaton + # set of any unique tokens in the trie, updated on each addition we keep + # track of the set of tokens added to the trie to build the automaton # these are needed to created the first level children failure links - self._known_chars = set() + self._known_tokens = set() # Flag set to True once a Trie has been converted to an Aho-Corasick automaton self._converted = False - def add(self, key, value=None, priority=0): + def add(self, tokens_string, value=None): """ - Add a new (key string, value) pair to the trie. If the key already exists in - the Trie, its value is replaced with the provided value. - A key is any unicode string. + Add a new tokens_string and its associated value to the trie. If the + tokens_string already exists in the Trie, its value is replaced with the + provided value, typically a Token object. If a value is not provided, + the tokens_string is used as value. + + A tokens_string is any unicode string. It will be tokenized when added + to the Trie. """ if self._converted: raise Exception('This Trie has been converted to an Aho-Corasick ' - 'automaton and cannot be further modified.') - if not key: + 'automaton and cannot be modified.') + + if not tokens_string or not isinstance(tokens_string, str): return - stored_key = self.ignore_case and key.lower() or key + tokens = [t for t in get_tokens(tokens_string) if t.strip()] - # we keep track of the set of chars added to the trie to build the automaton - # these are needed to created the first level children failure links - self._known_chars.update(stored_key) + # we keep track of the set of tokens added to the trie to build the + # automaton these are needed to created the first level children failure + # links + + self._known_tokens.update(tokens) node = self.root - for char in stored_key: + for token in tokens: try: - node = node.children[char] + node = node.children[token] except KeyError: - child = TrieNode(char) - node.children[char] = child + child = TrieNode(token) + node.children[token] = child node = child - # we always store the original key, not a possibly lowercased version - node.output = Output(key, value, priority) + node.output = (tokens_string, value or tokens_string) - def __get_node(self, key): + def __get_node(self, tokens_string): """ - Return a node for this key or None if the trie does not contain the key. - Private function retrieving a final node of trie for given key. + Return a node for this tokens_string or None if the trie does not + contain the tokens_string. Private function retrieving a final node of + the Trie for a given tokens_string. """ - key = self.ignore_case and key.lower() or key + if not tokens_string or not isinstance(tokens_string, str): + return + + tokens = [t for t in get_tokens(tokens_string) if t.strip()] node = self.root - for char in key: + for token in tokens: try: - node = node.children[char] + node = node.children[token] except KeyError: return None return node - def get(self, key, default=nil): + def get(self, tokens_string, default=nil): """ - Return the Output tuple associated with a `key`. - If there is no such key in the Trie, return the default value (other - than nil): if default is not given or nil, raise a KeyError exception. + Return the output value found associated with a `tokens_string`. If + there is no such tokens_string in the Trie, return the default value + (other than nil). If `default` is not provided or is `nil`, raise a + KeyError. """ - node = self.__get_node(key) + node = self.__get_node(tokens_string) output = nil if node: output = node.output if output is nil: if default is nil: - raise KeyError(key) + raise KeyError(tokens_string) else: return default else: @@ -142,37 +194,36 @@ def items(self): """ items = [] - def walk(node, key): + def walk(node, tokens): """ Walk the trie, depth first. """ - key = key + node.char + tokens = [t for t in tokens + [node.token] if t] if node.output is not nil: - items.append((node.output.key, node.output.value)) + items.append((node.output[0], node.output[1],)) for child in node.children.values(): if child is not node: - walk(child, key) + walk(child, tokens) - walk(self.root, key='') + walk(self.root, tokens=[]) return iter(items) - def exists(self, key): + def exists(self, tokens_string): """ Return True if the key is present in this trie. """ - # TODO: add __contains__ magic for this - node = self.__get_node(key) + node = self.__get_node(tokens_string) if node: return bool(node.output != nil) return False - def is_prefix(self, key): + def is_prefix(self, tokens_string): """ - Return True if key is a prefix of any existing key in the trie. + Return True if tokens_string is a prefix of any existing tokens_string in the trie. """ - return (self.__get_node(key) is not None) + return bool(self.__get_node(tokens_string) is not None) def make_automaton(self): """ @@ -181,45 +232,45 @@ def make_automaton(self): converted to an Automaton. """ queue = deque() - queue_append = queue.append - queue_popleft = queue.popleft # 1. create root children for each known items range (e.g. all unique - # characters from all the added keys), failing to root. + # characters from all the added tokens), failing to root. # And build a queue of these - for char in self._known_chars: - if char in self.root.children: - node = self.root.children[char] + for token in self._known_tokens: + if token in self.root.children: + node = self.root.children[token] # e.g. f(s) = 0, Aho-Corasick-wise node.fail = self.root - queue_append(node) + queue.append(node) else: - self.root.children[char] = self.root + self.root.children[token] = self.root # 2. using the queue of all possible top level items/chars, walk the trie and # add failure links to nodes as needed while queue: - current_node = queue_popleft() + current_node = queue.popleft() for node in current_node.children.values(): - queue_append(node) + queue.append(node) state = current_node.fail - while node.char not in state.children: + while node.token not in state.children: state = state.fail - node.fail = state.children.get(node.char, self.root) + node.fail = state.children.get(node.token, self.root) # Mark the trie as converted so it cannot be modified anymore self._converted = True - def iter(self, string): + def iter(self, tokens_string, include_unmatched=False, include_space=False): """ - Yield Result objects for matched strings by performing the Aho-Corasick search procedure. + Yield Token objects for matched strings by performing the Aho-Corasick + search procedure. - The Result start and end positions in the searched string are such that the - matched string is "search_string[start:end+1]". And the start is computed - from the end_index collected by the Aho-Corasick search procedure such that - "start=end_index - n + 1" where n is the length of a matched key. + The Token start and end positions in the searched string are such that + the matched string is "tokens_string[start:end+1]". And the start is + computed from the end_index collected by the Aho-Corasick search + procedure such that + "start=end_index - n + 1" where n is the length of a matched string. - The Result.output is an Output object for a matched key. + The Token.value is an object associated with a matched string. For example: >>> a = Trie() @@ -229,17 +280,14 @@ def iter(self, string): >>> a.add('EFGH') >>> a.add('KL') >>> a.make_automaton() - >>> string = 'abcdefghijklm' - >>> results = Result.sort(a.iter(string)) - + >>> tokens_string = 'a bcdef ghij kl m' + >>> strings = Token.sort(a.iter(tokens_string)) >>> expected = [ - ... Result(1, 5, 'bcdef', Output('BCDEF')), - ... Result(2, 4, 'cde', Output('CDE')), - ... Result(3, 7, 'defgh', Output('DEFGH')), - ... Result(4, 7, 'efgh', Output('EFGH')), - ... Result(10, 11, 'kl', Output('KL')), + ... Token(2, 6, u'bcdef', u'BCDEF'), + ... Token(13, 14, u'kl', u'KL') ... ] - >>> results == expected + + >>> strings == expected True >>> list(a.iter('')) == [] @@ -248,38 +296,78 @@ def iter(self, string): >>> list(a.iter(' ')) == [] True """ - if not string: + if not tokens_string: return - # keep a copy for results - original_string = string - string = self.ignore_case and string.lower() or string - - known_chars = self._known_chars + tokens = get_tokens(tokens_string) state = self.root - for end, char in enumerate(string): - if char not in known_chars: + + if TRACE: + logger_debug('Trie.iter() with:', repr(tokens_string)) + logger_debug(' tokens:', tokens) + + end_pos = -1 + for token_string in tokens: + end_pos += len(token_string) + if TRACE: + logger_debug() + logger_debug('token_string', repr(token_string)) + logger_debug(' end_pos', end_pos) + + if not include_space and not token_string.strip(): + if TRACE: + logger_debug(' include_space skipped') + continue + + if token_string not in self._known_tokens: state = self.root + if TRACE: + logger_debug(' unmatched') + if include_unmatched: + n = len(token_string) + start_pos = end_pos - n + 1 + tok = Token(start_pos, end_pos, tokens_string[start_pos: end_pos + 1], None) + if TRACE: + logger_debug(' unmatched tok:', tok) + yield tok continue - # search for a matching character in the children, starting at root - while char not in state.children: + yielded = False + + # search for a matching token_string in the children, starting at root + while token_string not in state.children: state = state.fail - # we have a matching starting character - state = state.children.get(char, self.root) + + # we have a matching starting token_string + state = state.children.get(token_string, self.root) match = state while match is not nil: if match.output is not nil: - # TODO: this could be precomputed or cached - n = len(match.output.key) - start = end - n + 1 - yield Result(start, end, original_string[start:end + 1], match.output) + matched_string, output_value = match.output + if TRACE: + logger_debug(' type output', repr(output_value), type(matched_string)) + n = len(matched_string) + start_pos = end_pos - n + 1 + if TRACE: logger_debug(' start_pos', start_pos) + yield Token(start_pos, end_pos, tokens_string[start_pos: end_pos + 1], output_value) + yielded = True match = match.fail - - def scan(self, string): - """ - Scan a string for matched and unmatched sub-sequences and yield non- - overlapping Result objects performing a modified Aho-Corasick search + if not yielded and include_unmatched: + if TRACE: + logger_debug(' unmatched but known token') + n = len(token_string) + start_pos = end_pos - n + 1 + tok = Token(start_pos, end_pos, tokens_string[start_pos: end_pos + 1], None) + if TRACE: + logger_debug(' unmatched tok 2:', tok) + yield tok + + logger_debug() + + def tokenize(self, string, include_unmatched=True, include_space=False): + """ + tokenize a string for matched and unmatched sub-sequences and yield non- + overlapping Token objects performing a modified Aho-Corasick search procedure: - return both matched and unmatched sub-sequences. @@ -293,10 +381,8 @@ def scan(self, string): return the non-overlapping portion of the other discarded match as a non-match. - Each Result contains the start and end position, the corresponding string and - an Output object (with original key and any associated associated value). The - string and key are in their original case even if the automaton has the - `ignore_case` attribute. + Each Token contains the start and end position, the corresponding string + and an associated value object. For example: >>> a = Trie() @@ -306,144 +392,175 @@ def scan(self, string): >>> a.add('EFGH') >>> a.add('KL') >>> a.make_automaton() - >>> string = 'abcdefghijkl' - >>> results = list(a.scan(string)) + >>> string = 'a bcdef ghij kl' + >>> tokens = list(a.tokenize(string, include_space=True)) >>> expected = [ - ... Result(start=0, end=0, string='a', output=None), - ... Result(start=1, end=5, string='bcdef', output=Output('BCDEF')), - ... Result(start=6, end=9, string='ghij', output=None), - ... Result(start=10, end=11, string='kl', output=Output('KL')), + ... Token(0, 0, u'a', None), + ... Token(1, 1, u' ', None), + ... Token(2, 6, u'bcdef', u'BCDEF'), + ... Token(7, 7, u' ', None), + ... Token(8, 11, u'ghij', None), + ... Token(12, 12, u' ', None), + ... Token(13, 14, u'kl', u'KL') ... ] - - >>> results == expected + >>> tokens == expected True """ - results = self.iter(string) - results = filter_overlapping(results) - results = add_unmatched(string, results) - return results + tokens = self.iter(string, + include_unmatched=include_unmatched, include_space=include_space) + tokens = list(tokens) + if TRACE: + logger_debug('tokenize.tokens:', tokens) + if not include_space: + tokens = [t for t in tokens if t.string.strip()] + tokens = filter_overlapping(tokens) + return tokens -class TrieNode(object): +def filter_overlapping(tokens): """ - Node of the Trie/Aho-Corasick automaton. - """ - __slots__ = ['char', 'output', 'fail', 'children'] + Return a new list from an iterable of `tokens` discarding contained and + overlaping Tokens using these rules: - def __init__(self, char, output=nil): - # character of a key string added to the Trie - self.char = char - - # an output function (in the Aho-Corasick meaning) for this node: this is an - # Output object that contains the original key string and any additional - # value data associated to that key. Or "nil" for a node that is not a - # terminal leave for a key. It will be returned with a match. - self.output = output - - # failure link used by the Aho-Corasick automaton and its search procedure - self.fail = nil - - # children of this node as a mapping of char->node - self.children = {} - - def __repr__(self): - if self.output is not nil: - return 'TrieNode(%r, %r)' % (self.char, self.output) - else: - return 'TrieNode(%r)' % self.char + - skip a token fully contained in another token. + - keep the biggest, left-most token of two overlapping tokens and skip the other + For example: + >>> tokens = [ + ... Token(0, 0, 'a'), + ... Token(1, 5, 'bcdef'), + ... Token(2, 4, 'cde'), + ... Token(3, 7, 'defgh'), + ... Token(4, 7, 'efgh'), + ... Token(8, 9, 'ij'), + ... Token(10, 13, 'klmn'), + ... Token(11, 15, 'lmnop'), + ... Token(16, 16, 'q'), + ... ] -class Output(object): - """ - An Output is used to track a key added to the Trie as a TrieNode and any - arbitrary value object corresponding to that key. + >>> expected = [ + ... Token(0, 0, 'a'), + ... Token(1, 5, 'bcdef'), + ... Token(8, 9, 'ij'), + ... Token(11, 15, 'lmnop'), + ... Token(16, 16, 'q'), + ... ] - - `key` is the original key unmodified unicode string. - - `value` is the associated value for this key as provided when adding this key. - - `priority` is an optional priority for this key used to disambiguate overalpping matches. + >>> filtered = list(filter_overlapping(tokens)) + >>> filtered == expected + True """ - __slots__ = 'key', 'value', 'priority' - - def __init__(self, key, value=None, priority=0): - self.key = key - self.value = value - self.priority = priority - - def __repr__(self): - return self.__class__.__name__ + '(%(key)r, %(value)r, %(priority)r)' % self.as_dict() + tokens = Token.sort(tokens) - def __eq__(self, other): - return ( - isinstance(other, Output) - and self.key == other.key - and self.value == other.value - and self.priority == other.priority) + # compare pair of tokens in the sorted sequence: current and next + i = 0 + while i < len(tokens) - 1: + j = i + 1 + while j < len(tokens): + curr_tok = tokens[i] + next_tok = tokens[j] + + logger_debug('curr_tok, i, next_tok, j:', curr_tok, i, next_tok, j) + # disjoint tokens: break, there is nothing to do + if next_tok.is_after(curr_tok): + logger_debug(' break to next', curr_tok) + break - def __hash__(self): - return hash((self.key, self.value, self.priority,)) + # contained token: discard the contained token + if next_tok in curr_tok: + logger_debug(' del next_tok contained:', next_tok) + del tokens[j] + continue - def as_dict(self): - return OrderedDict([(s, getattr(self, s)) for s in self.__slots__]) + # overlap: Keep the longest token and skip the smallest overlapping + # tokens. In case of length tie: keep the left most + if curr_tok.overlap(next_tok): + if len(curr_tok) >= len(next_tok): + logger_debug(' del next_tok smaller overlap:', next_tok) + del tokens[j] + continue + else: + logger_debug(' del curr_tok smaller overlap:', curr_tok) + del tokens[i] + break + j += 1 + i += 1 + return tokens -class Result(object): +class Token(object): """ - A Result is used to track the result of a search with its start and end as - index position in the original string and other attributes: + A Token is used to track the tokenization an expression with its + start and end as index position in the original string and other attributes: - `start` and `end` are zero-based index in the original string S such that S[start:end+1] will yield `string`. - - `string` is the sub-string from the original searched string for this Result. - - `output` is the Output object for a matched string and is a marker that this is a - matched string. None otherwise for a Result for unmatched text. + - `string` is the matched substring from the original string for this Token. + - `value` is the corresponding object for this token as one of: + - a LicenseSymbol object + - a "Keyword" object (and, or, with, left and right parens) + - None if this is a space. """ - __slots__ = 'start', 'end', 'string', 'output' + __slots__ = 'start', 'end', 'string', 'value', - def __init__(self, start, end, string='', output=None): + def __init__(self, start, end, string='', value=None): self.start = start self.end = end self.string = string - self.output = output + self.value = value def __repr__(self): - return self.__class__.__name__ + '(%(start)r, %(end)r, %(string)r, %(output)r)' % self.as_dict() + return self.__class__.__name__ + '(%(start)r, %(end)r, %(string)r, %(value)r)' % self.as_dict() def as_dict(self): return OrderedDict([(s, getattr(self, s)) for s in self.__slots__]) def __len__(self): - return self.end + 1 - self.start + return self.end - self.start + 1 def __eq__(self, other): - return isinstance(other, Result) and ( + return isinstance(other, Token) and ( self.start == other.start and self.end == other.end and self.string == other.string and - self.output == other.output + self.value == other.value ) def __hash__(self): - tup = self.start, self.end, self.string, self.output + tup = self.start, self.end, self.string, self.value return hash(tup) - @property - def priority(self): - return getattr(self.output, 'priority', 0) + @classmethod + def sort(cls, tokens): + """ + Return a new sorted sequence of tokens given a sequence of tokens. The + primary sort is on start and the secondary sort is on longer lengths. + Therefore if two tokens have the same start, the longer token will sort + first. + + For example: + >>> tokens = [Token(0, 0), Token(5, 5), Token(1, 1), Token(2, 4), Token(2, 5)] + >>> expected = [Token(0, 0), Token(1, 1), Token(2, 5), Token(2, 4), Token(5, 5)] + >>> expected == Token.sort(tokens) + True + """ + key = lambda s: (s.start, -len(s),) + return sorted(tokens, key=key) def is_after(self, other): """ - Return True if this result is after the other result. + Return True if this token is after the other token. For example: - >>> Result(1, 2).is_after(Result(5, 6)) + >>> Token(1, 2).is_after(Token(5, 6)) False - >>> Result(5, 6).is_after(Result(5, 6)) + >>> Token(5, 6).is_after(Token(5, 6)) False - >>> Result(2, 3).is_after(Result(1, 2)) + >>> Token(2, 3).is_after(Token(1, 2)) False - >>> Result(5, 6).is_after(Result(3, 4)) + >>> Token(5, 6).is_after(Token(3, 4)) True """ return self.start > other.end @@ -453,188 +570,57 @@ def is_before(self, other): def __contains__(self, other): """ - Return True if this result contains the other result. + Return True if this token contains the other token. For example: - >>> Result(5, 7) in Result(5, 7) + >>> Token(5, 7) in Token(5, 7) True - >>> Result(6, 8) in Result(5, 7) + >>> Token(6, 8) in Token(5, 7) False - >>> Result(6, 6) in Result(4, 8) + >>> Token(6, 6) in Token(4, 8) True - >>> Result(3, 9) in Result(4, 8) + >>> Token(3, 9) in Token(4, 8) False - >>> Result(4, 8) in Result(3, 9) + >>> Token(4, 8) in Token(3, 9) True """ return self.start <= other.start and other.end <= self.end def overlap(self, other): """ - Return True if this result and the other result overlap. + Return True if this token and the other token overlap. For example: - >>> Result(1, 2).overlap(Result(5, 6)) + >>> Token(1, 2).overlap(Token(5, 6)) False - >>> Result(5, 6).overlap(Result(5, 6)) + >>> Token(5, 6).overlap(Token(5, 6)) True - >>> Result(4, 5).overlap(Result(5, 6)) + >>> Token(4, 5).overlap(Token(5, 6)) True - >>> Result(4, 5).overlap(Result(5, 7)) + >>> Token(4, 5).overlap(Token(5, 7)) True - >>> Result(4, 5).overlap(Result(6, 7)) + >>> Token(4, 5).overlap(Token(6, 7)) False """ start = self.start end = self.end return (start <= other.start <= end) or (start <= other.end <= end) - @classmethod - def sort(cls, results): - """ - Return a new sorted sequence of results given a sequence of results. The - primary sort is on start and the secondary sort is on longer lengths. - Therefore if two results have the same start, the longer result will sort - first. - For example: - >>> results = [Result(0, 0), Result(5, 5), Result(1, 1), Result(2, 4), Result(2, 5)] - >>> expected = [Result(0, 0), Result(1, 1), Result(2, 5), Result(2, 4), Result(5, 5)] - >>> expected == Result.sort(results) - True - """ - key = lambda s: (s.start, -len(s),) - return sorted(results, key=key) +# tokenize to separate text from parens +_tokenizer = re.compile(''' + (?P[^\s\(\)]+) + | + (?P\s+) + | + (?P[\(\)]) + ''', + re.VERBOSE | re.MULTILINE | re.UNICODE +) -def filter_overlapping(results): +def get_tokens(tokens_string): """ - Return a new list from an iterable of `results` discarding contained and - overlaping Results using these rules: - - - skip a result fully contained in another result. - - keep the biggest, left-most result of two overlapping results and skip the other - - For example: - >>> results = [ - ... Result(0, 0, 'a'), - ... Result(1, 5, 'bcdef'), - ... Result(2, 4, 'cde'), - ... Result(3, 7, 'defgh'), - ... Result(4, 7, 'efgh'), - ... Result(8, 9, 'ij'), - ... Result(10, 13, 'klmn'), - ... Result(11, 15, 'lmnop'), - ... Result(16, 16, 'q'), - ... ] - - >>> expected = [ - ... Result(0, 0, 'a'), - ... Result(1, 5, 'bcdef'), - ... Result(8, 9, 'ij'), - ... Result(11, 15, 'lmnop'), - ... Result(16, 16, 'q'), - ... ] - - >>> filtered = list(filter_overlapping(results)) - >>> filtered == expected - True - """ - results = Result.sort(results) - - # compare pair of results in the sorted sequence: current and next - i = 0 - while i < len(results) - 1: - j = i + 1 - while j < len(results): - curr_res = results[i] - next_res = results[j] - - logger_debug('curr_res, i, next_res, j:', curr_res, i, next_res, j) - # disjoint results: break, there is nothing to do - if next_res.is_after(curr_res): - logger_debug(' break to next', curr_res) - break - - # contained result: discard the contained result - if next_res in curr_res: - logger_debug(' del next_res contained:', next_res) - del results[j] - continue - - # overlap: keep the biggest result and skip the smallest overlapping results - # in case of length tie: keep the left most - if curr_res.overlap(next_res): - if curr_res.priority < next_res.priority: - logger_debug(' del next_res lower priority:', next_res) - del results[j] - continue - elif curr_res.priority > next_res.priority: - logger_debug(' del curr_res lower priority:', curr_res) - del results[i] - break - else: - if len(curr_res) >= len(next_res): - logger_debug(' del next_res smaller overlap:', next_res) - del results[j] - continue - else: - logger_debug(' del curr_res smaller overlap:', curr_res) - del results[i] - break - j += 1 - i += 1 - return results - - -def add_unmatched(string, results): - """ - Yield Result object from the original `string` and the search `results` iterable - of non-overlapping matched substring Result object. New unmatched Results are - added to the stream for unmatched parts. - - For example: - >>> string ='abcdefghijklmn' - >>> results = [ - ... Result(2, 3, 'cd'), - ... Result(7, 7, 'h', None), - ... Result(9, 10, 'jk', None), - ... ] - >>> expected = [ - ... Result(0, 1, 'ab'), - ... Result(2, 3, 'cd'), - ... Result(4, 6, 'efg'), - ... Result(7, 7, 'h'), - ... Result(8, 8, 'i'), - ... Result(9, 10, 'jk'), - ... Result(11, 13, 'lmn') - ... ] - >>> expected == list(add_unmatched(string, results)) - True - - >>> string ='abc2' - >>> results = [ - ... Result(0, 2, 'abc'), - ... ] - >>> expected = [ - ... Result(0, 2, 'abc'), - ... Result(3, 3, '2', None), - ... ] - >>> expected == list(add_unmatched(string, results)) - True - + Return an iterable of strings splitting on spaces and parens. """ - string_pos = 0 - for result in Result.sort(results): - if result.start > string_pos: - start = string_pos - end = result.start - 1 - yield Result(start, end, string[start:end + 1]) - yield result - string_pos = result.end + 1 - - len_string = len(string) - if string_pos < len_string: - start = string_pos - end = len_string - 1 - yield Result(start, end, string[start:end + 1]) + return [match for match in _tokenizer.split(tokens_string.lower()) if match] diff --git a/tests/test__pyahocorasick.py b/tests/test__pyahocorasick.py index 7b346b6..e7ad883 100644 --- a/tests/test__pyahocorasick.py +++ b/tests/test__pyahocorasick.py @@ -6,12 +6,7 @@ WWW : http://0x80.pl License : public domain -Modified for use in the license_expression library and in particular: - - add support for unicode key strinsg. - - rename word to key and output to value (to be more like a mapping/dict) - - case insensitive search - - improve returned results with the actual start,end and matched string. - - support returning non-matched parts of a string +Modified for use in the license_expression library. """ from __future__ import unicode_literals @@ -21,35 +16,34 @@ import unittest from license_expression._pyahocorasick import Trie -from license_expression._pyahocorasick import Output -from license_expression._pyahocorasick import Result +from license_expression._pyahocorasick import Token class TestTrie(unittest.TestCase): - def testAddedWordShouldBeCountedAndAvailableForRetrieval(self): + def test_add_can_get(self): t = Trie() t.add('python', 'value') - assert Output('python', 'value') == t.get('python') + assert ('python', 'value') == t.get('python') - def testAddingExistingWordShouldReplaceAssociatedValue(self): + def test_add_existing_WordShouldReplaceAssociatedValue(self): t = Trie() t.add('python', 'value') - assert Output('python', 'value') == t.get('python') + assert ('python', 'value') == t.get('python') t.add('python', 'other') - assert Output('python', 'other') == t.get('python') + assert ('python', 'other') == t.get('python') - def testGetUnknowWordWithoutDefaultValueShouldRaiseException(self): + def test_get_UnknowWordWithoutDefaultValueShouldRaiseException(self): t = Trie() with self.assertRaises(KeyError): t.get('python') - def testGetUnknowWordWithDefaultValueShouldReturnDefault(self): + def test_get_UnknowWordWithDefaultValueShouldReturnDefault(self): t = Trie() self.assertEqual(t.get('python', 'default'), 'default') - def testExistShouldDetectAddedWords(self): + def test_exists_ShouldDetectAddedWords(self): t = Trie() t.add('python', 'value') t.add('ada', 'value') @@ -57,7 +51,7 @@ def testExistShouldDetectAddedWords(self): self.assertTrue(t.exists('python')) self.assertTrue(t.exists('ada')) - def testExistShouldReturnFailOnUnknownWord(self): + def test_exists_ShouldReturnFailOnUnknownWord(self): t = Trie() t.add('python', 'value') @@ -66,20 +60,22 @@ def testExistShouldReturnFailOnUnknownWord(self): def test_is_prefix_ShouldDetecAllPrefixesIncludingWord(self): t = Trie() t.add('python', 'value') - t.add('ada', 'value') + t.add('ada lovelace', 'value') - self.assertTrue(t.is_prefix('a')) - self.assertTrue(t.is_prefix('ad')) + self.assertFalse(t.is_prefix('a')) + self.assertFalse(t.is_prefix('ad')) self.assertTrue(t.is_prefix('ada')) - self.assertTrue(t.is_prefix('p')) - self.assertTrue(t.is_prefix('py')) - self.assertTrue(t.is_prefix('pyt')) - self.assertTrue(t.is_prefix('pyth')) - self.assertTrue(t.is_prefix('pytho')) + self.assertFalse(t.is_prefix('p')) + self.assertFalse(t.is_prefix('py')) + self.assertFalse(t.is_prefix('pyt')) + self.assertFalse(t.is_prefix('pyth')) + self.assertFalse(t.is_prefix('pytho')) self.assertTrue(t.is_prefix('python')) - def testItemsShouldReturnAllItemsAlreadyAddedToTheTrie(self): + self.assertFalse(t.is_prefix('lovelace')) + + def test_items_ShouldReturnAllItemsAlreadyAddedToTheTrie(self): t = Trie() t.add('python', 1) @@ -87,6 +83,7 @@ def testItemsShouldReturnAllItemsAlreadyAddedToTheTrie(self): t.add('perl', 3) t.add('pascal', 4) t.add('php', 5) + t.add('php that', 6) result = list(t.items()) self.assertIn(('python', 1), result) @@ -94,8 +91,9 @@ def testItemsShouldReturnAllItemsAlreadyAddedToTheTrie(self): self.assertIn(('perl', 3), result) self.assertIn(('pascal', 4), result) self.assertIn(('php', 5), result) + self.assertIn(('php that', 6), result) - def testKeysShouldReturnAllKeysAlreadyAddedToTheTrie(self): + def test_keys_ShouldReturnAllKeysAlreadyAddedToTheTrie(self): t = Trie() t.add('python', 1) @@ -103,6 +101,7 @@ def testKeysShouldReturnAllKeysAlreadyAddedToTheTrie(self): t.add('perl', 3) t.add('pascal', 4) t.add('php', 5) + t.add('php that', 6) result = list(t.keys()) self.assertIn('python', result) @@ -110,8 +109,9 @@ def testKeysShouldReturnAllKeysAlreadyAddedToTheTrie(self): self.assertIn('perl', result) self.assertIn('pascal', result) self.assertIn('php', result) + self.assertIn('php that', result) - def testValuesShouldReturnAllValuesAlreadyAddedToTheTrie(self): + def test_values_ShouldReturnAllValuesAlreadyAddedToTheTrie(self): t = Trie() t.add('python', 1) @@ -127,36 +127,60 @@ def testValuesShouldReturnAllValuesAlreadyAddedToTheTrie(self): self.assertIn(4, result) self.assertIn(5, result) - def test_iter_should_not_return_non_matches(self): + def test_iter_should_not_return_non_matches_by_default(self): def get_test_automaton(): - words = "he her hers his she hi him man himan".split() + words = 'he her hers his she hi him man himan'.split() t = Trie() for w in words: t.add(w, w) t.make_automaton() return t - test_string = "he she himan" + test_string = 'he she himan' t = get_test_automaton() result = list(t.iter(test_string)) + assert 'he she himan'.split() == [r.value for r in result] + + def test_iter_should_can_return_non_matches_optionally(self): + + def get_test_automaton(): + words = 'he her hers his she hi him man himan'.split() + t = Trie() + for w in words: + t.add(w, w) + t.make_automaton() + return t + + test_string = ' he she junk himan other stuffs ' + # 111111111122222222223333333 + # 0123456789012345678901234567890123456 + + t = get_test_automaton() + result = list(t.iter(test_string, include_unmatched=True, include_space=True)) expected = [ - Result(start=0, end=1, string='he', output=Output('he', 'he')), - Result(start=3, end=5, string='she', output=Output('she', 'she')), - Result(start=4, end=5, string='he', output=Output('he', 'he')), - Result(start=7, end=8, string='hi', output=Output('hi', 'hi')), - Result(start=7, end=9, string='him', output=Output('him', 'him')), - Result(start=7, end=11, string='himan', output=Output('himan', 'himan')), - Result(start=9, end=11, string='man', output=Output('man', 'man')) + Token(0, 1, u' ', None), + Token(2, 3, u'he', u'he'), + Token(4, 4, u' ', None), + Token(5, 7, u'she', u'she'), + Token(8, 8, u' ', None), + Token(9, 12, u'junk', None), + Token(13, 14, u' ', None), + Token(15, 19, u'himan', u'himan'), + Token(20, 21, u' ', None), + Token(22, 26, u'other', None), + Token(27, 27, u' ', None), + Token(28, 33, u'stuffs', None), + Token(34, 36, u' ', None), ] assert expected == result - def test_iter_vs_scan(self): + def test_iter_vs_tokenize(self): def get_test_automaton(): - words = "( AND ) OR".split() + words = '( AND ) OR'.split() t = Trie() for w in words: t.add(w, w) @@ -166,41 +190,38 @@ def get_test_automaton(): test_string = '((l-a + AND l-b) OR (l -c+))' t = get_test_automaton() - result = list(t.iter(test_string)) + result = list(t.iter(test_string, include_unmatched=True, include_space=True)) expected = [ - Result(0, 0, '(', Output('(', '(')), - Result(1, 1, '(', Output('(', '(')), - Result(8, 10, 'AND', Output('AND', 'AND')), - Result(15, 15, ')', Output(')', ')')), - Result(17, 18, 'OR', Output('OR', 'OR')), - Result(20, 20, '(', Output('(', '(')), - Result(26, 26, ')', Output(')', ')')), - Result(27, 27, ')', Output(')', ')')) + Token(0, 0, u'(', u'('), + Token(1, 1, u'(', u'('), + Token(2, 4, u'l-a', None), + Token(5, 5, u' ', None), + Token(6, 6, u'+', None), + Token(7, 7, u' ', None), + Token(8, 10, u'AND', u'AND'), + Token(11, 11, u' ', None), + Token(12, 14, u'l-b', None), + Token(15, 15, u')', u')'), + Token(16, 16, u' ', None), + Token(17, 18, u'OR', u'OR'), + Token(19, 19, u' ', None), + Token(20, 20, u'(', u'('), + Token(21, 21, u'l', None), + Token(22, 22, u' ', None), + Token(23, 25, u'-c+', None), + Token(26, 26, u')', u')'), + Token(27, 27, u')', u')') ] + assert expected == result - result = list(t.scan(test_string)) - expected = [ - Result(0, 0, '(', Output('(', '(')), - Result(1, 1, '(', Output('(', '(')), - Result(2, 7, 'l-a + ', None), - Result(8, 10, 'AND', Output('AND', 'AND')), - Result(11, 14, ' l-b', None), - Result(15, 15, ')', Output(')', ')')), - Result(16, 16, ' ', None), - Result(17, 18, 'OR', Output('OR', 'OR')), - Result(19, 19, ' ', None), - Result(20, 20, '(', Output('(', '(')), - Result(21, 25, 'l -c+', None), - Result(26, 26, ')', Output(')', ')')), - Result(27, 27, ')', Output(')', ')')) - ] + result = list(t.tokenize(test_string, include_unmatched=True, include_space=True)) assert expected == result - def test_scan_with_unmatched(self): + def test_tokenize_with_unmatched_and_space(self): def get_test_automaton(): - words = "( AND ) OR".split() + words = '( AND ) OR'.split() t = Trie() for w in words: t.add(w, w) @@ -208,18 +229,44 @@ def get_test_automaton(): return t test_string = '((l-a + AND l-b) OR an (l -c+))' - + # 111111111122222222223 + # 0123456789012345678901234567890 t = get_test_automaton() - result = list(t.scan(test_string)) - assert test_string == ''.join(r.string for r in result) + result = list(t.tokenize(test_string, include_unmatched=True, include_space=True)) + expected = [ + Token(0, 0, u'(', u'('), + Token(1, 1, u'(', u'('), + Token(2, 4, u'l-a', None), + Token(5, 5, u' ', None), + Token(6, 6, u'+', None), + Token(7, 7, u' ', None), + Token(8, 10, u'AND', u'AND'), + Token(11, 11, u' ', None), + Token(12, 14, u'l-b', None), + Token(15, 15, u')', u')'), + Token(16, 16, u' ', None), + Token(17, 18, u'OR', u'OR'), + Token(19, 19, u' ', None), + Token(20, 21, u'an', None), + Token(22, 22, u' ', None), + Token(23, 23, u'(', u'('), + Token(24, 24, u'l', None), + Token(25, 25, u' ', None), + Token(26, 28, u'-c+', None), + Token(29, 29, u')', u')'), + Token(30, 30, u')', u')') + ] + + assert expected == result + assert test_string == ''.join(t.string for t in result) def test_iter_with_unmatched_simple(self): t = Trie() - t.add('AND', 'AND') + t.add('And', 'And') t.make_automaton() - test_string = 'AND an a and' + test_string = 'AND an a And' result = list(t.iter(test_string)) - assert 'ANDand' == ''.join(r.string for r in result) + assert ['And', 'And'] == [r.value for r in result] def test_iter_with_unmatched_simple2(self): t = Trie() @@ -227,5 +274,49 @@ def test_iter_with_unmatched_simple2(self): t.make_automaton() test_string = 'AND an a and' result = list(t.iter(test_string)) - assert 'ANDand' == ''.join(r.string for r in result) + assert ['AND', 'AND'] == [r.value for r in result] + + def test_iter_with_unmatched_simple3(self): + t = Trie() + t.add('AND', 'AND') + t.make_automaton() + test_string = 'AND an a andersom' + result = list(t.iter(test_string)) + assert ['AND'] == [r.value for r in result] + def test_iter_simple(self): + t = Trie() + t.add('AND', 'AND') + t.add('OR', 'OR') + t.add('WITH', 'WITH') + t.add('(', '(') + t.add(')', ')') + t.add('GPL-2.0', 'GPL-2.0') + t.add('mit', 'MIT') + t.add('Classpath', 'Classpath') + t.make_automaton() + test_string = '(GPL-2.0 with Classpath) or (gpl-2.0) and (classpath or gpl-2.0 OR mit) ' + # 111111111122222222223333333333444444444455555555556666666666777 + # 0123456789012345678901234567890123456789012345678901234567890123456789012 + result = list(t.iter(test_string)) + expected = [ + Token(0, 0, u'(', u'('), + Token(1, 7, u'GPL-2.0', u'GPL-2.0'), + Token(9, 12, u'with', u'WITH'), + Token(14, 22, u'Classpath', u'Classpath'), + Token(23, 23, u')', u')'), + Token(25, 26, u'or', u'OR'), + Token(28, 28, u'(', u'('), + Token(29, 35, u'gpl-2.0', u'GPL-2.0'), + Token(36, 36, u')', u')'), + Token(38, 40, u'and', u'AND'), + Token(42, 42, u'(', u'('), + Token(43, 51, u'classpath', u'Classpath'), + Token(53, 54, u'or', u'OR'), + Token(57, 63, u'gpl-2.0', u'GPL-2.0'), + Token(65, 66, u'OR', u'OR'), + Token(68, 70, u'mit', u'MIT'), + Token(71, 71, u')', u')') + ] + + assert expected == result diff --git a/tests/test_license_expression.py b/tests/test_license_expression.py index 58312c4..6ceed6c 100644 --- a/tests/test_license_expression.py +++ b/tests/test_license_expression.py @@ -17,8 +17,10 @@ from __future__ import print_function from __future__ import unicode_literals +from collections import namedtuple from collections import OrderedDict from unittest import TestCase +from unittest.case import expectedFailure import sys from boolean.boolean import PARSE_UNBALANCED_CLOSING_PARENS @@ -28,6 +30,7 @@ from license_expression import PARSE_INVALID_NESTING from license_expression import PARSE_INVALID_EXCEPTION from license_expression import PARSE_INVALID_SYMBOL_AS_EXCEPTION +from license_expression import PARSE_INVALID_OPERATOR_SEQUENCE from license_expression import ExpressionError from license_expression import Keyword @@ -37,12 +40,9 @@ from license_expression import LicenseSymbolLike from license_expression import LicenseWithExceptionSymbol from license_expression import ParseError -from license_expression import Result -from license_expression import Output +from license_expression import Token -from license_expression import group_results_for_with_subexpression -from license_expression import splitter -from license_expression import strip_and_skip_spaces +from license_expression import build_token_groups_for_with_subexpression from license_expression import validate_symbols from license_expression import TOKEN_AND @@ -150,9 +150,9 @@ def test_tokenize_plain4(self): expected = [ (TOKEN_LPAR, '(', 0), (TOKEN_LPAR, '(', 1), - (LicenseSymbol(key=u'l-a+'), u'l-a+', 2), + (LicenseSymbol(key=u'l-a+'), 'l-a+', 2), (TOKEN_AND, 'AND', 7), - (LicenseSymbol(key=u'l-b'), u'l-b', 11), + (LicenseSymbol(key=u'l-b'), 'l-b', 11), (TOKEN_RPAR, ')', 14), (TOKEN_OR, 'OR', 16), (TOKEN_LPAR, '(', 19), @@ -198,27 +198,46 @@ def get_symbols_and_licensing(self): licensing = Licensing(symbols) return gpl_20, gpl_20_plus, lgpl_21, mit, licensing - def test_tokenize_1(self): + def test_tokenize_1_with_symbols(self): gpl_20, _gpl_20_plus, lgpl_21, mit, licensing = self.get_symbols_and_licensing() - result = licensing.tokenize('The GNU GPL 20 or LGPL-2.1 and mit') + + result = licensing.tokenize('The GNU GPL 20 or LGPL v2.1 AND MIT license ') + # 111111111122222222223333333333444 + # 0123456789012345678901234567890123456789012 + expected = [ (gpl_20, 'The GNU GPL 20', 0), - (TOKEN_OR, ' or ', 14), - (lgpl_21, 'LGPL-2.1', 18), - (TOKEN_AND, ' and ', 26), - (mit, 'mit', 31)] + (TOKEN_OR, 'or', 15), + (lgpl_21, 'LGPL v2.1', 18), + (TOKEN_AND, 'AND', 28), + (mit, 'MIT license', 32) + ] + assert expected == list(result) + + def test_tokenize_1_no_symbols(self): + licensing = Licensing() + + result = licensing.tokenize('The GNU GPL 20 or LGPL v2.1 AND MIT license') + + expected = [ + (LicenseSymbol(u'The GNU GPL 20'), 'The GNU GPL 20', 0), + (TOKEN_OR, 'or', 15), + (LicenseSymbol(u'LGPL v2.1'), 'LGPL v2.1', 18), + (TOKEN_AND, 'AND', 28), + (LicenseSymbol(u'MIT license'), 'MIT license', 32) + ] + assert expected == list(result) def test_tokenize_with_trailing_unknown(self): - gpl_20, _gpl_20_plus, lgpl_21, mit, licensing = self.get_symbols_and_licensing() + gpl_20, _gpl_20_plus, lgpl_21, _mit, licensing = self.get_symbols_and_licensing() result = licensing.tokenize('The GNU GPL 20 or LGPL-2.1 and mit2') expected = [ (gpl_20, 'The GNU GPL 20', 0), - (TOKEN_OR, ' or ', 14), + (TOKEN_OR, 'or', 15), (lgpl_21, 'LGPL-2.1', 18), - (TOKEN_AND, ' and ', 26), - (mit, 'mit', 31), - (LicenseSymbol(key='2'), '2', 34) + (TOKEN_AND, 'and', 27), + (LicenseSymbol(key='mit2'), 'mit2', 31), ] assert expected == list(result) @@ -228,14 +247,15 @@ def test_tokenize_3(self): result = licensing.tokenize('The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit') expected = [ (gpl_20_plus, 'The GNU GPL 20 or later', 0), - (TOKEN_OR, ' or ', 23), + (TOKEN_OR, 'or', 24), (TOKEN_LPAR, '(', 27), (lgpl_21, 'LGPL-2.1', 28), - (TOKEN_AND, ' and ', 36), + (TOKEN_AND, 'and', 37), (mit, 'mit', 41), (TOKEN_RPAR, ')', 44), - (TOKEN_OR, ' or ', 45), - (gpl_20, 'The GNU GPL 20', 49), (2, ' or ', 63), + (TOKEN_OR, 'or', 46), + (gpl_20, 'The GNU GPL 20', 49), + (2, 'or', 64), (mit, 'mit', 67) ] assert expected == list(result) @@ -245,8 +265,41 @@ def test_tokenize_unknown_as_trailing_single_attached_character(self): l = Licensing(symbols) result = list(l.tokenize('mit2')) expected = [ - (LicenseSymbol(key='MIT', aliases=('MIT license',)), 'mit', 0), - (LicenseSymbol(key='2'), '2', 3), + (LicenseSymbol(u'mit2'), 'mit2', 0), + ] + assert expected == result + + def test_tokenize_with_unknown_symbol_containing_known_symbol_leading(self): + l = Licensing(['gpl-2.0']) + result = list(l.tokenize('gpl-2.0 AND gpl-2.0-plus', strict=False)) + result = [s for s, _, _ in result] + expected = [ + LicenseSymbol(key='gpl-2.0'), + TOKEN_AND, + LicenseSymbol(key='gpl-2.0-plus'), + ] + assert expected == result + + def test_tokenize_with_unknown_symbol_containing_known_symbol_contained(self): + l = Licensing(['gpl-2.0']) + result = list(l.tokenize('gpl-2.0 WITH exception-gpl-2.0-plus', strict=False)) + result = [s for s, _, _ in result] + expected = [ + LicenseWithExceptionSymbol( + LicenseSymbol(u'gpl-2.0'), + LicenseSymbol(u'exception-gpl-2.0-plus') + ) + ] + assert expected == result + + def test_tokenize_with_unknown_symbol_containing_known_symbol_trailing(self): + l = Licensing(['gpl-2.0']) + result = list(l.tokenize('gpl-2.0 AND exception-gpl-2.0', strict=False)) + result = [s for s, _, _ in result] + expected = [ + LicenseSymbol(u'gpl-2.0'), + TOKEN_AND, + LicenseSymbol(u'exception-gpl-2.0') ] assert expected == result @@ -270,7 +323,12 @@ def test_parse_raise_ParseError(self): licensing.parse(expression) self.fail('ParseError should be raised') except ParseError as pe: - expected = {'error_code': PARSE_UNBALANCED_CLOSING_PARENS, 'position': 48, 'token_string': ')', 'token_type': TOKEN_RPAR} + expected = { + 'error_code': PARSE_UNBALANCED_CLOSING_PARENS, + 'position': 48, + 'token_string': ')', + 'token_type': TOKEN_RPAR + } assert expected == _parse_error_as_dict(pe) def test_parse_raise_ExpressionError_when_validating(self): @@ -278,31 +336,65 @@ def test_parse_raise_ExpressionError_when_validating(self): licensing = Licensing() try: licensing.parse(expression, validate=True) + self.fail('Exception not raised') except ExpressionError as ee: assert 'Unknown license key(s): gpl, bsd, lgpl, exception' == str(ee) - def test_parse_raise_ExpressionError_when_validating_strict(self): + def test_parse_raise_ParseError_when_validating_strict(self): + expression = 'gpl and bsd or lgpl with exception' + licensing = Licensing() + try: + licensing.parse(expression, validate=True, strict=True) + self.fail('Exception not raised') + except ParseError as pe: + expected = { + 'error_code': PARSE_INVALID_SYMBOL_AS_EXCEPTION, + 'position': 25, + 'token_string': 'exception', + 'token_type': TOKEN_SYMBOL + } + assert expected == _parse_error_as_dict(pe) + + def test_parse_raise_ParseError_when_strict_no_validate(self): expression = 'gpl and bsd or lgpl with exception' licensing = Licensing() + try: + licensing.parse(expression, validate=False, strict=True) + self.fail('Exception not raised') + except ParseError as pe: + expected = { + 'error_code': PARSE_INVALID_SYMBOL_AS_EXCEPTION, + 'position': 25, + 'token_string': 'exception', + 'token_type': TOKEN_SYMBOL + } + assert expected == _parse_error_as_dict(pe) + + def test_parse_raise_ExpressionError_when_validating_strict_with_unknown(self): + expression = 'gpl and bsd or lgpl with exception' + licensing = Licensing(symbols=[LicenseSymbol('exception', is_exception=True)]) try: licensing.parse(expression, validate=True, strict=True) except ExpressionError as ee: - assert str(ee).startswith('exception_symbol must be an exception with "is_exception" set to True:') + assert 'Unknown license key(s): gpl, bsd, lgpl' == str(ee) def test_parse_in_strict_mode_for_solo_symbol(self): expression = 'lgpl' licensing = Licensing() licensing.parse(expression, strict=True) - def test_parse_invalid_expression_raise_expression(self): + def test_parse_invalid_expression_raise_exception(self): licensing = Licensing() - expr = 'wrong' licensing.parse(expr) + def test_parse_not_invalid_expression_rais_not_exception(self): + licensing = Licensing() expr = 'l-a AND none' licensing.parse(expr) + def test_parse_invalid_expression_raise_exception3(self): + licensing = Licensing() expr = '(l-a + AND l-b' try: licensing.parse(expr) @@ -310,6 +402,8 @@ def test_parse_invalid_expression_raise_expression(self): except ParseError: pass + def test_parse_invalid_expression_raise_exception4(self): + licensing = Licensing() expr = '(l-a + AND l-b))' try: licensing.parse(expr) @@ -317,20 +411,33 @@ def test_parse_invalid_expression_raise_expression(self): except ParseError: pass + def test_parse_invalid_expression_raise_exception5(self): + licensing = Licensing() expr = 'l-a AND' try: licensing.parse(expr) self.fail("Exception not raised when validating '%s'" % expr) - except ParseError: - pass + except ExpressionError as ee: + assert 'AND requires two or more licenses as in: MIT AND BSD' == str(ee) + def test_parse_invalid_expression_raise_exception6(self): + licensing = Licensing() expr = 'OR l-a' try: licensing.parse(expr) self.fail("Exception not raised when validating '%s'" % expr) - except ParseError: - pass + self.fail('Exception not raised') + except ParseError as pe: + expected = { + 'error_code': PARSE_INVALID_OPERATOR_SEQUENCE, + 'position': 0, + 'token_string': 'OR', + 'token_type': TOKEN_OR + } + assert expected == _parse_error_as_dict(pe) + def test_parse_not_invalid_expression_raise_no_exception2(self): + licensing = Licensing() expr = '+l-a' licensing.parse(expr) @@ -355,7 +462,12 @@ def test_parse_errors_catch_invalid_nesting(self): licensing.parse('mit (and LGPL 2.1)') self.fail('Exception not raised') except ParseError as pe: - expected = {'error_code': PARSE_INVALID_NESTING, 'position': 4, 'token_string': '(', 'token_type': TOKEN_LPAR} + expected = { + 'error_code': PARSE_INVALID_NESTING, + 'position': 4, + 'token_string': '(', + 'token_type': TOKEN_LPAR + } assert expected == _parse_error_as_dict(pe) def test_parse_errors_catch_invalid_expression_with_bare_and(self): @@ -364,7 +476,12 @@ def test_parse_errors_catch_invalid_expression_with_bare_and(self): licensing.parse('and') self.fail('Exception not raised') except ParseError as pe: - expected = {'error_code': PARSE_INVALID_EXPRESSION, 'position':-1, 'token_string': '', 'token_type': None} + expected = { + 'error_code': PARSE_INVALID_OPERATOR_SEQUENCE, + 'position': 0, + 'token_string': 'and', + 'token_type': TOKEN_AND + } assert expected == _parse_error_as_dict(pe) def test_parse_errors_catch_invalid_expression_with_or_and_no_other(self): @@ -373,7 +490,12 @@ def test_parse_errors_catch_invalid_expression_with_or_and_no_other(self): licensing.parse('or that') self.fail('Exception not raised') except ParseError as pe: - expected = {'error_code': PARSE_INVALID_EXPRESSION, 'position':-1, 'token_string': '', 'token_type': None} + expected = { + 'error_code': PARSE_INVALID_OPERATOR_SEQUENCE, + 'position': 0, + 'token_string': 'or', + 'token_type': TOKEN_OR + } assert expected == _parse_error_as_dict(pe) def test_parse_errors_catch_invalid_expression_with_empty_parens(self): @@ -382,7 +504,12 @@ def test_parse_errors_catch_invalid_expression_with_empty_parens(self): licensing.parse('with ( )this') self.fail('Exception not raised') except ParseError as pe: - expected = {'error_code': PARSE_INVALID_EXPRESSION, 'position': 0, 'token_string': 'with', 'token_type': TOKEN_WITH} + expected = { + 'error_code': PARSE_INVALID_EXPRESSION, + 'position': 0, + 'token_string': 'with', + 'token_type': TOKEN_WITH + } assert expected == _parse_error_as_dict(pe) def test_parse_errors_catch_invalid_non_unicode_byte_strings_on_python3(self): @@ -580,15 +707,11 @@ def test_create_from_python(self): ) assert a == b - def test_parse_with_repeated_or_later_raise_parse_error(self): + def test_parse_with_repeated_or_later_does_not_raise_parse_error(self): l = Licensing() expr = 'LGPL2.1+ + and mit' - try: - l.parse(expr) - self.fail('Exception not raised') - except ParseError as ee: - expected = 'Invalid symbols sequence such as (A B) for token: "+" at position: 9' - assert expected == str(ee) + parsed = l.parse(expr) + assert 'LGPL2.1+ + AND mit' == str(parsed) def test_render_complex(self): licensing = Licensing() @@ -651,18 +774,6 @@ def test_parse_complex2(self): expected = 'GPL-2.0 OR (LGPL-2.1 AND mit)' assert expected == expr.render('{symbol.key}') - def test_Licensing_can_scan_valid_expressions_with_symbols_that_contain_and_with_or(self): - licensing = Licensing() - expression = 'orgpl or withbsd with orclasspath and andmit or andlgpl and ormit or withme' - result = [r.string for r in licensing.get_scanner().scan(expression)] - expected = [ - 'orgpl', ' or ', 'withbsd', ' with ', 'orclasspath', - ' and ', 'andmit', ' or ', 'andlgpl', ' and ', 'ormit', - ' or ', 'withme' - ] - - assert expected == result - def test_Licensing_can_tokenize_valid_expressions_with_symbols_that_contain_and_with_or(self): licensing = Licensing() expression = 'orgpl or withbsd with orclasspath and andmit or anlgpl and ormit or withme' @@ -687,6 +798,40 @@ def test_Licensing_can_tokenize_valid_expressions_with_symbols_that_contain_and_ assert expected == result + def test_Licensing_can_simple_tokenize_valid_expressions_with_symbols_that_contain_and_with_or(self): + licensing = Licensing() + expression = 'orgpl or withbsd with orclasspath and andmit or andlgpl and ormit or withme' + + result = [r.string for r in licensing.simple_tokenizer(expression)] + expected = [ + 'orgpl', + ' ', + 'or', + ' ', + 'withbsd', + ' ', + 'with', + ' ', + 'orclasspath', + ' ', + 'and', + ' ', + 'andmit', + ' ', + 'or', + ' ', + 'andlgpl', + ' ', + 'and', + ' ', + 'ormit', + ' ', + 'or', + ' ', + 'withme' + ] + assert expected == result + def test_Licensing_can_parse_valid_expressions_with_symbols_that_contain_and_with_or(self): licensing = Licensing() expression = 'orgpl or withbsd with orclasspath and andmit or anlgpl and ormit or withme' @@ -695,24 +840,97 @@ def test_Licensing_can_parse_valid_expressions_with_symbols_that_contain_and_wit expected = 'orgpl OR (withbsd WITH orclasspath AND andmit) OR (anlgpl AND ormit) OR withme' assert expected == result.render('{symbol.key}') + def test_Licensing_can_parse_valid_expressions_with_symbols_that_contain_spaces(self): + licensing = Licensing() + expression = ' GPL-2.0 or (mit and LGPL 2.1) or bsd Or GPL-2.0 or (mit and LGPL 2.1)' + parsed = licensing.parse(expression) + expected = 'GPL-2.0 OR (mit AND LGPL 2.1) OR bsd OR GPL-2.0 OR (mit AND LGPL 2.1)' + assert expected == str(parsed) -class LicensingParseWithSymbolsSimpleTest(TestCase): + def test_parse_invalid_expression_with_trailing_or(self): + licensing = Licensing() + expr = 'mit or' + try: + licensing.parse(expr) + self.fail("Exception not raised when validating '%s'" % expr) + except ExpressionError as ee: + assert 'OR requires two or more licenses as in: MIT OR BSD' == str(ee) + + def test_parse_invalid_expression_with_trailing_or_and_valid_start_does_not_raise_exception(self): + licensing = Licensing() + expression = ' mit or mit or ' + parsed = licensing.parse(expression) + # ExpressionError: OR requires two or more licenses as in: MIT OR BSD + expected = 'mit OR mit' + assert expected == str(parsed) + + def test_parse_invalid_expression_with_repeated_trailing_or_raise_exception(self): + licensing = Licensing() + expression = 'mit or mit or or' + try: + licensing.parse(expression, simple=False) + self.fail('Exception not raised') + except ParseError as pe: + expected = { + 'error_code': PARSE_INVALID_OPERATOR_SEQUENCE, + 'position': 14, + 'token_string': 'or', + 'token_type': TOKEN_OR + } + assert expected == _parse_error_as_dict(pe) - def test_Licensing_with_illegal_symbols_raise_Exception(self): + @expectedFailure + def test_parse_invalid_expression_with_single_trailing_or_raise_exception(self): + licensing = Licensing() + expression = 'mit or mit or' try: - Licensing([ - 'GPL-2.0 or LATER', - 'classpath Exception', - 'something with else+', - 'mit', - 'LGPL 2.1', - 'mit or later' - ]) + licensing.parse(expression, simple=False) + self.fail('Exception not raised') + except ParseError as pe: + expected = { + 'error_code': PARSE_INVALID_OPERATOR_SEQUENCE, + 'position': 14, + 'token_string': 'or', + 'token_type': TOKEN_OR + } + assert expected == _parse_error_as_dict(pe) + + def test_parse_invalid_expression_with_single_trailing_and_raise_exception(self): + licensing = Licensing() + expression = 'mit or mit and' + try: + licensing.parse(expression, simple=False) + self.fail('Exception not raised') except ExpressionError as ee: - expected = ('Invalid license key: "or later" words are reserved and ' - 'cannot be used in a key: "GPL-2.0 or LATER"') + assert 'AND requires two or more licenses as in: MIT AND BSD' == str(ee) + + def test_parse_invalid_expression_with_single_leading_or_raise_exception(self): + licensing = Licensing() + expression = 'or mit or mit' + try: + licensing.parse(expression, simple=False) + self.fail('Exception not raised') + except ParseError as pe: + expected = { + 'error_code': PARSE_INVALID_OPERATOR_SEQUENCE, + 'position': 0, + 'token_string': 'or', + 'token_type': TOKEN_OR + } + assert expected == _parse_error_as_dict(pe) - assert expected == str(ee) + +class LicensingParseWithSymbolsSimpleTest(TestCase): + + def test_Licensing_with_overlapping_symbols_with_keywords_does_not_raise_Exception(self): + Licensing([ + 'GPL-2.0 or LATER', + 'classpath Exception', + 'something with else+', + 'mit', + 'LGPL 2.1', + 'mit or later' + ]) def get_syms_and_licensing(self): a = LicenseSymbol('l-a') @@ -881,6 +1099,7 @@ def test_parse_raise_ParseError_when_validating_strict_with_non_exception_symbol expression = 'gpl and bsd or lgpl with exception' try: licensing.parse(expression, validate=True, strict=True) + self.fail('Exception not raised') except ParseError as pe: expected = { 'error_code': PARSE_INVALID_SYMBOL_AS_EXCEPTION, @@ -895,6 +1114,7 @@ def test_parse_raise_ParseError_when_validating_strict_with_exception_symbols_in licensing.parse('gpl with exception', validate=True, strict=True) try: licensing.parse('exception with gpl', validate=True, strict=True) + self.fail('Exception not raised') except ParseError as pe: expected = { 'error_code': PARSE_INVALID_EXCEPTION, @@ -905,6 +1125,7 @@ def test_parse_raise_ParseError_when_validating_strict_with_exception_symbols_in try: licensing.parse('gpl with gpl', validate=True, strict=True) + self.fail('Exception not raised') except ParseError as pe: expected = { 'error_code': PARSE_INVALID_SYMBOL_AS_EXCEPTION, @@ -913,6 +1134,31 @@ def test_parse_raise_ParseError_when_validating_strict_with_exception_symbols_in 'token_type': TOKEN_SYMBOL} assert expected == _parse_error_as_dict(pe) + def test_with_unknown_symbol_string_contained_in_known_symbol_does_not_crash_with(self): + l = Licensing(['lgpl-3.0-plus']) + license_expression = 'lgpl-3.0-plus WITH openssl-exception-lgpl-3.0-plus' + l.parse(license_expression) + + def test_with_unknown_symbol_string_contained_in_known_symbol_does_not_crash_and(self): + l = Licensing(['lgpl-3.0-plus']) + license_expression = 'lgpl-3.0-plus AND openssl-exception-lgpl-3.0-plus' + l.parse(license_expression) + + def test_with_unknown_symbol_string_contained_in_known_symbol_does_not_crash_or(self): + l = Licensing(['lgpl-3.0-plus']) + license_expression = 'lgpl-3.0-plus OR openssl-exception-lgpl-3.0-plus' + l.parse(license_expression) + + def test_with_known_symbol_string_contained_in_known_symbol_does_not_crash_or(self): + l = Licensing(['lgpl-3.0-plus', 'openssl-exception-lgpl-3.0-plus']) + license_expression = 'lgpl-3.0-plus OR openssl-exception-lgpl-3.0-plus' + l.parse(license_expression) + + def test_with_known_symbol_string_contained_in_known_symbol_does_not_crash_with(self): + l = Licensing(['lgpl-3.0-plus', 'openssl-exception-lgpl-3.0-plus']) + license_expression = 'lgpl-3.0-plus WITH openssl-exception-lgpl-3.0-plus' + l.parse(license_expression) + class LicensingSymbolsReplacement(TestCase): @@ -1011,14 +1257,18 @@ def get_symbols_and_licensing(self): licensing = Licensing(symbols) return gpl2, gpl2plus, lgpl, mit, mitand2, licensing - def test_parse_trailing_char_raise_exception(self): + def test_parse_trailing_char_does_not_raise_exception_without_validate(self): + _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing() + e = licensing.parse('The GNU GPL 20 or LGPL-2.1 and mit2', validate=False) + assert 'gpl-2.0 OR (LGPL-2.1 AND mit2)' == str(e) + + def test_parse_trailing_char_raise_exception_with_validate(self): _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing() try: - licensing.parse('The GNU GPL 20 or LGPL-2.1 and mit2') - except ParseError as pe: - expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 34, - 'token_string': '2', 'token_type': LicenseSymbol('2')} - assert expected == _parse_error_as_dict(pe) + licensing.parse('The GNU GPL 20 or LGPL-2.1 and mit2', validate=True) + self.fail('Exception not raised') + except ExpressionError as ee: + assert 'Unknown license key(s): mit2' == str(ee) def test_parse_expression_with_trailing_unknown_should_raise_exception(self): gpl2, gpl2plus, lgpl, mit, _mitand2, licensing = self.get_symbols_and_licensing() @@ -1027,25 +1277,26 @@ def test_parse_expression_with_trailing_unknown_should_raise_exception(self): tokens = list(licensing.tokenize('The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit 123')) expected = [ (gpl2plus, 'The GNU GPL 20 or later', 0), - (TOKEN_OR, ' or ', 23), + (TOKEN_OR, 'or', 24), (TOKEN_LPAR, '(', 27), (lgpl, 'LGPL-2.1', 28), - (TOKEN_AND, ' and ', 36), + (TOKEN_AND, 'and', 37), (mit, 'mit', 41), (TOKEN_RPAR, ')', 44), - (TOKEN_OR, ' or ', 45), + (TOKEN_OR, 'or', 46), (gpl2, 'The GNU GPL 20', 49), - (TOKEN_OR, ' or ', 63), + (TOKEN_OR, 'or', 64), (mit, 'mit', 67), - (unknown, ' 123', 70) + (unknown, '123', 71) ] assert expected == tokens try: licensing.parse('The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit 123') + self.fail('Exception not raised') except ParseError as pe: - expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 70, - 'token_string': ' 123', 'token_type': unknown} + expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 71, + 'token_string': '123', 'token_type': unknown} assert expected == _parse_error_as_dict(pe) def test_parse_expression_with_trailing_unknown_should_raise_exception2(self): @@ -1053,9 +1304,11 @@ def test_parse_expression_with_trailing_unknown_should_raise_exception2(self): unknown = LicenseSymbol(key='123') try: licensing.parse('The GNU GPL 20 or mit 123') + # 01234567890123456789012345 + self.fail('Exception not raised') except ParseError as pe: - expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 21, - 'token_string': ' 123', 'token_type': unknown} + expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 22, + 'token_string': '123', 'token_type': unknown} assert expected == _parse_error_as_dict(pe) def test_parse_expression_with_WITH(self): @@ -1069,15 +1322,15 @@ def test_parse_expression_with_WITH(self): tokens = list(licensing.tokenize(expr)) expected = [ (gpl_20_or_later, 'The GNU GPL 20 or later', 0), - (TOKEN_OR, ' or ', 23), + (TOKEN_OR, 'or', 24), (TOKEN_LPAR, '(', 27), (lgpl, 'LGPL-2.1', 28), - (TOKEN_AND, ' and ', 36), + (TOKEN_AND, 'and', 37), (mit, 'mit', 41), (TOKEN_RPAR, ')', 44), - (TOKEN_OR, ' or ', 45), + (TOKEN_OR, 'or', 46), (gpl2, 'The GNU GPL 20', 49), - (TOKEN_OR, ' or ', 63), + (TOKEN_OR, 'or', 64), (LicenseWithExceptionSymbol(mit, mitexp), 'mit with mit exp', 67) ] @@ -1123,19 +1376,37 @@ def test_unknown_keys_with_trailing_char(self): assert [] == licensing.unknown_license_keys(parsed) assert [] == licensing.unknown_license_keys(expr) - def test_unknown_keys_with_trailing_char_2(self): + def test_unknown_keys_with_trailing_char_2_with_validate(self): _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing() expr = 'The GNU GPL 20 or LGPL-2.1 and mitand3' try: - licensing.parse(expr) - self.fail('ParseError should be raised') - except ParseError as pe: - expected = {'error_code': 5, 'position': 34, 'token_string': u'and3', 'token_type': LicenseSymbol(key=u'and3')} + licensing.parse(expr, validate=True) + self.fail('Exception should be raised') + except ExpressionError as ee: + assert 'Unknown license key(s): mitand3' == str(ee) - assert expected == _parse_error_as_dict(pe) + def test_unknown_keys_with_trailing_char_2_without_validate(self): + _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing() + expr = 'The GNU GPL 20 or LGPL-2.1 and mitand3' + parsed = licensing.parse(expr, validate=False) + assert 'gpl-2.0 OR (LGPL-2.1 AND mitand3)' == str(parsed) + + def test_parse_with_overlapping_key_without_symbols(self): + expression = 'mit or mit AND zlib or mit or mit with verylonglicense' + # 1111111111222222222233333333334444444444555555555566666 + # 0123456789012345678901234567890123456789012345678901234 + + licensing = Licensing() + results = str(licensing.parse(expression)) + expected = 'mit OR (mit AND zlib) OR mit OR mit WITH verylonglicense' + assert expected == results + + def test_advanced_tokenizer_tokenize_with_overlapping_key_with_symbols_and_trailing_unknown(self): + expression = 'mit or mit AND zlib or mit or mit with verylonglicense' + # 111111111122222222223333333333444444444455555 + # 0123456789012345678901234567890123456789012345678901234 - def test_parse_with_overlapping_key_with_licensing(self): symbols = [ LicenseSymbol('MIT', ['MIT license']), LicenseSymbol('LGPL-2.1', ['LGPL v2.1']), @@ -1145,11 +1416,104 @@ def test_parse_with_overlapping_key_with_licensing(self): LicenseSymbol('hmit', ['h verylonglicense']), ] licensing = Licensing(symbols) + results = list(licensing.get_advanced_tokenizer().tokenize(expression)) + expected = [ + Token(0, 2, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))), + Token(4, 5, 'or', Keyword(value=u'or', type=2)), + Token(7, 9, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))), + Token(11, 13, 'AND', Keyword(value=u'and', type=1)), + Token(15, 18, 'zlib', LicenseSymbol(u'zlib', aliases=(u'zlib',))), + Token(20, 21, 'or', Keyword(value=u'or', type=2)), + Token(23, 25, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))), + Token(27, 28, 'or', Keyword(value=u'or', type=2)), + Token(30, 32, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))), + Token(34, 37, 'with', Keyword(value=u'with', type=10)), + Token(39, 53, 'verylonglicense', None), + ] + + assert expected == results + + def test_advanced_tokenizer_iter_with_overlapping_key_with_symbols_and_trailing_unknown(self): + expression = 'mit or mit AND zlib or mit or mit with verylonglicense' + # 111111111122222222223333333333444444444455555 + # 0123456789012345678901234567890123456789012345678901234 + + symbols = [ + LicenseSymbol('MIT', ['MIT license']), + LicenseSymbol('LGPL-2.1', ['LGPL v2.1']), + LicenseSymbol('zlib', ['zlib']), + LicenseSymbol('d-zlib', ['D zlib']), + LicenseSymbol('mito', ['mit o']), + LicenseSymbol('hmit', ['h verylonglicense']), + ] + licensing = Licensing(symbols) + results = list(licensing.get_advanced_tokenizer().iter(expression, include_unmatched=True)) + expected = [ + Token(0, 2, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))), + Token(4, 5, 'or', Keyword(value=u'or', type=2)), + Token(7, 9, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))), + Token(11, 13, 'AND', Keyword(value=u'and', type=1)), + Token(15, 18, 'zlib', LicenseSymbol(u'zlib', aliases=(u'zlib',))), + Token(20, 21, 'or', Keyword(value=u'or', type=2)), + Token(23, 25, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))), + Token(27, 28, 'or', Keyword(value=u'or', type=2)), + Token(30, 32, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))), + Token(34, 37, 'with', Keyword(value=u'with', type=10)), + Token(39, 53, 'verylonglicense', None), + ] + assert expected == results + + def test_advanced_tokenizer_iter_with_overlapping_key_with_symbols_and_trailing_unknown2(self): + expression = 'mit with verylonglicense' + symbols = [ + LicenseSymbol('MIT', ['MIT license']), + LicenseSymbol('hmit', ['h verylonglicense']), + ] + licensing = Licensing(symbols) + results = list(licensing.get_advanced_tokenizer().iter(expression, include_unmatched=True)) + expected = [ + Token(0, 2, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))), + Token(4, 7, 'with', Keyword(value=u'with', type=10)), + Token(9, 23, 'verylonglicense', None), + ] + assert expected == results + def test_tokenize_with_overlapping_key_with_symbols_and_trailing_unknown(self): expression = 'mit or mit AND zlib or mit or mit with verylonglicense' + # 1111111111222222222233333333334444444444555555555566666 + # 0123456789012345678901234567890123456789012345678901234 + + symbols = [ + LicenseSymbol('MIT', ['MIT license']), + LicenseSymbol('LGPL-2.1', ['LGPL v2.1']), + LicenseSymbol('zlib', ['zlib']), + LicenseSymbol('d-zlib', ['D zlib']), + LicenseSymbol('mito', ['mit o']), + LicenseSymbol('hmit', ['h verylonglicense']), + ] + licensing = Licensing(symbols) + + results = list(licensing.tokenize(expression)) + expected = [ + (LicenseSymbol(u'MIT', aliases=(u'MIT license',)), 'mit', 0), + (2, 'or', 4), + (LicenseSymbol(u'MIT', aliases=(u'MIT license',)), 'mit', 7), + (1, 'AND', 11), + (LicenseSymbol(u'zlib', aliases=(u'zlib',)), 'zlib', 15), + (2, 'or', 20), + (LicenseSymbol(u'MIT', aliases=(u'MIT license',)), 'mit', 23), + (2, 'or', 27), + (LicenseWithExceptionSymbol( + license_symbol=LicenseSymbol(u'MIT', aliases=(u'MIT license',)), + exception_symbol=LicenseSymbol(u'verylonglicense')), 'mit with verylonglicense', + 30) + ] + + assert expected == results + results = str(licensing.parse(expression)) - expected = 'mit OR (MIT AND zlib) OR mit OR MIT WITH verylonglicense' - self.assertEqual(expected, results) + expected = 'MIT OR (MIT AND zlib) OR MIT OR MIT WITH verylonglicense' + assert expected == results class LicensingSymbolsTest(TestCase): @@ -1286,104 +1650,105 @@ def test_primary_license_symbol_and_primary_license_key(self): class SplitAndTokenizeTest(TestCase): - def test_splitter(self): + def test_simple_tokenizer(self): expr = (' GPL-2.0 or later with classpath Exception and mit and ' 'mit with SOMETHING with ELSE+ or LGPL 2.1 and ' 'GPL-2.0 or LATER with (Classpath Exception and ' 'mit or later) or LGPL 2.1 or mit or GPL-2.0 or LATER ' 'with SOMETHING with ELSE+ and lgpl 2.1') - results = list(splitter(expr)) + licensing = Licensing() + results = list(licensing.simple_tokenizer(expr)) expected = [ - Result(0, 0, ' ', None), - Result(1, 7, 'GPL-2.0', Output('GPL-2.0', LicenseSymbol(key='GPL-2.0',))), - Result(8, 8, ' ', None), - Result(9, 10, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))), - Result(11, 11, ' ', None), - Result(12, 16, 'later', Output('later', LicenseSymbol(key='later',))), - Result(17, 17, ' ', None), - Result(18, 21, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))), - Result(22, 22, ' ', None), - Result(23, 31, 'classpath', Output('classpath', LicenseSymbol(key='classpath',))), - Result(32, 32, ' ', None), - Result(33, 41, 'Exception', Output('Exception', LicenseSymbol(key='Exception',))), - Result(42, 42, ' ', None), - Result(43, 45, 'and', Output('and', Keyword(value='and', type=TOKEN_AND))), - Result(46, 46, ' ', None), - Result(47, 49, 'mit', Output('mit', LicenseSymbol(key='mit',))), - Result(50, 50, ' ', None), - Result(51, 53, 'and', Output('and', Keyword(value='and', type=TOKEN_AND))), - Result(54, 54, ' ', None), - Result(55, 57, 'mit', Output('mit', LicenseSymbol(key='mit',))), - Result(58, 58, ' ', None), - Result(59, 62, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))), - Result(63, 63, ' ', None), - Result(64, 72, 'SOMETHING', Output('SOMETHING', LicenseSymbol(key='SOMETHING',))), - Result(73, 73, ' ', None), - Result(74, 77, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))), - Result(78, 78, ' ', None), - Result(79, 83, 'ELSE+', Output('ELSE+', LicenseSymbol(key='ELSE+',))), - Result(84, 84, ' ', None), - Result(85, 86, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))), - Result(87, 87, ' ', None), - Result(88, 91, 'LGPL', Output('LGPL', LicenseSymbol(key='LGPL',))), - Result(92, 92, ' ', None), - Result(93, 95, '2.1', Output('2.1', LicenseSymbol(key='2.1',))), - Result(96, 96, ' ', None), - Result(97, 99, 'and', Output('and', Keyword(value='and', type=TOKEN_AND))), - Result(100, 100, ' ', None), - Result(101, 107, 'GPL-2.0', Output('GPL-2.0', LicenseSymbol(key='GPL-2.0',))), - Result(108, 108, ' ', None), - Result(109, 110, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))), - Result(111, 111, ' ', None), - Result(112, 116, 'LATER', Output('LATER', LicenseSymbol(key='LATER',))), - Result(117, 117, ' ', None), - Result(118, 121, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))), - Result(122, 122, ' ', None), - Result(123, 123, '(', Output('(', Keyword(value='(', type=TOKEN_LPAR))), - Result(124, 132, 'Classpath', Output('Classpath', LicenseSymbol(key='Classpath',))), - Result(133, 133, ' ', None), - Result(134, 142, 'Exception', Output('Exception', LicenseSymbol(key='Exception',))), - Result(143, 143, ' ', None), - Result(144, 146, 'and', Output('and', Keyword(value='and', type=TOKEN_AND))), - Result(147, 147, ' ', None), - Result(148, 150, 'mit', Output('mit', LicenseSymbol(key='mit',))), - Result(151, 151, ' ', None), - Result(152, 153, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))), - Result(154, 154, ' ', None), - Result(155, 159, 'later', Output('later', LicenseSymbol(key='later',))), - Result(160, 160, ')', Output(')', Keyword(value=')', type=TOKEN_RPAR))), - Result(161, 161, ' ', None), - Result(162, 163, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))), - Result(164, 164, ' ', None), - Result(165, 168, 'LGPL', Output('LGPL', LicenseSymbol(key='LGPL',))), - Result(169, 169, ' ', None), - Result(170, 172, '2.1', Output('2.1', LicenseSymbol(key='2.1',))), - Result(173, 173, ' ', None), - Result(174, 175, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))), - Result(176, 176, ' ', None), - Result(177, 179, 'mit', Output('mit', LicenseSymbol(key='mit',))), - Result(180, 180, ' ', None), - Result(181, 182, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))), - Result(183, 183, ' ', None), - Result(184, 190, 'GPL-2.0', Output('GPL-2.0', LicenseSymbol(key='GPL-2.0',))), - Result(191, 191, ' ', None), - Result(192, 193, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))), - Result(194, 194, ' ', None), - Result(195, 199, 'LATER', Output('LATER', LicenseSymbol(key='LATER',))), - Result(200, 200, ' ', None), - Result(201, 204, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))), - Result(205, 205, ' ', None), - Result(206, 214, 'SOMETHING', Output('SOMETHING', LicenseSymbol(key='SOMETHING',))), - Result(215, 215, ' ', None), - Result(216, 219, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))), - Result(220, 220, ' ', None), - Result(221, 225, 'ELSE+', Output('ELSE+', LicenseSymbol(key='ELSE+',))), - Result(226, 226, ' ', None), - Result(227, 229, 'and', Output('and', Keyword(value='and', type=TOKEN_AND))), - Result(230, 230, ' ', None), - Result(231, 234, 'lgpl', Output('lgpl', LicenseSymbol(key='lgpl',))), - Result(235, 235, ' ', None), - Result(236, 238, '2.1', Output('2.1', LicenseSymbol(key='2.1',))) + Token(0, 0, ' ', None), + Token(1, 7, 'GPL-2.0', LicenseSymbol(key='GPL-2.0')), + Token(8, 8, ' ', None), + Token(9, 10, 'or', Keyword(value='or', type=TOKEN_OR)), + Token(11, 11, ' ', None), + Token(12, 16, 'later', LicenseSymbol(key='later')), + Token(17, 17, ' ', None), + Token(18, 21, 'with', Keyword(value='with', type=TOKEN_WITH)), + Token(22, 22, ' ', None), + Token(23, 31, 'classpath', LicenseSymbol(key='classpath')), + Token(32, 32, ' ', None), + Token(33, 41, 'Exception', LicenseSymbol(key='Exception')), + Token(42, 42, ' ', None), + Token(43, 45, 'and', Keyword(value='and', type=TOKEN_AND)), + Token(46, 46, ' ', None), + Token(47, 49, 'mit', LicenseSymbol(key='mit')), + Token(50, 50, ' ', None), + Token(51, 53, 'and', Keyword(value='and', type=TOKEN_AND)), + Token(54, 54, ' ', None), + Token(55, 57, 'mit', LicenseSymbol(key='mit')), + Token(58, 58, ' ', None), + Token(59, 62, 'with', Keyword(value='with', type=TOKEN_WITH)), + Token(63, 63, ' ', None), + Token(64, 72, 'SOMETHING', LicenseSymbol(key='SOMETHING')), + Token(73, 73, ' ', None), + Token(74, 77, 'with', Keyword(value='with', type=TOKEN_WITH)), + Token(78, 78, ' ', None), + Token(79, 83, 'ELSE+', LicenseSymbol(key='ELSE+')), + Token(84, 84, ' ', None), + Token(85, 86, 'or', Keyword(value='or', type=TOKEN_OR)), + Token(87, 87, ' ', None), + Token(88, 91, 'LGPL', LicenseSymbol(key='LGPL')), + Token(92, 92, ' ', None), + Token(93, 95, '2.1', LicenseSymbol(key='2.1')), + Token(96, 96, ' ', None), + Token(97, 99, 'and', Keyword(value='and', type=TOKEN_AND)), + Token(100, 100, ' ', None), + Token(101, 107, 'GPL-2.0', LicenseSymbol(key='GPL-2.0')), + Token(108, 108, ' ', None), + Token(109, 110, 'or', Keyword(value='or', type=TOKEN_OR)), + Token(111, 111, ' ', None), + Token(112, 116, 'LATER', LicenseSymbol(key='LATER')), + Token(117, 117, ' ', None), + Token(118, 121, 'with', Keyword(value='with', type=TOKEN_WITH)), + Token(122, 122, ' ', None), + Token(123, 123, '(', Keyword(value='(', type=TOKEN_LPAR)), + Token(124, 132, 'Classpath', LicenseSymbol(key='Classpath')), + Token(133, 133, ' ', None), + Token(134, 142, 'Exception', LicenseSymbol(key='Exception')), + Token(143, 143, ' ', None), + Token(144, 146, 'and', Keyword(value='and', type=TOKEN_AND)), + Token(147, 147, ' ', None), + Token(148, 150, 'mit', LicenseSymbol(key='mit')), + Token(151, 151, ' ', None), + Token(152, 153, 'or', Keyword(value='or', type=TOKEN_OR)), + Token(154, 154, ' ', None), + Token(155, 159, 'later', LicenseSymbol(key='later')), + Token(160, 160, ')', Keyword(value=')', type=TOKEN_RPAR)), + Token(161, 161, ' ', None), + Token(162, 163, 'or', Keyword(value='or', type=TOKEN_OR)), + Token(164, 164, ' ', None), + Token(165, 168, 'LGPL', LicenseSymbol(key='LGPL')), + Token(169, 169, ' ', None), + Token(170, 172, '2.1', LicenseSymbol(key='2.1')), + Token(173, 173, ' ', None), + Token(174, 175, 'or', Keyword(value='or', type=TOKEN_OR)), + Token(176, 176, ' ', None), + Token(177, 179, 'mit', LicenseSymbol(key='mit')), + Token(180, 180, ' ', None), + Token(181, 182, 'or', Keyword(value='or', type=TOKEN_OR)), + Token(183, 183, ' ', None), + Token(184, 190, 'GPL-2.0', LicenseSymbol(key='GPL-2.0')), + Token(191, 191, ' ', None), + Token(192, 193, 'or', Keyword(value='or', type=TOKEN_OR)), + Token(194, 194, ' ', None), + Token(195, 199, 'LATER', LicenseSymbol(key='LATER')), + Token(200, 200, ' ', None), + Token(201, 204, 'with', Keyword(value='with', type=TOKEN_WITH)), + Token(205, 205, ' ', None), + Token(206, 214, 'SOMETHING', LicenseSymbol(key='SOMETHING')), + Token(215, 215, ' ', None), + Token(216, 219, 'with', Keyword(value='with', type=TOKEN_WITH)), + Token(220, 220, ' ', None), + Token(221, 225, 'ELSE+', LicenseSymbol(key='ELSE+')), + Token(226, 226, ' ', None), + Token(227, 229, 'and', Keyword(value='and', type=TOKEN_AND)), + Token(230, 230, ' ', None), + Token(231, 234, 'lgpl', LicenseSymbol(key='lgpl')), + Token(235, 235, ' ', None), + Token(236, 238, '2.1', LicenseSymbol(key='2.1',)) ] assert expected == results @@ -1422,103 +1787,82 @@ def test_tokenize_step_by_step_does_not_munge_trailing_symbols(self): 'mit or later or LGPL 2.1 or mit or GPL-2.0 or LATER ' 'with mitthing with ELSE+ and lgpl 2.1 or gpl-2.0') - # fist scan - scanner = licensing.get_scanner() - result = list(scanner.scan(expr)) - - WITH_KW = Keyword(value=' with ', type=10) - AND_KW = Keyword(value=' and ', type=1) - OR_KW = Keyword(value=' or ', type=2) - + # fist tokenize + tokenizer = licensing.get_advanced_tokenizer() + result = list(tokenizer.tokenize(expr)) expected = [ - Result(0, 0, ' ', None), - Result(1, 16, 'GPL-2.0 or later', Output('GPL-2.0 or LATER', gpl2plus, 1)), - Result(17, 22, ' with ', Output(' with ', WITH_KW, 0)), - Result(23, 41, 'classpath Exception', Output('classpath Exception', cpex, 1)), - Result(42, 46, ' and ', Output(' and ', AND_KW, 0)), - Result(47, 49, 'mit', Output('mit', mit, 1)), - Result(50, 54, ' and ', Output(' and ', AND_KW, 0)), - Result(55, 57, 'mit', Output('mit', mit, 1)), - Result(58, 63, ' with ', Output(' with ', WITH_KW, 0)), - Result(64, 82, 'mitthing with ELSE+', Output('mitthing with else+', mitthing_with_else, 1)), - Result(83, 86, ' or ', Output(' or ', OR_KW, 0)), - Result(87, 94, 'LGPL 2.1', Output('LGPL 2.1', lgpl, 1)), - Result(95, 99, ' and ', Output(' and ', AND_KW, 0)), - Result(100, 115, 'GPL-2.0 or LATER', Output('GPL-2.0 or LATER', gpl2plus, 1)), - Result(116, 121, ' with ', Output(' with ', WITH_KW, 0)), - Result(122, 140, 'Classpath Exception', Output('classpath Exception', cpex, 1)), - Result(141, 145, ' and ', Output(' and ', AND_KW, 0)), - Result(146, 157, 'mit or later', Output('mit or later', mitplus, 1)), - Result(158, 161, ' or ', Output(' or ', OR_KW, 0)), - Result(162, 169, 'LGPL 2.1', Output('LGPL 2.1', lgpl, 1)), - Result(170, 173, ' or ', Output(' or ', OR_KW, 0)), - Result(174, 176, 'mit', Output('mit', mit, 1)), - Result(177, 180, ' or ', Output(' or ', OR_KW, 0)), - Result(181, 196, 'GPL-2.0 or LATER', Output('GPL-2.0 or LATER', gpl2plus, 1)), - Result(197, 202, ' with ', Output(' with ', WITH_KW, 0)), - Result(203, 221, 'mitthing with ELSE+', Output('mitthing with else+', mitthing_with_else, 1)), - Result(222, 226, ' and ', Output(' and ', AND_KW, 0)), - Result(227, 234, 'lgpl 2.1', Output('LGPL 2.1', lgpl, 1)), - Result(235, 238, ' or ', Output(' or ', OR_KW, 0)), - Result(239, 245, 'gpl-2.0', Output('GPL-2.0', gpl2, 1)) + Token(1, 16, 'GPL-2.0 or later', LicenseSymbol(u'GPL-2.0 or LATER')), + Token(18, 21, 'with', Keyword(value=u'with', type=10)), + Token(23, 41, 'classpath Exception', LicenseSymbol(u'classpath Exception', is_exception=True)), + Token(43, 45, 'and', Keyword(value=u'and', type=1)), + Token(47, 49, 'mit', LicenseSymbol(u'mit')), + Token(51, 53, 'and', Keyword(value=u'and', type=1)), + Token(55, 57, 'mit', LicenseSymbol(u'mit')), + Token(59, 62, 'with', Keyword(value=u'with', type=10)), + Token(64, 82, 'mitthing with ELSE+', LicenseSymbol(u'mitthing with else+')), + Token(84, 85, 'or', Keyword(value=u'or', type=2)), + Token(87, 94, 'LGPL 2.1', LicenseSymbol(u'LGPL 2.1')), + Token(96, 98, 'and', Keyword(value=u'and', type=1)), + Token(100, 115, 'GPL-2.0 or LATER', LicenseSymbol(u'GPL-2.0 or LATER')), + Token(117, 120, 'with', Keyword(value=u'with', type=10)), + Token(122, 140, 'Classpath Exception', LicenseSymbol(u'classpath Exception', is_exception=True)), + Token(142, 144, 'and', Keyword(value=u'and', type=1)), + Token(146, 157, 'mit or later', LicenseSymbol(u'mit or later')), + Token(159, 160, 'or', Keyword(value=u'or', type=2)), + Token(162, 169, 'LGPL 2.1', LicenseSymbol(u'LGPL 2.1')), + Token(171, 172, 'or', Keyword(value=u'or', type=2)), + Token(174, 176, 'mit', LicenseSymbol(u'mit')), + Token(178, 179, 'or', Keyword(value=u'or', type=2)), + Token(181, 196, 'GPL-2.0 or LATER', LicenseSymbol(u'GPL-2.0 or LATER')), + Token(198, 201, 'with', Keyword(value=u'with', type=10)), + Token(203, 221, 'mitthing with ELSE+', LicenseSymbol(u'mitthing with else+')), + Token(223, 225, 'and', Keyword(value=u'and', type=1)), + Token(227, 234, 'lgpl 2.1', LicenseSymbol(u'LGPL 2.1')), + Token(236, 237, 'or', Keyword(value=u'or', type=2)), + Token(239, 245, 'gpl-2.0', LicenseSymbol(u'GPL-2.0')) ] assert expected == result - assert 246 == expected[-1].end + 1 - assert 246 == sum(len(r.string) for r in result) - - # skip spaces - result = list(strip_and_skip_spaces(result)) - # here only the first token is a space - assert expected[1:] == result - - # group results - - gpl2pluso = Output('GPL-2.0 or LATER', LicenseSymbol('GPL-2.0 or LATER', is_exception=False), 1) - cpex0 = Output('classpath Exception', LicenseSymbol('classpath Exception', is_exception=True), 1) - mito = Output('mit', LicenseSymbol('mit', is_exception=False), 1) - mieo1 = Output('mitthing with else+', LicenseSymbol('mitthing with else+', is_exception=False), 1) - lgplo = Output('LGPL 2.1', LicenseSymbol('LGPL 2.1', is_exception=False), 1) - mitoo = Output('mit or later', LicenseSymbol('mit or later', is_exception=False), 1) - gpl202 = Output('GPL-2.0', LicenseSymbol('GPL-2.0', is_exception=False), 1) - - with_kw = Output(' with ', WITH_KW, 0) - and_kw = Output(' and ', AND_KW, 0) - or_kw = Output(' or ', OR_KW, 0) expected_groups = [ - (Result(1, 16, 'GPL-2.0 or later', gpl2pluso), - Result(17, 22, ' with ', with_kw), - Result(23, 41, 'classpath Exception', cpex0)), - (Result(42, 46, ' and ', and_kw),), - (Result(47, 49, 'mit', mito),), - (Result(50, 54, ' and ', and_kw),), - (Result(55, 57, 'mit', mito), - Result(58, 63, ' with ', with_kw), - Result(64, 82, 'mitthing with ELSE+', mieo1)), - (Result(83, 86, ' or ', or_kw),), - (Result(87, 94, 'LGPL 2.1', lgplo),), - (Result(95, 99, ' and ', and_kw),), - (Result(100, 115, 'GPL-2.0 or LATER', gpl2pluso), - Result(116, 121, ' with ', with_kw), - Result(122, 140, 'Classpath Exception', cpex0)), - (Result(141, 145, ' and ', and_kw),), - (Result(146, 157, 'mit or later', mitoo),), - (Result(158, 161, ' or ', or_kw),), - (Result(162, 169, 'LGPL 2.1', lgplo),), - (Result(170, 173, ' or ', or_kw),), - (Result(174, 176, 'mit', mito),), - (Result(177, 180, ' or ', or_kw),), - (Result(181, 196, 'GPL-2.0 or LATER', gpl2pluso), - Result(197, 202, ' with ', with_kw), - Result(203, 221, 'mitthing with ELSE+', mieo1)), - (Result(222, 226, ' and ', and_kw),), - (Result(227, 234, 'lgpl 2.1', lgplo),), - (Result(235, 238, ' or ', or_kw),), - (Result(239, 245, 'gpl-2.0', gpl202),) + (Token(1, 16, 'GPL-2.0 or later', LicenseSymbol(u'GPL-2.0 or LATER')), + Token(18, 21, 'with', Keyword(value=u'with', type=10)), + Token(23, 41, 'classpath Exception', LicenseSymbol(u'classpath Exception', is_exception=True))), + + (Token(43, 45, 'and', Keyword(value=u'and', type=1)),), + (Token(47, 49, 'mit', LicenseSymbol(u'mit')),), + (Token(51, 53, 'and', Keyword(value=u'and', type=1)),), + + (Token(55, 57, 'mit', LicenseSymbol(u'mit')), + Token(59, 62, 'with', Keyword(value=u'with', type=10)), + Token(64, 82, 'mitthing with ELSE+', LicenseSymbol(u'mitthing with else+'))), + + (Token(84, 85, 'or', Keyword(value=u'or', type=2)),), + (Token(87, 94, 'LGPL 2.1', LicenseSymbol(u'LGPL 2.1')),), + (Token(96, 98, 'and', Keyword(value=u'and', type=1)),), + + (Token(100, 115, 'GPL-2.0 or LATER', LicenseSymbol(u'GPL-2.0 or LATER')), + Token(117, 120, 'with', Keyword(value=u'with', type=10)), + Token(122, 140, 'Classpath Exception', LicenseSymbol(u'classpath Exception', is_exception=True))), + + (Token(142, 144, 'and', Keyword(value=u'and', type=1)),), + (Token(146, 157, 'mit or later', LicenseSymbol(u'mit or later')),), + (Token(159, 160, 'or', Keyword(value=u'or', type=2)),), + (Token(162, 169, 'LGPL 2.1', LicenseSymbol(u'LGPL 2.1')),), + (Token(171, 172, 'or', Keyword(value=u'or', type=2)),), + (Token(174, 176, 'mit', LicenseSymbol(u'mit')),), + (Token(178, 179, 'or', Keyword(value=u'or', type=2)),), + + (Token(181, 196, 'GPL-2.0 or LATER', LicenseSymbol(u'GPL-2.0 or LATER')), + Token(198, 201, 'with', Keyword(value=u'with', type=10)), + Token(203, 221, 'mitthing with ELSE+', LicenseSymbol(u'mitthing with else+'))), + + (Token(223, 225, 'and', Keyword(value=u'and', type=1)),), + (Token(227, 234, 'lgpl 2.1', LicenseSymbol(u'LGPL 2.1')),), + (Token(236, 237, 'or', Keyword(value=u'or', type=2)),), + (Token(239, 245, 'gpl-2.0', LicenseSymbol(u'GPL-2.0')),) ] - - result_groups = list(group_results_for_with_subexpression(result)) + result_groups = list(build_token_groups_for_with_subexpression(result)) assert expected_groups == result_groups # finally retest it all with tokenize @@ -1530,25 +1874,25 @@ def test_tokenize_step_by_step_does_not_munge_trailing_symbols(self): expected = [ (gpl2plus_with_cpex, 'GPL-2.0 or later with classpath Exception', 1), - (TOKEN_AND, ' and ', 42), + (TOKEN_AND, 'and', 43), (mit, 'mit', 47), - (TOKEN_AND, ' and ', 50), + (TOKEN_AND, 'and', 51), (mit_with_mitthing_with_else, 'mit with mitthing with ELSE+', 55), - (TOKEN_OR, ' or ', 83), + (TOKEN_OR, 'or', 84), (lgpl, 'LGPL 2.1', 87), - (TOKEN_AND, ' and ', 95), + (TOKEN_AND, 'and', 96), (gpl2plus_with_cpex, 'GPL-2.0 or LATER with Classpath Exception', 100), - (TOKEN_AND, ' and ', 141), + (TOKEN_AND, 'and', 142), (mitplus, 'mit or later', 146), - (TOKEN_OR, ' or ', 158), + (TOKEN_OR, 'or', 159), (lgpl, 'LGPL 2.1', 162), - (TOKEN_OR, ' or ', 170), + (TOKEN_OR, 'or', 171), (mit, 'mit', 174), - (TOKEN_OR, ' or ', 177), + (TOKEN_OR, 'or', 178), (gpl2plus_with_someplus, 'GPL-2.0 or LATER with mitthing with ELSE+', 181), - (TOKEN_AND, ' and ', 222), + (TOKEN_AND, 'and', 223), (lgpl, 'lgpl 2.1', 227), - (TOKEN_OR, ' or ', 235), + (TOKEN_OR, 'or', 236), (gpl2, 'gpl-2.0', 239), ] @@ -1661,3 +2005,75 @@ def __init__(self, key, is_exception=False): expected = [l1, lx, lx2, lx3, l3, l2, l4] assert expected == sorted([l4, l3, l2, l1, lx , lx2, lx3]) + + +class MockLicensesTest(TestCase): + + def test_licensing_can_use_mocklicense_tuple(self): + MockLicense = namedtuple('MockLicense', 'key aliases is_exception') + + licenses = [ + MockLicense('gpl-2.0', ['GPL-2.0'], False), + MockLicense('classpath-2.0', ['Classpath-Exception-2.0'], True), + MockLicense('gpl-2.0-plus', ['GPL-2.0-or-later', 'GPL-2.0 or-later'], False), + MockLicense('lgpl-2.1-plus', ['LGPL-2.1-or-later'], False), + ] + licensing = Licensing(licenses) + + ex1 = '(GPL-2.0-or-later with Classpath-Exception-2.0 or GPL-2.0 or-later) and LGPL-2.1-or-later' + expression1 = licensing.parse(ex1, validate=False, strict=False) + assert ['gpl-2.0-plus', 'classpath-2.0', 'lgpl-2.1-plus'] == licensing.license_keys(expression1) + + ex2 = 'LGPL-2.1-or-later and (GPL-2.0-or-later oR GPL-2.0-or-later with Classpath-Exception-2.0)' + expression2 = licensing.parse(ex2, validate=True, strict=False) + + ex3 = 'LGPL-2.1-or-later and (GPL-2.0-or-later oR GPL-2.0-or-later)' + expression3 = licensing.parse(ex3, validate=True, strict=False) + + self.assertTrue(licensing.is_equivalent(expression1, expression2)) + self.assertTrue(licensing.is_equivalent(expression2, expression1)) + self.assertFalse(licensing.is_equivalent(expression1, expression3)) + self.assertFalse(licensing.is_equivalent(expression2, expression3)) + + def test_and_and_or_is_invalid(self): + expression = 'gpl-2.0 with classpath and and or gpl-2.0-plus' + licensing = Licensing() + try: + licensing.parse(expression) + self.fail('Exception not raised') + except ParseError as pe: + expected = { + 'error_code': PARSE_INVALID_OPERATOR_SEQUENCE, + 'position': 27, + 'token_string': 'and', + 'token_type': TOKEN_AND} + assert expected == _parse_error_as_dict(pe) + + def test_or_or_is_invalid(self): + expression = 'gpl-2.0 with classpath or or or or gpl-2.0-plus' + licensing = Licensing() + try: + licensing.parse(expression) + except ParseError as pe: + expected = { + 'error_code': PARSE_INVALID_OPERATOR_SEQUENCE, + 'position': 26, + 'token_string': 'or', + 'token_type': TOKEN_OR} + assert expected == _parse_error_as_dict(pe) + + def test_tokenize_or_or(self): + expression = 'gpl-2.0 with classpath or or or gpl-2.0-plus' + licensing = Licensing() + results = list(licensing.tokenize(expression)) + expected = [ + (LicenseWithExceptionSymbol( + license_symbol=LicenseSymbol(u'gpl-2.0'), + exception_symbol=LicenseSymbol(u'classpath')), 'gpl-2.0 with classpath', 0), + (2, 'or', 23), + (2, 'or', 26), + (2, 'or', 29), + (LicenseSymbol(u'gpl-2.0-plus'), 'gpl-2.0-plus', 32) + ] + + assert expected == results diff --git a/thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl b/thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl deleted file mode 100644 index 0710ef9..0000000 Binary files a/thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl and /dev/null differ diff --git a/thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl b/thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl new file mode 100644 index 0000000..72f8d99 Binary files /dev/null and b/thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl differ diff --git a/thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl.ABOUT b/thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl.ABOUT similarity index 55% rename from thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl.ABOUT rename to thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl.ABOUT index 9dd54ae..1884a86 100644 --- a/thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl.ABOUT +++ b/thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl.ABOUT @@ -1,17 +1,16 @@ -about_resource: aboutcode_toolkit-3.0.2-py2.py3-none-any.whl -checksum_md5: 7423e283e7c50979313f225065a5fea5 -checksum_sha1: 789d5d29437a11e8119da354a77218190d597d6d +about_resource: aboutcode_toolkit-3.1.1-py2.py3-none-any.whl +checksum_md5: 67e1f793b8421ce60800897bb5b9446d +checksum_sha1: 26466b098411fcce12efac48bb9098cdf4a83573 contact: http://www.nexb.com/contactus.html copyright: Copyright (c) 2013-2017 nexB Inc. description: AboutCode Toolkit is a tool to process ABOUT files. An ABOUT file provides a simple way to document the provenance (origin and license) 'about' a software component. This is a small text file stored in the codebase side-by-side with the documented software component. -download_url: https://pypi.python.org/packages/11/7c/07b565c8a66f8846dab007ad80e31078f15034981dcb7c5e26dd985e3f4a/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl#md5=7423e283e7c50979313f225065a5fea5 +download_url: https://files.pythonhosted.org/packages/48/05/c9dd903c5c6e0f06ec813a9911b27b252e3803fbd97ffa375d909694e26d/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl#sha256=68b2fd1d05dd0dbc8acc91e7bf1b676e43804ea631bab490d0b46ae0b65e51b5 homepage_url: https://aboutcode.org license_expression: apache-2.0 name: AboutCode toolkit -notice_file: NOTICE owner: nexB owner_url: http://www.nexb.com/ -version: 3.0.2 +version: 3.1.1 diff --git a/thirdparty/dev/more-itertools-py2.ABOUT b/thirdparty/dev/more-itertools-py2.ABOUT index b8af58d..6317114 100644 --- a/thirdparty/dev/more-itertools-py2.ABOUT +++ b/thirdparty/dev/more-itertools-py2.ABOUT @@ -8,5 +8,5 @@ description: More routines for operating on iterables, beyond itertools homepage_url: https://github.com/erikrose/more-itertools owner: Erik Rose license_expression: mit -notice_file: more_itertools.NOTICE +notice_file: more-itertools.NOTICE copyright: Copyright (c) 2012 Erik Rose diff --git a/thirdparty/dev/more-itertools-py3.ABOUT b/thirdparty/dev/more-itertools-py3.ABOUT index 8145db2..304fc26 100644 --- a/thirdparty/dev/more-itertools-py3.ABOUT +++ b/thirdparty/dev/more-itertools-py3.ABOUT @@ -8,5 +8,5 @@ description: More routines for operating on iterables, beyond itertools homepage_url: https://github.com/erikrose/more-itertools owner: Erik Rose license_expression: mit -notice_file: more_itertools.NOTICE +notice_file: more-itertools.NOTICE copyright: Copyright (c) 2012 Erik Rose diff --git a/thirdparty/prod/boolean.py-3.5-py2.py3-none-any.whl b/thirdparty/prod/boolean.py-3.5-py2.py3-none-any.whl deleted file mode 100644 index 5fe68e5..0000000 Binary files a/thirdparty/prod/boolean.py-3.5-py2.py3-none-any.whl and /dev/null differ diff --git a/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl new file mode 100644 index 0000000..b1f5579 Binary files /dev/null and b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl differ diff --git a/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.ABOUT b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.ABOUT new file mode 100644 index 0000000..3df1f3a --- /dev/null +++ b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.ABOUT @@ -0,0 +1,15 @@ +about_resource: boolean.py-3.6-py2.py3-none-any.whl +attribute: true +checksum_md5: da39999eb131b589e84ad935dc4ca642 +checksum_sha1: d31b55e7ad2ee917232b3213afe3ae9678156a9f +copyright: Copyright (c) 2009-2016 Sebastian Kraemer, basti.kr@gmail.com and others +description: Implements boolean algebra in one module. +download_url: https://files.pythonhosted.org/packages/9b/27/d22062a221010e17935237ba4b574cd828238ea02e0765337c238466a512/boolean.py-3.6-py2.py3-none-any.whl +homepage_url: https://github.com/bastikr/boolean.py +license_expression: bsd-simplified +license_file: bsd-simplified.LICENSE +name: boolean.py +notice_file: boolean.py-3.6-py2.py3-none-any.whl.NOTICE +notice_url: https://github.com/bastikr/boolean.py/blob/master/LICENSE.txt +owner: Sebastian Kraemer +version: '3.6' diff --git a/thirdparty/prod/boolean.py.LICENSE b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.NOTICE similarity index 93% rename from thirdparty/prod/boolean.py.LICENSE rename to thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.NOTICE index a0c637f..8819ea1 100644 --- a/thirdparty/prod/boolean.py.LICENSE +++ b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.NOTICE @@ -1,23 +1,23 @@ -Copyright (c) 2009-2016 Sebastian Kraemer, basti.kr@gmail.com and others -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation and/or -other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +Copyright (c) 2009-2017 Sebastian Kraemer, basti.kr@gmail.com +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/thirdparty/prod/boolean.py.ABOUT b/thirdparty/prod/boolean.py.ABOUT deleted file mode 100644 index 31eb628..0000000 --- a/thirdparty/prod/boolean.py.ABOUT +++ /dev/null @@ -1,11 +0,0 @@ -about_resource: boolean.py-3.5-py2.py3-none-any.whl -version: 3.5 -download_url: https://pypi.python.org/packages/80/f3/0508ae7ba76b02f7fd666b705766edc1863fc8ef29d0519b4c95d60ab1bb/boolean.py-3.5-py2.py3-none-any.whl#md5=cf90b0c0530663bbf71a53fb58f6fa72 - -name: boolean.py - -copyright: Copyright (c) 2009-2016 Sebastian Kraemer, basti.kr@gmail.com and others -license_expression: bsd-simplified -license_file: boolean.py.LICENSE - -homepage_url: https://github.com/bastikr/boolean.py diff --git a/thirdparty/prod/bsd-simplified.LICENSE b/thirdparty/prod/bsd-simplified.LICENSE new file mode 100644 index 0000000..d99a0b1 --- /dev/null +++ b/thirdparty/prod/bsd-simplified.LICENSE @@ -0,0 +1,20 @@ +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list +of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.