diff --git a/.travis.yml b/.travis.yml index a979755..1fe01c4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,17 +1,40 @@ -# This deliberately is not "python" as a work-around to support -# multi-os builds with custom Python versions in Travis CI. -language: cpp - -os: - - osx - - linux +language: python env: matrix: - - PYTHON_EXE="`pyenv install -s 2.7.13 && pyenv local 2.7.13`" - - PYTHON_EXE="`pyenv install -s 3.5.3 && pyenv local 3.5.3`" + - PYTHON_EXE="`pyenv install -s 2.7.14 && pyenv local 2.7.14`" - PYTHON_EXE="`pyenv install -s 3.6.1 && pyenv local 3.6.1`" + +# Travis does not offer OSX with arbitrary python versions (like 2.7.13 above) +# So, you cannot simply have the following section in your build matrix: +# os: +# - linux +# - osx +# Instead, you have to include OSX entries into the build matrix manually. +# In particular, this means specifying the environment variables again. + +# The following was adapted from here: +# https://docs.travis-ci.com/user/multi-os/ +# Set `language: generic` to clear `language: python` from above +# Set `python:` (to empty) to clear it from the travis-ci web interface +# Set `osx_image: xcode7.3` to pin OSX version see here: +# https://docs.travis-ci.com/user/osx-ci-environment/ + +matrix: + include: + - os: osx + language: generic + python: + osx_image: xcode7.3 + env: PYTHON_EXE="`pyenv install -s 2.7.14 && pyenv local 2.7.14`" + - os: osx + language: generic + python: + osx_image: xcode7.3 + env: PYTHON_EXE="`pyenv install -s 3.6.1 && pyenv local 3.6.1`" + + install: - pyenv install --list - ./configure @@ -32,4 +55,4 @@ notifications: use_notice: true skip_join: true template: - - "%{repository_slug}#%{build_number} (%{branch} - %{commit} : %{author}): %{message} : %{build_url}" + - "%{repository_slug}#%{build_number} (%{branch}-%{commit}:%{author})-%{message}- %{build_url}" diff --git a/README.rst b/README.rst index 77f24fb..6b09dd3 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ license-expression license-expression is a small utility library to parse, compare, simplify and normalize license expressions (e.g. SPDX license expressions) using boolean logic such as: -`GPL-2.0 or later WITH Classpath Exception AND MIT`. +`GPL-2.0-or-later WITH Classpath-Exception AND MIT`. See also for details: https://spdx.org/sites/cpstandard/files/pages/files/spdxversion2.1.pdf#page=95&zoom=auto @@ -102,27 +102,27 @@ And expression can be simplified: .. code-block:: python - >>> expression2 = ' GPL-2.0 or (mit and LGPL 2.1) or bsd Or GPL-2.0 or (mit and LGPL 2.1)' + >>> expression2 = ' GPL-2.0 or (mit and LGPL-2.1) or bsd Or GPL-2.0 or (mit and LGPL-2.1)' >>> parsed2 = licensing.parse(expression2) - >>> assert str(parsed2.simplify()) == 'BSD OR GPL-2.0 OR (LGPL 2.1 AND mit)' + >>> assert str(parsed2.simplify()) == 'BSD OR GPL-2.0 OR (LGPL-2.1 AND mit)' Two expressions can be compared for equivalence and containment: .. code-block:: python - >>> expr1 = licensing.parse(' GPL-2.0 or (LGPL 2.1 and mit) ') - >>> expr2 = licensing.parse(' (mit and LGPL 2.1) or GPL-2.0 ') + >>> expr1 = licensing.parse(' GPL-2.0 or (LGPL-2.1 and mit) ') + >>> expr2 = licensing.parse(' (mit and LGPL-2.1) or GPL-2.0 ') >>> licensing.is_equivalent(expr1, expr2) True - >>> licensing.is_equivalent(' GPL-2.0 or (LGPL 2.1 and mit) ', - ... ' (mit and LGPL 2.1) or GPL-2.0 ') + >>> licensing.is_equivalent(' GPL-2.0 or (LGPL-2.1 and mit) ', + ... ' (mit and LGPL-2.1) or GPL-2.0 ') True >>> expr1.simplify() == expr2.simplify() True - >>> expr3 = licensing.parse(' GPL-2.0 or mit or LGPL 2.1') + >>> expr3 = licensing.parse(' GPL-2.0 or mit or LGPL-2.1') >>> licensing.is_equivalent(expr2, expr3) False - >>> expr4 = licensing.parse('mit and LGPL 2.1') + >>> expr4 = licensing.parse('mit and LGPL-2.1') >>> expr4.simplify() in expr2.simplify() True >>> licensing.contains(expr2, expr4) diff --git a/configure b/configure index 8ceb9d6..4f9fdcc 100755 --- a/configure +++ b/configure @@ -16,17 +16,15 @@ CONF_DEFAULT="etc/conf/dev" CFG_CMD_LINE_ARGS="$@" -if [ "$1" == "--init" ]; then - CFG_CMD_LINE_ARGS=$CONF_INIT -fi - -if [ "$1" == "" ]; then +if [[ "$1" == "" ]]; then # default for dev conf if not argument is provided CFG_CMD_LINE_ARGS=$CONF_DEFAULT fi -if [ "$PYTHON_EXE" == "" ]; then +CONFIGURE_ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +if [[ "$PYTHON_EXE" == "" ]]; then PYTHON_EXE=python fi -$PYTHON_EXE etc/configure.py $CFG_CMD_LINE_ARGS +$PYTHON_EXE "$CONFIGURE_ROOT_DIR/etc/configure.py" $CFG_CMD_LINE_ARGS diff --git a/setup.py b/setup.py index ea2eae5..38f993d 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( name='license-expression', - version='0.98', + version='0.99', license='apache-2.0', description=desc, long_description=desc, diff --git a/src/license_expression/__init__.py b/src/license_expression/__init__.py index 43c7b2c..a0434da 100644 --- a/src/license_expression/__init__.py +++ b/src/license_expression/__init__.py @@ -41,6 +41,7 @@ unicode = str # NOQA import collections +from collections import OrderedDict from copy import copy from copy import deepcopy from functools import total_ordering @@ -66,10 +67,6 @@ from boolean.boolean import TOKEN_LPAR from boolean.boolean import TOKEN_RPAR -from license_expression._pyahocorasick import Trie as Scanner -from license_expression._pyahocorasick import Output -from license_expression._pyahocorasick import Result - # append new error codes to PARSE_ERRORS by monkey patching PARSE_EXPRESSION_NOT_UNICODE = 100 if PARSE_EXPRESSION_NOT_UNICODE not in PARSE_ERRORS: @@ -179,12 +176,21 @@ def __init__(self, symbols=tuple(), quiet=True): raise ValueError('\n'.join(warns + errors)) # mapping of known symbol used for parsing and resolution as (key, symbol) - # TODO: inject lpar, rpar and spaces sourround, before and after - # e.g "(sym)" "(sym " "sym)" " sym " - self.known_symbols = {symbol.key: symbol for symbol in symbols} + self.known_symbols = known_symbols = {} + # mapping of {key or alias all lowercase -> known symbol} used for + # parsing and resolution + self.symbols_by_key = symbols_by_key = {} - # Aho-Corasick automaton-based Scanner used for expression tokenizing - self.scanner = None + for symbol in symbols: + known_symbols[symbol.key] = symbol + symbols_by_key[symbol.key.lower()] = symbol + aliases = getattr(symbol, 'aliases', []) + for alias in aliases: + # normalize spaces for each alias: we ignore aliases with spaces + alias = ' '.join(alias.split()) + if ' ' in alias: + continue + symbols_by_key[alias.lower()] = symbol def is_equivalent(self, expression1, expression2, **kwargs): """ @@ -415,14 +421,8 @@ def tokenize(self, expression, strict=False): such as "XXX with ZZZ" if the XXX symbol has is_exception` set to True or the ZZZ symbol has `is_exception` set to False. """ - if self.known_symbols: - # scan with an automaton, recognize whole symbols+keywords or only keywords - scanner = self.get_scanner() - results = scanner.scan(expression) - else: - # scan with a simple regex-based splitter - results = splitter(expression) - + # scan with a simple regex-based splitter and lookup keys and aliases in a table + results = splitter(expression, self.symbols_by_key) results = strip_and_skip_spaces(results) result_groups = group_results_for_with_subexpression(results) @@ -527,38 +527,6 @@ def tokenize(self, expression, strict=False): yield token, token_string, pos - def get_scanner(self): - """ - Return a scanner either cached or created as needed. If symbols were provided - when this Licensing object was created, the scanner will recognize known - symbols when tokenizing expressions. Otherwise, only keywords are recognized - and a license symbol is anything in between keywords. - """ - if self.scanner is not None: - return self.scanner - - self.scanner = scanner = Scanner(ignore_case=True) - - for keyword in _KEYWORDS: - scanner.add(keyword.value, keyword, priority=0) - - # self.known_symbols has been created at Licensing initialization time and is - # already validated and trusted here - for key, symbol in self.known_symbols.items(): - # always use the key even if there are no aliases. - scanner.add(key, symbol, priority=1) - aliases = getattr(symbol, 'aliases', []) - for alias in aliases: - # normalize spaces for each alias. The Scanner will lowercase them - # since we created it with ignore_case=True - if alias: - alias = ' '.join(alias.split()) - if alias: - scanner.add(alias, symbol, priority=2) - - scanner.make_automaton() - return scanner - class Renderable(object): """ @@ -632,16 +600,20 @@ def __init__(self, key, aliases=tuple(), is_exception=False, *args, **kwargs): 'A license key must be a unicode string: %(key)r' % locals()) key = key.strip() - if not key: raise ExpressionError( 'A license key cannot be blank: "%(key)s"' % locals()) - # note: key can contain spaces + # note: key CANNOT contain spaces + no_spaces_key = ''.join(key.split()) + if key != no_spaces_key: + raise ExpressionError( + 'A license key cannot contains spaces: "%(key)s"' % locals()) + if not is_valid_license_key(key): raise ExpressionError( 'Invalid license key: the valid characters are: letters and numbers, ' - 'underscore, dot or hyphen signs and spaces: "%(key)s"' % locals()) + 'underscore, dot/period ant hyphen signs: "%(key)s"' % locals()) # normalize for spaces key = ' '.join(key.split()) @@ -1184,14 +1156,15 @@ def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS): ).finditer -def splitter(expression): +def splitter(expression, known_symbols=None): """ - Return an iterable of Result describing each token given an + Return an iterable of Result describing each keyword given an expression unicode string. - This is a simpler tokenizer used when the Licensing does not have - known symbols. The split is done on spaces and parens. Anything else - is either a token or a symbol. + This is a simpler tokenizer used when the Licensing does not have known + symbols. The split is done on spaces and parens. Anything else is either a + keyword or a symbol. Symbols matched to a `known_symbols` mapping key + (ignoring case) are returned. Otherwise a new symbol ob ject is created. """ if not expression: return @@ -1199,13 +1172,15 @@ def splitter(expression): if not isinstance(expression, str): raise ParseError(error_code=PARSE_EXPRESSION_NOT_UNICODE) - # mapping of lowercase token strings to a token type id - TOKENS = { + # mapping of lowercase keyword strings to a keywords type id + KEYWORDS = { 'and': Keyword(value='and', type=TOKEN_AND), 'or': Keyword(value='or', type=TOKEN_OR), 'with': Keyword(value='with', type=TOKEN_WITH), } + known_symbols = known_symbols or {} + for match in _splitter(expression): if not match: continue @@ -1226,18 +1201,91 @@ def splitter(expression): if rpar: yield Result(start, end, rpar, Output(rpar, KW_RPAR)) - token_or_sym = mgd.get('symbol') - if not token_or_sym: + keyword_or_symbol = mgd.get('symbol') + if not keyword_or_symbol: continue - token = TOKENS.get(token_or_sym.lower()) - if token: - yield Result(start, end, token_or_sym, Output(token_or_sym, token)) -# elif token_or_sym.endswith('+') and token_or_sym != '+': -# val = token_or_sym[:-1] -# sym = LicenseSymbol(key=val) -# yield Result(start, end - 1, val, Output(val, sym)) -# yield Result(end, end, '+', Output('+', KW_PLUS)) + kos_lower = keyword_or_symbol.lower() + keyword = KEYWORDS.get(kos_lower) + if keyword: + yield Result(start, end, keyword_or_symbol, Output(keyword_or_symbol, keyword)) else: - sym = LicenseSymbol(key=token_or_sym) - yield Result(start, end, token_or_sym, Output(token_or_sym, sym)) + # fetch a known symbol or build a new one + symbol = known_symbols.get(kos_lower) + if not symbol: + symbol = LicenseSymbol(key=keyword_or_symbol) + yield Result(start, end, keyword_or_symbol, Output(keyword_or_symbol, symbol)) + + +# FIXME: this needs to be simplified with no aho corasick in play +class Output(object): + """ + An Output is used to track a key added to the Trie as a TrieNode and any + arbitrary value object corresponding to that key. + + - `key` is the original key unmodified unicode string. + - `value` is the associated value for this key as provided when adding this key. + """ + __slots__ = 'key', 'value', + + def __init__(self, key, value=None): + self.key = key + self.value = value + + def __repr__(self): + return self.__class__.__name__ + '(%(key)r, %(value)r)' % self.as_dict() + + def __eq__(self, other): + return ( + isinstance(other, Output) + and self.key == other.key + and self.value == other.value) + + def __hash__(self): + return hash((self.key, self.value,)) + + def as_dict(self): + return OrderedDict([(s, getattr(self, s)) for s in self.__slots__]) + + +class Result(object): + """ + A Result is used to track the result of a search with its start and end as + index position in the original string and other attributes: + + - `start` and `end` are zero-based index in the original string S such that + S[start:end+1] will yield `string`. + - `string` is the sub-string from the original searched string for this Result. + - `output` is the Output object for a matched string and is a marker that this is a + matched string. None otherwise for a Result for unmatched text. + """ + + __slots__ = 'start', 'end', 'string', 'output' + + def __init__(self, start, end, string='', output=None): + self.start = start + self.end = end + self.string = string + self.output = output + + def __repr__(self): + return self.__class__.__name__ + '(%(start)r, %(end)r, %(string)r, %(output)r)' % self.as_dict() + + def as_dict(self): + return OrderedDict([(s, getattr(self, s)) for s in self.__slots__]) + + def __len__(self): + return self.end + 1 - self.start + + def __eq__(self, other): + return isinstance(other, Result) and ( + self.start == other.start and + self.end == other.end and + self.string == other.string and + self.output == other.output + ) + + def __hash__(self): + tup = self.start, self.end, self.string, self.output + return hash(tup) + diff --git a/src/license_expression/_pyahocorasick.ABOUT b/src/license_expression/_pyahocorasick.ABOUT deleted file mode 100644 index f57983a..0000000 --- a/src/license_expression/_pyahocorasick.ABOUT +++ /dev/null @@ -1,23 +0,0 @@ -about_resource: _pyahocorasick.py -download_url: https://github.com/WojciechMula/pyahocorasick/tree/ec2fb9cb393f571fd4316ea98ed7b65992f16127/py -name: pyahocorasick-python -version: ec2fb9 - -homepage_url: https://github.com/WojciechMula/pyahocorasick -license_expression: public-domain - -copyright: originally authored by Wojciech Mula, modified by the license_expression authors. - -notes: this is a vendored subset of the full pyahocorasick containing only the pure - python part with an implementation modified to return non-overlapping matches and - non-matches. - It has many limitation and in particular it does not pickle well and is much slower - than the full C-based implementation but is convenient to use as a vendored, pure - Python library. - -owner: nexB Inc. -author: Wojciech Mula http://0x80.pl/ - -vcs_tool: git -vcs_repository: https://github.com/WojciechMula/pyahocorasick.git - diff --git a/src/license_expression/_pyahocorasick.py b/src/license_expression/_pyahocorasick.py deleted file mode 100644 index 4c73709..0000000 --- a/src/license_expression/_pyahocorasick.py +++ /dev/null @@ -1,640 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Aho-Corasick string search algorithm. - -Original Author: Wojciech Muła, wojciech_mula@poczta.onet.pl -WWW : http://0x80.pl -License : public domain - -Modified for use in the license_expression library and in particular: - - add support for unicode key strinsg. - - rename word to key and output to value (to be more like a mapping/dict) - - case insensitive search - - improve returned results with the actual start,end and matched string. - - support returning non-matched parts of a string -""" - -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import print_function - -from collections import deque -from collections import OrderedDict -import logging - -logger = logging.getLogger(__name__) - - -def logger_debug(*args): - return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) - -# uncomment for local debug logging -# import sys -# logging.basicConfig(stream=sys.stdout) -# logger.setLevel(logging.DEBUG) - - -# used to distinguish from None -nil = object() - - -class Trie(object): - """ - A Trie and Aho-Corasick automaton. This behaves more or less like a mapping of - key->value. This is the main entry point. - """ - - def __init__(self, ignore_case=True): - """ - Initialize a new Trie. - - If `ignore_case` is True, searches in the Trie will be case insensitive. - """ - self.root = TrieNode('') - self.ignore_case = ignore_case - - # set of any unique character in the trie, updated on each addition - # we keep track of the set of chars added to the trie to build the automaton - # these are needed to created the first level children failure links - self._known_chars = set() - - # Flag set to True once a Trie has been converted to an Aho-Corasick automaton - self._converted = False - - def add(self, key, value=None, priority=0): - """ - Add a new (key string, value) pair to the trie. If the key already exists in - the Trie, its value is replaced with the provided value. - A key is any unicode string. - """ - if self._converted: - raise Exception('This Trie has been converted to an Aho-Corasick ' - 'automaton and cannot be further modified.') - if not key: - return - - stored_key = self.ignore_case and key.lower() or key - - # we keep track of the set of chars added to the trie to build the automaton - # these are needed to created the first level children failure links - self._known_chars.update(stored_key) - - node = self.root - for char in stored_key: - try: - node = node.children[char] - except KeyError: - child = TrieNode(char) - node.children[char] = child - node = child - - # we always store the original key, not a possibly lowercased version - node.output = Output(key, value, priority) - - def __get_node(self, key): - """ - Return a node for this key or None if the trie does not contain the key. - Private function retrieving a final node of trie for given key. - """ - key = self.ignore_case and key.lower() or key - node = self.root - for char in key: - try: - node = node.children[char] - except KeyError: - return None - return node - - def get(self, key, default=nil): - """ - Return the Output tuple associated with a `key`. - If there is no such key in the Trie, return the default value (other - than nil): if default is not given or nil, raise a KeyError exception. - """ - node = self.__get_node(key) - output = nil - if node: - output = node.output - - if output is nil: - if default is nil: - raise KeyError(key) - else: - return default - else: - return output - - def keys(self): - """ - Yield all keys stored in this trie. - """ - return (key for key, _ in self.items()) - - def values(self): - """ - Yield all values associated with keys stored in this trie. - """ - return (value for _, value in self.items()) - - def items(self): - """ - Yield tuple of all (key, value) stored in this trie. - """ - items = [] - - def walk(node, key): - """ - Walk the trie, depth first. - """ - key = key + node.char - if node.output is not nil: - items.append((node.output.key, node.output.value)) - - for child in node.children.values(): - if child is not node: - walk(child, key) - - walk(self.root, key='') - - return iter(items) - - def exists(self, key): - """ - Return True if the key is present in this trie. - """ - # TODO: add __contains__ magic for this - node = self.__get_node(key) - if node: - return bool(node.output != nil) - return False - - def is_prefix(self, key): - """ - Return True if key is a prefix of any existing key in the trie. - """ - return (self.__get_node(key) is not None) - - def make_automaton(self): - """ - Convert this trie to an Aho-Corasick automaton. - Note that this is an error to add new keys to a Trie once it has been - converted to an Automaton. - """ - queue = deque() - queue_append = queue.append - queue_popleft = queue.popleft - - # 1. create root children for each known items range (e.g. all unique - # characters from all the added keys), failing to root. - # And build a queue of these - for char in self._known_chars: - if char in self.root.children: - node = self.root.children[char] - # e.g. f(s) = 0, Aho-Corasick-wise - node.fail = self.root - queue_append(node) - else: - self.root.children[char] = self.root - - # 2. using the queue of all possible top level items/chars, walk the trie and - # add failure links to nodes as needed - while queue: - current_node = queue_popleft() - for node in current_node.children.values(): - queue_append(node) - state = current_node.fail - while node.char not in state.children: - state = state.fail - node.fail = state.children.get(node.char, self.root) - - # Mark the trie as converted so it cannot be modified anymore - self._converted = True - - def iter(self, string): - """ - Yield Result objects for matched strings by performing the Aho-Corasick search procedure. - - The Result start and end positions in the searched string are such that the - matched string is "search_string[start:end+1]". And the start is computed - from the end_index collected by the Aho-Corasick search procedure such that - "start=end_index - n + 1" where n is the length of a matched key. - - The Result.output is an Output object for a matched key. - - For example: - >>> a = Trie() - >>> a.add('BCDEF') - >>> a.add('CDE') - >>> a.add('DEFGH') - >>> a.add('EFGH') - >>> a.add('KL') - >>> a.make_automaton() - >>> string = 'abcdefghijklm' - >>> results = Result.sort(a.iter(string)) - - >>> expected = [ - ... Result(1, 5, 'bcdef', Output('BCDEF')), - ... Result(2, 4, 'cde', Output('CDE')), - ... Result(3, 7, 'defgh', Output('DEFGH')), - ... Result(4, 7, 'efgh', Output('EFGH')), - ... Result(10, 11, 'kl', Output('KL')), - ... ] - >>> results == expected - True - - >>> list(a.iter('')) == [] - True - - >>> list(a.iter(' ')) == [] - True - """ - if not string: - return - - # keep a copy for results - original_string = string - string = self.ignore_case and string.lower() or string - - known_chars = self._known_chars - state = self.root - for end, char in enumerate(string): - if char not in known_chars: - state = self.root - continue - - # search for a matching character in the children, starting at root - while char not in state.children: - state = state.fail - # we have a matching starting character - state = state.children.get(char, self.root) - match = state - while match is not nil: - if match.output is not nil: - # TODO: this could be precomputed or cached - n = len(match.output.key) - start = end - n + 1 - yield Result(start, end, original_string[start:end + 1], match.output) - match = match.fail - - def scan(self, string): - """ - Scan a string for matched and unmatched sub-sequences and yield non- - overlapping Result objects performing a modified Aho-Corasick search - procedure: - - - return both matched and unmatched sub-sequences. - - do not return matches with positions that are contained or overlap with - another match: - - discard smaller matches contained in a larger match. - - when there is overlap (but not containment), the matches are sorted by - start and biggest length and then: - - we return the largest match of two overlaping matches - - if they have the same length, keep the match starting the earliest and - return the non-overlapping portion of the other discarded match as a - non-match. - - Each Result contains the start and end position, the corresponding string and - an Output object (with original key and any associated associated value). The - string and key are in their original case even if the automaton has the - `ignore_case` attribute. - - For example: - >>> a = Trie() - >>> a.add('BCDEF') - >>> a.add('CDE') - >>> a.add('DEFGH') - >>> a.add('EFGH') - >>> a.add('KL') - >>> a.make_automaton() - >>> string = 'abcdefghijkl' - >>> results = list(a.scan(string)) - - >>> expected = [ - ... Result(start=0, end=0, string='a', output=None), - ... Result(start=1, end=5, string='bcdef', output=Output('BCDEF')), - ... Result(start=6, end=9, string='ghij', output=None), - ... Result(start=10, end=11, string='kl', output=Output('KL')), - ... ] - - >>> results == expected - True - """ - results = self.iter(string) - results = filter_overlapping(results) - results = add_unmatched(string, results) - return results - - -class TrieNode(object): - """ - Node of the Trie/Aho-Corasick automaton. - """ - __slots__ = ['char', 'output', 'fail', 'children'] - - def __init__(self, char, output=nil): - # character of a key string added to the Trie - self.char = char - - # an output function (in the Aho-Corasick meaning) for this node: this is an - # Output object that contains the original key string and any additional - # value data associated to that key. Or "nil" for a node that is not a - # terminal leave for a key. It will be returned with a match. - self.output = output - - # failure link used by the Aho-Corasick automaton and its search procedure - self.fail = nil - - # children of this node as a mapping of char->node - self.children = {} - - def __repr__(self): - if self.output is not nil: - return 'TrieNode(%r, %r)' % (self.char, self.output) - else: - return 'TrieNode(%r)' % self.char - - -class Output(object): - """ - An Output is used to track a key added to the Trie as a TrieNode and any - arbitrary value object corresponding to that key. - - - `key` is the original key unmodified unicode string. - - `value` is the associated value for this key as provided when adding this key. - - `priority` is an optional priority for this key used to disambiguate overalpping matches. - """ - __slots__ = 'key', 'value', 'priority' - - def __init__(self, key, value=None, priority=0): - self.key = key - self.value = value - self.priority = priority - - def __repr__(self): - return self.__class__.__name__ + '(%(key)r, %(value)r, %(priority)r)' % self.as_dict() - - def __eq__(self, other): - return ( - isinstance(other, Output) - and self.key == other.key - and self.value == other.value - and self.priority == other.priority) - - def __hash__(self): - return hash((self.key, self.value, self.priority,)) - - def as_dict(self): - return OrderedDict([(s, getattr(self, s)) for s in self.__slots__]) - - -class Result(object): - """ - A Result is used to track the result of a search with its start and end as - index position in the original string and other attributes: - - - `start` and `end` are zero-based index in the original string S such that - S[start:end+1] will yield `string`. - - `string` is the sub-string from the original searched string for this Result. - - `output` is the Output object for a matched string and is a marker that this is a - matched string. None otherwise for a Result for unmatched text. - """ - - __slots__ = 'start', 'end', 'string', 'output' - - def __init__(self, start, end, string='', output=None): - self.start = start - self.end = end - self.string = string - self.output = output - - def __repr__(self): - return self.__class__.__name__ + '(%(start)r, %(end)r, %(string)r, %(output)r)' % self.as_dict() - - def as_dict(self): - return OrderedDict([(s, getattr(self, s)) for s in self.__slots__]) - - def __len__(self): - return self.end + 1 - self.start - - def __eq__(self, other): - return isinstance(other, Result) and ( - self.start == other.start and - self.end == other.end and - self.string == other.string and - self.output == other.output - ) - - def __hash__(self): - tup = self.start, self.end, self.string, self.output - return hash(tup) - - @property - def priority(self): - return getattr(self.output, 'priority', 0) - - def is_after(self, other): - """ - Return True if this result is after the other result. - - For example: - >>> Result(1, 2).is_after(Result(5, 6)) - False - >>> Result(5, 6).is_after(Result(5, 6)) - False - >>> Result(2, 3).is_after(Result(1, 2)) - False - >>> Result(5, 6).is_after(Result(3, 4)) - True - """ - return self.start > other.end - - def is_before(self, other): - return self.end < other.start - - def __contains__(self, other): - """ - Return True if this result contains the other result. - - For example: - >>> Result(5, 7) in Result(5, 7) - True - >>> Result(6, 8) in Result(5, 7) - False - >>> Result(6, 6) in Result(4, 8) - True - >>> Result(3, 9) in Result(4, 8) - False - >>> Result(4, 8) in Result(3, 9) - True - """ - return self.start <= other.start and other.end <= self.end - - def overlap(self, other): - """ - Return True if this result and the other result overlap. - - For example: - >>> Result(1, 2).overlap(Result(5, 6)) - False - >>> Result(5, 6).overlap(Result(5, 6)) - True - >>> Result(4, 5).overlap(Result(5, 6)) - True - >>> Result(4, 5).overlap(Result(5, 7)) - True - >>> Result(4, 5).overlap(Result(6, 7)) - False - """ - start = self.start - end = self.end - return (start <= other.start <= end) or (start <= other.end <= end) - - @classmethod - def sort(cls, results): - """ - Return a new sorted sequence of results given a sequence of results. The - primary sort is on start and the secondary sort is on longer lengths. - Therefore if two results have the same start, the longer result will sort - first. - - For example: - >>> results = [Result(0, 0), Result(5, 5), Result(1, 1), Result(2, 4), Result(2, 5)] - >>> expected = [Result(0, 0), Result(1, 1), Result(2, 5), Result(2, 4), Result(5, 5)] - >>> expected == Result.sort(results) - True - """ - key = lambda s: (s.start, -len(s),) - return sorted(results, key=key) - - -def filter_overlapping(results): - """ - Return a new list from an iterable of `results` discarding contained and - overlaping Results using these rules: - - - skip a result fully contained in another result. - - keep the biggest, left-most result of two overlapping results and skip the other - - For example: - >>> results = [ - ... Result(0, 0, 'a'), - ... Result(1, 5, 'bcdef'), - ... Result(2, 4, 'cde'), - ... Result(3, 7, 'defgh'), - ... Result(4, 7, 'efgh'), - ... Result(8, 9, 'ij'), - ... Result(10, 13, 'klmn'), - ... Result(11, 15, 'lmnop'), - ... Result(16, 16, 'q'), - ... ] - - >>> expected = [ - ... Result(0, 0, 'a'), - ... Result(1, 5, 'bcdef'), - ... Result(8, 9, 'ij'), - ... Result(11, 15, 'lmnop'), - ... Result(16, 16, 'q'), - ... ] - - >>> filtered = list(filter_overlapping(results)) - >>> filtered == expected - True - """ - results = Result.sort(results) - - # compare pair of results in the sorted sequence: current and next - i = 0 - while i < len(results) - 1: - j = i + 1 - while j < len(results): - curr_res = results[i] - next_res = results[j] - - logger_debug('curr_res, i, next_res, j:', curr_res, i, next_res, j) - # disjoint results: break, there is nothing to do - if next_res.is_after(curr_res): - logger_debug(' break to next', curr_res) - break - - # contained result: discard the contained result - if next_res in curr_res: - logger_debug(' del next_res contained:', next_res) - del results[j] - continue - - # overlap: keep the biggest result and skip the smallest overlapping results - # in case of length tie: keep the left most - if curr_res.overlap(next_res): - if curr_res.priority < next_res.priority: - logger_debug(' del next_res lower priority:', next_res) - del results[j] - continue - elif curr_res.priority > next_res.priority: - logger_debug(' del curr_res lower priority:', curr_res) - del results[i] - break - else: - if len(curr_res) >= len(next_res): - logger_debug(' del next_res smaller overlap:', next_res) - del results[j] - continue - else: - logger_debug(' del curr_res smaller overlap:', curr_res) - del results[i] - break - j += 1 - i += 1 - return results - - -def add_unmatched(string, results): - """ - Yield Result object from the original `string` and the search `results` iterable - of non-overlapping matched substring Result object. New unmatched Results are - added to the stream for unmatched parts. - - For example: - >>> string ='abcdefghijklmn' - >>> results = [ - ... Result(2, 3, 'cd'), - ... Result(7, 7, 'h', None), - ... Result(9, 10, 'jk', None), - ... ] - >>> expected = [ - ... Result(0, 1, 'ab'), - ... Result(2, 3, 'cd'), - ... Result(4, 6, 'efg'), - ... Result(7, 7, 'h'), - ... Result(8, 8, 'i'), - ... Result(9, 10, 'jk'), - ... Result(11, 13, 'lmn') - ... ] - >>> expected == list(add_unmatched(string, results)) - True - - >>> string ='abc2' - >>> results = [ - ... Result(0, 2, 'abc'), - ... ] - >>> expected = [ - ... Result(0, 2, 'abc'), - ... Result(3, 3, '2', None), - ... ] - >>> expected == list(add_unmatched(string, results)) - True - - """ - string_pos = 0 - for result in Result.sort(results): - if result.start > string_pos: - start = string_pos - end = result.start - 1 - yield Result(start, end, string[start:end + 1]) - yield result - string_pos = result.end + 1 - - len_string = len(string) - if string_pos < len_string: - start = string_pos - end = len_string - 1 - yield Result(start, end, string[start:end + 1]) diff --git a/tests/test__pyahocorasick.py b/tests/test__pyahocorasick.py deleted file mode 100644 index 7b346b6..0000000 --- a/tests/test__pyahocorasick.py +++ /dev/null @@ -1,231 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests for Aho-Corasick string search algorithm. -Original Author: Wojciech Muła, wojciech_mula@poczta.onet.pl -WWW : http://0x80.pl -License : public domain - -Modified for use in the license_expression library and in particular: - - add support for unicode key strinsg. - - rename word to key and output to value (to be more like a mapping/dict) - - case insensitive search - - improve returned results with the actual start,end and matched string. - - support returning non-matched parts of a string -""" - -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import print_function - -import unittest - -from license_expression._pyahocorasick import Trie -from license_expression._pyahocorasick import Output -from license_expression._pyahocorasick import Result - - -class TestTrie(unittest.TestCase): - - def testAddedWordShouldBeCountedAndAvailableForRetrieval(self): - t = Trie() - t.add('python', 'value') - assert Output('python', 'value') == t.get('python') - - def testAddingExistingWordShouldReplaceAssociatedValue(self): - t = Trie() - t.add('python', 'value') - assert Output('python', 'value') == t.get('python') - - t.add('python', 'other') - assert Output('python', 'other') == t.get('python') - - def testGetUnknowWordWithoutDefaultValueShouldRaiseException(self): - t = Trie() - with self.assertRaises(KeyError): - t.get('python') - - def testGetUnknowWordWithDefaultValueShouldReturnDefault(self): - t = Trie() - self.assertEqual(t.get('python', 'default'), 'default') - - def testExistShouldDetectAddedWords(self): - t = Trie() - t.add('python', 'value') - t.add('ada', 'value') - - self.assertTrue(t.exists('python')) - self.assertTrue(t.exists('ada')) - - def testExistShouldReturnFailOnUnknownWord(self): - t = Trie() - t.add('python', 'value') - - self.assertFalse(t.exists('ada')) - - def test_is_prefix_ShouldDetecAllPrefixesIncludingWord(self): - t = Trie() - t.add('python', 'value') - t.add('ada', 'value') - - self.assertTrue(t.is_prefix('a')) - self.assertTrue(t.is_prefix('ad')) - self.assertTrue(t.is_prefix('ada')) - - self.assertTrue(t.is_prefix('p')) - self.assertTrue(t.is_prefix('py')) - self.assertTrue(t.is_prefix('pyt')) - self.assertTrue(t.is_prefix('pyth')) - self.assertTrue(t.is_prefix('pytho')) - self.assertTrue(t.is_prefix('python')) - - def testItemsShouldReturnAllItemsAlreadyAddedToTheTrie(self): - t = Trie() - - t.add('python', 1) - t.add('ada', 2) - t.add('perl', 3) - t.add('pascal', 4) - t.add('php', 5) - - result = list(t.items()) - self.assertIn(('python', 1), result) - self.assertIn(('ada', 2), result) - self.assertIn(('perl', 3), result) - self.assertIn(('pascal', 4), result) - self.assertIn(('php', 5), result) - - def testKeysShouldReturnAllKeysAlreadyAddedToTheTrie(self): - t = Trie() - - t.add('python', 1) - t.add('ada', 2) - t.add('perl', 3) - t.add('pascal', 4) - t.add('php', 5) - - result = list(t.keys()) - self.assertIn('python', result) - self.assertIn('ada', result) - self.assertIn('perl', result) - self.assertIn('pascal', result) - self.assertIn('php', result) - - def testValuesShouldReturnAllValuesAlreadyAddedToTheTrie(self): - t = Trie() - - t.add('python', 1) - t.add('ada', 2) - t.add('perl', 3) - t.add('pascal', 4) - t.add('php', 5) - - result = list(t.values()) - self.assertIn(1, result) - self.assertIn(2, result) - self.assertIn(3, result) - self.assertIn(4, result) - self.assertIn(5, result) - - def test_iter_should_not_return_non_matches(self): - - def get_test_automaton(): - words = "he her hers his she hi him man himan".split() - t = Trie() - for w in words: - t.add(w, w) - t.make_automaton() - return t - - test_string = "he she himan" - - t = get_test_automaton() - result = list(t.iter(test_string)) - expected = [ - Result(start=0, end=1, string='he', output=Output('he', 'he')), - Result(start=3, end=5, string='she', output=Output('she', 'she')), - Result(start=4, end=5, string='he', output=Output('he', 'he')), - Result(start=7, end=8, string='hi', output=Output('hi', 'hi')), - Result(start=7, end=9, string='him', output=Output('him', 'him')), - Result(start=7, end=11, string='himan', output=Output('himan', 'himan')), - Result(start=9, end=11, string='man', output=Output('man', 'man')) - ] - - assert expected == result - - def test_iter_vs_scan(self): - - def get_test_automaton(): - words = "( AND ) OR".split() - t = Trie() - for w in words: - t.add(w, w) - t.make_automaton() - return t - - test_string = '((l-a + AND l-b) OR (l -c+))' - - t = get_test_automaton() - result = list(t.iter(test_string)) - expected = [ - Result(0, 0, '(', Output('(', '(')), - Result(1, 1, '(', Output('(', '(')), - Result(8, 10, 'AND', Output('AND', 'AND')), - Result(15, 15, ')', Output(')', ')')), - Result(17, 18, 'OR', Output('OR', 'OR')), - Result(20, 20, '(', Output('(', '(')), - Result(26, 26, ')', Output(')', ')')), - Result(27, 27, ')', Output(')', ')')) - ] - assert expected == result - - result = list(t.scan(test_string)) - expected = [ - Result(0, 0, '(', Output('(', '(')), - Result(1, 1, '(', Output('(', '(')), - Result(2, 7, 'l-a + ', None), - Result(8, 10, 'AND', Output('AND', 'AND')), - Result(11, 14, ' l-b', None), - Result(15, 15, ')', Output(')', ')')), - Result(16, 16, ' ', None), - Result(17, 18, 'OR', Output('OR', 'OR')), - Result(19, 19, ' ', None), - Result(20, 20, '(', Output('(', '(')), - Result(21, 25, 'l -c+', None), - Result(26, 26, ')', Output(')', ')')), - Result(27, 27, ')', Output(')', ')')) - ] - assert expected == result - - def test_scan_with_unmatched(self): - - def get_test_automaton(): - words = "( AND ) OR".split() - t = Trie() - for w in words: - t.add(w, w) - t.make_automaton() - return t - - test_string = '((l-a + AND l-b) OR an (l -c+))' - - t = get_test_automaton() - result = list(t.scan(test_string)) - assert test_string == ''.join(r.string for r in result) - - def test_iter_with_unmatched_simple(self): - t = Trie() - t.add('AND', 'AND') - t.make_automaton() - test_string = 'AND an a and' - result = list(t.iter(test_string)) - assert 'ANDand' == ''.join(r.string for r in result) - - def test_iter_with_unmatched_simple2(self): - t = Trie() - t.add('AND', 'AND') - t.make_automaton() - test_string = 'AND an a and' - result = list(t.iter(test_string)) - assert 'ANDand' == ''.join(r.string for r in result) - diff --git a/tests/test_license_expression.py b/tests/test_license_expression.py index 58312c4..31d5aa9 100644 --- a/tests/test_license_expression.py +++ b/tests/test_license_expression.py @@ -98,9 +98,18 @@ def test_LicenseSymbol(self): assert sym3 == sym5.exception_symbol sym6 = LicenseWithExceptionSymbol(sym4, sym3) - # symbol euqality is based ONLY on the key + # symbol equality is based ONLY on the key assert sym5 == sym6 + def test_license_symbols_key_cannot_contain_spaces(self): + LicenseSymbol('mit ') + LicenseSymbol(' mit ') + try: + LicenseSymbol(' m it ') + self.fail('Exception not raised') + except ExpressionError: + pass + class LicensingTest(TestCase): @@ -189,64 +198,97 @@ def test_tokenize_plain5(self): class LicensingTokenizeWithSymbolsTest(TestCase): def get_symbols_and_licensing(self): - gpl_20 = LicenseSymbol('GPL-2.0', ['The GNU GPL 20']) + gpl_20 = LicenseSymbol('GPL-2.0', ['The-GNU-GPL-20']) gpl_20_plus = LicenseSymbol('gpl-2.0+', - ['The GNU GPL 20 or later', 'GPL-2.0 or later', 'GPL v2.0 or later']) - lgpl_21 = LicenseSymbol('LGPL-2.1', ['LGPL v2.1']) - mit = LicenseSymbol('MIT', ['MIT license']) + ['The-GNU-GPL-20-or-later', 'GPL-2.0-or-later', 'GPL-v2.0-or-later']) + lgpl_21 = LicenseSymbol('LGPL-2.1', ['LGPL-v2.1']) + mit = LicenseSymbol('MIT', ['MIT-license']) symbols = [gpl_20, gpl_20_plus, lgpl_21, mit] licensing = Licensing(symbols) return gpl_20, gpl_20_plus, lgpl_21, mit, licensing def test_tokenize_1(self): gpl_20, _gpl_20_plus, lgpl_21, mit, licensing = self.get_symbols_and_licensing() - result = licensing.tokenize('The GNU GPL 20 or LGPL-2.1 and mit') + result = licensing.tokenize('The-GNU-GPL-20 or LGPL-2.1 and mit') expected = [ - (gpl_20, 'The GNU GPL 20', 0), - (TOKEN_OR, ' or ', 14), + (gpl_20, 'The-GNU-GPL-20', 0), + (TOKEN_OR, 'or', 15), (lgpl_21, 'LGPL-2.1', 18), - (TOKEN_AND, ' and ', 26), + (TOKEN_AND, 'and', 27), (mit, 'mit', 31)] assert expected == list(result) def test_tokenize_with_trailing_unknown(self): - gpl_20, _gpl_20_plus, lgpl_21, mit, licensing = self.get_symbols_and_licensing() - result = licensing.tokenize('The GNU GPL 20 or LGPL-2.1 and mit2') + gpl_20, _gpl_20_plus, lgpl_21, _mit, licensing = self.get_symbols_and_licensing() + result = licensing.tokenize('The-GNU-GPL-20 or LGPL-2.1 and mit2') expected = [ - (gpl_20, 'The GNU GPL 20', 0), - (TOKEN_OR, ' or ', 14), + (gpl_20, 'The-GNU-GPL-20', 0), + (TOKEN_OR, 'or', 15), (lgpl_21, 'LGPL-2.1', 18), - (TOKEN_AND, ' and ', 26), - (mit, 'mit', 31), - (LicenseSymbol(key='2'), '2', 34) + (TOKEN_AND, 'and', 27), + (LicenseSymbol(key='mit2'), 'mit2', 31) ] assert expected == list(result) def test_tokenize_3(self): gpl_20, gpl_20_plus, lgpl_21, mit, licensing = self.get_symbols_and_licensing() - result = licensing.tokenize('The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit') + result = licensing.tokenize('The-GNU-GPL-20-or-later or (LGPL-2.1 and mit) or The-GNU-GPL-20 or mit') expected = [ - (gpl_20_plus, 'The GNU GPL 20 or later', 0), - (TOKEN_OR, ' or ', 23), + (gpl_20_plus, 'The-GNU-GPL-20-or-later', 0), + (TOKEN_OR, 'or', 24), (TOKEN_LPAR, '(', 27), (lgpl_21, 'LGPL-2.1', 28), - (TOKEN_AND, ' and ', 36), + (TOKEN_AND, 'and', 37), (mit, 'mit', 41), (TOKEN_RPAR, ')', 44), - (TOKEN_OR, ' or ', 45), - (gpl_20, 'The GNU GPL 20', 49), (2, ' or ', 63), + (TOKEN_OR, 'or', 46), + (gpl_20, 'The-GNU-GPL-20', 49), + (TOKEN_OR, 'or', 64), (mit, 'mit', 67) ] assert expected == list(result) - def test_tokenize_unknown_as_trailing_single_attached_character(self): - symbols = [LicenseSymbol('MIT', ['MIT license'])] + def test_tokenize_unknown_as_trailing_single_attached_character_does_not_match_known_license(self): + symbols = [LicenseSymbol('MIT', ['MIT-license'])] l = Licensing(symbols) result = list(l.tokenize('mit2')) expected = [ - (LicenseSymbol(key='MIT', aliases=('MIT license',)), 'mit', 0), - (LicenseSymbol(key='2'), '2', 3), + (LicenseSymbol(key='mit2'), 'mit2', 0), + ] + assert expected == result + + def test_tokenize_with_unknown_symbol_containing_known_symbol_leading(self): + l = Licensing(['gpl-2.0']) + result = list(l.tokenize('gpl-2.0 AND gpl-2.0-plus', strict=False)) + result = [s for s, _, _ in result] + expected = [ + LicenseSymbol(key='gpl-2.0'), + TOKEN_AND, + LicenseSymbol(key='gpl-2.0-plus'), + ] + assert expected == result + + def test_tokenize_with_unknown_symbol_containing_known_symbol_contained(self): + l = Licensing(['gpl-2.0']) + result = list(l.tokenize('gpl-2.0 WITH exception-gpl-2.0-plus', strict=False)) + result = [s for s, _, _ in result] + expected = [ + LicenseWithExceptionSymbol( + LicenseSymbol(u'gpl-2.0'), + LicenseSymbol(u'exception-gpl-2.0-plus') + ) + ] + assert expected == result + + def test_tokenize_with_unknown_symbol_containing_known_symbol_trailing(self): + l = Licensing(['gpl-2.0']) + result = list(l.tokenize('gpl-2.0 AND exception-gpl-2.0', strict=False)) + result = [s for s, _, _ in result] + expected = [ + LicenseSymbol(u'gpl-2.0'), + TOKEN_AND, + LicenseSymbol(u'exception-gpl-2.0') ] assert expected == result @@ -278,6 +320,7 @@ def test_parse_raise_ExpressionError_when_validating(self): licensing = Licensing() try: licensing.parse(expression, validate=True) + self.fail('Exception not raised') except ExpressionError as ee: assert 'Unknown license key(s): gpl, bsd, lgpl, exception' == str(ee) @@ -286,6 +329,7 @@ def test_parse_raise_ExpressionError_when_validating_strict(self): licensing = Licensing() try: licensing.parse(expression, validate=True, strict=True) + self.fail('Exception not raised') except ExpressionError as ee: assert str(ee).startswith('exception_symbol must be an exception with "is_exception" set to True:') @@ -651,16 +695,36 @@ def test_parse_complex2(self): expected = 'GPL-2.0 OR (LGPL-2.1 AND mit)' assert expected == expr.render('{symbol.key}') - def test_Licensing_can_scan_valid_expressions_with_symbols_that_contain_and_with_or(self): - licensing = Licensing() + def test_Licensing_can_split_valid_expressions_with_symbols_that_contain_and_with_or(self): expression = 'orgpl or withbsd with orclasspath and andmit or andlgpl and ormit or withme' - result = [r.string for r in licensing.get_scanner().scan(expression)] + result = [r.string for r in splitter(expression)] expected = [ - 'orgpl', ' or ', 'withbsd', ' with ', 'orclasspath', - ' and ', 'andmit', ' or ', 'andlgpl', ' and ', 'ormit', - ' or ', 'withme' + 'orgpl', + ' ', + 'or', + ' ', + 'withbsd', + ' ', + 'with', + ' ', + 'orclasspath', + ' ', + 'and', + ' ', + 'andmit', + ' ', + 'or', + ' ', + 'andlgpl', + ' ', + 'and', + ' ', + 'ormit', + ' ', + 'or', + ' ', + 'withme' ] - assert expected == result def test_Licensing_can_tokenize_valid_expressions_with_symbols_that_contain_and_with_or(self): @@ -708,10 +772,9 @@ def test_Licensing_with_illegal_symbols_raise_Exception(self): 'LGPL 2.1', 'mit or later' ]) + self.fail('Exception not raised') except ExpressionError as ee: - expected = ('Invalid license key: "or later" words are reserved and ' - 'cannot be used in a key: "GPL-2.0 or LATER"') - + expected = ('A license key cannot contains spaces: "GPL-2.0 or LATER"') assert expected == str(ee) def get_syms_and_licensing(self): @@ -733,7 +796,7 @@ def test_parse_license_expression1(self): def test_parse_license_expression_with_alias(self): _a, ap, _b, _c, licensing = self.get_syms_and_licensing() - express_string = 'l-a +' + express_string = 'l-a+' result = licensing.parse(express_string) assert 'L-a+' == str(result) expected = ap @@ -806,7 +869,7 @@ def test_parse_license_expression8_twice(self): def test_parse_license_expression_with_trailing_space_plus(self): symbols = [ LicenseSymbol('l-a'), - LicenseSymbol('L-a+', ['l-a +']), + LicenseSymbol('L-a+', ['l-a+']), LicenseSymbol('l-b'), LicenseSymbol('l-c'), ] @@ -818,7 +881,7 @@ def test_parse_license_expression_with_trailing_space_plus(self): assert [] == licensing.unknown_license_keys(result) # plus sign is not attached to the symbol, but an alias - expresssion_str = 'l-a +' + expresssion_str = 'l-a+' result = licensing.parse(expresssion_str) assert 'l-a+' == str(result).lower() assert [] == licensing.unknown_license_keys(result) @@ -881,6 +944,7 @@ def test_parse_raise_ParseError_when_validating_strict_with_non_exception_symbol expression = 'gpl and bsd or lgpl with exception' try: licensing.parse(expression, validate=True, strict=True) + self.fail('Exception not raised') except ParseError as pe: expected = { 'error_code': PARSE_INVALID_SYMBOL_AS_EXCEPTION, @@ -895,6 +959,7 @@ def test_parse_raise_ParseError_when_validating_strict_with_exception_symbols_in licensing.parse('gpl with exception', validate=True, strict=True) try: licensing.parse('exception with gpl', validate=True, strict=True) + self.fail('Exception not raised') except ParseError as pe: expected = { 'error_code': PARSE_INVALID_EXCEPTION, @@ -905,6 +970,7 @@ def test_parse_raise_ParseError_when_validating_strict_with_exception_symbols_in try: licensing.parse('gpl with gpl', validate=True, strict=True) + self.fail('Exception not raised') except ParseError as pe: expected = { 'error_code': PARSE_INVALID_SYMBOL_AS_EXCEPTION, @@ -913,8 +979,33 @@ def test_parse_raise_ParseError_when_validating_strict_with_exception_symbols_in 'token_type': TOKEN_SYMBOL} assert expected == _parse_error_as_dict(pe) + def test_with_unknown_symbol_string_contained_in_known_symbol_does_not_crash_with(self): + l = Licensing(['lgpl-3.0-plus']) + license_expression = 'lgpl-3.0-plus WITH openssl-exception-lgpl-3.0-plus' + l.parse(license_expression) + + def test_with_unknown_symbol_string_contained_in_known_symbol_does_not_crash_and(self): + l = Licensing(['lgpl-3.0-plus']) + license_expression = 'lgpl-3.0-plus AND openssl-exception-lgpl-3.0-plus' + l.parse(license_expression) + + def test_with_unknown_symbol_string_contained_in_known_symbol_does_not_crash_or(self): + l = Licensing(['lgpl-3.0-plus']) + license_expression = 'lgpl-3.0-plus OR openssl-exception-lgpl-3.0-plus' + l.parse(license_expression) + + def test_with_known_symbol_string_contained_in_known_symbol_does_not_crash_or(self): + l = Licensing(['lgpl-3.0-plus', 'openssl-exception-lgpl-3.0-plus']) + license_expression = 'lgpl-3.0-plus OR openssl-exception-lgpl-3.0-plus' + l.parse(license_expression) -class LicensingSymbolsReplacement(TestCase): + def test_with_known_symbol_string_contained_in_known_symbol_does_not_crash_with(self): + l = Licensing(['lgpl-3.0-plus', 'openssl-exception-lgpl-3.0-plus']) + license_expression = 'lgpl-3.0-plus WITH openssl-exception-lgpl-3.0-plus' + l.parse(license_expression) + + +class LicensingSymbolsReplacementTest(TestCase): def get_symbols_and_licensing(self): gpl2 = LicenseSymbol('gpl-2.0', ['The GNU GPL 20', 'GPL-2.0', 'GPL v2.0']) @@ -1002,11 +1093,11 @@ def test_multiple_substitutions_complex(self): class LicensingParseWithSymbolsAdvancedTest(TestCase): def get_symbols_and_licensing(self): - gpl2 = LicenseSymbol('gpl-2.0', ['The GNU GPL 20', 'GPL-2.0', 'GPL v2.0']) - gpl2plus = LicenseSymbol('gpl-2.0+', ['The GNU GPL 20 or later', 'GPL-2.0 or later', 'GPL v2.0 or later']) + gpl2 = LicenseSymbol('gpl-2.0', ['The-GNU-GPL-20', 'GPL-2.0', 'GPL-v2.0']) + gpl2plus = LicenseSymbol('gpl-2.0+', ['The-GNU-GPL-20-or-later', 'GPL-2.0-or-later', 'GPL-v2.0-or-later']) lgpl = LicenseSymbol('LGPL-2.1', ['LGPL v2.1']) mit = LicenseSymbol('MIT', ['MIT license']) - mitand2 = LicenseSymbol('mitand2', ['mitand2', 'mitand2 license']) + mitand2 = LicenseSymbol('mitand2', ['mitand2', 'mitand2-license']) symbols = [gpl2, gpl2plus, lgpl, mit, mitand2] licensing = Licensing(symbols) return gpl2, gpl2plus, lgpl, mit, mitand2, licensing @@ -1014,71 +1105,86 @@ def get_symbols_and_licensing(self): def test_parse_trailing_char_raise_exception(self): _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing() try: - licensing.parse('The GNU GPL 20 or LGPL-2.1 and mit2') + licensing.parse('The-GNU-GPL-20 or LGPL-2.1 and mit 2') + self.fail('Exception not raised') except ParseError as pe: - expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 34, - 'token_string': '2', 'token_type': LicenseSymbol('2')} + expected = { + 'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, + 'position': 35, + 'token_string': '2', + 'token_type': LicenseSymbol('2') + } assert expected == _parse_error_as_dict(pe) + def test_parse_trailing_char_raise_exception_if_validate(self): + _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing() + try: + licensing.parse('The-GNU-GPL-20 or LGPL-2.1 and mit2', validate=True) + self.fail('Exception not raised') + except ExpressionError as pe: + assert 'Unknown license key(s): mit2' in str(pe) + def test_parse_expression_with_trailing_unknown_should_raise_exception(self): gpl2, gpl2plus, lgpl, mit, _mitand2, licensing = self.get_symbols_and_licensing() unknown = LicenseSymbol(key='123') - tokens = list(licensing.tokenize('The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit 123')) + tokens = list(licensing.tokenize('The-GNU-GPL-20-or-later or (LGPL-2.1 and mit) or The-GNU-GPL-20 or mit 123')) expected = [ - (gpl2plus, 'The GNU GPL 20 or later', 0), - (TOKEN_OR, ' or ', 23), + (gpl2plus, 'The-GNU-GPL-20-or-later', 0), + (TOKEN_OR, 'or', 24), (TOKEN_LPAR, '(', 27), (lgpl, 'LGPL-2.1', 28), - (TOKEN_AND, ' and ', 36), + (TOKEN_AND, 'and', 37), (mit, 'mit', 41), (TOKEN_RPAR, ')', 44), - (TOKEN_OR, ' or ', 45), - (gpl2, 'The GNU GPL 20', 49), - (TOKEN_OR, ' or ', 63), + (TOKEN_OR, 'or', 46), + (gpl2, 'The-GNU-GPL-20', 49), + (TOKEN_OR, 'or', 64), (mit, 'mit', 67), - (unknown, ' 123', 70) + (unknown, '123', 71) ] assert expected == tokens try: - licensing.parse('The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit 123') + licensing.parse('The-GNU-GPL-20-or-later or (LGPL-2.1 and mit) or The-GNU-GPL-20 or mit 123') + self.fail('Exception not raised') except ParseError as pe: - expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 70, - 'token_string': ' 123', 'token_type': unknown} + expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 71, + 'token_string': '123', 'token_type': unknown} assert expected == _parse_error_as_dict(pe) def test_parse_expression_with_trailing_unknown_should_raise_exception2(self): _gpl2, _gpl2_plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing() unknown = LicenseSymbol(key='123') try: - licensing.parse('The GNU GPL 20 or mit 123') + licensing.parse('The-GNU-GPL-20 or mit 123') + self.fail('Exception not raised') except ParseError as pe: - expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 21, - 'token_string': ' 123', 'token_type': unknown} + expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 22, + 'token_string': '123', 'token_type': unknown} assert expected == _parse_error_as_dict(pe) def test_parse_expression_with_WITH(self): gpl2, _gpl2plus, lgpl, mit, mitand2, _ = self.get_symbols_and_licensing() - mitexp = LicenseSymbol('mitexp', ('mit exp',), is_exception=True) - gpl_20_or_later = LicenseSymbol('GPL-2.0+', ['The GNU GPL 20 or later']) + mitexp = LicenseSymbol('mitexp', ('mit-exp',), is_exception=True) + gpl_20_or_later = LicenseSymbol('GPL-2.0+', ['The-GNU-GPL-20-or-later']) symbols = [gpl2, lgpl, mit, mitand2, mitexp, gpl_20_or_later] licensing = Licensing(symbols) - expr = 'The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit with mit exp' + expr = 'The-GNU-gpl-20-or-later or (LGPL-2.1 and mit) or The-GNU-GPL-20 or mit with mit-exp' tokens = list(licensing.tokenize(expr)) expected = [ - (gpl_20_or_later, 'The GNU GPL 20 or later', 0), - (TOKEN_OR, ' or ', 23), + (gpl_20_or_later, 'The-GNU-gpl-20-or-later', 0), + (TOKEN_OR, 'or', 24), (TOKEN_LPAR, '(', 27), (lgpl, 'LGPL-2.1', 28), - (TOKEN_AND, ' and ', 36), + (TOKEN_AND, 'and', 37), (mit, 'mit', 41), (TOKEN_RPAR, ')', 44), - (TOKEN_OR, ' or ', 45), - (gpl2, 'The GNU GPL 20', 49), - (TOKEN_OR, ' or ', 63), - (LicenseWithExceptionSymbol(mit, mitexp), 'mit with mit exp', 67) + (TOKEN_OR, 'or', 46), + (gpl2, 'The-GNU-GPL-20', 49), + (TOKEN_OR, 'or', 64), + (LicenseWithExceptionSymbol(mit, mitexp), 'mit with mit-exp', 67) ] assert expected == tokens @@ -1092,11 +1198,11 @@ def test_parse_expression_with_WITH(self): def test_parse_expression_with_WITH_and_unknown_symbol(self): gpl2, _gpl2plus, lgpl, mit, mitand2, _ = self.get_symbols_and_licensing() mitexp = LicenseSymbol('mitexp', ('mit exp',), is_exception=True) - gpl_20_or_later = LicenseSymbol('GPL-2.0+', ['The GNU GPL 20 or later']) + gpl_20_or_later = LicenseSymbol('GPL-2.0+', ['The-GNU-GPL-20-or-later']) symbols = [gpl2, lgpl, mit, mitand2, mitexp, gpl_20_or_later] licensing = Licensing(symbols) - expr = 'The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit with 123' + expr = 'The-GNU-GPL-20-or-later or (LGPL-2.1 and mit) or The-GNU-GPL-20 or mit with 123' parsed = licensing.parse(expr) assert ['123'] == licensing.unknown_license_keys(parsed) @@ -1104,7 +1210,7 @@ def test_parse_expression_with_WITH_and_unknown_symbol(self): def test_unknown_keys(self): _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing() - expr = 'The GNU GPL 20 or LGPL-2.1 and mit' + expr = 'The-GNU-GPL-20 or LGPL-2.1 and mit' parsed = licensing.parse(expr) expected = 'gpl-2.0 OR (LGPL-2.1 AND MIT)' assert expected == str(parsed) @@ -1114,7 +1220,7 @@ def test_unknown_keys(self): def test_unknown_keys_with_trailing_char(self): gpl2, _gpl2plus, lgpl, _mit, mitand2, licensing = self.get_symbols_and_licensing() - expr = 'The GNU GPL 20 or LGPL-2.1 and mitand2' + expr = 'The-GNU-GPL-20 or LGPL-2.1 and mitand2' parsed = licensing.parse(expr) expected = [gpl2, lgpl, mitand2] assert expected == licensing.license_symbols(parsed) @@ -1125,30 +1231,35 @@ def test_unknown_keys_with_trailing_char(self): def test_unknown_keys_with_trailing_char_2(self): _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing() - expr = 'The GNU GPL 20 or LGPL-2.1 and mitand3' + expr = 'The-GNU-GPL-20 or LGPL-2.1 and mit and3' try: licensing.parse(expr) self.fail('ParseError should be raised') except ParseError as pe: - expected = {'error_code': 5, 'position': 34, 'token_string': u'and3', 'token_type': LicenseSymbol(key=u'and3')} + expected = { + 'error_code': 5, + 'position': 35, + 'token_string': 'and3', + 'token_type': LicenseSymbol(key='and3') + } assert expected == _parse_error_as_dict(pe) def test_parse_with_overlapping_key_with_licensing(self): symbols = [ - LicenseSymbol('MIT', ['MIT license']), - LicenseSymbol('LGPL-2.1', ['LGPL v2.1']), + LicenseSymbol('MIT', ['MIT-license']), + LicenseSymbol('LGPL-2.1', ['LGPL-v2.1']), LicenseSymbol('zlib', ['zlib']), - LicenseSymbol('d-zlib', ['D zlib']), - LicenseSymbol('mito', ['mit o']), - LicenseSymbol('hmit', ['h verylonglicense']), + LicenseSymbol('d-zlib', ['D-zlib']), + LicenseSymbol('mito', ['mit-o']), + LicenseSymbol('hmit', ['h-verylonglicense']), ] licensing = Licensing(symbols) expression = 'mit or mit AND zlib or mit or mit with verylonglicense' results = str(licensing.parse(expression)) - expected = 'mit OR (MIT AND zlib) OR mit OR MIT WITH verylonglicense' + expected = 'MIT OR (MIT AND zlib) OR MIT OR MIT WITH verylonglicense' self.assertEqual(expected, results) @@ -1158,90 +1269,107 @@ def test_get_license_symbols(self): symbols = [ LicenseSymbol('GPL-2.0'), LicenseSymbol('mit'), - LicenseSymbol('LGPL 2.1') + LicenseSymbol('LGPL-2.1') ] l = Licensing(symbols) - assert symbols == l.license_symbols(l.parse(' GPL-2.0 and mit or LGPL 2.1 and mit ')) + assert symbols == l.license_symbols(l.parse(' GPL-2.0 and mit or LGPL-2.1 and mit ')) def test_get_license_symbols2(self): symbols = [ LicenseSymbol('GPL-2.0'), LicenseSymbol('LATER'), LicenseSymbol('mit'), - LicenseSymbol('LGPL 2.1+'), - LicenseSymbol('Foo exception', is_exception=True), + LicenseSymbol('LGPL-2.1+'), + LicenseSymbol('Foo-exception', is_exception=True), ] l = Licensing(symbols) - expr = ' GPL-2.0 or LATER and mit or LGPL 2.1+ and mit with Foo exception ' + expr = ' GPL-2.0 or LATER and mit or LGPL-2.1+ and mit with Foo-exception ' expected = [ LicenseSymbol('GPL-2.0'), LicenseSymbol('LATER'), LicenseSymbol('mit'), - LicenseSymbol('LGPL 2.1+'), + LicenseSymbol('LGPL-2.1+'), LicenseSymbol('mit'), - LicenseSymbol('Foo exception', is_exception=True), + LicenseSymbol('Foo-exception', is_exception=True), ] assert expected == l.license_symbols(l.parse(expr), unique=False) def test_get_license_symbols3(self): symbols = [ LicenseSymbol('mit'), - LicenseSymbol('LGPL 2.1+'), - LicenseSymbol('Foo exception', is_exception=True), + LicenseSymbol('LGPL-2.1+'), + LicenseSymbol('Foo-exception', is_exception=True), LicenseSymbol('GPL-2.0'), LicenseSymbol('LATER'), ] l = Licensing(symbols) - expr = 'mit or LGPL 2.1+ and mit with Foo exception or GPL-2.0 or LATER ' + expr = 'mit or LGPL-2.1+ and mit with Foo-exception or GPL-2.0 or LATER ' assert symbols == l.license_symbols(l.parse(expr)) def test_get_license_symbols4(self): symbols = [ LicenseSymbol('GPL-2.0'), LicenseSymbol('LATER'), - LicenseSymbol('big exception', is_exception=True), + LicenseSymbol('big-exception', is_exception=True), LicenseSymbol('mit'), - LicenseSymbol('LGPL 2.1+'), - LicenseSymbol('Foo exception', is_exception=True), + LicenseSymbol('LGPL-2.1+'), + LicenseSymbol('Foo-exception', is_exception=True), ] l = Licensing(symbols) - expr = (' GPL-2.0 or LATER with big exception and mit or ' - 'LGPL 2.1+ and mit or later with Foo exception ') + expr = (' GPL-2.0 or LATER with big-exception and mit or ' + 'LGPL-2.1+ and mit or later with Foo-exception ') expected = [ LicenseSymbol('GPL-2.0'), LicenseSymbol('LATER'), - LicenseSymbol('big exception', is_exception=True), + LicenseSymbol('big-exception', is_exception=True), LicenseSymbol('mit'), - LicenseSymbol('LGPL 2.1+'), + LicenseSymbol('LGPL-2.1+'), LicenseSymbol('mit'), LicenseSymbol('LATER'), - LicenseSymbol('Foo exception', is_exception=True), + LicenseSymbol('Foo-exception', is_exception=True), + ] + + assert expected == l.license_symbols(l.parse(expr), unique=False) + + def test_get_license_symbols5(self): + l = Licensing() + expr = (' GPL-2.0 or LATER with big-exception and mit or ' + 'LGPL-2.1+ and mit or later with Foo-exception ') + expected = [ + LicenseSymbol('GPL-2.0'), + LicenseSymbol('LATER'), + LicenseSymbol('big-exception', is_exception=False), + LicenseSymbol('mit'), + LicenseSymbol('LGPL-2.1+'), + LicenseSymbol('mit'), + LicenseSymbol('later'), + LicenseSymbol('Foo-exception', is_exception=False), ] assert expected == l.license_symbols(l.parse(expr), unique=False) def test_license_symbols(self): licensing = Licensing([ - 'GPL-2.0 or LATER', - 'classpath Exception', - 'something with else+', + 'GPL-2.0-or-LATER', + 'classpath-Exception', + 'something-with-else+', 'mit', - 'LGPL 2.1', - 'mit or later' + 'LGPL-2.1', + 'mit-or-later' ]) - expr = (' GPL-2.0 or LATER with classpath Exception and mit and ' - 'mit with SOMETHING with ELSE+ or LGPL 2.1 and ' - 'GPL-2.0 or LATER with classpath Exception and ' - 'mit or later or LGPL 2.1 or mit or GPL-2.0 or LATER ' - 'with SOMETHING with ELSE+ and lgpl 2.1') + expr = (' GPL-2.0-or-LATER with classpath-Exception and mit and ' + 'mit with SOMETHING-with-ELSE+ or LGPL-2.1 and ' + 'GPL-2.0-or-LATER with classpath-Exception and ' + 'mit-or-later or LGPL-2.1 or mit or GPL-2.0-or-LATER ' + 'with SOMETHING-with-ELSE+ and lgpl-2.1') - gpl2plus = LicenseSymbol(key='GPL-2.0 or LATER') - cpex = LicenseSymbol(key='classpath Exception') - someplus = LicenseSymbol(key='something with else+') - mitplus = LicenseSymbol(key='mit or later') + gpl2plus = LicenseSymbol(key='GPL-2.0-or-LATER') + cpex = LicenseSymbol(key='classpath-Exception') + someplus = LicenseSymbol(key='something-with-else+') + mitplus = LicenseSymbol(key='mit-or-later') mit = LicenseSymbol(key='mit') - lgpl = LicenseSymbol(key='LGPL 2.1') + lgpl = LicenseSymbol(key='LGPL-2.1') gpl_with_cp = LicenseWithExceptionSymbol(license_symbol=gpl2plus, exception_symbol=cpex) mit_with_some = LicenseWithExceptionSymbol(license_symbol=mit, exception_symbol=someplus) gpl2_with_someplus = LicenseWithExceptionSymbol(license_symbol=gpl2plus, exception_symbol=someplus) @@ -1262,24 +1390,24 @@ def test_license_symbols(self): def test_primary_license_symbol_and_primary_license_key(self): licensing = Licensing([ - 'GPL-2.0 or LATER', - 'classpath Exception', + 'GPL-2.0-or-LATER', + 'classpath-Exception', 'mit', - 'LGPL 2.1', - 'mit or later' + 'LGPL-2.1', + 'mit-or-later' ]) - expr = ' GPL-2.0 or LATER with classpath Exception and mit or LGPL 2.1 and mit or later ' - gpl = LicenseSymbol('GPL-2.0 or LATER') - cpex = LicenseSymbol('classpath Exception') + expr = ' GPL-2.0-or-LATER with classpath-Exception and mit or LGPL-2.1 and mit-or-later ' + gpl = LicenseSymbol('GPL-2.0-or-LATER') + cpex = LicenseSymbol('classpath-Exception') expected = LicenseWithExceptionSymbol(gpl, cpex) parsed = licensing.parse(expr) assert expected == licensing.primary_license_symbol(parsed, decompose=False) assert gpl == licensing.primary_license_symbol(parsed, decompose=True) - assert 'GPL-2.0 or LATER' == licensing.primary_license_key(parsed) + assert 'GPL-2.0-or-LATER' == licensing.primary_license_key(parsed) - expr = ' GPL-2.0 or later with classpath Exception and mit or LGPL 2.1 and mit or later ' - expected = 'GPL-2.0 or LATER WITH classpath Exception' + expr = ' GPL-2.0-or-later with classpath-Exception and mit or LGPL-2.1 and mit-or-later ' + expected = 'GPL-2.0-or-LATER WITH classpath-Exception' assert expected == licensing.primary_license_symbol( parsed, decompose=False).render('{symbol.key}') @@ -1389,19 +1517,19 @@ def test_splitter(self): def test_tokenize_step_by_step_does_not_munge_trailing_symbols(self): gpl2 = LicenseSymbol(key='GPL-2.0') - gpl2plus = LicenseSymbol(key='GPL-2.0 or LATER') - cpex = LicenseSymbol(key='classpath Exception', is_exception=True) + gpl2plus = LicenseSymbol(key='GPL-2.0-or-LATER') + cpex = LicenseSymbol(key='classpath-Exception', is_exception=True) mitthing = LicenseSymbol(key='mithing') - mitthing_with_else = LicenseSymbol(key='mitthing with else+', is_exception=False) + mitthing_with_else = LicenseSymbol(key='mitthing-with-else+', is_exception=False) mit = LicenseSymbol(key='mit') - mitplus = LicenseSymbol(key='mit or later') + mitplus = LicenseSymbol(key='mit-or-later') elsish = LicenseSymbol(key='else') elsishplus = LicenseSymbol(key='else+') - lgpl = LicenseSymbol(key='LGPL 2.1') + lgpl = LicenseSymbol(key='LGPL-2.1') licensing = Licensing([ gpl2, @@ -1416,51 +1544,74 @@ def test_tokenize_step_by_step_does_not_munge_trailing_symbols(self): lgpl, ]) - expr = (' GPL-2.0 or later with classpath Exception and mit and ' - 'mit with mitthing with ELSE+ or LGPL 2.1 and ' - 'GPL-2.0 or LATER with Classpath Exception and ' - 'mit or later or LGPL 2.1 or mit or GPL-2.0 or LATER ' - 'with mitthing with ELSE+ and lgpl 2.1 or gpl-2.0') + expr = (' GPL-2.0-or-later with classpath-Exception and mit and ' + 'mit with mitthing-with-ELSE+ or LGPL-2.1 and ' + 'GPL-2.0-or-LATER with Classpath-Exception and ' + 'mit-or-later or LGPL-2.1 or mit or GPL-2.0-or-LATER ' + 'with mitthing-with-ELSE+ and lgpl-2.1 or gpl-2.0') # fist scan - scanner = licensing.get_scanner() - result = list(scanner.scan(expr)) - - WITH_KW = Keyword(value=' with ', type=10) - AND_KW = Keyword(value=' and ', type=1) - OR_KW = Keyword(value=' or ', type=2) + result = list(splitter(expr, licensing.symbols_by_key)) expected = [ - Result(0, 0, ' ', None), - Result(1, 16, 'GPL-2.0 or later', Output('GPL-2.0 or LATER', gpl2plus, 1)), - Result(17, 22, ' with ', Output(' with ', WITH_KW, 0)), - Result(23, 41, 'classpath Exception', Output('classpath Exception', cpex, 1)), - Result(42, 46, ' and ', Output(' and ', AND_KW, 0)), - Result(47, 49, 'mit', Output('mit', mit, 1)), - Result(50, 54, ' and ', Output(' and ', AND_KW, 0)), - Result(55, 57, 'mit', Output('mit', mit, 1)), - Result(58, 63, ' with ', Output(' with ', WITH_KW, 0)), - Result(64, 82, 'mitthing with ELSE+', Output('mitthing with else+', mitthing_with_else, 1)), - Result(83, 86, ' or ', Output(' or ', OR_KW, 0)), - Result(87, 94, 'LGPL 2.1', Output('LGPL 2.1', lgpl, 1)), - Result(95, 99, ' and ', Output(' and ', AND_KW, 0)), - Result(100, 115, 'GPL-2.0 or LATER', Output('GPL-2.0 or LATER', gpl2plus, 1)), - Result(116, 121, ' with ', Output(' with ', WITH_KW, 0)), - Result(122, 140, 'Classpath Exception', Output('classpath Exception', cpex, 1)), - Result(141, 145, ' and ', Output(' and ', AND_KW, 0)), - Result(146, 157, 'mit or later', Output('mit or later', mitplus, 1)), - Result(158, 161, ' or ', Output(' or ', OR_KW, 0)), - Result(162, 169, 'LGPL 2.1', Output('LGPL 2.1', lgpl, 1)), - Result(170, 173, ' or ', Output(' or ', OR_KW, 0)), - Result(174, 176, 'mit', Output('mit', mit, 1)), - Result(177, 180, ' or ', Output(' or ', OR_KW, 0)), - Result(181, 196, 'GPL-2.0 or LATER', Output('GPL-2.0 or LATER', gpl2plus, 1)), - Result(197, 202, ' with ', Output(' with ', WITH_KW, 0)), - Result(203, 221, 'mitthing with ELSE+', Output('mitthing with else+', mitthing_with_else, 1)), - Result(222, 226, ' and ', Output(' and ', AND_KW, 0)), - Result(227, 234, 'lgpl 2.1', Output('LGPL 2.1', lgpl, 1)), - Result(235, 238, ' or ', Output(' or ', OR_KW, 0)), - Result(239, 245, 'gpl-2.0', Output('GPL-2.0', gpl2, 1)) + Result(0, 0, u' ', None), + Result(1, 16, u'GPL-2.0-or-later', Output(u'GPL-2.0-or-later', LicenseSymbol(u'GPL-2.0-or-LATER', is_exception=False))), + Result(17, 17, u' ', None), + Result(18, 21, u'with', Output(u'with', Keyword(value=u'with', type=10))), + Result(22, 22, u' ', None), + Result(23, 41, u'classpath-Exception', Output(u'classpath-Exception', LicenseSymbol(u'classpath-Exception', is_exception=True))), + Result(42, 42, u' ', None), + Result(43, 45, u'and', Output(u'and', Keyword(value=u'and', type=1))), + Result(46, 46, u' ', None), + Result(47, 49, u'mit', Output(u'mit', LicenseSymbol(u'mit', is_exception=False))), + Result(50, 50, u' ', None), + Result(51, 53, u'and', Output(u'and', Keyword(value=u'and', type=1))), + Result(54, 54, u' ', None), + Result(55, 57, u'mit', Output(u'mit', LicenseSymbol(u'mit', is_exception=False))), + Result(58, 58, u' ', None), + Result(59, 62, u'with', Output(u'with', Keyword(value=u'with', type=10))), + Result(63, 63, u' ', None), + Result(64, 82, u'mitthing-with-ELSE+', Output(u'mitthing-with-ELSE+', LicenseSymbol(u'mitthing-with-else+', is_exception=False))), + Result(83, 83, u' ', None), + Result(84, 85, u'or', Output(u'or', Keyword(value=u'or', type=2))), + Result(86, 86, u' ', None), + Result(87, 94, u'LGPL-2.1', Output(u'LGPL-2.1', LicenseSymbol(u'LGPL-2.1', is_exception=False))), + Result(95, 95, u' ', None), + Result(96, 98, u'and', Output(u'and', Keyword(value=u'and', type=1))), + Result(99, 99, u' ', None), + Result(100, 115, u'GPL-2.0-or-LATER', Output(u'GPL-2.0-or-LATER', LicenseSymbol(u'GPL-2.0-or-LATER', is_exception=False))), + Result(116, 116, u' ', None), + Result(117, 120, u'with', Output(u'with', Keyword(value=u'with', type=10))), + Result(121, 121, u' ', None), + Result(122, 140, u'Classpath-Exception', Output(u'Classpath-Exception', LicenseSymbol(u'classpath-Exception', is_exception=True))), + Result(141, 141, u' ', None), + Result(142, 144, u'and', Output(u'and', Keyword(value=u'and', type=1))), + Result(145, 145, u' ', None), + Result(146, 157, u'mit-or-later', Output(u'mit-or-later', LicenseSymbol(u'mit-or-later', is_exception=False))), + Result(158, 158, u' ', None), + Result(159, 160, u'or', Output(u'or', Keyword(value=u'or', type=2))), + Result(161, 161, u' ', None), + Result(162, 169, u'LGPL-2.1', Output(u'LGPL-2.1', LicenseSymbol(u'LGPL-2.1', is_exception=False))), + Result(170, 170, u' ', None), + Result(171, 172, u'or', Output(u'or', Keyword(value=u'or', type=2))), + Result(173, 173, u' ', None), + Result(174, 176, u'mit', Output(u'mit', LicenseSymbol(u'mit', is_exception=False))), + Result(177, 177, u' ', None), + Result(178, 179, u'or', Output(u'or', Keyword(value=u'or', type=2))), + Result(180, 180, u' ', None), + Result(181, 196, u'GPL-2.0-or-LATER', Output(u'GPL-2.0-or-LATER', LicenseSymbol(u'GPL-2.0-or-LATER', is_exception=False))), + Result(197, 197, u' ', None), + Result(198, 201, u'with', Output(u'with', Keyword(value=u'with', type=10))), + Result(202, 202, u' ', None), + Result(203, 221, u'mitthing-with-ELSE+', Output(u'mitthing-with-ELSE+', LicenseSymbol(u'mitthing-with-else+', is_exception=False))), + Result(222, 222, u' ', None), + Result(223, 225, u'and', Output(u'and', Keyword(value=u'and', type=1))), + Result(226, 226, u' ', None), + Result(227, 234, u'lgpl-2.1', Output(u'lgpl-2.1', LicenseSymbol(u'LGPL-2.1', is_exception=False))), + Result(235, 235, u' ', None), + Result(236, 237, u'or', Output(u'or', Keyword(value=u'or', type=2))), + Result(238, 238, u' ', None), + Result(239, 245, u'gpl-2.0', Output(u'gpl-2.0', LicenseSymbol(u'GPL-2.0', is_exception=False))) ] assert expected == result @@ -1470,92 +1621,86 @@ def test_tokenize_step_by_step_does_not_munge_trailing_symbols(self): # skip spaces result = list(strip_and_skip_spaces(result)) # here only the first token is a space - assert expected[1:] == result + expected_no_spaces = [r for r in expected if r.output] + assert expected_no_spaces == result # group results - gpl2pluso = Output('GPL-2.0 or LATER', LicenseSymbol('GPL-2.0 or LATER', is_exception=False), 1) - cpex0 = Output('classpath Exception', LicenseSymbol('classpath Exception', is_exception=True), 1) - mito = Output('mit', LicenseSymbol('mit', is_exception=False), 1) - mieo1 = Output('mitthing with else+', LicenseSymbol('mitthing with else+', is_exception=False), 1) - lgplo = Output('LGPL 2.1', LicenseSymbol('LGPL 2.1', is_exception=False), 1) - mitoo = Output('mit or later', LicenseSymbol('mit or later', is_exception=False), 1) - gpl202 = Output('GPL-2.0', LicenseSymbol('GPL-2.0', is_exception=False), 1) - - with_kw = Output(' with ', WITH_KW, 0) - and_kw = Output(' and ', AND_KW, 0) - or_kw = Output(' or ', OR_KW, 0) - expected_groups = [ - (Result(1, 16, 'GPL-2.0 or later', gpl2pluso), - Result(17, 22, ' with ', with_kw), - Result(23, 41, 'classpath Exception', cpex0)), - (Result(42, 46, ' and ', and_kw),), - (Result(47, 49, 'mit', mito),), - (Result(50, 54, ' and ', and_kw),), - (Result(55, 57, 'mit', mito), - Result(58, 63, ' with ', with_kw), - Result(64, 82, 'mitthing with ELSE+', mieo1)), - (Result(83, 86, ' or ', or_kw),), - (Result(87, 94, 'LGPL 2.1', lgplo),), - (Result(95, 99, ' and ', and_kw),), - (Result(100, 115, 'GPL-2.0 or LATER', gpl2pluso), - Result(116, 121, ' with ', with_kw), - Result(122, 140, 'Classpath Exception', cpex0)), - (Result(141, 145, ' and ', and_kw),), - (Result(146, 157, 'mit or later', mitoo),), - (Result(158, 161, ' or ', or_kw),), - (Result(162, 169, 'LGPL 2.1', lgplo),), - (Result(170, 173, ' or ', or_kw),), - (Result(174, 176, 'mit', mito),), - (Result(177, 180, ' or ', or_kw),), - (Result(181, 196, 'GPL-2.0 or LATER', gpl2pluso), - Result(197, 202, ' with ', with_kw), - Result(203, 221, 'mitthing with ELSE+', mieo1)), - (Result(222, 226, ' and ', and_kw),), - (Result(227, 234, 'lgpl 2.1', lgplo),), - (Result(235, 238, ' or ', or_kw),), - (Result(239, 245, 'gpl-2.0', gpl202),) + (Result(1, 16, u'GPL-2.0-or-later', Output(u'GPL-2.0-or-later', LicenseSymbol(u'GPL-2.0-or-LATER', is_exception=False))), + Result(18, 21, u'with', Output(u'with', Keyword(value=u'with', type=10))), + Result(23, 41, u'classpath-Exception', Output(u'classpath-Exception', LicenseSymbol(u'classpath-Exception', is_exception=True)))), + (Result(43, 45, u'and', Output(u'and', Keyword(value=u'and', type=1))),), + (Result(47, 49, u'mit', Output(u'mit', LicenseSymbol(u'mit', is_exception=False))),), + (Result(51, 53, u'and', Output(u'and', Keyword(value=u'and', type=1))),), + (Result(55, 57, u'mit', Output(u'mit', LicenseSymbol(u'mit', is_exception=False))), + Result(59, 62, u'with', Output(u'with', Keyword(value=u'with', type=10))), + Result(64, 82, u'mitthing-with-ELSE+', Output(u'mitthing-with-ELSE+', LicenseSymbol(u'mitthing-with-else+', is_exception=False)))), + (Result(84, 85, u'or', Output(u'or', Keyword(value=u'or', type=2))),), + (Result(87, 94, u'LGPL-2.1', Output(u'LGPL-2.1', LicenseSymbol(u'LGPL-2.1', is_exception=False))),), + (Result(96, 98, u'and', Output(u'and', Keyword(value=u'and', type=1))),), + (Result(100, 115, u'GPL-2.0-or-LATER', Output(u'GPL-2.0-or-LATER', LicenseSymbol(u'GPL-2.0-or-LATER', is_exception=False))), + Result(117, 120, u'with', Output(u'with', Keyword(value=u'with', type=10))), + Result(122, 140, u'Classpath-Exception', Output(u'Classpath-Exception', LicenseSymbol(u'classpath-Exception', is_exception=True)))), + (Result(142, 144, u'and', Output(u'and', Keyword(value=u'and', type=1))),), + (Result(146, 157, u'mit-or-later', Output(u'mit-or-later', LicenseSymbol(u'mit-or-later', is_exception=False))),), + (Result(159, 160, u'or', Output(u'or', Keyword(value=u'or', type=2))),), + (Result(162, 169, u'LGPL-2.1', Output(u'LGPL-2.1', LicenseSymbol(u'LGPL-2.1', is_exception=False))),), + (Result(171, 172, u'or', Output(u'or', Keyword(value=u'or', type=2))),), + (Result(174, 176, u'mit', Output(u'mit', LicenseSymbol(u'mit', is_exception=False))),), + (Result(178, 179, u'or', Output(u'or', Keyword(value=u'or', type=2))),), + (Result(181, 196, u'GPL-2.0-or-LATER', Output(u'GPL-2.0-or-LATER', LicenseSymbol(u'GPL-2.0-or-LATER', is_exception=False))), + Result(198, 201, u'with', Output(u'with', Keyword(value=u'with', type=10))), + Result(203, 221, u'mitthing-with-ELSE+', Output(u'mitthing-with-ELSE+', LicenseSymbol(u'mitthing-with-else+', is_exception=False)))), + (Result(223, 225, u'and', Output(u'and', Keyword(value=u'and', type=1))),), + (Result(227, 234, u'lgpl-2.1', Output(u'lgpl-2.1', LicenseSymbol(u'LGPL-2.1', is_exception=False))),), + (Result(236, 237, u'or', Output(u'or', Keyword(value=u'or', type=2))),), + (Result(239, 245, u'gpl-2.0', Output(u'gpl-2.0', LicenseSymbol(u'GPL-2.0', is_exception=False))),) ] result_groups = list(group_results_for_with_subexpression(result)) assert expected_groups == result_groups # finally retest it all with tokenize - gpl2plus_with_cpex = LicenseWithExceptionSymbol(license_symbol=gpl2plus, exception_symbol=cpex) gpl2plus_with_someplus = LicenseWithExceptionSymbol(license_symbol=gpl2plus, exception_symbol=mitthing_with_else) - mit_with_mitthing_with_else = LicenseWithExceptionSymbol(license_symbol=mit, exception_symbol=mitthing_with_else) expected = [ - (gpl2plus_with_cpex, 'GPL-2.0 or later with classpath Exception', 1), - (TOKEN_AND, ' and ', 42), - (mit, 'mit', 47), - (TOKEN_AND, ' and ', 50), - (mit_with_mitthing_with_else, 'mit with mitthing with ELSE+', 55), - (TOKEN_OR, ' or ', 83), - (lgpl, 'LGPL 2.1', 87), - (TOKEN_AND, ' and ', 95), - (gpl2plus_with_cpex, 'GPL-2.0 or LATER with Classpath Exception', 100), - (TOKEN_AND, ' and ', 141), - (mitplus, 'mit or later', 146), - (TOKEN_OR, ' or ', 158), - (lgpl, 'LGPL 2.1', 162), - (TOKEN_OR, ' or ', 170), - (mit, 'mit', 174), - (TOKEN_OR, ' or ', 177), - (gpl2plus_with_someplus, 'GPL-2.0 or LATER with mitthing with ELSE+', 181), - (TOKEN_AND, ' and ', 222), - (lgpl, 'lgpl 2.1', 227), - (TOKEN_OR, ' or ', 235), - (gpl2, 'gpl-2.0', 239), + (gpl2plus_with_cpex, + u'GPL-2.0-or-later with classpath-Exception', + 1), + (1, u'and', 43), + (LicenseSymbol(u'mit', is_exception=False), u'mit', 47), + (1, u'and', 51), + (mit_with_mitthing_with_else, + u'mit with mitthing-with-ELSE+', + 55), + (2, u'or', 84), + (LicenseSymbol(u'LGPL-2.1', is_exception=False), u'LGPL-2.1', 87), + (1, u'and', 96), + (gpl2plus_with_cpex, + u'GPL-2.0-or-LATER with Classpath-Exception', + 100), + (1, u'and', 142), + (LicenseSymbol(u'mit-or-later', is_exception=False), u'mit-or-later', 146), + (2, u'or', 159), + (LicenseSymbol(u'LGPL-2.1', is_exception=False), u'LGPL-2.1', 162), + (2, u'or', 171), + (LicenseSymbol(u'mit', is_exception=False), u'mit', 174), + (2, u'or', 178), + (gpl2plus_with_someplus, + u'GPL-2.0-or-LATER with mitthing-with-ELSE+', + 181), + (1, u'and', 223), + (LicenseSymbol(u'LGPL-2.1', is_exception=False), u'lgpl-2.1', 227), + (2, u'or', 236), + (gpl2, u'gpl-2.0', 239) ] - assert expected == list(licensing.tokenize(expr)) -class LicensingExpression(TestCase): +class LicensingExpressionTest(TestCase): def test_is_equivalent_with_same_Licensing(self): licensing = Licensing() @@ -1601,27 +1746,27 @@ def test_is_equivalent_with_different_Licensing_and_simple_expression(self): def test_is_equivalent_with_symbols_and_complex_expression(self): licensing_no_sym = Licensing() licensing1 = Licensing([ - 'GPL-2.0 or LATER', - 'classpath Exception', + 'GPL-2.0-or-LATER', + 'classpath-Exception', 'agpl+', 'mit', - 'LGPL 2.1', + 'LGPL-2.1', ]) licensing2 = Licensing([ - 'GPL-2.0 or LATER', - 'classpath Exception', + 'GPL-2.0-or-LATER', + 'classpath-Exception', 'agpl+', 'mit', - 'LGPL 2.1', + 'LGPL-2.1', ]) - parsed1 = licensing1.parse(' ((LGPL 2.1 or mit) and GPL-2.0 or LATER with classpath Exception) and agpl+') - parsed2 = licensing2.parse(' agpl+ and (GPL-2.0 or LATER with classpath Exception and (mit or LGPL 2.1))') + parsed1 = licensing1.parse(' ((LGPL-2.1 or mit) and GPL-2.0-or-LATER with classpath-Exception) and agpl+') + parsed2 = licensing2.parse(' agpl+ and (GPL-2.0-or-LATER with classpath-Exception and (mit or LGPL-2.1))') assert licensing1.is_equivalent(parsed1, parsed2) assert licensing2.is_equivalent(parsed1, parsed2) assert licensing_no_sym.is_equivalent(parsed1, parsed2) - parsed3 = licensing1.parse(' ((LGPL 2.1 or mit) OR GPL-2.0 or LATER with classpath Exception) and agpl+') + parsed3 = licensing1.parse(' ((LGPL-2.1 or mit) OR GPL-2.0-or-LATER with classpath-Exception) and agpl+') assert not licensing1.is_equivalent(parsed1, parsed3) assert not licensing2.is_equivalent(parsed1, parsed3) assert not licensing_no_sym.is_equivalent(parsed1, parsed3)