diff --git a/.travis.yml b/.travis.yml
index a979755..c00ee3e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,24 +1,48 @@
-# This deliberately is not "python" as a work-around to support
-# multi-os builds with custom Python versions in Travis CI.
-language: cpp
-
-os: 
-  - osx
-  - linux 
+language: generic
 
 env:
   matrix:
-    - PYTHON_EXE="`pyenv install -s 2.7.13 && pyenv local 2.7.13`"
-    - PYTHON_EXE="`pyenv install -s 3.5.3 && pyenv local 3.5.3`"
+    - PYTHON_EXE="`pyenv install -s 2.7.14 && pyenv local 2.7.14`"
     - PYTHON_EXE="`pyenv install -s 3.6.1 && pyenv local 3.6.1`"
 
+
+# Travis does not offer OSX with arbitrary python versions (like 2.7.13 above)
+# So, you cannot simply have the following section in your build matrix:
+# os:
+#   - linux
+#   - osx
+# Instead, you have to include OSX entries into the build matrix manually.
+# In particular, this means specifying the environment variables again.
+
+# The following was adapted from here:
+#   https://docs.travis-ci.com/user/multi-os/
+# Set `language: generic` to clear `language: python` from above
+# Set `python:` (to empty) to clear it from the travis-ci web interface
+# Set `osx_image: xcode7.3` to pin OSX version see here:
+#   https://docs.travis-ci.com/user/osx-ci-environment/
+
+matrix:
+  include:
+    - os: osx
+      language: generic
+      python:
+      osx_image: xcode7.3
+      env: PYTHON_EXE="`pyenv install -s 2.7.14 && pyenv local 2.7.14`"
+    - os: osx
+      language: generic
+      python:
+      osx_image: xcode7.3
+      env: PYTHON_EXE="`pyenv install -s 3.6.1 && pyenv local 3.6.1`"
+
+
 install:
   - pyenv install --list
+  - echo $PYTHON_EXE
+  - python --version
   - ./configure
 
 before_script:
-  - bin/pip install aboutcode-toolkit
-  - bin/about-code check --show-all .
+  - bin/about-code check --verbose .
 
 script:
   - "bin/py.test -vvs"
@@ -32,4 +56,4 @@ notifications:
   use_notice: true
   skip_join: true
   template:
-    - "%{repository_slug}#%{build_number} (%{branch} - %{commit} : %{author}): %{message} : %{build_url}"
+    - "%{repository_slug}#%{build_number} (%{branch}-%{commit}:%{author})-%{message}- %{build_url}"
diff --git a/README.rst b/README.rst
index 77f24fb..09587d0 100644
--- a/README.rst
+++ b/README.rst
@@ -104,6 +104,8 @@ And expression can be simplified:
 
     >>> expression2 = ' GPL-2.0 or (mit and LGPL 2.1) or bsd Or GPL-2.0  or (mit and LGPL 2.1)'
     >>> parsed2 = licensing.parse(expression2)
+    >>> str(parsed2)
+    'GPL-2.0 OR (mit AND LGPL 2.1) OR BSD OR GPL-2.0 OR (mit AND LGPL 2.1)'
     >>> assert str(parsed2.simplify()) == 'BSD OR GPL-2.0 OR (LGPL 2.1 AND mit)'
 
 Two expressions can be compared for equivalence and containment:
diff --git a/configure b/configure
index 8ceb9d6..4f9fdcc 100755
--- a/configure
+++ b/configure
@@ -16,17 +16,15 @@ CONF_DEFAULT="etc/conf/dev"
 
 CFG_CMD_LINE_ARGS="$@"
 
-if [ "$1" == "--init" ]; then
-    CFG_CMD_LINE_ARGS=$CONF_INIT
-fi
-
-if [ "$1" == "" ]; then
+if [[ "$1" == "" ]]; then
     # default for dev conf if not argument is provided
     CFG_CMD_LINE_ARGS=$CONF_DEFAULT
 fi
 
-if [ "$PYTHON_EXE" == "" ]; then
+CONFIGURE_ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+if [[ "$PYTHON_EXE" == "" ]]; then
     PYTHON_EXE=python
 fi
 
-$PYTHON_EXE etc/configure.py $CFG_CMD_LINE_ARGS
+$PYTHON_EXE "$CONFIGURE_ROOT_DIR/etc/configure.py" $CFG_CMD_LINE_ARGS
diff --git a/setup.py b/setup.py
index ea2eae5..87a527c 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
 
 setup(
     name='license-expression',
-    version='0.98',
+    version='0.99',
     license='apache-2.0',
     description=desc,
     long_description=desc,
@@ -30,7 +30,7 @@
     include_package_data=True,
     zip_safe=False,
     classifiers=[
-        'Development Status :: 4 - Beta',
+        'Development Status :: 5 - Production/Stable',
         'License :: OSI Approved :: Apache Software License',
         'Intended Audience :: Developers',
         'Operating System :: OS Independent',
@@ -48,6 +48,6 @@
         'licence'
     ],
     install_requires=[
-        'boolean.py >= 3.5, < 4.0.0',
+        'boolean.py >= 3.6, < 4.0.0',
     ]
 )
diff --git a/src/license_expression/__init__.py b/src/license_expression/__init__.py
index 43c7b2c..e37f183 100644
--- a/src/license_expression/__init__.py
+++ b/src/license_expression/__init__.py
@@ -31,16 +31,10 @@
 from __future__ import unicode_literals
 from __future__ import print_function
 
-# Python 2 and 3 support
-try:
-    # Python 2
-    unicode
-    str = unicode  # NOQA
-except NameError:
-    # Python 3
-    unicode = str  # NOQA
-
-import collections
+from collections import defaultdict
+from collections import deque
+from collections import namedtuple
+from collections import OrderedDict
 from copy import copy
 from copy import deepcopy
 from functools import total_ordering
@@ -56,9 +50,11 @@
 from boolean.boolean import PARSE_ERRORS
 from boolean.boolean import PARSE_INVALID_EXPRESSION
 from boolean.boolean import PARSE_INVALID_NESTING
+from boolean.boolean import PARSE_INVALID_OPERATOR_SEQUENCE
 from boolean.boolean import PARSE_INVALID_SYMBOL_SEQUENCE
 from boolean.boolean import PARSE_UNBALANCED_CLOSING_PARENS
 from boolean.boolean import PARSE_UNKNOWN_TOKEN
+
 from boolean.boolean import ParseError
 from boolean.boolean import TOKEN_SYMBOL
 from boolean.boolean import TOKEN_AND
@@ -66,9 +62,19 @@
 from boolean.boolean import TOKEN_LPAR
 from boolean.boolean import TOKEN_RPAR
 
-from license_expression._pyahocorasick import Trie as Scanner
-from license_expression._pyahocorasick import Output
-from license_expression._pyahocorasick import Result
+from license_expression._pyahocorasick import Trie as AdvancedTokenizer
+from license_expression._pyahocorasick import Token
+
+
+# Python 2 and 3 support
+try:
+    # Python 2
+    unicode
+    str = unicode  # NOQA
+except NameError:
+    # Python 3
+    unicode = str  # NOQA
+
 
 # append new error codes to PARSE_ERRORS by monkey patching
 PARSE_EXPRESSION_NOT_UNICODE = 100
@@ -98,29 +104,65 @@ class ExpressionError(Exception):
 
 
 # Used for tokenizing
-Keyword = collections.namedtuple('Keyword', 'value type')
+Keyword = namedtuple('Keyword', 'value type')
+Keyword.__len__ = lambda self: len(self.value)
 
 # id for "with" token which is not a proper boolean symbol but an expression symbol
 TOKEN_WITH = 10
 
-# actual keyword types
+# keyword types that include operators and parens
+
 KW_LPAR = Keyword('(', TOKEN_LPAR)
 KW_RPAR = Keyword(')', TOKEN_RPAR)
-_KEYWORDS = [
-    Keyword(' and ', TOKEN_AND),
-    Keyword(' or ', TOKEN_OR),
-    KW_LPAR,
-    KW_RPAR,
-    Keyword(' with ', TOKEN_WITH),
-]
+KW_AND = Keyword('and', TOKEN_AND)
+KW_OR = Keyword('or', TOKEN_OR)
+KW_WITH = Keyword('with', TOKEN_WITH)
 
-KEYWORDS = tuple(kw.value for kw in _KEYWORDS)
-KEYWORDS_STRIPPED = tuple(k.strip() for k in KEYWORDS)
+KEYWORDS = (KW_AND, KW_OR, KW_LPAR, KW_RPAR, KW_WITH,)
+KEYWORDS_STRINGS = set(kw.value for kw in KEYWORDS)
+
+# mapping of lowercase operator strings to an operator object
+OPERATORS = {'and': KW_AND, 'or': KW_OR, 'with': KW_WITH}
+
+_simple_tokenizer = re.compile('''
+    (?P<symop>[^\s\(\)]+)
+     |
+    (?P<space>\s+)
+     |
+    (?P<lpar>\()
+     |
+    (?P<rpar>\))
+    ''',
+    re.VERBOSE | re.MULTILINE | re.UNICODE
+).finditer
 
 
 class Licensing(boolean.BooleanAlgebra):
     """
-    Define a mini language to parse, validate and compare license expressions.
+    Licensing defines a mini language to parse, validate and compare license
+    expressions. This is the main entry point this library.
+
+    Some of the features are:
+
+    - licenses can be validated against user-provided lists of known licenses
+      "symbols" (such as ScanCode licenses or the SPDX list).
+
+    - flexible expression parsing and recognition of licenses (including
+      licenses with spaces and keywords (such as AND, OR WITH) or parens in
+      their names).
+
+    - in an expression licenses can be more than just identifiers such short or
+      long names
+
+    - A license can have multiple aliases (such as GPLv2 or GPL2) and each will
+      be properly recognized when parsing.
+
+    - expressions can be simplified, normalized, sorted and compared for
+      containment and/or logical equivalence thanks to a built-in boolean logic
+      engine.
+
+    - Once parsed, expressions can be rendered using simple templates (for
+      instance to render HTML links in a GUI).
 
     For example:
 
@@ -154,9 +196,9 @@ class Licensing(boolean.BooleanAlgebra):
 
     def __init__(self, symbols=tuple(), quiet=True):
         """
-        Initialize a Licensing with an optional `symbols` sequence of LicenseSymbol
-        or LicenseSymbol-like objects or license key strings. If provided and this
-        list data is invalid, raise a ValueError.
+        Initialize a Licensing with an optional `symbols` sequence of
+        LicenseSymbol or LicenseSymbol-like objects or license key strings. If
+        provided and this list data is invalid, raise a ValueError.
         """
         super(Licensing, self).__init__(Symbol_class=LicenseSymbol, AND_class=AND, OR_class=OR)
 
@@ -168,9 +210,11 @@ def __init__(self, symbols=tuple(), quiet=True):
         if symbols:
             symbols = tuple(as_symbols(symbols))
             warns, errors = validate_symbols(symbols)
+
             if warns and not quiet:
                 for w in warns:
                     print(w)
+
             if errors and not quiet:
                 for e in errors:
                     print(e)
@@ -178,13 +222,14 @@ def __init__(self, symbols=tuple(), quiet=True):
             if errors:
                 raise ValueError('\n'.join(warns + errors))
 
-        # mapping of known symbol used for parsing and resolution as (key, symbol)
-        # TODO: inject lpar, rpar and spaces sourround, before and after
-        # e.g "(sym)" "(sym " "sym)" " sym "
+        # mapping of known symbol key to symbol for reference
         self.known_symbols = {symbol.key: symbol for symbol in symbols}
 
-        # Aho-Corasick automaton-based Scanner used for expression tokenizing
-        self.scanner = None
+        # mapping of known symbol lowercase key to symbol for reference
+        self.known_symbols_lowercase = {symbol.key.lower(): symbol for symbol in symbols}
+
+        # Aho-Corasick automaton-based Advanced Tokenizer
+        self.advanced_tokenizer = None
 
     def is_equivalent(self, expression1, expression2, **kwargs):
         """
@@ -331,29 +376,33 @@ def unknown_license_keys(self, expression, unique=True, **kwargs):
         symbols = self.unknown_license_symbols(expression, unique=False, **kwargs)
         return self._keys(symbols, unique)
 
-    def parse(self, expression, validate=False, strict=False, **kwargs):
+    def parse(self, expression, validate=False, strict=False, simple=False, **kwargs):
         """
-        Return a new license LicenseExpression object by parsing a license expression
-        string. Check that the expression syntax is valid and raise an Exception,
-        ExpressionError or ParseError on errors. Return None for empty expressions.
-        `expression` is either a string or a LicenseExpression object. If this is a
-        LicenseExpression it is retruned as-si.
-
-        Symbols are always recognized from known symbols if `symbols` were provided
-        Licensing creation time: each license and exception is recognized from known
-        license keys (and from aliases for a symbol if available).
-
-        If `validate` is True and a symbol is unknown, an ExpressionError error
+        Return a new license LicenseExpression object by parsing a license
+        `expression` string. Check that the expression syntax is valid and raise
+        an Exception, an ExpressionError or a ParseError on errors.
+        Return None for empty expressions.
+        `expression` is either a string or a LicenseExpression object. If this
+        is a LicenseExpression it is returned as-is.
+        Symbols are always recognized from known symbols if `symbols` were
+        provided at Licensing creation time: each license and exception is
+        recognized from known license keys (and from aliases for a symbol if
+        available).
+
+        If `validate` is True and a license is unknown, an ExpressionError error
         is raised with a message listing the unknown license keys.
 
-        If `validate` is False, no error is triggered.
+        If `validate` is False, no error is raised. You can call the
+        `unknown_license_keys` or `unknown_license_symbols` methods to get
+        unknown license keys or symbols found in a parsed LicenseExpression.
 
-        You can call the `unknown_license_keys` or `unknown_license_symbols` methods
-        to get unknown license keys or symbols found in a parsed LicenseExpression.
+        If `strict` is True, additional exceptions will be raised if in a
+        "WITH" expression such as "XXX with ZZZ" if the XXX symbol has
+        `is_exception` set to True or the YYY symbol has `is_exception` set to
+        False. This checks that symbols are used strictly as constructed.
 
-        If `strict` is True, additional exceptions will be raised if in a expression
-        such as "XXX with ZZZ" if the XXX symbol has `is_exception` set to True or
-        the YYY symbol has `is_exception` set to False.
+        If `simple` is True, parsing will use a simple tokenizer that assumes
+        that license symbols are all license keys that cannot contain spaces.
 
         For example:
         >>> expression = 'EPL-1.0 and Apache-1.1 OR GPL-2.0 with Classpath-exception'
@@ -369,7 +418,7 @@ def parse(self, expression, validate=False, strict=False, **kwargs):
 
         if isinstance(expression, bytes):
             try:
-                expression = unicode(expression)
+                expression = str(expression)
             except:
                 ext = type(expression)
                 raise ExpressionError('expression must be a string and not: %(ext)r' % locals())
@@ -382,7 +431,7 @@ def parse(self, expression, validate=False, strict=False, **kwargs):
             return
         try:
             # this will raise a ParseError on errors
-            tokens = list(self.tokenize(expression, strict=strict))
+            tokens = list(self.tokenize(expression, strict=strict, simple=simple))
             expression = super(Licensing, self).parse(tokens)
         except TypeError as e:
             msg = 'Invalid expression syntax: ' + repr(e)
@@ -399,7 +448,7 @@ def parse(self, expression, validate=False, strict=False, **kwargs):
 
         return expression
 
-    def tokenize(self, expression, strict=False):
+    def tokenize(self, expression, strict=False, simple=False):
         """
         Return an iterable of 3-tuple describing each token given an expression
         unicode string. See boolean.BooleanAlgreba.tokenize() for API details.
@@ -407,157 +456,349 @@ def tokenize(self, expression, strict=False):
         This 3-tuple contains these items: (token, token string, position):
         - token: either a Symbol instance or one of TOKEN_* token types..
         - token string: the original token unicode string.
-        - position: some simple object describing the starting position of the
-          original token string in the `expr` string. It can be an int for a
-          character offset, or a tuple of starting (row/line, column).
+        - position: the starting index of the token string in the `expr` string.
 
-        If `strict` is True, additional exceptions will be raised in a expression
-        such as "XXX with ZZZ" if the XXX symbol has is_exception` set to True or the
-        ZZZ symbol has `is_exception` set to False.
+        If `strict` is True, additional exceptions will be raised in a
+        expression such as "XXX with ZZZ" if the XXX symbol has is_exception`
+        set to True or the ZZZ symbol has `is_exception` set to False.
+
+        If `simple` is True, use a simple tokenizer that assumes that license
+        symbols are all license keys that cannot contain spaces.
         """
-        if self.known_symbols:
-            # scan with an automaton, recognize whole symbols+keywords or only keywords
-            scanner = self.get_scanner()
-            results = scanner.scan(expression)
+        if not expression:
+            return
+
+        if not isinstance(expression, str):
+            raise ParseError(error_code=PARSE_EXPRESSION_NOT_UNICODE)
+
+        if simple:
+            tokens = self.simple_tokenizer(expression)
         else:
-            # scan with a simple regex-based splitter
-            results = splitter(expression)
+            advanced_tokenizer = self.get_advanced_tokenizer()
+            tokens = advanced_tokenizer.tokenize(expression)
 
-        results = strip_and_skip_spaces(results)
-        result_groups = group_results_for_with_subexpression(results)
+        # Assign symbol for unknown tokens
+        tokens = build_symbols_from_unknown_tokens(tokens)
 
-        for group in result_groups:
-            len_group = len(group)
-            if not len_group:
-                # This should never happen
-                continue
-            if len_group == 1:
-                # a single token
-                result = group[0]
-                pos = result.start
-                token_string = result.string
-                output = result.output
-                if output:
-                    val = output.value
-                    if isinstance(val, Keyword):
-                        # keyword
-                        token = val.type
-                        # WITH is not known from the boolean parser as a proper
-                        # boolean element so we handle validation ourselves: by
-                        # design a single group cannot be a single 'WITH' keyword:
-                        # this is an error that we catch and raise here.
-                        if token == TOKEN_WITH:
-                            raise ParseError(token_type=TOKEN_WITH,
-                                             token_string=result.string,
-                                             position=result.start,
-                                             error_code=PARSE_INVALID_EXPRESSION)
-
-                    elif isinstance(val, LicenseSymbol):
-                        if strict and val.is_exception:
-                            raise ParseError(token_type=TOKEN_SYMBOL,
-                                             token_string=result.string,
-                                             position=result.start,
-                                             error_code=PARSE_INVALID_EXCEPTION)
-
-                        # known symbol: The strict check above handled possible errors before.
-                        token = val
-                    else:
-                        # this should not be possible by design
-                        raise Exception('Licensing.tokenize is internally confused...')
-                else:
-                    token = LicenseSymbol(result.string)
+        # skip whitespace-only tokens
+        tokens = (t for t in tokens if t.string and t.string.strip())
+
+        # create atomic LicenseWithExceptionSymbol from WITH subexpressions
+        tokens = replace_with_subexpression_by_license_symbol(tokens, strict)
 
+        # finally yield the actual args expected by the boolean parser
+        for token in tokens:
+            pos = token.start
+            token_string = token.string
+            token_value = token.value
+
+            if isinstance(token_value, BaseSymbol):
+                token_obj = token_value
+            elif isinstance(token_value, Keyword):
+                token_obj = token_value.type
             else:
-                if len_group != 3:
-                    # this should never happen
-                    string = ' '.join([res.string for res in group])
-                    start = group[0].start
-                    raise ParseError(
-                        TOKEN_SYMBOL, string, start, PARSE_INVALID_EXPRESSION)
+                raise ParseError(error_code=PARSE_INVALID_EXPRESSION)
 
-                # this is a A with B seq of three results
-                lic_res, WITH , exc_res = group
-                pos = lic_res.start
-                WITHs = ' ' + WITH.string.strip() + ' '
-                token_string = ''.join([lic_res.string, WITHs, exc_res.string])
+            yield token_obj, token_string, pos
 
-                # licenses
-                lic_out = lic_res.output
-                lic_sym = lic_out and lic_out.value
+    def get_advanced_tokenizer(self):
+        """
+        Return an AdvancedTokenizer instance either cached or created as needed.
+
+        If symbols were provided when this Licensing object was created, the
+        tokenizer will recognize known symbol keys and aliases (ignoring case)
+        when tokenizing expressions.
 
-                # this should not happen
-                if lic_sym and not isinstance(lic_sym, LicenseSymbol):
-                    raise ParseError(TOKEN_SYMBOL, lic_res.string, lic_res.start,
-                                     PARSE_INVALID_SYMBOL)
+        A license symbol is any string separated by keywords and parens (and it
+        can include spaces).
+        """
+        if self.advanced_tokenizer is not None:
+            return self.advanced_tokenizer
+
+        self.advanced_tokenizer = tokenizer = AdvancedTokenizer()
+
+        add_item = tokenizer.add
+        for keyword in KEYWORDS:
+            add_item(keyword.value, keyword)
+
+        # self.known_symbols has been created at Licensing initialization time and is
+        # already validated and trusted here
+        for key, symbol in self.known_symbols.items():
+            # always use the key even if there are no aliases.
+            add_item(key, symbol)
+            aliases = getattr(symbol, 'aliases', [])
+            for alias in aliases:
+                # normalize spaces for each alias. The AdvancedTokenizer will lowercase them
+                if alias:
+                    alias = ' '.join(alias.split())
+                    add_item(alias, symbol)
 
-                if not lic_sym:
-                    lic_sym = LicenseSymbol(lic_res.string, is_exception=False)
+        tokenizer.make_automaton()
+        return tokenizer
 
-                if not isinstance(lic_sym, LicenseSymbol):
-                    raise ParseError(TOKEN_SYMBOL, lic_res.string, lic_res.start,
-                                     PARSE_INVALID_SYMBOL)
+    def advanced_tokenizer(self, expression):
+        """
+        Return an iterable of Token describing each token given an expression
+        unicode string.
+        """
+        tokenizer = self.get_advanced_tokenizer()
+        return tokenizer.tokenize(expression)
 
-                if strict and lic_sym.is_exception:
-                    raise ParseError(TOKEN_SYMBOL, lic_res.string, lic_res.start,
-                                     PARSE_INVALID_EXCEPTION)
+    def simple_tokenizer(self, expression):
+        """
+        Return an iterable of Token describing each token given an expression
+        unicode string.
 
-                # exception
-                exc_out = exc_res.output
-                exc_sym = exc_out and exc_out.value
+        The split is done on spaces, keywords and parens. Anything else is a
+        symbol token, e.g. a typically license key or license id (that contains
+        no spaces or parens).
 
-                # this should not happen
-                if exc_sym and not isinstance(exc_sym, LicenseSymbol):
-                    raise ParseError(TOKEN_SYMBOL, lic_sym.string, lic_sym.start,
-                                     PARSE_INVALID_SYMBOL)
-                if exc_sym:
-                    exc_sym = copy(exc_sym)
+        If symbols were provided when this Licensing object was created, the
+        tokenizer will recognize known symbol keys (ignoring case) when
+        tokenizing expressions.
+        """
 
-                if not exc_sym:
-                    exc_sym = LicenseSymbol(exc_res.string)
+        symbols = self.known_symbols_lowercase or {}
 
-                if not isinstance(exc_sym, LicenseSymbol):
-                    raise ParseError(TOKEN_SYMBOL, exc_res.string, exc_res.start,
-                                     PARSE_INVALID_SYMBOL)
+        for match in _simple_tokenizer(expression):
+            if not match:
+                continue
+            # set start and end as string indexes
+            start, end = match.span()
+            end = end - 1
+            match_getter = match.groupdict().get
+
+            space = match_getter('space')
+            if space:
+                yield Token(start, end, space, None)
+
+            lpar = match_getter('lpar')
+            if lpar:
+                yield Token(start, end, lpar, KW_LPAR)
+
+            rpar = match_getter('rpar')
+            if rpar:
+                yield Token(start, end, rpar, KW_RPAR)
+
+            sym_or_op = match_getter('symop')
+            if sym_or_op:
+                sym_or_op_lower = sym_or_op.lower()
+
+                operator = OPERATORS.get(sym_or_op_lower)
+                if operator:
+                    yield Token(start, end, sym_or_op, operator)
+                else:
+                    sym = symbols.get(sym_or_op_lower)
+                    if not sym:
+                        sym = LicenseSymbol(key=sym_or_op)
+                    yield Token(start, end, sym_or_op, sym)
 
-                if strict and self.known_symbols and not exc_sym.is_exception:
-                    raise ParseError(TOKEN_SYMBOL, exc_res.string, exc_res.start,
-                                     PARSE_INVALID_SYMBOL_AS_EXCEPTION)
 
-                token = LicenseWithExceptionSymbol(lic_sym, exc_sym, strict)
+def build_symbols_from_unknown_tokens(tokens):
+    """
+    Yield Token given a sequence of Token replacing unmatched contiguous Tokens
+    by a single token with a LicenseSymbol.
+    """
+    tokens = list(tokens)
 
-            yield token, token_string, pos
+    unmatched = deque()
 
-    def get_scanner(self):
+    def build_token_with_symbol():
         """
-        Return a scanner either cached or created as needed. If symbols were provided
-        when this Licensing object was created, the scanner will recognize known
-        symbols when tokenizing expressions. Otherwise, only keywords are recognized
-        and a license symbol is anything in between keywords.
+        Build and return a new Token from accumulated unmatched tokens or None.
         """
-        if self.scanner is not None:
-            return self.scanner
+        if not unmatched:
+            return
+        # strip trailing spaces
+        trailing_spaces = []
+        while unmatched and not unmatched[-1].string.strip():
+            trailing_spaces.append(unmatched.pop())
+
+        if unmatched:
+            string = ' '.join(t.string for t in unmatched if t.string.strip())
+            start = unmatched[0].start
+            end = unmatched[-1].end
+            toksym = LicenseSymbol(string)
+            unmatched.clear()
+            yield Token(start, end, string, toksym)
+
+        for ts in trailing_spaces:
+            yield ts
+
+    for tok in tokens:
+        if tok.value:
+            for symtok in build_token_with_symbol():
+                yield symtok
+            yield tok
+        else:
+            if not unmatched and not tok.string.strip():
+                # skip leading spaces
+                yield tok
+            else:
+                unmatched.append(tok)
 
-        self.scanner = scanner = Scanner(ignore_case=True)
+    # end remainders
+    for symtok in build_token_with_symbol():
+        yield symtok
 
-        for keyword in _KEYWORDS:
-            scanner.add(keyword.value, keyword, priority=0)
 
-        # self.known_symbols has been created at Licensing initialization time and is
-        # already validated and trusted here
-        for key, symbol in self.known_symbols.items():
-            # always use the key even if there are no aliases.
-            scanner.add(key, symbol, priority=1)
-            aliases = getattr(symbol, 'aliases', [])
-            for alias in aliases:
-                # normalize spaces for each alias. The Scanner will lowercase them
-                # since we created it with ignore_case=True
-                if alias:
-                    alias = ' '.join(alias.split())
-                if alias:
-                    scanner.add(alias, symbol, priority=2)
+def build_token_groups_for_with_subexpression(tokens):
+    """
+    Yield tuples of Token given a sequence of Token such that:
+     - all symbol-with-symbol sequences of 3 tokens are grouped in a three-tuple
+     - other tokens are a single token wrapped in a tuple.
+    """
+
+    # if n-1 is sym, n is with and n+1 is sym: yield this as a group for a with
+    # exp otherwise: yield each single token as a group
+
+    tokens = list(tokens)
+
+    # check three contiguous tokens that may form "lic WITh exception" sequence
+    triple_len = 3
+
+    # shortcut if there are no grouping possible
+    if len(tokens) < triple_len:
+        for tok in tokens:
+            yield (tok,)
+        return
+
+    # accumulate three contiguous tokens
+    triple = deque()
+    triple_popleft = triple.popleft
+    triple_clear = triple.clear
+    tripple_append = triple.append
+
+    for tok in tokens:
+        if len(triple) == triple_len:
+            if is_with_subexpression(triple):
+                yield tuple(triple)
+                triple_clear()
+            else:
+                prev_tok = triple_popleft()
+                yield (prev_tok,)
+        tripple_append(tok)
+
+    # end remainders
+    if triple:
+        if len(triple) == triple_len and is_with_subexpression(triple):
+            yield tuple(triple)
+        else:
+            for tok in triple:
+                yield (tok,)
 
-        scanner.make_automaton()
-        return scanner
+
+def is_with_subexpression(tokens_tripple):
+    """
+    Return True if a Token tripple is a WITH license sub-expression.
+    """
+    lic, wit, exc = tokens_tripple
+    return (isinstance(lic.value, LicenseSymbol)
+        and wit.value == KW_WITH
+        and isinstance(exc.value, LicenseSymbol)
+    )
+
+
+def replace_with_subexpression_by_license_symbol(tokens, strict=False):
+    """
+    Given an iterable of Token, yiled token, replacing any XXX WITH ZZZ
+    subexpression by a LicenseWithExceptionSymbol symbol.
+
+    Check validity of with subexpessions and raise ParseError as needed.
+
+    If `strict` is True also raise ParseError if the left hand side
+    LicenseSymbol has is_exception True or if the right hand side
+    LicenseSymbol has is_exception False.
+    """
+    token_groups = build_token_groups_for_with_subexpression(tokens)
+
+    for token_group in token_groups:
+        len_group = len(token_group)
+
+        if not len_group:
+            # This should never happen
+            continue
+
+        if len_group == 1:
+            # a single token
+            token = token_group[0]
+            tval = token.value
+
+            if isinstance(tval, Keyword):
+                if tval.type == TOKEN_WITH:
+                    # keyword
+                    # a single group cannot be a single 'WITH' keyword:
+                    # this is an error that we catch and raise here.
+                    raise ParseError(
+                        token_type=TOKEN_WITH, token_string=token.string,
+                        position=token.start, error_code=PARSE_INVALID_EXPRESSION)
+
+            elif isinstance(tval, LicenseSymbol):
+                if strict and tval.is_exception:
+                    raise ParseError(
+                        token_type=TOKEN_SYMBOL, token_string=token.string,
+                        position=token.start, error_code=PARSE_INVALID_EXCEPTION)
+
+            else:
+                # this should not be possible by design
+                raise Exception('Licensing.tokenize is internally confused...:' + repr(tval))
+
+            yield token
+            continue
+
+        if len_group != 3:
+            # this should never happen
+            string = ' '.join([tok.string for tok in token_group])
+            start = token_group[0].start
+            raise ParseError(
+                TOKEN_SYMBOL, string, start, PARSE_INVALID_EXPRESSION)
+
+        # from now on we have a tripple of tokens: a WITH sub-expression such as "A with
+        # B" seq of three tokens
+        lic_token, WITH , exc_token = token_group
+
+        token_string = ' '.join([
+            lic_token.string,
+            WITH.string.strip(),
+            exc_token.string
+        ])
+
+        # the left hand side license symbol
+        lic_sym = lic_token.value
+
+        # this should not happen
+        if not isinstance(lic_sym, LicenseSymbol):
+            raise ParseError(
+                TOKEN_SYMBOL, lic_token.string, lic_token.start,
+                PARSE_INVALID_SYMBOL)
+
+        if strict and lic_sym.is_exception:
+            raise ParseError(
+                TOKEN_SYMBOL, lic_token.string, lic_token.start,
+                PARSE_INVALID_EXCEPTION)
+
+        # the right hand side exception symbol
+        exc_sym = exc_token.value
+
+        if not isinstance(exc_sym, LicenseSymbol):
+            raise ParseError(
+                TOKEN_SYMBOL, lic_sym.string, lic_sym.start,
+                PARSE_INVALID_SYMBOL)
+
+        if strict and not exc_sym.is_exception:
+            raise ParseError(
+                TOKEN_SYMBOL, exc_token.string, exc_token.start,
+                PARSE_INVALID_SYMBOL_AS_EXCEPTION)
+
+        lic_exc_sym = LicenseWithExceptionSymbol(lic_sym, exc_sym, strict)
+
+        token = Token(
+            lic_token.start,
+            exc_token.end,
+            token_string,
+            lic_exc_sym,
+        )
+        yield token
 
 
 class Renderable(object):
@@ -608,7 +849,7 @@ def __contains__(self, other):
 is_valid_license_key = re.compile(r'^[-\w\s\.\+]+$', re.UNICODE).match
 
 
-#FIXME: we need to implement comparison!!!!
+# TODO: we need to implement comparison by hand instead
 @total_ordering
 class LicenseSymbol(BaseSymbol):
     """
@@ -623,7 +864,7 @@ def __init__(self, key, aliases=tuple(), is_exception=False, *args, **kwargs):
         if not isinstance(key, str):
             if isinstance(key, bytes):
                 try:
-                    key = unicode(key)
+                    key = str(key)
                 except:
                     raise ExpressionError(
                         'A license key must be a unicode string: %(key)r' % locals())
@@ -646,7 +887,7 @@ def __init__(self, key, aliases=tuple(), is_exception=False, *args, **kwargs):
         # normalize for spaces
         key = ' '.join(key.split())
 
-        if key.lower() in KEYWORDS_STRIPPED:
+        if key.lower() in KEYWORDS_STRINGS:
             raise ExpressionError(
                 'Invalid license key: a key cannot be a reserved keyword: "or", "and" or "with: "%(key)s"' % locals())
 
@@ -662,7 +903,7 @@ def __init__(self, key, aliases=tuple(), is_exception=False, *args, **kwargs):
 
     def decompose(self):
         """
-        Return an iterable the underlying symbols for this symbol
+        Return an iterable of the underlying symbols for this symbol.
         """
         yield self
 
@@ -698,6 +939,9 @@ def render(self, template='{symbol.key}', *args, **kwargs):
     def __str__(self):
         return self.key
 
+    def __len__(self):
+        return len(self.key)
+
     def __repr__(self):
         cls = self.__class__.__name__
         key = self.key
@@ -716,12 +960,12 @@ def symbol_like(cls, symbol):
         return hasattr(symbol, 'key') and hasattr(symbol, 'is_exception')
 
 
-#FIXME: we need to implement comparison!!!!
+# TODO: we need to implement comparison by hand instead
 @total_ordering
 class LicenseSymbolLike(LicenseSymbol):
     """
-    A LicenseSymbolLike object wraps a symbol-like object to expose a LicenseSymbol
-    behavior.
+    A LicenseSymbolLike object wraps a symbol-like object to expose a
+    LicenseSymbol behavior.
     """
 
     def __init__(self, symbol_like, *args, **kwargs):
@@ -777,7 +1021,7 @@ def __lt__(self, other):
             return NotImplemented
 
 
-#FIXME: we need to implement comparison!!!!
+# TODO: we need to implement comparison by hand instead
 @total_ordering
 class LicenseWithExceptionSymbol(BaseSymbol):
     """
@@ -921,6 +1165,8 @@ class AND(RenderableFunction, boolean.AND):
     """
 
     def __init__(self, *args):
+        if len(args) < 2:
+            raise ExpressionError('AND requires two or more licenses as in: MIT AND BSD')
         super(AND, self).__init__(*args)
         self.operator = ' AND '
 
@@ -931,6 +1177,8 @@ class OR(RenderableFunction, boolean.OR):
     """
 
     def __init__(self, *args):
+        if len(args) < 2:
+            raise ExpressionError('OR requires two or more licenses as in: MIT OR BSD')
         super(OR, self).__init__(*args)
         self.operator = ' OR '
 
@@ -949,84 +1197,13 @@ def ordered_unique(seq):
     return uniques
 
 
-def strip_and_skip_spaces(results):
-    """
-    Yield results given a sequence of Result skipping whitespace-only results
-    """
-    for result in results:
-        if result.string.strip():
-            yield result
-
-
-def group_results_for_with_subexpression(results):
-    """
-    Yield tuples of (Result) given a sequence of Result such that:
-     - all symbol-with-symbol subsequences of three results are grouped in a three-tuple
-     - other results are the single result in a tuple.
-    """
-
-    # if n-1 is sym, n is with and n+1 is sym: yield this as a group for a with exp
-    # otherwise: yield each single result as a group
-
-    results = list(results)
-
-    # check three contiguous result from scanning at a time
-    triple_len = 3
-
-    # shortcut if there are no grouping possible
-    if len(results) < triple_len:
-        for res in results:
-            yield (res,)
-        return
-
-    # accumulate three contiguous results
-    triple = collections.deque()
-    triple_popleft = triple.popleft
-    triple_clear = triple.clear
-    tripple_append = triple.append
-
-    for res in results:
-        if len(triple) == triple_len:
-            if is_with_subexpression(triple):
-                yield tuple(triple)
-                triple_clear()
-            else:
-                prev_res = triple_popleft()
-                yield (prev_res,)
-        tripple_append(res)
-
-    # end remainders
-    if triple:
-        if len(triple) == triple_len and is_with_subexpression(triple):
-            yield tuple(triple)
-        else:
-            for res in triple:
-                yield (res,)
-
-
-def is_symbol(result):
-    # either the output value is a known sym, or we have no output for unknown sym
-    return result.output and isinstance(result.output.value, LicenseSymbol) or not result.output
-
-
-def is_with_keyword(result):
-    return (result.output
-            and isinstance(result.output.value, Keyword)
-            and result.output.value.type == TOKEN_WITH)
-
-
-def is_with_subexpression(results):
-    lic, wit, exc = results
-    return (is_symbol(lic) and is_with_keyword(wit) and is_symbol(exc))
-
-
 def as_symbols(symbols):
     """
     Return an iterable of LicenseSymbol objects from a sequence of `symbols` or
-    strings. If an item is a string, then create a new LicenseSymbol for it using the
-    string as key. If this is not a string it must be a LicenseSymbol-like type. It
-    will raise a TypeError expection if an item is neither a string or LicenseSymbol-
-    like.
+    strings. If an item is a string, then create a new LicenseSymbol for it
+    using the string as key. If this is not a string it must be a LicenseSymbol-
+    like type. It will raise a TypeError expection if an item is neither a
+    string or LicenseSymbol- like.
     """
     if symbols:
         for symbol in symbols:
@@ -1034,11 +1211,11 @@ def as_symbols(symbols):
                 continue
             if isinstance(symbol, bytes):
                 try:
-                    symbol = unicode(symbol)
+                    symbol = str(symbol)
                 except:
                     raise TypeError('%(symbol)r is not a unicode string.' % locals())
 
-            if isinstance(symbol, unicode):
+            if isinstance(symbol, str):
                 if symbol.strip():
                     yield LicenseSymbol(symbol)
 
@@ -1053,7 +1230,7 @@ def as_symbols(symbols):
                                 'or a LicenseSymbol-like instance.' % locals())
 
 
-def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS):
+def validate_symbols(symbols, validate_keys=False):
     """
     Return a tuple of (`warnings`, `errors`) given a sequence of `symbols`
     LicenseSymbol-like objects.
@@ -1075,9 +1252,9 @@ def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS):
     not_symbol_classes = []
     dupe_keys = set()
     dupe_exceptions = set()
-    dupe_aliases = collections.defaultdict(list)
+    dupe_aliases = defaultdict(list)
     invalid_keys_as_kw = set()
-    invalid_alias_as_kw = collections.defaultdict(list)
+    invalid_alias_as_kw = defaultdict(list)
 
     # warning
     warning_dupe_aliases = set()
@@ -1096,7 +1273,7 @@ def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS):
             dupe_keys.add(key)
 
         # key cannot be an expression keyword
-        if keyl in _keywords:
+        if keyl in KEYWORDS_STRINGS:
             invalid_keys_as_kw.add(key)
 
         # keep a set of unique seen keys
@@ -1129,7 +1306,7 @@ def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS):
                 dupe_aliases[alias].append(key)
 
             # an alias cannot be an expression keyword
-            if alias in _keywords:
+            if alias in KEYWORDS_STRINGS:
                 invalid_alias_as_kw[key].append(alias)
 
             seen_aliases[alias] = keyl
@@ -1169,75 +1346,3 @@ def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS):
         errors.append('Duplicated or empty aliases ignored for license key: %(dupeal)r.' % locals())
 
     return warnings, errors
-
-
-_splitter = re.compile('''
-    (?P<symbol>[^\s\(\)]+)
-     |
-    (?P<space>\s+)
-     |
-    (?P<lpar>\()
-     |
-    (?P<rpar>\))
-    ''',
-    re.VERBOSE | re.MULTILINE | re.UNICODE
-).finditer
-
-
-def splitter(expression):
-    """
-    Return an iterable of Result describing each token given an
-    expression unicode string.
-
-    This is a simpler tokenizer used when the Licensing does not have
-    known symbols. The split is done on spaces and parens. Anything else
-    is either a token or a symbol.
-    """
-    if not expression:
-        return
-
-    if not isinstance(expression, str):
-        raise ParseError(error_code=PARSE_EXPRESSION_NOT_UNICODE)
-
-    # mapping of lowercase token strings to a token type id
-    TOKENS = {
-        'and': Keyword(value='and', type=TOKEN_AND),
-        'or': Keyword(value='or', type=TOKEN_OR),
-        'with': Keyword(value='with', type=TOKEN_WITH),
-    }
-
-    for match in _splitter(expression):
-        if not match:
-            continue
-
-        start, end = match.span()
-        end = end - 1
-        mgd = match.groupdict()
-
-        space = mgd.get('space')
-        if space:
-            yield Result(start, end, space, None)
-
-        lpar = mgd.get('lpar')
-        if lpar:
-            yield Result(start, end, lpar, Output(lpar, KW_LPAR))
-
-        rpar = mgd.get('rpar')
-        if rpar:
-            yield Result(start, end, rpar, Output(rpar, KW_RPAR))
-
-        token_or_sym = mgd.get('symbol')
-        if not token_or_sym:
-            continue
-
-        token = TOKENS.get(token_or_sym.lower())
-        if token:
-            yield Result(start, end, token_or_sym, Output(token_or_sym, token))
-#         elif token_or_sym.endswith('+') and token_or_sym != '+':
-#             val = token_or_sym[:-1]
-#             sym = LicenseSymbol(key=val)
-#             yield Result(start, end - 1, val, Output(val, sym))
-#             yield Result(end, end, '+', Output('+', KW_PLUS))
-        else:
-            sym = LicenseSymbol(key=token_or_sym)
-            yield Result(start, end, token_or_sym, Output(token_or_sym, sym))
diff --git a/src/license_expression/_pyahocorasick.py b/src/license_expression/_pyahocorasick.py
index 4c73709..fefe51f 100644
--- a/src/license_expression/_pyahocorasick.py
+++ b/src/license_expression/_pyahocorasick.py
@@ -6,119 +6,171 @@
 WWW            : http://0x80.pl
 License        : public domain
 
-Modified for use in the license_expression library and in particular:
- - add support for unicode key strinsg.
- - rename word to key and output to value (to be more like a mapping/dict)
- - case insensitive search
+Modified for use in the license_expression library:
+ - add support for unicode strings.
+ - case insensitive search using sequence of words and not characters
  - improve returned results with the actual start,end and matched string.
  - support returning non-matched parts of a string
 """
 
-from __future__ import unicode_literals
 from __future__ import absolute_import
 from __future__ import print_function
+from __future__ import unicode_literals
 
 from collections import deque
 from collections import OrderedDict
 import logging
+import re
+
+# Python 2 and 3 support
+try:
+    # Python 2
+    unicode
+    str = unicode  # NOQA
+except NameError:
+    # Python 3
+    unicode = str  # NOQA
+
+TRACE = False
 
 logger = logging.getLogger(__name__)
 
 
 def logger_debug(*args):
-    return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
+    pass
+
 
-# uncomment for local debug logging
-# import sys
-# logging.basicConfig(stream=sys.stdout)
-# logger.setLevel(logging.DEBUG)
+if TRACE:
 
+    def logger_debug(*args):
+        return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
+
+    import sys
+    logging.basicConfig(stream=sys.stdout)
+    logger.setLevel(logging.DEBUG)
 
 # used to distinguish from None
 nil = object()
 
 
+class TrieNode(object):
+    """
+    Node of the Trie/Aho-Corasick automaton.
+    """
+    __slots__ = ['token', 'output', 'fail', 'children']
+
+    def __init__(self, token, output=nil):
+        # token of a tokens string added to the Trie as a string
+        self.token = token
+
+        # an output function (in the Aho-Corasick meaning) for this node: this
+        # is an object that contains the original key string and any
+        # additional value data associated to that key. Or "nil" for a node that
+        # is not a terminal leave for a key. It will be returned with a match.
+        self.output = output
+
+        # failure link used by the Aho-Corasick automaton and its search procedure
+        self.fail = nil
+
+        # children of this node as a mapping of char->node
+        self.children = {}
+
+    def __repr__(self):
+        if self.output is not nil:
+            return 'TrieNode(%r, %r)' % (self.token, self.output)
+        else:
+            return 'TrieNode(%r)' % self.token
+
+
 class Trie(object):
     """
     A Trie and Aho-Corasick automaton. This behaves more or less like a mapping of
     key->value. This is the main entry point.
     """
 
-    def __init__(self, ignore_case=True):
+    def __init__(self):
         """
         Initialize a new Trie.
-
-        If `ignore_case` is True, searches in the Trie will be case insensitive.
         """
         self.root = TrieNode('')
-        self.ignore_case = ignore_case
 
-        # set of any unique character in the trie, updated on each addition
-        # we keep track of the set of chars added to the trie to build the automaton
+        # set of any unique tokens in the trie, updated on each addition we keep
+        # track of the set of tokens added to the trie to build the automaton
         # these are needed to created the first level children failure links
-        self._known_chars = set()
+        self._known_tokens = set()
 
         # Flag set to True once a Trie has been converted to an Aho-Corasick automaton
         self._converted = False
 
-    def add(self, key, value=None, priority=0):
+    def add(self, tokens_string, value=None):
         """
-        Add a new (key string, value) pair to the trie. If the key already exists in
-        the Trie, its value is replaced with the provided value.
-        A key is any unicode string.
+        Add a new tokens_string and its associated value to the trie. If the
+        tokens_string already exists in the Trie, its value is replaced with the
+        provided value, typically a Token object. If a value is not provided,
+        the tokens_string is used as value.
+
+        A tokens_string is any unicode string. It will be tokenized when added
+        to the Trie.
         """
         if self._converted:
             raise Exception('This Trie has been converted to an Aho-Corasick '
-                            'automaton and cannot be further modified.')
-        if not key:
+                            'automaton and cannot be modified.')
+
+        if not tokens_string or not isinstance(tokens_string, str):
             return
 
-        stored_key = self.ignore_case and key.lower() or key
+        tokens = [t for t in get_tokens(tokens_string) if t.strip()]
 
-        # we keep track of the set of chars added to the trie to build the automaton
-        # these are needed to created the first level children failure links
-        self._known_chars.update(stored_key)
+        # we keep track of the set of tokens added to the trie to build the
+        # automaton these are needed to created the first level children failure
+        # links
+
+        self._known_tokens.update(tokens)
 
         node = self.root
-        for char in stored_key:
+        for token in tokens:
             try:
-                node = node.children[char]
+                node = node.children[token]
             except KeyError:
-                child = TrieNode(char)
-                node.children[char] = child
+                child = TrieNode(token)
+                node.children[token] = child
                 node = child
 
-        # we always store the original key, not a possibly lowercased version
-        node.output = Output(key, value, priority)
+        node.output = (tokens_string, value or tokens_string)
 
-    def __get_node(self, key):
+    def __get_node(self, tokens_string):
         """
-        Return a node for this key or None if the trie does not contain the key.
-        Private function retrieving a final node of trie for given key.
+        Return a node for this tokens_string or None if the trie does not
+        contain the tokens_string. Private function retrieving a final node of
+        the Trie for a given tokens_string.
         """
-        key = self.ignore_case and key.lower() or key
+        if not tokens_string or not isinstance(tokens_string, str):
+            return
+
+        tokens = [t for t in get_tokens(tokens_string) if t.strip()]
         node = self.root
-        for char in key:
+        for token in tokens:
             try:
-                node = node.children[char]
+                node = node.children[token]
             except KeyError:
                 return None
         return node
 
-    def get(self, key, default=nil):
+    def get(self, tokens_string, default=nil):
         """
-        Return the Output tuple associated with a `key`.
-        If there is no such key in the Trie, return the default value (other
-        than nil): if default is not given or nil, raise a KeyError exception.
+        Return the output value found associated with a `tokens_string`. If
+        there is no such tokens_string in the Trie, return the default value
+        (other than nil). If `default` is not provided or is `nil`, raise a
+        KeyError.
         """
-        node = self.__get_node(key)
+        node = self.__get_node(tokens_string)
         output = nil
         if node:
             output = node.output
 
         if output is nil:
             if default is nil:
-                raise KeyError(key)
+                raise KeyError(tokens_string)
             else:
                 return default
         else:
@@ -142,37 +194,36 @@ def items(self):
         """
         items = []
 
-        def walk(node, key):
+        def walk(node, tokens):
             """
             Walk the trie, depth first.
             """
-            key = key + node.char
+            tokens = [t for t in tokens + [node.token] if t]
             if node.output is not nil:
-                items.append((node.output.key, node.output.value))
+                items.append((node.output[0], node.output[1],))
 
             for child in node.children.values():
                 if child is not node:
-                    walk(child, key)
+                    walk(child, tokens)
 
-        walk(self.root, key='')
+        walk(self.root, tokens=[])
 
         return iter(items)
 
-    def exists(self, key):
+    def exists(self, tokens_string):
         """
         Return True if the key is present in this trie.
         """
-        # TODO: add __contains__ magic for this
-        node = self.__get_node(key)
+        node = self.__get_node(tokens_string)
         if node:
             return bool(node.output != nil)
         return False
 
-    def is_prefix(self, key):
+    def is_prefix(self, tokens_string):
         """
-        Return True if key is a prefix of any existing key in the trie.
+        Return True if tokens_string is a prefix of any existing tokens_string in the trie.
         """
-        return (self.__get_node(key) is not None)
+        return bool(self.__get_node(tokens_string) is not None)
 
     def make_automaton(self):
         """
@@ -181,45 +232,45 @@ def make_automaton(self):
         converted to an Automaton.
         """
         queue = deque()
-        queue_append = queue.append
-        queue_popleft = queue.popleft
 
         # 1. create root children for each known items range (e.g. all unique
-        # characters from all the added keys), failing to root.
+        # characters from all the added tokens), failing to root.
         # And build a queue of these
-        for char in self._known_chars:
-            if char in self.root.children:
-                node = self.root.children[char]
+        for token in self._known_tokens:
+            if token in self.root.children:
+                node = self.root.children[token]
                 # e.g. f(s) = 0, Aho-Corasick-wise
                 node.fail = self.root
-                queue_append(node)
+                queue.append(node)
             else:
-                self.root.children[char] = self.root
+                self.root.children[token] = self.root
 
         # 2. using the queue of all possible top level items/chars, walk the trie and
         # add failure links to nodes as needed
         while queue:
-            current_node = queue_popleft()
+            current_node = queue.popleft()
             for node in current_node.children.values():
-                queue_append(node)
+                queue.append(node)
                 state = current_node.fail
-                while node.char not in state.children:
+                while node.token not in state.children:
                     state = state.fail
-                node.fail = state.children.get(node.char, self.root)
+                node.fail = state.children.get(node.token, self.root)
 
         # Mark the trie as converted so it cannot be modified anymore
         self._converted = True
 
-    def iter(self, string):
+    def iter(self, tokens_string, include_unmatched=False, include_space=False):
         """
-        Yield Result objects for matched strings by performing the Aho-Corasick search procedure.
+        Yield Token objects for matched strings by performing the Aho-Corasick
+        search procedure.
 
-        The Result start and end positions in the searched string are such that the
-        matched string is "search_string[start:end+1]". And the start is computed
-        from the end_index collected by the Aho-Corasick search procedure such that
-        "start=end_index - n + 1" where n is the length of a matched key.
+        The Token start and end positions in the searched string are such that
+        the matched string is "tokens_string[start:end+1]". And the start is
+        computed from the end_index collected by the Aho-Corasick search
+        procedure such that
+        "start=end_index - n + 1" where n is the length of a matched string.
 
-        The Result.output is an Output object for a matched key.
+        The Token.value is an object associated with a matched string.
 
         For example:
         >>> a = Trie()
@@ -229,17 +280,14 @@ def iter(self, string):
         >>> a.add('EFGH')
         >>> a.add('KL')
         >>> a.make_automaton()
-        >>> string = 'abcdefghijklm'
-        >>> results = Result.sort(a.iter(string))
-
+        >>> tokens_string = 'a bcdef ghij kl m'
+        >>> strings = Token.sort(a.iter(tokens_string))
         >>> expected = [
-        ...     Result(1, 5, 'bcdef', Output('BCDEF')),
-        ...     Result(2, 4, 'cde', Output('CDE')),
-        ...     Result(3, 7, 'defgh', Output('DEFGH')),
-        ...     Result(4, 7, 'efgh', Output('EFGH')),
-        ...     Result(10, 11, 'kl', Output('KL')),
+        ...     Token(2, 6, u'bcdef', u'BCDEF'),
+        ...     Token(13, 14, u'kl', u'KL')
         ... ]
-        >>> results == expected
+
+        >>> strings == expected
         True
 
         >>> list(a.iter('')) == []
@@ -248,38 +296,78 @@ def iter(self, string):
         >>> list(a.iter(' ')) == []
         True
         """
-        if not string:
+        if not tokens_string:
             return
 
-        # keep a copy for results
-        original_string = string
-        string = self.ignore_case and string.lower() or string
-
-        known_chars = self._known_chars
+        tokens = get_tokens(tokens_string)
         state = self.root
-        for end, char in enumerate(string):
-            if char not in known_chars:
+
+        if TRACE:
+            logger_debug('Trie.iter() with:', repr(tokens_string))
+            logger_debug(' tokens:', tokens)
+
+        end_pos = -1
+        for token_string in tokens:
+            end_pos += len(token_string)
+            if TRACE:
+                logger_debug()
+                logger_debug('token_string', repr(token_string))
+                logger_debug(' end_pos', end_pos)
+
+            if not include_space and not token_string.strip():
+                if TRACE:
+                    logger_debug('  include_space skipped')
+                continue
+
+            if token_string not in self._known_tokens:
                 state = self.root
+                if TRACE:
+                    logger_debug('  unmatched')
+                if include_unmatched:
+                    n = len(token_string)
+                    start_pos = end_pos - n + 1
+                    tok = Token(start_pos, end_pos, tokens_string[start_pos: end_pos + 1], None)
+                    if TRACE:
+                        logger_debug('  unmatched tok:', tok)
+                    yield tok
                 continue
 
-            # search for a matching character in the children, starting at root
-            while char not in state.children:
+            yielded = False
+
+            # search for a matching token_string in the children, starting at root
+            while token_string not in state.children:
                 state = state.fail
-            # we have a matching starting character
-            state = state.children.get(char, self.root)
+
+            # we have a matching starting token_string
+            state = state.children.get(token_string, self.root)
             match = state
             while match is not nil:
                 if match.output is not nil:
-                    # TODO: this could be precomputed or cached
-                    n = len(match.output.key)
-                    start = end - n + 1
-                    yield Result(start, end, original_string[start:end + 1], match.output)
+                    matched_string, output_value = match.output
+                    if TRACE:
+                        logger_debug(' type output', repr(output_value), type(matched_string))
+                    n = len(matched_string)
+                    start_pos = end_pos - n + 1
+                    if TRACE: logger_debug('   start_pos', start_pos)
+                    yield Token(start_pos, end_pos, tokens_string[start_pos: end_pos + 1], output_value)
+                    yielded = True
                 match = match.fail
-
-    def scan(self, string):
-        """
-        Scan a string for matched and unmatched sub-sequences and yield non-
-        overlapping Result objects performing a modified Aho-Corasick search
+            if not yielded and include_unmatched:
+                if TRACE:
+                    logger_debug('  unmatched but known token')
+                n = len(token_string)
+                start_pos = end_pos - n + 1
+                tok = Token(start_pos, end_pos, tokens_string[start_pos: end_pos + 1], None)
+                if TRACE:
+                    logger_debug('  unmatched tok 2:', tok)
+                yield tok
+
+        logger_debug()
+
+    def tokenize(self, string, include_unmatched=True, include_space=False):
+        """
+        tokenize a string for matched and unmatched sub-sequences and yield non-
+        overlapping Token objects performing a modified Aho-Corasick search
         procedure:
 
         - return both matched and unmatched sub-sequences.
@@ -293,10 +381,8 @@ def scan(self, string):
                return the non-overlapping portion of the other discarded match as a
                non-match.
 
-        Each Result contains the start and end position, the corresponding string and
-        an Output object (with original key and any associated associated value). The
-        string and key are in their original case even if the automaton has the
-        `ignore_case` attribute.
+        Each Token contains the start and end position, the corresponding string
+        and an associated value object.
 
         For example:
         >>> a = Trie()
@@ -306,144 +392,175 @@ def scan(self, string):
         >>> a.add('EFGH')
         >>> a.add('KL')
         >>> a.make_automaton()
-        >>> string = 'abcdefghijkl'
-        >>> results = list(a.scan(string))
+        >>> string = 'a bcdef ghij kl'
+        >>> tokens = list(a.tokenize(string, include_space=True))
 
         >>> expected = [
-        ...     Result(start=0, end=0, string='a', output=None),
-        ...     Result(start=1, end=5, string='bcdef', output=Output('BCDEF')),
-        ...     Result(start=6, end=9, string='ghij', output=None),
-        ...     Result(start=10, end=11, string='kl', output=Output('KL')),
+        ...     Token(0, 0, u'a', None),
+        ...     Token(1, 1, u' ', None),
+        ...     Token(2, 6, u'bcdef', u'BCDEF'),
+        ...     Token(7, 7, u' ', None),
+        ...     Token(8, 11, u'ghij', None),
+        ...     Token(12, 12, u' ', None),
+        ...     Token(13, 14, u'kl', u'KL')
         ... ]
-
-        >>> results == expected
+        >>> tokens == expected
         True
         """
-        results = self.iter(string)
-        results = filter_overlapping(results)
-        results = add_unmatched(string, results)
-        return results
+        tokens = self.iter(string,
+            include_unmatched=include_unmatched, include_space=include_space)
+        tokens = list(tokens)
+        if TRACE:
+            logger_debug('tokenize.tokens:', tokens)
+        if not include_space:
+            tokens = [t for t in tokens if t.string.strip()]
+        tokens = filter_overlapping(tokens)
+        return tokens
 
 
-class TrieNode(object):
+def filter_overlapping(tokens):
     """
-    Node of the Trie/Aho-Corasick automaton.
-    """
-    __slots__ = ['char', 'output', 'fail', 'children']
+    Return a new list from an iterable of `tokens` discarding contained and
+    overlaping Tokens using these rules:
 
-    def __init__(self, char, output=nil):
-        # character of a key string added to the Trie
-        self.char = char
-
-        # an output function (in the Aho-Corasick meaning) for this node: this is an
-        # Output object that contains the original key string and any additional
-        # value data associated to that key. Or "nil" for a node that is not a
-        # terminal leave for a key. It will be returned with a match.
-        self.output = output
-
-        # failure link used by the Aho-Corasick automaton and its search procedure
-        self.fail = nil
-
-        # children of this node as a mapping of char->node
-        self.children = {}
-
-    def __repr__(self):
-        if self.output is not nil:
-            return 'TrieNode(%r, %r)' % (self.char, self.output)
-        else:
-            return 'TrieNode(%r)' % self.char
+    - skip a token fully contained in another token.
+    - keep the biggest, left-most token of two overlapping tokens and skip the other
 
+    For example:
+    >>> tokens = [
+    ...     Token(0, 0, 'a'),
+    ...     Token(1, 5, 'bcdef'),
+    ...     Token(2, 4, 'cde'),
+    ...     Token(3, 7, 'defgh'),
+    ...     Token(4, 7, 'efgh'),
+    ...     Token(8, 9, 'ij'),
+    ...     Token(10, 13, 'klmn'),
+    ...     Token(11, 15, 'lmnop'),
+    ...     Token(16, 16, 'q'),
+    ... ]
 
-class Output(object):
-    """
-    An Output is used to track a key added to the Trie as a TrieNode and any
-    arbitrary value object corresponding to that key.
+    >>> expected = [
+    ...     Token(0, 0, 'a'),
+    ...     Token(1, 5, 'bcdef'),
+    ...     Token(8, 9, 'ij'),
+    ...     Token(11, 15, 'lmnop'),
+    ...     Token(16, 16, 'q'),
+    ... ]
 
-    - `key` is the original key unmodified unicode string.
-    - `value` is the associated value for this key as provided when adding this key.
-    - `priority` is an optional priority for this key used to disambiguate overalpping matches.
+    >>> filtered = list(filter_overlapping(tokens))
+    >>> filtered == expected
+    True
     """
-    __slots__ = 'key', 'value', 'priority'
-
-    def __init__(self, key, value=None, priority=0):
-        self.key = key
-        self.value = value
-        self.priority = priority
-
-    def __repr__(self):
-        return self.__class__.__name__ + '(%(key)r, %(value)r, %(priority)r)' % self.as_dict()
+    tokens = Token.sort(tokens)
 
-    def __eq__(self, other):
-        return (
-            isinstance(other, Output)
-            and self.key == other.key
-            and self.value == other.value
-            and self.priority == other.priority)
+    # compare pair of tokens in the sorted sequence: current and next
+    i = 0
+    while i < len(tokens) - 1:
+        j = i + 1
+        while j < len(tokens):
+            curr_tok = tokens[i]
+            next_tok = tokens[j]
+
+            logger_debug('curr_tok, i, next_tok, j:', curr_tok, i, next_tok, j)
+            # disjoint tokens: break, there is nothing to do
+            if next_tok.is_after(curr_tok):
+                logger_debug('  break to next', curr_tok)
+                break
 
-    def __hash__(self):
-        return hash((self.key, self.value, self.priority,))
+            # contained token: discard the contained token
+            if next_tok in curr_tok:
+                logger_debug('  del next_tok contained:', next_tok)
+                del tokens[j]
+                continue
 
-    def as_dict(self):
-        return OrderedDict([(s, getattr(self, s)) for s in self.__slots__])
+            # overlap: Keep the longest token and skip the smallest overlapping
+            # tokens. In case of length tie: keep the left most
+            if curr_tok.overlap(next_tok):
+                if len(curr_tok) >= len(next_tok):
+                    logger_debug('  del next_tok smaller overlap:', next_tok)
+                    del tokens[j]
+                    continue
+                else:
+                    logger_debug('  del curr_tok smaller overlap:', curr_tok)
+                    del tokens[i]
+                    break
+            j += 1
+        i += 1
+    return tokens
 
 
-class Result(object):
+class Token(object):
     """
-    A Result is used to track the result of a search with its start and end as
-    index position in the original string and other attributes:
+    A Token is used to track the tokenization an expression with its
+    start and end as index position in the original string and other attributes:
 
     - `start` and `end` are zero-based index in the original string S such that
          S[start:end+1] will yield `string`.
-    - `string` is the sub-string from the original searched string for this Result.
-    - `output` is the Output object for a matched string and is a marker that this is a
-       matched string. None otherwise for a Result for unmatched text.
+    - `string` is the matched substring from the original string for this Token.
+    - `value` is the corresponding object for this token as one of:
+      - a LicenseSymbol object
+      - a "Keyword" object (and, or, with, left and right parens)
+      - None if this is a space.
     """
 
-    __slots__ = 'start', 'end', 'string', 'output'
+    __slots__ = 'start', 'end', 'string', 'value',
 
-    def __init__(self, start, end, string='', output=None):
+    def __init__(self, start, end, string='', value=None):
         self.start = start
         self.end = end
         self.string = string
-        self.output = output
+        self.value = value
 
     def __repr__(self):
-        return self.__class__.__name__ + '(%(start)r, %(end)r, %(string)r, %(output)r)' % self.as_dict()
+        return self.__class__.__name__ + '(%(start)r, %(end)r, %(string)r, %(value)r)' % self.as_dict()
 
     def as_dict(self):
         return OrderedDict([(s, getattr(self, s)) for s in self.__slots__])
 
     def __len__(self):
-        return self.end + 1 - self.start
+        return self.end - self.start + 1
 
     def __eq__(self, other):
-        return isinstance(other, Result) and (
+        return isinstance(other, Token) and (
             self.start == other.start and
             self.end == other.end and
             self.string == other.string and
-            self.output == other.output
+            self.value == other.value
         )
 
     def __hash__(self):
-        tup = self.start, self.end, self.string, self.output
+        tup = self.start, self.end, self.string, self.value
         return hash(tup)
 
-    @property
-    def priority(self):
-        return getattr(self.output, 'priority', 0)
+    @classmethod
+    def sort(cls, tokens):
+        """
+        Return a new sorted sequence of tokens given a sequence of tokens. The
+        primary sort is on start and the secondary sort is on longer lengths.
+        Therefore if two tokens have the same start, the longer token will sort
+        first.
+
+        For example:
+        >>> tokens = [Token(0, 0), Token(5, 5), Token(1, 1), Token(2, 4), Token(2, 5)]
+        >>> expected = [Token(0, 0), Token(1, 1), Token(2, 5), Token(2, 4), Token(5, 5)]
+        >>> expected == Token.sort(tokens)
+        True
+        """
+        key = lambda s: (s.start, -len(s),)
+        return sorted(tokens, key=key)
 
     def is_after(self, other):
         """
-        Return True if this result is after the other result.
+        Return True if this token is after the other token.
 
         For example:
-        >>> Result(1, 2).is_after(Result(5, 6))
+        >>> Token(1, 2).is_after(Token(5, 6))
         False
-        >>> Result(5, 6).is_after(Result(5, 6))
+        >>> Token(5, 6).is_after(Token(5, 6))
         False
-        >>> Result(2, 3).is_after(Result(1, 2))
+        >>> Token(2, 3).is_after(Token(1, 2))
         False
-        >>> Result(5, 6).is_after(Result(3, 4))
+        >>> Token(5, 6).is_after(Token(3, 4))
         True
         """
         return self.start > other.end
@@ -453,188 +570,57 @@ def is_before(self, other):
 
     def __contains__(self, other):
         """
-        Return True if this result contains the other result.
+        Return True if this token contains the other token.
 
         For example:
-        >>> Result(5, 7) in Result(5, 7)
+        >>> Token(5, 7) in Token(5, 7)
         True
-        >>> Result(6, 8) in Result(5, 7)
+        >>> Token(6, 8) in Token(5, 7)
         False
-        >>> Result(6, 6) in Result(4, 8)
+        >>> Token(6, 6) in Token(4, 8)
         True
-        >>> Result(3, 9) in Result(4, 8)
+        >>> Token(3, 9) in Token(4, 8)
         False
-        >>> Result(4, 8) in Result(3, 9)
+        >>> Token(4, 8) in Token(3, 9)
         True
         """
         return self.start <= other.start and other.end <= self.end
 
     def overlap(self, other):
         """
-        Return True if this result and the other result overlap.
+        Return True if this token and the other token overlap.
 
         For example:
-        >>> Result(1, 2).overlap(Result(5, 6))
+        >>> Token(1, 2).overlap(Token(5, 6))
         False
-        >>> Result(5, 6).overlap(Result(5, 6))
+        >>> Token(5, 6).overlap(Token(5, 6))
         True
-        >>> Result(4, 5).overlap(Result(5, 6))
+        >>> Token(4, 5).overlap(Token(5, 6))
         True
-        >>> Result(4, 5).overlap(Result(5, 7))
+        >>> Token(4, 5).overlap(Token(5, 7))
         True
-        >>> Result(4, 5).overlap(Result(6, 7))
+        >>> Token(4, 5).overlap(Token(6, 7))
         False
         """
         start = self.start
         end = self.end
         return (start <= other.start <= end) or (start <= other.end <= end)
 
-    @classmethod
-    def sort(cls, results):
-        """
-        Return a new sorted sequence of results given a sequence of results. The
-        primary sort is on start and the secondary sort is on longer lengths.
-        Therefore if two results have the same start, the longer result will sort
-        first.
 
-        For example:
-        >>> results = [Result(0, 0), Result(5, 5), Result(1, 1), Result(2, 4), Result(2, 5)]
-        >>> expected = [Result(0, 0), Result(1, 1), Result(2, 5), Result(2, 4), Result(5, 5)]
-        >>> expected == Result.sort(results)
-        True
-        """
-        key = lambda s: (s.start, -len(s),)
-        return sorted(results, key=key)
+# tokenize to separate text from parens
+_tokenizer = re.compile('''
+    (?P<text>[^\s\(\)]+)
+     |
+    (?P<space>\s+)
+     |
+    (?P<parens>[\(\)])
+    ''',
+    re.VERBOSE | re.MULTILINE | re.UNICODE
+)
 
 
-def filter_overlapping(results):
+def get_tokens(tokens_string):
     """
-    Return a new list from an iterable of `results` discarding contained and
-    overlaping Results using these rules:
-
-    - skip a result fully contained in another result.
-    - keep the biggest, left-most result of two overlapping results and skip the other
-
-    For example:
-    >>> results = [
-    ...     Result(0, 0, 'a'),
-    ...     Result(1, 5, 'bcdef'),
-    ...     Result(2, 4, 'cde'),
-    ...     Result(3, 7, 'defgh'),
-    ...     Result(4, 7, 'efgh'),
-    ...     Result(8, 9, 'ij'),
-    ...     Result(10, 13, 'klmn'),
-    ...     Result(11, 15, 'lmnop'),
-    ...     Result(16, 16, 'q'),
-    ... ]
-
-    >>> expected = [
-    ...     Result(0, 0, 'a'),
-    ...     Result(1, 5, 'bcdef'),
-    ...     Result(8, 9, 'ij'),
-    ...     Result(11, 15, 'lmnop'),
-    ...     Result(16, 16, 'q'),
-    ... ]
-
-    >>> filtered = list(filter_overlapping(results))
-    >>> filtered == expected
-    True
-    """
-    results = Result.sort(results)
-
-    # compare pair of results in the sorted sequence: current and next
-    i = 0
-    while i < len(results) - 1:
-        j = i + 1
-        while j < len(results):
-            curr_res = results[i]
-            next_res = results[j]
-
-            logger_debug('curr_res, i, next_res, j:', curr_res, i, next_res, j)
-            # disjoint results: break, there is nothing to do
-            if next_res.is_after(curr_res):
-                logger_debug('  break to next', curr_res)
-                break
-
-            # contained result: discard the contained result
-            if next_res in curr_res:
-                logger_debug('  del next_res contained:', next_res)
-                del results[j]
-                continue
-
-            # overlap: keep the biggest result and skip the smallest overlapping results
-            # in case of length tie: keep the left most
-            if curr_res.overlap(next_res):
-                if curr_res.priority < next_res.priority:
-                    logger_debug('  del next_res lower priority:', next_res)
-                    del results[j]
-                    continue
-                elif curr_res.priority > next_res.priority:
-                    logger_debug('  del curr_res lower priority:', curr_res)
-                    del results[i]
-                    break
-                else:
-                    if len(curr_res) >= len(next_res):
-                        logger_debug('  del next_res smaller overlap:', next_res)
-                        del results[j]
-                        continue
-                    else:
-                        logger_debug('  del curr_res smaller overlap:', curr_res)
-                        del results[i]
-                        break
-            j += 1
-        i += 1
-    return results
-
-
-def add_unmatched(string, results):
-    """
-    Yield Result object from the original `string` and the search `results` iterable
-    of non-overlapping matched substring Result object. New unmatched Results are
-    added to the stream for unmatched parts.
-
-    For example:
-    >>> string ='abcdefghijklmn'
-    >>> results = [
-    ...   Result(2, 3, 'cd'),
-    ...   Result(7, 7, 'h', None),
-    ...   Result(9, 10, 'jk', None),
-    ... ]
-    >>> expected = [
-    ...   Result(0, 1, 'ab'),
-    ...   Result(2, 3, 'cd'),
-    ...   Result(4, 6, 'efg'),
-    ...   Result(7, 7, 'h'),
-    ...   Result(8, 8, 'i'),
-    ...   Result(9, 10, 'jk'),
-    ...   Result(11, 13, 'lmn')
-    ... ]
-    >>> expected == list(add_unmatched(string, results))
-    True
-
-    >>> string ='abc2'
-    >>> results = [
-    ...   Result(0, 2, 'abc'),
-    ... ]
-    >>> expected = [
-    ...   Result(0, 2, 'abc'),
-    ...   Result(3, 3, '2', None),
-    ... ]
-    >>> expected == list(add_unmatched(string, results))
-    True
-
+    Return an iterable of strings splitting on spaces and parens.
     """
-    string_pos = 0
-    for result in Result.sort(results):
-        if result.start > string_pos:
-            start = string_pos
-            end = result.start - 1
-            yield Result(start, end, string[start:end + 1])
-        yield result
-        string_pos = result.end + 1
-
-    len_string = len(string)
-    if string_pos < len_string:
-        start = string_pos
-        end = len_string - 1
-        yield Result(start, end, string[start:end + 1])
+    return [match for match in _tokenizer.split(tokens_string.lower()) if match]
diff --git a/tests/test__pyahocorasick.py b/tests/test__pyahocorasick.py
index 7b346b6..e7ad883 100644
--- a/tests/test__pyahocorasick.py
+++ b/tests/test__pyahocorasick.py
@@ -6,12 +6,7 @@
 WWW            : http://0x80.pl
 License        : public domain
 
-Modified for use in the license_expression library and in particular:
- - add support for unicode key strinsg.
- - rename word to key and output to value (to be more like a mapping/dict)
- - case insensitive search
- - improve returned results with the actual start,end and matched string.
- - support returning non-matched parts of a string
+Modified for use in the license_expression library.
 """
 
 from __future__ import unicode_literals
@@ -21,35 +16,34 @@
 import unittest
 
 from license_expression._pyahocorasick import Trie
-from license_expression._pyahocorasick import Output
-from license_expression._pyahocorasick import Result
+from license_expression._pyahocorasick import Token
 
 
 class TestTrie(unittest.TestCase):
 
-    def testAddedWordShouldBeCountedAndAvailableForRetrieval(self):
+    def test_add_can_get(self):
         t = Trie()
         t.add('python', 'value')
-        assert Output('python', 'value') == t.get('python')
+        assert ('python', 'value') == t.get('python')
 
-    def testAddingExistingWordShouldReplaceAssociatedValue(self):
+    def test_add_existing_WordShouldReplaceAssociatedValue(self):
         t = Trie()
         t.add('python', 'value')
-        assert Output('python', 'value') == t.get('python')
+        assert ('python', 'value') == t.get('python')
 
         t.add('python', 'other')
-        assert Output('python', 'other') == t.get('python')
+        assert ('python', 'other') == t.get('python')
 
-    def testGetUnknowWordWithoutDefaultValueShouldRaiseException(self):
+    def test_get_UnknowWordWithoutDefaultValueShouldRaiseException(self):
         t = Trie()
         with self.assertRaises(KeyError):
             t.get('python')
 
-    def testGetUnknowWordWithDefaultValueShouldReturnDefault(self):
+    def test_get_UnknowWordWithDefaultValueShouldReturnDefault(self):
         t = Trie()
         self.assertEqual(t.get('python', 'default'), 'default')
 
-    def testExistShouldDetectAddedWords(self):
+    def test_exists_ShouldDetectAddedWords(self):
         t = Trie()
         t.add('python', 'value')
         t.add('ada', 'value')
@@ -57,7 +51,7 @@ def testExistShouldDetectAddedWords(self):
         self.assertTrue(t.exists('python'))
         self.assertTrue(t.exists('ada'))
 
-    def testExistShouldReturnFailOnUnknownWord(self):
+    def test_exists_ShouldReturnFailOnUnknownWord(self):
         t = Trie()
         t.add('python', 'value')
 
@@ -66,20 +60,22 @@ def testExistShouldReturnFailOnUnknownWord(self):
     def test_is_prefix_ShouldDetecAllPrefixesIncludingWord(self):
         t = Trie()
         t.add('python', 'value')
-        t.add('ada', 'value')
+        t.add('ada lovelace', 'value')
 
-        self.assertTrue(t.is_prefix('a'))
-        self.assertTrue(t.is_prefix('ad'))
+        self.assertFalse(t.is_prefix('a'))
+        self.assertFalse(t.is_prefix('ad'))
         self.assertTrue(t.is_prefix('ada'))
 
-        self.assertTrue(t.is_prefix('p'))
-        self.assertTrue(t.is_prefix('py'))
-        self.assertTrue(t.is_prefix('pyt'))
-        self.assertTrue(t.is_prefix('pyth'))
-        self.assertTrue(t.is_prefix('pytho'))
+        self.assertFalse(t.is_prefix('p'))
+        self.assertFalse(t.is_prefix('py'))
+        self.assertFalse(t.is_prefix('pyt'))
+        self.assertFalse(t.is_prefix('pyth'))
+        self.assertFalse(t.is_prefix('pytho'))
         self.assertTrue(t.is_prefix('python'))
 
-    def testItemsShouldReturnAllItemsAlreadyAddedToTheTrie(self):
+        self.assertFalse(t.is_prefix('lovelace'))
+
+    def test_items_ShouldReturnAllItemsAlreadyAddedToTheTrie(self):
         t = Trie()
 
         t.add('python', 1)
@@ -87,6 +83,7 @@ def testItemsShouldReturnAllItemsAlreadyAddedToTheTrie(self):
         t.add('perl', 3)
         t.add('pascal', 4)
         t.add('php', 5)
+        t.add('php that', 6)
 
         result = list(t.items())
         self.assertIn(('python', 1), result)
@@ -94,8 +91,9 @@ def testItemsShouldReturnAllItemsAlreadyAddedToTheTrie(self):
         self.assertIn(('perl', 3), result)
         self.assertIn(('pascal', 4), result)
         self.assertIn(('php', 5), result)
+        self.assertIn(('php that', 6), result)
 
-    def testKeysShouldReturnAllKeysAlreadyAddedToTheTrie(self):
+    def test_keys_ShouldReturnAllKeysAlreadyAddedToTheTrie(self):
         t = Trie()
 
         t.add('python', 1)
@@ -103,6 +101,7 @@ def testKeysShouldReturnAllKeysAlreadyAddedToTheTrie(self):
         t.add('perl', 3)
         t.add('pascal', 4)
         t.add('php', 5)
+        t.add('php that', 6)
 
         result = list(t.keys())
         self.assertIn('python', result)
@@ -110,8 +109,9 @@ def testKeysShouldReturnAllKeysAlreadyAddedToTheTrie(self):
         self.assertIn('perl', result)
         self.assertIn('pascal', result)
         self.assertIn('php', result)
+        self.assertIn('php that', result)
 
-    def testValuesShouldReturnAllValuesAlreadyAddedToTheTrie(self):
+    def test_values_ShouldReturnAllValuesAlreadyAddedToTheTrie(self):
         t = Trie()
 
         t.add('python', 1)
@@ -127,36 +127,60 @@ def testValuesShouldReturnAllValuesAlreadyAddedToTheTrie(self):
         self.assertIn(4, result)
         self.assertIn(5, result)
 
-    def test_iter_should_not_return_non_matches(self):
+    def test_iter_should_not_return_non_matches_by_default(self):
 
         def get_test_automaton():
-            words = "he her hers his she hi him man himan".split()
+            words = 'he her hers his she hi him man himan'.split()
             t = Trie()
             for w in words:
                 t.add(w, w)
             t.make_automaton()
             return t
 
-        test_string = "he she himan"
+        test_string = 'he she himan'
 
         t = get_test_automaton()
         result = list(t.iter(test_string))
+        assert 'he she himan'.split() == [r.value for r in result]
+
+    def test_iter_should_can_return_non_matches_optionally(self):
+
+        def get_test_automaton():
+            words = 'he her hers his she hi him man himan'.split()
+            t = Trie()
+            for w in words:
+                t.add(w, w)
+            t.make_automaton()
+            return t
+
+        test_string = '  he she junk  himan  other stuffs   '
+        #                        111111111122222222223333333
+        #              0123456789012345678901234567890123456
+
+        t = get_test_automaton()
+        result = list(t.iter(test_string, include_unmatched=True, include_space=True))
         expected = [
-            Result(start=0, end=1, string='he', output=Output('he', 'he')),
-            Result(start=3, end=5, string='she', output=Output('she', 'she')),
-            Result(start=4, end=5, string='he', output=Output('he', 'he')),
-            Result(start=7, end=8, string='hi', output=Output('hi', 'hi')),
-            Result(start=7, end=9, string='him', output=Output('him', 'him')),
-            Result(start=7, end=11, string='himan', output=Output('himan', 'himan')),
-            Result(start=9, end=11, string='man', output=Output('man', 'man'))
+            Token(0, 1, u'  ', None),
+            Token(2, 3, u'he', u'he'),
+            Token(4, 4, u' ', None),
+            Token(5, 7, u'she', u'she'),
+            Token(8, 8, u' ', None),
+            Token(9, 12, u'junk', None),
+            Token(13, 14, u'  ', None),
+            Token(15, 19, u'himan', u'himan'),
+            Token(20, 21, u'  ', None),
+            Token(22, 26, u'other', None),
+            Token(27, 27, u' ', None),
+            Token(28, 33, u'stuffs', None),
+            Token(34, 36, u'   ', None),
         ]
 
         assert expected == result
 
-    def test_iter_vs_scan(self):
+    def test_iter_vs_tokenize(self):
 
         def get_test_automaton():
-            words = "( AND ) OR".split()
+            words = '( AND ) OR'.split()
             t = Trie()
             for w in words:
                 t.add(w, w)
@@ -166,41 +190,38 @@ def get_test_automaton():
         test_string = '((l-a + AND l-b) OR (l -c+))'
 
         t = get_test_automaton()
-        result = list(t.iter(test_string))
+        result = list(t.iter(test_string, include_unmatched=True, include_space=True))
         expected = [
-            Result(0, 0, '(', Output('(', '(')),
-            Result(1, 1, '(', Output('(', '(')),
-            Result(8, 10, 'AND', Output('AND', 'AND')),
-            Result(15, 15, ')', Output(')', ')')),
-            Result(17, 18, 'OR', Output('OR', 'OR')),
-            Result(20, 20, '(', Output('(', '(')),
-            Result(26, 26, ')', Output(')', ')')),
-            Result(27, 27, ')', Output(')', ')'))
+            Token(0, 0, u'(', u'('),
+            Token(1, 1, u'(', u'('),
+            Token(2, 4, u'l-a', None),
+            Token(5, 5, u' ', None),
+            Token(6, 6, u'+', None),
+            Token(7, 7, u' ', None),
+            Token(8, 10, u'AND', u'AND'),
+            Token(11, 11, u' ', None),
+            Token(12, 14, u'l-b', None),
+            Token(15, 15, u')', u')'),
+            Token(16, 16, u' ', None),
+            Token(17, 18, u'OR', u'OR'),
+            Token(19, 19, u' ', None),
+            Token(20, 20, u'(', u'('),
+            Token(21, 21, u'l', None),
+            Token(22, 22, u' ', None),
+            Token(23, 25, u'-c+', None),
+            Token(26, 26, u')', u')'),
+            Token(27, 27, u')', u')')
         ]
+
         assert expected == result
 
-        result = list(t.scan(test_string))
-        expected = [
-            Result(0, 0, '(', Output('(', '(')),
-            Result(1, 1, '(', Output('(', '(')),
-            Result(2, 7, 'l-a + ', None),
-            Result(8, 10, 'AND', Output('AND', 'AND')),
-            Result(11, 14, ' l-b', None),
-            Result(15, 15, ')', Output(')', ')')),
-            Result(16, 16, ' ', None),
-            Result(17, 18, 'OR', Output('OR', 'OR')),
-            Result(19, 19, ' ', None),
-            Result(20, 20, '(', Output('(', '(')),
-            Result(21, 25, 'l -c+', None),
-            Result(26, 26, ')', Output(')', ')')),
-            Result(27, 27, ')', Output(')', ')'))
-        ]
+        result = list(t.tokenize(test_string, include_unmatched=True, include_space=True))
         assert expected == result
 
-    def test_scan_with_unmatched(self):
+    def test_tokenize_with_unmatched_and_space(self):
 
         def get_test_automaton():
-            words = "( AND ) OR".split()
+            words = '( AND ) OR'.split()
             t = Trie()
             for w in words:
                 t.add(w, w)
@@ -208,18 +229,44 @@ def get_test_automaton():
             return t
 
         test_string = '((l-a + AND l-b) OR an (l -c+))'
-
+        #                        111111111122222222223
+        #              0123456789012345678901234567890
         t = get_test_automaton()
-        result = list(t.scan(test_string))
-        assert test_string == ''.join(r.string for r in result)
+        result = list(t.tokenize(test_string, include_unmatched=True, include_space=True))
+        expected = [
+            Token(0, 0, u'(', u'('),
+            Token(1, 1, u'(', u'('),
+            Token(2, 4, u'l-a', None),
+            Token(5, 5, u' ', None),
+            Token(6, 6, u'+', None),
+            Token(7, 7, u' ', None),
+            Token(8, 10, u'AND', u'AND'),
+            Token(11, 11, u' ', None),
+            Token(12, 14, u'l-b', None),
+            Token(15, 15, u')', u')'),
+            Token(16, 16, u' ', None),
+            Token(17, 18, u'OR', u'OR'),
+            Token(19, 19, u' ', None),
+            Token(20, 21, u'an', None),
+            Token(22, 22, u' ', None),
+            Token(23, 23, u'(', u'('),
+            Token(24, 24, u'l', None),
+            Token(25, 25, u' ', None),
+            Token(26, 28, u'-c+', None),
+            Token(29, 29, u')', u')'),
+            Token(30, 30, u')', u')')
+        ]
+
+        assert expected == result
+        assert test_string == ''.join(t.string for t in result)
 
     def test_iter_with_unmatched_simple(self):
         t = Trie()
-        t.add('AND', 'AND')
+        t.add('And', 'And')
         t.make_automaton()
-        test_string = 'AND  an a and'
+        test_string = 'AND  an a And'
         result = list(t.iter(test_string))
-        assert 'ANDand' == ''.join(r.string for r in result)
+        assert ['And', 'And'] == [r.value for r in result]
 
     def test_iter_with_unmatched_simple2(self):
         t = Trie()
@@ -227,5 +274,49 @@ def test_iter_with_unmatched_simple2(self):
         t.make_automaton()
         test_string = 'AND  an a and'
         result = list(t.iter(test_string))
-        assert 'ANDand' == ''.join(r.string for r in result)
+        assert ['AND', 'AND'] == [r.value for r in result]
+
+    def test_iter_with_unmatched_simple3(self):
+        t = Trie()
+        t.add('AND', 'AND')
+        t.make_automaton()
+        test_string = 'AND  an a andersom'
+        result = list(t.iter(test_string))
+        assert ['AND'] == [r.value for r in result]
 
+    def test_iter_simple(self):
+        t = Trie()
+        t.add('AND', 'AND')
+        t.add('OR', 'OR')
+        t.add('WITH', 'WITH')
+        t.add('(', '(')
+        t.add(')', ')')
+        t.add('GPL-2.0', 'GPL-2.0')
+        t.add('mit', 'MIT')
+        t.add('Classpath', 'Classpath')
+        t.make_automaton()
+        test_string = '(GPL-2.0 with Classpath) or (gpl-2.0) and (classpath or  gpl-2.0 OR mit) '
+        #                        111111111122222222223333333333444444444455555555556666666666777
+        #              0123456789012345678901234567890123456789012345678901234567890123456789012
+        result = list(t.iter(test_string))
+        expected = [
+            Token(0, 0, u'(', u'('),
+            Token(1, 7, u'GPL-2.0', u'GPL-2.0'),
+            Token(9, 12, u'with', u'WITH'),
+            Token(14, 22, u'Classpath', u'Classpath'),
+            Token(23, 23, u')', u')'),
+            Token(25, 26, u'or', u'OR'),
+            Token(28, 28, u'(', u'('),
+            Token(29, 35, u'gpl-2.0', u'GPL-2.0'),
+            Token(36, 36, u')', u')'),
+            Token(38, 40, u'and', u'AND'),
+            Token(42, 42, u'(', u'('),
+            Token(43, 51, u'classpath', u'Classpath'),
+            Token(53, 54, u'or', u'OR'),
+            Token(57, 63, u'gpl-2.0', u'GPL-2.0'),
+            Token(65, 66, u'OR', u'OR'),
+            Token(68, 70, u'mit', u'MIT'),
+            Token(71, 71, u')', u')')
+        ]
+
+        assert expected == result
diff --git a/tests/test_license_expression.py b/tests/test_license_expression.py
index 58312c4..6ceed6c 100644
--- a/tests/test_license_expression.py
+++ b/tests/test_license_expression.py
@@ -17,8 +17,10 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from collections import namedtuple
 from collections import OrderedDict
 from unittest import TestCase
+from unittest.case import expectedFailure
 import sys
 
 from boolean.boolean import PARSE_UNBALANCED_CLOSING_PARENS
@@ -28,6 +30,7 @@
 from license_expression import PARSE_INVALID_NESTING
 from license_expression import PARSE_INVALID_EXCEPTION
 from license_expression import PARSE_INVALID_SYMBOL_AS_EXCEPTION
+from license_expression import PARSE_INVALID_OPERATOR_SEQUENCE
 
 from license_expression import ExpressionError
 from license_expression import Keyword
@@ -37,12 +40,9 @@
 from license_expression import LicenseSymbolLike
 from license_expression import LicenseWithExceptionSymbol
 from license_expression import ParseError
-from license_expression import Result
-from license_expression import Output
+from license_expression import Token
 
-from license_expression import group_results_for_with_subexpression
-from license_expression import splitter
-from license_expression import strip_and_skip_spaces
+from license_expression import build_token_groups_for_with_subexpression
 from license_expression import validate_symbols
 
 from license_expression import TOKEN_AND
@@ -150,9 +150,9 @@ def test_tokenize_plain4(self):
         expected = [
             (TOKEN_LPAR, '(', 0),
             (TOKEN_LPAR, '(', 1),
-            (LicenseSymbol(key=u'l-a+'), u'l-a+', 2),
+            (LicenseSymbol(key=u'l-a+'), 'l-a+', 2),
             (TOKEN_AND, 'AND', 7),
-            (LicenseSymbol(key=u'l-b'), u'l-b', 11),
+            (LicenseSymbol(key=u'l-b'), 'l-b', 11),
             (TOKEN_RPAR, ')', 14),
             (TOKEN_OR, 'OR', 16),
             (TOKEN_LPAR, '(', 19),
@@ -198,27 +198,46 @@ def get_symbols_and_licensing(self):
         licensing = Licensing(symbols)
         return gpl_20, gpl_20_plus, lgpl_21, mit, licensing
 
-    def test_tokenize_1(self):
+    def test_tokenize_1_with_symbols(self):
         gpl_20, _gpl_20_plus, lgpl_21, mit, licensing = self.get_symbols_and_licensing()
-        result = licensing.tokenize('The GNU GPL 20 or LGPL-2.1 and mit')
+
+        result = licensing.tokenize('The GNU GPL 20 or LGPL v2.1 AND MIT license ')
+        #                                      111111111122222222223333333333444
+        #                            0123456789012345678901234567890123456789012
+
         expected = [
             (gpl_20, 'The GNU GPL 20', 0),
-            (TOKEN_OR, ' or ', 14),
-            (lgpl_21, 'LGPL-2.1', 18),
-            (TOKEN_AND, ' and ', 26),
-            (mit, 'mit', 31)]
+            (TOKEN_OR, 'or', 15),
+            (lgpl_21, 'LGPL v2.1', 18),
+            (TOKEN_AND, 'AND', 28),
+            (mit, 'MIT license', 32)
+        ]
+        assert expected == list(result)
+
+    def test_tokenize_1_no_symbols(self):
+        licensing = Licensing()
+
+        result = licensing.tokenize('The GNU GPL 20 or LGPL v2.1 AND MIT license')
+
+        expected = [
+            (LicenseSymbol(u'The GNU GPL 20'), 'The GNU GPL 20', 0),
+            (TOKEN_OR, 'or', 15),
+            (LicenseSymbol(u'LGPL v2.1'), 'LGPL v2.1', 18),
+            (TOKEN_AND, 'AND', 28),
+            (LicenseSymbol(u'MIT license'), 'MIT license', 32)
+        ]
+
         assert expected == list(result)
 
     def test_tokenize_with_trailing_unknown(self):
-        gpl_20, _gpl_20_plus, lgpl_21, mit, licensing = self.get_symbols_and_licensing()
+        gpl_20, _gpl_20_plus, lgpl_21, _mit, licensing = self.get_symbols_and_licensing()
         result = licensing.tokenize('The GNU GPL 20 or LGPL-2.1 and mit2')
         expected = [
             (gpl_20, 'The GNU GPL 20', 0),
-            (TOKEN_OR, ' or ', 14),
+            (TOKEN_OR, 'or', 15),
             (lgpl_21, 'LGPL-2.1', 18),
-            (TOKEN_AND, ' and ', 26),
-            (mit, 'mit', 31),
-            (LicenseSymbol(key='2'), '2', 34)
+            (TOKEN_AND, 'and', 27),
+            (LicenseSymbol(key='mit2'), 'mit2', 31),
         ]
         assert expected == list(result)
 
@@ -228,14 +247,15 @@ def test_tokenize_3(self):
         result = licensing.tokenize('The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit')
         expected = [
             (gpl_20_plus, 'The GNU GPL 20 or later', 0),
-            (TOKEN_OR, ' or ', 23),
+            (TOKEN_OR, 'or', 24),
             (TOKEN_LPAR, '(', 27),
             (lgpl_21, 'LGPL-2.1', 28),
-            (TOKEN_AND, ' and ', 36),
+            (TOKEN_AND, 'and', 37),
             (mit, 'mit', 41),
             (TOKEN_RPAR, ')', 44),
-            (TOKEN_OR, ' or ', 45),
-            (gpl_20, 'The GNU GPL 20', 49), (2, ' or ', 63),
+            (TOKEN_OR, 'or', 46),
+            (gpl_20, 'The GNU GPL 20', 49),
+            (2, 'or', 64),
             (mit, 'mit', 67)
         ]
         assert expected == list(result)
@@ -245,8 +265,41 @@ def test_tokenize_unknown_as_trailing_single_attached_character(self):
         l = Licensing(symbols)
         result = list(l.tokenize('mit2'))
         expected = [
-            (LicenseSymbol(key='MIT', aliases=('MIT license',)), 'mit', 0),
-            (LicenseSymbol(key='2'), '2', 3),
+            (LicenseSymbol(u'mit2'), 'mit2', 0),
+        ]
+        assert expected == result
+
+    def test_tokenize_with_unknown_symbol_containing_known_symbol_leading(self):
+        l = Licensing(['gpl-2.0'])
+        result = list(l.tokenize('gpl-2.0 AND gpl-2.0-plus', strict=False))
+        result = [s for s, _, _ in result]
+        expected = [
+            LicenseSymbol(key='gpl-2.0'),
+            TOKEN_AND,
+            LicenseSymbol(key='gpl-2.0-plus'),
+        ]
+        assert expected == result
+
+    def test_tokenize_with_unknown_symbol_containing_known_symbol_contained(self):
+        l = Licensing(['gpl-2.0'])
+        result = list(l.tokenize('gpl-2.0 WITH exception-gpl-2.0-plus', strict=False))
+        result = [s for s, _, _ in result]
+        expected = [
+            LicenseWithExceptionSymbol(
+                LicenseSymbol(u'gpl-2.0'),
+                LicenseSymbol(u'exception-gpl-2.0-plus')
+            )
+        ]
+        assert expected == result
+
+    def test_tokenize_with_unknown_symbol_containing_known_symbol_trailing(self):
+        l = Licensing(['gpl-2.0'])
+        result = list(l.tokenize('gpl-2.0 AND exception-gpl-2.0', strict=False))
+        result = [s for s, _, _ in result]
+        expected = [
+            LicenseSymbol(u'gpl-2.0'),
+            TOKEN_AND,
+            LicenseSymbol(u'exception-gpl-2.0')
         ]
         assert expected == result
 
@@ -270,7 +323,12 @@ def test_parse_raise_ParseError(self):
             licensing.parse(expression)
             self.fail('ParseError should be raised')
         except ParseError as pe:
-            expected = {'error_code': PARSE_UNBALANCED_CLOSING_PARENS, 'position': 48, 'token_string': ')', 'token_type': TOKEN_RPAR}
+            expected = {
+                'error_code': PARSE_UNBALANCED_CLOSING_PARENS,
+                'position': 48,
+                'token_string': ')',
+                'token_type': TOKEN_RPAR
+            }
             assert expected == _parse_error_as_dict(pe)
 
     def test_parse_raise_ExpressionError_when_validating(self):
@@ -278,31 +336,65 @@ def test_parse_raise_ExpressionError_when_validating(self):
         licensing = Licensing()
         try:
             licensing.parse(expression, validate=True)
+            self.fail('Exception not raised')
         except ExpressionError as ee:
             assert 'Unknown license key(s): gpl, bsd, lgpl, exception' == str(ee)
 
-    def test_parse_raise_ExpressionError_when_validating_strict(self):
+    def test_parse_raise_ParseError_when_validating_strict(self):
+        expression = 'gpl and bsd or lgpl with exception'
+        licensing = Licensing()
+        try:
+            licensing.parse(expression, validate=True, strict=True)
+            self.fail('Exception not raised')
+        except ParseError as pe:
+            expected = {
+                'error_code': PARSE_INVALID_SYMBOL_AS_EXCEPTION,
+                'position': 25,
+                'token_string': 'exception',
+                'token_type': TOKEN_SYMBOL
+            }
+            assert expected == _parse_error_as_dict(pe)
+
+    def test_parse_raise_ParseError_when_strict_no_validate(self):
         expression = 'gpl and bsd or lgpl with exception'
         licensing = Licensing()
+        try:
+            licensing.parse(expression, validate=False, strict=True)
+            self.fail('Exception not raised')
+        except ParseError as pe:
+            expected = {
+                'error_code': PARSE_INVALID_SYMBOL_AS_EXCEPTION,
+                'position': 25,
+                'token_string': 'exception',
+                'token_type': TOKEN_SYMBOL
+            }
+            assert expected == _parse_error_as_dict(pe)
+
+    def test_parse_raise_ExpressionError_when_validating_strict_with_unknown(self):
+        expression = 'gpl and bsd or lgpl with exception'
+        licensing = Licensing(symbols=[LicenseSymbol('exception', is_exception=True)])
         try:
             licensing.parse(expression, validate=True, strict=True)
         except ExpressionError as ee:
-            assert str(ee).startswith('exception_symbol must be an exception with "is_exception" set to True:')
+            assert 'Unknown license key(s): gpl, bsd, lgpl' == str(ee)
 
     def test_parse_in_strict_mode_for_solo_symbol(self):
         expression = 'lgpl'
         licensing = Licensing()
         licensing.parse(expression, strict=True)
 
-    def test_parse_invalid_expression_raise_expression(self):
+    def test_parse_invalid_expression_raise_exception(self):
         licensing = Licensing()
-
         expr = 'wrong'
         licensing.parse(expr)
 
+    def test_parse_not_invalid_expression_rais_not_exception(self):
+        licensing = Licensing()
         expr = 'l-a AND none'
         licensing.parse(expr)
 
+    def test_parse_invalid_expression_raise_exception3(self):
+        licensing = Licensing()
         expr = '(l-a + AND l-b'
         try:
             licensing.parse(expr)
@@ -310,6 +402,8 @@ def test_parse_invalid_expression_raise_expression(self):
         except ParseError:
             pass
 
+    def test_parse_invalid_expression_raise_exception4(self):
+        licensing = Licensing()
         expr = '(l-a + AND l-b))'
         try:
             licensing.parse(expr)
@@ -317,20 +411,33 @@ def test_parse_invalid_expression_raise_expression(self):
         except ParseError:
             pass
 
+    def test_parse_invalid_expression_raise_exception5(self):
+        licensing = Licensing()
         expr = 'l-a AND'
         try:
             licensing.parse(expr)
             self.fail("Exception not raised when validating '%s'" % expr)
-        except ParseError:
-            pass
+        except ExpressionError as ee:
+            assert 'AND requires two or more licenses as in: MIT AND BSD' == str(ee)
 
+    def test_parse_invalid_expression_raise_exception6(self):
+        licensing = Licensing()
         expr = 'OR l-a'
         try:
             licensing.parse(expr)
             self.fail("Exception not raised when validating '%s'" % expr)
-        except ParseError:
-            pass
+            self.fail('Exception not raised')
+        except ParseError as pe:
+            expected = {
+                'error_code': PARSE_INVALID_OPERATOR_SEQUENCE,
+                'position': 0,
+                'token_string': 'OR',
+                'token_type': TOKEN_OR
+            }
+            assert expected == _parse_error_as_dict(pe)
 
+    def test_parse_not_invalid_expression_raise_no_exception2(self):
+        licensing = Licensing()
         expr = '+l-a'
         licensing.parse(expr)
 
@@ -355,7 +462,12 @@ def test_parse_errors_catch_invalid_nesting(self):
             licensing.parse('mit (and LGPL 2.1)')
             self.fail('Exception not raised')
         except ParseError as pe:
-            expected = {'error_code': PARSE_INVALID_NESTING, 'position': 4, 'token_string': '(', 'token_type': TOKEN_LPAR}
+            expected = {
+                'error_code': PARSE_INVALID_NESTING,
+                'position': 4,
+                'token_string': '(',
+                'token_type': TOKEN_LPAR
+            }
             assert expected == _parse_error_as_dict(pe)
 
     def test_parse_errors_catch_invalid_expression_with_bare_and(self):
@@ -364,7 +476,12 @@ def test_parse_errors_catch_invalid_expression_with_bare_and(self):
             licensing.parse('and')
             self.fail('Exception not raised')
         except ParseError as pe:
-            expected = {'error_code': PARSE_INVALID_EXPRESSION, 'position':-1, 'token_string': '', 'token_type': None}
+            expected = {
+                'error_code': PARSE_INVALID_OPERATOR_SEQUENCE,
+                'position': 0,
+                'token_string': 'and',
+                'token_type': TOKEN_AND
+            }
             assert expected == _parse_error_as_dict(pe)
 
     def test_parse_errors_catch_invalid_expression_with_or_and_no_other(self):
@@ -373,7 +490,12 @@ def test_parse_errors_catch_invalid_expression_with_or_and_no_other(self):
             licensing.parse('or that')
             self.fail('Exception not raised')
         except ParseError as pe:
-            expected = {'error_code': PARSE_INVALID_EXPRESSION, 'position':-1, 'token_string': '', 'token_type': None}
+            expected = {
+                'error_code': PARSE_INVALID_OPERATOR_SEQUENCE,
+                'position': 0,
+                'token_string': 'or',
+                'token_type': TOKEN_OR
+            }
             assert expected == _parse_error_as_dict(pe)
 
     def test_parse_errors_catch_invalid_expression_with_empty_parens(self):
@@ -382,7 +504,12 @@ def test_parse_errors_catch_invalid_expression_with_empty_parens(self):
             licensing.parse('with ( )this')
             self.fail('Exception not raised')
         except ParseError as pe:
-            expected = {'error_code': PARSE_INVALID_EXPRESSION, 'position': 0, 'token_string':  'with', 'token_type': TOKEN_WITH}
+            expected = {
+                'error_code': PARSE_INVALID_EXPRESSION,
+                'position': 0,
+                'token_string': 'with',
+                'token_type': TOKEN_WITH
+            }
             assert expected == _parse_error_as_dict(pe)
 
     def test_parse_errors_catch_invalid_non_unicode_byte_strings_on_python3(self):
@@ -580,15 +707,11 @@ def test_create_from_python(self):
             )
         assert a == b
 
-    def test_parse_with_repeated_or_later_raise_parse_error(self):
+    def test_parse_with_repeated_or_later_does_not_raise_parse_error(self):
         l = Licensing()
         expr = 'LGPL2.1+ + and mit'
-        try:
-            l.parse(expr)
-            self.fail('Exception not raised')
-        except ParseError as ee:
-            expected = 'Invalid symbols sequence such as (A B) for token: "+" at position: 9'
-            assert expected == str(ee)
+        parsed = l.parse(expr)
+        assert 'LGPL2.1+ + AND mit' == str(parsed)
 
     def test_render_complex(self):
         licensing = Licensing()
@@ -651,18 +774,6 @@ def test_parse_complex2(self):
         expected = 'GPL-2.0 OR (LGPL-2.1 AND mit)'
         assert expected == expr.render('{symbol.key}')
 
-    def test_Licensing_can_scan_valid_expressions_with_symbols_that_contain_and_with_or(self):
-        licensing = Licensing()
-        expression = 'orgpl or withbsd with orclasspath and andmit or andlgpl and ormit or withme'
-        result = [r.string for r in licensing.get_scanner().scan(expression)]
-        expected = [
-            'orgpl', ' or ', 'withbsd', ' with ', 'orclasspath',
-            ' and ', 'andmit', ' or ', 'andlgpl', ' and ', 'ormit',
-            ' or ', 'withme'
-        ]
-
-        assert expected == result
-
     def test_Licensing_can_tokenize_valid_expressions_with_symbols_that_contain_and_with_or(self):
         licensing = Licensing()
         expression = 'orgpl or withbsd with orclasspath and andmit or anlgpl and ormit or withme'
@@ -687,6 +798,40 @@ def test_Licensing_can_tokenize_valid_expressions_with_symbols_that_contain_and_
 
         assert expected == result
 
+    def test_Licensing_can_simple_tokenize_valid_expressions_with_symbols_that_contain_and_with_or(self):
+        licensing = Licensing()
+        expression = 'orgpl or withbsd with orclasspath and andmit or andlgpl and ormit or withme'
+
+        result = [r.string for r in licensing.simple_tokenizer(expression)]
+        expected = [
+            'orgpl',
+            ' ',
+            'or',
+            ' ',
+            'withbsd',
+            ' ',
+            'with',
+            ' ',
+            'orclasspath',
+            ' ',
+            'and',
+            ' ',
+            'andmit',
+            ' ',
+            'or',
+            ' ',
+            'andlgpl',
+            ' ',
+            'and',
+            ' ',
+            'ormit',
+            ' ',
+            'or',
+            ' ',
+            'withme'
+        ]
+        assert expected == result
+
     def test_Licensing_can_parse_valid_expressions_with_symbols_that_contain_and_with_or(self):
         licensing = Licensing()
         expression = 'orgpl or withbsd with orclasspath and andmit or anlgpl and ormit or withme'
@@ -695,24 +840,97 @@ def test_Licensing_can_parse_valid_expressions_with_symbols_that_contain_and_wit
         expected = 'orgpl OR (withbsd WITH orclasspath AND andmit) OR (anlgpl AND ormit) OR withme'
         assert expected == result.render('{symbol.key}')
 
+    def test_Licensing_can_parse_valid_expressions_with_symbols_that_contain_spaces(self):
+        licensing = Licensing()
+        expression = ' GPL-2.0 or (mit and LGPL 2.1) or bsd Or GPL-2.0  or (mit and LGPL 2.1)'
+        parsed = licensing.parse(expression)
+        expected = 'GPL-2.0 OR (mit AND LGPL 2.1) OR bsd OR GPL-2.0 OR (mit AND LGPL 2.1)'
+        assert expected == str(parsed)
 
-class LicensingParseWithSymbolsSimpleTest(TestCase):
+    def test_parse_invalid_expression_with_trailing_or(self):
+        licensing = Licensing()
+        expr = 'mit or'
+        try:
+            licensing.parse(expr)
+            self.fail("Exception not raised when validating '%s'" % expr)
+        except ExpressionError as ee:
+            assert 'OR requires two or more licenses as in: MIT OR BSD' == str(ee)
+
+    def test_parse_invalid_expression_with_trailing_or_and_valid_start_does_not_raise_exception(self):
+        licensing = Licensing()
+        expression = ' mit or mit or '
+        parsed = licensing.parse(expression)
+        # ExpressionError: OR requires two or more licenses as in: MIT OR BSD
+        expected = 'mit OR mit'
+        assert expected == str(parsed)
+
+    def test_parse_invalid_expression_with_repeated_trailing_or_raise_exception(self):
+        licensing = Licensing()
+        expression = 'mit or mit or or'
+        try:
+            licensing.parse(expression, simple=False)
+            self.fail('Exception not raised')
+        except ParseError as pe:
+            expected = {
+                'error_code': PARSE_INVALID_OPERATOR_SEQUENCE,
+                'position': 14,
+                'token_string': 'or',
+                'token_type': TOKEN_OR
+            }
+            assert expected == _parse_error_as_dict(pe)
 
-    def test_Licensing_with_illegal_symbols_raise_Exception(self):
+    @expectedFailure
+    def test_parse_invalid_expression_with_single_trailing_or_raise_exception(self):
+        licensing = Licensing()
+        expression = 'mit or mit or'
         try:
-            Licensing([
-                'GPL-2.0 or LATER',
-                'classpath Exception',
-                'something with else+',
-                'mit',
-                'LGPL 2.1',
-                'mit or later'
-            ])
+            licensing.parse(expression, simple=False)
+            self.fail('Exception not raised')
+        except ParseError as pe:
+            expected = {
+                'error_code': PARSE_INVALID_OPERATOR_SEQUENCE,
+                'position': 14,
+                'token_string': 'or',
+                'token_type': TOKEN_OR
+            }
+            assert expected == _parse_error_as_dict(pe)
+
+    def test_parse_invalid_expression_with_single_trailing_and_raise_exception(self):
+        licensing = Licensing()
+        expression = 'mit or mit and'
+        try:
+            licensing.parse(expression, simple=False)
+            self.fail('Exception not raised')
         except ExpressionError as ee:
-            expected = ('Invalid license key: "or later" words are reserved and '
-                        'cannot be used in a key: "GPL-2.0 or LATER"')
+            assert 'AND requires two or more licenses as in: MIT AND BSD' == str(ee)
+
+    def test_parse_invalid_expression_with_single_leading_or_raise_exception(self):
+        licensing = Licensing()
+        expression = 'or mit or mit'
+        try:
+            licensing.parse(expression, simple=False)
+            self.fail('Exception not raised')
+        except ParseError as pe:
+            expected = {
+                'error_code': PARSE_INVALID_OPERATOR_SEQUENCE,
+                'position': 0,
+                'token_string': 'or',
+                'token_type': TOKEN_OR
+            }
+            assert expected == _parse_error_as_dict(pe)
 
-            assert expected == str(ee)
+
+class LicensingParseWithSymbolsSimpleTest(TestCase):
+
+    def test_Licensing_with_overlapping_symbols_with_keywords_does_not_raise_Exception(self):
+        Licensing([
+            'GPL-2.0 or LATER',
+            'classpath Exception',
+            'something with else+',
+            'mit',
+            'LGPL 2.1',
+            'mit or later'
+        ])
 
     def get_syms_and_licensing(self):
         a = LicenseSymbol('l-a')
@@ -881,6 +1099,7 @@ def test_parse_raise_ParseError_when_validating_strict_with_non_exception_symbol
         expression = 'gpl and bsd or lgpl with exception'
         try:
             licensing.parse(expression, validate=True, strict=True)
+            self.fail('Exception not raised')
         except ParseError as pe:
             expected = {
                 'error_code': PARSE_INVALID_SYMBOL_AS_EXCEPTION,
@@ -895,6 +1114,7 @@ def test_parse_raise_ParseError_when_validating_strict_with_exception_symbols_in
         licensing.parse('gpl with exception', validate=True, strict=True)
         try:
             licensing.parse('exception with gpl', validate=True, strict=True)
+            self.fail('Exception not raised')
         except ParseError as pe:
             expected = {
                 'error_code': PARSE_INVALID_EXCEPTION,
@@ -905,6 +1125,7 @@ def test_parse_raise_ParseError_when_validating_strict_with_exception_symbols_in
 
         try:
             licensing.parse('gpl with gpl', validate=True, strict=True)
+            self.fail('Exception not raised')
         except ParseError as pe:
             expected = {
                 'error_code': PARSE_INVALID_SYMBOL_AS_EXCEPTION,
@@ -913,6 +1134,31 @@ def test_parse_raise_ParseError_when_validating_strict_with_exception_symbols_in
                 'token_type': TOKEN_SYMBOL}
             assert expected == _parse_error_as_dict(pe)
 
+    def test_with_unknown_symbol_string_contained_in_known_symbol_does_not_crash_with(self):
+        l = Licensing(['lgpl-3.0-plus'])
+        license_expression = 'lgpl-3.0-plus WITH openssl-exception-lgpl-3.0-plus'
+        l.parse(license_expression)
+
+    def test_with_unknown_symbol_string_contained_in_known_symbol_does_not_crash_and(self):
+        l = Licensing(['lgpl-3.0-plus'])
+        license_expression = 'lgpl-3.0-plus AND openssl-exception-lgpl-3.0-plus'
+        l.parse(license_expression)
+
+    def test_with_unknown_symbol_string_contained_in_known_symbol_does_not_crash_or(self):
+        l = Licensing(['lgpl-3.0-plus'])
+        license_expression = 'lgpl-3.0-plus OR openssl-exception-lgpl-3.0-plus'
+        l.parse(license_expression)
+
+    def test_with_known_symbol_string_contained_in_known_symbol_does_not_crash_or(self):
+        l = Licensing(['lgpl-3.0-plus', 'openssl-exception-lgpl-3.0-plus'])
+        license_expression = 'lgpl-3.0-plus OR openssl-exception-lgpl-3.0-plus'
+        l.parse(license_expression)
+
+    def test_with_known_symbol_string_contained_in_known_symbol_does_not_crash_with(self):
+        l = Licensing(['lgpl-3.0-plus', 'openssl-exception-lgpl-3.0-plus'])
+        license_expression = 'lgpl-3.0-plus WITH openssl-exception-lgpl-3.0-plus'
+        l.parse(license_expression)
+
 
 class LicensingSymbolsReplacement(TestCase):
 
@@ -1011,14 +1257,18 @@ def get_symbols_and_licensing(self):
         licensing = Licensing(symbols)
         return gpl2, gpl2plus, lgpl, mit, mitand2, licensing
 
-    def test_parse_trailing_char_raise_exception(self):
+    def test_parse_trailing_char_does_not_raise_exception_without_validate(self):
+        _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing()
+        e = licensing.parse('The GNU GPL 20 or LGPL-2.1 and mit2', validate=False)
+        assert 'gpl-2.0 OR (LGPL-2.1 AND mit2)' == str(e)
+
+    def test_parse_trailing_char_raise_exception_with_validate(self):
         _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing()
         try:
-            licensing.parse('The GNU GPL 20 or LGPL-2.1 and mit2')
-        except ParseError as pe:
-            expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 34,
-                        'token_string': '2', 'token_type': LicenseSymbol('2')}
-            assert expected == _parse_error_as_dict(pe)
+            licensing.parse('The GNU GPL 20 or LGPL-2.1 and mit2', validate=True)
+            self.fail('Exception not raised')
+        except ExpressionError as ee:
+            assert 'Unknown license key(s): mit2' == str(ee)
 
     def test_parse_expression_with_trailing_unknown_should_raise_exception(self):
         gpl2, gpl2plus, lgpl, mit, _mitand2, licensing = self.get_symbols_and_licensing()
@@ -1027,25 +1277,26 @@ def test_parse_expression_with_trailing_unknown_should_raise_exception(self):
         tokens = list(licensing.tokenize('The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit 123'))
         expected = [
             (gpl2plus, 'The GNU GPL 20 or later', 0),
-            (TOKEN_OR, ' or ', 23),
+            (TOKEN_OR, 'or', 24),
             (TOKEN_LPAR, '(', 27),
             (lgpl, 'LGPL-2.1', 28),
-            (TOKEN_AND, ' and ', 36),
+            (TOKEN_AND, 'and', 37),
             (mit, 'mit', 41),
             (TOKEN_RPAR, ')', 44),
-            (TOKEN_OR, ' or ', 45),
+            (TOKEN_OR, 'or', 46),
             (gpl2, 'The GNU GPL 20', 49),
-            (TOKEN_OR, ' or ', 63),
+            (TOKEN_OR, 'or', 64),
             (mit, 'mit', 67),
-            (unknown, ' 123', 70)
+            (unknown, '123', 71)
         ]
         assert expected == tokens
 
         try:
             licensing.parse('The GNU GPL 20 or later or (LGPL-2.1 and mit) or The GNU GPL 20 or mit 123')
+            self.fail('Exception not raised')
         except ParseError as pe:
-            expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 70,
-                        'token_string': ' 123', 'token_type': unknown}
+            expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 71,
+                        'token_string': '123', 'token_type': unknown}
             assert expected == _parse_error_as_dict(pe)
 
     def test_parse_expression_with_trailing_unknown_should_raise_exception2(self):
@@ -1053,9 +1304,11 @@ def test_parse_expression_with_trailing_unknown_should_raise_exception2(self):
         unknown = LicenseSymbol(key='123')
         try:
             licensing.parse('The GNU GPL 20 or mit 123')
+            #                01234567890123456789012345
+            self.fail('Exception not raised')
         except ParseError as pe:
-            expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 21,
-                        'token_string': ' 123', 'token_type': unknown}
+            expected = {'error_code': PARSE_INVALID_SYMBOL_SEQUENCE, 'position': 22,
+                        'token_string': '123', 'token_type': unknown}
             assert expected == _parse_error_as_dict(pe)
 
     def test_parse_expression_with_WITH(self):
@@ -1069,15 +1322,15 @@ def test_parse_expression_with_WITH(self):
         tokens = list(licensing.tokenize(expr))
         expected = [
             (gpl_20_or_later, 'The GNU GPL 20 or later', 0),
-            (TOKEN_OR, ' or ', 23),
+            (TOKEN_OR, 'or', 24),
             (TOKEN_LPAR, '(', 27),
             (lgpl, 'LGPL-2.1', 28),
-            (TOKEN_AND, ' and ', 36),
+            (TOKEN_AND, 'and', 37),
             (mit, 'mit', 41),
             (TOKEN_RPAR, ')', 44),
-            (TOKEN_OR, ' or ', 45),
+            (TOKEN_OR, 'or', 46),
             (gpl2, 'The GNU GPL 20', 49),
-            (TOKEN_OR, ' or ', 63),
+            (TOKEN_OR, 'or', 64),
             (LicenseWithExceptionSymbol(mit, mitexp), 'mit with mit exp', 67)
         ]
 
@@ -1123,19 +1376,37 @@ def test_unknown_keys_with_trailing_char(self):
         assert [] == licensing.unknown_license_keys(parsed)
         assert [] == licensing.unknown_license_keys(expr)
 
-    def test_unknown_keys_with_trailing_char_2(self):
+    def test_unknown_keys_with_trailing_char_2_with_validate(self):
         _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing()
         expr = 'The GNU GPL 20 or LGPL-2.1 and mitand3'
 
         try:
-            licensing.parse(expr)
-            self.fail('ParseError should be raised')
-        except ParseError as pe:
-            expected = {'error_code': 5, 'position': 34, 'token_string': u'and3', 'token_type': LicenseSymbol(key=u'and3')}
+            licensing.parse(expr, validate=True)
+            self.fail('Exception should be raised')
+        except ExpressionError as ee:
+            assert 'Unknown license key(s): mitand3' == str(ee)
 
-            assert expected == _parse_error_as_dict(pe)
+    def test_unknown_keys_with_trailing_char_2_without_validate(self):
+        _gpl2, _gpl2plus, _lgpl, _mit, _mitand2, licensing = self.get_symbols_and_licensing()
+        expr = 'The GNU GPL 20 or LGPL-2.1 and mitand3'
+        parsed = licensing.parse(expr, validate=False)
+        assert 'gpl-2.0 OR (LGPL-2.1 AND mitand3)' == str(parsed)
+
+    def test_parse_with_overlapping_key_without_symbols(self):
+        expression = 'mit or mit AND zlib or mit or mit with verylonglicense'
+        #             1111111111222222222233333333334444444444555555555566666
+        #             0123456789012345678901234567890123456789012345678901234
+
+        licensing = Licensing()
+        results = str(licensing.parse(expression))
+        expected = 'mit OR (mit AND zlib) OR mit OR mit WITH verylonglicense'
+        assert expected == results
+
+    def test_advanced_tokenizer_tokenize_with_overlapping_key_with_symbols_and_trailing_unknown(self):
+        expression = 'mit or mit AND zlib or mit or mit with verylonglicense'
+        #                       111111111122222222223333333333444444444455555
+        #             0123456789012345678901234567890123456789012345678901234
 
-    def test_parse_with_overlapping_key_with_licensing(self):
         symbols = [
             LicenseSymbol('MIT', ['MIT license']),
             LicenseSymbol('LGPL-2.1', ['LGPL v2.1']),
@@ -1145,11 +1416,104 @@ def test_parse_with_overlapping_key_with_licensing(self):
             LicenseSymbol('hmit', ['h verylonglicense']),
         ]
         licensing = Licensing(symbols)
+        results = list(licensing.get_advanced_tokenizer().tokenize(expression))
+        expected = [
+            Token(0, 2, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))),
+            Token(4, 5, 'or', Keyword(value=u'or', type=2)),
+            Token(7, 9, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))),
+            Token(11, 13, 'AND', Keyword(value=u'and', type=1)),
+            Token(15, 18, 'zlib', LicenseSymbol(u'zlib', aliases=(u'zlib',))),
+            Token(20, 21, 'or', Keyword(value=u'or', type=2)),
+            Token(23, 25, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))),
+            Token(27, 28, 'or', Keyword(value=u'or', type=2)),
+            Token(30, 32, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))),
+            Token(34, 37, 'with', Keyword(value=u'with', type=10)),
+            Token(39, 53, 'verylonglicense', None),
+        ]
+
+        assert expected == results
+
+    def test_advanced_tokenizer_iter_with_overlapping_key_with_symbols_and_trailing_unknown(self):
+        expression = 'mit or mit AND zlib or mit or mit with verylonglicense'
+        #                       111111111122222222223333333333444444444455555
+        #             0123456789012345678901234567890123456789012345678901234
+
+        symbols = [
+            LicenseSymbol('MIT', ['MIT license']),
+            LicenseSymbol('LGPL-2.1', ['LGPL v2.1']),
+            LicenseSymbol('zlib', ['zlib']),
+            LicenseSymbol('d-zlib', ['D zlib']),
+            LicenseSymbol('mito', ['mit o']),
+            LicenseSymbol('hmit', ['h verylonglicense']),
+        ]
+        licensing = Licensing(symbols)
+        results = list(licensing.get_advanced_tokenizer().iter(expression, include_unmatched=True))
+        expected = [
+            Token(0, 2, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))),
+            Token(4, 5, 'or', Keyword(value=u'or', type=2)),
+            Token(7, 9, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))),
+            Token(11, 13, 'AND', Keyword(value=u'and', type=1)),
+            Token(15, 18, 'zlib', LicenseSymbol(u'zlib', aliases=(u'zlib',))),
+            Token(20, 21, 'or', Keyword(value=u'or', type=2)),
+            Token(23, 25, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))),
+            Token(27, 28, 'or', Keyword(value=u'or', type=2)),
+            Token(30, 32, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))),
+            Token(34, 37, 'with', Keyword(value=u'with', type=10)),
+            Token(39, 53, 'verylonglicense', None),
+        ]
+        assert expected == results
+
+    def test_advanced_tokenizer_iter_with_overlapping_key_with_symbols_and_trailing_unknown2(self):
+        expression = 'mit with verylonglicense'
+        symbols = [
+            LicenseSymbol('MIT', ['MIT license']),
+            LicenseSymbol('hmit', ['h verylonglicense']),
+        ]
+        licensing = Licensing(symbols)
+        results = list(licensing.get_advanced_tokenizer().iter(expression, include_unmatched=True))
+        expected = [
+            Token(0, 2, 'mit', LicenseSymbol(u'MIT', aliases=(u'MIT license',))),
+            Token(4, 7, 'with', Keyword(value=u'with', type=10)),
+            Token(9, 23, 'verylonglicense', None),
+        ]
+        assert expected == results
 
+    def test_tokenize_with_overlapping_key_with_symbols_and_trailing_unknown(self):
         expression = 'mit or mit AND zlib or mit or mit with verylonglicense'
+        #             1111111111222222222233333333334444444444555555555566666
+        #             0123456789012345678901234567890123456789012345678901234
+
+        symbols = [
+            LicenseSymbol('MIT', ['MIT license']),
+            LicenseSymbol('LGPL-2.1', ['LGPL v2.1']),
+            LicenseSymbol('zlib', ['zlib']),
+            LicenseSymbol('d-zlib', ['D zlib']),
+            LicenseSymbol('mito', ['mit o']),
+            LicenseSymbol('hmit', ['h verylonglicense']),
+        ]
+        licensing = Licensing(symbols)
+
+        results = list(licensing.tokenize(expression))
+        expected = [
+            (LicenseSymbol(u'MIT', aliases=(u'MIT license',)), 'mit', 0),
+            (2, 'or', 4),
+            (LicenseSymbol(u'MIT', aliases=(u'MIT license',)), 'mit', 7),
+            (1, 'AND', 11),
+            (LicenseSymbol(u'zlib', aliases=(u'zlib',)), 'zlib', 15),
+            (2, 'or', 20),
+            (LicenseSymbol(u'MIT', aliases=(u'MIT license',)), 'mit', 23),
+            (2, 'or', 27),
+            (LicenseWithExceptionSymbol(
+                license_symbol=LicenseSymbol(u'MIT', aliases=(u'MIT license',)),
+                exception_symbol=LicenseSymbol(u'verylonglicense')), 'mit with verylonglicense',
+             30)
+        ]
+
+        assert expected == results
+
         results = str(licensing.parse(expression))
-        expected = 'mit OR (MIT AND zlib) OR mit OR MIT WITH verylonglicense'
-        self.assertEqual(expected, results)
+        expected = 'MIT OR (MIT AND zlib) OR MIT OR MIT WITH verylonglicense'
+        assert expected == results
 
 
 class LicensingSymbolsTest(TestCase):
@@ -1286,104 +1650,105 @@ def test_primary_license_symbol_and_primary_license_key(self):
 
 class SplitAndTokenizeTest(TestCase):
 
-    def test_splitter(self):
+    def test_simple_tokenizer(self):
         expr = (' GPL-2.0 or later with classpath Exception and mit and '
                 'mit with SOMETHING with ELSE+ or LGPL 2.1 and '
                 'GPL-2.0 or LATER with (Classpath Exception and '
                 'mit or later) or LGPL 2.1 or mit or GPL-2.0 or LATER '
                 'with SOMETHING with ELSE+ and lgpl 2.1')
-        results = list(splitter(expr))
+        licensing = Licensing()
+        results = list(licensing.simple_tokenizer(expr))
         expected = [
-            Result(0, 0, ' ', None),
-            Result(1, 7, 'GPL-2.0', Output('GPL-2.0', LicenseSymbol(key='GPL-2.0',))),
-            Result(8, 8, ' ', None),
-            Result(9, 10, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))),
-            Result(11, 11, ' ', None),
-            Result(12, 16, 'later', Output('later', LicenseSymbol(key='later',))),
-            Result(17, 17, ' ', None),
-            Result(18, 21, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))),
-            Result(22, 22, ' ', None),
-            Result(23, 31, 'classpath', Output('classpath', LicenseSymbol(key='classpath',))),
-            Result(32, 32, ' ', None),
-            Result(33, 41, 'Exception', Output('Exception', LicenseSymbol(key='Exception',))),
-            Result(42, 42, ' ', None),
-            Result(43, 45, 'and', Output('and', Keyword(value='and', type=TOKEN_AND))),
-            Result(46, 46, ' ', None),
-            Result(47, 49, 'mit', Output('mit', LicenseSymbol(key='mit',))),
-            Result(50, 50, ' ', None),
-            Result(51, 53, 'and', Output('and', Keyword(value='and', type=TOKEN_AND))),
-            Result(54, 54, ' ', None),
-            Result(55, 57, 'mit', Output('mit', LicenseSymbol(key='mit',))),
-            Result(58, 58, ' ', None),
-            Result(59, 62, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))),
-            Result(63, 63, ' ', None),
-            Result(64, 72, 'SOMETHING', Output('SOMETHING', LicenseSymbol(key='SOMETHING',))),
-            Result(73, 73, ' ', None),
-            Result(74, 77, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))),
-            Result(78, 78, ' ', None),
-            Result(79, 83, 'ELSE+', Output('ELSE+', LicenseSymbol(key='ELSE+',))),
-            Result(84, 84, ' ', None),
-            Result(85, 86, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))),
-            Result(87, 87, ' ', None),
-            Result(88, 91, 'LGPL', Output('LGPL', LicenseSymbol(key='LGPL',))),
-            Result(92, 92, ' ', None),
-            Result(93, 95, '2.1', Output('2.1', LicenseSymbol(key='2.1',))),
-            Result(96, 96, ' ', None),
-            Result(97, 99, 'and', Output('and', Keyword(value='and', type=TOKEN_AND))),
-            Result(100, 100, ' ', None),
-            Result(101, 107, 'GPL-2.0', Output('GPL-2.0', LicenseSymbol(key='GPL-2.0',))),
-            Result(108, 108, ' ', None),
-            Result(109, 110, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))),
-            Result(111, 111, ' ', None),
-            Result(112, 116, 'LATER', Output('LATER', LicenseSymbol(key='LATER',))),
-            Result(117, 117, ' ', None),
-            Result(118, 121, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))),
-            Result(122, 122, ' ', None),
-            Result(123, 123, '(', Output('(', Keyword(value='(', type=TOKEN_LPAR))),
-            Result(124, 132, 'Classpath', Output('Classpath', LicenseSymbol(key='Classpath',))),
-            Result(133, 133, ' ', None),
-            Result(134, 142, 'Exception', Output('Exception', LicenseSymbol(key='Exception',))),
-            Result(143, 143, ' ', None),
-            Result(144, 146, 'and', Output('and', Keyword(value='and', type=TOKEN_AND))),
-            Result(147, 147, ' ', None),
-            Result(148, 150, 'mit', Output('mit', LicenseSymbol(key='mit',))),
-            Result(151, 151, ' ', None),
-            Result(152, 153, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))),
-            Result(154, 154, ' ', None),
-            Result(155, 159, 'later', Output('later', LicenseSymbol(key='later',))),
-            Result(160, 160, ')', Output(')', Keyword(value=')', type=TOKEN_RPAR))),
-            Result(161, 161, ' ', None),
-            Result(162, 163, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))),
-            Result(164, 164, ' ', None),
-            Result(165, 168, 'LGPL', Output('LGPL', LicenseSymbol(key='LGPL',))),
-            Result(169, 169, ' ', None),
-            Result(170, 172, '2.1', Output('2.1', LicenseSymbol(key='2.1',))),
-            Result(173, 173, ' ', None),
-            Result(174, 175, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))),
-            Result(176, 176, ' ', None),
-            Result(177, 179, 'mit', Output('mit', LicenseSymbol(key='mit',))),
-            Result(180, 180, ' ', None),
-            Result(181, 182, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))),
-            Result(183, 183, ' ', None),
-            Result(184, 190, 'GPL-2.0', Output('GPL-2.0', LicenseSymbol(key='GPL-2.0',))),
-            Result(191, 191, ' ', None),
-            Result(192, 193, 'or', Output('or', Keyword(value='or', type=TOKEN_OR))),
-            Result(194, 194, ' ', None),
-            Result(195, 199, 'LATER', Output('LATER', LicenseSymbol(key='LATER',))),
-            Result(200, 200, ' ', None),
-            Result(201, 204, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))),
-            Result(205, 205, ' ', None),
-            Result(206, 214, 'SOMETHING', Output('SOMETHING', LicenseSymbol(key='SOMETHING',))),
-            Result(215, 215, ' ', None),
-            Result(216, 219, 'with', Output('with', Keyword(value='with', type=TOKEN_WITH))),
-            Result(220, 220, ' ', None),
-            Result(221, 225, 'ELSE+', Output('ELSE+', LicenseSymbol(key='ELSE+',))),
-            Result(226, 226, ' ', None),
-            Result(227, 229, 'and', Output('and', Keyword(value='and', type=TOKEN_AND))),
-            Result(230, 230, ' ', None),
-            Result(231, 234, 'lgpl', Output('lgpl', LicenseSymbol(key='lgpl',))),
-            Result(235, 235, ' ', None),
-            Result(236, 238, '2.1', Output('2.1', LicenseSymbol(key='2.1',)))
+            Token(0, 0, ' ', None),
+            Token(1, 7, 'GPL-2.0', LicenseSymbol(key='GPL-2.0')),
+            Token(8, 8, ' ', None),
+            Token(9, 10, 'or', Keyword(value='or', type=TOKEN_OR)),
+            Token(11, 11, ' ', None),
+            Token(12, 16, 'later', LicenseSymbol(key='later')),
+            Token(17, 17, ' ', None),
+            Token(18, 21, 'with', Keyword(value='with', type=TOKEN_WITH)),
+            Token(22, 22, ' ', None),
+            Token(23, 31, 'classpath', LicenseSymbol(key='classpath')),
+            Token(32, 32, ' ', None),
+            Token(33, 41, 'Exception', LicenseSymbol(key='Exception')),
+            Token(42, 42, ' ', None),
+            Token(43, 45, 'and', Keyword(value='and', type=TOKEN_AND)),
+            Token(46, 46, ' ', None),
+            Token(47, 49, 'mit', LicenseSymbol(key='mit')),
+            Token(50, 50, ' ', None),
+            Token(51, 53, 'and', Keyword(value='and', type=TOKEN_AND)),
+            Token(54, 54, ' ', None),
+            Token(55, 57, 'mit', LicenseSymbol(key='mit')),
+            Token(58, 58, ' ', None),
+            Token(59, 62, 'with', Keyword(value='with', type=TOKEN_WITH)),
+            Token(63, 63, ' ', None),
+            Token(64, 72, 'SOMETHING', LicenseSymbol(key='SOMETHING')),
+            Token(73, 73, ' ', None),
+            Token(74, 77, 'with', Keyword(value='with', type=TOKEN_WITH)),
+            Token(78, 78, ' ', None),
+            Token(79, 83, 'ELSE+', LicenseSymbol(key='ELSE+')),
+            Token(84, 84, ' ', None),
+            Token(85, 86, 'or', Keyword(value='or', type=TOKEN_OR)),
+            Token(87, 87, ' ', None),
+            Token(88, 91, 'LGPL', LicenseSymbol(key='LGPL')),
+            Token(92, 92, ' ', None),
+            Token(93, 95, '2.1', LicenseSymbol(key='2.1')),
+            Token(96, 96, ' ', None),
+            Token(97, 99, 'and', Keyword(value='and', type=TOKEN_AND)),
+            Token(100, 100, ' ', None),
+            Token(101, 107, 'GPL-2.0', LicenseSymbol(key='GPL-2.0')),
+            Token(108, 108, ' ', None),
+            Token(109, 110, 'or', Keyword(value='or', type=TOKEN_OR)),
+            Token(111, 111, ' ', None),
+            Token(112, 116, 'LATER', LicenseSymbol(key='LATER')),
+            Token(117, 117, ' ', None),
+            Token(118, 121, 'with', Keyword(value='with', type=TOKEN_WITH)),
+            Token(122, 122, ' ', None),
+            Token(123, 123, '(', Keyword(value='(', type=TOKEN_LPAR)),
+            Token(124, 132, 'Classpath', LicenseSymbol(key='Classpath')),
+            Token(133, 133, ' ', None),
+            Token(134, 142, 'Exception', LicenseSymbol(key='Exception')),
+            Token(143, 143, ' ', None),
+            Token(144, 146, 'and', Keyword(value='and', type=TOKEN_AND)),
+            Token(147, 147, ' ', None),
+            Token(148, 150, 'mit', LicenseSymbol(key='mit')),
+            Token(151, 151, ' ', None),
+            Token(152, 153, 'or', Keyword(value='or', type=TOKEN_OR)),
+            Token(154, 154, ' ', None),
+            Token(155, 159, 'later', LicenseSymbol(key='later')),
+            Token(160, 160, ')', Keyword(value=')', type=TOKEN_RPAR)),
+            Token(161, 161, ' ', None),
+            Token(162, 163, 'or', Keyword(value='or', type=TOKEN_OR)),
+            Token(164, 164, ' ', None),
+            Token(165, 168, 'LGPL', LicenseSymbol(key='LGPL')),
+            Token(169, 169, ' ', None),
+            Token(170, 172, '2.1', LicenseSymbol(key='2.1')),
+            Token(173, 173, ' ', None),
+            Token(174, 175, 'or', Keyword(value='or', type=TOKEN_OR)),
+            Token(176, 176, ' ', None),
+            Token(177, 179, 'mit', LicenseSymbol(key='mit')),
+            Token(180, 180, ' ', None),
+            Token(181, 182, 'or', Keyword(value='or', type=TOKEN_OR)),
+            Token(183, 183, ' ', None),
+            Token(184, 190, 'GPL-2.0', LicenseSymbol(key='GPL-2.0')),
+            Token(191, 191, ' ', None),
+            Token(192, 193, 'or', Keyword(value='or', type=TOKEN_OR)),
+            Token(194, 194, ' ', None),
+            Token(195, 199, 'LATER', LicenseSymbol(key='LATER')),
+            Token(200, 200, ' ', None),
+            Token(201, 204, 'with', Keyword(value='with', type=TOKEN_WITH)),
+            Token(205, 205, ' ', None),
+            Token(206, 214, 'SOMETHING', LicenseSymbol(key='SOMETHING')),
+            Token(215, 215, ' ', None),
+            Token(216, 219, 'with', Keyword(value='with', type=TOKEN_WITH)),
+            Token(220, 220, ' ', None),
+            Token(221, 225, 'ELSE+', LicenseSymbol(key='ELSE+')),
+            Token(226, 226, ' ', None),
+            Token(227, 229, 'and', Keyword(value='and', type=TOKEN_AND)),
+            Token(230, 230, ' ', None),
+            Token(231, 234, 'lgpl', LicenseSymbol(key='lgpl')),
+            Token(235, 235, ' ', None),
+            Token(236, 238, '2.1', LicenseSymbol(key='2.1',))
         ]
         assert expected == results
 
@@ -1422,103 +1787,82 @@ def test_tokenize_step_by_step_does_not_munge_trailing_symbols(self):
                 'mit or later or LGPL 2.1 or mit or GPL-2.0 or LATER '
                 'with mitthing with ELSE+ and lgpl 2.1 or gpl-2.0')
 
-        # fist scan
-        scanner = licensing.get_scanner()
-        result = list(scanner.scan(expr))
-
-        WITH_KW = Keyword(value=' with ', type=10)
-        AND_KW = Keyword(value=' and ', type=1)
-        OR_KW = Keyword(value=' or ', type=2)
-
+        # fist tokenize
+        tokenizer = licensing.get_advanced_tokenizer()
+        result = list(tokenizer.tokenize(expr))
         expected = [
-            Result(0, 0, ' ', None),
-            Result(1, 16, 'GPL-2.0 or later', Output('GPL-2.0 or LATER', gpl2plus, 1)),
-            Result(17, 22, ' with ', Output(' with ', WITH_KW, 0)),
-            Result(23, 41, 'classpath Exception', Output('classpath Exception', cpex, 1)),
-            Result(42, 46, ' and ', Output(' and ', AND_KW, 0)),
-            Result(47, 49, 'mit', Output('mit', mit, 1)),
-            Result(50, 54, ' and ', Output(' and ', AND_KW, 0)),
-            Result(55, 57, 'mit', Output('mit', mit, 1)),
-            Result(58, 63, ' with ', Output(' with ', WITH_KW, 0)),
-            Result(64, 82, 'mitthing with ELSE+', Output('mitthing with else+', mitthing_with_else, 1)),
-            Result(83, 86, ' or ', Output(' or ', OR_KW, 0)),
-            Result(87, 94, 'LGPL 2.1', Output('LGPL 2.1', lgpl, 1)),
-            Result(95, 99, ' and ', Output(' and ', AND_KW, 0)),
-            Result(100, 115, 'GPL-2.0 or LATER', Output('GPL-2.0 or LATER', gpl2plus, 1)),
-            Result(116, 121, ' with ', Output(' with ', WITH_KW, 0)),
-            Result(122, 140, 'Classpath Exception', Output('classpath Exception', cpex, 1)),
-            Result(141, 145, ' and ', Output(' and ', AND_KW, 0)),
-            Result(146, 157, 'mit or later', Output('mit or later', mitplus, 1)),
-            Result(158, 161, ' or ', Output(' or ', OR_KW, 0)),
-            Result(162, 169, 'LGPL 2.1', Output('LGPL 2.1', lgpl, 1)),
-            Result(170, 173, ' or ', Output(' or ', OR_KW, 0)),
-            Result(174, 176, 'mit', Output('mit', mit, 1)),
-            Result(177, 180, ' or ', Output(' or ', OR_KW, 0)),
-            Result(181, 196, 'GPL-2.0 or LATER', Output('GPL-2.0 or LATER', gpl2plus, 1)),
-            Result(197, 202, ' with ', Output(' with ', WITH_KW, 0)),
-            Result(203, 221, 'mitthing with ELSE+', Output('mitthing with else+', mitthing_with_else, 1)),
-            Result(222, 226, ' and ', Output(' and ', AND_KW, 0)),
-            Result(227, 234, 'lgpl 2.1', Output('LGPL 2.1', lgpl, 1)),
-            Result(235, 238, ' or ', Output(' or ', OR_KW, 0)),
-            Result(239, 245, 'gpl-2.0', Output('GPL-2.0', gpl2, 1))
+            Token(1, 16, 'GPL-2.0 or later', LicenseSymbol(u'GPL-2.0 or LATER')),
+            Token(18, 21, 'with', Keyword(value=u'with', type=10)),
+            Token(23, 41, 'classpath Exception', LicenseSymbol(u'classpath Exception', is_exception=True)),
+            Token(43, 45, 'and', Keyword(value=u'and', type=1)),
+            Token(47, 49, 'mit', LicenseSymbol(u'mit')),
+            Token(51, 53, 'and', Keyword(value=u'and', type=1)),
+            Token(55, 57, 'mit', LicenseSymbol(u'mit')),
+            Token(59, 62, 'with', Keyword(value=u'with', type=10)),
+            Token(64, 82, 'mitthing with ELSE+', LicenseSymbol(u'mitthing with else+')),
+            Token(84, 85, 'or', Keyword(value=u'or', type=2)),
+            Token(87, 94, 'LGPL 2.1', LicenseSymbol(u'LGPL 2.1')),
+            Token(96, 98, 'and', Keyword(value=u'and', type=1)),
+            Token(100, 115, 'GPL-2.0 or LATER', LicenseSymbol(u'GPL-2.0 or LATER')),
+            Token(117, 120, 'with', Keyword(value=u'with', type=10)),
+            Token(122, 140, 'Classpath Exception', LicenseSymbol(u'classpath Exception', is_exception=True)),
+            Token(142, 144, 'and', Keyword(value=u'and', type=1)),
+            Token(146, 157, 'mit or later', LicenseSymbol(u'mit or later')),
+            Token(159, 160, 'or', Keyword(value=u'or', type=2)),
+            Token(162, 169, 'LGPL 2.1', LicenseSymbol(u'LGPL 2.1')),
+            Token(171, 172, 'or', Keyword(value=u'or', type=2)),
+            Token(174, 176, 'mit', LicenseSymbol(u'mit')),
+            Token(178, 179, 'or', Keyword(value=u'or', type=2)),
+            Token(181, 196, 'GPL-2.0 or LATER', LicenseSymbol(u'GPL-2.0 or LATER')),
+            Token(198, 201, 'with', Keyword(value=u'with', type=10)),
+            Token(203, 221, 'mitthing with ELSE+', LicenseSymbol(u'mitthing with else+')),
+            Token(223, 225, 'and', Keyword(value=u'and', type=1)),
+            Token(227, 234, 'lgpl 2.1', LicenseSymbol(u'LGPL 2.1')),
+            Token(236, 237, 'or', Keyword(value=u'or', type=2)),
+            Token(239, 245, 'gpl-2.0', LicenseSymbol(u'GPL-2.0'))
         ]
 
         assert expected == result
-        assert 246 == expected[-1].end + 1
-        assert 246 == sum(len(r.string) for r in result)
-
-        # skip spaces
-        result = list(strip_and_skip_spaces(result))
-        # here only the first token is a space
-        assert expected[1:] == result
-
-        # group results
-
-        gpl2pluso = Output('GPL-2.0 or LATER', LicenseSymbol('GPL-2.0 or LATER', is_exception=False), 1)
-        cpex0 = Output('classpath Exception', LicenseSymbol('classpath Exception', is_exception=True), 1)
-        mito = Output('mit', LicenseSymbol('mit', is_exception=False), 1)
-        mieo1 = Output('mitthing with else+', LicenseSymbol('mitthing with else+', is_exception=False), 1)
-        lgplo = Output('LGPL 2.1', LicenseSymbol('LGPL 2.1', is_exception=False), 1)
-        mitoo = Output('mit or later', LicenseSymbol('mit or later', is_exception=False), 1)
-        gpl202 = Output('GPL-2.0', LicenseSymbol('GPL-2.0', is_exception=False), 1)
-
-        with_kw = Output(' with ', WITH_KW, 0)
-        and_kw = Output(' and ', AND_KW, 0)
-        or_kw = Output(' or ', OR_KW, 0)
 
         expected_groups = [
-            (Result(1, 16, 'GPL-2.0 or later', gpl2pluso),
-             Result(17, 22, ' with ', with_kw),
-             Result(23, 41, 'classpath Exception', cpex0)),
-            (Result(42, 46, ' and ', and_kw),),
-            (Result(47, 49, 'mit', mito),),
-            (Result(50, 54, ' and ', and_kw),),
-            (Result(55, 57, 'mit', mito),
-             Result(58, 63, ' with ', with_kw),
-             Result(64, 82, 'mitthing with ELSE+', mieo1)),
-            (Result(83, 86, ' or ', or_kw),),
-            (Result(87, 94, 'LGPL 2.1', lgplo),),
-            (Result(95, 99, ' and ', and_kw),),
-            (Result(100, 115, 'GPL-2.0 or LATER', gpl2pluso),
-             Result(116, 121, ' with ', with_kw),
-             Result(122, 140, 'Classpath Exception', cpex0)),
-            (Result(141, 145, ' and ', and_kw),),
-            (Result(146, 157, 'mit or later', mitoo),),
-            (Result(158, 161, ' or ', or_kw),),
-            (Result(162, 169, 'LGPL 2.1', lgplo),),
-            (Result(170, 173, ' or ', or_kw),),
-            (Result(174, 176, 'mit', mito),),
-            (Result(177, 180, ' or ', or_kw),),
-            (Result(181, 196, 'GPL-2.0 or LATER', gpl2pluso),
-             Result(197, 202, ' with ', with_kw),
-             Result(203, 221, 'mitthing with ELSE+', mieo1)),
-            (Result(222, 226, ' and ', and_kw),),
-            (Result(227, 234, 'lgpl 2.1', lgplo),),
-            (Result(235, 238, ' or ', or_kw),),
-            (Result(239, 245, 'gpl-2.0', gpl202),)
+            (Token(1, 16, 'GPL-2.0 or later', LicenseSymbol(u'GPL-2.0 or LATER')),
+             Token(18, 21, 'with', Keyword(value=u'with', type=10)),
+             Token(23, 41, 'classpath Exception', LicenseSymbol(u'classpath Exception', is_exception=True))),
+
+            (Token(43, 45, 'and', Keyword(value=u'and', type=1)),),
+            (Token(47, 49, 'mit', LicenseSymbol(u'mit')),),
+            (Token(51, 53, 'and', Keyword(value=u'and', type=1)),),
+
+            (Token(55, 57, 'mit', LicenseSymbol(u'mit')),
+             Token(59, 62, 'with', Keyword(value=u'with', type=10)),
+             Token(64, 82, 'mitthing with ELSE+', LicenseSymbol(u'mitthing with else+'))),
+
+            (Token(84, 85, 'or', Keyword(value=u'or', type=2)),),
+            (Token(87, 94, 'LGPL 2.1', LicenseSymbol(u'LGPL 2.1')),),
+            (Token(96, 98, 'and', Keyword(value=u'and', type=1)),),
+
+            (Token(100, 115, 'GPL-2.0 or LATER', LicenseSymbol(u'GPL-2.0 or LATER')),
+             Token(117, 120, 'with', Keyword(value=u'with', type=10)),
+             Token(122, 140, 'Classpath Exception', LicenseSymbol(u'classpath Exception', is_exception=True))),
+
+            (Token(142, 144, 'and', Keyword(value=u'and', type=1)),),
+            (Token(146, 157, 'mit or later', LicenseSymbol(u'mit or later')),),
+            (Token(159, 160, 'or', Keyword(value=u'or', type=2)),),
+            (Token(162, 169, 'LGPL 2.1', LicenseSymbol(u'LGPL 2.1')),),
+            (Token(171, 172, 'or', Keyword(value=u'or', type=2)),),
+            (Token(174, 176, 'mit', LicenseSymbol(u'mit')),),
+            (Token(178, 179, 'or', Keyword(value=u'or', type=2)),),
+
+            (Token(181, 196, 'GPL-2.0 or LATER', LicenseSymbol(u'GPL-2.0 or LATER')),
+             Token(198, 201, 'with', Keyword(value=u'with', type=10)),
+             Token(203, 221, 'mitthing with ELSE+', LicenseSymbol(u'mitthing with else+'))),
+
+            (Token(223, 225, 'and', Keyword(value=u'and', type=1)),),
+            (Token(227, 234, 'lgpl 2.1', LicenseSymbol(u'LGPL 2.1')),),
+            (Token(236, 237, 'or', Keyword(value=u'or', type=2)),),
+            (Token(239, 245, 'gpl-2.0', LicenseSymbol(u'GPL-2.0')),)
         ]
-
-        result_groups = list(group_results_for_with_subexpression(result))
+        result_groups = list(build_token_groups_for_with_subexpression(result))
         assert expected_groups == result_groups
 
         # finally retest it all with tokenize
@@ -1530,25 +1874,25 @@ def test_tokenize_step_by_step_does_not_munge_trailing_symbols(self):
 
         expected = [
             (gpl2plus_with_cpex, 'GPL-2.0 or later with classpath Exception', 1),
-            (TOKEN_AND, ' and ', 42),
+            (TOKEN_AND, 'and', 43),
             (mit, 'mit', 47),
-            (TOKEN_AND, ' and ', 50),
+            (TOKEN_AND, 'and', 51),
             (mit_with_mitthing_with_else, 'mit with mitthing with ELSE+', 55),
-            (TOKEN_OR, ' or ', 83),
+            (TOKEN_OR, 'or', 84),
             (lgpl, 'LGPL 2.1', 87),
-            (TOKEN_AND, ' and ', 95),
+            (TOKEN_AND, 'and', 96),
             (gpl2plus_with_cpex, 'GPL-2.0 or LATER with Classpath Exception', 100),
-            (TOKEN_AND, ' and ', 141),
+            (TOKEN_AND, 'and', 142),
             (mitplus, 'mit or later', 146),
-            (TOKEN_OR, ' or ', 158),
+            (TOKEN_OR, 'or', 159),
             (lgpl, 'LGPL 2.1', 162),
-            (TOKEN_OR, ' or ', 170),
+            (TOKEN_OR, 'or', 171),
             (mit, 'mit', 174),
-            (TOKEN_OR, ' or ', 177),
+            (TOKEN_OR, 'or', 178),
             (gpl2plus_with_someplus, 'GPL-2.0 or LATER with mitthing with ELSE+', 181),
-            (TOKEN_AND, ' and ', 222),
+            (TOKEN_AND, 'and', 223),
             (lgpl, 'lgpl 2.1', 227),
-            (TOKEN_OR, ' or ', 235),
+            (TOKEN_OR, 'or', 236),
             (gpl2, 'gpl-2.0', 239),
         ]
 
@@ -1661,3 +2005,75 @@ def __init__(self, key, is_exception=False):
 
         expected = [l1, lx, lx2, lx3, l3, l2, l4]
         assert expected == sorted([l4, l3, l2, l1, lx , lx2, lx3])
+
+
+class MockLicensesTest(TestCase):
+
+    def test_licensing_can_use_mocklicense_tuple(self):
+        MockLicense = namedtuple('MockLicense', 'key aliases is_exception')
+
+        licenses = [
+            MockLicense('gpl-2.0', ['GPL-2.0'], False),
+            MockLicense('classpath-2.0', ['Classpath-Exception-2.0'], True),
+            MockLicense('gpl-2.0-plus', ['GPL-2.0-or-later', 'GPL-2.0 or-later'], False),
+            MockLicense('lgpl-2.1-plus', ['LGPL-2.1-or-later'], False),
+        ]
+        licensing = Licensing(licenses)
+
+        ex1 = '(GPL-2.0-or-later with Classpath-Exception-2.0 or GPL-2.0 or-later) and LGPL-2.1-or-later'
+        expression1 = licensing.parse(ex1, validate=False, strict=False)
+        assert ['gpl-2.0-plus', 'classpath-2.0', 'lgpl-2.1-plus'] == licensing.license_keys(expression1)
+
+        ex2 = 'LGPL-2.1-or-later and (GPL-2.0-or-later oR GPL-2.0-or-later with Classpath-Exception-2.0)'
+        expression2 = licensing.parse(ex2, validate=True, strict=False)
+
+        ex3 = 'LGPL-2.1-or-later and (GPL-2.0-or-later oR GPL-2.0-or-later)'
+        expression3 = licensing.parse(ex3, validate=True, strict=False)
+
+        self.assertTrue(licensing.is_equivalent(expression1, expression2))
+        self.assertTrue(licensing.is_equivalent(expression2, expression1))
+        self.assertFalse(licensing.is_equivalent(expression1, expression3))
+        self.assertFalse(licensing.is_equivalent(expression2, expression3))
+
+    def test_and_and_or_is_invalid(self):
+        expression = 'gpl-2.0 with classpath and and or gpl-2.0-plus'
+        licensing = Licensing()
+        try:
+            licensing.parse(expression)
+            self.fail('Exception not raised')
+        except ParseError as pe:
+            expected = {
+                'error_code': PARSE_INVALID_OPERATOR_SEQUENCE,
+                'position': 27,
+                'token_string': 'and',
+                'token_type': TOKEN_AND}
+            assert expected == _parse_error_as_dict(pe)
+
+    def test_or_or_is_invalid(self):
+        expression = 'gpl-2.0 with classpath or or or or gpl-2.0-plus'
+        licensing = Licensing()
+        try:
+            licensing.parse(expression)
+        except ParseError as pe:
+            expected = {
+                'error_code': PARSE_INVALID_OPERATOR_SEQUENCE,
+                'position': 26,
+                'token_string': 'or',
+                'token_type': TOKEN_OR}
+            assert expected == _parse_error_as_dict(pe)
+
+    def test_tokenize_or_or(self):
+        expression = 'gpl-2.0 with classpath or or or gpl-2.0-plus'
+        licensing = Licensing()
+        results = list(licensing.tokenize(expression))
+        expected = [
+            (LicenseWithExceptionSymbol(
+                license_symbol=LicenseSymbol(u'gpl-2.0'),
+                exception_symbol=LicenseSymbol(u'classpath')), 'gpl-2.0 with classpath', 0),
+            (2, 'or', 23),
+            (2, 'or', 26),
+            (2, 'or', 29),
+            (LicenseSymbol(u'gpl-2.0-plus'), 'gpl-2.0-plus', 32)
+        ]
+
+        assert expected == results
diff --git a/thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl b/thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl
deleted file mode 100644
index 0710ef9..0000000
Binary files a/thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl and /dev/null differ
diff --git a/thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl b/thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl
new file mode 100644
index 0000000..72f8d99
Binary files /dev/null and b/thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl differ
diff --git a/thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl.ABOUT b/thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl.ABOUT
similarity index 55%
rename from thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl.ABOUT
rename to thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl.ABOUT
index 9dd54ae..1884a86 100644
--- a/thirdparty/dev/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl.ABOUT
+++ b/thirdparty/dev/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl.ABOUT
@@ -1,17 +1,16 @@
-about_resource: aboutcode_toolkit-3.0.2-py2.py3-none-any.whl
-checksum_md5: 7423e283e7c50979313f225065a5fea5
-checksum_sha1: 789d5d29437a11e8119da354a77218190d597d6d
+about_resource: aboutcode_toolkit-3.1.1-py2.py3-none-any.whl
+checksum_md5: 67e1f793b8421ce60800897bb5b9446d
+checksum_sha1: 26466b098411fcce12efac48bb9098cdf4a83573
 contact: http://www.nexb.com/contactus.html
 copyright: Copyright (c) 2013-2017 nexB Inc.
 description: AboutCode Toolkit is a tool to process ABOUT files. An ABOUT file provides
   a simple way to document the provenance (origin and license) 'about' a software
   component. This is a small text file stored in the codebase side-by-side with the
   documented software component.
-download_url: https://pypi.python.org/packages/11/7c/07b565c8a66f8846dab007ad80e31078f15034981dcb7c5e26dd985e3f4a/aboutcode_toolkit-3.0.2-py2.py3-none-any.whl#md5=7423e283e7c50979313f225065a5fea5
+download_url: https://files.pythonhosted.org/packages/48/05/c9dd903c5c6e0f06ec813a9911b27b252e3803fbd97ffa375d909694e26d/aboutcode_toolkit-3.1.1-py2.py3-none-any.whl#sha256=68b2fd1d05dd0dbc8acc91e7bf1b676e43804ea631bab490d0b46ae0b65e51b5
 homepage_url: https://aboutcode.org
 license_expression: apache-2.0
 name: AboutCode toolkit
-notice_file: NOTICE
 owner: nexB
 owner_url: http://www.nexb.com/
-version: 3.0.2
+version: 3.1.1
diff --git a/thirdparty/dev/more-itertools-py2.ABOUT b/thirdparty/dev/more-itertools-py2.ABOUT
index b8af58d..6317114 100644
--- a/thirdparty/dev/more-itertools-py2.ABOUT
+++ b/thirdparty/dev/more-itertools-py2.ABOUT
@@ -8,5 +8,5 @@ description:  More routines for operating on iterables, beyond itertools
 homepage_url: https://github.com/erikrose/more-itertools
 owner: Erik Rose
 license_expression: mit
-notice_file: more_itertools.NOTICE
+notice_file: more-itertools.NOTICE
 copyright: Copyright (c) 2012 Erik Rose
diff --git a/thirdparty/dev/more-itertools-py3.ABOUT b/thirdparty/dev/more-itertools-py3.ABOUT
index 8145db2..304fc26 100644
--- a/thirdparty/dev/more-itertools-py3.ABOUT
+++ b/thirdparty/dev/more-itertools-py3.ABOUT
@@ -8,5 +8,5 @@ description:  More routines for operating on iterables, beyond itertools
 homepage_url: https://github.com/erikrose/more-itertools
 owner: Erik Rose
 license_expression: mit
-notice_file: more_itertools.NOTICE
+notice_file: more-itertools.NOTICE
 copyright: Copyright (c) 2012 Erik Rose
diff --git a/thirdparty/prod/boolean.py-3.5-py2.py3-none-any.whl b/thirdparty/prod/boolean.py-3.5-py2.py3-none-any.whl
deleted file mode 100644
index 5fe68e5..0000000
Binary files a/thirdparty/prod/boolean.py-3.5-py2.py3-none-any.whl and /dev/null differ
diff --git a/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl
new file mode 100644
index 0000000..b1f5579
Binary files /dev/null and b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl differ
diff --git a/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.ABOUT b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.ABOUT
new file mode 100644
index 0000000..3df1f3a
--- /dev/null
+++ b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.ABOUT
@@ -0,0 +1,15 @@
+about_resource: boolean.py-3.6-py2.py3-none-any.whl
+attribute: true
+checksum_md5: da39999eb131b589e84ad935dc4ca642
+checksum_sha1: d31b55e7ad2ee917232b3213afe3ae9678156a9f
+copyright: Copyright (c) 2009-2016 Sebastian Kraemer, basti.kr@gmail.com and others
+description: Implements boolean algebra in one module.
+download_url: https://files.pythonhosted.org/packages/9b/27/d22062a221010e17935237ba4b574cd828238ea02e0765337c238466a512/boolean.py-3.6-py2.py3-none-any.whl
+homepage_url: https://github.com/bastikr/boolean.py
+license_expression: bsd-simplified
+license_file: bsd-simplified.LICENSE
+name: boolean.py
+notice_file: boolean.py-3.6-py2.py3-none-any.whl.NOTICE
+notice_url: https://github.com/bastikr/boolean.py/blob/master/LICENSE.txt
+owner: Sebastian Kraemer
+version: '3.6'
diff --git a/thirdparty/prod/boolean.py.LICENSE b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.NOTICE
similarity index 93%
rename from thirdparty/prod/boolean.py.LICENSE
rename to thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.NOTICE
index a0c637f..8819ea1 100644
--- a/thirdparty/prod/boolean.py.LICENSE
+++ b/thirdparty/prod/boolean.py-3.6-py2.py3-none-any.whl.NOTICE
@@ -1,23 +1,23 @@
-Copyright (c) 2009-2016 Sebastian Kraemer, basti.kr@gmail.com and others
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation and/or
-other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+Copyright (c) 2009-2017 Sebastian Kraemer, basti.kr@gmail.com
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/thirdparty/prod/boolean.py.ABOUT b/thirdparty/prod/boolean.py.ABOUT
deleted file mode 100644
index 31eb628..0000000
--- a/thirdparty/prod/boolean.py.ABOUT
+++ /dev/null
@@ -1,11 +0,0 @@
-about_resource: boolean.py-3.5-py2.py3-none-any.whl
-version: 3.5
-download_url: https://pypi.python.org/packages/80/f3/0508ae7ba76b02f7fd666b705766edc1863fc8ef29d0519b4c95d60ab1bb/boolean.py-3.5-py2.py3-none-any.whl#md5=cf90b0c0530663bbf71a53fb58f6fa72
-
-name: boolean.py
-
-copyright: Copyright (c) 2009-2016 Sebastian Kraemer, basti.kr@gmail.com and others
-license_expression: bsd-simplified
-license_file: boolean.py.LICENSE
-
-homepage_url: https://github.com/bastikr/boolean.py
diff --git a/thirdparty/prod/bsd-simplified.LICENSE b/thirdparty/prod/bsd-simplified.LICENSE
new file mode 100644
index 0000000..d99a0b1
--- /dev/null
+++ b/thirdparty/prod/bsd-simplified.LICENSE
@@ -0,0 +1,20 @@
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list
+of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.