From 3a7e45585bbd5d4bc616729b18935d10e78da40b Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Fri, 7 Aug 2020 12:32:35 -0400
Subject: [PATCH 1/5] Switch to directly decoding to UTF-8.

* Bump the version of simplejson to a version that properly handles
  ensure_ascii=False
* Remove the manual unascii-ifying and use ensure_ascii=False, which
  is now at least as fast.
---
 canonicaljson.py | 84 ++----------------------------------------------
 setup.py         |  2 +-
 2 files changed, 4 insertions(+), 82 deletions(-)

diff --git a/canonicaljson.py b/canonicaljson.py
index afab92a..f4858a6 100644
--- a/canonicaljson.py
+++ b/canonicaljson.py
@@ -32,14 +32,6 @@ def _default(obj):
                     obj.__class__.__name__)
 
 
-# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
-# much quicker (assuming c speedups are enabled) that it's actually much
-# quicker to let it do that and then substitute back (it's about 2.5x faster).
-#
-# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
-# as per https://github.com/simplejson/simplejson/issues/206).
-#
-
 # Declare these in the module scope, but they get configured in
 # set_json_library.
 _canonical_encoder = None
@@ -56,7 +48,7 @@ def set_json_library(json_lib):
     """
     global _canonical_encoder
     _canonical_encoder = json_lib.JSONEncoder(
-        ensure_ascii=True,
+        ensure_ascii=False,
         separators=(',', ':'),
         sort_keys=True,
         default=_default,
@@ -64,7 +56,7 @@ def set_json_library(json_lib):
 
     global _pretty_encoder
     _pretty_encoder = json_lib.JSONEncoder(
-        ensure_ascii=True,
+        ensure_ascii=False,
         indent=4,
         sort_keys=True,
         default=_default,
@@ -77,76 +69,6 @@ def set_json_library(json_lib):
 _U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")
 
 
-def _unascii(s):
-    """Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8
-
-    This method takes the output of the JSONEncoder and expands any \\uNNNN
-    escapes it finds (except for \\u0000 to \\u001F, which are converted to
-    \\xNN escapes).
-
-    For performance, it assumes that the input is valid JSON, and performs few
-    sanity checks.
-    """
-
-    # make the fast path fast: if there are no matches in the string, the
-    # whole thing is ascii. We have to turn it into a bytes, which is
-    # quickest with encode('utf-8')
-    m = _U_ESCAPE.search(s)
-    if not m:
-        return s.encode('utf-8')
-
-    # appending to a string (or a bytes) is slooow, so we accumulate sections
-    # of string result in 'chunks', and join them all together later.
-    # (It doesn't seem to make much difference whether we accumulate
-    # utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
-    #
-    chunks = []
-
-    # 'pos' tracks the index in 's' that we have processed into 'chunks' so
-    # far.
-    pos = 0
-
-    while m:
-        start = m.start()
-        end = m.end()
-
-        g = m.group(1)
-
-        if g is None:
-            # escaped backslash: pass it through along with anything before the
-            # match
-            chunks.append(s[pos:end])
-        else:
-            # \uNNNN, but we have to watch out for surrogate pairs.
-            #
-            # str.encode("utf-8") complains about surrogates, so we have to
-            # unpack them.
-            c = int(g, 16)
-
-            if c < 0x20:
-                # leave as a \uNNNN escape
-                chunks.append(s[pos:end])
-            else:
-                if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u':
-                    esc2 = s[end + 2:end + 6]
-                    c2 = int(esc2, 16)
-                    if c2 & 0xfc00 == 0xdc00:
-                        c = 0x10000 + (((c - 0xd800) << 10) |
-                                       (c2 - 0xdc00))
-                        end += 6
-
-                chunks.append(s[pos:start])
-                chunks.append(chr(c))
-
-        pos = end
-        m = _U_ESCAPE.search(s, pos)
-
-    # pass through anything after the last match
-    chunks.append(s[pos:])
-
-    return (''.join(chunks)).encode("utf-8")
-
-
 def encode_canonical_json(json_object):
     """Encodes the shortest UTF-8 JSON encoding with dictionary keys
     lexicographically sorted by unicode code point.
@@ -157,7 +79,7 @@ def encode_canonical_json(json_object):
     Returns:
         bytes encoding the JSON object"""
     s = _canonical_encoder.encode(json_object)
-    return _unascii(s)
+    return s.encode("utf-8")
 
 
 def encode_pretty_printed_json(json_object):
diff --git a/setup.py b/setup.py
index 3138d43..a4fc72e 100755
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@ def exec_file(path_segments, name):
     py_modules=["canonicaljson"],
     description="Canonical JSON",
     install_requires=[
-        "simplejson>=3.6.5",
+        "simplejson>=3.14.0",
         "frozendict>=1.0",
     ],
     zip_safe=True,

From 7f17bdca0e34be14c8549ce7d068823d467d5c41 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Fri, 7 Aug 2020 12:38:44 -0400
Subject: [PATCH 2/5] Remove more unused code.

---
 canonicaljson.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/canonicaljson.py b/canonicaljson.py
index f4858a6..63118a6 100644
--- a/canonicaljson.py
+++ b/canonicaljson.py
@@ -15,7 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
 import platform
 
 from frozendict import frozendict
@@ -63,12 +62,6 @@ def set_json_library(json_lib):
     )
 
 
-# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
-# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
-# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
-_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")
-
-
 def encode_canonical_json(json_object):
     """Encodes the shortest UTF-8 JSON encoding with dictionary keys
     lexicographically sorted by unicode code point.

From e98dc2f7717a36c835e01636098ec0406e9e82c6 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Fri, 7 Aug 2020 14:50:53 -0400
Subject: [PATCH 3/5] Add tests for characters 0x00 - 0x7E.

---
 test_canonicaljson.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/test_canonicaljson.py b/test_canonicaljson.py
index b190aaa..2032c65 100644
--- a/test_canonicaljson.py
+++ b/test_canonicaljson.py
@@ -62,6 +62,36 @@ def test_encode_canonical(self):
             b'"\\\\u1234"',
         )
 
+    def test_ascii(self):
+        """
+        Ensure the proper ASCII characters are escaped.
+
+        See https://matrix.org/docs/spec/appendices#grammar.
+        """
+        # Some characters go to their common shorthands.
+        escaped = {
+            0x08: b'"\\b"',
+            0x09: b'"\\t"',
+            0x0A: b'"\\n"',
+            0x0C: b'"\\f"',
+            0x0D: b'"\\r"',
+            0x22: b'"\\""',
+            0x5C: b'"\\\\"',
+        }
+        for c, expected in escaped.items():
+            self.assertEqual(encode_canonical_json(chr(c)), expected)
+
+        # Others go to the \uXXXX.
+        hex_escaped = list(range(0x08)) + [0x0B] + list(range(0x0E, 0x20))
+        for c in hex_escaped:
+            self.assertEqual(encode_canonical_json(chr(c)), b'"\\u00%02x"' % (c,))
+
+        # And other characters are passed unescaped.
+        unescaped = [0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E))
+        for c in unescaped:
+            c = chr(c)
+            self.assertEqual(encode_canonical_json(c), b'"' + c.encode("ascii") + b'"')
+
     def test_encode_pretty_printed(self):
         self.assertEqual(encode_pretty_printed_json({}), b'{}')
 

From d95cb6421c3bfc9b96d60fce9f4b8b8e02ab51a7 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Fri, 7 Aug 2020 15:00:18 -0400
Subject: [PATCH 4/5] Add a comment to the dependencies.

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index a4fc72e..2d2774e 100755
--- a/setup.py
+++ b/setup.py
@@ -46,6 +46,8 @@ def exec_file(path_segments, name):
     py_modules=["canonicaljson"],
     description="Canonical JSON",
     install_requires=[
+        # simplerjson versions before 3.14.0 had a bug with some characters
+        # (e.g. \u2028) if ensure_ascii was set to false.
         "simplejson>=3.14.0",
         "frozendict>=1.0",
     ],

From 0d1f4e73446fdd27445d0218798f74e6eca7146b Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Fri, 7 Aug 2020 16:16:43 -0400
Subject: [PATCH 5/5] Lint

---
 test_canonicaljson.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/test_canonicaljson.py b/test_canonicaljson.py
index 2032c65..d06c66a 100644
--- a/test_canonicaljson.py
+++ b/test_canonicaljson.py
@@ -84,13 +84,20 @@ def test_ascii(self):
         # Others go to the \uXXXX.
         hex_escaped = list(range(0x08)) + [0x0B] + list(range(0x0E, 0x20))
         for c in hex_escaped:
-            self.assertEqual(encode_canonical_json(chr(c)), b'"\\u00%02x"' % (c,))
+            self.assertEqual(
+                encode_canonical_json(chr(c)),
+                b'"\\u00%02x"' % (c,)
+            )
 
         # And other characters are passed unescaped.
-        unescaped = [0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E))
+        unescaped = (
+            [0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E)))
         for c in unescaped:
             c = chr(c)
-            self.assertEqual(encode_canonical_json(c), b'"' + c.encode("ascii") + b'"')
+            self.assertEqual(
+                encode_canonical_json(c),
+                b'"' + c.encode("ascii") + b'"'
+            )
 
     def test_encode_pretty_printed(self):
         self.assertEqual(encode_pretty_printed_json({}), b'{}')