From 3a7e45585bbd5d4bc616729b18935d10e78da40b Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 7 Aug 2020 12:32:35 -0400 Subject: [PATCH 1/5] Switch to directly decoding to UTF-8. * Bump the version of simplejson to a version that properly handles ensure_ascii=False * Remove the manual unascii-ifying and use ensure_ascii=False, which is now at least as fast. --- canonicaljson.py | 84 ++---------------------------------------------- setup.py | 2 +- 2 files changed, 4 insertions(+), 82 deletions(-) diff --git a/canonicaljson.py b/canonicaljson.py index afab92a..f4858a6 100644 --- a/canonicaljson.py +++ b/canonicaljson.py @@ -32,14 +32,6 @@ def _default(obj): obj.__class__.__name__) -# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so -# much quicker (assuming c speedups are enabled) that it's actually much -# quicker to let it do that and then substitute back (it's about 2.5x faster). -# -# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right, -# as per https://github.com/simplejson/simplejson/issues/206). -# - # Declare these in the module scope, but they get configured in # set_json_library. _canonical_encoder = None @@ -56,7 +48,7 @@ def set_json_library(json_lib): """ global _canonical_encoder _canonical_encoder = json_lib.JSONEncoder( - ensure_ascii=True, + ensure_ascii=False, separators=(',', ':'), sort_keys=True, default=_default, @@ -64,7 +56,7 @@ def set_json_library(json_lib): global _pretty_encoder _pretty_encoder = json_lib.JSONEncoder( - ensure_ascii=True, + ensure_ascii=False, indent=4, sort_keys=True, default=_default, @@ -77,76 +69,6 @@ def set_json_library(json_lib): _U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\") -def _unascii(s): - """Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8 - - This method takes the output of the JSONEncoder and expands any \\uNNNN - escapes it finds (except for \\u0000 to \\u001F, which are converted to - \\xNN escapes). - - For performance, it assumes that the input is valid JSON, and performs few - sanity checks. - """ - - # make the fast path fast: if there are no matches in the string, the - # whole thing is ascii. We have to turn it into a bytes, which is - # quickest with encode('utf-8') - m = _U_ESCAPE.search(s) - if not m: - return s.encode('utf-8') - - # appending to a string (or a bytes) is slooow, so we accumulate sections - # of string result in 'chunks', and join them all together later. - # (It doesn't seem to make much difference whether we accumulate - # utf8-encoded bytes, or strings which we utf-8 encode after rejoining) - # - chunks = [] - - # 'pos' tracks the index in 's' that we have processed into 'chunks' so - # far. - pos = 0 - - while m: - start = m.start() - end = m.end() - - g = m.group(1) - - if g is None: - # escaped backslash: pass it through along with anything before the - # match - chunks.append(s[pos:end]) - else: - # \uNNNN, but we have to watch out for surrogate pairs. - # - # str.encode("utf-8") complains about surrogates, so we have to - # unpack them. - c = int(g, 16) - - if c < 0x20: - # leave as a \uNNNN escape - chunks.append(s[pos:end]) - else: - if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u': - esc2 = s[end + 2:end + 6] - c2 = int(esc2, 16) - if c2 & 0xfc00 == 0xdc00: - c = 0x10000 + (((c - 0xd800) << 10) | - (c2 - 0xdc00)) - end += 6 - - chunks.append(s[pos:start]) - chunks.append(chr(c)) - - pos = end - m = _U_ESCAPE.search(s, pos) - - # pass through anything after the last match - chunks.append(s[pos:]) - - return (''.join(chunks)).encode("utf-8") - - def encode_canonical_json(json_object): """Encodes the shortest UTF-8 JSON encoding with dictionary keys lexicographically sorted by unicode code point. @@ -157,7 +79,7 @@ def encode_canonical_json(json_object): Returns: bytes encoding the JSON object""" s = _canonical_encoder.encode(json_object) - return _unascii(s) + return s.encode("utf-8") def encode_pretty_printed_json(json_object): diff --git a/setup.py b/setup.py index 3138d43..a4fc72e 100755 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ def exec_file(path_segments, name): py_modules=["canonicaljson"], description="Canonical JSON", install_requires=[ - "simplejson>=3.6.5", + "simplejson>=3.14.0", "frozendict>=1.0", ], zip_safe=True, From 7f17bdca0e34be14c8549ce7d068823d467d5c41 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 7 Aug 2020 12:38:44 -0400 Subject: [PATCH 2/5] Remove more unused code. --- canonicaljson.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/canonicaljson.py b/canonicaljson.py index f4858a6..63118a6 100644 --- a/canonicaljson.py +++ b/canonicaljson.py @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re import platform from frozendict import frozendict @@ -63,12 +62,6 @@ def set_json_library(json_lib): ) -# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it -# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN -# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'. -_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\") - - def encode_canonical_json(json_object): """Encodes the shortest UTF-8 JSON encoding with dictionary keys lexicographically sorted by unicode code point. From e98dc2f7717a36c835e01636098ec0406e9e82c6 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 7 Aug 2020 14:50:53 -0400 Subject: [PATCH 3/5] Add tests for characters 0x00 - 0x7E. --- test_canonicaljson.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test_canonicaljson.py b/test_canonicaljson.py index b190aaa..2032c65 100644 --- a/test_canonicaljson.py +++ b/test_canonicaljson.py @@ -62,6 +62,36 @@ def test_encode_canonical(self): b'"\\\\u1234"', ) + def test_ascii(self): + """ + Ensure the proper ASCII characters are escaped. + + See https://matrix.org/docs/spec/appendices#grammar. + """ + # Some characters go to their common shorthands. + escaped = { + 0x08: b'"\\b"', + 0x09: b'"\\t"', + 0x0A: b'"\\n"', + 0x0C: b'"\\f"', + 0x0D: b'"\\r"', + 0x22: b'"\\""', + 0x5C: b'"\\\\"', + } + for c, expected in escaped.items(): + self.assertEqual(encode_canonical_json(chr(c)), expected) + + # Others go to the \uXXXX. + hex_escaped = list(range(0x08)) + [0x0B] + list(range(0x0E, 0x20)) + for c in hex_escaped: + self.assertEqual(encode_canonical_json(chr(c)), b'"\\u00%02x"' % (c,)) + + # And other characters are passed unescaped. + unescaped = [0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E)) + for c in unescaped: + c = chr(c) + self.assertEqual(encode_canonical_json(c), b'"' + c.encode("ascii") + b'"') + def test_encode_pretty_printed(self): self.assertEqual(encode_pretty_printed_json({}), b'{}') From d95cb6421c3bfc9b96d60fce9f4b8b8e02ab51a7 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 7 Aug 2020 15:00:18 -0400 Subject: [PATCH 4/5] Add a comment to the dependencies. --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index a4fc72e..2d2774e 100755 --- a/setup.py +++ b/setup.py @@ -46,6 +46,8 @@ def exec_file(path_segments, name): py_modules=["canonicaljson"], description="Canonical JSON", install_requires=[ + # simplerjson versions before 3.14.0 had a bug with some characters + # (e.g. \u2028) if ensure_ascii was set to false. "simplejson>=3.14.0", "frozendict>=1.0", ], From 0d1f4e73446fdd27445d0218798f74e6eca7146b Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 7 Aug 2020 16:16:43 -0400 Subject: [PATCH 5/5] Lint --- test_canonicaljson.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/test_canonicaljson.py b/test_canonicaljson.py index 2032c65..d06c66a 100644 --- a/test_canonicaljson.py +++ b/test_canonicaljson.py @@ -84,13 +84,20 @@ def test_ascii(self): # Others go to the \uXXXX. hex_escaped = list(range(0x08)) + [0x0B] + list(range(0x0E, 0x20)) for c in hex_escaped: - self.assertEqual(encode_canonical_json(chr(c)), b'"\\u00%02x"' % (c,)) + self.assertEqual( + encode_canonical_json(chr(c)), + b'"\\u00%02x"' % (c,) + ) # And other characters are passed unescaped. - unescaped = [0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E)) + unescaped = ( + [0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E))) for c in unescaped: c = chr(c) - self.assertEqual(encode_canonical_json(c), b'"' + c.encode("ascii") + b'"') + self.assertEqual( + encode_canonical_json(c), + b'"' + c.encode("ascii") + b'"' + ) def test_encode_pretty_printed(self): self.assertEqual(encode_pretty_printed_json({}), b'{}')