diff --git a/canonicaljson.py b/canonicaljson.py index afab92a..63118a6 100644 --- a/canonicaljson.py +++ b/canonicaljson.py @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re import platform from frozendict import frozendict @@ -32,14 +31,6 @@ def _default(obj): obj.__class__.__name__) -# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so -# much quicker (assuming c speedups are enabled) that it's actually much -# quicker to let it do that and then substitute back (it's about 2.5x faster). -# -# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right, -# as per https://github.com/simplejson/simplejson/issues/206). -# - # Declare these in the module scope, but they get configured in # set_json_library. _canonical_encoder = None @@ -56,7 +47,7 @@ def set_json_library(json_lib): """ global _canonical_encoder _canonical_encoder = json_lib.JSONEncoder( - ensure_ascii=True, + ensure_ascii=False, separators=(',', ':'), sort_keys=True, default=_default, @@ -64,89 +55,13 @@ def set_json_library(json_lib): global _pretty_encoder _pretty_encoder = json_lib.JSONEncoder( - ensure_ascii=True, + ensure_ascii=False, indent=4, sort_keys=True, default=_default, ) -# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it -# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN -# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'. -_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\") - - -def _unascii(s): - """Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8 - - This method takes the output of the JSONEncoder and expands any \\uNNNN - escapes it finds (except for \\u0000 to \\u001F, which are converted to - \\xNN escapes). - - For performance, it assumes that the input is valid JSON, and performs few - sanity checks. - """ - - # make the fast path fast: if there are no matches in the string, the - # whole thing is ascii. We have to turn it into a bytes, which is - # quickest with encode('utf-8') - m = _U_ESCAPE.search(s) - if not m: - return s.encode('utf-8') - - # appending to a string (or a bytes) is slooow, so we accumulate sections - # of string result in 'chunks', and join them all together later. - # (It doesn't seem to make much difference whether we accumulate - # utf8-encoded bytes, or strings which we utf-8 encode after rejoining) - # - chunks = [] - - # 'pos' tracks the index in 's' that we have processed into 'chunks' so - # far. - pos = 0 - - while m: - start = m.start() - end = m.end() - - g = m.group(1) - - if g is None: - # escaped backslash: pass it through along with anything before the - # match - chunks.append(s[pos:end]) - else: - # \uNNNN, but we have to watch out for surrogate pairs. - # - # str.encode("utf-8") complains about surrogates, so we have to - # unpack them. - c = int(g, 16) - - if c < 0x20: - # leave as a \uNNNN escape - chunks.append(s[pos:end]) - else: - if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u': - esc2 = s[end + 2:end + 6] - c2 = int(esc2, 16) - if c2 & 0xfc00 == 0xdc00: - c = 0x10000 + (((c - 0xd800) << 10) | - (c2 - 0xdc00)) - end += 6 - - chunks.append(s[pos:start]) - chunks.append(chr(c)) - - pos = end - m = _U_ESCAPE.search(s, pos) - - # pass through anything after the last match - chunks.append(s[pos:]) - - return (''.join(chunks)).encode("utf-8") - - def encode_canonical_json(json_object): """Encodes the shortest UTF-8 JSON encoding with dictionary keys lexicographically sorted by unicode code point. @@ -157,7 +72,7 @@ def encode_canonical_json(json_object): Returns: bytes encoding the JSON object""" s = _canonical_encoder.encode(json_object) - return _unascii(s) + return s.encode("utf-8") def encode_pretty_printed_json(json_object): diff --git a/setup.py b/setup.py index 3138d43..2d2774e 100755 --- a/setup.py +++ b/setup.py @@ -46,7 +46,9 @@ def exec_file(path_segments, name): py_modules=["canonicaljson"], description="Canonical JSON", install_requires=[ - "simplejson>=3.6.5", + # simplerjson versions before 3.14.0 had a bug with some characters + # (e.g. \u2028) if ensure_ascii was set to false. + "simplejson>=3.14.0", "frozendict>=1.0", ], zip_safe=True, diff --git a/test_canonicaljson.py b/test_canonicaljson.py index b190aaa..d06c66a 100644 --- a/test_canonicaljson.py +++ b/test_canonicaljson.py @@ -62,6 +62,43 @@ def test_encode_canonical(self): b'"\\\\u1234"', ) + def test_ascii(self): + """ + Ensure the proper ASCII characters are escaped. + + See https://matrix.org/docs/spec/appendices#grammar. + """ + # Some characters go to their common shorthands. + escaped = { + 0x08: b'"\\b"', + 0x09: b'"\\t"', + 0x0A: b'"\\n"', + 0x0C: b'"\\f"', + 0x0D: b'"\\r"', + 0x22: b'"\\""', + 0x5C: b'"\\\\"', + } + for c, expected in escaped.items(): + self.assertEqual(encode_canonical_json(chr(c)), expected) + + # Others go to the \uXXXX. + hex_escaped = list(range(0x08)) + [0x0B] + list(range(0x0E, 0x20)) + for c in hex_escaped: + self.assertEqual( + encode_canonical_json(chr(c)), + b'"\\u00%02x"' % (c,) + ) + + # And other characters are passed unescaped. + unescaped = ( + [0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E))) + for c in unescaped: + c = chr(c) + self.assertEqual( + encode_canonical_json(c), + b'"' + c.encode("ascii") + b'"' + ) + def test_encode_pretty_printed(self): self.assertEqual(encode_pretty_printed_json({}), b'{}')