Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 3 additions & 88 deletions canonicaljson.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import platform

from frozendict import frozendict
Expand All @@ -32,14 +31,6 @@ def _default(obj):
obj.__class__.__name__)


# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
# much quicker (assuming c speedups are enabled) that it's actually much
# quicker to let it do that and then substitute back (it's about 2.5x faster).
#
# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
# as per https://github.com/simplejson/simplejson/issues/206).
#

# Declare these in the module scope, but they get configured in
# set_json_library.
_canonical_encoder = None
Expand All @@ -56,97 +47,21 @@ def set_json_library(json_lib):
"""
global _canonical_encoder
_canonical_encoder = json_lib.JSONEncoder(
ensure_ascii=True,
ensure_ascii=False,
separators=(',', ':'),
sort_keys=True,
default=_default,
)

global _pretty_encoder
_pretty_encoder = json_lib.JSONEncoder(
ensure_ascii=True,
ensure_ascii=False,
indent=4,
sort_keys=True,
default=_default,
)


# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")


def _unascii(s):
"""Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8

This method takes the output of the JSONEncoder and expands any \\uNNNN
escapes it finds (except for \\u0000 to \\u001F, which are converted to
\\xNN escapes).

For performance, it assumes that the input is valid JSON, and performs few
sanity checks.
"""

# make the fast path fast: if there are no matches in the string, the
# whole thing is ascii. We have to turn it into a bytes, which is
# quickest with encode('utf-8')
m = _U_ESCAPE.search(s)
if not m:
return s.encode('utf-8')

# appending to a string (or a bytes) is slooow, so we accumulate sections
# of string result in 'chunks', and join them all together later.
# (It doesn't seem to make much difference whether we accumulate
# utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
#
chunks = []

# 'pos' tracks the index in 's' that we have processed into 'chunks' so
# far.
pos = 0

while m:
start = m.start()
end = m.end()

g = m.group(1)

if g is None:
# escaped backslash: pass it through along with anything before the
# match
chunks.append(s[pos:end])
else:
# \uNNNN, but we have to watch out for surrogate pairs.
#
# str.encode("utf-8") complains about surrogates, so we have to
# unpack them.
c = int(g, 16)

if c < 0x20:
# leave as a \uNNNN escape
chunks.append(s[pos:end])
else:
if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u':
esc2 = s[end + 2:end + 6]
c2 = int(esc2, 16)
if c2 & 0xfc00 == 0xdc00:
c = 0x10000 + (((c - 0xd800) << 10) |
(c2 - 0xdc00))
end += 6

chunks.append(s[pos:start])
chunks.append(chr(c))

pos = end
m = _U_ESCAPE.search(s, pos)

# pass through anything after the last match
chunks.append(s[pos:])

return (''.join(chunks)).encode("utf-8")


def encode_canonical_json(json_object):
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
lexicographically sorted by unicode code point.
Expand All @@ -157,7 +72,7 @@ def encode_canonical_json(json_object):
Returns:
bytes encoding the JSON object"""
s = _canonical_encoder.encode(json_object)
return _unascii(s)
return s.encode("utf-8")


def encode_pretty_printed_json(json_object):
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def exec_file(path_segments, name):
py_modules=["canonicaljson"],
description="Canonical JSON",
install_requires=[
"simplejson>=3.6.5",
# simplerjson versions before 3.14.0 had a bug with some characters
# (e.g. \u2028) if ensure_ascii was set to false.
"simplejson>=3.14.0",
Comment thread
clokep marked this conversation as resolved.
"frozendict>=1.0",
],
zip_safe=True,
Expand Down
37 changes: 37 additions & 0 deletions test_canonicaljson.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,43 @@ def test_encode_canonical(self):
b'"\\\\u1234"',
)

def test_ascii(self):
"""
Ensure the proper ASCII characters are escaped.

See https://matrix.org/docs/spec/appendices#grammar.
"""
# Some characters go to their common shorthands.
escaped = {
0x08: b'"\\b"',
0x09: b'"\\t"',
0x0A: b'"\\n"',
0x0C: b'"\\f"',
0x0D: b'"\\r"',
0x22: b'"\\""',
0x5C: b'"\\\\"',
}
for c, expected in escaped.items():
self.assertEqual(encode_canonical_json(chr(c)), expected)

# Others go to the \uXXXX.
hex_escaped = list(range(0x08)) + [0x0B] + list(range(0x0E, 0x20))
for c in hex_escaped:
self.assertEqual(
encode_canonical_json(chr(c)),
b'"\\u00%02x"' % (c,)
)

# And other characters are passed unescaped.
unescaped = (
[0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E)))
for c in unescaped:
c = chr(c)
self.assertEqual(
encode_canonical_json(c),
b'"' + c.encode("ascii") + b'"'
)

def test_encode_pretty_printed(self):
self.assertEqual(encode_pretty_printed_json({}), b'{}')

Expand Down