From 48630127d0098af7476c3ccfeb51fe7ea0c53bea Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Tue, 4 Feb 2025 17:28:42 -0800 Subject: [PATCH] CBOR-to-JSON: do properly escape JSON strings We hadn't bothered, as this was just example-like code to show how one could convert from CBOR to JSON. But as it was added to the library (no extra dependency), we should Do The Right Thing (DTRT) and escape. This patch could have used cbor_value_to_pretty() to print the string, which has better support for UTF-8 escaping and thus checks for UTF-8 correctness, but that would make map_to_json()'s metadata functionality much more complex, especially since we cannot rely on open_memstream() always being available. Therefore, we are partially duplicating cborpretty.c's utf8EscapedDump(). Signed-off-by: Thiago Macieira --- src/cbortojson.c | 120 +++++++++++++++++++++++++++++++++--- src/compilersupport_p.h | 11 ++++ src/memory.h | 5 +- tests/tojson/tst_tojson.cpp | 21 +++++++ 4 files changed, 145 insertions(+), 12 deletions(-) diff --git a/src/cbortojson.c b/src/cbortojson.c index 1f74e32f..a617db2d 100644 --- a/src/cbortojson.c +++ b/src/cbortojson.c @@ -170,9 +170,16 @@ typedef struct ConversionStatus { static CborError value_to_json(FILE *out, CborValue *it, int flags, CborType type, int nestingLevel, ConversionStatus *status); -static CborError dump_bytestring_base16(char **result, CborValue *it) +static void append_hex(void *buffer, uint8_t byte) { static const char characters[] = "0123456789abcdef"; + char *str = buffer; + str[0] = characters[byte >> 4]; + str[1] = characters[byte & 0xf]; +} + +static CborError dump_bytestring_base16(char **result, CborValue *it) +{ size_t i; size_t n = 0; uint8_t *buffer; @@ -195,8 +202,7 @@ static CborError dump_bytestring_base16(char **result, CborValue *it) for (i = 0; i < n; ++i) { uint8_t byte = buffer[n + i]; - buffer[2*i] = characters[byte >> 4]; - buffer[2*i + 1] = characters[byte & 0xf]; + append_hex(buffer + 2 * i, byte); } return CborNoError; } @@ -293,6 +299,96 @@ static CborError dump_bytestring_base64url(char **result, CborValue *it) return generic_dump_base64(result, it, alphabet); } +static CborError escape_text_string(char **str, size_t *alloc, size_t *offsetp, const char *input, size_t len) +{ + /* JSON requires escaping some characters in strings, so we iterate and + * escape as necessary + * https://www.rfc-editor.org/rfc/rfc8259#section-7: + * All Unicode characters may be placed within the + * quotation marks, except for the characters that MUST be escaped: + * quotation mark, reverse solidus, and the control characters (U+0000 + * through U+001F). + * We additionally choose to escape BS, HT, CR, LF and FF. + */ + char *buf = *str; + + /* Ensure we have enough space for this chunk. In the worst case, we + * have 6 escaped characters per input character. + * + * The overflow checking here is only practically useful for 32-bit + * machines, as SIZE_MAX/6 for a 64-bit machine is 2.6667 exabytes. + * That is much more than any current architecture can even address and + * cbor_value_get_text_string_chunk() only works for data already + * loaded into memory. + */ + size_t needed; + size_t offset = offsetp ? *offsetp : 0; + if (mul_check_overflow(len, 6, &needed) || add_check_overflow(needed, offset, &needed) + || add_check_overflow(needed, 1, &needed)) { + return CborErrorDataTooLarge; + } + if (!alloc || needed > *alloc) { + buf = cbor_realloc(buf, needed); + if (!buf) + return CborErrorOutOfMemory; + if (alloc) + *alloc = needed; + } + + for (size_t i = 0; i < len; ++i) { + static const char escapeChars[] = "\b\t\n\r\f\"\\"; + static const char escapedChars[] = "btnrf\"\\"; + unsigned char c = input[i]; + + char *esc = c > 0 ? strchr(escapeChars, c) : NULL; + if (esc) { + buf[offset++] = '\\'; + buf[offset++] = escapedChars[esc - escapeChars]; + } else if (c <= 0x1F) { + buf[offset++] = '\\'; + buf[offset++] = 'u'; + buf[offset++] = '0'; + buf[offset++] = '0'; + append_hex(buf + offset, c); + offset += 2; + } else { + buf[offset++] = c; + } + } + buf[offset] = '\0'; + *str = buf; + if (offsetp) + *offsetp = offset; + return CborNoError; +} + +static CborError text_string_to_escaped(char **str, CborValue *it) +{ + size_t alloc = 0, offset = 0; + CborError err; + + *str = NULL; + err = cbor_value_begin_string_iteration(it); + while (err == CborNoError) { + const char *chunk; + size_t len; + err = cbor_value_get_text_string_chunk(it, &chunk, &len, it); + if (err == CborNoError) + err = escape_text_string(str, &alloc, &offset, chunk, len); + } + + if (likely(err == CborErrorNoMoreStringChunks)) { + /* success */ + if (!*str) + *str = strdup(""); // wasteful, but very atypical + return cbor_value_finish_string_iteration(it); + } + + cbor_free(*str); + *str = NULL; + return err; +} + static CborError add_value_metadata(FILE *out, CborType type, const ConversionStatus *status) { int flags = status->flags; @@ -420,14 +516,20 @@ static CborError stringify_map_key(char **key, CborValue *it, int flags, CborTyp return CborErrorJsonNotImplemented; #else size_t size; + char *stringified; - FILE *memstream = open_memstream(key, &size); + FILE *memstream = open_memstream(&stringified, &size); if (memstream == NULL) return CborErrorOutOfMemory; /* could also be EMFILE, but it's unlikely */ CborError err = cbor_value_to_pretty_advance(memstream, it); - if (unlikely(fclose(memstream) < 0 || *key == NULL)) + if (unlikely(fclose(memstream) < 0 || stringified == NULL)) return CborErrorInternalError; + if (err == CborNoError) { + /* escape the stringified CBOR stream */ + err = escape_text_string(key, NULL, NULL, stringified, size); + } + cbor_free(stringified); return err; #endif } @@ -452,15 +554,14 @@ static CborError map_to_json(FILE *out, CborValue *it, int flags, int nestingLev const char *comma = ""; CborError err; while (!cbor_value_at_end(it)) { - char *key; + char *key = NULL; if (fprintf(out, "%s", comma) < 0) return CborErrorIO; comma = ","; CborType keyType = cbor_value_get_type(it); if (likely(keyType == CborTextStringType)) { - size_t n = 0; - err = cbor_value_dup_text_string(it, &key, &n, it); + err = text_string_to_escaped(&key, it); } else if (flags & CborConvertStringifyMapKeys) { err = stringify_map_key(&key, it, flags, keyType); } else { @@ -570,8 +671,7 @@ static CborError value_to_json(FILE *out, CborValue *it, int flags, CborType typ err = dump_bytestring_base64url(&str, it); status->flags = TypeWasNotNative; } else { - size_t n = 0; - err = cbor_value_dup_text_string(it, &str, &n, it); + err = text_string_to_escaped(&str, it); } if (err) return err; diff --git a/src/compilersupport_p.h b/src/compilersupport_p.h index 55a26e4a..c91ea933 100644 --- a/src/compilersupport_p.h +++ b/src/compilersupport_p.h @@ -234,4 +234,15 @@ static inline bool add_check_overflow(size_t v1, size_t v2, size_t *r) #endif } +static inline bool mul_check_overflow(size_t v1, size_t v2, size_t *r) +{ +#if ((defined(__GNUC__) && (__GNUC__ >= 5)) && !defined(__INTEL_COMPILER)) || __has_builtin(__builtin_add_overflow) + return __builtin_mul_overflow(v1, v2, r); +#else + /* unsigned multiplications are well-defined */ + *r = v1 * v2; + return *r > v1 && *r > v2; +#endif +} + #endif /* COMPILERSUPPORT_H */ diff --git a/src/memory.h b/src/memory.h index 0032b93b..6686eb26 100644 --- a/src/memory.h +++ b/src/memory.h @@ -26,6 +26,7 @@ # include CBOR_CUSTOM_ALLOC_INCLUDE #else # include -# define cbor_malloc malloc -# define cbor_free free +# define cbor_malloc malloc +# define cbor_realloc realloc +# define cbor_free free #endif diff --git a/tests/tojson/tst_tojson.cpp b/tests/tojson/tst_tojson.cpp index 27603ea6..a1b03663 100644 --- a/tests/tojson/tst_tojson.cpp +++ b/tests/tojson/tst_tojson.cpp @@ -160,6 +160,18 @@ void addTextStringsData() QTest::newRow("_textstring5*2") << raw("\x7f\x63Hel\x62lo\xff") << "\"Hello\""; QTest::newRow("_textstring5*5") << raw("\x7f\x61H\x61""e\x61l\x61l\x61o\xff") << "\"Hello\""; QTest::newRow("_textstring5*6") << raw("\x7f\x61H\x61""e\x61l\x60\x61l\x61o\xff") << "\"Hello\""; + + // strings containing characters that are escaped in JSON + QTest::newRow("null") << raw("\x61\0") << R"("\u0000")"; + QTest::newRow("bell") << raw("\x61\7") << R"("\u0007")"; // not \\a + QTest::newRow("backspace") << raw("\x61\b") << R"("\b")"; + QTest::newRow("tab") << raw("\x61\t") << R"("\t")"; + QTest::newRow("carriage-return") << raw("\x61\r") << R"("\r")"; + QTest::newRow("line-feed") << raw("\x61\n") << R"("\n")"; + QTest::newRow("form-feed") << raw("\x61\f") << R"("\f")"; + QTest::newRow("esc") << raw("\x61\x1f") << R"("\u001f")"; + QTest::newRow("quote") << raw("\x61\"") << R"("\"")"; + QTest::newRow("backslash") << raw("\x61\\") << R"("\\")"; } void addNonJsonData() @@ -412,6 +424,15 @@ void tst_ToJson::nonStringKeyMaps_data() QTest::newRow("map-24-0") << raw("\xa1\x18\x18\0") << "{24: 0}"; QTest::newRow("_map-0-24") << raw("\xbf\0\x18\x18\xff") << "{_ 0: 24}"; QTest::newRow("_map-24-0") << raw("\xbf\x18\x18\0\xff") << "{_ 24: 0}"; + + // nested strings ought to be escaped + QTest::newRow("array-emptystring") << raw("\x81\x60") << R"([\"\"])"; + QTest::newRow("array-string1") << raw("\x81\x61 ") << R"([\" \"])"; + + // and escaped chracters in strings end up doubly escaped + QTest::newRow("array-string-null") << raw("\x81\x61\0") << R"([\"\\u0000\"])"; + QTest::newRow("array-string-quote") << raw("\x81\x61\"") << R"([\"\\\"\"])"; + QTest::newRow("array-string-backslash") << raw("\x81\x61\\") << R"([\"\\\\\"])"; } void tst_ToJson::nonStringKeyMaps()