From 58c62a2e45288f83596906e19bc0fa0ae2b8ca8d Mon Sep 17 00:00:00 2001 From: Chris Dickinson Date: Wed, 1 Apr 2015 13:38:36 -0700 Subject: [PATCH 1/6] wip,src: add utf8 consumer/validator --- src/util.cc | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/util.h | 1 + 2 files changed, 189 insertions(+) diff --git a/src/util.cc b/src/util.cc index f382b3d565a8cf..7c764f0918cfd1 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1,8 +1,196 @@ #include "util.h" #include "string_bytes.h" +#define UNI_SUR_HIGH_START (uint32_t) 0xD800 +#define UNI_SUR_LOW_END (uint32_t) 0xDFFF +#define UNI_REPLACEMENT_CHAR (uint32_t) 0x0000FFFD +#define UNI_MAX_LEGAL_UTF32 (uint32_t) 0x0010FFFF + +#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) +#define HAS_GCC_BUILTIN_CLZ +#endif + +#ifdef WIN32 +#include +static uint32_t __inline clz(uint32_t xs) { + unsigned long result = 0; + _BitScanReverse(&result, xs); + return (31 - result); +} +#elif defined(HAS_GCC_BUILTIN_CLZ) +#define clz(xs) __builtin_clz(xs) +#else +static inline uint32_t clz(uint32_t xs) { + uint32_t out = 0; + while (xs >> (31 - out)) { + ++out; + } + return out; +} +#endif + namespace node { +typedef size_t (*ErrorStrategy)( + const size_t, const char*, const size_t); +typedef void (*GlyphStrategy)( + const char*, const size_t, const uint32_t, const size_t); + + +static size_t Skip( + const size_t remaining, const char* input, const size_t glyph_size) { + if (remaining > glyph_size) { + return 1; + } + return 0; +} + + +static size_t Halt( + const size_t remaining, const char* input, const size_t glyph_size) { + return 0; +} + + +static void DiscardGlyph( + const char* glyph, + size_t glyph_size, + uint32_t glyph_value, + const size_t pos) { +} + + +static bool IsLegalUTF8Glyph(const uint8_t* input, const size_t length) { + uint8_t acc; + const uint8_t* srcptr = input + length; + switch (length) { + default: return false; + case 4: + acc = (*--srcptr); + if (acc < 0x80 || acc > 0xBF) + return false; + case 3: + acc = (*--srcptr); + if (acc < 0x80 || acc > 0xBF) + return false; + case 2: + acc = (*--srcptr); + if (acc > 0xBF) + return false; + switch (*input) { + case 0xE0: + if (acc < 0xA0) + return false; + break; + case 0xED: + if (acc > 0x9F) + return false; + break; + case 0xF0: + if (acc < 0x90) + return false; + break; + case 0xF4: + if (acc > 0x8F) + return false; + break; + default: + if (acc < 0x80) + return false; + } + case 1: + if (*input >= 0x80 && *input < 0xC2) { + return false; + } + } + return *input <= 0xF4; +} + + +static const uint32_t offsets_from_utf8[6] = { + 0x00000000, 0x00003080, 0x000E2080, + 0x03C82080, 0xFA082080, 0x82082080 +}; + + +template +static size_t Utf8Consume( + char* const input, + const size_t length, + const GlyphStrategy OnGlyph) { + size_t idx = 0; + while (idx < length) { + size_t advance = 0; + uint32_t glyph = 0; + uint8_t extrabytes = (uint8_t)clz(~(static_cast(input[idx])<<24)); + size_t i = idx; + + if (extrabytes + idx > length) { + advance = OnError(length - idx, input, extrabytes); + } else if (!IsLegalUTF8Glyph( + reinterpret_cast(input + idx), extrabytes + 1)) { + advance = OnError(length - idx, input, extrabytes); + } else { + switch (extrabytes) { + case 5: + glyph += (uint8_t) input[i++]; + glyph <<= 6; + case 4: + glyph += (uint8_t) input[i++]; + glyph <<= 6; + case 3: + glyph += (uint8_t) input[i++]; + glyph <<= 6; + case 2: + glyph += (uint8_t) input[i++]; + glyph <<= 6; + case 1: + glyph += (uint8_t) input[i++]; + glyph <<= 6; + case 0: + glyph += (uint8_t) input[i]; + } + + glyph -= offsets_from_utf8[extrabytes]; + + if (glyph > UNI_MAX_LEGAL_UTF32 || + (glyph >= UNI_SUR_HIGH_START && glyph <= UNI_SUR_LOW_END)) { + advance = OnError(length - idx, input, extrabytes); + } else { + advance = extrabytes + 1; + OnGlyph(input + idx, extrabytes + 1, glyph, idx); + } + } + + if (advance == 0) { + break; + } + idx += advance; + } + return idx; +} + + +size_t StripInvalidUtf8Glyphs(char * input, const size_t size) { + size_t idx = 0; + auto on_glyph = [&input, &idx]( + const char* data, size_t size, uint32_t glyph, size_t pos) { + size_t old_idx = idx; + idx += size; + if (pos == old_idx) + return; + memcpy(input + old_idx, data, size); + }; + + return Utf8Consume(input, size, on_glyph); +} + + +bool Utf8Value::IsValidUTF8(char * const input, const size_t size) { + return Utf8Consume(input, size, DiscardGlyph) == size; +} + + Utf8Value::Utf8Value(v8::Isolate* isolate, v8::Handle value) : length_(0), str_(str_st_) { if (value.IsEmpty()) diff --git a/src/util.h b/src/util.h index ea17a155745993..85b7d49286a323 100644 --- a/src/util.h +++ b/src/util.h @@ -189,6 +189,7 @@ class Utf8Value { return length_; }; + static bool IsValidUTF8(char * const, const size_t); private: size_t length_; char* str_; From e7918fecb9b456c49cbe8ec05c8ab4f684f0ce9a Mon Sep 17 00:00:00 2001 From: Chris Dickinson Date: Wed, 1 Apr 2015 14:04:10 -0700 Subject: [PATCH 2/6] nix C-style casts --- src/util.cc | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/util.cc b/src/util.cc index 7c764f0918cfd1..9ee4515e900ff4 100644 --- a/src/util.cc +++ b/src/util.cc @@ -1,10 +1,10 @@ #include "util.h" #include "string_bytes.h" -#define UNI_SUR_HIGH_START (uint32_t) 0xD800 -#define UNI_SUR_LOW_END (uint32_t) 0xDFFF -#define UNI_REPLACEMENT_CHAR (uint32_t) 0x0000FFFD -#define UNI_MAX_LEGAL_UTF32 (uint32_t) 0x0010FFFF +#define UNI_SUR_HIGH_START 0xD800UL +#define UNI_SUR_LOW_END 0xDFFFUL +#define UNI_REPLACEMENT_CHAR 0x0000FFFDUL +#define UNI_MAX_LEGAL_UTF32 0x0010FFFFUL #if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) #define HAS_GCC_BUILTIN_CLZ @@ -122,7 +122,8 @@ static size_t Utf8Consume( while (idx < length) { size_t advance = 0; uint32_t glyph = 0; - uint8_t extrabytes = (uint8_t)clz(~(static_cast(input[idx])<<24)); + uint8_t extrabytes = static_cast( + clz(~(static_cast(input[idx])<<24))); size_t i = idx; if (extrabytes + idx > length) { @@ -133,22 +134,22 @@ static size_t Utf8Consume( } else { switch (extrabytes) { case 5: - glyph += (uint8_t) input[i++]; + glyph += static_cast(input[i++]); glyph <<= 6; case 4: - glyph += (uint8_t) input[i++]; + glyph += static_cast(input[i++]); glyph <<= 6; case 3: - glyph += (uint8_t) input[i++]; + glyph += static_cast(input[i++]); glyph <<= 6; case 2: - glyph += (uint8_t) input[i++]; + glyph += static_cast(input[i++]); glyph <<= 6; case 1: - glyph += (uint8_t) input[i++]; + glyph += static_cast(input[i++]); glyph <<= 6; case 0: - glyph += (uint8_t) input[i]; + glyph += static_cast(input[i]); } glyph -= offsets_from_utf8[extrabytes]; From 09cd52f5b92738072d5b95917dd36f2f7d791726 Mon Sep 17 00:00:00 2001 From: Chris Dickinson Date: Wed, 1 Apr 2015 14:42:41 -0700 Subject: [PATCH 3/6] style fixes, switch from char* to uint8_t* --- src/util.cc | 59 ++++++++++++++++++++++++++++++----------------------- src/util.h | 2 +- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/src/util.cc b/src/util.cc index 9ee4515e900ff4..c9f6bb6d1ef8ef 100644 --- a/src/util.cc +++ b/src/util.cc @@ -31,14 +31,14 @@ static inline uint32_t clz(uint32_t xs) { namespace node { -typedef size_t (*ErrorStrategy)( - const size_t, const char*, const size_t); -typedef void (*GlyphStrategy)( - const char*, const size_t, const uint32_t, const size_t); +typedef size_t (*ErrorStrategy)(size_t, const uint8_t*, size_t); +typedef void (*GlyphStrategy)(const uint8_t*, size_t, uint32_t, size_t); -static size_t Skip( - const size_t remaining, const char* input, const size_t glyph_size) { +inline size_t Skip( + const size_t remaining, + const uint8_t* input, + const size_t glyph_size) { if (remaining > glyph_size) { return 1; } @@ -46,21 +46,23 @@ static size_t Skip( } -static size_t Halt( - const size_t remaining, const char* input, const size_t glyph_size) { +inline size_t Halt( + const size_t remaining, + const uint8_t* input, + const size_t glyph_size) { return 0; } -static void DiscardGlyph( - const char* glyph, - size_t glyph_size, - uint32_t glyph_value, +inline void DiscardGlyph( + const uint8_t* glyph, + const size_t glyph_size, + const uint32_t glyph_value, const size_t pos) { } -static bool IsLegalUTF8Glyph(const uint8_t* input, const size_t length) { +inline bool IsLegalUtf8Glyph(const uint8_t* input, const size_t length) { uint8_t acc; const uint8_t* srcptr = input + length; switch (length) { @@ -69,10 +71,12 @@ static bool IsLegalUTF8Glyph(const uint8_t* input, const size_t length) { acc = (*--srcptr); if (acc < 0x80 || acc > 0xBF) return false; + // fall-through case 3: acc = (*--srcptr); if (acc < 0x80 || acc > 0xBF) return false; + // fall-through case 2: acc = (*--srcptr); if (acc > 0xBF) @@ -98,6 +102,7 @@ static bool IsLegalUTF8Glyph(const uint8_t* input, const size_t length) { if (acc < 0x80) return false; } + // fall-through case 1: if (*input >= 0x80 && *input < 0xC2) { return false; @@ -115,7 +120,7 @@ static const uint32_t offsets_from_utf8[6] = { template static size_t Utf8Consume( - char* const input, + const uint8_t* const input, const size_t length, const GlyphStrategy OnGlyph) { size_t idx = 0; @@ -128,28 +133,32 @@ static size_t Utf8Consume( if (extrabytes + idx > length) { advance = OnError(length - idx, input, extrabytes); - } else if (!IsLegalUTF8Glyph( - reinterpret_cast(input + idx), extrabytes + 1)) { + } else if (!IsLegalUtf8Glyph(input + idx, extrabytes + 1)) { advance = OnError(length - idx, input, extrabytes); } else { switch (extrabytes) { case 5: - glyph += static_cast(input[i++]); + glyph += input[i++]; glyph <<= 6; + // fall-through case 4: - glyph += static_cast(input[i++]); + glyph += input[i++]; glyph <<= 6; + // fall-through case 3: - glyph += static_cast(input[i++]); + glyph += input[i++]; glyph <<= 6; + // fall-through case 2: - glyph += static_cast(input[i++]); + glyph += input[i++]; glyph <<= 6; + // fall-through case 1: - glyph += static_cast(input[i++]); + glyph += input[i++]; glyph <<= 6; + // fall-through case 0: - glyph += static_cast(input[i]); + glyph += input[i]; } glyph -= offsets_from_utf8[extrabytes]; @@ -172,10 +181,10 @@ static size_t Utf8Consume( } -size_t StripInvalidUtf8Glyphs(char * input, const size_t size) { +size_t StripInvalidUtf8Glyphs(uint8_t* input, const size_t size) { size_t idx = 0; auto on_glyph = [&input, &idx]( - const char* data, size_t size, uint32_t glyph, size_t pos) { + const uint8_t* data, size_t size, uint32_t glyph, size_t pos) { size_t old_idx = idx; idx += size; if (pos == old_idx) @@ -187,7 +196,7 @@ size_t StripInvalidUtf8Glyphs(char * input, const size_t size) { } -bool Utf8Value::IsValidUTF8(char * const input, const size_t size) { +bool Utf8Value::IsValidUtf8(const uint8_t * const input, const size_t size) { return Utf8Consume(input, size, DiscardGlyph) == size; } diff --git a/src/util.h b/src/util.h index 85b7d49286a323..dd8e97b449aeb6 100644 --- a/src/util.h +++ b/src/util.h @@ -189,7 +189,7 @@ class Utf8Value { return length_; }; - static bool IsValidUTF8(char * const, const size_t); + static bool IsValidUtf8(const uint8_t* const, const size_t); private: size_t length_; char* str_; From 5cb6d3dd090a0adb4814616ec6050875b8dc593d Mon Sep 17 00:00:00 2001 From: Chris Dickinson Date: Wed, 1 Apr 2015 14:44:12 -0700 Subject: [PATCH 4/6] switch remaining functions from static to inline --- src/util.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/util.cc b/src/util.cc index c9f6bb6d1ef8ef..e99c5fa2d17a24 100644 --- a/src/util.cc +++ b/src/util.cc @@ -12,7 +12,7 @@ #ifdef WIN32 #include -static uint32_t __inline clz(uint32_t xs) { +inline uint32_t __inline clz(uint32_t xs) { unsigned long result = 0; _BitScanReverse(&result, xs); return (31 - result); @@ -20,7 +20,7 @@ static uint32_t __inline clz(uint32_t xs) { #elif defined(HAS_GCC_BUILTIN_CLZ) #define clz(xs) __builtin_clz(xs) #else -static inline uint32_t clz(uint32_t xs) { +inline uint32_t clz(uint32_t xs) { uint32_t out = 0; while (xs >> (31 - out)) { ++out; @@ -119,7 +119,7 @@ static const uint32_t offsets_from_utf8[6] = { template -static size_t Utf8Consume( +inline size_t Utf8Consume( const uint8_t* const input, const size_t length, const GlyphStrategy OnGlyph) { From 7cd5465e9d96365db40666c0c2621d915f37a7e4 Mon Sep 17 00:00:00 2001 From: Chris Dickinson Date: Wed, 1 Apr 2015 15:36:07 -0700 Subject: [PATCH 5/6] review fixes --- src/util.cc | 47 +++++++++++++++++++++++------------------------ src/util.h | 1 + 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/util.cc b/src/util.cc index e99c5fa2d17a24..bdd8c2aed754b6 100644 --- a/src/util.cc +++ b/src/util.cc @@ -12,20 +12,27 @@ #ifdef WIN32 #include -inline uint32_t __inline clz(uint32_t xs) { +inline uint32_t clz(uint8_t xs) { unsigned long result = 0; + uint32_t input = static_cast(xs) << 24; _BitScanReverse(&result, xs); return (31 - result); } #elif defined(HAS_GCC_BUILTIN_CLZ) -#define clz(xs) __builtin_clz(xs) +#define clz(xs) __builtin_clz(static_cast(xs) << 24) #else -inline uint32_t clz(uint32_t xs) { - uint32_t out = 0; - while (xs >> (31 - out)) { - ++out; - } - return out; +inline uint32_t log2(uint8_t v) { + const uint32_t r = (v > 15) << 2; + v >>= r; + const uint32_t s = (v > 3) << 1; + v >>= s; + v >>= 1; + return r | s | v; +} + +inline uint32_t clz(uint8_t v) { + // clz(0) == 7. Add a zero check if that's an issue. + return 7 - log2(v); } #endif @@ -127,8 +134,7 @@ inline size_t Utf8Consume( while (idx < length) { size_t advance = 0; uint32_t glyph = 0; - uint8_t extrabytes = static_cast( - clz(~(static_cast(input[idx])<<24))); + uint8_t extrabytes = input[idx] ? clz(~input[idx]) : 0; size_t i = idx; if (extrabytes + idx > length) { @@ -136,15 +142,8 @@ inline size_t Utf8Consume( } else if (!IsLegalUtf8Glyph(input + idx, extrabytes + 1)) { advance = OnError(length - idx, input, extrabytes); } else { + ASSERT(extrabytes < 4); switch (extrabytes) { - case 5: - glyph += input[i++]; - glyph <<= 6; - // fall-through - case 4: - glyph += input[i++]; - glyph <<= 6; - // fall-through case 3: glyph += input[i++]; glyph <<= 6; @@ -181,18 +180,18 @@ inline size_t Utf8Consume( } -size_t StripInvalidUtf8Glyphs(uint8_t* input, const size_t size) { +size_t Utf8Value::StripInvalidUtf8Glyphs(uint8_t* const input, const size_t size) { size_t idx = 0; - auto on_glyph = [&input, &idx]( + auto on_glyph = [input, &idx]( const uint8_t* data, size_t size, uint32_t glyph, size_t pos) { size_t old_idx = idx; idx += size; - if (pos == old_idx) - return; - memcpy(input + old_idx, data, size); + if (old_idx == pos) return; + memmove(input + old_idx, data, size); }; - return Utf8Consume(input, size, on_glyph); + size_t copied = Utf8Consume(input, size, on_glyph); + return idx; } diff --git a/src/util.h b/src/util.h index dd8e97b449aeb6..a2881adb6b6924 100644 --- a/src/util.h +++ b/src/util.h @@ -190,6 +190,7 @@ class Utf8Value { }; static bool IsValidUtf8(const uint8_t* const, const size_t); + static size_t StripInvalidUtf8Glyphs(uint8_t* const, const size_t); private: size_t length_; char* str_; From e4bc82f145d541fd9359782c0b9c745e6ab5b5a2 Mon Sep 17 00:00:00 2001 From: Chris Dickinson Date: Wed, 1 Apr 2015 20:00:09 -0700 Subject: [PATCH 6/6] =?UTF-8?q?nix=20clz=20=E2=80=93=20turns=20out=20@bnoo?= =?UTF-8?q?rdhuis'=20implementation=20is=20faster=20=F0=9F=8F=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/util.cc | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/src/util.cc b/src/util.cc index bdd8c2aed754b6..e17a6f286b9e6c 100644 --- a/src/util.cc +++ b/src/util.cc @@ -6,21 +6,6 @@ #define UNI_REPLACEMENT_CHAR 0x0000FFFDUL #define UNI_MAX_LEGAL_UTF32 0x0010FFFFUL -#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) -#define HAS_GCC_BUILTIN_CLZ -#endif - -#ifdef WIN32 -#include -inline uint32_t clz(uint8_t xs) { - unsigned long result = 0; - uint32_t input = static_cast(xs) << 24; - _BitScanReverse(&result, xs); - return (31 - result); -} -#elif defined(HAS_GCC_BUILTIN_CLZ) -#define clz(xs) __builtin_clz(static_cast(xs) << 24) -#else inline uint32_t log2(uint8_t v) { const uint32_t r = (v > 15) << 2; v >>= r; @@ -31,10 +16,8 @@ inline uint32_t log2(uint8_t v) { } inline uint32_t clz(uint8_t v) { - // clz(0) == 7. Add a zero check if that's an issue. return 7 - log2(v); } -#endif namespace node { @@ -190,8 +173,7 @@ size_t Utf8Value::StripInvalidUtf8Glyphs(uint8_t* const input, const size_t size memmove(input + old_idx, data, size); }; - size_t copied = Utf8Consume(input, size, on_glyph); - return idx; + return Utf8Consume(input, size, on_glyph); }