From 549e589dec375cfbb0f921bad418ac3133c0b90a Mon Sep 17 00:00:00 2001 From: Vojtech Forejt Date: Tue, 25 Apr 2017 09:13:11 +0100 Subject: [PATCH 1/2] Remove an unused conversion function that is broken anyway, and tidy utf32 to utf8. --- src/util/unicode.cpp | 39 ++++++++------------------------------- src/util/unicode.h | 1 - 2 files changed, 8 insertions(+), 32 deletions(-) diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp index 1e280783aff..e357353d767 100644 --- a/src/util/unicode.cpp +++ b/src/util/unicode.cpp @@ -154,17 +154,17 @@ std::wstring widen(const std::string &s) /*******************************************************************\ -Function: utf32_to_utf8 +Function: utf8_append_code - Inputs: + Inputs: character to append, string to append to Outputs: - Purpose: + Purpose: Appends a unicode character to a utf8-encoded string \*******************************************************************/ -void utf32_to_utf8(unsigned int c, std::string &result) +static void utf8_append_code(unsigned int c, std::string &result) { if(c<=0x7f) result+=static_cast(c); @@ -192,9 +192,10 @@ void utf32_to_utf8(unsigned int c, std::string &result) Function: utf32_to_utf8 - Inputs: + Inputs: utf32-encoded wide string - Outputs: + Outputs: utf8-encoded string with the same unicode characters + as the input. Purpose: @@ -207,31 +208,7 @@ std::string utf32_to_utf8(const std::basic_string &s) result.reserve(s.size()); // at least that long for(const auto c : s) - utf32_to_utf8(c, result); - - return result; -} - -/*******************************************************************\ - -Function: utf16_to_utf8 - - Inputs: - - Outputs: - - Purpose: - -\*******************************************************************/ - -std::string utf16_to_utf8(const std::basic_string &s) -{ - std::string result; - - result.reserve(s.size()); // at least that long - - for(const auto c : s) - utf32_to_utf8(c, result); + utf8_append_code(c, result); return result; } diff --git a/src/util/unicode.h b/src/util/unicode.h index c4bcab617d4..15aafe8ae13 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -20,7 +20,6 @@ std::string narrow(const std::wstring &s); std::wstring widen(const std::string &s); std::string utf32_to_utf8(const std::basic_string &s); -std::string utf16_to_utf8(const std::basic_string &s); std::wstring utf8_to_utf16_big_endian(const std::string &); std::wstring utf8_to_utf16_little_endian(const std::string &); From 49b77cea3b2d2dac5cc906f38a36b101b1cba484 Mon Sep 17 00:00:00 2001 From: Vojtech Forejt Date: Tue, 25 Apr 2017 10:09:31 +0100 Subject: [PATCH 2/2] Change utf8 to utf16 conversion to not require codecvt. A unit test is added to check (on a few instances) equivalence with the original implementation. Decrease the required gcc version. --- COMPILING | 2 +- src/util/unicode.cpp | 160 +++++++++++++++++++++++++++++++++++++++---- unit/Makefile | 4 ++ unit/unicode.cpp | 94 +++++++++++++++++++++++++ 4 files changed, 245 insertions(+), 15 deletions(-) create mode 100644 unit/unicode.cpp diff --git a/COMPILING b/COMPILING index 7cbdfcaab13..4b47b38b790 100644 --- a/COMPILING +++ b/COMPILING @@ -38,7 +38,7 @@ We assume that you have a Debian/Ubuntu or Red Hat-like distribution. yum install gcc gcc-c++ flex bison perl-libwww-perl patch devtoolset-6 - Note that you need g++ version 5.2 or newer. + Note that you need g++ version 4.9 or newer. 1) As a user, get the CBMC source via diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp index e357353d767..9788ea6a897 100644 --- a/src/util/unicode.cpp +++ b/src/util/unicode.cpp @@ -8,9 +8,9 @@ Author: Daniel Kroening, kroening@kroening.com #include #include -#include #include #include +#include #include "unicode.h" @@ -20,6 +20,24 @@ Author: Daniel Kroening, kroening@kroening.com /*******************************************************************\ +Function: is_little_endian_arch + + Inputs: + + Outputs: True if the architecture is little_endian + + Purpose: Determine endianness of the architecture + +\*******************************************************************/ + +bool is_little_endian_arch() +{ + uint32_t i=1; + return reinterpret_cast(i); +} + +/*******************************************************************\ + Function: narrow Inputs: @@ -242,20 +260,141 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide) /*******************************************************************\ +Function: do_swap_bytes + + Inputs: A 16-bit integer + + Outputs: A 16-bit integer with bytes swapped + + Purpose: A helper function for dealing with different UTF16 endians + +\*******************************************************************/ + +uint16_t do_swap_bytes(uint16_t x) +{ + uint16_t b1=x & 0xFF; + uint16_t b2=x & 0xFF00; + return (b1 << 8) | (b2 >> 8); +} + + +void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result) +{ + // we do not treat 0xD800 to 0xDFFF, although + // they are not valid unicode symbols + + if(code<0xFFFF) + { // code is encoded as one UTF16 character + // we just take the code and possibly swap the bytes + unsigned int a=(swap_bytes)?do_swap_bytes(code):code; + result+=static_cast(a); + } + else // code is encoded as two UTF16 characters + { + // if this is valid unicode, we have + // code<0x10FFFF + // but let's not check it programmatically + + // encode the code in UTF16, possibly swapping bytes. + code=code-0x10000; + unsigned int i1=((code>>10) & 0x3ff) | 0xD800; + unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast(i1)):i1; + result+=static_cast(a1); + unsigned int i2=(code & 0x3ff) | 0xDC00; + unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast(i2)):i2; + result+=static_cast(a2); + } +} + + +/*******************************************************************\ + +Function: utf8_to_utf16 + + Inputs: String in UTF-8 format, bool value indicating whether the + endianness should be different from the architecture one. + + Outputs: String in UTF-16 format. The encoding follows the + endianness of the architecture iff swap_bytes is true. + + Purpose: + +\*******************************************************************/ +std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes) +{ + std::wstring result; + result.reserve(in.size()); + int i=0; + while(i > converter; - return converter.from_bytes(in); + bool swap_bytes=is_little_endian_arch(); + return utf8_to_utf16(in, swap_bytes); } /*******************************************************************\ @@ -266,21 +405,14 @@ Function: utf8_to_utf16_little_endian Outputs: String in UTF-16LE format - Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+ + Purpose: \*******************************************************************/ std::wstring utf8_to_utf16_little_endian(const std::string& in) { - const std::codecvt_mode mode=std::codecvt_mode::little_endian; - - // default largest value codecvt_utf8_utf16 reads without error is 0x10ffff - // see: http://en.cppreference.com/w/cpp/locale/codecvt_utf8_utf16 - const unsigned long maxcode=0x10ffff; - - typedef std::codecvt_utf8_utf16 codecvt_utf8_utf16t; - std::wstring_convert converter; - return converter.from_bytes(in); + bool swap_bytes=!is_little_endian_arch(); + return utf8_to_utf16(in, swap_bytes); } /*******************************************************************\ diff --git a/unit/Makefile b/unit/Makefile index 049d3f84ac0..f916b59e593 100644 --- a/unit/Makefile +++ b/unit/Makefile @@ -10,6 +10,7 @@ SRC = cpp_parser.cpp \ sharing_node.cpp \ smt2_parser.cpp \ string_utils.cpp \ + unicode.cpp \ wp.cpp \ # Empty last line @@ -76,3 +77,6 @@ sharing_map$(EXEEXT): sharing_map$(OBJEXT) sharing_node$(EXEEXT): sharing_node$(OBJEXT) $(LINKBIN) +unicode$(EXEEXT): unicode$(OBJEXT) + $(LINKBIN) + diff --git a/unit/unicode.cpp b/unit/unicode.cpp new file mode 100644 index 00000000000..9670856419f --- /dev/null +++ b/unit/unicode.cpp @@ -0,0 +1,94 @@ +/*******************************************************************\ + +Module: Unicode conversion tests. + +Author: Vojtech Forejt, forejtv@diffblue.com + +\*******************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include + +// This unit test compares our implementation with codecvt implementation, +// checking bit-by-bit equivalence of results. + +bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b) +{ + if(a.size() != b.size()) + return false; + const char + *pa=reinterpret_cast(&a[0]), + *pb=reinterpret_cast(&b[0]); + for(std::size_t i=0; ib.size())?a.size():b.size(); + const unsigned char + *pa=reinterpret_cast(&a[0]), + *pb=reinterpret_cast(&b[0]); + for(std::size_t i=0; i codecvt_utf8_utf16t; + std::wstring_convert converter; + std::wstring s2=converter.from_bytes(in); + + assert(paranoid_wstr_equals(s1, s2)); +} + +void compare_utf8_to_utf16_little_endian(std::string& in) +{ + std::wstring s1=utf8_to_utf16_little_endian(in); + + const std::codecvt_mode mode=std::codecvt_mode::little_endian; + const unsigned long maxcode=0x10ffff; + + typedef std::codecvt_utf8_utf16 codecvt_utf8_utf16t; + std::wstring_convert converter; + std::wstring s2=converter.from_bytes(in); + + assert(paranoid_wstr_equals(s1, s2)); +} + +int main() +{ + std::string s; + s=u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8"; + compare_utf8_to_utf16_big_endian(s); + compare_utf8_to_utf16_little_endian(s); + s=u8"$¢€𐍈"; + compare_utf8_to_utf16_big_endian(s); + compare_utf8_to_utf16_little_endian(s); + s=u8"𐐏𤭢"; + compare_utf8_to_utf16_big_endian(s); + compare_utf8_to_utf16_little_endian(s); + s=u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ"; + compare_utf8_to_utf16_big_endian(s); + compare_utf8_to_utf16_little_endian(s); +} +