From 549e589dec375cfbb0f921bad418ac3133c0b90a Mon Sep 17 00:00:00 2001
From: Vojtech Forejt <forejtv@gmail.com>
Date: Tue, 25 Apr 2017 09:13:11 +0100
Subject: [PATCH 1/2] Remove an unused conversion function that is broken
 anyway, and tidy utf32 to utf8.

---
 src/util/unicode.cpp | 39 ++++++++-------------------------------
 src/util/unicode.h   |  1 -
 2 files changed, 8 insertions(+), 32 deletions(-)
diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp
index 1e280783aff..e357353d767 100644
--- a/src/util/unicode.cpp
+++ b/src/util/unicode.cpp
@@ -154,17 +154,17 @@ std::wstring widen(const std::string &s)
 
 /*******************************************************************\
 
-Function: utf32_to_utf8
+Function: utf8_append_code
 
-  Inputs:
+  Inputs: character to append, string to append to
 
  Outputs:
 
- Purpose:
+ Purpose: Appends a unicode character to a utf8-encoded string
 
 \*******************************************************************/
 
-void utf32_to_utf8(unsigned int c, std::string &result)
+static void utf8_append_code(unsigned int c, std::string &result)
 {
   if(c<=0x7f)
     result+=static_cast<char>(c);
@@ -192,9 +192,10 @@ void utf32_to_utf8(unsigned int c, std::string &result)
 
 Function: utf32_to_utf8
 
-  Inputs:
+  Inputs: utf32-encoded wide string
 
- Outputs:
+ Outputs: utf8-encoded string with the same unicode characters
+          as the input.
 
  Purpose:
 
@@ -207,31 +208,7 @@ std::string utf32_to_utf8(const std::basic_string<unsigned int> &s)
   result.reserve(s.size()); // at least that long
 
   for(const auto c : s)
-    utf32_to_utf8(c, result);
-
-  return result;
-}
-
-/*******************************************************************\
-
-Function: utf16_to_utf8
-
-  Inputs:
-
- Outputs:
-
- Purpose:
-
-\*******************************************************************/
-
-std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s)
-{
-  std::string result;
-
-  result.reserve(s.size()); // at least that long
-
-  for(const auto c : s)
-    utf32_to_utf8(c, result);
+    utf8_append_code(c, result);
 
   return result;
 }
diff --git a/src/util/unicode.h b/src/util/unicode.h
index c4bcab617d4..15aafe8ae13 100644
--- a/src/util/unicode.h
+++ b/src/util/unicode.h
@@ -20,7 +20,6 @@ std::string narrow(const std::wstring &s);
 std::wstring widen(const std::string &s);
 
 std::string utf32_to_utf8(const std::basic_string<unsigned int> &s);
-std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s);
 
 std::wstring utf8_to_utf16_big_endian(const std::string &);
 std::wstring utf8_to_utf16_little_endian(const std::string &);

From 49b77cea3b2d2dac5cc906f38a36b101b1cba484 Mon Sep 17 00:00:00 2001
From: Vojtech Forejt <forejtv@gmail.com>
Date: Tue, 25 Apr 2017 10:09:31 +0100
Subject: [PATCH 2/2] Change utf8 to utf16 conversion to not require codecvt.

A unit test is added to check (on a few instances) equivalence
with the original implementation.

Decrease the required gcc version.
---
 COMPILING            |   2 +-
 src/util/unicode.cpp | 160 +++++++++++++++++++++++++++++++++++++++----
 unit/Makefile        |   4 ++
 unit/unicode.cpp     |  94 +++++++++++++++++++++++++
 4 files changed, 245 insertions(+), 15 deletions(-)
 create mode 100644 unit/unicode.cpp

diff --git a/COMPILING b/COMPILING
index 7cbdfcaab13..4b47b38b790 100644
--- a/COMPILING
+++ b/COMPILING
@@ -38,7 +38,7 @@ We assume that you have a Debian/Ubuntu or Red Hat-like distribution.
 
    yum install gcc gcc-c++ flex bison perl-libwww-perl patch devtoolset-6
 
-   Note that you need g++ version 5.2 or newer.
+   Note that you need g++ version 4.9 or newer.
 
 1) As a user, get the CBMC source via
 
diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp
index e357353d767..9788ea6a897 100644
--- a/src/util/unicode.cpp
+++ b/src/util/unicode.cpp
@@ -8,9 +8,9 @@ Author: Daniel Kroening, kroening@kroening.com
 
 #include <cstring>
 #include <locale>
-#include <codecvt>
 #include <iomanip>
 #include <sstream>
+#include <cstdint>
 
 #include "unicode.h"
 
@@ -20,6 +20,24 @@ Author: Daniel Kroening, kroening@kroening.com
 
 /*******************************************************************\
 
+Function: is_little_endian_arch
+
+  Inputs:
+
+ Outputs: True if the architecture is little_endian
+
+ Purpose: Determine endianness of the architecture
+
+\*******************************************************************/
+
+bool is_little_endian_arch()
+{
+  uint32_t i=1;
+  return reinterpret_cast<uint8_t &>(i);
+}
+
+/*******************************************************************\
+
 Function: narrow
 
   Inputs:
@@ -242,20 +260,141 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide)
 
 /*******************************************************************\
 
+Function: do_swap_bytes
+
+  Inputs: A 16-bit integer
+
+ Outputs: A 16-bit integer with bytes swapped
+
+ Purpose: A helper function for dealing with different UTF16 endians
+
+\*******************************************************************/
+
+uint16_t do_swap_bytes(uint16_t x)
+{
+  uint16_t b1=x & 0xFF;
+  uint16_t b2=x & 0xFF00;
+  return (b1 << 8) | (b2 >> 8);
+}
+
+
+void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result)
+{
+  // we do not treat 0xD800 to 0xDFFF, although
+  // they are not valid unicode symbols
+
+  if(code<0xFFFF)
+  { // code is encoded as one UTF16 character
+    // we just take the code and possibly swap the bytes
+    unsigned int a=(swap_bytes)?do_swap_bytes(code):code;
+    result+=static_cast<wchar_t>(a);
+  }
+  else // code is encoded as two UTF16 characters
+  {
+    // if this is valid unicode, we have
+    // code<0x10FFFF
+    // but let's not check it programmatically
+
+    // encode the code in UTF16, possibly swapping bytes.
+    code=code-0x10000;
+    unsigned int i1=((code>>10) & 0x3ff) | 0xD800;
+    unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i1)):i1;
+    result+=static_cast<wchar_t>(a1);
+    unsigned int i2=(code & 0x3ff) | 0xDC00;
+    unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i2)):i2;
+    result+=static_cast<wchar_t>(a2);
+  }
+}
+
+
+/*******************************************************************\
+
+Function: utf8_to_utf16
+
+  Inputs: String in UTF-8 format, bool value indicating whether the
+          endianness should be different from the architecture one.
+
+ Outputs: String in UTF-16 format. The encoding follows the
+          endianness of the architecture iff swap_bytes is true.
+
+ Purpose:
+
+\*******************************************************************/
+std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
+{
+    std::wstring result;
+    result.reserve(in.size());
+    int i=0;
+    while(i<in.size())
+    {
+      unsigned char c=in[i++];
+      unsigned int code=0;
+      // the ifs that follow find out how many UTF8 characters (1-4) store the
+      // next unicode character. This is determined by the few most
+      // significant bits.
+      if(c<=0x7F)
+      {
+        // if it's one character, then code is exactly the value
+        code=c;
+      }
+      else if(c<=0xDF && i<in.size())
+      { // in other cases, we need to read the right number of chars and decode
+        // note: if we wanted to make sure that we capture incorrect strings,
+        // we should check that whatever follows first character starts with
+        // bits 10.
+        code=(c & 0x1F) << 6;
+        c=in[i++];
+        code+=c  & 0x3F;
+      }
+      else if(c<=0xEF && i+1<in.size())
+      {
+        code=(c & 0xF) << 12;
+        c=in[i++];
+        code+=(c & 0x3F) << 6;
+        c=in[i++];
+        code+=c & 0x3F;
+      }
+      else if(c<=0xF7 && i+2<in.size())
+      {
+        code=(c & 0x7) << 18;
+        c=in[i++];
+        code+=(c & 0x3F) << 12;
+        c=in[i++];
+        code+=(c & 0x3F) << 6;
+        c=in[i++];
+        code+=c & 0x3F;
+      }
+      else
+      {
+        // The string is not a valid UTF8 string! Either it has some characters
+        // missing from a multi-character unicode symbol, or it has a char with
+        // too high value.
+        // For now, let's replace the character with a space
+        code=32;
+      }
+
+      utf16_append_code(code, swap_bytes, result);
+    }
+
+    return result;
+}
+
+/*******************************************************************\
+
 Function: utf8_to_utf16_big_endian
 
   Inputs: String in UTF-8 format
 
  Outputs: String in UTF-16BE format
 
- Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+
+ Purpose:
 
 \*******************************************************************/
 
 std::wstring utf8_to_utf16_big_endian(const std::string& in)
 {
-  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t> > converter;
-  return converter.from_bytes(in);
+  bool swap_bytes=is_little_endian_arch();
+  return utf8_to_utf16(in, swap_bytes);
 }
 
 /*******************************************************************\
@@ -266,21 +405,14 @@ Function: utf8_to_utf16_little_endian
 
  Outputs: String in UTF-16LE format
 
- Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+
+ Purpose:
 
 \*******************************************************************/
 
 std::wstring utf8_to_utf16_little_endian(const std::string& in)
 {
-  const std::codecvt_mode mode=std::codecvt_mode::little_endian;
-
-  // default largest value codecvt_utf8_utf16 reads without error is 0x10ffff
-  // see: http://en.cppreference.com/w/cpp/locale/codecvt_utf8_utf16
-  const unsigned long maxcode=0x10ffff;
-
-  typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t;
-  std::wstring_convert<codecvt_utf8_utf16t> converter;
-  return converter.from_bytes(in);
+  bool swap_bytes=!is_little_endian_arch();
+  return utf8_to_utf16(in, swap_bytes);
 }
 
 /*******************************************************************\
diff --git a/unit/Makefile b/unit/Makefile
index 049d3f84ac0..f916b59e593 100644
--- a/unit/Makefile
+++ b/unit/Makefile
@@ -10,6 +10,7 @@ SRC = cpp_parser.cpp \
       sharing_node.cpp \
       smt2_parser.cpp \
       string_utils.cpp \
+      unicode.cpp \
       wp.cpp \
       # Empty last line
 
@@ -76,3 +77,6 @@ sharing_map$(EXEEXT): sharing_map$(OBJEXT)
 sharing_node$(EXEEXT): sharing_node$(OBJEXT)
 	$(LINKBIN)
 
+unicode$(EXEEXT): unicode$(OBJEXT)
+	$(LINKBIN)
+
diff --git a/unit/unicode.cpp b/unit/unicode.cpp
new file mode 100644
index 00000000000..9670856419f
--- /dev/null
+++ b/unit/unicode.cpp
@@ -0,0 +1,94 @@
+/*******************************************************************\
+
+Module: Unicode conversion tests.
+
+Author: Vojtech Forejt, forejtv@diffblue.com
+
+\*******************************************************************/
+
+#include <cassert>
+#include <vector>
+#include <string>
+#include <codecvt>
+#include <iomanip>
+#include <iostream>
+#include <locale>
+
+#include <util/unicode.h>
+
+// This unit test compares our implementation with codecvt implementation,
+// checking bit-by-bit equivalence of results.
+
+bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b)
+{
+  if(a.size() != b.size())
+    return false;
+  const char
+    *pa=reinterpret_cast<const char *>(&a[0]),
+    *pb=reinterpret_cast<const char *>(&b[0]);
+  for(std::size_t i=0; i<a.size() * sizeof(a.front()); ++i)
+  {
+    if(pa[i] != pb[i])
+      return false;
+  }
+  return true;
+}
+
+// helper print function, can be called for debugging problem
+void wstr_print(const std::wstring &a, const std::wstring &b)
+{
+  int endi=(a.size()>b.size())?a.size():b.size();
+  const unsigned char
+    *pa=reinterpret_cast<const unsigned char *>(&a[0]),
+    *pb=reinterpret_cast<const unsigned char *>(&b[0]);
+  for(std::size_t i=0; i<endi * sizeof(a.front()); ++i)
+  {
+    std::cout << ((a.size()<endi)?"x":std::to_string(pa[i]))
+      << ' '
+      << ((b.size()<endi)?"x":std::to_string(pb[i])) << '\n';
+  }
+  std::cout << '\n';
+}
+
+void compare_utf8_to_utf16_big_endian(std::string& in)
+{
+  std::wstring s1=utf8_to_utf16_big_endian(in);
+
+  typedef std::codecvt_utf8_utf16<wchar_t> codecvt_utf8_utf16t;
+  std::wstring_convert<codecvt_utf8_utf16t> converter;
+  std::wstring s2=converter.from_bytes(in);
+
+  assert(paranoid_wstr_equals(s1, s2));
+}
+
+void compare_utf8_to_utf16_little_endian(std::string& in)
+{
+  std::wstring s1=utf8_to_utf16_little_endian(in);
+
+  const std::codecvt_mode mode=std::codecvt_mode::little_endian;
+  const unsigned long maxcode=0x10ffff;
+
+  typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t;
+  std::wstring_convert<codecvt_utf8_utf16t> converter;
+  std::wstring s2=converter.from_bytes(in);
+
+  assert(paranoid_wstr_equals(s1, s2));
+}
+
+int main()
+{
+  std::string s;
+  s=u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8";
+  compare_utf8_to_utf16_big_endian(s);
+  compare_utf8_to_utf16_little_endian(s);
+  s=u8"$¢€𐍈";
+  compare_utf8_to_utf16_big_endian(s);
+  compare_utf8_to_utf16_little_endian(s);
+  s=u8"𐐏𤭢";
+  compare_utf8_to_utf16_big_endian(s);
+  compare_utf8_to_utf16_little_endian(s);
+  s=u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ";
+  compare_utf8_to_utf16_big_endian(s);
+  compare_utf8_to_utf16_little_endian(s);
+}
+