-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMSF_UTF.cpp
More file actions
270 lines (238 loc) · 12.7 KB
/
MSF_UTF.cpp
File metadata and controls
270 lines (238 loc) · 12.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#include "MSF_UTF.h"
#include "MSF_Assert.h"
#include "MSF_Utilities.h"
//-------------------------------------------------------------------------------------------------
// The leader bits of the the first character tells you how many bytes to use.
// 0b0xxxxxxx - ascii character, 1 byte
// 0b110xxxxx - 2 bytes, 5 bits in this segment (total 11 bits)
// 0b1110xxxx - 3 bytes, 4 bits in this segment (total 16 bits)
// 0b11110xxx - 4 bytes, 3 bits in this segment (total 21 bits)
//
// Subsequent bytes all have the same form of 0xb10xxxxxx to have 6 bits each.
//-------------------------------------------------------------------------------------------------
MSF_CodeRead MSF_ReadCodePoint(char const* aString)
{
// read as unsigned to make checks work
uint32_t const lead = *(uint8_t const*)aString;
if (lead <= 0x7f)
{
return { lead, 1 };
}
if (!aString[1])
{
return { 0, 1 };
}
if (lead < 0b11100000)
{
uint32_t const code =
((lead & 0b00011111) << 6) |
((aString[1] & 0b00111111));
return { code, 2 };
}
if (!aString[2])
{
return { 0, 2 };
}
if (lead < 0b11110000)
{
uint32_t const code =
((lead & 0b00001111) << 12) |
((aString[1] & 0b00111111) << 6) |
((aString[2] & 0b00111111));
return { code, 3 };
}
if (!aString[3])
{
return { 0, 3 };
}
uint32_t const code =
((lead & 0b00000111) << 18) |
((aString[1] & 0b00111111) << 12) |
((aString[2] & 0b00111111) << 6) |
((aString[3] & 0b00111111));
return { code, 4 };
}
//-------------------------------------------------------------------------------------------------
MSF_CodeRead MSF_ReadCodePoint(char8_t const* aString)
{
return MSF_ReadCodePoint((char const*)aString);
}
//-------------------------------------------------------------------------------------------------
// Values in the range 0xD800-0xDfff are reserved and used to identify when a unicode character is split
// into two UTF16 segments.
//
// When split, 0x10000 is removed from the total value which allows use of more space in the pair which
// combined only contains 20 bits of total information
//-------------------------------------------------------------------------------------------------
MSF_CodeRead MSF_ReadCodePoint(char16_t const* aString)
{
uint32_t const lead = *(uint16_t const*)aString;
// if (code >= 0xD800 && code < 0xE000) but with a single branch
if ((lead - 0xD800) < 0xE000 - 0xD800)
{
// unexpected end of file, read the bytes but return null terminator
if (!aString[1])
{
return { 0, 1 };
}
uint32_t const code =
0x10000 |
((lead & 0b1111111111) << 10) |
(aString[1] & 0b1111111111);
return { code, 2 };
}
return { lead, 1 };
}
//-------------------------------------------------------------------------------------------------
MSF_CodeRead MSF_ReadCodePoint(char32_t const* aString)
{
return MSF_CodeRead{ *aString, 1 };
}
//-------------------------------------------------------------------------------------------------
MSF_CodeRead MSF_ReadCodePoint(wchar_t const* aString)
{
return MSF_ReadCodePoint((MSF_WChar const*)aString);
}
//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
inline uint32_t MSF_WriteCodePointInternal(uint32_t aCodePoint, char* aStringOut)
{
if (aCodePoint <= 0x7f)
{
aStringOut[0] = char(aCodePoint);
return 1;
}
if (aCodePoint <= 0x7ff)
{
aStringOut[0] = 0b11000000 | char((aCodePoint >> 6));
aStringOut[1] = 0b10000000 | char((aCodePoint >> 0) & 0b00111111);
return 2;
}
if (aCodePoint <= 0xffff)
{
aStringOut[0] = 0b11100000 | char((aCodePoint >> 12));
aStringOut[1] = 0b10000000 | char((aCodePoint >> 6) & 0b00111111);
aStringOut[2] = 0b10000000 | char((aCodePoint >> 0) & 0b00111111);
return 3;
}
aStringOut[0] = 0b11110000 | char((aCodePoint >> 18) & 0b00000111);
aStringOut[1] = 0b10000000 | char((aCodePoint >> 12) & 0b00111111);
aStringOut[2] = 0b10000000 | char((aCodePoint >> 6) & 0b00111111);
aStringOut[3] = 0b10000000 | char((aCodePoint >> 0) & 0b00111111);
return 4;
}
//-------------------------------------------------------------------------------------------------
inline uint32_t MSF_WriteCodePointInternal(uint32_t aCodePoint, char16_t* aStringOut)
{
if (aCodePoint <= 0xffff)
{
MSF_ASSERT(aCodePoint < 0xd800 || aCodePoint >= 0xE000);
aStringOut[0] = (char16_t)aCodePoint;
return 1;
}
aCodePoint -= 0x10000;
aStringOut[0] = char16_t(0xD800 | (aCodePoint >> 10));
aStringOut[1] = char16_t(0xDC00 | (aCodePoint & 0b1111111111));
return 2;
}
//-------------------------------------------------------------------------------------------------
inline uint32_t MSF_WriteCodePointInternal(uint32_t aCodePoint, char32_t* aStringOut)
{
*aStringOut = aCodePoint;
return 1;
}
//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
uint32_t MSF_WriteCodePoint(uint32_t aCodePoint, char aStringOut[4 / sizeof(char)])
{
return MSF_WriteCodePointInternal(aCodePoint, aStringOut);
}
//-------------------------------------------------------------------------------------------------
uint32_t MSF_WriteCodePoint(uint32_t aCodePoint, char8_t aStringOut[4 / sizeof(char8_t)])
{
return MSF_WriteCodePointInternal(aCodePoint, (char*)aStringOut);
}
//-------------------------------------------------------------------------------------------------
uint32_t MSF_WriteCodePoint(uint32_t aCodePoint, char16_t aStringOut[4 / sizeof(char16_t)])
{
return MSF_WriteCodePointInternal(aCodePoint, aStringOut);
}
//-------------------------------------------------------------------------------------------------
uint32_t MSF_WriteCodePoint(uint32_t aCodePoint, char32_t aStringOut[4 / sizeof(char32_t)])
{
return MSF_WriteCodePointInternal(aCodePoint, aStringOut);
}
//-------------------------------------------------------------------------------------------------
uint32_t MSF_WriteCodePoint(uint32_t aCodePoint, wchar_t aStringOut[4 / sizeof(wchar_t)])
{
return MSF_WriteCodePointInternal(aCodePoint, (MSF_WChar*)aStringOut);
}
//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
template <typename CharTo, typename CharFrom>
MSF_CharactersWritten MSF_UTFCopyShared(CharTo* aStringOut, size_t aBufferLength, CharFrom const* aStringIn, size_t aCharacterLimit)
{
MSF_CharactersWritten written = { 0, 0 };
while (*aStringIn && written.Characters < aCharacterLimit)
{
MSF_CodeRead const read = MSF_ReadCodePoint(aStringIn);
aStringIn += read.CharsRead;
union
{
CharTo String[4 / sizeof(CharTo)];
uint32_t Block;
} Optim;
uint32_t const write = MSF_WriteCodePoint(read.CodePoint, Optim.String);
if (write > aBufferLength)
break;
if (aStringOut)
{
if (aBufferLength > 4 / sizeof(CharTo))
{
// fast copy just copy a whole 4 byte block as long as we have space
*(uint32_t*)aStringOut = Optim.Block;
}
else
{
for (uint32_t i = 0; i < write; ++i)
{
aStringOut[i] = Optim.String[i];
}
}
aStringOut += write;
}
aBufferLength -= write;
written.Elements += write;
++written.Characters;
}
if (aStringOut && aBufferLength)
*aStringOut = 0;
return written;
}
//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
MSF_CharactersWritten MSF_UTFCopy(char* aStringOut, size_t aBufferLength, char const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char* aStringOut, size_t aBufferLength, char8_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char* aStringOut, size_t aBufferLength, char16_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char* aStringOut, size_t aBufferLength, char32_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char* aStringOut, size_t aBufferLength, wchar_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char8_t* aStringOut, size_t aBufferLength, char const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char8_t* aStringOut, size_t aBufferLength, char8_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char8_t* aStringOut, size_t aBufferLength, char16_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char8_t* aStringOut, size_t aBufferLength, char32_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char8_t* aStringOut, size_t aBufferLength, wchar_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char16_t* aStringOut, size_t aBufferLength, char const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char16_t* aStringOut, size_t aBufferLength, char8_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char16_t* aStringOut, size_t aBufferLength, char16_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char16_t* aStringOut, size_t aBufferLength, char32_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char16_t* aStringOut, size_t aBufferLength, wchar_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char32_t* aStringOut, size_t aBufferLength, char const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char32_t* aStringOut, size_t aBufferLength, char8_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char32_t* aStringOut, size_t aBufferLength, char16_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char32_t* aStringOut, size_t aBufferLength, char32_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(char32_t* aStringOut, size_t aBufferLength, wchar_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(wchar_t* aStringOut, size_t aBufferLength, char const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(wchar_t* aStringOut, size_t aBufferLength, char8_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(wchar_t* aStringOut, size_t aBufferLength, char16_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(wchar_t* aStringOut, size_t aBufferLength, char32_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }
MSF_CharactersWritten MSF_UTFCopy(wchar_t* aStringOut, size_t aBufferLength, wchar_t const* aStringIn, size_t aCharacterLimit) { return MSF_UTFCopyShared(aStringOut, aBufferLength, aStringIn, aCharacterLimit); }