diff --git a/src/native/minipal/utf8.c b/src/native/minipal/utf8.c index 77a4e1407bcd9a..a9ed639ea8bf20 100644 --- a/src/native/minipal/utf8.c +++ b/src/native/minipal/utf8.c @@ -1,2150 +1,669 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -#include - #include #include #include -#include - -#define HIGH_SURROGATE_START 0xd800 -#define HIGH_SURROGATE_END 0xdbff -#define LOW_SURROGATE_START 0xdc00 -#define LOW_SURROGATE_END 0xdfff - -// Test if the wide character is a high surrogate -static bool IsHighSurrogate(const CHAR16_T c) -{ - return (c & 0xFC00) == HIGH_SURROGATE_START; -} +#include -// Test if the wide character is a low surrogate -static bool IsLowSurrogate(const CHAR16_T c) -{ - return (c & 0xFC00) == LOW_SURROGATE_START; -} +#include -// Test if the wide character is a surrogate half -static bool IsSurrogate(const CHAR16_T c) -{ - return (c & 0xF800) == HIGH_SURROGATE_START; -} +#ifdef _MSC_VER +#define BYTESWAP16 _byteswap_ushort +#else +#define BYTESWAP16 __builtin_bswap16 +#endif // MSC_VER -typedef struct -{ - // Store our default string - unsigned char* byteStart; - CHAR16_T* charEnd; - const CHAR16_T strDefault[2]; - int strDefaultLength; - int fallbackCount; - int fallbackIndex; -} DecoderBuffer; - -static CHAR16_T DecoderReplacementFallbackBuffer_GetNextChar(DecoderBuffer* self) +size_t minipal_get_length_utf8_to_utf16(const char* source, size_t sourceLength, uint32_t flags) { - // We want it to get < 0 because == 0 means that the current/last character is a fallback - // and we need to detect recursion. We could have a flag but we already have this counter. - self->fallbackCount--; - self->fallbackIndex++; - - // Do we have anything left? 0 is now last fallback char, negative is nothing left - if (self->fallbackCount < 0) - return '\0'; - - // Need to get it out of the buffer. - // Make sure it didn't wrap from the fast count-- path - if (self->fallbackCount == INT_MAX) - { - self->fallbackCount = -1; - return '\0'; - } + errno = 0; - // Now make sure its in the expected range - assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0); + if (sourceLength == 0) + return 0; - return self->strDefault[self->fallbackIndex]; -} + size_t sourceIndex = 0; + size_t destinationLength = 0; + bool useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS); -// Fallback Methods -static bool DecoderReplacementFallbackBuffer_Fallback(DecoderBuffer* self) -{ - // We expect no previous fallback in our buffer - // We can't call recursively but others might (note, we don't test on last char!!!) - assert(self->fallbackCount < 1); + while (sourceIndex < sourceLength) + { + unsigned char currentChar = source[sourceIndex]; + size_t sequenceLength = 0; + uint32_t codePoint = 0; - // Go ahead and get our fallback - if (self->strDefaultLength == 0) - return false; + if (currentChar <= 0x7F) + { + destinationLength++; + sourceIndex++; + continue; + } - self->fallbackCount = self->strDefaultLength; - self->fallbackIndex = -1; + if ((currentChar & 0xC0) == 0x80 || (currentChar & 0xFE) == 0xFE || (currentChar & 0xFF) == 0xFF) + { + if (useFallback) + { + destinationLength++; + sourceIndex++; + } + else + { + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; + } + continue; + } - return true; -} + if ((currentChar & 0xE0) == 0xC0) + { + sequenceLength = 2; + } + else if ((currentChar & 0xF0) == 0xE0) + { + sequenceLength = 3; + } + else if ((currentChar & 0xF8) == 0xF0) + { + sequenceLength = 4; + } + else + { + if (useFallback) + { + destinationLength++; + sourceIndex++; + } + else + { + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; + } + continue; + } -// Fallback the current byte by sticking it into the remaining char buffer. -// This can only be called by our encodings (other have to use the public fallback methods), so -// we can use our DecoderNLS here too (except we don't). -// Returns true if we are successful, false if we can't fallback the character (no buffer space) -// So caller needs to throw buffer space if return false. -// Right now this has both bytes and bytes[], since we might have extra bytes, hence the -// array, and we might need the index, hence the byte* -// Don't touch ref chars unless we succeed -static bool DecoderReplacementFallbackBuffer_InternalFallback_Copy(DecoderBuffer* self, CHAR16_T** chars, CHAR16_T* pAllocatedBufferEnd) -{ - assert(self->byteStart != NULL); + if (sourceIndex + sequenceLength > sourceLength) + { + if (useFallback) + { + destinationLength += sequenceLength >= 3 ? 2 : 1; + sourceIndex = sourceLength; + } + else + { + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; + } + continue; + } - bool fallbackResult = DecoderReplacementFallbackBuffer_Fallback(self); + bool validSequence = true; + for (size_t i = 1; i < sequenceLength; ++i) + { + if ((source[sourceIndex + i] & 0xC0) != 0x80) + { + validSequence = false; + break; + } + } - // See if there's a fallback character and we have an output buffer then copy our string. - if (fallbackResult) - { - // Copy the chars to our output - CHAR16_T ch; - CHAR16_T* charTemp = *chars; - bool bHighSurrogate = false; - (void)bHighSurrogate; // unused in release build - while ((ch = DecoderReplacementFallbackBuffer_GetNextChar(self)) != 0) + if (!validSequence) { - // Make sure no mixed up surrogates - if (IsSurrogate(ch)) + if (useFallback) { - if (IsHighSurrogate(ch)) - { - // High Surrogate - assert(!bHighSurrogate); - bHighSurrogate = true; - } - else + destinationLength++; + + do { - // Low surrogate - assert(bHighSurrogate); - bHighSurrogate = false; + sourceIndex++; } + while (sourceIndex < sourceLength && (source[sourceIndex] & 0xC0) == 0x80); + continue; } - - if (charTemp >= self->charEnd) + else { - // No buffer space - return false; + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; } + } - *(charTemp++) = ch; - if (charTemp > pAllocatedBufferEnd) + if ((currentChar == 0xC0 && sourceIndex + 1 < sourceLength && (unsigned char)source[sourceIndex + 1] == 0x80) || + (currentChar == 0xE0 && sourceIndex + 2 < sourceLength && + (unsigned char)source[sourceIndex + 1] == 0x80 && (unsigned char)source[sourceIndex + 2] == 0x80) || + (currentChar == 0xF0 && sourceIndex + 3 < sourceLength && + (unsigned char)source[sourceIndex + 1] == 0x80 && (unsigned char)source[sourceIndex + 2] == 0x80 && + (unsigned char)source[sourceIndex + 3] == 0x80)) + { + if (useFallback) { - errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; - return false; + if (sequenceLength < 3) + destinationLength++; + sourceIndex++; // Move to the end of the invalid sequence + continue; + } + else + { + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; } } - // Need to make sure that bHighSurrogate isn't true - assert(!bHighSurrogate); - - // Now we aren't going to be false, so its OK to update chars - *chars = charTemp; - } - - return true; -} - -// Clear the buffer -static void DecoderReplacementFallbackBuffer_Reset(DecoderBuffer* self) -{ - self->fallbackCount = -1; - self->fallbackIndex = -1; - self->byteStart = NULL; -} + if (sequenceLength == 2) + { + codePoint = ((currentChar & 0x1F) << 6) | (source[sourceIndex + 1] & 0x3F); + } + else if (sequenceLength == 3) + { + codePoint = ((currentChar & 0x0F) << 12) | ((source[sourceIndex + 1] & 0x3F) << 6) | (source[sourceIndex + 2] & 0x3F); + } + else if (sequenceLength == 4) + { + codePoint = ((currentChar & 0x07) << 18) | ((source[sourceIndex + 1] & 0x3F) << 12) | ((source[sourceIndex + 2] & 0x3F) << 6) | (source[sourceIndex + 3] & 0x3F); + } -typedef struct -{ - const CHAR16_T strDefault[3]; - int strDefaultLength; - CHAR16_T* charStart; - CHAR16_T* charEnd; - bool setEncoder; - bool bUsedEncoder; - bool bFallingBack; - int iRecursionCount; - int fallbackCount; - int fallbackIndex; -} EncoderBuffer; - -#define MAX_RECURSION 250 - -// Set the above values -// This can't be part of the constructor because EncoderFallbacks would have to know how to implement these. -static void EncoderReplacementFallbackBuffer_InternalInitialize(EncoderBuffer* self, CHAR16_T* charStart, CHAR16_T* charEnd, bool setEncoder) -{ - self->charStart = charStart; - self->charEnd = charEnd; - self->setEncoder = setEncoder; - self->bUsedEncoder = false; - self->bFallingBack = false; - self->iRecursionCount = 0; -} + if (sequenceLength < 4 || codePoint <= 0xFFFF) + { + destinationLength++; + } + else + { + destinationLength += 2; + } -static CHAR16_T EncoderReplacementFallbackBuffer_InternalGetNextChar(EncoderBuffer* self) -{ - // We want it to get < 0 because == 0 means that the current/last character is a fallback - // and we need to detect recursion. We could have a flag but we already have this counter. - self->fallbackCount--; - self->fallbackIndex++; - - // Do we have anything left? 0 is now last fallback char, negative is nothing left - if (self->fallbackCount < 0) - return '\0'; - - // Need to get it out of the buffer. - // Make sure it didn't wrap from the fast count-- path - if (self->fallbackCount == INT_MAX) - { - self->fallbackCount = -1; - return '\0'; + sourceIndex += sequenceLength; } - // Now make sure its in the expected range - assert(self->fallbackIndex < self->strDefaultLength && self->fallbackIndex >= 0); - - CHAR16_T ch = self->strDefault[self->fallbackIndex]; - self->bFallingBack = (ch != 0); - if (ch == 0) self->iRecursionCount = 0; - return ch; -} - -// Fallback Methods -static bool EncoderReplacementFallbackBuffer_Fallback(EncoderBuffer* self) -{ - // If we had a buffer already we're being recursive, throw, it's probably at the suspect - // character in our array. - assert(self->fallbackCount < 1); - - // Go ahead and get our fallback - // Divide by 2 because we aren't a surrogate pair - self->fallbackCount = self->strDefaultLength / 2; - self->fallbackIndex = -1; - - return self->fallbackCount != 0; + return destinationLength; } -static bool EncoderReplacementFallbackBuffer_Fallback_Unknown(EncoderBuffer* self) +size_t minipal_get_length_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, uint32_t flags) { - // If we had a buffer already we're being recursive, throw, it's probably at the suspect - // character in our array. - assert(self->fallbackCount < 1); - - // Go ahead and get our fallback - self->fallbackCount = self->strDefaultLength; - self->fallbackIndex = -1; + errno = 0; - return self->fallbackCount != 0; -} + if (sourceLength == 0) + return 0; -// Fallback the current character using the remaining buffer and encoder if necessary -// This can only be called by our encodings (other have to use the public fallback methods), so -// we can use our EncoderNLS here too. -// setEncoder is true if we're calling from a GetBytes method, false if we're calling from a GetByteCount -// -// Note that this could also change the contents of self->buffer.encoder, which is the same -// object that the caller is using, so the caller could mess up the encoder for us -// if they aren't careful. -static bool EncoderReplacementFallbackBuffer_InternalFallback(EncoderBuffer* self, CHAR16_T ch, CHAR16_T** chars) -{ - // Shouldn't have null charStart - assert(self->charStart != NULL); + size_t sourceIndex = 0; + size_t destinationLength = 0; + bool useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS); - // See if it was a high surrogate - if (IsHighSurrogate(ch)) + while (sourceIndex < sourceLength) { - // See if there's a low surrogate to go with it - if (*chars >= self->charEnd) + CHAR16_T currentChar = source[sourceIndex]; + size_t sequenceLength = 0; + + if (currentChar <= 0x7F) { - // Nothing left in input buffer - // No input, return 0 + sequenceLength = 1; } - else + else if (currentChar <= 0x7FF) + { + sequenceLength = 2; + } + else if (currentChar >= 0xD800 && currentChar <= 0xDBFF) { - // Might have a low surrogate - CHAR16_T cNext = **chars; - if (IsLowSurrogate(cNext)) + if (sourceIndex + 1 >= sourceLength || (source[sourceIndex + 1] < 0xDC00 || source[sourceIndex + 1] > 0xDFFF)) { - // If already falling back then fail - assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION); - - // Next is a surrogate, add it as surrogate pair, and increment chars - (*chars)++; - self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback_Unknown(self); - return self->bFallingBack; + if (useFallback) + { + sequenceLength = 3; // Replacement character + } + else + { + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; + } + } + else + { + sequenceLength = 4; + sourceIndex++; // Skip the second part of the surrogate pair } - - // Next isn't a low surrogate, just fallback the high surrogate } - } - - // If already falling back then fail - assert(!self->bFallingBack || self->iRecursionCount++ <= MAX_RECURSION); - - // Fall back our char - self->bFallingBack = EncoderReplacementFallbackBuffer_Fallback(self); - - return self->bFallingBack; -} + else if (currentChar >= 0xDC00 && currentChar <= 0xDFFF) + { + if (useFallback) + { + sequenceLength = 3; // Replacement character + } + else + { + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; + } + } + else + { + sequenceLength = 3; + } -static bool EncoderReplacementFallbackBuffer_MovePrevious(EncoderBuffer* self) -{ - // Back up one, only if we just processed the last character (or earlier) - if (self->fallbackCount >= -1 && self->fallbackIndex >= 0) - { - self->fallbackIndex--; - self->fallbackCount++; - return true; + destinationLength += sequenceLength; + sourceIndex++; } - // Return false 'cause we couldn't do it. - return false; -} - -typedef struct -{ - union - { - DecoderBuffer decoder; - EncoderBuffer encoder; - } buffer; - - bool useFallback; - -#if BIGENDIAN - bool treatAsLE; -#endif -} UTF8Encoding; - -// These are bitmasks used to maintain the state in the decoder. They occupy the higher bits -// while the actual character is being built in the lower bits. They are shifted together -// with the actual bits of the character. - -// bits 30 & 31 are used for pending bits fixup -#define FinalByte (1 << 29) -#define SupplimentarySeq (1 << 28) -#define ThreeByteSeq (1 << 27) - -static bool InRange(int c, int begin, int end) -{ - return begin <= c && c <= end; + return destinationLength; } -// During GetChars we had an invalid byte sequence -// pSrc is backed up to the start of the bad sequence if we didn't have room to -// fall it back. Otherwise pSrc remains where it is. -static bool FallbackInvalidByteSequence_Copy(UTF8Encoding* self, unsigned char** pSrc, CHAR16_T** pTarget, CHAR16_T* pAllocatedBufferEnd) +size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CHAR16_T* destination, size_t destinationLength, uint32_t flags) { - assert(self->useFallback); - - // Get our byte[] - unsigned char* pStart = *pSrc; - bool fallbackResult = DecoderReplacementFallbackBuffer_InternalFallback_Copy(&self->buffer.decoder, pTarget, pAllocatedBufferEnd); + errno = 0; - // Do the actual fallback - if (!fallbackResult) + if (sourceLength == 0) { - // Oops, it failed, back up to pStart - *pSrc = pStart; - return false; + return 0; } - // It worked - return true; -} + size_t sourceIndex = 0; + size_t destinationIndex = 0; + bool useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS); +#if BIGENDIAN + bool treatAsLE = !!(flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN); +#endif -static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t count) -{ - assert(bytes != NULL); - assert(count >= 0); - - // Initialize stuff - unsigned char *pSrc = bytes; - unsigned char *pEnd = pSrc + count; - size_t availableBytes; - int chc; - - // Start by assuming we have as many as count, charCount always includes the adjustment - // for the character being decoded - size_t charCount = count; - int ch = 0; - bool fallbackUsed = false; - - while (true) + while (sourceIndex < sourceLength && destinationIndex < destinationLength) { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - if (pSrc >= pEnd) break; - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - - // no pending bits - if (ch == 0) goto ReadChar; - - pSrc++; + uint32_t currentChar = (unsigned char)source[sourceIndex]; - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & 0xC0) != 0x80) + if (currentChar < 0x80) { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - charCount += (ch >> 30); - goto InvalidByteSequence; + // 1-byte sequence + sourceIndex++; } - - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) + else if (currentChar < 0xE0) { - assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0); - - if ((ch & SupplimentarySeq) != 0) + // 2-byte sequence - based on ABNF (Augmented Backus-Naur Form) syntax described at + // https://datatracker.ietf.org/doc/html/rfc3629#section-4 + if (sourceIndex + 1 >= sourceLength || + (currentChar == 0xC0 || currentChar == 0xC1) || + (source[sourceIndex + 1] & 0xC0) != 0x80) { - if ((ch & (FinalByte >> 6)) != 0) + if (useFallback) { - // this is 3rd byte (of 4 byte supplimentary) - nothing to do - continue; + currentChar = 0xFFFD; + sourceIndex++; } - - // 2nd byte, check for non-shortest form of supplimentary char and the valid - // supplimentary characters in range 0x010000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) + else { - goto InvalidByteSequence; + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; } } else { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate - { - goto InvalidByteSequence; - } + currentChar = ((currentChar & 0x1F) << 6) | (source[sourceIndex + 1] & 0x3F); + sourceIndex += 2; } - continue; - } - - // ready to punch - - // adjust for surrogates in non-shortest form - if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) charCount--; - - goto EncodeChar; - - InvalidByteSequence: - if (!self->useFallback) - { - errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; - return 0; - } - - if (!fallbackUsed) - { - fallbackUsed = true; - self->buffer.decoder.byteStart = bytes; - self->buffer.decoder.charEnd = NULL; } - charCount += self->buffer.decoder.strDefaultLength; - - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) + else if (currentChar < 0xF0) { - // If its > 0x7F, its start of a new multi-byte sequence - - // Long sequence, so unreserve our char. - charCount--; - - // bit 6 has to be non-zero for start of multibyte chars. - if ((ch & 0x40) == 0) goto InvalidByteSequence; - - // start a new long code - if ((ch & 0x20) != 0) + // 3-byte sequence - based on ABNF (Augmented Backus-Naur Form) syntax described at + // https://datatracker.ietf.org/doc/html/rfc3629#section-4 + if (sourceIndex + 2 >= sourceLength || + (currentChar == 0xE0 && (source[sourceIndex + 1] & 0xE0) != 0xA0) || + (currentChar >= 0xE1 && currentChar <= 0xEC && (source[sourceIndex + 1] & 0xC0) != 0x80) || + (currentChar == 0xED && (source[sourceIndex + 1] & 0xF0) != 0x80) || + (currentChar >= 0xEE && currentChar <= 0xEF && (source[sourceIndex + 1] & 0xC0) != 0x80) || + (source[sourceIndex + 2] & 0xC0) != 0x80) { - if ((ch & 0x10) != 0) + if (useFallback) { - // 4 byte encoding - supplimentary character (2 surrogates) + uint32_t nextChar = 0; + uint8_t missingBytes = 0; - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) + // Check second byte validity + if (sourceIndex + 1 < sourceLength) { - ch |= 0xf0; - goto InvalidByteSequence; + nextChar = (unsigned char)source[sourceIndex + 1]; + } + if ((nextChar & 0xC0) != 0x80) + { + missingBytes = 1; + } + else + { + // Check third byte validity + if (sourceIndex + 2 < sourceLength) + { + nextChar = (unsigned char)source[sourceIndex + 2]; + if ((nextChar & 0xC0) != 0x80 || + (currentChar == 0xE0 && (nextChar < 0xA0 || nextChar > 0xBF)) || + (currentChar == 0xED && (nextChar < 0x80 || nextChar > 0x9F))) + { + missingBytes = 2; + } + else + { + missingBytes = 3; + } + } + else + { + missingBytes = 2; // Only one byte is guaranteed missing + } } - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - // Final byte flag, count fix if we don't make final byte & supplimentary sequence flag. - ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now - (1 << 30) | // If it dies on next byte we'll need an extra char - (3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - - // Our character count will be 2 characters for these 4 bytes, so subtract another char - charCount--; + currentChar = 0xFFFD; // Replacement character for invalid sequences + sourceIndex += missingBytes; } else { - // 3 byte encoding - // Add bit flags so that when we check new characters & rotate we'll be flagged correctly. - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - - // We'll expect 1 character for these 3 bytes, so subtract another char. - charCount--; + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; } } else { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) - { - ch |= 0xc0; - goto InvalidByteSequence; - } - - // Add bit flags so we'll be flagged correctly - ch |= (FinalByte >> 6); + currentChar = ((currentChar & 0x0F) << 12) | ((source[sourceIndex + 1] & 0x3F) << 6) | (source[sourceIndex + 2] & 0x3F); + sourceIndex += 3; } - continue; } - - EncodeChar: - - availableBytes = (size_t)(pEnd - pSrc); - - // don't fall into the fast decoding loop if we don't have enough bytes - if (availableBytes <= 13) + else if (currentChar >= 0xF0) { - // try to get over the remainder of the ascii characters fast though - unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) + // 4-byte sequence - based on ABNF (Augmented Backus-Naur Form) syntax described at + // https://datatracker.ietf.org/doc/html/rfc3629#section-4 + if ((sourceIndex + 3 >= sourceLength) || + ((unsigned char)source[sourceIndex] < 0xF0 || (unsigned char)source[sourceIndex] > 0xF4) || + ((unsigned char)source[sourceIndex] == 0xF0 && + ((sourceIndex + 1 >= sourceLength || (unsigned char)source[sourceIndex + 1] < 0x90 || (unsigned char)source[sourceIndex + 1] > 0xBF) || + (sourceIndex + 2 >= sourceLength || (unsigned char)source[sourceIndex + 2] < 0x80 || (unsigned char)source[sourceIndex + 2] > 0xBF) || + (sourceIndex + 3 >= sourceLength || (unsigned char)source[sourceIndex + 3] < 0x80 || (unsigned char)source[sourceIndex + 3] > 0xBF))) || + ((unsigned char)source[sourceIndex] >= 0xF1 && (unsigned char)source[sourceIndex] <= 0xF3 && + ((sourceIndex + 1 >= sourceLength || (unsigned char)source[sourceIndex + 1] < 0x80 || (unsigned char)source[sourceIndex + 1] > 0xBF) || + (sourceIndex + 2 >= sourceLength || (unsigned char)source[sourceIndex + 2] < 0x80 || (unsigned char)source[sourceIndex + 2] > 0xBF) || + (sourceIndex + 3 >= sourceLength || (unsigned char)source[sourceIndex + 3] < 0x80 || (unsigned char)source[sourceIndex + 3] > 0xBF))) || + ((unsigned char)source[sourceIndex] == 0xF4 && + ((sourceIndex + 1 >= sourceLength || (unsigned char)source[sourceIndex + 1] < 0x80 || (unsigned char)source[sourceIndex + 1] > 0x8F) || + (sourceIndex + 2 >= sourceLength || (unsigned char)source[sourceIndex + 2] < 0x80 || (unsigned char)source[sourceIndex + 2] > 0xBF) || + (sourceIndex + 3 >= sourceLength || (unsigned char)source[sourceIndex + 3] < 0x80 || (unsigned char)source[sourceIndex + 3] > 0xBF)))) { - ch = *pSrc; - pSrc++; + if (useFallback) + { + uint8_t missingBytes = 0; - if (ch > 0x7F) - goto ProcessChar; - } - // we are done - ch = 0; - break; - } + // Check second byte validity + if (sourceIndex + 1 < sourceLength && ((unsigned char)source[sourceIndex + 1] & 0xC0) == 0x80) + { + missingBytes++; + } - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - unsigned char *pStop = pSrc + availableBytes - 7; + // Check third byte validity + if (sourceIndex + 2 < sourceLength && ((unsigned char)source[sourceIndex + 2] & 0xC0) == 0x80) + { + missingBytes++; + } - while (pSrc < pStop) - { - ch = *pSrc; - pSrc++; + // Check fourth byte validity + if (sourceIndex + 3 < sourceLength) + { + uint8_t fourthByte = source[sourceIndex + 3]; + if ((fourthByte & 0xC0) == 0x80) + { + missingBytes++; + } + + // Increment missingBytes for each invalid byte + missingBytes = 5 - missingBytes; + if (fourthByte > 0) + { + missingBytes += 3; + } + } + else + { + missingBytes += 1; + } - if (ch > 0x7F) - { - goto LongCode; - } + if(missingBytes > 4) + { + destination[destinationIndex++] = 0xFFFD; + destination[destinationIndex++] = 0xFFFD; + destination[destinationIndex++] = 0xFFFD; + + #if BIGENDIAN + if (!treatAsLE) + { + destination[destinationIndex - 3] = BYTESWAP16(destination[destinationIndex - 2]); + destination[destinationIndex - 2] = BYTESWAP16(destination[destinationIndex - 2]); + destination[destinationIndex - 1] = BYTESWAP16(destination[destinationIndex - 1]); + } + #endif + + destinationIndex++; + sourceIndex += missingBytes; + + continue; + } - // get pSrc 2-byte aligned - if (((size_t)pSrc & 0x1) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) - { - goto LongCode; + currentChar = 0xFFFD; // Replacement character for invalid sequences + sourceIndex += missingBytes; } - } - - // get pSrc 4-byte aligned - if (((size_t)pSrc & 0x2) != 0) - { - ch = *(unsigned short*)pSrc; - if ((ch & 0x8080) != 0) + else { - goto LongCodeWithMask16; + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; } - pSrc += 2; } - - - // Run 8 + 8 characters at a time! - while (pSrc < pStop) + else { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) + currentChar = ((currentChar & 0x07) << 18) | ((source[sourceIndex + 1] & 0x3F) << 12) | ((source[sourceIndex + 2] & 0x3F) << 6) | (source[sourceIndex + 3] & 0x3F); + sourceIndex += 4; + + if (currentChar >= 0x10000) { - goto LongCodeWithMask32; - } - pSrc += 8; + // Surrogate pair + currentChar -= 0x10000; + if (destinationIndex + 1 >= destinationLength) + { + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; + } - // This is a really small loop - unroll it - if (pSrc >= pStop) - break; + destination[destinationIndex++] = (CHAR16_T)(0xD800 | (currentChar >> 10)); - ch = *(int*)pSrc; - chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) - { - goto LongCodeWithMask32; + #if BIGENDIAN + if (!treatAsLE) + { + destination[destinationIndex - 1] = BYTESWAP16(destination[destinationIndex - 1]); + } + #endif + currentChar = 0xDC00 | (currentChar & 0x3FF); } - pSrc += 8; } - break; - - LongCodeWithMask32 : -#if BIGENDIAN - // be careful about the sign extension - if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); - else -#endif - ch &= 0xFF; + } - LongCodeWithMask16: -#if BIGENDIAN - if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8); - else -#endif - ch &= 0xFF; - - pSrc++; - if (ch <= 0x7F) + if (destinationIndex >= destinationLength) { - continue; + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; } - LongCode: - chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) - { - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) - { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - pSrc += 2; - - // extra byte - charCount--; - } - else - { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - // extra byte - charCount--; - } - } - else - { - // 2 byte encoding - - // check for non-shortest form - if ((ch & 0x1E) == 0) goto BadLongCode; - } - - // extra byte - charCount--; + #if BIGENDIAN + if (!treatAsLE) + { + currentChar = BYTESWAP16(currentChar); } + #endif - // no pending bits at this point - ch = 0; - continue; - - BadLongCode: - pSrc -= 2; - ch = 0; - continue; + destination[destinationIndex++] = (CHAR16_T)currentChar; } - // May have a problem if we have to flush - if (ch != 0) + if (sourceIndex < sourceLength && (destinationIndex > destinationLength || (destinationIndex == destinationLength && sourceIndex + 1 < sourceLength))) { - // We were already adjusting for these, so need to unadjust - charCount += (ch >> 30); - charCount += self->buffer.decoder.strDefaultLength; + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; } - // Shouldn't have anything in fallback buffer for GetCharCount - // (don't have to check m_throwOnOverflow for count) - assert(!fallbackUsed || !self->useFallback || self->buffer.decoder.fallbackCount < 0); - - return charCount; + return destinationIndex; } -#define ENSURE_BUFFER_INC \ - pTarget++; \ - if (pTarget > pAllocatedBufferEnd) \ - { \ - errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; \ - return 0; \ - } - -static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount) +size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, char* destination, size_t destinationLength, uint32_t flags) { - assert(chars != NULL); - assert(byteCount >= 0); - assert(charCount >= 0); - assert(bytes != NULL); - - unsigned char *pSrc = bytes; - CHAR16_T *pTarget = chars; - - unsigned char *pEnd = pSrc + byteCount; - CHAR16_T *pAllocatedBufferEnd = pTarget + charCount; + errno = 0; - int ch = 0; - int chc; + if (sourceLength == 0) + return 0; - bool fallbackUsed = false; + size_t sourceIndex = 0; + size_t destinationIndex = 0; + bool useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS); +#if BIGENDIAN + bool treatAsLE = !!(flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN); +#endif - while (true) + while (sourceIndex < sourceLength && destinationIndex < destinationLength) { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) break; - - // read next byte. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - int cha = *pSrc; - - if (ch == 0) - { - // no pending bits - goto ReadChar; - } - - pSrc++; - - // we are expecting to see trailing bytes like 10vvvvvv - if ((cha & 0xC0) != 0x80) + uint32_t currentChar = source[sourceIndex]; + #if BIGENDIAN + if (!treatAsLE) { - // This can be a valid starting byte for another UTF8 byte sequence, so let's put - // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence - pSrc--; - goto InvalidByteSequence; + currentChar = BYTESWAP16(currentChar); } + #endif - // fold in the new byte - ch = (ch << 6) | (cha & 0x3F); - - if ((ch & FinalByte) == 0) + if (currentChar >= 0xD800 && currentChar <= 0xDBFF) { - // Not at last byte yet - assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0); - - if ((ch & SupplimentarySeq) != 0) - { - // Its a 4-byte supplimentary sequence - if ((ch & (FinalByte >> 6)) != 0) - { - // this is 3rd byte of 4 byte sequence - nothing to do - continue; - } - - // 2nd byte of 4 bytes - // check for non-shortest form of surrogate and the valid surrogate - // range 0x000000 - 0x10FFFF at the same time - if (!InRange(ch & 0x1F0, 0x10, 0x100)) - { - goto InvalidByteSequence; - } - } - else + if (sourceIndex + 1 >= sourceLength) { - // Must be 2nd byte of a 3-byte sequence - // check for non-shortest form of 3 byte seq - if ((ch & (0x1F << 5)) == 0 || // non-shortest form - (ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate + if (!useFallback) { - goto InvalidByteSequence; + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; } - } - continue; - } - - // ready to punch - - // surrogate in shortest form? - // Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte? - if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) - { - // let the range check for the second char throw the exception - if (pTarget < pAllocatedBufferEnd) - { - *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) + - (HIGH_SURROGATE_START - (0x10000 >> 10))); - - ENSURE_BUFFER_INC - - ch = (ch & 0x3FF) + - (int)(LOW_SURROGATE_START); - } - } - - goto EncodeChar; - - InvalidByteSequence: - if (!self->useFallback) - { - errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; - return 0; - } - - // this code fragment should be close to the gotos referencing it - // Have to do fallback for invalid bytes - if (!fallbackUsed) - { - fallbackUsed = true; - self->buffer.decoder.byteStart = bytes; - self->buffer.decoder.charEnd = pAllocatedBufferEnd; - } - - // That'll back us up the appropriate # of bytes if we didn't get anywhere - if (!FallbackInvalidByteSequence_Copy(self, &pSrc, &pTarget, pAllocatedBufferEnd)) - { - if (errno == MINIPAL_ERROR_INSUFFICIENT_BUFFER) return 0; - - // Check if we ran out of buffer space - assert(pSrc >= bytes); - - DecoderReplacementFallbackBuffer_Reset(&self->buffer.decoder); - ch = 0; - break; - } - - assert(pSrc >= bytes); - - ch = 0; - continue; - - ReadChar: - ch = *pSrc; - pSrc++; - - ProcessChar: - if (ch > 0x7F) - { - // If its > 0x7F, its start of a new multi-byte sequence - - // bit 6 has to be non-zero - if ((ch & 0x40) == 0) goto InvalidByteSequence; - - // start a new long code - if ((ch & 0x20) != 0) - { - if ((ch & 0x10) != 0) + else { - // 4 byte encoding - supplimentary character (2 surrogates) - - ch &= 0x0F; - - // check that bit 4 is zero and the valid supplimentary character - // range 0x000000 - 0x10FFFF at the same time - if (ch > 0x04) + if (destinationIndex + 3 >= destinationLength) { - ch |= 0xf0; - goto InvalidByteSequence; + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; } - ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) | - (SupplimentarySeq) | (SupplimentarySeq >> 6) | - (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6); - } - else - { - // 3 byte encoding - ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) | - (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6)); - } - } - else - { - // 2 byte encoding + // Replacement characters + destination[destinationIndex++] = 0xEF; + destination[destinationIndex++] = 0xBF; + destination[destinationIndex++] = 0xBD; - ch &= 0x1F; + sourceIndex++; - // check for non-shortest form - if (ch <= 1) - { - ch |= 0xc0; - goto InvalidByteSequence; + continue; } + } - ch |= (FinalByte >> 6); + uint32_t nextChar = source[sourceIndex + 1]; + #if BIGENDIAN + if (!treatAsLE) + { + nextChar = BYTESWAP16(nextChar); } - continue; - } + #endif - EncodeChar: - // write the pending character - if (pTarget >= pAllocatedBufferEnd) - { - // Fix chars so we make sure to throw if we didn't output anything - ch &= 0x1fffff; - if (ch > 0x7f) + if (nextChar < 0xDC00 || nextChar > 0xDFFF) { - if (ch > 0x7ff) + if (!useFallback) { - if (ch >= LOW_SURROGATE_START && - ch <= LOW_SURROGATE_END) - { - pSrc--; // It was 4 bytes - pTarget--; // 1 was stored already, but we can't remember 1/2, so back up - } - else if (ch > 0xffff) + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; + } + else + { + if (destinationIndex + 3 >= destinationLength) { - pSrc--; // It was 4 bytes, nothing was stored + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; } - pSrc--; // It was at least 3 bytes - } - pSrc--; // It was at least 2 bytes - } - pSrc--; - - assert(pSrc >= bytes); - // Don't store ch in decoder, we already backed up to its start - ch = 0; + // Replacement characters + destination[destinationIndex++] = 0xEF; + destination[destinationIndex++] = 0xBF; + destination[destinationIndex++] = 0xBD; - // Didn't throw, just use this buffer size. - break; - } - *pTarget = (CHAR16_T)ch; - ENSURE_BUFFER_INC - - size_t availableChars = (size_t)(pAllocatedBufferEnd - pTarget); - size_t availableBytes = (size_t)(pEnd - pSrc); + sourceIndex++; - // don't fall into the fast decoding loop if we don't have enough bytes - // Test for availableChars is done because pStop would be <= pTarget. - if (availableBytes <= 13) - { - // we may need as many as 1 character per byte - if (availableChars < availableBytes) - { - // not enough output room. no pending bits at this point - ch = 0; - continue; + continue; + } } - // try to get over the remainder of the ascii characters fast though - unsigned char* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) + // Combine the surrogate pair into one code point + if (nextChar >= 0xDC00 && nextChar <= 0xDFFF) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) goto ProcessChar; - - *pTarget = (CHAR16_T)ch; - ENSURE_BUFFER_INC + currentChar = (((currentChar - 0xD800) << 10) | (nextChar - 0xDC00)) + 0x10000; + sourceIndex++; } - // we are done - ch = 0; - break; } - - // we may need as many as 1 character per byte, so reduce the byte count if necessary. - // If availableChars is too small, pStop will be before pTarget and we won't do fast loop. - if (availableChars < availableBytes) availableBytes = availableChars; - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences - CHAR16_T *pStop = pTarget + availableBytes - 7; - - while (pTarget < pStop) + else if (currentChar >= 0xDC00 && currentChar <= 0xDFFF) { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) goto LongCode; - - *pTarget = (CHAR16_T)ch; - ENSURE_BUFFER_INC - - // get pSrc to be 2-byte aligned - if ((((size_t)pSrc) & 0x1) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) goto LongCode; - - *pTarget = (CHAR16_T)ch; - ENSURE_BUFFER_INC - } - - // get pSrc to be 4-byte aligned - if ((((size_t)pSrc) & 0x2) != 0) + if (!useFallback) { - ch = *(unsigned short*)pSrc; - if ((ch & 0x8080) != 0) goto LongCodeWithMask16; - - - if (pTarget + 2 > pAllocatedBufferEnd) - { - errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; - return 0; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - if (!self->treatAsLE) - { - *pTarget = (CHAR16_T)((ch >> 8) & 0x7F); - pSrc += 2; - *(pTarget + 1) = (CHAR16_T)(ch & 0x7F); - pTarget += 2; - } - else -#endif - { - *pTarget = (CHAR16_T)(ch & 0x7F); - pSrc += 2; - *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F); - pTarget += 2; - } + errno = MINIPAL_ERROR_NO_UNICODE_TRANSLATION; + return 0; } - - // Run 8 characters at a time! - while (pTarget < pStop) + else { - ch = *(int*)pSrc; - int chb = *(int*)(pSrc + 4); - if (((ch | chb) & (int)0x80808080) != 0) goto LongCodeWithMask32; - - if (pTarget + 8 > pAllocatedBufferEnd) + if (destinationIndex + 3 >= destinationLength) { errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; return 0; } - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - if (!self->treatAsLE) - { - *pTarget = (CHAR16_T)((ch >> 24) & 0x7F); - *(pTarget + 1) = (CHAR16_T)((ch >> 16) & 0x7F); - *(pTarget + 2) = (CHAR16_T)((ch >> 8) & 0x7F); - *(pTarget + 3) = (CHAR16_T)(ch & 0x7F); - pSrc += 8; - *(pTarget + 4) = (CHAR16_T)((chb >> 24) & 0x7F); - *(pTarget + 5) = (CHAR16_T)((chb >> 16) & 0x7F); - *(pTarget + 6) = (CHAR16_T)((chb >> 8) & 0x7F); - *(pTarget + 7) = (CHAR16_T)(chb & 0x7F); - pTarget += 8; - } - else -#endif - { - *pTarget = (CHAR16_T)(ch & 0x7F); - *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F); - *(pTarget + 2) = (CHAR16_T)((ch >> 16) & 0x7F); - *(pTarget + 3) = (CHAR16_T)((ch >> 24) & 0x7F); - pSrc += 8; - *(pTarget + 4) = (CHAR16_T)(chb & 0x7F); - *(pTarget + 5) = (CHAR16_T)((chb >> 8) & 0x7F); - *(pTarget + 6) = (CHAR16_T)((chb >> 16) & 0x7F); - *(pTarget + 7) = (CHAR16_T)((chb >> 24) & 0x7F); - pTarget += 8; - } - } - break; - - LongCodeWithMask32 : -#if BIGENDIAN - // be careful about the sign extension - if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); - else -#endif - ch &= 0xFF; + // Replacement characters + destination[destinationIndex++] = 0xEF; + destination[destinationIndex++] = 0xBF; + destination[destinationIndex++] = 0xBD; - LongCodeWithMask16: -#if BIGENDIAN - if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 8); - else -#endif - ch &= 0xFF; + sourceIndex++; - pSrc++; - if (ch <= 0x7F) - { - *pTarget = (CHAR16_T)ch; - ENSURE_BUFFER_INC continue; } + } - LongCode: - chc = *pSrc; - pSrc++; - - if ( - // bit 6 has to be zero - (ch & 0x40) == 0 || - // we are expecting to see trailing bytes like 10vvvvvv - (chc & 0xC0) != 0x80) + if (currentChar < 0x80) + { + if ((short)(destinationIndex - 1) > (int)destinationLength) { - goto BadLongCode; + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; } - - chc &= 0x3F; - - // start a new long code - if ((ch & 0x20) != 0) + destination[destinationIndex++] = (char)currentChar; + } + else if (currentChar < 0x800) + { + if (destinationIndex > destinationLength) { - - // fold the first two bytes together - chc |= (ch & 0x0F) << 6; - - if ((ch & 0x10) != 0) - { - // 4 byte encoding - surrogate - ch = *pSrc; - if ( - // check that bit 4 is zero, the non-shortest form of surrogate - // and the valid surrogate range 0x000000 - 0x10FFFF at the same time - !InRange(chc >> 4, 0x01, 0x10) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - - chc = (chc << 6) | (ch & 0x3F); - - ch = *(pSrc + 1); - // we are expecting to see trailing bytes like 10vvvvvv - if ((ch & 0xC0) != 0x80) goto BadLongCode; - - pSrc += 2; - - ch = (chc << 6) | (ch & 0x3F); - - *pTarget = (CHAR16_T)(((ch >> 10) & 0x7FF) + - (HIGH_SURROGATE_START - (0x10000 >> 10))); - ENSURE_BUFFER_INC - - ch = (ch & 0x3FF) + (LOW_SURROGATE_START); - - // extra byte, we're already planning 2 chars for 2 of these bytes, - // but the big loop is testing the target against pStop, so we need - // to subtract 2 more or we risk overrunning the input. Subtract - // one here and one below. - pStop--; - } - else - { - // 3 byte encoding - ch = *pSrc; - if ( - // check for non-shortest form of 3 byte seq - (chc & (0x1F << 5)) == 0 || - // Can't have surrogates here. - (chc & (0xF800 >> 6)) == (0xD800 >> 6) || - // we are expecting to see trailing bytes like 10vvvvvv - (ch & 0xC0) != 0x80) - { - goto BadLongCode; - } - pSrc++; - - ch = (chc << 6) | (ch & 0x3F); - - // extra byte, we're only expecting 1 char for each of these 3 bytes, - // but the loop is testing the target (not source) against pStop, so - // we need to subtract 2 more or we risk overrunning the input. - // Subtract 1 here and one more below - pStop--; - } + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; } - else + destination[destinationIndex++] = (char)(0xC0 | (currentChar >> 6)); + destination[destinationIndex++] = (char)(0x80 | (currentChar & 0x3F)); + } + else if (currentChar < 0x10000) + { + if (destinationIndex + 1 > destinationLength) { - // 2 byte encoding - - ch &= 0x1F; - - // check for non-shortest form - if (ch <= 1) goto BadLongCode; - - ch = (ch << 6) | chc; + errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; + return 0; } - - *pTarget = (CHAR16_T)ch; - ENSURE_BUFFER_INC - - // extra byte, we're only expecting 1 char for each of these 2 bytes, - // but the loop is testing the target (not source) against pStop. - // subtract an extra count from pStop so that we don't overrun the input. - pStop--; + destination[destinationIndex++] = (char)(0xE0 | (currentChar >> 12)); + destination[destinationIndex++] = (char)(0x80 | ((currentChar >> 6) & 0x3F)); + destination[destinationIndex++] = (char)(0x80 | (currentChar & 0x3F)); } - - assert(pTarget <= pAllocatedBufferEnd); - - // no pending bits at this point - ch = 0; - continue; - - BadLongCode: - pSrc -= 2; - ch = 0; - continue; - } - - if (ch != 0) - { - // This'll back us up the appropriate # of bytes if we didn't get anywhere - if (!self->useFallback) + else { - assert(pSrc >= bytes || pTarget == chars); - - // Ran out of buffer space - // Need to throw an exception? - if (pTarget == chars) + if (destinationIndex + 2 > destinationLength) { errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; return 0; } + destination[destinationIndex++] = (char)(0xF0 | (currentChar >> 18)); + destination[destinationIndex++] = (char)(0x80 | ((currentChar>> 12) & 0x3F)); + destination[destinationIndex++] = (char)(0x80 | ((currentChar >> 6) & 0x3F)); + destination[destinationIndex++] = (char)(0x80 | (currentChar & 0x3F)); } - assert(pSrc >= bytes); - ch = 0; - } - // Shouldn't have anything in fallback buffer for GetChars - // (don't have to check m_throwOnOverflow for chars) - assert(!fallbackUsed || self->buffer.decoder.fallbackCount < 0); + sourceIndex++; + } - if (pSrc < pEnd) + if (sourceIndex < sourceLength && destinationIndex >= destinationLength) { errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; return 0; } - return (size_t)(pTarget - chars); -} - -static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, unsigned char* bytes, size_t byteCount) -{ - assert(chars != NULL); - assert(byteCount >= 0); - assert(charCount >= 0); - assert(bytes != NULL); - - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - bool fallbackUsed = false; - CHAR16_T *pSrc = chars; - unsigned char *pTarget = bytes; - - CHAR16_T *pEnd = pSrc + charCount; - unsigned char *pAllocatedBufferEnd = pTarget + byteCount; - - int ch = 0; - int chd; - - // assume that JIT will enregister pSrc, pTarget and ch - - while (true) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - - if (pSrc >= pEnd) - { - if (ch == 0) - { - // Check if there's anything left to get out of the fallback buffer - ch = fallbackUsed ? EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder) : 0; - if (ch > 0) goto ProcessChar; - } - else - { - // Case of leftover surrogates in the fallback buffer - if (fallbackUsed && self->buffer.encoder.bFallingBack) - { - assert(ch >= 0xD800 && ch <= 0xDBFF); - - int cha = ch; - - ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); - - if (InRange(ch, LOW_SURROGATE_START, LOW_SURROGATE_END)) - { - ch = ch + (cha << 10) + (0x10000 - LOW_SURROGATE_START - (HIGH_SURROGATE_START << 10)); - goto EncodeChar; - } - else if (ch > 0) - { - goto ProcessChar; - } - - break; - } - } - - // attempt to encode the partial surrogate (will fail or ignore) - if (ch > 0) goto EncodeChar; - - // We're done - break; - } - - if (ch > 0) - { - // We have a high surrogate left over from a previous loop. - assert(ch >= 0xD800 && ch <= 0xDBFF); - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - if (InRange(cha, LOW_SURROGATE_START, LOW_SURROGATE_END)) - { - ch = cha + (ch << 10) + - (0x10000 - - LOW_SURROGATE_START - - (HIGH_SURROGATE_START << 10)); - - pSrc++; - } - // else ch is still high surrogate and encoding will fail - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallbackUsed) - { - ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); - if (ch > 0) goto ProcessChar; - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - if (InRange(ch, HIGH_SURROGATE_START, HIGH_SURROGATE_END)) continue; - - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed, we have to do fallback for them - // Have to make a fallback buffer if we don't have one - if (!fallbackUsed) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - fallbackUsed = true; - - // Set our internal fallback interesting things. - EncoderReplacementFallbackBuffer_InternalInitialize(&self->buffer.encoder, chars, pEnd, true); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - EncoderReplacementFallbackBuffer_InternalFallback(&self->buffer.encoder, (CHAR16_T)ch, &pSrc); - - // Ignore it if we don't throw - ch = 0; - continue; - } - - // Count bytes needed - int bytesNeeded = 1; - if (ch > 0x7F) - { - if (ch > 0x7FF) - { - if (ch > 0xFFFF) - { - bytesNeeded++; // 4 bytes (surrogate pair) - } - bytesNeeded++; // 3 bytes (800-FFFF) - } - bytesNeeded++; // 2 bytes (80-7FF) - } - - if (pTarget > pAllocatedBufferEnd - bytesNeeded) - { - // Left over surrogate from last time will cause pSrc == chars, so we'll throw - if (fallbackUsed && self->buffer.encoder.bFallingBack) - { - EncoderReplacementFallbackBuffer_MovePrevious(&self->buffer.encoder); // Didn't use this fallback char - if (ch > 0xFFFF) - EncoderReplacementFallbackBuffer_MovePrevious(&self->buffer.encoder); // Was surrogate, didn't use 2nd part either - } - else - { - pSrc--; // Didn't use this char - if (ch > 0xFFFF) - pSrc--; // Was surrogate, didn't use 2nd part either - } - - assert(pSrc >= chars || pTarget == bytes); - - if (pTarget == bytes) // Throw if we must - { - errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; - return 0; - } - ch = 0; // Nothing left over (we backed up to start of pair if supplimentary) - break; - } - - if (ch <= 0x7F) - { - *pTarget = (unsigned char)ch; - } - else - { - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int chb; - if (ch <= 0x7FF) - { - // 2 unsigned char encoding - chb = (unsigned char)(0xC0 | (ch >> 6)); - } - else - { - if (ch <= 0xFFFF) - { - chb = (unsigned char)(0xE0 | (ch >> 12)); - } - else - { - *pTarget = (unsigned char)(0xF0 | (ch >> 18)); - ENSURE_BUFFER_INC - - chb = 0x80 | ((ch >> 12) & 0x3F); - } - *pTarget = (unsigned char)chb; - ENSURE_BUFFER_INC - - chb = 0x80 | ((ch >> 6) & 0x3F); - } - *pTarget = (unsigned char)chb; - ENSURE_BUFFER_INC - - *pTarget = (unsigned char)0x80 | (ch & 0x3F); - } - - ENSURE_BUFFER_INC - - // If still have fallback don't do fast loop - if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0) - goto ProcessChar; - - size_t availableChars = (size_t)(pEnd - pSrc); - size_t availableBytes = (size_t)(pAllocatedBufferEnd - pTarget); - - // don't fall into the fast decoding loop if we don't have enough characters - // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop. - if (availableChars <= 13) - { - // we are hoping for 1 unsigned char per char - if (availableBytes < availableChars) - { - // not enough output room. no pending bits at this point - ch = 0; - continue; - } - - // try to get over the remainder of the ascii characters fast though - CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) - { - ch = *pSrc; - pSrc++; - - // Not ASCII, need more than 1 unsigned char per char - if (ch > 0x7F) goto ProcessChar; - - *pTarget = (unsigned char)ch; - ENSURE_BUFFER_INC - } - // we are done, let ch be 0 to clear encoder - ch = 0; - break; - } - - // we need at least 1 unsigned char per character, but Convert might allow us to convert - // only part of the input, so try as much as we can. Reduce charCount if necessary - if (availableBytes < availableChars) - { - availableChars = availableBytes; - } - - // FASTLOOP: - // - optimistic range checks - // - fallbacks to the slow loop for all special cases, exception throwing, etc. - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop. - CHAR16_T *pStop = pSrc + availableChars - 5; - - while (pSrc < pStop) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) goto LongCode; - - *pTarget = (unsigned char)ch; - ENSURE_BUFFER_INC - - // get pSrc aligned - if (((size_t)pSrc & 0x2) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) goto LongCode; - - *pTarget = (unsigned char)ch; - ENSURE_BUFFER_INC - } - - // Run 4 characters at a time! - while (pSrc < pStop) - { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - - if (((ch | chc) & (int)0xFF80FF80) != 0) goto LongCodeWithMask; - - if (pTarget + 4 > pAllocatedBufferEnd) - { - errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; - return 0; - } - - // Unfortunately, this is endianness sensitive -#if BIGENDIAN - if (!self->treatAsLE) - { - *pTarget = (unsigned char)(ch >> 16); - *(pTarget + 1) = (unsigned char)ch; - pSrc += 4; - *(pTarget + 2) = (unsigned char)(chc >> 16); - *(pTarget + 3) = (unsigned char)chc; - pTarget += 4; - } - else -#endif - { - *pTarget = (unsigned char)ch; - *(pTarget + 1) = (unsigned char)(ch >> 16); - pSrc += 4; - *(pTarget + 2) = (unsigned char)chc; - *(pTarget + 3) = (unsigned char)(chc >> 16); - pTarget += 4; - } - } - continue; - - LongCodeWithMask: -#if BIGENDIAN - // be careful about the sign extension - if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); - else -#endif - ch = (CHAR16_T)ch; - pSrc++; - - if (ch > 0x7F) goto LongCode; - - *pTarget = (unsigned char)ch; - ENSURE_BUFFER_INC - continue; - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - if (ch <= 0x7FF) - { - // 2 unsigned char encoding - chd = 0xC0 | (ch >> 6); - } - else - { - if (!InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) - { - // 3 unsigned char encoding - chd = 0xE0 | (ch >> 12); - } - else - { - // 4 unsigned char encoding - high surrogate + low surrogate - if (ch > HIGH_SURROGATE_END) - { - // low without high -> bad, try again in slow loop - pSrc -= 1; - break; - } - - chd = *pSrc; - pSrc++; - - if (!InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END)) - { - // high not followed by low -> bad, try again in slow loop - pSrc -= 2; - break; - } - - ch = chd + (ch << 10) + - (0x10000 - - LOW_SURROGATE_START - - (HIGH_SURROGATE_START << 10)); - - *pTarget = (unsigned char)(0xF0 | (ch >> 18)); - // pStop - this unsigned char is compensated by the second surrogate character - // 2 input chars require 4 output bytes. 2 have been anticipated already - // and 2 more will be accounted for by the 2 pStop-- calls below. - ENSURE_BUFFER_INC - - chd = 0x80 | ((ch >> 12) & 0x3F); - } - *pTarget = (unsigned char)chd; - pStop--; // 3 unsigned char sequence for 1 char, so need pStop-- and the one below too. - ENSURE_BUFFER_INC - - chd = 0x80 | ((ch >> 6) & 0x3F); - } - *pTarget = (unsigned char)chd; - pStop--; // 2 unsigned char sequence for 1 char so need pStop--. - ENSURE_BUFFER_INC - - *pTarget = (unsigned char)(0x80 | (ch & 0x3F)); - // pStop - this unsigned char is already included - ENSURE_BUFFER_INC - } - - assert(pTarget <= pAllocatedBufferEnd); - - // no pending char at this point - ch = 0; - } - - if (pSrc < pEnd) - { - errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; - return 0; - } - - return (size_t)(pTarget - bytes); -} - -static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) -{ - // For fallback we may need a fallback buffer. - // We wait to initialize it though in case we don't have any broken input unicode - bool fallbackUsed = false; - CHAR16_T *pSrc = chars; - CHAR16_T *pEnd = pSrc + count; - - // Start by assuming we have as many as count - size_t byteCount = count; - - int ch = 0; - - while (true) - { - // SLOWLOOP: does all range checks, handles all special cases, but it is slow - if (pSrc >= pEnd) - { - - if (ch == 0) - { - // Unroll any fallback that happens at the end - ch = fallbackUsed ? EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder) : 0; - if (ch > 0) - { - byteCount++; - goto ProcessChar; - } - } - else - { - // Case of surrogates in the fallback. - if (fallbackUsed && self->buffer.encoder.bFallingBack) - { - assert(ch >= 0xD800 && ch <= 0xDBFF); - - ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); - byteCount++; - - if (InRange(ch, LOW_SURROGATE_START, LOW_SURROGATE_END)) - { - ch = 0xfffd; - byteCount++; - goto EncodeChar; - } - else if (ch > 0) - { - goto ProcessChar; - } - else - { - byteCount--; // ignore last one. - break; - } - } - } - - if (ch <= 0) - { - break; - } - - // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1. - byteCount++; - goto EncodeChar; - } - - if (ch > 0) - { - assert(ch >= 0xD800 && ch <= 0xDBFF); - - // use separate helper variables for local contexts so that the jit optimizations - // won't get confused about the variable lifetimes - int cha = *pSrc; - - // count the pending surrogate - byteCount++; - - // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here. - if (InRange(cha, LOW_SURROGATE_START, LOW_SURROGATE_END)) - { - // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do. - ch = 0xfffd; - // ch = cha + (ch << 10) + - // (0x10000 - // - LOW_SURROGATE_START - // - (HIGH_SURROGATE_START << 10) ); - - // Use this next char - pSrc++; - } - // else ch is still high surrogate and encoding will fail (so don't add count) - - // attempt to encode the surrogate or partial surrogate - goto EncodeChar; - } - - // If we've used a fallback, then we have to check for it - if (fallbackUsed) - { - ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder); - if (ch > 0) - { - // We have an extra byte we weren't expecting. - byteCount++; - goto ProcessChar; - } - } - - // read next char. The JIT optimization seems to be getting confused when - // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead - ch = *pSrc; - pSrc++; - - ProcessChar: - if (InRange(ch, HIGH_SURROGATE_START, HIGH_SURROGATE_END)) - { - // we will count this surrogate next time around - byteCount--; - continue; - } - // either good char or partial surrogate - - EncodeChar: - // throw exception on partial surrogate if necessary - if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) - { - // Lone surrogates aren't allowed - // Have to make a fallback buffer if we don't have one - if (!fallbackUsed) - { - // wait on fallbacks if we can - // For fallback we may need a fallback buffer - fallbackUsed = true; - - // Set our internal fallback interesting things. - EncoderReplacementFallbackBuffer_InternalInitialize(&self->buffer.encoder, chars, chars + count, false); - } - - // Do our fallback. Actually we already know its a mixed up surrogate, - // so the ref pSrc isn't gonna do anything. - EncoderReplacementFallbackBuffer_InternalFallback(&self->buffer.encoder, (CHAR16_T)ch, &pSrc); - - // Ignore it if we don't throw (we had preallocated this ch) - byteCount--; - ch = 0; - continue; - } - - // Count them - if (ch > 0x7F) - { - if (ch > 0x7FF) - { - // the extra surrogate byte was compensated by the second surrogate character - // (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char) - byteCount++; - } - byteCount++; - } - -#if WIN64 - // check for overflow - if (byteCount < 0) - { - break; - } -#endif - - // If still have fallback don't do fast loop - if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0) - { - // We're reserving 1 byte for each char by default - byteCount++; - goto ProcessChar; - } - - size_t availableChars = (size_t)(pEnd - pSrc); - - // don't fall into the fast decoding loop if we don't have enough characters - if (availableChars <= 13) - { - // try to get over the remainder of the ascii characters fast though - CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered - while (pSrc < pLocalEnd) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) goto ProcessChar; - } - - // we are done - break; - } - -#if WIN64 - // make sure that we won't get a silent overflow inside the fast loop - // (Fall out to slow loop if we have this many characters) - availableChars &= 0x0FFFFFFF; -#endif - - // To compute the upper bound, assume that all characters are ASCII characters at this point, - // the boundary will be decreased for every non-ASCII character we encounter - // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates - CHAR16_T *pStop = pSrc + availableChars - (3 + 4); - - while (pSrc < pStop) - { - ch = *pSrc; - pSrc++; - - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - - // get pSrc aligned - if (((size_t)pSrc & 0x2) != 0) - { - ch = *pSrc; - pSrc++; - if (ch > 0x7F) // Not ASCII - { - if (ch > 0x7FF) // Not 2 Byte - { - if ((ch & 0xF800) == 0xD800) // See if its a Surrogate - goto LongCode; - byteCount++; - } - byteCount++; - } - } - - // Run 2 * 4 characters at a time! - while (pSrc < pStop) - { - ch = *(int*)pSrc; - int chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII - { - if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - - if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits) - byteCount++; - if ((ch & (int)0xFF80) != 0) - byteCount++; - if ((chc & (int)0xFF800000) != 0) - byteCount++; - if ((chc & (int)0xFF80) != 0) - byteCount++; - } - pSrc += 4; - - ch = *(int*)pSrc; - chc = *(int*)(pSrc + 2); - if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII - { - if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte - { - goto LongCodeWithMask; - } - - if ((ch & (int)0xFF800000) != 0) - byteCount++; - if ((ch & (int)0xFF80) != 0) - byteCount++; - if ((chc & (int)0xFF800000) != 0) - byteCount++; - if ((chc & (int)0xFF80) != 0) - byteCount++; - } - pSrc += 4; - } - break; - - LongCodeWithMask: -#if BIGENDIAN - // be careful about the sign extension - if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); - else -#endif - ch = (CHAR16_T)ch; - - pSrc++; - - if (ch <= 0x7F) - { - continue; - } - - LongCode: - // use separate helper variables for slow and fast loop so that the jit optimizations - // won't get confused about the variable lifetimes - if (ch > 0x7FF) - { - if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) - { - // 4 byte encoding - high surrogate + low surrogate - - int chd = *pSrc; - if ( - ch > HIGH_SURROGATE_END || - !InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END)) - { - // Back up and drop out to slow loop to figure out error - pSrc--; - break; - } - pSrc++; - - // byteCount - this byte is compensated by the second surrogate character - } - byteCount++; - } - byteCount++; - - // byteCount - the last byte is already included - } - - // no pending char at this point - ch = 0; - } - -#if WIN64 - // check for overflow - assert(byteCount >= 0); -#endif - assert(!fallbackUsed || self->buffer.encoder.fallbackCount < 0); - - return byteCount; -} - -size_t minipal_get_length_utf8_to_utf16(const char* source, size_t sourceLength, unsigned int flags) -{ - errno = 0; - - if (sourceLength == 0) - return 0; - - UTF8Encoding enc = - { - .buffer = { .decoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0 }, .strDefaultLength = 1 } }, - .useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS), -#if BIGENDIAN - .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) -#endif - }; - - return GetCharCount(&enc, (unsigned char*)source, sourceLength); -} - -size_t minipal_get_length_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, unsigned int flags) -{ - errno = 0; - - if (sourceLength == 0) - return 0; - - UTF8Encoding enc = - { - // repeat replacement char (0xFFFD) twice for a surrogate pair - .buffer = { .encoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0xFFFD, 0 }, .strDefaultLength = 2 } }, - .useFallback = true, -#if BIGENDIAN - .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) -#endif - }; - -#if !BIGENDIAN - (void)flags; // unused -#endif - - return GetByteCount(&enc, (CHAR16_T*)source, sourceLength); -} - -size_t minipal_convert_utf8_to_utf16(const char* source, size_t sourceLength, CHAR16_T* destination, size_t destinationLength, unsigned int flags) -{ - size_t ret; - errno = 0; - - if (sourceLength == 0) - return 0; - - UTF8Encoding enc = - { - .buffer = { .decoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0 }, .strDefaultLength = 1 } }, - .useFallback = !(flags & MINIPAL_MB_NO_REPLACE_INVALID_CHARS), -#if BIGENDIAN - .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) -#endif - }; - - ret = GetChars(&enc, (unsigned char*)source, sourceLength, destination, destinationLength); - if (errno) ret = 0; - - return ret; -} - -size_t minipal_convert_utf16_to_utf8(const CHAR16_T* source, size_t sourceLength, char* destination, size_t destinationLength, unsigned int flags) -{ - size_t ret; - errno = 0; - - if (sourceLength == 0) - return 0; - - UTF8Encoding enc = - { - // repeat replacement char (0xFFFD) twice for a surrogate pair - .buffer = { .encoder = { .fallbackCount = -1, .fallbackIndex = -1, .strDefault = { 0xFFFD, 0xFFFD, 0 }, .strDefaultLength = 2 } }, - .useFallback = true, -#if BIGENDIAN - .treatAsLE = (flags & MINIPAL_TREAT_AS_LITTLE_ENDIAN) -#endif - }; - -#if !BIGENDIAN - (void)flags; // unused -#endif - - ret = GetBytes(&enc, (CHAR16_T*)source, sourceLength, (unsigned char*)destination, destinationLength); - if (errno) ret = 0; - - return ret; + return destinationIndex; }