Skip to content
This repository was archived by the owner on Oct 12, 2022. It is now read-only.
/ druntime Public archive
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 33 additions & 14 deletions src/rt/util/utf.d
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,15 @@
* $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
* $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
*
* Copyright: Copyright Digital Mars 2003 - 2009.
* Copyright: Copyright Digital Mars 2003 - 2016.
* License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
* Authors: Walter Bright, Sean Kelly
* Source: $(DRUNTIMESRC src/rt/util/_utf.d)
*/

/* Copyright Digital Mars 2003 - 2009.
* Distributed under the Boost Software License, Version 1.0.
* (See accompanying file LICENSE or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
module rt.util.utf;


extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ );
extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure;

/*******************************
* Test if c is a valid UTF-32 character.
Expand All @@ -38,6 +33,7 @@ extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__,
* Returns: true if it is, false if not.
*/

@safe @nogc pure nothrow
bool isValidDchar(dchar c)
{
/* Note: FFFE and FFFF are specifically permitted by the
Expand Down Expand Up @@ -87,6 +83,7 @@ static immutable UTF8stride =
* The number of bytes in the UTF-8 sequence or
* 0xFF meaning s[i] is not the start of of UTF-8 sequence.
*/
@safe @nogc pure nothrow
uint stride(in char[] s, size_t i)
{
return UTF8stride[s[i]];
Expand All @@ -96,6 +93,7 @@ uint stride(in char[] s, size_t i)
* stride() returns the length of a UTF-16 sequence starting at index i
* in string s.
*/
@safe @nogc pure nothrow
uint stride(in wchar[] s, size_t i)
{ uint u = s[i];
return 1 + (u >= 0xD800 && u <= 0xDBFF);
Expand All @@ -106,6 +104,7 @@ uint stride(in wchar[] s, size_t i)
* in string s.
* Returns: The return value will always be 1.
*/
@safe @nogc pure nothrow
uint stride(in dchar[] s, size_t i)
{
return 1;
Expand All @@ -116,7 +115,7 @@ uint stride(in dchar[] s, size_t i)
* and assuming that index i is at the start of a UTF character,
* determine the number of UCS characters up to that index i.
*/

@safe pure
size_t toUCSindex(in char[] s, size_t i)
{
size_t n;
Expand All @@ -135,6 +134,7 @@ size_t toUCSindex(in char[] s, size_t i)
}

/** ditto */
@safe pure
size_t toUCSindex(in wchar[] s, size_t i)
{
size_t n;
Expand All @@ -153,6 +153,7 @@ size_t toUCSindex(in wchar[] s, size_t i)
}

/** ditto */
@safe @nogc pure nothrow
size_t toUCSindex(in dchar[] s, size_t i)
{
return i;
Expand All @@ -161,7 +162,7 @@ size_t toUCSindex(in dchar[] s, size_t i)
/******************************************
* Given a UCS index n into an array of characters s[], return the UTF index.
*/

@safe pure
size_t toUTFindex(in char[] s, size_t n)
{
size_t i;
Expand All @@ -177,6 +178,7 @@ size_t toUTFindex(in char[] s, size_t n)
}

/** ditto */
@safe @nogc pure nothrow
size_t toUTFindex(in wchar[] s, size_t n)
{
size_t i;
Expand All @@ -190,6 +192,7 @@ size_t toUTFindex(in wchar[] s, size_t n)
}

/** ditto */
@safe @nogc pure nothrow
size_t toUTFindex(in dchar[] s, size_t n)
{
return n;
Expand All @@ -202,6 +205,7 @@ size_t toUTFindex(in dchar[] s, size_t n)
* decoded character. If the character is not well formed, a UtfException is
* thrown and idx remains unchanged.
*/
@safe pure
dchar decode(in char[] s, ref size_t idx)
in
{
Expand Down Expand Up @@ -342,7 +346,7 @@ unittest
}

/** ditto */

@safe pure
dchar decode(in wchar[] s, ref size_t idx)
in
{
Expand Down Expand Up @@ -400,7 +404,7 @@ dchar decode(in wchar[] s, ref size_t idx)
}

/** ditto */

@safe pure
dchar decode(in dchar[] s, ref size_t idx)
in
{
Expand All @@ -427,6 +431,7 @@ dchar decode(in dchar[] s, ref size_t idx)
/*******************************
* Encodes character c and appends it to array s[].
*/
@safe pure nothrow
void encode(ref char[] s, dchar c)
in
{
Expand Down Expand Up @@ -495,7 +500,7 @@ unittest
}

/** ditto */

@safe pure nothrow
void encode(ref wchar[] s, dchar c)
in
{
Expand All @@ -521,6 +526,7 @@ void encode(ref wchar[] s, dchar c)
}

/** ditto */
@safe pure nothrow
void encode(ref dchar[] s, dchar c)
in
{
Expand All @@ -535,7 +541,7 @@ void encode(ref dchar[] s, dchar c)
Returns the code length of $(D c) in the encoding using $(D C) as a
code point. The code is returned in character count, not in bytes.
*/

@safe pure nothrow @nogc
ubyte codeLength(C)(dchar c)
{
static if (C.sizeof == 1)
Expand Down Expand Up @@ -564,6 +570,7 @@ Checks to see if string is well formed or not. $(D S) can be an array
of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
if it is not. Use to check all untrusted input for correctness.
*/
@safe pure
void validate(S)(in S s)
{
auto len = s.length;
Expand All @@ -575,6 +582,7 @@ void validate(S)(in S s)

/* =================== Conversion to UTF8 ======================= */

@safe pure nothrow @nogc
char[] toUTF8(return out char[4] buf, dchar c)
in
{
Expand Down Expand Up @@ -614,6 +622,7 @@ char[] toUTF8(return out char[4] buf, dchar c)
/*******************
* Encodes string s into UTF-8 and returns the encoded string.
*/
@safe pure nothrow
string toUTF8(string s)
in
{
Expand All @@ -625,6 +634,7 @@ string toUTF8(string s)
}

/** ditto */
@trusted pure
string toUTF8(in wchar[] s)
{
char[] r;
Expand Down Expand Up @@ -652,6 +662,7 @@ string toUTF8(in wchar[] s)
}

/** ditto */
@trusted pure
string toUTF8(in dchar[] s)
{
char[] r;
Expand Down Expand Up @@ -680,6 +691,7 @@ string toUTF8(in dchar[] s)

/* =================== Conversion to UTF16 ======================= */

@safe pure nothrow @nogc
wchar[] toUTF16(return out wchar[2] buf, dchar c)
in
{
Expand All @@ -705,6 +717,7 @@ wchar[] toUTF16(return out wchar[2] buf, dchar c)
* toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
* an LPWSTR or LPCWSTR argument.
*/
@trusted pure
wstring toUTF16(in char[] s)
{
wchar[] r;
Expand All @@ -731,6 +744,7 @@ wstring toUTF16(in char[] s)

alias const(wchar)* wptr;
/** ditto */
@safe pure
wptr toUTF16z(in char[] s)
{
wchar[] r;
Expand All @@ -757,6 +771,7 @@ wptr toUTF16z(in char[] s)
}

/** ditto */
@safe pure nothrow
wstring toUTF16(wstring s)
in
{
Expand All @@ -768,6 +783,7 @@ wstring toUTF16(wstring s)
}

/** ditto */
@trusted pure nothrow
wstring toUTF16(in dchar[] s)
{
wchar[] r;
Expand All @@ -787,6 +803,7 @@ wstring toUTF16(in dchar[] s)
/*****
* Encodes string s into UTF-32 and returns the encoded string.
*/
@trusted pure
dstring toUTF32(in char[] s)
{
dchar[] r;
Expand All @@ -807,6 +824,7 @@ dstring toUTF32(in char[] s)
}

/** ditto */
@trusted pure
dstring toUTF32(in wchar[] s)
{
dchar[] r;
Expand All @@ -827,6 +845,7 @@ dstring toUTF32(in wchar[] s)
}

/** ditto */
@safe pure nothrow
dstring toUTF32(dstring s)
in
{
Expand Down