From 0c550b75ceec0c302f78fab4a47d7dde3c534a59 Mon Sep 17 00:00:00 2001 From: Jack Stouffer Date: Fri, 22 Jul 2016 10:46:40 -0400 Subject: [PATCH 1/6] Make std.utf.toUTF8 DRY by using byChar internally --- std/utf.d | 73 +++++++++++++++---------------------------------------- 1 file changed, 19 insertions(+), 54 deletions(-) diff --git a/std/utf.d b/std/utf.d index 84b25dbda9a..99fe4007237 100644 --- a/std/utf.d +++ b/std/utf.d @@ -2462,67 +2462,32 @@ char[] toUTF8(return out char[4] buf, dchar c) nothrow @nogc @safe } } -/******************* - * Encodes string $(D_PARAM s) into UTF-8 and returns the encoded string. +/** + * Encodes string `s` into UTF-8 and returns the encoded string. + * + * Params: + * s = the string to encode + * Returns: + * A UTF-8 string + * See_Also: + * For a lazy, non-allocating version of these functions, see $(LREF byUTF). */ -string toUTF8(scope const char[] s) @safe +string toUTF8(S)(S s) if (isSomeString!S) { - validate(s); - return s.idup; + import std.array : array; + return s.byChar.array; } -/// ditto -string toUTF8(scope const wchar[] s) @safe -{ - char[] r; - size_t i; - immutable slen = s.length; - - r.length = slen; - for (i = 0; i < slen; i++) - { - immutable c = s[i]; - - if (c <= 0x7F) - r[i] = cast(char)c; // fast path for ascii - else - { - r.length = i; - while (i < slen) - encode(r, decode(s, i)); - break; - } - } - - return r; -} - -/// ditto -string toUTF8(scope const dchar[] s) @safe +/// +@safe pure unittest { - char[] r; - size_t i; - immutable slen = s.length; - - r.length = slen; - for (i = 0; i < slen; i++) - { - immutable c = s[i]; + import std.algorithm.comparison : equal; - if (c <= 0x7F) - r[i] = cast(char)c; // fast path for ascii - else - { - r.length = i; - foreach (dchar d; s[i .. slen]) - { - encode(r, d); - } - break; - } - } + // The ö is represented by two UTF-8 code units + assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); - return r; + // 𐐷 is four code units in UTF-8 + assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); } From 65c4648c273756f81672120054e19769a4ab0f81 Mon Sep 17 00:00:00 2001 From: Jack Stouffer Date: Fri, 22 Jul 2016 10:53:33 -0400 Subject: [PATCH 2/6] Remove over applied pure attribute from std.utf --- std/utf.d | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/std/utf.d b/std/utf.d index 99fe4007237..8525f82521b 100644 --- a/std/utf.d +++ b/std/utf.d @@ -2418,11 +2418,7 @@ void validate(S)(in S str) @safe pure } /* =================== Conversion to UTF8 ======================= */ - -pure -{ - -char[] toUTF8(return out char[4] buf, dchar c) nothrow @nogc @safe +char[] toUTF8(return out char[4] buf, dchar c) nothrow @nogc @safe pure { if (c <= 0x7F) { @@ -2493,7 +2489,7 @@ string toUTF8(S)(S s) if (isSomeString!S) /* =================== Conversion to UTF16 ======================= */ -wchar[] toUTF16(return ref wchar[2] buf, dchar c) nothrow @nogc @safe +wchar[] toUTF16(return ref wchar[2] buf, dchar c) nothrow @nogc @safe pure in { assert(isValidDchar(c)); @@ -2516,7 +2512,7 @@ body /**************** * Encodes string $(D s) into UTF-16 and returns the encoded string. */ -wstring toUTF16(scope const char[] s) @safe +wstring toUTF16(scope const char[] s) @safe pure { wchar[] r; immutable slen = s.length; @@ -2542,14 +2538,14 @@ wstring toUTF16(scope const char[] s) @safe } /// ditto -wstring toUTF16(scope const wchar[] s) @safe +wstring toUTF16(scope const wchar[] s) @safe pure { validate(s); return s.idup; } /// ditto -wstring toUTF16(scope const dchar[] s) @safe +wstring toUTF16(scope const dchar[] s) @safe pure { wchar[] r; immutable slen = s.length; @@ -2570,7 +2566,7 @@ wstring toUTF16(scope const dchar[] s) @safe /***** * Encodes string $(D_PARAM s) into UTF-32 and returns the encoded string. */ -dstring toUTF32(scope const char[] s) @safe +dstring toUTF32(scope const char[] s) @safe pure { dchar[] r; immutable slen = s.length; @@ -2591,7 +2587,7 @@ dstring toUTF32(scope const char[] s) @safe } /// ditto -dstring toUTF32(scope const wchar[] s) @safe +dstring toUTF32(scope const wchar[] s) @safe pure { dchar[] r; immutable slen = s.length; @@ -2612,15 +2608,12 @@ dstring toUTF32(scope const wchar[] s) @safe } /// ditto -dstring toUTF32(scope const dchar[] s) @safe +dstring toUTF32(scope const dchar[] s) @safe pure { validate(s); return s.idup; } -} // Convert functions are @safe - - /* =================== toUTFz ======================= */ /++ From 2f24a0aba421ab9705126e718ad4a7b0650c320b Mon Sep 17 00:00:00 2001 From: Jack Stouffer Date: Fri, 22 Jul 2016 16:11:10 -0400 Subject: [PATCH 3/6] use appender instead of array --- std/utf.d | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/std/utf.d b/std/utf.d index 8525f82521b..9a4da600d2f 100644 --- a/std/utf.d +++ b/std/utf.d @@ -2470,8 +2470,15 @@ char[] toUTF8(return out char[4] buf, dchar c) nothrow @nogc @safe pure */ string toUTF8(S)(S s) if (isSomeString!S) { - import std.array : array; - return s.byChar.array; + import std.array : appender; + + auto app = appender!string(); + app.reserve(s.length); + + foreach (c; s.byChar) + app.put(c); + + return app.data; } /// From 3cc0a7cba4a0b59cfd34e433201f89b2998cd089 Mon Sep 17 00:00:00 2001 From: Jack Stouffer Date: Sat, 23 Jul 2016 17:03:37 -0400 Subject: [PATCH 4/6] Added string specific logic --- std/utf.d | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/std/utf.d b/std/utf.d index 9a4da600d2f..7ad2cdb5524 100644 --- a/std/utf.d +++ b/std/utf.d @@ -2470,15 +2470,19 @@ char[] toUTF8(return out char[4] buf, dchar c) nothrow @nogc @safe pure */ string toUTF8(S)(S s) if (isSomeString!S) { - import std.array : appender; - - auto app = appender!string(); - app.reserve(s.length); - - foreach (c; s.byChar) - app.put(c); - - return app.data; + static if (is(S : string)) + { + return s.idup; + } + else + { + import std.array : appender; + auto app = appender!string(); + app.reserve(s.length); + foreach (c; s.byUTF2!char) + app.put(c); + return app.data; + } } /// From ed92b3d33feb8be8a8d29f6be6dc838338c52815 Mon Sep 17 00:00:00 2001 From: Jack Stouffer Date: Sat, 23 Jul 2016 17:06:04 -0400 Subject: [PATCH 5/6] Range-ified --- std/utf.d | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/std/utf.d b/std/utf.d index 7ad2cdb5524..958d0666a2d 100644 --- a/std/utf.d +++ b/std/utf.d @@ -2468,7 +2468,7 @@ char[] toUTF8(return out char[4] buf, dchar c) nothrow @nogc @safe pure * See_Also: * For a lazy, non-allocating version of these functions, see $(LREF byUTF). */ -string toUTF8(S)(S s) if (isSomeString!S) +string toUTF8(S)(S s) if (isInputRange!S && isSomeChar!(ElementEncodingType!S)) { static if (is(S : string)) { @@ -2478,9 +2478,13 @@ string toUTF8(S)(S s) if (isSomeString!S) { import std.array : appender; auto app = appender!string(); - app.reserve(s.length); + + static if (hasLength!S || isSomeString!S) + app.reserve(s.length); + foreach (c; s.byUTF2!char) app.put(c); + return app.data; } } From e096f29e186100faf187218acca9466f388a93c2 Mon Sep 17 00:00:00 2001 From: Jack Stouffer Date: Sat, 23 Jul 2016 21:11:21 -0400 Subject: [PATCH 6/6] Improved docs and added tests --- std/utf.d | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/std/utf.d b/std/utf.d index 958d0666a2d..34279aec95e 100644 --- a/std/utf.d +++ b/std/utf.d @@ -2459,7 +2459,8 @@ char[] toUTF8(return out char[4] buf, dchar c) nothrow @nogc @safe pure } /** - * Encodes string `s` into UTF-8 and returns the encoded string. + * Encodes the elements of `s` to UTF-8 and returns a newly allocated + * string of the elements. * * Params: * s = the string to encode @@ -2482,7 +2483,7 @@ string toUTF8(S)(S s) if (isInputRange!S && isSomeChar!(ElementEncodingType!S)) static if (hasLength!S || isSomeString!S) app.reserve(s.length); - foreach (c; s.byUTF2!char) + foreach (c; s.byUTF!char) app.put(c); return app.data; @@ -2501,6 +2502,18 @@ string toUTF8(S)(S s) if (isInputRange!S && isSomeChar!(ElementEncodingType!S)) assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); } +@system pure unittest +{ + import std.internal.test.dummyrange : ReferenceInputRange; + import std.algorithm.comparison : equal; + + auto r1 = new ReferenceInputRange!dchar("Hellø"); + auto r2 = new ReferenceInputRange!dchar("𐐷"); + + assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); + assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); +} + /* =================== Conversion to UTF16 ======================= */