diff --git a/README.md b/README.md index fabd559..f3173ac 100644 --- a/README.md +++ b/README.md @@ -13,3 +13,8 @@ Current modules included: * std.dateparse * std.regexp * std.stream and friends + +Some individual functions have been moved here rather than full Phobos modules. They are + +* undead.doformat: Contains the `doFormat` function from std.format +* undead.string: Contains regex style pattern matching functions from std.string diff --git a/posix.mak b/posix.mak index 05f0b35..b7f23e5 100644 --- a/posix.mak +++ b/posix.mak @@ -20,7 +20,8 @@ LFLAGS=-L/map/co $(DMD) -c $(DFLAGS) $* SRC= $S/bitarray.d $S/regexp.d $S/datebase.d $S/date.d $S/dateparse.d \ - $S/cstream.d $S/stream.d $S/socketstream.d $S/doformat.d + $S/cstream.d $S/stream.d $S/socketstream.d $S/doformat.d $S/string.d \ + $S/internal/file.d SOURCE= $(SRC) win32.mak posix.mak LICENSE README.md dub.json diff --git a/src/undead/string.d b/src/undead/string.d new file mode 100644 index 0000000..17280f1 --- /dev/null +++ b/src/undead/string.d @@ -0,0 +1,295 @@ +/** + * Contains the obsolete pattern matching functions from Phobos' + * `std.string`. + */ +module undead.string; + +import std.traits; + +/*********************************************** + * See if character c is in the pattern. + * Patterns: + * + * A $(I pattern) is an array of characters much like a $(I character + * class) in regular expressions. A sequence of characters + * can be given, such as "abcde". The '-' can represent a range + * of characters, as "a-e" represents the same pattern as "abcde". + * "a-fA-F0-9" represents all the hex characters. + * If the first character of a pattern is '^', then the pattern + * is negated, i.e. "^0-9" means any character except a digit. + * The functions inPattern, $(B countchars), $(B removeschars), + * and $(B squeeze) use patterns. + * + * Note: In the future, the pattern syntax may be improved + * to be more like regular expression character classes. + */ +bool inPattern(S)(dchar c, in S pattern) @safe pure @nogc +if (isSomeString!S) +{ + bool result = false; + int range = 0; + dchar lastc; + + foreach (size_t i, dchar p; pattern) + { + if (p == '^' && i == 0) + { + result = true; + if (i + 1 == pattern.length) + return (c == p); // or should this be an error? + } + else if (range) + { + range = 0; + if (lastc <= c && c <= p || c == p) + return !result; + } + else if (p == '-' && i > result && i + 1 < pattern.length) + { + range = 1; + continue; + } + else if (c == p) + return !result; + lastc = p; + } + return result; +} + + +@safe pure @nogc unittest +{ + assertCTFEable!( + { + assert(inPattern('x', "x") == 1); + assert(inPattern('x', "y") == 0); + assert(inPattern('x', string.init) == 0); + assert(inPattern('x', "^y") == 1); + assert(inPattern('x', "yxxy") == 1); + assert(inPattern('x', "^yxxy") == 0); + assert(inPattern('x', "^abcd") == 1); + assert(inPattern('^', "^^") == 0); + assert(inPattern('^', "^") == 1); + assert(inPattern('^', "a^") == 1); + assert(inPattern('x', "a-z") == 1); + assert(inPattern('x', "A-Z") == 0); + assert(inPattern('x', "^a-z") == 0); + assert(inPattern('x', "^A-Z") == 1); + assert(inPattern('-', "a-") == 1); + assert(inPattern('-', "^A-") == 0); + assert(inPattern('a', "z-a") == 1); + assert(inPattern('z', "z-a") == 1); + assert(inPattern('x', "z-a") == 0); + }); +} + + +/** + * See if character c is in the intersection of the patterns. + */ +bool inPattern(S)(dchar c, S[] patterns) @safe pure @nogc +if (isSomeString!S) +{ + foreach (string pattern; patterns) + { + if (!inPattern(c, pattern)) + { + return false; + } + } + return true; +} + + +/** + * Count characters in s that match pattern. + */ +size_t countchars(S, S1)(S s, in S1 pattern) @safe pure @nogc +if (isSomeString!S && isSomeString!S1) +{ + size_t count; + foreach (dchar c; s) + { + count += inPattern(c, pattern); + } + return count; +} + +@safe pure @nogc unittest +{ + assertCTFEable!( + { + assert(countchars("abc", "a-c") == 3); + assert(countchars("hello world", "or") == 3); + }); +} + + +/** + * Return string that is s with all characters removed that match pattern. + */ +S removechars(S)(S s, in S pattern) @safe pure +if (isSomeString!S) +{ + import std.utf : encode; + + Unqual!(typeof(s[0]))[] r; + bool changed = false; + + foreach (size_t i, dchar c; s) + { + if (inPattern(c, pattern)) + { + if (!changed) + { + changed = true; + r = s[0 .. i].dup; + } + continue; + } + if (changed) + { + encode(r, c); + } + } + if (changed) + return r; + else + return s; +} + +@safe pure unittest +{ + assertCTFEable!( + { + assert(removechars("abc", "a-c").length == 0); + assert(removechars("hello world", "or") == "hell wld"); + assert(removechars("hello world", "d") == "hello worl"); + assert(removechars("hah", "h") == "a"); + }); +} + +@safe pure unittest +{ + assert(removechars("abc", "x") == "abc"); +} + + +/*************************************************** + * Return string where sequences of a character in s[] from pattern[] + * are replaced with a single instance of that character. + * If pattern is null, it defaults to all characters. + */ +S squeeze(S)(S s, in S pattern = null) +{ + import std.utf : encode, stride; + + Unqual!(typeof(s[0]))[] r; + dchar lastc; + size_t lasti; + int run; + bool changed; + + foreach (size_t i, dchar c; s) + { + if (run && lastc == c) + { + changed = true; + } + else if (pattern is null || inPattern(c, pattern)) + { + run = 1; + if (changed) + { + if (r is null) + r = s[0 .. lasti].dup; + encode(r, c); + } + else + lasti = i + stride(s, i); + lastc = c; + } + else + { + run = 0; + if (changed) + { + if (r is null) + r = s[0 .. lasti].dup; + encode(r, c); + } + } + } + return changed ? ((r is null) ? s[0 .. lasti] : cast(S) r) : s; +} + +@system pure unittest +{ + assertCTFEable!( + { + string s; + + assert(squeeze("hello") == "helo"); + + s = "abcd"; + assert(squeeze(s) is s); + s = "xyzz"; + assert(squeeze(s).ptr == s.ptr); // should just be a slice + + assert(squeeze("hello goodbyee", "oe") == "hello godbye"); + }); +} + +/*************************************************************** + Finds the position $(D_PARAM pos) of the first character in $(D_PARAM + s) that does not match $(D_PARAM pattern) (in the terminology used by + $(REF inPattern, std,string)). Updates $(D_PARAM s = + s[pos..$]). Returns the slice from the beginning of the original + (before update) string up to, and excluding, $(D_PARAM pos). + +The $(D_PARAM munch) function is mostly convenient for skipping +certain category of characters (e.g. whitespace) when parsing +strings. (In such cases, the return value is not used.) + */ +S1 munch(S1, S2)(ref S1 s, S2 pattern) @safe pure @nogc +{ + size_t j = s.length; + foreach (i, dchar c; s) + { + if (!inPattern(c, pattern)) + { + j = i; + break; + } + } + scope(exit) s = s[j .. $]; + return s[0 .. j]; +} + +/// +@safe pure @nogc unittest +{ + string s = "123abc"; + string t = munch(s, "0123456789"); + assert(t == "123" && s == "abc"); + t = munch(s, "0123456789"); + assert(t == "" && s == "abc"); +} + +@safe pure @nogc unittest +{ + string s = "123€abc"; + string t = munch(s, "0123456789"); + assert(t == "123" && s == "€abc"); + t = munch(s, "0123456789"); + assert(t == "" && s == "€abc"); + t = munch(s, "£$€¥"); + assert(t == "€" && s == "abc"); +} + +// helper function for unit tests +private @property void assertCTFEable(alias dg)() +{ + static assert({ cast(void) dg(); return true; }()); + cast(void) dg(); +} \ No newline at end of file diff --git a/win32.mak b/win32.mak index f4db3d3..cd55b5f 100644 --- a/win32.mak +++ b/win32.mak @@ -20,7 +20,8 @@ LFLAGS=-L/map/co $(DMD) -c $(DFLAGS) $* SRC= $S\bitarray.d $S\regexp.d $S\datebase.d $S\date.d $S\dateparse.d \ - $S\cstream.d $S\stream.d $S\socketstream.d $S\doformat.d $S\internal\file.d + $S\cstream.d $S\stream.d $S\socketstream.d $S\doformat.d $S/string.d \ + $S\internal\file.d SOURCE= $(SRC) win32.mak win64.mak posix.mak LICENSE README.md dub.json diff --git a/win64.mak b/win64.mak index 609b2bf..45a1045 100644 --- a/win64.mak +++ b/win64.mak @@ -20,7 +20,8 @@ LFLAGS=-L/map/co $(DMD) -c $(DFLAGS) $* SRC= $S\bitarray.d $S\regexp.d $S\datebase.d $S\date.d $S\dateparse.d \ - $S\cstream.d $S\stream.d $S\socketstream.d $S\doformat.d $S\internal\file.d + $S\cstream.d $S\stream.d $S\socketstream.d $S\doformat.d $S/string.d \ + $S\internal\file.d SOURCE= $(SRC) win32.mak win64.mak posix.mak LICENSE README.md dub.json