From 66048ae334a9fa2bec2b47e706b3c8691daf311c Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Tue, 19 Apr 2016 13:48:32 +0300 Subject: [PATCH 01/23] [Refactor] Generalize kickstart engines, in preparation for more to come --- posix.mak | 2 +- std/regex/internal/backtracking.d | 2 +- std/regex/internal/ir.d | 16 +- std/regex/internal/parser.d | 6 +- std/regex/internal/{kickstart.d => shiftor.d} | 162 ++++++++++-------- std/regex/internal/thompson.d | 2 +- win32.mak | 2 +- win64.mak | 2 +- 8 files changed, 111 insertions(+), 83 deletions(-) rename std/regex/internal/{kickstart.d => shiftor.d} (82%) diff --git a/posix.mak b/posix.mak index 8b9f55ea391..749ce932736 100644 --- a/posix.mak +++ b/posix.mak @@ -192,7 +192,7 @@ PACKAGE_std_experimental_ndslice = package iteration selection slice PACKAGE_std_net = curl isemail PACKAGE_std_range = interfaces package primitives PACKAGE_std_regex = package $(addprefix internal/,generator ir parser \ - backtracking kickstart tests thompson) + backtracking shiftor tests thompson) # Modules in std (including those in packages) STD_MODULES=$(call P2MODULES,$(STD_PACKAGES)) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index a7c360c5e88..5427b1b380e 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -216,7 +216,7 @@ template BacktrackingMatcher(bool CTregex) } static if (kicked) { - if (!re.kickstart.empty) + if (re.kickstart) { for (;;) { diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index b5d3417b950..912b435b87d 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -452,6 +452,17 @@ struct Group(DataIndex) writeln("\t", disassemble(slice, pc, dict)); } +/+ + Generic interface for kickstart engine components. + The goal of kickstart is to advance input to the next potential match, + the more accurate & fast the better. ++/ +interface Kickstart(Char){ +@trusted: + bool opCall(ref Input!Char input); + @property bool empty() const; +} + /++ $(D Regex) object holds regular expression pattern in compiled form. Instances of this object are constructed via calls to $(D regex). @@ -513,7 +524,6 @@ struct Regex(Char) } package(std.regex): - import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency NamedGroup[] dict; // maps name -> user group number uint ngroup; // number of internal groups uint maxCounterDepth; // max depth of nested {n,m} repetitions @@ -622,10 +632,10 @@ struct Input(Char) @property bool atEnd(){ return _index == _origin.length; } + bool search(Kickstart)(ref Kickstart kick, ref dchar res, ref size_t pos) { - size_t idx = kick.search(_origin, _index); - _index = idx; + kick(this); return nextChar(res, pos); } diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index 49f6b45573f..ade9c8a3e65 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -1604,7 +1604,11 @@ struct Parser(R, Generator) } checkIfOneShot(); if (!(flags & RegexInfo.oneShot)) - kickstart = Kickstart!Char(zis, new uint[](256)); + { + kickstart = new ShiftOr!Char(zis); + if(kickstart.empty) + kickstart = null; + } debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount); optimize(zis); } diff --git a/std/regex/internal/kickstart.d b/std/regex/internal/shiftor.d similarity index 82% rename from std/regex/internal/kickstart.d rename to std/regex/internal/shiftor.d index f052a955509..f57dbe20420 100644 --- a/std/regex/internal/kickstart.d +++ b/std/regex/internal/shiftor.d @@ -2,7 +2,7 @@ Kickstart is a coarse-grained "filter" engine that finds likely matches to be verified by full-blown matcher. */ -module std.regex.internal.kickstart; +module std.regex.internal.shiftor; package(std.regex): @@ -26,7 +26,7 @@ uint effectiveSize(Char)() Kickstart engine using ShiftOr algorithm, a bit parallel technique for inexact string searching. */ -struct ShiftOr(Char) +class ShiftOr(Char) : Kickstart!Char { private: uint[] table; @@ -127,13 +127,13 @@ private: } public: - @trusted this(ref Regex!Char re, uint[] memory) + @trusted this(ref Regex!Char re) { static import std.algorithm.comparison; import std.algorithm.searching : countUntil; import std.conv : text; import std.range : assumeSorted; - assert(memory.length == 256); + uint[] memory = new uint[256]; fChar = uint.max; // FNV-1a flavored hash (uses 32bits at a time) ulong hash(uint[] tab) @@ -385,22 +385,23 @@ public: } } - @property bool empty() const { return n_length == 0; } + final @property bool empty() const { return n_length == 0; } - @property uint length() const{ return n_length/charSize; } + final @property uint length() const{ return n_length/charSize; } // lookup compatible bit pattern in haystack, return starting index // has a useful trait: if supplied with valid UTF indexes, // returns only valid UTF indexes // (that given the haystack in question is valid UTF string) - @trusted size_t search(const(Char)[] haystack, size_t idx) + final @trusted bool opCall(ref Input!Char s) {//@BUG: apparently assumes little endian machines import std.conv : text; import core.stdc.string : memchr; assert(!empty); - auto p = cast(const(ubyte)*)(haystack.ptr+idx); + auto haystack = s._origin; uint state = uint.max; uint limit = 1u<<(n_length - 1u); + auto p = cast(const(ubyte)*)(haystack.ptr+s._index); debug(std_regex_search) writefln("Limit: %32b",limit); if (fChar != uint.max) { @@ -415,11 +416,17 @@ public: assert(p <= end, text(p," vs ", end)); p = cast(ubyte*)memchr(p, fChar, end - p); if (!p) - return haystack.length; + { + s._index = haystack.length; + return false; + } if ((cast(size_t)p & (Char.sizeof-1)) == orginalAlign) break; if (++p == end) - return haystack.length; + { + s._index = haystack.length; + return false; + } } state = ~1u; assert((cast(size_t)p & (Char.sizeof-1)) == orginalAlign); @@ -433,8 +440,10 @@ public: p++; //first char is tested, see if that's all if (!(state & limit)) - return (p-cast(ubyte*)haystack.ptr)/Char.sizeof - -length; + { + s._index = (p-cast(ubyte*)haystack.ptr)/Char.sizeof-length; + return true; + } } else {//have some bits/states for possible matches, @@ -452,8 +461,10 @@ public: p++; } if (!(state & limit)) - return (p-cast(ubyte*)haystack.ptr)/Char.sizeof - -length; + { + s._index = (p-cast(ubyte*)haystack.ptr)/Char.sizeof-length; + return true; + } } debug(std_regex_search) writefln("State: %32b", state); } @@ -471,8 +482,10 @@ public: state = (state<<1) | table[p[2]]; p += 4; if (!(state & limit))//division rounds down for dchar - return (p-cast(ubyte*)haystack.ptr)/Char.sizeof - -length; + { + s._index = (p-cast(ubyte*)haystack.ptr)/Char.sizeof-length; + return true; + } } } else @@ -483,23 +496,31 @@ public: { state = (state<<1) | table[p[i++]]; if (!(state & limit)) - return idx+i/Char.sizeof-length; + { + s._index += i/Char.sizeof-length; + return true; + } } while (i < len) { state = (state<<1) | table[p[i++]]; if (!(state & limit)) - return idx+i/Char.sizeof - -length; + { + s._index += i/Char.sizeof-length; + return true; + } state = (state<<1) | table[p[i++]]; if (!(state & limit)) - return idx+i/Char.sizeof - -length; + { + s._index += i/Char.sizeof-length; + return true; + } debug(std_regex_search) writefln("State: %32b", state); } } } - return haystack.length; + s._index = haystack.length; + return false; } @system debug static void dump(uint[] table) @@ -515,65 +536,58 @@ public: unittest { import std.conv, std.regex; - @trusted void test_fixed(alias Kick)() + auto shiftOrLength(C)(const(C)[] pat, uint length) + { + auto r = regex(pat); + auto kick = new ShiftOr!C(r); + assert(kick.length == length, text(C.stringof, " == ", kick.length)); + return kick; + } + auto searches(C)(const (C)[] source, ShiftOr!C kick, uint[] results...) { - foreach (i, v; AliasSeq!(char, wchar, dchar)) + auto inp = Input!C(source); + foreach(r; results) { - alias Char = v; - alias String = immutable(v)[]; - auto r = regex(to!String(`abc$`)); - auto kick = Kick!Char(r, new uint[256]); - assert(kick.length == 3, text(Kick.stringof," ",v.stringof, " == ", kick.length)); - auto r2 = regex(to!String(`(abc){2}a+`)); - kick = Kick!Char(r2, new uint[256]); - assert(kick.length == 7, text(Kick.stringof,v.stringof," == ", kick.length)); - auto r3 = regex(to!String(`\b(a{2}b{3}){2,4}`)); - kick = Kick!Char(r3, new uint[256]); - assert(kick.length == 10, text(Kick.stringof,v.stringof," == ", kick.length)); - auto r4 = regex(to!String(`\ba{2}c\bxyz`)); - kick = Kick!Char(r4, new uint[256]); - assert(kick.length == 6, text(Kick.stringof,v.stringof, " == ", kick.length)); - auto r5 = regex(to!String(`\ba{2}c\b`)); - kick = Kick!Char(r5, new uint[256]); - size_t x = kick.search("aabaacaa", 0); - assert(x == 3, text(Kick.stringof,v.stringof," == ", kick.length)); - x = kick.search("aabaacaa", x+1); - assert(x == 8, text(Kick.stringof,v.stringof," == ", kick.length)); + kick(inp); + dchar ch; + size_t idx; + assert(inp._index == r, text(inp._index, " vs ", r)); + inp.nextChar(ch, idx); } } - @trusted void test_flex(alias Kick)() + + foreach(i, Char; AliasSeq!(char, wchar, dchar)) { - foreach (i, v; AliasSeq!(char, wchar, dchar)) - { - alias Char = v; - alias String = immutable(v)[]; - auto r = regex(to!String(`abc[a-z]`)); - auto kick = Kick!Char(r, new uint[256]); - auto x = kick.search(to!String("abbabca"), 0); - assert(x == 3, text("real x is ", x, " ",v.stringof)); + alias String = immutable(Char)[]; + shiftOrLength(`abc`.to!String, 3); + shiftOrLength(`abc$`.to!String, 3); + shiftOrLength(`(abc){2}a+`.to!String, 7); + shiftOrLength(`\b(a{2}b{3}){2,4}`.to!String, 10); + shiftOrLength(`\ba{2}c\bxyz`.to!String, 6); + auto kick = shiftOrLength(`\ba{2}c\b`.to!String, 3); + auto inp = Input!Char("aabaacaa"); + assert(kick(inp)); + assert(inp._index == 3, text(Char.stringof," == ", kick.length)); + dchar ch; + size_t idx; + inp.nextChar(ch, idx); + assert(!kick(inp)); + assert(inp._index == 8, text(Char.stringof," == ", kick.length)); + } - auto r2 = regex(to!String(`(ax|bd|cdy)`)); - String s2 = to!String("abdcdyabax"); - kick = Kick!Char(r2, new uint[256]); - x = kick.search(s2, 0); - assert(x == 1, text("real x is ", x)); - x = kick.search(s2, x+1); - assert(x == 3, text("real x is ", x)); - x = kick.search(s2, x+1); - assert(x == 8, text("real x is ", x)); - auto rdot = regex(to!String(`...`)); - kick = Kick!Char(rdot, new uint[256]); - assert(kick.length == 0); - auto rN = regex(to!String(`a(b+|c+)x`)); - kick = Kick!Char(rN, new uint[256]); - assert(kick.length == 3, to!string(kick.length)); - assert(kick.search("ababx",0) == 2); - assert(kick.search("abaacba",0) == 3);//expected inexact + foreach(i, Char; AliasSeq!(char, wchar, dchar)) + { + alias String = immutable(Char)[]; + auto kick = shiftOrLength(`abc[a-z]`.to!String, 4); + searches("abbabca".to!String, kick, 3); + kick = shiftOrLength(`(ax|bd|cdy)`.to!String, 2); + searches("abdcdyabax".to!String, kick, 1, 3, 8); + + shiftOrLength(`...`.to!String, 0); + kick = shiftOrLength(`a(b+|c+)x`.to!String, 3); + searches("ababx".to!String, kick, 2); + searches("abaacba".to!String, kick, 3); //expected inexact - } } - test_fixed!(ShiftOr)(); - test_flex!(ShiftOr)(); } -alias Kickstart = ShiftOr; diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d index 3065ee6fc2a..530e5c503e0 100644 --- a/std/regex/internal/thompson.d +++ b/std/regex/internal/thompson.d @@ -922,7 +922,7 @@ template ThompsonOps(E,S, bool withInput:false) return matchOneShot(matches); } static if (kicked) - if (!re.kickstart.empty) + if (re.kickstart) return matchImpl!(true)(matches); return matchImpl!(false)(matches); } diff --git a/win32.mak b/win32.mak index 38cc8242fc8..b093f880b27 100644 --- a/win32.mak +++ b/win32.mak @@ -224,7 +224,7 @@ SRC_STD_REGEX= \ std\regex\internal\tests.d \ std\regex\internal\backtracking.d \ std\regex\internal\thompson.d \ - std\regex\internal\kickstart.d \ + std\regex\internal\shiftor.d \ std\regex\internal\generator.d SRC_STD_C= \ diff --git a/win64.mak b/win64.mak index 838595e31c2..fe496708924 100644 --- a/win64.mak +++ b/win64.mak @@ -243,7 +243,7 @@ SRC_STD_REGEX= \ std\regex\internal\tests.d \ std\regex\internal\backtracking.d \ std\regex\internal\thompson.d \ - std\regex\internal\kickstart.d \ + std\regex\internal\shiftor.d \ std\regex\internal\generator.d SRC_STD_C= \ From 1416ddb0183553f291e09c009d4bcded1d549c29 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Mon, 11 Apr 2016 18:09:25 +0300 Subject: [PATCH 02/23] A start on bit-NFA --- std/regex/internal/bitnfa.d | 60 +++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 std/regex/internal/bitnfa.d diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d new file mode 100644 index 00000000000..045a547dfce --- /dev/null +++ b/std/regex/internal/bitnfa.d @@ -0,0 +1,60 @@ +//Written in the D programming language +/* + Implementation of a concept "NFA in a word" which is + bit-parallel impementation of regex where each bit represents + a state in an NFA. Execution is Thompson-style achieved via bit tricks. + + There is a great number of limitations inlcuding not tracking any state (captures) + and not supporting even basic assertions such as ^, $ or \b. +*/ +import std.regex.internal.ir; + +// since there is no way to mark a starting position +// need 2 instance of BitNfa - one to find the end, and the other +// to run backwards to find the start. +struct BitNfa +{ + uint asciiTab[128]; // state mask for ascii characters + UintTrie2 uniTab; // state mask for unicode characters + uint[uint] controlFlow; // maps each bit pattern to resulting jumps pattern + uint controlFlowMask; // masks all control flow bits + uint finalMask; // marks final states terminating the NFA + + bool opCall(Input)(ref Input r) + { + dchar ch; + size_t idx; + uint word = ~0u; + while(r.nextChar(ch, idx)){ + word <<= 1; // shift - create a state + // cfMask has 1 for each control-flow op + uint cflow = ~word & controlFlowMask; + word = word | controlFlowMask; // kill cflow + word |= controlFlow[cflow]; // map normal ops + if(word & finalMask != finalMask) + return true; + // mask away failing states + if(ch < 0x80) + word |= assciiTab[ch]; + else + word |= uniTab[ch]; + } + return false; + } +} + +final class BitMatcher +{ + BitNfa forward, backward; + bool opCall(Input)(ref Input r) + { + bool res = forward(r); + if(res){ + auto backward = r.loopBack + backward(backward); + r.reset(backward._index); + } + return res; + } +} + From 99095eebfc7d8af7181b11c20a289c9cbad58916 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Wed, 13 Apr 2016 17:13:56 +0300 Subject: [PATCH 03/23] ASCII-only version of Bit-NFA --- posix.mak | 2 +- std/regex/internal/bitnfa.d | 512 ++++++++++++++++++++++++++++++++++-- std/regex/internal/ir.d | 108 ++++++++ std/regex/internal/parser.d | 2 +- 4 files changed, 607 insertions(+), 17 deletions(-) diff --git a/posix.mak b/posix.mak index 749ce932736..b97bd52d3c6 100644 --- a/posix.mak +++ b/posix.mak @@ -192,7 +192,7 @@ PACKAGE_std_experimental_ndslice = package iteration selection slice PACKAGE_std_net = curl isemail PACKAGE_std_range = interfaces package primitives PACKAGE_std_regex = package $(addprefix internal/,generator ir parser \ - backtracking shiftor tests thompson) + backtracking bitnfa tests thompson shiftor) # Modules in std (including those in packages) STD_MODULES=$(call P2MODULES,$(STD_PACKAGES)) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index 045a547dfce..049f5fdac61 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -1,60 +1,542 @@ //Written in the D programming language /* Implementation of a concept "NFA in a word" which is - bit-parallel impementation of regex where each bit represents + bit-parallel impementation of regex where each bit represents a state in an NFA. Execution is Thompson-style achieved via bit tricks. There is a great number of limitations inlcuding not tracking any state (captures) and not supporting even basic assertions such as ^, $ or \b. */ +module std.regex.internal.bitnfa; + +package(std.regex): + import std.regex.internal.ir; -// since there is no way to mark a starting position -// need 2 instance of BitNfa - one to find the end, and the other +debug(std_regex_bitnfa) import std.stdio; + + + +struct HashTab() +{ + @disable this(this); + + uint opIndex(uint key) + { + auto p = locate(key, table); + assert(p.occupied); + return p.value; + } + + void opIndexAssign(uint value, uint key) + { + if(table.length == 0) grow(); + auto p = locate(key, table); + if(!p.occupied) + { + items++; + if(4*items >= table.length*3) + { + grow(); + p = locate(key, table); + } + p.occupied = true; + p.key = key; + } + p.value = value; + } + + auto keys() + { + auto app = appender!(uint[])(); + foreach(i, v; table) + { + if(v.occupied) + app.put(v.key); + } + return app.data; + } + + auto values() + { + auto app = appender!(uint[])(); + foreach(i, v; table) + { + if(v.occupied) + app.put(v.value); + } + return app.data; + } + +private: + static uint hashOf(uint val) + { + return (val >> 20) ^ (val>>8) ^ val; + } + + struct Node + { + uint key; + uint value; + bool occupied; + } + Node[] table; + size_t items; + + static Node* locate(uint key, Node[] table) + { + size_t slot = hashOf(key) & (table.length-1); + while(table.ptr[slot].occupied) + { + if(table.ptr[slot].key == key) + break; + slot += 1; + if(slot == table.length) + slot = 0; + } + return table.ptr+slot; + } + + void grow() + { + Node[] newTable = new Node[table.length ? table.length*2 : 4]; + foreach(i, v; table) + { + if(v.occupied) + { + auto p = locate(v.key, newTable); + *p = v; + } + } + } +} + + +// Specialized 2-level trie of uint masks for BitNfa. +// Uses the concept of CoW: a page gets modified in place +// if the block's ref-count is 1, else a newblock is allocated +// and ref count is decreased +struct UIntTrie2 +{ + ushort[] index; // pages --> blocks + ushort[] refCounts; // ref counts for each block + uint[] hashes; // hashes of blocks + uint[] blocks; // linear array with blocks + uint[] scratch; // temporary block + enum blockSize = 2<<8; // size of block + + static uint hash(uint[] data) + { + uint h = 5183; + foreach(v; data) + { + h = 31*h + v; + } + return h; + } + + static UIntTrie2 opCall() + { + UIntTrie2 ut; + ut.index.length = 2<<13; + ut.blocks = new uint[blockSize]; + ut.blocks[] = uint.max; // all ones + ut.scratch = new uint[blockSize]; + ut.refCounts = new ushort[1]; + ut.refCounts[0] = 2<<13; + ut.hashes = new uint[1]; + ut.hashes[0] = hash(ut.blocks); + return ut; + } + + bool opIndex(dchar ch) + { + return false; // TODO: stub + } + + void opIndexOpAssign(string op)(uint val, dchar ch) + { + // TODO: stub + } + + void opSliceOpAssign(string op)(uint val, uint start, uint end) + { + // TODO: stub + } +} + +// Since there is no way to mark a starting position +// we need 2 instances of BitNfa: one to find the end, and the other // to run backwards to find the start. struct BitNfa { - uint asciiTab[128]; // state mask for ascii characters - UintTrie2 uniTab; // state mask for unicode characters + uint[128] asciiTab; // state mask for ascii characters + UIntTrie2 uniTab; // state mask for unicode characters uint[uint] controlFlow; // maps each bit pattern to resulting jumps pattern uint controlFlowMask; // masks all control flow bits uint finalMask; // marks final states terminating the NFA + bool empty; // if this engine is empty + + void combineControlFlow() + { + uint[] keys = controlFlow.keys; + uint[] values = controlFlow.values; + auto selection = new bool[keys.length]; + bool nextChoice() + { + uint i; + for(i=0;i %d %s", j, ir[j].mnemonic); + paths.push(j+IRL!Option); + //writefln(">> %d", j+IRL!Option); + j = j + ir[j].data + IRL!Option; + } + break; + case GotoEndOr: + paths.push(j+IRL!GotoEndOr+ir[j].data); + break; + case OrEnd, Wordboundary, Notwordboundary, Bol, Eol, Nop, GroupStart, GroupEnd: + paths.push(j+ir[j].length); + break; + case LookaheadStart, NeglookaheadStart, LookbehindStart, + NeglookbehindStart: + paths.push(j + IRL!LookaheadStart + ir[j].data + IRL!LookaheadEnd); + break; + case InfiniteStart, InfiniteQStart: + paths.push(j+IRL!InfiniteStart); + paths.push(j+ir[j].data+IRL!InfiniteEnd); + break; + case InfiniteBloomStart: + paths.push(j+IRL!InfiniteStart); + paths.push(j+ir[j].data+IRL!InfiniteBloomEnd); + break; + case InfiniteEnd, InfiniteQEnd: + paths.push(j-ir[j].data); + paths.push(j+IRL!InfiniteEnd); + break; + case InfiniteBloomEnd: + paths.push(j-ir[j].data); + paths.push(j+IRL!InfiniteBloomEnd); + break; + default: + result ~= j; + } + } + return result; + } + + this(Char)(auto ref Regex!Char re) + { + asciiTab[] = uint.max; // all ones + uniTab = UIntTrie2(); + controlFlow[0] = 0; + // pc -> bit number + uint[] bitMapping = new uint[re.ir.length]; + uint bitCount = 0, nesting=0, lastNonnested=0; + bool stop = false; + with(re) +outer: for(uint i=0; i Date: Thu, 21 Apr 2016 16:48:19 +0300 Subject: [PATCH 04/23] [std.regex] Bit-NFA: implement unicode trie --- std/regex/internal/bitnfa.d | 98 ++++++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 11 deletions(-) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index 049f5fdac61..3433e4ed62f 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -14,7 +14,7 @@ package(std.regex): import std.regex.internal.ir; debug(std_regex_bitnfa) import std.stdio; - +import std.algorithm; struct HashTab() @@ -118,12 +118,14 @@ private: // and ref count is decreased struct UIntTrie2 { - ushort[] index; // pages --> blocks - ushort[] refCounts; // ref counts for each block - uint[] hashes; // hashes of blocks - uint[] blocks; // linear array with blocks - uint[] scratch; // temporary block - enum blockSize = 2<<8; // size of block + ushort[] index; // pages --> blocks + ushort[] refCounts; // ref counts for each block + uint[] hashes; // hashes of blocks + uint[] blocks; // linear array with blocks + uint[] scratch; // temporary block + enum blockBits = 8; // size of block in bits + enum blockSize = 1<>blockBits]; + //writeln(">blk = ", blk); + return blocks.ptr[blk*blockSize + (ch & (blockSize-1))]; + } + + void setPageRange(string op)(uint val, uint low, uint high) { - return false; // TODO: stub + immutable blk = index[low>>blockBits]; + //writeln(" x[1] == h)) + { + if(scratch[] == blocks[i*blockSize .. (i+1)*blockSize]) + { + // re-route to existing page + index[low>>blockBits] = cast(ushort)i; + refCounts[i]++; // inc refs + found = true; + break; + } + } + if(!found) + { + index[low>>blockBits] = cast(ushort)hashes.length; + blocks ~= scratch[]; + refCounts ~= 1; + hashes ~= h; + } + } } void opIndexOpAssign(string op)(uint val, dchar ch) { - // TODO: stub + setPageRange!op(val, ch, ch+1); } void opSliceOpAssign(string op)(uint val, uint start, uint end) { - // TODO: stub + uint startBlk = start >> blockBits; + uint endBlk = end >> blockBits; + uint first = min(startBlk*blockSize+blockSize, end); + setPageRange!op(val, start, first); + foreach(blk; startBlk..endBlk) + setPageRange!op(val, blk*blockSize, (blk+1)*blockSize); + if(first != end) + { + setPageRange!op(val, endBlk*blockSize, end); + } } } +unittest +{ + UIntTrie2 trie = UIntTrie2(); + trie['d'] &= 3; + assert(trie['d'] == 3); + trie['\u0280'] &= 1; + assert(trie['\u0280'] == 1); + import std.uni; + UIntTrie2 trie2 = UIntTrie2(); + auto letters = unicode("L"); + foreach(r; letters.byInterval) + trie2[r.a..r.b] &= 1; + foreach(ch; letters.byCodepoint) + assert(trie2[ch] == 1); + auto space = unicode("WhiteSpace"); + auto trie3 = UIntTrie2(); + foreach(r; space.byInterval) + trie3[r.a..r.b] &= 2; + foreach(ch; space.byCodepoint) + assert(trie3[ch] == 2); +} + // Since there is no way to mark a starting position // we need 2 instances of BitNfa: one to find the end, and the other // to run backwards to find the start. From 5c21564f6ff23f75b271e58762672857cd7157fa Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Thu, 21 Apr 2016 17:18:54 +0300 Subject: [PATCH 05/23] [std.regex] Improved hash-table for Bit-NFA --- std/regex/internal/bitnfa.d | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index 3433e4ed62f..58164f6000b 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -17,11 +17,11 @@ debug(std_regex_bitnfa) import std.stdio; import std.algorithm; -struct HashTab() +struct HashTab { @disable this(this); - uint opIndex(uint key) + uint opIndex()(uint key) { auto p = locate(key, table); assert(p.occupied); @@ -69,7 +69,7 @@ struct HashTab() } private: - static uint hashOf(uint val) + static uint hashOf()(uint val) { return (val >> 20) ^ (val>>8) ^ val; } @@ -83,12 +83,12 @@ private: Node[] table; size_t items; - static Node* locate(uint key, Node[] table) + static Node* locate()(uint key, Node[] table) { size_t slot = hashOf(key) & (table.length-1); - while(table.ptr[slot].occupied) + while(table[slot].occupied) { - if(table.ptr[slot].key == key) + if(table[slot].key == key) break; slot += 1; if(slot == table.length) @@ -108,6 +108,7 @@ private: *p = v; } } + table = newTable; } } @@ -248,7 +249,7 @@ struct BitNfa { uint[128] asciiTab; // state mask for ascii characters UIntTrie2 uniTab; // state mask for unicode characters - uint[uint] controlFlow; // maps each bit pattern to resulting jumps pattern + HashTab controlFlow; // maps each bit pattern to resulting jumps pattern uint controlFlowMask; // masks all control flow bits uint finalMask; // marks final states terminating the NFA bool empty; // if this engine is empty @@ -609,6 +610,7 @@ unittest // stop on repetition "abcdef1".checkBit("a[a-z]{5}", 1); "ads@email.com".checkBit(`\S+@\S+`); + //"abc".checkBit(`([^ ]*)?`); } unittest From 7d930c1de845b931d8c8d9572f1befe938cee5d7 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Thu, 21 Apr 2016 17:31:27 +0300 Subject: [PATCH 06/23] [std.regex] Even faster hash table for Bit-NFA --- std/regex/internal/bitnfa.d | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index 58164f6000b..db5ba78fe35 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -40,8 +40,8 @@ struct HashTab grow(); p = locate(key, table); } - p.occupied = true; - p.key = key; + p.key_ = key; + p.setOccupied(); } p.value = value; } @@ -76,9 +76,12 @@ private: struct Node { - uint key; + uint key_; uint value; - bool occupied; + @property uint key()(){ return key_ & 0x7fff_ffff; } + @property bool occupied()(){ return (key_ & 0x8000_0000) != 0; } + void setOccupied(){ key_ |= 0x8000_0000; } + } Node[] table; size_t items; From 845551b3d6cc5b8f2aca5aa0074d274eae4947a1 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 22 Apr 2016 10:40:10 +0300 Subject: [PATCH 07/23] [std.regex] Bit-NFA fix premature stop on repetitions --- std/regex/internal/bitnfa.d | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index db5ba78fe35..0b4f3f244f1 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -352,7 +352,6 @@ struct BitNfa // pc -> bit number uint[] bitMapping = new uint[re.ir.length]; uint bitCount = 0, nesting=0, lastNonnested=0; - bool stop = false; with(re) outer: for(uint i=0; i Date: Fri, 22 Apr 2016 11:13:05 +0300 Subject: [PATCH 08/23] [std.regex] Bit-NFA - fix inversion of the right sub-portion of regex --- std/regex/internal/bitnfa.d | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index 0b4f3f244f1..cdd1d9b2fc0 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -255,7 +255,9 @@ struct BitNfa HashTab controlFlow; // maps each bit pattern to resulting jumps pattern uint controlFlowMask; // masks all control flow bits uint finalMask; // marks final states terminating the NFA - bool empty; // if this engine is empty + uint length; // if this engine is empty + + @property bool empty() const { return length == 0; } void combineControlFlow() { @@ -391,12 +393,10 @@ outer: for(uint i=0; i Date: Fri, 22 Apr 2016 11:21:54 +0300 Subject: [PATCH 09/23] [std.regex] Add Bit-NFA to win32/win64 makefiles --- win32.mak | 1 + win64.mak | 1 + 2 files changed, 2 insertions(+) diff --git a/win32.mak b/win32.mak index b093f880b27..743f6f3260a 100644 --- a/win32.mak +++ b/win32.mak @@ -225,6 +225,7 @@ SRC_STD_REGEX= \ std\regex\internal\backtracking.d \ std\regex\internal\thompson.d \ std\regex\internal\shiftor.d \ + std\regex\internal\bitnfa.d \ std\regex\internal\generator.d SRC_STD_C= \ diff --git a/win64.mak b/win64.mak index fe496708924..7a7040a7d6b 100644 --- a/win64.mak +++ b/win64.mak @@ -244,6 +244,7 @@ SRC_STD_REGEX= \ std\regex\internal\backtracking.d \ std\regex\internal\thompson.d \ std\regex\internal\shiftor.d \ + std\regex\internal\bitnfa.d \ std\regex\internal\generator.d SRC_STD_C= \ From 138a2f3dcb80a15b9cacdc521024a628855f7d20 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 22 Apr 2016 11:37:44 +0300 Subject: [PATCH 10/23] [std.regex] Integrate Bit-NFA into std.regex --- std/regex/internal/bitnfa.d | 8 +++++++- std/regex/internal/parser.d | 11 ++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index cdd1d9b2fc0..6eded48171c 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -182,8 +182,9 @@ struct UIntTrie2 mixin("scratch[lowIdx..highIdx] "~op~"= val;"); uint h = hash(scratch); bool found = false; - foreach(i,_; hashes.enumerate.filter!(x => x[1] == h)) + foreach(i,x; hashes) { + if(x != h) continue; if(scratch[] == blocks[i*blockSize .. (i+1)*blockSize]) { // re-route to existing page @@ -424,6 +425,11 @@ outer: for(uint i=0; i Date: Fri, 22 Apr 2016 12:22:36 +0300 Subject: [PATCH 11/23] [std.regex] Limit ShiftOr to only apply where it's better then Bit-NFA --- std/regex/internal/shiftor.d | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/std/regex/internal/shiftor.d b/std/regex/internal/shiftor.d index f57dbe20420..4c12bec0833 100644 --- a/std/regex/internal/shiftor.d +++ b/std/regex/internal/shiftor.d @@ -339,25 +339,6 @@ public: t.pc += IRL!(IR.RepeatEnd); } break; - case IR.InfiniteStart, IR.InfiniteQStart: - t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); - goto case IR.InfiniteEnd; //both Q and non-Q - case IR.InfiniteEnd: - case IR.InfiniteQEnd: - auto slot = re.ir[t.pc+1].raw+t.counter; - auto val = hash(t.tab); - if (val in merge[slot]) - goto L_StopThread; // merge equivalent - merge[slot][val] = true; - uint len = re.ir[t.pc].data; - uint pc1, pc2; //branches to take in priority order - if (++t.hops == 32) - goto L_StopThread; - pc1 = t.pc + IRL!(IR.InfiniteEnd); - pc2 = t.pc - len; - trs ~= fork(t, pc2, t.counter); - t.pc = pc1; - break; case IR.GroupStart, IR.GroupEnd: t.pc += IRL!(IR.GroupStart); break; @@ -385,7 +366,7 @@ public: } } - final @property bool empty() const { return n_length == 0; } + final @property bool empty() const { return n_length < 3 && fChar == uint.max; } final @property uint length() const{ return n_length/charSize; } @@ -584,7 +565,7 @@ unittest searches("abdcdyabax".to!String, kick, 1, 3, 8); shiftOrLength(`...`.to!String, 0); - kick = shiftOrLength(`a(b+|c+)x`.to!String, 3); + kick = shiftOrLength(`a(b{1,2}|c{1,2})x`.to!String, 3); searches("ababx".to!String, kick, 2); searches("abaacba".to!String, kick, 3); //expected inexact From 2011a1a75c2a2f4eb11e7c4cb0c7bc8ddab5b1e6 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 22 Apr 2016 12:36:55 +0300 Subject: [PATCH 12/23] Trailing whitespace --- std/regex/internal/bitnfa.d | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index 6eded48171c..6e389b5bad1 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -172,7 +172,7 @@ struct UIntTrie2 immutable highIdx = high - low + lowIdx; mixin("blocks[lowIdx..highIdx] "~op~"= val;"); } - else + else { // create a new page refCounts[blk]--; From 20c5e399d3923a0804a3a5209eadd136ce94102f Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 22 Apr 2016 13:18:34 +0300 Subject: [PATCH 13/23] Try to reduce memory usage in CT-regex tests --- std/regex/internal/tests.d | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index a098fcc431c..993164cc2fb 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -353,7 +353,7 @@ unittest void run_tests(alias matchFn)() { int i; - foreach (Char; AliasSeq!( char, wchar, dchar)) + foreach(Char; AliasSeq!( char, wchar, dchar)) (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 alias String = immutable(Char)[]; String produceExpected(M,Range)(auto ref M m, Range fmt) @@ -363,7 +363,7 @@ unittest return app.data; } Regex!(Char) r; - foreach (a, tvd; tv) + foreach(a, tvd; tv) { uint c = tvd.result[0]; debug(std_regex_test) writeln(" Test #", a, " pattern: ", tvd.pattern, " with Char = ", Char.stringof); @@ -380,7 +380,7 @@ unittest assert((c == 'c') ? !i : i, "failed to compile pattern "~tvd.pattern); - if (c != 'c') + if(c != 'c') { auto m = matchFn(to!(String)(tvd.input), r); i = !m.empty; @@ -427,11 +427,11 @@ unittest alias Tests = Sequence!(220, tv.length); } else - alias Tests = AliasSeq!(Sequence!(0, 30), Sequence!(235, tv.length-5)); - foreach (a, v; Tests) + alias Tests = AliasSeq!(Sequence!(0, 25), Sequence!(238, tv.length-5)); + foreach(a, v; Tests) (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 enum tvd = tv[v]; - static if (tvd.result == "c") + static if(tvd.result == "c") { static assert(!__traits(compiles, (){ enum r = regex(tvd.pattern, tvd.flags); @@ -449,11 +449,11 @@ unittest bool ok = (c == 'y') ^ m.empty; assert(ok, text("ctRegex: failed to match pattern #", a ,": ", tvd.pattern)); - if (c == 'y') + if(c == 'y') { import std.stdio; auto result = produceExpected(m, tvd.format); - if (result != tvd.replace) + if(result != tvd.replace) writeln("ctRegex mismatch pattern #", a, ": ", tvd.pattern," expected: ", tvd.replace, " vs ", result); } @@ -568,7 +568,7 @@ unittest string s = "a quick brown fox jumps over a lazy dog"; auto r1 = regex("\\b[a-z]+\\b","g"); string[] test; - foreach (m; matchFn(s, r1)) + foreach(m; matchFn(s, r1)) test ~= m.hit; assert(equal(test, [ "a", "quick", "brown", "fox", "jumps", "over", "a", "lazy", "dog"])); auto free_reg = regex(` @@ -689,7 +689,7 @@ unittest { import std.uni : toUpper; - foreach (i, v; AliasSeq!(string, wstring, dstring)) + foreach(i, v; AliasSeq!(string, wstring, dstring)) { auto baz(Cap)(Cap m) if (is(Cap == Captures!(Cap.String))) @@ -805,7 +805,7 @@ unittest auto r = regex( r"^NAME = (?P[a-zA-Z0-9_]+):*(?P[a-zA-Z0-9_]*)","gm"); auto uniCapturesNew = match(uniFileOld, r); - for (int i = 0; i < 20; i++) + for(int i = 0; i < 20; i++) foreach (matchNew; uniCapturesNew) {} //a second issue with same symptoms auto r2 = regex(`([а-яА-Я\-_]+\s*)+(?<=[\s\.,\^])`); @@ -1007,7 +1007,7 @@ unittest unittest { auto ctPat2 = regex(r"^[CDF]$", "i"); - foreach (v; ["C", "c", "D", "d", "F", "f"]) + foreach(v; ["C", "c", "D", "d", "F", "f"]) assert(matchAll(v, ctPat2).front.hit == v); } From 3544686b7474507ecee4c1c974f81cf33d6d3df2 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Sun, 24 Apr 2016 17:02:33 +0300 Subject: [PATCH 14/23] WIP fixing multi-pattern match --- std/regex/internal/bitnfa.d | 42 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index 6e389b5bad1..49f3f7b6d31 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -158,14 +158,12 @@ struct UIntTrie2 uint opIndex(dchar ch) { immutable blk = index[ch>>blockBits]; - //writeln(">blk = ", blk); return blocks.ptr[blk*blockSize + (ch & (blockSize-1))]; } void setPageRange(string op)(uint val, uint low, uint high) { immutable blk = index[low>>blockBits]; - //writeln(" Date: Wed, 4 May 2016 23:18:14 +0300 Subject: [PATCH 15/23] [std.regex] Get Bit-NFA working on the testsuite --- posix.mak | 2 +- std/regex/internal/backtracking.d | 1 + std/regex/internal/bitnfa.d | 327 ++++++++++------- std/regex/internal/ir.d | 7 +- std/regex/internal/parser.d | 10 +- std/regex/internal/shiftor.d | 15 +- std/regex/internal/tests.d | 573 +----------------------------- std/regex/internal/tests2.d | 268 ++++++++++++++ std/regex/internal/tests3.d | 305 ++++++++++++++++ win32.mak | 2 + win64.mak | 2 + 11 files changed, 807 insertions(+), 705 deletions(-) create mode 100644 std/regex/internal/tests2.d create mode 100644 std/regex/internal/tests3.d diff --git a/posix.mak b/posix.mak index b97bd52d3c6..5ce6959a644 100644 --- a/posix.mak +++ b/posix.mak @@ -192,7 +192,7 @@ PACKAGE_std_experimental_ndslice = package iteration selection slice PACKAGE_std_net = curl isemail PACKAGE_std_range = interfaces package primitives PACKAGE_std_regex = package $(addprefix internal/,generator ir parser \ - backtracking bitnfa tests thompson shiftor) + backtracking bitnfa tests tests2 tests3 thompson shiftor) # Modules in std (including those in packages) STD_MODULES=$(call P2MODULES,$(STD_PACKAGES)) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index 5427b1b380e..2d14f604178 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -225,6 +225,7 @@ template BacktrackingMatcher(bool CTregex) return val; else { + import std.stdio; if (atEnd) break; search(); diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index 49f3f7b6d31..e1ca33a5e32 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -1,3 +1,4 @@ + //Written in the D programming language /* Implementation of a concept "NFA in a word" which is @@ -28,14 +29,20 @@ struct HashTab return p.value; } + bool opBinaryRight(string op:"in")(uint key) + { + auto p = locate(key, table); + return p.occupied; + } + void opIndexAssign(uint value, uint key) { - if(table.length == 0) grow(); + if (table.length == 0) grow(); auto p = locate(key, table); - if(!p.occupied) + if (!p.occupied) { items++; - if(4*items >= table.length*3) + if (4*items >= table.length*3) { grow(); p = locate(key, table); @@ -49,9 +56,9 @@ struct HashTab auto keys() { auto app = appender!(uint[])(); - foreach(i, v; table) + foreach (i, v; table) { - if(v.occupied) + if (v.occupied) app.put(v.key); } return app.data; @@ -60,9 +67,9 @@ struct HashTab auto values() { auto app = appender!(uint[])(); - foreach(i, v; table) + foreach (i, v; table) { - if(v.occupied) + if (v.occupied) app.put(v.value); } return app.data; @@ -89,12 +96,12 @@ private: static Node* locate()(uint key, Node[] table) { size_t slot = hashOf(key) & (table.length-1); - while(table[slot].occupied) + while (table[slot].occupied) { - if(table[slot].key == key) + if (table[slot].key == key) break; slot += 1; - if(slot == table.length) + if (slot == table.length) slot = 0; } return table.ptr+slot; @@ -103,9 +110,9 @@ private: void grow() { Node[] newTable = new Node[table.length ? table.length*2 : 4]; - foreach(i, v; table) + foreach (i, v; table) { - if(v.occupied) + if (v.occupied) { auto p = locate(v.key, newTable); *p = v; @@ -134,7 +141,7 @@ struct UIntTrie2 static uint hash(uint[] data) { uint h = 5183; - foreach(v; data) + foreach (v; data) { h = 31*h + v; } @@ -164,7 +171,7 @@ struct UIntTrie2 void setPageRange(string op)(uint val, uint low, uint high) { immutable blk = index[low>>blockBits]; - if(refCounts[blk] == 1) // modify in-place + if (refCounts[blk] == 1) // modify in-place { immutable lowIdx = blk*blockSize + (low & (blockSize-1)); immutable highIdx = high - low + lowIdx; @@ -180,10 +187,10 @@ struct UIntTrie2 mixin("scratch[lowIdx..highIdx] "~op~"= val;"); uint h = hash(scratch); bool found = false; - foreach(i,x; hashes) + foreach (i,x; hashes) { - if(x != h) continue; - if(scratch[] == blocks[i*blockSize .. (i+1)*blockSize]) + if (x != h) continue; + if (scratch[] == blocks[i*blockSize .. (i+1)*blockSize]) { // re-route to existing page index[low>>blockBits] = cast(ushort)i; @@ -192,7 +199,7 @@ struct UIntTrie2 break; } } - if(!found) + if (!found) { index[low>>blockBits] = cast(ushort)hashes.length; blocks ~= scratch[]; @@ -213,9 +220,9 @@ struct UIntTrie2 uint endBlk = end >> blockBits; uint first = min(startBlk*blockSize+blockSize, end); setPageRange!op(val, start, first); - foreach(blk; startBlk..endBlk) + foreach (blk; startBlk..endBlk) setPageRange!op(val, blk*blockSize, (blk+1)*blockSize); - if(first != end) + if (first != end) { setPageRange!op(val, endBlk*blockSize, end); } @@ -232,15 +239,15 @@ unittest import std.uni; UIntTrie2 trie2 = UIntTrie2(); auto letters = unicode("L"); - foreach(r; letters.byInterval) + foreach (r; letters.byInterval) trie2[r.a..r.b] &= 1; - foreach(ch; letters.byCodepoint) + foreach (ch; letters.byCodepoint) assert(trie2[ch] == 1); auto space = unicode("WhiteSpace"); auto trie3 = UIntTrie2(); - foreach(r; space.byInterval) + foreach (r; space.byInterval) trie3[r.a..r.b] &= 2; - foreach(ch; space.byCodepoint) + foreach (ch; space.byCodepoint) assert(trie3[ch] == 2); } @@ -266,22 +273,22 @@ struct BitNfa bool nextChoice() { uint i; - for(i=0;i %d %s", j, ir[j].mnemonic); @@ -315,7 +325,7 @@ struct BitNfa case GotoEndOr: paths.push(j+IRL!GotoEndOr+ir[j].data); break; - case OrEnd, Wordboundary, Notwordboundary, Bol, Eol, Nop, GroupStart, GroupEnd: + case OrEnd, Wordboundary, Notwordboundary, Bof, Bol, Eol, Eof, Nop, GroupStart, GroupEnd: paths.push(j+ir[j].length); break; case LookaheadStart, NeglookaheadStart, LookbehindStart, @@ -324,11 +334,11 @@ struct BitNfa break; case InfiniteStart, InfiniteQStart: paths.push(j+IRL!InfiniteStart); - paths.push(j+ir[j].data+IRL!InfiniteEnd); + paths.push(j+IRL!InfiniteStart+ir[j].data+IRL!InfiniteEnd); break; case InfiniteBloomStart: paths.push(j+IRL!InfiniteStart); - paths.push(j+ir[j].data+IRL!InfiniteBloomEnd); + paths.push(j+IRL!InfiniteBloomStart+ir[j].data+IRL!InfiniteBloomEnd); break; case InfiniteEnd, InfiniteQEnd: paths.push(j-ir[j].data); @@ -354,16 +364,16 @@ struct BitNfa uint[] bitMapping = new uint[re.ir.length]; uint bitCount = 0, nesting=0, lastNonnested=0; with(re) -outer: for(uint i=0; i= privateUseStart && current <= privateUseEnd) { g.endPattern(current - privateUseStart + 1); - break; } - auto op = Bytecode(IR.Char, current); + else + { + auto op = Bytecode(IR.Char, current); + g.put(op); + } next(); - g.put(op); } } diff --git a/std/regex/internal/shiftor.d b/std/regex/internal/shiftor.d index 4c12bec0833..6066efd45ed 100644 --- a/std/regex/internal/shiftor.d +++ b/std/regex/internal/shiftor.d @@ -374,7 +374,7 @@ public: // has a useful trait: if supplied with valid UTF indexes, // returns only valid UTF indexes // (that given the haystack in question is valid UTF string) - final @trusted bool opCall(ref Input!Char s) + final @trusted bool search(ref Input!Char s) {//@BUG: apparently assumes little endian machines import std.conv : text; import core.stdc.string : memchr; @@ -504,6 +504,12 @@ public: return false; } + final @trusted bool match(ref Input!Char s) + { + //TODO: stub + return false; + } + @system debug static void dump(uint[] table) {//@@@BUG@@@ writef(ln) is @system import std.stdio : writefln; @@ -529,7 +535,7 @@ unittest auto inp = Input!C(source); foreach(r; results) { - kick(inp); + kick.search(inp); dchar ch; size_t idx; assert(inp._index == r, text(inp._index, " vs ", r)); @@ -547,12 +553,12 @@ unittest shiftOrLength(`\ba{2}c\bxyz`.to!String, 6); auto kick = shiftOrLength(`\ba{2}c\b`.to!String, 3); auto inp = Input!Char("aabaacaa"); - assert(kick(inp)); + assert(kick.search(inp)); assert(inp._index == 3, text(Char.stringof," == ", kick.length)); dchar ch; size_t idx; inp.nextChar(ch, idx); - assert(!kick(inp)); + assert(!kick.search(inp)); assert(inp._index == 8, text(Char.stringof," == ", kick.length)); } @@ -568,7 +574,6 @@ unittest kick = shiftOrLength(`a(b{1,2}|c{1,2})x`.to!String, 3); searches("ababx".to!String, kick, 2); searches("abaacba".to!String, kick, 3); //expected inexact - } } diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index 993164cc2fb..a339e3bed92 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -8,8 +8,6 @@ package(std.regex): import std.conv, std.exception, std.meta, std.range, std.typecons, std.regex; -import std.regex.internal.parser : Escapables; // characters that need escaping - alias Sequence(int B, int E) = staticIota!(B, E); unittest @@ -315,6 +313,7 @@ unittest TestVectors( `\b[A-Za-z0-9.]+(?=(@(?!gmail)))`, "a@gmail,x@com", "y", "$&-$1", "x-@"), TestVectors( `x()(abc)(?=(d)(e)(f)\2)`, "xabcdefabc", "y", "$&", "xabc"), TestVectors( `x()(abc)(?=(d)(e)(f)()\3\4\5)`, "xabcdefdef", "y", "$&", "xabc"), + //lookback TestVectors( `(?<=(ab))\d`, "12ba3ab4", "y", "$&-$1", "4-ab", "i"), TestVectors( `\w(?"); - assert(bmatch("texttext", greed).hit - == "text"); -} - -unittest -{ - import std.algorithm.comparison : equal; - auto cr8 = ctRegex!("^(a)(b)?(c*)"); - auto m8 = bmatch("abcc",cr8); - assert(m8); - assert(m8.captures[1] == "a"); - assert(m8.captures[2] == "b"); - assert(m8.captures[3] == "cc"); - auto cr9 = ctRegex!("q(a|b)*q"); - auto m9 = match("xxqababqyy",cr9); - assert(m9); - assert(equal(bmatch("xxqababqyy",cr9).captures, ["qababq", "b"])); -} - -unittest -{ - import std.algorithm.comparison : equal; - auto rtr = regex("a|b|c"); - enum ctr = regex("a|b|c"); - assert(equal(rtr.ir,ctr.ir)); - //CTFE parser BUG is triggered by group - //in the middle of alternation (at least not first and not last) - enum testCT = regex(`abc|(edf)|xyz`); - auto testRT = regex(`abc|(edf)|xyz`); - assert(equal(testCT.ir,testRT.ir)); -} - -unittest -{ - import std.algorithm.iteration : map; - import std.algorithm.comparison : equal; - enum cx = ctRegex!"(A|B|C)"; - auto mx = match("B",cx); - assert(mx); - assert(equal(mx.captures, [ "B", "B"])); - enum cx2 = ctRegex!"(A|B)*"; - assert(match("BAAA",cx2)); - - enum cx3 = ctRegex!("a{3,4}","i"); - auto mx3 = match("AaA",cx3); - assert(mx3); - assert(mx3.captures[0] == "AaA"); - enum cx4 = ctRegex!(`^a{3,4}?[a-zA-Z0-9~]{1,2}`,"i"); - auto mx4 = match("aaaabc", cx4); - assert(mx4); - assert(mx4.captures[0] == "aaaab"); - auto cr8 = ctRegex!("(a)(b)?(c*)"); - auto m8 = bmatch("abcc",cr8); - assert(m8); - assert(m8.captures[1] == "a"); - assert(m8.captures[2] == "b"); - assert(m8.captures[3] == "cc"); - auto cr9 = ctRegex!(".*$", "gm"); - auto m9 = match("First\rSecond", cr9); - assert(m9); - assert(equal(map!"a.hit"(m9), ["First", "", "Second"])); -} - -unittest -{ - import std.algorithm.iteration : map; - import std.algorithm.comparison : equal; -//global matching - void test_body(alias matchFn)() - { - string s = "a quick brown fox jumps over a lazy dog"; - auto r1 = regex("\\b[a-z]+\\b","g"); - string[] test; - foreach(m; matchFn(s, r1)) - test ~= m.hit; - assert(equal(test, [ "a", "quick", "brown", "fox", "jumps", "over", "a", "lazy", "dog"])); - auto free_reg = regex(` - - abc - \s+ - " - ( - [^"]+ - | \\ " - )+ - " - z - `, "x"); - auto m = match(`abc "quoted string with \" inside"z`,free_reg); - assert(m); - string mails = " hey@you.com no@spam.net "; - auto rm = regex(`@(?<=\S+@)\S+`,"g"); - assert(equal(map!"a[0]"(matchFn(mails, rm)), ["@you.com", "@spam.net"])); - auto m2 = matchFn("First line\nSecond line",regex(".*$","gm")); - assert(equal(map!"a[0]"(m2), ["First line", "", "Second line"])); - auto m2a = matchFn("First line\nSecond line",regex(".+$","gm")); - assert(equal(map!"a[0]"(m2a), ["First line", "Second line"])); - auto m2b = matchFn("First line\nSecond line",regex(".+?$","gm")); - assert(equal(map!"a[0]"(m2b), ["First line", "Second line"])); - debug(std_regex_test) writeln("!!! FReD FLAGS test done "~matchFn.stringof~" !!!"); - } - test_body!bmatch(); - test_body!match(); -} - -//tests for accumulated std.regex issues and other regressions -unittest -{ - import std.algorithm.iteration : map; - import std.algorithm.comparison : equal; - void test_body(alias matchFn)() - { - //issue 5857 - //matching goes out of control if ... in (...){x} has .*/.+ - auto c = matchFn("axxxzayyyyyzd",regex("(a.*z){2}d")).captures; - assert(c[0] == "axxxzayyyyyzd"); - assert(c[1] == "ayyyyyz"); - auto c2 = matchFn("axxxayyyyyd",regex("(a.*){2}d")).captures; - assert(c2[0] == "axxxayyyyyd"); - assert(c2[1] == "ayyyyy"); - //issue 2108 - //greedy vs non-greedy - auto nogreed = regex(""); - assert(matchFn("texttext", nogreed).hit - == "text"); - auto greed = regex(""); - assert(matchFn("texttext", greed).hit - == "texttext"); - //issue 4574 - //empty successful match still advances the input - string[] pres, posts, hits; - foreach (m; matchFn("abcabc", regex("","g"))) - { - pres ~= m.pre; - posts ~= m.post; - assert(m.hit.empty); - - } - auto heads = [ - "abcabc", - "abcab", - "abca", - "abc", - "ab", - "a", - "" - ]; - auto tails = [ - "abcabc", - "bcabc", - "cabc", - "abc", - "bc", - "c", - "" - ]; - assert(pres == array(retro(heads))); - assert(posts == tails); - //issue 6076 - //regression on .* - auto re = regex("c.*|d"); - auto m = matchFn("mm", re); - assert(!m); - debug(std_regex_test) writeln("!!! FReD REGRESSION test done "~matchFn.stringof~" !!!"); - auto rprealloc = regex(`((.){5}.{1,10}){5}`); - auto arr = array(repeat('0',100)); - auto m2 = matchFn(arr, rprealloc); - assert(m2); - assert(collectException( - regex(r"^(import|file|binary|config)\s+([^\(]+)\(?([^\)]*)\)?\s*$") - ) is null); - foreach (ch; [Escapables]) - { - assert(match(to!string(ch),regex(`[\`~ch~`]`))); - assert(!match(to!string(ch),regex(`[^\`~ch~`]`))); - assert(match(to!string(ch),regex(`[\`~ch~`-\`~ch~`]`))); - } - //bugzilla 7718 - string strcmd = "./myApp.rb -os OSX -path \"/GIT/Ruby Apps/sec\" -conf 'notimer'"; - auto reStrCmd = regex (`(".*")|('.*')`, "g"); - assert(equal(map!"a[0]"(matchFn(strcmd, reStrCmd)), - [`"/GIT/Ruby Apps/sec"`, `'notimer'`])); - } - test_body!bmatch(); - test_body!match(); -} - -// tests for replace -unittest -{ - void test(alias matchFn)() - { - import std.uni : toUpper; - - foreach(i, v; AliasSeq!(string, wstring, dstring)) - { - auto baz(Cap)(Cap m) - if (is(Cap == Captures!(Cap.String))) - { - return toUpper(m.hit); - } - alias String = v; - assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r")), to!String("c")) - == to!String("ack rapacity")); - assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r"), "g"), to!String("c")) - == to!String("ack capacity")); - assert(std.regex.replace!(matchFn)(to!String("noon"), regex(to!String("^n")), to!String("[$&]")) - == to!String("[n]oon")); - assert(std.regex.replace!(matchFn)( - to!String("test1 test2"), regex(to!String(`\w+`),"g"), to!String("$`:$'") - ) == to!String(": test2 test1 :")); - auto s = std.regex.replace!(baz!(Captures!(String)))(to!String("Strap a rocket engine on a chicken."), - regex(to!String("[ar]"), "g")); - assert(s == "StRAp A Rocket engine on A chicken."); - } - debug(std_regex_test) writeln("!!! Replace test done "~matchFn.stringof~" !!!"); - } - test!(bmatch)(); - test!(match)(); -} - -// tests for splitter -unittest -{ - import std.algorithm.comparison : equal; - auto s1 = ", abc, de, fg, hi, "; - auto sp1 = splitter(s1, regex(", *")); - auto w1 = ["", "abc", "de", "fg", "hi", ""]; - assert(equal(sp1, w1)); - - auto s2 = ", abc, de, fg, hi"; - auto sp2 = splitter(s2, regex(", *")); - auto w2 = ["", "abc", "de", "fg", "hi"]; - - uint cnt; - foreach (e; sp2) - { - assert(w2[cnt++] == e); - } - assert(equal(sp2, w2)); -} - -unittest -{ - char[] s1 = ", abc, de, fg, hi, ".dup; - auto sp2 = splitter(s1, regex(", *")); -} - -unittest -{ - import std.algorithm.comparison : equal; - auto s1 = ", abc, de, fg, hi, "; - auto w1 = ["", "abc", "de", "fg", "hi", ""]; - assert(equal(split(s1, regex(", *")), w1[])); -} - -unittest -{ // bugzilla 7141 - string pattern = `[a\--b]`; - assert(match("-", pattern)); - assert(match("b", pattern)); - string pattern2 = `[&-z]`; - assert(match("b", pattern2)); -} -unittest -{//bugzilla 7111 - assert(match("", regex("^"))); -} -unittest -{//bugzilla 7300 - assert(!match("a"d, "aa"d)); -} - -// bugzilla 7551 -unittest -{ - auto r = regex("[]abc]*"); - assert("]ab".matchFirst(r).hit == "]ab"); - assertThrown(regex("[]")); - auto r2 = regex("[]abc--ab]*"); - assert("]ac".matchFirst(r2).hit == "]"); -} - -unittest -{//bugzilla 7674 - assert("1234".replace(regex("^"), "$$") == "$1234"); - assert("hello?".replace(regex(r"\?", "g"), r"\?") == r"hello\?"); - assert("hello?".replace(regex(r"\?", "g"), r"\\?") != r"hello\?"); -} -unittest -{// bugzilla 7679 - import std.algorithm.comparison : equal; - foreach (S; AliasSeq!(string, wstring, dstring)) - (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 - enum re = ctRegex!(to!S(r"\.")); - auto str = to!S("a.b"); - assert(equal(std.regex.splitter(str, re), [to!S("a"), to!S("b")])); - assert(split(str, re) == [to!S("a"), to!S("b")]); - }(); -} -unittest -{//bugzilla 8203 - string data = " - NAME = XPAW01_STA:STATION - NAME = XPAW01_STA - "; - auto uniFileOld = data; - auto r = regex( - r"^NAME = (?P[a-zA-Z0-9_]+):*(?P[a-zA-Z0-9_]*)","gm"); - auto uniCapturesNew = match(uniFileOld, r); - for(int i = 0; i < 20; i++) - foreach (matchNew; uniCapturesNew) {} - //a second issue with same symptoms - auto r2 = regex(`([а-яА-Я\-_]+\s*)+(?<=[\s\.,\^])`); - match("аллея Театральная", r2); -} -unittest -{// bugzilla 8637 purity of enforce - auto m = match("hello world", regex("world")); - enforce(m); -} - -// bugzilla 8725 -unittest -{ - static italic = regex( r"\* - (?!\s+) - (.*?) - (?!\s+) - \*", "gx" ); - string input = "this * is* interesting, *very* interesting"; - assert(replace(input, italic, "$1") == - "this * is* interesting, very interesting"); -} - -// bugzilla 8349 -unittest -{ - enum peakRegexStr = r"\>(wgEncode.*Tfbs.*\.(?:narrow)|(?:broad)Peak.gz)"; - enum peakRegex = ctRegex!(peakRegexStr); - //note that the regex pattern itself is probably bogus - assert(match(r"\>wgEncode-blah-Tfbs.narrow", peakRegex)); -} - -// bugzilla 9211 -unittest -{ - import std.algorithm.comparison : equal; - auto rx_1 = regex(r"^(\w)*(\d)"); - auto m = match("1234", rx_1); - assert(equal(m.front, ["1234", "3", "4"])); - auto rx_2 = regex(r"^([0-9])*(\d)"); - auto m2 = match("1234", rx_2); - assert(equal(m2.front, ["1234", "3", "4"])); -} - -// bugzilla 9280 -unittest -{ - string tomatch = "a!b@c"; - static r = regex(r"^(?P.*?)!(?P.*?)@(?P.*?)$"); - auto nm = match(tomatch, r); - assert(nm); - auto c = nm.captures; - assert(c[1] == "a"); - assert(c["nick"] == "a"); -} - - -// bugzilla 9579 -unittest -{ - char[] input = ['a', 'b', 'c']; - string format = "($1)"; - // used to give a compile error: - auto re = regex(`(a)`, "g"); - auto r = replace(input, re, format); - assert(r == "(a)bc"); -} - -// bugzilla 9634 -unittest -{ - auto re = ctRegex!"(?:a+)"; - assert(match("aaaa", re).hit == "aaaa"); -} - -//bugzilla 10798 -unittest -{ - auto cr = ctRegex!("[abcd--c]*"); - auto m = "abc".match(cr); - assert(m); - assert(m.hit == "ab"); -} - -// bugzilla 10913 -unittest -{ - @system static string foo(const(char)[] s) - { - return s.dup; - } - @safe static string bar(const(char)[] s) - { - return s.dup; - } - () @system { - replace!((a) => foo(a.hit))("blah", regex(`a`)); - }(); - () @safe { - replace!((a) => bar(a.hit))("blah", regex(`a`)); - }(); -} - -// bugzilla 11262 -unittest -{ - enum reg = ctRegex!(r",", "g"); - auto str = "This,List"; - str = str.replace(reg, "-"); - assert(str == "This-List"); -} - -// bugzilla 11775 -unittest -{ - assert(collectException(regex("a{1,0}"))); -} - -// bugzilla 11839 -unittest -{ - import std.algorithm.comparison : equal; - assert(regex(`(?P\w+)`).namedCaptures.equal(["var1"])); - assert(collectException(regex(`(?P<1>\w+)`))); - assert(regex(`(?P\w+)`).namedCaptures.equal(["v1"])); - assert(regex(`(?P<__>\w+)`).namedCaptures.equal(["__"])); - assert(regex(`(?P<я>\w+)`).namedCaptures.equal(["я"])); -} - -// bugzilla 12076 -unittest -{ - auto RE = ctRegex!(r"(?abc)`); - assert(collectException("abc".matchFirst(r)["b"])); -} - -// bugzilla 12691 -unittest -{ - assert(bmatch("e@", "^([a-z]|)*$").empty); - assert(bmatch("e@", ctRegex!`^([a-z]|)*$`).empty); -} - -//bugzilla 12713 -unittest -{ - assertThrown(regex("[[a-z]([a-z]|(([[a-z])))")); -} - -//bugzilla 12747 -unittest -{ - assertThrown(regex(`^x(\1)`)); - assertThrown(regex(`^(x(\1))`)); - assertThrown(regex(`^((x)(?=\1))`)); -} - -// bugzilla 14504 -unittest -{ - auto p = ctRegex!("a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?" ~ - "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); -} - -// bugzilla 14529 -unittest -{ - auto ctPat2 = regex(r"^[CDF]$", "i"); - foreach(v; ["C", "c", "D", "d", "F", "f"]) - assert(matchAll(v, ctPat2).front.hit == v); -} - -// bugzilla 14615 -unittest -{ - import std.stdio : writeln; - import std.regex : replaceFirst, replaceFirstInto, regex; - import std.array : appender; - - auto example = "Hello, world!"; - auto pattern = regex("^Hello, (bug)"); // won't find this one - auto result = replaceFirst(example, pattern, "$1 Sponge Bob"); - assert(result == "Hello, world!"); // Ok. - - auto sink = appender!string; - replaceFirstInto(sink, example, pattern, "$1 Sponge Bob"); - assert(sink.data == "Hello, world!"); - replaceAllInto(sink, example, pattern, "$1 Sponge Bob"); - assert(sink.data == "Hello, world!Hello, world!"); -} - -// bugzilla 15573 -unittest -{ - auto rx = regex("[c d]", "x"); - assert("a b".matchFirst(rx)); -} - -// bugzilla 15864 -unittest -{ - regex(`("); + assert(bmatch("texttext", greed).hit + == "text"); +} + +unittest +{ + auto cr8 = ctRegex!("^(a)(b)?(c*)"); + auto m8 = bmatch("abcc",cr8); + assert(m8); + assert(m8.captures[1] == "a"); + assert(m8.captures[2] == "b"); + assert(m8.captures[3] == "cc"); + auto cr9 = ctRegex!("q(a|b)*q"); + auto m9 = match("xxqababqyy",cr9); + assert(m9); + assert(equal(bmatch("xxqababqyy",cr9).captures, ["qababq", "b"])); +} + +unittest +{ + auto rtr = regex("a|b|c"); + enum ctr = regex("a|b|c"); + assert(equal(rtr.ir,ctr.ir)); + //CTFE parser BUG is triggered by group + //in the middle of alternation (at least not first and not last) + enum testCT = regex(`abc|(edf)|xyz`); + auto testRT = regex(`abc|(edf)|xyz`); + assert(equal(testCT.ir,testRT.ir)); +} + +unittest +{ + enum cx = ctRegex!"(A|B|C)"; + auto mx = match("B",cx); + assert(mx); + assert(equal(mx.captures, [ "B", "B"])); + enum cx2 = ctRegex!"(A|B)*"; + assert(match("BAAA",cx2)); + + enum cx3 = ctRegex!("a{3,4}","i"); + auto mx3 = match("AaA",cx3); + assert(mx3); + assert(mx3.captures[0] == "AaA"); + enum cx4 = ctRegex!(`^a{3,4}?[a-zA-Z0-9~]{1,2}`,"i"); + auto mx4 = match("aaaabc", cx4); + assert(mx4); + assert(mx4.captures[0] == "aaaab"); + auto cr8 = ctRegex!("(a)(b)?(c*)"); + auto m8 = bmatch("abcc",cr8); + assert(m8); + assert(m8.captures[1] == "a"); + assert(m8.captures[2] == "b"); + assert(m8.captures[3] == "cc"); + auto cr9 = ctRegex!(".*$", "gm"); + auto m9 = match("First\rSecond", cr9); + assert(m9); + assert(equal(map!"a.hit"(m9), ["First", "", "Second"])); +} + +unittest +{ +//global matching + void test_body(alias matchFn)() + { + string s = "a quick brown fox jumps over a lazy dog"; + auto r1 = regex("\\b[a-z]+\\b","g"); + string[] test; + foreach(m; matchFn(s, r1)) + test ~= m.hit; + assert(equal(test, [ "a", "quick", "brown", "fox", "jumps", "over", "a", "lazy", "dog"])); + auto free_reg = regex(` + + abc + \s+ + " + ( + [^"]+ + | \\ " + )+ + " + z + `, "x"); + auto m = match(`abc "quoted string with \" inside"z`,free_reg); + assert(m); + string mails = " hey@you.com no@spam.net "; + auto rm = regex(`@(?<=\S+@)\S+`,"g"); + assert(equal(map!"a[0]"(matchFn(mails, rm)), ["@you.com", "@spam.net"])); + auto m2 = matchFn("First line\nSecond line",regex(".*$","gm")); + assert(equal(map!"a[0]"(m2), ["First line", "", "Second line"])); + auto m2a = matchFn("First line\nSecond line",regex(".+$","gm")); + assert(equal(map!"a[0]"(m2a), ["First line", "Second line"])); + auto m2b = matchFn("First line\nSecond line",regex(".+?$","gm")); + assert(equal(map!"a[0]"(m2b), ["First line", "Second line"])); + debug(std_regex_test) writeln("!!! FReD FLAGS test done "~matchFn.stringof~" !!!"); + } + test_body!bmatch(); + test_body!match(); +} + +//tests for accumulated std.regex issues and other regressions +unittest +{ + void test_body(alias matchFn)() + { + //issue 5857 + //matching goes out of control if ... in (...){x} has .*/.+ + auto c = matchFn("axxxzayyyyyzd",regex("(a.*z){2}d")).captures; + assert(c[0] == "axxxzayyyyyzd"); + assert(c[1] == "ayyyyyz"); + auto c2 = matchFn("axxxayyyyyd",regex("(a.*){2}d")).captures; + assert(c2[0] == "axxxayyyyyd"); + assert(c2[1] == "ayyyyy"); + //issue 2108 + //greedy vs non-greedy + auto nogreed = regex(""); + assert(matchFn("texttext", nogreed).hit + == "text"); + auto greed = regex(""); + assert(matchFn("texttext", greed).hit + == "texttext"); + //issue 4574 + //empty successful match still advances the input + string[] pres, posts, hits; + foreach(m; matchFn("abcabc", regex("","g"))) { + pres ~= m.pre; + posts ~= m.post; + assert(m.hit.empty); + + } + auto heads = [ + "abcabc", + "abcab", + "abca", + "abc", + "ab", + "a", + "" + ]; + auto tails = [ + "abcabc", + "bcabc", + "cabc", + "abc", + "bc", + "c", + "" + ]; + assert(pres == array(retro(heads))); + assert(posts == tails); + //issue 6076 + //regression on .* + auto re = regex("c.*|d"); + auto m = matchFn("mm", re); + assert(!m); + debug(std_regex_test) writeln("!!! FReD REGRESSION test done "~matchFn.stringof~" !!!"); + auto rprealloc = regex(`((.){5}.{1,10}){5}`); + auto arr = array(repeat('0',100)); + auto m2 = matchFn(arr, rprealloc); + assert(m2); + assert(collectException( + regex(r"^(import|file|binary|config)\s+([^\(]+)\(?([^\)]*)\)?\s*$") + ) is null); + foreach(ch; [Escapables]) + { + assert(match(to!string(ch),regex(`[\`~ch~`]`))); + assert(!match(to!string(ch),regex(`[^\`~ch~`]`))); + assert(match(to!string(ch),regex(`[\`~ch~`-\`~ch~`]`))); + } + //bugzilla 7718 + string strcmd = "./myApp.rb -os OSX -path \"/GIT/Ruby Apps/sec\" -conf 'notimer'"; + auto reStrCmd = regex (`(".*")|('.*')`, "g"); + assert(equal(map!"a[0]"(matchFn(strcmd, reStrCmd)), + [`"/GIT/Ruby Apps/sec"`, `'notimer'`])); + } + test_body!bmatch(); + test_body!match(); +} + +// tests for replace +unittest +{ + void test(alias matchFn)() + { + import std.uni : toUpper; + + foreach(i, v; AliasSeq!(string, wstring, dstring)) + { + auto baz(Cap)(Cap m) + if (is(Cap == Captures!(Cap.String))) + { + return toUpper(m.hit); + } + alias String = v; + assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r")), to!String("c")) + == to!String("ack rapacity")); + assert(std.regex.replace!(matchFn)(to!String("ark rapacity"), regex(to!String("r"), "g"), to!String("c")) + == to!String("ack capacity")); + assert(std.regex.replace!(matchFn)(to!String("noon"), regex(to!String("^n")), to!String("[$&]")) + == to!String("[n]oon")); + assert(std.regex.replace!(matchFn)(to!String("test1 test2"), regex(to!String(`\w+`),"g"), to!String("$`:$'")) + == to!String(": test2 test1 :")); + auto s = std.regex.replace!(baz!(Captures!(String)))(to!String("Strap a rocket engine on a chicken."), + regex(to!String("[ar]"), "g")); + assert(s == "StRAp A Rocket engine on A chicken.", text(s)); + } + debug(std_regex_test) writeln("!!! Replace test done "~matchFn.stringof~" !!!"); + } + test!(bmatch)(); + test!(match)(); +} + +// tests for splitter +unittest +{ + auto s1 = ", abc, de, fg, hi, "; + auto sp1 = splitter(s1, regex(", *")); + auto w1 = ["", "abc", "de", "fg", "hi", ""]; + assert(equal(sp1, w1)); + + auto s2 = ", abc, de, fg, hi"; + auto sp2 = splitter(s2, regex(", *")); + auto w2 = ["", "abc", "de", "fg", "hi"]; + + uint cnt; + foreach(e; sp2) { + assert(w2[cnt++] == e); + } + assert(equal(sp2, w2)); +} + +unittest +{ + char[] s1 = ", abc, de, fg, hi, ".dup; + auto sp2 = splitter(s1, regex(", *")); +} + +unittest +{ + auto s1 = ", abc, de, fg, hi, "; + auto w1 = ["", "abc", "de", "fg", "hi", ""]; + assert(equal(split(s1, regex(", *")), w1[])); +} diff --git a/std/regex/internal/tests3.d b/std/regex/internal/tests3.d new file mode 100644 index 00000000000..237a221738b --- /dev/null +++ b/std/regex/internal/tests3.d @@ -0,0 +1,305 @@ +/* + Regualar expressions package test suite part 3. +*/ +module std.regex.internal.tests3; + +package(std.regex): + +import std.algorithm, std.conv, std.exception, std.meta, std.range, + std.typecons, std.regex; + +unittest +{ // bugzilla 7141 + string pattern = `[a\--b]`; + assert(match("-", pattern)); + assert(match("b", pattern)); + string pattern2 = `[&-z]`; + assert(match("b", pattern2)); +} +unittest +{//bugzilla 7111 + assert(match("", regex("^"))); +} +unittest +{//bugzilla 7300 + assert(!match("a"d, "aa"d)); +} + +// bugzilla 7551 +unittest +{ + auto r = regex("[]abc]*"); + assert("]ab".matchFirst(r).hit == "]ab"); + assertThrown(regex("[]")); + auto r2 = regex("[]abc--ab]*"); + assert("]ac".matchFirst(r2).hit == "]"); +} + +unittest +{//bugzilla 7674 + assert("1234".replace(regex("^"), "$$") == "$1234"); + assert("hello?".replace(regex(r"\?", "g"), r"\?") == r"hello\?"); + assert("hello?".replace(regex(r"\?", "g"), r"\\?") != r"hello\?"); +} +unittest +{// bugzilla 7679 + foreach(S; AliasSeq!(string, wstring, dstring)) + (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 + enum re = ctRegex!(to!S(r"\.")); + auto str = to!S("a.b"); + assert(equal(std.regex.splitter(str, re), [to!S("a"), to!S("b")])); + assert(split(str, re) == [to!S("a"), to!S("b")]); + }(); +} +unittest +{//bugzilla 8203 + string data = " + NAME = XPAW01_STA:STATION + NAME = XPAW01_STA + "; + auto uniFileOld = data; + auto r = regex( + r"^NAME = (?P[a-zA-Z0-9_]+):*(?P[a-zA-Z0-9_]*)","gm"); + auto uniCapturesNew = match(uniFileOld, r); + for(int i = 0; i < 20; i++) + foreach (matchNew; uniCapturesNew) {} + //a second issue with same symptoms + auto r2 = regex(`([а-яА-Я\-_]+\s*)+(?<=[\s\.,\^])`); + match("аллея Театральная", r2); +} +unittest +{// bugzilla 8637 purity of enforce + auto m = match("hello world", regex("world")); + enforce(m); +} + +// bugzilla 8725 +unittest +{ + static italic = regex( r"\* + (?!\s+) + (.*?) + (?!\s+) + \*", "gx" ); + string input = "this * is* interesting, *very* interesting"; + assert(replace(input, italic, "$1") == + "this * is* interesting, very interesting"); +} + +// bugzilla 8349 +unittest +{ + enum peakRegexStr = r"\>(wgEncode.*Tfbs.*\.(?:narrow)|(?:broad)Peak.gz)"; + enum peakRegex = ctRegex!(peakRegexStr); + //note that the regex pattern itself is probably bogus + assert(match(r"\>wgEncode-blah-Tfbs.narrow", peakRegex)); +} + +// bugzilla 9211 +unittest +{ + auto rx_1 = regex(r"^(\w)*(\d)"); + auto m = match("1234", rx_1); + assert(equal(m.front, ["1234", "3", "4"])); + auto rx_2 = regex(r"^([0-9])*(\d)"); + auto m2 = match("1234", rx_2); + assert(equal(m2.front, ["1234", "3", "4"])); +} + +// bugzilla 9280 +unittest +{ + string tomatch = "a!b@c"; + static r = regex(r"^(?P.*?)!(?P.*?)@(?P.*?)$"); + auto nm = match(tomatch, r); + assert(nm); + auto c = nm.captures; + assert(c[1] == "a"); + assert(c["nick"] == "a"); +} + + +// bugzilla 9579 +unittest +{ + char[] input = ['a', 'b', 'c']; + string format = "($1)"; + // used to give a compile error: + auto re = regex(`(a)`, "g"); + auto r = replace(input, re, format); + assert(r == "(a)bc"); +} + +// bugzilla 9634 +unittest +{ + auto re = ctRegex!"(?:a+)"; + assert(match("aaaa", re).hit == "aaaa"); +} + +//bugzilla 10798 +unittest +{ + auto cr = ctRegex!("[abcd--c]*"); + auto m = "abc".match(cr); + assert(m); + assert(m.hit == "ab"); +} + +// bugzilla 10913 +unittest +{ + @system static string foo(const(char)[] s) + { + return s.dup; + } + @safe static string bar(const(char)[] s) + { + return s.dup; + } + () @system { + replace!((a) => foo(a.hit))("blah", regex(`a`)); + }(); + () @safe { + replace!((a) => bar(a.hit))("blah", regex(`a`)); + }(); +} + +// bugzilla 11262 +unittest +{ + enum reg = ctRegex!(r",", "g"); + auto str = "This,List"; + str = str.replace(reg, "-"); + assert(str == "This-List"); +} + +// bugzilla 11775 +unittest +{ + assert(collectException(regex("a{1,0}"))); +} + +// bugzilla 11839 +unittest +{ + assert(regex(`(?P\w+)`).namedCaptures.equal(["var1"])); + assert(collectException(regex(`(?P<1>\w+)`))); + assert(regex(`(?P\w+)`).namedCaptures.equal(["v1"])); + assert(regex(`(?P<__>\w+)`).namedCaptures.equal(["__"])); + assert(regex(`(?P<я>\w+)`).namedCaptures.equal(["я"])); +} + +// bugzilla 12076 +unittest +{ + auto RE = ctRegex!(r"(?abc)`); + assert(collectException("abc".matchFirst(r)["b"])); +} + +// bugzilla 12691 +unittest +{ + assert(bmatch("e@", "^([a-z]|)*$").empty); + assert(bmatch("e@", ctRegex!`^([a-z]|)*$`).empty); +} + +//bugzilla 12713 +unittest +{ + assertThrown(regex("[[a-z]([a-z]|(([[a-z])))")); +} + +//bugzilla 12747 +unittest +{ + assertThrown(regex(`^x(\1)`)); + assertThrown(regex(`^(x(\1))`)); + assertThrown(regex(`^((x)(?=\1))`)); +} + +// bugzilla 14504 +unittest +{ + auto p = ctRegex!("a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?" ~ + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); +} + +// bugzilla 14529 +unittest +{ + auto ctPat2 = regex(r"^[CDF]$", "i"); + foreach(v; ["C", "c", "D", "d", "F", "f"]) + assert(matchAll(v, ctPat2).front.hit == v); +} + +// bugzilla 14615 +unittest +{ + import std.stdio : writeln; + import std.regex : replaceFirst, replaceFirstInto, regex; + import std.array : appender; + + auto example = "Hello, world!"; + auto pattern = regex("^Hello, (bug)"); // won't find this one + auto result = replaceFirst(example, pattern, "$1 Sponge Bob"); + assert(result == "Hello, world!"); // Ok. + + auto sink = appender!string; + replaceFirstInto(sink, example, pattern, "$1 Sponge Bob"); + assert(sink.data == "Hello, world!"); + replaceAllInto(sink, example, pattern, "$1 Sponge Bob"); + assert(sink.data == "Hello, world!Hello, world!"); +} + +// bugzilla 15573 +unittest +{ + auto rx = regex("[c d]", "x"); + assert("a b".matchFirst(rx)); +} + +// bugzilla 15864 +unittest +{ + regex(`( Date: Sat, 7 May 2016 18:23:22 +0300 Subject: [PATCH 16/23] [std.regex] Re-style pass --- std/regex/internal/ir.d | 24 ++++++++++++------------ std/regex/internal/parser.d | 4 ++-- std/regex/internal/shiftor.d | 6 +++--- std/regex/internal/tests.d | 14 +++++++------- std/regex/internal/tests2.d | 10 +++++----- std/regex/internal/tests3.d | 6 +++--- 6 files changed, 32 insertions(+), 32 deletions(-) diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index 1658831f5bc..6d47becc401 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -479,7 +479,7 @@ interface Kickstart(Char){ assert(!empty); auto val = data[$ - 1]; data = data[0 .. $ - 1]; - if(!__ctfe) + if (!__ctfe) cast(void)data.assumeSafeAppend(); return val; } @@ -499,23 +499,23 @@ interface Kickstart(Char){ Stack!(Tuple!(uint, uint, uint)) stack; uint start = 0; uint end = cast(uint)code.length; - for(;;) + for (;;) { - for(uint pc = start; pc < end; ) + for (uint pc = start; pc < end; ) { uint len = code[pc].length; - if(code[pc].code == IR.GotoEndOr) + if (code[pc].code == IR.GotoEndOr) break; //pick next alternation branch - if(code[pc].isAtom) + if (code[pc].isAtom) { rev[revPc - len .. revPc] = code[pc .. pc + len]; revPc -= len; pc += len; } - else if(code[pc].isStart || code[pc].isEnd) + else if (code[pc].isStart || code[pc].isEnd) { //skip over other embedded lookbehinds they are reversed - if(code[pc].code == IR.LookbehindStart + if (code[pc].code == IR.LookbehindStart || code[pc].code == IR.NeglookbehindStart) { uint blockLen = len + code[pc].data @@ -529,15 +529,15 @@ interface Kickstart(Char){ uint secLen = code[second].length; rev[revPc - secLen .. revPc] = code[second .. second + secLen]; revPc -= secLen; - if(code[pc].code == IR.OrStart) + if (code[pc].code == IR.OrStart) { //we pass len bytes forward, but secLen in reverse uint revStart = revPc - (second + len - secLen - pc); uint r = revStart; uint i = pc + IRL!(IR.OrStart); - while(code[i].code == IR.Option) + while (code[i].code == IR.Option) { - if(code[i - 1].code != IR.OrStart) + if (code[i - 1].code != IR.OrStart) { assert(code[i - 1].code == IR.GotoEndOr); rev[r - 1] = code[i - 1]; @@ -546,7 +546,7 @@ interface Kickstart(Char){ auto newStart = i + IRL!(IR.Option); auto newEnd = newStart + code[i].data; auto newRpc = r + code[i].data + IRL!(IR.Option); - if(code[newEnd].code != IR.OrEnd) + if (code[newEnd].code != IR.OrEnd) { newRpc--; } @@ -562,7 +562,7 @@ interface Kickstart(Char){ pc += len; } } - if(stack.empty) + if (stack.empty) break; start = stack.top[0]; end = stack.top[1]; diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index 3429dc5e6b6..8c7568a8e12 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -1607,10 +1607,10 @@ struct Parser(R, Generator) if (!(flags & RegexInfo.oneShot)) { kickstart = new ShiftOr!Char(zis); - if(kickstart.empty) + if (kickstart.empty) { kickstart = new BitMatcher!Char(zis); - if(kickstart.empty) + if (kickstart.empty) kickstart = null; } } diff --git a/std/regex/internal/shiftor.d b/std/regex/internal/shiftor.d index 6066efd45ed..404c3c52679 100644 --- a/std/regex/internal/shiftor.d +++ b/std/regex/internal/shiftor.d @@ -533,7 +533,7 @@ unittest auto searches(C)(const (C)[] source, ShiftOr!C kick, uint[] results...) { auto inp = Input!C(source); - foreach(r; results) + foreach (r; results) { kick.search(inp); dchar ch; @@ -543,7 +543,7 @@ unittest } } - foreach(i, Char; AliasSeq!(char, wchar, dchar)) + foreach (i, Char; AliasSeq!(char, wchar, dchar)) { alias String = immutable(Char)[]; shiftOrLength(`abc`.to!String, 3); @@ -562,7 +562,7 @@ unittest assert(inp._index == 8, text(Char.stringof," == ", kick.length)); } - foreach(i, Char; AliasSeq!(char, wchar, dchar)) + foreach (i, Char; AliasSeq!(char, wchar, dchar)) { alias String = immutable(Char)[]; auto kick = shiftOrLength(`abc[a-z]`.to!String, 4); diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index a339e3bed92..6a3db845991 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -352,7 +352,7 @@ unittest void run_tests(alias matchFn)() { int i; - foreach(Char; AliasSeq!( char, wchar, dchar)) + foreach (Char; AliasSeq!( char, wchar, dchar)) (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 alias String = immutable(Char)[]; String produceExpected(M,Range)(auto ref M m, Range fmt) @@ -362,7 +362,7 @@ unittest return app.data; } Regex!(Char) r; - foreach(a, tvd; tv) + foreach (a, tvd; tv) { uint c = tvd.result[0]; debug(std_regex_test) writeln(" Test #", a, " pattern: ", tvd.pattern, " with Char = ", Char.stringof); @@ -379,7 +379,7 @@ unittest assert((c == 'c') ? !i : i, "failed to compile pattern "~tvd.pattern); - if(c != 'c') + if (c != 'c') { auto m = matchFn(to!(String)(tvd.input), r); i = !m.empty; @@ -427,10 +427,10 @@ unittest } else alias Tests = AliasSeq!(Sequence!(0, 25)); - foreach(a, v; Tests) + foreach (a, v; Tests) (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 enum tvd = tv[v]; - static if(tvd.result == "c") + static if (tvd.result == "c") { static assert(!__traits(compiles, (){ enum r = regex(tvd.pattern, tvd.flags); @@ -448,11 +448,11 @@ unittest bool ok = (c == 'y') ^ m.empty; assert(ok, text("ctRegex: failed to match pattern #", a ,": ", tvd.pattern)); - if(c == 'y') + if (c == 'y') { import std.stdio; auto result = produceExpected(m, tvd.format); - if(result != tvd.replace) + if (result != tvd.replace) writeln("ctRegex mismatch pattern #", a, ": ", tvd.pattern," expected: ", tvd.replace, " vs ", result); } diff --git a/std/regex/internal/tests2.d b/std/regex/internal/tests2.d index 72d3c011763..19286fa31f8 100644 --- a/std/regex/internal/tests2.d +++ b/std/regex/internal/tests2.d @@ -91,7 +91,7 @@ unittest string s = "a quick brown fox jumps over a lazy dog"; auto r1 = regex("\\b[a-z]+\\b","g"); string[] test; - foreach(m; matchFn(s, r1)) + foreach (m; matchFn(s, r1)) test ~= m.hit; assert(equal(test, [ "a", "quick", "brown", "fox", "jumps", "over", "a", "lazy", "dog"])); auto free_reg = regex(` @@ -147,7 +147,7 @@ unittest //issue 4574 //empty successful match still advances the input string[] pres, posts, hits; - foreach(m; matchFn("abcabc", regex("","g"))) { + foreach (m; matchFn("abcabc", regex("","g"))) { pres ~= m.pre; posts ~= m.post; assert(m.hit.empty); @@ -186,7 +186,7 @@ unittest assert(collectException( regex(r"^(import|file|binary|config)\s+([^\(]+)\(?([^\)]*)\)?\s*$") ) is null); - foreach(ch; [Escapables]) + foreach (ch; [Escapables]) { assert(match(to!string(ch),regex(`[\`~ch~`]`))); assert(!match(to!string(ch),regex(`[^\`~ch~`]`))); @@ -209,7 +209,7 @@ unittest { import std.uni : toUpper; - foreach(i, v; AliasSeq!(string, wstring, dstring)) + foreach (i, v; AliasSeq!(string, wstring, dstring)) { auto baz(Cap)(Cap m) if (is(Cap == Captures!(Cap.String))) @@ -248,7 +248,7 @@ unittest auto w2 = ["", "abc", "de", "fg", "hi"]; uint cnt; - foreach(e; sp2) { + foreach (e; sp2) { assert(w2[cnt++] == e); } assert(equal(sp2, w2)); diff --git a/std/regex/internal/tests3.d b/std/regex/internal/tests3.d index 237a221738b..07541fbb62d 100644 --- a/std/regex/internal/tests3.d +++ b/std/regex/internal/tests3.d @@ -43,7 +43,7 @@ unittest } unittest {// bugzilla 7679 - foreach(S; AliasSeq!(string, wstring, dstring)) + foreach (S; AliasSeq!(string, wstring, dstring)) (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 enum re = ctRegex!(to!S(r"\.")); auto str = to!S("a.b"); @@ -61,7 +61,7 @@ unittest auto r = regex( r"^NAME = (?P[a-zA-Z0-9_]+):*(?P[a-zA-Z0-9_]*)","gm"); auto uniCapturesNew = match(uniFileOld, r); - for(int i = 0; i < 20; i++) + for (int i = 0; i < 20; i++) foreach (matchNew; uniCapturesNew) {} //a second issue with same symptoms auto r2 = regex(`([а-яА-Я\-_]+\s*)+(?<=[\s\.,\^])`); @@ -261,7 +261,7 @@ unittest unittest { auto ctPat2 = regex(r"^[CDF]$", "i"); - foreach(v; ["C", "c", "D", "d", "F", "f"]) + foreach (v; ["C", "c", "D", "d", "F", "f"]) assert(matchAll(v, ctPat2).front.hit == v); } From 3bed8491259b7dcc1e46e1bd3b5cc2ab0e72b924 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Sat, 7 May 2016 20:51:30 +0300 Subject: [PATCH 17/23] [std.regex] Save memory by building regex in 3 parts --- win32.mak | 28 +++++++++++++++++++--------- win64.mak | 26 +++++++++++++++++--------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/win32.mak b/win32.mak index fd0cc4a8e94..1d4a6409795 100644 --- a/win32.mak +++ b/win32.mak @@ -220,15 +220,19 @@ SRC_STD_RANGE= \ SRC_STD_REGEX= \ std\regex\internal\ir.d \ std\regex\package.d \ - std\regex\internal\parser.d \ std\regex\internal\tests.d \ - std\regex\internal\tests2.d \ - std\regex\internal\tests3.d \ + std\regex\internal\generator.d + +SRC_STD_REGEX_2 = \ + std\regex\internal\parser.d \ std\regex\internal\backtracking.d \ std\regex\internal\thompson.d \ + std\regex\internal\tests2.d + +SRC_STD_REGEX_3 = \ std\regex\internal\shiftor.d \ std\regex\internal\bitnfa.d \ - std\regex\internal\generator.d + std\regex\internal\tests3.d SRC_STD_C= \ std\c\process.d \ @@ -355,6 +359,8 @@ SRC_TO_COMPILE= \ $(SRC_STD_NET) \ $(SRC_STD_RANGE) \ $(SRC_STD_REGEX) \ + $(SRC_STD_REGEX_2) \ + $(SRC_STD_REGEX_3) \ $(SRC_STD_C) \ $(SRC_STD_WIN) \ $(SRC_STD_C_WIN) \ @@ -574,6 +580,8 @@ UNITTEST_OBJS= \ unittest8d.obj \ unittest8e.obj \ unittest8f.obj \ + unittest8g.obj \ + unittest8h.obj \ unittest9a.obj unittest : $(LIB) @@ -588,11 +596,13 @@ unittest : $(LIB) $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest6.obj $(SRC_STD_6) $(SRC_STD_CONTAINER) $(SRC_STD_EXP_ALLOC) $(SRC_STD_EXP_LOGGER) $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest7.obj $(SRC_STD_7) $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8a.obj $(SRC_STD_REGEX) - $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8b.obj $(SRC_STD_NET) - $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8c.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN) - $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8d.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS) - $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8e.obj $(SRC_ETC) $(SRC_ETC_C) - $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8f.obj $(SRC_STD_EXP) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8b.obj $(SRC_STD_REGEX_2) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8c.obj $(SRC_STD_REGEX_3) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8d.obj $(SRC_STD_NET) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8e.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8f.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8g.obj $(SRC_ETC) $(SRC_ETC_C) + $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest8h.obj $(SRC_STD_EXP) $(DMD) $(UDFLAGS) -L/co -c -unittest -ofunittest9a.obj $(SRC_STD_EXP_NDSLICE) $(DMD) $(UDFLAGS) -L/co -unittest unittest.d $(UNITTEST_OBJS) \ $(ZLIB) $(DRUNTIMELIB) diff --git a/win64.mak b/win64.mak index d352d322b8a..9c2a2a487aa 100644 --- a/win64.mak +++ b/win64.mak @@ -239,15 +239,19 @@ SRC_STD_RANGE= \ SRC_STD_REGEX= \ std\regex\internal\ir.d \ std\regex\package.d \ - std\regex\internal\parser.d \ std\regex\internal\tests.d \ - std\regex\internal\tests2.d \ - std\regex\internal\tests3.d \ + std\regex\internal\generator.d + +SRC_STD_REGEX_2 = \ + std\regex\internal\parser.d \ std\regex\internal\backtracking.d \ std\regex\internal\thompson.d \ + std\regex\internal\tests2.d + +SRC_STD_REGEX_3 = \ std\regex\internal\shiftor.d \ std\regex\internal\bitnfa.d \ - std\regex\internal\generator.d + std\regex\internal\tests3.d SRC_STD_C= \ std\c\process.d \ @@ -374,6 +378,8 @@ SRC_TO_COMPILE= \ $(SRC_STD_NET) \ $(SRC_STD_RANGE) \ $(SRC_STD_REGEX) \ + $(SRC_STD_REGEX_2) \ + $(SRC_STD_REGEX_3) \ $(SRC_STD_C) \ $(SRC_STD_WIN) \ $(SRC_STD_C_WIN) \ @@ -624,11 +630,13 @@ unittest : $(LIB) $(DMD) $(UDFLAGS) -c -unittest -ofunittest6i.obj $(SRC_STD_6i) $(DMD) $(UDFLAGS) -c -unittest -ofunittest7.obj $(SRC_STD_7) $(SRC_STD_EXP_LOGGER) $(DMD) $(UDFLAGS) -c -unittest -ofunittest8a.obj $(SRC_STD_REGEX) - $(DMD) $(UDFLAGS) -c -unittest -ofunittest8b.obj $(SRC_STD_NET) - $(DMD) $(UDFLAGS) -c -unittest -ofunittest8c.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN) - $(DMD) $(UDFLAGS) -c -unittest -ofunittest8d.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS) - $(DMD) $(UDFLAGS) -c -unittest -ofunittest8e.obj $(SRC_ETC) $(SRC_ETC_C) - $(DMD) $(UDFLAGS) -c -unittest -ofunittest8f.obj $(SRC_STD_EXP) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8b.obj $(SRC_STD_REGEX_2) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8c.obj $(SRC_STD_REGEX_3) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8d.obj $(SRC_STD_NET) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8e.obj $(SRC_STD_C) $(SRC_STD_WIN) $(SRC_STD_C_WIN) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8f.obj $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8g.obj $(SRC_ETC) $(SRC_ETC_C) + $(DMD) $(UDFLAGS) -c -unittest -ofunittest8h.obj $(SRC_STD_EXP) $(DMD) $(UDFLAGS) -c -unittest -ofunittest9.obj $(SRC_STD_EXP_ALLOC) $(DMD) $(UDFLAGS) -c -unittest -ofunittest9a.obj $(SRC_STD_EXP_NDSLICE) $(DMD) $(UDFLAGS) -L/OPT:NOICF -unittest unittest.d $(UNITTEST_OBJS) \ From 91f0671bc74d161b00cf3d6dc73475f3da914dfb Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Sat, 30 Jul 2016 17:01:27 +0200 Subject: [PATCH 18/23] [std.regex] rebase on top of latest master, fix shiftor --- std/regex/internal/bitnfa.d | 2 ++ std/regex/internal/shiftor.d | 11 ++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index e1ca33a5e32..691ad71a989 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -55,6 +55,7 @@ struct HashTab auto keys() { + import std.array : appender; auto app = appender!(uint[])(); foreach (i, v; table) { @@ -66,6 +67,7 @@ struct HashTab auto values() { + import std.array : appender; auto app = appender!(uint[])(); foreach (i, v; table) { diff --git a/std/regex/internal/shiftor.d b/std/regex/internal/shiftor.d index 404c3c52679..39f1c3f6137 100644 --- a/std/regex/internal/shiftor.d +++ b/std/regex/internal/shiftor.d @@ -1,6 +1,6 @@ /* - Kickstart is a coarse-grained "filter" engine that finds likely matches - to be verified by full-blown matcher. + ShiftOr is a kickstart engine, a coarse-grained "filter" engine that finds + potential matches to be verified by a full-blown matcher. */ module std.regex.internal.shiftor; @@ -351,7 +351,6 @@ public: default: L_StopThread: assert(re.ir[t.pc].code >= 0x80, text(re.ir[t.pc].code)); - debug (fred_search) writeln("ShiftOr stumbled on ",re.ir[t.pc].mnemonic); n_length = std.algorithm.comparison.min(t.idx, n_length); break L_Eval_Thread; } @@ -543,6 +542,7 @@ unittest } } + foreach (i, Char; AliasSeq!(char, wchar, dchar)) { alias String = immutable(Char)[]; @@ -567,13 +567,14 @@ unittest alias String = immutable(Char)[]; auto kick = shiftOrLength(`abc[a-z]`.to!String, 4); searches("abbabca".to!String, kick, 3); - kick = shiftOrLength(`(ax|bd|cdy)`.to!String, 2); - searches("abdcdyabax".to!String, kick, 1, 3, 8); + kick = shiftOrLength(`(axx|bdx|cdy)`.to!String, 3); + searches("abdcdxabax".to!String, kick, 3); shiftOrLength(`...`.to!String, 0); kick = shiftOrLength(`a(b{1,2}|c{1,2})x`.to!String, 3); searches("ababx".to!String, kick, 2); searches("abaacba".to!String, kick, 3); //expected inexact } + } From e98fa4ad5ad39487844c91357cfec4f698e88230 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 7 Oct 2016 01:20:32 +0300 Subject: [PATCH 19/23] [std.regex] Fix issue 9391 - const regex, ctRegex is immutable --- std/regex/internal/backtracking.d | 232 +++++++++++++++++------------- std/regex/internal/bitnfa.d | 37 ++--- std/regex/internal/generator.d | 6 +- std/regex/internal/ir.d | 99 +++++++------ std/regex/internal/parser.d | 129 ++--------------- std/regex/internal/shiftor.d | 20 +-- std/regex/internal/tests.d | 2 +- std/regex/internal/thompson.d | 202 ++++++++++++++------------ std/regex/package.d | 139 +++++++++--------- std/uni.d | 13 -- 10 files changed, 412 insertions(+), 467 deletions(-) diff --git a/std/regex/internal/backtracking.d b/std/regex/internal/backtracking.d index 2d14f604178..fb3b357ec67 100644 --- a/std/regex/internal/backtracking.d +++ b/std/regex/internal/backtracking.d @@ -30,7 +30,13 @@ template BacktrackingMatcher(bool CTregex) alias String = const(Char)[]; alias RegEx = Regex!Char; alias MatchFn = bool function (ref BacktrackingMatcher!(Char, Stream)); - RegEx re; //regex program + const(Bytecode)[] ir; + uint ngroup; + uint flags; + const(Interval[])[] charsets; + const(CharMatcher)[] matchers; + const(BitTable)[] filters; + const Kickstart!Char kickstart; static if (CTregex) MatchFn nativeFn; //native code for that program //Stream state @@ -79,12 +85,17 @@ template BacktrackingMatcher(bool CTregex) static size_t initialMemory(const ref RegEx re) { - return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof; + return stackSize(re.ngroup)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof; } - static size_t stackSize(const ref RegEx re) + size_t initialMemory() { - return initialStack*(stateSize + re.ngroup*(Group!DataIndex).sizeof/size_t.sizeof)+1; + return stackSize(ngroup)*size_t.sizeof + merge.length*Trace.sizeof; + } + + static size_t stackSize(uint ngroup) + { + return initialStack*(stateSize + ngroup*(Group!DataIndex).sizeof/size_t.sizeof)+1; } @property bool atStart(){ return index == 0; } @@ -101,7 +112,7 @@ template BacktrackingMatcher(bool CTregex) { static if (kicked) { - if (!s.search(re.kickstart, front, index)) + if (!s.search(kickstart, front, index)) { index = s.lastIndex; } @@ -113,46 +124,69 @@ template BacktrackingMatcher(bool CTregex) // void newStack() { - auto chunk = mallocArray!(size_t)(stackSize(re)); + auto chunk = mallocArray!(size_t)(stackSize(ngroup)); chunk[0] = cast(size_t)(memory.ptr); memory = chunk[1..$]; } - void initExternalMemory(void[] memBlock) + void initExternalMemory(void[] memBlock, size_t hotspotTableSize) { - merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock); + merge = arrayInChunk!(Trace)(hotspotTableSize, memBlock); merge[] = Trace.init; memory = cast(size_t[])memBlock; memory[0] = 0; //hidden pointer memory = memory[1..$]; } - void initialize(ref RegEx program, Stream stream, void[] memBlock) + void dupTo(void[] memory) { - re = program; - s = stream; - exhausted = false; - initExternalMemory(memBlock); - backrefed = null; + initExternalMemory(memory, merge.length); } - auto dupTo(void[] memory) + this(Matcher)(ref Matcher matcher, Stream stream, void[] memBlock, dchar ch, DataIndex idx) { - typeof(this) tmp = this; - tmp.initExternalMemory(memory); - return tmp; + ir = matcher.ir; + charsets = matcher.charsets; + filters = matcher.filters; + matchers = matcher.matchers; + ngroup = matcher.ngroup; + flags = matcher.flags; + s = stream; + exhausted = false; + initExternalMemory(memBlock, matcher.merge.length); + backrefed = null; + front = ch; + index = idx; } - this(ref RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx) + this(Matcher)(ref Matcher matcher, Stream stream, void[] memBlock) { - initialize(program, stream, memBlock); - front = ch; - index = idx; + ir = matcher.ir; + charsets = matcher.charsets; + filters = matcher.filters; + matchers = matcher.matchers; + ngroup = matcher.ngroup; + flags = matcher.flags; + s = stream; + exhausted = false; + initExternalMemory(memBlock, matcher.merge.length); + backrefed = null; + next(); } - this(ref RegEx program, Stream stream, void[] memBlock) + this()(ref const RegEx program, Stream stream, void[] memBlock, uint regexFlags) { - initialize(program, stream, memBlock); + kickstart = program.kickstart; + ir = program.ir; + charsets = program.charsets; + filters = program.filters; + matchers = program.matchers; + ngroup = program.ngroup; + flags = regexFlags; + s = stream; + exhausted = false; + initExternalMemory(memBlock, program.hotspotTableSize); + backrefed = null; next(); } @@ -160,7 +194,7 @@ template BacktrackingMatcher(bool CTregex) { alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); alias BackMatcher = BackMatcherTempl!(Char, Stream); - auto fwdMatcher = BackMatcher(matcher.re, s, memBlock, front, index); + auto fwdMatcher = BackMatcher(matcher, s, memBlock, front, index); return fwdMatcher; } @@ -169,7 +203,7 @@ template BacktrackingMatcher(bool CTregex) alias BackMatcherTempl = .BacktrackingMatcher!(CTregex); alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index))); auto fwdMatcher = - BackMatcher(matcher.re, s.loopBack(index), memBlock); + BackMatcher(matcher, s.loopBack(index), memBlock); return fwdMatcher; } @@ -182,7 +216,7 @@ template BacktrackingMatcher(bool CTregex) {//stream is updated here matches[0].begin = start; matches[0].end = index; - if (!(re.flags & RegexOption.global) || atEnd) + if (!(flags & RegexOption.global) || atEnd) exhausted = true; if (start == index)//empty match advances input next(); @@ -202,7 +236,7 @@ template BacktrackingMatcher(bool CTregex) if (exhausted) //all matches collected return false; this.matches = matches; - if (re.flags & RegexInfo.oneShot) + if (flags & RegexInfo.oneShot) { exhausted = true; const DataIndex start = index; @@ -216,7 +250,7 @@ template BacktrackingMatcher(bool CTregex) } static if (kicked) { - if (re.kickstart) + if (kickstart) { for (;;) { @@ -285,19 +319,19 @@ template BacktrackingMatcher(bool CTregex) { debug(std_regex_matcher) writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s", - pc, counter, disassemble(re.ir, pc, re.dict), + pc, counter, disassemble(ir, pc), front, s._index); - switch (re.ir[pc].code) + switch (ir[pc].code) { case IR.OrChar://assumes IRL!(OrChar) == 1 if (atEnd) goto L_backtrack; - uint len = re.ir[pc].sequence; + uint len = ir[pc].sequence; uint end = pc + len; - if (re.ir[pc].data != front && re.ir[pc+1].data != front) + if (ir[pc].data != front && ir[pc+1].data != front) { for (pc = pc+2; pc < end; pc++) - if (re.ir[pc].data == front) + if (ir[pc].data == front) break; if (pc == end) goto L_backtrack; @@ -306,7 +340,7 @@ template BacktrackingMatcher(bool CTregex) next(); break; case IR.Char: - if (atEnd || front != re.ir[pc].data) + if (atEnd || front != ir[pc].data) goto L_backtrack; pc += IRL!(IR.Char); next(); @@ -318,13 +352,13 @@ template BacktrackingMatcher(bool CTregex) next(); break; case IR.CodepointSet: - if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front)) + if (atEnd || !charsets[ir[pc].data].scanFor(front)) goto L_backtrack; next(); pc += IRL!(IR.CodepointSet); break; case IR.Trie: - if (atEnd || !re.matchers[re.ir[pc].data][front]) + if (atEnd || !matchers[ir[pc].data][front]) goto L_backtrack; next(); pc += IRL!(IR.Trie); @@ -412,10 +446,10 @@ template BacktrackingMatcher(bool CTregex) goto L_backtrack; break; case IR.InfiniteStart, IR.InfiniteQStart: - pc += re.ir[pc].data + IRL!(IR.InfiniteStart); + pc += ir[pc].data + IRL!(IR.InfiniteStart); //now pc is at end IR.Infinite(Q)End - uint len = re.ir[pc].data; - if (re.ir[pc].code == IR.InfiniteEnd) + uint len = ir[pc].data; + if (ir[pc].code == IR.InfiniteEnd) { pushState(pc+IRL!(IR.InfiniteEnd), counter); pc -= len; @@ -427,29 +461,29 @@ template BacktrackingMatcher(bool CTregex) } break; case IR.InfiniteBloomStart: - pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart); + pc += ir[pc].data + IRL!(IR.InfiniteBloomStart); //now pc is at end IR.InfiniteBloomEnd - immutable len = re.ir[pc].data; - immutable filterIdx = re.ir[pc+2].raw; - if (re.filters[filterIdx][front]) + immutable len = ir[pc].data; + immutable filterIdx = ir[pc+2].raw; + if (filters[filterIdx][front]) pushState(pc+IRL!(IR.InfiniteBloomEnd), counter); pc -= len; break; case IR.RepeatStart, IR.RepeatQStart: - pc += re.ir[pc].data + IRL!(IR.RepeatStart); + pc += ir[pc].data + IRL!(IR.RepeatStart); break; case IR.RepeatEnd: case IR.RepeatQEnd: - if (merge[re.ir[pc + 1].raw+counter].mark(index)) + if (merge[ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; } //len, step, min, max - immutable len = re.ir[pc].data; - immutable step = re.ir[pc+2].raw; - immutable min = re.ir[pc+3].raw; - immutable max = re.ir[pc+4].raw; + immutable len = ir[pc].data; + immutable step = ir[pc+2].raw; + immutable min = ir[pc+3].raw; + immutable max = ir[pc+4].raw; if (counter < min) { counter += step; @@ -457,7 +491,7 @@ template BacktrackingMatcher(bool CTregex) } else if (counter < max) { - if (re.ir[pc].code == IR.RepeatEnd) + if (ir[pc].code == IR.RepeatEnd) { pushState(pc + IRL!(IR.RepeatEnd), counter%step); counter += step; @@ -479,13 +513,13 @@ template BacktrackingMatcher(bool CTregex) case IR.InfiniteEnd: case IR.InfiniteQEnd: debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); - if (merge[re.ir[pc + 1].raw+counter].mark(index)) + if (merge[ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; } - immutable len = re.ir[pc].data; - if (re.ir[pc].code == IR.InfiniteEnd) + immutable len = ir[pc].data; + if (ir[pc].code == IR.InfiniteEnd) { pushState(pc + IRL!(IR.InfiniteEnd), counter); pc -= len; @@ -498,14 +532,14 @@ template BacktrackingMatcher(bool CTregex) break; case IR.InfiniteBloomEnd: debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting); - if (merge[re.ir[pc + 1].raw+counter].mark(index)) + if (merge[ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; } - immutable len = re.ir[pc].data; - immutable filterIdx = re.ir[pc+2].raw; - if (re.filters[filterIdx][front]) + immutable len = ir[pc].data; + immutable filterIdx = ir[pc+2].raw; + if (filters[filterIdx][front]) { infiniteNesting--; pushState(pc + IRL!(IR.InfiniteBloomEnd), counter); @@ -514,7 +548,7 @@ template BacktrackingMatcher(bool CTregex) pc -= len; break; case IR.OrEnd: - if (merge[re.ir[pc + 1].raw+counter].mark(index)) + if (merge[ir[pc + 1].raw+counter].mark(index)) { // merged! goto L_backtrack; @@ -525,34 +559,34 @@ template BacktrackingMatcher(bool CTregex) pc += IRL!(IR.OrStart); goto case; case IR.Option: - immutable len = re.ir[pc].data; - if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one + immutable len = ir[pc].data; + if (ir[pc+len].code == IR.GotoEndOr)//not a last one { pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch } pc += IRL!(IR.Option); break; case IR.GotoEndOr: - pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr); + pc = pc + ir[pc].data + IRL!(IR.GotoEndOr); break; case IR.GroupStart: - immutable n = re.ir[pc].data; + immutable n = ir[pc].data; matches[n].begin = index; debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index); pc += IRL!(IR.GroupStart); break; case IR.GroupEnd: - immutable n = re.ir[pc].data; + immutable n = ir[pc].data; matches[n].end = index; debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index); pc += IRL!(IR.GroupEnd); break; case IR.LookaheadStart: case IR.NeglookaheadStart: - immutable len = re.ir[pc].data; + immutable len = ir[pc].data; auto save = index; - immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; - auto mem = malloc(initialMemory(re))[0..initialMemory(re)]; + immutable ms = ir[pc+1].raw, me = ir[pc+2].raw; + auto mem = malloc(initialMemory())[0..initialMemory()]; scope(exit) free(mem.ptr); static if (Stream.isLoopback) { @@ -564,10 +598,10 @@ template BacktrackingMatcher(bool CTregex) } matcher.matches = matches[ms .. me]; matcher.backrefed = backrefed.empty ? matches : backrefed; - matcher.re.ir = re.ir[ + matcher.ir = ir[ pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd) ]; - immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart); + immutable match = (matcher.matchImpl() != 0) ^ (ir[pc].code == IR.NeglookaheadStart); s.reset(save); next(); if (!match) @@ -579,26 +613,26 @@ template BacktrackingMatcher(bool CTregex) break; case IR.LookbehindStart: case IR.NeglookbehindStart: - immutable len = re.ir[pc].data; - immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw; - auto mem = malloc(initialMemory(re))[0..initialMemory(re)]; + immutable len = ir[pc].data; + immutable ms = ir[pc+1].raw, me = ir[pc+2].raw; + auto mem = malloc(initialMemory())[0..initialMemory()]; scope(exit) free(mem.ptr); static if (Stream.isLoopback) { alias Matcher = BacktrackingMatcher!(Char, Stream); - auto matcher = Matcher(re, s, mem, front, index); + auto matcher = Matcher(this, s, mem, front, index); } else { alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index))); - auto matcher = Matcher(re, s.loopBack(index), mem); + auto matcher = Matcher(this, s.loopBack(index), mem); } matcher.matches = matches[ms .. me]; - matcher.re.ir = re.ir[ + matcher.ir = ir[ pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd) ]; matcher.backrefed = backrefed.empty ? matches : backrefed; - immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart); + immutable match = (matcher.matchImpl() != 0) ^ (ir[pc].code == IR.NeglookbehindStart); if (!match) goto L_backtrack; else @@ -607,8 +641,8 @@ template BacktrackingMatcher(bool CTregex) } break; case IR.Backref: - immutable n = re.ir[pc].data; - auto referenced = re.ir[pc].localRef + immutable n = ir[pc].data; + auto referenced = ir[pc].localRef ? s[matches[n].begin .. matches[n].end] : s[backrefed[n].begin .. backrefed[n].end]; while (!atEnd && !referenced.empty && front == referenced.front) @@ -629,9 +663,9 @@ template BacktrackingMatcher(bool CTregex) case IR.LookbehindEnd: case IR.NeglookbehindEnd: case IR.End: - return re.ir[pc].data; + return ir[pc].data; default: - debug printBytecode(re.ir[0..$]); + debug printBytecode(ir[0..$]); assert(0); L_backtrack: if (!popState()) @@ -660,7 +694,7 @@ template BacktrackingMatcher(bool CTregex) { import core.stdc.stdlib : free; free(memory.ptr);//last segment is freed in RegexMatch - immutable size = initialStack*(stateSize + 2*re.ngroup); + immutable size = initialStack*(stateSize + 2*ngroup); memory = prev[0..size]; lastState = size; return true; @@ -795,7 +829,7 @@ struct CtContext //to mark the portion of matches to save int match, total_matches; int reserved; - CodepointSet[] charsets; + const Interval[][] charsets; //state of codegenerator @@ -805,7 +839,7 @@ struct CtContext int addr; } - this(Char)(Regex!Char re) + this(Char)(const Regex!Char re) { match = 1; reserved = 1; //first match is skipped @@ -867,7 +901,7 @@ struct CtContext } // - CtState ctGenBlock(Bytecode[] ir, int addr) + CtState ctGenBlock(const(Bytecode)[] ir, int addr) { CtState result; result.addr = addr; @@ -881,7 +915,7 @@ struct CtContext } // - CtState ctGenGroup(ref Bytecode[] ir, int addr) + CtState ctGenGroup(ref const(Bytecode)[] ir, int addr) { import std.algorithm.comparison : max; auto bailOut = "goto L_backtrack;"; @@ -944,7 +978,7 @@ struct CtContext //(neg)lookaround piece ends } auto save = index; - auto mem = malloc(initialMemory(re))[0..initialMemory(re)]; + auto mem = malloc(initialMemory())[0..initialMemory()]; scope(exit) free(mem.ptr); static if (typeof(matcher.s).isLoopback) auto lookaround = $$; @@ -983,7 +1017,7 @@ struct CtContext } //generate source for bytecode contained in OrStart ... OrEnd - CtState ctGenAlternation(Bytecode[] ir, int addr) + CtState ctGenAlternation(const(Bytecode)[] ir, int addr) { CtState[] pieces; CtState r; @@ -1023,11 +1057,11 @@ struct CtContext // generate fixup code for instruction in ir, // fixup means it has an alternative way for control flow - string ctGenFixupCode(Bytecode[] ir, int addr, int fixup) + string ctGenFixupCode(const(Bytecode)[] ir, int addr, int fixup) { return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version } - string ctGenFixupCode(ref Bytecode[] ir, int addr, int fixup) + string ctGenFixupCode(ref const(Bytecode)[] ir, int addr, int fixup) { string r; string testCode; @@ -1181,7 +1215,7 @@ struct CtContext } - string ctQuickTest(Bytecode[] ir, int id) + string ctQuickTest(const(Bytecode)[] ir, int id) { uint pc = 0; while (pc < ir.length && ir[pc].isAtom) @@ -1208,7 +1242,7 @@ struct CtContext } //process & generate source for simple bytecodes at front of ir using address addr - CtState ctGenAtom(ref Bytecode[] ir, int addr) + CtState ctGenAtom(ref const(Bytecode)[] ir, int addr) { CtState result; result.code = ctAtomCode(ir, addr); @@ -1218,7 +1252,7 @@ struct CtContext } //D code for atom at ir using address addr, addr < 0 means quickTest - string ctAtomCode(Bytecode[] ir, int addr) + string ctAtomCode(const(Bytecode)[] ir, int addr) { string code; string bailOut, nextInstr; @@ -1263,7 +1297,7 @@ struct CtContext break; case IR.Any: code ~= ctSub( ` - if (atEnd || (!(re.flags & RegexOption.singleline) + if (atEnd || (!(flags & RegexOption.singleline) && (front == '\r' || front == '\n'))) $$ $$ @@ -1273,7 +1307,7 @@ struct CtContext if (charsets.length) { string name = `func_`~to!string(addr+1); - string funcCode = charsets[ir[0].data].toSourceCode(name); + string funcCode = CodepointSet(charsets[ir[0].data]).toSourceCode(name); code ~= ctSub( ` static $$ if (atEnd || !$$(front)) @@ -1283,16 +1317,16 @@ struct CtContext } else code ~= ctSub( ` - if (atEnd || !re.charsets[$$].scanFor(front)) + if (atEnd || !charsets[$$].scanFor(front)) $$ $$ $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); break; case IR.Trie: - if (charsets.length && charsets[ir[0].data].byInterval.length <= 8) + if (charsets.length && charsets[ir[0].data].length <= 8) goto case IR.CodepointSet; code ~= ctSub( ` - if (atEnd || !re.matchers[$$][front]) + if (atEnd || !matchers[$$][front]) $$ $$ $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr); @@ -1430,7 +1464,7 @@ struct CtContext } //generate D code for the whole regex - public string ctGenRegEx(Bytecode[] ir) + public string ctGenRegEx(const(Bytecode)[] ir) { auto bdy = ctGenBlock(ir, 0); auto r = ` @@ -1476,7 +1510,7 @@ struct CtContext } -string ctGenRegExCode(Char)(Regex!Char re) +string ctGenRegExCode(Char)(const Regex!Char re) { auto context = CtContext(re); return context.ctGenRegEx(re.ir); diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index 691ad71a989..3e7fbd61bfd 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -20,16 +20,17 @@ import std.algorithm; struct HashTab { +pure: @disable this(this); - uint opIndex()(uint key) + uint opIndex()(uint key) const { auto p = locate(key, table); assert(p.occupied); return p.value; } - bool opBinaryRight(string op:"in")(uint key) + bool opBinaryRight(string op:"in")(uint key) const { auto p = locate(key, table); return p.occupied; @@ -53,7 +54,7 @@ struct HashTab p.value = value; } - auto keys() + auto keys() const { import std.array : appender; auto app = appender!(uint[])(); @@ -65,7 +66,7 @@ struct HashTab return app.data; } - auto values() + auto values() const { import std.array : appender; auto app = appender!(uint[])(); @@ -85,17 +86,17 @@ private: struct Node { + pure: uint key_; uint value; - @property uint key()(){ return key_ & 0x7fff_ffff; } - @property bool occupied()(){ return (key_ & 0x8000_0000) != 0; } + @property uint key()() const { return key_ & 0x7fff_ffff; } + @property bool occupied()() const { return (key_ & 0x8000_0000) != 0; } void setOccupied(){ key_ |= 0x8000_0000; } - } Node[] table; size_t items; - static Node* locate()(uint key, Node[] table) + static N* locate(N)(uint key, N[] table) { size_t slot = hashOf(key) & (table.length-1); while (table[slot].occupied) @@ -131,6 +132,7 @@ private: // and ref count is decreased struct UIntTrie2 { +pure: ushort[] index; // pages --> blocks ushort[] refCounts; // ref counts for each block uint[] hashes; // hashes of blocks @@ -164,7 +166,7 @@ struct UIntTrie2 return ut; } - uint opIndex(dchar ch) + uint opIndex(dchar ch) const { immutable blk = index[ch>>blockBits]; return blocks.ptr[blk*blockSize + (ch & (blockSize-1))]; @@ -258,6 +260,7 @@ unittest // to run backwards to find the start. struct BitNfa { +pure: uint[128] asciiTab; // state mask for ascii characters UIntTrie2 uniTab; // state mask for unicode characters HashTab controlFlow; // maps each bit pattern to resulting jumps pattern @@ -468,7 +471,7 @@ outer: for (uint i=0; i user group number uint ngroup; // number of internal groups uint maxCounterDepth; // max depth of nested {n,m} repetitions uint hotspotTableSize; // number of entries in merge table uint threadCount; // upper bound on number of Thompson VM threads uint flags; // global regex flags - public const(CharMatcher)[] matchers; // tables that represent character sets - public const(BitTable)[] filters; // bloom filters for conditional loops + Interval[][] charsets; // intervals of characters + const(CharMatcher)[] matchers; // tables that represent character sets + const(BitTable)[] filters; // bloom filters for conditional loops uint[] backrefed; // bit array of backreferenced submatches Kickstart!Char kickstart; @@ -696,11 +693,10 @@ package(std.regex): public: Regex!Char _regex; alias _regex this; - this(Regex!Char re, MatchFn fn) + this(immutable Regex!Char re, MatchFn fn) immutable { _regex = re; nativeFn = fn; - } } @@ -742,7 +738,7 @@ struct Input(Char) return _index == _origin.length; } - bool search(Kickstart!Char kick, ref dchar res, ref size_t pos) + bool search(const Kickstart!Char kick, ref dchar res, ref size_t pos) { kick.search(this); return nextChar(res, pos); @@ -824,8 +820,8 @@ template BackLooper(E) } // -@trusted uint lookupNamedGroup(String)(NamedGroup[] dict, String name) -{//equal is @system? +@safe uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name) +{ import std.range : assumeSorted; import std.conv : text; import std.algorithm.iteration : map; @@ -861,6 +857,7 @@ public class RegexException : Exception // simple 128-entry bit-table used with a hash function struct BitTable { +pure: uint[4] filter; this(CodepointSet set){ @@ -889,7 +886,7 @@ struct BitTable { struct CharMatcher { BitTable ascii; // fast path for ASCII Trie trie; // slow path for Unicode - +pure: this(CodepointSet set) { auto asciiSet = set & unicode.ASCII; diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index 8c7568a8e12..9e9be2fdc57 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -7,11 +7,11 @@ module std.regex.internal.parser; import std.regex.internal.ir, std.regex.internal.shiftor, std.regex.internal.bitnfa; import std.range.primitives, std.uni, std.meta, - std.traits, std.typecons, std.exception; + std.traits, std.typecons, std.exception, std.range; static import std.ascii; // package relevant info from parser into a regex object -auto makeRegex(S, CG)(Parser!(S, CG) p) +auto makeRegex(S, CG)(Parser!(S, CG) p) pure { Regex!(BasicElementOf!S) re; auto g = p.g; @@ -22,7 +22,10 @@ auto makeRegex(S, CG)(Parser!(S, CG) p) ngroup = g.ngroup; maxCounterDepth = g.counterDepth; flags = p.re_flags; - charsets = g.charsets; + charsets = g.charsets + .map!(x => + x.byInterval.map!(x=>Interval(x.a,x.b)).array + ).array; matchers = g.matchers; backrefed = g.backrefed; re.postprocess(); @@ -77,87 +80,6 @@ unittest assert(nc.equal(cp[1 .. $ - 1])); } - -@trusted void reverseBytecode()(Bytecode[] code) -{ - Bytecode[] rev = new Bytecode[code.length]; - uint revPc = cast(uint)rev.length; - Stack!(Tuple!(uint, uint, uint)) stack; - uint start = 0; - uint end = cast(uint)code.length; - for (;;) - { - for (uint pc = start; pc < end; ) - { - immutable len = code[pc].length; - if (code[pc].code == IR.GotoEndOr) - break; //pick next alternation branch - if (code[pc].isAtom) - { - rev[revPc - len .. revPc] = code[pc .. pc + len]; - revPc -= len; - pc += len; - } - else if (code[pc].isStart || code[pc].isEnd) - { - //skip over other embedded lookbehinds they are reversed - if (code[pc].code == IR.LookbehindStart - || code[pc].code == IR.NeglookbehindStart) - { - immutable blockLen = len + code[pc].data - + code[pc].pairedLength; - rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen]; - pc += blockLen; - revPc -= blockLen; - continue; - } - immutable second = code[pc].indexOfPair(pc); - immutable secLen = code[second].length; - rev[revPc - secLen .. revPc] = code[second .. second + secLen]; - revPc -= secLen; - if (code[pc].code == IR.OrStart) - { - //we pass len bytes forward, but secLen in reverse - immutable revStart = revPc - (second + len - secLen - pc); - uint r = revStart; - uint i = pc + IRL!(IR.OrStart); - while (code[i].code == IR.Option) - { - if (code[i - 1].code != IR.OrStart) - { - assert(code[i - 1].code == IR.GotoEndOr); - rev[r - 1] = code[i - 1]; - } - rev[r] = code[i]; - auto newStart = i + IRL!(IR.Option); - auto newEnd = newStart + code[i].data; - auto newRpc = r + code[i].data + IRL!(IR.Option); - if (code[newEnd].code != IR.OrEnd) - { - newRpc--; - } - stack.push(tuple(newStart, newEnd, newRpc)); - r += code[i].data + IRL!(IR.Option); - i += code[i].data + IRL!(IR.Option); - } - pc = i; - revPc = revStart; - assert(code[pc].code == IR.OrEnd); - } - else - pc += len; - } - } - if (stack.empty) - break; - start = stack.top[0]; - end = stack.top[1]; - revPc = stack.top[2]; - stack.pop(); - } - code[] = rev[]; -} - //test if a given string starts with hex number of maxDigit that's a valid codepoint //returns it's value and skips these maxDigit chars on success, throws on failure dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit) @@ -212,7 +134,7 @@ auto caseEnclose(CodepointSet set) /+ fetch codepoint set corresponding to a name (InBlock or binary property) +/ -@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold) +@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold) pure { CodepointSet s = unicode(name); //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) @@ -223,35 +145,9 @@ auto caseEnclose(CodepointSet set) return s; } -//basic stack, just in case it gets used anywhere else then Parser -@trusted struct Stack(T) -{ - T[] data; - @property bool empty(){ return data.empty; } - - @property size_t length(){ return data.length; } - - void push(T val){ data ~= val; } - - T pop() - { - assert(!empty); - auto val = data[$ - 1]; - data = data[0 .. $ - 1]; - if (!__ctfe) - cast(void)data.assumeSafeAppend(); - return val; - } - - @property ref T top() - { - assert(!empty); - return data[$ - 1]; - } -} - struct CodeGen { +pure: Bytecode[] ir; // resulting bytecode Stack!(uint) fixupStack; // stack of opened start instructions NamedGroup[] dict; // maps name -> user group number @@ -336,7 +232,7 @@ struct CodeGen } if (ivals.length*2 > maxCharsetUsed) { - auto t = getMatcher(set); + auto t = CharMatcher(set); put(Bytecode(IR.Trie, cast(uint)matchers.length)); matchers ~= t; debug(std_regex_allocation) writeln("Trie generated"); @@ -617,6 +513,7 @@ enum infinite = ~0u; struct Parser(R, Generator) if (isForwardRange!R && is(ElementType!R : dchar)) { +pure: dchar _current; bool empty; R pat, origin; //keep full pattern for pretty printing error messages @@ -1543,7 +1440,7 @@ struct Parser(R, Generator) /+ Postproces the IR, then optimize. +/ -@trusted void postprocess(Char)(ref Regex!Char zis) +@trusted void postprocess(Char)(ref Regex!Char zis) pure {//@@@BUG@@@ write is @system with(zis) { @@ -1663,7 +1560,7 @@ void fixupBytecode()(Bytecode[] ir) assert(fixups.empty); } -void optimize(Char)(ref Regex!Char zis) +void optimize(Char)(ref Regex!Char zis) pure { import std.array : insertInPlace; CodepointSet nextSet(uint idx) @@ -1680,7 +1577,7 @@ void optimize(Char)(ref Regex!Char zis) goto default; //TODO: OrChar case Trie, CodepointSet: - set = zis.charsets[ir[i].data]; + set = .CodepointSet(zis.charsets[ir[i].data]); goto default; case GroupStart,GroupEnd: break; diff --git a/std/regex/internal/shiftor.d b/std/regex/internal/shiftor.d index 39f1c3f6137..48bfebfebe8 100644 --- a/std/regex/internal/shiftor.d +++ b/std/regex/internal/shiftor.d @@ -29,6 +29,7 @@ uint effectiveSize(Char)() class ShiftOr(Char) : Kickstart!Char { private: +pure: uint[] table; uint fChar; uint n_length; @@ -115,8 +116,8 @@ private: { auto t = worklist[$-1]; worklist.length -= 1; - if (!__ctfe) - cast(void)worklist.assumeSafeAppend(); + //if (!__ctfe) + // cast(void)worklist.assumeSafeAppend(); return t; } @@ -241,9 +242,9 @@ public: static immutable codeBounds = [0x0, 0x7F, 0x80, 0x7FF, 0x800, 0xFFFF, 0x10000, 0x10FFFF]; else //== 2 static immutable codeBounds = [0x0, 0xFFFF, 0x10000, 0x10FFFF]; - uint[] arr = new uint[set.byInterval.length * 2]; + uint[] arr = new uint[set.length * 2]; size_t ofs = 0; - foreach (ival; set.byInterval) + foreach (ival; set) { arr[ofs++] = ival.a; arr[ofs++] = ival.b; @@ -262,7 +263,8 @@ public: auto chars = set.length; if (chars > charsetThreshold) goto L_StopThread; - foreach (ch; set.byCodepoint) + foreach (ival; set) + foreach (ch; ival.a..ival.b) { //avoid surrogate pairs if (0xD800 <= ch && ch <= 0xDFFF) @@ -373,7 +375,7 @@ public: // has a useful trait: if supplied with valid UTF indexes, // returns only valid UTF indexes // (that given the haystack in question is valid UTF string) - final @trusted bool search(ref Input!Char s) + final @trusted bool search(ref Input!Char s) const {//@BUG: apparently assumes little endian machines import std.conv : text; import core.stdc.string : memchr; @@ -503,7 +505,7 @@ public: return false; } - final @trusted bool match(ref Input!Char s) + final @trusted bool match(ref Input!Char s) const { //TODO: stub return false; @@ -514,7 +516,7 @@ public: import std.stdio : writefln; for (size_t i = 0; i < table.length; i += 4) { - writefln("%32b %32b %32b %32b",table[i], table[i+1], table[i+2], table[i+3]); + debug writefln("%32b %32b %32b %32b",table[i], table[i+1], table[i+2], table[i+3]); } } } @@ -524,7 +526,7 @@ unittest import std.conv, std.regex; auto shiftOrLength(C)(const(C)[] pat, uint length) { - auto r = regex(pat); + auto r = regex(pat, "s"); auto kick = new ShiftOr!C(r); assert(kick.length == length, text(C.stringof, " == ", kick.length)); return kick; diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index 6a3db845991..347c268da28 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -433,7 +433,7 @@ unittest static if (tvd.result == "c") { static assert(!__traits(compiles, (){ - enum r = regex(tvd.pattern, tvd.flags); + static r = regex(tvd.pattern, tvd.flags); }), "errornously compiles regex pattern: " ~ tvd.pattern); } else diff --git a/std/regex/internal/thompson.d b/std/regex/internal/thompson.d index 530e5c503e0..9cced4f3c8a 100644 --- a/std/regex/internal/thompson.d +++ b/std/regex/internal/thompson.d @@ -93,7 +93,7 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - finish(t, matches, re.ir[t.pc].data); + finish(t, matches, ir[t.pc].data); //fix endpoint of the whole match matches[0].end = index; recycle(t); @@ -243,35 +243,35 @@ template ThompsonOps(E, S, bool withInput:true) static bool op(IR code:IR.InfiniteStart)(E* e, S* state) { with(e) with(state) - t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); + t.pc += ir[t.pc].data + IRL!(IR.InfiniteStart); return op!(IR.InfiniteEnd)(e,state); } static bool op(IR code:IR.InfiniteBloomStart)(E* e, S* state) { with(e) with(state) - t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart); + t.pc += ir[t.pc].data + IRL!(IR.InfiniteBloomStart); return op!(IR.InfiniteBloomEnd)(e,state); } static bool op(IR code:IR.InfiniteQStart)(E* e, S* state) { with(e) with(state) - t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart); + t.pc += ir[t.pc].data + IRL!(IR.InfiniteQStart); return op!(IR.InfiniteQEnd)(e,state); } static bool op(IR code:IR.RepeatStart)(E* e, S* state) { with(e) with(state) - t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart); + t.pc += ir[t.pc].data + IRL!(IR.RepeatStart); return op!(IR.RepeatEnd)(e,state); } static bool op(IR code:IR.RepeatQStart)(E* e, S* state) { with(e) with(state) - t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart); + t.pc += ir[t.pc].data + IRL!(IR.RepeatQStart); return op!(IR.RepeatQEnd)(e,state); } @@ -281,32 +281,32 @@ template ThompsonOps(E, S, bool withInput:true) with(e) with(state) { //len, step, min, max - uint len = re.ir[t.pc].data; - uint step = re.ir[t.pc+2].raw; - uint min = re.ir[t.pc+3].raw; + uint len = ir[t.pc].data; + uint step = ir[t.pc+2].raw; + uint min = ir[t.pc+3].raw; if (t.counter < min) { t.counter += step; t.pc -= len; return true; } - if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) + if (merge[ir[t.pc + 1].raw+t.counter] < genCounter) { debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", - t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); - merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; + t.pc, index, genCounter, merge[ir[t.pc + 1].raw+t.counter] ); + merge[ir[t.pc + 1].raw+t.counter] = genCounter; } else { debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", - t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); + t.pc, index, genCounter, merge[ir[t.pc + 1].raw+t.counter] ); return popState(e); } - uint max = re.ir[t.pc+4].raw; + uint max = ir[t.pc+4].raw; if (t.counter < max) { - if (re.ir[t.pc].code == IR.RepeatEnd) + if (ir[t.pc].code == IR.RepeatEnd) { //queue out-of-loop thread worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd), t.counter % step)); @@ -335,21 +335,21 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) + if (merge[ir[t.pc + 1].raw+t.counter] < genCounter) { debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", - t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); - merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; + t.pc, index, genCounter, merge[ir[t.pc + 1].raw+t.counter] ); + merge[ir[t.pc + 1].raw+t.counter] = genCounter; } else { debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", - t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); + t.pc, index, genCounter, merge[ir[t.pc + 1].raw+t.counter] ); return popState(e); } - uint len = re.ir[t.pc].data; + uint len = ir[t.pc].data; uint pc1, pc2; //branches to take in priority order - if (re.ir[t.pc].code == IR.InfiniteEnd) + if (ir[t.pc].code == IR.InfiniteEnd) { pc1 = t.pc - len; pc2 = t.pc + IRL!(IR.InfiniteEnd); @@ -370,24 +370,24 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) + if (merge[ir[t.pc + 1].raw+t.counter] < genCounter) { debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", - t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); - merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; + t.pc, index, genCounter, merge[ir[t.pc + 1].raw+t.counter] ); + merge[ir[t.pc + 1].raw+t.counter] = genCounter; } else { debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", - t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); + t.pc, index, genCounter, merge[ir[t.pc + 1].raw+t.counter] ); return popState(e); } - uint len = re.ir[t.pc].data; + uint len = ir[t.pc].data; uint pc1, pc2; //branches to take in priority order pc1 = t.pc - len; pc2 = t.pc + IRL!(IR.InfiniteBloomEnd); - uint filterIndex = re.ir[t.pc + 2].raw; - if (re.filters[filterIndex][front]) + uint filterIndex = ir[t.pc + 2].raw; + if (filters[filterIndex][front]) worklist.insertFront(fork(t, pc2, t.counter)); t.pc = pc1; return true; @@ -398,17 +398,17 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) + if (merge[ir[t.pc + 1].raw+t.counter] < genCounter) { debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", - t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] ); - merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; + t.pc, s[index .. s.lastIndex], genCounter, merge[ir[t.pc + 1].raw + t.counter] ); + merge[ir[t.pc + 1].raw+t.counter] = genCounter; t.pc += IRL!(IR.OrEnd); } else { debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", - t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] ); + t.pc, s[index .. s.lastIndex], genCounter, merge[ir[t.pc + 1].raw + t.counter] ); return popState(e); } return true; @@ -428,9 +428,9 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option); + uint next = t.pc + ir[t.pc].data + IRL!(IR.Option); //queue next Option - if (re.ir[next].code == IR.Option) + if (ir[next].code == IR.Option) { worklist.insertFront(fork(t, next, t.counter)); } @@ -443,7 +443,7 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr); + t.pc = t.pc + ir[t.pc].data + IRL!(IR.GotoEndOr); return op!(IR.OrEnd)(e, state); } } @@ -452,7 +452,7 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - uint n = re.ir[t.pc].data; + uint n = ir[t.pc].data; t.matches.ptr[n].begin = index; t.pc += IRL!(IR.GroupStart); return true; @@ -462,7 +462,7 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - uint n = re.ir[t.pc].data; + uint n = ir[t.pc].data; t.matches.ptr[n].end = index; t.pc += IRL!(IR.GroupEnd); return true; @@ -473,8 +473,8 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - uint n = re.ir[t.pc].data; - Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; + uint n = ir[t.pc].data; + Group!DataIndex* source = ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; assert(source); if (source[n].begin == source[n].end)//zero-width Backref! { @@ -511,15 +511,15 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - uint len = re.ir[t.pc].data; - uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw; + uint len = ir[t.pc].data; + uint ms = ir[t.pc + 1].raw, me = ir[t.pc + 2].raw; uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart); - bool positive = re.ir[t.pc].code == IR.LookbehindStart; + bool positive = ir[t.pc].code == IR.LookbehindStart; static if (Stream.isLoopback) auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); else auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); - matcher.re.ngroup = me - ms; + matcher.ngroup = me - ms; matcher.backrefed = backrefed.empty ? t.matches : backrefed; //backMatch auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart)); @@ -540,15 +540,15 @@ template ThompsonOps(E, S, bool withInput:true) with(e) with(state) { auto save = index; - uint len = re.ir[t.pc].data; - uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw; + uint len = ir[t.pc].data; + uint ms = ir[t.pc+1].raw, me = ir[t.pc+2].raw; uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart); - bool positive = re.ir[t.pc].code == IR.LookaheadStart; + bool positive = ir[t.pc].code == IR.LookaheadStart; static if (Stream.isLoopback) auto matcher = bwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); else auto matcher = fwdMatcher(t.pc, end, subCounters.get(t.pc, 0)); - matcher.re.ngroup = me - ms; + matcher.ngroup = me - ms; matcher.backrefed = backrefed.empty ? t.matches : backrefed; auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart)); freelist = matcher.freelist; @@ -570,7 +570,7 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - finish(t, matches.ptr[0 .. re.ngroup], re.ir[t.pc].data); + finish(t, matches.ptr[0 .. ngroup], ir[t.pc].data); recycle(t); //cut off low priority threads recycle(clist); @@ -589,11 +589,11 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - uint len = re.ir[t.pc].sequence; + uint len = ir[t.pc].sequence; uint end = t.pc + len; static assert(IRL!(IR.OrChar) == 1); for (; t.pc < end; t.pc++) - if (re.ir[t.pc].data == front) + if (ir[t.pc].data == front) break; if (t.pc != end) { @@ -611,7 +611,7 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - if (front == re.ir[t.pc].data) + if (front == ir[t.pc].data) { t.pc += IRL!(IR.Char); nlist.insertBack(t); @@ -638,7 +638,7 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - if (re.charsets[re.ir[t.pc].data].scanFor(front)) + if (charsets[ir[t.pc].data].scanFor(front)) { t.pc += IRL!(IR.CodepointSet); nlist.insertBack(t); @@ -656,7 +656,7 @@ template ThompsonOps(E, S, bool withInput:true) { with(e) with(state) { - if (re.matchers[re.ir[t.pc].data][front]) + if (matchers[ir[t.pc].data][front]) { t.pc += IRL!(IR.Trie); nlist.insertBack(t); @@ -688,8 +688,8 @@ template ThompsonOps(E,S, bool withInput:false) { with(e) with(state) { - uint n = re.ir[t.pc].data; - Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; + uint n = ir[t.pc].data; + Group!DataIndex* source = ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; assert(source); if (source[n].begin == source[n].end)//zero-width Backref! { @@ -725,20 +725,27 @@ template ThompsonOps(E,S, bool withInput:false) Thread!DataIndex* freelist; ThreadList!DataIndex clist, nlist; DataIndex[] merge; - Group!DataIndex[] backrefed; - Regex!Char re; //regex program + const(Bytecode)[] ir; + int ngroup; // number of capturing groups + uint flags; + const(Interval[])[] charsets; + const(CharMatcher)[] matchers; + const(BitTable)[] filters; Stream s; dchar front; DataIndex index; - DataIndex genCounter; //merge trace counter, goes up on every dchar - size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter + DataIndex genCounter; // merge trace counter, goes up on every dchar OpFunc[] opCacheTrue; // pointers to Op!(IR.xyz) for each bytecode OpFunc[] opCacheFalse; // ditto OpBackFunc[] opCacheBackTrue; // ditto OpBackFunc[] opCacheBackFalse; // ditto size_t threadSize; + size_t threadCount; int matched; bool exhausted; + const Kickstart!Char kickstart; + Group!DataIndex[] backrefed; + size_t[size_t] subCounters; // a table of gen counter per sub-engine: PC -> counter static struct State { @@ -799,7 +806,7 @@ template ThompsonOps(E,S, bool withInput:false) bool search() { - if (!s.search(re.kickstart, front, index)) + if (!s.search(kickstart, front, index)) { index = s.lastIndex; return false; @@ -808,24 +815,23 @@ template ThompsonOps(E,S, bool withInput:false) } } - void initExternalMemory(void[] memory) + void initExternalMemory(void[] memory, size_t hotspotTableSize) { - threadSize = getThreadSize(re); - prepareFreeList(re.threadCount, memory); - if (re.hotspotTableSize) + prepareFreeList(threadCount, memory); + if (hotspotTableSize) { - merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory); + merge = arrayInChunk!(DataIndex)(hotspotTableSize, memory); merge[] = 0; } - opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory); - opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory); - opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory); - opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory); + opCacheTrue = arrayInChunk!(OpFunc)(ir.length, memory); + opCacheFalse = arrayInChunk!(OpFunc)(ir.length, memory); + opCacheBackTrue = arrayInChunk!(OpBackFunc)(ir.length, memory); + opCacheBackFalse = arrayInChunk!(OpBackFunc)(ir.length, memory); - for (uint pc = 0; pc 1) { @@ -344,10 +342,17 @@ public alias StaticRegex(Char) = std.regex.internal.ir.StaticRegex!(Char); } else pat = patterns[0]; + return regexImpl!S(pat, flags); +} +@trusted public auto regex(S)(S[] patterns, const(char)[] flags="") + if (isSomeString!(S)) +{ + import std.functional : memoize; + enum cacheSize = 8; if (__ctfe) - return regexImpl(pat, flags); - return memoize!(regexImpl!S, cacheSize)(pat, flags); + return regexPure(patterns, flags); + return memoize!(regexPure!S, cacheSize)(patterns, flags); } ///ditto @@ -371,7 +376,7 @@ unittest assert(m.front[1] == "12"); } -public auto regexImpl(S)(S pattern, const(char)[] flags="") +public auto regexImpl(S)(S pattern, const(char)[] flags="") pure if (isSomeString!(S)) { import std.regex.internal.parser : Parser, CodeGen; @@ -384,7 +389,7 @@ public auto regexImpl(S)(S pattern, const(char)[] flags="") template ctRegexImpl(alias pattern, string flags=[]) { import std.regex.internal.parser, std.regex.internal.backtracking; - enum r = regex(pattern, flags); + static immutable r = cast(immutable)regexPure([pattern], flags); alias Char = BasicElementOf!(typeof(pattern)); enum source = ctGenRegExCode(r); alias Matcher = BacktrackingMatcher!(true); @@ -393,7 +398,7 @@ template ctRegexImpl(alias pattern, string flags=[]) debug(std_regex_ctr) pragma(msg, source); mixin(source); } - enum nr = StaticRegex!Char(r, &func); + static immutable nr = immutable StaticRegex!Char(r, &func); } /++ @@ -406,7 +411,7 @@ template ctRegexImpl(alias pattern, string flags=[]) pattern = Regular expression flags = The _attributes (g, i, m and x accepted) +/ -public enum ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).nr; +public static immutable ctRegex(alias pattern, alias flags=[]) = ctRegexImpl!(pattern, flags).nr; enum isRegexFor(RegEx, R) = is(RegEx == Regex!(BasicElementOf!R)) || is(RegEx == StaticRegex!(BasicElementOf!R)); @@ -436,9 +441,9 @@ private: } uint _f, _b; uint _refcount; // ref count or SMALL MASK + num groups - NamedGroup[] _names; + const NamedGroup[] _names; - this()(R input, uint n, NamedGroup[] named) + this()(R input, uint n, const(NamedGroup)[] named) { _input = input; _names = named; @@ -447,16 +452,6 @@ private: _f = 0; } - this(alias Engine)(ref RegexMatch!(R,Engine) rmatch) - { - _input = rmatch._input; - _names = rmatch._engine.re.dict; - immutable n = rmatch._engine.re.ngroup; - newMatches(n); - _b = n; - _f = 0; - } - @property inout(Group!DataIndex[]) matches() inout { return (_refcount & SMALL_MASK) ? small_matches[0 .. _refcount & 0xFF] : big_matches; @@ -660,21 +655,24 @@ private: alias EngineType = Engine!Char; EngineType _engine; R _input; + uint _ngroup; Captures!(R,EngineType.DataIndex) _captures; void[] _memory;//is ref-counted - this(RegEx)(R input, RegEx prog) + this(RegEx)(R input, RegEx prog, uint reFlags) { import std.exception : enforce; _input = input; + _ngroup = prog.ngroup; immutable size = EngineType.initialMemory(prog)+size_t.sizeof; _memory = (enforce(malloc(size), "malloc failed")[0..size]); scope(failure) free(_memory.ptr); *cast(size_t*)_memory.ptr = 1; - _engine = EngineType(prog, Input!Char(input), _memory[size_t.sizeof..$]); - static if (is(RegEx == StaticRegex!(BasicElementOf!R))) + _engine = EngineType(prog, Input!Char(input), + _memory[size_t.sizeof..$], reFlags); + static if (is(typeof(prog.nativeFn))) _engine.nativeFn = prog.nativeFn; - _captures = Captures!(R,EngineType.DataIndex)(this); + _captures = Captures!(R,EngineType.DataIndex)(input, prog.ngroup, prog.dict); _captures._nMatch = _engine.match(_captures.matches); debug(std_regex_allocation) writefln("RefCount (ctor): %x %d", _memory.ptr, counter); } @@ -743,16 +741,16 @@ public: if (counter != 1) {//do cow magic first counter--;//we abandon this reference - immutable size = EngineType.initialMemory(_engine.re)+size_t.sizeof; + immutable size = _memory.length; _memory = (enforce(malloc(size), "malloc failed")[0..size]); - _engine = _engine.dupTo(_memory[size_t.sizeof..size]); + _engine.dupTo(_memory[size_t.sizeof..size]); counter = 1;//points to new chunk } if (!_captures.unique) { // has external references - allocate new space - _captures.newMatches(_engine.re.ngroup); + _captures.newMatches(_ngroup); } _captures._nMatch = _engine.match(_captures.matches); } @@ -771,7 +769,7 @@ public: } -private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, RegEx re) +private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, const RegEx re) { import core.stdc.stdlib : malloc, free; import std.exception : enforce; @@ -782,17 +780,16 @@ private @trusted auto matchOnce(alias Engine, RegEx, R)(R input, RegEx re) void[] memory = enforce(malloc(size), "malloc failed")[0..size]; scope(exit) free(memory.ptr); auto captures = Captures!(R, EngineType.DataIndex)(input, re.ngroup, re.dict); - auto engine = EngineType(re, Input!Char(input), memory); - static if (is(RegEx == StaticRegex!(BasicElementOf!R))) + auto engine = EngineType(re, Input!Char(input), memory, re.flags); + static if (is(typeof(re.nativeFn))) engine.nativeFn = re.nativeFn; captures._nMatch = engine.match(captures.matches); return captures; } -private auto matchMany(alias Engine, RegEx, R)(R input, RegEx re) +private auto matchMany(alias Engine, RegEx, R)(R input, const RegEx re) { - re.flags |= RegexOption.global; - return RegexMatch!(R, Engine)(input, re); + return RegexMatch!(R, Engine)(input, re, re.flags | RegexOption.global); } unittest @@ -847,7 +844,7 @@ private void replaceMatchesInto(alias output, Sink, R, T) } // a general skeleton of replaceFirst -private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re) +private R replaceFirstWith(alias output, R, RegEx)(R input, const RegEx re) if (isSomeString!R && isRegexFor!(RegEx, R)) { import std.array : appender; @@ -862,7 +859,7 @@ private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re) // ditto for replaceAll // the method parameter allows old API to ride on the back of the new one private R replaceAllWith(alias output, - alias method=matchAll, R, RegEx)(R input, RegEx re) + alias method=matchAll, R, RegEx)(R input, const RegEx re) if (isSomeString!R && isRegexFor!(RegEx, R)) { import std.array : appender; @@ -891,11 +888,12 @@ private R replaceAllWith(alias output, Returns: a $(D RegexMatch) object holding engine state after first match. +/ -public auto match(R, RegEx)(R input, RegEx re) +public auto match(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { import std.regex.internal.thompson : ThompsonMatcher; - return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, re); + return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher) + (input, re, re.flags); } ///ditto @@ -903,14 +901,17 @@ public auto match(R, String)(R input, String re) if (isSomeString!R && isSomeString!String) { import std.regex.internal.thompson : ThompsonMatcher; - return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher)(input, regex(re)); + auto r = regex(re); + return RegexMatch!(Unqual!(typeof(input)),ThompsonMatcher) + (input, r, r.flags); } -public auto match(R, RegEx)(R input, RegEx re) +public auto match(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re); + return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true) + (input, re, re.flags); } /++ @@ -931,7 +932,7 @@ public auto match(R, RegEx)(R input, RegEx re) $(LREF Captures) containing the extent of a match together with all submatches if there was a match, otherwise an empty $(LREF Captures) object. +/ -public auto matchFirst(R, RegEx)(R input, RegEx re) +public auto matchFirst(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { import std.regex.internal.thompson : ThompsonMatcher; @@ -954,7 +955,7 @@ public auto matchFirst(R, String)(R input, String[] re...) return matchOnce!ThompsonMatcher(input, regex(re)); } -public auto matchFirst(R, RegEx)(R input, RegEx re) +public auto matchFirst(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { import std.regex.internal.backtracking : BacktrackingMatcher; @@ -982,7 +983,7 @@ public auto matchFirst(R, RegEx)(R input, RegEx re) $(LREF RegexMatch) object that represents matcher state after the first match was found or an empty one if not present. +/ -public auto matchAll(R, RegEx)(R input, RegEx re) +public auto matchAll(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { import std.regex.internal.thompson : ThompsonMatcher; @@ -1005,7 +1006,7 @@ public auto matchAll(R, String)(R input, String[] re...) return matchMany!ThompsonMatcher(input, regex(re)); } -public auto matchAll(R, RegEx)(R input, RegEx re) +public auto matchAll(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { import std.regex.internal.backtracking : BacktrackingMatcher; @@ -1071,11 +1072,12 @@ public auto matchAll(R, RegEx)(R input, RegEx re) state after first match. +/ -public auto bmatch(R, RegEx)(R input, RegEx re) +public auto bmatch(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == Regex!(BasicElementOf!R))) { import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, re); + return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false) + (input, re, re.flags); } ///ditto @@ -1083,14 +1085,17 @@ public auto bmatch(R, String)(R input, String re) if (isSomeString!R && isSomeString!String) { import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false)(input, regex(re)); + auto r = regex(re); + return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false) + (input, r, r.flags); } -public auto bmatch(R, RegEx)(R input, RegEx re) +public auto bmatch(R, RegEx)(R input, const RegEx re) if (isSomeString!R && is(RegEx == StaticRegex!(BasicElementOf!R))) { import std.regex.internal.backtracking : BacktrackingMatcher; - return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true)(input, re); + return RegexMatch!(Unqual!(typeof(input)),BacktrackingMatcher!true) + (input, re, re.flags); } // produces replacement string from format using captures for substitution @@ -1183,7 +1188,7 @@ L_Replace_Loop: A string of the same type with the first match (if any) replaced. If no match is found returns the input string itself. +/ -public R replaceFirst(R, C, RegEx)(R input, RegEx re, const(C)[] format) +public R replaceFirst(R, C, RegEx)(R input, const RegEx re, const(C)[] format) if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) { return replaceFirstWith!((m, sink) => replaceFmt(format, m, sink))(input, re); @@ -1210,7 +1215,7 @@ unittest replaced by return values of $(D fun). If no matches found returns the $(D input) itself. +/ -public R replaceFirst(alias fun, R, RegEx)(R input, RegEx re) +public R replaceFirst(alias fun, R, RegEx)(R input, const RegEx re) if (isSomeString!R && isRegexFor!(RegEx, R)) { return replaceFirstWith!((m, sink) => sink.put(fun(m)))(input, re); @@ -1236,7 +1241,7 @@ unittest and the one with the user defined callback. +/ public @trusted void replaceFirstInto(Sink, R, C, RegEx) - (ref Sink sink, R input, RegEx re, const(C)[] format) + (ref Sink sink, R input, const RegEx re, const(C)[] format) if (isOutputRange!(Sink, dchar) && isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) { @@ -1246,7 +1251,7 @@ public @trusted void replaceFirstInto(Sink, R, C, RegEx) ///ditto public @trusted void replaceFirstInto(alias fun, Sink, R, RegEx) - (Sink sink, R input, RegEx re) + (Sink sink, R input, const RegEx re) if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) { replaceCapturesInto!fun(sink, input, matchFirst(input, re)); @@ -1301,7 +1306,7 @@ unittest of the matches (if any) replaced. If no match is found returns the input string itself. +/ -public @trusted R replaceAll(R, C, RegEx)(R input, RegEx re, const(C)[] format) +public @trusted R replaceAll(R, C, RegEx)(R input, const RegEx re, const(C)[] format) if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) { return replaceAllWith!((m, sink) => replaceFmt(format, m, sink))(input, re); @@ -1335,7 +1340,7 @@ unittest re = compiled regular expression fun = delegate to use +/ -public @trusted R replaceAll(alias fun, R, RegEx)(R input, RegEx re) +public @trusted R replaceAll(alias fun, R, RegEx)(R input, const RegEx re) if (isSomeString!R && isRegexFor!(RegEx, R)) { return replaceAllWith!((m, sink) => sink.put(fun(m)))(input, re); @@ -1364,7 +1369,7 @@ unittest the other one with a user defined functor. +/ public @trusted void replaceAllInto(Sink, R, C, RegEx) - (Sink sink, R input, RegEx re, const(C)[] format) + (Sink sink, R input, const RegEx re, const(C)[] format) if (isOutputRange!(Sink, dchar) && isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) { @@ -1374,7 +1379,7 @@ public @trusted void replaceAllInto(Sink, R, C, RegEx) ///ditto public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) - (Sink sink, R input, RegEx re) + (Sink sink, R input, const RegEx re) if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) { replaceMatchesInto!fun(sink, input, matchAll(input, re)); @@ -1446,14 +1451,14 @@ public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) The use of this function is $(RED discouraged), please use $(LREF replaceAll) or $(LREF replaceFirst) explicitly. +/ -public R replace(alias scheme = match, R, C, RegEx)(R input, RegEx re, const(C)[] format) +public R replace(alias scheme = match, R, C, RegEx)(R input, const RegEx re, const(C)[] format) if (isSomeString!R && isRegexFor!(RegEx, R)) { return replaceAllWith!((m, sink) => replaceFmt(format, m, sink), match)(input, re); } ///ditto -public R replace(alias fun, R, RegEx)(R input, RegEx re) +public R replace(alias fun, R, RegEx)(R input, const RegEx re) if (isSomeString!R && isRegexFor!(RegEx, R)) { return replaceAllWith!(fun, match)(input, re); @@ -1475,15 +1480,14 @@ public struct Splitter(Flag!"keepSeparators" keepSeparators = No.keepSeparators, private: Range _input; size_t _offset; - alias Rx = typeof(match(Range.init,RegEx.init)); + alias Rx = typeof(matchAll(Range.init,RegEx.init)); Rx _match; static if (keepSeparators) bool onMatch = false; - @trusted this(Range input, RegEx separator) + @trusted this(Range input, const RegEx separator) {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted _input = input; - separator.flags |= RegexOption.global; if (_input.empty) { //there is nothing to match at all, make _offset > 0 @@ -1491,7 +1495,7 @@ private: } else { - _match = Rx(_input, separator); + _match = matchAll(_input, separator); static if (keepSeparators) if (_match.pre.empty) @@ -1579,8 +1583,9 @@ public: /// ditto public Splitter!(keepSeparators, Range, RegEx) splitter( - Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx)(Range r, RegEx pat) if ( - is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range)) + Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx) + (Range r, const RegEx pat) + if (is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range)) { return Splitter!(keepSeparators, Range, RegEx)(r, pat); } @@ -1611,7 +1616,7 @@ unittest } ///An eager version of $(D splitter) that creates an array with splitted slices of $(D input). -public @trusted String[] split(String, RegEx)(String input, RegEx rx) +public @trusted String[] split(String, RegEx)(String input, const RegEx rx) if (isSomeString!String && isRegexFor!(RegEx, String)) { import std.array : appender; diff --git a/std/uni.d b/std/uni.d index 8b7fa32fdcb..84c7ee31399 100644 --- a/std/uni.d +++ b/std/uni.d @@ -2120,19 +2120,6 @@ public: assert(!gothic['$']); } - // Linear scan for $(D ch). Useful only for small sets. - // TODO: - // used internally in std.regex - // should be properly exposed in a public API ? - package auto scanFor()(dchar ch) const - { - immutable len = data.length; - for (size_t i = 0; i < len; i++) - if (ch < data[i]) - return i & 1; - return 0; - } - /// Number of $(CODEPOINTS) in this set @property size_t length() { From 87847ed4804ccbbff84eeb60d5c6061ad3c138a5 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 7 Oct 2016 01:41:09 +0300 Subject: [PATCH 20/23] [std.regex] More tests for issue 9391 --- std/regex/internal/tests.d | 9 +++++---- std/regex/package.d | 32 +++++++------------------------- 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index 347c268da28..4f52f819c5d 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -468,15 +468,16 @@ unittest unittest { - auto cr = ctRegex!("abc"); + immutable cr = ctRegex!("abc"); assert(bmatch("abc",cr).hit == "abc"); - auto cr2 = ctRegex!("ab*c"); + immutable cr2 = ctRegex!("ab*c"); assert(bmatch("abbbbc",cr2).hit == "abbbbc"); } + unittest { - auto cr3 = ctRegex!("^abc$"); + immutable cr3 = ctRegex!("^abc$"); assert(bmatch("abc",cr3).hit == "abc"); - auto cr4 = ctRegex!(`\b(a\B[a-z]b)\b`); + immutable cr4 = ctRegex!(`\b(a\B[a-z]b)\b`); assert(array(match("azb",cr4).captures) == ["azb", "azb"]); } diff --git a/std/regex/package.d b/std/regex/package.d index 074ff33c094..0d9e99272b1 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -1023,7 +1023,7 @@ public auto matchAll(R, RegEx)(R input, const RegEx re) foreach (String; AliasSeq!(string, wstring, const(dchar)[])) { auto str1 = "blah-bleh".to!String(); - auto pat1 = "bl[ae]h".to!String(); + const pat1 = "bl[ae]h".to!String(); auto mf = matchFirst(str1, pat1); assert(mf.equal(["blah".to!String()])); auto mAll = matchAll(str1, pat1); @@ -1031,7 +1031,7 @@ public auto matchAll(R, RegEx)(R input, const RegEx re) ([["blah".to!String()], ["bleh".to!String()]])); auto str2 = "1/03/12 - 3/03/12".to!String(); - auto pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]); + const pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]); auto mf2 = matchFirst(str2, pat2); assert(mf2.equal(["1/03/12", "1", "03", "12"].map!(to!String)())); auto mAll2 = matchAll(str2, pat2); @@ -1041,7 +1041,7 @@ public auto matchAll(R, RegEx)(R input, const RegEx re) mf2.popFrontN(3); assert(mf2.equal(["12".to!String()])); - auto ctPat = ctRegex!(`(?P\d+)/(?P\d+)`.to!String()); + const ctPat = ctRegex!(`(?P\d+)/(?P\d+)`.to!String()); auto str = "2 + 34/56 - 6/1".to!String(); auto cmf = matchFirst(str, ctPat); assert(cmf.equal(["34/56", "34", "56"].map!(to!String)())); @@ -1270,24 +1270,6 @@ unittest assert(result.data == "first\nsecond\n"); } -//examples for replaceFirst -@system unittest -{ - import std.conv; - string list = "#21 out of 46"; - string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) - (list, regex(`[0-9]+`)); - assert(newList == "#22 out of 46"); - import std.array; - string m1 = "first message\n"; - string m2 = "second message\n"; - auto result = appender!string(); - replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); - //equivalent of the above with user-defined callback - replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); - assert(result.data == "first\nsecond\n"); -} - /++ Construct a new string from $(D input) by replacing all of the fragments that match a pattern $(D re) with a string generated @@ -1316,7 +1298,7 @@ public @trusted R replaceAll(R, C, RegEx)(R input, const RegEx re, const(C)[] fo unittest { // insert comma as thousands delimiter - auto re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g"); + const re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g"); assert(replaceAll("12000 + 42100 = 54100", re, ",") == "12,000 + 42,100 = 54,100"); } @@ -1416,8 +1398,8 @@ public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) S t2F = "hound dome".to!S(); S t1A = "court trial".to!S(); S t2A = "hound home".to!S(); - auto re1 = regex("curt".to!S()); - auto re2 = regex("[dr]o".to!S()); + const re1 = regex("curt".to!S()); + const re2 = regex("[dr]o".to!S()); assert(replaceFirst(s1, re1, "court") == t1F); assert(replaceFirst(s2, re2, "ho") == t2F); @@ -1604,7 +1586,7 @@ unittest { import std.algorithm.comparison : equal; - auto pattern = regex(`([\.,])`); + const pattern = regex(`([\.,])`); assert("2003.04.05" .splitter!(Yes.keepSeparators)(pattern) From cd2c28f40220366b3591da40cd209da39400bf09 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 7 Oct 2016 01:43:05 +0300 Subject: [PATCH 21/23] [std.regex] Trailing whites --- std/regex/internal/parser.d | 2 +- std/regex/package.d | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index 9e9be2fdc57..3f0fb806730 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -23,7 +23,7 @@ auto makeRegex(S, CG)(Parser!(S, CG) p) pure maxCounterDepth = g.counterDepth; flags = p.re_flags; charsets = g.charsets - .map!(x => + .map!(x => x.byInterval.map!(x=>Interval(x.a,x.b)).array ).array; matchers = g.matchers; diff --git a/std/regex/package.d b/std/regex/package.d index 0d9e99272b1..11902dfba7b 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -668,7 +668,7 @@ private: _memory = (enforce(malloc(size), "malloc failed")[0..size]); scope(failure) free(_memory.ptr); *cast(size_t*)_memory.ptr = 1; - _engine = EngineType(prog, Input!Char(input), + _engine = EngineType(prog, Input!Char(input), _memory[size_t.sizeof..$], reFlags); static if (is(typeof(prog.nativeFn))) _engine.nativeFn = prog.nativeFn; @@ -1085,7 +1085,7 @@ public auto bmatch(R, String)(R input, String re) if (isSomeString!R && isSomeString!String) { import std.regex.internal.backtracking : BacktrackingMatcher; - auto r = regex(re); + auto r = regex(re); return RegexMatch!(Unqual!(typeof(input)), BacktrackingMatcher!false) (input, r, r.flags); } @@ -1566,7 +1566,7 @@ public: /// ditto public Splitter!(keepSeparators, Range, RegEx) splitter( Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx) - (Range r, const RegEx pat) + (Range r, const RegEx pat) if (is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range)) { return Splitter!(keepSeparators, Range, RegEx)(r, pat); From d1d53c556ca180909ed803109c1c5be5253f5d2b Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Fri, 7 Oct 2016 02:17:40 +0300 Subject: [PATCH 22/23] [std.regex] Fixes for recent compiler version --- std/regex/internal/tests2.d | 12 ++++++------ std/regex/internal/tests3.d | 8 ++++---- std/regex/package.d | 20 +++++++++++++------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/std/regex/internal/tests2.d b/std/regex/internal/tests2.d index 19286fa31f8..5aaa360e185 100644 --- a/std/regex/internal/tests2.d +++ b/std/regex/internal/tests2.d @@ -45,29 +45,29 @@ unittest unittest { auto rtr = regex("a|b|c"); - enum ctr = regex("a|b|c"); + const ctr = regex("a|b|c"); assert(equal(rtr.ir,ctr.ir)); //CTFE parser BUG is triggered by group //in the middle of alternation (at least not first and not last) - enum testCT = regex(`abc|(edf)|xyz`); + const testCT = regex(`abc|(edf)|xyz`); auto testRT = regex(`abc|(edf)|xyz`); assert(equal(testCT.ir,testRT.ir)); } unittest { - enum cx = ctRegex!"(A|B|C)"; + immutable cx = ctRegex!"(A|B|C)"; auto mx = match("B",cx); assert(mx); assert(equal(mx.captures, [ "B", "B"])); - enum cx2 = ctRegex!"(A|B)*"; + immutable cx2 = ctRegex!"(A|B)*"; assert(match("BAAA",cx2)); - enum cx3 = ctRegex!("a{3,4}","i"); + immutable cx3 = ctRegex!("a{3,4}","i"); auto mx3 = match("AaA",cx3); assert(mx3); assert(mx3.captures[0] == "AaA"); - enum cx4 = ctRegex!(`^a{3,4}?[a-zA-Z0-9~]{1,2}`,"i"); + immutable cx4 = ctRegex!(`^a{3,4}?[a-zA-Z0-9~]{1,2}`,"i"); auto mx4 = match("aaaabc", cx4); assert(mx4); assert(mx4.captures[0] == "aaaab"); diff --git a/std/regex/internal/tests3.d b/std/regex/internal/tests3.d index 07541fbb62d..3bd8cb8f336 100644 --- a/std/regex/internal/tests3.d +++ b/std/regex/internal/tests3.d @@ -45,7 +45,7 @@ unittest {// bugzilla 7679 foreach (S; AliasSeq!(string, wstring, dstring)) (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 - enum re = ctRegex!(to!S(r"\.")); + const re = ctRegex!(to!S(r"\.")); auto str = to!S("a.b"); assert(equal(std.regex.splitter(str, re), [to!S("a"), to!S("b")])); assert(split(str, re) == [to!S("a"), to!S("b")]); @@ -89,8 +89,8 @@ unittest // bugzilla 8349 unittest { - enum peakRegexStr = r"\>(wgEncode.*Tfbs.*\.(?:narrow)|(?:broad)Peak.gz)"; - enum peakRegex = ctRegex!(peakRegexStr); + const peakRegexStr = r"\>(wgEncode.*Tfbs.*\.(?:narrow)|(?:broad)Peak.gz)"; + const peakRegex = ctRegex!(peakRegexStr); //note that the regex pattern itself is probably bogus assert(match(r"\>wgEncode-blah-Tfbs.narrow", peakRegex)); } @@ -168,7 +168,7 @@ unittest // bugzilla 11262 unittest { - enum reg = ctRegex!(r",", "g"); + const reg = ctRegex!(r",", "g"); auto str = "This,List"; str = str.replace(reg, "-"); assert(str == "This-List"); diff --git a/std/regex/package.d b/std/regex/package.d index 11902dfba7b..e4566d47c6f 100644 --- a/std/regex/package.d +++ b/std/regex/package.d @@ -324,7 +324,7 @@ public alias StaticRegex(Char) = std.regex.internal.ir.StaticRegex!(Char); if (isSomeString!(S)) { import std.array : appender; - S pat; + Unqual!S pat; if (patterns.length > 1) { auto app = appender!S(); @@ -386,18 +386,24 @@ public auto regexImpl(S)(S pattern, const(char)[] flags="") pure } -template ctRegexImpl(alias pattern, string flags=[]) +private template IsolatedFunc(Char, alias source) { - import std.regex.internal.parser, std.regex.internal.backtracking; - static immutable r = cast(immutable)regexPure([pattern], flags); - alias Char = BasicElementOf!(typeof(pattern)); - enum source = ctGenRegExCode(r); + import std.regex.internal.backtracking; alias Matcher = BacktrackingMatcher!(true); - @trusted bool func(ref Matcher!Char matcher) + @trusted bool IsolatedFunc(ref Matcher!Char matcher) { debug(std_regex_ctr) pragma(msg, source); mixin(source); } +} + +template ctRegexImpl(alias pattern, string flags=[]) +{ + import std.regex.internal.parser, std.regex.internal.backtracking; + static immutable r = cast(immutable)regexPure([pattern], flags); + alias Char = BasicElementOf!(typeof(pattern)); + enum source = ctGenRegExCode(r); + alias func = IsolatedFunc!(Char, source); static immutable nr = immutable StaticRegex!Char(r, &func); } From f8b3eea065ac3264d2e6784836b4a69a5d4a2be4 Mon Sep 17 00:00:00 2001 From: Dmitry Olshansky Date: Sun, 9 Oct 2016 15:35:32 +0300 Subject: [PATCH 23/23] [std.regex] Addressing review comments --- std/regex/internal/bitnfa.d | 33 +++++++++++++++++++++------------ std/regex/internal/ir.d | 8 ++++---- std/regex/package.d | 23 ++++++++++++----------- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/std/regex/internal/bitnfa.d b/std/regex/internal/bitnfa.d index 3e7fbd61bfd..2dd0b8f0d41 100644 --- a/std/regex/internal/bitnfa.d +++ b/std/regex/internal/bitnfa.d @@ -1,4 +1,3 @@ - //Written in the D programming language /* Implementation of a concept "NFA in a word" which is @@ -43,7 +42,7 @@ pure: if (!p.occupied) { items++; - if (4*items >= table.length*3) + if (4 * items >= table.length * 3) { grow(); p = locate(key, table); @@ -107,7 +106,7 @@ private: if (slot == table.length) slot = 0; } - return table.ptr+slot; + return table.ptr + slot; } void grow() @@ -125,6 +124,17 @@ private: } } +unittest +{ + HashTab tab; + tab[3] = 1; + tab[7] = 2; + tab[11] = 3; + assert(tab[3] == 1); + assert(tab[7] == 2); + assert(tab[11] == 3); +} + // Specialized 2-level trie of uint masks for BitNfa. // Uses the concept of CoW: a page gets modified in place @@ -586,11 +596,6 @@ auto reverseBitNfa(Char)(auto ref Regex!Char re, uint length) pure while (ir[pc].code == Option) { size_t size = ir[pc].data; - if (ir[pc+size-IRL!GotoEndOr].code == GotoEndOr) - { - ir[pc+size-IRL!(GotoEndOr)].data = ir[pc+size-IRL!(GotoEndOr)].data+1; - size -= IRL!GotoEndOr; - } size_t j = pc + IRL!Option; if (ir[j].code == End) { @@ -657,7 +662,7 @@ version(unittest) { import std.regex, std.conv; import std.stdio; - auto rex = regex(re); + auto rex = regex(re, "s"); auto m = make(rex); auto s = Input!char(input); assert(m.search(s), text("Failed @", line, " ", input, " with ", re)); @@ -672,7 +677,7 @@ version(unittest) { import std.regex, std.conv; import std.stdio; - auto rex = regex(re); + auto rex = regex(re, "s"); auto m = make(rex); auto s = Input!char(input); assert(!m.search(s), text("Should have failed @", line, " " , input, " with ", re)); @@ -709,6 +714,8 @@ unittest .checkBit("0123456789_0123456789_0123456789_0123456789", 31); "0123456789_0123456789_0123456789_012" .checkBit("0123456789(0123456789_0123456789_0123456789_0123456789|01234)",10); + "0123456789_0123456789_0123456789_012" + .checkBit("0123456789_0123456789_012345678[890]", 31); // assertions ignored "0abc1".checkBit("(?